import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)

df_raw = pd.read_csv("AB_NYC_2019.csv")
df_raw.head()

df_raw.info()
df_raw.isnull().sum()
df_raw.describe(include='all')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              48895 non-null  int64  
 1   name                            48879 non-null  object 
 2   host_id                         48895 non-null  int64  
 3   host_name                       48874 non-null  object 
 4   neighbourhood_group             48895 non-null  object 
 5   neighbourhood                   48895 non-null  object 
 6   latitude                        48895 non-null  float64
 7   longitude                       48895 non-null  float64
 8   room_type                       48895 non-null  object 
 9   price                           48895 non-null  int64  
 10  minimum_nights                  48895 non-null  int64  
 11  number_of_reviews               48895 non-null  int64  
 12  last_review                     38843 non-null  object 
 13  reviews_per_month               38843 non-null  float64
 14  calculated_host_listings_count  48895 non-null  int64  
 15  availability_365                48895 non-null  int64  
dtypes: float64(3), int64(7), object(6)
memory usage: 6.0+ MB

df = df_raw.copy()

# Remove rows with missing price or room type
df = df.dropna(subset=['price', 'room_type', 'neighbourhood_group', 'neighbourhood'])

# Remove zero or negative prices
df = df[df['price'] > 0]

# Also remove listings with missing critical numeric fields used later
df = df.dropna(subset=['minimum_nights', 'number_of_reviews', 'availability_365'])

df.shape

(48884, 16)

df['price_capped'] = np.where(df['price'] > 1000, 1000, df['price'])

print("Max original price:", df['price'].max())
print("Max capped price:", df['price_capped'].max())
df[['price', 'price_capped']].describe()

Max original price: 10000
Max capped price: 1000

df['neighbourhood_group'] = df['neighbourhood_group'].astype('category')
df['room_type'] = df['room_type'].astype('category')

df['reviews_per_month'] = df['reviews_per_month'].fillna(0)
df['has_reviews'] = (df['number_of_reviews'] > 0).astype(int)

df['log_price'] = np.log1p(df['price_capped'])
df['availability_rate'] = df['availability_365'] / 365

df[['price_capped', 'log_price', 'reviews_per_month', 'availability_rate']].head()

plt.figure(figsize=(7,4))
plt.hist(df['price_capped'], bins=50)
plt.title("Distribution of Price (capped at $1000)")
plt.xlabel("Price")
plt.ylabel("Frequency")
plt.show()

plt.figure(figsize=(7,4))
plt.hist(df['log_price'], bins=50)
plt.title("Distribution of log(1 + price)")
plt.xlabel("log(1 + price)")
plt.ylabel("Frequency")
plt.show()

plt.figure(figsize=(10,6))
sns.boxplot(x='neighbourhood_group', y='price_capped', data=df)
plt.title("Price Distribution by Borough (capped)")
plt.xlabel("Borough")
plt.ylabel("Price (capped)")
plt.show()

plt.figure(figsize=(10,6))
sns.boxplot(x='room_type', y='price_capped', data=df)
plt.title("Price Distribution by Room Type (capped)")
plt.xlabel("Room Type")
plt.ylabel("Price (capped)")
plt.xticks(rotation=20)
plt.show()

df.groupby("room_type")['price_capped'].mean().sort_values()

/tmp/ipykernel_1939/3201097528.py:9: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  df.groupby("room_type")['price_capped'].mean().sort_values()

room_type
Shared room         69.341969
Private room        86.673552
Entire home/apt    200.667021
Name: price_capped, dtype: float64

plt.figure(figsize=(12,6))
sns.boxplot(x='neighbourhood_group', y='price_capped', hue='room_type', data=df)
plt.title("Price by Borough and Room Type (capped)")
plt.xlabel("Borough")
plt.ylabel("Price (capped)")
plt.legend(title="Room Type")
plt.show()

neigh_median = (
    df.groupby('neighbourhood')['price_capped']
      .median()
      .sort_values(ascending=False)
      .head(15)
)

plt.figure(figsize=(10,6))
neigh_median.sort_values().plot(kind='barh')
plt.title("Top 15 Neighborhoods by Median Price (capped)")
plt.xlabel("Median Price (capped)")
plt.ylabel("Neighborhood")
plt.show()

neigh_median

neighbourhood
Fort Wadsworth        800.0
Woodrow               700.0
Tribeca               295.0
Neponsit              274.0
NoHo                  250.0
Willowbrook           249.0
Flatiron District     225.0
Midtown               210.0
West Village          200.0
Financial District    200.0
SoHo                  199.0
Chelsea               199.0
Greenwich Village     197.5
Breezy Point          195.0
Battery Park City     195.0
Name: price_capped, dtype: float64

plt.figure(figsize=(7,4))
plt.scatter(df['number_of_reviews'], df['price_capped'], alpha=0.2)
plt.title("Price vs Number of Reviews")
plt.xlabel("Number of Reviews")
plt.ylabel("Price (capped)")
plt.show()

plt.figure(figsize=(7,4))
plt.scatter(df['availability_365'], df['price_capped'], alpha=0.2)
plt.title("Price vs Availability (days/year)")
plt.xlabel("Availability_365")
plt.ylabel("Price (capped)")
plt.show()

plt.figure(figsize=(8,6))
sns.heatmap(
    df[['price_capped', 'minimum_nights', 'number_of_reviews', 'reviews_per_month', 'availability_365']].corr(),
    annot=True
)
plt.title("Correlation Heatmap (Numeric Features)")
plt.show()

numeric_features = df.select_dtypes(include=['int64', 'float64'])
correlation_matrix = numeric_features.corr()

# Identify strongest positive and negative correlations
correlation_pairs = correlation_matrix.unstack().sort_values()
strong_negative = correlation_pairs.head(5)
strong_positive = correlation_pairs.tail(5)

print("Strongest Negative Correlations:")
print(strong_negative)

print("\nStrongest Positive Correlations:")
print(strong_positive)

Strongest Negative Correlations:
longitude          log_price           -0.329992
log_price          longitude           -0.329992
number_of_reviews  id                  -0.319800
id                 number_of_reviews   -0.319800
price_capped       longitude           -0.247066
dtype: float64

Strongest Positive Correlations:
price_capped       price_capped         1.0
has_reviews        has_reviews          1.0
availability_rate  availability_rate    1.0
                   availability_365     1.0
availability_365   availability_rate    1.0
dtype: float64

from scipy.stats import ttest_ind

entire = df[df['room_type']=="Entire home/apt"]['price_capped']
private = df[df['room_type']=="Private room"]['price_capped']

t_stat, p_val = ttest_ind(entire, private, equal_var=False)

t_stat, p_val

(np.float64(108.68412190706182), np.float64(0.0))

from scipy.stats import f_oneway

groups = []
for b in df['neighbourhood_group'].cat.categories:
    groups.append(df[df['neighbourhood_group'] == b]['price_capped'])

anova_stat, anova_p = f_oneway(*groups)
anova_stat, anova_p

(np.float64(1035.230812771272), np.float64(0.0))

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Simple baseline: predict the median price for everyone
y = df['price_capped']
y_train, y_test = train_test_split(y, test_size=0.2, random_state=42)

baseline_pred = np.median(y_train) * np.ones_like(y_test)
baseline_mae = mean_absolute_error(y_test, baseline_pred)

baseline_mae

77.2404623095019

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Feature selection
X = df[['minimum_nights', 'number_of_reviews', 'availability_365']]
y = df['price_capped']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train Random Forest model
model = RandomForestRegressor(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Model evaluation using MAE
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae:.4f}")

# Compare model error to a naive baseline
baseline_pred = np.full_like(y_test, y_test.mean())
baseline_mae = mean_absolute_error(y_test, baseline_pred)

print(f"Baseline MAE: {baseline_mae:.4f}")
print(f"Improvement over baseline: {baseline_mae - mae:.4f}")

Mean Absolute Error (MAE): 86.4471
Baseline MAE: 83.0890
Improvement over baseline: -3.3581

plt.figure(figsize=(6,4))
plt.bar(X.columns, model.feature_importances_)
plt.title("Feature Importance (Numeric Features)")
plt.xlabel("Feature")
plt.ylabel("Importance")
plt.show()

# Feature importance analysis for interpretability

if hasattr(model, "coef_"):
    feature_importance = pd.Series(
        model.coef_.flatten(), index=X.columns
    ).sort_values(ascending=False)

    print("Feature Importance:")
    print(feature_importance)

X2 = df[['minimum_nights', 'number_of_reviews', 'availability_365', 'neighbourhood_group', 'room_type']]
X2 = pd.get_dummies(X2, drop_first=True)

y2 = df['price_capped']

X_train, X_test, y_train, y_test = train_test_split(
    X2, y2, test_size=0.2, random_state=42
)

model2 = RandomForestRegressor(n_estimators=300, random_state=42)
model2.fit(X_train, y_train)

pred2 = model2.predict(X_test)
mae2 = mean_absolute_error(y_test, pred2)

baseline_mae, mae, mae2

(77.2404623095019, 86.44710796983082, 64.4309596953749)

plt.figure(figsize=(6,6))
plt.scatter(y_test, pred2, alpha=0.2)
plt.title("Predicted vs Actual Price (capped)")
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.plot([0, 1000], [0, 1000])
plt.show()

	id	name	host_id	host_name	neighbourhood_group	neighbourhood	latitude	longitude	room_type	price	minimum_nights	number_of_reviews	last_review	reviews_per_month	calculated_host_listings_count	availability_365
0	2539	Clean & quiet apt home by the park	2787	John	Brooklyn	Kensington	40.64749	-73.97237	Private room	149	1	9	2018-10-19	0.21	6	365
1	2595	Skylit Midtown Castle	2845	Jennifer	Manhattan	Midtown	40.75362	-73.98377	Entire home/apt	225	1	45	2019-05-21	0.38	2	355
2	3647	THE VILLAGE OF HARLEM....NEW YORK !	4632	Elisabeth	Manhattan	Harlem	40.80902	-73.94190	Private room	150	3	0	NaN	NaN	1	365
3	3831	Cozy Entire Floor of Brownstone	4869	LisaRoxanne	Brooklyn	Clinton Hill	40.68514	-73.95976	Entire home/apt	89	1	270	2019-07-05	4.64	1	194
4	5022	Entire Apt: Spacious Studio/Loft by central park	7192	Laura	Manhattan	East Harlem	40.79851	-73.94399	Entire home/apt	80	10	9	2018-11-19	0.10	1	0

	id	name	host_id	host_name	neighbourhood_group	neighbourhood	latitude	longitude	room_type	price	minimum_nights	number_of_reviews	last_review	reviews_per_month	calculated_host_listings_count	availability_365
count	4.889500e+04	48879	4.889500e+04	48874	48895	48895	48895.000000	48895.000000	48895	48895.000000	48895.000000	48895.000000	38843	38843.000000	48895.000000	48895.000000
unique	NaN	47905	NaN	11452	5	221	NaN	NaN	3	NaN	NaN	NaN	1764	NaN	NaN	NaN
top	NaN	Hillside Hotel	NaN	Michael	Manhattan	Williamsburg	NaN	NaN	Entire home/apt	NaN	NaN	NaN	2019-06-23	NaN	NaN	NaN
freq	NaN	18	NaN	417	21661	3920	NaN	NaN	25409	NaN	NaN	NaN	1413	NaN	NaN	NaN
mean	1.901714e+07	NaN	6.762001e+07	NaN	NaN	NaN	40.728949	-73.952170	NaN	152.720687	7.029962	23.274466	NaN	1.373221	7.143982	112.781327
std	1.098311e+07	NaN	7.861097e+07	NaN	NaN	NaN	0.054530	0.046157	NaN	240.154170	20.510550	44.550582	NaN	1.680442	32.952519	131.622289
min	2.539000e+03	NaN	2.438000e+03	NaN	NaN	NaN	40.499790	-74.244420	NaN	0.000000	1.000000	0.000000	NaN	0.010000	1.000000	0.000000
25%	9.471945e+06	NaN	7.822033e+06	NaN	NaN	NaN	40.690100	-73.983070	NaN	69.000000	1.000000	1.000000	NaN	0.190000	1.000000	0.000000
50%	1.967728e+07	NaN	3.079382e+07	NaN	NaN	NaN	40.723070	-73.955680	NaN	106.000000	3.000000	5.000000	NaN	0.720000	1.000000	45.000000
75%	2.915218e+07	NaN	1.074344e+08	NaN	NaN	NaN	40.763115	-73.936275	NaN	175.000000	5.000000	24.000000	NaN	2.020000	2.000000	227.000000
max	3.648724e+07	NaN	2.743213e+08	NaN	NaN	NaN	40.913060	-73.712990	NaN	10000.000000	1250.000000	629.000000	NaN	58.500000	327.000000	365.000000

	price	price_capped
count	48884.000000	48884.000000
mean	152.755053	145.510024
std	240.170260	130.946570
min	10.000000	10.000000
25%	69.000000	69.000000
50%	106.000000	106.000000
75%	175.000000	175.000000
max	10000.000000	1000.000000

	price_capped	log_price	reviews_per_month	availability_rate
0	149	5.010635	0.21	1.000000
1	225	5.420535	0.38	0.972603
2	150	5.017280	0.00	1.000000
3	89	4.499810	4.64	0.531507
4	80	4.394449	0.10	0.000000

New York City Airbnb Market Analysis (2019)¶

Table of Contents¶

1. Introduction¶

2. Data Collection¶

Dataset Overview¶

Key Features¶

Initial Data Inspection¶

3. Data Processing¶

Handling Outliers in Price¶

Data Type Conversion¶

Feature Engineering¶

4. Exploratory Analysis & Data Visualization¶

Price Distribution (Raw vs Log Transformed)¶

Price by Borough¶

Price by Room Type¶

Borough + Room Type Together¶

Most Expensive Neighborhoods (by Median Price)¶

Reviews and Availability vs Price¶

Correlation Heatmap (Numeric Variables)¶

5. Hypothesis Testing¶

Testing Price Differences Across Boroughs (ANOVA)¶

6. Machine Learning: Predicting Price¶

Random Forest Model (Numeric Features Only)¶

Feature Importance (Numeric-Only Model)¶

Improved Model: Include Borough and Room Type¶

Interpreting the Improved Model¶

7. Conclusions¶

8. Limitations and Future Work¶

9. External Links¶