library(tidyverse)
library(tidymodels)
library(lubridate) # In case we need date manipulations
library(sf) # For geospatial data / geometry
library(ggplot2) # For plotting
library(readr) # For reading CSV
library(dplyr) # For data manipulation
library(purrr) # For functional programming
library(parsnip) # For model specification
library(workflows) # For workflows
library(rsample) # For splitting and cross-validation
library(tune) # For tuning
library(yardstick) # For metrics
library(dials) # For tuning parameters
library(recipes) # For feature engineering
library(ggrepel) # For improved label placing on plots (optional)
library(rlang) # For tidy evaluation
library(leaflet) # For interactive maps
library(xgboost) # For XGBoost model
Team 3 - Challenge 2: AirBnB Pricing
Introduction
In this report we analyze Airbnb pricing data in Barcelona to build a predictive model for rental prices and to develop an investment strategy for a real estate speculator. As specified in the instructions, we use XGBoost to predict (log) prices and finally evaluate investment opportunities across different neighborhoods.
Loading Libraries and Initial Data
First, we load the average price data by neighborhood:
<- data.frame(
avg_prices neighbourhood = c("eixample", "ciutat vella","sant marti","sants-montjuic",
"sarria-sant gervasi","nou barris","horta-guinardo",
"gracia","sant andreu","les corts"),
avg_price_m2 = c(5881,4751,4728,4220,6242, 2610,3872,5153,3624, 5613),
avg_price = c(684012, 392645,435215,299140,980439, 201074,310891,500411,288534, 779088),
latitude = c(41.389887, 41.382183, 41.412146, 41.374394, 41.402357,
41.437541, 41.423826, 41.406744, 41.432717, 41.387295),
longitude = c(2.161808, 2.176437, 2.204667, 2.140377, 2.134925,
2.175310, 2.161701, 2.158146, 2.189304, 2.126046),
pct_year_occupation = c(0.75, 0.8, 0.6, 0.7, 0.7, 0.5, 0.55, 0.7, 0.6, 0.7)
)
Next, we load and examine the main (cleaned) Airbnb dataset to make sure it’s actually clean and not run into any issues later:
# Load the data
<- read_csv("DATA/airbnb_clean.csv")
airbnb_clean
dim(airbnb_clean)
[1] 19833 113
glimpse(airbnb_clean)
Rows: 19,833
Columns: 113
$ host_id <dbl> 71615, 71615, …
$ host_since <date> 2010-01-19, 2…
$ host_response_time <chr> "within an hou…
$ host_response_rate <dbl> 99, 99, 100, 1…
$ host_is_superhost <lgl> FALSE, FALSE, …
$ host_listings_count <dbl> 45, 45, 2, 5, …
$ host_has_profile_pic <lgl> TRUE, TRUE, TR…
$ host_identity_verified <lgl> TRUE, TRUE, TR…
$ neighbourhood_group_cleansed <chr> "sant marti", …
$ latitude <dbl> 41.40889, 41.4…
$ longitude <dbl> 2.18555, 2.173…
$ is_location_exact <lgl> TRUE, TRUE, TR…
$ property_type <chr> "apartment", "…
$ room_type <chr> "entire home/a…
$ accommodates <dbl> 6, 8, 2, 6, 2,…
$ bathrooms <dbl> 1.0, 2.0, 1.0,…
$ bedrooms <dbl> 2, 3, 1, 3, 1,…
$ beds <dbl> 4, 6, 1, 8, 1,…
$ price <dbl> 130, 60, 33, 2…
$ cleaning_fee <dbl> 42, 50, NA, 80…
$ minimum_nights <dbl> 3, 1, 2, 3, 1,…
$ maximum_nights <dbl> 730, 1125, 112…
$ has_availability <lgl> TRUE, TRUE, TR…
$ number_of_reviews <dbl> 1, 15, 119, 45…
$ first_review <date> 2015-10-10, 2…
$ last_review <date> 2015-10-10, 2…
$ review_scores_rating <dbl> 80, 87, 90, 95…
$ review_scores_accuracy <dbl> 10, 9, 10, 10,…
$ review_scores_cleanliness <dbl> 10, 9, 9, 10, …
$ review_scores_checkin <dbl> 2, 10, 10, 10,…
$ review_scores_communication <dbl> 10, 10, 10, 10…
$ review_scores_location <dbl> 10, 9, 9, 9, 1…
$ review_scores_value <dbl> 8, 8, 9, 9, 9,…
$ instant_bookable <lgl> FALSE, TRUE, F…
$ has_verificator__email <lgl> TRUE, TRUE, TR…
$ has_verificator__phone <lgl> TRUE, TRUE, TR…
$ has_verificator__reviews <lgl> TRUE, TRUE, TR…
$ has_verificator__jumio <lgl> TRUE, TRUE, TR…
$ has_verificator__government_id <lgl> TRUE, TRUE, TR…
$ has_verificator__offline_government_id <lgl> FALSE, FALSE, …
$ has_verificator__selfie <lgl> FALSE, FALSE, …
$ has_verificator__identity_manual <lgl> FALSE, FALSE, …
$ has_verificator__facebook <lgl> FALSE, FALSE, …
$ has_verificator__work_email <lgl> FALSE, FALSE, …
$ has_amenity__TV <lgl> TRUE, TRUE, TR…
$ has_amenity__Internet <lgl> TRUE, TRUE, FA…
$ has_amenity__Wifi <lgl> TRUE, TRUE, TR…
$ has_amenity__Air.conditioning <lgl> TRUE, TRUE, FA…
$ has_amenity__Kitchen <lgl> TRUE, TRUE, TR…
$ has_amenity__Elevator <lgl> TRUE, TRUE, TR…
$ has_amenity__Free.street.parking <lgl> TRUE, TRUE, FA…
$ has_amenity__Heating <lgl> TRUE, TRUE, TR…
$ has_amenity__Family.kid.friendly <lgl> TRUE, TRUE, TR…
$ has_amenity__Washer <lgl> TRUE, TRUE, TR…
$ has_amenity__Dryer <lgl> TRUE, FALSE, T…
$ has_amenity__Essentials <lgl> TRUE, TRUE, TR…
$ has_amenity__Shampoo <lgl> TRUE, TRUE, TR…
$ has_amenity__Hair.dryer <lgl> TRUE, TRUE, TR…
$ has_amenity__Hot.water <lgl> TRUE, TRUE, TR…
$ has_amenity__Host.greets.you <lgl> TRUE, TRUE, FA…
$ has_amenity__Paid.parking.on.premises <lgl> TRUE, TRUE, TR…
$ has_amenity__Buzzer.wireless.intercom <lgl> FALSE, TRUE, F…
$ has_amenity__Hangers <lgl> FALSE, TRUE, T…
$ has_amenity__Iron <lgl> FALSE, TRUE, T…
$ has_amenity__Laptop.friendly.workspace <lgl> FALSE, TRUE, T…
$ has_amenity__Crib <lgl> FALSE, TRUE, F…
$ has_amenity__Paid.parking.off.premises <lgl> FALSE, FALSE, …
$ has_amenity__First.aid.kit <lgl> FALSE, FALSE, …
$ has_amenity__Self.check.in <lgl> FALSE, FALSE, …
$ has_amenity__Bed.linens <lgl> FALSE, FALSE, …
$ has_amenity__Extra.pillows.and.blankets <lgl> FALSE, FALSE, …
$ has_amenity__Microwave <lgl> FALSE, FALSE, …
$ has_amenity__Coffee.maker <lgl> FALSE, FALSE, …
$ has_amenity__Refrigerator <lgl> FALSE, FALSE, …
$ has_amenity__Dishwasher <lgl> FALSE, FALSE, …
$ has_amenity__Dishes.and.silverware <lgl> FALSE, FALSE, …
$ has_amenity__Cooking.basics <lgl> FALSE, FALSE, …
$ has_amenity__Oven <lgl> FALSE, FALSE, …
$ has_amenity__Stove <lgl> FALSE, FALSE, …
$ has_amenity__Patio.or.balcony <lgl> FALSE, FALSE, …
$ has_amenity__Luggage.dropoff.allowed <lgl> FALSE, FALSE, …
$ has_amenity__No.stairs.or.steps.to.enter <lgl> FALSE, FALSE, …
$ has_amenity__Wide.entrance.for.guests <lgl> FALSE, FALSE, …
$ has_amenity__Well.lit.path.to.entrance <lgl> FALSE, FALSE, …
$ has_amenity__Wide.entryway <lgl> FALSE, FALSE, …
$ has_amenity__Smoke.detector <lgl> FALSE, FALSE, …
$ has_amenity__Carbon.monoxide.detector <lgl> FALSE, FALSE, …
$ has_amenity__Fire.extinguisher <lgl> FALSE, FALSE, …
$ has_amenity__High.chair <lgl> FALSE, FALSE, …
$ has_amenity__Pack..n.Play.travel.crib <lgl> FALSE, FALSE, …
$ has_amenity__Long.term.stays.allowed <lgl> FALSE, FALSE, …
$ has_amenity__Wide.hallways <lgl> FALSE, FALSE, …
$ has_amenity__Smoking.allowed <lgl> FALSE, FALSE, …
$ has_amenity__Lock.on.bedroom.door <lgl> FALSE, FALSE, …
$ has_amenity__translation.missing..en.hosting_amenity_50 <lgl> FALSE, FALSE, …
$ has_amenity__Private.living.room <lgl> FALSE, FALSE, …
$ has_amenity__Cable.TV <lgl> FALSE, FALSE, …
$ has_amenity__Safety.card <lgl> FALSE, FALSE, …
$ has_amenity__24.hour.check.in <lgl> FALSE, FALSE, …
$ has_amenity__Private.entrance <lgl> FALSE, FALSE, …
$ has_amenity__Breakfast <lgl> FALSE, FALSE, …
$ has_amenity__translation.missing..en.hosting_amenity_49 <lgl> FALSE, FALSE, …
$ has_amenity__Room.darkening.shades <lgl> FALSE, FALSE, …
$ has_amenity__Pets.allowed <lgl> FALSE, FALSE, …
$ has_amenity__Pocket.wifi <lgl> FALSE, FALSE, …
$ has_amenity__Extra.space.around.bed <lgl> FALSE, FALSE, …
$ has_amenity__Accessible.height.bed <lgl> FALSE, FALSE, …
$ has_amenity__Bathtub <lgl> FALSE, FALSE, …
$ has_amenity__Wide.entrance <lgl> FALSE, FALSE, …
$ has_amenity__.toilet <lgl> FALSE, FALSE, …
$ has_amenity__Ethernet.connection <lgl> FALSE, FALSE, …
$ log_price <dbl> 4.867534, 4.09…
$ n_NA_cols <dbl> 0, 0, 1, 0, 1,…
# Identify columns with missing values
<- airbnb_clean %>%
missing_cols summarise(across(everything(), ~ sum(is.na(.)))) %>%
pivot_longer(cols = everything(), names_to = "column", values_to = "missing_count") %>%
filter(missing_count > 0)
# identify is.character columns:
<- airbnb_clean %>%
char_cols select(where(is.character))
# We conclude there is no need to create factors because we will use dummy variables.
# Additionally, there are no numeric variables that need to be converted to factors.
1. Test and Train Split
We start by splitting our data into training and testing sets. Since we’re building a regression model to predict log_price
, we stratify by this target variable to ensure both sets have similar distributions of the outcome variable.
set.seed(123)
<- initial_split(airbnb_clean, strata = log_price)
airbnb_split <- training(airbnb_split) # Default 75% train
airbnb_train <- testing(airbnb_split) # Default 1 75% train, so 25% test
airbnb_test
# Confirm splits
nrow(airbnb_train)
[1] 14874
nrow(airbnb_test)
[1] 4959
2. Cross-validation
To properly evaluate our model during training, we implement 10-fold cross-validation. This helps us assess how well our model will generalize to unseen data and avoid overfitting.
set.seed(123)
<- vfold_cv(airbnb_train, v = 10, strata = log_price)
airbnb_folds airbnb_folds
# 10-fold cross-validation using stratification
# A tibble: 10 × 2
splits id
<list> <chr>
1 <split [13384/1490]> Fold01
2 <split [13386/1488]> Fold02
3 <split [13386/1488]> Fold03
4 <split [13386/1488]> Fold04
5 <split [13386/1488]> Fold05
6 <split [13386/1488]> Fold06
7 <split [13388/1486]> Fold07
8 <split [13388/1486]> Fold08
9 <split [13388/1486]> Fold09
10 <split [13388/1486]> Fold10
3. Create Recipe
Now we create a preprocessing recipe for our data. This recipe includes several steps to prepare our data for modeling:
<- recipe(log_price ~ ., data = airbnb_train) %>%
airbnb_recipe
#1. Remove the date columns that are not numeric:
step_rm(host_since, first_review, last_review) %>%
#2. Impute mode for categorical columns:
step_impute_mode(all_nominal_predictors()) %>%
#3. Impute median for numeric columns:
step_impute_median(all_numeric_predictors()) %>%
#4. Handle unseen levels in nominal columns: at least 3 detected
step_novel(all_nominal_predictors()) %>% # Handle unseen categories
#5. Convert all logical or character columns to dummy numeric columns:
step_dummy(all_nominal_predictors()) %>%
#6. Remove columns with only 1 value (zero variance): host_response_time_new,
# neighbourhood_group_cleansed_new, property_type_new, and room_type_new.
step_zv(all_predictors()) %>%
#7. Scale all numeric columns: After dummy numeric columns are created.
step_scale(all_numeric_predictors(), -all_outcomes()) %>% # Remove target variable from scaling
#8. Convert logical to numeric (0/1)
step_mutate(across(where(is.logical), as.integer))
Our preprocessing steps handle missing values, convert categorical variables to numeric, eliminate zero-variance predictors, and scale numerical features.
Next, we define our evaluation metrics and set up an XGBoost regression model with tuneable hyperparameters.
We selected rmse
, rsq
, and mae
as our evaluation metrics. - RMSE (Root Mean Square Error) will highlight the magnitude of prediction errors, penalizing larger errors more heavily—critical for identifying outlier predictions in price estimation. - R-squared measures the proportion of variance explained by our model, giving stakeholders a clear understanding of our model’s explanatory power. - MAE (Mean Absolute Error) provides a more intuitive, scale-dependent measure that directly communicates the average prediction error in dollar terms, making it easily interpretable for real-world pricing implications.
# Metrics set for linear models: target is log_price
<- metric_set(rmse, rsq, mae)
airbnb_metrics
# Create workflow with XGBoost 'model' for linear regression
# xgboost model with tune() placeholders for hyperparameters
<- boost_tree(
airbnb_model trees = tune(),
mtry = tune(),
min_n = tune(),
learn_rate = tune()
%>%
) set_engine("xgboost") %>%
set_mode("regression")
# Combine model and recipe into a single workflow
<- workflow() %>%
airbnb_wf add_model(airbnb_model) %>%
add_recipe(airbnb_recipe)
4. Tune Hyperparameters
We use a random grid search approach to find the optimal hyperparameters for our XGBoost model. This is more computationally efficient than an exhaustive grid search while still exploring a diverse set of hyperparameter combinations.
# We keep predictions with control_resamples(save_pred=TRUE) to store them
<- control_resamples(save_pred = TRUE)
ctrl
# Check if a grid search has already been performed and saved, otherwise re-run it
if (!file.exists("OUTPUT/Models/xgboost_grid.RData")) {
# Create a grid of hyperparameters and evaluate each combination
::registerDoParallel()
doParallelset.seed(123)
<- tune_grid(
xgboost_grid
airbnb_wf,resamples = airbnb_folds,
grid = grid_random( # random search to save time
trees(range = c(50, 100)),
finalize(mtry(), airbnb_train),
min_n(range = c(2, 10)),
learn_rate(range = c(0.01, 0.3)),
size = 5
),metrics = airbnb_metrics,
control = ctrl
%>%
) collect_metrics() %>%
arrange(.metric, mean)
# Save the best performing trained model to not have to re-run the whole grid every time we run the notebook
save(xgboost_grid, file = "OUTPUT/Models/xgboost_grid.RData")
}
Now we are loading and extracting the best performing model:
# MODEL SELECTION (BY RMSE)
## Load the results from the hyperparameter tuning process
load("OUTPUT/Models/xgboost_grid.RData")
<- xgboost_grid %>%
best_rmse filter(.metric == "rmse") %>%
arrange(mean) %>% # Sort by lowest RMSE
::slice(1) # Select the best model
dplyr
print(best_rmse)
# A tibble: 1 × 10
mtry trees min_n learn_rate .metric .estimator mean n std_err .config
<int> <int> <int> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 101 63 4 1.59 rmse standard 0.155 10 0.0224 Preproces…
After identifying the best hyperparameters based on RMSE, we finalize our workflow and evaluate it on the test set:
# Finalize the workflow by applying the best hyperparameters (selected based on RMSE)
<- finalize_workflow(airbnb_wf, best_rmse)
final_airbnb_wf
# Train the finalized model on the entire training set and evaluate it on the test set
<- last_fit(
airbnb_final_fit # The finalized workflow with the best hyperparameters
final_airbnb_wf, split = airbnb_split, # Train-test split used for final evaluation
metrics = airbnb_metrics # Evaluation metrics to assess model performance
)
# Collect and display the performance metrics of the final model
%>% collect_metrics() airbnb_final_fit
# A tibble: 3 × 4
.metric .estimator .estimate .config
<chr> <chr> <dbl> <chr>
1 rmse standard 0.157 Preprocessor1_Model1
2 rsq standard 0.967 Preprocessor1_Model1
3 mae standard 0.0786 Preprocessor1_Model1
The evaluation metrics show how well our model performs on unseen data, with RMSE, R-squared, and MAE values indicating prediction quality.
5. Creating the Airbnb Predictor
We extract the final fitted model from our workflow to use it for predictions:
# Extract information from the best model in workflow column
<- airbnb_final_fit$.workflow[[1]]
airbnb_predictor airbnb_predictor
══ Workflow [trained] ══════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: boost_tree()
── Preprocessor ────────────────────────────────────────────────────────────────
8 Recipe Steps
• step_rm()
• step_impute_mode()
• step_impute_median()
• step_novel()
• step_dummy()
• step_zv()
• step_scale()
• step_mutate()
── Model ───────────────────────────────────────────────────────────────────────
##### xgb.Booster
raw: 231.7 Kb
call:
xgboost::xgb.train(params = list(eta = 1.58545778104285, max_depth = 6,
gamma = 0, colsample_bytree = 1, colsample_bynode = 0.701388888888889,
min_child_weight = 4L, subsample = 1), data = x$data, nrounds = 63L,
watchlist = x$watchlist, verbose = 0, nthread = 1, objective = "reg:squarederror")
params (as set within xgb.train):
eta = "1.58545778104285", max_depth = "6", gamma = "0", colsample_bytree = "1", colsample_bynode = "0.701388888888889", min_child_weight = "4", subsample = "1", nthread = "1", objective = "reg:squarederror", validate_parameters = "TRUE"
xgb.attributes:
niter
callbacks:
cb.evaluation.log()
# of features: 144
niter: 63
nfeatures : 144
evaluation_log:
iter training_rmse
<num> <num>
1 2.26356774
2 1.32678958
--- ---
62 0.04649578
63 0.04610318
6. Predictions and Accuracy
Now we apply our model to predict prices for the entire dataset and evaluate its accuracy:
# Get predictions on log_price with respect to all *original data* (airbnb_clean)
<- predict(airbnb_predictor, new_data = airbnb_clean) %>%
all_predictions bind_cols(airbnb_clean %>% select(log_price))
# Let's name them nicely
<- all_predictions %>%
all_predictions rename(
pred_log_price = .pred,
actual_log_price = log_price
)
# Plot: x= actual, y=pred
ggplot(all_predictions, aes(x = actual_log_price, y = pred_log_price,
color = abs(actual_log_price - pred_log_price))) +
geom_point(alpha = 0.2) + # Reduce opacity to minimize overlap
geom_abline(color = "red", linetype = "dashed", linewidth = 1) + # Highlight perfect fit line
scale_color_gradient(low = "blue", high = "red", name = "Prediction Error") + # Color based on error
labs(title = "Actual vs. Predicted log_price",
x = "Actual log_price",
y = "Predicted log_price") +
coord_cartesian(xlim = c(0, max(all_predictions$actual_log_price)),
ylim = c(0, max(all_predictions$pred_log_price))) + # Set limits dynamically
theme_minimal() +
theme(
axis.title.x = element_text(color = "blue", face = "bold", size = 12),
axis.title.y = element_text(color = "darkgreen", face = "bold", size = 12),
panel.grid.major = element_line(color = "grey80", linetype = "dotted"),
panel.grid.minor = element_blank()
)
This plot helps us visualize how well our predictions match actual values. Points closer to the diagonal red line indicate more accurate predictions, while the color gradient highlights larger errors. It actually seems like a reall good model, with only few prediction outliers, which is also supported by the high r-squared
value of 96% explanability in underlying variance.
In this next part we will see how the model performs in different neighbourhoods and how we can use this information to make investment decisions.
7. Geospatial Analysis of Prediction Errors
7.1 Full Barcelona Map
To understand the spatial distribution of our prediction errors, we create geospatial visualizations. We start with the code and end with the interpretation.
# Calculate error difference
<- airbnb_clean %>%
map_data select(listing_id = host_id, latitude, longitude, neighbourhood_group_cleansed) %>%
bind_cols(all_predictions %>% select(pred_log_price, actual_log_price)) %>%
mutate(
error = pred_log_price - actual_log_price,
actual_price = round(exp(actual_log_price), 2),
predicted_price = round(exp(pred_log_price), 2)
)
# Convert to sf object
<- st_as_sf(map_data, coords = c("longitude", "latitude"), crs = 4326)
map_data_sf
<- map_data_sf %>%
neighborhoods_sf group_by(neighbourhood_group_cleansed) %>%
summarise(
mean_error = mean(error, na.rm = TRUE), # Avg. error per neighborhood
geometry = st_union(geometry) # Merge edge points into a single geometry
%>%
) st_convex_hull() # Create the hull for the map polygon
# Define the error color for individual and neighbourhood errors
<- colorNumeric(palette = c('#d7191c','#fdae61','white','#abd9e9', '#2c7bb6'), domain = neighborhoods_sf$error)
error_pal <- colorNumeric(palette = c('#d7191c','#fdae61','white','#abd9e9', '#2c7bb6'), domain = neighborhoods_sf$mean_error)
neighborhood_pal
# Create the map
<- leaflet(options = leafletOptions(preferCanvas = TRUE)) %>%
city_map addTiles() %>%
# Neighborhood polygons colored by average error
addPolygons(
data = neighborhoods_sf,
fillColor = ~neighborhood_pal(mean_error),
fillOpacity = 0.25,
weight = 0.5,
color = "lightgrey",
popup = ~paste("Avg Error:", round(mean_error, 2))
%>%
)
# Individual error markers
addCircleMarkers(
data = map_data_sf,
radius = ~rescale(abs(error), to = c(1, 15)),
stroke = FALSE,
fillColor = ~error_pal(error),
fillOpacity = ~rescale(abs(error), to = c(1, 5)),
popup = ~paste0(
'<strong>Neighborhood:</strong> ', neighbourhood_group_cleansed, '<br>',
'<strong>Actual Price:</strong> $', actual_price, '<br>',
'<strong>Predicted Price:</strong> $', predicted_price, '<br>',
'<strong>Error:</strong> $', round(predicted_price - actual_price, 2), '<br>',
'<strong>Log. Error Size:</strong>', round(error, 2)
)%>%
)
# Legends for both layers
addLegend(
"bottomright",
pal = error_pal,
values = map_data$error,
title = "Individual Prediction Error"
%>%
) addLegend(
"topright",
pal = neighborhood_pal,
values = neighborhoods_sf$mean_error,
title = "Avg. Neighborhood Error"
)
city_map
These visualizations help us identify geographic patterns in our model’s performance. Areas with consistently high errors might indicate neighborhoods with unique pricing dynamics that our model struggles to capture accurately. The log error in our case represents the difference between the predicted and actual price.
The map visualization helps us identify whether the model overestimates or underestimates property prices in different areas. Looking at the average neighborhood error, we can see which areas are generally overpriced or underpriced according to our model. Here we see that the model tends to overestimate prices rather than underestimate, meaning that on average per neighborhood, the predicted price is higher than the actual price. We will go deeper into this after analyzing the individual neighbourhoods, but especially for neighbourhoods like Les Corts or Sant Andreu, the models would predict higher actual prices suggesting market potential for higher price, whereas Gracia
However, instead of viewing model error as a flaw, we can use it as a strategic advantage to detect market inefficiencies. By analyzing these discrepancies, we can help Airbnb hosts, guests, and investors make smarter pricing and investment decisions.
7.2 Neighborhood Maps
In this part we extend our analysis into each neighborhood to understand prediction errors at a more granular level. We create individual maps for each neighborhood, highlighting prediction errors for individual properties. To do so more accurately, we recalculate the relative prediction errors within the neighborhood to better identify outliers and thus understand potential investment opportunities and errors in our model.
# List of unique neighborhoods
<- unique(map_data_sf$neighbourhood_group_cleansed)
neighborhoods
# Loop over each neighborhood and generate a plot
<- function(neigh) {
create_neigh_map # Subset data for the neighborhood
<- map_data %>% filter(neighbourhood_group_cleansed == neigh)
neigh_data <- st_as_sf(neigh_data, coords = c("longitude", "latitude"), crs = 4326)
neigh_sf <- st_convex_hull(st_union(neigh_sf))
neigh_outline
# Calculate the center of the neighborhood for zooming
<- mean(neigh_data$longitude)
center_lon <- mean(neigh_data$latitude)
center_lat
<- max(abs(range(neigh_data$error))) # Find max absolute error
error_range <- colorNumeric(
error_pal palette = c('#d7191c','#fdae61','white','#abd9e9', '#2c7bb6'),
domain = c(-error_range, 0, error_range) # Symmetric range
)
<- leaflet(options = leafletOptions(preferCanvas = TRUE)) %>%
n_map addTiles() %>%
setView(lng = center_lon, lat = center_lat, zoom = 13.3) %>%
# Neighborhood polygon outline
addPolygons(
data = neigh_outline,
fillColor = NA,
fillOpacity = 0,
weight = 3,
color = "black"
%>%
)
# Individual error markers
addCircleMarkers(
data = neigh_data,
radius = ~rescale(abs(error), to = c(2, 15)),
stroke = FALSE,
fillColor = ~error_pal(error),
fillOpacity = 1,
popup = ~paste0(
'<strong>Neighborhood:</strong> ', neighbourhood_group_cleansed, '<br>',
'<strong>Actual Price:</strong> $', actual_price, '<br>',
'<strong>Predicted Price:</strong> $', predicted_price, '<br>',
'<strong>Error:</strong> ', round(predicted_price - actual_price, 2), '<br>',
'<strong>Error Size:</strong> ', round(error, 2)
)%>%
)
# Legends for point layer
addLegend(
"bottomright",
pal = error_pal,
values = neigh_data$error,
title = "Individual Prediction Error"
)
n_map }
The interpretation of the maps by each of the 10 districts below is as follows:
The color gradient indicates the prediction errors (differing for each map);
Red
represents overestimated prices, whileBlue
represents underestimated prices. Overnight prices estimated correctly are rendered inWhite
, because we don’t want to highlight them for now and only focus on the outliers.The circles’ size corresponds to the prediction error’s magnitude, highlighting specific locations where the model identied over- or underpriced properties.
The grey polygon outlines the neighborhood’s boundaries, providing a spatial context for the prediction errors.
This gives a really good indication where to get a really good deal on an AirBnB and which one’s are shamelessly overpriced.
Note: The maps are interactive, so feel free to zoom in and explore the individual properties in each neighborhood.
Deep Dive: Gracia
Prediction errors appear to be center around zero suggesting a more consistent pricing pattern. However, on both ends there are outliers with really higher-than-expected and lower-than expected AirBnBs.
create_neigh_map("gracia")
Deep Dive: Eixample
First off, Eixample has a huge number of AirBnBs which becomes clear from the large number of white data points. There also don’t seem to be a lot of outliers. Maybe here the model is more accurate than in other neighbourhoods, due to the large number of available sample (despite not having trained multiple models per barrio). Two, three top-end outliers are there that seem vastly overpriced (by 10-15x).
create_neigh_map("eixample")
Deep Dive: Ciutat Vella
Also a lot of AirBnBs, but with outliers on both ends of the spectrum.
create_neigh_map("ciutat vella")
Deep Dive: Sant Marti
Not as many AirBnBs (a density function would have been cool to calculate, but we’re running out of time…) but also BIG outliers (4x) on both ends.
create_neigh_map("sant marti")
Deep Dive: Sants Montjuic
Sants covers a large and diverse area from Poble-Sec to Sants and the Fira near Plaza Espana. Lots of interesting outliers to look at, mostly on the overpriced side, unfortunately.
create_neigh_map("sants-montjuic")
Deep Dive: Sant Gervasi
Surprisingly few AirBnBs in this area. Since it’s a very expensive, residential area, it’s quite surprising that the model is able to estimate so accurately here. Only one big prediction error (predicting 13k€ per night must be wrong…).
create_neigh_map("sarria-sant gervasi")
Deep Dive: Nou Barris
Nou Barris also still has very few AirBnBs, but the model seems to be able to predict the prices quite accurately. There are some underpriced AirBnBs, but also two big outliers. Overall very cheap though, which we will get to in our investment analysis.
create_neigh_map("nou barris")
Deep Dive: Horta Guinardo
Diverse area around Park Güell and the Hospital de Sant Pau. The model seems to be able to predict the prices quite accurately. There is one highly overpriced AirBnB, but overall the prices seem to be quite accurate.
create_neigh_map("horta-guinardo")
Deep Dive: Sant Andreu
Here the prediction errors are quite high, most notably underestimating some AirBnBs. This could be a good area to look for a good deal, together with Nou Barris. Stay tuned for the investment analysis.
create_neigh_map("sant andreu")
Deep Dive: Les Corts
In general Les Corts seems to be quite accurately predicted, but there two notable outliers, underestimating the prices. Since onces again, the model seems to think that $13000 is a good price per night, we need to look into this deeper in next iterations. Especially since the AirBnBs seems to be located in the hospital garden… maybe you get a free knee surgery with it…
create_neigh_map("les corts")
7.3 Key Takeaways
Okay, so to recap: There is two main types of prediction errors: overestimation and underestimation. We will draw some conclusions and strategies for guests, hosts, and investors based on these errors.
Overestimated Prices (Blue Dots & Areas), where Predicted > Actual
Here the model thinks the price per night should be more expensive than it actually is.
Implications:
- For Guests: Potential good deals! If the model consistently underestimates in certain neighborhoods, these could be undervalued locations where travelers can find affordable stays.
- For Hosts: Hosts in these areas may have pricing power and could charge even more, especially if demand remains strong.
- For Investors: If properties are frequently undervalued, these areas could be attractive for investment, as they may still have room for price increases.
Underestimated Prices (Red Dots & Areas), where Predicted < Actual
The model thinks the price per night is worth less than it actually is.
Implications:
- For Guests: These listings are overpriced, meaning travelers might be overpaying.
- For Hosts: If bookings are still coming in at these higher-than-expected prices, it suggests high demand and potential pricing power.
- For Investors: These areas may already be fully valued or even overpriced, so future price appreciation could be limited.
Strategic Takeaways
1. Optimizing Airbnb’s Pricing Algorithm
- If a neighborhood is frequently undervalued, Airbnb could suggest price increases to hosts.
- If a neighborhood is frequently overvalued, the model may need adjustments to better capture local pricing factors.
2. Identifying Market Opportunities
Emerging Hotspots: Areas consistently undervalued may indicate neighborhoods on the rise, which can be marketed as hidden gems for travelers. Pricing Adjustments: Airbnb could educate travelers on where they might be overpaying and suggest alternative areas with better value.
3. Investment Strategies
- If a neighborhood is frequently undervalued, investors could buy properties before the market corrects itself.
- If a neighborhood is frequently overvalued, investors should be cautious of market hype and ensure rental demand justifies the pricing.
This is exaclty what we will look at in the next part. There we compare different investment strategies to maximize returns when investing in potential AirBnB properties.
8. Investment Strategies for Real Estate Speculator
In this section, we analyze various investment strategies for a real estate speculator in Barcelona, leveraging data-driven insights. To test the performance of our model, we simulate different scenarios where we are a real estate speculator with a budget of 3 million euros to acquire properties in Barcelona. In this framework, we aim to make investment decisions in a data-focused manner, designing three distinct strategies based on the profile of the investor and its aversion to risk.
- Strategy 1: Shortest Recovery Time Approach
- Strategy 2: Occupancy Rate Optimization
- Strategy 3: Hybrid Approach
Strategy 1: Occupancy Rate Optimization
The first strategy focuses on maximizing revenue. To achieve this the focus is placed on properties where the shortest investment recovery time can be achieved. This strategy is ideal for investors looking to maximize returns in the shortest time frame possible.
Steps:
- Estimate Recovery Time: Calculate the recovery time for each neighborhood based on property cost, predicted nightly price, daily revenue, and occupancy rate.
- Sort Neighborhoods: Arrange neighborhoods by the shortest recovery time.
- Allocate Budget: Invest in properties in neighborhoods with the shortest recovery time until the budget is exhausted.
Output: The investment plan is concentrated in the neighborhoods with the quickest return, which may lead to higher market-specific risk due to its low diversification, potentially exposing the investor to local market fluctuations.
## Intuition is to go with finding the neighbourhood property with shortest recovery time
## Once we get shortest recovery time per property of that neighbourhood, we can buy properties in that neighbourhood until budget is exhausted
source("create_sample.R")
<- 3000000 # Total budget: 3 million
budget <- budget
remaining_budget <- data.frame()
investment_plan
# Step 1: Precompute estimated recovery time for each neighborhood
<- data.frame()
neighborhood_data
for (nbh in avg_prices$neighbourhood) {
# 1) Retrieve estimated cost of a single property in this neighborhood
<- avg_prices %>% filter(neighbourhood == nbh) %>% pull(avg_price)
this_cost
# 2) Generate a sample property using create_sample()
<- create_sample(
new_apt df = airbnb_clean,
df_prices = avg_prices,
origin_sample = 1,
neighbourhood_new = nbh
)
# 3) Predict nightly price
<- predict(airbnb_predictor, new_data = new_apt)$.pred
log_pred <- exp(log_pred)
predicted_nightly_price
# 4) Compute revenue metrics
<- predicted_nightly_price + 20 # Includes cleaning fee
daily_revenue <- avg_prices %>% filter(neighbourhood == nbh) %>% pull(pct_year_occupation)
occ_rate <- daily_revenue * 365 * occ_rate
annual_revenue
# 5) Calculate recovery time
<- this_cost / annual_revenue
years_to_recover_one
# Store precomputed data
<- rbind(
neighborhood_data
neighborhood_data,data.frame(
neighbourhood = nbh,
cost_of_one_property = round(this_cost),
predicted_nightly = round(predicted_nightly_price, 2),
daily_revenue = round(daily_revenue, 2),
occupancy_rate = occ_rate,
annual_revenue = round(annual_revenue),
years_to_recover = round(years_to_recover_one, 1)
)
)
}
# Step 2: Sort neighborhoods by best recovery time
<- neighborhood_data %>% arrange(years_to_recover)
neighborhood_data
# Step 3: Allocate budget across multiple neighborhoods
for (i in 1:nrow(neighborhood_data)) {
<- neighborhood_data[i, ]
row
# Maximum number of properties we can buy in this neighborhood
<- floor(remaining_budget / row$cost_of_one_property)
max_can_buy
if (max_can_buy > 0) {
# Deduct from remaining budget
<- remaining_budget - (max_can_buy * row$cost_of_one_property)
remaining_budget
# Store in investment plan
<- rbind(
investment_plan
investment_plan,data.frame(
neighbourhood = row$neighbourhood,
cost_of_one_property = row$cost_of_one_property,
num_properties_bought = max_can_buy,
total_spent = max_can_buy * row$cost_of_one_property,
predicted_nightly = row$predicted_nightly,
daily_revenue = row$daily_revenue,
occupancy_rate = row$occupancy_rate,
annual_revenue = max_can_buy * row$annual_revenue,
years_to_recover = row$years_to_recover
)
)
}
# Stop if budget is exhausted
if (remaining_budget <= 0) {
break
}
}
# Print final investment plan - should match initial intuition
t(investment_plan)
[,1]
neighbourhood "nou barris"
cost_of_one_property "201074"
num_properties_bought "14"
total_spent "2815036"
predicted_nightly "171.14"
daily_revenue "191.14"
occupancy_rate "0.5"
annual_revenue "488362"
years_to_recover "5.8"
This approach aggressively targets the quickest return neighborhoods first, which may lead to a portfolio concentrated in specific areas of the city. The strategy assumes that past performance (occupancy rates and pricing) will continue in the future. With this approach, we would be able to buy 14 properties in Nou Barris with a super short amortization of only 5.8 years.
Strategy 2: Occupancy Rate Optimization
This strategy prioritizes neighborhoods with the highest occupancy rates. The goal is to mitigate risk by targeting high-demand neighborhoods where properties have a higher occupancy rate. Although the recovery time might be longer, the stable and frequent bookings provide a reliable steam of income.
Steps:
- Identify High Occupancy Neighborhoods: Select neighborhoods with the highest occupancy rates.
- Allocate Budget: Invest in properties in the top occupancy neighborhood until the budget is exhausted.
Output: More conservative investment plan, focusing on properties with consistent bookings, reducing the risk of extended vacancy periods.
This strategy prioritizes neighborhoods with the highest occupancy rates. The thinking here is to reduce risk by focusing on high-demand areas where properties are more consistently booked, providing more stable income even if individual rental prices might be lower.
# Set budget
<- 3000000
budget <- budget
remaining_budget <- data.frame()
investment_plan_occ
<- neighborhood_data %>%
top_occ_neighborhoods arrange(desc(occupancy_rate)) %>%
head(3) # Select only the highest occupancy neighborhood
# Process investment for the single best neighborhood
<- top_occ_neighborhoods[1, ]
row
<- floor(remaining_budget / row$cost_of_one_property)
max_can_buy
if (max_can_buy > 0) {
<- remaining_budget - (max_can_buy * row$cost_of_one_property)
remaining_budget
<- rbind(
investment_plan_occ
investment_plan_occ,data.frame(
neighbourhood = row$neighbourhood,
cost_of_one_property = row$cost_of_one_property,
num_properties_bought = max_can_buy,
total_spent = max_can_buy * row$cost_of_one_property,
occupancy_rate = row$occupancy_rate
)
)
}
t(investment_plan_occ)
[,1]
neighbourhood "ciutat vella"
cost_of_one_property "392645"
num_properties_bought "7"
total_spent "2748515"
occupancy_rate "0.8"
This approach creates a more conservative investment portfolio by focusing on properties that are more likely to be consistently rented throughout the year. While the recovery time might be longer, the risk of extended vacancy periods is mitigated, making cash flow more predictable.
Strategy 3: Hybrid Approach
This strategy combines occupancy rates with profit ratio to create a balanced investment portfolio. The thinking is to blend risk reduction (high occupancy) with return optimization (profit ratio) by using a weighted scoring system that values stability but also considers financial returns.
Objective: Balance risk reduction (high occupancy) with return optimization (profit ratio) using a weighted scoring system.
Steps:
- Combine Metrics: Use a weighted scoring system that values occupancy rates (60%) and profit ratio (40%).
- Allocate Budget: Invest in properties across different neighborhoods based on the combined score.
Output: Diversified portfolio with better risk-adjusted returns, likely to perform well across different market conditions.
So, let’s see what the outcomes are:
# Step 1: Calculate Hybrid Score & Rank Neighborhoods
<- neighborhood_data %>%
hybrid_neighborhood_data mutate(
profit_ratio = annual_revenue / cost_of_one_property,
hybrid_score = 0.6 * occupancy_rate + 0.4 * profit_ratio
%>%
) arrange(desc(hybrid_score))
# Step 2: Allocate Budget Based on Hybrid Score
<- data.frame()
investment_plan_hybrid <- budget
remaining_budget
for (i in 1:nrow(hybrid_neighborhood_data)) {
<- hybrid_neighborhood_data[i, ]
row
<- floor(remaining_budget / row$cost_of_one_property)
max_can_buy
if (max_can_buy > 0) {
<- remaining_budget - (max_can_buy * row$cost_of_one_property)
remaining_budget
<- rbind(
investment_plan_hybrid
investment_plan_hybrid,data.frame(
neighbourhood = row$neighbourhood,
cost_of_one_property = row$cost_of_one_property,
num_properties_bought = max_can_buy,
total_spent = max_can_buy * row$cost_of_one_property,
predicted_nightly = row$predicted_nightly,
daily_revenue = row$daily_revenue,
occupancy_rate = row$occupancy_rate,
annual_revenue = max_can_buy * row$annual_revenue,
years_to_recover = row$years_to_recover,
hybrid_score = row$hybrid_score
)
)
}
if (remaining_budget <= 0) break
}
# Step 3: Compute Weighted Combined Metrics
<- sum(investment_plan_hybrid$num_properties_bought)
total_properties if (total_properties > 0) {
<- data.frame(
combined_row neighbourhood = "TOTAL_COMBINED",
cost_of_one_property = NA,
num_properties_bought = total_properties,
total_spent = sum(investment_plan_hybrid$total_spent),
predicted_nightly = weighted.mean(investment_plan_hybrid$predicted_nightly, investment_plan_hybrid$num_properties_bought),
daily_revenue = weighted.mean(investment_plan_hybrid$daily_revenue, investment_plan_hybrid$num_properties_bought),
occupancy_rate = weighted.mean(investment_plan_hybrid$occupancy_rate, investment_plan_hybrid$num_properties_bought),
annual_revenue = sum(investment_plan_hybrid$annual_revenue),
years_to_recover = weighted.mean(investment_plan_hybrid$years_to_recover, investment_plan_hybrid$num_properties_bought),
hybrid_score = NA # Not applicable
)
# Append summary row
<- rbind(investment_plan_hybrid, combined_row)
investment_plan_hybrid
}
# Step 4 Print final investment plan
t(investment_plan_hybrid)
[,1] [,2] [,3]
neighbourhood "ciutat vella" "nou barris" "TOTAL_COMBINED"
cost_of_one_property "392645" "201074" NA
num_properties_bought "7" "1" "8"
total_spent "2748515" " 201074" "2949589"
predicted_nightly "128.0700" "171.1400" "133.4538"
daily_revenue "148.0700" "191.1400" "153.4538"
occupancy_rate "0.8000" "0.5000" "0.7625"
annual_revenue "302645" " 34883" "337528"
years_to_recover "9.1000" "5.8000" "8.6875"
hybrid_score "0.5240449" "0.3693934" NA
The hybrid approach successfully balances risk and return by combining properties with high occupancy rates, from the Ciutat Vella neighborhood, with other properties with a faster recovery rate. The total combined strategy shows a strong occupancy rate (0.76) with a a combined recovery time of 8.7 years.
Overall Conclusions and Key Insights
Each strategy represents a different investment philosophy and risk tolerance:
Shortest Recovery Time Strategy: Focuses on rapid returns, potentially concentrating investments in fewer neighborhoods. This approach increases market-specific risk but offers quicker recovery. We see Nou Barris as the clear winner here, which was partially visible already from the tendency of the model to detect undervalues AirBnBs in this area.
Occupancy Rate Strategy: Prioritizes stable cash flow by targeting high-demand areas, reducing vacancy risks. This strategy is especially advantageous in volatile or recession-prone markets.
Hybrid Strategy: Strikes a balance between return optimization and risk mitigation, creating a diversified portfolio. While it may not maximize returns or minimize risk to the extreme, it provides resilience across varying market conditions.