28
loading...
This website collects cookies to deliver better user experience
training.csv
.library(tidyverse)
train_raw <- read_csv("train.csv")
price
variable is skewed a lot, as prices often are!train_raw %>%
ggplot(aes(price, fill = neighbourhood_group)) +
geom_histogram(position = "identity", alpha = 0.5, bins = 20) +
scale_x_log10(labels = scales::dollar_format()) +
labs(fill = NULL, x = "price per night")
train_raw %>%
ggplot(aes(longitude, latitude, color = log(price))) +
geom_point(alpha = 0.2) +
scale_color_viridis_c()
train_raw %>%
ggplot(aes(longitude, latitude, z = log(price))) +
stat_summary_hex(alpha = 0.8, bins = 70) +
scale_fill_viridis_c() +
labs(fill = "Mean price (log)")
library(tidymodels)
set.seed(123)
nyc_split <- train_raw %>%
mutate(price = log(price + 1)) %>%
initial_split(strata = price)
nyc_train <- training(nyc_split)
nyc_test <- testing(nyc_split)
set.seed(234)
nyc_folds <- vfold_cv(nyc_train, v = 5, strata = price)
nyc_folds
## # 5-fold cross-validation using stratification
## # A tibble: 5 x 2
## splits id
## <list> <chr>
## 1 <split [20533/5135]> Fold1
## 2 <split [20533/5135]> Fold2
## 3 <split [20534/5134]> Fold3
## 4 <split [20535/5133]> Fold4
## 5 <split [20537/5131]> Fold5
neighborhood
, and create features for machine learning from the text in the name
variable. Read more about creating ML features from natural language in my book with my coauthor Emil Hvitfeldt. For this demonstration, let’s start out with only the top 30 tokens and see how well we do.library(textrecipes)
nyc_rec <-
recipe(price ~ latitude + longitude + neighbourhood + room_type +
minimum_nights + number_of_reviews + availability_365 + name,
data = nyc_train
) %>%
step_novel(neighbourhood) %>%
step_other(neighbourhood, threshold = 0.01) %>%
step_tokenize(name) %>%
step_stopwords(name) %>%
step_tokenfilter(name, max_tokens = 30) %>%
step_tf(name)
nyc_rec
## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 8
##
## Operations:
##
## Novel factor level assignment for neighbourhood
## Collapsing factor levels for neighbourhood
## Tokenization for name
## Stop word removal for name
## Text filtering for name
## Term frequency with name
library(baguette)
bag_spec <-
bag_tree(min_n = 10) %>%
set_engine("rpart", times = 25) %>%
set_mode("regression")
bag_wf <-
workflow() %>%
add_recipe(nyc_rec) %>%
add_model(bag_spec)
set.seed(123)
bag_fit <- fit(bag_wf, data = nyc_train)
bag_fit
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: bag_tree()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 6 Recipe Steps
##
## • step_novel()
## • step_other()
## • step_tokenize()
## • step_stopwords()
## • step_tokenfilter()
## • step_tf()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Bagged CART (regression with 25 members)
##
## Variable importance scores include:
##
## # A tibble: 37 x 4
## term value std.error used
## <chr> <dbl> <dbl> <int>
## 1 room_type 4800. 15.5 25
## 2 longitude 3084. 13.0 25
## 3 neighbourhood 2619. 13.0 25
## 4 tf_name_room 2044. 9.17 25
## 5 latitude 1822. 14.8 25
## 6 minimum_nights 1642. 9.53 25
## 7 availability_365 1114. 10.6 25
## 8 tf_name_private 996. 7.74 25
## 9 number_of_reviews 914. 9.33 25
## 10 tf_name_studio 189. 2.99 25
## # … with 27 more rows
room_type
and the geographical information are very important for this model.doParallel::registerDoParallel()
set.seed(123)
bag_rs <- fit_resamples(bag_wf, nyc_folds)
collect_metrics(bag_rs)
## # A tibble: 2 x 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 0.437 5 0.00237 Preprocessor1_Model1
## 2 rsq standard 0.603 5 0.00260 Preprocessor1_Model1
test_rs <- augment(bag_fit, nyc_test)
test_rs %>%
ggplot(aes(exp(price), exp(.pred), color = neighbourhood_group)) +
geom_abline(slope = 1, lty = 2, color = "gray50", alpha = 0.5) +
geom_point(alpha = 0.2) +
scale_x_log10(labels = scales::dollar_format()) +
scale_y_log10(labels = scales::dollar_format()) +
labs(color = NULL, x = "True price", y = "Predicted price")
library(rlang)
rmsle_vec <- function(truth, estimate, na_rm = TRUE, ...) {
rmsle_impl <- function(truth, estimate) {
sqrt(mean((log(truth + 1) - log(estimate + 1))^2))
}
metric_vec_template(
metric_impl = rmsle_impl,
truth = truth,
estimate = estimate,
na_rm = na_rm,
cls = "numeric",
...
)
}
rmsle <- function(data, ...) {
UseMethod("rmsle")
}
rmsle <- new_numeric_metric(rmsle, direction = "minimize")
rmsle.data.frame <- function(data, truth, estimate, na_rm = TRUE, ...) {
metric_summarizer(
metric_nm = "rmsle",
metric_fn = rmsle_vec,
data = data,
truth = !!enquo(truth),
estimate = !!enquo(estimate),
na_rm = na_rm,
...
)
}
rmse()
with the results on the log scale and rmsle()
on the results back on the dollar scale.test_rs %>%
rmse(price, .pred)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 0.435
test_rs %>%
mutate(across(c(price, .pred), exp)) %>%
rmsle(price, .pred)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmsle standard 0.431