-
Notifications
You must be signed in to change notification settings - Fork 5
/
01-train.R
435 lines (367 loc) · 17 KB
/
01-train.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# 1. Setup ---------------------------------------------------------------------
#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# NOTE: See DESCRIPTION for library dependencies and R/setup.R for
# variables used in each pipeline stage
# Start the stage timer and clear logs from prior stage
tictoc::tic.clearlog()
tictoc::tic("Train")
# Load libraries, helpers, and recipes from files
purrr::walk(list.files("R/", "\\.R$", full.names = TRUE), source)
# Print a run note and type for context at the top of CI logs
message("Run note: ", run_note)
message("Run type: ", run_type)
#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# 2. Prepare Data --------------------------------------------------------------
#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
message("Preparing model training data")
# Load the full set of training data, then arrange by sale date in order to
# facilitate out-of-time sampling/validation
# NOTE: It is critical to trim "multicard" sales when training. Multicard means
# there is multiple buildings on a PIN. Since these sales include multiple
# buildings, they are typically higher than a "normal" sale and must be removed
training_data_full <- read_parquet(paths$input$training$local) %>%
filter(!ind_pin_is_multicard, !sv_is_outlier) %>%
arrange(meta_sale_date)
# Create train/test split by time, with most recent observations in the test set
# We want our best model(s) to be predictive of the future, since properties are
# assessed on the basis of past sales
split_data <- initial_time_split(
data = training_data_full,
prop = params$cv$split_prop
)
test <- testing(split_data)
train <- training(split_data)
# Create a recipe for the training data which removes non-predictor columns and
# preps categorical data, see R/recipes.R for details
train_recipe <- model_main_recipe(
data = training_data_full,
pred_vars = params$model$predictor$all,
cat_vars = params$model$predictor$categorical,
id_vars = params$model$predictor$id
)
#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# 3. Linear Model --------------------------------------------------------------
#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
message("Creating and fitting linear baseline model")
# Create a linear model recipe with additional imputation, transformations,
# and feature interactions
lin_recipe <- model_lin_recipe(
data = training_data_full %>%
mutate(meta_sale_price = log(meta_sale_price)),
pred_vars = params$model$predictor$all,
cat_vars = params$model$predictor$categorical,
id_vars = params$model$predictor$id
)
# Create a linear model specification and workflow
lin_model <- parsnip::linear_reg() %>%
set_mode("regression") %>%
set_engine("lm")
lin_wflow <- workflow() %>%
add_model(lin_model) %>%
add_recipe(
recipe = lin_recipe,
blueprint = hardhat::default_recipe_blueprint(allow_novel_levels = TRUE)
)
# Fit the linear model on the training data
lin_wflow_final_fit <- lin_wflow %>%
fit(data = train %>% mutate(meta_sale_price = log(meta_sale_price)))
#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# 4. LightGBM Model ------------------------------------------------------------
#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
message("Initializing LightGBM model")
# This is the main model used to value 200-class residential property. It uses
# lightgbm as a backend, which is a boosted tree model similar to xgboost or
# catboost, but with better performance and faster training time in our use case
# See https://lightgbm.readthedocs.io/ for more information
## 4.1. Model Initialization ---------------------------------------------------
# Initialize a lightgbm model specification. Most hyperparameters are passed to
# lightgbm as "engine arguments" i.e. things specific to lightgbm, as opposed to
# model arguments, which are provided by parsnip's boost_tree()
lgbm_model <- parsnip::boost_tree(
stop_iter = params$model$parameter$stop_iter
) %>%
set_mode("regression") %>%
set_engine(
engine = params$model$engine,
# Parameters required to make the model deterministic i.e. output the same
# predictions if the same hyperparameters are used
seed = params$model$seed,
deterministic = params$model$deterministic,
force_row_wise = params$model$force_row_wise,
### 4.1.1. Manual Parameters -----------------------------------------------
# These are static lightgbm-specific engine parameters passed to lgb.train()
# See lightsnip::train_lightgbm for details
num_threads = num_threads,
verbose = params$model$verbose,
# Set the objective function. This is what lightgbm will try to minimize
objective = params$model$objective,
# Names of integer-encoded categorical columns. This is CRITICAL or else
# lightgbm will treat these columns as numeric
categorical_feature = params$model$predictor$categorical,
# Enable early stopping using a proportion of each training sample as a
# validation set. If lgb.train goes `stop_iter` rounds without improvement
# in the chosen metric, then it will end training early. Saves an immense
# amount of time during CV. WARNING: See GitLab issue #82 for more info
validation = params$model$parameter$validation_prop,
sample_type = params$model$parameter$validation_type,
metric = params$model$parameter$validation_metric,
# Lightsnip custom parameter. Links the value of max_depth to num_leaves
# using floor(log2(num_leaves)) + add_to_linked_depth. Useful since
# otherwise Bayesian opt spends time exploring irrelevant parameter space
link_max_depth = params$model$parameter$link_max_depth,
# Always initialize save_tree_error to false until we're ready to train
# the final model, since it's incompatible with CV
save_tree_error = FALSE,
### 4.1.2. Tuned Parameters ------------------------------------------------
# Typically set manually along with the number of iterations (trees)
learning_rate = tune(),
# Max number of bins that feature values will be bucketed in
max_bin = tune(),
# Main parameters determining model complexity
num_leaves = tune(),
add_to_linked_depth = tune(),
feature_fraction = tune(),
min_gain_to_split = tune(),
min_data_in_leaf = tune(),
# Categorical-specific parameters
max_cat_threshold = tune(),
min_data_per_group = tune(),
cat_smooth = tune(),
cat_l2 = tune(),
# Regularization parameters
lambda_l1 = tune(),
lambda_l2 = tune()
)
# The num_iterations (trees) parameter has three possible values:
# 1. If CV AND early stopping are enabled, set a static parameter using the
# upper bound of the CV search range as the maximum possible number of trees
# 2. If CV is enabled but early stopping is disabled, set the search range to
# the standard CV range
# 3. If no CV is enabled, use the default parameter value
if (cv_enable && early_stopping_enable) {
lgbm_model <- lgbm_model %>%
set_args(trees = params$model$hyperparameter$range$num_iterations[2])
} else if (cv_enable && !early_stopping_enable) {
lgbm_model <- lgbm_model %>%
set_args(trees = tune())
} else {
lgbm_model <- lgbm_model %>%
set_args(trees = params$model$hyperparameter$default$num_iterations)
}
# Initialize lightgbm workflow, which contains both the model spec AND the
# pre-processing steps/recipe needed to prepare the raw data
lgbm_wflow <- workflow() %>%
add_model(lgbm_model) %>%
add_recipe(
recipe = train_recipe,
blueprint = hardhat::default_recipe_blueprint(allow_novel_levels = TRUE)
)
## 4.2. Cross-Validation -------------------------------------------------------
# Begin CV tuning if enabled. We use Bayesian tuning as grid search or random
# search take a very long time to produce good results due to the high number
# of hyperparameters
if (cv_enable) {
message("Starting cross-validation")
# Create resamples / folds for cross-validation
if (params$model$parameter$validation_type == "random") {
train_folds <- vfold_cv(data = train, v = params$cv$num_folds)
} else if (params$model$parameter$validation_type == "recent") {
# CRITICAL NOTE: When using rolling-origin CV with train_includes_val = TRUE
# the validation set IS INCLUDED IN THE TRAINING DATA for each fold
# This is a hack to pass a validation set to LightGBM for early stopping
# (our R wrapper package Lightsnip cuts out the last X% of each training
# set to use as a validation set). See GitLab issue #82 for more info
train_folds <- create_rolling_origin_splits(
data = train,
v = params$cv$num_folds,
overlap_months = params$cv$fold_overlap,
date_col = meta_sale_date,
val_prop = params$model$parameter$validation_prop,
train_includes_val = params$model$parameter$validation_prop > 0,
cumulative = FALSE
)
}
# Create the parameter search space for hyperparameter optimization
# Parameter boundaries are taken from the lightgbm docs and hand-tuned
# See: https://lightgbm.readthedocs.io/en/latest/Parameters-Tuning.html
lgbm_range <- params$model$hyperparameter$range
lgbm_params <- lgbm_wflow %>%
hardhat::extract_parameter_set_dials() %>%
update(
# nolint start
learning_rate = lightsnip::learning_rate(lgbm_range$learning_rate),
max_bin = lightsnip::max_bin(lgbm_range$max_bin),
num_leaves = lightsnip::num_leaves(lgbm_range$num_leaves),
add_to_linked_depth = lightsnip::add_to_linked_depth(lgbm_range$add_to_linked_depth),
feature_fraction = lightsnip::feature_fraction(lgbm_range$feature_fraction),
min_gain_to_split = lightsnip::min_gain_to_split(lgbm_range$min_gain_to_split),
min_data_in_leaf = lightsnip::min_data_in_leaf(lgbm_range$min_data_in_leaf),
max_cat_threshold = lightsnip::max_cat_threshold(lgbm_range$max_cat_threshold),
min_data_per_group = lightsnip::min_data_per_group(lgbm_range$min_data_per_group),
cat_smooth = lightsnip::cat_smooth(lgbm_range$cat_smooth),
cat_l2 = lightsnip::cat_l2(lgbm_range$cat_l2),
lambda_l1 = lightsnip::lambda_l1(lgbm_range$lambda_l1),
lambda_l2 = lightsnip::lambda_l2(lgbm_range$lambda_l2)
# nolint end
)
# Only update the tuning param with the CV range if early stopping is disabled
if (!early_stopping_enable) {
lgbm_params <- lgbm_params %>%
update(trees = dials::trees(lgbm_range$num_iterations))
}
# Use Bayesian tuning to find best performing hyperparameters. This part takes
# quite a long time, depending on the compute resources available
lgbm_search <- tune_bayes(
object = lgbm_wflow,
resamples = train_folds,
initial = params$cv$initial_set,
iter = params$cv$max_iterations,
param_info = lgbm_params,
metrics = metric_set(rmse, mape, mae),
control = control_bayes(
verbose = TRUE,
verbose_iter = TRUE,
uncertain = params$cv$uncertain,
no_improve = params$cv$no_improve,
extract = extract_num_iterations,
seed = params$model$seed
)
)
# Save tuning results to file. This is a data.frame where each row is one
# CV iteration
lgbm_search %>%
lightsnip::axe_tune_data() %>%
arrow::write_parquet(paths$output$parameter_raw$local)
# Save the parameter ranges searched while tuning
lgbm_params %>%
mutate(range = purrr::map(object, dials::range_get)) %>%
tidyr::unnest_wider(range) %>%
select(
parameter_name = name, parameter_type = component_id,
lower, upper
) %>%
arrow::write_parquet(paths$output$parameter_range$local)
# Choose the best model (whichever model minimizes the chosen objective,
# averaged across CV folds)
lgbm_final_params <- tibble(
engine = params$model$engine,
seed = params$model$seed,
objective = params$model$objective
) %>%
bind_cols(
as_tibble(params$model$parameter) %>%
select(-any_of("num_iterations"))
) %>%
bind_cols(
select_iterations(lgbm_search, metric = params$cv$best_metric)
) %>%
bind_cols(
select_best(lgbm_search, metric = params$cv$best_metric) %>%
select(-any_of("trees"))
) %>%
select(configuration = .config, everything()) %>%
mutate(across(any_of("num_iterations"), as.integer)) %>%
arrow::write_parquet(paths$output$parameter_final$local)
} else {
# If CV is disabled, just use the default set of parameters specified in
# params.yaml, keeping only the ones used in the model specification
lgbm_missing_params <- names(params$model$hyperparameter$default)
lgbm_missing_params <- lgbm_missing_params[
!lgbm_missing_params %in%
c(hardhat::extract_parameter_set_dials(lgbm_wflow)$name, "num_iterations")
]
lgbm_final_params <- tibble(
configuration = "Default",
engine = params$model$engine,
seed = params$model$seed,
objective = params$model$objective
) %>%
bind_cols(as_tibble(params$model$parameter)) %>%
bind_cols(as_tibble(params$model$hyperparameter$default)) %>%
select(-all_of(lgbm_missing_params)) %>%
mutate(across(any_of("num_iterations"), as.integer)) %>%
arrow::write_parquet(paths$output$parameter_final$local)
# If CV is disabled, we still need to write empty stub files for any outputs
# created by CV so DVC has something to hash/look for
arrow::write_parquet(data.frame(), paths$output$parameter_raw$local)
arrow::write_parquet(data.frame(), paths$output$parameter_range$local)
}
## 4.3. Fit Models -------------------------------------------------------------
# Finalize the model specification by disabling early stopping, instead using
# the maximum number of iterations used during the best cross-validation round
# OR the default `num_iterations` if CV was not performed. Also enable comps
# if they're configured to run, since they're incompatible with CV
lgbm_model_final <- lgbm_model %>%
set_args(
stop_iter = NULL,
validation = 0,
trees = lgbm_final_params$num_iterations,
save_tree_error = comp_enable
)
# Fit the final model using the training data and our final hyperparameters
# This model is used to measure performance on the test set
message("Fitting final model on training data")
lgbm_wflow_final_fit <- lgbm_wflow %>%
update_model(lgbm_model_final) %>%
finalize_workflow(lgbm_final_params) %>%
fit(data = train)
# Fit the final model using the full data (including the test set) and our final
# hyperparameters. This model is used for actually assessing all properties
message("Fitting final model on full data")
lgbm_wflow_final_full_fit <- lgbm_wflow %>%
update_model(lgbm_model_final) %>%
finalize_workflow(lgbm_final_params) %>%
fit(data = training_data_full)
#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# 5. Finalize Models -----------------------------------------------------------
#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
message("Finalizing and saving trained model")
# Get predictions on the test set using the training data model. These
# predictions are used to evaluate model performance on the unseen test set.
# Keep only the variables necessary for evaluation
test %>%
mutate(
pred_card_initial_fmv = predict(lgbm_wflow_final_fit, test)$.pred,
pred_card_initial_fmv_lin = exp(predict(
lin_wflow_final_fit,
test %>% mutate(meta_sale_price = log(meta_sale_price))
)$.pred)
) %>%
select(
meta_year, meta_pin, meta_class, meta_card_num, meta_triad_code,
all_of(params$ratio_study$geographies), char_bldg_sf,
all_of(c(
"prior_far_tot" = params$ratio_study$far_column,
"prior_near_tot" = params$ratio_study$near_column
)),
pred_card_initial_fmv, pred_card_initial_fmv_lin,
meta_sale_price, meta_sale_date, meta_sale_document_num
) %>%
# Prior year values are AV, not FMV. Multiply by 10 to get FMV for residential
mutate(
prior_far_tot = prior_far_tot * 10,
prior_near_tot = prior_near_tot * 10
) %>%
as_tibble() %>%
write_parquet(paths$output$test_card$local)
# Save the finalized model object to file so it can be used elsewhere. Note the
# lgbm_save() function, which uses lgb.save() rather than saveRDS(), since
# lightgbm is picky about how its model objects are stored on disk
lgbm_wflow_final_full_fit %>%
workflows::extract_fit_parsnip() %>%
lightsnip::lgbm_save(paths$output$workflow_fit$local)
# Save the finalized recipe object to file so it can be used to preprocess
# new data. This is critical since it saves the factor levels used to integer-
# encode any categorical columns
lgbm_wflow_final_full_fit %>%
workflows::extract_recipe() %>%
lightsnip::axe_recipe() %>%
saveRDS(paths$output$workflow_recipe$local)
# End the stage timer and write the time elapsed to a temporary file
tictoc::toc(log = TRUE)
bind_rows(tictoc::tic.log(format = FALSE)) %>%
arrow::write_parquet(gsub("//*", "/", file.path(
paths$intermediate$timing$local,
"model_timing_train.parquet"
)))