diff --git a/dvc.lock b/dvc.lock index 1cbfd64..3da0982 100644 --- a/dvc.lock +++ b/dvc.lock @@ -26,35 +26,36 @@ stages: outs: - path: input/assessment_data.parquet hash: md5 - md5: 605ee612ff45dca2edf5c508993a7f56 - size: 69522635 + md5: 4e1ac463b4d74fb9b238fa5e2c19210a + size: 80070368 - path: input/char_data.parquet hash: md5 - md5: ed7b8f4ed02eb491d0450920874a66c3 - size: 131476800 + md5: ea34f077e5a26d8e23174d3f9743ec9c + size: 149213288 - path: input/condo_strata_data.parquet hash: md5 - md5: 0a7462f0afccb09bdd94c58148a3ca8d - size: 40842 + md5: 413f828ff0d1a2ca4d34ae0c6ca33ca8 + size: 40859 - path: input/land_nbhd_rate_data.parquet hash: md5 - md5: e508daf5790982c303d6503fe1cb8e2b + md5: 6c1baaf2acbcba9869025bb336f4ad25 size: 4413 - path: input/training_data.parquet hash: md5 - md5: 51090aa4f5b5311b1441e62b81fd3827 - size: 68987740 + md5: 2fe90bea662f5624408f5927374abaeb + size: 74835195 train: cmd: Rscript pipeline/01-train.R deps: - path: input/training_data.parquet hash: md5 - md5: 97b6ced3adb788e20fb2fc3758cd38a2 - size: 53281136 + md5: 2fe90bea662f5624408f5927374abaeb + size: 74835195 params: params.yaml: cv: split_prop: 0.9 + num_folds: 10 fold_overlap: 9 initial_set: 20 max_iterations: 50 @@ -64,21 +65,21 @@ stages: model.engine: lightgbm model.hyperparameter: default: - num_iterations: 2500 - learning_rate: 0.015 - max_bin: 512 - num_leaves: 159 - add_to_linked_depth: 1 - feature_fraction: 0.688 - min_gain_to_split: 5.58 + num_iterations: 2275 + learning_rate: 0.011 + max_bin: 225 + num_leaves: 200 + add_to_linked_depth: 2 + feature_fraction: 0.661 + min_gain_to_split: 1.58 min_data_in_leaf: 44 - max_cat_threshold: 228 - min_data_per_group: 160 - cat_smooth: 54.52 - cat_l2: 0.11 - lambda_l1: 0.016 - lambda_l2: 2.413 - neighbors: 5 + max_cat_threshold: 87 + min_data_per_group: 200 + cat_smooth: 140.85 + cat_l2: 0.017 + lambda_l1: 0.697 + lambda_l2: 0.002 + neighbors: 15 range: num_iterations: - 100 @@ -143,12 +144,14 @@ stages: - char_building_non_units - char_bldg_is_mixed_use - char_building_sf + - char_unit_sf + - char_bedrooms + - char_half_baths + - char_full_baths - loc_longitude - loc_latitude - - loc_env_flood_fema_sfha + - loc_census_tract_geoid - loc_env_flood_fs_factor - - loc_env_flood_fs_risk_direction - - loc_env_airport_noise_dnl - loc_school_elementary_district_geoid - loc_school_secondary_district_geoid - loc_access_cmap_walk_nta_score @@ -158,8 +161,6 @@ stages: - prox_num_bus_stop_in_half_mile - prox_num_foreclosure_per_1000_pin_past_5_years - prox_num_school_in_half_mile - - prox_num_school_with_rating_in_half_mile - - prox_avg_school_rating_in_half_mile - prox_airport_dnl_total - prox_nearest_bike_trail_dist_ft - prox_nearest_cemetery_dist_ft @@ -173,12 +174,13 @@ stages: - prox_nearest_park_dist_ft - prox_nearest_railroad_dist_ft - prox_nearest_secondary_road_dist_ft + - prox_nearest_university_dist_ft + - prox_nearest_vacant_land_dist_ft - prox_nearest_water_dist_ft - prox_nearest_golf_course_dist_ft - acs5_percent_age_children - acs5_percent_age_senior - acs5_median_age_total - - acs5_percent_mobility_no_move - acs5_percent_mobility_moved_from_other_state - acs5_percent_household_family_married - acs5_percent_household_nonfamily_alone @@ -196,8 +198,9 @@ stages: - acs5_percent_household_total_occupied_w_sel_cond - acs5_percent_mobility_moved_in_county - other_tax_bill_rate - - other_school_district_elementary_avg_rating - - other_school_district_secondary_avg_rating + - ccao_is_active_exe_homeowner + - ccao_is_corner_lot + - ccao_n_years_exe_homeowner - time_sale_year - time_sale_day - time_sale_quarter_of_year @@ -211,6 +214,7 @@ stages: categorical: - meta_township_code - meta_nbhd_code + - loc_census_tract_geoid - loc_tax_municipality_name - loc_school_elementary_district_geoid - loc_school_secondary_district_geoid @@ -260,59 +264,59 @@ stages: outs: - path: output/intermediate/timing/model_timing_train.parquet hash: md5 - md5: 2a1bd76cefa0e890a0c44d4c1957b728 - size: 2865 + md5: 66e8095fc931829e5dee80997bf73a33 + size: 2872 - path: output/parameter_final/model_parameter_final.parquet hash: md5 - md5: e8bee777cc37b928818f58e5f10c30ef + md5: b234a91486b487642e8738306f87c25c size: 8857 - path: output/parameter_range/model_parameter_range.parquet hash: md5 - md5: 3b2015c65992cfcc2a46b1c029d62212 + md5: 150000269b5873fa1b3eaeeff7887ce2 size: 501 - path: output/parameter_search/model_parameter_search.parquet hash: md5 - md5: 3b2015c65992cfcc2a46b1c029d62212 + md5: 150000269b5873fa1b3eaeeff7887ce2 size: 501 - path: output/test_card/model_test_card.parquet hash: md5 - md5: 0c39e69ea32a78d6ffadf87fc9eab1e0 - size: 1085792 + md5: e7901e36f5ce271eb0b029b22d17a4ac + size: 1398727 - path: output/workflow/fit/model_workflow_fit.zip hash: md5 - md5: d7223e5a080f2bbaaca75ab8eeddfb2b - size: 11610240 + md5: e8bec722aa37a604dd068469c52f5adf + size: 11666878 - path: output/workflow/recipe/model_workflow_recipe.rds hash: md5 - md5: bef3c1299229b126404c8ac251ad981e - size: 3391336 + md5: 1a995afd1800871877dbcbe3695f1032 + size: 4375559 assess: cmd: Rscript pipeline/02-assess.R deps: - path: input/assessment_data.parquet hash: md5 - md5: 9f1a4cb2c2b1533e568b936404913d44 - size: 84715114 + md5: 4e1ac463b4d74fb9b238fa5e2c19210a + size: 80070368 - path: input/condo_strata_data.parquet hash: md5 - md5: 68c07b633902d6de2b7f564ad2e5e304 - size: 40750 + md5: 413f828ff0d1a2ca4d34ae0c6ca33ca8 + size: 40859 - path: input/land_nbhd_rate_data.parquet hash: md5 - md5: e508daf5790982c303d6503fe1cb8e2b + md5: 6c1baaf2acbcba9869025bb336f4ad25 size: 4413 - path: input/training_data.parquet hash: md5 - md5: 97b6ced3adb788e20fb2fc3758cd38a2 - size: 53281136 + md5: 2fe90bea662f5624408f5927374abaeb + size: 74835195 - path: output/workflow/fit/model_workflow_fit.zip hash: md5 - md5: d7223e5a080f2bbaaca75ab8eeddfb2b - size: 11610240 + md5: e8bec722aa37a604dd068469c52f5adf + size: 11666878 - path: output/workflow/recipe/model_workflow_recipe.rds hash: md5 - md5: bef3c1299229b126404c8ac251ad981e - size: 3391336 + md5: 1a995afd1800871877dbcbe3695f1032 + size: 4375559 params: params.yaml: assessment: @@ -332,12 +336,14 @@ stages: - char_building_non_units - char_bldg_is_mixed_use - char_building_sf + - char_unit_sf + - char_bedrooms + - char_half_baths + - char_full_baths - loc_longitude - loc_latitude - - loc_env_flood_fema_sfha + - loc_census_tract_geoid - loc_env_flood_fs_factor - - loc_env_flood_fs_risk_direction - - loc_env_airport_noise_dnl - loc_school_elementary_district_geoid - loc_school_secondary_district_geoid - loc_access_cmap_walk_nta_score @@ -347,8 +353,6 @@ stages: - prox_num_bus_stop_in_half_mile - prox_num_foreclosure_per_1000_pin_past_5_years - prox_num_school_in_half_mile - - prox_num_school_with_rating_in_half_mile - - prox_avg_school_rating_in_half_mile - prox_airport_dnl_total - prox_nearest_bike_trail_dist_ft - prox_nearest_cemetery_dist_ft @@ -362,12 +366,13 @@ stages: - prox_nearest_park_dist_ft - prox_nearest_railroad_dist_ft - prox_nearest_secondary_road_dist_ft + - prox_nearest_university_dist_ft + - prox_nearest_vacant_land_dist_ft - prox_nearest_water_dist_ft - prox_nearest_golf_course_dist_ft - acs5_percent_age_children - acs5_percent_age_senior - acs5_median_age_total - - acs5_percent_mobility_no_move - acs5_percent_mobility_moved_from_other_state - acs5_percent_household_family_married - acs5_percent_household_nonfamily_alone @@ -385,8 +390,9 @@ stages: - acs5_percent_household_total_occupied_w_sel_cond - acs5_percent_mobility_moved_in_county - other_tax_bill_rate - - other_school_district_elementary_avg_rating - - other_school_district_secondary_avg_rating + - ccao_is_active_exe_homeowner + - ccao_is_corner_lot + - ccao_n_years_exe_homeowner - time_sale_year - time_sale_day - time_sale_quarter_of_year @@ -398,19 +404,15 @@ stages: - meta_strata_1 - meta_strata_2 pv: - nonlivable_threshold: 1000 - nonlivable_fixed_fmv: 30000 land_pct_of_total_cap: 0.5 round_break: - 1000 - 10000 - - 100000 round_to_nearest: - 1 - - 500 - - 5000 - - 10000 - round_type: floor + - 10 + - 100 + round_type: ceiling ratio_study: far_year: '2021' far_stage: board @@ -436,16 +438,16 @@ stages: outs: - path: output/assessment_card/model_assessment_card.parquet hash: md5 - md5: 32956ff98cb61bf379d91876075d856a - size: 46538183 + md5: 7af071fdbf4ff8ba35ae158d4b6480f7 + size: 39005384 - path: output/assessment_pin/model_assessment_pin.parquet hash: md5 - md5: e4b201478916e76c05281e80239a1715 - size: 43587426 + md5: 0305e937be3245ca7403c8d2d7b714fa + size: 41683293 - path: output/intermediate/timing/model_timing_assess.parquet hash: md5 - md5: e5aa33e79f26f4c243126e3874f8df2c - size: 2879 + md5: ee8d205dec3fe1a5d77f6180557657e1 + size: 2886 evaluate: cmd: Rscript pipeline/03-evaluate.R deps: diff --git a/pipeline/00-ingest.R b/pipeline/00-ingest.R index b4f92dc..51e19c1 100644 --- a/pipeline/00-ingest.R +++ b/pipeline/00-ingest.R @@ -45,7 +45,9 @@ training_data <- dbGetQuery( sale.buyer_name AS meta_sale_buyer_name, sale.num_parcels_sale AS meta_sale_num_parcels, sale.sv_is_outlier, - sale.sv_outlier_type, + sale.sv_outlier_reason1, + sale.sv_outlier_reason2, + sale.sv_outlier_reason3, condo.* FROM model.vw_pin_condo_input condo INNER JOIN default.vw_pin_sale sale @@ -121,7 +123,6 @@ rm(AWS_ATHENA_CONN_NOCTUA) - #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # 3. Define Functions ---------------------------------------------------------- #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -270,12 +271,12 @@ training_data_klg <- training_data_ms %>% # likely non-arms-length sales. ONLY APPLIES to multi-sale properties training_data_fil <- training_data_klg %>% mutate( - sv_outlier_type = case_when( + sv_outlier_reason1 = case_when( meta_sale_price < 50000 & meta_sale_num_parcels == 2 ~ "Low price (multi)", meta_sale_price > 1700000 & meta_sale_num_parcels == 2 ~ "High price (multi)", - TRUE ~ sv_outlier_type + TRUE ~ sv_outlier_reason1 ), sv_is_outlier = ifelse( (meta_sale_price < 50000 & meta_sale_num_parcels == 2) | @@ -285,12 +286,12 @@ training_data_fil <- training_data_klg %>% ), # Kludge sale validation flags based on raw price for sales added later # due to https://github.com/ccao-data/data-architecture/pull/334 - sv_outlier_type = case_when( + sv_outlier_reason1 = case_when( meta_sale_price < 40000 & sv_added_later ~ "Low price (raw)", meta_sale_price > 1500000 & sv_added_later ~ "High price (raw)", - TRUE ~ sv_outlier_type + TRUE ~ sv_outlier_reason1 ), sv_is_outlier = ifelse( (meta_sale_price < 40000 & sv_added_later) | @@ -322,17 +323,25 @@ training_data_clean <- training_data_fil %>% TRUE, sv_is_outlier ), - sv_outlier_type = ifelse( - meta_modeling_group == "NONLIVABLE", - "Non-livable area", - sv_outlier_type + # Assign 'Non-livable area' to the first available outlier reason column + # as to not replace + sv_outlier_reason1 = case_when( + meta_modeling_group == "NONLIVABLE" & is.na(sv_outlier_reason1) ~ "Non-livable area", + TRUE ~ sv_outlier_reason1 + ), + sv_outlier_reason2 = case_when( + meta_modeling_group == "NONLIVABLE" & is.na(sv_outlier_reason1) & is.na(sv_outlier_reason2) ~ "Non-livable area", + TRUE ~ sv_outlier_reason2 + ), + sv_outlier_reason3 = case_when( + meta_modeling_group == "NONLIVABLE" & is.na(sv_outlier_reason1) & is.na(sv_outlier_reason2) & is.na(sv_outlier_reason3) ~ "Non-livable area", + TRUE ~ sv_outlier_reason3 ) ) %>% # Only exclude explicit outliers from training. Sales with missing validation # outcomes will be considered non-outliers mutate( - sv_is_outlier = replace_na(sv_is_outlier, FALSE), - sv_outlier_type = replace_na(sv_outlier_type, "Not outlier") + sv_is_outlier = replace_na(sv_is_outlier, FALSE) ) %>% # Some Athena columns are stored as arrays but are converted to string on # ingest. In such cases, take the first element and clean the string @@ -472,7 +481,7 @@ message("Calculating condo strata") bldg_5yr_sales_avg <- training_data_clean %>% filter( meta_sale_date > make_date(as.numeric(params$input$max_sale_year) - 4), - !sv_is_outlier + !sv_outlier_reason1 %in% c("Non-livable area", "High price (multi)", "Low price (multi)") ) %>% select( meta_pin10, meta_sale_price, meta_sale_date, diff --git a/pipeline/01-train.R b/pipeline/01-train.R index 214f24f..0857cbf 100644 --- a/pipeline/01-train.R +++ b/pipeline/01-train.R @@ -27,7 +27,7 @@ message("Preparing model training data") # Load the full set of training data, then arrange by sale date in order to # facilitate out-of-time sampling/validation training_data_full <- read_parquet(paths$input$training$local) %>% - filter(!sv_is_outlier) %>% + filter(!sv_outlier_reason1 %in% c("Non-livable area", "High price (multi)", "Low price (multi)")) %>% arrange(meta_sale_date) # Create train/test split by time, with most recent observations in the test set diff --git a/pipeline/02-assess.R b/pipeline/02-assess.R index 50449b5..d3a4214 100644 --- a/pipeline/02-assess.R +++ b/pipeline/02-assess.R @@ -202,16 +202,14 @@ sales_data_two_most_recent <- sales_data %>% distinct( meta_pin, meta_year, meta_sale_price, meta_sale_date, meta_sale_document_num, - sv_outlier_type, meta_sale_num_parcels, sv_added_later + sv_outlier_reason1, sv_outlier_reason2, sv_outlier_reason3, + meta_sale_num_parcels, sv_added_later ) %>% # Include outliers, since these data are used for desk review and # not for modeling - rename(meta_sale_outlier_type = sv_outlier_type) %>% - mutate( - meta_sale_outlier_type = ifelse( - meta_sale_outlier_type == "Not outlier", NA, meta_sale_outlier_type - ) - ) %>% + rename(meta_sale_outlier_reason1 = sv_outlier_reason1, + meta_sale_outlier_reason2 = sv_outlier_reason2, + meta_sale_outlier_reason3 = sv_outlier_reason3) %>% group_by(meta_pin) %>% slice_max(meta_sale_date, n = 2) %>% mutate(mr = paste0("sale_recent_", row_number())) %>% @@ -222,7 +220,9 @@ sales_data_two_most_recent <- sales_data %>% meta_sale_date, meta_sale_price, meta_sale_document_num, - meta_sale_outlier_type, + meta_sale_outlier_reason1, + meta_sale_outlier_reason2, + meta_sale_outlier_reason3, meta_sale_num_parcels, sv_added_later ), @@ -271,7 +271,7 @@ assessment_data_pin <- assessment_data_merged %>% meta_pin_num_landlines, char_yrblt, # Keep overall building square footage - char_total_bldg_sf = char_building_sf, + chtotal_bldg_sf = char_building_sf, char_unit_sf, char_land_sf, # Keep locations, prior year values, and indicators diff --git a/pipeline/03-evaluate.R b/pipeline/03-evaluate.R index 6828aaa..f9f83ba 100644 --- a/pipeline/03-evaluate.R +++ b/pipeline/03-evaluate.R @@ -12,7 +12,6 @@ tictoc::tic("Evaluate") # Load libraries, helpers, and recipes from files purrr::walk(list.files("R/", "\\.R$", full.names = TRUE), source) -# Enable parallel backend for generating stats more quickly plan(multisession, workers = num_threads) # Renaming dictionary for input columns. We want the actual value of the column @@ -351,7 +350,7 @@ pwalk( function(geo, cls) { gen_agg_stats( data = test_data_card, - truth = meta_sale_price, + truth= meta_sale_price, estimate = !!pred, bldg_sqft = char_unit_sf, rsn_col = prior_near_tot,