From eb6e14975965dbb037b9f10b9047546189db8403 Mon Sep 17 00:00:00 2001 From: Dan Snow Date: Mon, 30 Oct 2023 18:53:48 -0500 Subject: [PATCH 1/5] Add sale table column defs --- dbt/models/sale/docs.md | 10 ++- dbt/models/sale/schema.yml | 145 ++++++++++++++++++++++++++++++++++- dbt/models/shared_columns.md | 11 +++ 3 files changed, 161 insertions(+), 5 deletions(-) diff --git a/dbt/models/sale/docs.md b/dbt/models/sale/docs.md index e990cb5e5..3c10dd621 100644 --- a/dbt/models/sale/docs.md +++ b/dbt/models/sale/docs.md @@ -2,6 +2,10 @@ This table holds the flag information from the sales val program. {% enddocs %} +{% docs foreclosure %} +Foreclosures +{% enddocs %} + {% docs parameter %} This table holds information about the specifications used to flag outliers in the sales val program. {% enddocs %} @@ -14,6 +18,10 @@ This table holds group mean information which we can utilize to explain exactly View to help the upload process of sales validation flags into iasWorld. {% enddocs %} +{% docs mydec %} +mydec +{% enddocs %} + {% docs vw_ias_salesval_upload %} View to help the upload process of sales validation flags into iasWorld. -{% enddocs %} \ No newline at end of file +{% enddocs %} diff --git a/dbt/models/sale/schema.yml b/dbt/models/sale/schema.yml index f73a05788..49be43a14 100644 --- a/dbt/models/sale/schema.yml +++ b/dbt/models/sale/schema.yml @@ -1,17 +1,154 @@ sources: - name: sale - tags: - - load_auto tables: - name: flag description: '{{ doc("flag") }}' - - name: parameter - description: '{{ doc("parameter") }}' + tags: + - load_auto + + columns: + - name: ptax_flag_original + description: | + Whether or not this sale was flagged on Q10 of the + PTAX-203 form (regardless of statistical deviation) + - name: meta_sale_document_number + description: '{{ doc("shared_column_document_number") }}' + - name: rolling_window + description: | + Rolling window period used to calculate statistics + for flagging this sale + - name: run_id + description: '{{ doc("shared_column_sv_run_id") }}' + - name: sv_is_heuristic_outlier + description: '{{ doc("shared_column_sv_is_heuristic_outlier") }}' + - name: sv_is_ptax_outlier + description: '{{ doc("shared_column_sv_is_ptax_outlier") }}' + - name: sv_is_outlier + description: '{{ doc("shared_column_sv_is_outlier") }}' + - name: sv_outlier_type + description: '{{ doc("shared_column_sv_outlier_type") }}' + - name: version + description: '{{ doc("shared_column_sv_version") }}' + + - name: foreclosure + description: '{{ doc("foreclosure") }}' + tags: + - load_manual + - name: group_mean description: '{{ doc("group_mean") }}' + tags: + - load_auto + + columns: + - name: group + description: | + Group string used as a unique identifier. + + Typically a combination of year, township, and class + - name: group_size + description: Number of properties in the group + - name: run_id + description: '{{ doc("shared_column_sv_run_id") }}' + - name: mean_price + description: Mean price of the group, in FMV + - name: mean_price_per_sqft + description: Mean price per sqft (of building) of the group, in FMV + + - name: parameter + description: '{{ doc("parameter") }}' + tags: + - load_auto + + columns: + - name: condo_stat_groups + description: | + Groups used to calculate flagging statistics (std. dev.) + for condominium (class 299, 399) properties + - name: dev_bounds + description: | + Boundaries for standard deviation flagging. + + Sales with prices beyond these boundaries are flagged. + - name: earliest_data_ingest + description: | + Date of earliest sale used in validation. + + This inclusive of the rolling window period used for + calculating statistical groups. In other words, if the earliest + sale to-be-flagged is 2013-12-01 and the rolling window period + is 9 months, then the earliest sale *used* would be 2023-03-01 + - name: iso_forest_cols + description: Columns used as features in the isolation forest model + - name: latest_data_ingest + description: Date of latest sale used in validation + - name: min_group_thresh + description: | + Minimum number of sales required for statistical flagging + - name: ptax_sd + description: | + Boundaries for standard deviation flagging in combination + with a PTAX-203 flag + - name: res_stat_groups + description: | + Groups used to calculate flagging statistics (std. dev.) + for residential (class 2) properties + - name: rolling_window + description: | + Rolling window size, in months. + + For each target sale, calculate statistics (std. dev., + group size) using all sales in the period N months prior + - name: run_id + description: '{{ doc("shared_column_sv_run_id") }}' + - name: sales_flagged + description: | + Total number of sales flagged. + + Inclusive of both sales flagged as outliers *and* sales + flagged as non-outliers + - name: short_term_owner_threshold + description: | + Properties with a significant price change and multiple + sales within this time duration (in days) are flagged + + - name: metadata description: '{{ doc("metadata") }}' + tags: + - load_auto + + columns: + - name: long_commit_sha + description: Full commit SHA of the code used for the model run + - name: run_id + description: '{{ doc("shared_column_sv_run_id") }}' + - name: run_timestamp + description: Start timestamp of the model run + - name: run_type + description: | + Type of model run. + + Variable can be one of `initial_flagging`, `recurring`, + or `manual_update` + - name: short_commit_sha + description: Short commit SHA of the code used for the model run + + - name: mydec + description: '{{ doc("mydec") }}' + tags: + - load_manual models: - name: sale.vw_ias_salesval_upload description: '{{ doc("vw_ias_salesval_upload") }}' + + columns: + - name: run_id + description: '{{ doc("shared_column_sv_run_id") }}' + - name: salekey + description: '{{ doc("shared_column_sale_key") }}' + - name: sv_is_outlier + description: '{{ doc("shared_column_sv_is_outlier") }}' + - name: sv_outlier_type + description: '{{ doc("shared_column_sv_outlier_type") }}' diff --git a/dbt/models/shared_columns.md b/dbt/models/shared_columns.md index a9f218d5d..0fd8d29c4 100644 --- a/dbt/models/shared_columns.md +++ b/dbt/models/shared_columns.md @@ -1077,6 +1077,8 @@ See [model-sales-val](https://github.com/ccao-data/model-sales-val) for full det {% docs shared_column_sv_is_ptax_outlier %} Outlier flagged due to certain answers on Q10 of the PTAX-203 form. +Must have a Q10 flag _in addition to_ a statistical flag. + See [model-sales-val](https://github.com/ccao-data/model-sales-val) for more details {% enddocs %} @@ -1090,6 +1092,15 @@ with `sv_is_ptax_outlier` (using OR logic). NOTE: Outlier flags only exist for sales _after_ 2014. {% enddocs %} +## sv_outlier_type + +{% docs shared_column_sv_outlier_type %} +Heuristic or model used to flag an outlier. + +See the [model-sales-val](https://github.com/ccao-data/model-sales-val) repo +for a list of possible flags. +{% enddocs %} + ## sv_run_id {% docs shared_column_sv_run_id %} From a3aed6f9cd0016727d45a322edfec0e91ce80f8b Mon Sep 17 00:00:00 2001 From: Dan Snow Date: Mon, 30 Oct 2023 19:38:15 -0500 Subject: [PATCH 2/5] Add sale table docs --- dbt/models/sale/docs.md | 54 +++++++++++++++++++++++++++++++++++------ 1 file changed, 47 insertions(+), 7 deletions(-) diff --git a/dbt/models/sale/docs.md b/dbt/models/sale/docs.md index 3c10dd621..6928699fd 100644 --- a/dbt/models/sale/docs.md +++ b/dbt/models/sale/docs.md @@ -1,27 +1,67 @@ +# flag + {% docs flag %} -This table holds the flag information from the sales val program. +PIN-level sales validation flags created by +[model-sales-val](https://github.com/ccao-data/model-sales-val). + +This is the primary sales validation output table. Flags within this table +should be possible to reconstruct using the other sales validation tables: +`sale.group_mean`, `sale.parameter`, and `sale.metadata`. + +**Primary Key**: `meta_sale_document_number`, `run_id`, `version` {% enddocs %} +# foreclosure + {% docs foreclosure %} -Foreclosures +Foreclosure data ingested from Illinois Public Records (RIS). + +**Primary Key**: `pin`, `document_number` {% enddocs %} +# parameter + {% docs parameter %} -This table holds information about the specifications used to flag outliers in the sales val program. +Parameters used for each run of +[model-sales-val](https://github.com/ccao-data/model-sales-val), +including the statistical bounds, groupings, window sizes, etc. + +**Primary Key**: `run_id` {% enddocs %} +# group_mean + {% docs group_mean %} -This table holds group mean information which we can utilize to explain exactly why an outlier was flagged. +Information about groups used to calculate statistical deviations +for sales validation. + +**Primary Key**: `run_id`, `group` {% enddocs %} +# metadata + {% docs metadata %} -View to help the upload process of sales validation flags into iasWorld. +Information about the code used for a sales validation run, as well as +the start time and type of run. + +**Primary Key**: `run_id` {% enddocs %} +# mydec + {% docs mydec %} -mydec +MyDec data from the Illinois Department of Revenue (IDOR). Includes property +transfer declarations (sales) used to fill in missing data in `iasworld.sales` +and as an input to sales validation flagging. + +**Primary Key**: `document_number`, `year_of_sale` {% enddocs %} +# vw_ias_salesval_upload + {% docs vw_ias_salesval_upload %} -View to help the upload process of sales validation flags into iasWorld. +View for sales validation outputs to create an upload format compatible +with iasWorld. + +**Primary Key**: `salekey`, `run_id` {% enddocs %} From 4db42c886d96933fb69467ec33dc8c6b7fafccca Mon Sep 17 00:00:00 2001 From: Dan Snow Date: Mon, 30 Oct 2023 19:38:26 -0500 Subject: [PATCH 3/5] Capitalize MyDec --- dbt/models/default/schema/default.vw_pin_sale.yml | 2 +- dbt/models/shared_columns.md | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dbt/models/default/schema/default.vw_pin_sale.yml b/dbt/models/default/schema/default.vw_pin_sale.yml index d37ab7652..3e4df9074 100644 --- a/dbt/models/default/schema/default.vw_pin_sale.yml +++ b/dbt/models/default/schema/default.vw_pin_sale.yml @@ -14,7 +14,7 @@ models: - name: is_multisale description: '{{ doc("shared_column_sale_is_multisale") }}' - name: is_mydec_date - description: Indicator for whether or not the observation uses the myDec sale date + description: Indicator for whether or not the observation uses the MyDec sale date - name: nbhd description: '{{ doc("shared_column_nbhd_code") }}' - name: num_parcels_sale diff --git a/dbt/models/shared_columns.md b/dbt/models/shared_columns.md index 0fd8d29c4..1d9107b8b 100644 --- a/dbt/models/shared_columns.md +++ b/dbt/models/shared_columns.md @@ -989,7 +989,7 @@ prorated, but the building value is. {% docs shared_column_buyer_name %} Name of property buyer, as listed on deed. -Can be truncated by myDec/IDOR. See Clerk/Recorder of Deeds for full name. +Can be truncated by MyDec/IDOR. See Clerk/Recorder of Deeds for full name. {% enddocs %} ## document_number @@ -1049,7 +1049,7 @@ iasWorld internal sale identifier {% docs shared_column_sale_price %} Sale price of a PIN, as recorded on the deed. -Sales are sourced from myDec/IDOR. This serves as the outcome variable in regression models +Sales are sourced from MyDec/IDOR. This serves as the outcome variable in regression models {% enddocs %} ## seller_name @@ -1057,7 +1057,7 @@ Sales are sourced from myDec/IDOR. This serves as the outcome variable in regres {% docs shared_column_seller_name %} Name of property seller, as listed on deed. -Can be truncated by myDec/IDOR. See Clerk/Recorder of Deeds for full name. +Can be truncated by MyDec/IDOR. See Clerk/Recorder of Deeds for full name. {% enddocs %} # Sale Validation From 4eb71d588cf355607577748ddc6ca63bd96aa298 Mon Sep 17 00:00:00 2001 From: Dan Snow Date: Tue, 31 Oct 2023 14:56:15 +0000 Subject: [PATCH 4/5] Fix year typo in sales schema --- dbt/models/sale/schema.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbt/models/sale/schema.yml b/dbt/models/sale/schema.yml index 49be43a14..ec66eeca0 100644 --- a/dbt/models/sale/schema.yml +++ b/dbt/models/sale/schema.yml @@ -77,7 +77,7 @@ sources: This inclusive of the rolling window period used for calculating statistical groups. In other words, if the earliest sale to-be-flagged is 2013-12-01 and the rolling window period - is 9 months, then the earliest sale *used* would be 2023-03-01 + is 9 months, then the earliest sale *used* would be 2013-03-01 - name: iso_forest_cols description: Columns used as features in the isolation forest model - name: latest_data_ingest From b57e0a154027b40faef2bfe742ef40622cadebb8 Mon Sep 17 00:00:00 2001 From: Dan Snow Date: Tue, 31 Oct 2023 14:58:51 +0000 Subject: [PATCH 5/5] Add @wagnerlmichael suggestions --- dbt/models/sale/schema.yml | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/dbt/models/sale/schema.yml b/dbt/models/sale/schema.yml index ec66eeca0..4ceb863c1 100644 --- a/dbt/models/sale/schema.yml +++ b/dbt/models/sale/schema.yml @@ -84,7 +84,11 @@ sources: description: Date of latest sale used in validation - name: min_group_thresh description: | - Minimum number of sales required for statistical flagging + Minimum number of sales required for statistical flagging. + + If the minimum number of sales in our group methodology + (township, class, rolling window) is below N, these sales + are not flagged and are set to `Not outlier` - name: ptax_sd description: | Boundaries for standard deviation flagging in combination @@ -98,7 +102,8 @@ sources: Rolling window size, in months. For each target sale, calculate statistics (std. dev., - group size) using all sales in the period N months prior + group size) using all sales in the period N months prior, + inclusive of the month of the sale itself - name: run_id description: '{{ doc("shared_column_sv_run_id") }}' - name: sales_flagged