From 348d7ec4a9b3c7e819908787050ac70cbe9d70f1 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Mon, 19 Aug 2024 11:34:17 -0500 Subject: [PATCH] Rename tag for iasWorld data tests and refactor script to run tests (#571) * Update README.md to clarify role of dbt generics * Make some updates to dbt/README.md to sketch out QC doc structure * Add link to QC tests and reports section to root README.md * Continue fleshing out dbt test docs in dbt/README.md * Clean up data testing section of dbt/README.md * Update dbt/README.md to add docs for running QC reports * Finish fleshing out docs on QC reports in dbt/README.md * Move transform_dbt_test_results.py script to dbt/scripts/ subdirectory * Fix small issues in dbt/README.md * Clean up some trailing whitespace in dbt/README.md * Run pip install step in dbt/ subdir in test_dbt_models workflow * Clarify bug described in unit tests section of dbt/README.md * Clarify --rebuild docs in dbt/README.md * Small clarification in dbt/README.md * Apply a few small nitpicks from Dan's code review to dbt/README.md Co-authored-by: Dan Snow <31494343+dfsnow@users.noreply.github.com> * Standardize on "Valuations staff" in dbt/README.md * Use relative paths in dbt/README.md * Refactor meta.export_template option to not expect file extensions for export_models script * Refactor docs to use clearer terminology for iasWorld data tests * Rename tag for iasWorld data tests and refactor script to run tests * Small tweak to dbt/README.md * Better error handling for dbt invocation in run_iasworld_data_tests.py * Rename `data_test_iasworld` selector -> `select_data_test_iasworld` * Add --target flag to run_iasworld_data_tests script --------- Co-authored-by: Dan Snow <31494343+dfsnow@users.noreply.github.com> --- .github/workflows/build_and_test_dbt.yaml | 6 +- .github/workflows/test_dbt_models.yaml | 29 +- dbt/README.md | 46 +- dbt/dbt_project.yml | 21 +- .../default/schema/default.vw_pin_address.yml | 2 +- .../default/schema/default.vw_pin_appeal.yml | 4 +- .../default/schema/default.vw_pin_value.yml | 18 +- .../iasworld/schema/iasworld.aasysjur.yml | 2 +- dbt/models/iasworld/schema/iasworld.addn.yml | 8 +- .../iasworld/schema/iasworld.addrindx.yml | 2 +- .../iasworld/schema/iasworld.aprval.yml | 6 +- .../iasworld/schema/iasworld.asmt_all.yml | 18 +- .../iasworld/schema/iasworld.asmt_hist.yml | 2 +- dbt/models/iasworld/schema/iasworld.cname.yml | 2 +- .../iasworld/schema/iasworld.comdat.yml | 16 +- dbt/models/iasworld/schema/iasworld.comnt.yml | 2 +- dbt/models/iasworld/schema/iasworld.cvleg.yml | 2 +- dbt/models/iasworld/schema/iasworld.cvown.yml | 2 +- .../iasworld/schema/iasworld.cvtran.yml | 2 +- dbt/models/iasworld/schema/iasworld.dedit.yml | 2 +- .../iasworld/schema/iasworld.dweldat.yml | 84 +- dbt/models/iasworld/schema/iasworld.enter.yml | 2 +- .../iasworld/schema/iasworld.exadmn.yml | 2 +- dbt/models/iasworld/schema/iasworld.exapp.yml | 2 +- .../iasworld/schema/iasworld.excode.yml | 2 +- dbt/models/iasworld/schema/iasworld.exdet.yml | 2 +- .../iasworld/schema/iasworld.htagnt.yml | 2 +- .../iasworld/schema/iasworld.htdates.yml | 2 +- dbt/models/iasworld/schema/iasworld.htpar.yml | 8 +- dbt/models/iasworld/schema/iasworld.land.yml | 12 +- .../iasworld/schema/iasworld.legdat.yml | 12 +- dbt/models/iasworld/schema/iasworld.lpmod.yml | 2 +- .../iasworld/schema/iasworld.lpnbhd.yml | 2 +- dbt/models/iasworld/schema/iasworld.oby.yml | 14 +- .../iasworld/schema/iasworld.owndat.yml | 8 +- .../iasworld/schema/iasworld.pardat.yml | 10 +- .../iasworld/schema/iasworld.permit.yml | 2 +- dbt/models/iasworld/schema/iasworld.rcoby.yml | 2 +- dbt/models/iasworld/schema/iasworld.sales.yml | 2 +- .../iasworld/schema/iasworld.splcom.yml | 2 +- .../iasworld/schema/iasworld.valclass.yml | 2 +- .../qc/qc.vw_nonlivable_condos_with_chars.sql | 3 - dbt/models/qc/schema.yml | 38 +- dbt/models/reporting/schema.yml | 10 +- dbt/models/tax/schema.yml | 6 +- .../requirements.run_iasworld_data_tests.txt | 6 + dbt/scripts/run_iasworld_data_tests.py | 1502 +++++++++++++++++ dbt/scripts/transform_dbt_test_results.py | 154 +- dbt/selectors.yml | 8 +- 49 files changed, 1799 insertions(+), 296 deletions(-) create mode 100644 dbt/scripts/requirements.run_iasworld_data_tests.txt create mode 100644 dbt/scripts/run_iasworld_data_tests.py diff --git a/.github/workflows/build_and_test_dbt.yaml b/.github/workflows/build_and_test_dbt.yaml index 425c3346f..c861714de 100644 --- a/.github/workflows/build_and_test_dbt.yaml +++ b/.github/workflows/build_and_test_dbt.yaml @@ -84,14 +84,14 @@ jobs: if [[ $MODIFIED_RESOURCES_ONLY == 'true' ]]; then if [[ $MANUALLY_DISPATCHED == 'true' ]]; then echo "Running tests on manually selected resources" - dbt test -t "$TARGET" -s ${{ inputs.models }} --exclude "tag:test_qc*" --defer --state "$STATE_DIR" + dbt test -t "$TARGET" -s ${{ inputs.models }} --exclude "tag:data_test_iasworld" --defer --state "$STATE_DIR" else echo "Running tests on modified/new resources only" - dbt test -t "$TARGET" -s state:modified state:new --exclude "tag:test_qc*" --defer --state "$STATE_DIR" + dbt test -t "$TARGET" -s state:modified state:new --exclude "tag:data_test_iasworld" --defer --state "$STATE_DIR" fi else echo "Running tests on all resources" - dbt test -t "$TARGET" --exclude "tag:test_qc*" + dbt test -t "$TARGET" --exclude "tag:data_test_iasworld" fi working-directory: ${{ env.PROJECT_DIR }} shell: bash diff --git a/.github/workflows/test_dbt_models.yaml b/.github/workflows/test_dbt_models.yaml index f32fc7b16..e5fcb65e4 100644 --- a/.github/workflows/test_dbt_models.yaml +++ b/.github/workflows/test_dbt_models.yaml @@ -23,37 +23,14 @@ jobs: role-to-assume: ${{ secrets.AWS_IAM_ROLE_TO_ASSUME_ARN }} - name: Install Python requirements - run: pip install -r scripts/requirements.transform_dbt_test_results.txt + run: pip install -r scripts/requirements.run_iasworld_data_tests.txt working-directory: ${{ env.PROJECT_DIR }} shell: bash - name: Run tests run: | - # dbt doesn't differentiate between test failures and errors, but we - # need to since we expect failures. Do this by capturing the output - # and checking its contents to look for errors - if output=$(dbt test --target "$TARGET" --selector qc_tests --store-failures); then - echo "$output" - else - status_code="$?" - if [[ "$output" =~ "Runtime Error" || "$output" =~ "Compilation Error" ]]; then - # The presence of an error string indicates that these tests - # failed due to an error, so print the output and fail the - # pipeline - echo "$output" - exit "$status_code" - else - # The tests must have failed rather than errored out, so - # print the output but don't raise an error - echo "$output" - fi - fi - working-directory: ${{ env.PROJECT_DIR }} - shell: bash - - - name: Extract test results - run: | - python3 scripts/transform_dbt_test_results.py \ + python3 scripts/run_iasworld_data_tests.py \ + --target "$TARGET" \ --output-dir ./qc_test_results/ working-directory: ${{ env.PROJECT_DIR }} shell: bash diff --git a/dbt/README.md b/dbt/README.md index cd2a3f5ea..b582b6ef2 100644 --- a/dbt/README.md +++ b/dbt/README.md @@ -541,14 +541,9 @@ built for iasWorld data tests. #### Running iasWorld data tests -The iasWorld data test suite can be run using the [`dbt test` -command](https://docs.getdbt.com/reference/commands/test) with a dedicated -[selector](https://docs.getdbt.com/reference/node-selection/yaml-selectors) -and the [`--store-failures` -flag](https://docs.getdbt.com/reference/resource-configs/store_failures), -and its output can be transformed for review and analysis using the -[`transform_dbt_test_results` script](./scripts/transform_dbt_test_results.py). -This script reads the metadata for the most recent `dbt test` run and outputs a number of +The iasWorld data test suite can be run using the [`run_iasworld_data_tests` +script](./scripts/run_iasworld_data_tests.py). +This script runs the tests and reads the metadata for the run to output a number of different artifacts with information about the tests: * An Excel workbook with detailed information on each failure to aid in resolving @@ -572,25 +567,19 @@ Typically, Valuations staff will ask for test output for a specific township. We [township code](https://github.com/ccao-data/wiki/blob/master/Data/Townships.md) for this township using the bash variable `$TOWNSHIP_CODE`. -First, run the tests locally using dbt and the [iasWorld data test -selector](./selectors.yml): +Run the tests locally using the [`run_iasworld_data_tests` +script](./scripts/run_iasworld_data_tests.yml): ```bash # Make sure you're in the dbt subdirectory with the virtualenv activated cd dbt source venv/bin/activate -# Run the tests and store failures in Athena -dbt test --selector qc_tests --store-failures +# Run the script +python3 scripts/run_iasworld_data_tests.py --township $TOWNSHIP_CODE ``` -Next, transform the results for the township that Valuations staff requested: - -```bash -python3 scripts/transform_dbt_test_results.py --township $TOWNSHIP_CODE -``` - -Finally, spot check the Excel workbook that the script produced to make sure it's formatted +Then, check the Excel workbook that the script produced to make sure it's formatted correctly, and send it to Valuations staff for review. #### Adding iasWorld data tests @@ -601,14 +590,14 @@ by the script: * One of either the test or the model that the test is defined on must be [tagged](https://docs.getdbt.com/reference/resource-configs/tags) with -the tag `test_qc_iasworld` +the tag `data_test_iasworld` * Prefer tagging the model, and fall back to tagging the test if for some reason the model cannot be tagged (e.g. if it has some non-QC tests defined on it) * If you would like to disable a data test but you don't want to remove it - altogether, you can tag it or its model with `test_qc_exclude_from_workbook`, + altogether, you can tag it or its model with `data_test_iasworld_exclude_from_workbook`, which will prevent the test (or all of the model's tests, if you tagged - the model) from running as part of the `qc_tests` selector + the model) from running as part of the `select_data_test_iasworld` selector * The test definition must supply a few specific parameters: * `name` must be set and follow the pattern `iasworld__` @@ -622,14 +611,14 @@ the tag `test_qc_iasworld` test you're using * `config.where` should typically set to provide a filter expression that restricts tests to unique rows and to rows matching a date range - set by the `test_qc_year_start` and `test_qc_year_end` + set by the `data_test_iasworld_year_start` and `data_test_iasworld_year_end` [project variables](https://docs.getdbt.com/docs/build/project-variables) * `meta` should be set with a few specific string attributes: * `description` (required): A short human-readable description of the test * `category` (optional): A workbook category for the test, required if a category is not defined for the test's generic in the `TEST_CATEGORIES` - constant in the [`transform_dbt_test_results` - script](./scripts/transform_dbt_test_results.py) + constant in the [`run_iasworld_data_tests` + script](./scripts/run_iasworld_data_tests.py) * `table_name` (optional): The name of the table to report in the output workbook, if the workbook should report a different table name than the name of the model that the test is defined on @@ -674,8 +663,8 @@ do so, you have two options: and other tests. You'll also need to follow a few extra steps that are specific to our environment: 1. Add a default category for your generic test in - the `TEST_CATEGORIES` constant in the [`transform_dbt_test_results` - script](./scripts/transform_dbt_test_results.py) + the `TEST_CATEGORIES` constant in the [`run_iasworld_data_tests` + script](./scripts/run_iasworld_data_tests.py) 2. Make sure that your generic test supports the `additional_select_columns` parameter that most of our generic tests support, making use of the `format_additional_select_columns` macro to format the @@ -767,8 +756,7 @@ model during export: reports at the same time, so we tag each model with the `qc_report_town_close` tag such that we can select them all at once when running the `export_models` script using `--select tag:qc_report_town_close`. For consistency, prefer tags that start with the `qc_report_*` - prefix, but beware not to use the `test_qc_*` prefix, which is instead used for [QC - tests](#adding-qc-tests). + prefix. * **Filtering**: Since the `export_models` script can filter your model using the `--where` option, you should define your model such that it selects any fields that you want to use for filtering in the `SELECT` clause. It's common to filter reports by `taxyr` and diff --git a/dbt/dbt_project.yml b/dbt/dbt_project.yml index f38d21355..c9ade8c2d 100644 --- a/dbt/dbt_project.yml +++ b/dbt/dbt_project.yml @@ -22,19 +22,18 @@ clean-targets: - "dbt_packages" # Variables that can change with invocation of dbt commands. Note that we parse -# the test_* variables by name in scripts, so if you change their names, make -# sure you check all of our scripts and adjust them accordingly +# the data_test_iasworld_* variables by name in scripts, so if you change their +# names, make sure you check all of our scripts and adjust them accordingly vars: - # Start year for testing data using test_qc_* tagged dbt tests. Typically - # this should be the current year, since errors in past data cannot usually - # be amended once records are closed. Set as an integer for compatibility - # with comparison operators and SQL's BETWEEN - test_qc_year_start: 2024 + # Start year for iasWorld data tests. Typically this should be the current + # year, since errors in past data cannot usually be amended once records are + # closed. Set as an integer for compatibility with comparison operators and + # SQL's BETWEEN + data_test_iasworld_year_start: 2024 - # End year for testing data using test_qc_* tagged dbt tests. Typically set - # to a date in the future, but can also be use to select specific time - # frames for testing - test_qc_year_end: 2030 + # End year for iasWorld data tests. Typically set to a date in the future, + # but can also be use to select specific time frames for testing + data_test_iasworld_year_end: 2030 # Configuring models # Full documentation: https://docs.getdbt.com/docs/configuring-models diff --git a/dbt/models/default/schema/default.vw_pin_address.yml b/dbt/models/default/schema/default.vw_pin_address.yml index 9624c4c35..73ea99864 100644 --- a/dbt/models/default/schema/default.vw_pin_address.yml +++ b/dbt/models/default/schema/default.vw_pin_address.yml @@ -64,7 +64,7 @@ models: - mail_address_zipcode_1 - mail_address_zipcode_2 config: - where: CAST(year AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + where: CAST(year AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} error_if: ">900000" - unique_combination_of_columns: name: default_vw_pin_address_unique_by_14_digit_pin_and_year diff --git a/dbt/models/default/schema/default.vw_pin_appeal.yml b/dbt/models/default/schema/default.vw_pin_appeal.yml index 4e599d836..2a388e929 100644 --- a/dbt/models/default/schema/default.vw_pin_appeal.yml +++ b/dbt/models/default/schema/default.vw_pin_appeal.yml @@ -91,7 +91,7 @@ models: - mailed_tot - certified_tot config: - where: CAST(year AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + where: CAST(year AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} error_if: ">266719" # `change` should be an enum - expression_is_true: @@ -103,7 +103,7 @@ models: - case_no - change config: - where: CAST(year AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + where: CAST(year AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} - row_count: name: default_vw_pin_appeal_row_count above: 8407667 # as of 2023-11-22 diff --git a/dbt/models/default/schema/default.vw_pin_value.yml b/dbt/models/default/schema/default.vw_pin_value.yml index a3b708e6e..6beb23b2f 100644 --- a/dbt/models/default/schema/default.vw_pin_value.yml +++ b/dbt/models/default/schema/default.vw_pin_value.yml @@ -80,44 +80,44 @@ models: name: default_vw_pin_value_mailed_class_not_null column_name: mailed_class config: - where: CAST(year AS int) < {{ var('test_qc_year_start') }} + where: CAST(year AS int) < {{ var('data_test_iasworld_year_start') }} error_if: ">289" - not_null: name: default_vw_pin_value_certified_class_not_null column_name: certified_class config: - where: CAST(year AS int) < {{ var('test_qc_year_start') }} + where: CAST(year AS int) < {{ var('data_test_iasworld_year_start') }} error_if: ">15" - not_null: name: default_vw_pin_value_board_class_not_null column_name: board_class config: - where: CAST(year AS int) < {{ var('test_qc_year_start') }} - 1 + where: CAST(year AS int) < {{ var('data_test_iasworld_year_start') }} - 1 error_if: ">1260" - not_null: name: default_vw_pin_value_mailed_tot_not_null column_name: mailed_tot config: - where: CAST(year AS int) < {{ var('test_qc_year_start') }} + where: CAST(year AS int) < {{ var('data_test_iasworld_year_start') }} error_if: ">310" - not_null: name: default_vw_pin_value_certified_tot_not_null column_name: certified_tot config: - where: CAST(year AS int) < {{ var('test_qc_year_start') }} + where: CAST(year AS int) < {{ var('data_test_iasworld_year_start') }} error_if: ">15" - not_null: name: default_vw_pin_value_board_tot_not_null column_name: board_tot config: - where: CAST(year AS int) < {{ var('test_qc_year_start') }} - 1 + where: CAST(year AS int) < {{ var('data_test_iasworld_year_start') }} - 1 error_if: ">1260" - not_null: name: default_vw_pin_value_mailed_tot_mv_not_null column_name: mailed_tot_mv config: where: | - CAST(year AS int) < {{ var('test_qc_year_start') }} AND + CAST(year AS int) < {{ var('data_test_iasworld_year_start') }} AND year >= '2021' error_if: ">310" - not_null: @@ -125,7 +125,7 @@ models: column_name: certified_tot_mv config: where: | - CAST(year AS int) < {{ var('test_qc_year_start') }} AND + CAST(year AS int) < {{ var('data_test_iasworld_year_start') }} AND year >= '2021' error_if: ">15" - not_null: @@ -133,6 +133,6 @@ models: column_name: board_tot_mv config: where: | - CAST(year AS int) < {{ var('test_qc_year_start') }} - 1 AND + CAST(year AS int) < {{ var('data_test_iasworld_year_start') }} - 1 AND year >= '2020' error_if: ">1260" diff --git a/dbt/models/iasworld/schema/iasworld.aasysjur.yml b/dbt/models/iasworld/schema/iasworld.aasysjur.yml index 70292539d..c72ac0c1f 100644 --- a/dbt/models/iasworld/schema/iasworld.aasysjur.yml +++ b/dbt/models/iasworld/schema/iasworld.aasysjur.yml @@ -3,7 +3,7 @@ sources: loaded_at_field: date_parse(loaded_at, '%Y-%m-%d %H:%i:%S.%f') tags: - load_auto - - test_qc_iasworld + - data_test_iasworld tables: - name: aasysjur diff --git a/dbt/models/iasworld/schema/iasworld.addn.yml b/dbt/models/iasworld/schema/iasworld.addn.yml index dd5f88e4d..60baa4cbe 100644 --- a/dbt/models/iasworld/schema/iasworld.addn.yml +++ b/dbt/models/iasworld/schema/iasworld.addn.yml @@ -3,7 +3,7 @@ sources: loaded_at_field: date_parse(loaded_at, '%Y-%m-%d %H:%i:%S.%f') tags: - load_auto - - test_qc_iasworld + - data_test_iasworld - type_res tables: @@ -30,7 +30,7 @@ sources: - wen config: &unique-conditions where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL meta: @@ -75,7 +75,7 @@ sources: additional_select_columns: *select-columns config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND class != 'EX' AND cur = 'Y' AND deactivat IS NULL @@ -93,7 +93,7 @@ sources: additional_select_columns: *select-columns config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} meta: description: cur should be 'Y' or 'D' - name: deactivat diff --git a/dbt/models/iasworld/schema/iasworld.addrindx.yml b/dbt/models/iasworld/schema/iasworld.addrindx.yml index 097c4c665..64f375caf 100644 --- a/dbt/models/iasworld/schema/iasworld.addrindx.yml +++ b/dbt/models/iasworld/schema/iasworld.addrindx.yml @@ -3,7 +3,7 @@ sources: loaded_at_field: date_parse(loaded_at, '%Y-%m-%d %H:%i:%S.%f') tags: - load_auto - - test_qc_iasworld + - data_test_iasworld tables: - name: addrindx diff --git a/dbt/models/iasworld/schema/iasworld.aprval.yml b/dbt/models/iasworld/schema/iasworld.aprval.yml index 2ea1a2cf1..2456702d5 100644 --- a/dbt/models/iasworld/schema/iasworld.aprval.yml +++ b/dbt/models/iasworld/schema/iasworld.aprval.yml @@ -3,7 +3,7 @@ sources: loaded_at_field: date_parse(loaded_at, '%Y-%m-%d %H:%i:%S.%f') tags: - load_auto - - test_qc_iasworld + - data_test_iasworld tables: - name: aprval @@ -34,7 +34,7 @@ sources: - wen config: &unique-conditions where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL meta: @@ -121,7 +121,7 @@ sources: additional_select_columns: *select-columns config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} meta: description: cur should be 'Y' or 'D' - name: deactivat diff --git a/dbt/models/iasworld/schema/iasworld.asmt_all.yml b/dbt/models/iasworld/schema/iasworld.asmt_all.yml index 7444d2eb6..1f1ea989f 100644 --- a/dbt/models/iasworld/schema/iasworld.asmt_all.yml +++ b/dbt/models/iasworld/schema/iasworld.asmt_all.yml @@ -3,7 +3,7 @@ sources: loaded_at_field: date_parse(loaded_at, '%Y-%m-%d %H:%i:%S.%f') tags: - load_auto - - test_qc_iasworld + - data_test_iasworld tables: - name: asmt_all @@ -28,7 +28,7 @@ sources: - wen config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND class NOT IN ('EX', 'RR', '999') AND NOT REGEXP_LIKE(class, '[0-9]{3}[A|B]') AND rolltype != 'RR' @@ -50,7 +50,7 @@ sources: agg_func: max config: &unique-conditions where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND rolltype != 'RR' AND procname IN ('CCAOVALUE', 'CCAOFINAL', 'BORVALUE') AND deactivat IS NULL @@ -75,7 +75,7 @@ sources: additional_select_columns: *select-columns config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} meta: description: cur should be 'Y', 'D', or 'N' - name: deactivat @@ -128,7 +128,7 @@ sources: additional_select_columns: *select-columns config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND rolltype != 'RR' AND deactivat IS NULL AND valclass IS NULL @@ -146,7 +146,7 @@ sources: additional_select_columns: *select-columns config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL meta: @@ -229,7 +229,7 @@ sources: - wen config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND rolltype != 'RR' AND procname IN ('CCAOVALUE', 'CCAOFINAL', 'BORVALUE') AND deactivat IS NULL @@ -249,7 +249,7 @@ sources: additional_select_columns: *select-columns-with-class config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND rolltype != 'RR' AND procname IN ('CCAOVALUE', 'CCAOFINAL', 'BORVALUE') AND deactivat IS NULL @@ -269,7 +269,7 @@ sources: name: iasworld_asmt_all_impr_class_valasm2_gt_0 config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND rolltype != 'RR' AND procname IN ('CCAOVALUE', 'CCAOFINAL', 'BORVALUE') AND deactivat IS NULL diff --git a/dbt/models/iasworld/schema/iasworld.asmt_hist.yml b/dbt/models/iasworld/schema/iasworld.asmt_hist.yml index 4a5847d8d..3f8f18d7e 100644 --- a/dbt/models/iasworld/schema/iasworld.asmt_hist.yml +++ b/dbt/models/iasworld/schema/iasworld.asmt_hist.yml @@ -3,7 +3,7 @@ sources: loaded_at_field: date_parse(loaded_at, '%Y-%m-%d %H:%i:%S.%f') tags: - load_auto - - test_qc_iasworld + - data_test_iasworld tables: - name: asmt_hist diff --git a/dbt/models/iasworld/schema/iasworld.cname.yml b/dbt/models/iasworld/schema/iasworld.cname.yml index 02288e28b..d89df60d7 100644 --- a/dbt/models/iasworld/schema/iasworld.cname.yml +++ b/dbt/models/iasworld/schema/iasworld.cname.yml @@ -3,7 +3,7 @@ sources: loaded_at_field: date_parse(loaded_at, '%Y-%m-%d %H:%i:%S.%f') tags: - load_auto - - test_qc_iasworld + - data_test_iasworld tables: - name: cname diff --git a/dbt/models/iasworld/schema/iasworld.comdat.yml b/dbt/models/iasworld/schema/iasworld.comdat.yml index a0468cbac..67dd791c4 100644 --- a/dbt/models/iasworld/schema/iasworld.comdat.yml +++ b/dbt/models/iasworld/schema/iasworld.comdat.yml @@ -3,7 +3,7 @@ sources: loaded_at_field: date_parse(loaded_at, '%Y-%m-%d %H:%i:%S.%f') tags: - load_auto - - test_qc_iasworld + - data_test_iasworld - type_ic tables: @@ -30,7 +30,7 @@ sources: - wen config: &unique-conditions where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL meta: @@ -69,7 +69,7 @@ sources: additional_select_columns: *select-columns config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL AND calc_meth IS NOT NULL @@ -116,7 +116,7 @@ sources: - mktadj config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL AND (external_occpct IS NOT NULL OR mktadj IS NOT NULL) @@ -135,7 +135,7 @@ sources: additional_select_columns: *select-columns config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND NOT REGEXP_LIKE(class, '[0-9]{3}[A|B]') AND class NOT IN ('EX', 'RR') AND cur = 'Y' @@ -165,7 +165,7 @@ sources: additional_select_columns: *select-columns config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} meta: description: cur should be 'Y' or 'D' - name: deactivat @@ -238,7 +238,7 @@ sources: - mktadj config: where: - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL AND chgrsn IN ('5', '5B') @@ -301,7 +301,7 @@ sources: - external_occpct config: where: - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL AND chgrsn IN ('5', '5B') diff --git a/dbt/models/iasworld/schema/iasworld.comnt.yml b/dbt/models/iasworld/schema/iasworld.comnt.yml index dec0bd17d..ccded887f 100644 --- a/dbt/models/iasworld/schema/iasworld.comnt.yml +++ b/dbt/models/iasworld/schema/iasworld.comnt.yml @@ -3,7 +3,7 @@ sources: loaded_at_field: date_parse(loaded_at, '%Y-%m-%d %H:%i:%S.%f') tags: - load_auto - - test_qc_iasworld + - data_test_iasworld tables: - name: comnt diff --git a/dbt/models/iasworld/schema/iasworld.cvleg.yml b/dbt/models/iasworld/schema/iasworld.cvleg.yml index e56393023..1cc56c4f1 100644 --- a/dbt/models/iasworld/schema/iasworld.cvleg.yml +++ b/dbt/models/iasworld/schema/iasworld.cvleg.yml @@ -3,7 +3,7 @@ sources: loaded_at_field: date_parse(loaded_at, '%Y-%m-%d %H:%i:%S.%f') tags: - load_auto - - test_qc_iasworld + - data_test_iasworld tables: - name: cvleg diff --git a/dbt/models/iasworld/schema/iasworld.cvown.yml b/dbt/models/iasworld/schema/iasworld.cvown.yml index de5e53e64..96a800f9d 100644 --- a/dbt/models/iasworld/schema/iasworld.cvown.yml +++ b/dbt/models/iasworld/schema/iasworld.cvown.yml @@ -3,7 +3,7 @@ sources: loaded_at_field: date_parse(loaded_at, '%Y-%m-%d %H:%i:%S.%f') tags: - load_auto - - test_qc_iasworld + - data_test_iasworld tables: - name: cvown diff --git a/dbt/models/iasworld/schema/iasworld.cvtran.yml b/dbt/models/iasworld/schema/iasworld.cvtran.yml index dd56b8e79..998912af6 100644 --- a/dbt/models/iasworld/schema/iasworld.cvtran.yml +++ b/dbt/models/iasworld/schema/iasworld.cvtran.yml @@ -3,7 +3,7 @@ sources: loaded_at_field: date_parse(loaded_at, '%Y-%m-%d %H:%i:%S.%f') tags: - load_auto - - test_qc_iasworld + - data_test_iasworld tables: - name: cvtran diff --git a/dbt/models/iasworld/schema/iasworld.dedit.yml b/dbt/models/iasworld/schema/iasworld.dedit.yml index c25dd0ebe..2dd169349 100644 --- a/dbt/models/iasworld/schema/iasworld.dedit.yml +++ b/dbt/models/iasworld/schema/iasworld.dedit.yml @@ -3,7 +3,7 @@ sources: loaded_at_field: date_parse(loaded_at, '%Y-%m-%d %H:%i:%S.%f') tags: - load_auto - - test_qc_iasworld + - data_test_iasworld tables: - name: dedit diff --git a/dbt/models/iasworld/schema/iasworld.dweldat.yml b/dbt/models/iasworld/schema/iasworld.dweldat.yml index db79c6c40..f672469df 100644 --- a/dbt/models/iasworld/schema/iasworld.dweldat.yml +++ b/dbt/models/iasworld/schema/iasworld.dweldat.yml @@ -3,7 +3,7 @@ sources: loaded_at_field: date_parse(loaded_at, '%Y-%m-%d %H:%i:%S.%f') tags: - load_auto - - test_qc_iasworld + - data_test_iasworld - type_res tables: @@ -60,7 +60,7 @@ sources: # online PDF of all class definitions config: &unique-conditions where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL AND class NOT IN ('201', '213', '218', '219', '220', '221', '224', '225', '236', '240', '241', '290', '294', '297') @@ -124,7 +124,7 @@ sources: additional_select_columns: *select-columns config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL AND class NOT IN ('201', '213', '218', '219', '220', '221', '224', '225', '236', '240', '241', '290', '294', '297') @@ -176,7 +176,7 @@ sources: additional_select_columns: *select-columns config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND class NOT IN ('EX', 'RR') AND cur = 'Y' AND deactivat IS NULL @@ -206,7 +206,7 @@ sources: additional_pardat_filter: AND class = 'EX' config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL AND class NOT LIKE 'OA%' @@ -251,7 +251,7 @@ sources: additional_select_columns: *select-columns config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} meta: description: cur should be 'Y' or 'D' - name: deactivat @@ -326,7 +326,7 @@ sources: - mktadj config: where: - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL AND mktrsn IN ('5', '5B') @@ -616,7 +616,7 @@ sources: - external_occpct config: where: - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL AND mktrsn IN ('5', '5B') @@ -644,7 +644,7 @@ sources: - mktadj config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL AND (external_occpct IS NOT NULL OR mktadj IS NOT NULL) @@ -887,7 +887,7 @@ sources: additional_select_columns: *select-columns config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND class NOT IN ('211', '212') AND cur = 'Y' AND deactivat IS NULL @@ -900,7 +900,7 @@ sources: additional_select_columns: *select-columns config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND class IN ('211', '212') AND cur = 'Y' AND deactivat IS NULL @@ -945,7 +945,7 @@ sources: - wen config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL AND class = '202' @@ -960,7 +960,7 @@ sources: additional_select_columns: *select-columns-with-class config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL AND class = '203' @@ -975,7 +975,7 @@ sources: additional_select_columns: *select-columns-with-class config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL AND class = '204' @@ -990,7 +990,7 @@ sources: additional_select_columns: *select-columns-with-class config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL AND class = '205' @@ -1005,7 +1005,7 @@ sources: additional_select_columns: *select-columns-with-class config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL AND class = '206' @@ -1020,7 +1020,7 @@ sources: additional_select_columns: *select-columns-with-class config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL AND class = '207' @@ -1035,7 +1035,7 @@ sources: additional_select_columns: *select-columns-with-class config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL AND class = '208' @@ -1050,7 +1050,7 @@ sources: additional_select_columns: *select-columns-with-class config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL AND class = '209' @@ -1065,7 +1065,7 @@ sources: additional_select_columns: *select-columns-with-class config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL AND class = '210' @@ -1080,7 +1080,7 @@ sources: additional_select_columns: *select-columns-with-class config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL AND class = '211' @@ -1095,7 +1095,7 @@ sources: additional_select_columns: *select-columns-with-class config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL AND class = '212' @@ -1110,7 +1110,7 @@ sources: additional_select_columns: *select-columns-with-class config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL AND class = '234' @@ -1125,7 +1125,7 @@ sources: additional_select_columns: *select-columns-with-class config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL AND class = '278' @@ -1140,7 +1140,7 @@ sources: additional_select_columns: *select-columns-with-class config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL AND class = '295' @@ -1272,7 +1272,7 @@ sources: - attic config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL AND class NOT IN ('201', '213', '218', '219', '220', '221', '224', '225', '236', '240', '241', '290', '294', '297') @@ -1317,7 +1317,7 @@ sources: - bsmt config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL AND class NOT IN ('201', '213', '218', '219', '220', '221', '224', '225', '236', '240', '241', '290', '294', '297') @@ -1358,7 +1358,7 @@ sources: additional_select_columns: *select-columns config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND class IN ('211', '212') AND cur = 'Y' AND deactivat IS NULL @@ -1372,7 +1372,7 @@ sources: additional_select_columns: *select-columns config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND class IN ('211', '212') AND cur = 'Y' AND deactivat IS NULL @@ -1406,7 +1406,7 @@ sources: additional_select_columns: *select-columns-with-class config: &unique-conditions-class-212 where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL AND class = '212' @@ -1430,7 +1430,7 @@ sources: config: # Exclude non-regression classes, exempt/RR parcels, and 212s where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL AND class NOT IN ('201', '213', '218', '219', '220', '221', '224', '225', '236', '240', '241', '290', '294', '297') @@ -1475,7 +1475,7 @@ sources: - user33 config: &unique-conditions-with-garage-size where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL AND class NOT IN ('201', '213', '218', '219', '220', '221', '224', '225', '236', '240', '241', '290', '294', '297') @@ -1542,7 +1542,7 @@ sources: additional_select_columns: *select-columns-with-garage-size config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL AND class NOT IN ('201', '213', '218', '219', '220', '221', '224', '225', '236', '240', '241', '290', '294', '297') @@ -1563,7 +1563,7 @@ sources: an integer between 1 and 4 tags: # Currently too many failures for this signal to be useful - - test_qc_exclude_from_workbook + - data_test_iasworld_exclude_from_workbook - name: useramt description: RCN value of additions - name: userfact @@ -1631,7 +1631,7 @@ sources: additional_select_columns: *select-columns config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL AND class = '205' @@ -1643,7 +1643,7 @@ sources: additional_select_columns: *select-columns config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL AND class = '206' @@ -1655,7 +1655,7 @@ sources: additional_select_columns: *select-columns config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL AND class = '207' @@ -1667,7 +1667,7 @@ sources: additional_select_columns: *select-columns config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL AND class = '208' @@ -1679,7 +1679,7 @@ sources: additional_select_columns: *select-columns config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL AND class = '210' @@ -1691,7 +1691,7 @@ sources: additional_select_columns: *select-columns config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL AND class = '278' @@ -1703,7 +1703,7 @@ sources: additional_select_columns: *select-columns config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL AND class = '295' @@ -1757,7 +1757,7 @@ sources: - user12 config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL AND class = '234' diff --git a/dbt/models/iasworld/schema/iasworld.enter.yml b/dbt/models/iasworld/schema/iasworld.enter.yml index 7be3b9d42..3e59c5aa5 100644 --- a/dbt/models/iasworld/schema/iasworld.enter.yml +++ b/dbt/models/iasworld/schema/iasworld.enter.yml @@ -3,7 +3,7 @@ sources: loaded_at_field: date_parse(loaded_at, '%Y-%m-%d %H:%i:%S.%f') tags: - load_auto - - test_qc_iasworld + - data_test_iasworld tables: - name: enter diff --git a/dbt/models/iasworld/schema/iasworld.exadmn.yml b/dbt/models/iasworld/schema/iasworld.exadmn.yml index 7afd310d9..a6f6b1b2f 100644 --- a/dbt/models/iasworld/schema/iasworld.exadmn.yml +++ b/dbt/models/iasworld/schema/iasworld.exadmn.yml @@ -3,7 +3,7 @@ sources: loaded_at_field: date_parse(loaded_at, '%Y-%m-%d %H:%i:%S.%f') tags: - load_auto - - test_qc_iasworld + - data_test_iasworld tables: - name: exadmn diff --git a/dbt/models/iasworld/schema/iasworld.exapp.yml b/dbt/models/iasworld/schema/iasworld.exapp.yml index d7cb15c0b..03fc53e9f 100644 --- a/dbt/models/iasworld/schema/iasworld.exapp.yml +++ b/dbt/models/iasworld/schema/iasworld.exapp.yml @@ -3,7 +3,7 @@ sources: loaded_at_field: date_parse(loaded_at, '%Y-%m-%d %H:%i:%S.%f') tags: - load_auto - - test_qc_iasworld + - data_test_iasworld tables: - name: exapp diff --git a/dbt/models/iasworld/schema/iasworld.excode.yml b/dbt/models/iasworld/schema/iasworld.excode.yml index 618b0c4ca..d4b74dc90 100644 --- a/dbt/models/iasworld/schema/iasworld.excode.yml +++ b/dbt/models/iasworld/schema/iasworld.excode.yml @@ -3,7 +3,7 @@ sources: loaded_at_field: date_parse(loaded_at, '%Y-%m-%d %H:%i:%S.%f') tags: - load_auto - - test_qc_iasworld + - data_test_iasworld tables: - name: excode diff --git a/dbt/models/iasworld/schema/iasworld.exdet.yml b/dbt/models/iasworld/schema/iasworld.exdet.yml index ac8485777..84430fb38 100644 --- a/dbt/models/iasworld/schema/iasworld.exdet.yml +++ b/dbt/models/iasworld/schema/iasworld.exdet.yml @@ -3,7 +3,7 @@ sources: loaded_at_field: date_parse(loaded_at, '%Y-%m-%d %H:%i:%S.%f') tags: - load_auto - - test_qc_iasworld + - data_test_iasworld tables: - name: exdet diff --git a/dbt/models/iasworld/schema/iasworld.htagnt.yml b/dbt/models/iasworld/schema/iasworld.htagnt.yml index db86f8cfb..014b057a5 100644 --- a/dbt/models/iasworld/schema/iasworld.htagnt.yml +++ b/dbt/models/iasworld/schema/iasworld.htagnt.yml @@ -3,7 +3,7 @@ sources: loaded_at_field: date_parse(loaded_at, '%Y-%m-%d %H:%i:%S.%f') tags: - load_auto - - test_qc_iasworld + - data_test_iasworld tables: - name: htagnt diff --git a/dbt/models/iasworld/schema/iasworld.htdates.yml b/dbt/models/iasworld/schema/iasworld.htdates.yml index c2a36a424..50db5bd64 100644 --- a/dbt/models/iasworld/schema/iasworld.htdates.yml +++ b/dbt/models/iasworld/schema/iasworld.htdates.yml @@ -3,7 +3,7 @@ sources: loaded_at_field: date_parse(loaded_at, '%Y-%m-%d %H:%i:%S.%f') tags: - load_auto - - test_qc_iasworld + - data_test_iasworld tables: - name: htdates diff --git a/dbt/models/iasworld/schema/iasworld.htpar.yml b/dbt/models/iasworld/schema/iasworld.htpar.yml index 48b92fc79..a037eb4d6 100644 --- a/dbt/models/iasworld/schema/iasworld.htpar.yml +++ b/dbt/models/iasworld/schema/iasworld.htpar.yml @@ -3,7 +3,7 @@ sources: loaded_at_field: date_parse(loaded_at, '%Y-%m-%d %H:%i:%S.%f') tags: - load_auto - - test_qc_iasworld + - data_test_iasworld tables: - name: htpar @@ -77,7 +77,7 @@ sources: - wen config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} meta: description: cur should be 'Y' or 'D' - name: curbldg @@ -117,7 +117,7 @@ sources: additional_select_columns: *select-columns config: &unique-conditions where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL meta: @@ -363,7 +363,7 @@ sources: Address columns should have no leading or trailing whitespace tags: # Currently too many failures for this signal to be useful - - test_qc_exclude_from_workbook + - data_test_iasworld_exclude_from_workbook - expression_is_true: name: iasworld_htpar_cpaddr3_length_lte_5 expression: | diff --git a/dbt/models/iasworld/schema/iasworld.land.yml b/dbt/models/iasworld/schema/iasworld.land.yml index c3d3856cf..b698a03ee 100644 --- a/dbt/models/iasworld/schema/iasworld.land.yml +++ b/dbt/models/iasworld/schema/iasworld.land.yml @@ -3,7 +3,7 @@ sources: loaded_at_field: date_parse(loaded_at, '%Y-%m-%d %H:%i:%S.%f') tags: - load_auto - - test_qc_iasworld + - data_test_iasworld - type_land tables: @@ -30,7 +30,7 @@ sources: - sf config: &unique-conditions where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL meta: @@ -83,7 +83,7 @@ sources: additional_select_columns: *select-columns config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND class NOT IN ('EX', 'RR', '999') AND NOT REGEXP_LIKE(class, '[0-9]{3}[A|B]') AND cur = 'Y' @@ -107,7 +107,7 @@ sources: agg_func: max config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND class NOT IN ('EX', 'RR') AND cur = 'Y' AND deactivat IS NULL @@ -139,7 +139,7 @@ sources: additional_select_columns: *select-columns config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} meta: description: cur should be 'Y' or 'D' - name: deactivat @@ -282,7 +282,7 @@ sources: additional_select_columns: *select-columns config: where: - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND class NOT IN ('EX', 'RR', '999') AND cur = 'Y' AND deactivat IS NULL diff --git a/dbt/models/iasworld/schema/iasworld.legdat.yml b/dbt/models/iasworld/schema/iasworld.legdat.yml index f207b5871..fb9356847 100644 --- a/dbt/models/iasworld/schema/iasworld.legdat.yml +++ b/dbt/models/iasworld/schema/iasworld.legdat.yml @@ -3,7 +3,7 @@ sources: loaded_at_field: date_parse(loaded_at, '%Y-%m-%d %H:%i:%S.%f') tags: - load_auto - - test_qc_iasworld + - data_test_iasworld tables: - name: legdat @@ -35,7 +35,7 @@ sources: - wen config: &unique-conditions where: - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL meta: @@ -99,7 +99,7 @@ sources: additional_select_columns: *select-columns config: where: - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} meta: description: cur should be 'Y' or 'D' - name: deactivat @@ -296,7 +296,7 @@ sources: description: zip1 should not be 00000 # Currently too many failures for this signal to be useful tags: - - test_qc_exclude_from_workbook + - data_test_iasworld_exclude_from_workbook - name: zip2 description: '{{ doc("shared_column_prop_address_zipcode_2") }}' data_tests: @@ -309,7 +309,7 @@ sources: description: zip2 should not be 0000 # Currently too many failures for this signal to be useful tags: - - test_qc_exclude_from_workbook + - data_test_iasworld_exclude_from_workbook data_tests: - unique_combination_of_columns: @@ -378,4 +378,4 @@ sources: unitno, cityname, statecode, zip1, zip2 tags: # Currently too many failures for this signal to be useful - - test_qc_exclude_from_workbook + - data_test_iasworld_exclude_from_workbook diff --git a/dbt/models/iasworld/schema/iasworld.lpmod.yml b/dbt/models/iasworld/schema/iasworld.lpmod.yml index 611359e60..fb125d717 100644 --- a/dbt/models/iasworld/schema/iasworld.lpmod.yml +++ b/dbt/models/iasworld/schema/iasworld.lpmod.yml @@ -3,7 +3,7 @@ sources: loaded_at_field: date_parse(loaded_at, '%Y-%m-%d %H:%i:%S.%f') tags: - load_auto - - test_qc_iasworld + - data_test_iasworld tables: - name: lpmod diff --git a/dbt/models/iasworld/schema/iasworld.lpnbhd.yml b/dbt/models/iasworld/schema/iasworld.lpnbhd.yml index 1aee1852a..308148b94 100644 --- a/dbt/models/iasworld/schema/iasworld.lpnbhd.yml +++ b/dbt/models/iasworld/schema/iasworld.lpnbhd.yml @@ -3,7 +3,7 @@ sources: loaded_at_field: date_parse(loaded_at, '%Y-%m-%d %H:%i:%S.%f') tags: - load_auto - - test_qc_iasworld + - data_test_iasworld tables: - name: lpnbhd diff --git a/dbt/models/iasworld/schema/iasworld.oby.yml b/dbt/models/iasworld/schema/iasworld.oby.yml index 79506a807..f22ce991e 100644 --- a/dbt/models/iasworld/schema/iasworld.oby.yml +++ b/dbt/models/iasworld/schema/iasworld.oby.yml @@ -3,7 +3,7 @@ sources: loaded_at_field: date_parse(loaded_at, '%Y-%m-%d %H:%i:%S.%f') tags: - load_auto - - test_qc_iasworld + - data_test_iasworld - type_condo tables: @@ -37,7 +37,7 @@ sources: - wen config: &unique-conditions where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL meta: @@ -57,7 +57,7 @@ sources: additional_select_columns: *select-columns config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL AND calc_meth IS NOT NULL @@ -95,7 +95,7 @@ sources: - mktadj config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL AND (external_occpct IS NOT NULL OR mktadj IS NOT NULL) @@ -172,7 +172,7 @@ sources: additional_select_columns: *select-columns config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} meta: description: cur should be 'Y' or 'D' - name: curmult @@ -233,7 +233,7 @@ sources: - mktadj config: where: - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL AND chgrsn IN ('5', '5B') @@ -333,7 +333,7 @@ sources: - external_occpct config: where: - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL AND chgrsn IN ('5', '5B') diff --git a/dbt/models/iasworld/schema/iasworld.owndat.yml b/dbt/models/iasworld/schema/iasworld.owndat.yml index 47cf8cbe3..6afe39058 100644 --- a/dbt/models/iasworld/schema/iasworld.owndat.yml +++ b/dbt/models/iasworld/schema/iasworld.owndat.yml @@ -3,7 +3,7 @@ sources: loaded_at_field: date_parse(loaded_at, '%Y-%m-%d %H:%i:%S.%f') tags: - load_auto - - test_qc_iasworld + - data_test_iasworld tables: - name: owndat @@ -75,7 +75,7 @@ sources: - wen config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} meta: description: cur should be 'Y' or 'D' - name: deactivat @@ -167,7 +167,7 @@ sources: - wen config: &unique-conditions where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL meta: @@ -293,4 +293,4 @@ sources: statecode, zip1, zip2, user27 tags: # Currently too many failures for this signal to be useful - - test_qc_exclude_from_workbook + - data_test_iasworld_exclude_from_workbook diff --git a/dbt/models/iasworld/schema/iasworld.pardat.yml b/dbt/models/iasworld/schema/iasworld.pardat.yml index 06c796fd1..33b506ee7 100644 --- a/dbt/models/iasworld/schema/iasworld.pardat.yml +++ b/dbt/models/iasworld/schema/iasworld.pardat.yml @@ -3,7 +3,7 @@ sources: loaded_at_field: date_parse(loaded_at, '%Y-%m-%d %H:%i:%S.%f') tags: - load_auto - - test_qc_iasworld + - data_test_iasworld tables: - name: pardat @@ -86,7 +86,7 @@ sources: - wen config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND class NOT IN ('EX', 'RR') AND NOT REGEXP_LIKE(class, '[0-9]{3}[A|B]') AND cur = 'Y' @@ -105,7 +105,7 @@ sources: - luc config: &unique-conditions where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} AND cur = 'Y' AND deactivat IS NULL meta: @@ -120,7 +120,7 @@ sources: additional_select_columns: *select-columns config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} meta: description: cur should be 'Y' or 'D' - name: deactivat @@ -293,7 +293,7 @@ sources: tags: # Valuations can't act on these failures since hyphens are # not distinguishable in the iasWorld UI - - test_qc_exclude_from_workbook + - data_test_iasworld_exclude_from_workbook - name: tiebkcd description: Tieback code - name: tiebldgpct diff --git a/dbt/models/iasworld/schema/iasworld.permit.yml b/dbt/models/iasworld/schema/iasworld.permit.yml index 6663f38dd..e086e47ea 100644 --- a/dbt/models/iasworld/schema/iasworld.permit.yml +++ b/dbt/models/iasworld/schema/iasworld.permit.yml @@ -3,7 +3,7 @@ sources: loaded_at_field: date_parse(loaded_at, '%Y-%m-%d %H:%i:%S.%f') tags: - load_auto - - test_qc_iasworld + - data_test_iasworld tables: - name: permit diff --git a/dbt/models/iasworld/schema/iasworld.rcoby.yml b/dbt/models/iasworld/schema/iasworld.rcoby.yml index 347c49021..79eb3d0a7 100644 --- a/dbt/models/iasworld/schema/iasworld.rcoby.yml +++ b/dbt/models/iasworld/schema/iasworld.rcoby.yml @@ -3,7 +3,7 @@ sources: loaded_at_field: date_parse(loaded_at, '%Y-%m-%d %H:%i:%S.%f') tags: - load_auto - - test_qc_iasworld + - data_test_iasworld tables: - name: rcoby diff --git a/dbt/models/iasworld/schema/iasworld.sales.yml b/dbt/models/iasworld/schema/iasworld.sales.yml index 31e047584..71f7291aa 100644 --- a/dbt/models/iasworld/schema/iasworld.sales.yml +++ b/dbt/models/iasworld/schema/iasworld.sales.yml @@ -3,7 +3,7 @@ sources: loaded_at_field: date_parse(loaded_at, '%Y-%m-%d %H:%i:%S.%f') tags: - load_auto - - test_qc_iasworld + - data_test_iasworld tables: - name: sales diff --git a/dbt/models/iasworld/schema/iasworld.splcom.yml b/dbt/models/iasworld/schema/iasworld.splcom.yml index 8273ac69f..6721e21ea 100644 --- a/dbt/models/iasworld/schema/iasworld.splcom.yml +++ b/dbt/models/iasworld/schema/iasworld.splcom.yml @@ -3,7 +3,7 @@ sources: loaded_at_field: date_parse(loaded_at, '%Y-%m-%d %H:%i:%S.%f') tags: - load_auto - - test_qc_iasworld + - data_test_iasworld tables: - name: splcom diff --git a/dbt/models/iasworld/schema/iasworld.valclass.yml b/dbt/models/iasworld/schema/iasworld.valclass.yml index 3fb97ed14..bf8318244 100644 --- a/dbt/models/iasworld/schema/iasworld.valclass.yml +++ b/dbt/models/iasworld/schema/iasworld.valclass.yml @@ -3,7 +3,7 @@ sources: loaded_at_field: date_parse(loaded_at, '%Y-%m-%d %H:%i:%S.%f') tags: - load_auto - - test_qc_iasworld + - data_test_iasworld tables: - name: valclass diff --git a/dbt/models/qc/qc.vw_nonlivable_condos_with_chars.sql b/dbt/models/qc/qc.vw_nonlivable_condos_with_chars.sql index 585e08cae..ad6a8ff23 100644 --- a/dbt/models/qc/qc.vw_nonlivable_condos_with_chars.sql +++ b/dbt/models/qc/qc.vw_nonlivable_condos_with_chars.sql @@ -20,6 +20,3 @@ LEFT JOIN {{ source('spatial', 'township') }} AS towns WHERE (condos.is_common_area OR condos.is_parking_space) AND (condos.char_bedrooms IS NOT NULL OR condos.char_full_baths IS NOT NULL) - AND CAST( - condos.year AS INT - ) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} diff --git a/dbt/models/qc/schema.yml b/dbt/models/qc/schema.yml index 4c3786d0f..d498853b4 100644 --- a/dbt/models/qc/schema.yml +++ b/dbt/models/qc/schema.yml @@ -6,8 +6,8 @@ models: description: '{{ doc("view_vw_change_in_high_low_value_sales") }}' config: tags: - - test_qc_sales - - test_qc_exclude_from_workbook + - data_test_iasworld + - data_test_iasworld_exclude_from_workbook data_tests: - expression_is_true: name: qc_vw_change_in_high_low_value_sales_ratio_high @@ -42,7 +42,7 @@ models: description: '{{ doc("view_vw_iasworld_asmt_all_joined_to_legdat") }}' config: tags: - - test_qc_iasworld + - data_test_iasworld data_tests: - unique_combination_of_columns: name: iasworld_asmt_all_unique_by_parid_procname_and_taxyr @@ -52,7 +52,7 @@ models: - taxyr config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} -- Filter out known dupes in township 74 from one particular day -- that were introduced by a bulk data update error AND township_code != '74' @@ -67,8 +67,8 @@ models: description: '{{ doc("view_vw_iasworld_sales_day_of_month") }}' config: tags: - - test_qc_sales - - test_qc_exclude_from_workbook + - data_test_iasworld + - data_test_iasworld_exclude_from_workbook data_tests: - expression_is_true: name: qc_vw_iasworld_sales_day_of_month_lte_half_observations @@ -88,7 +88,7 @@ models: description: '{{ doc("view_vw_iasworld_sales_high_value_by_class") }}' config: tags: - - test_qc_sales + - data_test_iasworld columns: - name: price data_tests: @@ -108,7 +108,7 @@ models: description: '{{ doc("view_vw_iasworld_sales_null_values") }}' config: tags: - - test_qc_sales + - data_test_iasworld columns: - name: buyer data_tests: @@ -156,7 +156,7 @@ models: description: '{{ doc("view_vw_iasworld_sales_price_diff_sale_mydec") }}' config: tags: - - test_qc_sales + - data_test_iasworld data_tests: # Values of $1000 and below seem to get recoded to $1 in iasWorld - expression_is_true: @@ -178,8 +178,8 @@ models: description: '{{ doc("view_vw_iasworld_sales_rowcount_matches_sale_mydec") }}' config: tags: - - test_qc_sales - - test_qc_exclude_from_workbook + - data_test_iasworld + - data_test_iasworld_exclude_from_workbook columns: - name: comparison data_tests: @@ -197,8 +197,8 @@ models: description: '{{ doc("view_vw_iasworld_sales_unmatched_joins_sale_mydec") }}' config: tags: - - test_qc_sales - - test_qc_exclude_from_workbook + - data_test_iasworld + - data_test_iasworld_exclude_from_workbook columns: - name: mydec_unmatched data_tests: @@ -225,7 +225,7 @@ models: description: '{{ doc("view_vw_neg_asmt_value") }}' config: tags: - - test_qc_inquire + - data_test_iasworld columns: - name: val01 data_tests: @@ -241,7 +241,7 @@ models: - wen config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} meta: table_name: asmt description: val01 (FARM LAND) should not be negative @@ -948,7 +948,7 @@ models: config: tags: - qc_report_town_close - - test_qc_inquire + - data_test_iasworld meta: export_name: Res parcels not set to cost approach columns: @@ -970,7 +970,7 @@ models: - wen config: where: | - CAST(taxyr AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(taxyr AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} meta: table_name: aprval description: revcode should be '1' for 200-class parcels @@ -989,8 +989,8 @@ models: description: '{{ doc("view_vw_sale_mydec_null_values") }}' config: tags: - - test_qc_sales - - test_qc_exclude_from_workbook + - data_test_iasworld + - data_test_iasworld_exclude_from_workbook columns: - name: address data_tests: diff --git a/dbt/models/reporting/schema.yml b/dbt/models/reporting/schema.yml index cd016dd36..fcedfde77 100644 --- a/dbt/models/reporting/schema.yml +++ b/dbt/models/reporting/schema.yml @@ -219,19 +219,19 @@ models: name: reporting_vw_pin_value_long_class_not_null column_name: class config: - where: CAST(year AS int) < {{ var('test_qc_year_start') }} + where: CAST(year AS int) < {{ var('data_test_iasworld_year_start') }} - not_null: name: reporting_vw_pin_value_long_tot_not_null column_name: tot config: - where: CAST(year AS int) < {{ var('test_qc_year_start') }} + where: CAST(year AS int) < {{ var('data_test_iasworld_year_start') }} error_if: ">21" # as of 2024-04-01 - not_null: name: reporting_vw_pin_value_long_tot_mv_not_null column_name: tot_mv config: where: | - CAST(year AS int) < {{ var('test_qc_year_start') }} AND + CAST(year AS int) < {{ var('data_test_iasworld_year_start') }} AND (year >= '2021' OR ( year = '2020' AND stage_name = 'BOR CERTIFIED') ) @@ -240,10 +240,10 @@ models: column_name: tot config: tags: - - test_qc_iasworld + - data_test_iasworld max_value: 1 where: | - class = 'EX' AND CAST(year AS int) = {{ var('test_qc_year_start') }} + class = 'EX' AND CAST(year AS int) = {{ var('data_test_iasworld_year_start') }} additional_select_columns: - {column: pin, alias: parid} - class diff --git a/dbt/models/tax/schema.yml b/dbt/models/tax/schema.yml index 22b42da59..5d36cc79c 100644 --- a/dbt/models/tax/schema.yml +++ b/dbt/models/tax/schema.yml @@ -43,7 +43,7 @@ sources: AND external_model.deactivat IS NULL config: where: - CAST(year AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(year AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} severity: warn meta: description: tax_code_num should match legdat.taxdist @@ -59,7 +59,7 @@ sources: column: year config: where: - CAST(year AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(year AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} meta: description: at least one class should match pardat class - relationships: @@ -70,7 +70,7 @@ sources: - year config: where: - CAST(year AS int) BETWEEN {{ var('test_qc_year_start') }} AND {{ var('test_qc_year_end') }} + CAST(year AS int) BETWEEN {{ var('data_test_iasworld_year_start') }} AND {{ var('data_test_iasworld_year_end') }} meta: category: class_mismatch_or_issue description: class code must be valid diff --git a/dbt/scripts/requirements.run_iasworld_data_tests.txt b/dbt/scripts/requirements.run_iasworld_data_tests.txt new file mode 100644 index 000000000..980523fb5 --- /dev/null +++ b/dbt/scripts/requirements.run_iasworld_data_tests.txt @@ -0,0 +1,6 @@ +openpyxl~=3.1.2 +PyAthena~=3.0.8 +simplejson~=3.19.2 +lxml~=5.1.0 +numpy==1.26.4 +pyarrow==15.0.2 diff --git a/dbt/scripts/run_iasworld_data_tests.py b/dbt/scripts/run_iasworld_data_tests.py new file mode 100644 index 000000000..4d6a11a78 --- /dev/null +++ b/dbt/scripts/run_iasworld_data_tests.py @@ -0,0 +1,1502 @@ +#!/usr/bin/env python3 +# +# Generates an Excel workbook of dbt test failures that can be shared with +# other teams for review and correction, along with metadata parquet files +# that can be uploaded to S3 for long-term result tracking. +# +# Run `python3 run_iasworld_data_tests.py --help` for detailed +# documentation. + +import argparse +import dataclasses +import datetime +import decimal +import enum +import hashlib +import os +import pathlib +import re +import subprocess +import typing + +import openpyxl +import openpyxl.cell +import openpyxl.styles +import openpyxl.styles.colors +import openpyxl.utils +import pyarrow as pa +import pyarrow.parquet +import pyathena +import pyathena.arrow.cursor +import pyathena.cursor +import simplejson as json +import yaml +from dbt.artifacts.schemas.results import TestStatus +from dbt.cli.main import dbtRunner + +DBT = dbtRunner() + +# Prefix for the URL location of a test in the dbt docs +DOCS_URL_PREFIX = "https://ccao-data.github.io/data-architecture/#!/test" +# The S3 bucket where Athena query results are stored +AWS_ATHENA_S3_STAGING_DIR = os.getenv( + "AWS_ATHENA_S3_STAGING_DIR", "s3://ccao-athena-results-us-east-1/" +) +# Field names that are used in the output workbook +SOURCE_TABLE_FIELD = "source_table" +DESCRIPTION_FIELD = "description" +TEST_NAME_FIELD = "test_name" +DOCS_URL_FIELD = "docs_url" +TAXYR_FIELD = "taxyr" +PARID_FIELD = "parid" +CARD_FIELD = "card" +LAND_LINE_FIELD = "lline" +TOWNSHIP_FIELD = "township_code" +CLASS_FIELD = "class" +WHO_FIELD = "who" +WEN_FIELD = "wen" +# Mapping that defines category names that should be reported for tests +# based on their generics +TEST_CATEGORIES = { + "test_accepted_range": "incorrect_values", + "test_accepted_values": "incorrect_values", + "test_not_accepted_values": "incorrect_values", + "test_unique_combination_of_columns": "duplicate_records", + "test_not_null": "missing_values", + "test_is_null": "missing_values", + "test_res_class_matches_pardat": "class_mismatch_or_issue", +} +# Fallback for tests whose category we can't determine from either the +# test name, the `meta.category` attribute, or the TEST_CATEGORIES mapping +DEFAULT_TEST_CATEGORY = "miscellaneous" +# Directory to store failed test caches +TEST_CACHE_DIR = "test_cache" + + +class Status(enum.Enum): + """Status of an individual dbt test result.""" + + PASS = "pass" + FAIL = "fail" + WARN = "warn" + + def __repr__(self) -> str: + return self.value.upper() + + +class TestResult: + """Class to store results for an individual test.""" + + def __init__( + self, + name: str, + table_name: str, + column_name: typing.Optional[str], + status: Status, + description: str, + elapsed_time: decimal.Decimal, + failing_rows: typing.Optional[typing.List[typing.Dict]] = None, + ) -> None: + """ + The failing_rows list should be formatted like the rows + returned by a csv.DictReader or a DictCursor, i.e. a list of + dicts mapping `{field_name: row_value}`. + """ + self.name = name + self.table_name = table_name + self.column_name = column_name + self.status = status + self.description = description + self.elapsed_time = elapsed_time + self.failing_rows: typing.List[typing.Dict] = failing_rows or [] + + @property + def fieldnames(self) -> typing.List[str]: + """Return a list of strings representing the fieldnames for any + failing_rows of this test. Returns an empty list if the test + passed.""" + fieldnames = [] + for row in self.failing_rows: + for fieldname in row.keys(): + if fieldname not in fieldnames: + fieldnames.append(fieldname) + return fieldnames + + def to_dict(self) -> typing.Dict: + """Serialize the TestResult object as a dictionary.""" + return { + "name": self.name, + "table_name": self.table_name, + "column_name": self.column_name, + "status": self.status.value, + "description": self.description, + "elapsed_time": self.elapsed_time, + "failing_rows": self.failing_rows, + } + + @classmethod + def from_dict(cls, result_dict: typing.Dict) -> "TestResult": + """Deserialize a TestResult object from a dictionary.""" + return TestResult( + name=result_dict["name"], + table_name=result_dict["table_name"], + column_name=result_dict["column_name"], + status=Status(result_dict["status"]), + description=result_dict["description"], + elapsed_time=result_dict["elapsed_time"], + failing_rows=result_dict["failing_rows"], + ) + + def __repr__(self) -> str: + return f"TestResult({self.to_dict()!r})" + + def split_by_township(self) -> typing.List["TownshipTestResult"]: + """Split out this TestResult object into one or more + TownshipTestResults based on the township code of each failing row. If + there are no failing rows, or if all the failing rows have a null + township, the return value will be a list of one TownshipTestResult + whose township_code is None.""" + # Split out the failing rows by township so that we know which + # townships are represented in this test's failures + failing_rows_by_township: typing.Dict[ + typing.Optional[str], typing.List[typing.Dict] + ] = {} + for row in self.failing_rows: + township_code = row.get(TOWNSHIP_FIELD) + if not failing_rows_by_township.get(township_code): + failing_rows_by_township[township_code] = [] + failing_rows_by_township[township_code].append(row) + + # These kwargs are shared by all TownshipTestResults, regardless of + # township code or failure status + base_kwargs = { + "name": self.name, + "table_name": self.table_name, + "column_name": self.column_name, + "status": self.status, + "description": self.description, + "elapsed_time": self.elapsed_time, + } + + # If we have any failing rows, split out separate TownshipTestResult + # objects for each township/row group; otherwise, just create one + # object with a null township mapped to the passing test + if failing_rows_by_township: + return [ + TownshipTestResult( + township_code=township_code, + failing_rows=rows, + **base_kwargs, + ) + for township_code, rows in failing_rows_by_township.items() + ] + return [ + TownshipTestResult( + township_code=None, failing_rows=[], **base_kwargs + ) + ] + + +class TownshipTestResult(TestResult): + """A variant of TestResult for a test whose results all share the same + township. Note that township_code is only present in the case of failing + tests; the township_code will always be None in the case of a passing test, + since passing tests have no township (or, thinking of it differently, + passing tests encompass all of the townships).""" + + def __init__( + self, township_code: typing.Optional[str], *args, **kwargs + ) -> None: + self.township_code = township_code + super().__init__(*args, **kwargs) + + +class TestCategory: + """Class to store TestResult objects for a group of dbt tests that share + the same category. Provides convenience methods for formatting those + results for output to a workbook and saving them to a cache.""" + + # Names of fields that are used for debugging + possible_debugging_fieldnames = [TEST_NAME_FIELD, DOCS_URL_FIELD] + # Names of fields that identify the test + possible_test_metadata_fieldnames = [ + *[SOURCE_TABLE_FIELD, DESCRIPTION_FIELD], + *possible_debugging_fieldnames, + ] + # Names of fields that are used for diagnostics + possible_diagnostic_fieldnames = [ + TAXYR_FIELD, + PARID_FIELD, + CARD_FIELD, + LAND_LINE_FIELD, + CLASS_FIELD, + TOWNSHIP_FIELD, + WHO_FIELD, + WEN_FIELD, + ] + # The complete set of fixed fields + possible_fixed_fieldnames = [ + *possible_test_metadata_fieldnames, + *possible_diagnostic_fieldnames, + ] + + def __init__( + self, + category: str, + results: typing.Optional[typing.List[TestResult]] = None, + ) -> None: + self.category = category + self.test_results: typing.List[TestResult] = results or [] + + def to_dict(self) -> typing.Dict: + """Serialize the TestCategory object as a dictionary.""" + return { + "category": self.category, + "test_results": [result.to_dict() for result in self.test_results], + } + + @classmethod + def from_dict(cls, category_dict: typing.Dict) -> "TestCategory": + """Deserialize a TestCategory object from a dictionary.""" + return TestCategory( + category=category_dict["category"], + results=[ + TestResult.from_dict(result_dict) + for result_dict in category_dict["test_results"] + ], + ) + + def __repr__(self) -> str: + num_failing_rows = sum( + len(result.failing_rows) for result in self.test_results + ) + return ( + f"TestCategory(category={self.category!r}, " + f"status={self.status!r}, " + f"num_tests={len(self.test_results)}, " + f"num_failing_rows={num_failing_rows})" + ) + + @property + def fieldnames(self) -> typing.List[str]: + """Get a list of fieldnames that encapsulates all of the fieldnames + for all of the rows of tests tracked by this group.""" + fieldnames = [] + for result in self.test_results: + for fieldname in result.fieldnames: + if fieldname not in fieldnames: + fieldnames.append(fieldname) + + # Remove any fixed fieldnames from the ordered list that are not + # present in this group + fixed_field_order = [ + field + for field in self.possible_fixed_fieldnames + if field in fieldnames + ] + + # Reorder the fieldnames so that diagnostic fields are presented in the + # correct order + for field in reversed(fixed_field_order): + fieldnames.insert(0, fieldnames.pop(fieldnames.index(field))) + + return fieldnames + + @property + def rows(self) -> typing.List[typing.List]: + """Format the rows of tests tracked by this group, with + fieldname data excluded. The combination of this property and the + `fieldnames` property can be used to write to a csv.Writer or + to an openpyxl.Workbook sheet for the tests tracked by this group.""" + fieldnames = self.fieldnames + return [ + [row.get(fieldname) for fieldname in fieldnames] + for result in self.test_results + for row in result.failing_rows + ] + + @property + def status(self) -> Status: + """Return an aggregate status for this category based on the statuses + of its TestResult objects.""" + statuses = set(result.status for result in self.test_results) + # case/match syntax doesn't work with sets, unfortunately + if statuses == {Status.PASS}: + return Status.PASS + if statuses == {Status.WARN}: + return Status.WARN + if statuses == {Status.FAIL}: + return Status.FAIL + if statuses == {Status.PASS, Status.WARN}: + return Status.WARN + if statuses == {Status.PASS, Status.FAIL}: + return Status.FAIL + if statuses == {Status.WARN, Status.FAIL}: + return Status.FAIL + raise ValueError(f"Unexpected combination of statuses: {statuses}") + + def add_to_workbook(self, workbook: openpyxl.Workbook) -> None: + """Add a sheet of failed dbt tests to an openpyxl Workbook using data + from the TestCategory object. Note that we expect the workbook to be + initialized with write_only=True.""" + # Only add this category to the workbook if it has any failing tests + if self.status in (Status.PASS, Status.WARN): + print( + f"Skipping add_to_workbook for category {self.category} since " + f"its status is '{self.status!r}'" + ) + return + + # openpyxl Workbooks are typically created with one untitled active + # sheet by default, but write-only sheets are an exception to this + # rule, so we always have to create a new sheet + sheet = workbook.create_sheet() + sheet.title = self.category + + # Freeze the header row. The syntax for the freeze_panes attribute is + # undocumented, but it freezes all rows above and all columns to the + # left of the given cell identifier. Note that freeze operations must + # be performed before any data is added to a sheet in a write-only + # workbook + data_header_idx = 3 # We have 3 headers; 2 for grouping and 1 for data + freeze_pane_letter = openpyxl.utils.get_column_letter( + len(self.test_metadata_fieldnames) + 1 + ) + freeze_pane_number = data_header_idx + 1 + sheet.freeze_panes = f"{freeze_pane_letter}{freeze_pane_number}" + + # Hide columns that are intended for debugging only, so that they don't + # get in the way of non-technical workbook consumers + for col_idx in self.debugging_field_indexes: + sheet.column_dimensions[col_idx].hidden = True + + # Create groupings for columns with a special group header + bold_font = openpyxl.styles.Font(bold=True) + italic_font = openpyxl.styles.Font(italic=True) + title_row, subtitle_row, header_row, merged_cell_range = [], [], [], [] + column_groups = { + self.test_metadata_field_indexes: { + "title": "Test description fields", + "subtitle": "These fields identify a failing test.", + "fieldnames": self.test_metadata_fieldnames, + "style": "20 % - Accent4", + "header_style": "Accent4", + }, + self.diagnostic_field_indexes: { + "title": "Unique identifier fields", + "subtitle": ( + "These fields identify the row that is failing a test." + ), + "fieldnames": self.diagnostic_fieldnames, + "style": "20 % - Accent1", + "header_style": "Accent1", + }, + self.nonfixed_field_indexes: { + "title": "Problematic fields", + "subtitle": ( + "These fields contain values that are causing the test " + "to fail." + ), + "fieldnames": self.nonfixed_fieldnames, + "style": "20 % - Accent2", + "header_style": "Accent2", + }, + } + for col_group_indexes, col_metadata in column_groups.items(): + # Sometimes there are no problematic fields for a given test; + # if this is the case, skip it + if not col_group_indexes: + continue + + # Save merged cell info + for cell_range in [ + f"{col_group_indexes[0]}1:{col_group_indexes[-1]}1", + f"{col_group_indexes[0]}2:{col_group_indexes[-1]}2", + ]: + merged_cell_range.append(cell_range) + + # Fill out and format grouping header + title_cell = openpyxl.cell.WriteOnlyCell( + sheet, value=col_metadata["title"] + ) + title_cell.style = "Note" + title_cell.font = bold_font + title_row.append(title_cell) + # Flesh out the empty title row cells that will be merged later on + for _ in range(len(col_group_indexes) - 1): + title_row.append("") + + subtitle_cell = openpyxl.cell.WriteOnlyCell( + sheet, value=col_metadata["subtitle"] + ) + subtitle_cell.style = "Note" + subtitle_cell.font = italic_font + subtitle_row.append(subtitle_cell) + for _ in range(len(col_group_indexes) - 1): + subtitle_row.append("") + + # Fill out and format the data header + for fieldname in col_metadata["fieldnames"]: + header_cell = openpyxl.cell.WriteOnlyCell( + sheet, value=fieldname + ) + header_cell.style = col_metadata["header_style"] + header_cell.font = openpyxl.styles.Font( + bold=True, color=openpyxl.styles.colors.WHITE + ) + header_row.append(header_cell) + + # Initialize the column widths based on the length of values in + # the header row + column_widths = { + openpyxl.utils.get_column_letter(idx + 1): len(fieldname) + 2 + for idx, fieldname in enumerate(self.fieldnames) + } + # Iterate the rows to extract data and optionally update the column + # widths if the length of the cell value exceeds the length of the + # header value + data_rows = [] + for row in self.rows: + data_row = [] + # Start enumeration at 1 since openpyxl columns are 1-indexed + for col_idx, cell in enumerate(row, 1): + # Convert row values to string so that Excel doesn't apply + # autoformatting + cell_str = str(cell) if cell is not None else "" + cell = openpyxl.cell.WriteOnlyCell(sheet, value=cell_str) + + # Retrieve the cell style from the column groupings if one + # exists + cell_style = None + column_letter = openpyxl.utils.get_column_letter(col_idx) + for col_group_indexes, col_metadata in column_groups.items(): + if column_letter in col_group_indexes: + cell_style = col_metadata["style"] + if cell_style: + cell.style = cell_style + data_row.append(cell) + + # Check if this cell is longer than the longest cell we've seen + # so far, and adjust the column dimensions accordingly + column_letter = openpyxl.utils.get_column_letter(col_idx) + column_widths[column_letter] = max( + column_widths.get(column_letter, 0), len(cell_str) + ) + + data_rows.append(data_row) + + # Update column widths so that they fit the longest column + for ( + column_letter, + column_width, + ) in column_widths.items(): + # Pad with an extra two characters to account for the fact that + # non-monospace fonts do not have consistent character widths, + # and set a hard limit of 75 characters so no one field takes over + # the viewport of the spreadsheet + width = min(column_width + 2, 75) + sheet.column_dimensions[column_letter].width = width + + # Add filters to fixed columns (i.e. columns that appear in every sheet + # in the same position) + fixed_field_indexes = self.fixed_field_indexes + sheet_max_row_idx = data_header_idx + len(data_rows) + min_fixed_idx = f"{fixed_field_indexes[0]}{data_header_idx}" + max_fixed_idx = f"{fixed_field_indexes[-1]}{sheet_max_row_idx}" + fixed_field_range = f"{min_fixed_idx}:{max_fixed_idx}" + sheet.auto_filter.ref = fixed_field_range + + # Add the data to the sheet. This should be one of the last steps in + # this function, since write-only sheets require all formatting to be + # set before data is added + sheet.append(title_row) + sheet.append(subtitle_row) + sheet.append(header_row) + for data_row in data_rows: + sheet.append(data_row) + + # Merge cells in the grouping headers. This approach is a bit of a hack + # since merged cells are not fully supported in write-only workbooks, + # hence why it takes place _after_ rows have been added to the sheet + # whereas most formatting options for write-only workbooks need to + # happen _before_ data is added. See here for details: + # https://stackoverflow.com/a/66159254 + for cell_range in merged_cell_range: + sheet.merged_cells.ranges.add(cell_range) + + @property + def debugging_fieldnames(self) -> typing.List[str]: + """Get a list of fieldnames (e.g. ["foo", "bar"]) for fields that + are used for debugging.""" + return self._filter_for_existing_fieldnames( + self.possible_debugging_fieldnames + ) + + @property + def debugging_field_indexes(self) -> tuple: + """Get a tuple of field indexes (e.g. ["A", "B"]) for fields that + are used for debugging.""" + return self._filter_for_existing_field_indexes( + self.possible_debugging_fieldnames + ) + + @property + def test_metadata_fieldnames(self) -> typing.List[str]: + """Get a list of fieldnames (e.g. ["foo", "bar"]) for fields that + are used for identifying tests.""" + return self._filter_for_existing_fieldnames( + self.possible_test_metadata_fieldnames + ) + + @property + def test_metadata_field_indexes(self) -> tuple: + """Get a tuple of field indexes (e.g. ["A", "B"]) for fields that + are used for identifying tests.""" + return self._filter_for_existing_field_indexes( + self.possible_test_metadata_fieldnames + ) + + @property + def diagnostic_fieldnames(self) -> typing.List[str]: + """Get a list of fieldnames (e.g. ["foo", "bar"]) for fields that + are used for diagnostics.""" + return self._filter_for_existing_fieldnames( + self.possible_diagnostic_fieldnames + ) + + @property + def diagnostic_field_indexes(self) -> tuple: + """Get a tuple of field indexes (e.g. ["A", "B"]) for fields that + are used for diagnostics.""" + return self._filter_for_existing_field_indexes( + self.possible_diagnostic_fieldnames + ) + + @property + def fixed_fieldnames(self) -> typing.List[str]: + """Get a list of fieldnames (e.g. ["foo", "bar"]) for fields that + are fixed (i.e. whose position is always at the start of the sheet, + for diagnostic purposes).""" + return self._filter_for_existing_fieldnames( + self.possible_fixed_fieldnames + ) + + @property + def fixed_field_indexes(self) -> tuple: + """Get a list of field indexes (e.g. ["A", "B"]) for fields that + are fixed (i.e. whose position is always at the start of the sheet, + for diagnostic purposes).""" + return self._filter_for_existing_field_indexes( + self.possible_fixed_fieldnames + ) + + @property + def nonfixed_fieldnames(self) -> typing.List[str]: + """Get a list of field names (e.g. ["foo", "bar"]) for fields that + are nonfixed (i.e. whose position comes after the fixed fields in the + sheet and are thus variable).""" + fieldnames = self.fieldnames + fixed_fieldnames = self.possible_fixed_fieldnames + return [field for field in fieldnames if field not in fixed_fieldnames] + + @property + def nonfixed_field_indexes(self) -> tuple: + """Get a list of field indexes (e.g. ["A", "B"]) for fields that + are nonfixed (i.e. whose position comes after the fixed fields in the + sheet and are thus variable).""" + nonfixed_fieldnames = self.nonfixed_fieldnames + return self._filter_for_existing_field_indexes(nonfixed_fieldnames) + + def _filter_for_existing_fieldnames( + self, possible_fieldnames: typing.List[str] + ) -> typing.List[str]: + """Helper function to filter a list of `possible_fieldnames` for + only those fields that exist in the test group, returning the + names of the fields (e.g. ["foo", "bar"]).""" + existing_fieldnames = self.fieldnames + return [ + field + for field in possible_fieldnames + if field in existing_fieldnames + ] + + def _filter_for_existing_field_indexes( + self, possible_fieldnames: typing.List[str] + ) -> tuple: + """Helper function to filter a list of `possible_fieldnames` for + only those fields that exist in the test group, returning the + indexes of the fields (e.g. ["A", "B"]).""" + existing_fieldnames = self.fieldnames + return tuple( + openpyxl.utils.get_column_letter( + # openpyxl is 1-indexed while the index() method is 0-indexed + existing_fieldnames.index(field) + 1 + ) + for field in self._filter_for_existing_fieldnames( + possible_fieldnames + ) + ) + + +# Help docstring for the command line interface +CLI_DESCRIPTION = """Runs iasWorld data tests and generates an Excel workbook of dbt test failures that can be shared with other teams +for review and correction, along with metadata parquet files that can be uploaded to S3 for long-term result tracking. + +This script expects that Python dependencies have been installed from requirements.run_iasworld_data_tests.txt. + +Expects one required environment variable to be set: + + 1. USER: The username of the user who ran the script. This is automatically set during login on Unix systems, but should be set manually elsewhere. + +Expects four optional environment variables: + + 1. AWS_ATHENA_S3_STAGING_DIR: Location in S3 where Athena query results should be written (defaults to s3://ccao-athena-results-us-east-1) + 2. GIT_SHA: The SHA of the latest git commit (defaults to the output of `git rev-parse`) + 3. GIT_REF: The name of the ref for the latest git commit (defaults to the output of `git rev-parse --abbrev-ref`) + 4. GIT_AUTHOR: The author of the latest git commit (defaults to the output of `git log`) + +Outputs three files to the directory specified by the `--output-dir` flag: + + 1. `iasworld_test_failures_.xlsx`: Excel workbook to share with other teams + 2. `metadata/test_run/run_year=YYYY/*.parquet`: Metadata about this run, partitioned by year of run and prepped for upload to S3 + 3. `metadata/test_run_result/run_year=YYYY/*.parquet`: Metadata about test results (pass, fail, number of failing rows, etc.) in this run, + partitioned by year of run and prepped for upload to S3 + +Each sheet in the output workbook represents a category of test, e.g. "valid_range" or "not_null"; each row in a sheet represents a row in +a database that failed a test, with enough metadata that a reader can figure out what conditions caused the test to fail and investigate the root cause.""" # noqa: E501 + +# Examples to use in the command line interface docstring +CLI_EXAMPLE = """Example usage with no options provided: + + python3 run_iasworld_data_tests.py + +Example usage with all options provided: + + AWS_ATHENA_S3_STAGING_DIR=s3://foo-bar-baz/ python3 run_iasworld_data_tests.py + --output-dir ./iasworld_test_results/ + --township 77 + --no-use-cached + +Example usage to filter for multiple townships: + + python3 run_iasworld_data_tests.py --township 76 77 + +Example usage to skip running tests, and instead reuse results from a previous run: + + python3 run_iasworld_data_tests.py --use-cached + +""" # noqa: E501 + + +def main() -> None: + """Entrypoint to this script. Runs dbt tests and writes artifacts + to the output directory with metadata about test results.""" + + parser = argparse.ArgumentParser( + description=CLI_DESCRIPTION, + epilog=CLI_EXAMPLE, + # Parse the description and epilog as raw text so that newlines + # get preserved + formatter_class=argparse.RawTextHelpFormatter, + ) + + parser.add_argument( + "--output-dir", + required=False, + help=( + "The directory to which output artifacts should be written; " + "if the directory does not exist, it will be created. Defaults to " + "'./iasworld_test_results_/'." + ), + ) + parser.add_argument( + "--township", + required=False, + nargs="*", + help=( + "One or more optional township codes which will be used to filter " + "results. Can be provided with multiple space-separated values " + "to select multiple townships. Defaults to all townships, " + "including null townships (which typically indicate invalid PINs)." + ), + ) + parser.add_argument( + "--use-cached", + action=argparse.BooleanOptionalAction, + required=False, + help=( + "Toggle using cached results from the most recent run. Useful when debugging " + "transformation steps. Defaults to False." + ), + ) + parser.add_argument( + "--target", + required=False, + default="dev", + help="dbt target to use for running tests, defaults to 'dev'", + ) + + args = parser.parse_args() + + output_dir = args.output_dir + townships = args.township if args.township else tuple() + use_cached = args.use_cached + target = args.target + + run_results_filepath = os.path.join("target", "run_results.json") + manifest_filepath = os.path.join("target", "manifest.json") + + date_today = datetime.datetime.today().strftime("%Y-%m-%d") + if output_dir is None: + output_dir = f"iasworld_test_results_{date_today}" + + if use_cached: + test_cache_path = get_test_cache_path( + run_results_filepath, manifest_filepath, townships + ) + if os.path.isfile(test_cache_path): + print(f"Loading test results from cache at {test_cache_path}") + test_categories = get_test_categories_from_file(test_cache_path) + else: + raise ValueError( + f"Test cache not found at {test_cache_path}, try rerunning " + "without --use-cached" + ) + else: + print("Running tests") + dbt_run_args = [ + "test", + "--target", + target, + "--selector", + "select_data_test_iasworld", + "--store-failures", + ] + print(f"> dbt {' '.join(dbt_run_args)}") + dbt_test_result = DBT.invoke(dbt_run_args) + + if dbt_test_result.exception is not None: + raise dbt_test_result.exception + + if any( + result.status == TestStatus.Error + for result in getattr(dbt_test_result.result, "results", []) + ): + # No need to report the exception, since the dbt process + # will have printed it already + raise ValueError("Quitting due to error in dbt test run") + + print("Loading test results from Athena") + test_categories = get_test_categories_from_athena( + run_results_filepath, manifest_filepath, townships + ) + + new_test_cache_path = get_test_cache_path( + run_results_filepath, manifest_filepath, townships + ) + print(f"Saving test results to the cache at {new_test_cache_path}") + save_test_categories_to_file(test_categories, new_test_cache_path) + + print("Generating the output workbook") + # It's important to use a write-only workbook here because otherwise + # the metadata required to store cell info about a large number of failing + # tests can cause the process to run out of memory + workbook = openpyxl.Workbook(write_only=True) + for test_category in test_categories: + print(f"Adding sheet for {test_category.category}") + test_category.add_to_workbook(workbook) + + pathlib.Path(output_dir).mkdir(exist_ok=True) + workbook_filepath = os.path.join( + output_dir, f"iasworld_test_failures_{date_today}.xlsx" + ) + workbook.save(workbook_filepath) + print(f"Output workbook saved to {workbook_filepath}") + + # Get run metadata from the environment + try: + run_by = os.environ["USER"] + except KeyError: + raise ValueError("USER env variable must be set") + + git_sha = ( + os.environ["GIT_SHA"] + if os.getenv("GIT_SHA") + else subprocess.getoutput("git rev-parse HEAD") + ) + git_ref = ( + os.environ["GIT_REF"] + if os.getenv("GIT_REF") + else subprocess.getoutput("git rev-parse --abbrev-ref HEAD") + ) + git_author = ( + os.environ["GIT_AUTHOR"] + if os.getenv("GIT_AUTHOR") + else subprocess.getoutput("git log -1 --pretty=format:'%an <%ae>'") + ) + + # Generate and save metadata tables as parquet + test_run_metadata = TestRunMetadata.create( + run_results_filepath, run_by, git_sha, git_ref, git_author + ) + test_run_result_metadata_list = TestRunResultMetadata.create_list( + test_categories, run_results_filepath + ) + test_run_failing_row_metadata_list = TestRunFailingRowMetadata.create_list( + test_categories, run_results_filepath + ) + run_date = get_run_date_from_run_results(run_results_filepath) + run_id = get_run_id_from_run_results(run_results_filepath) + + for metadata_list, tablename, partition_cols in [ + ([test_run_metadata], "test_run", ["run_year"]), + (test_run_result_metadata_list, "test_run_result", ["run_year"]), + ( + test_run_failing_row_metadata_list, + "test_run_failing_row", + ["run_year"], + ), + ]: + table = pa.Table.from_pylist( + [meta_obj.to_dict() for meta_obj in metadata_list], # type: ignore + ) + metadata_root_path = os.path.join(output_dir, "metadata", tablename) + pyarrow.parquet.write_to_dataset( + table, + metadata_root_path, + partition_cols, + basename_template="%s_%s_{i}.parquet" % (run_date, run_id), + ) + print(f"{tablename} metadata saved to {metadata_root_path}/") + + +@dataclasses.dataclass +class TestRunMetadata: + """Metadata object storing information about a test run.""" + + run_id: str + run_date: str + run_year: str # Separate from run_date for partitioning + run_by: str + elapsed_time: decimal.Decimal + var_year_start: str + var_year_end: str + git_sha: str + git_ref: str + git_author: str + + @classmethod + def create( + cls, + run_results_filepath: str, + run_by: str, + git_sha: str, + git_ref: str, + git_author: str, + ) -> "TestRunMetadata": + """Generate a TestRunMetadata object from a filepath to a + run_results.json file.""" + run_id = get_run_id_from_run_results(run_results_filepath) + run_date = get_run_date_from_run_results(run_results_filepath) + run_year = run_date[:4] + elapsed_time = get_key_from_run_results( + "elapsed_time", run_results_filepath + ) + + # Extract dbt vars + run_vars = get_key_from_run_results("args", run_results_filepath)[ + "vars" + ] + var_year_start = run_vars.get("data_test_iasworld_year_start") + var_year_end = run_vars.get("data_test_iasworld_year_end") + + # If dbt vars weren't set on the command line, the defaults won't exist + # in run_results.json, so we have to parse them from the dbt project + # config + if not var_year_start or not var_year_end: + with open("dbt_project.yml") as project_fobj: + project = yaml.safe_load(project_fobj) + var_year_start = ( + var_year_start + or project["vars"]["data_test_iasworld_year_start"] + ) + var_year_end = ( + var_year_end or project["vars"]["data_test_iasworld_year_end"] + ) + + return cls( + run_id=run_id, + run_year=run_year, + run_date=run_date, + run_by=run_by, + elapsed_time=elapsed_time, + var_year_start=var_year_start, + var_year_end=var_year_end, + git_sha=git_sha, + git_ref=git_ref, + git_author=git_author, + ) + + def to_dict(self) -> typing.Dict: + return dataclasses.asdict(self) + + +@dataclasses.dataclass +class TestRunResultMetadata: + """Metadata object storing aggregated information about township-level + test results in a run.""" + + run_id: str + run_year: str # Duplicated with TestRunMetadata for partitioning + test_name: str + table_name: str + column_name: typing.Optional[str] + category: str + description: str + township_code: typing.Optional[str] + status: str # Serialize Status enum to str for output to parquet + elapsed_time: decimal.Decimal + num_failing_rows: int + + @classmethod + def create_list( + cls, + test_categories: typing.List[TestCategory], + run_results_filepath: str, + ) -> typing.List["TestRunResultMetadata"]: + """Generate a list of TestRunMetadata object from a list of + TestCategory objects representing the categories in the run and a + filepath to a run_results.json file.""" + run_id = get_run_id_from_run_results(run_results_filepath) + run_date = get_run_date_from_run_results(run_results_filepath) + run_year = run_date[:4] + + return [ + TestRunResultMetadata( + run_id=run_id, + run_year=run_year, + test_name=township_result.name, + table_name=township_result.table_name, + column_name=township_result.column_name, + category=test_category.category, + description=township_result.description, + township_code=township_result.township_code, + status=township_result.status.value, + elapsed_time=township_result.elapsed_time, + num_failing_rows=len(township_result.failing_rows), + ) + for test_category in test_categories + for test_result in test_category.test_results + for township_result in test_result.split_by_township() + ] + + def to_dict(self) -> typing.Dict: + return dataclasses.asdict(self) + + +@dataclasses.dataclass +class TestRunFailingRowMetadata: + """Metadata object storing information about row-level individual test + failures in a run.""" + + # Fields that identify the run + run_id: str + run_year: str # Duplicated with TestRunMetadata for partitioning + # Fields that identify the test + test_name: str + table_name: str + column_name: typing.Optional[str] + category: str + description: str + # Fields that identify the failing row. Some of these can occasionally + # be arrays for tests that query multiple rows (e.g. uniqueness tests) + # so for consistency we set them to always be arrays, even when there + # is only one value + parid: typing.Optional[str] + taxyr: typing.Optional[str] + card: typing.Optional[typing.List[int]] + lline: typing.Optional[typing.List[int]] + class_: typing.Optional[typing.List[str]] + township_code: typing.Optional[str] + who: typing.Optional[typing.List[str]] + wen: typing.Optional[typing.List[str]] + # Since the problematic fields can vary so widely, we store them as a + # JSON blob + problematic_fields: typing.Dict + + @classmethod + def create_list( + cls, + test_categories: typing.List[TestCategory], + run_results_filepath: str, + ) -> typing.List["TestRunFailingRowMetadata"]: + """Generate a list of TestRunFailingRowMetadata object from a list of + TestCategory objects representing the categories in the run and a + filepath to a run_results.json file.""" + run_id = get_run_id_from_run_results(run_results_filepath) + run_date = get_run_date_from_run_results(run_results_filepath) + run_year = run_date[:4] + + def value_to_list(value): + """Tiny helper function to convert not-null column values to lists. + Useful in cases where a column can be either a scalar, a list, + or a null value, in which cases we want the output to always be + either a null value or a list.""" + if value is None or type(value) is list: + return value + return [value] + + return [ + TestRunFailingRowMetadata( + run_id=run_id, + run_year=run_year, + test_name=township_result.name, + table_name=township_result.table_name, + column_name=township_result.column_name, + category=test_category.category, + description=township_result.description, + township_code=township_result.township_code, + parid=failing_row.get(PARID_FIELD), + taxyr=failing_row.get(TAXYR_FIELD), + card=value_to_list(failing_row.get(CARD_FIELD)), + lline=value_to_list(failing_row.get(LAND_LINE_FIELD)), + class_=value_to_list(failing_row.get(CLASS_FIELD)), + who=value_to_list(failing_row.get(WHO_FIELD)), + wen=value_to_list(failing_row.get(WEN_FIELD)), + problematic_fields={ + key: val + for key, val in failing_row.items() + # Use possible_fixed_fieldnames to avoid having to + # recompute the exact fixed fieldnames on every iteration + # (we could also solve this by expanding out the list + # comprehension, but for now this is easier) + if key not in test_category.possible_fixed_fieldnames + }, + ) + for test_category in test_categories + for test_result in test_category.test_results + for township_result in test_result.split_by_township() + for failing_row in township_result.failing_rows + if township_result.status == Status.FAIL + ] + + def to_dict(self) -> typing.Dict: + output_data = dataclasses.asdict(self) + # Serialize the "class" attribute to a more human-friendly name + output_data["class"] = output_data.pop("class_") + # Dump the problematic fields to string, since parquet can't handle + # the notion of an untyped JSON object + output_data["problematic_fields"] = json.dumps( + output_data["problematic_fields"] + ) + return output_data + + +def get_key_from_run_results( + key: str, run_results_filepath: str +) -> typing.Any: + """Given a path to a run_results.json file, return a key that's represented + in the run metadata.""" + with open(run_results_filepath) as run_results_fobj: + run_results = json.load(run_results_fobj) + + return run_results[key] + + +def get_run_id_from_run_results(run_results_filepath: str) -> str: + """Given a path to a run_results.json file, return a string representation + of the invocation ID of the run.""" + metadata = get_key_from_run_results("metadata", run_results_filepath) + return metadata["invocation_id"] + + +def get_run_date_from_run_results(run_results_filepath: str) -> str: + """Given a path to a run_results.json file, return a string representation + of the date of the run formatted as YYYY-MM-DD.""" + metadata = get_key_from_run_results("metadata", run_results_filepath) + run_dt_str = metadata["generated_at"] + run_dt = datetime.datetime.strptime(run_dt_str, "%Y-%m-%dT%H:%M:%S.%fZ") + return run_dt.strftime("%Y-%m-%d") + + +def get_test_cache_path( + run_results_filepath: str, + manifest_filepath: str, + townships: typing.Tuple[str], +) -> str: + """Return the path to the cache where test results are stored, or an + empty string if no cache exists yet. The `run_results_filepath` and + `manifest_filepath` are used to generated a hash key that uniquely defines + the cache key for a given test run.""" + if not os.path.isfile(run_results_filepath) or not os.path.isfile( + manifest_filepath + ): + return "" + + with open(run_results_filepath, "rb") as run_results_file: + run_results_hash = hashlib.md5(run_results_file.read()).hexdigest() + + with open(manifest_filepath, "rb") as manifest_file: + manifest_hash = hashlib.md5(manifest_file.read()).hexdigest() + + base_filename = f"run_{run_results_hash}_manifest_{manifest_hash}" + if townships: + base_filename += f"_township_{'_'.join(townships)}" + + return os.path.join(TEST_CACHE_DIR, f"{base_filename}.json") + + +def get_test_categories_from_file( + file_path: str, +) -> typing.List[TestCategory]: + """Load a list of TestCategory objects from a cache located at + `file_path`.""" + with open(file_path) as cache_file: + test_category_dicts = json.load(cache_file, use_decimal=True) + return [ + TestCategory.from_dict(category_dict) + for category_dict in test_category_dicts + ] + + +def save_test_categories_to_file( + test_categories: typing.List[TestCategory], file_path: str +) -> None: + """Save a list of TestCategory objects to a cache located at + `file_path`.""" + test_category_dicts = [ + test_category.to_dict() for test_category in test_categories + ] + os.makedirs(TEST_CACHE_DIR, exist_ok=True) + with open(file_path, "w") as cache_file: + json.dump(test_category_dicts, cache_file, use_decimal=True) + + +def get_test_categories_from_athena( + run_results_filepath: str, + manifest_filepath: str, + townships: typing.Tuple[str], +) -> typing.List[TestCategory]: + """Load a list of TestCategory objects by querying Athena for + test results generated from a `dbt test --store-failures` call, + optionally filtering results by township.""" + with open(run_results_filepath) as run_results_fobj: + run_results = json.load(run_results_fobj) + + with open(manifest_filepath) as manifest_fobj: + manifest = json.load(manifest_fobj) + + test_categories = get_test_categories(run_results, manifest, townships) + if not test_categories: + raise ValueError(f"{run_results_filepath} contains no test results") + + return test_categories + + +def get_test_categories( + run_results: typing.Dict, + manifest: typing.Dict, + townships: typing.Tuple[str], +) -> typing.List[TestCategory]: + """Given two artifacts from a `dbt test --store-failures` call (a + run_results.json file dict and a manifest.json file dict) and an optional + township filter, generates a list of TestCategory objects storing the + results of the tests.""" + # Define a cursor with unload=True to output query results as parquet. + # This is particularly important when selecting aggregated columns, + # which are deserialized incorrectly by regular cursors + cursor = pyathena.connect( + s3_staging_dir=AWS_ATHENA_S3_STAGING_DIR, + region_name="us-east-1", + cursor_class=pyathena.arrow.cursor.ArrowCursor, + ).cursor(unload=True) + + tests_by_category: typing.Dict[str, TestCategory] = {} + + for run_result in run_results["results"]: + unique_id = run_result["unique_id"] + node = manifest["nodes"].get(unique_id) + if node is None: + raise ValueError(f"Missing dbt manifest node with id {unique_id}") + + test_name = node["name"] + status = run_result["status"] + execution_time = run_result["execution_time"] + + meta = node.get("meta", {}) + category = get_category_from_node(node) + tablename = get_tablename_from_node(node) + column_name = get_column_name_from_node(node) + test_description = meta.get("description") + + # Basic attrs for the test result that apply whether or not the test + # failed + base_result_kwargs = { + "name": test_name, + "table_name": tablename, + "column_name": column_name, + "description": test_description, + "elapsed_time": execution_time, + } + + if not tests_by_category.get(category): + tests_by_category[category] = TestCategory(category=category) + + if status == Status.PASS.value: + test_result = TestResult( + status=Status.PASS, failing_rows=[], **base_result_kwargs + ) + + elif status in (Status.FAIL.value, Status.WARN.value): + # Link to the test's page in the dbt docs, for debugging + test_docs_url = f"{DOCS_URL_PREFIX}/{unique_id}" + + # Get the fully-qualified name of the table that stores failures + # for this test so that we can query it + test_results_relation_name = node.get("relation_name") + if test_results_relation_name is None: + raise ValueError( + f"Missing relation_name attribute for test {test_name}. " + "Did you run `dbt test` with the --store-failures flag?" + ) + + print(f"Querying failed rows from {test_results_relation_name}") + # Athena SHOW COLUMNS doesn't allow double quoted tablenames + relation_name_unquoted = test_results_relation_name.replace( + '"', "`" + ) + cursor.execute(f"show columns in {relation_name_unquoted}") + # SHOW COLUMNS often returns field names with trailing whitespace + fieldnames = [row[0].strip() for row in cursor] + + # Construct the query to select the test results that were stored + # in Athena. We want to rehydrate a few fields like township_code + # and class without requiring that all tests select them, so + # check if those fields are missing and join against the correct + # source table in an attempt to rehydrate the missing columns. + # + # The conditionals that follow are a bit ugly, but they reflect the + # fact that we need to rehydrate different columns from different + # source tables, and that those source tables can differ based on + # whether the base model is keyed by card or parcel. In the future, + # we might consider simplifying this by joining to a helper view + # with a universe of township and class codes split out by table, + # e.g.: + # + # pin | card | taxyr | township_code | table | class + # ----- | ---- | ----- | ------------- | -------- | ----- + # 12345 | 1 | 2024 | 10 | land | 200 + # 12345 | 1 | 2024 | 10 | dweldat | 211 + test_results_select = "select test_results.*" + test_results_join = "" + # Format the townships tuple as a SQL array for filtering + townships_sql = ( + "(" + ",".join(f"'{code}'" for code in townships) + ")" + ) + # The correct reference for the township column varies + # depending on whether the original test selected a + # township field or not. If the township column has been + # selected, we can always reference it in the filter; otherwise, + # we need to wait to see if we can rehydrate the township based + # on the parid and taxyr + test_results_filter = ( + f" where test_results.{TOWNSHIP_FIELD} in {townships_sql}" + if townships and TOWNSHIP_FIELD in fieldnames + else "" + ) + # We need parid and taxyr at minimum in order to rehydrate any + # missing fields + if PARID_FIELD in fieldnames and TAXYR_FIELD in fieldnames: + if TOWNSHIP_FIELD not in fieldnames: + test_results_select += f", leg.user1 AS {TOWNSHIP_FIELD}" + test_results_join += f""" + left join iasworld.legdat as leg + on leg.{PARID_FIELD} = test_results.{PARID_FIELD} + and leg.{TAXYR_FIELD} = test_results.{TAXYR_FIELD} + and leg.cur = 'Y' + and leg.deactivat is null + """ + if townships: + test_results_filter = ( + f" where leg.user1 in {townships_sql}" + ) + if CLASS_FIELD not in fieldnames: + if ( + LAND_LINE_FIELD in fieldnames + or CARD_FIELD in fieldnames + ): + # Figure out the right table and key to join on in + # order to query the class + card_join_table = "dweldat" + if tablename in ["comdat", "land", "oby"]: + card_join_table = tablename + + card_field = ( + LAND_LINE_FIELD + if tablename == "land" + and LAND_LINE_FIELD in fieldnames + else CARD_FIELD + ) + + test_results_select += f", card.class AS {CLASS_FIELD}" + test_results_join += f""" + left join iasworld.{card_join_table} as card + on card.{PARID_FIELD} = + test_results.{PARID_FIELD} + and card.{TAXYR_FIELD} = + test_results.{TAXYR_FIELD} + and card.{card_field} = + test_results.{card_field} + and card.cur = 'Y' + and card.deactivat is null + """ + else: + test_results_select += f", par.class AS {CLASS_FIELD}" + test_results_join += f""" + left join iasworld.pardat as par + on par.{PARID_FIELD} = + test_results.{PARID_FIELD} + and par.{TAXYR_FIELD} = + test_results.{TAXYR_FIELD} + and par.cur = 'Y' + and par.deactivat is null + """ + + test_results_query = ( + f"{test_results_select} " + f"from {test_results_relation_name} as test_results " + f"{test_results_join}" + f"{test_results_filter}" + ) + # Use the cursor with unload=True in this query, since otherwise + # aggregated columns can be deserialized incorrectly + cursor.execute(test_results_query) + query_results = cursor.as_arrow().to_pylist() + if len(query_results) == 0: + msg = ( + f"Test {test_name} has status {status!r} but no failing " + "rows in Athena" + ) + if townships: + # Missing rows are most likely due to the township filter, + # so print a warning and skip + print(msg + ", possibly due to the township filter") + test_result = TestResult( + status=Status.PASS, + failing_rows=[], + **base_result_kwargs, + ) + else: + # If there's no township filter, the lack of rows indicates + # an unexpected error + raise ValueError(msg) + else: + # Add custom fields to query results that we don't expect to be + # included in the response + failing_rows = [ + { + TEST_NAME_FIELD: test_name, + DESCRIPTION_FIELD: test_description, + DOCS_URL_FIELD: test_docs_url, + SOURCE_TABLE_FIELD: tablename, + **row, + } + for row in query_results + ] + test_result = TestResult( + status=Status(status), + failing_rows=failing_rows, + **base_result_kwargs, + ) + + else: + raise ValueError( + f"Got unrecognized status '{status}' for node {unique_id} " + "in dbt run results" + ) + + tests_by_category[category].test_results.append(test_result) + + # Now that we've accumulated all of the test results and they are grouped + # into categories, we no longer need the category key in the dict, so + # transform the output into a list + return list(tests_by_category.values()) + + +def get_category_from_node(node: typing.Dict) -> str: + """Given a Node for a test extracted from a dbt manifest, return the + category that the test should go in.""" + if meta_category := node.get("meta", {}).get("category"): + return meta_category + + for dependency_macro in node["depends_on"]["macros"]: + # Macro names in the DAG are fully qualified following the pattern + # `macro..`, so remove the prefix to extract + # just the name of the macro + cleaned_macro_name = dependency_macro.split(".")[-1] + if custom_test_name := TEST_CATEGORIES.get(cleaned_macro_name): + return custom_test_name + # Custom generic tests are always formatted like test_ + if cleaned_macro_name.startswith("test_"): + return cleaned_macro_name.split("test_")[-1] + + return DEFAULT_TEST_CATEGORY + + +def get_tablename_from_node(node: typing.Dict) -> str: + """Given a Node for a test extracted from a dbt manifest, return the name + of the table that the test is testing.""" + if meta_tablename := node.get("meta", {}).get("table_name"): + # If meta.table_name is set, treat it as an override + return meta_tablename + + # Search for the model that is implicated in this test via the + # test_metadata.kwargs.model attribute. Note that it is common to use the + # elements in the depends_on array for this purpose, but this approach + # is fraught, since the order of parents for tests with multiple + # dependencies is not clear and can differ: + # https://github.com/dbt-labs/dbt-core/issues/6746#issuecomment-1829860236 + test_metadata = node.get("test_metadata", {}) + model_getter_str = test_metadata.get("kwargs", {}).get("model") + if not model_getter_str: + raise ValueError( + "Can't infer tablename: Missing `test_metadata.kwargs.model`" + f"attribute for test {node['name']}. You may need to add a " + "`meta.table_name` attribute to the test config to manually " + "specify the tablename" + ) + + # The test_metadata.kwargs.model attribute is formatted as a Jinja template + # call to the get_where_subquery macro, so we need to extract the ref or + # source tablename from that call + ref_match = re.search(r"ref\('(.+)'\)", model_getter_str) + if ref_match is not None: + fq_model_name = ref_match.group(1) + return fq_model_name.split(".")[-1] + + source_match = re.search(r"source\('iasworld', '(.+)'\)", model_getter_str) + if source_match is not None: + return source_match.group(1) + + raise ValueError( + "Can't infer tablename: Failed to parse model name from " + f'`test_metadata.kwargs.model` attribute "{model_getter_str}" ' + f" for test \"{node['name']}\". Inspect the dbt manifest file " + "for more information" + ) + + +def get_column_name_from_node(node: typing.Dict) -> typing.Optional[str]: + """Given a Node for a test extracted from a dbt manifest, return the name + of the column that the test is testing. Note that the column name is not + always set, e.g. for tests that are defined on a table instead of on + a column, so the return value can be None.""" + if meta_column_name := node.get("meta", {}).get("column_name"): + # If meta.column_name is set, treat it as an override + return meta_column_name + + return node.get("column_name") + + +if __name__ == "__main__": + main() diff --git a/dbt/scripts/transform_dbt_test_results.py b/dbt/scripts/transform_dbt_test_results.py index 04f6d00af..c2ce44004 100644 --- a/dbt/scripts/transform_dbt_test_results.py +++ b/dbt/scripts/transform_dbt_test_results.py @@ -4,7 +4,7 @@ # other teams for review and correction, along with metadata parquet files # that can be uploaded to S3 for long-term result tracking. # -# Run `python3 transform_dbt_test_results.py --help` for detailed +# Run `python3 run_iasworld_data_tests.py --help` for detailed # documentation. import argparse @@ -31,6 +31,10 @@ import pyathena.cursor import simplejson as json import yaml +from dbt.artifacts.schemas.results import TestStatus +from dbt.cli.main import dbtRunner + +DBT = dbtRunner() # Prefix for the URL location of a test in the dbt docs DOCS_URL_PREFIX = "https://ccao-data.github.io/data-architecture/#!/test" @@ -635,12 +639,10 @@ def _filter_for_existing_field_indexes( # Help docstring for the command line interface -CLI_DESCRIPTION = """Generates an Excel workbook of dbt test failures that can be shared with other teams for review and correction, -along with metadata parquet files that can be uploaded to S3 for long-term result tracking. +CLI_DESCRIPTION = """Runs iasWorld data tests and generates an Excel workbook of dbt test failures that can be shared with other teams +for review and correction, along with metadata parquet files that can be uploaded to S3 for long-term result tracking. -This script assumes that it is being run in sequence after a call to `dbt test --store-failures`, since it depends on two files created -by that operation (target/run_results.json and target/manifest.json). It also requires Python dependencies be installed from -requirements.transform_dbt_test_results.txt. +This script expects that Python dependencies have been installed from requirements.run_iasworld_data_tests.txt. Expects one required environment variable to be set: @@ -655,7 +657,7 @@ def _filter_for_existing_field_indexes( Outputs three files to the directory specified by the `--output-dir` flag: - 1. `qc_test_failures_.xlsx`: Excel workbook to share with other teams + 1. `iasworld_test_failures_.xlsx`: Excel workbook to share with other teams 2. `metadata/test_run/run_year=YYYY/*.parquet`: Metadata about this run, partitioned by year of run and prepped for upload to S3 3. `metadata/test_run_result/run_year=YYYY/*.parquet`: Metadata about test results (pass, fail, number of failing rows, etc.) in this run, partitioned by year of run and prepped for upload to S3 @@ -666,24 +668,29 @@ def _filter_for_existing_field_indexes( # Examples to use in the command line interface docstring CLI_EXAMPLE = """Example usage with no options provided: - python3 transform_dbt_test_results.py + python3 run_iasworld_data_tests.py Example usage with all options provided: - AWS_ATHENA_S3_STAGING_DIR=s3://foo-bar-baz/ python3 transform_dbt_test_results.py - --run-results ./target/run_results.json - --manifest ./target/manifest.json - --output-dir ./qc_test_results/ + AWS_ATHENA_S3_STAGING_DIR=s3://foo-bar-baz/ python3 run_iasworld_data_tests.py + --output-dir ./iasworld_test_results/ --township 77 + --no-use-cached Example usage to filter for multiple townships: - python3 transform_dbt_test_results.py --township 76 77""" # noqa: E501 + python3 run_iasworld_data_tests.py --township 76 77 + +Example usage to skip running tests, and instead reuse results from a previous run: + + python3 run_iasworld_data_tests.py --use-cached + +""" # noqa: E501 def main() -> None: - """Entrypoint to this script. Parses dbt test results and writes artifacts - to the output directory with metadata about tests.""" + """Entrypoint to this script. Runs dbt tests and writes artifacts + to the output directory with metadata about test results.""" parser = argparse.ArgumentParser( description=CLI_DESCRIPTION, @@ -693,33 +700,13 @@ def main() -> None: formatter_class=argparse.RawTextHelpFormatter, ) - parser.add_argument( - "--run-results", - required=False, - default=os.path.join("target", "run_results.json"), - help=( - "The local path to a run_results.json file generated by a " - "`dbt test --store-failures` run. Defaults to " - "'./target/run_results.json'." - ), - ) - parser.add_argument( - "--manifest", - required=False, - default=os.path.join("target", "manifest.json"), - help=( - "The local path to a manifest.json file with the compiled dbt " - "DAG, generated by all dbt CLI commands (including `dbt test`). " - "Defaults to './target/manifest.json'." - ), - ) parser.add_argument( "--output-dir", required=False, help=( "The directory to which output artifacts should be written; " "if the directory does not exist, it will be created. Defaults to " - "'./qc_test_results_/'." + "'./iasworld_test_results_/'." ), ) parser.add_argument( @@ -733,35 +720,73 @@ def main() -> None: "including null townships (which typically indicate invalid PINs)." ), ) + parser.add_argument( + "--use-cached", + action=argparse.BooleanOptionalAction, + required=False, + help=( + "Toggle using cached results from the most recent run. Useful when debugging " + "transformation steps. Defaults to False." + ), + ) args = parser.parse_args() - run_results_filepath = args.run_results - manifest_filepath = args.manifest output_dir = args.output_dir townships = args.township if args.township else tuple() + use_cached = args.use_cached + + run_results_filepath = os.path.join("target", "run_results.json") + manifest_filepath = os.path.join("target", "manifest.json") date_today = datetime.datetime.today().strftime("%Y-%m-%d") if output_dir is None: - output_dir = f"qc_test_results_{date_today}" - - test_cache_path = get_test_cache_path( - run_results_filepath, manifest_filepath, townships - ) + output_dir = f"iasworld_test_results_{date_today}" - if os.path.isfile(test_cache_path): - print(f"Loading test results from cache at {test_cache_path}") - test_categories = get_test_categories_from_file(test_cache_path) - else: - print( - f"Test cache not found at {test_cache_path}, loading test results " - "from Athena" + if use_cached: + test_cache_path = get_test_cache_path( + run_results_filepath, manifest_filepath, townships ) + if os.path.isfile(test_cache_path): + print(f"Loading test results from cache at {test_cache_path}") + test_categories = get_test_categories_from_file(test_cache_path) + else: + raise ValueError( + f"Test cache not found at {test_cache_path}, try rerunning " + "without --use-cached" + ) + else: + print("Running tests") + dbt_run_args = [ + "test", + "--selector", + "select_data_test_iasworld", + "--store-failures", + ] + print(f"> dbt {' '.join(dbt_run_args)}") + dbt_test_result = DBT.invoke(dbt_run_args) + + if dbt_test_result.exception is not None: + raise dbt_test_result.exception + + if any( + result.status == TestStatus.Error + for result in getattr(dbt_test_result.result, "results", []) + ): + # No need to report the exception, since the dbt process + # will have printed it already + raise ValueError("Quitting due to error in dbt test run") + + print("Loading test results from Athena") test_categories = get_test_categories_from_athena( run_results_filepath, manifest_filepath, townships ) - print(f"Saving test results to the cache at {test_cache_path}") - save_test_categories_to_file(test_categories, test_cache_path) + + new_test_cache_path = get_test_cache_path( + run_results_filepath, manifest_filepath, townships + ) + print(f"Saving test results to the cache at {new_test_cache_path}") + save_test_categories_to_file(test_categories, new_test_cache_path) print("Generating the output workbook") # It's important to use a write-only workbook here because otherwise @@ -774,7 +799,7 @@ def main() -> None: pathlib.Path(output_dir).mkdir(exist_ok=True) workbook_filepath = os.path.join( - output_dir, f"qc_test_failures_{date_today}.xlsx" + output_dir, f"iasworld_test_failures_{date_today}.xlsx" ) workbook.save(workbook_filepath) print(f"Output workbook saved to {workbook_filepath}") @@ -873,8 +898,8 @@ def create( run_vars = get_key_from_run_results("args", run_results_filepath)[ "vars" ] - var_year_start = run_vars.get("test_qc_year_start") - var_year_end = run_vars.get("test_qc_year_start") + var_year_start = run_vars.get("data_test_iasworld_year_start") + var_year_end = run_vars.get("data_test_iasworld_year_end") # If dbt vars weren't set on the command line, the defaults won't exist # in run_results.json, so we have to parse them from the dbt project @@ -883,9 +908,12 @@ def create( with open("dbt_project.yml") as project_fobj: project = yaml.safe_load(project_fobj) var_year_start = ( - var_year_start or project["vars"]["test_qc_year_start"] + var_year_start + or project["vars"]["data_test_iasworld_year_start"] + ) + var_year_end = ( + var_year_end or project["vars"]["data_test_iasworld_year_end"] ) - var_year_end = var_year_end or project["vars"]["test_qc_year_end"] return cls( run_id=run_id, @@ -1087,9 +1115,15 @@ def get_test_cache_path( manifest_filepath: str, townships: typing.Tuple[str], ) -> str: - """Return the path to the cache where test results are stored. - The `run_results_filepath` and `manifest_filepath` are used to generated - a hash key that uniquely defines the cache key for a given test run.""" + """Return the path to the cache where test results are stored, or an + empty string if no cache exists yet. The `run_results_filepath` and + `manifest_filepath` are used to generated a hash key that uniquely defines + the cache key for a given test run.""" + if not os.path.isfile(run_results_filepath) or not os.path.isfile( + manifest_filepath + ): + return "" + with open(run_results_filepath, "rb") as run_results_file: run_results_hash = hashlib.md5(run_results_file.read()).hexdigest() diff --git a/dbt/selectors.yml b/dbt/selectors.yml index aaf5fddcc..c9e469b0a 100644 --- a/dbt/selectors.yml +++ b/dbt/selectors.yml @@ -1,10 +1,10 @@ selectors: - - name: qc_tests - description: Selector for running QC tests on iasWorld tables + - name: select_data_test_iasworld + description: Selector for running data tests on iasWorld tables definition: union: - method: tag - value: "test_qc_*" + value: data_test_iasworld # Only run tests that exclusively reference selected nodes. Useful # for avoiding an edge case where a test whose base model is not # selected can run because it has an argument that references a model @@ -12,4 +12,4 @@ selectors: indirect_selection: cautious - exclude: - method: tag - value: test_qc_exclude_from_workbook + value: data_test_iasworld_exclude_from_workbook