From d6221b09989a577047e00d9d4672a02511d5e4a1 Mon Sep 17 00:00:00 2001 From: Matthew Solomonson Date: Mon, 2 Oct 2023 11:14:00 -0400 Subject: [PATCH] Move input validation out of tests, into pipeline code itself --- .../pipelines/gnomad_v4_validation.py} | 41 ++++++++----------- .../pipelines/gnomad_v4_variants.py | 15 +++++++ 2 files changed, 33 insertions(+), 23 deletions(-) rename data-pipeline/{tests/v4/test_inputs.py => src/data_pipeline/pipelines/gnomad_v4_validation.py} (58%) diff --git a/data-pipeline/tests/v4/test_inputs.py b/data-pipeline/src/data_pipeline/pipelines/gnomad_v4_validation.py similarity index 58% rename from data-pipeline/tests/v4/test_inputs.py rename to data-pipeline/src/data_pipeline/pipelines/gnomad_v4_validation.py index f4922169a..ba34e0cb0 100644 --- a/data-pipeline/tests/v4/test_inputs.py +++ b/data-pipeline/src/data_pipeline/pipelines/gnomad_v4_validation.py @@ -1,11 +1,10 @@ -import pytest from cattrs import structure, structure_attrs_fromdict import hail as hl import json -from data_pipeline.pipelines.gnomad_v4_variants import ( - pipeline as gnomad_v4_variant_pipeline, -) +from loguru import logger + +from data_pipeline.pipeline import Pipeline from data_pipeline.datasets.gnomad_v4.types.initial_globals import Globals from data_pipeline.datasets.gnomad_v4.types.initial_variant import InitialVariant @@ -13,8 +12,6 @@ from data_pipeline.datasets.gnomad_v4.types.prepare_variants_step2 import Variant as Step2Variant from data_pipeline.datasets.gnomad_v4.types.prepare_variants_step3 import Variant as Step3Variant -step1_task = gnomad_v4_variant_pipeline.get_task("prepare_gnomad_v4_exome_variants") - def ht_to_json(ht: hl.Table, field: str = "row"): if field == "row": @@ -32,45 +29,43 @@ def ht_to_json(ht: hl.Table, field: str = "row"): return objs -@pytest.mark.mock_data -def test_globals_input_validation(): - input_path = gnomad_v4_variant_pipeline.get_task("prepare_gnomad_v4_exome_variants").get_inputs()["input_path"] +def validate_globals_input(pipeline: Pipeline): + input_path = pipeline.get_task("prepare_gnomad_v4_exome_variants").get_inputs()["input_path"] ht = hl.read_table(input_path) result = ht_to_json(ht, "globals")[0] # logger.info(result) structure(result, Globals) + logger.info("Validated prepare_gnomad_v4_exome_variants input globals") -@pytest.mark.mock_data -def test_validate_variant_input(): - input_path = gnomad_v4_variant_pipeline.get_task("prepare_gnomad_v4_exome_variants").get_inputs()["input_path"] +def validate_variant_input(pipeline: Pipeline): + input_path = pipeline.get_task("prepare_gnomad_v4_exome_variants").get_inputs()["input_path"] ht = hl.read_table(input_path) result = ht_to_json(ht) [structure_attrs_fromdict(variant, InitialVariant) for variant in result] + logger.info("Validated prepare_gnomad_v4_exome_variants input variants") -@pytest.mark.mock_data -def test_validate_step1_output(): - output_path = gnomad_v4_variant_pipeline.get_task("prepare_gnomad_v4_exome_variants").get_output_path() +def validate_step1_output(pipeline: Pipeline): + output_path = pipeline.get_task("prepare_gnomad_v4_exome_variants").get_output_path() ht = hl.read_table(output_path) # ht = ht.sample(0.1, seed=1234) result = ht_to_json(ht) [structure_attrs_fromdict(variant, Step1Variant) for variant in result] + logger.info("Validated prepare_gnomad_v4_exome_variants (step 1) output") -@pytest.mark.mock_data -def test_validate_step2_output(): - output_path = gnomad_v4_variant_pipeline.get_task("annotate_gnomad_v4_exome_variants").get_output_path() +def validate_step2_output(pipeline: Pipeline): + output_path = pipeline.get_task("annotate_gnomad_v4_exome_variants").get_output_path() ht = hl.read_table(output_path) result = ht_to_json(ht) [structure_attrs_fromdict(variant, Step2Variant) for variant in result] + logger.info("Validated annotate_gnomad_v4_exome_variants (step 2) output") -@pytest.mark.mock_data -def test_validate_step3_output(): - output_path = gnomad_v4_variant_pipeline.get_task( - "annotate_gnomad_v4_exome_transcript_consequences" - ).get_output_path() +def validate_step3_output(pipeline: Pipeline): + output_path = pipeline.get_task("annotate_gnomad_v4_exome_transcript_consequences").get_output_path() ht = hl.read_table(output_path) result = ht_to_json(ht) [structure_attrs_fromdict(variant, Step3Variant) for variant in result] + logger.info("Validated annotate_gnomad_v4_exome_transcript_consequences (step 3) output") diff --git a/data-pipeline/src/data_pipeline/pipelines/gnomad_v4_variants.py b/data-pipeline/src/data_pipeline/pipelines/gnomad_v4_variants.py index ce1bf6335..47ae0b386 100644 --- a/data-pipeline/src/data_pipeline/pipelines/gnomad_v4_variants.py +++ b/data-pipeline/src/data_pipeline/pipelines/gnomad_v4_variants.py @@ -1,4 +1,5 @@ import os +from loguru import logger from data_pipeline.config import PipelineConfig, get_data_environment, DataEnvironment @@ -12,6 +13,13 @@ annotate_variants, annotate_transcript_consequences, ) +from data_pipeline.pipelines.gnomad_v4_validation import ( + validate_globals_input, + validate_step1_output, + validate_step2_output, + validate_step3_output, + validate_variant_input, +) DATA_ENV = os.getenv("DATA_ENV", "mock") @@ -106,3 +114,10 @@ if __name__ == "__main__": run_pipeline(pipeline) + + logger.info("Validating pipeline IO formats") + validate_globals_input(pipeline) + validate_variant_input(pipeline) + validate_step1_output(pipeline) + validate_step2_output(pipeline) + validate_step3_output(pipeline)