From ec2db6084ead1b2907174f7556cb88f4a76ff79c Mon Sep 17 00:00:00 2001 From: Matthew Solomonson Date: Tue, 19 Sep 2023 14:30:24 -0400 Subject: [PATCH] Add output validation/schemas for step 2 --- .../gnomad_v4_exome_coverage.ht.schema | 21 + .../gnomad_v4_genome_coverage.ht.schema | 21 + ...ad_v4_exome_variants_annotated_1.ht.schema | 362 ++++++++++++++++++ .../gnomad_v4_exome_variants_base.ht.schema | 334 ++++++++++++++++ .../gnomad_v4/types/prepare_variants_step2.py | 29 ++ data-pipeline/src/data_pipeline/pipeline.py | 1 + .../pipelines/gnomad_v4_coverage.py | 22 +- .../pipelines/gnomad_v4_variants.py | 38 +- data-pipeline/tests/v4/test_inputs.py | 9 + 9 files changed, 808 insertions(+), 29 deletions(-) create mode 100644 data-pipeline/schemas/gnomad_v4_variants/annotate_gnomad_v4_exome_variants/exome_coverage_path/gnomad_v4_exome_coverage.ht.schema create mode 100644 data-pipeline/schemas/gnomad_v4_variants/annotate_gnomad_v4_exome_variants/genome_coverage_path/gnomad_v4_genome_coverage.ht.schema create mode 100644 data-pipeline/schemas/gnomad_v4_variants/annotate_gnomad_v4_exome_variants/output/gnomad_v4_exome_variants_annotated_1.ht.schema create mode 100644 data-pipeline/schemas/gnomad_v4_variants/annotate_gnomad_v4_exome_variants/variants_path/gnomad_v4_exome_variants_base.ht.schema create mode 100644 data-pipeline/src/data_pipeline/datasets/gnomad_v4/types/prepare_variants_step2.py diff --git a/data-pipeline/schemas/gnomad_v4_variants/annotate_gnomad_v4_exome_variants/exome_coverage_path/gnomad_v4_exome_coverage.ht.schema b/data-pipeline/schemas/gnomad_v4_variants/annotate_gnomad_v4_exome_variants/exome_coverage_path/gnomad_v4_exome_coverage.ht.schema new file mode 100644 index 000000000..cc7c55027 --- /dev/null +++ b/data-pipeline/schemas/gnomad_v4_variants/annotate_gnomad_v4_exome_variants/exome_coverage_path/gnomad_v4_exome_coverage.ht.schema @@ -0,0 +1,21 @@ +---------------------------------------- +Global fields: + None +---------------------------------------- +Row fields: + 'locus': locus + 'xpos': int64 + 'mean': float64 + 'median': int32 + 'over_1': float32 + 'over_5': float32 + 'over_10': float32 + 'over_15': float32 + 'over_20': float32 + 'over_25': float32 + 'over_30': float32 + 'over_50': float32 + 'over_100': float32 +---------------------------------------- +Key: ['locus'] +---------------------------------------- \ No newline at end of file diff --git a/data-pipeline/schemas/gnomad_v4_variants/annotate_gnomad_v4_exome_variants/genome_coverage_path/gnomad_v4_genome_coverage.ht.schema b/data-pipeline/schemas/gnomad_v4_variants/annotate_gnomad_v4_exome_variants/genome_coverage_path/gnomad_v4_genome_coverage.ht.schema new file mode 100644 index 000000000..cc7c55027 --- /dev/null +++ b/data-pipeline/schemas/gnomad_v4_variants/annotate_gnomad_v4_exome_variants/genome_coverage_path/gnomad_v4_genome_coverage.ht.schema @@ -0,0 +1,21 @@ +---------------------------------------- +Global fields: + None +---------------------------------------- +Row fields: + 'locus': locus + 'xpos': int64 + 'mean': float64 + 'median': int32 + 'over_1': float32 + 'over_5': float32 + 'over_10': float32 + 'over_15': float32 + 'over_20': float32 + 'over_25': float32 + 'over_30': float32 + 'over_50': float32 + 'over_100': float32 +---------------------------------------- +Key: ['locus'] +---------------------------------------- \ No newline at end of file diff --git a/data-pipeline/schemas/gnomad_v4_variants/annotate_gnomad_v4_exome_variants/output/gnomad_v4_exome_variants_annotated_1.ht.schema b/data-pipeline/schemas/gnomad_v4_variants/annotate_gnomad_v4_exome_variants/output/gnomad_v4_exome_variants_annotated_1.ht.schema new file mode 100644 index 000000000..2d79bcce0 --- /dev/null +++ b/data-pipeline/schemas/gnomad_v4_variants/annotate_gnomad_v4_exome_variants/output/gnomad_v4_exome_variants_annotated_1.ht.schema @@ -0,0 +1,362 @@ +---------------------------------------- +Global fields: + 'freq_meta': array> + 'freq_index_dict': dict + 'faf_meta': array> + 'faf_index_dict': dict + 'freq_sample_count': array + 'filtering_model': struct { + model_name: str, + score_name: str, + feature_medians: dict, + variants_by_strata: dict, + features_importance: dict, + features: array, + test_results: array, + rf_snv_cutoff: struct { + bin: float64, + min_score: float64 + }, + rf_indel_cutoff: struct { + bin: float64, + min_score: float64 + }, + inbreeding_cutoff: float64, + model_id: str + } + 'tool_versions': struct { + dbsnp_version: str, + cadd_version: str, + revel_version: str, + splicaai_version: str, + primateai_version: str, + pangolin_version: str, + vrs_version: str + } + 'vep_globals': struct { + vep_version: str, + vep_csq_header: str, + vep_help: str, + vep_config: str + } + 'age_distribution': struct { + bin_edges: array, + bin_freq: array, + n_smaller: int32, + n_larger: int32 + } + 'age_index_dict': dict + 'age_meta': array> + 'grpmax_index_dict': dict + 'grpmax_meta': array> + 'README': dict + 'gnomad_qc_repo': str + 'gnomad_methods_repo': str +---------------------------------------- +Row fields: + 'locus': locus + 'alleles': array + 'grpmax': array + 'rsids': set + 'vep': struct { + allele_string: str, + end: int32, + id: str, + input: str, + intergenic_consequences: array, + impact: str, + variant_allele: str + }>, + most_severe_consequence: str, + motif_feature_consequences: array, + high_inf_pos: str, + impact: str, + motif_feature_id: str, + motif_name: str, + motif_pos: int32, + motif_score_change: float64, + transcription_factors: array, + strand: int32, + variant_allele: str + }>, + regulatory_feature_consequences: array, + impact: str, + regulatory_feature_id: str, + variant_allele: str + }>, + seq_region_name: str, + start: int32, + strand: int32, + transcript_consequences: array, + distance: int32, + domains: array, + exon: str, + flags: str, + gene_id: str, + gene_pheno: int32, + gene_symbol: str, + gene_symbol_source: str, + hgnc_id: str, + hgvsc: str, + hgvsp: str, + hgvs_offset: int32, + impact: str, + intron: str, + lof: str, + lof_flags: str, + lof_filter: str, + lof_info: str, + mane_select: str, + mane_plus_clinical: str, + mirna: array, + polyphen_prediction: str, + polyphen_score: float64, + protein_end: int32, + protein_start: int32, + protein_id: str, + sift_prediction: str, + sift_score: float64, + source: str, + strand: int32, + transcript_id: str, + tsl: int32, + uniprot_isoform: array, + variant_allele: str + }>, + variant_class: str + } + 'rf': struct { + rf_positive_label: bool, + rf_negative_label: bool, + rf_label: str, + rf_train: bool, + rf_tp_probability: float64 + } + 'in_silico_predictors': struct { + cadd: struct { + phred: float32, + raw_score: float32, + has_duplicate: bool + }, + revel: struct { + revel_score: float64, + has_duplicate: bool + }, + splice_ai: struct { + splice_ai_score: float32, + splice_consequence: str, + has_duplicate: bool + }, + pangolin: struct { + pangolin_score: float64 + } + } + 'variant_id': str + 'colocated_variants': struct { + all: array, + non_ukb: array + } + 'gnomad': struct { + freq: struct { + all: struct { + ac: int32, + ac_raw: int32, + an: int32, + hemizygote_count: int32, + homozygote_count: int32, + populations: array + }, + non_ukb: struct { + ac: int32, + ac_raw: int32, + an: int32, + hemizygote_count: int32, + homozygote_count: int32, + populations: array + } + }, + faf95: struct { + popmax: float64, + popmax_population: str + }, + faf99: struct { + popmax: float64, + popmax_population: str + }, + age_distribution: struct { + het: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }>, + hom: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }> + }, + filters: set, + quality_metrics: struct { + allele_balance: struct { + alt_adj: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + alt_raw: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + } + }, + genotype_depth: struct { + all_adj: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + all_raw: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + alt_adj: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + alt_raw: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + } + }, + genotype_quality: struct { + all_adj: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + all_raw: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + alt_adj: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + alt_raw: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + } + }, + site_quality_metrics: array + } + } + 'subsets': set + 'flags': set + 'coverage': struct { + exome: struct { + mean: float64, + median: int32, + over_1: float32, + over_5: float32, + over_10: float32, + over_15: float32, + over_20: float32, + over_25: float32, + over_30: float32, + over_50: float32, + over_100: float32 + }, + genome: struct { + mean: float64, + median: int32, + over_1: float32, + over_5: float32, + over_10: float32, + over_15: float32, + over_20: float32, + over_25: float32, + over_30: float32, + over_50: float32, + over_100: float32 + } + } +---------------------------------------- +Key: ['locus', 'alleles'] +---------------------------------------- \ No newline at end of file diff --git a/data-pipeline/schemas/gnomad_v4_variants/annotate_gnomad_v4_exome_variants/variants_path/gnomad_v4_exome_variants_base.ht.schema b/data-pipeline/schemas/gnomad_v4_variants/annotate_gnomad_v4_exome_variants/variants_path/gnomad_v4_exome_variants_base.ht.schema new file mode 100644 index 000000000..89fba5a59 --- /dev/null +++ b/data-pipeline/schemas/gnomad_v4_variants/annotate_gnomad_v4_exome_variants/variants_path/gnomad_v4_exome_variants_base.ht.schema @@ -0,0 +1,334 @@ +---------------------------------------- +Global fields: + 'freq_meta': array> + 'freq_index_dict': dict + 'faf_meta': array> + 'faf_index_dict': dict + 'freq_sample_count': array + 'filtering_model': struct { + model_name: str, + score_name: str, + feature_medians: dict, + variants_by_strata: dict, + features_importance: dict, + features: array, + test_results: array, + rf_snv_cutoff: struct { + bin: float64, + min_score: float64 + }, + rf_indel_cutoff: struct { + bin: float64, + min_score: float64 + }, + inbreeding_cutoff: float64, + model_id: str + } + 'tool_versions': struct { + dbsnp_version: str, + cadd_version: str, + revel_version: str, + splicaai_version: str, + primateai_version: str, + pangolin_version: str, + vrs_version: str + } + 'vep_globals': struct { + vep_version: str, + vep_csq_header: str, + vep_help: str, + vep_config: str + } + 'age_distribution': struct { + bin_edges: array, + bin_freq: array, + n_smaller: int32, + n_larger: int32 + } + 'age_index_dict': dict + 'age_meta': array> + 'grpmax_index_dict': dict + 'grpmax_meta': array> + 'README': dict + 'gnomad_qc_repo': str + 'gnomad_methods_repo': str +---------------------------------------- +Row fields: + 'locus': locus + 'alleles': array + 'grpmax': array + 'rsids': set + 'vep': struct { + allele_string: str, + end: int32, + id: str, + input: str, + intergenic_consequences: array, + impact: str, + variant_allele: str + }>, + most_severe_consequence: str, + motif_feature_consequences: array, + high_inf_pos: str, + impact: str, + motif_feature_id: str, + motif_name: str, + motif_pos: int32, + motif_score_change: float64, + transcription_factors: array, + strand: int32, + variant_allele: str + }>, + regulatory_feature_consequences: array, + impact: str, + regulatory_feature_id: str, + variant_allele: str + }>, + seq_region_name: str, + start: int32, + strand: int32, + transcript_consequences: array, + distance: int32, + domains: array, + exon: str, + flags: str, + gene_id: str, + gene_pheno: int32, + gene_symbol: str, + gene_symbol_source: str, + hgnc_id: str, + hgvsc: str, + hgvsp: str, + hgvs_offset: int32, + impact: str, + intron: str, + lof: str, + lof_flags: str, + lof_filter: str, + lof_info: str, + mane_select: str, + mane_plus_clinical: str, + mirna: array, + polyphen_prediction: str, + polyphen_score: float64, + protein_end: int32, + protein_start: int32, + protein_id: str, + sift_prediction: str, + sift_score: float64, + source: str, + strand: int32, + transcript_id: str, + tsl: int32, + uniprot_isoform: array, + variant_allele: str + }>, + variant_class: str + } + 'rf': struct { + rf_positive_label: bool, + rf_negative_label: bool, + rf_label: str, + rf_train: bool, + rf_tp_probability: float64 + } + 'in_silico_predictors': struct { + cadd: struct { + phred: float32, + raw_score: float32, + has_duplicate: bool + }, + revel: struct { + revel_score: float64, + has_duplicate: bool + }, + splice_ai: struct { + splice_ai_score: float32, + splice_consequence: str, + has_duplicate: bool + }, + pangolin: struct { + pangolin_score: float64 + } + } + 'variant_id': str + 'colocated_variants': struct { + all: array, + non_ukb: array + } + 'gnomad': struct { + freq: struct { + all: struct { + ac: int32, + ac_raw: int32, + an: int32, + hemizygote_count: int32, + homozygote_count: int32, + populations: array + }, + non_ukb: struct { + ac: int32, + ac_raw: int32, + an: int32, + hemizygote_count: int32, + homozygote_count: int32, + populations: array + } + }, + faf95: struct { + popmax: float64, + popmax_population: str + }, + faf99: struct { + popmax: float64, + popmax_population: str + }, + age_distribution: struct { + het: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }>, + hom: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }> + }, + filters: set, + quality_metrics: struct { + allele_balance: struct { + alt_adj: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + alt_raw: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + } + }, + genotype_depth: struct { + all_adj: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + all_raw: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + alt_adj: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + alt_raw: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + } + }, + genotype_quality: struct { + all_adj: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + all_raw: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + alt_adj: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + alt_raw: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + } + }, + site_quality_metrics: array + } + } + 'subsets': set + 'flags': set +---------------------------------------- +Key: ['locus', 'alleles'] +---------------------------------------- \ No newline at end of file diff --git a/data-pipeline/src/data_pipeline/datasets/gnomad_v4/types/prepare_variants_step2.py b/data-pipeline/src/data_pipeline/datasets/gnomad_v4/types/prepare_variants_step2.py new file mode 100644 index 000000000..144da0965 --- /dev/null +++ b/data-pipeline/src/data_pipeline/datasets/gnomad_v4/types/prepare_variants_step2.py @@ -0,0 +1,29 @@ +import attr +from data_pipeline.datasets.gnomad_v4.types.prepare_variants_step1 import Variant as InputVariant + + +@attr.define +class CoverageDetail: + mean: float + median: int + over_1: float + over_5: float + over_10: float + over_15: float + over_20: float + over_25: float + over_30: float + over_50: float + over_100: float + + +@attr.define +class Coverage: + exome: CoverageDetail + genome: CoverageDetail + + +@attr.define +class Variant(InputVariant): + coverage: Coverage + # caids go here too diff --git a/data-pipeline/src/data_pipeline/pipeline.py b/data-pipeline/src/data_pipeline/pipeline.py index 5db3a362f..b395e3fc4 100644 --- a/data-pipeline/src/data_pipeline/pipeline.py +++ b/data-pipeline/src/data_pipeline/pipeline.py @@ -128,6 +128,7 @@ def get_inputs(self): if isinstance(v, (Task, DownloadTask)): paths.update({k: v.get_output_path()}) else: + logger.info(v) new_path = os.path.join(config.data_paths.root, v) paths.update({k: os.path.join(config.data_paths.root, v)}) diff --git a/data-pipeline/src/data_pipeline/pipelines/gnomad_v4_coverage.py b/data-pipeline/src/data_pipeline/pipelines/gnomad_v4_coverage.py index 412c50cea..cfccb0c2d 100644 --- a/data-pipeline/src/data_pipeline/pipelines/gnomad_v4_coverage.py +++ b/data-pipeline/src/data_pipeline/pipelines/gnomad_v4_coverage.py @@ -3,28 +3,28 @@ from data_pipeline.data_types.coverage import prepare_coverage -pipeline = Pipeline() +pipeline = Pipeline(name="gnomad_v4_coverage") pipeline.add_task( - "prepare_gnomad_v4_exome_coverage", - prepare_coverage, - "/gnomad_v4/gnomad_v4_exome_coverage.ht", + name="prepare_gnomad_v4_exome_coverage", + task_function=prepare_coverage, + output_path="/gnomad_v4/gnomad_v4_exome_coverage.ht", # Using v3 coverage as mock for now - { + inputs={ "coverage_path": "gs://gcp-public-data--gnomad/release/3.0.1/coverage/genomes/gnomad.genomes.r3.0.1.coverage.ht", - "filter_intervals": ["chr1:10030-10150"], }, + params={"filter_intervals": ["chr1:10030-10150"]}, ) pipeline.add_task( - "prepare_gnomad_v4_genome_coverage", - prepare_coverage, - "/gnomad_v4/gnomad_v4_genome_coverage.ht", + name="prepare_gnomad_v4_genome_coverage", + task_function=prepare_coverage, + output_path="/gnomad_v4/gnomad_v4_genome_coverage.ht", # Using v3 coverage as mock for now - { + inputs={ "coverage_path": "gs://gcp-public-data--gnomad/release/3.0.1/coverage/genomes/gnomad.genomes.r3.0.1.coverage.ht", - "filter_intervals": ["chr1:10030-10150"], }, + params={"filter_intervals": ["chr1:10030-10150"]}, ) ############################################### diff --git a/data-pipeline/src/data_pipeline/pipelines/gnomad_v4_variants.py b/data-pipeline/src/data_pipeline/pipelines/gnomad_v4_variants.py index 6da147d37..573bc6550 100644 --- a/data-pipeline/src/data_pipeline/pipelines/gnomad_v4_variants.py +++ b/data-pipeline/src/data_pipeline/pipelines/gnomad_v4_variants.py @@ -3,9 +3,10 @@ from data_pipeline.config import config from data_pipeline.datasets.gnomad_v4.gnomad_v4_variants import prepare_gnomad_v4_variants -# from data_pipeline.data_types.variant import annotate_variants, annotate_transcript_consequences +from data_pipeline.data_types.variant import annotate_variants, annotate_transcript_consequences # from data_pipeline.pipelines.gnomad_v4_coverage import pipeline as coverage_pipeline + # from data_pipeline.pipelines.genes import pipeline as genes_pipeline @@ -22,26 +23,27 @@ ) # pipeline.add_task( -# "prepare_gnomad_v4_genome_variants", -# prepare_gnomad_v4_variants, -# "/gnomad_v4/gnomad_v4_genome_variants_base.ht", -# { -# "path": "gs://gnomad-matt-data-pipeline/external_sources/2023-09-07-exome-variants-v4-mock/mock_v4_release.ht", -# "type": "genome", +# name="prepare_gnomad_v4_genome_variants", +# task_function=prepare_gnomad_v4_variants, +# output_path="/gnomad_v4/gnomad_v4_genome_variants_base.ht", +# inputs={ +# "input_path": "external_datasets/mock_v4_release.ht", # }, # ) -# pipeline.add_task( -# "annotate_gnomad_v4_variants", -# annotate_variants, -# "/gnomad_v4/gnomad_v4_variants_annotated_1.ht", -# { -# "variants_path": pipeline.get_task("prepare_gnomad_v4_exome_variants"), -# "exome_coverage_path": coverage_pipeline.get_output("exome_coverage"), -# "genome_coverage_path": coverage_pipeline.get_output("genome_coverage"), -# # "caids_path": "gs://gnomad-browser-data-pipeline/caids/gnomad_v4_caids.ht", -# }, -# ) +pipeline.add_task( + "annotate_gnomad_v4_exome_variants", + annotate_variants, + "/gnomad_v4/gnomad_v4_exome_variants_annotated_1.ht", + { + "variants_path": pipeline.get_task("prepare_gnomad_v4_exome_variants"), + "exome_coverage_path": "tiny_datasets/gnomad_v4_exome_coverage.ht", + "genome_coverage_path": "tiny_datasets/gnomad_v4_genome_coverage.ht", + # "exome_coverage_path": coverage_pipeline.get_output("exome_coverage"), + # "genome_coverage_path": coverage_pipeline.get_output("genome_coverage"), + # "caids_path": "gs://gnomad-browser-data-pipeline/caids/gnomad_v4_caids.ht", + }, +) # pipeline.add_task( # "annotate_gnomad_v4_transcript_consequences", diff --git a/data-pipeline/tests/v4/test_inputs.py b/data-pipeline/tests/v4/test_inputs.py index e0f40fd25..64987f32c 100644 --- a/data-pipeline/tests/v4/test_inputs.py +++ b/data-pipeline/tests/v4/test_inputs.py @@ -11,6 +11,7 @@ from data_pipeline.datasets.gnomad_v4.types.initial_globals import Globals from data_pipeline.datasets.gnomad_v4.types.initial_variant import InitialVariant from data_pipeline.datasets.gnomad_v4.types.prepare_variants_step1 import Variant as Step1Variant +from data_pipeline.datasets.gnomad_v4.types.prepare_variants_step2 import Variant as Step2Variant step1_task = gnomad_v4_variant_pipeline.get_task("prepare_gnomad_v4_exome_variants") @@ -53,3 +54,11 @@ def test_validate_step1_output(): # ht = ht.sample(0.1, seed=1234) result = ht_to_json(ht) [structure_attrs_fromdict(variant, Step1Variant) for variant in result] + + +def test_validate_step2_output(): + output_path = gnomad_v4_variant_pipeline.get_task("annotate_gnomad_v4_exome_variants").get_output_path() + ht = hl.read_table(output_path) + # ht = ht.sample(0.1, seed=1234) + result = ht_to_json(ht) + [structure_attrs_fromdict(variant, Step2Variant) for variant in result]