diff --git a/data-pipeline/src/data_pipeline/data_types/variant/__init__.py b/data-pipeline/src/data_pipeline/data_types/variant/__init__.py index 1e2584c65..ce421adf8 100644 --- a/data-pipeline/src/data_pipeline/data_types/variant/__init__.py +++ b/data-pipeline/src/data_pipeline/data_types/variant/__init__.py @@ -1,4 +1,4 @@ -from .annotate_variants import annotate_variants, annotate_caids +from .annotate_variants import annotate_variants, annotate_caids, annotate_vrs_ids from .transcript_consequence.annotate_transcript_consequences import annotate_transcript_consequences from .variant_id import variant_id, variant_ids, compressed_variant_id @@ -9,4 +9,5 @@ "variant_id", "variant_ids", "compressed_variant_id", + "annotate_vrs_ids", ] diff --git a/data-pipeline/src/data_pipeline/data_types/variant/annotate_variants.py b/data-pipeline/src/data_pipeline/data_types/variant/annotate_variants.py index c53b4cb7b..e91425a07 100644 --- a/data-pipeline/src/data_pipeline/data_types/variant/annotate_variants.py +++ b/data-pipeline/src/data_pipeline/data_types/variant/annotate_variants.py @@ -23,3 +23,31 @@ def annotate_caids(variants_path, caids_path=None): ds = ds.annotate(caid=caids[ds.key].caid) return ds + + +def annotate_vrs_ids(variants_path, exome_variants_path, genome_variants_path): + ds = hl.read_table(variants_path) + exomes = hl.read_table(exome_variants_path) + genomes = hl.read_table(genome_variants_path) + exome_vrs = exomes.select(vrs=exomes.info.vrs) + genome_vrs = genomes.select(vrs=genomes.info.vrs) + vrs = exome_vrs.union(genome_vrs) + vrs = vrs.group_by(vrs.locus, vrs.alleles).aggregate(vrs=hl.agg.collect(vrs.vrs)[0]) + vrs = vrs.transmute( + vrs=hl.struct( + ref=hl.struct( + allele_id=vrs.vrs.VRS_Allele_IDs[0], + start=vrs.vrs.VRS_Starts[0], + end=vrs.vrs.VRS_Ends[0], + state=vrs.vrs.VRS_States[0], + ), + alt=hl.struct( + allele_id=vrs.vrs.VRS_Allele_IDs[1], + start=vrs.vrs.VRS_Starts[1], + end=vrs.vrs.VRS_Ends[1], + state=vrs.vrs.VRS_States[1], + ), + ) + ) + ds = ds.join(vrs) + return ds diff --git a/data-pipeline/src/data_pipeline/pipeline.py b/data-pipeline/src/data_pipeline/pipeline.py index 8d58d0de4..1e42b7d78 100644 --- a/data-pipeline/src/data_pipeline/pipeline.py +++ b/data-pipeline/src/data_pipeline/pipeline.py @@ -102,7 +102,7 @@ def run(self, force=False): elapsed = stop - start logger.info("Finished %s in %dm%02ds", self._name, elapsed // 60, elapsed % 60) else: - logger.info("Skipping %s", self._name) + logger.info(f"Skipping {self._name}") @attr.define diff --git a/data-pipeline/src/data_pipeline/pipelines/export_to_elasticsearch.py b/data-pipeline/src/data_pipeline/pipelines/export_to_elasticsearch.py index 709a788a0..9ddfa8b7b 100644 --- a/data-pipeline/src/data_pipeline/pipelines/export_to_elasticsearch.py +++ b/data-pipeline/src/data_pipeline/pipelines/export_to_elasticsearch.py @@ -128,10 +128,11 @@ def add_liftover_document_id(ds): "document_id", "variant_id", "rsids", - # "caid", + "caid", "locus", "transcript_consequences.gene_id", "transcript_consequences.transcript_id", + "vrs.alt.allele_id", ], "id_field": "document_id", "num_shards": 48, diff --git a/data-pipeline/src/data_pipeline/pipelines/gnomad_v4_variants.py b/data-pipeline/src/data_pipeline/pipelines/gnomad_v4_variants.py index b1becc134..4b484e44d 100644 --- a/data-pipeline/src/data_pipeline/pipelines/gnomad_v4_variants.py +++ b/data-pipeline/src/data_pipeline/pipelines/gnomad_v4_variants.py @@ -28,6 +28,7 @@ annotate_variants, annotate_transcript_consequences, annotate_caids, + annotate_vrs_ids, ) RUN = True @@ -90,11 +91,22 @@ }, ) +pipeline.add_task( + name="annotate_vrs_ids", + task_function=annotate_vrs_ids, + output_path=f"{output_sub_dir}/gnomad_v4_variants_annotated_4.ht", + inputs={ + "variants_path": pipeline.get_task("annotate_gnomad_v4_caids"), + "exome_variants_path": "gs://gcp-public-data--gnomad/release/4.1/ht/exomes/gnomad.exomes.v4.1.sites.ht", + "genome_variants_path": "gs://gcp-public-data--gnomad/release/4.1/ht/genomes/gnomad.genomes.v4.1.sites.ht", + }, +) + ############################################### # Outputs ############################################### -pipeline.set_outputs({"variants": "annotate_gnomad_v4_caids"}) +pipeline.set_outputs({"variants": "annotate_vrs_ids"}) ############################################### # Run @@ -106,12 +118,13 @@ write_schemas( [pipeline], - os.path.join("/home/msolomon", "schemas"), + os.path.expanduser("~/schemas"), task_names=[ "prepare_gnomad_v4_variants", "annotate_gnomad_v4_variants", "annotate_gnomad_v4_transcript_consequences", "annotate_gnomad_v4_caids", + "annotate_vrs_ids", ], ) # copy locally using: