Skip to content

Commit

Permalink
Add VRS data to v4 variant data pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
phildarnowsky-broad committed Jun 24, 2024
1 parent 4172526 commit 747900b
Show file tree
Hide file tree
Showing 5 changed files with 48 additions and 5 deletions.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .annotate_variants import annotate_variants, annotate_caids
from .annotate_variants import annotate_variants, annotate_caids, annotate_vrs_ids
from .transcript_consequence.annotate_transcript_consequences import annotate_transcript_consequences
from .variant_id import variant_id, variant_ids, compressed_variant_id

Expand All @@ -9,4 +9,5 @@
"variant_id",
"variant_ids",
"compressed_variant_id",
"annotate_vrs_ids",
]
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,31 @@ def annotate_caids(variants_path, caids_path=None):
ds = ds.annotate(caid=caids[ds.key].caid)

return ds


def annotate_vrs_ids(variants_path, exome_variants_path, genome_variants_path):
ds = hl.read_table(variants_path)
exomes = hl.read_table(exome_variants_path)
genomes = hl.read_table(genome_variants_path)
exome_vrs = exomes.select(vrs=exomes.info.vrs)
genome_vrs = genomes.select(vrs=genomes.info.vrs)
vrs = exome_vrs.union(genome_vrs)
vrs = vrs.group_by(vrs.locus, vrs.alleles).aggregate(vrs=hl.agg.collect(vrs.vrs)[0])
vrs = vrs.transmute(
vrs=hl.struct(
ref=hl.struct(
allele_id=vrs.vrs.VRS_Allele_IDs[0],
start=vrs.vrs.VRS_Starts[0],
end=vrs.vrs.VRS_Ends[0],
state=vrs.vrs.VRS_States[0],
),
alt=hl.struct(
allele_id=vrs.vrs.VRS_Allele_IDs[1],
start=vrs.vrs.VRS_Starts[1],
end=vrs.vrs.VRS_Ends[1],
state=vrs.vrs.VRS_States[1],
),
)
)
ds = ds.join(vrs)
return ds
2 changes: 1 addition & 1 deletion data-pipeline/src/data_pipeline/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def run(self, force=False):
elapsed = stop - start
logger.info("Finished %s in %dm%02ds", self._name, elapsed // 60, elapsed % 60)
else:
logger.info("Skipping %s", self._name)
logger.info(f"Skipping {self._name}")


@attr.define
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -128,10 +128,11 @@ def add_liftover_document_id(ds):
"document_id",
"variant_id",
"rsids",
# "caid",
"caid",
"locus",
"transcript_consequences.gene_id",
"transcript_consequences.transcript_id",
"vrs.alt.allele_id",
],
"id_field": "document_id",
"num_shards": 48,
Expand Down
17 changes: 15 additions & 2 deletions data-pipeline/src/data_pipeline/pipelines/gnomad_v4_variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
annotate_variants,
annotate_transcript_consequences,
annotate_caids,
annotate_vrs_ids,
)

RUN = True
Expand Down Expand Up @@ -90,11 +91,22 @@
},
)

pipeline.add_task(
name="annotate_vrs_ids",
task_function=annotate_vrs_ids,
output_path=f"{output_sub_dir}/gnomad_v4_variants_annotated_4.ht",
inputs={
"variants_path": pipeline.get_task("annotate_gnomad_v4_caids"),
"exome_variants_path": "gs://gcp-public-data--gnomad/release/4.1/ht/exomes/gnomad.exomes.v4.1.sites.ht",
"genome_variants_path": "gs://gcp-public-data--gnomad/release/4.1/ht/genomes/gnomad.genomes.v4.1.sites.ht",
},
)

###############################################
# Outputs
###############################################

pipeline.set_outputs({"variants": "annotate_gnomad_v4_caids"})
pipeline.set_outputs({"variants": "annotate_vrs_ids"})

###############################################
# Run
Expand All @@ -106,12 +118,13 @@

write_schemas(
[pipeline],
os.path.join("/home/msolomon", "schemas"),
os.path.expanduser("~/schemas"),
task_names=[
"prepare_gnomad_v4_variants",
"annotate_gnomad_v4_variants",
"annotate_gnomad_v4_transcript_consequences",
"annotate_gnomad_v4_caids",
"annotate_vrs_ids",
],
)
# copy locally using:
Expand Down

0 comments on commit 747900b

Please sign in to comment.