Skip to content

Commit

Permalink
feat(data-pipelines): add steps to prepare v4 variants ht for public …
Browse files Browse the repository at this point in the history
…release
  • Loading branch information
rileyhgrant committed Aug 28, 2024
1 parent 597f61d commit 6f0e887
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def annotate_transcript_consequences(variants_path, transcripts_path, mane_trans

if mane_transcripts_path:
mane_transcripts = hl.read_table(mane_transcripts_path)
mane_transcripts_version = hl.eval(mane_transcripts.globals.version)
mane_select_transcripts_version = hl.eval(mane_transcripts.globals.version)

mane_transcripts = hl.dict([(row.gene_id, row.drop("gene_id")) for row in mane_transcripts.collect()])

Expand Down Expand Up @@ -138,7 +138,7 @@ def annotate_transcript_consequences(variants_path, transcripts_path, mane_trans
)

ds = ds.annotate(transcript_consequences=transcript_consequences).drop("vep")
ds = ds.annotate_globals(mane_transcripts_version=mane_transcripts_version)
ds = ds.annotate_globals(mane_select_version=mane_select_transcripts_version)

else:
transcript_consequences = hl.sorted(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,17 @@ def freq_joint(ds, subset=None, pop=None, sex=None, raw=False):
return ds


def prepare_table_for_release(variants_table_path):
ds = hl.read_table(variants_table_path)
ds = ds.annotate(
exomes=ds.exomes.drop("faf95", "faf99"),
genomes=ds.genomes.drop("faf95", "faf99"),
joint=ds.joint.drop("faf99_joint", "faf95_joint"),
)
ds = ds.select_globals(mane_select_version=ds.globals.mane_transcripts_version)
return ds


def prepare_gnomad_v4_variants(exome_variants_path: str, genome_variants_path: str, variants_joint_frequency_path: str):
exome_variants = prepare_gnomad_v4_variants_helper(exome_variants_path, "exome")
genome_variants = prepare_gnomad_v4_variants_helper(genome_variants_path, "genome")
Expand Down
12 changes: 12 additions & 0 deletions data-pipeline/src/data_pipeline/pipelines/gnomad_v4_variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from data_pipeline.datasets.gnomad_v4.gnomad_v4_variants import (
prepare_gnomad_v4_variants,
prepare_table_for_release,
)


Expand Down Expand Up @@ -102,6 +103,17 @@
},
)

# removes several duplicated values, as well as constraint to prepare for release to the general public
# naming scheme follows methods naming scheme for consistency
pipeline.add_task(
name="prepare_table_for_release",
task_function=prepare_table_for_release,
output_path=f"{output_sub_dir}/gnomad.browser.v4.1.sites.ht",
inputs={
"variants_table_path": pipeline.get_task("annotate_vrs_ids"),
},
)

###############################################
# Outputs
###############################################
Expand Down

0 comments on commit 6f0e887

Please sign in to comment.