Skip to content

Commit

Permalink
updates to ClinVar parsing (#311)
Browse files Browse the repository at this point in the history
  • Loading branch information
MattWellie authored Aug 15, 2023
1 parent acfe235 commit f9de245
Show file tree
Hide file tree
Showing 5 changed files with 58 additions and 31 deletions.
8 changes: 4 additions & 4 deletions helpers/clinvar_conf.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[workflow]
name = 'Annotate_Clinvar'
scatter_count = 50
scatter_count = 25
vcf_size_in_gb = 30
sequencing_type = 'genome'

Expand All @@ -11,10 +11,10 @@ default_memory = 'highmem'

[images]
vep = 'australia-southeast1-docker.pkg.dev/cpg-common/images/vep:105.0'
cpg_workflows = 'australia-southeast1-docker.pkg.dev/cpg-common/images/cpg_aip:1.1.1'
cpg_workflows = 'australia-southeast1-docker.pkg.dev/cpg-common/images/cpg_aip:latest'

[clinvar]
filter_benign = ['illumina laboratory services; illumina']

[cohorts]
placeholder = "placeholder"
[cohorts.acute-care]
clinvar_filter = ['victorian clinical genetics services, murdoch childrens research institute']
37 changes: 26 additions & 11 deletions helpers/report_hunter.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
generate an index HTML page with links to all reports
"""

import logging
import sys
from dataclasses import dataclass
from os.path import join
from pathlib import Path
Expand Down Expand Up @@ -75,7 +77,7 @@ def get_project_analyses(project: str) -> list[dict]:
}
"""
)
# validate(project_query)

response: dict[str, Any] = query(project_query, variables={'project': project})
return response['project']['analyses']

Expand All @@ -93,21 +95,28 @@ def main():

for analysis in get_project_analyses(cohort):
# only look for HTML reanalysis entries
if 'reanalysis' not in analysis['output']:
if 'reanalysis' not in analysis['output'] or not analysis[
'output'
].endswith('html'):
continue

# pull the exome/singleton flags
exome_output = analysis['meta'].get('is_exome', False)
singleton_output = analysis['meta'].get('is_singleton', False)

# incorporate that into a key when gathering
all_cohorts[f'{cohort}_{exome_output}_{singleton_output}'] = Report(
dataset=cohort,
address=analysis['meta']['display_url'],
genome_or_exome='Exome' if exome_output else 'Genome',
subtype='Singleton' if singleton_output else 'Familial',
date=analysis['timestampCompleted'].split('T')[0],
)
try:
# incorporate that into a key when gathering
all_cohorts[f'{cohort}_{exome_output}_{singleton_output}'] = Report(
dataset=cohort,
address=analysis['meta']['display_url'],
genome_or_exome='Exome' if exome_output else 'Genome',
subtype='Singleton' if singleton_output else 'Familial',
date=analysis['timestampCompleted'].split('T')[0],
)
except KeyError:
logging.info(
'Failed to construct a Report entry - is this a report HTML entry?'
)
logging.info(analysis)

# smoosh into a list for the report context - all reports sortable by date
template_context = {'reports': list(all_cohorts.values())}
Expand All @@ -130,4 +139,10 @@ def main():


if __name__ == '__main__':
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s %(levelname)s %(module)s:%(lineno)d - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
stream=sys.stderr,
)
main()
14 changes: 7 additions & 7 deletions reanalysis/reanalysis_global.toml
Original file line number Diff line number Diff line change
Expand Up @@ -93,20 +93,20 @@ genome_calling_interval_lists = 'gs://cpg-common-main/references/hg38/v0/wgs_cal
#gene_prior = if a specific gene list is to be used to determine Cat 2 (new gene-disease associations), provide the filepath here

[cohorts.acute-care]
clinvar_filter = ['victorian clinical genetics services,murdoch childrens research institute']
clinvar_filter = ['victorian clinical genetics services, murdoch childrens research institute']
#cohort_percentage = 80
gene_prior = 'gs://cpg-acute-care-test/reanalysis/pre_panelapp_mendeliome.json'

[cohorts.ag-cardiac]
cohort_panels = [4059]

[cohorts.ag-hidden]
clinvar_filter = ['victorian clinical genetics services,murdoch childrens research institute']
clinvar_filter = ['victorian clinical genetics services, murdoch childrens research institute']
gene_prior = 'gs://cpg-ag-hidden-test/reanalysis/pre_panelapp_mendeliome.json'
cohort_panels = [275]

[cohorts.brain-malf]
clinvar_filter = ['victorian clinical genetics services,murdoch childrens research institute']
clinvar_filter = ['victorian clinical genetics services, murdoch childrens research institute']
#cohort_percentage = 80
gene_prior = 'gs://cpg-brain-malf-test/reanalysis/pre_panelapp_mendeliome.json'
cohort_panels = [3136]
Expand All @@ -123,7 +123,7 @@ gene_prior = "gs://cpg-broad-rgp-test-analysis/reanalysis/jan_2020_panels.json"
cohort_panels = [239]

[cohorts.epileptic-enceph]
clinvar_filter = ['victorian clinical genetics services,murdoch childrens research institute']
clinvar_filter = ['victorian clinical genetics services, murdoch childrens research institute']
#cohort_percentage = 80
gene_prior = 'gs://cpg-epileptic-enceph-test/reanalysis/pre_panelapp_mendeliome.json'
cohort_panels = [202]
Expand All @@ -142,18 +142,18 @@ cohort_panels = [3120]
cohort_panels = [56]

[cohorts.kidgen]
clinvar_filter = ['victorian clinical genetics services,murdoch childrens research institute']
clinvar_filter = ['victorian clinical genetics services, murdoch childrens research institute']
gene_prior = 'gs://cpg-kidgen-test-analysis/reanalysis/pre_panelapp_mendeliome.json'
cohort_panels = [275]

[cohorts.leukodystrophies]
clinvar_filter = ['victorian clinical genetics services,murdoch childrens research institute']
clinvar_filter = ['victorian clinical genetics services, murdoch childrens research institute']
cohort_panels = [298, 299, 3094]
#cohort_percentage = 80
gene_prior = 'gs://cpg-leukodystrophies-test/reanalysis/pre_panelapp_mendeliome.json'

[cohorts.mito-disease]
clinvar_filter = ['victorian clinical genetics services,murdoch childrens research institute']
clinvar_filter = ['victorian clinical genetics services, murdoch childrens research institute']
#cohort_percentage = 80
gene_prior = 'gs://cpg-mito-disease-test/reanalysis/pre_panelapp_mendeliome.json'
cohort_panels = [203]
Expand Down
4 changes: 2 additions & 2 deletions reanalysis/run_reanalysis.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@ DATE=${1:-$(date +%F)}
analysis-runner \
--config reanalysis/reanalysis_global.toml \
--config reanalysis/reanalysis_cohort.toml \
--image australia-southeast1-docker.pkg.dev/cpg-common/images/cpg_aip \
--image australia-southeast1-docker.pkg.dev/cpg-common/images/cpg_aip:latest \
--dataset acute-care \
--description "AIP run" \
-o "reanalysis/${DATE}" \
--access-level test \
reanalysis/interpretation_runner.py \
--i gs://cpg-acute-care-test/reanalysis/2011-11-11/prior_to_annotation.vcf.bgz \
--pedigree gs://cpg-acute-care-test/reanalysis/acute-care-plink.fam \
--pedigree gs://cpg-acute-care-test/reanalysis/pedigree.ped \
--skip_annotation
26 changes: 19 additions & 7 deletions reanalysis/summarise_clinvar_entries.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,15 +153,15 @@ def get_allele_locus_map(summary_file: str) -> dict:
return allele_dict


def lines_from_gzip(filename: str) -> str:
def lines_from_gzip(filename: str) -> list[list[str]]:
"""
generator for gzip reading, copies file locally before reading
Args:
filename (str): the gzipped input file
Returns:
generator; yields each line
generator; yields each line as a list of its elements
"""

if isinstance(to_path(filename), CloudPath):
Expand Down Expand Up @@ -306,7 +306,7 @@ def dict_list_to_ht(list_of_dicts: list) -> hl.Table:


def get_all_decisions(
submission_file: str, threshold_date: datetime, allele_ids: set
submission_file: str, threshold_date: datetime | None, allele_ids: set
) -> dict[str, list[Submission]]:
"""
obtains all submissions per-allele which pass basic criteria
Expand All @@ -332,17 +332,25 @@ def get_all_decisions(
blacklist = cohort_config.get('clinvar_filter', [])
logging.info(f'Blacklisted sites: {blacklist}')
except (AssertionError, KeyError):
logging.info('Failure to identify blacklisted sites for this project')
blacklist = []

for line in lines_from_gzip(submission_file):

# if we have a threshold date, and an un-dated entry
# put it straight in the bin
if threshold_date is None and line[2] == '-':
continue

a_id, line_sub = process_line(line)

# skip rows where the variantID isn't in this mapping
# this saves a little effort on haplotypes, CNVs, and SVs
# pylint: disable=too-many-boolean-expressions
if (
(a_id not in allele_ids)
or (line_sub.submitter in blacklist)
or (line_sub.date > threshold_date)
or (threshold_date is not None and line_sub.date > threshold_date)
or (line_sub.review_status in USELESS_RATINGS)
or (line_sub.classification == Consequence.UNKNOWN)
):
Expand Down Expand Up @@ -481,7 +489,11 @@ def snv_missense_filter(clinvar_table: hl.Table, vcf_path: str):


def main(
subs: str, date: datetime, variants: str, out: str, path_snv: str | None = None
subs: str,
date: datetime | None,
variants: str,
out: str,
path_snv: str | None = None,
):
"""
Redefines what it is to be a clinvar summary
Expand All @@ -490,7 +502,7 @@ def main(
subs (str): file path to all submissions (gzipped)
variants (str): file path to variant summary (gzipped)
out (str): path to write JSON out to
date (str): date threshold to use for filtering submissions
date (datetime | None): date threshold to use for filtering submissions
path_snv (str): if defined, path to write SNV VCF file
"""

Expand Down Expand Up @@ -570,7 +582,7 @@ def main(
'date, format YYYY-MM-DD. Individual submissions after this date are '
'removed. Un-dated submissions will pass this threshold.'
),
default=datetime.now(),
default=None,
)
parser.add_argument('--path_snv', help='Output VCF, sites-only, Pathogenic SNVs')
args = parser.parse_args()
Expand Down

0 comments on commit f9de245

Please sign in to comment.