updates to ClinVar parsing (#311)

populationgenomics · Aug 15, 2023 · f9de245 · f9de245
1 parent acfe235
commit f9de245
Show file tree

Hide file tree

Showing 5 changed files with 58 additions and 31 deletions.
diff --git a/helpers/clinvar_conf.toml b/helpers/clinvar_conf.toml
@@ -1,6 +1,6 @@
 [workflow]
 name = 'Annotate_Clinvar'
-scatter_count = 50
+scatter_count = 25
 vcf_size_in_gb = 30
 sequencing_type = 'genome'
 
@@ -11,10 +11,10 @@ default_memory = 'highmem'
 
 [images]
 vep = 'australia-southeast1-docker.pkg.dev/cpg-common/images/vep:105.0'
-cpg_workflows = 'australia-southeast1-docker.pkg.dev/cpg-common/images/cpg_aip:1.1.1'
+cpg_workflows = 'australia-southeast1-docker.pkg.dev/cpg-common/images/cpg_aip:latest'
 
 [clinvar]
 filter_benign = ['illumina laboratory services; illumina']
 
-[cohorts]
-placeholder = "placeholder"
+[cohorts.acute-care]
+clinvar_filter = ['victorian clinical genetics services, murdoch childrens research institute']
diff --git a/helpers/report_hunter.py b/helpers/report_hunter.py
@@ -6,6 +6,8 @@
 generate an index HTML page with links to all reports
 """
 
+import logging
+import sys
 from dataclasses import dataclass
 from os.path import join
 from pathlib import Path
@@ -75,7 +77,7 @@ def get_project_analyses(project: str) -> list[dict]:
     }
     """
     )
-    # validate(project_query)
+
     response: dict[str, Any] = query(project_query, variables={'project': project})
     return response['project']['analyses']
 
@@ -93,21 +95,28 @@ def main():
 
         for analysis in get_project_analyses(cohort):
             # only look for HTML reanalysis entries
-            if 'reanalysis' not in analysis['output']:
+            if 'reanalysis' not in analysis['output'] or not analysis[
+                'output'
+            ].endswith('html'):
                 continue
 
             # pull the exome/singleton flags
             exome_output = analysis['meta'].get('is_exome', False)
             singleton_output = analysis['meta'].get('is_singleton', False)
-
-            # incorporate that into a key when gathering
-            all_cohorts[f'{cohort}_{exome_output}_{singleton_output}'] = Report(
-                dataset=cohort,
-                address=analysis['meta']['display_url'],
-                genome_or_exome='Exome' if exome_output else 'Genome',
-                subtype='Singleton' if singleton_output else 'Familial',
-                date=analysis['timestampCompleted'].split('T')[0],
-            )
+            try:
+                # incorporate that into a key when gathering
+                all_cohorts[f'{cohort}_{exome_output}_{singleton_output}'] = Report(
+                    dataset=cohort,
+                    address=analysis['meta']['display_url'],
+                    genome_or_exome='Exome' if exome_output else 'Genome',
+                    subtype='Singleton' if singleton_output else 'Familial',
+                    date=analysis['timestampCompleted'].split('T')[0],
+                )
+            except KeyError:
+                logging.info(
+                    'Failed to construct a Report entry - is this a report HTML entry?'
+                )
+                logging.info(analysis)
 
     # smoosh into a list for the report context - all reports sortable by date
     template_context = {'reports': list(all_cohorts.values())}
@@ -130,4 +139,10 @@ def main():
 
 
 if __name__ == '__main__':
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(asctime)s %(levelname)s %(module)s:%(lineno)d - %(message)s',
+        datefmt='%Y-%m-%d %H:%M:%S',
+        stream=sys.stderr,
+    )
     main()
diff --git a/reanalysis/reanalysis_global.toml b/reanalysis/reanalysis_global.toml
@@ -93,20 +93,20 @@ genome_calling_interval_lists = 'gs://cpg-common-main/references/hg38/v0/wgs_cal
 #gene_prior = if a specific gene list is to be used to determine Cat 2 (new gene-disease associations), provide the filepath here
 
 [cohorts.acute-care]
-clinvar_filter = ['victorian clinical genetics services,murdoch childrens research institute']
+clinvar_filter = ['victorian clinical genetics services, murdoch childrens research institute']
 #cohort_percentage = 80
 gene_prior = 'gs://cpg-acute-care-test/reanalysis/pre_panelapp_mendeliome.json'
 
 [cohorts.ag-cardiac]
 cohort_panels = [4059]
 
 [cohorts.ag-hidden]
-clinvar_filter = ['victorian clinical genetics services,murdoch childrens research institute']
+clinvar_filter = ['victorian clinical genetics services, murdoch childrens research institute']
 gene_prior = 'gs://cpg-ag-hidden-test/reanalysis/pre_panelapp_mendeliome.json'
 cohort_panels = [275]
 
 [cohorts.brain-malf]
-clinvar_filter = ['victorian clinical genetics services,murdoch childrens research institute']
+clinvar_filter = ['victorian clinical genetics services, murdoch childrens research institute']
 #cohort_percentage = 80
 gene_prior = 'gs://cpg-brain-malf-test/reanalysis/pre_panelapp_mendeliome.json'
 cohort_panels = [3136]
@@ -123,7 +123,7 @@ gene_prior = "gs://cpg-broad-rgp-test-analysis/reanalysis/jan_2020_panels.json"
 cohort_panels = [239]
 
 [cohorts.epileptic-enceph]
-clinvar_filter = ['victorian clinical genetics services,murdoch childrens research institute']
+clinvar_filter = ['victorian clinical genetics services, murdoch childrens research institute']
 #cohort_percentage = 80
 gene_prior = 'gs://cpg-epileptic-enceph-test/reanalysis/pre_panelapp_mendeliome.json'
 cohort_panels = [202]
@@ -142,18 +142,18 @@ cohort_panels = [3120]
 cohort_panels = [56]
 
 [cohorts.kidgen]
-clinvar_filter = ['victorian clinical genetics services,murdoch childrens research institute']
+clinvar_filter = ['victorian clinical genetics services, murdoch childrens research institute']
 gene_prior = 'gs://cpg-kidgen-test-analysis/reanalysis/pre_panelapp_mendeliome.json'
 cohort_panels = [275]
 
 [cohorts.leukodystrophies]
-clinvar_filter = ['victorian clinical genetics services,murdoch childrens research institute']
+clinvar_filter = ['victorian clinical genetics services, murdoch childrens research institute']
 cohort_panels = [298, 299, 3094]
 #cohort_percentage = 80
 gene_prior = 'gs://cpg-leukodystrophies-test/reanalysis/pre_panelapp_mendeliome.json'
 
 [cohorts.mito-disease]
-clinvar_filter = ['victorian clinical genetics services,murdoch childrens research institute']
+clinvar_filter = ['victorian clinical genetics services, murdoch childrens research institute']
 #cohort_percentage = 80
 gene_prior = 'gs://cpg-mito-disease-test/reanalysis/pre_panelapp_mendeliome.json'
 cohort_panels = [203]

diff --git a/reanalysis/run_reanalysis.sh b/reanalysis/run_reanalysis.sh
@@ -8,12 +8,12 @@ DATE=${1:-$(date +%F)}
 analysis-runner \
   --config reanalysis/reanalysis_global.toml \
   --config reanalysis/reanalysis_cohort.toml \
-  --image australia-southeast1-docker.pkg.dev/cpg-common/images/cpg_aip \
+  --image australia-southeast1-docker.pkg.dev/cpg-common/images/cpg_aip:latest \
   --dataset acute-care \
   --description "AIP run" \
   -o "reanalysis/${DATE}" \
   --access-level test \
   reanalysis/interpretation_runner.py \
     --i gs://cpg-acute-care-test/reanalysis/2011-11-11/prior_to_annotation.vcf.bgz \
-    --pedigree gs://cpg-acute-care-test/reanalysis/acute-care-plink.fam \
+    --pedigree gs://cpg-acute-care-test/reanalysis/pedigree.ped \
     --skip_annotation
diff --git a/reanalysis/summarise_clinvar_entries.py b/reanalysis/summarise_clinvar_entries.py
@@ -153,15 +153,15 @@ def get_allele_locus_map(summary_file: str) -> dict:
     return allele_dict
 
 
-def lines_from_gzip(filename: str) -> str:
+def lines_from_gzip(filename: str) -> list[list[str]]:
     """
     generator for gzip reading, copies file locally before reading
 
     Args:
         filename (str): the gzipped input file
 
     Returns:
-        generator; yields each line
+        generator; yields each line as a list of its elements
     """
 
     if isinstance(to_path(filename), CloudPath):
@@ -306,7 +306,7 @@ def dict_list_to_ht(list_of_dicts: list) -> hl.Table:
 
 
 def get_all_decisions(
-    submission_file: str, threshold_date: datetime, allele_ids: set
+    submission_file: str, threshold_date: datetime | None, allele_ids: set
 ) -> dict[str, list[Submission]]:
     """
     obtains all submissions per-allele which pass basic criteria
@@ -332,17 +332,25 @@ def get_all_decisions(
         blacklist = cohort_config.get('clinvar_filter', [])
         logging.info(f'Blacklisted sites: {blacklist}')
     except (AssertionError, KeyError):
+        logging.info('Failure to identify blacklisted sites for this project')
         blacklist = []
 
     for line in lines_from_gzip(submission_file):
+
+        # if we have a threshold date, and an un-dated entry
+        # put it straight in the bin
+        if threshold_date is None and line[2] == '-':
+            continue
+
         a_id, line_sub = process_line(line)
 
         # skip rows where the variantID isn't in this mapping
         # this saves a little effort on haplotypes, CNVs, and SVs
+        # pylint: disable=too-many-boolean-expressions
         if (
             (a_id not in allele_ids)
             or (line_sub.submitter in blacklist)
-            or (line_sub.date > threshold_date)
+            or (threshold_date is not None and line_sub.date > threshold_date)
             or (line_sub.review_status in USELESS_RATINGS)
             or (line_sub.classification == Consequence.UNKNOWN)
         ):
@@ -481,7 +489,11 @@ def snv_missense_filter(clinvar_table: hl.Table, vcf_path: str):
 
 
 def main(
-    subs: str, date: datetime, variants: str, out: str, path_snv: str | None = None
+    subs: str,
+    date: datetime | None,
+    variants: str,
+    out: str,
+    path_snv: str | None = None,
 ):
     """
     Redefines what it is to be a clinvar summary
@@ -490,7 +502,7 @@ def main(
         subs (str): file path to all submissions (gzipped)
         variants (str): file path to variant summary (gzipped)
         out (str): path to write JSON out to
-        date (str): date threshold to use for filtering submissions
+        date (datetime | None): date threshold to use for filtering submissions
         path_snv (str): if defined, path to write SNV VCF file
     """
 
@@ -570,7 +582,7 @@ def main(
             'date, format YYYY-MM-DD. Individual submissions after this date are '
             'removed. Un-dated submissions will pass this threshold.'
         ),
-        default=datetime.now(),
+        default=None,
     )
     parser.add_argument('--path_snv', help='Output VCF, sites-only, Pathogenic SNVs')
     args = parser.parse_args()