Include dataset everywhere (#335)

* Bump version: 2.0.2 → 2.0.3
populationgenomics · Nov 8, 2023 · 338a87e · 338a87e
1 parent 056eae6
commit 338a87e
Show file tree

Hide file tree

Showing 11 changed files with 32 additions and 26 deletions.
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 2.0.2
+current_version = 2.0.3
 commit = True
 tag = False
 

diff --git a/.github/workflows/clinvar_runner.yaml b/.github/workflows/clinvar_runner.yaml
@@ -32,5 +32,5 @@ jobs:
           curl --fail --silent --show-error -X POST \
               -H "Authorization: Bearer $TOKEN" \
               -H "Content-Type:application/json" \
-              -d '{"output": "generate_clinvar_${{ steps.date.outputs.date }}", "dataset": "talos", "accessLevel": "full", "repo": "automated-interpretation-pipeline", "commit": "${{ github.sha }}", "cwd": "reanalysis", "script": ["./clinvar_runner.py"], "description": "Generate Latest Clinvar Summaries", "image": "australia-southeast1-docker.pkg.dev/cpg-common/images/cpg_aip:2.0.2", "config": {"workflow": {"sequencing_type": "genome"}, "cohorts": {"talos": {"clinvar_filter": ["victorian clinical genetics services, murdoch childrens research institute"]}}}, "wait": false}' \
+              -d '{"output": "generate_clinvar_${{ steps.date.outputs.date }}", "dataset": "talos", "accessLevel": "full", "repo": "automated-interpretation-pipeline", "commit": "${{ github.sha }}", "cwd": "reanalysis", "script": ["./clinvar_runner.py"], "description": "Generate Latest Clinvar Summaries", "image": "australia-southeast1-docker.pkg.dev/cpg-common/images/cpg_aip:2.0.3", "config": {"workflow": {"sequencing_type": "genome"}, "cohorts": {"talos": {"clinvar_filter": ["victorian clinical genetics services, murdoch childrens research institute"]}}}, "wait": false}' \
               https://server-a2pko7ameq-ts.a.run.app
diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml
@@ -5,7 +5,7 @@ on:
       - main
 
 env:
-  VERSION: 2.0.2
+  VERSION: 2.0.3
 
 jobs:
   docker:

diff --git a/reanalysis/query_panelapp.py b/reanalysis/query_panelapp.py
@@ -318,7 +318,7 @@ def main(panels: str | None, out_path: str, dataset: str | None = None):
 
     # historic data overrides default 'previous' list for cohort
     # open to discussing order of precedence here
-    if old_file := find_latest_file(start='panel_'):
+    if old_file := find_latest_file(dataset=dataset, start='panel_'):
         logging.info(f'Grabbing legacy panel data from {old_file}')
         old_data: dict = read_json_from_path(old_file, default=old_data)
 
@@ -392,7 +392,7 @@ def main(panels: str | None, out_path: str, dataset: str | None = None):
     # write the output to long term storage
     write_output_json(output_path=out_path, object_to_write=gene_dict)
 
-    save_new_historic(old_data, prefix='panel_')
+    save_new_historic(old_data, dataset=dataset, prefix='panel_')
 
 
 if __name__ == '__main__':

diff --git a/reanalysis/reanalysis_global.toml b/reanalysis/reanalysis_global.toml
@@ -68,9 +68,9 @@ default_memory = 'highmem'
 
 [images]
 gatk = 'australia-southeast1-docker.pkg.dev/cpg-common/images/gatk:4.2.6.1'
-aip = 'australia-southeast1-docker.pkg.dev/cpg-common/images/cpg_aip:2.0.2'
+aip = 'australia-southeast1-docker.pkg.dev/cpg-common/images/cpg_aip:2.0.3'
 vep_110 = "australia-southeast1-docker.pkg.dev/cpg-common/images/vep_110:release_110.1"
-cpg_workflows = "australia-southeast1-docker.pkg.dev/cpg-common/images/cpg_workflows:1.17.1"
+cpg_workflows = "australia-southeast1-docker.pkg.dev/cpg-common/images/cpg_workflows:latest"
 
 [references]
 vep_110_mount = "gs://cpg-common-main/references/vep/110/mount"

diff --git a/reanalysis/utils.py b/reanalysis/utils.py
@@ -1072,7 +1072,7 @@ def find_comp_hets(var_list: list[AbstractVariant], pedigree) -> CompHetDict:
     return comp_het_results
 
 
-def filter_results(results: dict, singletons: bool) -> dict:
+def filter_results(results: dict, singletons: bool, dataset: str) -> dict:
     """
     loads the most recent prior result set (if it exists)
     annotates previously seen variants with the most recent date seen
@@ -1085,7 +1085,7 @@ def filter_results(results: dict, singletons: bool) -> dict:
     Returns: same results annotated with date-first-seen
     """
 
-    historic_folder = get_cohort_seq_type_conf().get('historic_results')
+    historic_folder = get_cohort_seq_type_conf(dataset).get('historic_results')
 
     if historic_folder is None:
         logging.info('No historic data folder, no filtering')
@@ -1100,30 +1100,33 @@ def filter_results(results: dict, singletons: bool) -> dict:
 
     # 2 is the required prefix, i.e. 2022_*, to discriminate vs. 'singletons_'
     # in 1000 years this might cause a problem :/ \s
-    latest_results_path = find_latest_file(start=prefix or '2')
+    latest_results_path = find_latest_file(dataset=dataset, start=prefix or '2')
 
     logging.info(f'latest results: {latest_results_path}')
 
     latest_results: dict = read_json_from_path(latest_results_path)  # type: ignore
 
     results, cumulative = date_annotate_results(results, latest_results)
-    save_new_historic(results=cumulative, prefix=prefix)
+    save_new_historic(results=cumulative, prefix=prefix, dataset=dataset)
 
     return results
 
 
-def save_new_historic(results: dict, prefix: str = '', directory: str | None = None):
+def save_new_historic(
+    results: dict, dataset: str, prefix: str = '', directory: str | None = None
+):
     """
     save the new results in the historic results dir
 
     Args:
         results (): object to save as a JSON file
+        dataset (str): the dataset to save results for
         prefix (str): name prefix for this file (optional)
         directory (): defaults to historic_data from config
     """
 
     if directory is None:
-        directory = get_cohort_seq_type_conf().get('historic_results')
+        directory = get_cohort_seq_type_conf(dataset).get('historic_results')
         if directory is None:
             logging.info('No historic results directory, nothing written')
             return
@@ -1136,11 +1139,12 @@ def save_new_historic(results: dict, prefix: str = '', directory: str | None = N
 
 
 def find_latest_file(
-    results_folder: str | None = None, start: str = '', ext: str = 'json'
+    dataset: str, results_folder: str | None = None, start: str = '', ext: str = 'json'
 ) -> str | None:
     """
     takes a directory of files, and finds the latest
     Args:
+        dataset (): the dataset to fetch results for
         results_folder (): local or remote folder
         start (str): the start of the filename, if applicable
         ext (): the type of files we're looking for
@@ -1149,7 +1153,7 @@ def find_latest_file(
     """
 
     if results_folder is None:
-        results_folder = get_cohort_seq_type_conf().get('historic_results')
+        results_folder = get_cohort_seq_type_conf(dataset).get('historic_results')
         if results_folder is None:
             logging.info('`historic_results` not present in config')
             return None

diff --git a/reanalysis/validate_categories.py b/reanalysis/validate_categories.py
@@ -542,7 +542,7 @@ def main(
 
     # annotate previously seen results using cumulative data file(s)
     analysis_results = filter_results(
-        analysis_results, singletons=bool('singleton' in pedigree)
+        analysis_results, singletons=bool('singleton' in pedigree), dataset=dataset
     )
 
     # create the full final output file

diff --git a/reanalysis/vep_jobs.py b/reanalysis/vep_jobs.py
@@ -17,7 +17,7 @@
     command,
     query_command,
 )
-from cpg_workflows.resources import STANDARD
+from cpg_workflows.resources import HIGHMEM, STANDARD
 from cpg_workflows.utils import exists
 
 
@@ -271,9 +271,8 @@ def vep_one(
     j.image(image_path('vep_110'))
 
     # vep is single threaded, with a middling memory requirement
-    # during test it caps out around 4GB, though this could be
-    # larger for some long-running jobs
-    j.memory('highmem').storage('10Gi').cpu(1)
+    # tests have exceeded 8GB, so bump to ~13 (2 * highmem)
+    HIGHMEM.set_resources(j, ncpu=2, storage_gb=10)
 
     if not isinstance(vcf, hb.ResourceFile):
         vcf = b.read_input(str(vcf))
@@ -306,7 +305,6 @@ def vep_one(
     -i {vcf} \\
     --everything \\
     --mane_select \\
-    --no_stats \\
     --allele_number \\
     --minimal \\
     --species homo_sapiens \\

diff --git a/reanalysis/version.py b/reanalysis/version.py
@@ -3,4 +3,4 @@
 """
 
 # Do not edit this file manually
-__version__ = '2.0.2'
+__version__ = '2.0.3'
diff --git a/setup.py b/setup.py
@@ -32,7 +32,7 @@ def read_reqs(filename: str) -> list[str]:
     name='automated-interpretation-pipeline',
     description='CPG Variant Prioritisation',
     long_description=readme,
-    version='2.0.2',
+    version='2.0.3',
     author='Matthew Welland, CPG',
     author_email=(
         'matthew.welland@populationgenomics.org.au, '

diff --git a/test/test_results_comparison.py b/test/test_results_comparison.py
@@ -244,7 +244,7 @@ def test_find_latest(tmp_path):
     touch(join(tmp_str, 'file2.json'))
     sleep(0.2)
     touch(join(tmp_str, 'file3.json'))
-    assert 'file3.json' in find_latest_file(tmp_str)
+    assert 'file3.json' in find_latest_file(results_folder=tmp_str, dataset='cohort')
 
 
 def test_find_latest_singletons(tmp_path):
@@ -257,7 +257,9 @@ def test_find_latest_singletons(tmp_path):
     touch(join(tmp_str, 'file2.json'))
     sleep(0.2)
     touch(join(tmp_str, 'file3.json'))
-    assert 'singletons_file1.json' in find_latest_file(tmp_str, start='singletons')
+    assert 'singletons_file1.json' in find_latest_file(
+        results_folder=tmp_str, start='singletons', dataset='cohort'
+    )
 
 
 def test_find_latest_with_ext(tmp_path):
@@ -270,4 +272,6 @@ def test_find_latest_with_ext(tmp_path):
     touch(join(tmp_str, 'file2.txt'))
     sleep(0.2)
     touch(join(tmp_str, 'file3.json'))
-    assert 'file2.txt' in find_latest_file(tmp_str, ext='txt')
+    assert 'file2.txt' in find_latest_file(
+        results_folder=tmp_str, ext='txt', dataset='cohort'
+    )