Skip to content

Commit

Permalink
Include dataset everywhere (#335)
Browse files Browse the repository at this point in the history
* Bump version: 2.0.2 → 2.0.3
  • Loading branch information
MattWellie authored Nov 8, 2023
1 parent 056eae6 commit 338a87e
Show file tree
Hide file tree
Showing 11 changed files with 32 additions and 26 deletions.
2 changes: 1 addition & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 2.0.2
current_version = 2.0.3
commit = True
tag = False

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/clinvar_runner.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,5 +32,5 @@ jobs:
curl --fail --silent --show-error -X POST \
-H "Authorization: Bearer $TOKEN" \
-H "Content-Type:application/json" \
-d '{"output": "generate_clinvar_${{ steps.date.outputs.date }}", "dataset": "talos", "accessLevel": "full", "repo": "automated-interpretation-pipeline", "commit": "${{ github.sha }}", "cwd": "reanalysis", "script": ["./clinvar_runner.py"], "description": "Generate Latest Clinvar Summaries", "image": "australia-southeast1-docker.pkg.dev/cpg-common/images/cpg_aip:2.0.2", "config": {"workflow": {"sequencing_type": "genome"}, "cohorts": {"talos": {"clinvar_filter": ["victorian clinical genetics services, murdoch childrens research institute"]}}}, "wait": false}' \
-d '{"output": "generate_clinvar_${{ steps.date.outputs.date }}", "dataset": "talos", "accessLevel": "full", "repo": "automated-interpretation-pipeline", "commit": "${{ github.sha }}", "cwd": "reanalysis", "script": ["./clinvar_runner.py"], "description": "Generate Latest Clinvar Summaries", "image": "australia-southeast1-docker.pkg.dev/cpg-common/images/cpg_aip:2.0.3", "config": {"workflow": {"sequencing_type": "genome"}, "cohorts": {"talos": {"clinvar_filter": ["victorian clinical genetics services, murdoch childrens research institute"]}}}, "wait": false}' \
https://server-a2pko7ameq-ts.a.run.app
2 changes: 1 addition & 1 deletion .github/workflows/docker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ on:
- main

env:
VERSION: 2.0.2
VERSION: 2.0.3

jobs:
docker:
Expand Down
4 changes: 2 additions & 2 deletions reanalysis/query_panelapp.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,7 @@ def main(panels: str | None, out_path: str, dataset: str | None = None):

# historic data overrides default 'previous' list for cohort
# open to discussing order of precedence here
if old_file := find_latest_file(start='panel_'):
if old_file := find_latest_file(dataset=dataset, start='panel_'):
logging.info(f'Grabbing legacy panel data from {old_file}')
old_data: dict = read_json_from_path(old_file, default=old_data)

Expand Down Expand Up @@ -392,7 +392,7 @@ def main(panels: str | None, out_path: str, dataset: str | None = None):
# write the output to long term storage
write_output_json(output_path=out_path, object_to_write=gene_dict)

save_new_historic(old_data, prefix='panel_')
save_new_historic(old_data, dataset=dataset, prefix='panel_')


if __name__ == '__main__':
Expand Down
4 changes: 2 additions & 2 deletions reanalysis/reanalysis_global.toml
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,9 @@ default_memory = 'highmem'

[images]
gatk = 'australia-southeast1-docker.pkg.dev/cpg-common/images/gatk:4.2.6.1'
aip = 'australia-southeast1-docker.pkg.dev/cpg-common/images/cpg_aip:2.0.2'
aip = 'australia-southeast1-docker.pkg.dev/cpg-common/images/cpg_aip:2.0.3'
vep_110 = "australia-southeast1-docker.pkg.dev/cpg-common/images/vep_110:release_110.1"
cpg_workflows = "australia-southeast1-docker.pkg.dev/cpg-common/images/cpg_workflows:1.17.1"
cpg_workflows = "australia-southeast1-docker.pkg.dev/cpg-common/images/cpg_workflows:latest"

[references]
vep_110_mount = "gs://cpg-common-main/references/vep/110/mount"
Expand Down
20 changes: 12 additions & 8 deletions reanalysis/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1072,7 +1072,7 @@ def find_comp_hets(var_list: list[AbstractVariant], pedigree) -> CompHetDict:
return comp_het_results


def filter_results(results: dict, singletons: bool) -> dict:
def filter_results(results: dict, singletons: bool, dataset: str) -> dict:
"""
loads the most recent prior result set (if it exists)
annotates previously seen variants with the most recent date seen
Expand All @@ -1085,7 +1085,7 @@ def filter_results(results: dict, singletons: bool) -> dict:
Returns: same results annotated with date-first-seen
"""

historic_folder = get_cohort_seq_type_conf().get('historic_results')
historic_folder = get_cohort_seq_type_conf(dataset).get('historic_results')

if historic_folder is None:
logging.info('No historic data folder, no filtering')
Expand All @@ -1100,30 +1100,33 @@ def filter_results(results: dict, singletons: bool) -> dict:

# 2 is the required prefix, i.e. 2022_*, to discriminate vs. 'singletons_'
# in 1000 years this might cause a problem :/ \s
latest_results_path = find_latest_file(start=prefix or '2')
latest_results_path = find_latest_file(dataset=dataset, start=prefix or '2')

logging.info(f'latest results: {latest_results_path}')

latest_results: dict = read_json_from_path(latest_results_path) # type: ignore

results, cumulative = date_annotate_results(results, latest_results)
save_new_historic(results=cumulative, prefix=prefix)
save_new_historic(results=cumulative, prefix=prefix, dataset=dataset)

return results


def save_new_historic(results: dict, prefix: str = '', directory: str | None = None):
def save_new_historic(
results: dict, dataset: str, prefix: str = '', directory: str | None = None
):
"""
save the new results in the historic results dir
Args:
results (): object to save as a JSON file
dataset (str): the dataset to save results for
prefix (str): name prefix for this file (optional)
directory (): defaults to historic_data from config
"""

if directory is None:
directory = get_cohort_seq_type_conf().get('historic_results')
directory = get_cohort_seq_type_conf(dataset).get('historic_results')
if directory is None:
logging.info('No historic results directory, nothing written')
return
Expand All @@ -1136,11 +1139,12 @@ def save_new_historic(results: dict, prefix: str = '', directory: str | None = N


def find_latest_file(
results_folder: str | None = None, start: str = '', ext: str = 'json'
dataset: str, results_folder: str | None = None, start: str = '', ext: str = 'json'
) -> str | None:
"""
takes a directory of files, and finds the latest
Args:
dataset (): the dataset to fetch results for
results_folder (): local or remote folder
start (str): the start of the filename, if applicable
ext (): the type of files we're looking for
Expand All @@ -1149,7 +1153,7 @@ def find_latest_file(
"""

if results_folder is None:
results_folder = get_cohort_seq_type_conf().get('historic_results')
results_folder = get_cohort_seq_type_conf(dataset).get('historic_results')
if results_folder is None:
logging.info('`historic_results` not present in config')
return None
Expand Down
2 changes: 1 addition & 1 deletion reanalysis/validate_categories.py
Original file line number Diff line number Diff line change
Expand Up @@ -542,7 +542,7 @@ def main(

# annotate previously seen results using cumulative data file(s)
analysis_results = filter_results(
analysis_results, singletons=bool('singleton' in pedigree)
analysis_results, singletons=bool('singleton' in pedigree), dataset=dataset
)

# create the full final output file
Expand Down
8 changes: 3 additions & 5 deletions reanalysis/vep_jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
command,
query_command,
)
from cpg_workflows.resources import STANDARD
from cpg_workflows.resources import HIGHMEM, STANDARD
from cpg_workflows.utils import exists


Expand Down Expand Up @@ -271,9 +271,8 @@ def vep_one(
j.image(image_path('vep_110'))

# vep is single threaded, with a middling memory requirement
# during test it caps out around 4GB, though this could be
# larger for some long-running jobs
j.memory('highmem').storage('10Gi').cpu(1)
# tests have exceeded 8GB, so bump to ~13 (2 * highmem)
HIGHMEM.set_resources(j, ncpu=2, storage_gb=10)

if not isinstance(vcf, hb.ResourceFile):
vcf = b.read_input(str(vcf))
Expand Down Expand Up @@ -306,7 +305,6 @@ def vep_one(
-i {vcf} \\
--everything \\
--mane_select \\
--no_stats \\
--allele_number \\
--minimal \\
--species homo_sapiens \\
Expand Down
2 changes: 1 addition & 1 deletion reanalysis/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
"""

# Do not edit this file manually
__version__ = '2.0.2'
__version__ = '2.0.3'
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def read_reqs(filename: str) -> list[str]:
name='automated-interpretation-pipeline',
description='CPG Variant Prioritisation',
long_description=readme,
version='2.0.2',
version='2.0.3',
author='Matthew Welland, CPG',
author_email=(
'matthew.welland@populationgenomics.org.au, '
Expand Down
10 changes: 7 additions & 3 deletions test/test_results_comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ def test_find_latest(tmp_path):
touch(join(tmp_str, 'file2.json'))
sleep(0.2)
touch(join(tmp_str, 'file3.json'))
assert 'file3.json' in find_latest_file(tmp_str)
assert 'file3.json' in find_latest_file(results_folder=tmp_str, dataset='cohort')


def test_find_latest_singletons(tmp_path):
Expand All @@ -257,7 +257,9 @@ def test_find_latest_singletons(tmp_path):
touch(join(tmp_str, 'file2.json'))
sleep(0.2)
touch(join(tmp_str, 'file3.json'))
assert 'singletons_file1.json' in find_latest_file(tmp_str, start='singletons')
assert 'singletons_file1.json' in find_latest_file(
results_folder=tmp_str, start='singletons', dataset='cohort'
)


def test_find_latest_with_ext(tmp_path):
Expand All @@ -270,4 +272,6 @@ def test_find_latest_with_ext(tmp_path):
touch(join(tmp_str, 'file2.txt'))
sleep(0.2)
touch(join(tmp_str, 'file3.json'))
assert 'file2.txt' in find_latest_file(tmp_str, ext='txt')
assert 'file2.txt' in find_latest_file(
results_folder=tmp_str, ext='txt', dataset='cohort'
)

0 comments on commit 338a87e

Please sign in to comment.