From 0bc398dffe7758ebe9be28fddd8ffc1598abc00d Mon Sep 17 00:00:00 2001 From: Hope Tanudisastro <40783500+hopedisastro@users.noreply.github.com> Date: Mon, 21 Aug 2023 13:19:51 +1000 Subject: [PATCH] Update merge_str_prep.py (#84) * Update merge_str_prep.py * linting * Update str/trtools/merge_str_prep.py Co-authored-by: Matt Welland --------- Co-authored-by: Matt Welland --- str/trtools/merge_str_prep.py | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/str/trtools/merge_str_prep.py b/str/trtools/merge_str_prep.py index 3e2230f1..34d6ab51 100644 --- a/str/trtools/merge_str_prep.py +++ b/str/trtools/merge_str_prep.py @@ -4,7 +4,7 @@ This script prepares GangSTR/EH VCF files for input into mergeSTR. Required input: --caller, --input-dir, and external sample IDs For example: -analysis-runner --access-level test --dataset tob-wgs --description 'tester --output-dir 'tester' merge_prep.py --caller=eh --input-dir=gs://cpg-tob-wgs-main/str/expansionhunter/pure_repeats --dataset=tob-wgs TOBXXXX TOBXXXX +analysis-runner --access-level test --dataset tob-wgs --description 'tester --output-dir 'tester' merge_prep.py --caller=eh --input-dir=gs://cpg-tob-wgs-main/str/expansionhunter/pure_repeats --dataset=tob-wgs CPGXXXX CPGXXXX Required packages: sample-metadata, hail, click, os pip install sample-metadata hail click @@ -13,8 +13,6 @@ import os import click -from sample_metadata.apis import SampleApi - from cpg_utils.config import get_config from cpg_utils.hail_batch import output_path from cpg_workflows.batch import get_batch @@ -33,23 +31,17 @@ help='gangstr or eh', type=click.Choice(['eh', 'gangstr'], case_sensitive=True), ) -# dataset -@click.option('--dataset', help='dataset eg tob-wgs') # input directory @click.option('--input-dir', help='gs://...') # input sample ID -@click.argument('external-wgs-ids', nargs=-1) +@click.argument('internal-wgs-ids', nargs=-1) @click.command() def main( - dataset, caller, input_dir, external_wgs_ids: list[str] + caller, input_dir, internal_wgs_ids: list[str] ): # pylint: disable=missing-function-docstring # Initializing Batch b = get_batch() - external_id_to_cpg_id: dict[str, str] = SampleApi().get_sample_id_map_by_external( - dataset, list(external_wgs_ids) - ) - # Working with CRAM files requires the reference fasta ref = b.read_input_group( **dict( @@ -62,8 +54,7 @@ def main( input_vcf_dict = {} - for id in list(external_id_to_cpg_id.values()): - input_vcf_dict[id] = os.path.join(input_dir, f'{id}_{caller}.vcf') + input_vcf_dict = {id: os.path.join(input_dir, f'{id}_{caller}.vcf') for id in internal_wgs_ids} for id in list(input_vcf_dict.keys()): bcftools_job = b.new_job(name=f'{id} {caller} Files prep') @@ -93,7 +84,7 @@ def main( """ ) # Output writing - output_path_eh = output_path(f'{id}_eh') + output_path_eh = output_path(f'{id}_eh', 'analysis') b.write_output(bcftools_job.vcf_sorted, output_path_eh) else: @@ -113,7 +104,7 @@ def main( """ ) # Output writing - output_path_gangstr = output_path(f'{id}_gangstr') + output_path_gangstr = output_path(f'{id}_gangstr', 'analysis') b.write_output(bcftools_job.vcf_sorted, output_path_gangstr) b.run(wait=False)