Update merge_str_runner.py

populationgenomics · Aug 21, 2023 · 0d4e15b · 0d4e15b
1 parent 7432f2c
commit 0d4e15b
Showing 1 changed file with 41 additions and 20 deletions.
diff --git a/str/trtools/merge_str_runner.py b/str/trtools/merge_str_runner.py
@@ -4,7 +4,7 @@
 Please ensure merge_prep.py has been run on the vcf files prior to running mergeSTR.py
 
 For example: 
-analysis-runner --access-level test --dataset tob-wgs --description 'tester --output-dir 'tester' merge_str_runner.py --input-dir=gs://cpg-tob-wgs-main/str/expansionhunter/pure_repeats --dataset=tob-wgs TOBXXXX TOBXXXX
+analysis-runner --access-level test --dataset tob-wgs --description 'tester' --output-dir 'str/expansionhunter/test_chr_22/mergeSTR' merge_str_runner.py --input-dir=gs://cpg-tob-wgs-test-analysis/str/expansionhunter/test_chr_22/mergeSTRprep --dataset=tob-wgs CPG199760 CPG199778
 
 Required packages: sample-metadata, hail, click, os
 pip install sample-metadata hail click
@@ -13,8 +13,6 @@
 
 import click
 
-from sample_metadata.apis import SampleApi
-
 from cpg_utils.config import get_config
 from cpg_utils.hail_batch import output_path
 from cpg_workflows.batch import get_batch
@@ -33,44 +31,67 @@
 # input directory
 @click.option('--input-dir', help='gs://...')
 # input sample ID
-@click.argument('external-wgs-ids', nargs=-1)
+@click.argument('internal-wgs-ids', nargs=-1)
 @click.command()
 def main(
-    dataset, input_dir, external_wgs_ids: list[str]
+    dataset, input_dir, internal_wgs_ids: list[str]
 ):  # pylint: disable=missing-function-docstring
     # Initializing Batch
     b = get_batch()
 
-    external_id_to_cpg_id: dict[str, str] = SampleApi().get_sample_id_map_by_external(
-        dataset, list(external_wgs_ids)
-    )
-
+    #Initialise TRTools job to run mergeSTR
     trtools_job = b.new_job(name='mergeSTR')
     trtools_job.image(TRTOOLS_IMAGE)
     trtools_job.cpu(8)
-    trtools_job.cloudfuse(f'cpg-{dataset}-main', '/vcffuse')
-
+    #mount using cloudfuse for reading input files 
+    trtools_job.cloudfuse(f'cpg-{dataset}-main-analysis', '/vcffuse')
+    trtools_job.declare_resource_group(
+            vcf_output={
+                'vcf': '{root}.vcf'  
+            }
+        )
+
+    #Initialise job to zip and index the mergeSTR VCF prior to writing to output
+    zip_job = b.new_job(name = 'Zip and index')
+    zip_job.image(TRTOOLS_IMAGE)
+    zip_job.cpu(8)
+    zip_job.declare_resource_group(
+            vcf_output={
+                'vcf.gz':'{root}.vcf.gz',
+                'vcf.gz.tbi':'{root}.vcf.gz.tbi'
+            }
+        )
+
+
+    #read in input file paths 
     vcffuse_path = []
-    for id in list(external_id_to_cpg_id.values()):
+    for id in list(internal_wgs_ids):
         vcf = os.path.join(input_dir, f'{id}_eh.reheader.vcf.gz')
         suffix = vcf.removeprefix('gs://').split('/', maxsplit=1)[1]
         vcffuse_path.append(f'/vcffuse/{suffix}')
     num_samples = len(vcffuse_path)
     vcffuse_path = ','.join(vcffuse_path)  # string format for input into mergeSTR
-
-    output_path_vcf = output_path(f'mergeSTR_{num_samples}_samples_eh.vcf.bgz')
-    output_path_vcf = output_path_vcf.removeprefix('gs://').split('/', maxsplit=1)[1]
-
+
     trtools_job.command(
         f"""
-    mergeSTR --vcfs {vcffuse_path} --out temp.vcf --vcftype eh
-    bgzip -c temp.vcf > /vcffuse/{output_path_vcf}
-    tabix /vcffuse/{output_path_vcf}
+    mergeSTR --vcfs {vcffuse_path} --out {trtools_job.vcf_output} --vcftype eh
     """
     )
 
-    b.run(wait=False)
+    zip_job.command(
+        f"""
+    bgzip -c {trtools_job.vcf_output['vcf']} > {zip_job.vcf_output['vcf.gz']}
+    tabix -p vcf {zip_job.vcf_output['vcf.gz']}
+    """
+    )
 
+    output_path_name = output_path(f'mergeSTR_{num_samples}_samples_eh', 'analysis')
+    b.write_output(zip_job.vcf_output, output_path_name)
+
+    b.run(wait=False)
 
 if __name__ == '__main__':
     main()  # pylint: disable=no-value-for-parameter
+
+
+