populationgenomics · hopedisastro · Aug 27, 2023 · Aug 21, 2023 · Aug 21, 2023 · Aug 21, 2023
diff --git a/str/trtools/merge_str_runner.py b/str/trtools/merge_str_runner.py
@@ -4,7 +4,7 @@
 Please ensure merge_prep.py has been run on the vcf files prior to running mergeSTR.py
 
 For example: 
-analysis-runner --access-level test --dataset tob-wgs --description 'tester --output-dir 'tester' merge_str_runner.py --input-dir=gs://cpg-tob-wgs-main/str/expansionhunter/pure_repeats --dataset=tob-wgs TOBXXXX TOBXXXX
+analysis-runner --access-level test --dataset tob-wgs --description 'tester' --output-dir 'str/expansionhunter/test_chr_22/mergeSTR' merge_str_runner.py --input-dir=gs://cpg-tob-wgs-test-analysis/str/expansionhunter/test_chr_22/mergeSTRprep --dataset=tob-wgs CPG199760 CPG199778
 
 Required packages: sample-metadata, hail, click, os
 pip install sample-metadata hail click
@@ -13,8 +13,6 @@
 
 import click
 
-from sample_metadata.apis import SampleApi
-
 from cpg_utils.config import get_config
 from cpg_utils.hail_batch import output_path
 from cpg_workflows.batch import get_batch
@@ -33,42 +31,51 @@
 # input directory
 @click.option('--input-dir', help='gs://...')
 # input sample ID
-@click.argument('external-wgs-ids', nargs=-1)
+@click.argument('internal-wgs-ids', nargs=-1)
 @click.command()
 def main(
-    dataset, input_dir, external_wgs_ids: list[str]
+    dataset, input_dir, internal_wgs_ids: list[str]
 ):  # pylint: disable=missing-function-docstring
     # Initializing Batch
     b = get_batch()
 
-    external_id_to_cpg_id: dict[str, str] = SampleApi().get_sample_id_map_by_external(
-        dataset, list(external_wgs_ids)
-    )
-
+    # Initialise TRTools job to run mergeSTR
     trtools_job = b.new_job(name='mergeSTR')
     trtools_job.image(TRTOOLS_IMAGE)
     trtools_job.cpu(8)
-    trtools_job.cloudfuse(f'cpg-{dataset}-main', '/vcffuse')
+    # mount using cloudfuse for reading input files
+    trtools_job.cloudfuse(f'cpg-{dataset}-main-analysis', '/vcffuse')
+    trtools_job.declare_resource_group(
+        vcf_output={
+            'vcf': '{root}.vcf',
+            'vcf.gz': '{root}.vcf.gz',
+            'vcf.gz.tbi': '{root}.vcf.gz.tbi',
+        }
+    )
 
+    # read in input file paths
     vcffuse_path = []
-    for id in list(external_id_to_cpg_id.values()):
+    for id in list(internal_wgs_ids):
         vcf = os.path.join(input_dir, f'{id}_eh.reheader.vcf.gz')
         suffix = vcf.removeprefix('gs://').split('/', maxsplit=1)[1]
         vcffuse_path.append(f'/vcffuse/{suffix}')
     num_samples = len(vcffuse_path)
     vcffuse_path = ','.join(vcffuse_path)  # string format for input into mergeSTR
 
-    output_path_vcf = output_path(f'mergeSTR_{num_samples}_samples_eh.vcf.bgz')
-    output_path_vcf = output_path_vcf.removeprefix('gs://').split('/', maxsplit=1)[1]
-
     trtools_job.command(
         f"""
-    mergeSTR --vcfs {vcffuse_path} --out temp.vcf --vcftype eh
-    bgzip -c temp.vcf > /vcffuse/{output_path_vcf}
-    tabix /vcffuse/{output_path_vcf}
+    mergeSTR --vcfs {vcffuse_path} --out {trtools_job.vcf_output} --vcftype eh
+    bgzip -c {trtools_job.vcf_output}.vcf > {trtools_job.vcf_output['vcf.gz']}
+    tabix -f -p vcf {trtools_job.vcf_output['vcf.gz']}  > {trtools_job.vcf_output['vcf.gz.tbi']}
     """
     )
 
+    output_path_name = output_path(f'mergeSTR_{num_samples}_samples_eh', 'analysis')
+    b.write_output(trtools_job.vcf_output['vcf.gz'], f'{output_path_name}.vcf.gz')
+    b.write_output(
+        trtools_job.vcf_output['vcf.gz.tbi'], f'{output_path_name}.vcf.gz.tbi'
+    )
+
     b.run(wait=False)