Skip to content

Commit

Permalink
Update merge_str_runner.py
Browse files Browse the repository at this point in the history
  • Loading branch information
hopedisastro committed Aug 21, 2023
1 parent 7432f2c commit 0d4e15b
Showing 1 changed file with 41 additions and 20 deletions.
61 changes: 41 additions & 20 deletions str/trtools/merge_str_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
Please ensure merge_prep.py has been run on the vcf files prior to running mergeSTR.py
For example:
analysis-runner --access-level test --dataset tob-wgs --description 'tester --output-dir 'tester' merge_str_runner.py --input-dir=gs://cpg-tob-wgs-main/str/expansionhunter/pure_repeats --dataset=tob-wgs TOBXXXX TOBXXXX
analysis-runner --access-level test --dataset tob-wgs --description 'tester' --output-dir 'str/expansionhunter/test_chr_22/mergeSTR' merge_str_runner.py --input-dir=gs://cpg-tob-wgs-test-analysis/str/expansionhunter/test_chr_22/mergeSTRprep --dataset=tob-wgs CPG199760 CPG199778
Required packages: sample-metadata, hail, click, os
pip install sample-metadata hail click
Expand All @@ -13,8 +13,6 @@

import click

from sample_metadata.apis import SampleApi

from cpg_utils.config import get_config
from cpg_utils.hail_batch import output_path
from cpg_workflows.batch import get_batch
Expand All @@ -33,44 +31,67 @@
# input directory
@click.option('--input-dir', help='gs://...')
# input sample ID
@click.argument('external-wgs-ids', nargs=-1)
@click.argument('internal-wgs-ids', nargs=-1)
@click.command()
def main(
dataset, input_dir, external_wgs_ids: list[str]
dataset, input_dir, internal_wgs_ids: list[str]
): # pylint: disable=missing-function-docstring
# Initializing Batch
b = get_batch()

external_id_to_cpg_id: dict[str, str] = SampleApi().get_sample_id_map_by_external(
dataset, list(external_wgs_ids)
)

#Initialise TRTools job to run mergeSTR
trtools_job = b.new_job(name='mergeSTR')
trtools_job.image(TRTOOLS_IMAGE)
trtools_job.cpu(8)
trtools_job.cloudfuse(f'cpg-{dataset}-main', '/vcffuse')

#mount using cloudfuse for reading input files
trtools_job.cloudfuse(f'cpg-{dataset}-main-analysis', '/vcffuse')
trtools_job.declare_resource_group(
vcf_output={
'vcf': '{root}.vcf'
}
)

#Initialise job to zip and index the mergeSTR VCF prior to writing to output
zip_job = b.new_job(name = 'Zip and index')
zip_job.image(TRTOOLS_IMAGE)
zip_job.cpu(8)
zip_job.declare_resource_group(
vcf_output={
'vcf.gz':'{root}.vcf.gz',
'vcf.gz.tbi':'{root}.vcf.gz.tbi'
}
)


#read in input file paths
vcffuse_path = []
for id in list(external_id_to_cpg_id.values()):
for id in list(internal_wgs_ids):
vcf = os.path.join(input_dir, f'{id}_eh.reheader.vcf.gz')
suffix = vcf.removeprefix('gs://').split('/', maxsplit=1)[1]
vcffuse_path.append(f'/vcffuse/{suffix}')
num_samples = len(vcffuse_path)
vcffuse_path = ','.join(vcffuse_path) # string format for input into mergeSTR

output_path_vcf = output_path(f'mergeSTR_{num_samples}_samples_eh.vcf.bgz')
output_path_vcf = output_path_vcf.removeprefix('gs://').split('/', maxsplit=1)[1]


trtools_job.command(
f"""
mergeSTR --vcfs {vcffuse_path} --out temp.vcf --vcftype eh
bgzip -c temp.vcf > /vcffuse/{output_path_vcf}
tabix /vcffuse/{output_path_vcf}
mergeSTR --vcfs {vcffuse_path} --out {trtools_job.vcf_output} --vcftype eh
"""
)

b.run(wait=False)
zip_job.command(
f"""
bgzip -c {trtools_job.vcf_output['vcf']} > {zip_job.vcf_output['vcf.gz']}
tabix -p vcf {zip_job.vcf_output['vcf.gz']}
"""
)

output_path_name = output_path(f'mergeSTR_{num_samples}_samples_eh', 'analysis')
b.write_output(zip_job.vcf_output, output_path_name)

b.run(wait=False)

if __name__ == '__main__':
main() # pylint: disable=no-value-for-parameter



0 comments on commit 0d4e15b

Please sign in to comment.