Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update merge_str_runner.py #85

Merged
merged 9 commits into from
Aug 27, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 24 additions & 17 deletions str/trtools/merge_str_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
Please ensure merge_prep.py has been run on the vcf files prior to running mergeSTR.py

For example:
analysis-runner --access-level test --dataset tob-wgs --description 'tester --output-dir 'tester' merge_str_runner.py --input-dir=gs://cpg-tob-wgs-main/str/expansionhunter/pure_repeats --dataset=tob-wgs TOBXXXX TOBXXXX
analysis-runner --access-level test --dataset tob-wgs --description 'tester' --output-dir 'str/expansionhunter/test_chr_22/mergeSTR' merge_str_runner.py --input-dir=gs://cpg-tob-wgs-test-analysis/str/expansionhunter/test_chr_22/mergeSTRprep --dataset=tob-wgs CPG199760 CPG199778

Required packages: sample-metadata, hail, click, os
pip install sample-metadata hail click
Expand All @@ -13,8 +13,6 @@

import click

from sample_metadata.apis import SampleApi

from cpg_utils.config import get_config
from cpg_utils.hail_batch import output_path
from cpg_workflows.batch import get_batch
Expand All @@ -33,42 +31,51 @@
# input directory
@click.option('--input-dir', help='gs://...')
# input sample ID
@click.argument('external-wgs-ids', nargs=-1)
@click.argument('internal-wgs-ids', nargs=-1)
@click.command()
def main(
dataset, input_dir, external_wgs_ids: list[str]
dataset, input_dir, internal_wgs_ids: list[str]
): # pylint: disable=missing-function-docstring
# Initializing Batch
b = get_batch()

external_id_to_cpg_id: dict[str, str] = SampleApi().get_sample_id_map_by_external(
dataset, list(external_wgs_ids)
)

# Initialise TRTools job to run mergeSTR
trtools_job = b.new_job(name='mergeSTR')
trtools_job.image(TRTOOLS_IMAGE)
trtools_job.cpu(8)
trtools_job.cloudfuse(f'cpg-{dataset}-main', '/vcffuse')
# mount using cloudfuse for reading input files
trtools_job.cloudfuse(f'cpg-{dataset}-main-analysis', '/vcffuse')
trtools_job.declare_resource_group(
vcf_output={
'vcf': '{root}.vcf',
'vcf.gz': '{root}.vcf.gz',
'vcf.gz.tbi': '{root}.vcf.gz.tbi',
}
)

# read in input file paths
vcffuse_path = []
for id in list(external_id_to_cpg_id.values()):
for id in list(internal_wgs_ids):
vcf = os.path.join(input_dir, f'{id}_eh.reheader.vcf.gz')
suffix = vcf.removeprefix('gs://').split('/', maxsplit=1)[1]
vcffuse_path.append(f'/vcffuse/{suffix}')
num_samples = len(vcffuse_path)
vcffuse_path = ','.join(vcffuse_path) # string format for input into mergeSTR

output_path_vcf = output_path(f'mergeSTR_{num_samples}_samples_eh.vcf.bgz')
output_path_vcf = output_path_vcf.removeprefix('gs://').split('/', maxsplit=1)[1]

trtools_job.command(
f"""
mergeSTR --vcfs {vcffuse_path} --out temp.vcf --vcftype eh
bgzip -c temp.vcf > /vcffuse/{output_path_vcf}
tabix /vcffuse/{output_path_vcf}
mergeSTR --vcfs {vcffuse_path} --out {trtools_job.vcf_output} --vcftype eh
bgzip -c {trtools_job.vcf_output}.vcf > {trtools_job.vcf_output['vcf.gz']}
hopedisastro marked this conversation as resolved.
Show resolved Hide resolved
tabix -f -p vcf {trtools_job.vcf_output['vcf.gz']} > {trtools_job.vcf_output['vcf.gz.tbi']}
"""
)

output_path_name = output_path(f'mergeSTR_{num_samples}_samples_eh', 'analysis')
b.write_output(trtools_job.vcf_output['vcf.gz'], f'{output_path_name}.vcf.gz')
b.write_output(
trtools_job.vcf_output['vcf.gz.tbi'], f'{output_path_name}.vcf.gz.tbi'
)

b.run(wait=False)


Expand Down
Loading