Skip to content

Commit

Permalink
Update merge_str_prep.py (#84)
Browse files Browse the repository at this point in the history
* Update merge_str_prep.py

* linting

* Update str/trtools/merge_str_prep.py

Co-authored-by: Matt Welland <mattwellie@gmail.com>

---------

Co-authored-by: Matt Welland <mattwellie@gmail.com>
  • Loading branch information
hopedisastro and MattWellie authored Aug 21, 2023
1 parent 7432f2c commit 0bc398d
Showing 1 changed file with 6 additions and 15 deletions.
21 changes: 6 additions & 15 deletions str/trtools/merge_str_prep.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
This script prepares GangSTR/EH VCF files for input into mergeSTR.
Required input: --caller, --input-dir, and external sample IDs
For example:
analysis-runner --access-level test --dataset tob-wgs --description 'tester --output-dir 'tester' merge_prep.py --caller=eh --input-dir=gs://cpg-tob-wgs-main/str/expansionhunter/pure_repeats --dataset=tob-wgs TOBXXXX TOBXXXX
analysis-runner --access-level test --dataset tob-wgs --description 'tester --output-dir 'tester' merge_prep.py --caller=eh --input-dir=gs://cpg-tob-wgs-main/str/expansionhunter/pure_repeats --dataset=tob-wgs CPGXXXX CPGXXXX
Required packages: sample-metadata, hail, click, os
pip install sample-metadata hail click
Expand All @@ -13,8 +13,6 @@
import os
import click

from sample_metadata.apis import SampleApi

from cpg_utils.config import get_config
from cpg_utils.hail_batch import output_path
from cpg_workflows.batch import get_batch
Expand All @@ -33,23 +31,17 @@
help='gangstr or eh',
type=click.Choice(['eh', 'gangstr'], case_sensitive=True),
)
# dataset
@click.option('--dataset', help='dataset eg tob-wgs')
# input directory
@click.option('--input-dir', help='gs://...')
# input sample ID
@click.argument('external-wgs-ids', nargs=-1)
@click.argument('internal-wgs-ids', nargs=-1)
@click.command()
def main(
dataset, caller, input_dir, external_wgs_ids: list[str]
caller, input_dir, internal_wgs_ids: list[str]
): # pylint: disable=missing-function-docstring
# Initializing Batch
b = get_batch()

external_id_to_cpg_id: dict[str, str] = SampleApi().get_sample_id_map_by_external(
dataset, list(external_wgs_ids)
)

# Working with CRAM files requires the reference fasta
ref = b.read_input_group(
**dict(
Expand All @@ -62,8 +54,7 @@ def main(

input_vcf_dict = {}

for id in list(external_id_to_cpg_id.values()):
input_vcf_dict[id] = os.path.join(input_dir, f'{id}_{caller}.vcf')
input_vcf_dict = {id: os.path.join(input_dir, f'{id}_{caller}.vcf') for id in internal_wgs_ids}

for id in list(input_vcf_dict.keys()):
bcftools_job = b.new_job(name=f'{id} {caller} Files prep')
Expand Down Expand Up @@ -93,7 +84,7 @@ def main(
"""
)
# Output writing
output_path_eh = output_path(f'{id}_eh')
output_path_eh = output_path(f'{id}_eh', 'analysis')
b.write_output(bcftools_job.vcf_sorted, output_path_eh)

else:
Expand All @@ -113,7 +104,7 @@ def main(
"""
)
# Output writing
output_path_gangstr = output_path(f'{id}_gangstr')
output_path_gangstr = output_path(f'{id}_gangstr', 'analysis')
b.write_output(bcftools_job.vcf_sorted, output_path_gangstr)

b.run(wait=False)
Expand Down

0 comments on commit 0bc398d

Please sign in to comment.