From 21781ee61499937c75ce4def85fc719cbada1413 Mon Sep 17 00:00:00 2001 From: Hope Tanudisastro <40783500+hopedisastro@users.noreply.github.com> Date: Wed, 5 Jun 2024 07:17:15 +1000 Subject: [PATCH] Use unmasked ref fasta if specify 'hgdp' in dataset param (#224) * try unmasked fasta for hgdp * Update str_iterative_eh_runner.py * cheeck if 'hgdp' in * Update get_cis_numpy_files.py * remove \n from 'sex' variable --- str/associatr/get_cis_numpy_files.py | 2 +- str/runners/str_iterative_eh_runner.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/str/associatr/get_cis_numpy_files.py b/str/associatr/get_cis_numpy_files.py index dc00c09a..2693b7f0 100644 --- a/str/associatr/get_cis_numpy_files.py +++ b/str/associatr/get_cis_numpy_files.py @@ -51,7 +51,7 @@ def extract_genotypes(vcf_file, loci): for record in vcf_reader(f'{chrom}:{pos}-{pos}'): if record.CHROM == chrom and record.POS == pos: gt = record.gt_types - gt[gt == 3] = 2 #HOM ALT is coded as 3; change it to 2 + gt[gt == 3] = 2 # HOM ALT is coded as 3; change it to 2 results[locus] = gt break diff --git a/str/runners/str_iterative_eh_runner.py b/str/runners/str_iterative_eh_runner.py index f13e19f0..024a6b95 100644 --- a/str/runners/str_iterative_eh_runner.py +++ b/str/runners/str_iterative_eh_runner.py @@ -86,7 +86,10 @@ def main( b = get_batch() # Reference fasta - ref_fasta = str(reference_path('broad/ref_fasta')) + if 'hgdp' in dataset: + ref_fasta = 'gs://cpg-common-main/references/hg38/v0/Homo_sapiens_assembly38.fasta' + else: + ref_fasta = str(reference_path('broad/ref_fasta')) ref = b.read_input_group( **dict( base=ref_fasta, @@ -112,6 +115,7 @@ def main( split_line = line.split(',') cpg_id = split_line[0] sex = split_line[2] + sex = sex.replace('\n', '') if cpg_id == 's': # header line continue if sex == 'XY':