Use unmasked ref fasta if specify 'hgdp' in dataset param (#224)

* try unmasked fasta for hgdp * Update str_iterative_eh_runner.py * cheeck if 'hgdp' in * Update get_cis_numpy_files.py * remove \n from 'sex' variable
populationgenomics · Jun 4, 2024 · 21781ee · 21781ee
1 parent d44310f
commit 21781ee
Show file tree

Hide file tree

Showing 2 changed files with 6 additions and 2 deletions.
diff --git a/str/associatr/get_cis_numpy_files.py b/str/associatr/get_cis_numpy_files.py
@@ -51,7 +51,7 @@ def extract_genotypes(vcf_file, loci):
         for record in vcf_reader(f'{chrom}:{pos}-{pos}'):
             if record.CHROM == chrom and record.POS == pos:
                 gt = record.gt_types
-                gt[gt == 3] = 2 #HOM ALT is coded as 3; change it to 2
+                gt[gt == 3] = 2  # HOM ALT is coded as 3; change it to 2
                 results[locus] = gt
                 break
 

diff --git a/str/runners/str_iterative_eh_runner.py b/str/runners/str_iterative_eh_runner.py
@@ -86,7 +86,10 @@ def main(
     b = get_batch()
 
     # Reference fasta
-    ref_fasta = str(reference_path('broad/ref_fasta'))
+    if 'hgdp' in dataset:
+        ref_fasta = 'gs://cpg-common-main/references/hg38/v0/Homo_sapiens_assembly38.fasta'
+    else:
+        ref_fasta = str(reference_path('broad/ref_fasta'))
     ref = b.read_input_group(
         **dict(
             base=ref_fasta,
@@ -112,6 +115,7 @@ def main(
             split_line = line.split(',')
             cpg_id = split_line[0]
             sex = split_line[2]
+            sex = sex.replace('\n', '')
             if cpg_id == 's':  # header line
                 continue
             if sex == 'XY':