Merge pull request #73 from hall-lab/dup_improvements

Genotyping calculation improvements
hall-lab · Jan 19, 2018 · a9eb890 · a9eb890
2 parents 376bd1d + 8ed6810
commit a9eb890
Show file tree

Hide file tree

Showing 5 changed files with 90 additions and 94 deletions.
diff --git a/svtyper/classic.py b/svtyper/classic.py
@@ -433,7 +433,8 @@ def sv_genotype(bam_string,
                 QR = int(split_weight * ref_seq) + int(disc_weight * ref_span)
                 QA = int(split_weight * alt_splitters) + int(disc_weight * alt_span)
                 gt_lplist = bayes_gt(QR, QA, is_dup)
-                gt_idx = gt_lplist.index(max(gt_lplist))
+                best, second_best = sorted([ (i, e) for i, e in enumerate(gt_lplist) ], key=lambda(x): x[1], reverse=True)[0:2]
+                gt_idx = best[0]
 
                 # print log probabilities of homref, het, homalt
                 if debug:
@@ -468,10 +469,7 @@ def sv_genotype(bam_string,
                 if gt_sum > 0:
                     gt_sum_log = math.log(gt_sum, 10)
                     sample_qual = abs(-10 * (gt_lplist[0] - gt_sum_log)) # phred-scaled probability site is non-reference in this sample
-                    if 1 - (10**gt_lplist[gt_idx] / 10**gt_sum_log) == 0:
-                        phred_gq = 200
-                    else:
-                        phred_gq = abs(-10 * math.log(1 - (10**gt_lplist[gt_idx] / 10**gt_sum_log), 10))
+                    phred_gq = min(-10 * (second_best[1] - best[1]), 200)
                     var.genotype(sample.name).set_format('GQ', int(phred_gq))
                     var.genotype(sample.name).set_format('SQ', sample_qual)
                     var.qual += sample_qual

diff --git a/svtyper/singlesample.py b/svtyper/singlesample.py
@@ -413,7 +413,8 @@ def bayesian_genotype(breakpoint, counts, split_weight, disc_weight, debug):
 
     # the actual bayesian calculation and decision
     gt_lplist = bayes_gt(QR, QA, is_dup)
-    gt_idx = gt_lplist.index(max(gt_lplist))
+    best, second_best = sorted([ (i, e) for i, e in enumerate(gt_lplist) ], key=lambda(x): x[1], reverse=True)[0:2]
+    gt_idx = best[0]
 
     # print log probabilities of homref, het, homalt
     if debug:
@@ -450,10 +451,7 @@ def bayesian_genotype(breakpoint, counts, split_weight, disc_weight, debug):
     if gt_sum > 0:
         gt_sum_log = math.log(gt_sum, 10)
         sample_qual = abs(-10 * (gt_lplist[0] - gt_sum_log)) # phred-scaled probability site is non-reference in this sample
-        if 1 - (10**gt_lplist[gt_idx] / 10**gt_sum_log) == 0:
-            phred_gq = 200
-        else:
-            phred_gq = abs(-10 * math.log(1 - (10**gt_lplist[gt_idx] / 10**gt_sum_log), 10))
+        phred_gq = min(-10 * (second_best[1] - best[1]), 200)
         result['formats']['GQ'] = int(phred_gq)
         result['formats']['SQ'] = sample_qual
         result['qual'] += sample_qual

diff --git a/svtyper/statistics.py b/svtyper/statistics.py
@@ -23,7 +23,7 @@ def log_choose(n, k):
 def bayes_gt(ref, alt, is_dup):
     # probability of seeing an alt read with true genotype of of hom_ref, het, hom_alt respectively
     if is_dup: # specialized logic to handle non-destructive events such as duplications
-        p_alt = [1e-2, 1/3.0, 0.5]
+        p_alt = [1e-2, 0.2, 1/3.0]
     else:
         p_alt = [1e-3, 0.5, 0.9]
 

diff --git a/svtyper/version.py b/svtyper/version.py
@@ -1,2 +1,2 @@
 __author__ = "Colby Chiang (colbychiang@wustl.edu)"
-__version__ = "v0.4.0"
+__version__ = "v0.5.0"