Merge pull request #45 from msk-access/hotfix-sexmismatch-nan-noregions

Hotfix sexmismatch nan noregions
msk-access · Jun 8, 2021 · f9bb131 · f9bb131
2 parents 5820383 + f75abc6
commit f9bb131
Show file tree

Hide file tree

Showing 5 changed files with 73 additions and 11 deletions.
diff --git a/biometrics/VERSION b/biometrics/VERSION
@@ -1 +1 @@
-0.2.11
+0.2.12
diff --git a/biometrics/extract.py b/biometrics/extract.py
@@ -55,7 +55,7 @@ def _parse_bed_file(self):
         # only keep Y chrom regions
         self.regions = self.regions[self.regions[0].isin(['Y', 'chrY'])]
         if len(self.regions) == 0:
-            print('There are not Y chromosome regions!')
+            print('There are no Y chromosome regions. Cannot determine if there is a sex mismatch.')
 
         self.regions.columns = range(self.regions.shape[1])
 
@@ -87,9 +87,9 @@ def _extract_regions(self, sample):
                 'end': end,
                 'count': count})
 
-        region_counts = pd.DataFrame(region_counts)
-
-        sample.region_counts = region_counts
+        if len(region_counts) > 0:
+            region_counts = pd.DataFrame(region_counts)
+            sample.region_counts = region_counts
 
         return sample
 

diff --git a/biometrics/sample.py b/biometrics/sample.py
@@ -29,9 +29,9 @@ def __init__(self, sample_name=None, sample_bam=None, sample_group=None,
 
         if self.sample_name is not None:
             if db is not None:
-                self.extraction_file = os.path.join(db, self.sample_name + '.pk')
+                self.extraction_file = os.path.join(db, self.sample_name + '.pickle')
             else:
-                self.extraction_file = self.sample_name + '.pk'
+                self.extraction_file = self.sample_name + '.pickle'
 
     def save_to_file(self):
 

diff --git a/tests/test_biometrics.py b/tests/test_biometrics.py
@@ -8,6 +8,7 @@
 from unittest import TestCase
 from unittest import mock
 
+import pandas as pd
 from biometrics.biometrics import get_samples, run_minor_contamination, run_major_contamination
 from biometrics.cli import get_args
 from biometrics.extract import Extract
@@ -93,6 +94,10 @@ def test_extract_sample(self):
         self.assertIsNotNone(samples['test_sample1'].pileup, msg='Sample pileup was not loaded correctly.')
         self.assertEqual(samples['test_sample1'].pileup.shape[0], 15, msg='Did not find pileup for 4 variants. Found: {}.'.format(samples['test_sample1'].pileup))
 
+        self.assertIsNotNone(
+            samples['test_sample1'].region_counts,
+            msg='Sample bed file was not loaded correctly.')
+
 
 class TestLoadData(TestCase):
     """Tests load data by sample name in `biometrics` package."""
@@ -110,7 +115,7 @@ class TestLoadData(TestCase):
             database=os.path.join(CUR_DIR, 'test_data/'),
             vcf=None,
             fafile=None,
-            bed=None,
+            bed=os.path.join(CUR_DIR, 'test_data/test.bed'),
             min_mapping_quality=None,
             min_base_quality=None,
             min_coverage=None,
@@ -149,8 +154,8 @@ class TestLoadDataPickle(TestCase):
         return_value=argparse.Namespace(
             subparser_name='extract',
             input=[
-                os.path.join(CUR_DIR, 'test_data/test_sample1.pk'),
-                os.path.join(CUR_DIR, 'test_data/test_sample2.pk')],
+                os.path.join(CUR_DIR, 'test_data/test_sample1.pickle'),
+                os.path.join(CUR_DIR, 'test_data/test_sample2.pickle')],
             sample_bam=None,
             sample_name=None,
             sample_type=None,
@@ -206,7 +211,7 @@ class TestDownstreamTools(TestCase):
             database=os.path.join(CUR_DIR, 'test_data/'),
             vcf=None,
             fafile=None,
-            bed=None,
+            bed=os.path.join(CUR_DIR, 'test_data/test.bed'),
             min_mapping_quality=None,
             min_base_quality=None,
             min_coverage=None,
@@ -297,3 +302,59 @@ def test_sexmismatch(self):
 
         self.assertEqual(set(results['expected_sex']), set(['M']), msg='Expected all samples to have an expected sex of M.')
         self.assertEqual(set(results['predicted_sex']), set(['M']), msg='Expected all samples to not have a sex mismatch.')
+
+
+class TestNASexMismatch(TestCase):
+    """Test that sex mismatch returns NA if no Y chrom regions."""
+
+    @mock.patch(
+        'argparse.ArgumentParser.parse_args',
+        return_value=argparse.Namespace(
+            subparser_name='extract',
+            input=None,
+            sample_bam=[
+                os.path.join(CUR_DIR, 'test_data/test_sample1_golden.bam'),
+                os.path.join(CUR_DIR, 'test_data/test_sample2_golden.bam')],
+            sample_name=['test_sample1', 'test_sample2'],
+            sample_type=['tumor', 'tumor'],
+            sample_group=['patient1', 'patient1'],
+            sample_sex=['M', 'M'],
+            database=os.path.join(CUR_DIR, 'test_data/'),
+            vcf=os.path.join(CUR_DIR, 'test_data/test.vcf'),
+            fafile=os.path.join(CUR_DIR, 'test_data/ref.fasta'),
+            bed=os.path.join(CUR_DIR, 'test_data/test-noY.bed'),
+            min_mapping_quality=1,
+            min_base_quality=1,
+            min_coverage=10,
+            minor_threshold=0.002,
+            major_threshold=0.6,
+            discordance_threshold=0.05,
+            coverage_threshold=50,
+            min_homozygous_thresh=0.1,
+            zmin=None,
+            zmax=None,
+            outdir='.',
+            json=None,
+            plot=False,
+            default_genotype=None,
+            overwrite=True,
+            no_db_compare=False,
+            prefix='test',
+            version=False,
+            threads=1))
+    def setUp(self, mock_args):
+        """Set up test fixtures, if any."""
+
+        self.args = get_args()
+
+    def test_sexmismatch_noY(self):
+
+        extractor = Extract(self.args)
+        samples = get_samples(self.args, extraction_mode=True)
+        samples = extractor.extract(samples)
+
+        sex_mismatch = SexMismatch(self.args.coverage_threshold)
+        results = sex_mismatch.detect_mismatch(samples)
+
+        self.assertTrue(
+            pd.isna(results.at[0, 'predicted_sex']), msg='Predicted sample sex should have been nan.')
diff --git a/tests/test_data/test-noY.bed b/tests/test_data/test-noY.bed
@@ -0,0 +1 @@
+X	1	200