Correction to report splitting (#420)

* resolve issues with Pathlib.parent * delete some more linting exclusions * restricts pedigree generation to use only the specific sequencing type * also include sequencing technology as a query parameter * Bump version: 5.1.1 → 5.1.2
populationgenomics · Jul 17, 2024 · 28a0811 · 28a0811
1 parent 7e92956
commit 28a0811
Show file tree

Hide file tree

Showing 8 changed files with 29 additions and 24 deletions.
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 5.1.1
+current_version = 5.1.2
 commit = True
 tag = False
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -38,16 +38,12 @@ ignore = [
     "E731", # Do not assign a lambda expression, use a def
     "G004", # Logging statement uses f-string
     "PLW0603", # Using the global statement to update `<VAR>` is discouraged
-    "PT018", # Assertion should be broken down into multiple parts
     "Q000", # Single quotes found but double quotes preferred
     "S101", # Use of assert detected
-    "PLR0911", # Too many return statements (> 6)
     "PLR0912", # Too many branches (> 12)
     "PLR0913", # Too many arguments in function (> 5)
     "C901", # method is too complex (> 10 conditions)
     "N999", # invalid module name (not lower-case and potholes)
-    "TCH001", # move application import *** into a type-checking block
-    "ERA001", # found commented out code
     "I001", # Import block is un-sorted or un-formatted (to remove soon)
 ]
 

diff --git a/setup.py b/setup.py
@@ -27,7 +27,7 @@ def read_reqs(filename: str) -> list[str]:
     name='talos',
     description='Centre for Population Genomics Variant Prioritisation',
     long_description=readme,
-    version='5.1.1',
+    version='5.1.2',
     author='Matthew Welland, CPG',
     author_email='matthew.welland@populationgenomics.org.au, cas.simons@populationgenomics.org.au',
     package_data={'talos': ['templates/*.jinja', 'reanalysis_global.toml']},

diff --git a/src/talos/CreateTalosHTML.py b/src/talos/CreateTalosHTML.py
@@ -80,12 +80,17 @@ def main(results: str, panelapp: str, output: str, latest: str | None = None, sp
 
     # do something to split the output into separate datasets
     # either look for an ID convention, or go with a random split
-    html_base = Path(output).parent
+    # originally this used Path(X).parent, but that translates gs:// to gs:/
+    # gs:/ as a schema is not recognised as a GCP path, leading to write errors
+    default_report_name = Path(output).name
+    html_base = output.rstrip(default_report_name)
+
     for data, report, latest in split_data_into_sub_reports(results, split_samples):
         html = HTMLBuilder(results=data, panelapp_path=panelapp)
         try:
             get_logger().info(f'Attempting to create {report}')
-            html.write_html(output_filepath=str(html_base / report))
+            html.write_html(output_filepath=f'{html_base}{report}')
+
         except NoVariantsFoundError:
             get_logger().info('No variants in that report, skipping')
 
@@ -96,7 +101,7 @@ def main(results: str, panelapp: str, output: str, latest: str | None = None, sp
             latest_html = HTMLBuilder(results=date_filtered_object, panelapp_path=panelapp)
             try:
                 get_logger().info(f'Attempting to create {latest_html}')
-                latest_html.write_html(output_filepath=str(html_base / latest), latest=True)
+                latest_html.write_html(output_filepath=f'{html_base}{latest}', latest=True)
             except NoVariantsFoundError:
                 get_logger().info('No variants in that latest report, skipping')
 
@@ -165,19 +170,18 @@ def __init__(self, results: str | ResultData, panelapp_path: str):
         # Optionally read in the labels file
         # This file should be a nested dictionary of sample IDs and variant identifiers
         # with a list of corresponding label values, e.g.:
+        # ruff: noqa: ERA001
         # {
         #     "sample1": {
         #         "1-123456-A-T": ["label1", "label2"],
         #         "1-123457-A-T": ["label1"]
         #     },
         # }
-        ext_labels = config_retrieve(['CreateTalosHTML', 'external_labels'], {})
-        assert isinstance(ext_labels, dict)
-        self.ext_labels: dict[str, dict] = ext_labels
+        self.ext_labels: dict[str, dict] = config_retrieve(['CreateTalosHTML', 'external_labels'], {})
+        assert isinstance(self.ext_labels, dict)
 
         # Read results file, or take it directly
         results_dict = read_json_from_path(results, return_model=ResultData) if isinstance(results, str) else results
-
         assert isinstance(results_dict, ResultData)
 
         self.metadata = results_dict.metadata
@@ -392,8 +396,6 @@ def __init__(
         self.panel_ids = metadata.panel_ids
         self.panel_names = metadata.panel_names
         self.seqr_id = html_builder.seqr.get(name, None)
-        self.ext_labels = ext_labels
-        self.html_builder = html_builder
 
         # Ingest variants excluding any on the forbidden gene list
         self.variants = [

diff --git a/src/talos/GeneratePED.py b/src/talos/GeneratePED.py
@@ -22,10 +22,10 @@
 HPO_RE = re.compile(r'HP:[0-9]+')
 PARTICIPANT_QUERY = gql(
     """
-query MyQuery($project: String!) {
+query MyQuery($project: String!, $sequencing_type: String!, $technology: String!) {
   project(name: $project) {
     pedigree
-    sequencingGroups {
+    sequencingGroups(technology: {eq: $technology}, type:  {eq: $sequencing_type}) {
       id
       sample {
         participant {
@@ -39,11 +39,13 @@
 )
 
 
-def get_data_from_metamist(project: str) -> list[list[str]]:
+def get_data_from_metamist(project: str, seq_type: str, tech: str) -> list[list[str]]:
     """
     Query metamist for the required data
     Args:
         project ():
+        seq_type (str): exome/genome
+        tech (str): type of sequence data to query for
 
     Returns:
         returns the new Ped contents, ready to be written to a file
@@ -56,7 +58,7 @@ def get_data_from_metamist(project: str) -> list[list[str]]:
     ped_entries: list[list[str]] = []
 
     # first get a lookup of Int IDs to Ext IDs
-    result = query(PARTICIPANT_QUERY, variables={'project': project})
+    result = query(PARTICIPANT_QUERY, variables={'project': project, 'sequencing_type': seq_type, 'technology': tech})
 
     # maps External IDs from the Pedigree endpoint to Internal CPG IDs
     ext_to_int: dict[str, str] = {}
@@ -82,8 +84,13 @@ def get_data_from_metamist(project: str) -> list[list[str]]:
             entry['individual_id'],
         ]
 
-        # if there are recorded HPOs, extend the row with them
+        # skip over the rows where we didn't find a linked internal ID
+        # this will prune the pedigree to remove all the data-only entries in the cohort
         assert isinstance(entry['individual_id'], str)
+        if not ped_row[1].startswith('CPG'):
+            continue
+
+        # if there are recorded HPOs, extend the row with them
         if hpos := cpg_to_hpos.get(ext_to_int.get(entry['individual_id'], 'missing')):
             ped_row.extend(sorted(hpos))
 
@@ -96,9 +103,11 @@ def main():
     parser = ArgumentParser(description='Generate a PED file for Talos')
     parser.add_argument('dataset', help='The dataset to query for')
     parser.add_argument('output', help='The output file')
+    parser.add_argument('type', help='Sequencing type (exome or genome)')
+    parser.add_argument('--tech', help='Sequencing technology', default='short-read')
     args = parser.parse_args()
 
-    new_ped_rows = get_data_from_metamist(args.dataset)
+    new_ped_rows = get_data_from_metamist(args.dataset, seq_type=args.type, tech=args.tech)
 
     # write a headless TSV file, as an extended PED format
     with open(args.output, 'w', encoding='utf-8') as handle:

diff --git a/src/talos/version.py b/src/talos/version.py
@@ -3,4 +3,4 @@
 """
 
 # Do not edit this file manually
-__version__ = '5.1.1'
+__version__ = '5.1.2'
diff --git a/test/conftest.py b/test/conftest.py
@@ -20,7 +20,6 @@
 INPUT: str = str(PWD / 'input')
 hl.init(default_reference='GRCh38')
 environ['TALOS_CONFIG'] = join(INPUT, 'reanalysis_global.toml')
-print(environ)
 
 LABELLED = join(INPUT, '1_labelled_variant.vcf.bgz')
 Talos_OUTPUT = join(INPUT, 'aip_output_example.json')

diff --git a/test/model_liftovers/test_PhenotypeMatchedPanels.py b/test/model_liftovers/test_PhenotypeMatchedPanels.py
@@ -54,7 +54,6 @@ def test_lift_pmp_from_none_json(test_input_models_path):
     # check the HPO terms
     for sample in lifted.samples.values():
         for term in sample.hpo_terms:
-            print(type(term))
             assert isinstance(term, PhenoPacketHpo)
             assert isinstance(term.id, str)
             assert isinstance(term.label, str)