Skip to content

Commit

Permalink
Correction to report splitting (#420)
Browse files Browse the repository at this point in the history
* resolve issues with Pathlib.parent

* delete some more linting exclusions

* restricts pedigree generation to use only the specific sequencing type

* also include sequencing technology as a query parameter

* Bump version: 5.1.1 → 5.1.2
  • Loading branch information
MattWellie authored Jul 17, 2024
1 parent 7e92956 commit 28a0811
Show file tree
Hide file tree
Showing 8 changed files with 29 additions and 24 deletions.
2 changes: 1 addition & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 5.1.1
current_version = 5.1.2
commit = True
tag = False

Expand Down
4 changes: 0 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,16 +38,12 @@ ignore = [
"E731", # Do not assign a lambda expression, use a def
"G004", # Logging statement uses f-string
"PLW0603", # Using the global statement to update `<VAR>` is discouraged
"PT018", # Assertion should be broken down into multiple parts
"Q000", # Single quotes found but double quotes preferred
"S101", # Use of assert detected
"PLR0911", # Too many return statements (> 6)
"PLR0912", # Too many branches (> 12)
"PLR0913", # Too many arguments in function (> 5)
"C901", # method is too complex (> 10 conditions)
"N999", # invalid module name (not lower-case and potholes)
"TCH001", # move application import *** into a type-checking block
"ERA001", # found commented out code
"I001", # Import block is un-sorted or un-formatted (to remove soon)
]

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def read_reqs(filename: str) -> list[str]:
name='talos',
description='Centre for Population Genomics Variant Prioritisation',
long_description=readme,
version='5.1.1',
version='5.1.2',
author='Matthew Welland, CPG',
author_email='matthew.welland@populationgenomics.org.au, cas.simons@populationgenomics.org.au',
package_data={'talos': ['templates/*.jinja', 'reanalysis_global.toml']},
Expand Down
20 changes: 11 additions & 9 deletions src/talos/CreateTalosHTML.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,12 +80,17 @@ def main(results: str, panelapp: str, output: str, latest: str | None = None, sp

# do something to split the output into separate datasets
# either look for an ID convention, or go with a random split
html_base = Path(output).parent
# originally this used Path(X).parent, but that translates gs:// to gs:/
# gs:/ as a schema is not recognised as a GCP path, leading to write errors
default_report_name = Path(output).name
html_base = output.rstrip(default_report_name)

for data, report, latest in split_data_into_sub_reports(results, split_samples):
html = HTMLBuilder(results=data, panelapp_path=panelapp)
try:
get_logger().info(f'Attempting to create {report}')
html.write_html(output_filepath=str(html_base / report))
html.write_html(output_filepath=f'{html_base}{report}')

except NoVariantsFoundError:
get_logger().info('No variants in that report, skipping')

Expand All @@ -96,7 +101,7 @@ def main(results: str, panelapp: str, output: str, latest: str | None = None, sp
latest_html = HTMLBuilder(results=date_filtered_object, panelapp_path=panelapp)
try:
get_logger().info(f'Attempting to create {latest_html}')
latest_html.write_html(output_filepath=str(html_base / latest), latest=True)
latest_html.write_html(output_filepath=f'{html_base}{latest}', latest=True)
except NoVariantsFoundError:
get_logger().info('No variants in that latest report, skipping')

Expand Down Expand Up @@ -165,19 +170,18 @@ def __init__(self, results: str | ResultData, panelapp_path: str):
# Optionally read in the labels file
# This file should be a nested dictionary of sample IDs and variant identifiers
# with a list of corresponding label values, e.g.:
# ruff: noqa: ERA001
# {
# "sample1": {
# "1-123456-A-T": ["label1", "label2"],
# "1-123457-A-T": ["label1"]
# },
# }
ext_labels = config_retrieve(['CreateTalosHTML', 'external_labels'], {})
assert isinstance(ext_labels, dict)
self.ext_labels: dict[str, dict] = ext_labels
self.ext_labels: dict[str, dict] = config_retrieve(['CreateTalosHTML', 'external_labels'], {})
assert isinstance(self.ext_labels, dict)

# Read results file, or take it directly
results_dict = read_json_from_path(results, return_model=ResultData) if isinstance(results, str) else results

assert isinstance(results_dict, ResultData)

self.metadata = results_dict.metadata
Expand Down Expand Up @@ -392,8 +396,6 @@ def __init__(
self.panel_ids = metadata.panel_ids
self.panel_names = metadata.panel_names
self.seqr_id = html_builder.seqr.get(name, None)
self.ext_labels = ext_labels
self.html_builder = html_builder

# Ingest variants excluding any on the forbidden gene list
self.variants = [
Expand Down
21 changes: 15 additions & 6 deletions src/talos/GeneratePED.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@
HPO_RE = re.compile(r'HP:[0-9]+')
PARTICIPANT_QUERY = gql(
"""
query MyQuery($project: String!) {
query MyQuery($project: String!, $sequencing_type: String!, $technology: String!) {
project(name: $project) {
pedigree
sequencingGroups {
sequencingGroups(technology: {eq: $technology}, type: {eq: $sequencing_type}) {
id
sample {
participant {
Expand All @@ -39,11 +39,13 @@
)


def get_data_from_metamist(project: str) -> list[list[str]]:
def get_data_from_metamist(project: str, seq_type: str, tech: str) -> list[list[str]]:
"""
Query metamist for the required data
Args:
project ():
seq_type (str): exome/genome
tech (str): type of sequence data to query for
Returns:
returns the new Ped contents, ready to be written to a file
Expand All @@ -56,7 +58,7 @@ def get_data_from_metamist(project: str) -> list[list[str]]:
ped_entries: list[list[str]] = []

# first get a lookup of Int IDs to Ext IDs
result = query(PARTICIPANT_QUERY, variables={'project': project})
result = query(PARTICIPANT_QUERY, variables={'project': project, 'sequencing_type': seq_type, 'technology': tech})

# maps External IDs from the Pedigree endpoint to Internal CPG IDs
ext_to_int: dict[str, str] = {}
Expand All @@ -82,8 +84,13 @@ def get_data_from_metamist(project: str) -> list[list[str]]:
entry['individual_id'],
]

# if there are recorded HPOs, extend the row with them
# skip over the rows where we didn't find a linked internal ID
# this will prune the pedigree to remove all the data-only entries in the cohort
assert isinstance(entry['individual_id'], str)
if not ped_row[1].startswith('CPG'):
continue

# if there are recorded HPOs, extend the row with them
if hpos := cpg_to_hpos.get(ext_to_int.get(entry['individual_id'], 'missing')):
ped_row.extend(sorted(hpos))

Expand All @@ -96,9 +103,11 @@ def main():
parser = ArgumentParser(description='Generate a PED file for Talos')
parser.add_argument('dataset', help='The dataset to query for')
parser.add_argument('output', help='The output file')
parser.add_argument('type', help='Sequencing type (exome or genome)')
parser.add_argument('--tech', help='Sequencing technology', default='short-read')
args = parser.parse_args()

new_ped_rows = get_data_from_metamist(args.dataset)
new_ped_rows = get_data_from_metamist(args.dataset, seq_type=args.type, tech=args.tech)

# write a headless TSV file, as an extended PED format
with open(args.output, 'w', encoding='utf-8') as handle:
Expand Down
2 changes: 1 addition & 1 deletion src/talos/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
"""

# Do not edit this file manually
__version__ = '5.1.1'
__version__ = '5.1.2'
1 change: 0 additions & 1 deletion test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
INPUT: str = str(PWD / 'input')
hl.init(default_reference='GRCh38')
environ['TALOS_CONFIG'] = join(INPUT, 'reanalysis_global.toml')
print(environ)

LABELLED = join(INPUT, '1_labelled_variant.vcf.bgz')
Talos_OUTPUT = join(INPUT, 'aip_output_example.json')
Expand Down
1 change: 0 additions & 1 deletion test/model_liftovers/test_PhenotypeMatchedPanels.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ def test_lift_pmp_from_none_json(test_input_models_path):
# check the HPO terms
for sample in lifted.samples.values():
for term in sample.hpo_terms:
print(type(term))
assert isinstance(term, PhenoPacketHpo)
assert isinstance(term.id, str)
assert isinstance(term.label, str)
Expand Down

0 comments on commit 28a0811

Please sign in to comment.