Skip to content

Commit

Permalink
Phenotype matching (#426)
Browse files Browse the repository at this point in the history
* Adds explicit phenotype matching step, using gen~phen db

* always do a strict comparison, optionally do semantic comparison

* Bump version: 5.1.4 → 5.2.0

* Co-authored-by: cassimons <cas.simons@populationgenomics.org.au>
  • Loading branch information
MattWellie authored Aug 5, 2024
1 parent fedf508 commit 70e4b79
Show file tree
Hide file tree
Showing 34 changed files with 710 additions and 48 deletions.
2 changes: 1 addition & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 5.1.4
current_version = 5.2.0
commit = True
tag = False

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/index_page_builder.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
runs-on: ubuntu-latest

env:
VERSION: 5.1.4
VERSION: 5.2.0

steps:

Expand Down
1 change: 0 additions & 1 deletion helpers/stratify_solved_cases.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from sys import argv

import toml

from metamist.graphql import gql, query

PROJECT_QUERY = gql(
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -55,5 +55,5 @@ quote-style = "single"
section-order = ["future", "standard-library", "third-party", "hail", "cpg", "first-party", "local-folder"]

[tool.ruff.lint.isort.sections]
cpg = ["metamist"]
cpg = ["metamist", "talos"]
hail = ["hail", "hailtop"]
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,6 @@ peds>=1.2.0
pydantic>=2.5.2
pyspark>=3.5.1
requests>=2.31.0
semsimian>=0.2.15
tabulate>=0.8.9
toml==0.10.2
9 changes: 8 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def read_reqs(filename: str) -> list[str]:
name='talos',
description='Centre for Population Genomics Variant Prioritisation',
long_description=readme,
version='5.1.4',
version='5.2.0',
author='Matthew Welland, CPG',
author_email='matthew.welland@populationgenomics.org.au, cas.simons@populationgenomics.org.au',
package_data={'talos': ['templates/*.jinja', 'example_config.toml']},
Expand Down Expand Up @@ -65,12 +65,19 @@ def read_reqs(filename: str) -> list[str]:
'GeneratePanelData = talos.GeneratePanelData:cli_main',
# query PanelApp for those selected panels
'QueryPanelapp = talos.QueryPanelapp:cli_main',
# use API queries to find the gene symbol for each gene ID
'FindGeneSymbolMap = talos.FindGeneSymbolMap:cli_main',
# # TODO - this thing just doesn't work in its current form. Does not scale.
# # match participant HPOs to gene HPOs for prioritisation
# # 'MatchGenesToPhenotypes = talos.MatchGenesToPhenotypes:cli_main', # noqa: ERA001
# Filter and label a small-variant MatrixTable
'RunHailFiltering = talos.RunHailFiltering:cli_main',
# Filter and label a SV MatrixTable
'RunHailFilteringSV = talos.RunHailFilteringSV:cli_main',
# Run each of the category-labelled variants through MOI filters
'ValidateMOI = talos.ValidateMOI:cli_main',
# catch variants which have strong phenotypic matches
'HPOFlagging = talos.HPOFlagging:cli_main',
# CPG internal (?), publish those results as an HTML report
'CreateTalosHTML = talos.CreateTalosHTML:cli_main',
# CPG internal (?), generate a file for ingestion by Seqr
Expand Down
3 changes: 1 addition & 2 deletions src/talos/BuildReportIndexPage.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,12 @@

import re
from dataclasses import dataclass
from functools import lru_cache
from pathlib import Path
from typing import Any
from functools import lru_cache

import jinja2
from cloudpathlib.anypath import to_anypath

from metamist.graphql import gql, query

from talos.static_values import get_logger
Expand Down
3 changes: 3 additions & 0 deletions src/talos/CreateTalosHTML.py
Original file line number Diff line number Diff line change
Expand Up @@ -469,6 +469,9 @@ def __init__(
self.genotypes = report_variant.genotypes
self.sample = sample
self.ext_labels = ext_labels
# add the phenotype match date and HPO term id/labels
self.phenotype_match_date = report_variant.date_of_phenotype_match
self.phenotype_matches = report_variant.phenotype_labels

# List of (gene_id, symbol)
self.genes: list[tuple[str, str]] = []
Expand Down
87 changes: 87 additions & 0 deletions src/talos/FindGeneSymbolMap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
"""
Takes the PanelApp data (the full region of interest)
produce an output of the {gene ID: gene symbol} for all included genes
write this to a file
This has been extracted to a separate script to allow for parallelisation
and to ensure that we can separately repeat this one step in isolation
if we run into API/throttling issues
This may be integrated into the HPO~Phenotype matching script if it runs consistently enough
"""

import asyncio
import json
from argparse import ArgumentParser

from aiohttp import ClientSession

from talos.config import config_retrieve
from talos.models import PanelApp
from talos.utils import chunks, read_json_from_path

ENSEMBL_REST_API = 'http://rest.ensembl.org'


async def match_ensgs_to_symbols(genes: list[str], session: ClientSession) -> dict[str, str]:
data_payload = json.dumps({'ids': genes})
r = await session.request(
method='POST',
url=f'{ENSEMBL_REST_API}/lookup/id',
headers={'Accept': 'application/json', 'Content-Type': 'application/json'},
data=data_payload,
)
r.raise_for_status()
json_reponse = await r.json()
# match symbol to the ENSG (or Unknown if the key is missing, or has a None value)
return {value.get('display_name'): key for key, value in json_reponse.items() if value}


async def match_symbol_to_ensg(gene_symbol: str, session: ClientSession) -> tuple[str, str]:
r = await session.request(
method='GET',
url=f'{ENSEMBL_REST_API}/lookup/id/{gene_symbol}',
headers={'Content-Type': 'application/json'},
)
r.raise_for_status()
json_reponse = await r.json()
return gene_symbol, json_reponse['display_name']


def cli_main():
parser = ArgumentParser()
parser.add_argument('--panelapp', help='Path to the PanelApp results file', required=True)
parser.add_argument('--out_path', help='where to write the output (.json)', required=True)
args = parser.parse_args()

main(args.panelapp, out_path=args.out_path)


def main(panelapp_path: str, out_path: str):
"""
Args:
panelapp_path (str): path to the PanelApp model JSON
out_path ():
"""
panelapp_object = read_json_from_path(read_path=panelapp_path, return_model=PanelApp)
# confirm for mypy
assert isinstance(panelapp_object, PanelApp)
genes = list(panelapp_object.genes.keys())
ensg_to_symbol_mapping = asyncio.run(get_symbols_for_ensgs(genes))

with open(out_path, 'w') as f:
json.dump(ensg_to_symbol_mapping, f, indent=4)


async def get_symbols_for_ensgs(genes: list[str]) -> dict[str, str]:
chunksize = config_retrieve(['FindGeneSymbolMap', 'chunk_size'], 800)
all_results: dict[str, str] = {}
async with ClientSession() as session:
for chunk in chunks(genes, chunksize):
all_results |= await match_ensgs_to_symbols(chunk, session=session)
return all_results


if __name__ == '__main__':
cli_main()
10 changes: 7 additions & 3 deletions src/talos/GeneratePanelData.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,18 @@
from peds import open_ped

from talos.config import config_retrieve
from talos.models import ParticipantHPOPanels, PhenotypeMatchedPanels
from talos.models import ParticipantHPOPanels, PhenoPacketHpo, PhenotypeMatchedPanels
from talos.static_values import get_logger

HPO_RE = re.compile(r'HP:[0-9]+')
MAX_DEPTH = 3

PANELAPP_HARD_CODED_DEFAULT = 'https://panelapp.agha.umccr.org/api/v1/panels'
PANELS_ENDPOINT = config_retrieve(['GeneratePanelData', 'panelapp'], PANELAPP_HARD_CODED_DEFAULT)
try:
PANELS_ENDPOINT = config_retrieve(['GeneratePanelData', 'panelapp'], PANELAPP_HARD_CODED_DEFAULT)
except KeyError:
get_logger(__file__).warning('Config environment variable TALOS_CONFIG not set, falling back to Aussie PanelApp')
PANELS_ENDPOINT = PANELAPP_HARD_CODED_DEFAULT


def get_json_response(url: str) -> dict:
Expand Down Expand Up @@ -102,7 +106,7 @@ def get_participant_hpos(pedigree: str) -> tuple[PhenotypeMatchedPanels, set[str
hpo_dict.samples[internal_id] = ParticipantHPOPanels(
external_id=external_id,
family_id=family_id,
hpo_terms=[{'id': hpo, 'label': ''} for hpo in member_data[1:]],
hpo_terms=[PhenoPacketHpo(id=hpo, label='') for hpo in member_data[1:]],
panels={137},
)

Expand Down
Loading

0 comments on commit 70e4b79

Please sign in to comment.