Skip to content

Commit

Permalink
always do a strict comparison, optionally do semantic comparison
Browse files Browse the repository at this point in the history
  • Loading branch information
MattWellie committed Aug 5, 2024
1 parent 77f5d29 commit c66efdb
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 14 deletions.
17 changes: 7 additions & 10 deletions src/talos/HPOFlagging.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def annotate_phenotype_matches(result_object: ResultData, gen_phen: dict[str, se
out_path (str): path to write results to
"""

use_strict = config_retrieve(['HPOFlagging', 'strict'], False)
semantic_match = config_retrieve(['HPOFlagging', 'semantic_match'], False)

min_similarity: float = config_retrieve(['HPOFlagging', 'min_similarity'])

Expand All @@ -116,15 +116,12 @@ def annotate_phenotype_matches(result_object: ResultData, gen_phen: dict[str, se
gene_hpos = gen_phen.get(var_gene, set())

# under strict matching we require exact overlapping terms
if use_strict:
hpo_intersection = participant_hpos & gene_hpos
if not hpo_intersection:
continue
for hpo_id in hpo_intersection:
variant.phenotype_labels.add(f'{hpo_id}: {participant_hpos_dict[hpo_id]}')

# under standard matching we check for phenotypic similarity
elif participant_hpos and gene_hpos:
# we always run a strict match
for hpo_id in participant_hpos & gene_hpos:
variant.phenotype_labels.add(f'{hpo_id}: {participant_hpos_dict[hpo_id]}')

# optionally also use semantic matching for phenotypic similarity
if participant_hpos and gene_hpos and semantic_match:
termset_similarity = get_sem_client().termset_pairwise_similarity(participant_hpos, gene_hpos)
# Convert object terms (gene_phenotypes) to lookup dict
object_termset = {
Expand Down
7 changes: 4 additions & 3 deletions src/talos/example_config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,10 @@ seqr_project = "e.g. COHORT_project_id"
[HPOFlagging]
# this section relates to phenotype-matching the final variant set

# set this to True to do strict phenotype matching (participant must have an HPO term associated with the Gene)
# set this to True to do a wiggly semantic similarity test between participant and gene HPOs
strict = false
# set this to True to do a semantic term comparison when phenotype matching
# this does a wiggly semantic similarity test between participant and gene HPOs, through SemSimian
# if False, we will always do a set intersection on HPO terms
semantic_match = true

# min similarity score when doing a semsimian termset similarity test
min_similarity = 14.0
15 changes: 14 additions & 1 deletion src/talos/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -828,8 +828,21 @@ def phenotype_label_history(results: ResultData):
else:
hist.first_phenotype_tagged = get_granular_date()

# update all the phenotype labels - we might indentify incremental phenotype matches in future
# update all the phenotype labels - we might identify incremental phenotype matches in future
hist.phenotype_labels.update(var.phenotype_labels)
else:
# totally new variant, probably not possible, but tolerate here anyway
# reasoning: we're always running this after MOI checking, so we shouldn't
# have completely new variants between there and here
sample_historic.results.setdefault(sample, {})[var_id] = HistoricSampleVariant(
categories={cat: get_granular_date() for cat in var.categories},
support_vars=var.support_vars,
independent=var.independent,
first_tagged=get_granular_date(),
clinvar_stars=var.var_data.info.get('clinvar_stars'),
phenotype_labels=var.phenotype_labels,
first_phenotype_tagged=get_granular_date(),
)

save_new_historic(results=latest_results)

Expand Down

0 comments on commit c66efdb

Please sign in to comment.