diff --git a/src/talos/HPOFlagging.py b/src/talos/HPOFlagging.py index 0eae2c66..d31ad026 100644 --- a/src/talos/HPOFlagging.py +++ b/src/talos/HPOFlagging.py @@ -103,7 +103,7 @@ def annotate_phenotype_matches(result_object: ResultData, gen_phen: dict[str, se out_path (str): path to write results to """ - use_strict = config_retrieve(['HPOFlagging', 'strict'], False) + semantic_match = config_retrieve(['HPOFlagging', 'semantic_match'], False) min_similarity: float = config_retrieve(['HPOFlagging', 'min_similarity']) @@ -116,15 +116,12 @@ def annotate_phenotype_matches(result_object: ResultData, gen_phen: dict[str, se gene_hpos = gen_phen.get(var_gene, set()) # under strict matching we require exact overlapping terms - if use_strict: - hpo_intersection = participant_hpos & gene_hpos - if not hpo_intersection: - continue - for hpo_id in hpo_intersection: - variant.phenotype_labels.add(f'{hpo_id}: {participant_hpos_dict[hpo_id]}') - - # under standard matching we check for phenotypic similarity - elif participant_hpos and gene_hpos: + # we always run a strict match + for hpo_id in participant_hpos & gene_hpos: + variant.phenotype_labels.add(f'{hpo_id}: {participant_hpos_dict[hpo_id]}') + + # optionally also use semantic matching for phenotypic similarity + if participant_hpos and gene_hpos and semantic_match: termset_similarity = get_sem_client().termset_pairwise_similarity(participant_hpos, gene_hpos) # Convert object terms (gene_phenotypes) to lookup dict object_termset = { diff --git a/src/talos/example_config.toml b/src/talos/example_config.toml index dcd732ea..26608d70 100644 --- a/src/talos/example_config.toml +++ b/src/talos/example_config.toml @@ -72,9 +72,10 @@ seqr_project = "e.g. COHORT_project_id" [HPOFlagging] # this section relates to phenotype-matching the final variant set -# set this to True to do strict phenotype matching (participant must have an HPO term associated with the Gene) -# set this to True to do a wiggly semantic similarity test between participant and gene HPOs -strict = false +# set this to True to do a semantic term comparison when phenotype matching +# this does a wiggly semantic similarity test between participant and gene HPOs, through SemSimian +# if False, we will always do a set intersection on HPO terms +semantic_match = true # min similarity score when doing a semsimian termset similarity test min_similarity = 14.0 diff --git a/src/talos/utils.py b/src/talos/utils.py index ede230bb..766f6da8 100644 --- a/src/talos/utils.py +++ b/src/talos/utils.py @@ -828,8 +828,21 @@ def phenotype_label_history(results: ResultData): else: hist.first_phenotype_tagged = get_granular_date() - # update all the phenotype labels - we might indentify incremental phenotype matches in future + # update all the phenotype labels - we might identify incremental phenotype matches in future hist.phenotype_labels.update(var.phenotype_labels) + else: + # totally new variant, probably not possible, but tolerate here anyway + # reasoning: we're always running this after MOI checking, so we shouldn't + # have completely new variants between there and here + sample_historic.results.setdefault(sample, {})[var_id] = HistoricSampleVariant( + categories={cat: get_granular_date() for cat in var.categories}, + support_vars=var.support_vars, + independent=var.independent, + first_tagged=get_granular_date(), + clinvar_stars=var.var_data.info.get('clinvar_stars'), + phenotype_labels=var.phenotype_labels, + first_phenotype_tagged=get_granular_date(), + ) save_new_historic(results=latest_results)