always do a strict comparison, optionally do semantic comparison

populationgenomics · Aug 5, 2024 · c66efdb · c66efdb
1 parent 77f5d29
commit c66efdb
Show file tree

Hide file tree

Showing 3 changed files with 25 additions and 14 deletions.
diff --git a/src/talos/HPOFlagging.py b/src/talos/HPOFlagging.py
@@ -103,7 +103,7 @@ def annotate_phenotype_matches(result_object: ResultData, gen_phen: dict[str, se
         out_path (str): path to write results to
     """
 
-    use_strict = config_retrieve(['HPOFlagging', 'strict'], False)
+    semantic_match = config_retrieve(['HPOFlagging', 'semantic_match'], False)
 
     min_similarity: float = config_retrieve(['HPOFlagging', 'min_similarity'])
 
@@ -116,15 +116,12 @@ def annotate_phenotype_matches(result_object: ResultData, gen_phen: dict[str, se
             gene_hpos = gen_phen.get(var_gene, set())
 
             # under strict matching we require exact overlapping terms
-            if use_strict:
-                hpo_intersection = participant_hpos & gene_hpos
-                if not hpo_intersection:
-                    continue
-                for hpo_id in hpo_intersection:
-                    variant.phenotype_labels.add(f'{hpo_id}: {participant_hpos_dict[hpo_id]}')
-
-            # under standard matching we check for phenotypic similarity
-            elif participant_hpos and gene_hpos:
+            # we always run a strict match
+            for hpo_id in participant_hpos & gene_hpos:
+                variant.phenotype_labels.add(f'{hpo_id}: {participant_hpos_dict[hpo_id]}')
+
+            # optionally also use semantic matching for phenotypic similarity
+            if participant_hpos and gene_hpos and semantic_match:
                 termset_similarity = get_sem_client().termset_pairwise_similarity(participant_hpos, gene_hpos)
                 # Convert object terms (gene_phenotypes) to lookup dict
                 object_termset = {

diff --git a/src/talos/example_config.toml b/src/talos/example_config.toml
@@ -72,9 +72,10 @@ seqr_project = "e.g. COHORT_project_id"
 [HPOFlagging]
 # this section relates to phenotype-matching the final variant set
 
-# set this to True to do strict phenotype matching (participant must have an HPO term associated with the Gene)
-# set this to True to do a wiggly semantic similarity test between participant and gene HPOs
-strict = false
+# set this to True to do a semantic term comparison when phenotype matching
+# this does a wiggly semantic similarity test between participant and gene HPOs, through SemSimian
+# if False, we will always do a set intersection on HPO terms
+semantic_match = true
 
 # min similarity score when doing a semsimian termset similarity test
 min_similarity = 14.0
diff --git a/src/talos/utils.py b/src/talos/utils.py
@@ -828,8 +828,21 @@ def phenotype_label_history(results: ResultData):
                 else:
                     hist.first_phenotype_tagged = get_granular_date()
 
-                # update all the phenotype labels - we might indentify incremental phenotype matches in future
+                # update all the phenotype labels - we might identify incremental phenotype matches in future
                 hist.phenotype_labels.update(var.phenotype_labels)
+            else:
+                # totally new variant, probably not possible, but tolerate here anyway
+                # reasoning: we're always running this after MOI checking, so we shouldn't
+                # have completely new variants between there and here
+                sample_historic.results.setdefault(sample, {})[var_id] = HistoricSampleVariant(
+                    categories={cat: get_granular_date() for cat in var.categories},
+                    support_vars=var.support_vars,
+                    independent=var.independent,
+                    first_tagged=get_granular_date(),
+                    clinvar_stars=var.var_data.info.get('clinvar_stars'),
+                    phenotype_labels=var.phenotype_labels,
+                    first_phenotype_tagged=get_granular_date(),
+                )
 
     save_new_historic(results=latest_results)