phenoscape · hlapp · Feb 2, 2021 · Feb 1, 2021
diff --git a/R/semsim.R b/R/semsim.R
@@ -213,6 +213,8 @@ cosine_similarity <- function(subsumer_mat = NA, terms = NULL, ...) {
 #'   greater than any of its superclasses. If a function, it must accept parameter
 #'   `x` as the vector of term IRIs and return a vector of frequencies (_not_
 #'   IC scores) for the terms. The default is to use function [term_freqs()].
+#'   Subsumer terms with zero or missing (NA) frequency will be omitted from
+#'   the calculation.
 #' @param wt_args list, named parameters for the function calculating term
 #'   frequencies. Ignored if `wt` is not a function. For the default `wt`
 #'   function [term_freqs()], the main parameters are `as` and `corpus`. 
@@ -248,7 +250,14 @@ resnik_similarity <- function(subsumer_mat = NA, terms = NULL, ...,
   if (missing(wt) || is.function(wt)) {
     wt_args$x <- rownames(subsumer_mat)
     wt <- do.call(wt, wt_args)
-    wt[wt == 0] <- 1
+    # Terms with frequency zero should not occur in the subsumer matrix, so
+    # if there are any, they either shouldn't have been a subsumer, or they
+    # didn't yield a count. Either way, remove them from the computation.
+    rowsToRemove <- is.na(wt) | wt == 0
+    if (any(rowsToRemove)) {
+      wt <- wt[! rowsToRemove]
+      subsumer_mat <- subsumer_mat[! rowsToRemove,]
+    }
     # we assume we got frequencies, turn into IC
     wt <- -log(wt, base = base)
   }

diff --git a/man/similarity.Rd b/man/similarity.Rd
diff --git a/tests/testthat/test-semsim.R b/tests/testthat/test-semsim.R
@@ -84,9 +84,9 @@ test_that("Resnik similarity", {
   termICs <- -log10(term_freqs(phens$id, as = "phenotype", corpus = "taxa"))
   testthat::expect_equivalent(diag(sm.ic), termICs)
 
-  subs.ics <- -log10(term_freqs(rownames(subs.mat),
-                                as = "phenotype", corpus = "taxa"))
-  sm.ic2 <- resnik_similarity(subs.mat, wt = subs.ics)
+  tfreqs <- term_freqs(rownames(subs.mat), as = "phenotype", corpus = "taxa")
+  sm.ic2 <- resnik_similarity(subs.mat[! (is.na(tfreqs) | tfreqs == 0), ],
+                              wt = -log10(tfreqs[! (is.na(tfreqs) | tfreqs == 0)]))
   testthat::expect_equal(sm.ic, sm.ic2)
 })
 
@@ -151,6 +151,9 @@ test_that("profile similarity with Resnik", {
   }))
 
   freqs <- term_freqs(rownames(subs.mat), as = "phenotype", corpus = "taxa")
+  toKeep <- ! (is.na(freqs) | freqs == 0)
+  freqs <- freqs[toKeep]
+  subs.mat <- subs.mat[toKeep,]
   sm <- profile_similarity(resnik_similarity, subs.mat, wt = -log10(freqs),
                            f = phens.f)
   testthat::expect_equal(colnames(sm), levels(phens.f))