diff --git a/.Rbuildignore b/.Rbuildignore index df93d0b..daa6fd9 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -3,7 +3,7 @@ ^\.Rproj\.user$ README.rmd rphenoscape.*.tar.gz -cache +^cache vignettes/cache ^docs$ ^_pkgdown\.yml$ diff --git a/.gitignore b/.gitignore index 6d4aef8..6dafd37 100644 --- a/.gitignore +++ b/.gitignore @@ -3,7 +3,7 @@ .Rproj.user/ rphenoscape.Rproj .Rproj.user -cache +cache*/ rphenoscape*.tar.gz *.Rcheck .DS_Store diff --git a/DESCRIPTION b/DESCRIPTION index 210534e..e5e7974 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -30,7 +30,9 @@ Imports: rlang, methods, stringi, - RNeXML (>= 2.4.0) + RNeXML (>= 2.4.0), + digest, + memoise Suggests: roxygen2, knitr, diff --git a/NAMESPACE b/NAMESPACE index 5b9e79f..fee4322 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -12,6 +12,7 @@ export(add_provenance_record) export(anatomy_ontology_iris) export(as.phenotype) export(bestPairs) +export(cache_serializableMemory) export(chars) export(charstates) export(corpus_size) @@ -53,6 +54,8 @@ export(pk_taxon_detail) export(profile_similarity) export(reduce.ignoringDiag) export(resnik_similarity) +export(restore_cache) +export(serialize_cache) export(subsumer_matrix) export(tanimoto_similarity) export(taxon_ontology_iris) @@ -71,6 +74,7 @@ importFrom(RNeXML,get_taxa) importFrom(RNeXML,meta) importFrom(RNeXML,nexml) importFrom(RNeXML,nexml_read) +importFrom(digest,digest) importFrom(dplyr,"%>%") importFrom(dplyr,all_vars) importFrom(dplyr,filter) @@ -83,6 +87,8 @@ importFrom(httr,POST) importFrom(httr,content) importFrom(jsonlite,fromJSON) importFrom(jsonlite,toJSON) +importFrom(memoise,is.memoised) +importFrom(memoise,memoise) importFrom(methods,is) importFrom(methods,new) importFrom(methods,slot) diff --git a/R/cache.R b/R/cache.R new file mode 100644 index 0000000..0640259 --- /dev/null +++ b/R/cache.R @@ -0,0 +1,211 @@ +#' In-memory cache initialized from file +#' +#' A cache in memory, but initialized from, and serializable to a file on the +#' filesystem. Anything added to the cache and not serialized will last only in +#' the current R session. The persistent file cache can be updated by invoking +#' `cacheObj$serialize()` (where `cacheObj` is the object returned by this +#' function), or a file copy can be created by passing a different path as +#' argument to the invocation. +#' +#' The implementation is based on [memoise::cache_memory()], and is fully +#' compatible as a cache function with [memoise::memoise()] (parameter `cache`). +#' It differs in the following ways: +#' - The object returned (a list) has additional keys `serialize` and `restore`, +#' both functions accepting a path to a file. +#' - The computed hash values (as keys for the cache) remain the same across +#' reinstallation of this package, making it possible to use a persistent +#' cache that comes installed with the package. +#' +#' @param algo The hashing algorithm used for the cache, see +#' \code{\link[digest]{digest}} for available algorithms. +#' @param path the path for the file from which to initially load the cache, or +#' a [call][base::call] that evaluates to a file path. By default this is used +#' as the file to restore from and serialize to when requested. If the file +#' does not exist at initialization, it will be treated as if an empty file +#' had been provided. +#' @param compress logical, whether to use compression when serializing the cache. +#' @return +#' An object (in the form of a list) with the same interface as the objects +#' returned by [memoise::cache_memory()] etc., with functions `serialize()` and +#' `restore()` added. Both accept parameter `path`, which defaults to the value +#' of the `path` parameter above, and serializes / restores the cache to / from +#' the file designated by its value. +#' @references +#' Hadley Wickham, Jim Hester, Kirill Müller and Daniel Cook (2017). +#' memoise: Memoisation of Functions. R package version 1.1.0. +#' https://CRAN.R-project.org/package=memoise +#' @family cache methods +#' @keywords internal +#' @importFrom digest digest +#' @export +cache_serializableMemory <- function(algo = "sha512", path = NULL, compress = FALSE) { + + cache <- NULL + + cache_restore <- function(path = NULL, warnNotExists = TRUE) { + if (missing(path) || is.null(path)) + path <- parent.env(env = environment())$path + if (is.call(path)) path <- eval(path) + + if (is.null(path)) + # no file path set here or when initialized, treat as reset + cache_reset() + else if (file.exists(path)) + # hopefully all is good - file path set and exists + cache <<- readRDS(path) + else { + # file path set but doesn't exist, treat as restoring to empty file + if (warnNotExists) + warning("File '", path, "' to restore cache from does not exist. ", + "Treating as empty cache, i.e., cache reset.", + call. = FALSE) + cache_reset() + } + } + + cache_reset <- function() { + cache <<- new.env(TRUE, emptyenv()) + } + + cache_set <- function(key, value) { + assign(key, value, envir = cache) + } + + cache_get <- function(key) { + get(key, envir = cache, inherits = FALSE) + } + + cache_has_key <- function(key) { + exists(key, envir = cache, inherits = FALSE) + } + + cache_drop_key <- function(key) { + rm(list = key, envir = cache, inherits = FALSE) + } + + cache_serialize <- function(path = NULL) { + if (missing(path) || is.null(path)) + path <- parent.env(env = environment())$path + if (is.call(path)) path <- eval(path) + + if (is.null(path)) + warning("No path set or provided to which to serialize cache", call. = FALSE) + else + saveRDS(cache, file = path, compress = compress) + } + + hash_objects <- function(...) { + args <- list(...) + obj <- args[[1]] + obj <- lapply(obj, function(obj) + if (is.call(obj)) + paste(format(obj), collapse = "\n") + else + obj + ) + args[[1]] <- unlist(obj) + args$algo <- algo + do.call(digest::digest, args) + } + + cache_restore(path, warnNotExists = FALSE) + + list( + digest = hash_objects, + reset = cache_reset, + set = cache_set, + get = cache_get, + has_key = cache_has_key, + drop_key = cache_drop_key, + keys = function() ls(cache), + serialize = cache_serialize, + restore = cache_restore + ) +} + +#' Serialize a memoised function's cache to disk +#' +#' @param f the (memoised) function for which to serialize the cache to file +#' @param ... parameters to be passed on to the `serialize()` method of the cache, +#' normally the file path if different from the one with which the cache +#' was initialized. +#' @family cache methods +#' @importFrom memoise is.memoised +#' @export +serialize_cache <- function(f, ...) { + if (! memoise::is.memoised(f)) stop("`f` is not a memoised function", call. = FALSE) + + env <- environment(f) + if (exists("_cache", env, inherits = FALSE)) { + cache <- get("_cache", env) + if ("serialize" %in% names(cache)) + return(cache$serialize(...)) + else + warning("function cache does not support serializing upon request", call. = FALSE) + } + invisible(NULL) # this is what saveRDS() returns normally +} + +#' Restore a memoised function's cache from a file +#' +#' @param f the (memoised) function for which to restore the cache from file +#' @param ... parameters to be passed on to the `restore()` method of the cache, +#' normally the file path if different from the one with which the cache was +#' initialized. If a file path is provided, by default a warning is issued +#' if it does not exist. This can be suppressed by additionally passing +#' `warnNotExists = FALSE`. +#' @family cache methods +#' @importFrom memoise is.memoised +#' @export +restore_cache <- function(f, ...) { + if (! memoise::is.memoised(f)) stop("`f` is not a memoised function", call. = FALSE) + + env <- environment(f) + if (exists("_cache", env, inherits = FALSE)) { + cache <- get("_cache", env) + if ("restore" %in% names(cache)) + cache$restore(...) + else + warning("function cache does not support restoring from file upon request", call. = FALSE) + } +} + +#' Memoise a function optionally with persistent cache +#' +#' This is a thin layer over [memoise::memoise()], by applying a few defaults. +#' Specifically, by default the cache is in memory, but optionally a name for +#' a persistent cache can be given, in which case the cache is still held (and +#' grown) in memory, but is initialized from a file if it exists. +#' +#' The name of the persistent cache is turned into a file path by prefixing it +#' with the path to the `data-cache/` directory within the installed package, +#' and appending the extension `.rds`. Normally this location will not +#' be writable, i.e., the default persistent file cache will normally come with +#' installation, and stay unchanged. +#' @note +#' This function is a shortcut meant to be used only by the package's developers. +#' The described behavior when providing a persistent cache name can be achieved +#' with files located anywhere by using [memoise::memoise()] and passing +#' [cache_serializableMemory()] for the `cache` parameter. +#' @param f the function to [memoise][memoise::memoise()] +#' @param persistName character, a name for the persistent cache to load. Do not +#' include extension or path. The corresponding file is assumed to be part +#' of the installed package. If there is no such persistent cache file, this +#' is ignored on startup/load. +#' @importFrom memoise memoise +#' @keywords internal +memoise <- function(f, persistName = NULL) { + if (is.null(persistName) || nchar(persistName) == 0) + memoise::memoise(f, cache = memoise::cache_memory(algo = "xxhash64")) + else { + # constructing the actual file path needs to be deferred to when it is + # actually needed, due to Staged Installation compatibility being tested + # in R 3.6.0 and above. See + # https://developer.r-project.org/Blog/public/2019/02/14/staged-install/index.html + cacheFile <- substitute(file.path(system.file(package = "rphenoscape"), + "data-cache", + paste0(persistName, ".rds"))) + memoise::memoise(f, cache = cache_serializableMemory(algo = "xxhash64", + path = cacheFile)) + } +} \ No newline at end of file diff --git a/R/pk_get_IRI.R b/R/pk_get_IRI.R index 218978f..23e4a3c 100644 --- a/R/pk_get_IRI.R +++ b/R/pk_get_IRI.R @@ -219,21 +219,15 @@ semweb_ns <- function(default = NA) { #' @return A character vector #' @export #' @importFrom dplyr filter_at all_vars -anatomy_ontology_iris <- local({ - .iris <- c(); +anatomy_ontology_iris <- memoise( function() { - if (length(.iris) == 0) { - res <- find_term("anatomical structure", - matchBy = c("rdfs:label"), - matchTypes = c("exact", "partial"), - limit = 200) - res <- dplyr::filter_at(res, "label", - all_vars(startsWith(., "anatomical structure"))) - .iris <<- unique(res$isDefinedBy) - } - .iris - } -}) + res <- find_term("anatomical structure", + matchBy = c("rdfs:label"), matchTypes = c("exact", "partial"), + limit = 200) + res <- dplyr::filter_at(res, "label", + all_vars(startsWith(., "anatomical structure"))) + unique(res$isDefinedBy) + }) #' Get IRIs of ontologies with taxonomy terms #' @@ -241,18 +235,12 @@ anatomy_ontology_iris <- local({ #' @return A character vector #' @export #' @importFrom dplyr filter_at all_vars -taxon_ontology_iris <- local({ - .iris <- c(); +taxon_ontology_iris <- memoise( function() { - if (length(.iris) == 0) { - res <- find_term("Vertebrata", - matchBy = c("rdfs:label"), - matchTypes = c("exact")) - .iris <<- unique(res$isDefinedBy) - } - .iris - } -}) + res <- find_term("Vertebrata", + matchBy = c("rdfs:label"), matchTypes = c("exact")) + unique(res$isDefinedBy) + }) ontology_iri <- function(abbr) { ifelse(nchar(abbr) == 0 | diff --git a/R/term-weights.R b/R/term-weights.R index d47929a..42288f9 100644 --- a/R/term-weights.R +++ b/R/term-weights.R @@ -121,29 +121,30 @@ decode_entity_postcomp <- function(x) { }) } -annotations_count <- function(iri, termType, decodeIRI = TRUE, - apiEndpoint = "/taxon/annotations", - ...) { - query <- pkb_args_to_query(...) - query$total <- TRUE - if (termType == "entity" && decodeIRI) { - comps <- decode_entity_postcomp(iri)[[1]] - if ((length(comps$entities) == 1) && any(partOf_iri() %in% comps$rels)) { - query[[termType]] <- comps$entities +annotations_count <- memoise( + function(iri, termType, + decodeIRI = TRUE, apiEndpoint = "/taxon/annotations", ...) { + query <- pkb_args_to_query(...) + query$total <- TRUE + if (termType == "entity" && decodeIRI) { + comps <- decode_entity_postcomp(iri)[[1]] + if ((length(comps$entities) == 1) && any(partOf_iri() %in% comps$rels)) { + query[[termType]] <- comps$entities + } } - } - if (is.null(query[[termType]])) query[[termType]] <- iri - res <- get_json_data(pkb_api(apiEndpoint), query = query) - # if the IRI used for counting is a result of decoding the IRI, _and_ if - # we haven't included parts in the count already - if ((query[[termType]] != iri) && (is.null(query$parts) || ! query$parts)) { - # count with including parts, then subtract entities alone (counted before) - query$parts <- TRUE - res2 <- get_json_data(pkb_api(apiEndpoint), query = query) - res2$total - res$total - } else - res$total -} + if (is.null(query[[termType]])) query[[termType]] <- iri + res <- get_json_data(pkb_api(apiEndpoint), query = query) + # if the IRI used for counting is a result of decoding the IRI, _and_ if + # we haven't included parts in the count already + if ((query[[termType]] != iri) && (is.null(query$parts) || ! query$parts)) { + # count with including parts, then subtract entities alone (counted before) + query$parts <- TRUE + res2 <- get_json_data(pkb_api(apiEndpoint), query = query) + res2$total - res$total + } else + res$total + }, + persistName = "annot-counts") #' Obtain the size of different corpora #' @@ -163,23 +164,18 @@ annotations_count <- function(iri, termType, decodeIRI = TRUE, #' corpus_size("taxa") #' corpus_size("taxon_annotations") #' @export -corpus_size <- local({ - .sizes <- list() +corpus_size <- memoise( function(corpus = c("taxon_annotations", "taxa", "gene_annotations", "genes")) { corpus <- match.arg(corpus) - if (is.null(.sizes[[corpus]])) { - if (corpus == "taxa" || corpus == "genes") { - corpusID <- paste0("http://kb.phenoscape.org/sim/", corpus) - res <- get_json_data(pkb_api("/similarity/corpus_size"), - query = list(corpus_graph = corpusID)) - .sizes[[corpus]] <- res$total - } else if (corpus == "taxon_annotations") { - res <- get_json_data(pkb_api("/taxon/annotations"), list(total = TRUE)) - .sizes[[corpus]] <- res$total - } else { - stop("corpus 'gene_annotations' is currently unsupported", call. = FALSE) - } + if (corpus == "taxa" || corpus == "genes") { + corpusID <- paste0("http://kb.phenoscape.org/sim/", corpus) + res <- get_json_data(pkb_api("/similarity/corpus_size"), + query = list(corpus_graph = corpusID)) + res$total + } else if (corpus == "taxon_annotations") { + res <- get_json_data(pkb_api("/taxon/annotations"), list(total = TRUE)) + res$total + } else { + stop("corpus 'gene_annotations' is currently unsupported", call. = FALSE) } - .sizes[[corpus]] - } -}) + }) diff --git a/man/cache_serializableMemory.Rd b/man/cache_serializableMemory.Rd new file mode 100644 index 0000000..6c30772 --- /dev/null +++ b/man/cache_serializableMemory.Rd @@ -0,0 +1,59 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cache.R +\name{cache_serializableMemory} +\alias{cache_serializableMemory} +\title{In-memory cache initialized from file} +\usage{ +cache_serializableMemory(algo = "sha512", path = NULL, + compress = FALSE) +} +\arguments{ +\item{algo}{The hashing algorithm used for the cache, see +\code{\link[digest]{digest}} for available algorithms.} + +\item{path}{the path for the file from which to initially load the cache, or +a \link[base:call]{call} that evaluates to a file path. By default this is used +as the file to restore from and serialize to when requested. If the file +does not exist at initialization, it will be treated as if an empty file +had been provided.} + +\item{compress}{logical, whether to use compression when serializing the cache.} +} +\value{ +An object (in the form of a list) with the same interface as the objects +returned by \code{\link[memoise:cache_memory]{memoise::cache_memory()}} etc., with functions \code{serialize()} and +\code{restore()} added. Both accept parameter \code{path}, which defaults to the value +of the \code{path} parameter above, and serializes / restores the cache to / from +the file designated by its value. +} +\description{ +A cache in memory, but initialized from, and serializable to a file on the +filesystem. Anything added to the cache and not serialized will last only in +the current R session. The persistent file cache can be updated by invoking +\code{cacheObj$serialize()} (where \code{cacheObj} is the object returned by this +function), or a file copy can be created by passing a different path as +argument to the invocation. +} +\details{ +The implementation is based on \code{\link[memoise:cache_memory]{memoise::cache_memory()}}, and is fully +compatible as a cache function with \code{\link[memoise:memoise]{memoise::memoise()}} (parameter \code{cache}). +It differs in the following ways: +\itemize{ +\item The object returned (a list) has additional keys \code{serialize} and \code{restore}, +both functions accepting a path to a file. +\item The computed hash values (as keys for the cache) remain the same across +reinstallation of this package, making it possible to use a persistent +cache that comes installed with the package. +} +} +\references{ +Hadley Wickham, Jim Hester, Kirill Müller and Daniel Cook (2017). +memoise: Memoisation of Functions. R package version 1.1.0. +https://CRAN.R-project.org/package=memoise +} +\seealso{ +Other cache methods: \code{\link{restore_cache}}, + \code{\link{serialize_cache}} +} +\concept{cache methods} +\keyword{internal} diff --git a/man/memoise.Rd b/man/memoise.Rd new file mode 100644 index 0000000..1758b42 --- /dev/null +++ b/man/memoise.Rd @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cache.R +\name{memoise} +\alias{memoise} +\title{Memoise a function optionally with persistent cache} +\usage{ +memoise(f, persistName = NULL) +} +\arguments{ +\item{f}{the function to \link[memoise:memoise]{memoise}} + +\item{persistName}{character, a name for the persistent cache to load. Do not +include extension or path. The corresponding file is assumed to be part +of the installed package. If there is no such persistent cache file, this +is ignored on startup/load.} +} +\description{ +This is a thin layer over \code{\link[memoise:memoise]{memoise::memoise()}}, by applying a few defaults. +Specifically, by default the cache is in memory, but optionally a name for +a persistent cache can be given, in which case the cache is still held (and +grown) in memory, but is initialized from a file if it exists. +} +\details{ +The name of the persistent cache is turned into a file path by prefixing it +with the path to the \code{data-cache/} directory within the installed package, +and appending the extension \code{.rds}. Normally this location will not +be writable, i.e., the default persistent file cache will normally come with +installation, and stay unchanged. +} +\note{ +This function is a shortcut meant to be used only by the package's developers. +The described behavior when providing a persistent cache name can be achieved +with files located anywhere by using \code{\link[memoise:memoise]{memoise::memoise()}} and passing +\code{\link[=cache_serializableMemory]{cache_serializableMemory()}} for the \code{cache} parameter. +} +\keyword{internal} diff --git a/man/restore_cache.Rd b/man/restore_cache.Rd new file mode 100644 index 0000000..91f1a7a --- /dev/null +++ b/man/restore_cache.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cache.R +\name{restore_cache} +\alias{restore_cache} +\title{Restore a memoised function's cache from a file} +\usage{ +restore_cache(f, ...) +} +\arguments{ +\item{f}{the (memoised) function for which to restore the cache from file} + +\item{...}{parameters to be passed on to the \code{restore()} method of the cache, +normally the file path if different from the one with which the cache was +initialized. If a file path is provided, by default a warning is issued +if it does not exist. This can be suppressed by additionally passing +\code{warnNotExists = FALSE}.} +} +\description{ +Restore a memoised function's cache from a file +} +\seealso{ +Other cache methods: \code{\link{cache_serializableMemory}}, + \code{\link{serialize_cache}} +} +\concept{cache methods} diff --git a/man/serialize_cache.Rd b/man/serialize_cache.Rd new file mode 100644 index 0000000..f8b8738 --- /dev/null +++ b/man/serialize_cache.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cache.R +\name{serialize_cache} +\alias{serialize_cache} +\title{Serialize a memoised function's cache to disk} +\usage{ +serialize_cache(f, ...) +} +\arguments{ +\item{f}{the (memoised) function for which to serialize the cache to file} + +\item{...}{parameters to be passed on to the \code{serialize()} method of the cache, +normally the file path if different from the one with which the cache +was initialized.} +} +\description{ +Serialize a memoised function's cache to disk +} +\seealso{ +Other cache methods: \code{\link{cache_serializableMemory}}, + \code{\link{restore_cache}} +} +\concept{cache methods}