diff --git a/DESCRIPTION b/DESCRIPTION index cde14bf5..ce17b468 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: arf Title: Adversarial Random Forests -Version: 0.1.3 +Version: 0.2.0 Date: 2023-02-06 Authors@R: c(person(given = "Marvin N.", @@ -34,10 +34,10 @@ Imports: ranger, foreach, truncnorm, - matrixStats + tibble Encoding: UTF-8 Roxygen: list(markdown = TRUE) -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.0 Suggests: ggplot2, doParallel, diff --git a/NAMESPACE b/NAMESPACE index 19456ce2..c200186e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,7 @@ # Generated by roxygen2: do not edit by hand export(adversarial_rf) +export(expct) export(forde) export(forge) export(lik) @@ -9,8 +10,10 @@ import(ranger) importFrom(foreach,"%do%") importFrom(foreach,"%dopar%") importFrom(foreach,foreach) -importFrom(matrixStats,logSumExp) importFrom(stats,predict) importFrom(stats,runif) +importFrom(tibble,as_tibble) importFrom(truncnorm,dtruncnorm) +importFrom(truncnorm,etruncnorm) +importFrom(truncnorm,ptruncnorm) importFrom(truncnorm,rtruncnorm) diff --git a/NEWS.md b/NEWS.md index c3f7db68..28791455 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,7 @@ -# arf 0.1.3 -* Speed boost for the adversarial resampling step -* Early stopping option for adversarial training -* alpha parameter for regularizing multinomial distributions in forde -* Unified treatment of colnames with internal semantics (y, obs, tree, leaf) \ No newline at end of file +# arf 0.2.0 +* Vectorized adversarial resampling +* Speed boost for compiling into a probabilistic circuit +* Conditional densities and sampling +* Bayesian solution for invariant continuous data within leaf nodes +* New function for computing (conditional) expectations +* Options for missing data \ No newline at end of file diff --git a/R/adversarial_rf.R b/R/adversarial_rf.R index e5d00843..2f5f6846 100644 --- a/R/adversarial_rf.R +++ b/R/adversarial_rf.R @@ -13,6 +13,7 @@ #' @param max_iters Maximum iterations for the adversarial loop. #' @param early_stop Terminate loop if performance fails to improve from one #' round to the next? +#' @param prune Impose \code{min_node_size} by pruning? #' @param verbose Print discriminator accuracy after each round? #' @param parallel Compute in parallel? Must register backend beforehand, e.g. #' via \code{doParallel}. @@ -24,10 +25,10 @@ #' iteratively, with alternating rounds of generation and discrimination. In #' the first instance, synthetic data is generated via independent bootstraps of #' each feature, and a RF classifier is trained to distinguish between real and -#' synthetic samples. In subsequent rounds, synthetic data is generated -#' separately in each leaf, using splits from the previous forest. This creates -#' increasingly realistic data that satisfies local independence by construction. -#' The algorithm converges when a RF cannot reliably distinguish between the two +#' fake samples. In subsequent rounds, synthetic data is generated separately in +#' each leaf, using splits from the previous forest. This creates increasingly +#' realistic data that satisfies local independence by construction. The +#' algorithm converges when a RF cannot reliably distinguish between the two #' classes, i.e. when OOB accuracy falls below 0.5 + \code{delta}. #' #' ARFs are useful for several unsupervised learning tasks, such as density @@ -36,13 +37,12 @@ #' trees for improved performance (typically on the order of 100-1000 depending #' on sample size). #' -#' Integer variables are treated as ordered factors by default. If the ARF is -#' passed to \code{forde}, the estimated distribution for these variables will -#' only have support on observed factor levels (i.e., the output will be a pmf, -#' not a pdf). To override this behavior and assign nonzero density to -#' intermediate values, explicitly recode the features as numeric. +#' Integer variables are recoded with a warning. Default behavior is to convert +#' those with six or more unique values to numeric, while those with up to five +#' unique values are treated as ordered factors. To override this behavior, +#' explicitly recode integer variables to the target type prior to training. #' -#' Note: convergence is not guaranteed in finite samples. The \code{max_iter} +#' Note: convergence is not guaranteed in finite samples. The \code{max_iters} #' argument sets an upper bound on the number of training rounds. Similar #' results may be attained by increasing \code{delta}. Even a single round can #' often give good performance, but data with strong or complex dependencies may @@ -84,57 +84,23 @@ adversarial_rf <- function( delta = 0, max_iters = 10L, early_stop = TRUE, + prune = TRUE, verbose = TRUE, parallel = TRUE, ...) { # To avoid data.table check issues - i <- b <- cnt <- obs <- tree <- leaf <- . <- NULL + i <- b <- cnt <- obs <- tree <- leaf <- N <- . <- NULL # Prep data - x_real <- as.data.frame(x) + x_real <- prep_x(x) n <- nrow(x_real) - if ('y' %in% colnames(x_real)) { - colnames(x_real)[which(colnames(x_real) == 'y')] <- col_rename(x_real, 'y') - } - if ('obs' %in% colnames(x_real)) { - colnames(x_real)[which(colnames(x_real) == 'obs')] <- col_rename(x_real, 'obs') - } - if ('tree' %in% colnames(x_real)) { - colnames(x_real)[which(colnames(x_real) == 'tree')] <- col_rename(x_real, 'tree') - } - if ('leaf' %in% colnames(x_real)) { - colnames(x_real)[which(colnames(x_real) == 'leaf')] <- col_rename(x_real, 'leaf') - } - idx_char <- sapply(x_real, is.character) - if (any(idx_char)) { - x_real[, idx_char] <- as.data.frame( - lapply(x_real[, idx_char, drop = FALSE], as.factor) - ) - } - idx_logical <- sapply(x_real, is.logical) - if (any(idx_logical)) { - x_real[, idx_logical] <- as.data.frame( - lapply(x_real[, idx_logical, drop = FALSE], as.factor) - ) - } - idx_intgr <- sapply(x_real, is.integer) - if (any(idx_intgr)) { - warning('Recoding integer data as ordered factors. To override this behavior, ', - 'explicitly code these variables as numeric.') - for (j in which(idx_intgr)) { - lvls <- sort(unique(x_real[, j])) - x_real[, j] <- factor(x_real[, j], levels = lvls, ordered = TRUE) - } - } + d <- ncol(x_real) factor_cols <- sapply(x_real, is.factor) - if (any(!factor_cols) & min_node_size == 1L) { - warning('Variance is undefined when a leaf contains just a single observation. ', - 'Consider increasing min_node_size.') - } + lvls <- lapply(x_real[factor_cols], levels) # Fit initial model: sample from marginals, concatenate data, train RF - x_synth <- as.data.frame(lapply(x_real, sample, n, replace = TRUE)) + x_synth <- setDF(lapply(x_real, sample, n, replace = TRUE)) dat <- rbind(data.frame(y = 1L, x_real), data.frame(y = 0L, x_synth)) if (isTRUE(parallel)) { @@ -155,29 +121,40 @@ adversarial_rf <- function( ', Accuracy: ', round(acc0 * 100, 2), '%\n')) } if (acc0 > 0.5 + delta & iters < max_iters) { - sample_by_class <- function(x, n) { - if (is.numeric(x)) { - as.numeric(sample(x, n, replace = TRUE)) - } else { - sample(x, n, replace = TRUE) - } - } converged <- FALSE while (!isTRUE(converged)) { # Adversarial loop begins... # Create synthetic data by sampling from intra-leaf marginals nodeIDs <- stats::predict(rf0, x_real, type = 'terminalNodes')$predictions - tmp <- melt(as.data.table(nodeIDs), measure.vars = 1:num_trees, - variable.name = 'tree', value.name = 'leaf') - tmp[, tree := as.numeric(gsub('V', '', tree))][, obs := rep(1:n, num_trees)] - x_real_dt <- as.data.table(x_real)[, obs := 1:n] - x_real_dt <- merge(x_real_dt, tmp, by = 'obs', sort = FALSE) - tmp[, obs := NULL] + tmp <- data.table('tree' = rep(seq_len(num_trees), each = n), + 'leaf' = as.vector(nodeIDs)) + x_real_dt <- do.call(rbind, lapply(seq_len(num_trees), function(b) { + cbind(x_real, tmp[tree == b]) + })) + x_real_dt[, factor_cols] <- lapply(x_real_dt[, factor_cols, drop = FALSE], as.numeric) tmp <- tmp[sample(.N, n, replace = TRUE)] tmp <- unique(tmp[, cnt := .N, by = .(tree, leaf)]) - draw_from <- merge(tmp, x_real_dt, by = c('tree', 'leaf'), sort = FALSE) - x_synth <- draw_from[, lapply(.SD[, -c('cnt', 'obs')], sample_by_class, .SD[, cnt[1]]), - by = .(tree, leaf)][, c('tree', 'leaf') := NULL] - rm(nodeIDs, tmp, x_real_dt, draw_from) + draw_from <- merge(tmp, x_real_dt, by = c('tree', 'leaf'), + sort = FALSE)[, N := .N, by = .(tree, leaf)] + rm(nodeIDs, tmp, x_real_dt) + draw_params_within <- unique(draw_from, by = c('tree','leaf'))[, .(cnt, N)] + adj_absolut_col <- rep(c(0, draw_params_within[-.N, cumsum(N)]), + times = draw_params_within$cnt) + adj_absolut <- rep(adj_absolut_col, d) + rep(seq(0, d - 1) * nrow(draw_from), each = n) + idx_drawn_within <- ceiling(runif(n * d, 0, rep(draw_params_within$N, draw_params_within$cnt))) + idx_drawn <- idx_drawn_within + adj_absolut + draw_from_stacked <- unlist(draw_from[, -c('tree', 'leaf', 'cnt', 'N')], + use.names = FALSE) + values_drawn_stacked <- data.table('col_id' = rep(seq_len(d), each = n), + 'values' = draw_from_stacked[idx_drawn]) + x_synth <- as.data.table(split(values_drawn_stacked, by = 'col_id', keep.by = FALSE)) + setnames(x_synth, names(x_real)) + if (any(factor_cols)) { + x_synth[, names(which(factor_cols))] <- lapply(names(which(factor_cols)), function(j) { + lvls[[j]][x_synth[[j]]] + }) + } + rm(draw_from, draw_params_within, adj_absolut_col, + adj_absolut, idx_drawn_within, idx_drawn, draw_from_stacked) # Concatenate real and synthetic data dat <- rbind(data.frame(y = 1L, x_real), data.frame(y = 0L, x_synth)) @@ -195,8 +172,8 @@ adversarial_rf <- function( acc0 <- 1 - rf1$prediction.error acc <- c(acc, acc0) iters <- iters + 1L - plateau <- ifelse(isTRUE(early_stop), - acc[iters] <= acc[iters + 1L], FALSE) + plateau <- fifelse(isTRUE(early_stop), + acc[iters] <= acc[iters + 1L], FALSE) if (acc0 <= 0.5 + delta | iters >= max_iters | plateau) { converged <- TRUE } else { @@ -211,32 +188,34 @@ adversarial_rf <- function( } # Prune leaves to ensure min_node_size w.r.t. real data - pred <- stats::predict(rf0, x_real, type = 'terminalNodes')$predictions + 1L - prune <- function(tree) { - out <- rf0$forest$child.nodeIDs[[tree]] - leaves <- which(out[[1]] == 0L) - to_prune <- leaves[!(leaves %in% which(tabulate(pred[, tree]) >= min_node_size))] - while(length(to_prune) > 0) { - for (tp in to_prune) { - # Find parents - parent <- which((out[[1]] + 1L) == tp) - if (length(parent) > 0) { - # Left child - out[[1]][parent] <- out[[2]][parent] - } else { - # Right child - parent <- which((out[[2]] + 1L) == tp) - out[[2]][parent] <- out[[1]][parent] + if (isTRUE(prune)) { + pred <- stats::predict(rf0, x_real, type = 'terminalNodes')$predictions + 1L + prune <- function(tree) { + out <- rf0$forest$child.nodeIDs[[tree]] + leaves <- which(out[[1]] == 0L) + to_prune <- leaves[!(leaves %in% which(tabulate(pred[, tree]) >= min_node_size))] + while(length(to_prune) > 0) { + for (tp in to_prune) { + # Find parents + parent <- which((out[[1]] + 1L) == tp) + if (length(parent) > 0) { + # Left child + out[[1]][parent] <- out[[2]][parent] + } else { + # Right child + parent <- which((out[[2]] + 1L) == tp) + out[[2]][parent] <- out[[1]][parent] + } } + to_prune <- which((out[[1]] + 1L) %in% to_prune) } - to_prune <- which((out[[1]] + 1L) %in% to_prune) + return(out) + } + if (isTRUE(parallel)) { + rf0$forest$child.nodeIDs <- foreach(b = seq_len(num_trees)) %dopar% prune(b) + } else { + rf0$forest$child.nodeIDs <- foreach(b = seq_len(num_trees)) %do% prune(b) } - return(out) - } - if (isTRUE(parallel)) { - rf0$forest$child.nodeIDs <- foreach(b = 1:num_trees) %dopar% prune(b) - } else { - rf0$forest$child.nodeIDs <- foreach(b = 1:num_trees) %do% prune(b) } # Export diff --git a/R/expct.R b/R/expct.R new file mode 100644 index 00000000..1b3128de --- /dev/null +++ b/R/expct.R @@ -0,0 +1,134 @@ +#' Expected Value +#' +#' Compute the expectation of some query variable(s), optionally conditioned +#' on some event(s). +#' +#' @param params Circuit parameters learned via \code{\link{forde}}. +#' @param query Optional character vector of variable names. Estimates will be +#' computed for each. If \code{NULL}, all variables other than those in +#' \code{evidence} will be estimated. +#' @param evidence Optional set of conditioning events. This can take one of +#' three forms: (1) a partial sample, i.e. a single row of data with some but +#' not all columns; (2) a data frame of conditioning events, which allows for +#' inequalities; or (3) a posterior distribution over leaves. See Details. +#' +#' +#' @details +#' This function computes expected values for any subset of features, optionally +#' conditioned on some event(s). +#' +#' +#' @return +#' A one row data frame with values for all query variables. +#' +#' +#' @references +#' Watson, D., Blesch, K., Kapar, J., & Wright, M. (2023). Adversarial random +#' forests for density estimation and generative modeling. In \emph{Proceedings +#' of the 26th International Conference on Artificial Intelligence and +#' Statistics}, pp. 5357-5375. +#' +#' +#' @examples +#' # Train ARF and corresponding circuit +#' arf <- adversarial_rf(iris) +#' psi <- forde(arf, iris) +#' +#' # What is the expected value Sepal.Length? +#' expct(psi, query = "Sepal.Length") +#' +#' # What if we condition on Species = "setosa"? +#' evi <- data.frame(Species = "setosa") +#' expct(psi, query = "Sepal.Length", evidence = evi) +#' +#' # Compute expectations for all features other than Species +#' expct(psi, evidence = evi) +#' +#' +#' @seealso +#' \code{\link{adversarial_rf}}, \code{\link{forde}}, \code{\link{lik}} +#' +#' +#' @export +#' @import data.table +#' @importFrom truncnorm etruncnorm +#' + +expct <- function( + params, + query = NULL, + evidence = NULL) { + + # To avoid data.table check issues + variable <- tree <- f_idx <- cvg <- wt <- V1 <- value <- val <- family <- + mu <- sigma <- obs <- prob <- . <- NULL + + # Prep evidence + conj <- FALSE + if (!is.null(evidence)) { + evidence <- prep_evi(params, evidence) + if (!all(c('f_idx', 'wt') %in% colnames(evidence))) { + conj <- TRUE + } + } + + # Check query + if (is.null(query)) { + if (isTRUE(conj)) { + query <- setdiff(params$meta$variable, evidence$variable) + } else { + query <- params$meta$variable + if (!is.null(evidence)) { + warning('Computing expectations for all variables. To avoid this ', + 'for conditioning variables, consider passing evidence in the ', + 'form of a partial sample or data frame of events.') + } + } + } else if (any(!query %in% params$meta$variable)) { + err <- setdiff(query, params$meta$variable) + stop('Unrecognized feature(s) in query: ', err) + } + factor_cols <- params$meta[variable %in% query, family == 'multinom'] + + # PMF over leaves + if (is.null(evidence)) { + num_trees <- params$forest[, max(tree)] + omega <- params$forest[, .(f_idx, cvg)] + omega[, wt := cvg / num_trees] + omega[, cvg := NULL] + } else if (conj) { + omega <- leaf_posterior(params, evidence) + } else { + omega <- evidence + } + omega <- omega[wt > 0] + + psi_cnt <- psi_cat <- NULL + # Continuous data + if (any(!factor_cols)) { + tmp <- merge(params$cnt[variable %in% query], omega, by = 'f_idx', sort = FALSE) + # tmp[, expct := truncnorm::etruncnorm(min, max, mu, sigma)] + # psi_cnt <- tmp[, crossprod(wt, expct), by = variable] + psi_cnt <- tmp[, crossprod(wt, mu), by = variable] + psi_cnt <- dcast(psi_cnt, . ~ variable, value.var = 'V1')[, . := NULL] + } + + # Categorical data + if (any(factor_cols)) { + tmp <- merge(params$cat[variable %in% query], omega, by = 'f_idx', sort = FALSE) + tmp <- tmp[, crossprod(prob, wt), by = .(variable, val)] + tmp <- tmp[order(match(variable, query[factor_cols]))] + vals <- tmp[tmp[, .I[which.max(V1)], by = variable]$V1]$val + psi_cat <- setDT(lapply(seq_along(vals), function(j) vals[j])) + setnames(psi_cat, query[factor_cols]) + } + + # Clean up, export + out <- cbind(psi_cnt, psi_cat) + out <- post_x(out, params) + return(out) +} + + + + diff --git a/R/forde.R b/R/forde.R index 9e68a710..31618e9e 100644 --- a/R/forde.R +++ b/R/forde.R @@ -11,14 +11,16 @@ #' features. Current options include truncated normal (the default #' \code{family = "truncnorm"}) and uniform (\code{family = "unif"}). See #' Details. +#' @param finite_bounds Impose finite bounds on all continuous variables? #' @param alpha Optional pseudocount for Laplace smoothing of categorical #' features. This avoids zero-mass points when test data fall outside the #' support of training data. Effectively parametrizes a flat Dirichlet prior #' on multinomial likelihoods. #' @param epsilon Optional slack parameter on empirical bounds when -#' \code{family = "unif"}. This avoids zero-density points when test data fall -#' outside the support of training data. The gap between lower and upper -#' bounds is expanded by a factor of \code{1 + epsilon}. +#' \code{family = "unif"} or \code{finite_bounds = TRUE}. This avoids +#' zero-density points when test data fall outside the support of training +#' data. The gap between lower and upper bounds is expanded by a factor of +#' \code{1 + epsilon}. #' @param parallel Compute in parallel? Must register backend beforehand, e.g. #' via \code{doParallel}. #' @@ -28,8 +30,8 @@ #' distribution parameters for data within each leaf. The former includes #' coverage (proportion of data falling into the leaf) and split criteria. The #' latter includes proportions for categorical features and mean/variance for -#' continuous features. These values are stored in a \code{data.table}, which -#' can be used as input to various other functions. +#' continuous features. The result is a probabilistic circuit, stored as a +#' \code{data.table}, which can be used for various downstream inference tasks. #' #' Currently, \code{forde} only provides support for a limited number of #' distributional families: truncated normal or uniform for continuous data, @@ -86,14 +88,15 @@ forde <- function( x, oob = FALSE, family = 'truncnorm', + finite_bounds = FALSE, alpha = 0, epsilon = 0, parallel = TRUE) { # To avoid data.table check issues - tree <- n_oob <- cvg <- leaf <- variable <- count <- sd <- value <- y_new <- - obs_new <- tree_new <- leaf_new <- psi_cnt <- psi_cat <- f_idx <- sigma <- - new_min <- new_max <- prob <- val <- val_count <- k <- . <- NULL + tree <- n_oob <- cvg <- leaf <- variable <- count <- sd <- value <- psi_cnt <- + psi_cat <- f_idx <- sigma <- new_min <- new_max <- mid <- sigma0 <- prob <- + val <- val_count <- level <- all_na <- i <- k <- cnt <- . <- NULL # Prelimz if (isTRUE(oob) & !nrow(x) %in% c(arf$num.samples, arf$num.samples/2)) { @@ -102,76 +105,58 @@ forde <- function( if (!family %in% c('truncnorm', 'unif')) { stop('family not recognized.') } + if (alpha < 0) { + stop('alpha must be nonnegative.') + } + if (epsilon < 0) { + stop('epsilon must be nonnegative.') + } # Prep data input_class <- class(x) x <- as.data.frame(x) + inf_flag <- sapply(seq_along(x), function(j) any(is.infinite(x[[j]]))) + if (any(inf_flag)) { + stop('x contains infinite values.') + } n <- nrow(x) d <- ncol(x) colnames_x <- colnames(x) - if ('y' %in% colnames(x)) { - y_new <- col_rename(x, 'y') - colnames(x)[which(colnames(x) == 'y')] <- y_new - } - if ('obs' %in% colnames(x)) { - obs_new <- col_rename(x, 'obs') - colnames(x)[which(colnames(x) == 'obs')] <- obs_new - } - if ('tree' %in% colnames(x)) { - tree_new <- col_rename(x, 'tree') - colnames(x)[which(colnames(x) == 'tree')] <- tree_new - } - if ('leaf' %in% colnames(x)) { - leaf_new <- col_rename(x, 'leaf') - colnames(x)[which(colnames(x) == 'leaf')] <- leaf_new - } classes <- sapply(x, class) - idx_char <- sapply(x, is.character) - if (any(idx_char)) { - x[, idx_char] <- as.data.frame( - lapply(x[, idx_char, drop = FALSE], as.factor) - ) - } - idx_logical <- sapply(x, is.logical) - if (any(idx_logical)) { - x[, idx_logical] <- as.data.frame( - lapply(x[, idx_logical, drop = FALSE], as.factor) - ) - } - idx_integer <- sapply(x, is.integer) - if (any(idx_integer)) { - warning('Recoding integer data as ordered factors. To override this behavior, ', - 'explicitly code these variables as numeric.') - for (j in which(idx_integer)) { - lvls <- sort(unique(x[, j])) - x[, j] <- factor(x[, j], levels = lvls, ordered = TRUE) - } - } + x <- suppressWarnings(prep_x(x)) factor_cols <- sapply(x, is.factor) - if (!family %in% c('truncnorm', 'unif')) { - stop('family not recognized.') - } - if (alpha < 0) { - stop('alpha must be nonnegative.') - } - if (epsilon < 0) { - stop('epsilon must be nonnegative.') + lvls <- arf$forest$covariate.levels[factor_cols] + lvl_df <- rbindlist(lapply(seq_along(lvls), function(j) { + melt(as.data.table(lvls[j]), measure.vars = names(lvls)[j], + value.name = 'val')[, level := .I] + })) + names(factor_cols) <- colnames_x + deci <- rep(NA_integer_, d) + if (any(!factor_cols)) { + deci[!factor_cols] <- sapply(which(!factor_cols), function(j) { + if (any(grepl('\\.', x[[j]]))) { + tmp <- x[grepl('\\.', x[[j]]), j] + out <- max(nchar(sub('.*[.]', '', tmp))) + } else { + out <- 0L + } + return(out) + }) } # Compute leaf bounds and coverage num_trees <- arf$num.trees - pred <- stats::predict(arf, x, type = 'terminalNodes')$predictions + 1L bnd_fn <- function(tree) { num_nodes <- length(arf$forest$split.varIDs[[tree]]) lb <- matrix(-Inf, nrow = num_nodes, ncol = d) ub <- matrix(Inf, nrow = num_nodes, ncol = d) - if (family == 'unif') { - for (j in seq_len(d)) { - if (!isTRUE(factor_cols[j])) { - gap <- max(x[[j]]) - min(x[[j]]) - lb[, j] <- min(x[[j]]) - epsilon/2 * gap - ub[, j] <- max(x[[j]]) + epsilon/2 * gap - } + if (family == 'unif' | isTRUE(finite_bounds) & any(!factor_cols)) { + for (j in which(!factor_cols)) { + min_j <- min(x[[j]], na.rm = TRUE) + max_j <- max(x[[j]], na.rm = TRUE) + gap <- max_j - min_j + lb[, j] <- min_j - epsilon / 2 * gap + ub[, j] <- max_j + epsilon / 2 * gap } } for (i in 1:num_nodes) { @@ -189,8 +174,7 @@ forde <- function( } } leaves <- which(arf$forest$child.nodeIDs[[tree]][[1]] == 0L) - colnames(lb) <- arf$forest$independent.variable.names - colnames(ub) <- arf$forest$independent.variable.names + colnames(lb) <- colnames(ub) <- colnames_x merge(melt(data.table(tree = tree, leaf = leaves, lb[leaves, ]), id.vars = c('tree', 'leaf'), value.name = 'min'), melt(data.table(tree = tree, leaf = leaves, ub[leaves, ]), @@ -198,33 +182,34 @@ forde <- function( by = c('tree', 'leaf', 'variable'), sort = FALSE) } if (isTRUE(parallel)) { - bnds <- foreach(tree = 1:num_trees, .combine = rbind) %dopar% bnd_fn(tree) + bnds <- foreach(tree = seq_len(num_trees), .combine = rbind) %dopar% bnd_fn(tree) } else { - bnds <- foreach(tree = 1:num_trees, .combine = rbind) %do% bnd_fn(tree) + bnds <- foreach(tree = seq_len(num_trees), .combine = rbind) %do% bnd_fn(tree) } - # Use only OOB data? + # Compute coverage + pred <- stats::predict(arf, x, type = 'terminalNodes')$predictions + 1L + keep <- data.table('tree' = rep(seq_len(num_trees), each = n), + 'leaf' = as.vector(pred)) if (isTRUE(oob)) { - inbag <- (do.call(cbind, arf$inbag.counts) > 0L)[1:(arf$num.samples/2), ] - pred[inbag] <- NA_integer_ - bnds[, n_oob := sum(!is.na(pred[, tree])), by = tree] - bnds[, cvg := sum(pred[, tree] == leaf, na.rm = TRUE) / n_oob, by = .(tree, leaf)] - if (any(!factor_cols)) { - bnds[cvg == 1 / n_oob, cvg := 0] - } + keep[, oob := as.vector(sapply(seq_len(num_trees), function(b) { + arf$inbag.counts[[b]][seq_len(n)] == 0L + }))] + keep <- keep[oob == TRUE] + keep <- unique(keep[, cnt := .N, by = .(tree, leaf)]) + keep[, n_oob := sum(oob), by = tree] + keep[, cvg := cnt / n_oob][, c('oob', 'cnt', 'n_oob') := NULL] } else { - bnds[, cvg := sum(pred[, tree] == leaf) / n, by = .(tree, leaf)] - if (any(!factor_cols)) { - bnds[cvg == 1 / n, cvg := 0] - } + keep <- unique(keep[, cnt := .N, by = .(tree, leaf)]) + keep[, cvg := cnt / n][, cnt := NULL] } - # No parameters to learn for zero coverage leaves - bnds <- bnds[cvg > 0] + bnds <- merge(bnds, keep, by = c('tree', 'leaf'), sort = FALSE) + rm(keep) # Create forest index setkey(bnds, tree, leaf) bnds[, f_idx := .GRP, by = key(bnds)] # Calculate distribution parameters for each variable - fams <- ifelse(factor_cols, 'multinom', family) + setnames(x, colnames_x) # Continuous case if (any(!factor_cols)) { psi_cnt_fn <- function(tree) { @@ -236,33 +221,48 @@ forde <- function( dt <- merge(dt, bnds[, .(tree, leaf, variable, min, max, f_idx)], by = c('tree', 'leaf', 'variable'), sort = FALSE) if (family == 'truncnorm') { - dt[, c('mu', 'sigma') := .(mean(value), sd(value)), by = .(leaf, variable)] - if (dt[sigma == 0, .N] > 0L) { - dt[sigma == 0, new_min := ifelse(!is.finite(min), min(value), min), by = variable] - dt[sigma == 0, new_max := ifelse(!is.finite(max), max(value), max), by = variable] - dt[sigma == 0, value := stats::runif(.N, min = new_min, max = new_max)] - dt[sigma == 0, sigma := sd(value), by = .(leaf, variable)] - dt[, c('new_min', 'new_max') := NULL] + if (any(is.na(dt$value))) { + dt[, all_na := all(is.na(value)), by = .(leaf, variable)] + if (any(dt[, all_na == TRUE])) { + if (any(dt[all_na == TRUE, !is.finite(min)])) { + for (j in names(which(!factor_cols))) { + dt[all_na == TRUE & !is.finite(min) & variable == j, min := min(x[[j]], na.rm = TRUE)] + } + } + if (any(dt[all_na == TRUE, !is.finite(max)])) { + for (j in names(which(!factor_cols))) { + dt[all_na == TRUE & !is.finite(max) & variable == j, max := max(x[[j]], na.rm = TRUE)] + } + } + dt[all_na == TRUE, value := (max - min) / 2] + } + dt[, all_na := NULL] + } + dt[, c('mu', 'sigma') := .(mean(value, na.rm = TRUE), sd(value, na.rm = TRUE)), + by = .(leaf, variable)] + dt[is.na(sigma), sigma := 0] + if (any(dt[, sigma == 0])) { + dt[, new_min := fifelse(!is.finite(min), min(value, na.rm = TRUE), min), by = variable] + dt[, new_max := fifelse(!is.finite(max), max(value, na.rm = TRUE), max), by = variable] + dt[, mid := (new_min + new_max) / 2] + dt[, sigma0 := (new_max - mid) / stats::qnorm(0.975)] + # This prior places 95% of the density within the bounding box. + # In addition, we set the prior degrees of freedom at nu0 = 2. + # Since the mode of a chisq is max(df-2, 0), this means that + # (1) with a single observation, the posterior reduces to the prior; and + # (2) with more invariant observations, the posterior tends toward zero. + dt[sigma == 0, sigma := sqrt(2 / .N * sigma0^2), by = .(variable, leaf)] + dt[, c('new_min', 'new_max', 'mid', 'sigma0') := NULL] } } - dt <- unique(dt[, c('tree', 'leaf', 'value') := NULL]) + return(unique(dt[, c('tree', 'leaf', 'value') := NULL])) } if (isTRUE(parallel)) { - psi_cnt <- foreach(tree = 1:num_trees, .combine = rbind) %dopar% psi_cnt_fn(tree) + psi_cnt <- foreach(tree = seq_len(num_trees), .combine = rbind) %dopar% + psi_cnt_fn(tree) } else { - psi_cnt <- foreach(tree = 1:num_trees, .combine = rbind) %do% psi_cnt_fn(tree) - } - if (!is.null(y_new)) { - psi_cnt[variable == y_new, variable := 'y'] - } - if (!is.null(obs_new)) { - psi_cnt[variable == obs_new, variable := 'obs'] - } - if (!is.null(tree_new)) { - psi_cnt[variable == tree_new, variable := 'tree'] - } - if (!is.null(leaf_new)) { - psi_cnt[variable == leaf_new, variable := 'leaf'] + psi_cnt <- foreach(tree = seq_len(num_trees), .combine = rbind) %do% + psi_cnt_fn(tree) } setkey(psi_cnt, f_idx, variable) setcolorder(psi_cnt, c('f_idx', 'variable')) @@ -276,6 +276,36 @@ forde <- function( } dt <- melt(dt, id.vars = 'leaf', variable.factor = FALSE, value.factor = FALSE, value.name = 'val')[, tree := tree] + if (dt[, any(is.na(val))]) { + dt[, all_na := all(is.na(val)), by = .(leaf, variable)] + dt <- dt[!(is.na(val) & all_na == FALSE)] + if (any(dt[, all_na == TRUE])) { + all_na <- unique(dt[all_na == TRUE]) + dt <- dt[all_na == FALSE] + dt[, all_na := NULL] + all_na <- merge(all_na, bnds[, .(tree, leaf, variable, min, max, f_idx)], + by = c('tree', 'leaf', 'variable'), sort = FALSE) + all_na[!is.finite(min), min := 0.5] + for (j in names(which(factor_cols))) { + all_na[!is.finite(max) & variable == j, max := lvl_df[variable == j, max(level)]] + } + all_na[!grepl('\\.5', min), min := min + 0.5] + all_na[!grepl('\\.5', max), max := max + 0.5] + all_na[, min := min + 0.5][, max := max - 0.5] + all_na <- rbindlist(lapply(seq_len(nrow(all_na)), function(i) { + data.table( + leaf = all_na[i, leaf], variable = all_na[i, variable], + level = all_na[i, seq(min, max)] + ) + })) + all_na <- merge(all_na, lvl_df, by = c('variable', 'level')) + all_na[, level := NULL][, tree := tree] + setcolorder(all_na, colnames(dt)) + dt <- rbind(dt, all_na) + } else { + dt[, all_na := NULL] + } + } dt[, count := .N, by = .(leaf, variable)] dt <- merge(dt, bnds[, .(tree, leaf, variable, min, max, f_idx)], by = c('tree', 'leaf', 'variable'), sort = FALSE) @@ -286,20 +316,14 @@ forde <- function( # Define the range of each variable in each leaf dt <- unique(dt[, val_count := .N, by = .(f_idx, variable, val)]) dt[, k := length(unique(val)), by = variable] - dt[min == -Inf, min := 0.5][max == Inf, max := k + 0.5] - dt[!grepl('.5', min), min := min - 0.5][!grepl('.5', max), max := max + 0.5] + dt[!is.finite(min), min := 0.5][!is.finite(max), max := k + 0.5] + dt[!grepl('\\.5', min), min := min + 0.5][!grepl('\\.5', max), max := max + 0.5] dt[, k := max - min] # Enumerate each possible leaf-variable-value combo tmp <- dt[, seq(min[1] + 0.5, max[1] - 0.5), by = .(f_idx, variable)] - setnames(tmp, 'V1', 'levels') - tmp2 <- rbindlist( - lapply(which(factor_cols), function(j) { - data.table('variable' = colnames(x)[j], - 'val' = arf$forest$covariate.levels[[j]])[, levels := .I] - }) - ) - tmp <- merge(tmp, tmp2, by = c('variable', 'levels'), - sort = FALSE)[, levels := NULL] + setnames(tmp, 'V1', 'level') + tmp <- merge(tmp, lvl_df, by = c('variable', 'level'), + sort = FALSE)[, level := NULL] # Populate count, k tmp <- merge(tmp, unique(dt[, .(f_idx, variable, count, k)]), by = c('f_idx', 'variable'), sort = FALSE) @@ -314,31 +338,26 @@ forde <- function( dt[, c('count', 'min', 'max') := NULL] } if (isTRUE(parallel)) { - psi_cat <- foreach(tree = 1:num_trees, .combine = rbind) %dopar% psi_cat_fn(tree) + psi_cat <- foreach(tree = seq_len(num_trees), .combine = rbind) %dopar% + psi_cat_fn(tree) } else { - psi_cat <- foreach(tree = 1:num_trees, .combine = rbind) %do% psi_cat_fn(tree) - } - if (!is.null(y_new)) { - psi_cat[variable == y_new, variable := 'y'] - } - if (!is.null(obs_new)) { - psi_cat[variable == obs_new, variable := 'obs'] - } - if (!is.null(tree_new)) { - psi_cat[variable == tree_new, variable := 'tree'] - } - if (!is.null(leaf_new)) { - psi_cat[variable == leaf_new, variable := 'leaf'] + psi_cat <- foreach(tree = seq_len(num_trees), .combine = rbind) %do% + psi_cat_fn(tree) } + lvl_df[, level := NULL] setkey(psi_cat, f_idx, variable) setcolorder(psi_cat, c('f_idx', 'variable')) } # Add metadata, export psi <- list( - 'cnt' = psi_cnt, 'cat' = psi_cat, + 'cnt' = psi_cnt, + 'cat' = psi_cat, 'forest' = unique(bnds[, .(f_idx, tree, leaf, cvg)]), - 'meta' = data.table(variable = colnames_x, class = classes, family = fams), + 'meta' = data.table('variable' = colnames_x, 'class' = classes, + 'family' = fifelse(factor_cols, 'multinom', family), + 'decimals' = deci), + 'levels' = lvl_df, 'input_class' = input_class ) return(psi) diff --git a/R/forge.R b/R/forge.R index ddf6f652..2ac8705d 100644 --- a/R/forge.R +++ b/R/forge.R @@ -2,18 +2,31 @@ #' #' Uses pre-trained FORDE model to simulate synthetic data. #' -#' @param params Parameters learned via \code{\link{forde}}. +#' @param params Circuit parameters learned via \code{\link{forde}}. #' @param n_synth Number of synthetic samples to generate. -#' @param parallel Compute in parallel? Must register backend beforehand, e.g. -#' via \code{doParallel}. +#' @param evidence Optional set of conditioning events. This can take one of +#' three forms: (1) a partial sample, i.e. a single row of data with +#' some but not all columns; (2) a data frame of conditioning events, +#' which allows for inequalities; or (3) a posterior distribution over leaves. +#' See Details. #' #' @details #' \code{forge} simulates a synthetic dataset of \code{n_synth} samples. First, -#' leaves are sampled in proportion to their coverage. Then, each feature is -#' sampled independently within each leaf according to the probability mass or -#' density function learned by \code{\link{forde}}. This will create realistic +#' leaves are sampled in proportion to either their coverage (if +#' \code{evidence = NULL}) or their posterior probability. Then, each feature is +#' sampled independently within each leaf according to the probability mass or +#' density function learned by \code{\link{forde}}. This will create realistic #' data so long as the adversarial RF used in the previous step satisfies the -#' local independence criterion. See Watson et al. (2022). +#' local independence criterion. See Watson et al. (2023). +#' +#' There are three methods for (optionally) encoding conditioning events via the +#' \code{evidence} argument. The first is to provide a partial sample, where +#' some but not all columns from the training data are present. The second is to +#' provide a data frame with three columns: \code{variable}, \code{relation}, +#' and \code{value}. This supports inequalities via \code{relation}. +#' Alternatively, users may directly input a pre-calculated posterior +#' distribution over leaves, with columns \code{f_idx} and \code{wt}. This may +#' be preferable for complex constraints. See Examples. #' #' #' @return @@ -32,6 +45,22 @@ #' psi <- forde(arf, iris) #' x_synth <- forge(psi, n_synth = 100) #' +#' # Condition on Species = "setosa" +#' evi <- data.frame(Species = "setosa") +#' x_synth <- forge(psi, n_synth = 100, evidence = evi) +#' +#' # Condition in Species = "setosa" and Sepal.Length > 6 +#' evi <- data.frame(variable = c("Species", "Sepal.Length"), +#' relation = c("==", ">"), +#' value = c("setosa", 6)) +#' x_synth <- forge(psi, n_synth = 100, evidence = evi) +#' +#' # Or just input some distribution on leaves +#' # (Weights that do not sum to unity are automatically scaled) +#' n_leaves <- nrow(psi$forest) +#' evi <- data.frame(f_idx = psi$forest$f_idx, wt = rexp(n_leaves)) +#' x_synth <- forge(psi, n_synth = 100, evidence = evi) +#' #' #' @seealso #' \code{\link{adversarial_rf}}, \code{\link{forde}} @@ -46,24 +75,67 @@ forge <- function( params, n_synth, - parallel = TRUE) { + evidence = NULL) { # To avoid data.table check issues tree <- cvg <- leaf <- idx <- family <- mu <- sigma <- prob <- dat <- - variable <- j <- f_idx <- val <- . <- NULL + variable <- relation <- wt <- j <- f_idx <- val <- . <- NULL + + # Prep evidence + conj <- FALSE + to_sim <- params$meta$variable + if (!is.null(evidence)) { + evidence <- prep_evi(params, evidence) + if (!all(c('f_idx', 'wt') %in% colnames(evidence))) { + conj <- TRUE + to_sim <- setdiff(params$meta$variable, evidence[relation == '==', variable]) + } + } + factor_cols <- params$meta[variable %in% to_sim, family == 'multinom'] + + # Prepare the event space + if (is.null(evidence)) { + num_trees <- params$forest[, max(tree)] + omega <- params$forest[, .(f_idx, cvg)] + omega[, wt := cvg / num_trees] + omega[, cvg := NULL] + } else if (isTRUE(conj)) { + omega <- leaf_posterior(params, evidence) + } else { + omega <- evidence + } + omega <- omega[wt > 0] - # Draw random leaves with probability proportional to coverage - omega <- params$forest - draws <- data.table( - 'f_idx' = omega[, sample(f_idx, size = n_synth, replace = TRUE, prob = cvg)] - ) + if (nrow(omega) == 1) { + draws <- omega[, .(f_idx)] + } else { + # Draw random leaves with probability proportional to weight + draws <- data.table( + 'f_idx' = omega[, sample(f_idx, size = n_synth, replace = TRUE, prob = wt)] + ) + } omega <- merge(draws, omega, sort = FALSE)[, idx := .I] - # Simulate data + # Simulate continuous data synth_cnt <- synth_cat <- NULL - if (!is.null(params$cnt)) { + if (any(!factor_cols)) { fam <- params$meta[family != 'multinom', unique(family)] - psi <- merge(omega, params$cnt, by = 'f_idx', sort = FALSE, allow.cartesian = TRUE) + psi <- merge(omega, params$cnt[variable %in% to_sim], by = 'f_idx', + sort = FALSE, allow.cartesian = TRUE) + if (isTRUE(conj)) { + if (any(evidence$relation %in% c('<', '<=', '>', '>='))) { + for (k in evidence[, which(grepl('<', relation))]) { + j <- evidence$variable[k] + value <- as.numeric(evidence$value[k]) + psi[variable == j & max > value, max := value] + } + for (k in evidence[, which(grepl('>', relation))]) { + j <- evidence$variable[k] + value <- as.numeric(evidence$value[k]) + psi[variable == j & min < value, min := value] + } + } + } if (fam == 'truncnorm') { psi[, dat := truncnorm::rtruncnorm(.N, a = min, b = max, mean = mu, sd = sigma)] } else if (fam == 'unif') { @@ -71,49 +143,36 @@ forge <- function( } synth_cnt <- dcast(psi, idx ~ variable, value.var = 'dat')[, idx := NULL] } - if (!is.null(params$cat)) { - sim_cat <- function(j) { - psi <- merge(omega, params$cat[variable == j], by = 'f_idx', sort = FALSE, - allow.cartesian = TRUE) - psi[prob == 1, dat := val] - psi[prob < 1, dat := sample(val, 1, prob = prob), by = idx] - setnames(data.table(unique(psi[, .(idx, dat)])[, dat]), j) - } - cat_vars <- params$meta[family == 'multinom', variable] - if (isTRUE(parallel)) { - synth_cat <- foreach(j = cat_vars, .combine = cbind) %dopar% sim_cat(j) - } else { - synth_cat <- foreach(j = cat_vars, .combine = cbind) %do% sim_cat(j) + + # Simulate categorical data + if (any(factor_cols)) { + psi <- merge(omega, params$cat[variable %in% to_sim], by = 'f_idx', + sort = FALSE, allow.cartesian = TRUE) + psi[prob == 1, dat := val] + if (isTRUE(conj)) { + if (any(evidence[, relation == '!='])) { + tmp <- evidence[relation == '!=', .(variable, value)] + psi <- merge(psi, tmp, by = 'variable', sort = FALSE) + psi <- psi[val != value] + psi[, value := NULL] + } } + psi[prob < 1, dat := sample(val, 1, prob = prob), by = .(variable, idx)] + psi <- unique(psi[, .(idx, variable, dat)]) + synth_cat <- dcast(psi, idx ~ variable, value.var = 'dat')[, idx := NULL] } - # Clean up, export + # Combine, optionally impose constraint(s) x_synth <- cbind(synth_cnt, synth_cat) - setcolorder(x_synth, params$meta$variable) - setDF(x_synth) - idx_factor <- params$meta[, which(class == 'factor')] - idx_logical <- params$meta[, which(class == 'logical')] - idx_integer <- params$meta[, which(class == 'integer')] - if (sum(idx_factor) > 0L) { - x_synth[, idx_factor] <- as.data.frame( - lapply(x_synth[, idx_factor, drop = FALSE], as.factor) - ) - } - if (sum(idx_logical) > 0L) { - x_synth[, idx_logical] <- as.data.frame( - lapply(x_synth[, idx_logical, drop = FALSE], function(x) {x == 'TRUE'}) - ) - } - if (sum(idx_integer) > 0L) { - x_synth[, idx_integer] <- as.data.frame( - lapply(x_synth[, idx_integer, drop = FALSE], function(x) as.integer(x)) - ) - } - if ('data.table' %in% params$input_class) { - x_synth <- as.data.table(x_synth) - } else if ('matrix' %in% params$input_class) { - x_synth <- as.matrix(x_synth) + if (length(to_sim) != params$meta[, .N]) { + tmp <- evidence[relation == '=='] + add_on <- setDT(lapply(tmp[, .I], function(k) rep(tmp[k, value], n_synth))) + setnames(add_on, tmp[, variable]) + x_synth <- cbind(x_synth, add_on) } + + # Clean up, export + x_synth <- post_x(x_synth, params) return(x_synth) } diff --git a/R/lik.R b/R/lik.R index b78294dd..5bbfacb0 100644 --- a/R/lik.R +++ b/R/lik.R @@ -1,16 +1,24 @@ #' Likelihood Estimation #' -#' Compute the density of input data. +#' Compute the likelihood of input data, optionally conditioned on some event(s). #' -#' @param arf Pre-trained \code{\link{adversarial_rf}}. Alternatively, any -#' object of class \code{ranger}. -#' @param params Parameters learned via \code{\link{forde}}. -#' @param x Input data. Densities will be computed for each sample. +#' @param params Circuit parameters learned via \code{\link{forde}}. +#' @param query Data frame of samples, optionally comprising just a subset of +#' training features. Likelihoods will be computed for each sample. Missing +#' features will be marginalized out. See Details. +#' @param evidence Optional set of conditioning events. This can take one of +#' three forms: (1) a partial sample, i.e. a single row of data with some but +#' not all columns; (2) a data frame of conditioning events, which allows for +#' inequalities; or (3) a posterior distribution over leaves. See Details. +#' @param arf Pre-trained \code{\link{adversarial_rf}} or other object of class +#' \code{ranger}. This is not required but speeds up computation considerably +#' for total evidence queries. (Ignored for partial evidence queries.) #' @param oob Only use out-of-bag leaves for likelihood estimation? If #' \code{TRUE}, \code{x} must be the same dataset used to train \code{arf}. +#' Only applicable for total evidence queries. #' @param log Return likelihoods on log scale? Recommended to prevent underflow. #' @param batch Batch size. The default is to compute densities for all of -#' \code{x} in one round, which is always the fastest option if memory allows. +#' queries in one round, which is always the fastest option if memory allows. #' However, with large samples or many trees, it can be more memory efficient #' to split the data into batches. This has no impact on results. #' @param parallel Compute in parallel? Must register backend beforehand, e.g. @@ -18,11 +26,18 @@ #' #' #' @details -#' This function computes the density of input data according to a FORDE model -#' using a pre-trained ARF. Each sample's likelihood is a weighted average of -#' its likelihood in all leaves whose split criteria it satisfies. Intra-leaf -#' densities are fully factorized, since ARFs satisfy the local independence -#' criterion by construction. See Watson et al. (2022). +#' This function computes the likelihood of input data, optionally conditioned +#' on some event(s). Queries may be partial, i.e. covering some but not all +#' features, in which case excluded variables will be marginalized out. +#' +#' There are three methods for (optionally) encoding conditioning events via the +#' \code{evidence} argument. The first is to provide a partial sample, where +#' some but not all columns from the training data are present. The second is to +#' provide a data frame with three columns: \code{variable}, \code{relation}, +#' and \code{value}. This supports inequalities via \code{relation}. +#' Alternatively, users may directly input a pre-calculated posterior +#' distribution over leaves, with columns \code{f_idx} and \code{wt}. This may +#' be preferable for complex constraints. See Examples. #' #' #' @return @@ -40,9 +55,26 @@ #' # Estimate average log-likelihood #' arf <- adversarial_rf(iris) #' psi <- forde(arf, iris) -#' ll <- lik(arf, psi, iris, log = TRUE) +#' ll <- lik(psi, iris, arf = arf, log = TRUE) +#' mean(ll) +#' +#' # Identical but slower +#' ll <- lik(psi, iris, log = TRUE) #' mean(ll) #' +#' # Partial evidence query +#' lik(psi, query = iris[1, 1:3]) +#' +#' # Condition on Species = "setosa" +#' evi <- data.frame(Species = "setosa") +#' lik(psi, query = iris[1, 1:3], evidence = evi) +#' +#' # Condition on Species = "setosa" and Petal.Width > 0.3 +#' evi <- data.frame(variable = c("Species", "Petal.Width"), +#' relation = c("==", ">"), +#' value = c("setosa", 0.3)) +#' lik(psi, query = iris[1, 1:3], evidence = evi) +#' #' #' @seealso #' \code{\link{adversarial_rf}}, \code{\link{forge}} @@ -54,60 +86,80 @@ #' @importFrom stats predict #' @importFrom foreach foreach %do% %dopar% #' @importFrom truncnorm dtruncnorm -#' @importFrom matrixStats logSumExp #' lik <- function( - arf, params, - x, - oob = FALSE, + query, + evidence = NULL, + arf = NULL, + oob = FALSE, log = TRUE, batch = NULL, parallel = TRUE) { # To avoid data.table check issues tree <- cvg <- leaf <- variable <- mu <- sigma <- value <- obs <- prob <- - V1 <- family <- fold <- . <- NULL + V1 <- relation <- f_idx <- wt <- val <- family <- fold <- . <- NULL - # Prep data - x <- as.data.frame(x) + # Check query + x <- as.data.frame(query) + colnames_x <- colnames(x) n <- nrow(x) - if ('y' %in% colnames(x)) { - colnames(x)[which(colnames(x) == 'y')] <- col_rename(x, 'y') + d <- ncol(x) + if (d == params$meta[, .N] & is.null(arf)) { + warning('For total evidence queries, it is faster to include the ', + 'pre-trained arf.') } - if ('obs' %in% colnames(x)) { - colnames(x)[which(colnames(x) == 'obs')] <- col_rename(x, 'obs') + if (any(!colnames(x) %in% params$meta$variable)) { + err <- setdiff(colnames(x), params$meta$variable) + stop('Unrecognized feature(s) among colnames: ', err) } - if ('tree' %in% colnames(x)) { - colnames(x)[which(colnames(x) == 'tree')] <- col_rename(x, 'tree') - } - if ('leaf' %in% colnames(x)) { - colnames(x)[which(colnames(x) == 'leaf')] <- col_rename(x, 'leaf') + x <- suppressWarnings(prep_x(x)) + factor_cols <- sapply(x, is.factor) + + # Prep evidence + conj <- FALSE + if (!is.null(evidence)) { + evidence <- prep_evi(params, evidence) + if (!all(c('f_idx', 'wt') %in% colnames(evidence))) { + conj <- TRUE + } } - idx_char <- sapply(x, is.character) - if (any(idx_char)) { - x[, idx_char] <- as.data.frame( - lapply(x[, idx_char, drop = FALSE], as.factor) - ) - } - idx_logical <- sapply(x, is.logical) - if (any(idx_logical)) { - x[, idx_logical] <- as.data.frame( - lapply(x[, idx_logical, drop = FALSE], as.factor) - ) - } - idx_integer <- sapply(x, is.integer) - if (any(idx_integer)) { - for (j in which(idx_integer)) { - lvls <- sort(unique(x[, j])) - x[, j] <- factor(x[, j], levels = lvls, ordered = TRUE) + + # Check ARF + if (d == params$meta[, .N] & !is.null(arf)) { + num_trees <- arf$num.trees + preds <- stats::predict(arf, x, type = 'terminalNodes')$predictions + 1L + preds <- data.table('tree' = rep(seq_len(num_trees), each = n), + 'leaf' = as.vector(preds), + 'obs' = rep(seq_len(n), times = num_trees)) + if (isTRUE(oob)) { + preds <- stats::na.omit(preds) } + preds <- merge(preds, params$forest[, .(tree, leaf, f_idx)], + by = c('tree', 'leaf'), sort = FALSE) + setnames(x, params$meta$variable) + } else { + arf <- NULL + setnames(x, colnames_x) } - factor_cols <- sapply(x, is.factor) - pred <- stats::predict(arf, x, type = 'terminalNodes')$predictions + 1L - # Optional batch index + # PMF over leaves + if (is.null(evidence)) { + num_trees <- params$forest[, max(tree)] + omega <- params$forest[, .(f_idx, cvg)] + omega[, wt := cvg / num_trees] + omega[, cvg := NULL] + } else if (isTRUE(conj)) { + omega <- leaf_posterior(params, evidence) + } else { + omega <- evidence + } + omega <- omega[wt > 0] + leaves <- omega[, f_idx] + + # Optional batching if (is.null(batch)) { batch <- n } @@ -115,84 +167,159 @@ lik <- function( if (k < 1) { k <- 1L } - batch_idx <- suppressWarnings(split(1:n, seq_len(k))) + batch_idx <- suppressWarnings(split(seq_len(n), seq_len(k))) - # Likelihood function - num_trees <- arf$num.trees - fams <- params$meta$family - lik_fn <- function(fold) { - params_x_cnt <- params_x_cat <- NULL - # Predictions - preds <- rbindlist(lapply(1:num_trees, function(b) { - data.table(tree = b, leaf = pred[batch_idx[[fold]], b], obs = batch_idx[[fold]]) - })) - if (isTRUE(oob)) { - preds <- preds[!is.na(leaf)] + # Likelihood function + lik_fn <- function(fold, arf) { + + # Prep work + psi_cnt <- psi_cat <- NULL + pure <- all(factor_cols) | all(!factor_cols) + if (is.null(arf) & !isTRUE(pure)) { + omega_tmp <- rbindlist(lapply(batch_idx[[fold]], function(i) { + omega$obs <- i + omega$wt <- NULL + return(omega) + })) } - preds <- merge(preds, params$forest, by = c('tree', 'leaf'), sort = FALSE) - preds[, leaf := NULL] + # Continuous data - if (!is.null(params$cnt)) { - fam <- params$meta[family != 'multinom', unique(family)] - x_long_cnt <- melt( + if (any(!factor_cols)) { + fam <- params$meta[class == 'numeric', unique(family)] + x_long <- melt( data.table(obs = batch_idx[[fold]], x[batch_idx[[fold]], !factor_cols, drop = FALSE]), id.vars = 'obs', variable.factor = FALSE ) - preds_x_cnt <- merge(preds, x_long_cnt, by = 'obs', sort = FALSE, - allow.cartesian = TRUE) - params_x_cnt <- merge(params$cnt, preds_x_cnt, - by = c('f_idx', 'variable'), sort = FALSE) + if (is.null(arf)) { + psi_cnt <- merge(params$cnt[f_idx %in% leaves], x_long, by = 'variable', + sort = FALSE, allow.cartesian = TRUE) + rm(x_long) + } else { + preds_cnt <- merge(preds[f_idx %in% leaves], x_long, by = 'obs', + sort = FALSE, allow.cartesian = TRUE) + rm(x_long) + psi_cnt <- merge(params$cnt[f_idx %in% leaves], preds_cnt, + by = c('f_idx', 'variable'), sort = FALSE) + rm(preds_cnt) + } if (fam == 'truncnorm') { - params_x_cnt[, lik := truncnorm::dtruncnorm(value, a = min, b = max, mean = mu, sd = sigma)] + psi_cnt[, lik := truncnorm::dtruncnorm(value, a = min, b = max, + mean = mu, sd = sigma)] } else if (fam == 'unif') { - params_x_cnt[, lik := stats::dunif(value, min = min, max = max)] + psi_cnt[, lik := stats::dunif(value, min = min, max = max)] + } + psi_cnt[value == min, lik := 0] + psi_cnt[, lik := prod(lik), by = .(f_idx, obs)] + psi_cnt <- unique(psi_cnt[lik > 0, .(f_idx, obs, lik)]) + if (is.null(arf) & !isTRUE(pure)) { + omega_tmp <- merge(omega_tmp, psi_cnt[, .(f_idx, obs)], + by = c('f_idx', 'obs'), sort = FALSE) + leaves <- omega_tmp[, unique(f_idx)] } - params_x_cnt <- params_x_cnt[, .(tree, obs, cvg, lik)] - rm(x_long_cnt, preds_x_cnt) } + # Categorical data - if ('multinom' %in% fams) { - x_long_cat <- melt( - data.table(obs = batch_idx[[fold]], - x[batch_idx[[fold]], factor_cols, drop = FALSE]), + if (any(factor_cols)) { + x_tmp <- x[batch_idx[[fold]], factor_cols, drop = FALSE] + n_tmp <- nrow(x_tmp) + x_long <- melt( + data.table(obs = batch_idx[[fold]], x_tmp), id.vars = 'obs', value.name = 'val', variable.factor = FALSE ) - preds_x_cat <- merge(preds, x_long_cat, by = 'obs', sort = FALSE, - allow.cartesian = TRUE) - params_x_cat <- merge(params$cat, preds_x_cat, - by = c('f_idx', 'variable', 'val'), - sort = FALSE, allow.cartesian = TRUE, - all.y = TRUE) - params_x_cat[is.na(prob), prob := 1e-20] - params_x_cat[, lik := prob] - params_x_cat <- params_x_cat[, .(tree, obs, cvg, lik)] - rm(x_long_cat, preds_x_cat) + # Speedups are possible if there are many duplicates + is_unique <- !duplicated(x_tmp) + if (all(is_unique)) { + x_unique <- x_long + colnames(x_unique)[1] <- 's_idx' + } else { + x_unique <- unique(x_tmp) + x_unique <- melt( + data.table(s_idx = seq_len(nrow(x_unique)), x_unique), + id.vars = 's_idx', value.name = 'val', variable.factor = FALSE + ) + s_idx <- integer(length = n_tmp) + s_idx[is_unique] <- seq_len(sum(is_unique)) + for (i in 2:n_tmp) { + if (s_idx[i] == 0L) { + s_idx[i] <- s_idx[i - 1L] + } + } + idx_dt <- data.table(obs = batch_idx[[fold]], s_idx = s_idx) + } + if (is.null(arf)) { + grd <- rbindlist(lapply(which(factor_cols), function(j) { + expand.grid('f_idx' = leaves, 'variable' = colnames(x)[j], + 'val' = x_long[variable == colnames(x)[j], unique(val)], + stringsAsFactors = FALSE) + })) + rm(x_long) + psi_cat <- merge(params$cat[f_idx %in% leaves], grd, + by = c('f_idx', 'variable', 'val'), + sort = FALSE, all.y = TRUE) + rm(grd) + psi_cat[is.na(prob), prob := 0] + psi_cat <- merge(psi_cat, x_unique, by = c('variable', 'val'), + sort = FALSE, allow.cartesian = TRUE) + psi_cat[, lik := prod(prob), by = .(f_idx, s_idx)] + psi_cat <- unique(psi_cat[lik > 0, .(f_idx, s_idx, lik)]) + if (all(is_unique)) { + setnames(psi_cat, 's_idx', 'obs') + } else { + if (!isTRUE(pure)) { + omega_tmp <- merge(idx_dt, omega_tmp, by = 'obs', sort = FALSE) + psi_cat <- merge(psi_cat, omega_tmp, by = c('f_idx', 's_idx'), + sort = FALSE)[, s_idx := NULL] + rm(omega_tmp) + setcolorder(psi_cat, c('f_idx', 'obs', 'lik')) + psi_cnt <- merge(psi_cnt, psi_cat[, .(f_idx, obs)], + by = c('f_idx', 'obs'), sort = FALSE) + } + } + } else { + preds_cat <- merge(preds[f_idx %in% leaves], x_long, by = 'obs', + sort = FALSE, allow.cartesian = TRUE) + rm(x_long) + psi_cat <- merge(params$cat, preds_cat, by = c('f_idx', 'variable', 'val'), + sort = FALSE, allow.cartesian = TRUE, all.y = TRUE) + rm(preds_cat) + psi_cat[is.na(prob), prob := 0] + psi_cat[, lik := prod(prob), by = .(f_idx, obs)] + psi_cat <- unique(psi_cat[lik > 0, .(f_idx, obs, lik)]) + } } - rm(preds) + # Put it together - params_x <- rbind(params_x_cnt, params_x_cat) - rm(params_x_cnt, params_x_cat) - # Compute per-sample likelihoods - lik <- unique(params_x[, sum(log(lik)) + log(cvg), by = .(obs, tree)]) - lik[is.na(V1), V1 := 0] - if (isTRUE(log)) { - out <- lik[, -log(.N) + matrixStats::logSumExp(V1), by = obs] - } else { - out <- lik[, mean(exp(V1)), by = obs] + psi_x <- rbind(psi_cnt, psi_cat) + if (!isTRUE(pure)) { + psi_x <- psi_x[, prod(lik), by = .(f_idx, obs)] + setnames(psi_x, 'V1', 'lik') } - return(out) + return(psi_x) } - if (k == 1L) { - ll <- lik_fn(1L) + if (isTRUE(parallel)) { + out <- foreach(fold = seq_len(k), .combine = rbind) %dopar% lik_fn(fold, arf) } else { - if (isTRUE(parallel)) { - ll <- foreach(fold = 1:k, .combine = rbind) %dopar% lik_fn(fold) - } else { - ll <- foreach(fold = 1:k, .combine = rbind) %do% lik_fn(fold) - } + out <- foreach(fold = seq_len(k), .combine = rbind) %do% lik_fn(fold, arf) + } + + # Compute per-sample likelihoods + out <- merge(out, omega, by = 'f_idx', sort = FALSE) + out <- out[, log(crossprod(wt, lik)), by = obs] + setnames(out, 'V1', 'lik') + + # Anybody missing? + zeros <- setdiff(seq_len(n), out[, obs]) + if (length(zeros) > 0L) { + zero_dt <- data.table(obs = zeros, lik = -Inf) + out <- rbind(out, zero_dt) + } + + # Export + if (!isTRUE(log)) { + out[, lik := exp(lik)] } - return(ll[order(obs), V1]) + return(out[order(obs), lik]) } diff --git a/R/utils.R b/R/utils.R index 71117a56..4666996b 100644 --- a/R/utils.R +++ b/R/utils.R @@ -19,4 +19,314 @@ col_rename <- function(df, old_name) { } } return(new_name) -} \ No newline at end of file +} + +#' Preprocess input data +#' +#' This function prepares input data for ARFs. +#' +#' @param x Input data.frame. +#' + +prep_x <- function(x) { + # Reclass all non-numeric features as factors + x <- as.data.frame(x) + idx_char <- sapply(x, is.character) + if (any(idx_char)) { + x[, idx_char] <- lapply(x[, idx_char, drop = FALSE], as.factor) + } + idx_logical <- sapply(x, is.logical) + if (any(idx_logical)) { + x[, idx_logical] <- lapply(x[, idx_logical, drop = FALSE], as.factor) + } + idx_integer <- sapply(x, is.integer) + if (any(idx_integer)) { + # Recoding integers with > 5 levels as numeric + to_numeric <- sapply(seq_len(ncol(x)), function(j) { + idx_integer[j] & length(unique(x[[j]])) > 5 + }) + if (any(to_numeric)) { + warning('Recoding integers with more than 5 unique values as numeric. ', + 'To override this behavior, explicitly code these variables as factors.') + x[, to_numeric] <- lapply(x[, to_numeric, drop = FALSE], as.numeric) + } + to_factor <- sapply(seq_len(ncol(x)), function(j) { + idx_integer[j] & length(unique(x[[j]])) < 6 + }) + if (any(to_factor)) { + warning('Recoding integers with fewer than 6 unique values as ordered factors. ', + 'To override this behavior, explicitly code these variables as numeric.') + x[, to_factor] <- lapply(which(to_factor), function(j) { + lvls <- sort(unique(x[[j]])) + factor(x[[j]], levels = lvls, ordered = TRUE) + }) + } + } + # Rename annoying columns + if ('y' %in% colnames(x)) { + colnames(x)[which(colnames(x) == 'y')] <- col_rename(x, 'y') + } + if ('obs' %in% colnames(x)) { + colnames(x)[which(colnames(x) == 'obs')] <- col_rename(x, 'obs') + } + if ('tree' %in% colnames(x)) { + colnames(x)[which(colnames(x) == 'tree')] <- col_rename(x, 'tree') + } + if ('leaf' %in% colnames(x)) { + colnames(x)[which(colnames(x) == 'leaf')] <- col_rename(x, 'leaf') + } + if ('cnt' %in% colnames(x)) { + colnames(x)[which(colnames(x) == 'cnt')] <- col_rename(x, 'cnt') + } + if ('N' %in% colnames(x)) { + colnames(x)[which(colnames(x) == 'N')] <- col_rename(x, 'N') + } + return(x) +} + +#' Preprocess evidence +#' +#' This function prepares the evidence for computing leaf posteriors. +#' +#' @param params Circuit parameters learned via \code{\link{forde}}. +#' @param evidence Optional set of conditioning events. +#' +#' @import data.table +#' + +prep_evi <- function(params, evidence) { + + # To avoid data.table check issues + variable <- relation <- N <- n <- family <- wt <- NULL + + # Prep + setDT(evidence) + part <- all(colnames(evidence) %in% params$meta$variable) + conj <- all(c('variable', 'relation', 'value') %in% colnames(evidence)) + post <- all(c('f_idx', 'wt') %in% colnames(evidence)) + if (part + conj + post != 1L) { + stop('evidence must either be a partial sample, a data frame of conjuncts, ', + 'or a posterior distribution over leaves.') + } + if (isTRUE(part)) { + if (!all(colnames(evidence) %in% params$meta$variable)) { + err <- setdiff(colnames(evidence), params$meta$variable) + stop('Unrecognized feature(s) among colnames: ', err) + } + evidence <- suppressWarnings( + melt(evidence, measure.vars = colnames(evidence), variable.factor = FALSE) + ) + evidence[, relation := '=='] + conj <- TRUE + } + if (isTRUE(conj)) { + evi <- merge(params$meta, evidence, by = 'variable', sort = FALSE) + evi[, n := .N, by = variable] + if (any(evi[n > 1L, relation == '=='])) { + culprit <- evi[n > 1L & relation == '==', variable] + stop(paste('Inconsistent conditioning events for the following variable(s):', + culprit)) + } + if (any(evi[, family == 'multinom'])) { + evi_tmp <- evi[family == 'multinom'] + if (any(evi_tmp[, !relation %in% c('==', '!=')])) { + stop('With categorical features, the only valid relations are ', + '"==" or "!=".') + } + } + if (any(evi[, family != 'multinom'])) { + evi_tmp <- evi[family != 'multinom'] + if (any(evi_tmp[, relation == '!='])) { + evidence <- evidence[!(family != 'multinom' & relation == '!=')] + warning('With continuous features, "!=" is not a valid relation. ', + 'This constraint has been removed.') + } + #if (any(evi_tmp[, n > 2L])) { + # inf <- blah + # sup <- blah + #} + } + } + if (isTRUE(post)) { + if (evidence[, sum(wt)] != 1) { + evidence[, wt := wt / sum(wt)] + warning('Posterior weights have been normalized to sum to unity.') + } + } + ### ALSO: Reduce redundant events to most informative condition + ### and check for inconsistencies, e.g. >3 & <2 + + + return(evidence) +} + + +#' Compute leaf posterior +#' +#' This function returns a posterior distribution on leaves, conditional on some +#' evidence. +#' +#' @param params Circuit parameters learned via \code{\link{forde}}. +#' @param evidence Data frame of conditioning event(s). +#' +#' @import data.table +#' @importFrom truncnorm dtruncnorm ptruncnorm +#' + +leaf_posterior <- function(params, evidence) { + + # To avoid data.table check issues + variable <- relation <- value <- prob <- f_idx <- cvg <- wt <- + mu <- sigma <- val <- k <- family <- n <- compl <- lik2 <- . <- NULL + + # Likelihood per leaf-event combo + psi_cnt <- psi_cat <- NULL + evidence <- merge(evidence, params$meta, by = 'variable', sort = FALSE) + + # Continuous features + if (any(evidence$family != 'multinom')) { + evi <- evidence[family != 'multinom'] + evi[, n := .N, by = variable] + psi <- merge(evi, params$cnt, by = 'variable') + if (any(evi$relation == '==')) { + psi[relation == '==', lik := + truncnorm::dtruncnorm(value, a = min, b = max, mean = mu, sd = sigma)] + } + if (any(evi$relation != '==')) { + psi[relation != '==', lik := + truncnorm::ptruncnorm(value, a = min, b = max, mean = mu, sd = sigma)] + } + if (any(evi[, n > 1L])) { + interval_vars <- evi[n > 1L, variable] + psi1 <- psi[!variable %in% interval_vars] + psi2 <- psi[variable %in% interval_vars] + psi2[, lik := rep(psi2[relation %in% c('<', '<='), lik] - + psi2[relation %in% c('>', '>='), lik], 2)] + psi <- rbind(psi1, psi2) + } + if (any(evi[, n == 1 & relation %in% c('>', '>=')])) { + psi[n == 1 & relation %in% c('>', '>='), lik := 1 - lik] + } + psi[value == min, lik := 0] + psi_cnt <- unique(psi[, .(f_idx, variable, lik)]) + } + + # Categorical features + psi_eq <- psi_ineq <- NULL + if (any(evidence$family == 'multinom')) { + evi <- evidence[family == 'multinom'] + evi[, value := as.character(value)] + grd <- rbindlist(lapply(evi[, variable], function(j) { + expand.grid('f_idx' = params$forest$f_idx, 'variable' = j, + 'val' = params$cat[variable == j, unique(val)], + stringsAsFactors = FALSE) + })) + psi <- merge(params$cat, grd, by = c('f_idx', 'variable', 'val'), + sort = FALSE, all.y = TRUE) + psi[is.na(prob), prob := 0] + setnames(psi, 'prob', 'lik') + if (any(evi[, relation == '=='])) { + evi_tmp <- evi[relation == '==', .(variable, value)] + setnames(evi_tmp, 'value', 'val') + psi_eq <- merge(psi, evi_tmp, by = c('variable', 'val'), sort = FALSE) + psi_eq <- psi_eq[, .(f_idx, variable, lik)] + } + if (any(evi[, relation == '!='])) { + evi_tmp <- evi[relation == '!=', .(variable, value)] + psi_ineq <- rbindlist(lapply(evi_tmp[, .I], function(k) { + psi[variable == evi_tmp$variable[k] & val != evi_tmp$value[k]] + })) + psi_ineq[, lik := sum(lik), by = .(f_idx, variable)] + psi_ineq <- unique(psi_ineq[, .(f_idx, variable, lik)]) + } + psi_cat <- rbind(psi_eq, psi_ineq) + } + psi <- rbind(psi_cnt, psi_cat) + + # Weight is proportional to coverage times product of likelihoods + psi <- merge(psi, params$forest[, .(f_idx, cvg)], by = 'f_idx', sort = FALSE) + psi[, wt:= { + if (any(lik == 0)) { + 0 + } else { + exp(mean(log(c(cvg[1], lik)))) + } + } + , by = f_idx] + + # Normalize, export + out <- unique(psi[wt > 0, .(f_idx, wt)]) + if (nrow(out) == 0) { + # If all leaves have zero weight, choose one randomly + warning("All leaves have zero likelihood. This is probably because evidence contains an (almost) impossible combination. For categorical data, consider setting alpha>0 in forde().") + out <- unique(psi[, .(f_idx)]) + out[, wt := 1] + } + out[, wt := (wt / max(wt, na.rm = T))^(nrow(evidence) + 1)][wt > 0, wt := wt / sum(wt)] + return(out[]) +} + + +#' Post-process data +#' +#' This function prepares output data for forge. +#' +#' @param x Input data.frame. +#' @param params Circuit parameters learned via \code{\link{forde}}. +#' +#' @import data.table +#' @importFrom tibble as_tibble +#' + +post_x <- function(x, params) { + + # To avoid data.table check issues + variable <- val <- NULL + + # Order, classify features + meta_tmp <- params$meta[variable %in% colnames(x)] + setcolorder(x, match(meta_tmp$variable, colnames(x))) + setDF(x) + idx_numeric <- meta_tmp[, which(class == 'numeric')] + idx_factor <- meta_tmp[, which(class == 'factor')] + idx_ordered <- meta_tmp[, which(grepl('ordered', class))] + idx_logical <- meta_tmp[, which(class == 'logical')] + idx_integer <- meta_tmp[, which(class == 'integer')] + + # Recode + if (sum(idx_numeric) > 0L) { + x[, idx_numeric] <- lapply(idx_numeric, function(j) { + round(as.numeric(x[[j]]), meta_tmp$decimals[j]) + }) + } + if (sum(idx_factor) > 0L) { + x[, idx_factor] <- lapply(idx_factor, function(j) { + factor(x[[j]], levels = params$levels[variable == colnames(x)[j], val]) + }) + } + if (sum(idx_ordered) > 0L) { + x[, idx_ordered] <- lapply(idx_ordered, function(j) { + factor(x[[j]], levels = params$levels[variable == colnames(x)[j], val], ordered = TRUE) + }) + } + if (sum(idx_logical) > 0L) { + x[, idx_logical] <- lapply(x[, idx_logical, drop = FALSE], as.logical) + } + if (sum(idx_integer) > 0L) { + x[, idx_integer] <- lapply(idx_integer, function(j) { + as.integer(as.character(x[[j]])) + }) + } + + # Export + if ('data.table' %in% params$input_class) { + setDT(x) + } else if ('tbl_df' %in% params$input_class) { + x <- tibble::as_tibble(x) + } else if ('matrix' %in% params$input_class) { + x <- as.matrix(x) + } + return(x) +} + + diff --git a/README.md b/README.md index b7d2dba4..c3a27ede 100644 --- a/README.md +++ b/README.md @@ -41,9 +41,16 @@ To generate 100 synthetic samples: forge(psi, 100) ``` +### Conditional expectations +To estimate the mean of some variable(s), optionally conditioned on some event(s): +```R +evi <- data.frame(Species = "setosa") +expct(psi, query = "Sepal.Length", evidence = evi) +``` + For more detailed examples, see the package [vignette](https://bips-hb.github.io/arf/articles/vignette.html). -## Other distributions +## Python library A Python implementation of ARF, `arfpy`, is available on [PyPI](https://pypi.org/project/arfpy/). For the development version, see [here](https://github.com/bips-hb/arfpy). ## References diff --git a/man/adversarial_rf.Rd b/man/adversarial_rf.Rd index 8ca89ec2..5e210894 100644 --- a/man/adversarial_rf.Rd +++ b/man/adversarial_rf.Rd @@ -11,6 +11,7 @@ adversarial_rf( delta = 0, max_iters = 10L, early_stop = TRUE, + prune = TRUE, verbose = TRUE, parallel = TRUE, ... @@ -34,6 +35,8 @@ likelihood estimation. See Details.} \item{early_stop}{Terminate loop if performance fails to improve from one round to the next?} +\item{prune}{Impose \code{min_node_size} by pruning?} + \item{verbose}{Print discriminator accuracy after each round?} \item{parallel}{Compute in parallel? Must register backend beforehand, e.g. @@ -53,10 +56,10 @@ factorized leaves where features are jointly independent. ARFs are trained iteratively, with alternating rounds of generation and discrimination. In the first instance, synthetic data is generated via independent bootstraps of each feature, and a RF classifier is trained to distinguish between real and -synthetic samples. In subsequent rounds, synthetic data is generated -separately in each leaf, using splits from the previous forest. This creates -increasingly realistic data that satisfies local independence by construction. -The algorithm converges when a RF cannot reliably distinguish between the two +fake samples. In subsequent rounds, synthetic data is generated separately in +each leaf, using splits from the previous forest. This creates increasingly +realistic data that satisfies local independence by construction. The +algorithm converges when a RF cannot reliably distinguish between the two classes, i.e. when OOB accuracy falls below 0.5 + \code{delta}. ARFs are useful for several unsupervised learning tasks, such as density @@ -65,13 +68,12 @@ estimation (see \code{\link{forde}}) and data synthesis (see trees for improved performance (typically on the order of 100-1000 depending on sample size). -Integer variables are treated as ordered factors by default. If the ARF is -passed to \code{forde}, the estimated distribution for these variables will -only have support on observed factor levels (i.e., the output will be a pmf, -not a pdf). To override this behavior and assign nonzero density to -intermediate values, explicitly recode the features as numeric. +Integer variables are recoded with a warning. Default behavior is to convert +those with six or more unique values to numeric, while those with up to five +unique values are treated as ordered factors. To override this behavior, +explicitly recode integer variables to the target type prior to training. -Note: convergence is not guaranteed in finite samples. The \code{max_iter} +Note: convergence is not guaranteed in finite samples. The \code{max_iters} argument sets an upper bound on the number of training rounds. Similar results may be attained by increasing \code{delta}. Even a single round can often give good performance, but data with strong or complex dependencies may diff --git a/man/expct.Rd b/man/expct.Rd new file mode 100644 index 00000000..c618499d --- /dev/null +++ b/man/expct.Rd @@ -0,0 +1,57 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/expct.R +\name{expct} +\alias{expct} +\title{Expected Value} +\usage{ +expct(params, query = NULL, evidence = NULL) +} +\arguments{ +\item{params}{Circuit parameters learned via \code{\link{forde}}.} + +\item{query}{Optional character vector of variable names. Estimates will be +computed for each. If \code{NULL}, all variables other than those in +\code{evidence} will be estimated.} + +\item{evidence}{Optional set of conditioning events. This can take one of +three forms: (1) a partial sample, i.e. a single row of data with some but +not all columns; (2) a data frame of conditioning events, which allows for +inequalities; or (3) a posterior distribution over leaves. See Details.} +} +\value{ +A one row data frame with values for all query variables. +} +\description{ +Compute the expectation of some query variable(s), optionally conditioned +on some event(s). +} +\details{ +This function computes expected values for any subset of features, optionally +conditioned on some event(s). +} +\examples{ +# Train ARF and corresponding circuit +arf <- adversarial_rf(iris) +psi <- forde(arf, iris) + +# What is the expected value Sepal.Length? +expct(psi, query = "Sepal.Length") + +# What if we condition on Species = "setosa"? +evi <- data.frame(Species = "setosa") +expct(psi, query = "Sepal.Length", evidence = evi) + +# Compute expectations for all features other than Species +expct(psi, evidence = evi) + + +} +\references{ +Watson, D., Blesch, K., Kapar, J., & Wright, M. (2023). Adversarial random +forests for density estimation and generative modeling. In \emph{Proceedings +of the 26th International Conference on Artificial Intelligence and +Statistics}, pp. 5357-5375. +} +\seealso{ +\code{\link{adversarial_rf}}, \code{\link{forde}}, \code{\link{lik}} +} diff --git a/man/forde.Rd b/man/forde.Rd index f0d890b9..d481452e 100644 --- a/man/forde.Rd +++ b/man/forde.Rd @@ -9,6 +9,7 @@ forde( x, oob = FALSE, family = "truncnorm", + finite_bounds = FALSE, alpha = 0, epsilon = 0, parallel = TRUE @@ -28,15 +29,18 @@ features. Current options include truncated normal (the default \code{family = "truncnorm"}) and uniform (\code{family = "unif"}). See Details.} +\item{finite_bounds}{Impose finite bounds on all continuous variables?} + \item{alpha}{Optional pseudocount for Laplace smoothing of categorical features. This avoids zero-mass points when test data fall outside the support of training data. Effectively parametrizes a flat Dirichlet prior on multinomial likelihoods.} \item{epsilon}{Optional slack parameter on empirical bounds when -\code{family = "unif"}. This avoids zero-density points when test data fall -outside the support of training data. The gap between lower and upper -bounds is expanded by a factor of \code{1 + epsilon}.} +\code{family = "unif"} or \code{finite_bounds = TRUE}. This avoids +zero-density points when test data fall outside the support of training +data. The gap between lower and upper bounds is expanded by a factor of +\code{1 + epsilon}.} \item{parallel}{Compute in parallel? Must register backend beforehand, e.g. via \code{doParallel}.} @@ -55,8 +59,8 @@ Uses a pre-trained ARF model to estimate leaf and distribution parameters. distribution parameters for data within each leaf. The former includes coverage (proportion of data falling into the leaf) and split criteria. The latter includes proportions for categorical features and mean/variance for -continuous features. These values are stored in a \code{data.table}, which -can be used as input to various other functions. +continuous features. The result is a probabilistic circuit, stored as a +\code{data.table}, which can be used for various downstream inference tasks. Currently, \code{forde} only provides support for a limited number of distributional families: truncated normal or uniform for continuous data, diff --git a/man/forge.Rd b/man/forge.Rd index 58fe2f45..5b1a5df1 100644 --- a/man/forge.Rd +++ b/man/forge.Rd @@ -4,15 +4,18 @@ \alias{forge} \title{Forests for Generative Modeling} \usage{ -forge(params, n_synth, parallel = TRUE) +forge(params, n_synth, evidence = NULL) } \arguments{ -\item{params}{Parameters learned via \code{\link{forde}}.} +\item{params}{Circuit parameters learned via \code{\link{forde}}.} \item{n_synth}{Number of synthetic samples to generate.} -\item{parallel}{Compute in parallel? Must register backend beforehand, e.g. -via \code{doParallel}.} +\item{evidence}{Optional set of conditioning events. This can take one of +three forms: (1) a partial sample, i.e. a single row of data with +some but not all columns; (2) a data frame of conditioning events, +which allows for inequalities; or (3) a posterior distribution over leaves. +See Details.} } \value{ A dataset of \code{n_synth} synthetic samples. @@ -22,17 +25,43 @@ Uses pre-trained FORDE model to simulate synthetic data. } \details{ \code{forge} simulates a synthetic dataset of \code{n_synth} samples. First, -leaves are sampled in proportion to their coverage. Then, each feature is +leaves are sampled in proportion to either their coverage (if +\code{evidence = NULL}) or their posterior probability. Then, each feature is sampled independently within each leaf according to the probability mass or density function learned by \code{\link{forde}}. This will create realistic data so long as the adversarial RF used in the previous step satisfies the -local independence criterion. See Watson et al. (2022). +local independence criterion. See Watson et al. (2023). + +There are three methods for (optionally) encoding conditioning events via the +\code{evidence} argument. The first is to provide a partial sample, where +some but not all columns from the training data are present. The second is to +provide a data frame with three columns: \code{variable}, \code{relation}, +and \code{value}. This supports inequalities via \code{relation}. +Alternatively, users may directly input a pre-calculated posterior +distribution over leaves, with columns \code{f_idx} and \code{wt}. This may +be preferable for complex constraints. See Examples. } \examples{ arf <- adversarial_rf(iris) psi <- forde(arf, iris) x_synth <- forge(psi, n_synth = 100) +# Condition on Species = "setosa" +evi <- data.frame(Species = "setosa") +x_synth <- forge(psi, n_synth = 100, evidence = evi) + +# Condition in Species = "setosa" and Sepal.Length > 6 +evi <- data.frame(variable = c("Species", "Sepal.Length"), + relation = c("==", ">"), + value = c("setosa", 6)) +x_synth <- forge(psi, n_synth = 100, evidence = evi) + +# Or just input some distribution on leaves +# (Weights that do not sum to unity are automatically scaled) +n_leaves <- nrow(psi$forest) +evi <- data.frame(f_idx = psi$forest$f_idx, wt = rexp(n_leaves)) +x_synth <- forge(psi, n_synth = 100, evidence = evi) + } \references{ diff --git a/man/leaf_posterior.Rd b/man/leaf_posterior.Rd new file mode 100644 index 00000000..0212f7cb --- /dev/null +++ b/man/leaf_posterior.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{leaf_posterior} +\alias{leaf_posterior} +\title{Compute leaf posterior} +\usage{ +leaf_posterior(params, evidence) +} +\arguments{ +\item{params}{Circuit parameters learned via \code{\link{forde}}.} + +\item{evidence}{Data frame of conditioning event(s).} +} +\description{ +This function returns a posterior distribution on leaves, conditional on some +evidence. +} diff --git a/man/lik.Rd b/man/lik.Rd index 7be4a07c..08b82e9b 100644 --- a/man/lik.Rd +++ b/man/lik.Rd @@ -4,23 +4,41 @@ \alias{lik} \title{Likelihood Estimation} \usage{ -lik(arf, params, x, oob = FALSE, log = TRUE, batch = NULL, parallel = TRUE) +lik( + params, + query, + evidence = NULL, + arf = NULL, + oob = FALSE, + log = TRUE, + batch = NULL, + parallel = TRUE +) } \arguments{ -\item{arf}{Pre-trained \code{\link{adversarial_rf}}. Alternatively, any -object of class \code{ranger}.} +\item{params}{Circuit parameters learned via \code{\link{forde}}.} -\item{params}{Parameters learned via \code{\link{forde}}.} +\item{query}{Data frame of samples, optionally comprising just a subset of +training features. Likelihoods will be computed for each sample. Missing +features will be marginalized out. See Details.} -\item{x}{Input data. Densities will be computed for each sample.} +\item{evidence}{Optional set of conditioning events. This can take one of +three forms: (1) a partial sample, i.e. a single row of data with some but +not all columns; (2) a data frame of conditioning events, which allows for +inequalities; or (3) a posterior distribution over leaves. See Details.} + +\item{arf}{Pre-trained \code{\link{adversarial_rf}} or other object of class +\code{ranger}. This is not required but speeds up computation considerably +for total evidence queries. (Ignored for partial evidence queries.)} \item{oob}{Only use out-of-bag leaves for likelihood estimation? If -\code{TRUE}, \code{x} must be the same dataset used to train \code{arf}.} +\code{TRUE}, \code{x} must be the same dataset used to train \code{arf}. +Only applicable for total evidence queries.} \item{log}{Return likelihoods on log scale? Recommended to prevent underflow.} \item{batch}{Batch size. The default is to compute densities for all of -\code{x} in one round, which is always the fastest option if memory allows. +queries in one round, which is always the fastest option if memory allows. However, with large samples or many trees, it can be more memory efficient to split the data into batches. This has no impact on results.} @@ -31,22 +49,46 @@ via \code{doParallel}.} A vector of likelihoods, optionally on the log scale. } \description{ -Compute the density of input data. +Compute the likelihood of input data, optionally conditioned on some event(s). } \details{ -This function computes the density of input data according to a FORDE model -using a pre-trained ARF. Each sample's likelihood is a weighted average of -its likelihood in all leaves whose split criteria it satisfies. Intra-leaf -densities are fully factorized, since ARFs satisfy the local independence -criterion by construction. See Watson et al. (2022). +This function computes the likelihood of input data, optionally conditioned +on some event(s). Queries may be partial, i.e. covering some but not all +features, in which case excluded variables will be marginalized out. + +There are three methods for (optionally) encoding conditioning events via the +\code{evidence} argument. The first is to provide a partial sample, where +some but not all columns from the training data are present. The second is to +provide a data frame with three columns: \code{variable}, \code{relation}, +and \code{value}. This supports inequalities via \code{relation}. +Alternatively, users may directly input a pre-calculated posterior +distribution over leaves, with columns \code{f_idx} and \code{wt}. This may +be preferable for complex constraints. See Examples. } \examples{ # Estimate average log-likelihood arf <- adversarial_rf(iris) psi <- forde(arf, iris) -ll <- lik(arf, psi, iris, log = TRUE) +ll <- lik(psi, iris, arf = arf, log = TRUE) +mean(ll) + +# Identical but slower +ll <- lik(psi, iris, log = TRUE) mean(ll) +# Partial evidence query +lik(psi, query = iris[1, 1:3]) + +# Condition on Species = "setosa" +evi <- data.frame(Species = "setosa") +lik(psi, query = iris[1, 1:3], evidence = evi) + +# Condition on Species = "setosa" and Petal.Width > 0.3 +evi <- data.frame(variable = c("Species", "Petal.Width"), + relation = c("==", ">"), + value = c("setosa", 0.3)) +lik(psi, query = iris[1, 1:3], evidence = evi) + } \references{ diff --git a/man/post_x.Rd b/man/post_x.Rd new file mode 100644 index 00000000..32c072a9 --- /dev/null +++ b/man/post_x.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{post_x} +\alias{post_x} +\title{Post-process data} +\usage{ +post_x(x, params) +} +\arguments{ +\item{x}{Input data.frame.} + +\item{params}{Circuit parameters learned via \code{\link{forde}}.} +} +\description{ +This function prepares output data for forge. +} diff --git a/man/prep_evi.Rd b/man/prep_evi.Rd new file mode 100644 index 00000000..a9f77a99 --- /dev/null +++ b/man/prep_evi.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{prep_evi} +\alias{prep_evi} +\title{Preprocess evidence} +\usage{ +prep_evi(params, evidence) +} +\arguments{ +\item{params}{Circuit parameters learned via \code{\link{forde}}.} + +\item{evidence}{Optional set of conditioning events.} +} +\description{ +This function prepares the evidence for computing leaf posteriors. +} diff --git a/man/prep_x.Rd b/man/prep_x.Rd new file mode 100644 index 00000000..8a7f1936 --- /dev/null +++ b/man/prep_x.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{prep_x} +\alias{prep_x} +\title{Preprocess input data} +\usage{ +prep_x(x) +} +\arguments{ +\item{x}{Input data.frame.} +} +\description{ +This function prepares input data for ARFs. +} diff --git a/tests/testthat/test-arguments.R b/tests/testthat/test-arguments.R new file mode 100644 index 00000000..9108f6db --- /dev/null +++ b/tests/testthat/test-arguments.R @@ -0,0 +1,4 @@ +test_that("FORDE works with alpha>0", { + arf <- adversarial_rf(iris, parallel = FALSE) + expect_silent(forde(arf, iris, parallel = FALSE, alpha = 0.01)) +}) \ No newline at end of file diff --git a/tests/testthat/test-return_types.R b/tests/testthat/test-return_types.R index 72fe02b2..f27c7b4b 100644 --- a/tests/testthat/test-return_types.R +++ b/tests/testthat/test-return_types.R @@ -7,7 +7,7 @@ test_that("FORDE returns correct list object", { arf <- adversarial_rf(iris, num_trees = 2, verbose = FALSE, parallel = FALSE) psi <- forde(arf, iris, parallel = FALSE) expect_type(psi, "list") - expect_named(psi, c("cnt", "cat", "forest", "meta", "input_class")) + expect_named(psi, c("cnt", "cat", "forest", "meta", "levels", "input_class")) expect_s3_class(psi$cnt, "data.table") expect_s3_class(psi$cat, "data.table") expect_s3_class(psi$forest, "data.table") @@ -18,29 +18,32 @@ test_that("FORDE returns correct list object", { test_that("FORDE categories sum to unity", { arf <- adversarial_rf(iris, num_trees = 2, verbose = FALSE, parallel = FALSE) psi <- forde(arf, iris, parallel = FALSE) - expect_true(all(psi$cat[, sum(prob) == 1, by = f_idx]$V1 == 1)) + tmp <- psi$cat[, sum(prob), by = f_idx] + expect_equal(tmp$V1, rep(1, times = tmp[, .N])) }) test_that("Likelihood calculation returns vector of log-likelihoods", { arf <- adversarial_rf(iris, num_trees = 2, verbose = FALSE, parallel = FALSE) psi <- forde(arf, iris, parallel = FALSE) - loglik <- lik(arf, psi, iris, parallel = FALSE) + loglik <- lik(psi, iris, arf = arf, parallel = FALSE) + expect_warning(loglik2 <- lik(psi, iris, parallel = FALSE)) expect_type(loglik, "double") expect_length(loglik, nrow(iris)) expect_true(all(!is.na(loglik))) + expect_equal(loglik, loglik2) }) test_that("FORGE returns data frame when called with data frame", { arf <- adversarial_rf(iris, num_trees = 2, verbose = FALSE, parallel = FALSE) psi <- forde(arf, iris, parallel = FALSE) - x_synth <- forge(psi, n_synth = 20, parallel = FALSE) + x_synth <- forge(psi, n_synth = 20) expect_s3_class(x_synth, "data.frame") }) test_that("FORGE returns data table when called with data table", { arf <- adversarial_rf(data.table::as.data.table(iris), num_trees = 2, verbose = FALSE, parallel = FALSE) psi <- forde(arf, data.table::as.data.table(iris), parallel = FALSE) - x_synth <- forge(psi, n_synth = 20, parallel = FALSE) + x_synth <- forge(psi, n_synth = 20) expect_s3_class(x_synth, "data.table") }) @@ -51,7 +54,7 @@ test_that("FORGE returns matrix when called with matrix", { arf <- adversarial_rf(x, num_trees = 2, verbose = FALSE, parallel = FALSE) psi <- forde(arf, x, parallel = FALSE) - x_synth <- forge(psi, n_synth = 20, parallel = FALSE) + x_synth <- forge(psi, n_synth = 20) expect_type(x_synth, "double") expect_true(is.matrix(x_synth)) }) @@ -65,14 +68,38 @@ test_that("FORGE returns same column types", { logical = (sample(0:1, n, replace = TRUE) == 1)) expect_warning(arf <- adversarial_rf(dat, num_trees = 2, verbose = FALSE, parallel = FALSE)) - expect_warning(psi <- forde(arf, dat, parallel = FALSE)) - x_synth <- forge(psi, n_synth = 20, parallel = FALSE) + psi <- forde(arf, dat, parallel = FALSE) + x_synth <- forge(psi, n_synth = 20) # No NAs expect_true(all(!is.na(x_synth))) # Keeps column types - types <- sapply(dat, typeof) - types_synth <- sapply(x_synth, typeof) - expect_equal(types, types_synth) + classes <- sapply(dat, class) + classes_synth <- sapply(x_synth, class) + expect_equal(classes, classes_synth) }) + +# test_that("MAP returns proper column types", { +# n <- 50 +# dat <- data.frame(numeric = rnorm(n), +# integer = sample(1L:5L, n, replace = TRUE), +# character = sample(letters[1:5], n, replace = TRUE), +# factor = factor(sample(letters[1:5], n, replace = TRUE)), +# logical = (sample(0:1, n, replace = TRUE) == 1)) +# +# expect_warning(arf <- adversarial_rf(dat, num_trees = 2, verbose = FALSE, parallel = FALSE)) +# psi <- forde(arf, dat, parallel = FALSE) +# map_out <- map(psi) +# +# # No NAs +# expect_true(all(!is.na(map_out))) +# +# # Keep column types +# types <- sapply(dat, typeof) +# types_map <- sapply(map_out, typeof) +# expect_equal(types, types_map) +# }) + + + diff --git a/vignettes/vignette.Rmd b/vignettes/vignette.Rmd index 62254a97..2dff6eef 100644 --- a/vignettes/vignette.Rmd +++ b/vignettes/vignette.Rmd @@ -30,18 +30,19 @@ library(ggplot2) set.seed(123, "L'Ecuyer-CMRG") # Train ARF -arf <- adversarial_rf(iris) +arf_iris <- adversarial_rf(iris) ``` The printouts can be turned off by setting `verbose = FALSE`. Accuracy is still stored within the `arf` object, so you can evaluate convergence after the fact. The warning appears just once per session. It can be suppressed by setting `parallel = FALSE` or registering a parallel backend (more on this below). ```{r arf2, fig.height=5, fig.width=5} # Train ARF with no printouts -arf <- adversarial_rf(iris, verbose = FALSE) +arf_iris <- adversarial_rf(iris, verbose = FALSE) # Plot accuracy against iterations (model converges when accuracy <= 0.5) -tmp <- data.frame('acc' = arf$acc, 'iter' = seq_len(length(arf$acc))) -ggplot(tmp, aes(iter, acc)) + +tmp <- data.frame('Accuracy' = arf_iris$acc, + 'Iteration' = seq_len(length(arf_iris$acc))) +ggplot(tmp, aes(Iteration, Accuracy)) + geom_point() + geom_path() + geom_hline(yintercept = 0.5, linetype = 'dashed', color = 'red') @@ -49,7 +50,7 @@ ggplot(tmp, aes(iter, acc)) + We find a quick drop in accuracy following the resampling procedure, as desired. If the ARF has converged, then resulting splits should form fully factorized leaves, i.e. subregions of the feature space where variables are locally independent. -ARF convergence is almost surely guaranteed as $n \rightarrow \infty$ (see [Watson et al., 2022](https://arxiv.org/abs/2205.09435), Thm. 1). However, this has no implications for finite sample performance. In practice, we often find that adversarial training completes in just one or two rounds, but this may not hold for some datasets. To avoid infinite loops, users can increase the slack parameter `delta` or set the `max_iters` argument (default = 10). In addition to these failsafes, `adversarial_rf` uses early stopping by default `(early_stop = TRUE)`, which terminates training if factorization does not improve from one round to the next. This is recommended, since discriminator accuracy rarely falls much lower once it has increased. +ARF convergence is asymptotically guaranteed as $n \rightarrow \infty$ (see [Watson et al., 2023](https://proceedings.mlr.press/v206/watson23a.html), Thm. 1). However, this has no implications for finite sample performance. In practice, we often find that adversarial training completes in just one or two rounds, but this may not hold for some datasets. To avoid infinite loops, users can increase the slack parameter `delta` or set the `max_iters` argument (default = 10). In addition to these failsafes, `adversarial_rf` uses early stopping by default `(early_stop = TRUE)`, which terminates training if factorization does not improve from one round to the next. This is recommended, since discriminator accuracy rarely falls much lower once it has increased. For density estimation tasks, we recommend increasing the default number of trees. We generally use 100 in our experiments, though this may be suboptimal for some datasets. Likelihood estimates are not very sensitive to this parameter above a certain threshold, but larger models incur extra costs in time and memory. We can speed up computations by registering a parallel backend, in which case ARF training is distributed across cores using the `ranger` package. Much like with `ranger`, the default behavior of `adversarial_rf` is to compute in parallel if possible. How exactly this is done varies across operating systems. The following code works on Unix machines. @@ -72,7 +73,7 @@ In either case, we can now execute in parallel. ```{r arf3} # Rerun ARF, now in parallel and with more trees -arf <- adversarial_rf(iris, num_trees = 100) +arf_iris <- adversarial_rf(iris, num_trees = 100) ``` The result is an object of class `ranger`, which we can input to downstream functions. @@ -83,28 +84,28 @@ The next step is to learn the leaf and distribution parameters using forests for ```{r forde} # Compute leaf and distribution parameters -params <- forde(arf, iris) +params_iris <- forde(arf_iris, iris) ``` Default behavior is to use a truncated normal distribution for continuous data (with boundaries given by the tree's split parameters) and a multinomial distribution for categorical data. We find that this produces stable results in a wide range of settings. You can also use a uniform distribution for continuous features by setting `family = 'unif'`, thereby instantiating a piecewise constant density estimator. ```{r forde_unif} # Recompute with uniform density -params_unif <- forde(arf, iris, family = 'unif') +params_unif <- forde(arf_iris, iris, family = 'unif') ``` This method tends to perform poorly in practice, and we do not recommend it. The option is implemented primarily for benchmarking purposes. Alternative families, e.g. truncated Poisson or beta distributions, may be useful for certain problems. Future releases will expand the range of options for the `family` argument. -The `alpha` and `epsilon` arguments allow for optional regularization of multinomial and uniform distributions, respectively. These help prevent zero likelihood samples when test data fall outside the support of training data. The former is a pseudocount parameter that applies Laplace smoothing within leaves, preventing unobserved values from being assigned zero probability unless splits explicitly rule them out. In other words, we impose a flat Dirichlet prior $\text{Dir}(\alpha)$ and report posterior probabilities rather than maximum likelihood estimates. The latter is a slack parameter on empirical bounds that expands the estimated extrema for continuous features by a factor of $1 + \epsilon$. +The `alpha` and `epsilon` arguments allow for optional regularization of multinomial and uniform distributions, respectively. These help prevent zero likelihood samples when test data fall outside the support of training data. The former is a pseudocount parameter that applies [Laplace smoothing](https://en.wikipedia.org/wiki/Additive_smoothing) within leaves, preventing unobserved values from being assigned zero probability unless splits explicitly rule them out. In other words, we impose a flat Dirichlet prior and report posterior probabilities rather than maximum likelihood estimates. The latter is a slack parameter on empirical bounds that expands the estimated extrema for continuous features by a factor of $1 + \epsilon$. Compare the results of our original probability estimates for the `Species` variable with those obtained by adding a pseudocount of $\alpha = 0.1$. ```{r dirichlet} # Recompute with additive smoothing -params_alpha <- forde(arf, iris, alpha = 0.1) +params_alpha <- forde(arf_iris, iris, alpha = 0.1) # Compare results -head(params$cat) +head(params_iris$cat) head(params_alpha$cat) ``` @@ -112,47 +113,56 @@ Under Laplace smoothing, extreme probabilities only occur when the splits explic ```{r unity} # Sum probabilities -summary(params$cat[, sum(prob), by = .(f_idx, variable)]$V1) +summary(params_iris$cat[, sum(prob), by = .(f_idx, variable)]$V1) summary(params_alpha$cat[, sum(prob), by = .(f_idx, variable)]$V1) ``` -The `forde` function outputs a list of length 5, with entries for (1) continuous features; (2) categorical features; (3) leaf parameters; (4) variable metadata; and (5) data input class. +The `forde` function outputs a list of length 6, with entries for (1) continuous features; (2) categorical features; (3) leaf parameters; (4) variable metadata; (5) factor levels; and (6) data input class. ```{r forde2} -params +params_iris ``` These parameters can be used for a variety of downstream tasks, such as likelihood estimation and data synthesis. # Likelihood Estimation -To calculate log-likelihoods, we pass `arf` and `params` on to the `lik` function, along with the data whose likelihood we wish to evaluate. +To calculate log-likelihoods, we pass `params` on to the `lik` function, along with the data whose likelihood we wish to evaluate. For *total evidence* queries (i.e., those spanning all variables and no conditioning events), it is faster to also include `arf` in the function call. ```{r lik} # Compute likelihood under truncated normal and uniform distributions -ll <- lik(arf, params, iris) -ll_unif <- lik(arf, params_unif, iris) +ll <- lik(params_iris, iris, arf = arf_iris) +ll_unif <- lik(params_unif, iris, arf = arf_iris) # Compare average negative log-likelihood (lower is better) -mean(ll) -mean(ll_unif) ``` -Note that the piecewise constant estimator does considerably worse in this experiment. +Note that the piecewise constant estimator does considerably worse in this experiment. -We can compute likelihoods on the probability scale by setting `log = FALSE`, but this may result in numerical underflow. There is also a `batch` argument, which has no impact on results but can be more memory efficient for large datasets. For instance, we could rerun the code above in batches of size 50: - -```{r lik2} -# Compute likelihood in batches of 50 -ll_50 <- lik(arf, params, iris, batch = 50) +The `lik` function can also be used to compute the likelihood of some *partial state*, i.e. a setting in which some but not all variable values are specified. Let's take a look at that iris dataset: +```{r iris} +head(iris) +``` +Say we want to calculate sample likelihoods using only continuous data. That is, we provide values for the first four variables but exclude the fifth. In this case, the model will have to marginalize over `Species`: +```{r iris2, fig.height=5, fig.width=5} +# Compute likelihoods after marginalizing over Species +iris_without_species <- iris[, -5] +ll_cnt <- lik(params_iris, iris_without_species) -# Identical results? -identical(ll, ll_50) +# Compare results +tmp <- data.frame(Total = ll, Partial = ll_cnt) +ggplot(tmp, aes(Total, Partial)) + + geom_point() + + geom_abline(slope = 1, intercept = 0, color = 'red') ``` -In this example, we have used the same data throughout. This may lead to overfitting. With sufficient data, it is preferable to use a training set for `adversarial_rf`, a validation set for `forde`, and a test set for `lik`. Alternatively, we can set the `oob` argument to `TRUE` for either of the latter two functions, in which case computations are performed only on out-of-bag (OOB) data. These are samples that are randomly excluded from a given tree due to the bootstrapping subroutine of the RF classifier. Note that this only works when the dataset `x` passed to `forde` or `lik` is the same one used to train the `arf`. Recall that a sample's probability of being excluded from a single tree is $e^{-1} \approx 0.368$. When using `oob = TRUE`, be sure to include enough trees so that every observation is likely to be OOB at least a few times. +We find that likelihoods are almost identical, with slightly higher likelihood on average for partial samples. This is expected, since they have less variation to model. + +In this example, we have used the same data throughout. This may lead to overfitting. With sufficient data, it is preferable to use a training set for `adversarial_rf`, a validation set for `forde`, and a test set for `lik`. Alternatively, we can set the `oob` argument to `TRUE` for either of the latter two functions, in which case computations are performed only on out-of-bag (OOB) data. These are samples that are randomly excluded from a given tree due to the bootstrapping subroutine of the RF classifier. Note that this only works when the dataset `x` passed to `forde` or `lik` is the same one used to train the `arf`. Recall that a sample's probability of being excluded from a single tree is $\exp(-1) \approx 0.368$. When using `oob = TRUE`, be sure to include enough trees so that every observation is likely to be OOB at least a few times. -# Data synthesis +# Data Synthesis For this experiment, we use the `smiley` simulation from the `mlbench` package, which allows for easy visual assessment. We draw a training set of $n = 1000$ and simulate $1000$ synthetic datapoints. Resulting data are plotted side by side. @@ -164,13 +174,13 @@ x <- data.frame(x$x, x$classes) colnames(x) <- c('X', 'Y', 'Class') # Fit ARF -arf <- adversarial_rf(x, mtry = 2) +arf_smiley <- adversarial_rf(x, mtry = 2) # Estimate parameters -params <- forde(arf, x) +params_smiley <- forde(arf_smiley, x) # Simulate data -synth <- forge(params, n_synth = 1000) +synth <- forge(params_smiley, n_synth = 1000) # Compare structure str(x) @@ -189,70 +199,78 @@ ggplot(df, aes(X, Y, color = Class, shape = Class)) + The general shape is clearly recognizable, even if some stray samples are evident and borders are not always crisp. This can be improved with more training data. -Note that the default behavior of `adversarial_rf` is to treat integers as ordered factors, with a warning. This makes sense for, say, count data with limited support (e.g., number of petals on a flower). However, this is probably not the desired behavior for other integer variables. Consider the `diamonds` dataset, where `price` is classed as an integer. - -```{r price, fig.height=5, fig.width=5} -# Check data -head(diamonds) - -# View the distribution -hist(diamonds$price) - -# How many unique prices? -length(unique(diamonds$price)) +# Conditioning + +ARFs can also be used to compute likelihoods and synthesize data under conditioning events that specify values or ranges for input features. For instance, say we want to evaluate the likelihood of samples from the `iris` dataset under the condition that `Species = 'setosa'`. There are several ways to encode evidence, but the simplest is to pass a partial observation. +```{r conditional, fig.height=5, fig.width=5} +# Compute conditional likelihoods +evi <- data.frame(Species = 'setosa') +ll_conditional <- lik(params_iris, query = iris_without_species, evidence = evi) + +# Compare NLL across species (shifting to positive range for visualization) +tmp <- iris +tmp$NLL <- -ll_conditional + max(ll_conditional) + 1 +ggplot(tmp, aes(Species, NLL, fill = Species)) + + geom_boxplot() + + scale_y_log10() + + ylab('Negative Log-Likelihood') + + theme(legend.position = 'none') ``` -This variable should clearly not be treated as a factor with 11602 levels. To make sure we fit a continuous density for price, we re-class the feature as numeric. +As expected, measurements for non-setosa samples appear much less likely under this conditioning event. -```{r price2, fig.height=5, fig.width=5} -# Re-class -diamonds$price <- as.numeric(diamonds$price) +The partial observation method of passing evidence requires users to specify unique feature values for each conditioning variable. A more flexible alternative is to construct a data frame of conditioning events, potentially including inequalities. For example, we may want to calculate the likelihood of samples given that `Species = 'setosa'` and `Petal.Width > 0.3`. -# Take a random subsample of size 2000 -s_idx <- sample(1:nrow(diamonds), 2000) +```{r cond2} +# Data frame of conditioning events +evi <- data.frame(variable = c('Species', 'Petal.Width'), + relation = c('==', '>'), + value = c('setosa', 0.3)) +evi +``` +Each row is treated as an extra condition, so we can define intervals by putting multiple constraints on a single variable. +```{r cond3} +evi <- data.frame(variable = c('Species', 'Petal.Width', 'Petal.Width'), + relation = c('==', '>', '<='), + value = c('setosa', 0.3, 0.5)) +evi +``` +Resulting likelihoods will be computed on the condition that all queries are drawn from setosa flowers with petal width on the interval $(0.3, 0.5]$. + +A final method for passing evidence is to directly compute a posterior distribution on leaves. This could be useful for particularly complex conditioning events for which there is currently no inbuilt interface, such as polynomial constraints or arbitrary propositions in disjunctive normal form. In this case, we just require a data frame with columns for `f_idx` and `wt`. If the latter does not sum to unity, the distribution will be normalized with a warning. +```{r cond4} +# Drawing random weights +evi <- data.frame(f_idx = params_iris$forest$f_idx, + wt = rexp(nrow(params_iris$forest))) +evi$wt <- evi$wt / sum(evi$wt) +head(evi) +``` -# Train ARF -arf <- adversarial_rf(diamonds[s_idx, ]) +Each of these methods can be used for conditional sampling as well. -# Estimate parameters -params <- forde(arf, diamonds[s_idx, ]) +```{r smiley2, fig.height=5, fig.width=8} +# Simulate class-conditional data for smiley example +evi <- data.frame(Class = 4) +synth2 <- forge(params_smiley, n_synth = 250, evidence = evi) -# Check distributional families -params$meta +# Put it all together +synth2$Data <- 'Synthetic' +df <- rbind(x, synth2) -# Forge data, check histogram -synth <- forge(params, n_synth = 1000) -hist(synth$price) +# Plot results +ggplot(df, aes(X, Y, color = Class, shape = Class)) + + geom_point() + + facet_wrap(~ Data) ``` -Using `family = 'truncnorm'`, the distribution for `price` will now be modeled with a truncated Gaussian mixture. Though the general outline of the histogram looks about right, we do find some implausible values, e.g. negative prices. This can be overcome by manually setting a hard lower bound. +By conditioning on `Class = 4`, we restrict our sampling to the smile itself, rather than eyes or nose. -```{r lb, fig.height=5, fig.width=5} -# Set price minimum to empirical lower bound -params$cnt[variable == 'price' & min == -Inf, min := min(diamonds$price)] +Computing conditional expectations is similarly straightforward. -# Re-forge, check histogram -synth <- forge(params, n_synth = 1000) -hist(synth$price) +```{r cond6} +expct(params_smiley, evidence = evi) ``` -Alternatively, we could retrain under log transformation. - -```{r lprice, fig.height=5, fig.width=5} -# Transform price variable -tmp <- as.data.table(diamonds[s_idx, ]) -tmp[, price := log(price)] - -# Retrain ARF -arf <- adversarial_rf(tmp) - -# Estimate parameters -params <- forde(arf, tmp) - -# Forge, check histogram -synth <- forge(params, n_synth = 1000) -hist(exp(synth$price)) -``` +These are the average $X, Y$ coordinates for the smile above. -This may be unnecessary with sufficiently large sample sizes. For instance, negative prices are exceedingly rare when training on the complete diamonds dataset ($n = 53940$). However, if natural constraints are known *a priori*, they can easily be incorporated into `params`.