diff --git a/NAMESPACE b/NAMESPACE index 700e7c6..b15e08a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -43,9 +43,11 @@ importFrom(boot,logit) importFrom(dplyr,any_of) importFrom(dplyr,arrange) importFrom(dplyr,cummean) +importFrom(dplyr,distinct) importFrom(dplyr,distinct_at) importFrom(dplyr,enquo) importFrom(dplyr,filter) +importFrom(dplyr,pull) importFrom(dplyr,if_else) importFrom(dplyr,left_join) importFrom(dplyr,mutate) diff --git a/R/methods.R b/R/methods.R index fd3e49d..bb31ae3 100644 --- a/R/methods.R +++ b/R/methods.R @@ -387,6 +387,10 @@ sccomp_estimate.data.frame = function(.data, .count = enquo(.count) .sample_cell_group_pairs_to_exclude = enquo(.sample_cell_group_pairs_to_exclude) + # Check Sample Consistency of Factors + check_sample_consistency_of_factors(.data, formula_composition, !!.sample) + + if( quo_is_null(.count)) res = sccomp_glm_data_frame_raw( .data, diff --git a/R/utilities.R b/R/utilities.R index 4ea96c5..ec1c1ed 100644 --- a/R/utilities.R +++ b/R/utilities.R @@ -2866,6 +2866,60 @@ contains_only_valid_chars_for_column <- function(column_names) { sapply(column_names, check_validity) } +#' Check Sample Consistency of Factors +#' +#' This function checks for each sample in the provided data frame if the number of unique +#' covariate values from a specified formula matches the number of samples. It is useful for +#' verifying data consistency before statistical analysis. The function stops and throws an +#' error if inconsistencies are found. +#' +#' @importFrom dplyr select +#' @importFrom dplyr filter +#' @importFrom dplyr mutate +#' @importFrom dplyr pull +#' @importFrom dplyr distinct +#' @importFrom tidyr pivot_longer +#' @importFrom purrr map_lgl +#' +#' @param .data A data frame containing the samples and covariates. +#' @param my_formula A formula specifying the covariates to check, passed as a string. +#' +#' @details The function selects the sample and covariates based on `my_formula`, pivots +#' the data longer so each row represents a unique sample-covariate combination, nests +#' the data by covariate name, and checks if the number of unique sample-covariate +#' pairs matches the number of samples for each covariate. +#' +#' @return This function does not return a value; it stops with an error message if any +#' inconsistencies are found. +#' +#' @noRd +#' @keywords internal +check_sample_consistency_of_factors = function(.data, my_formula, .sample){ + + .sample = enquo(.sample) + + # Check that I have one set of covariates per sample + any_covariate_not_matching_sample_size = + .data |> + select(!!.sample, parse_formula(my_formula)) |> + pivot_longer(-!!.sample) |> + nest(data = -name) |> + mutate(correct_size = map_lgl(data, + ~ + (.x |> distinct(!!.sample, value) |> nrow()) <= + (.x |> distinct(!!.sample) |> nrow()) + )) |> + filter(!correct_size) + + if( any_covariate_not_matching_sample_size |> nrow() > 0 ) stop( + sprintf("sccomp says: your \"%s\" factor(s) is(are) mismatched across samples. ", any_covariate_not_matching_sample_size |> pull(name) |> paste(collapse = ", ")), + "For example, sample_bar having more than one value for factor_foo. ", + "For sample_bar you should have one value for factor_foo. consistent across groups (e.g. cell types)." + ) + +} + + #' chatGPT - Intelligently Remove Surrounding Brackets from Each String in a Vector #'