From 9c655f67e36887d955dbc1dcd0e9c112729ff8be Mon Sep 17 00:00:00 2001 From: tonywu1999 Date: Mon, 4 Mar 2024 09:57:11 -0500 Subject: [PATCH] feat(groupComparison): Enable parallel processing for groupComparison function (#110) * feat(groupComparison): Enable parallel processing for groupComparison function * refactored single core vs multicore group comparison into separate util functions * update documentation regarding groupComparisonWithMultipleCores util function * fix(groupComparison): Remove return statements, define environment() variable, fixed all_proteins_id iterator * docs: Updated docs to mention that parallel processing only works for Linux/Mac OS --- DESCRIPTION | 3 +- NAMESPACE | 4 ++ R/groupComparison.R | 38 ++++++------ R/utils_groupcomparison.R | 65 +++++++++++++++++++++ inst/tinytest/test_groupComparison.R | 20 +++++++ man/MSstatsGroupComparison.Rd | 7 ++- man/dot-groupComparisonWithMultipleCores.Rd | 34 +++++++++++ man/dot-groupComparisonWithSingleCore.Rd | 29 +++++++++ man/groupComparison.Rd | 7 ++- 9 files changed, 187 insertions(+), 20 deletions(-) create mode 100644 inst/tinytest/test_groupComparison.R create mode 100644 man/dot-groupComparisonWithMultipleCores.Rd create mode 100644 man/dot-groupComparisonWithSingleCore.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 1c58e520..5460ebe0 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -33,7 +33,8 @@ Imports: grDevices, graphics, methods, - statmod + statmod, + parallel Suggests: BiocStyle, knitr, diff --git a/NAMESPACE b/NAMESPACE index 6ba86be7..2e3b961b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -83,6 +83,10 @@ importFrom(limma,squeezeVar) importFrom(lme4,lmer) importFrom(marray,maPalette) importFrom(methods,is) +importFrom(parallel,clusterExport) +importFrom(parallel,makeCluster) +importFrom(parallel,parLapply) +importFrom(parallel,stopCluster) importFrom(plotly,add_trace) importFrom(plotly,ggplotly) importFrom(plotly,plot_ly) diff --git a/R/groupComparison.R b/R/groupComparison.R index d310efdf..83923c82 100644 --- a/R/groupComparison.R +++ b/R/groupComparison.R @@ -5,6 +5,9 @@ #' @param save_fitted_models logical, if TRUE, fitted models will be added to #' the output. #' @param log_base base of the logarithm used in dataProcess. +#' @param numberOfCores Number of cores for parallel processing. When > 1, +#' a logfile named `MSstats_groupComparison_log_progress.log` is created to +#' track progress. Only works for Linux & Mac OS. #' @inheritParams .documentFunction #' #' @details @@ -44,7 +47,8 @@ groupComparison = function(contrast.matrix, data, save_fitted_models = TRUE, log_base = 2, use_log_file = TRUE, append = FALSE, - verbose = TRUE, log_file_path = NULL + verbose = TRUE, log_file_path = NULL, + numberOfCores = 1 ) { MSstatsConvert::MSstatsLogsSettings(use_log_file, append, verbose, log_file_path, @@ -61,7 +65,8 @@ groupComparison = function(contrast.matrix, data, getOption("MSstatsMsg")("INFO", " == Start to test and get inference in whole plot ...") testing_results = MSstatsGroupComparison(split_summarized, contrast_matrix, - save_fitted_models, repeated, samples_info) + save_fitted_models, repeated, samples_info, + numberOfCores) getOption("MSstatsLog")("INFO", "== Comparisons for all proteins are done.") getOption("MSstatsMsg")("INFO", @@ -107,8 +112,12 @@ MSstatsPrepareForGroupComparison = function(summarization_output) { #' @param save_fitted_models if TRUE, fitted models will be included in the output #' @param repeated logical, output of checkRepeatedDesign function #' @param samples_info data.table, output of getSamplesInfo function +#' @param numberOfCores Number of cores for parallel processing. When > 1, +#' a logfile named `MSstats_groupComparison_log_progress.log` is created to +#' track progress. Only works for Linux & Mac OS. #' #' @importFrom utils txtProgressBar setTxtProgressBar +#' @importFrom parallel makeCluster clusterExport parLapply stopCluster #' #' @export #' @@ -130,22 +139,17 @@ MSstatsPrepareForGroupComparison = function(summarization_output) { #' group_comparison[[2]][[3]] # NULL, because we set save_fitted_models to FALSE #' MSstatsGroupComparison = function(summarized_list, contrast_matrix, - save_fitted_models, repeated, samples_info) { - groups = colnames(contrast_matrix) - has_imputed = attr(summarized_list, "has_imputed") - all_proteins_id = seq_along(summarized_list) - test_results = vector("list", length(all_proteins_id)) - pb = txtProgressBar(max = length(all_proteins_id), style = 3) - for (i in all_proteins_id) { - comparison_outputs = MSstatsGroupComparisonSingleProtein( - summarized_list[[i]], contrast_matrix, repeated, - groups, samples_info, save_fitted_models, has_imputed - ) - test_results[[i]] = comparison_outputs - setTxtProgressBar(pb, i) + save_fitted_models, repeated, samples_info, + numberOfCores = 1) { + if (numberOfCores > 1) { + return(.groupComparisonWithMultipleCores(summarized_list, contrast_matrix, + save_fitted_models, repeated, + samples_info, numberOfCores)) + } else { + return(.groupComparisonWithSingleCore(summarized_list, contrast_matrix, + save_fitted_models, repeated, + samples_info)) } - close(pb) - test_results } diff --git a/R/utils_groupcomparison.R b/R/utils_groupcomparison.R index 3326005b..e5f659d3 100644 --- a/R/utils_groupcomparison.R +++ b/R/utils_groupcomparison.R @@ -439,3 +439,68 @@ getSamplesInfo = function(summarization_output) { result } +#' Perform group comparison per protein in parallel +#' @param summarized_list output of MSstatsPrepareForGroupComparison +#' @param contrast_matrix contrast matrix +#' @param save_fitted_models if TRUE, fitted models will be included in the output +#' @param repeated logical, output of checkRepeatedDesign function +#' @param samples_info data.table, output of getSamplesInfo function +#' @param numberOfCores Number of cores for parallel processing. +#' A logfile named `MSstats_groupComparison_log_progress.log` is created to +#' track progress. Only works for Linux & Mac OS. +#' @keywords internal +.groupComparisonWithMultipleCores = function(summarized_list, contrast_matrix, + save_fitted_models, repeated, samples_info, + numberOfCores) { + groups = colnames(contrast_matrix) + has_imputed = attr(summarized_list, "has_imputed") + all_proteins_id = seq_along(summarized_list) + function_environment = environment() + cl = parallel::makeCluster(numberOfCores) + parallel::clusterExport(cl, c("MSstatsGroupComparisonSingleProtein", + "contrast_matrix", "repeated", "groups", + "samples_info", "save_fitted_models", "has_imputed"), + envir = function_environment) + cat(paste0("Number of proteins to process: ", length(all_proteins_id)), + sep = "\n", file = "MSstats_groupComparison_log_progress.log") + test_results = parallel::parLapply(cl, all_proteins_id, function(i) { + if (i %% 100 == 0) { + cat("Finished processing an additional 100 protein comparisons", + sep = "\n", file = "MSstats_groupComparison_log_progress.log", append = TRUE) + } + MSstatsGroupComparisonSingleProtein( + summarized_list[[i]], contrast_matrix, repeated, + groups, samples_info, save_fitted_models, has_imputed + ) + }) + parallel::stopCluster(cl) + test_results +} + +#' Perform group comparison per protein iteratively with a single loop +#' @param summarized_list output of MSstatsPrepareForGroupComparison +#' @param contrast_matrix contrast matrix +#' @param save_fitted_models if TRUE, fitted models will be included in the output +#' @param repeated logical, output of checkRepeatedDesign function +#' @param samples_info data.table, output of getSamplesInfo function +#' @keywords internal +.groupComparisonWithSingleCore = function(summarized_list, contrast_matrix, + save_fitted_models, repeated, + samples_info) { + groups = colnames(contrast_matrix) + has_imputed = attr(summarized_list, "has_imputed") + all_proteins_id = seq_along(summarized_list) + test_results = vector("list", length(all_proteins_id)) + pb = txtProgressBar(max = length(all_proteins_id), style = 3) + for (i in all_proteins_id) { + comparison_outputs = MSstatsGroupComparisonSingleProtein( + summarized_list[[i]], contrast_matrix, repeated, + groups, samples_info, save_fitted_models, has_imputed + ) + test_results[[i]] = comparison_outputs + setTxtProgressBar(pb, i) + } + close(pb) + test_results +} + diff --git a/inst/tinytest/test_groupComparison.R b/inst/tinytest/test_groupComparison.R new file mode 100644 index 00000000..fc3997a3 --- /dev/null +++ b/inst/tinytest/test_groupComparison.R @@ -0,0 +1,20 @@ +# Setup ------------------------------------------------------------------ +QuantData = dataProcess(SRMRawData, use_log_file = FALSE) +comparison = matrix(c(-1,0,0,0,0,0,1,0,0,0),nrow=1) +row.names(comparison) = "T7-T1" +groups = levels(QuantData$ProteinLevelData$GROUP) +colnames(comparison) = groups[order(as.numeric(groups))] + +# Test groupComparison with default parameters --------------------------- +testResultDefaultComparison = groupComparison(contrast.matrix=comparison, + data=QuantData, + use_log_file = FALSE) + +# Test groupComparison with numberOfCores parameter ---------------------- +testResultParallelComparison = groupComparison(contrast.matrix=comparison, + data=QuantData, + use_log_file = FALSE, + numberOfCores = 2) + +expect_equal(nrow(testResultDefaultComparison$ComparisonResult), + nrow(testResultParallelComparison$ComparisonResult)) \ No newline at end of file diff --git a/man/MSstatsGroupComparison.Rd b/man/MSstatsGroupComparison.Rd index 95e4f1e7..0b48d455 100644 --- a/man/MSstatsGroupComparison.Rd +++ b/man/MSstatsGroupComparison.Rd @@ -9,7 +9,8 @@ MSstatsGroupComparison( contrast_matrix, save_fitted_models, repeated, - samples_info + samples_info, + numberOfCores = 1 ) } \arguments{ @@ -22,6 +23,10 @@ MSstatsGroupComparison( \item{repeated}{logical, output of checkRepeatedDesign function} \item{samples_info}{data.table, output of getSamplesInfo function} + +\item{numberOfCores}{Number of cores for parallel processing. When > 1, +a logfile named `MSstats_groupComparison_log_progress.log` is created to +track progress. Only works for Linux & Mac OS.} } \description{ Group comparison diff --git a/man/dot-groupComparisonWithMultipleCores.Rd b/man/dot-groupComparisonWithMultipleCores.Rd new file mode 100644 index 00000000..f4ae9ea1 --- /dev/null +++ b/man/dot-groupComparisonWithMultipleCores.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils_groupcomparison.R +\name{.groupComparisonWithMultipleCores} +\alias{.groupComparisonWithMultipleCores} +\title{Perform group comparison per protein in parallel} +\usage{ +.groupComparisonWithMultipleCores( + summarized_list, + contrast_matrix, + save_fitted_models, + repeated, + samples_info, + numberOfCores +) +} +\arguments{ +\item{summarized_list}{output of MSstatsPrepareForGroupComparison} + +\item{contrast_matrix}{contrast matrix} + +\item{save_fitted_models}{if TRUE, fitted models will be included in the output} + +\item{repeated}{logical, output of checkRepeatedDesign function} + +\item{samples_info}{data.table, output of getSamplesInfo function} + +\item{numberOfCores}{Number of cores for parallel processing. +A logfile named `MSstats_groupComparison_log_progress.log` is created to +track progress. Only works for Linux & Mac OS.} +} +\description{ +Perform group comparison per protein in parallel +} +\keyword{internal} diff --git a/man/dot-groupComparisonWithSingleCore.Rd b/man/dot-groupComparisonWithSingleCore.Rd new file mode 100644 index 00000000..f3d15e71 --- /dev/null +++ b/man/dot-groupComparisonWithSingleCore.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils_groupcomparison.R +\name{.groupComparisonWithSingleCore} +\alias{.groupComparisonWithSingleCore} +\title{Perform group comparison per protein iteratively with a single loop} +\usage{ +.groupComparisonWithSingleCore( + summarized_list, + contrast_matrix, + save_fitted_models, + repeated, + samples_info +) +} +\arguments{ +\item{summarized_list}{output of MSstatsPrepareForGroupComparison} + +\item{contrast_matrix}{contrast matrix} + +\item{save_fitted_models}{if TRUE, fitted models will be included in the output} + +\item{repeated}{logical, output of checkRepeatedDesign function} + +\item{samples_info}{data.table, output of getSamplesInfo function} +} +\description{ +Perform group comparison per protein iteratively with a single loop +} +\keyword{internal} diff --git a/man/groupComparison.Rd b/man/groupComparison.Rd index 751b7bd2..04fe6ddc 100644 --- a/man/groupComparison.Rd +++ b/man/groupComparison.Rd @@ -12,7 +12,8 @@ groupComparison( use_log_file = TRUE, append = FALSE, verbose = TRUE, - log_file_path = NULL + log_file_path = NULL, + numberOfCores = 1 ) } \arguments{ @@ -38,6 +39,10 @@ to the console.} data processing will be saved. If not provided, such a file will be created automatically. If `append = TRUE`, has to be a valid path to a file.} + +\item{numberOfCores}{Number of cores for parallel processing. When > 1, +a logfile named `MSstats_groupComparison_log_progress.log` is created to +track progress. Only works for Linux & Mac OS.} } \value{ list that consists of three elements: "ComparisonResult" - data.frame with results of statistical testing,