Skip to content

Commit

Permalink
refactor(MSstatsSummarize): Create MSstatsSummarizeWithSingleCore to …
Browse files Browse the repository at this point in the history
…reduce memory and runtime usage (#126)
  • Loading branch information
tonywu1999 authored Jul 26, 2024
1 parent c105272 commit 98a8267
Show file tree
Hide file tree
Showing 7 changed files with 142 additions and 10 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ biocViews: ImmunoOncology, MassSpectrometry, Proteomics, Software, Normalization
LazyData: true
URL: http://msstats.org
BugReports: https://groups.google.com/forum/#!forum/msstats
RoxygenNote: 7.3.1
RoxygenNote: 7.3.2
Encoding: UTF-8
NeedsCompilation: no
Packaged: 2017-10-20 02:13:12 UTC; meenachoi
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ export(MSstatsSummarizationOutput)
export(MSstatsSummarize)
export(MSstatsSummarizeSingleLinear)
export(MSstatsSummarizeSingleTMP)
export(MSstatsSummarizeWithSingleCore)
export(MaxQtoMSstatsFormat)
export(OpenMStoMSstatsFormat)
export(OpenSWATHtoMSstatsFormat)
Expand Down
72 changes: 70 additions & 2 deletions R/dataProcess.R
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,8 @@ MSstatsSummarizeWithMultipleCores = function(input, method, impute, censored_sym
num_proteins = length(protein_indices)
function_environment = environment()
cl = parallel::makeCluster(numberOfCores)
getOption("MSstatsLog")("INFO",
"Starting the cluster setup for summarization")
parallel::clusterExport(cl, c("MSstatsSummarizeSingleTMP",
"MSstatsSummarizeSingleLinear",
"input", "impute", "censored_symbol",
Expand Down Expand Up @@ -196,12 +198,69 @@ MSstatsSummarizeWithMultipleCores = function(input, method, impute, censored_sym
parallel::stopCluster(cl)
return(summarized_results)
} else {
input_split = split(input, input$PROTEIN)
return(MSstatsSummarize(input_split, method, impute, censored_symbol,
return(MSstatsSummarizeWithSingleCore(input, method, impute, censored_symbol,
remove50missing, equal_variance))
}
}

#' Feature-level data summarization with 1 core
#'
#' @inheritParams MSstatsSummarizeWithMultipleCores
#'
#' @importFrom data.table uniqueN
#' @importFrom utils setTxtProgressBar
#'
#' @return list of length one with run-level data.
#'
#' @export
#'
#' @examples
#' raw = DDARawData
#' method = "TMP"
#' cens = "NA"
#' impute = TRUE
#' MSstatsConvert::MSstatsLogsSettings(FALSE)
#' input = MSstatsPrepareForDataProcess(raw, 2, NULL)
#' input = MSstatsNormalize(input, "EQUALIZEMEDIANS")
#' input = MSstatsMergeFractions(input)
#' input = MSstatsHandleMissing(input, "TMP", TRUE, "NA", 0.999)
#' input = MSstatsSelectFeatures(input, "all")
#' processed = getProcessed(input)
#' input = MSstatsPrepareForSummarization(input, method, impute, cens, FALSE)
#' summarized = MSstatsSummarizeWithSingleCore(input, method, impute, cens, FALSE, TRUE)
#' length(summarized) # list of summarization outputs for each protein
#' head(summarized[[1]][[1]]) # run-level summary
#'
MSstatsSummarizeWithSingleCore = function(input, method, impute, censored_symbol,
remove50missing, equal_variance) {


protein_indices = split(seq_len(nrow(input)), list(input$PROTEIN))
num_proteins = length(protein_indices)
summarized_results = vector("list", num_proteins)
if (method == "TMP") {
pb = utils::txtProgressBar(min = 0, max = num_proteins, style = 3)
for (protein_id in seq_len(num_proteins)) {
single_protein = input[protein_indices[[protein_id]],]
summarized_results[[protein_id]] = MSstatsSummarizeSingleTMP(
single_protein, impute, censored_symbol, remove50missing)
setTxtProgressBar(pb, protein_id)
}
close(pb)
} else {
pb = utils::txtProgressBar(min = 0, max = num_proteins, style = 3)
for (protein_id in seq_len(num_proteins)) {
single_protein = input[protein_indices[[protein_id]],]
summarized_result = MSstatsSummarizeSingleLinear(single_protein,
equal_variance)
summarized_results[[protein_id]] = summarized_result
setTxtProgressBar(pb, protein_id)
}
close(pb)
}
summarized_results
}


#' Feature-level data summarization
#'
Expand Down Expand Up @@ -257,6 +316,15 @@ MSstatsSummarize = function(proteins_list, method, impute, censored_symbol,
}
close(pb)
}
msg_deprecation = paste("FUNCTION DEPRECATION NOTICE: We would like to",
"notify you that the MSstatsSummarize function",
"will undergo a transition process. Starting from release 3.21",
"the MSstatsSummarize function in MSstats will be deprecated",
"in favor of MSstatsSummarizeWithSingleCore.",
"Please take the necessary steps to update your codebase",
"and migrate to MSstatsSummarizeWithSingleCore before",
"release 3.21 to avoid any disruptions to your workflow.")
message(msg_deprecation)
summarized_results
}

Expand Down
5 changes: 2 additions & 3 deletions R/utils_output.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#' Post-processing output from MSstats summarization
#'
#' @param input `data.table` in MSstats format
#' @param summarized output of the `MSstatsSummarize` function
#' @param summarized output of the `MSstatsSummarizeWithSingleCore` function
#' @param processed output of MSstatsSelectFeatures
#' @param method name of the summarization method
#' (`summaryMethod` parameter to `dataProcess`)
Expand Down Expand Up @@ -32,8 +32,7 @@
#' input = MSstatsSelectFeatures(input, "all")
#' processed = getProcessed(input)
#' input = MSstatsPrepareForSummarization(input, method, impute, cens, FALSE)
#' input_split = split(input, input$PROTEIN)
#' summarized = MSstatsSummarize(input_split, method, impute, cens, FALSE, TRUE)
#' summarized = MSstatsSummarizeWithSingleCore(input, method, impute, cens, FALSE, TRUE)
#' output = output = MSstatsSummarizationOutput(input, summarized, processed,
#' method, impute, cens)
#'
Expand Down
2 changes: 1 addition & 1 deletion R/utils_summarization.R
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@
#' Fit Tukey median polish
#' @param input data.table with data for a single protein
#' @param is_labeled logical, if TRUE, data is coming from an SRM experiment
#' @inheritParams MSstatsSummarize
#' @inheritParams MSstatsSummarizeWithSingleCore
#' @return data.table
#' @keywords internal
.runTukey = function(input, is_labeled, censored_symbol, remove50missing) {
Expand Down
5 changes: 2 additions & 3 deletions man/MSstatsSummarizationOutput.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

65 changes: 65 additions & 0 deletions man/MSstatsSummarizeWithSingleCore.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 98a8267

Please sign in to comment.