From 16df86fbc90714c9f8d7051913c345268719d88d Mon Sep 17 00:00:00 2001 From: Tony Wu Date: Thu, 31 Oct 2024 16:04:00 -0400 Subject: [PATCH] docs(dataProcess+groupComparison): Add documentation on output tables of dataProcess and groupComparison --- R/dataProcess.R | 40 +++++++++++++++++++++++++++++++++++++++- R/groupComparison.R | 38 ++++++++++++++++++++++++++++++++++++-- man/dataProcess.Rd | 39 +++++++++++++++++++++++++++++++++++++++ man/groupComparison.Rd | 38 ++++++++++++++++++++++++++++++++++++-- 4 files changed, 150 insertions(+), 5 deletions(-) diff --git a/R/dataProcess.R b/R/dataProcess.R index 9415d8e..17b4fd3 100755 --- a/R/dataProcess.R +++ b/R/dataProcess.R @@ -55,7 +55,45 @@ #' @inheritParams .documentFunction #' #' @importFrom utils sessionInfo -#' @importFrom data.table as.data.table +#' @importFrom data.table as.data.table +#' +#' @return A list containing: +#' \describe{ +#' \item{FeatureLevelData}{A data frame with feature-level information after processing. Columns include: +#' \describe{ +#' \item{PROTEIN}{Identifier for the protein associated with the feature.} +#' \item{PEPTIDE}{Identifier for the peptide sequence.} +#' \item{TRANSITION}{Identifier for the transition, typically representing a specific ion pair.} +#' \item{FEATURE}{Unique identifier for the feature, which could be a combination of peptide and transition.} +#' \item{LABEL}{Specifies the isotopic labeling of peptides, notably for SRM-based experiments. "L" indicates light-labeled peptides while "H" denotes heavy-labeled peptides.} +#' \item{GROUP}{Experimental group identifier.} +#' \item{RUN}{Identifier for the specific MS run.} +#' \item{SUBJECT}{Subject identifier within the experimental group.} +#' \item{FRACTION}{Fraction identifier if fractionation was performed.} +#' \item{originalRUN}{Original run identifier before any processing.} +#' \item{censored}{Logical indicator of whether the intensity value is considered missing or below limit of detection.} +#' \item{INTENSITY}{Original intensity measurement of the feature in the given run.} +#' \item{ABUNDANCE}{Processed abundance or intensity value after log-transformation and normalization.} +#' \item{newABUNDANCE}{The ABUNDANCE column but includes imputed missing values. It is the column that is used for protein summarization.} +#' \item{predicted}{Predicted intensity values for censored data, typically derived from a statistical model.} +#' } +#' } +#' \item{ProteinLevelData}{A data frame with run-level summarized information for each protein. Columns include: +#' \describe{ +#' \item{RUN}{Identifier for the specific MS run.} +#' \item{Protein}{Identifier for the protein.} +#' \item{LogIntensities}{Log-transformed intensities for the protein in each run.} +#' \item{originalRUN}{Original run identifier before any processing.} +#' \item{GROUP}{Experimental group identifier.} +#' \item{SUBJECT}{Subject identifier within the experimental group.} +#' \item{TotalGroupMeasurements}{Total number of feature measurements for the protein in the given group.} +#' \item{NumMeasuredFeatures}{Number of features measured for the protein in the given run.} +#' \item{MissingPercentage}{Percentage of missing feature values for the protein in the given run.} +#' \item{more50missing}{Logical indicator of whether more than 50 percent of the features values are missing for the protein in the given run.} +#' \item{NumImputedFeature}{Number of features for which values were imputed due to missing or censored data for the protein in the given run.} +#' } +#' } +#' } #' #' @export #' diff --git a/R/groupComparison.R b/R/groupComparison.R index e319cd7..1d9c7d2 100644 --- a/R/groupComparison.R +++ b/R/groupComparison.R @@ -15,8 +15,42 @@ #' The underlying model fitting functions are lm and lmer for the fixed effects model and mixed effects model, respectively. #' The input of this function is the quantitative data from function (dataProcess). #' -#' @return list that consists of three elements: "ComparisonResult" - data.frame with results of statistical testing, -#' "ModelQC" - data.frame with data used to fit models for group comparison and "FittedModel" - list of fitted models. +#' @return A list with the following components: +#' \describe{ +#' \item{ComparisonResult}{A `data.frame` containing the results of the statistical testing for each protein. The columns include: +#' \describe{ +#' \item{Protein}{The name of the protein for which the comparison is made.} +#' \item{Label}{The label of the comparison, typically derived from the `contrast.matrix`.} +#' \item{log2FC}{The log2 fold change between the conditions being compared. The base of the logarithm is specified by the `log_base` parameter.} +#' \item{SE}{The standard error of the log2 fold change estimate.} +#' \item{Tvalue}{The t-statistic value for the comparison.} +#' \item{DF}{The degrees of freedom associated with the t-statistic.} +#' \item{pvalue}{The p-value for the statistical test of the comparison.} +#' \item{adj.pvalue}{The adjusted p-value using the Benjamini-Hochberg method for controlling the false discovery rate.} +#' \item{issue}{Any issues encountered during the comparison. NA indicates no issues. "oneConditionMissing" occurs when data for one of the conditions being compared is entirely missing for a particular protein.} +#' \item{MissingPercentage}{The percentage of missing features for a given protein across all runs. This column is included only if missing values were imputed.} +#' \item{ImputationPercentage}{The percentage of features that were imputed for a given protein across all runs. This column is included only if missing values were imputed.} +#' } +#' } +#' \item{ModelQC}{A `data.frame` containing quality control data used to fit models for group comparison. The columns include: +#' \describe{ +#' \item{RUN}{Identifier for the specific MS run.} +#' \item{Protein}{Identifier for the protein.} +#' \item{ABUNDANCE}{Summarized intensity for the protein in a given run.} +#' \item{originalRUN}{Original run identifier before any processing.} +#' \item{GROUP}{Experimental group identifier.} +#' \item{SUBJECT}{Subject identifier within the experimental group.} +#' \item{TotalGroupMeasurements}{Total number of feature measurements for the protein in the given group.} +#' \item{NumMeasuredFeatures}{Number of features measured for the protein in the given run.} +#' \item{MissingPercentage}{Percentage of missing feature values for the protein in the given run.} +#' \item{more50missing}{Logical indicator of whether more than 50 percent of the features values are missing for the protein in the given run.} +#' \item{NumImputedFeature}{Number of features for which values were imputed due to missing or censored data for the protein in the given run.} +#' \item{residuals}{Contains the differences between the observed values and the values predicted by the fitted model. } +#' \item{fitted}{The predicted values obtained from the model for a protein measurement for a given run in the dataset. } +#' } +#' } +#' \item{FittedModel}{A list of fitted models for each protein. This is included only if `save_fitted_models` is set to TRUE. Each element of the list corresponds to a protein and contains the fitted model object.} +#' } #' #' @export #' @import lme4 diff --git a/man/dataProcess.Rd b/man/dataProcess.Rd index 8388937..9e8b52f 100644 --- a/man/dataProcess.Rd +++ b/man/dataProcess.Rd @@ -110,6 +110,45 @@ If `append = TRUE`, has to be a valid path to a file.} a logfile named `MSstats_dataProcess_log_progress.log` is created to track progress. Only works for Linux & Mac OS. Default is 1.} } +\value{ +A list containing: +\describe{ + \item{FeatureLevelData}{A data frame with feature-level information after processing. Columns include: + \describe{ + \item{PROTEIN}{Identifier for the protein associated with the feature.} + \item{PEPTIDE}{Identifier for the peptide sequence.} + \item{TRANSITION}{Identifier for the transition, typically representing a specific ion pair.} + \item{FEATURE}{Unique identifier for the feature, which could be a combination of peptide and transition.} + \item{LABEL}{Specifies the isotopic labeling of peptides, notably for SRM-based experiments. "L" indicates light-labeled peptides while "H" denotes heavy-labeled peptides.} + \item{GROUP}{Experimental group identifier.} + \item{RUN}{Identifier for the specific MS run.} + \item{SUBJECT}{Subject identifier within the experimental group.} + \item{FRACTION}{Fraction identifier if fractionation was performed.} + \item{originalRUN}{Original run identifier before any processing.} + \item{censored}{Logical indicator of whether the intensity value is considered missing or below limit of detection.} + \item{INTENSITY}{Original intensity measurement of the feature in the given run.} + \item{ABUNDANCE}{Processed abundance or intensity value after log-transformation and normalization.} + \item{newABUNDANCE}{The ABUNDANCE column but includes imputed missing values. It is the column that is used for protein summarization.} + \item{predicted}{Predicted intensity values for censored data, typically derived from a statistical model.} + } + } + \item{ProteinLevelData}{A data frame with run-level summarized information for each protein. Columns include: + \describe{ + \item{RUN}{Identifier for the specific MS run.} + \item{Protein}{Identifier for the protein.} + \item{LogIntensities}{Log-transformed intensities for the protein in each run.} + \item{originalRUN}{Original run identifier before any processing.} + \item{GROUP}{Experimental group identifier.} + \item{SUBJECT}{Subject identifier within the experimental group.} + \item{TotalGroupMeasurements}{Total number of feature measurements for the protein in the given group.} + \item{NumMeasuredFeatures}{Number of features measured for the protein in the given run.} + \item{MissingPercentage}{Percentage of missing feature values for the protein in the given run.} + \item{more50missing}{Logical indicator of whether more than 50 percent of the features values are missing for the protein in the given run.} + \item{NumImputedFeature}{Number of features for which values were imputed due to missing or censored data for the protein in the given run.} + } + } +} +} \description{ Process MS data: clean, normalize and summarize before differential analysis } diff --git a/man/groupComparison.Rd b/man/groupComparison.Rd index 0a7aa73..a0ffdfb 100644 --- a/man/groupComparison.Rd +++ b/man/groupComparison.Rd @@ -45,8 +45,42 @@ a logfile named `MSstats_groupComparison_log_progress.log` is created to track progress. Only works for Linux & Mac OS. Default is 1.} } \value{ -list that consists of three elements: "ComparisonResult" - data.frame with results of statistical testing, -"ModelQC" - data.frame with data used to fit models for group comparison and "FittedModel" - list of fitted models. +A list with the following components: +\describe{ + \item{ComparisonResult}{A `data.frame` containing the results of the statistical testing for each protein. The columns include: + \describe{ + \item{Protein}{The name of the protein for which the comparison is made.} + \item{Label}{The label of the comparison, typically derived from the `contrast.matrix`.} + \item{log2FC}{The log2 fold change between the conditions being compared. The base of the logarithm is specified by the `log_base` parameter.} + \item{SE}{The standard error of the log2 fold change estimate.} + \item{Tvalue}{The t-statistic value for the comparison.} + \item{DF}{The degrees of freedom associated with the t-statistic.} + \item{pvalue}{The p-value for the statistical test of the comparison.} + \item{adj.pvalue}{The adjusted p-value using the Benjamini-Hochberg method for controlling the false discovery rate.} + \item{issue}{Any issues encountered during the comparison. NA indicates no issues. "oneConditionMissing" occurs when data for one of the conditions being compared is entirely missing for a particular protein.} + \item{MissingPercentage}{The percentage of missing features for a given protein across all runs. This column is included only if missing values were imputed.} + \item{ImputationPercentage}{The percentage of features that were imputed for a given protein across all runs. This column is included only if missing values were imputed.} + } + } + \item{ModelQC}{A `data.frame` containing quality control data used to fit models for group comparison. The columns include: + \describe{ + \item{RUN}{Identifier for the specific MS run.} + \item{Protein}{Identifier for the protein.} + \item{ABUNDANCE}{Summarized intensity for the protein in a given run.} + \item{originalRUN}{Original run identifier before any processing.} + \item{GROUP}{Experimental group identifier.} + \item{SUBJECT}{Subject identifier within the experimental group.} + \item{TotalGroupMeasurements}{Total number of feature measurements for the protein in the given group.} + \item{NumMeasuredFeatures}{Number of features measured for the protein in the given run.} + \item{MissingPercentage}{Percentage of missing feature values for the protein in the given run.} + \item{more50missing}{Logical indicator of whether more than 50 percent of the features values are missing for the protein in the given run.} + \item{NumImputedFeature}{Number of features for which values were imputed due to missing or censored data for the protein in the given run.} + \item{residuals}{Contains the differences between the observed values and the values predicted by the fitted model. } + \item{fitted}{The predicted values obtained from the model for a protein measurement for a given run in the dataset. } + } + } + \item{FittedModel}{A list of fitted models for each protein. This is included only if `save_fitted_models` is set to TRUE. Each element of the list corresponds to a protein and contains the fitted model object.} +} } \description{ Whole plot testing