From 16df86fbc90714c9f8d7051913c345268719d88d Mon Sep 17 00:00:00 2001
From: Tony Wu <wu.anthon@northeastern.edu>
Date: Thu, 31 Oct 2024 16:04:00 -0400
Subject: [PATCH] docs(dataProcess+groupComparison): Add documentation on
 output tables of dataProcess and groupComparison

---
 R/dataProcess.R        | 40 +++++++++++++++++++++++++++++++++++++++-
 R/groupComparison.R    | 38 ++++++++++++++++++++++++++++++++++++--
 man/dataProcess.Rd     | 39 +++++++++++++++++++++++++++++++++++++++
 man/groupComparison.Rd | 38 ++++++++++++++++++++++++++++++++++++--
 4 files changed, 150 insertions(+), 5 deletions(-)

diff --git a/R/dataProcess.R b/R/dataProcess.R
index 9415d8e..17b4fd3 100755
--- a/R/dataProcess.R
+++ b/R/dataProcess.R
@@ -55,7 +55,45 @@
 #' @inheritParams .documentFunction
 #' 
 #' @importFrom utils sessionInfo
-#' @importFrom data.table as.data.table 
+#' @importFrom data.table as.data.table
+#' 
+#' @return A list containing:
+#' \describe{
+#'   \item{FeatureLevelData}{A data frame with feature-level information after processing. Columns include:
+#'     \describe{
+#'       \item{PROTEIN}{Identifier for the protein associated with the feature.}
+#'       \item{PEPTIDE}{Identifier for the peptide sequence.}
+#'       \item{TRANSITION}{Identifier for the transition, typically representing a specific ion pair.}
+#'       \item{FEATURE}{Unique identifier for the feature, which could be a combination of peptide and transition.}
+#'       \item{LABEL}{Specifies the isotopic labeling of peptides, notably for SRM-based experiments. "L" indicates light-labeled peptides while "H" denotes heavy-labeled peptides.}
+#'       \item{GROUP}{Experimental group identifier.}
+#'       \item{RUN}{Identifier for the specific MS run.}
+#'       \item{SUBJECT}{Subject identifier within the experimental group.}
+#'       \item{FRACTION}{Fraction identifier if fractionation was performed.}
+#'       \item{originalRUN}{Original run identifier before any processing.}
+#'       \item{censored}{Logical indicator of whether the intensity value is considered missing or below limit of detection.}
+#'       \item{INTENSITY}{Original intensity measurement of the feature in the given run.}
+#'       \item{ABUNDANCE}{Processed abundance or intensity value after log-transformation and normalization.}
+#'       \item{newABUNDANCE}{The ABUNDANCE column but includes imputed missing values. It is the column that is used for protein summarization.}
+#'       \item{predicted}{Predicted intensity values for censored data, typically derived from a statistical model.}
+#'     }
+#'   }
+#'   \item{ProteinLevelData}{A data frame with run-level summarized information for each protein. Columns include:
+#'     \describe{
+#'       \item{RUN}{Identifier for the specific MS run.}
+#'       \item{Protein}{Identifier for the protein.}
+#'       \item{LogIntensities}{Log-transformed intensities for the protein in each run.}
+#'       \item{originalRUN}{Original run identifier before any processing.}
+#'       \item{GROUP}{Experimental group identifier.}
+#'       \item{SUBJECT}{Subject identifier within the experimental group.}
+#'       \item{TotalGroupMeasurements}{Total number of feature measurements for the protein in the given group.}
+#'       \item{NumMeasuredFeatures}{Number of features measured for the protein in the given run.}
+#'       \item{MissingPercentage}{Percentage of missing feature values for the protein in the given run.}
+#'       \item{more50missing}{Logical indicator of whether more than 50 percent of the features values are missing for the protein in the given run.}
+#'       \item{NumImputedFeature}{Number of features for which values were imputed due to missing or censored data for the protein in the given run.}
+#'     }
+#'   }
+#' }
 #' 
 #' @export
 #' 
diff --git a/R/groupComparison.R b/R/groupComparison.R
index e319cd7..1d9c7d2 100644
--- a/R/groupComparison.R
+++ b/R/groupComparison.R
@@ -15,8 +15,42 @@
 #' The underlying model fitting functions are lm and lmer for the fixed effects model and mixed effects model, respectively.
 #' The input of this function is the quantitative data from function (dataProcess).
 #'
-#' @return list that consists of three elements: "ComparisonResult" - data.frame with results of statistical testing,
-#' "ModelQC" - data.frame with data used to fit models for group comparison and "FittedModel" - list of fitted models.
+#' @return A list with the following components:
+#' \describe{
+#'   \item{ComparisonResult}{A `data.frame` containing the results of the statistical testing for each protein. The columns include:
+#'     \describe{
+#'       \item{Protein}{The name of the protein for which the comparison is made.}
+#'       \item{Label}{The label of the comparison, typically derived from the `contrast.matrix`.}
+#'       \item{log2FC}{The log2 fold change between the conditions being compared. The base of the logarithm is specified by the `log_base` parameter.}
+#'       \item{SE}{The standard error of the log2 fold change estimate.}
+#'       \item{Tvalue}{The t-statistic value for the comparison.}
+#'       \item{DF}{The degrees of freedom associated with the t-statistic.}
+#'       \item{pvalue}{The p-value for the statistical test of the comparison.}
+#'       \item{adj.pvalue}{The adjusted p-value using the Benjamini-Hochberg method for controlling the false discovery rate.}
+#'       \item{issue}{Any issues encountered during the comparison.  NA indicates no issues. "oneConditionMissing" occurs when data for one of the conditions being compared is entirely missing for a particular protein.}
+#'       \item{MissingPercentage}{The percentage of missing features for a given protein across all runs. This column is included only if missing values were imputed.}
+#'       \item{ImputationPercentage}{The percentage of features that were imputed for a given protein across all runs. This column is included only if missing values were imputed.}
+#'     }
+#'   }
+#'   \item{ModelQC}{A `data.frame` containing quality control data used to fit models for group comparison. The columns include:
+#'     \describe{
+#'       \item{RUN}{Identifier for the specific MS run.}
+#'       \item{Protein}{Identifier for the protein.}
+#'       \item{ABUNDANCE}{Summarized intensity for the protein in a given run.}
+#'       \item{originalRUN}{Original run identifier before any processing.}
+#'       \item{GROUP}{Experimental group identifier.}
+#'       \item{SUBJECT}{Subject identifier within the experimental group.}
+#'       \item{TotalGroupMeasurements}{Total number of feature measurements for the protein in the given group.}
+#'       \item{NumMeasuredFeatures}{Number of features measured for the protein in the given run.}
+#'       \item{MissingPercentage}{Percentage of missing feature values for the protein in the given run.}
+#'       \item{more50missing}{Logical indicator of whether more than 50 percent of the features values are missing for the protein in the given run.}
+#'       \item{NumImputedFeature}{Number of features for which values were imputed due to missing or censored data for the protein in the given run.}
+#'       \item{residuals}{Contains the differences between the observed values and the values predicted by the fitted model. }
+#'       \item{fitted}{The predicted values obtained from the model for a protein measurement for a given run in the dataset. }
+#'     }
+#'   }
+#'   \item{FittedModel}{A list of fitted models for each protein. This is included only if `save_fitted_models` is set to TRUE. Each element of the list corresponds to a protein and contains the fitted model object.}
+#' }
 #' 
 #' @export 
 #' @import lme4
diff --git a/man/dataProcess.Rd b/man/dataProcess.Rd
index 8388937..9e8b52f 100644
--- a/man/dataProcess.Rd
+++ b/man/dataProcess.Rd
@@ -110,6 +110,45 @@ If `append = TRUE`, has to be a valid path to a file.}
 a logfile named `MSstats_dataProcess_log_progress.log` is created to 
 track progress. Only works for Linux & Mac OS. Default is 1.}
 }
+\value{
+A list containing:
+\describe{
+  \item{FeatureLevelData}{A data frame with feature-level information after processing. Columns include:
+    \describe{
+      \item{PROTEIN}{Identifier for the protein associated with the feature.}
+      \item{PEPTIDE}{Identifier for the peptide sequence.}
+      \item{TRANSITION}{Identifier for the transition, typically representing a specific ion pair.}
+      \item{FEATURE}{Unique identifier for the feature, which could be a combination of peptide and transition.}
+      \item{LABEL}{Specifies the isotopic labeling of peptides, notably for SRM-based experiments. "L" indicates light-labeled peptides while "H" denotes heavy-labeled peptides.}
+      \item{GROUP}{Experimental group identifier.}
+      \item{RUN}{Identifier for the specific MS run.}
+      \item{SUBJECT}{Subject identifier within the experimental group.}
+      \item{FRACTION}{Fraction identifier if fractionation was performed.}
+      \item{originalRUN}{Original run identifier before any processing.}
+      \item{censored}{Logical indicator of whether the intensity value is considered missing or below limit of detection.}
+      \item{INTENSITY}{Original intensity measurement of the feature in the given run.}
+      \item{ABUNDANCE}{Processed abundance or intensity value after log-transformation and normalization.}
+      \item{newABUNDANCE}{The ABUNDANCE column but includes imputed missing values. It is the column that is used for protein summarization.}
+      \item{predicted}{Predicted intensity values for censored data, typically derived from a statistical model.}
+    }
+  }
+  \item{ProteinLevelData}{A data frame with run-level summarized information for each protein. Columns include:
+    \describe{
+      \item{RUN}{Identifier for the specific MS run.}
+      \item{Protein}{Identifier for the protein.}
+      \item{LogIntensities}{Log-transformed intensities for the protein in each run.}
+      \item{originalRUN}{Original run identifier before any processing.}
+      \item{GROUP}{Experimental group identifier.}
+      \item{SUBJECT}{Subject identifier within the experimental group.}
+      \item{TotalGroupMeasurements}{Total number of feature measurements for the protein in the given group.}
+      \item{NumMeasuredFeatures}{Number of features measured for the protein in the given run.}
+      \item{MissingPercentage}{Percentage of missing feature values for the protein in the given run.}
+      \item{more50missing}{Logical indicator of whether more than 50 percent of the features values are missing for the protein in the given run.}
+      \item{NumImputedFeature}{Number of features for which values were imputed due to missing or censored data for the protein in the given run.}
+    }
+  }
+}
+}
 \description{
 Process MS data: clean, normalize and summarize before differential analysis
 }
diff --git a/man/groupComparison.Rd b/man/groupComparison.Rd
index 0a7aa73..a0ffdfb 100644
--- a/man/groupComparison.Rd
+++ b/man/groupComparison.Rd
@@ -45,8 +45,42 @@ a logfile named `MSstats_groupComparison_log_progress.log` is created to
 track progress. Only works for Linux & Mac OS. Default is 1.}
 }
 \value{
-list that consists of three elements: "ComparisonResult" - data.frame with results of statistical testing,
-"ModelQC" - data.frame with data used to fit models for group comparison and "FittedModel" - list of fitted models.
+A list with the following components:
+\describe{
+  \item{ComparisonResult}{A `data.frame` containing the results of the statistical testing for each protein. The columns include:
+    \describe{
+      \item{Protein}{The name of the protein for which the comparison is made.}
+      \item{Label}{The label of the comparison, typically derived from the `contrast.matrix`.}
+      \item{log2FC}{The log2 fold change between the conditions being compared. The base of the logarithm is specified by the `log_base` parameter.}
+      \item{SE}{The standard error of the log2 fold change estimate.}
+      \item{Tvalue}{The t-statistic value for the comparison.}
+      \item{DF}{The degrees of freedom associated with the t-statistic.}
+      \item{pvalue}{The p-value for the statistical test of the comparison.}
+      \item{adj.pvalue}{The adjusted p-value using the Benjamini-Hochberg method for controlling the false discovery rate.}
+      \item{issue}{Any issues encountered during the comparison.  NA indicates no issues. "oneConditionMissing" occurs when data for one of the conditions being compared is entirely missing for a particular protein.}
+      \item{MissingPercentage}{The percentage of missing features for a given protein across all runs. This column is included only if missing values were imputed.}
+      \item{ImputationPercentage}{The percentage of features that were imputed for a given protein across all runs. This column is included only if missing values were imputed.}
+    }
+  }
+  \item{ModelQC}{A `data.frame` containing quality control data used to fit models for group comparison. The columns include:
+    \describe{
+      \item{RUN}{Identifier for the specific MS run.}
+      \item{Protein}{Identifier for the protein.}
+      \item{ABUNDANCE}{Summarized intensity for the protein in a given run.}
+      \item{originalRUN}{Original run identifier before any processing.}
+      \item{GROUP}{Experimental group identifier.}
+      \item{SUBJECT}{Subject identifier within the experimental group.}
+      \item{TotalGroupMeasurements}{Total number of feature measurements for the protein in the given group.}
+      \item{NumMeasuredFeatures}{Number of features measured for the protein in the given run.}
+      \item{MissingPercentage}{Percentage of missing feature values for the protein in the given run.}
+      \item{more50missing}{Logical indicator of whether more than 50 percent of the features values are missing for the protein in the given run.}
+      \item{NumImputedFeature}{Number of features for which values were imputed due to missing or censored data for the protein in the given run.}
+      \item{residuals}{Contains the differences between the observed values and the values predicted by the fitted model. }
+      \item{fitted}{The predicted values obtained from the model for a protein measurement for a given run in the dataset. }
+    }
+  }
+  \item{FittedModel}{A list of fitted models for each protein. This is included only if `save_fitted_models` is set to TRUE. Each element of the list corresponds to a protein and contains the fitted model object.}
+}
 }
 \description{
 Whole plot testing