From 9124e3239503be7adc7b7516bed3301e8be8e34d Mon Sep 17 00:00:00 2001
From: Kate Isaac <41767733+kweav@users.noreply.github.com>
Date: Thu, 29 Aug 2024 16:42:30 -0400
Subject: [PATCH 1/4] add filtering

---
 R/04-normalize.R    | 33 ++++++++++++++++++++++++++-------
 R/06-calculate_gi.R |  2 +-
 2 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/R/04-normalize.R b/R/04-normalize.R
index 1f09c05..a1374c4 100644
--- a/R/04-normalize.R
+++ b/R/04-normalize.R
@@ -9,6 +9,9 @@
 #' @param replicates Specifies the column name of the metadata set up in `$metadata$sample_metadata`
 #' that has a factor that represents column that specifies replicates. These replicates will be kept separate
 #' for the late but the early and plasmid others will be averaged and used for normalization.
+#' @param num_ids_wo_annot default is 20; the number of pgRNA IDs to display to console if they don't have corresponding annotation data;
+#' ff there are more IDs without annotation data than this number, the output will be sent to a file rather than the console.
+#' @param rm_ids_wo_annot default is TRUE; whether or not to filter out pgRNA IDs from the input dataset that don't have corresponding annotation data available
 #' @export
 #' @examples \dontrun{
 #'
@@ -31,7 +34,9 @@
 gimap_normalize <- function(.data = NULL,
                             gimap_dataset,
                             timepoints = NULL,
-                            replicates = NULL) {
+                            replicates = NULL,
+                            num_ids_wo_annot = 20,
+                            rm_ids_wo_annot = TRUE) {
   # Code adapted from
   # https://github.com/FredHutch/GI_mapping/blob/main/workflow/scripts/03-filter_and_calculate_LFC.Rmd
 
@@ -83,6 +88,7 @@ gimap_normalize <- function(.data = NULL,
     dataset <- gimap_dataset$transformed_data$log2_cpm
     pg_ids <- gimap_dataset$metadata$pg_ids
   }
+  
   # Doing some reshaping to get one handy data frame
   lfc_df <- dataset %>%
     as.data.frame() %>%
@@ -95,17 +101,30 @@ gimap_normalize <- function(.data = NULL,
                        names_from = c(timepoints, replicates))
 
   
-
+  missing_ids <- setdiff(lfc_df$pg_ids, gimap_dataset$annotation$pgRNA_id)
+  
+  if ((length(missing_ids) > 0) & (length(missing_ids) < num_ids_wo_annot)){
+    message("The following ", length(missing_ids), " IDs were not found in the annotation data: \n", paste0(missing_ids, collapse = ", "))
+  } else {
+    #output to a file
+  }
+  
+  if ((length(missing_ids) > 0) & (rm_ids_wo_annot == TRUE)){
+    lfc_df <- lfc_df %>%
+      filter(!pg_ids %in% missing_ids)
+    message("The input data for the IDs which were not found in the annotation data has been filtered out and will not be included in the analysis output.")
+  } else{
+    message("The input data for the IDs which were not found in the annotation data will be kept throughout the analysis, but any data from the annotation won't be available for them.")
+  }
+  
   # Calculate the means for each construct across the groups
   plasmid_df <- apply(dplyr::select(lfc_df, starts_with("plasmid")), 1, mean)
-
-
+  
   late_vs_plasmid_df <-  lfc_df %>%
     dplyr::mutate_at(dplyr::vars(dplyr::starts_with("late")), ~.x - plasmid_df) %>%
     dplyr::select(pg_ids, dplyr::starts_with("late"))  %>%
     dplyr::left_join(gimap_dataset$annotation, by = c("pg_ids" = "pgRNA_id"))
 
-
   ########################### Perform adjustments #############################
 
   ### Calculate medians
@@ -128,10 +147,10 @@ gimap_normalize <- function(.data = NULL,
       # Then, divide by the median of negative controls (double non-targeting) minus
       # median of positive controls (targeting 1 essential gene).
       # This will effectively set the median of the positive controls (essential genes) to -1.
-      lfc_adj = lfc_adj1 / (median(lfc_adj1[norm_ctrl_flag == "negative_control"]) - median(lfc_adj1[norm_ctrl_flag == "positive_control"]))
+      lfc_adj = lfc_adj1 / (median(lfc_adj1[norm_ctrl_flag == "negative_control"], na.rm = TRUE) - median(lfc_adj1[norm_ctrl_flag == "positive_control"]))
     ) %>%
     ungroup()
-
+  
   # Save this at the construct level
   gimap_dataset$normalized_log_fc <- lfc_df_adj
 
diff --git a/R/06-calculate_gi.R b/R/06-calculate_gi.R
index 209f55c..563e8a0 100644
--- a/R/06-calculate_gi.R
+++ b/R/06-calculate_gi.R
@@ -30,7 +30,6 @@ calc_gi <- function(.data = NULL,
 
   if (!("gimap_dataset" %in% class(gimap_dataset))) stop("This function only works with gimap_dataset objects which can be made with the setup_data() function.")
 
-
   ## calculate expected double-targeting GI score by summing the two mean single-targeting
   ## CRISPR scores for that paralog pair
   gi_calc_df <- gimap_dataset$crispr_score %>%
@@ -39,6 +38,7 @@ calc_gi <- function(.data = NULL,
       target_type == "gene_ctrl" ~ mean_single_target_crispr_1 + mean_double_control_crispr_2,
       target_type == "ctrl_gene" ~ mean_double_control_crispr_1 + mean_single_target_crispr_2
     ))
+  
   # Calculating mean crisprs
   overall_results <- gi_calc_df %>%
     dplyr::group_by(rep, pgRNA_target_double) %>%

From 89dd88fc7d0bb385bc2dd9a772a58de453dedfc2 Mon Sep 17 00:00:00 2001
From: Kate Isaac <41767733+kweav@users.noreply.github.com>
Date: Mon, 23 Sep 2024 15:52:32 -0400
Subject: [PATCH 2/4] write to file

---
 R/03-annotate.R  |  7 +------
 R/04-normalize.R | 17 +++++++++--------
 2 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/R/03-annotate.R b/R/03-annotate.R
index e6fc489..0331b7f 100644
--- a/R/03-annotate.R
+++ b/R/03-annotate.R
@@ -134,12 +134,7 @@ gimap_annotate <- function(.data = NULL,
       )
     )
 
-  if (gimap_dataset$filtered_data$filter_step_run) {
-    keep_for_annotdf <- annotation_df$pgRNA_id %in% unlist(gimap_dataset$filtered_data$metadata_pg_ids)
-    annotation_df <- annotation_df[keep_for_annotdf, ]
-  }
-
-  ################################ STORE IT ####################################
+################################ STORE IT ####################################
 
   if (gimap_dataset$filtered_data$filter_step_run) {
     keep_for_annotdf <- annotation_df$pgRNA_id %in% unlist(gimap_dataset$filtered_data$metadata_pg_ids)
diff --git a/R/04-normalize.R b/R/04-normalize.R
index a1374c4..d78d24b 100644
--- a/R/04-normalize.R
+++ b/R/04-normalize.R
@@ -88,7 +88,7 @@ gimap_normalize <- function(.data = NULL,
     dataset <- gimap_dataset$transformed_data$log2_cpm
     pg_ids <- gimap_dataset$metadata$pg_ids
   }
-  
+
   # Doing some reshaping to get one handy data frame
   lfc_df <- dataset %>%
     as.data.frame() %>%
@@ -100,15 +100,16 @@ gimap_normalize <- function(.data = NULL,
     tidyr::pivot_wider(values_from = "log2_cpm",
                        names_from = c(timepoints, replicates))
 
-  
+
   missing_ids <- setdiff(lfc_df$pg_ids, gimap_dataset$annotation$pgRNA_id)
-  
+
   if ((length(missing_ids) > 0) & (length(missing_ids) < num_ids_wo_annot)){
     message("The following ", length(missing_ids), " IDs were not found in the annotation data: \n", paste0(missing_ids, collapse = ", "))
   } else {
-    #output to a file
+    missing_ids_file <- file.path("missing_ids_file.csv")
+    readr::write_csv(missing_ids, missing_ids_file)
   }
-  
+
   if ((length(missing_ids) > 0) & (rm_ids_wo_annot == TRUE)){
     lfc_df <- lfc_df %>%
       filter(!pg_ids %in% missing_ids)
@@ -116,10 +117,10 @@ gimap_normalize <- function(.data = NULL,
   } else{
     message("The input data for the IDs which were not found in the annotation data will be kept throughout the analysis, but any data from the annotation won't be available for them.")
   }
-  
+
   # Calculate the means for each construct across the groups
   plasmid_df <- apply(dplyr::select(lfc_df, starts_with("plasmid")), 1, mean)
-  
+
   late_vs_plasmid_df <-  lfc_df %>%
     dplyr::mutate_at(dplyr::vars(dplyr::starts_with("late")), ~.x - plasmid_df) %>%
     dplyr::select(pg_ids, dplyr::starts_with("late"))  %>%
@@ -150,7 +151,7 @@ gimap_normalize <- function(.data = NULL,
       lfc_adj = lfc_adj1 / (median(lfc_adj1[norm_ctrl_flag == "negative_control"], na.rm = TRUE) - median(lfc_adj1[norm_ctrl_flag == "positive_control"]))
     ) %>%
     ungroup()
-  
+
   # Save this at the construct level
   gimap_dataset$normalized_log_fc <- lfc_df_adj
 

From f25c407eabab989df9f844deb51cbbc462e19991 Mon Sep 17 00:00:00 2001
From: Kate Isaac <41767733+kweav@users.noreply.github.com>
Date: Mon, 23 Sep 2024 15:54:47 -0400
Subject: [PATCH 3/4] from devtools document

---
 NAMESPACE              |  7 +++++++
 man/calc_crispr.Rd     |  2 +-
 man/calc_gi.Rd         |  2 +-
 man/gimap_annotate.Rd  |  6 +++---
 man/gimap_normalize.Rd | 11 +++++++++--
 man/setup_data.Rd      |  9 +--------
 6 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index a74fd5d..5e97e8e 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -7,8 +7,15 @@ export(get_example_data)
 export(gimap_annotate)
 export(gimap_filter)
 export(gimap_normalize)
+export(qc_cdf)
+export(qc_constructs_countzero_bar)
+export(qc_cor_heatmap)
+export(qc_plasmid_histogram)
+export(qc_sample_hist)
+export(qc_variance_hist)
 export(run_qc)
 export(setup_data)
+export(supported_cell_lines)
 import(dplyr)
 import(ggplot2)
 import(kableExtra)
diff --git a/man/calc_crispr.Rd b/man/calc_crispr.Rd
index 11e8650..41e4c5d 100644
--- a/man/calc_crispr.Rd
+++ b/man/calc_crispr.Rd
@@ -26,7 +26,7 @@ run_qc(gimap_dataset)
 
 gimap_dataset <- gimap_dataset \%>\%
   gimap_filter() \%>\%
-  gimap_annotate() \%>\%
+  gimap_annotate(cell_line = "HELA") \%>\%
   gimap_normalize(
     timepoints = "day",
     replicates = "rep"
diff --git a/man/calc_gi.Rd b/man/calc_gi.Rd
index bdf6809..0804afd 100644
--- a/man/calc_gi.Rd
+++ b/man/calc_gi.Rd
@@ -23,7 +23,7 @@ Create results table that has CRISPR scores, Wilcoxon rank-sum test and t tests.
 
   gimap_dataset <- gimap_dataset \%>\%
     gimap_filter() \%>\%
-    gimap_annotate() \%>\%
+    gimap_annotate(cell_line = "HELA") \%>\%
     gimap_normalize(
       timepoints = "day",
       replicates = "rep"
diff --git a/man/gimap_annotate.Rd b/man/gimap_annotate.Rd
index 828d508..6c5eff1 100644
--- a/man/gimap_annotate.Rd
+++ b/man/gimap_annotate.Rd
@@ -7,7 +7,7 @@
 gimap_annotate(
   .data = NULL,
   gimap_dataset,
-  cell_line = "HELA",
+  cell_line,
   control_genes = NULL,
   cn_annotate = TRUE,
   annotation_file = NULL
@@ -18,7 +18,7 @@ gimap_annotate(
 
 \item{gimap_dataset}{A special dataset structure that is setup using the `setup_data()` function.}
 
-\item{cell_line}{which cell line are you using? Default is "HELA"}
+\item{cell_line}{which cell line are you using? (e.g., HELA, PC9, etc.). Required argument}
 
 \item{control_genes}{A vector of gene symbols (e.g. AAMP) that should be labeled as control genes. These will be used for log fold change calculations. If no list is given then DepMap Public 23Q4 Achilles_common_essentials.csv is used https://depmap.org/portal/download/all/}
 
@@ -39,7 +39,7 @@ run_qc(gimap_dataset)
 
 gimap_dataset <- gimap_dataset \%>\%
   gimap_filter() \%>\%
-  gimap_annotate()
+  gimap_annotate(cell_line = "HELA")
 
 # To see anotations
 gimap_dataset$annotation
diff --git a/man/gimap_normalize.Rd b/man/gimap_normalize.Rd
index 571a89a..e768283 100644
--- a/man/gimap_normalize.Rd
+++ b/man/gimap_normalize.Rd
@@ -8,7 +8,9 @@ gimap_normalize(
   .data = NULL,
   gimap_dataset,
   timepoints = NULL,
-  replicates = NULL
+  replicates = NULL,
+  num_ids_wo_annot = 20,
+  rm_ids_wo_annot = TRUE
 )
 }
 \arguments{
@@ -24,6 +26,11 @@ The late timepoints will be the focus for the calculations. The column used for
 \item{replicates}{Specifies the column name of the metadata set up in `$metadata$sample_metadata`
 that has a factor that represents column that specifies replicates. These replicates will be kept separate
 for the late but the early and plasmid others will be averaged and used for normalization.}
+
+\item{num_ids_wo_annot}{default is 20; the number of pgRNA IDs to display to console if they don't have corresponding annotation data;
+ff there are more IDs without annotation data than this number, the output will be sent to a file rather than the console.}
+
+\item{rm_ids_wo_annot}{default is TRUE; whether or not to filter out pgRNA IDs from the input dataset that don't have corresponding annotation data available}
 }
 \description{
 This calculates the log fold change for a gimap dataset based on the annotation and metadata provided.
@@ -38,7 +45,7 @@ run_qc(gimap_dataset)
 
 gimap_dataset <- gimap_dataset \%>\%
   gimap_filter() \%>\%
-  gimap_annotate() \%>\%
+  gimap_annotate(cell_line = "HELA") \%>\%
   gimap_normalize(
     timepoints = "day",
     replicates = "rep"
diff --git a/man/setup_data.Rd b/man/setup_data.Rd
index b154841..4188492 100644
--- a/man/setup_data.Rd
+++ b/man/setup_data.Rd
@@ -4,20 +4,13 @@
 \alias{setup_data}
 \title{Making a new gimap dataset}
 \usage{
-setup_data(
-  counts = NULL,
-  pg_ids = NULL,
-  pg_metadata = NULL,
-  sample_metadata = NULL
-)
+setup_data(counts = NULL, pg_ids = NULL, sample_metadata = NULL)
 }
 \arguments{
 \item{counts}{a matrix of data that contains the counts where rows are each paired_guide target and columns are each sample}
 
 \item{pg_ids}{the pgRNA IDs: metadata associated with the pgRNA constructs that correspond to the rows of the counts data}
 
-\item{pg_metadata}{construct metadata}
-
 \item{sample_metadata}{metadata associated with the samples of the dataset that correspond to the columns of the counts data.
 Should include a column that has replicate information as well as a column that contains timepoint information respectively (this will be used for log fold calculations). These columns should be factors.}
 }

From fa738ba46e19540a4d2f3116291e53b349734486 Mon Sep 17 00:00:00 2001
From: Candace Savonen <cansav09@gmail.com>
Date: Mon, 30 Sep 2024 13:48:15 -0400
Subject: [PATCH 4/4] Fix a minor issue with data writing

---
 R/04-normalize.R | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/R/04-normalize.R b/R/04-normalize.R
index d78d24b..d544cb0 100644
--- a/R/04-normalize.R
+++ b/R/04-normalize.R
@@ -101,16 +101,18 @@ gimap_normalize <- function(.data = NULL,
                        names_from = c(timepoints, replicates))
 
 
-  missing_ids <- setdiff(lfc_df$pg_ids, gimap_dataset$annotation$pgRNA_id)
+  missing_ids <- data.frame(
+    missing_ids = setdiff(lfc_df$pg_ids, gimap_dataset$annotation$pgRNA_id)
+  )
 
-  if ((length(missing_ids) > 0) & (length(missing_ids) < num_ids_wo_annot)){
-    message("The following ", length(missing_ids), " IDs were not found in the annotation data: \n", paste0(missing_ids, collapse = ", "))
+  if ((nrow(missing_ids) > 0) & (nrow(missing_ids) < num_ids_wo_annot)){
+    message("The following ", nrow(missing_ids), " IDs were not found in the annotation data: \n", paste0(missing_ids, collapse = ", "))
   } else {
     missing_ids_file <- file.path("missing_ids_file.csv")
     readr::write_csv(missing_ids, missing_ids_file)
   }
 
-  if ((length(missing_ids) > 0) & (rm_ids_wo_annot == TRUE)){
+  if ((nrow(missing_ids) > 0) & (rm_ids_wo_annot == TRUE)){
     lfc_df <- lfc_df %>%
       filter(!pg_ids %in% missing_ids)
     message("The input data for the IDs which were not found in the annotation data has been filtered out and will not be included in the analysis output.")