Merge branch 'main' into cansavvy/gi_calc

FredHutch · Jul 1, 2024 · a2ca5d5 · a2ca5d5
2 parents 1c52383 + 6f837ee
commit a2ca5d5
Show file tree

Hide file tree

Showing 13 changed files with 409 additions and 64 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -5,10 +5,16 @@ Version: 0.1.0
 Authors@R: c(
     person("Candace", "Savonen", ,
     c("cansav09@gmail.com","csavonen@fredhutch.org"),
-    role = c("aut", "cre")),
+    role = c("aut", "cre")
+    ),
     person("Howard", "Baek", ,
     c("howardbaek@fredhutch.org","howardbaek.fh@gmail.com"),
-    role = c("aut"))
+    role = c("aut")
+    ), 
+    person("Kate", "Isaac", ,
+    c("kisaac@fredhutch.org"),
+    role = c("aut")
+    )
     )
 Description: This package allows a pipeline to be built for analyzing genetic interactions of paired guide CRISPR screens. It is based off of original research from the Alice Berger lab at Fred Hutchinson Cancer Center
 License: GPL-3
@@ -25,6 +31,8 @@ Imports:
     magrittr, 
     kableExtra, 
     pheatmap,
+    purrr,
+    janitor,
 Suggests:
     testthat (>= 3.0.0),
     roxygen2,

diff --git a/NAMESPACE b/NAMESPACE
@@ -10,8 +10,16 @@ export(run_qc)
 export(setup_data)
 import(ggplot2)
 import(kableExtra)
+importFrom(dplyr,across)
+importFrom(dplyr,if_any)
+importFrom(dplyr,mutate)
 importFrom(ggplot2,ggplot)
 importFrom(ggplot2,labs)
+importFrom(janitor,clean_names)
 importFrom(magrittr,"%>%")
 importFrom(pheatmap,pheatmap)
+importFrom(purrr,map)
+importFrom(purrr,reduce)
 importFrom(tidyr,pivot_longer)
+importFrom(tidyr,pivot_wider)
+importFrom(tidyr,unite)
diff --git a/R/00-setup_data.R b/R/00-setup_data.R
@@ -13,7 +13,7 @@
 #'   dplyr::select(c("Day00_RepA", "Day05_RepA", "Day22_RepA", "Day22_RepB", "Day22_RepC")) %>%
 #'   as.matrix()
 #'
-#' gimap_dataset <- setup_data(counts = example_counts_data)
+#' gimap_dataset <- setup_data(counts = example_counts)
 #'
 #' # You can see what an example dataset looks like by pulling the example gimap_dataset:
 #' gimap_dataset <- get_example_data("gimap")
@@ -22,7 +22,7 @@ setup_data <- function(counts = NULL,
                        pg_ids = NULL,
                        pg_metadata = NULL,
                        sample_metadata = NULL) {
-  new_data <- gimap_data <- list(
+  new_data <- list(
     raw_counts = NULL,
     counts_per_sample = NULL,
     transformed_data = list(

diff --git a/R/01-qc.R b/R/01-qc.R
@@ -4,6 +4,9 @@
 #' @param plots_dir default is `./qc_plots`; directory to save plots created with this function, if it doesn't exist already it will be created
 #' @param overwrite default is FALSE; whether to overwrite the QC Report file
 #' @param output_file default is `QC_Report`; name of the output QC report file
+#' @param filter_zerocount_target_col default is NULL; Which sample column(s) should be used to check for counts of 0? If NULL and not specified, downstream analysis will select all sample columns
+#' @param filter_plasmid_target_col default is NULL; Which sample columns(s) should be used to look at log2 CPM expression for plasmid pgRNA constructs? If NULL and not specified, downstream analysis will select the first sample column only
+#' @param filter_replicates_target_col default is NULL; Which sample columns are replicates whose variation you'd like to analyze; If NULL, the last 3 sample columns are used
 #' @param ... additional parameters are sent to `rmarkdown::render()`
 #' @returns a QC report saved locally
 #' @export
@@ -19,6 +22,9 @@ run_qc <- function(gimap_dataset,
                    output_file = "./gimap_QC_Report.Rmd",
                    plots_dir = "./qc_plots",
                    overwrite = FALSE,
+                   filter_zerocount_target_col = NULL,
+                   filter_plasmid_target_col = NULL,
+                   filter_replicates_target_col = NULL,
                    ...) {
   if (!("gimap_dataset" %in% class(gimap_dataset))) stop("This function only works with gimap_dataset objects which can be made with the setup_data() function.")
 
@@ -56,7 +62,10 @@ run_qc <- function(gimap_dataset,
   rmarkdown::render(output_file,
     params = list(
       dataset = gimap_dataset,
-      plots_dir = plots_dir
+      plots_dir = plots_dir,
+      filter_zerocount_target_col = filter_zerocount_target_col,
+      filter_plasmid_target_col = filter_zerocount_target_col,
+      filter_replicates_target_col = filter_zerocount_target_col
     ),
     ...
   )

diff --git a/R/02-filter.R b/R/02-filter.R
@@ -19,6 +19,8 @@
 #' # To see filtered data
 #' gimap_dataset$filtered_data
 #' }
+#'
+
 gimap_filter <- function(.data = NULL,
                          gimap_dataset,
                          filter_type = "both") {
@@ -30,3 +32,118 @@ gimap_filter <- function(.data = NULL,
 
   return(gimap_dataset)
 }
+
+# Possible filters
+
+#' Create a filter for pgRNAs which have a raw count of 0 for any sample/time point
+#' @description This function flags and reports which and how many pgRNAs have a raw count of 0 for any sample/time point
+#' @param gimap_dataset The special gimap_dataset from the `setup_data` function which contains the raw count data
+#' @param filter_zerocount_target_col default is NULL; Which sample column(s) should be used to check for counts of 0? If NULL and not specified, downstream analysis will select all sample columns
+#' @importFrom magrittr %>%
+#' @importFrom dplyr mutate
+#' @importFrom purrr reduce map
+#' @return a named list with the filter `filter` specifying which pgRNA have a count zero for at least one sample/time point and a report df `reportdf` for the number and percent of pgRNA which have a count zero for at least one sample/time point
+#' @examples \dontrun{
+#'   gimap_dataset <- get_example_data("gimap")
+#'   qc_filter_zerocounts(gimap_dataset)
+#'   
+#'   #or to specify a different column (or set of columns to select)
+#'   qc_filter_zerocount(gimap_dataset, filter_zerocount_target_col = c(1,2))
+#' }
+#'
+
+qc_filter_zerocounts <- function(gimap_dataset, filter_zerocount_target_col = NULL){
+
+  if (is.null(filter_zerocount_target_col)) {filter_zerocount_target_col <- c(1:ncol(gimap_dataset$raw_counts))}
+
+  if (!all(filter_zerocount_target_col %in% 1:ncol(gimap_dataset$raw_counts))) {
+    stop("The columns selected do not exist. `filter_zerocount_target_col` needs to correspond to the index of the columns in `gimap_dataset$raw_counts` that you need to filter by") 
+   }
+
+  counts_filter <- data.frame(gimap_dataset$raw_counts[,filter_zerocount_target_col]) %>% map(~.x %in% c(0)) %>% reduce(`|`)
+
+  zerocount_df <- data.frame("RawCount0" = c(FALSE, TRUE), n = c(sum(!counts_filter), sum(counts_filter))) %>%
+    mutate(percent = round(((n/sum(n))*100),2))
+
+  return(list(filter = counts_filter, reportdf = zerocount_df))
+
+}
+
+#' Create a filter for pgRNAs which have a low log2 CPM value for the plasmid/Day 0 sample/time point
+#' @description This function flags and reports which and how many pgRNAs have low log2 CPM values for the plasmid/Day 0 sample/time point. If more than one column is specified as the plasmid sample, 
+#' we pool all the replicate samples to find the lower outlier and flag constructs for which any plasmid replicate has a log2 CPM value below the cutoff
+#' @param gimap_dataset The special gimap_dataset from the `setup_data` function which contains the log2 CPM transformed data
+#' @param filter_plasmid_target_col default is NULL, and if NULL, will select the first column only; this parameter specifically should be used to specify the plasmid column(s) that will be selected
+#' @importFrom magrittr %>%
+#' @importFrom dplyr mutate across if_any
+#' @importFrom tidyr pivot_wider pivot_longer 
+#' @importFrom janitor clean_names
+#' @return a named list with the filter `plasmid_filter` specifying which pgRNAs have low plasmid log2 CPM (column of interest is `plasmid_cpm_filter`) and a report df `plasmid_filter_report` for the number and percent of pgRNA which have a low plasmid log2 CPM
+#' @examples \dontrun{
+#'   gimap_dataset <- get_example_data("gimap")
+#' 
+#'   qc_filter_plasmid(gimap_dataset)
+#'   
+#'   #or to specify a cutoff value to be used in the filter rather than the lower outlier default
+#'   qc_filter_plasmid(gimap_dataset, cutoff=2)
+#'   
+#'   #or to specify a different column (or set of columns to select)
+#'   qc_filter_plasmid(gimap_dataset, filter_plasmid_target_col = 1:2)
+#'
+#'   # or to specify a cutoff value that will be used in the filter rather than the lower outlier default as well as to specify a different column (or set of columns) to select
+#'   qc_filter_plasmid(gimap_dataset, cutoff=1.75, filter_plasmid_target_col=c(1,2))
+#' 
+#' }
+#'
+
+qc_filter_plasmid <- function(gimap_dataset, cutoff = NULL, filter_plasmid_target_col = NULL){
+
+  if (is.null(filter_plasmid_target_col)) {filter_plasmid_target_col <- c(1)}
+
+  if (!all(filter_plasmid_target_col %in% 1:ncol(gimap_dataset$transformed_data$log2_cpm))) {
+    stop("The columns selected do not exist. `filter_plasmid_target_col` needs to correspond to the index of the columns in `gimap_dataset$transformed_data$log2_cpm` that you need to filter by") 
+  }
+
+  plasmid_data <- data.frame(gimap_dataset$transformed_data$log2_cpm[, filter_plasmid_target_col]) %>% `colnames<-`(rep(c("plasmid_log2_cpm"), length(filter_plasmid_target_col))) %>% clean_names()
+
+  if (length(filter_plasmid_target_col >1)){ #if more than one column was selected, collapse all of the columns into the same vector using pivot_longer to store in a df with the name of the rep and number for row/construct
+    plasmid_data <- plasmid_data %>% mutate(construct = rownames(plasmid_data)) %>%
+      pivot_longer(starts_with("plasmid_log2_cpm"), 
+                   values_to = "plasmid_log2_cpm", 
+                   names_to = "rep")
+  }
+
+  if (is.null(cutoff)) {
+    # if cutoff is null, use lower outlier 
+    quantile_info <- quantile(plasmid_data$plasmid_log2_cpm)
+
+    cutoff <- quantile_info["25%"] - (1.5 * (quantile_info["75%"] - quantile_info["25%"])) #later step make a function for this in utils since it's used more than once
+  }
+
+  if (length(filter_plasmid_target_col >1)){ #if more than one column was selected, take collapsed/pooled data and compare it to the cutoff
+                                            #then pivot_wider so that the constructs are in the same row and we can use if_any to report if any of the replicates were flagged by the cutoff
+                                            #return just that summary column (reporting if any are TRUE) as the filter
+    plasmid_data <- plasmid_data %>% 
+      mutate(filterFlag = plasmid_log2_cpm < cutoff) %>%
+      pivot_wider(id_cols = construct, names_from = rep, values_from = filterFlag)
+    plasmid_cpm_filter <- plasmid_data %>% 
+      mutate(plasmid_cpm_filter=  if_any(.cols = starts_with('plasmid_log2_cpm'))) %>%
+      select(plasmid_cpm_filter)
+
+  } else {
+
+    plasmid_cpm_filter <- as.data.frame(plasmid_data$plasmid_log2_cpm < cutoff) %>%`colnames<-`("plasmid_cpm_filter")
+
+  }
+
+
+  plasmid_filter_df <- data.frame("Plasmid_log2cpmBelowCutoff" = c(FALSE, TRUE), n = c(sum(!plasmid_cpm_filter), sum(plasmid_cpm_filter))) %>%
+    mutate(percent = round(((n / sum(n)) * 100), 2)) #later step make a function for this in utils since it's used more than once
+
+  return(list(
+    plasmid_filter = plasmid_cpm_filter,
+    plasmid_filter_report = plasmid_filter_df
+  ))
+
+}
+