Skip to content

Commit

Permalink
Merge branch 'main' into cansavvy/gi_calc
Browse files Browse the repository at this point in the history
  • Loading branch information
cansavvy authored Jul 1, 2024
2 parents 1c52383 + 6f837ee commit a2ca5d5
Show file tree
Hide file tree
Showing 13 changed files with 409 additions and 64 deletions.
12 changes: 10 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,16 @@ Version: 0.1.0
Authors@R: c(
person("Candace", "Savonen", ,
c("cansav09@gmail.com","csavonen@fredhutch.org"),
role = c("aut", "cre")),
role = c("aut", "cre")
),
person("Howard", "Baek", ,
c("howardbaek@fredhutch.org","howardbaek.fh@gmail.com"),
role = c("aut"))
role = c("aut")
),
person("Kate", "Isaac", ,
c("kisaac@fredhutch.org"),
role = c("aut")
)
)
Description: This package allows a pipeline to be built for analyzing genetic interactions of paired guide CRISPR screens. It is based off of original research from the Alice Berger lab at Fred Hutchinson Cancer Center
License: GPL-3
Expand All @@ -25,6 +31,8 @@ Imports:
magrittr,
kableExtra,
pheatmap,
purrr,
janitor,
Suggests:
testthat (>= 3.0.0),
roxygen2,
Expand Down
8 changes: 8 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,16 @@ export(run_qc)
export(setup_data)
import(ggplot2)
import(kableExtra)
importFrom(dplyr,across)
importFrom(dplyr,if_any)
importFrom(dplyr,mutate)
importFrom(ggplot2,ggplot)
importFrom(ggplot2,labs)
importFrom(janitor,clean_names)
importFrom(magrittr,"%>%")
importFrom(pheatmap,pheatmap)
importFrom(purrr,map)
importFrom(purrr,reduce)
importFrom(tidyr,pivot_longer)
importFrom(tidyr,pivot_wider)
importFrom(tidyr,unite)
4 changes: 2 additions & 2 deletions R/00-setup_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
#' dplyr::select(c("Day00_RepA", "Day05_RepA", "Day22_RepA", "Day22_RepB", "Day22_RepC")) %>%
#' as.matrix()
#'
#' gimap_dataset <- setup_data(counts = example_counts_data)
#' gimap_dataset <- setup_data(counts = example_counts)
#'
#' # You can see what an example dataset looks like by pulling the example gimap_dataset:
#' gimap_dataset <- get_example_data("gimap")
Expand All @@ -22,7 +22,7 @@ setup_data <- function(counts = NULL,
pg_ids = NULL,
pg_metadata = NULL,
sample_metadata = NULL) {
new_data <- gimap_data <- list(
new_data <- list(
raw_counts = NULL,
counts_per_sample = NULL,
transformed_data = list(
Expand Down
11 changes: 10 additions & 1 deletion R/01-qc.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
#' @param plots_dir default is `./qc_plots`; directory to save plots created with this function, if it doesn't exist already it will be created
#' @param overwrite default is FALSE; whether to overwrite the QC Report file
#' @param output_file default is `QC_Report`; name of the output QC report file
#' @param filter_zerocount_target_col default is NULL; Which sample column(s) should be used to check for counts of 0? If NULL and not specified, downstream analysis will select all sample columns
#' @param filter_plasmid_target_col default is NULL; Which sample columns(s) should be used to look at log2 CPM expression for plasmid pgRNA constructs? If NULL and not specified, downstream analysis will select the first sample column only
#' @param filter_replicates_target_col default is NULL; Which sample columns are replicates whose variation you'd like to analyze; If NULL, the last 3 sample columns are used
#' @param ... additional parameters are sent to `rmarkdown::render()`
#' @returns a QC report saved locally
#' @export
Expand All @@ -19,6 +22,9 @@ run_qc <- function(gimap_dataset,
output_file = "./gimap_QC_Report.Rmd",
plots_dir = "./qc_plots",
overwrite = FALSE,
filter_zerocount_target_col = NULL,
filter_plasmid_target_col = NULL,
filter_replicates_target_col = NULL,
...) {
if (!("gimap_dataset" %in% class(gimap_dataset))) stop("This function only works with gimap_dataset objects which can be made with the setup_data() function.")

Expand Down Expand Up @@ -56,7 +62,10 @@ run_qc <- function(gimap_dataset,
rmarkdown::render(output_file,
params = list(
dataset = gimap_dataset,
plots_dir = plots_dir
plots_dir = plots_dir,
filter_zerocount_target_col = filter_zerocount_target_col,
filter_plasmid_target_col = filter_zerocount_target_col,
filter_replicates_target_col = filter_zerocount_target_col
),
...
)
Expand Down
117 changes: 117 additions & 0 deletions R/02-filter.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
#' # To see filtered data
#' gimap_dataset$filtered_data
#' }
#'

gimap_filter <- function(.data = NULL,
gimap_dataset,
filter_type = "both") {
Expand All @@ -30,3 +32,118 @@ gimap_filter <- function(.data = NULL,

return(gimap_dataset)
}

# Possible filters

#' Create a filter for pgRNAs which have a raw count of 0 for any sample/time point
#' @description This function flags and reports which and how many pgRNAs have a raw count of 0 for any sample/time point
#' @param gimap_dataset The special gimap_dataset from the `setup_data` function which contains the raw count data
#' @param filter_zerocount_target_col default is NULL; Which sample column(s) should be used to check for counts of 0? If NULL and not specified, downstream analysis will select all sample columns
#' @importFrom magrittr %>%
#' @importFrom dplyr mutate
#' @importFrom purrr reduce map
#' @return a named list with the filter `filter` specifying which pgRNA have a count zero for at least one sample/time point and a report df `reportdf` for the number and percent of pgRNA which have a count zero for at least one sample/time point
#' @examples \dontrun{
#' gimap_dataset <- get_example_data("gimap")
#' qc_filter_zerocounts(gimap_dataset)
#'
#' #or to specify a different column (or set of columns to select)
#' qc_filter_zerocount(gimap_dataset, filter_zerocount_target_col = c(1,2))
#' }
#'

qc_filter_zerocounts <- function(gimap_dataset, filter_zerocount_target_col = NULL){

if (is.null(filter_zerocount_target_col)) {filter_zerocount_target_col <- c(1:ncol(gimap_dataset$raw_counts))}

if (!all(filter_zerocount_target_col %in% 1:ncol(gimap_dataset$raw_counts))) {
stop("The columns selected do not exist. `filter_zerocount_target_col` needs to correspond to the index of the columns in `gimap_dataset$raw_counts` that you need to filter by")
}

counts_filter <- data.frame(gimap_dataset$raw_counts[,filter_zerocount_target_col]) %>% map(~.x %in% c(0)) %>% reduce(`|`)

zerocount_df <- data.frame("RawCount0" = c(FALSE, TRUE), n = c(sum(!counts_filter), sum(counts_filter))) %>%
mutate(percent = round(((n/sum(n))*100),2))

return(list(filter = counts_filter, reportdf = zerocount_df))

}

#' Create a filter for pgRNAs which have a low log2 CPM value for the plasmid/Day 0 sample/time point
#' @description This function flags and reports which and how many pgRNAs have low log2 CPM values for the plasmid/Day 0 sample/time point. If more than one column is specified as the plasmid sample,
#' we pool all the replicate samples to find the lower outlier and flag constructs for which any plasmid replicate has a log2 CPM value below the cutoff
#' @param gimap_dataset The special gimap_dataset from the `setup_data` function which contains the log2 CPM transformed data
#' @param filter_plasmid_target_col default is NULL, and if NULL, will select the first column only; this parameter specifically should be used to specify the plasmid column(s) that will be selected
#' @importFrom magrittr %>%
#' @importFrom dplyr mutate across if_any
#' @importFrom tidyr pivot_wider pivot_longer
#' @importFrom janitor clean_names
#' @return a named list with the filter `plasmid_filter` specifying which pgRNAs have low plasmid log2 CPM (column of interest is `plasmid_cpm_filter`) and a report df `plasmid_filter_report` for the number and percent of pgRNA which have a low plasmid log2 CPM
#' @examples \dontrun{
#' gimap_dataset <- get_example_data("gimap")
#'
#' qc_filter_plasmid(gimap_dataset)
#'
#' #or to specify a cutoff value to be used in the filter rather than the lower outlier default
#' qc_filter_plasmid(gimap_dataset, cutoff=2)
#'
#' #or to specify a different column (or set of columns to select)
#' qc_filter_plasmid(gimap_dataset, filter_plasmid_target_col = 1:2)
#'
#' # or to specify a cutoff value that will be used in the filter rather than the lower outlier default as well as to specify a different column (or set of columns) to select
#' qc_filter_plasmid(gimap_dataset, cutoff=1.75, filter_plasmid_target_col=c(1,2))
#'
#' }
#'

qc_filter_plasmid <- function(gimap_dataset, cutoff = NULL, filter_plasmid_target_col = NULL){

if (is.null(filter_plasmid_target_col)) {filter_plasmid_target_col <- c(1)}

if (!all(filter_plasmid_target_col %in% 1:ncol(gimap_dataset$transformed_data$log2_cpm))) {
stop("The columns selected do not exist. `filter_plasmid_target_col` needs to correspond to the index of the columns in `gimap_dataset$transformed_data$log2_cpm` that you need to filter by")
}

plasmid_data <- data.frame(gimap_dataset$transformed_data$log2_cpm[, filter_plasmid_target_col]) %>% `colnames<-`(rep(c("plasmid_log2_cpm"), length(filter_plasmid_target_col))) %>% clean_names()

if (length(filter_plasmid_target_col >1)){ #if more than one column was selected, collapse all of the columns into the same vector using pivot_longer to store in a df with the name of the rep and number for row/construct
plasmid_data <- plasmid_data %>% mutate(construct = rownames(plasmid_data)) %>%
pivot_longer(starts_with("plasmid_log2_cpm"),
values_to = "plasmid_log2_cpm",
names_to = "rep")
}

if (is.null(cutoff)) {
# if cutoff is null, use lower outlier
quantile_info <- quantile(plasmid_data$plasmid_log2_cpm)

cutoff <- quantile_info["25%"] - (1.5 * (quantile_info["75%"] - quantile_info["25%"])) #later step make a function for this in utils since it's used more than once
}

if (length(filter_plasmid_target_col >1)){ #if more than one column was selected, take collapsed/pooled data and compare it to the cutoff
#then pivot_wider so that the constructs are in the same row and we can use if_any to report if any of the replicates were flagged by the cutoff
#return just that summary column (reporting if any are TRUE) as the filter
plasmid_data <- plasmid_data %>%
mutate(filterFlag = plasmid_log2_cpm < cutoff) %>%
pivot_wider(id_cols = construct, names_from = rep, values_from = filterFlag)
plasmid_cpm_filter <- plasmid_data %>%
mutate(plasmid_cpm_filter= if_any(.cols = starts_with('plasmid_log2_cpm'))) %>%
select(plasmid_cpm_filter)

} else {

plasmid_cpm_filter <- as.data.frame(plasmid_data$plasmid_log2_cpm < cutoff) %>%`colnames<-`("plasmid_cpm_filter")

}


plasmid_filter_df <- data.frame("Plasmid_log2cpmBelowCutoff" = c(FALSE, TRUE), n = c(sum(!plasmid_cpm_filter), sum(plasmid_cpm_filter))) %>%
mutate(percent = round(((n / sum(n)) * 100), 2)) #later step make a function for this in utils since it's used more than once

return(list(
plasmid_filter = plasmid_cpm_filter,
plasmid_filter_report = plasmid_filter_df
))

}

Loading

0 comments on commit a2ca5d5

Please sign in to comment.