diff --git a/DESCRIPTION b/DESCRIPTION index 65ae402..b28b5ba 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -5,10 +5,16 @@ Version: 0.1.0 Authors@R: c( person("Candace", "Savonen", , c("cansav09@gmail.com","csavonen@fredhutch.org"), - role = c("aut", "cre")), + role = c("aut", "cre") + ), person("Howard", "Baek", , c("howardbaek@fredhutch.org","howardbaek.fh@gmail.com"), - role = c("aut")) + role = c("aut") + ), + person("Kate", "Isaac", , + c("kisaac@fredhutch.org"), + role = c("aut") + ) ) Description: This package allows a pipeline to be built for analyzing genetic interactions of paired guide CRISPR screens. It is based off of original research from the Alice Berger lab at Fred Hutchinson Cancer Center License: GPL-3 @@ -25,6 +31,7 @@ Imports: magrittr, kableExtra, pheatmap, + purrr, Suggests: testthat (>= 3.0.0), roxygen2, diff --git a/NAMESPACE b/NAMESPACE index 7ff14c7..5afa03e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -10,8 +10,12 @@ export(run_qc) export(setup_data) import(ggplot2) import(kableExtra) +importFrom(dplyr,mutate) importFrom(ggplot2,ggplot) importFrom(ggplot2,labs) importFrom(magrittr,"%>%") importFrom(pheatmap,pheatmap) +importFrom(purrr,map) +importFrom(purrr,reduce) importFrom(tidyr,pivot_longer) +importFrom(tidyr,unite) diff --git a/R/02-filter.R b/R/02-filter.R index 68109d4..7ab4ecf 100644 --- a/R/02-filter.R +++ b/R/02-filter.R @@ -2,7 +2,7 @@ #' @description This is a function here's where we describe what it does #' @param .data Data can be piped in with %>% or |> from function to function. But the data must still be a gimap_dataset #' @param gimap_dataset A special dataset structure that is setup using the `setup_data()` function. -#' @param filter_type Can be one of the following: `zero_count_only`, `low_plasmid_cpm_only` or `rep_variation`, `zero_in_last_time_point` or a vector that includes multiple of these filters. +#' @param filter_type Can be one of the following: `zero_count_only`, `low_plasmid_cpm_only` or `rep_variation`, `zero_in_last_time_point` or a vector that includes multiple of these filters. #' You should decide on the appropriate filter based on the results of your QC report. #' @returns a filtered version of the gimap_dataset returned in the $filtered_data section #' @export @@ -20,6 +20,8 @@ #' gimap_dataset$filtered_data #' #' } +#' + gimap_filter <- function(.data = NULL, gimap_dataset, filter_type = "both") { @@ -32,3 +34,28 @@ gimap_filter <- function(.data = NULL, return(gimap_dataset) } + +# Possible filters + +#' Create a filter for pgRNAs which have a raw count of 0 for any sample/time point +#' @description This function flags and reports which and how many pgRNAs have a raw count of 0 for any sample/time point +#' @param gimap_dataset The special gimap_dataset from the `setup_data` function which contains the transformed data +#' @importFrom magrittr %>% +#' @importFrom dplyr mutate +#' @importFrom purrr reduce map +#' @return a named list with the filter `filter` specifying which pgRNA have a count zero for at least one sample/time point and a report df `reportdf` for the number and percent of pgRNA which have a count zero for at least one sample/time point +#' @examples \dontrun{ +#' +#' } +#' + +qc_filter_zerocounts <- function(gimap_dataset){ + + counts_filter <- data.frame(gimap_dataset$raw_counts) %>% map(~.x %in% c(0)) %>% reduce(`|`) + + zerocount_df <- data.frame("RawCount0" = c(FALSE, TRUE), n = c(sum(!counts_filter), sum(counts_filter))) %>% + mutate(percent = round(((n/sum(n))*100),2)) + + return(list(filter = counts_filter, reportdf = zerocount_df)) + +} diff --git a/R/plots-qc.R b/R/plots-qc.R index 9f774c6..709fafd 100644 --- a/R/plots-qc.R +++ b/R/plots-qc.R @@ -3,7 +3,7 @@ #' @param gimap_dataset The special gimap_dataset from the `setup_data` function which contains the transformed data #' @param qc_obj The object that has the qc stuff stored #' @param wide_ar aspect ratio, default is 0.75 -#' @importFrom tidyr pivot_longer +#' @importFrom tidyr pivot_longer unite #' @importFrom ggplot2 ggplot labs #' @return counts_cdf a ggplot #' @examples \dontrun{ @@ -64,6 +64,80 @@ qc_sample_hist <- function(gimap_dataset, wide_ar = 0.75) { return(sample_cpm_histogram) } +#' Create a histogram for the variance within replicates for each pgRNA +#' @description This function uses pivot_longer to rearrange the data for plotting, finds the variance for each pgRNA construct (using row number as a proxy) and then plots a histogram of these variances +#' @param gimap_dataset The special gimap_dataset from the `setup_data` function which contains the transformed data +#' @param wide_ar aspect ratio, default is 0.75 +#' @importFrom tidyr pivot_longer +#' @importFrom magrittr %>% +#' @import ggplot2 +#' @return a ggplot histogram +#' @examples \dontrun{ +#' gimap_dataset <- get_example_data("gimap") +#' qc_variance_hist(gimap_dataset) +#' } +#' + +qc_variance_hist <- function(gimap_dataset, wide_ar = 0.75){ + + return( + gimap_dataset$transformed_data$log2_cpm[,3:5] %>% + as.data.frame() %>% + mutate(row = row_number()) %>% + tidyr::pivot_longer(-row) %>% + group_by(row) %>% + summarize(var = var(value)) %>% + ggplot(aes(x=var)) + + geom_histogram(binwidth = 0.1) + + theme(panel.background = element_blank(), + panel.grid = element_blank(), + aspect.ratio = wide_ar) + + xlab("variance") + + ylab("pgRNA construct count") + ) + +} + +#' Create a bar graph that shows the number of replicates with a zero count for pgRNA constructs flagged by the zero count filter +#' @description A short description... +#' @param gimap_dataset The special gimap_dataset from the `setup_data` function which contains the transformed data +#' @param wide_ar aspect ratio, default is 0.75 +#' @importFrom tidyr pivot_longer +#' @importFrom magrittr %>% +#' @import ggplot2 +#' @return a ggplot barplot +#' @examples \dontrun{ +#' gimap_dataset <- get_example_data("gimap") +#' qc_constructs_countzero_bar(gimap_dataset) +#' } +#' + +qc_constructs_countzero_bar <- function(gimap_dataset, wide_ar = 0.75){ + + qc_filter_output <- qc_filter_zerocounts(gimap_dataset) + + return( + example_counts[qc_filter_output$filter, c(3:5)] %>% + as.data.frame() %>% + mutate(row = row_number()) %>% + tidyr::pivot_longer(tidyr::unite(gimap_dataset$metadata$sample_metadata[c(3:5), c("day", "rep")], "colName")$colName, + values_to = "counts") %>% + group_by(row) %>% + summarize(numzero = sum(counts == 0), + max_diff = max(counts) - min(counts), + sec_diff = min(counts[counts > 0]) - min(counts) + ) %>% + group_by(numzero) %>% + summarize(count = n()) %>% + ggplot(aes(x=numzero, y = count)) + + geom_bar(stat = "identity") + + theme_classic() + + ylab("Number of pgRNAs") + + xlab("Number of replicates with a zero") + + geom_text(aes(label = count, group = numzero), vjust = -0.5, size=2) + ) +} + #' Create a correlation heatmap for the pgRNA CPMs #' @description This function uses the `cor` function to find correlations between the sample CPM's and then plots a heatmap of these #' @param gimap_dataset The special gimap_dataset from the `setup_data` function which contains the transformed data @@ -75,7 +149,7 @@ qc_sample_hist <- function(gimap_dataset, wide_ar = 0.75) { #' #' } #' -qc_cor_heatmap <- function(gimap_dataset, ...) { +qc_cor_heatmap <- function(gimap_dataset) { cpm_cor <- gimap_dataset$transformed_data$cpm %>% cor() %>% round(2) %>% @@ -87,8 +161,7 @@ qc_cor_heatmap <- function(gimap_dataset, ...) { cellwidth = 20, cellheight = 20, treeheight_row = 20, - treeheight_col = 20, - ... + treeheight_col = 20 ) return(sample_cor_heatmap) diff --git a/inst/rmd/gimapQCTemplate.Rmd b/inst/rmd/gimapQCTemplate.Rmd index 16ec49f..873b7a8 100644 --- a/inst/rmd/gimapQCTemplate.Rmd +++ b/inst/rmd/gimapQCTemplate.Rmd @@ -53,6 +53,21 @@ qc_sample_hist(gimap_dataset) qc_cor_heatmap(gimap_dataset) ``` +## Variance within replicates + +```{r} +qc_variance_hist(gimap_dataset) +``` + +# Applying potential filters + +## Replicates with a pgRNA count of 0 + +```{r} +qc_constructs_countzero_bar(gimap_dataset) +``` + +# Session Info ```{r} sessionInfo() diff --git a/man/gimap_filter.Rd b/man/gimap_filter.Rd index be927e7..8ffea49 100644 --- a/man/gimap_filter.Rd +++ b/man/gimap_filter.Rd @@ -11,7 +11,7 @@ gimap_filter(.data = NULL, gimap_dataset, filter_type = "both") \item{gimap_dataset}{A special dataset structure that is setup using the `setup_data()` function.} -\item{filter_type}{Can be one of the following: `zero_count_only`, `low_plasmid_cpm_only` or `rep_variation`, `zero_in_last_time_point` or a vector that includes multiple of these filters. +\item{filter_type}{Can be one of the following: `zero_count_only`, `low_plasmid_cpm_only` or `rep_variation`, `zero_in_last_time_point` or a vector that includes multiple of these filters. You should decide on the appropriate filter based on the results of your QC report.} } \value{ @@ -35,4 +35,5 @@ gimap_dataset <- gimap_filter(gimap_dataset) gimap_dataset$filtered_data } + } diff --git a/man/qc_constructs_countzero_bar.Rd b/man/qc_constructs_countzero_bar.Rd new file mode 100644 index 0000000..968655f --- /dev/null +++ b/man/qc_constructs_countzero_bar.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plots-qc.R +\name{qc_constructs_countzero_bar} +\alias{qc_constructs_countzero_bar} +\title{Create a bar graph that shows the number of replicates with a zero count for pgRNA constructs flagged by the zero count filter} +\usage{ +qc_constructs_countzero_bar(gimap_dataset, wide_ar = 0.75) +} +\arguments{ +\item{gimap_dataset}{The special gimap_dataset from the `setup_data` function which contains the transformed data} + +\item{wide_ar}{aspect ratio, default is 0.75} +} +\value{ +a ggplot barplot +} +\description{ +A short description... +} +\examples{ +\dontrun{ +gimap_dataset <- get_example_data("gimap") +qc_constructs_countzero_bar(gimap_dataset) +} + +} diff --git a/man/qc_cor_heatmap.Rd b/man/qc_cor_heatmap.Rd index 1e2f5fc..cd80204 100644 --- a/man/qc_cor_heatmap.Rd +++ b/man/qc_cor_heatmap.Rd @@ -4,7 +4,7 @@ \alias{qc_cor_heatmap} \title{Create a correlation heatmap for the pgRNA CPMs} \usage{ -qc_cor_heatmap(gimap_dataset, ...) +qc_cor_heatmap(gimap_dataset) } \arguments{ \item{gimap_dataset}{The special gimap_dataset from the `setup_data` function which contains the transformed data} diff --git a/man/qc_filter_zerocounts.Rd b/man/qc_filter_zerocounts.Rd new file mode 100644 index 0000000..4b2935c --- /dev/null +++ b/man/qc_filter_zerocounts.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/02-filter.R +\name{qc_filter_zerocounts} +\alias{qc_filter_zerocounts} +\title{Create a filter for pgRNAs which have a raw count of 0 for any sample/time point} +\usage{ +qc_filter_zerocounts(gimap_dataset) +} +\arguments{ +\item{gimap_dataset}{The special gimap_dataset from the `setup_data` function which contains the transformed data} +} +\value{ +a named list with the filter `filter` specifying which pgRNA have a count zero for at least one sample/time point and a report df `reportdf` for the number and percent of pgRNA which have a count zero for at least one sample/time point +} +\description{ +This function flags and reports which and how many pgRNAs have a raw count of 0 for any sample/time point +} +\examples{ +\dontrun{ + +} + +} diff --git a/man/qc_variance_hist.Rd b/man/qc_variance_hist.Rd new file mode 100644 index 0000000..97823f2 --- /dev/null +++ b/man/qc_variance_hist.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plots-qc.R +\name{qc_variance_hist} +\alias{qc_variance_hist} +\title{Create a histogram for the variance within replicates for each pgRNA} +\usage{ +qc_variance_hist(gimap_dataset, wide_ar = 0.75) +} +\arguments{ +\item{gimap_dataset}{The special gimap_dataset from the `setup_data` function which contains the transformed data} + +\item{wide_ar}{aspect ratio, default is 0.75} +} +\value{ +a ggplot histogram +} +\description{ +This function uses pivot_longer to rearrange the data for plotting, finds the variance for each pgRNA construct (using row number as a proxy) and then plots a histogram of these variances +} +\examples{ +\dontrun{ +gimap_dataset <- get_example_data("gimap") +qc_variance_hist(gimap_dataset) +} + +} diff --git a/vignettes/getting_started.Rmd b/vignettes/getting_started.Rmd index bf7985b..d3d34c1 100644 --- a/vignettes/getting_started.Rmd +++ b/vignettes/getting_started.Rmd @@ -133,12 +133,20 @@ str(gimap_dataset) The first step is running some quality checks on our data. The `run_qc()` function will create a report we can look at to assess this. -The report includes several visualizations: +The report includes several visualizations of raw/unfiltered data: -- distribution of `count_norm` values for each sample. The goal of determining this distribution is to identify pgRNA counts that are low at the start of the screen - before any type of phenotypic or growth selection is occurring, either in the T0 or plasmid sample. These low abundance pgRNAs should be removed from the analysis. +- distribution of normalized counts for each sample. The goal of determining this distribution is to identify pgRNA counts that are low at the start of the screen - before any type of phenotypic or growth selection is occurring, either in the T0 or plasmid sample. These low abundance pgRNAs should be removed from the analysis. [the goal section here doesn't fit my expectations/understanding of this plot.] - histogram of `log2cpm` values for each individual sample: this helps users identify samples that do not have a normal distribution of reads and inform the upcoming filtering steps. - sample correlation heatmap: generates a heatmap using cpm values for each replicate/sample. This heatmap gives an overview on how similar samples are. Replicates should correlate well, and cluster together, while each timepoint sample should be different from the T0. This analysis will also allow users to identify potential sample swaps, if correlation scores between replicates is poor. -- log2cpm of plasmid counts. Any reads falling outside of the 1.5*IQR threshold will be filtered [TODO: Candace/Kate/Howard or is this being used as an argument in the code?] +- A histogram that shows the variance within replicates for each pgRNA construct. For each pgRNA construct, the variance among the 3 replicates is found and a distribution is constructed by looking at these variances together. + +This report also includes several visualizations after filters are applied: + +There is a filter that flags pgRNA constructs where any of the time points have a count of zero. + - We include a bar plot that shows the number of pgRNA constructs which have counts of zero in either 0, 1, 2, or 3 replicates. + +There is a filter that flags pgRNA constructs that have low log2 CPM counts for the day 0 or plasmid time point. + - We include a histogram of the log2cpm of plasmid counts. Any reads falling outside of the 1.5*IQR threshold will be filtered [TODO: Candace/Kate/Howard or is this being used as an argument in the code?; note this graph isn't currently in the report] ```{r} run_qc(gimap_dataset,