From 4d8db45a27fc46843a54c15b6d0e37f7788447d1 Mon Sep 17 00:00:00 2001 From: Jonathan Griffiths <7976085+jonathangriffiths@users.noreply.github.com> Date: Fri, 12 Jan 2024 11:28:09 +0000 Subject: [PATCH 1/3] Initial implementation for Csparse conversion --- DESCRIPTION | 4 ++-- R/BPSATACData.R | 6 ++++-- R/EmbryoAtlasData.R | 6 ++++-- R/TChimeraData.R | 6 ++++-- R/Tal1ChimeraData.R | 6 ++++-- R/WTChimeraData.R | 6 ++++-- R/getData.R | 26 +++++++++++++++++++++----- man/BPSATACData.Rd | 5 ++++- man/EmbryoAtlasData.Rd | 6 +++++- man/TChimeraData.Rd | 9 ++++++++- man/Tal1ChimeraData.Rd | 9 ++++++++- man/WTChimeraData.Rd | 9 ++++++++- 12 files changed, 76 insertions(+), 22 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index ee3f5f9..a1fa349 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: MouseGastrulationData Title: Single-Cell -omics Data across Mouse Gastrulation and Early Organogenesis -Version: 1.17.0 +Version: 1.17.1 Authors@R: c( person("Jonathan", "Griffiths", email = "jonathan.griffiths.94@gmail.com", role = c("aut", "cre")), person("Aaron", "Lun", email = "infinite.monkeys.with.keyboards@gmail.com", role = "aut")) @@ -31,4 +31,4 @@ Encoding: UTF-8 biocViews: ExperimentData, ExpressionData, SequencingData, RNASeqData, SingleCellData, ExperimentHub, Mus_musculus_Data URL: https://github.com/MarioniLab/MouseGastrulationData BugReports: https://github.com/MarioniLab/MouseGastrulationData/issues -RoxygenNote: 7.2.1 +RoxygenNote: 7.3.0 diff --git a/R/BPSATACData.R b/R/BPSATACData.R index 2ff7373..c4d7401 100644 --- a/R/BPSATACData.R +++ b/R/BPSATACData.R @@ -4,6 +4,8 @@ #' #' @param type String specifying the type of data to obtain, see Details. #' Default behaviour is to return processed data. +#' @param Csparse.assays Logical indicating whether to convert assay matrices into the column major format that is more performant with contemporary software packages. +#' Default behaviour is to perform the conversion. #' #' @return #' If \code{type="processed"}, a \linkS4class{SingleCellExperiment} is returned containing the processed data. @@ -81,8 +83,8 @@ #' @importFrom BiocGenerics sizeFactors #' @importClassesFrom S4Vectors DataFrame #' @importFrom methods as -BPSATACData <- function(type=c("processed", "raw")) { +BPSATACData <- function(type=c("processed", "raw"), Csparse.assays=TRUE) { type <- match.arg(type) versions <- list(base="1.6.0") - .getRNAseqData("BPS_atac", type, versions, samples=1, sample.options=as.character(1), sample.err="1") + .getRNAseqData("BPS_atac", type, versions, samples=1, sample.options=as.character(1), sample.err="1", makeCsparse=Csparse.assays) } diff --git a/R/EmbryoAtlasData.R b/R/EmbryoAtlasData.R index 0b2c811..3df3597 100644 --- a/R/EmbryoAtlasData.R +++ b/R/EmbryoAtlasData.R @@ -7,6 +7,8 @@ #' @param samples Integer or character vector specifying the samples for which data (processed or raw) should be obtained. #' If \code{NULL} (default), data are returned for all (36) samples. #' @param get.spliced Logical indicating whether to also download the spliced/unspliced/ambiguously spliced count matrices. +#' @param Csparse.assays Logical indicating whether to convert assay matrices into the column major format that is more performant with contemporary software packages. +#' Default behaviour is to perform the conversion. #' #' @return #' If \code{type="processed"}, a \linkS4class{SingleCellExperiment} is returned containing processed data from selected samples. @@ -76,7 +78,7 @@ #' @importFrom BiocGenerics sizeFactors #' @importClassesFrom S4Vectors DataFrame #' @importFrom methods as -EmbryoAtlasData <- function(type=c("processed", "raw"), samples=NULL, get.spliced=FALSE) { +EmbryoAtlasData <- function(type=c("processed", "raw"), samples=NULL, get.spliced=FALSE, Csparse.assays=TRUE) { type <- match.arg(type) versions <- list(base="1.0.0") extra_a <- NULL @@ -93,5 +95,5 @@ EmbryoAtlasData <- function(type=c("processed", "raw"), samples=NULL, get.splice "counts-unspliced"="1.4.0", "counts-ambig"="1.4.0")) } - .getRNAseqData("atlas", type, versions, samples, sample.options=as.character(c(1:10, 12:37)), sample.err="1:10 or 12:37", extra_assays = extra_a) + .getRNAseqData("atlas", type, versions, samples, sample.options=as.character(c(1:10, 12:37)), sample.err="1:10 or 12:37", extra_assays = extra_a, makeCsparse=Csparse.assays) } diff --git a/R/TChimeraData.R b/R/TChimeraData.R index e6b4cef..bd130c8 100644 --- a/R/TChimeraData.R +++ b/R/TChimeraData.R @@ -6,6 +6,8 @@ #' Default behaviour is to return processed data. #' @param samples Integer or character vector specifying the samples for which data (processed or raw) should be obtained. #' If \code{NULL} (default), data are returned for all QC-passing (fourteen) samples. +#' @param Csparse.assays Logical indicating whether to convert assay matrices into the column major format that is more performant with contemporary software packages. +#' Default behaviour is to perform the conversion. #' #' @return #' If \code{type="processed"}, a \linkS4class{SingleCellExperiment} is returned containing processed data from selected samples @@ -86,10 +88,10 @@ #' @importFrom BiocGenerics sizeFactors #' @importClassesFrom S4Vectors DataFrame #' @importFrom methods as -TChimeraData <- function(type=c("processed", "raw"), samples=c(1:2, 5:16)) { +TChimeraData <- function(type=c("processed", "raw"), samples=c(1:2, 5:16), Csparse.assays=TRUE) { if(any(3:4 %in% samples)) warning("You are downloading the QC-fail samples 3 and/or 4.") type <- match.arg(type) versions <- list(base="1.4.0") - .getRNAseqData("t-chimera", type, versions, samples, sample.options=as.character(seq_len(16)), sample.err="1:16") + .getRNAseqData("t-chimera", type, versions, samples, sample.options=as.character(seq_len(16)), sample.err="1:16", makeCsparse=Csparse.assays) } diff --git a/R/Tal1ChimeraData.R b/R/Tal1ChimeraData.R index 09a8e85..0c1cc53 100644 --- a/R/Tal1ChimeraData.R +++ b/R/Tal1ChimeraData.R @@ -6,6 +6,8 @@ #' Default behaviour is to return processed data. #' @param samples Integer or character vector specifying the samples for which data (processed or raw) should be obtained. #' If \code{NULL} (default), data are returned for all (four) samples. +#' @param Csparse.assays Logical indicating whether to convert assay matrices into the column major format that is more performant with contemporary software packages. +#' Default behaviour is to perform the conversion. #' #' @return #' If \code{type="processed"}, a \linkS4class{SingleCellExperiment} is returned containing processed data from selected samples. @@ -67,8 +69,8 @@ #' @importFrom BiocGenerics sizeFactors #' @importClassesFrom S4Vectors DataFrame #' @importFrom methods as -Tal1ChimeraData <- function(type=c("processed", "raw"), samples=NULL) { +Tal1ChimeraData <- function(type=c("processed", "raw"), samples=NULL, Csparse.assays=TRUE) { type <- match.arg(type) versions <- list(base="1.0.0") - .getRNAseqData("tal1-chimera", type, versions, samples, sample.options=as.character(seq_len(4)), sample.err="1:4") + .getRNAseqData("tal1-chimera", type, versions, samples, sample.options=as.character(seq_len(4)), sample.err="1:4", makeCsparse=Csparse.assays) } diff --git a/R/WTChimeraData.R b/R/WTChimeraData.R index b34e22f..e148ea4 100644 --- a/R/WTChimeraData.R +++ b/R/WTChimeraData.R @@ -6,6 +6,8 @@ #' Default behaviour is to return processed data. #' @param samples Integer or character vector specifying the samples for which data (processed or raw) should be obtained. #' If \code{NULL} (default), data are returned for all (ten) samples. +#' @param Csparse.assays Logical indicating whether to convert assay matrices into the column major format that is more performant with contemporary software packages. +#' Default behaviour is to perform the conversion. #' #' @return #' If \code{type="processed"}, a \linkS4class{SingleCellExperiment} is returned containing processed data from selected samples @@ -76,8 +78,8 @@ #' @importFrom BiocGenerics sizeFactors #' @importClassesFrom S4Vectors DataFrame #' @importFrom methods as -WTChimeraData <- function(type=c("processed", "raw"), samples=NULL) { +WTChimeraData <- function(type=c("processed", "raw"), samples=NULL, Csparse.assays=TRUE) { type <- match.arg(type) versions <- list(base="1.0.0") - .getRNAseqData("wt-chimera", type, versions, samples, sample.options=as.character(seq_len(10)), sample.err="1:10") + .getRNAseqData("wt-chimera", type, versions, samples, sample.options=as.character(seq_len(10)), sample.err="1:10", makeCsparse=Csparse.assays) } diff --git a/R/getData.R b/R/getData.R index 07290e1..23688bc 100644 --- a/R/getData.R +++ b/R/getData.R @@ -28,7 +28,8 @@ names, object.type=c("SingleCellExperiment", "SpatialExperiment"), return.list=FALSE, - ensemblise=TRUE + ensemblise=TRUE, + makeCsparse=FALSE ){ object.type <- match.arg(object.type) hub <- ExperimentHub() @@ -45,7 +46,8 @@ if(return.list){ out <- lapply(samples, function(x){ .getData(dataset, version, x, - sample.options, sample.err, names, object.type, return.list=FALSE)}) + sample.options, sample.err, names, object.type, return.list=FALSE, + ensemblise=ensemblise, makeCsparse=makeCsparse)}) names(out) <- samples return(out) } @@ -119,13 +121,16 @@ if("cell" %in% names(colData(sce))){ colnames(sce) <- colData(sce)$cell } + if(makeCsparse){ + sce <- .makeCsparse(sce) + } return(sce) } #### # Simpler interfaces for specific data types #### -.getRNAseqData <- function(dataset, type, version, samples, sample.options, sample.err, extra_assays=NULL, ens_rownames=TRUE){ +.getRNAseqData <- function(dataset, type, version, samples, sample.options, sample.err, extra_assays=NULL, ens_rownames=TRUE, makeCsparse=FALSE){ if(type == "processed"){ return( .getData( dataset, @@ -141,7 +146,8 @@ dimred="reduced-dims" ), object.type="SingleCellExperiment", - ensemblise=ens_rownames + ensemblise=ens_rownames, + makeCsparse=makeCsparse )) } else if (type == "raw"){ return( .getData( @@ -156,7 +162,8 @@ ), object.type="SingleCellExperiment", return.list=TRUE, - ensemblise=ens_rownames + ensemblise=ens_rownames, + makeCsparse=makeCsparse )) } } @@ -210,3 +217,12 @@ opt } } + +.makeCsparse <- function(sce){ + for(an in assayNames(sce)){ + if(is(assay(sce, an), "TsparseMatrix")){ + assay(sce, an) <- as(assay(sce, an), "CsparseMatrix") + } + } + return(sce) +} diff --git a/man/BPSATACData.Rd b/man/BPSATACData.Rd index 23416e3..77f2867 100644 --- a/man/BPSATACData.Rd +++ b/man/BPSATACData.Rd @@ -4,11 +4,14 @@ \alias{BPSATACData} \title{E8.25 snATAC-seq data} \usage{ -BPSATACData(type = c("processed", "raw")) +BPSATACData(type = c("processed", "raw"), Csparse.assays = TRUE) } \arguments{ \item{type}{String specifying the type of data to obtain, see Details. Default behaviour is to return processed data.} + +\item{Csparse.assays}{Logical indicating whether to convert assay matrices into the column major format that is more performant with contemporary software packages. +Default behaviour is to perform the conversion.} } \value{ If \code{type="processed"}, a \linkS4class{SingleCellExperiment} is returned containing the processed data. diff --git a/man/EmbryoAtlasData.Rd b/man/EmbryoAtlasData.Rd index e2fb958..fcb25ee 100644 --- a/man/EmbryoAtlasData.Rd +++ b/man/EmbryoAtlasData.Rd @@ -7,7 +7,8 @@ EmbryoAtlasData( type = c("processed", "raw"), samples = NULL, - get.spliced = FALSE + get.spliced = FALSE, + Csparse.assays = TRUE ) } \arguments{ @@ -18,6 +19,9 @@ Default behaviour is to return processed data.} If \code{NULL} (default), data are returned for all (36) samples.} \item{get.spliced}{Logical indicating whether to also download the spliced/unspliced/ambiguously spliced count matrices.} + +\item{Csparse.assays}{Logical indicating whether to convert assay matrices into the column major format that is more performant with contemporary software packages. +Default behaviour is to perform the conversion.} } \value{ If \code{type="processed"}, a \linkS4class{SingleCellExperiment} is returned containing processed data from selected samples. diff --git a/man/TChimeraData.Rd b/man/TChimeraData.Rd index bb047e2..e3d6a44 100644 --- a/man/TChimeraData.Rd +++ b/man/TChimeraData.Rd @@ -4,7 +4,11 @@ \alias{TChimeraData} \title{T chimera data} \usage{ -TChimeraData(type = c("processed", "raw"), samples = c(1:2, 5:16)) +TChimeraData( + type = c("processed", "raw"), + samples = c(1:2, 5:16), + Csparse.assays = TRUE +) } \arguments{ \item{type}{String specifying the type of data to obtain, see Details. @@ -12,6 +16,9 @@ Default behaviour is to return processed data.} \item{samples}{Integer or character vector specifying the samples for which data (processed or raw) should be obtained. If \code{NULL} (default), data are returned for all QC-passing (fourteen) samples.} + +\item{Csparse.assays}{Logical indicating whether to convert assay matrices into the column major format that is more performant with contemporary software packages. +Default behaviour is to perform the conversion.} } \value{ If \code{type="processed"}, a \linkS4class{SingleCellExperiment} is returned containing processed data from selected samples diff --git a/man/Tal1ChimeraData.Rd b/man/Tal1ChimeraData.Rd index 686667b..6028044 100644 --- a/man/Tal1ChimeraData.Rd +++ b/man/Tal1ChimeraData.Rd @@ -4,7 +4,11 @@ \alias{Tal1ChimeraData} \title{Tal1 chimera data} \usage{ -Tal1ChimeraData(type = c("processed", "raw"), samples = NULL) +Tal1ChimeraData( + type = c("processed", "raw"), + samples = NULL, + Csparse.assays = TRUE +) } \arguments{ \item{type}{String specifying the type of data to obtain, see Details. @@ -12,6 +16,9 @@ Default behaviour is to return processed data.} \item{samples}{Integer or character vector specifying the samples for which data (processed or raw) should be obtained. If \code{NULL} (default), data are returned for all (four) samples.} + +\item{Csparse.assays}{Logical indicating whether to convert assay matrices into the column major format that is more performant with contemporary software packages. +Default behaviour is to perform the conversion.} } \value{ If \code{type="processed"}, a \linkS4class{SingleCellExperiment} is returned containing processed data from selected samples. diff --git a/man/WTChimeraData.Rd b/man/WTChimeraData.Rd index dba3cc6..9df35a0 100644 --- a/man/WTChimeraData.Rd +++ b/man/WTChimeraData.Rd @@ -4,7 +4,11 @@ \alias{WTChimeraData} \title{WT chimera data} \usage{ -WTChimeraData(type = c("processed", "raw"), samples = NULL) +WTChimeraData( + type = c("processed", "raw"), + samples = NULL, + Csparse.assays = TRUE +) } \arguments{ \item{type}{String specifying the type of data to obtain, see Details. @@ -12,6 +16,9 @@ Default behaviour is to return processed data.} \item{samples}{Integer or character vector specifying the samples for which data (processed or raw) should be obtained. If \code{NULL} (default), data are returned for all (ten) samples.} + +\item{Csparse.assays}{Logical indicating whether to convert assay matrices into the column major format that is more performant with contemporary software packages. +Default behaviour is to perform the conversion.} } \value{ If \code{type="processed"}, a \linkS4class{SingleCellExperiment} is returned containing processed data from selected samples From 88c2c86979d676e0891295f40df5fdff9a58fa94 Mon Sep 17 00:00:00 2001 From: Jonathan Griffiths <7976085+jonathangriffiths@users.noreply.github.com> Date: Fri, 12 Jan 2024 13:05:45 +0000 Subject: [PATCH 2/3] Add test --- tests/testthat.R | 3 +++ tests/testthat/test-Csparse.R | 10 ++++++++++ 2 files changed, 13 insertions(+) create mode 100644 tests/testthat.R create mode 100644 tests/testthat/test-Csparse.R diff --git a/tests/testthat.R b/tests/testthat.R new file mode 100644 index 0000000..ecb13e2 --- /dev/null +++ b/tests/testthat.R @@ -0,0 +1,3 @@ +library(testthat) +library(MouseGastrulationData) +test_check("MouseGastrulationData") diff --git a/tests/testthat/test-Csparse.R b/tests/testthat/test-Csparse.R new file mode 100644 index 0000000..2d5f9b2 --- /dev/null +++ b/tests/testthat/test-Csparse.R @@ -0,0 +1,10 @@ +# This tests the conversion from triplet to column major matrix styles. +# library(testthat); library(MouseGastrulationData); source("test-Csparse.R") + +test_that("EmbryoAtlasData function for sample 1, with and without csparse conversion, gives equal counts assay", { + data_without_csparse <- EmbryoAtlasData(samples = 1, Csparse.assays = FALSE) + data_with_csparse <- EmbryoAtlasData(samples = 1, Csparse.assays = TRUE) + + expect_equal(assay(data_without_csparse, "counts"), + as(assay(data_with_csparse, "counts"), "TsparseMatrix")) +}) From 6fe1b49c35ca6479b747b51bfd7848e15f805740 Mon Sep 17 00:00:00 2001 From: Jonathan Griffiths <7976085+jonathangriffiths@users.noreply.github.com> Date: Fri, 12 Jan 2024 14:49:04 +0000 Subject: [PATCH 3/3] add missing testthat dep --- DESCRIPTION | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index a1fa349..b659194 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -22,7 +22,8 @@ Imports: Suggests: BiocStyle, knitr, - rmarkdown + rmarkdown, + testthat VignetteBuilder: knitr License: GPL-3