From 4d8db45a27fc46843a54c15b6d0e37f7788447d1 Mon Sep 17 00:00:00 2001
From: Jonathan Griffiths <7976085+jonathangriffiths@users.noreply.github.com>
Date: Fri, 12 Jan 2024 11:28:09 +0000
Subject: [PATCH 1/3] Initial implementation for Csparse conversion

---
 DESCRIPTION            |  4 ++--
 R/BPSATACData.R        |  6 ++++--
 R/EmbryoAtlasData.R    |  6 ++++--
 R/TChimeraData.R       |  6 ++++--
 R/Tal1ChimeraData.R    |  6 ++++--
 R/WTChimeraData.R      |  6 ++++--
 R/getData.R            | 26 +++++++++++++++++++++-----
 man/BPSATACData.Rd     |  5 ++++-
 man/EmbryoAtlasData.Rd |  6 +++++-
 man/TChimeraData.Rd    |  9 ++++++++-
 man/Tal1ChimeraData.Rd |  9 ++++++++-
 man/WTChimeraData.Rd   |  9 ++++++++-
 12 files changed, 76 insertions(+), 22 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index ee3f5f9..a1fa349 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: MouseGastrulationData
 Title: Single-Cell -omics Data across Mouse Gastrulation and Early Organogenesis
-Version: 1.17.0
+Version: 1.17.1
 Authors@R: c(
     person("Jonathan", "Griffiths", email = "jonathan.griffiths.94@gmail.com", role = c("aut", "cre")),
     person("Aaron", "Lun", email = "infinite.monkeys.with.keyboards@gmail.com", role = "aut"))
@@ -31,4 +31,4 @@ Encoding: UTF-8
 biocViews: ExperimentData, ExpressionData, SequencingData, RNASeqData, SingleCellData, ExperimentHub, Mus_musculus_Data
 URL: https://github.com/MarioniLab/MouseGastrulationData
 BugReports: https://github.com/MarioniLab/MouseGastrulationData/issues
-RoxygenNote: 7.2.1
+RoxygenNote: 7.3.0
diff --git a/R/BPSATACData.R b/R/BPSATACData.R
index 2ff7373..c4d7401 100644
--- a/R/BPSATACData.R
+++ b/R/BPSATACData.R
@@ -4,6 +4,8 @@
 #'
 #' @param type String specifying the type of data to obtain, see Details.
 #' Default behaviour is to return processed data.
+#' @param Csparse.assays Logical indicating whether to convert assay matrices into the column major format that is more performant with contemporary software packages.
+#' Default behaviour is to perform the conversion.
 #' 
 #' @return 
 #' If \code{type="processed"}, a \linkS4class{SingleCellExperiment} is returned containing the processed data.
@@ -81,8 +83,8 @@
 #' @importFrom BiocGenerics sizeFactors
 #' @importClassesFrom S4Vectors DataFrame
 #' @importFrom methods as
-BPSATACData <- function(type=c("processed", "raw")) {
+BPSATACData <- function(type=c("processed", "raw"), Csparse.assays=TRUE) {
     type <- match.arg(type)
     versions <- list(base="1.6.0")
-    .getRNAseqData("BPS_atac", type, versions, samples=1, sample.options=as.character(1), sample.err="1")
+    .getRNAseqData("BPS_atac", type, versions, samples=1, sample.options=as.character(1), sample.err="1", makeCsparse=Csparse.assays)
 }
diff --git a/R/EmbryoAtlasData.R b/R/EmbryoAtlasData.R
index 0b2c811..3df3597 100644
--- a/R/EmbryoAtlasData.R
+++ b/R/EmbryoAtlasData.R
@@ -7,6 +7,8 @@
 #' @param samples Integer or character vector specifying the samples for which data (processed or raw) should be obtained.
 #' If \code{NULL} (default), data are returned for all (36) samples.
 #' @param get.spliced Logical indicating whether to also download the spliced/unspliced/ambiguously spliced count matrices.
+#' @param Csparse.assays Logical indicating whether to convert assay matrices into the column major format that is more performant with contemporary software packages.
+#' Default behaviour is to perform the conversion.
 #' 
 #' @return 
 #' If \code{type="processed"}, a \linkS4class{SingleCellExperiment} is returned containing processed data from selected samples.
@@ -76,7 +78,7 @@
 #' @importFrom BiocGenerics sizeFactors
 #' @importClassesFrom S4Vectors DataFrame
 #' @importFrom methods as
-EmbryoAtlasData <- function(type=c("processed", "raw"), samples=NULL, get.spliced=FALSE) {
+EmbryoAtlasData <- function(type=c("processed", "raw"), samples=NULL, get.spliced=FALSE, Csparse.assays=TRUE) {
     type <- match.arg(type)
     versions <- list(base="1.0.0")
     extra_a <- NULL
@@ -93,5 +95,5 @@ EmbryoAtlasData <- function(type=c("processed", "raw"), samples=NULL, get.splice
             "counts-unspliced"="1.4.0",
             "counts-ambig"="1.4.0"))
     }
-    .getRNAseqData("atlas", type, versions, samples, sample.options=as.character(c(1:10, 12:37)), sample.err="1:10 or 12:37", extra_assays = extra_a)
+    .getRNAseqData("atlas", type, versions, samples, sample.options=as.character(c(1:10, 12:37)), sample.err="1:10 or 12:37", extra_assays = extra_a, makeCsparse=Csparse.assays)
 }
diff --git a/R/TChimeraData.R b/R/TChimeraData.R
index e6b4cef..bd130c8 100644
--- a/R/TChimeraData.R
+++ b/R/TChimeraData.R
@@ -6,6 +6,8 @@
 #' Default behaviour is to return processed data.
 #' @param samples Integer or character vector specifying the samples for which data (processed or raw) should be obtained.
 #' If \code{NULL} (default), data are returned for all QC-passing (fourteen) samples.
+#' @param Csparse.assays Logical indicating whether to convert assay matrices into the column major format that is more performant with contemporary software packages.
+#' Default behaviour is to perform the conversion.
 #'
 #' @return 
 #' If \code{type="processed"}, a \linkS4class{SingleCellExperiment} is returned containing processed data from selected samples
@@ -86,10 +88,10 @@
 #' @importFrom BiocGenerics sizeFactors
 #' @importClassesFrom S4Vectors DataFrame
 #' @importFrom methods as
-TChimeraData <- function(type=c("processed", "raw"), samples=c(1:2, 5:16)) {
+TChimeraData <- function(type=c("processed", "raw"), samples=c(1:2, 5:16), Csparse.assays=TRUE) {
     if(any(3:4 %in% samples))
         warning("You are downloading the QC-fail samples 3 and/or 4.")
     type <- match.arg(type)
     versions <- list(base="1.4.0")
-    .getRNAseqData("t-chimera", type, versions, samples, sample.options=as.character(seq_len(16)), sample.err="1:16")
+    .getRNAseqData("t-chimera", type, versions, samples, sample.options=as.character(seq_len(16)), sample.err="1:16", makeCsparse=Csparse.assays)
 }
diff --git a/R/Tal1ChimeraData.R b/R/Tal1ChimeraData.R
index 09a8e85..0c1cc53 100644
--- a/R/Tal1ChimeraData.R
+++ b/R/Tal1ChimeraData.R
@@ -6,6 +6,8 @@
 #' Default behaviour is to return processed data.
 #' @param samples Integer or character vector specifying the samples for which data (processed or raw) should be obtained.
 #' If \code{NULL} (default), data are returned for all (four) samples.
+#' @param Csparse.assays Logical indicating whether to convert assay matrices into the column major format that is more performant with contemporary software packages.
+#' Default behaviour is to perform the conversion.
 #'
 #' @return 
 #' If \code{type="processed"}, a \linkS4class{SingleCellExperiment} is returned containing processed data from selected samples.
@@ -67,8 +69,8 @@
 #' @importFrom BiocGenerics sizeFactors
 #' @importClassesFrom S4Vectors DataFrame
 #' @importFrom methods as
-Tal1ChimeraData <- function(type=c("processed", "raw"), samples=NULL) {
+Tal1ChimeraData <- function(type=c("processed", "raw"), samples=NULL, Csparse.assays=TRUE) {
     type <- match.arg(type)
     versions <- list(base="1.0.0")
-    .getRNAseqData("tal1-chimera", type, versions, samples, sample.options=as.character(seq_len(4)), sample.err="1:4")
+    .getRNAseqData("tal1-chimera", type, versions, samples, sample.options=as.character(seq_len(4)), sample.err="1:4", makeCsparse=Csparse.assays)
 }
diff --git a/R/WTChimeraData.R b/R/WTChimeraData.R
index b34e22f..e148ea4 100644
--- a/R/WTChimeraData.R
+++ b/R/WTChimeraData.R
@@ -6,6 +6,8 @@
 #' Default behaviour is to return processed data.
 #' @param samples Integer or character vector specifying the samples for which data (processed or raw) should be obtained.
 #' If \code{NULL} (default), data are returned for all (ten) samples.
+#' @param Csparse.assays Logical indicating whether to convert assay matrices into the column major format that is more performant with contemporary software packages.
+#' Default behaviour is to perform the conversion.
 #'
 #' @return 
 #' If \code{type="processed"}, a \linkS4class{SingleCellExperiment} is returned containing processed data from selected samples
@@ -76,8 +78,8 @@
 #' @importFrom BiocGenerics sizeFactors
 #' @importClassesFrom S4Vectors DataFrame
 #' @importFrom methods as
-WTChimeraData <- function(type=c("processed", "raw"), samples=NULL) {
+WTChimeraData <- function(type=c("processed", "raw"), samples=NULL, Csparse.assays=TRUE) {
     type <- match.arg(type)
     versions <- list(base="1.0.0")
-    .getRNAseqData("wt-chimera", type, versions, samples, sample.options=as.character(seq_len(10)), sample.err="1:10")
+    .getRNAseqData("wt-chimera", type, versions, samples, sample.options=as.character(seq_len(10)), sample.err="1:10", makeCsparse=Csparse.assays)
 }
diff --git a/R/getData.R b/R/getData.R
index 07290e1..23688bc 100644
--- a/R/getData.R
+++ b/R/getData.R
@@ -28,7 +28,8 @@
     names,
     object.type=c("SingleCellExperiment", "SpatialExperiment"),
     return.list=FALSE,
-    ensemblise=TRUE
+    ensemblise=TRUE,
+    makeCsparse=FALSE
 ){
     object.type <- match.arg(object.type)
     hub <- ExperimentHub()
@@ -45,7 +46,8 @@
 
     if(return.list){
         out <- lapply(samples, function(x){ .getData(dataset, version, x,
-            sample.options, sample.err, names, object.type, return.list=FALSE)})
+            sample.options, sample.err, names, object.type, return.list=FALSE,
+            ensemblise=ensemblise, makeCsparse=makeCsparse)})
         names(out) <- samples
         return(out)
     }
@@ -119,13 +121,16 @@
     if("cell" %in% names(colData(sce))){
         colnames(sce) <- colData(sce)$cell
     }
+    if(makeCsparse){
+        sce <- .makeCsparse(sce)
+    }
     return(sce)
 }
 
 ####
 # Simpler interfaces for specific data types
 ####
-.getRNAseqData <- function(dataset, type, version, samples, sample.options, sample.err, extra_assays=NULL, ens_rownames=TRUE){
+.getRNAseqData <- function(dataset, type, version, samples, sample.options, sample.err, extra_assays=NULL, ens_rownames=TRUE, makeCsparse=FALSE){
     if(type == "processed"){ return(
         .getData(
             dataset,
@@ -141,7 +146,8 @@
                 dimred="reduced-dims"
             ),
             object.type="SingleCellExperiment",
-            ensemblise=ens_rownames
+            ensemblise=ens_rownames,
+            makeCsparse=makeCsparse
         ))
     } else if (type == "raw"){ return(
         .getData(
@@ -156,7 +162,8 @@
             ),
             object.type="SingleCellExperiment",
             return.list=TRUE,
-            ensemblise=ens_rownames
+            ensemblise=ens_rownames,
+            makeCsparse=makeCsparse
         ))
     }
 }
@@ -210,3 +217,12 @@
         opt
     }
 }
+
+.makeCsparse <- function(sce){
+    for(an in assayNames(sce)){
+        if(is(assay(sce, an), "TsparseMatrix")){
+            assay(sce, an) <- as(assay(sce, an), "CsparseMatrix")
+        }
+    }
+    return(sce)
+}
diff --git a/man/BPSATACData.Rd b/man/BPSATACData.Rd
index 23416e3..77f2867 100644
--- a/man/BPSATACData.Rd
+++ b/man/BPSATACData.Rd
@@ -4,11 +4,14 @@
 \alias{BPSATACData}
 \title{E8.25 snATAC-seq data}
 \usage{
-BPSATACData(type = c("processed", "raw"))
+BPSATACData(type = c("processed", "raw"), Csparse.assays = TRUE)
 }
 \arguments{
 \item{type}{String specifying the type of data to obtain, see Details.
 Default behaviour is to return processed data.}
+
+\item{Csparse.assays}{Logical indicating whether to convert assay matrices into the column major format that is more performant with contemporary software packages.
+Default behaviour is to perform the conversion.}
 }
 \value{
 If \code{type="processed"}, a \linkS4class{SingleCellExperiment} is returned containing the processed data.
diff --git a/man/EmbryoAtlasData.Rd b/man/EmbryoAtlasData.Rd
index e2fb958..fcb25ee 100644
--- a/man/EmbryoAtlasData.Rd
+++ b/man/EmbryoAtlasData.Rd
@@ -7,7 +7,8 @@
 EmbryoAtlasData(
   type = c("processed", "raw"),
   samples = NULL,
-  get.spliced = FALSE
+  get.spliced = FALSE,
+  Csparse.assays = TRUE
 )
 }
 \arguments{
@@ -18,6 +19,9 @@ Default behaviour is to return processed data.}
 If \code{NULL} (default), data are returned for all (36) samples.}
 
 \item{get.spliced}{Logical indicating whether to also download the spliced/unspliced/ambiguously spliced count matrices.}
+
+\item{Csparse.assays}{Logical indicating whether to convert assay matrices into the column major format that is more performant with contemporary software packages.
+Default behaviour is to perform the conversion.}
 }
 \value{
 If \code{type="processed"}, a \linkS4class{SingleCellExperiment} is returned containing processed data from selected samples.
diff --git a/man/TChimeraData.Rd b/man/TChimeraData.Rd
index bb047e2..e3d6a44 100644
--- a/man/TChimeraData.Rd
+++ b/man/TChimeraData.Rd
@@ -4,7 +4,11 @@
 \alias{TChimeraData}
 \title{T chimera data}
 \usage{
-TChimeraData(type = c("processed", "raw"), samples = c(1:2, 5:16))
+TChimeraData(
+  type = c("processed", "raw"),
+  samples = c(1:2, 5:16),
+  Csparse.assays = TRUE
+)
 }
 \arguments{
 \item{type}{String specifying the type of data to obtain, see Details.
@@ -12,6 +16,9 @@ Default behaviour is to return processed data.}
 
 \item{samples}{Integer or character vector specifying the samples for which data (processed or raw) should be obtained.
 If \code{NULL} (default), data are returned for all QC-passing (fourteen) samples.}
+
+\item{Csparse.assays}{Logical indicating whether to convert assay matrices into the column major format that is more performant with contemporary software packages.
+Default behaviour is to perform the conversion.}
 }
 \value{
 If \code{type="processed"}, a \linkS4class{SingleCellExperiment} is returned containing processed data from selected samples
diff --git a/man/Tal1ChimeraData.Rd b/man/Tal1ChimeraData.Rd
index 686667b..6028044 100644
--- a/man/Tal1ChimeraData.Rd
+++ b/man/Tal1ChimeraData.Rd
@@ -4,7 +4,11 @@
 \alias{Tal1ChimeraData}
 \title{Tal1 chimera data}
 \usage{
-Tal1ChimeraData(type = c("processed", "raw"), samples = NULL)
+Tal1ChimeraData(
+  type = c("processed", "raw"),
+  samples = NULL,
+  Csparse.assays = TRUE
+)
 }
 \arguments{
 \item{type}{String specifying the type of data to obtain, see Details.
@@ -12,6 +16,9 @@ Default behaviour is to return processed data.}
 
 \item{samples}{Integer or character vector specifying the samples for which data (processed or raw) should be obtained.
 If \code{NULL} (default), data are returned for all (four) samples.}
+
+\item{Csparse.assays}{Logical indicating whether to convert assay matrices into the column major format that is more performant with contemporary software packages.
+Default behaviour is to perform the conversion.}
 }
 \value{
 If \code{type="processed"}, a \linkS4class{SingleCellExperiment} is returned containing processed data from selected samples.
diff --git a/man/WTChimeraData.Rd b/man/WTChimeraData.Rd
index dba3cc6..9df35a0 100644
--- a/man/WTChimeraData.Rd
+++ b/man/WTChimeraData.Rd
@@ -4,7 +4,11 @@
 \alias{WTChimeraData}
 \title{WT chimera data}
 \usage{
-WTChimeraData(type = c("processed", "raw"), samples = NULL)
+WTChimeraData(
+  type = c("processed", "raw"),
+  samples = NULL,
+  Csparse.assays = TRUE
+)
 }
 \arguments{
 \item{type}{String specifying the type of data to obtain, see Details.
@@ -12,6 +16,9 @@ Default behaviour is to return processed data.}
 
 \item{samples}{Integer or character vector specifying the samples for which data (processed or raw) should be obtained.
 If \code{NULL} (default), data are returned for all (ten) samples.}
+
+\item{Csparse.assays}{Logical indicating whether to convert assay matrices into the column major format that is more performant with contemporary software packages.
+Default behaviour is to perform the conversion.}
 }
 \value{
 If \code{type="processed"}, a \linkS4class{SingleCellExperiment} is returned containing processed data from selected samples

From 88c2c86979d676e0891295f40df5fdff9a58fa94 Mon Sep 17 00:00:00 2001
From: Jonathan Griffiths <7976085+jonathangriffiths@users.noreply.github.com>
Date: Fri, 12 Jan 2024 13:05:45 +0000
Subject: [PATCH 2/3] Add test

---
 tests/testthat.R              |  3 +++
 tests/testthat/test-Csparse.R | 10 ++++++++++
 2 files changed, 13 insertions(+)
 create mode 100644 tests/testthat.R
 create mode 100644 tests/testthat/test-Csparse.R

diff --git a/tests/testthat.R b/tests/testthat.R
new file mode 100644
index 0000000..ecb13e2
--- /dev/null
+++ b/tests/testthat.R
@@ -0,0 +1,3 @@
+library(testthat)
+library(MouseGastrulationData)
+test_check("MouseGastrulationData")
diff --git a/tests/testthat/test-Csparse.R b/tests/testthat/test-Csparse.R
new file mode 100644
index 0000000..2d5f9b2
--- /dev/null
+++ b/tests/testthat/test-Csparse.R
@@ -0,0 +1,10 @@
+# This tests the conversion from triplet to column major matrix styles.
+# library(testthat); library(MouseGastrulationData); source("test-Csparse.R")
+
+test_that("EmbryoAtlasData function for sample 1, with and without csparse conversion, gives equal counts assay", {
+    data_without_csparse <- EmbryoAtlasData(samples = 1, Csparse.assays = FALSE)
+    data_with_csparse <- EmbryoAtlasData(samples = 1, Csparse.assays = TRUE)
+    
+    expect_equal(assay(data_without_csparse, "counts"),
+        as(assay(data_with_csparse, "counts"), "TsparseMatrix"))
+})

From 6fe1b49c35ca6479b747b51bfd7848e15f805740 Mon Sep 17 00:00:00 2001
From: Jonathan Griffiths <7976085+jonathangriffiths@users.noreply.github.com>
Date: Fri, 12 Jan 2024 14:49:04 +0000
Subject: [PATCH 3/3] add missing testthat dep

---
 DESCRIPTION | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index a1fa349..b659194 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -22,7 +22,8 @@ Imports:
 Suggests: 
     BiocStyle, 
     knitr, 
-    rmarkdown
+    rmarkdown,
+    testthat
 VignetteBuilder: 
     knitr
 License: GPL-3