Merge pull request #301 from nf-core/81-call-empty-droplets

Fix issue 81, "call empty droplets"
nf-core · Mar 18, 2024 · 1043441 · 1043441
2 parents d554870 + 2406600
commit 1043441
Show file tree

Hide file tree

Showing 37 changed files with 682 additions and 174 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Update template to v2.13.1 ([#309](https://github.com/nf-core/scrnaseq/pull/309))
 - Update to kallisto|bustools v0.28.2 ([#294](https://github.com/nf-core/scrnaseq/pull/294))
 - Fix cellrangerarc matrix conversions and protocol selection ([#300](https://github.com/nf-core/scrnaseq/pull/300))
+- Add new emptydrops calling module ([#301](https://github.com/nf-core/scrnaseq/pull/301))
 
 ## v2.5.1
 

diff --git a/bin/emptydrops_cell_calling.R b/bin/emptydrops_cell_calling.R
@@ -0,0 +1,52 @@
+#!/usr/bin/env Rscript
+library("DropletUtils")
+library("Matrix")
+
+args <- commandArgs(trailingOnly=TRUE)
+
+fn_mtx      <- args[1]
+fn_barcodes <- args[2]
+fn_genes    <- args[3]
+outdir      <- args[4]
+aligner     <- args[5]
+
+# Read matrix/barcodes/genes
+genes    <- read.table(fn_genes,sep='\t')
+barcodes <- read.table(fn_barcodes,sep='\t')
+mtx      <- readMM(fn_mtx)
+
+get_name <- function(file) {
+    name <- as.character(basename(file))
+    name <- gsub('\\.gz$', '', name)
+    return(name)
+}
+
+# transpose matrices when required
+# based on code of 'mtx_to_seurat.R', only the data from kallisto and alevin would require transposition
+print("Only kallisto and alevin have transposed matrices.")
+if (aligner %in% c( "kallisto", "alevin" ))  {
+    is_transposed <- TRUE
+    mtx<-t(mtx)
+} else {
+    is_transposed <- FALSE
+}
+
+
+# Call empty drops
+e.out <- emptyDrops(mtx)
+is.cell <- e.out$FDR <= 0.01
+
+# Slice matrix and barcodes
+mtx_filtered <-mtx[,which(is.cell),drop=FALSE]
+barcodes_filtered<-barcodes[which(is.cell),]
+
+# If matrix was transposed early, need to transpose back
+if (is_transposed){
+    mtx_filtered<-t(mtx_filtered)
+    print('Transposing back matrix.')
+}
+
+# Write output
+writeMM(mtx_filtered,file.path(outdir,get_name(fn_mtx)))
+write.table(barcodes_filtered,file=file.path(outdir,get_name(fn_barcodes)),col.names=FALSE,row.names=FALSE,sep='\t',quote=FALSE)
+write.table(genes,file=file.path(outdir,get_name(fn_genes)),col.names=FALSE,row.names=FALSE,sep='\t',quote=FALSE)
diff --git a/bin/mtx_to_h5ad.py b/bin/mtx_to_h5ad.py
@@ -32,9 +32,13 @@ def _mtx_to_adata(
     aligner: str,
 ):
     adata = sc.read_mtx(mtx_file)
-    if (
-        aligner == "star"
-    ):  # for some reason star matrix comes transposed and doesn't fit when values are appended directly
+    # for some reason star matrix comes transposed and doesn't fit when values are appended directly
+    # also true for cellranger files ( this is only used when running with the custom emptydrops_filtered files )
+    # otherwise, it uses the cellranger .h5 files
+    if aligner in [
+        "cellranger",
+        "star",
+    ]:
         adata = adata.transpose()
 
     adata.obs_names = pd.read_csv(barcode_file, header=None, sep="\t")[0].values
@@ -57,22 +61,36 @@ def input_to_adata(
     if verbose and (txp2gene or star_index):
         print("Reading in {}".format(input_data))
 
-    if aligner == "cellranger":
+    #
+    # open main data
+    #
+    if aligner == "cellranger" and input_data.lower().endswith('.h5'):
         adata = _10x_h5_to_adata(input_data, sample)
     else:
         adata = _mtx_to_adata(input_data, barcode_file, feature_file, sample, aligner)
 
+    #
+    # open gene information
+    #
     if verbose and (txp2gene or star_index):
         print("Reading in {}".format(txp2gene))
 
-    if txp2gene:
-        t2g = pd.read_table(txp2gene, header=None, names=["gene_id", "gene_symbol"], usecols=[1, 2])
-    elif star_index:
-        t2g = pd.read_table(
-            f"{star_index}/geneInfo.tab", header=None, skiprows=1, names=["gene_id", "gene_symbol"], usecols=[0, 1]
-        )
-
-    if txp2gene or star_index:
+    if aligner == "cellranger" and not input_data.lower().endswith('.h5'):
+        #
+        # for cellranger workflow, we do not have a txp2gene file, so, when using this normal/manual function for empty drops
+        # we need to provide this information coming directly from the features.tsv file
+        # by not using the .h5 file for conversion, we loose the two col information: feature_types and genome
+        #
+        t2g = pd.read_table(feature_file, header=None, names=["gene_id", "gene_symbol", "feature_types"], usecols=[0, 1, 2])
+    else:
+        if txp2gene:
+            t2g = pd.read_table(txp2gene, header=None, names=["gene_id", "gene_symbol"], usecols=[1, 2])
+        elif star_index:
+            t2g = pd.read_table(
+                f"{star_index}/geneInfo.tab", header=None, skiprows=1, names=["gene_id", "gene_symbol"], usecols=[0, 1]
+            )
+
+    if txp2gene or star_index or (aligner == "cellranger" and not input_data.lower().endswith('.h5')):
         t2g = t2g.drop_duplicates(subset="gene_id").set_index("gene_id")
         adata.var["gene_symbol"] = t2g["gene_symbol"]
 

diff --git a/bin/mtx_to_seurat.R b/bin/mtx_to_seurat.R
@@ -3,23 +3,40 @@ library(Seurat)
 
 args <- commandArgs(trailingOnly=TRUE)
 
-mtx_file     <- args[1]
-barcode_file <- args[2]
-feature_file <- args[3]
-out.file     <- args[4]
-aligner      <- args[5]
+mtx_file      <- args[1]
+barcode_file  <- args[2]
+feature_file  <- args[3]
+out.file      <- args[4]
+aligner       <- args[5]
+is_emptydrops <- args[6]
+
+if (is_emptydrops == "--is_emptydrops") {
+    is_emptydrops <- TRUE
+} else{
+    is_emptydrops <- FALSE
+}
 
-if(aligner %in% c("kallisto", "alevin")) {
+if (aligner %in% c( "kallisto", "alevin" ))  {
+    print("1")
     # for kallisto and alevin, the features file contains only one column and matrix needs to be transposed
     expression.matrix <- ReadMtx(
         mtx = mtx_file, features = feature_file, cells = barcode_file, feature.column = 1, mtx.transpose = TRUE
     )
 } else {
-    expression.matrix <- ReadMtx(
-        mtx = mtx_file, features = feature_file, cells = barcode_file
-    )
+    if (aligner %in% c( "cellranger", "star" ) && is_emptydrops) {
+        print("2")
+        expression.matrix <- ReadMtx(
+            mtx = mtx_file, features = feature_file, cells = barcode_file, feature.column = 1
+        )
+    } else{
+        print("3")
+        expression.matrix <- ReadMtx(
+            mtx = mtx_file, features = feature_file, cells = barcode_file
+        )
+    }
 }
 
+
 seurat.object <- CreateSeuratObject(counts = expression.matrix)
 
 dir.create(basename(dirname(out.file)), showWarnings = FALSE)

diff --git a/conf/modules.config b/conf/modules.config
@@ -29,6 +29,7 @@ process {
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
     }
+
     withName: CUSTOM_DUMPSOFTWAREVERSIONS {
         publishDir = [
             path: { "${params.outdir}/pipeline_info" },
@@ -46,6 +47,20 @@ process {
         ]
     }
 
+    if (!params.skip_emptydrops) {
+        withName: EMPTYDROPS_CELL_CALLING {
+            publishDir = [
+                path: { "${params.outdir}/${params.aligner}" },
+                mode: params.publish_dir_mode,
+                saveAs: { filename ->
+                    if ( params.aligner == 'cellranger' ) "count/${meta.id}/${filename}"
+                    else if ( params.aligner == 'kallisto' ) "${meta.id}.count/${filename}"
+                    else "${meta.id}/${filename}"
+                }
+            ]
+        }
+    }
+
     withName: 'MTX_TO_H5AD|CONCAT_H5AD|MTX_TO_SEURAT' {
         publishDir = [
             path: { "${params.outdir}/${params.aligner}/mtx_conversions" },
@@ -205,11 +220,12 @@ if (params.aligner == 'kallisto') {
             ]
         }
         withName: KALLISTOBUSTOOLS_COUNT {
+            def kb_filter = (params.kb_filter) ? '--filter' : ''
             publishDir = [
                 path: { "${params.outdir}/${params.aligner}" },
                 mode: params.publish_dir_mode
             ]
-            ext.args = "--workflow ${params.kb_workflow}"
+            ext.args = "--workflow ${params.kb_workflow} ${kb_filter}"
         }
     }
 }
diff --git a/conf/test.config b/conf/test.config
@@ -20,7 +20,8 @@ params {
     max_time   = '6.h'
 
     // Input data
-    input        = 'https://github.com/nf-core/test-datasets/raw/scrnaseq/samplesheet-2-0.csv'
+    input           = 'https://github.com/nf-core/test-datasets/raw/scrnaseq/samplesheet-2-0.csv'
+    skip_emptydrops = true // module does not work on small dataset
 
     // Genome references
     fasta        = 'https://github.com/nf-core/test-datasets/raw/scrnaseq/reference/GRCm38.p6.genome.chr19.fa'

diff --git a/docs/output.md b/docs/output.md
@@ -19,6 +19,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
   - [Cellranger](#cellranger)
   - [Cellranger ARC](#cellranger-arc)
   - [UniverSC](#universc)
+  - [Custom emptydrops filter](#custom-emptydrops-filter)
   - [Other output data](#other-output-data)
   - [MultiQC](#multiqc)
   - [Pipeline information](#pipeline-information)
@@ -128,6 +129,16 @@ Battenberg, K., Kelly, S.T., Ras, R.A., Hetherington, N.A., Hayashi, K., and Min
 
 - Contains the mapped BAM files, filtered and unfiltered HDF5 matrices and output metrics created by the open-source implementation of Cell Ranger run via UniverSC
 
+## Custom emptydrops filter
+
+The pipeline also possess a module to perform empty-drops calling and filtering with a custom-made script that uses a library called `bioconductor-dropletutils` that is available in `bioconda`. The process is simple, it takes a raw/unfiltered matrix file, and performs the empty-drops calling and filtering on it, generating another matrix file.
+
+> Users can turn it of with `--skip_emptydrops`.
+
+**Output directory: `results/${params.aligner}/emptydrops_filtered`**
+
+- Contains the empty-drops filtered matrices results generated by the `bioconductor-dropletutils` custom script
+
 ## Other output data
 
 **Output directory: `results/reference_genome`**
@@ -143,6 +154,21 @@ Battenberg, K., Kelly, S.T., Ras, R.A., Hetherington, N.A., Hayashi, K., and Min
 - `*_matrix.h5ad`
   - `.mtx` files converted to [AnnData](https://anndata.readthedocs.io/en/latest/) in `.h5ad` format, using [scanpy package](https://scanpy.readthedocs.io/en/stable/).
   - One per sample and a single one with all samples concatenated together `combined_matrix.h5ad`
+- `*_matrix.rds`
+  - `.mtx` files converted to R native data format, rds, using the [Seurat package](https://github.com/satijalab/seurat)
+  - One per sample
+
+Because the pipeline has both the data directly from the aligners, and from the custom empty-drops filtering module the conversion modules were modified to understand the difference between raw/filtered from the aligners itself and filtered from the custom empty-drops module. So, to try to avoid confusion by the user, we added "suffixes" to the generated converted files so that we have provenance from what input it came from.
+
+So, the conversion modules generate data with the following syntax: **`*_{raw,filtered,custom_emptydrops_filter}_matrix.{h5ad,rds}`**. With the following meanings:
+
+| suffix                   | meaning                                                                                                                                  |
+| :----------------------- | :--------------------------------------------------------------------------------------------------------------------------------------- |
+| raw                      | Conversion of the raw/unprocessed matrix generated by the tool. It is also used for tools that generate only one matrix, such as alevin. |
+| filtered                 | Conversion of the filtered/processed matrix generated by the tool                                                                        |
+| custom_emptydrops_filter | Conversion of the matrix that was generated by the new custom empty drops filter module                                                  |
+
+> Some aligners, like `alevin` do not produce both raw&filtered matrices. When aligners give only one output, they are treated with the `raw` suffix. Some aligners may have an option to give both raw&filtered and only one, like `kallisto`. Be aware when using the tools.
 
 ## MultiQC
 

diff --git a/main.nf b/main.nf
@@ -17,20 +17,19 @@ nextflow.enable.dsl = 2
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 */
 
-include { SCRNASEQ  } from './workflows/scrnaseq'
+include { SCRNASEQ                } from './workflows/scrnaseq'
 include { PIPELINE_INITIALISATION } from './subworkflows/local/utils_nfcore_scrnaseq_pipeline'
 include { PIPELINE_COMPLETION     } from './subworkflows/local/utils_nfcore_scrnaseq_pipeline'
-
 include { getGenomeAttribute      } from './subworkflows/local/utils_nfcore_scrnaseq_pipeline'
 
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     GENOME PARAMETER VALUES
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 */
-
-params.fasta = getGenomeAttribute('fasta')
-params.gtf = getGenomeAttribute('gtf')
+// we cannot modify params. here, we must load the files
+ch_genome_fasta = params.genome ? file( getGenomeAttribute('fasta'), checkIfExists: true ) : []
+ch_gtf          = params.genome ? file( getGenomeAttribute('gtf'), checkIfExists: true   ) : []
 
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -45,14 +44,18 @@ workflow NFCORE_SCRNASEQ {
 
     take:
     samplesheet // channel: samplesheet read in from --input
+    ch_genome_fasta
+    ch_gtf
 
     main:
 
     //
     // WORKFLOW: Run pipeline
     //
     SCRNASEQ (
-        samplesheet
+        samplesheet,
+        ch_genome_fasta,
+        ch_gtf
     )
 
     emit:
@@ -86,7 +89,9 @@ workflow {
     // WORKFLOW: Run main workflow
     //
     NFCORE_SCRNASEQ (
-        PIPELINE_INITIALISATION.out.samplesheet
+        PIPELINE_INITIALISATION.out.samplesheet,
+        ch_genome_fasta,
+        ch_gtf
     )
 
     //

diff --git a/modules.json b/modules.json
@@ -7,7 +7,7 @@
                 "nf-core": {
                     "cellranger/count": {
                         "branch": "master",
-                        "git_sha": "a2dfd9a0b2e192243695711c723d652959de39fc",
+                        "git_sha": "92ca535c5a8c0fe89eb71e649ee536bd355ce4fc",
                         "installed_by": ["modules"]
                     },
                     "cellranger/mkgtf": {
@@ -52,7 +52,7 @@
                     },
                     "kallistobustools/count": {
                         "branch": "master",
-                        "git_sha": "de8215983defba48cd81961d620a9e844f11c7e7",
+                        "git_sha": "9d3e489286eead7dfe1010fd324904d8b698eca7",
                         "installed_by": ["modules"]
                     },
                     "kallistobustools/ref": {

diff --git a/modules/local/concat_h5ad.nf b/modules/local/concat_h5ad.nf
@@ -7,7 +7,7 @@ process CONCAT_H5AD {
         'biocontainers/scanpy:1.7.2--pyhdfd78af_0' }"
 
     input:
-    path h5ad
+    tuple val(input_type), path(h5ad)
     path samplesheet
 
     output:
@@ -20,7 +20,7 @@ process CONCAT_H5AD {
     """
     concat_h5ad.py \\
         --input $samplesheet \\
-        --out combined_matrix.h5ad \\
+        --out combined_${input_type}_matrix.h5ad \\
         --suffix "_matrix.h5ad"
     """