nf-core · fmalmeida · Mar 18, 2024 · Jan 31, 2024 · Feb 1, 2024 · Feb 2, 2024
diff --git a/bin/emptydrops_cell_calling.R b/bin/emptydrops_cell_calling.R
@@ -0,0 +1,49 @@
+#!/usr/bin/env Rscript
+library("DropletUtils")
+library("Matrix")
+
+args <- commandArgs(trailingOnly=TRUE)
+
+fn_mtx      <- args[1]
+fn_barcodes <- args[2]
+fn_genes    <- args[3]
+outdir      <- args[4]
+aligner     <- args[5]
+
+# Read matrix/barcodes/genes
+genes    <- read.table(fn_genes,sep='\t')
+barcodes <- read.table(fn_barcodes,sep='\t')
+mtx      <- readMM(fn_mtx)
+
+get_name <- function(file) {
+    name <- as.character(basename(file))
+    name <- gsub('\\.gz', '', name)
+    return(name)
+}
+
+# Check if barcodes are in columns, if not, transpose mtx
+is_transposed<-FALSE
+if (dim(barcodes)[1]!=dim(mtx)[2]){
+    mtx<-t(mtx)
+    is_transposed<-TRUE
+    print('Matrix was tranposed.')
+}
+
+# Call empty drops
+e.out <- emptyDrops(mtx)
+is.cell <- e.out$FDR <= 0.01
+
+# Slice matrix and barcodes
+mtx_filtered <-mtx[,which(is.cell),drop=FALSE]
+barcodes_filtered<-barcodes[which(is.cell),]
+
+# If matrix was transposed early, need to transpose back
+if (is_transposed){
+    mtx_filtered<-t(mtx_filtered)
+    print('Transposing back matrix.')
+}
+
+# Write output
+writeMM(mtx_filtered,file.path(outdir,get_name(fn_mtx)))
+write.table(barcodes_filtered,file=file.path(outdir,get_name(fn_barcodes)),col.names=FALSE,row.names=FALSE,sep='\t',quote=FALSE)
+write.table(genes,file=file.path(outdir,get_name(fn_genes)),col.names=FALSE,row.names=FALSE,sep='\t',quote=FALSE)
diff --git a/bin/mtx_to_h5ad.py b/bin/mtx_to_h5ad.py
@@ -57,22 +57,36 @@ def input_to_adata(
     if verbose and (txp2gene or star_index):
         print("Reading in {}".format(input_data))
 
-    if aligner == "cellranger":
+    #
+    # open main data
+    #
+    if aligner == "cellranger" and input_data.lower().endswith('.h5'):
         adata = _10x_h5_to_adata(input_data, sample)
     else:
         adata = _mtx_to_adata(input_data, barcode_file, feature_file, sample, aligner)
 
+    #
+    # open gene information
+    #
     if verbose and (txp2gene or star_index):
         print("Reading in {}".format(txp2gene))
 
-    if txp2gene:
-        t2g = pd.read_table(txp2gene, header=None, names=["gene_id", "gene_symbol"], usecols=[1, 2])
-    elif star_index:
-        t2g = pd.read_table(
-            f"{star_index}/geneInfo.tab", header=None, skiprows=1, names=["gene_id", "gene_symbol"], usecols=[0, 1]
-        )
-
-    if txp2gene or star_index:
+    if aligner == "cellranger" and not input_data.lower().endswith('.h5'):
+        #
+        # for cellranger workflow, we do not have a txp2gene file, so, when using this normal/manual function for empty drops
+        # we need to provide this information coming directly from the features.tsv file
+        # by not using the .h5 file for conversion, we loose the two col information: feature_types and genome
+        #
+        t2g = pd.read_table(feature_file, header=None, names=["gene_id", "gene_symbol", "feature_types"], usecols=[0, 1, 2])
+    else:
+        if txp2gene:
+            t2g = pd.read_table(txp2gene, header=None, names=["gene_id", "gene_symbol"], usecols=[1, 2])
+        elif star_index:
+            t2g = pd.read_table(
+                f"{star_index}/geneInfo.tab", header=None, skiprows=1, names=["gene_id", "gene_symbol"], usecols=[0, 1]
+            )
+
+    if txp2gene or star_index or (aligner == "cellranger" and not input_data.lower().endswith('.h5')):
         t2g = t2g.drop_duplicates(subset="gene_id").set_index("gene_id")
         adata.var["gene_symbol"] = t2g["gene_symbol"]
 

diff --git a/bin/mtx_to_seurat.R b/bin/mtx_to_seurat.R
@@ -3,23 +3,40 @@ library(Seurat)
 
 args <- commandArgs(trailingOnly=TRUE)
 
-mtx_file     <- args[1]
-barcode_file <- args[2]
-feature_file <- args[3]
-out.file     <- args[4]
-aligner      <- args[5]
+mtx_file      <- args[1]
+barcode_file  <- args[2]
+feature_file  <- args[3]
+out.file      <- args[4]
+aligner       <- args[5]
+is_emptydrops <- args[6]
+
+if (is_emptydrops == "--is_emptydrops") {
+    is_emptydrops <- TRUE
+} else{
+    is_emptydrops <- FALSE
+}
 
-if(aligner %in% c("kallisto", "alevin")) {
+if (aligner %in% c( "kallisto", "alevin" ))  {
+    print("1")
     # for kallisto and alevin, the features file contains only one column and matrix needs to be transposed
     expression.matrix <- ReadMtx(
         mtx = mtx_file, features = feature_file, cells = barcode_file, feature.column = 1, mtx.transpose = TRUE
     )
 } else {
-    expression.matrix <- ReadMtx(
-        mtx = mtx_file, features = feature_file, cells = barcode_file
-    )
+    if (aligner %in% c( "cellranger", "star" ) && is_emptydrops) {
+        print("2")
+        expression.matrix <- ReadMtx(
+            mtx = mtx_file, features = feature_file, cells = barcode_file, feature.column = 1
+        )
+    } else{
+        print("3")
+        expression.matrix <- ReadMtx(
+            mtx = mtx_file, features = feature_file, cells = barcode_file
+        )
+    }
 }
 
+
 seurat.object <- CreateSeuratObject(counts = expression.matrix)
 
 dir.create(basename(dirname(out.file)), showWarnings = FALSE)

diff --git a/conf/modules.config b/conf/modules.config
@@ -28,6 +28,7 @@ process {
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
     }
+
     withName: CUSTOM_DUMPSOFTWAREVERSIONS {
         publishDir = [
             path: { "${params.outdir}/pipeline_info" },
@@ -45,6 +46,20 @@ process {
         ]
     }
 
+    if (!params.skip_emptydrops) {
+        withName: EMPTYDROPS_CELL_CALLING {
+            publishDir = [
+                path: { "${params.outdir}/${params.aligner}" },
+                mode: params.publish_dir_mode,
+                saveAs: { filename ->
+                    if ( params.aligner == 'cellranger' ) "count/${meta.id}/${filename}"
+                    else if ( params.aligner == 'kallisto' ) "${meta.id}.count/${filename}"
+                    else "${meta.id}/${filename}"
+                }
+            ]
+        }
+    }
+
     withName: 'MTX_TO_H5AD|CONCAT_H5AD|MTX_TO_SEURAT' {
         publishDir = [
             path: { "${params.outdir}/${params.aligner}/mtx_conversions" },
@@ -204,11 +219,12 @@ if (params.aligner == 'kallisto') {
             ]
         }
         withName: KALLISTOBUSTOOLS_COUNT {
+            def kb_filter = (params.kb_filter) ? '--filter' : ''
             publishDir = [
                 path: { "${params.outdir}/${params.aligner}" },
                 mode: params.publish_dir_mode
             ]
-            ext.args = "--workflow ${params.kb_workflow}"
+            ext.args = "--workflow ${params.kb_workflow} ${kb_filter}"
         }
     }
 }
diff --git a/conf/test.config b/conf/test.config
@@ -20,7 +20,8 @@ params {
     max_time   = '6.h'
 
     // Input data
-    input        = 'https://github.com/nf-core/test-datasets/raw/scrnaseq/samplesheet-2-0.csv'
+    input           = 'https://github.com/nf-core/test-datasets/raw/scrnaseq/samplesheet-2-0.csv'
+    skip_emptydrops = true // module does not work on small dataset
 
     // Genome references
     fasta        = 'https://github.com/nf-core/test-datasets/raw/scrnaseq/reference/GRCm38.p6.genome.chr19.fa'

diff --git a/modules/local/concat_h5ad.nf b/modules/local/concat_h5ad.nf
@@ -7,7 +7,7 @@ process CONCAT_H5AD {
         'biocontainers/scanpy:1.7.2--pyhdfd78af_0' }"
 
     input:
-    path h5ad
+    tuple val(input_type), path(h5ad)
     path samplesheet
 
     output:
@@ -20,7 +20,7 @@ process CONCAT_H5AD {
     """
     concat_h5ad.py \\
         --input $samplesheet \\
-        --out combined_matrix.h5ad \\
+        --out combined_${input_type}_matrix.h5ad \\
         --suffix "_matrix.h5ad"
     """
 

diff --git a/modules/local/emptydrops.nf b/modules/local/emptydrops.nf
@@ -0,0 +1,82 @@
+process EMPTYDROPS_CELL_CALLING {
+    tag "$meta.id"
+    label 'process_medium'
+
+    conda "bioconda::bioconductor-dropletutils"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/bioconductor-dropletutils:1.18.0--r42hf17093f_1' :
+        'quay.io/biocontainers/bioconductor-dropletutils:1.18.0--r42hf17093f_1' }"
+
+    input:
+    // inputs from cellranger nf-core module does not come in a single sample dir
+    // for each sample, the sub-folders and files come directly in array.
+    tuple val(meta), path(inputs)
+
+    output:
+    tuple val(meta), path("emptydrops_filtered"), emit: filtered_matrices
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    if (params.aligner == "cellranger") {
+
+        matrix   = "raw_feature_bc_matrix/matrix.mtx.gz"
+        barcodes = "raw_feature_bc_matrix/barcodes.tsv.gz"
+        features = "raw_feature_bc_matrix/features.tsv.gz"
+
+    } else if (params.aligner == "kallisto") {
+
+        matrix     = "counts_unfiltered/*.mtx"
+        barcodes   = "counts_unfiltered/*.barcodes.txt"
+        features   = "counts_unfiltered/*.genes.txt"
+
+    } else if (params.aligner == "alevin") {
+
+        matrix   = "*_alevin_results/af_quant/alevin/quants_mat.mtx"
+        barcodes = "*_alevin_results/af_quant/alevin/quants_mat_rows.txt"
+        features = "*_alevin_results/af_quant/alevin/quants_mat_cols.txt"
+
+    } else if (params.aligner == 'star') {
+
+        matrix   = "raw/matrix.mtx.gz"
+        barcodes = "raw/barcodes.tsv.gz"
+        features = "raw/features.tsv.gz"
+
+    }
+
+    //
+    // run script
+    //
+    if (params.aligner == 'kallisto' && params.kb_workflow != 'standard')
+    """
+    mkdir emptydrops_filtered/
+    # convert file types
+    for splice_type in spliced unspliced ; do
+        emptydrops_cell_calling.R \\
+            *count/counts_unfiltered/\${splice_type}.mtx \\
+            *count/counts_unfiltered/\${splice_type}.barcodes.txt \\
+            *count/counts_unfiltered/\${splice_type}.genes.txt \\
+            emptydrops_filtered \\
+            ${params.aligner} \\
+            0
+    done
+    """
+
+    else
+    """
+    mkdir emptydrops_filtered/
+    emptydrops_cell_calling.R \\
+        $matrix \\
+        $barcodes \\
+        $features \\
+        emptydrops_filtered \\
+        ${params.aligner} \\
+        0
+    """
+
+    stub:
+    """
+    touch emptydrops_filtered/*
+    """
+}
diff --git a/modules/local/mtx_to_h5ad.nf b/modules/local/mtx_to_h5ad.nf
@@ -15,33 +15,59 @@ process MTX_TO_H5AD {
     path star_index
 
     output:
-    path "${meta.id}/*h5ad", emit: h5ad
-    path "${meta.id}/*", emit: counts
-    path  "versions.yml", emit: versions
+    tuple val(input_type), path("${meta.id}/*h5ad") , emit: h5ad
+    path  "versions.yml"                            , emit: versions
 
     when:
     task.ext.when == null || task.ext.when
 
     script:
-    // def file paths for aligners (except cellranger)
-    if (params.aligner == 'kallisto') {
-        mtx_matrix   = "*count/counts_unfiltered/*.mtx"
-        barcodes_tsv = "*count/counts_unfiltered/*.barcodes.txt"
-        features_tsv = "*count/counts_unfiltered/*.genes.txt"
+    // check input type of inputs
+    input_type = (inputs.toUriString().contains('unfiltered') || inputs.toUriString().contains('raw')) ? 'raw' : 'filtered'
+    if ( params.aligner == 'alevin' ) { input_type = 'raw' } // alevin has its own filtering methods and mostly output a single mtx, raw here means, the base tool output
+    if (inputs.toUriString().contains('emptydrops')) { input_type = 'custom_emptydrops_filter' }
+
+    // def file paths for aligners. Cellranger is normally converted with the .h5 files
+    // However, the emptydrops call, always generate .mtx files, thus, cellranger 'emptydrops' required a parsing
+    if (params.aligner in [ 'cellranger', 'cellrangerarc' ] && input_type == 'custom_emptydrops_filter') {
+
+        aligner      = 'cellranger'
+        txp2gene     = ''
+        star_index   = ''
+        mtx_matrix   = "emptydrops_filtered/matrix.mtx"
+        barcodes_tsv = "emptydrops_filtered/barcodes.tsv"
+        features_tsv = "emptydrops_filtered/features.tsv"
+
+    } else if (params.aligner == 'kallisto') {
+
+        kb_pattern   = (input_type == 'raw') ? 'un' : ''
+        mtx_dir      = (input_type == 'custom_emptydrops_filter') ? 'emptydrops_filtered' : "counts_${kb_pattern}filtered"
+        mtx_matrix   = "${mtx_dir}/*.mtx"
+        barcodes_tsv = "${mtx_dir}/*.barcodes.txt"
+        features_tsv = "${mtx_dir}/*.genes.txt"
+
     } else if (params.aligner == 'alevin') {
-        mtx_matrix   = "*_alevin_results/af_quant/alevin/quants_mat.mtx"
-        barcodes_tsv = "*_alevin_results/af_quant/alevin/quants_mat_rows.txt"
-        features_tsv = "*_alevin_results/af_quant/alevin/quants_mat_cols.txt"
+
+        // alevin does not have filtered/unfiltered results
+        mtx_dir      = (input_type == 'custom_emptydrops_filter') ? 'emptydrops_filtered' : '*_alevin_results/af_quant/alevin'
+        mtx_matrix   = "${mtx_dir}/quants_mat.mtx"
+        barcodes_tsv = "${mtx_dir}/quants_mat_rows.txt"
+        features_tsv = "${mtx_dir}/quants_mat_cols.txt"
+
     } else if (params.aligner == 'star') {
-        mtx_matrix   = "*.Solo.out/Gene*/filtered/matrix.mtx.gz"
-        barcodes_tsv = "*.Solo.out/Gene*/filtered/barcodes.tsv.gz"
-        features_tsv = "*.Solo.out/Gene*/filtered/features.tsv.gz"
+
+        mtx_dir      = (input_type == 'custom_emptydrops_filter') ? 'emptydrops_filtered' : "${input_type}"
+        suffix       = (input_type == 'custom_emptydrops_filter') ? '' : '.gz'
+        mtx_matrix   = "${mtx_dir}/matrix.mtx${suffix}"
+        barcodes_tsv = "${mtx_dir}/barcodes.tsv${suffix}"
+        features_tsv = "${mtx_dir}/features.tsv${suffix}"
+
     }
 
     //
     // run script
     //
-    if (params.aligner in [ 'cellranger', 'cellrangerarc' ])
+    if (params.aligner in [ 'cellranger', 'cellrangerarc' ] && input_type != 'custom_emptydrops_filter')
     """
     # convert file types
     mtx_to_h5ad.py \\
@@ -79,7 +105,7 @@ process MTX_TO_H5AD {
         --feature $features_tsv \\
         --txp2gene ${txp2gene} \\
         --star_index ${star_index} \\
-        --out ${meta.id}/${meta.id}_matrix.h5ad
+        --out ${meta.id}/${meta.id}_${input_type}_matrix.h5ad
     """
 
     stub: