From 6d818948e0b32642fa33571a875f64b65c12a3f5 Mon Sep 17 00:00:00 2001 From: fmalmeida Date: Wed, 31 Jan 2024 11:42:31 +0100 Subject: [PATCH 01/30] minimum inclusion for module tested only with kallisto aligner (both with and without automated kallisto filtering with bustools --filter parameter) --- bin/emptydrops_cell_calling.R | 49 +++++++++++ bin/mtx_to_h5ad.py | 32 ++++++-- bin/mtx_to_seurat.R | 35 ++++++-- conf/modules.config | 18 +++- conf/test.config | 3 +- modules/local/concat_h5ad.nf | 4 +- modules/local/emptydrops.nf | 82 +++++++++++++++++++ modules/local/mtx_to_h5ad.nf | 57 +++++++++---- modules/local/mtx_to_seurat.nf | 60 ++++++++++---- .../nf-core/kallistobustools/count/main.nf | 8 +- nextflow.config | 16 ++-- subworkflows/local/kallisto_bustools.nf | 3 +- subworkflows/local/mtx_conversion.nf | 4 +- workflows/scrnaseq.nf | 43 +++++++--- 14 files changed, 337 insertions(+), 77 deletions(-) create mode 100755 bin/emptydrops_cell_calling.R create mode 100644 modules/local/emptydrops.nf diff --git a/bin/emptydrops_cell_calling.R b/bin/emptydrops_cell_calling.R new file mode 100755 index 00000000..989c2b34 --- /dev/null +++ b/bin/emptydrops_cell_calling.R @@ -0,0 +1,49 @@ +#!/usr/bin/env Rscript +library("DropletUtils") +library("Matrix") + +args <- commandArgs(trailingOnly=TRUE) + +fn_mtx <- args[1] +fn_barcodes <- args[2] +fn_genes <- args[3] +outdir <- args[4] +aligner <- args[5] + +# Read matrix/barcodes/genes +genes <- read.table(fn_genes,sep='\t') +barcodes <- read.table(fn_barcodes,sep='\t') +mtx <- readMM(fn_mtx) + +get_name <- function(file) { + name <- as.character(basename(file)) + name <- gsub('\\.gz', '', name) + return(name) +} + +# Check if barcodes are in columns, if not, transpose mtx +is_transposed<-FALSE +if (dim(barcodes)[1]!=dim(mtx)[2]){ + mtx<-t(mtx) + is_transposed<-TRUE + print('Matrix was tranposed.') +} + +# Call empty drops +e.out <- emptyDrops(mtx) +is.cell <- e.out$FDR <= 0.01 + +# Slice matrix and barcodes +mtx_filtered <-mtx[,which(is.cell),drop=FALSE] +barcodes_filtered<-barcodes[which(is.cell),] + +# If matrix was transposed early, need to transpose back +if (is_transposed){ + mtx_filtered<-t(mtx_filtered) + print('Transposing back matrix.') +} + +# Write output +writeMM(mtx_filtered,file.path(outdir,get_name(fn_mtx))) +write.table(barcodes_filtered,file=file.path(outdir,get_name(fn_barcodes)),col.names=FALSE,row.names=FALSE,sep='\t',quote=FALSE) +write.table(genes,file=file.path(outdir,get_name(fn_genes)),col.names=FALSE,row.names=FALSE,sep='\t',quote=FALSE) diff --git a/bin/mtx_to_h5ad.py b/bin/mtx_to_h5ad.py index 3282122d..cdcc0c6c 100755 --- a/bin/mtx_to_h5ad.py +++ b/bin/mtx_to_h5ad.py @@ -57,22 +57,36 @@ def input_to_adata( if verbose and (txp2gene or star_index): print("Reading in {}".format(input_data)) - if aligner == "cellranger": + # + # open main data + # + if aligner == "cellranger" and input_data.lower().endswith('.h5'): adata = _10x_h5_to_adata(input_data, sample) else: adata = _mtx_to_adata(input_data, barcode_file, feature_file, sample, aligner) + # + # open gene information + # if verbose and (txp2gene or star_index): print("Reading in {}".format(txp2gene)) - if txp2gene: - t2g = pd.read_table(txp2gene, header=None, names=["gene_id", "gene_symbol"], usecols=[1, 2]) - elif star_index: - t2g = pd.read_table( - f"{star_index}/geneInfo.tab", header=None, skiprows=1, names=["gene_id", "gene_symbol"], usecols=[0, 1] - ) - - if txp2gene or star_index: + if aligner == "cellranger" and not input_data.lower().endswith('.h5'): + # + # for cellranger workflow, we do not have a txp2gene file, so, when using this normal/manual function for empty drops + # we need to provide this information coming directly from the features.tsv file + # by not using the .h5 file for conversion, we loose the two col information: feature_types and genome + # + t2g = pd.read_table(feature_file, header=None, names=["gene_id", "gene_symbol", "feature_types"], usecols=[0, 1, 2]) + else: + if txp2gene: + t2g = pd.read_table(txp2gene, header=None, names=["gene_id", "gene_symbol"], usecols=[1, 2]) + elif star_index: + t2g = pd.read_table( + f"{star_index}/geneInfo.tab", header=None, skiprows=1, names=["gene_id", "gene_symbol"], usecols=[0, 1] + ) + + if txp2gene or star_index or (aligner == "cellranger" and not input_data.lower().endswith('.h5')): t2g = t2g.drop_duplicates(subset="gene_id").set_index("gene_id") adata.var["gene_symbol"] = t2g["gene_symbol"] diff --git a/bin/mtx_to_seurat.R b/bin/mtx_to_seurat.R index f2680838..99ce2f73 100755 --- a/bin/mtx_to_seurat.R +++ b/bin/mtx_to_seurat.R @@ -3,23 +3,40 @@ library(Seurat) args <- commandArgs(trailingOnly=TRUE) -mtx_file <- args[1] -barcode_file <- args[2] -feature_file <- args[3] -out.file <- args[4] -aligner <- args[5] +mtx_file <- args[1] +barcode_file <- args[2] +feature_file <- args[3] +out.file <- args[4] +aligner <- args[5] +is_emptydrops <- args[6] + +if (is_emptydrops == "--is_emptydrops") { + is_emptydrops <- TRUE +} else{ + is_emptydrops <- FALSE +} -if(aligner %in% c("kallisto", "alevin")) { +if (aligner %in% c( "kallisto", "alevin" )) { + print("1") # for kallisto and alevin, the features file contains only one column and matrix needs to be transposed expression.matrix <- ReadMtx( mtx = mtx_file, features = feature_file, cells = barcode_file, feature.column = 1, mtx.transpose = TRUE ) } else { - expression.matrix <- ReadMtx( - mtx = mtx_file, features = feature_file, cells = barcode_file - ) + if (aligner %in% c( "cellranger", "star" ) && is_emptydrops) { + print("2") + expression.matrix <- ReadMtx( + mtx = mtx_file, features = feature_file, cells = barcode_file, feature.column = 1 + ) + } else{ + print("3") + expression.matrix <- ReadMtx( + mtx = mtx_file, features = feature_file, cells = barcode_file + ) + } } + seurat.object <- CreateSeuratObject(counts = expression.matrix) dir.create(basename(dirname(out.file)), showWarnings = FALSE) diff --git a/conf/modules.config b/conf/modules.config index 5813926a..a84247e1 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -28,6 +28,7 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } + withName: CUSTOM_DUMPSOFTWAREVERSIONS { publishDir = [ path: { "${params.outdir}/pipeline_info" }, @@ -45,6 +46,20 @@ process { ] } + if (!params.skip_emptydrops) { + withName: EMPTYDROPS_CELL_CALLING { + publishDir = [ + path: { "${params.outdir}/${params.aligner}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> + if ( params.aligner == 'cellranger' ) "count/${meta.id}/${filename}" + else if ( params.aligner == 'kallisto' ) "${meta.id}.count/${filename}" + else "${meta.id}/${filename}" + } + ] + } + } + withName: 'MTX_TO_H5AD|CONCAT_H5AD|MTX_TO_SEURAT' { publishDir = [ path: { "${params.outdir}/${params.aligner}/mtx_conversions" }, @@ -204,11 +219,12 @@ if (params.aligner == 'kallisto') { ] } withName: KALLISTOBUSTOOLS_COUNT { + def kb_filter = (params.kb_filter) ? '--filter' : '' publishDir = [ path: { "${params.outdir}/${params.aligner}" }, mode: params.publish_dir_mode ] - ext.args = "--workflow ${params.kb_workflow}" + ext.args = "--workflow ${params.kb_workflow} ${kb_filter}" } } } diff --git a/conf/test.config b/conf/test.config index 45ee54c8..08ab1b69 100644 --- a/conf/test.config +++ b/conf/test.config @@ -20,7 +20,8 @@ params { max_time = '6.h' // Input data - input = 'https://github.com/nf-core/test-datasets/raw/scrnaseq/samplesheet-2-0.csv' + input = 'https://github.com/nf-core/test-datasets/raw/scrnaseq/samplesheet-2-0.csv' + skip_emptydrops = true // module does not work on small dataset // Genome references fasta = 'https://github.com/nf-core/test-datasets/raw/scrnaseq/reference/GRCm38.p6.genome.chr19.fa' diff --git a/modules/local/concat_h5ad.nf b/modules/local/concat_h5ad.nf index 96920f9e..cd08cbbe 100644 --- a/modules/local/concat_h5ad.nf +++ b/modules/local/concat_h5ad.nf @@ -7,7 +7,7 @@ process CONCAT_H5AD { 'biocontainers/scanpy:1.7.2--pyhdfd78af_0' }" input: - path h5ad + tuple val(input_type), path(h5ad) path samplesheet output: @@ -20,7 +20,7 @@ process CONCAT_H5AD { """ concat_h5ad.py \\ --input $samplesheet \\ - --out combined_matrix.h5ad \\ + --out combined_${input_type}_matrix.h5ad \\ --suffix "_matrix.h5ad" """ diff --git a/modules/local/emptydrops.nf b/modules/local/emptydrops.nf new file mode 100644 index 00000000..0bc03026 --- /dev/null +++ b/modules/local/emptydrops.nf @@ -0,0 +1,82 @@ +process EMPTYDROPS_CELL_CALLING { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::bioconductor-dropletutils" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bioconductor-dropletutils:1.18.0--r42hf17093f_1' : + 'quay.io/biocontainers/bioconductor-dropletutils:1.18.0--r42hf17093f_1' }" + + input: + // inputs from cellranger nf-core module does not come in a single sample dir + // for each sample, the sub-folders and files come directly in array. + tuple val(meta), path(inputs) + + output: + tuple val(meta), path("emptydrops_filtered"), emit: filtered_matrices + + when: + task.ext.when == null || task.ext.when + + script: + if (params.aligner == "cellranger") { + + matrix = "raw_feature_bc_matrix/matrix.mtx.gz" + barcodes = "raw_feature_bc_matrix/barcodes.tsv.gz" + features = "raw_feature_bc_matrix/features.tsv.gz" + + } else if (params.aligner == "kallisto") { + + matrix = "counts_unfiltered/*.mtx" + barcodes = "counts_unfiltered/*.barcodes.txt" + features = "counts_unfiltered/*.genes.txt" + + } else if (params.aligner == "alevin") { + + matrix = "*_alevin_results/af_quant/alevin/quants_mat.mtx" + barcodes = "*_alevin_results/af_quant/alevin/quants_mat_rows.txt" + features = "*_alevin_results/af_quant/alevin/quants_mat_cols.txt" + + } else if (params.aligner == 'star') { + + matrix = "*.Solo.out/Gene*/raw/matrix.mtx.gz" + barcodes = "*.Solo.out/Gene*/raw/barcodes.tsv.gz" + features = "*.Solo.out/Gene*/raw/features.tsv.gz" + + } + + // + // run script + // + if (params.aligner == 'kallisto' && params.kb_workflow != 'standard') + """ + mkdir emptydrops_filtered/ + # convert file types + for splice_type in spliced unspliced ; do + emptydrops_cell_calling.R \\ + *count/counts_unfiltered/\${splice_type}.mtx \\ + *count/counts_unfiltered/\${splice_type}.barcodes.txt \\ + *count/counts_unfiltered/\${splice_type}.genes.txt \\ + emptydrops_filtered \\ + ${params.aligner} \\ + 0 + done + """ + + else + """ + mkdir emptydrops_filtered/ + emptydrops_cell_calling.R \\ + $matrix \\ + $barcodes \\ + $features \\ + emptydrops_filtered \\ + ${params.aligner} \\ + 0 + """ + + stub: + """ + touch emptydrops_filtered/* + """ +} diff --git a/modules/local/mtx_to_h5ad.nf b/modules/local/mtx_to_h5ad.nf index 84d98608..de5e01e8 100644 --- a/modules/local/mtx_to_h5ad.nf +++ b/modules/local/mtx_to_h5ad.nf @@ -15,33 +15,58 @@ process MTX_TO_H5AD { path star_index output: - path "${meta.id}/*h5ad", emit: h5ad - path "${meta.id}/*", emit: counts - path "versions.yml", emit: versions + tuple val(input_type), path("${meta.id}/*h5ad") , emit: h5ad + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when script: - // def file paths for aligners (except cellranger) - if (params.aligner == 'kallisto') { - mtx_matrix = "*count/counts_unfiltered/*.mtx" - barcodes_tsv = "*count/counts_unfiltered/*.barcodes.txt" - features_tsv = "*count/counts_unfiltered/*.genes.txt" + // check input type of inputs + input_type = (inputs.toUriString().contains('unfiltered') || inputs.toUriString().contains('raw')) ? 'raw' : 'filtered' + if (inputs.toUriString().contains('emptydrops')) { input_type = 'emptydrops' } + + // def file paths for aligners. Cellranger is normally converted with the .h5 files + // However, the emptydrops call, always generate .mtx files, thus, cellranger 'emptydrops' required a parsing + if (params.aligner in [ 'cellranger', 'cellrangerarc' ] && input_type == 'emptydrops') { + + aligner = 'cellranger' + txp2gene = '' + star_index = '' + mtx_matrix = "emptydrops_filtered/matrix.mtx" + barcodes_tsv = "emptydrops_filtered/barcodes.tsv" + features_tsv = "emptydrops_filtered/features.tsv" + + } else if (params.aligner == 'kallisto') { + + kb_pattern = (input_type == 'raw') ? 'un' : '' + mtx_dir = (input_type == 'emptydrops') ? 'emptydrops_filtered' : "counts_${kb_pattern}filtered" + mtx_matrix = "${mtx_dir}/*.mtx" + barcodes_tsv = "${mtx_dir}/*.barcodes.txt" + features_tsv = "${mtx_dir}/*.genes.txt" + } else if (params.aligner == 'alevin') { - mtx_matrix = "*_alevin_results/af_quant/alevin/quants_mat.mtx" - barcodes_tsv = "*_alevin_results/af_quant/alevin/quants_mat_rows.txt" - features_tsv = "*_alevin_results/af_quant/alevin/quants_mat_cols.txt" + + // alevin does not have filtered/unfiltered results + mtx_dir = (input_type == 'emptydrops') ? 'emptydrops_filtered' : '*_alevin_results/af_quant/alevin' + mtx_matrix = "${mtx_dir}/quants_mat.mtx" + barcodes_tsv = "${mtx_dir}/quants_mat_rows.txt" + features_tsv = "${mtx_dir}/quants_mat_cols.txt" + } else if (params.aligner == 'star') { - mtx_matrix = "*.Solo.out/Gene*/filtered/matrix.mtx.gz" - barcodes_tsv = "*.Solo.out/Gene*/filtered/barcodes.tsv.gz" - features_tsv = "*.Solo.out/Gene*/filtered/features.tsv.gz" + + mtx_dir = (input_type == 'emptydrops') ? 'emptydrops_filtered' : "*.Solo.out/Gene*/${input_type}" + suffix = (input_type == 'emptydrops') ? '' : '.gz' + mtx_matrix = "${mtx_dir}/matrix.mtx${suffix}" + barcodes_tsv = "${mtx_dir}/barcodes.tsv${suffix}" + features_tsv = "${mtx_dir}/features.tsv${suffix}" + } // // run script // - if (params.aligner in [ 'cellranger', 'cellrangerarc' ]) + if (params.aligner in [ 'cellranger', 'cellrangerarc' ] && input_type != 'emptydrops') """ # convert file types mtx_to_h5ad.py \\ @@ -79,7 +104,7 @@ process MTX_TO_H5AD { --feature $features_tsv \\ --txp2gene ${txp2gene} \\ --star_index ${star_index} \\ - --out ${meta.id}/${meta.id}_matrix.h5ad + --out ${meta.id}/${meta.id}_${input_type}_matrix.h5ad """ stub: diff --git a/modules/local/mtx_to_seurat.nf b/modules/local/mtx_to_seurat.nf index d83575a4..212b2cbf 100644 --- a/modules/local/mtx_to_seurat.nf +++ b/modules/local/mtx_to_seurat.nf @@ -19,23 +19,49 @@ process MTX_TO_SEURAT { script: def aligner = params.aligner + + // check input type of inputs + def input_type = (inputs.toUriString().contains('unfiltered') || inputs.toUriString().contains('raw')) ? 'raw' : 'filtered' + if (inputs.toUriString().contains('emptydrops')) { input_type = 'emptydrops' } + def is_emptydrops = (input_type == 'emptydrops') ? '--is_emptydrops' : '0' + + // def file paths for aligners. Cellranger is normally converted with the .h5 files + // However, the emptydrops call, always generate .mtx files, thus, cellranger 'emptydrops' required a parsing if (params.aligner in [ 'cellranger', 'cellrangerarc' ]) { - matrix = "matrix.mtx.gz" - barcodes = "barcodes.tsv.gz" - features = "features.tsv.gz" - } else if (params.aligner == "kallisto") { - matrix = "*count/counts_unfiltered/*.mtx" - barcodes = "*count/counts_unfiltered/*.barcodes.txt" - features = "*count/counts_unfiltered/*.genes.txt" + + mtx_dir = (input_type == 'emptydrops') ? 'emptydrops_filtered' : "${input_type}_feature_bc_matrix" + matrix = "${mtx_dir}/matrix.mtx*" + barcodes = "${mtx_dir}/barcodes.tsv*" + features = "${mtx_dir}/features.tsv*" + + } else if (params.aligner == 'kallisto') { + + kb_pattern = (input_type == 'raw') ? 'un' : '' + mtx_dir = (input_type == 'emptydrops') ? 'emptydrops_filtered' : "counts_${kb_pattern}filtered" + matrix = "${mtx_dir}/*.mtx" + barcodes = "${mtx_dir}/*.barcodes.txt" + features = "${mtx_dir}/*.genes.txt" + } else if (params.aligner == "alevin") { - matrix = "*_alevin_results/af_quant/alevin/quants_mat.mtx" - barcodes = "*_alevin_results/af_quant/alevin/quants_mat_rows.txt" - features = "*_alevin_results/af_quant/alevin/quants_mat_cols.txt" + + mtx_dir = (input_type == 'emptydrops') ? 'emptydrops_filtered' : '*_alevin_results/af_quant/alevin' + matrix = "${mtx_dir}/quants_mat.mtx" + barcodes = "${mtx_dir}/quants_mat_rows.txt" + features = "${mtx_dir}/quants_mat_cols.txt" + } else if (params.aligner == 'star') { - matrix = "*.Solo.out/Gene*/filtered/matrix.mtx.gz" - barcodes = "*.Solo.out/Gene*/filtered/barcodes.tsv.gz" - features = "*.Solo.out/Gene*/filtered/features.tsv.gz" + + mtx_dir = (input_type == 'emptydrops') ? 'emptydrops_filtered' : "*.Solo.out/Gene*/${input_type}" + suffix = (input_type == 'emptydrops') ? '' : '.gz' + matrix = "${mtx_dir}/matrix.mtx${suffix}" + barcodes = "${mtx_dir}/barcodes.tsv${suffix}" + features = "${mtx_dir}/features.tsv${suffix}" + } + + // + // run script + // """ mkdir ${meta.id} """ @@ -49,7 +75,8 @@ process MTX_TO_SEURAT { *count/counts_unfiltered/\${input_type}.barcodes.txt \\ *count/counts_unfiltered/\${input_type}.genes.txt \\ ${meta.id}/${meta.id}_\${input_type}_matrix.rds \\ - ${aligner} + ${aligner} \\ + ${is_emptydrops} done """ @@ -59,8 +86,9 @@ process MTX_TO_SEURAT { $matrix \\ $barcodes \\ $features \\ - ${meta.id}/${meta.id}_matrix.rds \\ - ${aligner} + ${meta.id}/${meta.id}_${input_type}_matrix.rds \\ + ${aligner} \\ + ${is_emptydrops} """ stub: diff --git a/modules/nf-core/kallistobustools/count/main.nf b/modules/nf-core/kallistobustools/count/main.nf index 036bb35d..f29af40b 100644 --- a/modules/nf-core/kallistobustools/count/main.nf +++ b/modules/nf-core/kallistobustools/count/main.nf @@ -16,9 +16,11 @@ process KALLISTOBUSTOOLS_COUNT { val technology output: - tuple val(meta), path ("*.count"), emit: count - path "versions.yml" , emit: versions - path "*.count/*/*.mtx" , emit: matrix //Ensure that kallisto finished and produced outputs + tuple val(meta), path ("*.count") , emit: count + tuple val(meta), path ("*.count/counts_unfiltered"), emit: raw_counts // TODO: Add to nf-coew/modules before merging PR + tuple val(meta), path ("*.count/counts_filtered") , emit: filtered_counts, optional: true // TODO: Add to nf-coew/modules before merging PR + path "versions.yml" , emit: versions + path "*.count/*/*.mtx" , emit: matrix //Ensure that kallisto finished and produced outputs when: task.ext.when == null || task.ext.when diff --git a/nextflow.config b/nextflow.config index 7af20f8c..bf3133fa 100644 --- a/nextflow.config +++ b/nextflow.config @@ -26,10 +26,11 @@ params { txp2gene = null salmon_index = null - // kallist bustools parameters + // kallisto bustools parameters kallisto_gene_map = null kallisto_index = null kb_workflow = "standard" + kb_filter = false // STARsolo parameters star_index = null @@ -38,15 +39,18 @@ params { star_feature = "Gene" // Cellranger parameters - cellranger_index = null + cellranger_index = null // Cellranger ARC parameters - motifs = null - cellrangerarc_config = null + motifs = null + cellrangerarc_config = null cellrangerarc_reference = null - // UniverSC paramaters - universc_index = null + // UniverSC parameters + universc_index = null + + // Emptydrops parameters + skip_emptydrops = false // Template Boilerplate options skip_multiqc = false diff --git a/subworkflows/local/kallisto_bustools.nf b/subworkflows/local/kallisto_bustools.nf index 3210e47a..3b4b6b82 100644 --- a/subworkflows/local/kallisto_bustools.nf +++ b/subworkflows/local/kallisto_bustools.nf @@ -66,7 +66,8 @@ workflow KALLISTO_BUSTOOLS { emit: ch_versions counts = KALLISTOBUSTOOLS_COUNT.out.count + raw_counts = KALLISTOBUSTOOLS_COUNT.out.raw_counts + filtered_counts = KALLISTOBUSTOOLS_COUNT.out.filtered_counts txp2gene = txp2gene.collect() - } diff --git a/subworkflows/local/mtx_conversion.nf b/subworkflows/local/mtx_conversion.nf index 958da400..8d1caee4 100644 --- a/subworkflows/local/mtx_conversion.nf +++ b/subworkflows/local/mtx_conversion.nf @@ -35,7 +35,7 @@ workflow MTX_CONVERSION { // Concat sample-specific h5ad in one // CONCAT_H5AD ( - MTX_TO_H5AD.out.h5ad.collect(), // gather all sample-specific files + MTX_TO_H5AD.out.h5ad.groupTuple(), // gather all sample-specific files / per type samplesheet ) @@ -51,6 +51,6 @@ workflow MTX_CONVERSION { emit: ch_versions - counts = MTX_TO_H5AD.out.counts + // counts = MTX_TO_H5AD.out.counts was this ever used? } diff --git a/workflows/scrnaseq.nf b/workflows/scrnaseq.nf index 25740a8e..596ce419 100644 --- a/workflows/scrnaseq.nf +++ b/workflows/scrnaseq.nf @@ -37,16 +37,17 @@ ch_multiqc_custom_methods_description = params.multiqc_methods_description ? fil // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // -include { INPUT_CHECK } from '../subworkflows/local/input_check' -include { FASTQC_CHECK } from '../subworkflows/local/fastqc' -include { KALLISTO_BUSTOOLS } from '../subworkflows/local/kallisto_bustools' -include { SCRNASEQ_ALEVIN } from '../subworkflows/local/alevin' -include { STARSOLO } from '../subworkflows/local/starsolo' -include { CELLRANGER_ALIGN } from "../subworkflows/local/align_cellranger" -include { CELLRANGERARC_ALIGN } from "../subworkflows/local/align_cellrangerarc" -include { UNIVERSC_ALIGN } from "../subworkflows/local/align_universc" -include { MTX_CONVERSION } from "../subworkflows/local/mtx_conversion" -include { GTF_GENE_FILTER } from '../modules/local/gtf_gene_filter' +include { INPUT_CHECK } from '../subworkflows/local/input_check' +include { FASTQC_CHECK } from '../subworkflows/local/fastqc' +include { KALLISTO_BUSTOOLS } from '../subworkflows/local/kallisto_bustools' +include { SCRNASEQ_ALEVIN } from '../subworkflows/local/alevin' +include { STARSOLO } from '../subworkflows/local/starsolo' +include { CELLRANGER_ALIGN } from "../subworkflows/local/align_cellranger" +include { CELLRANGERARC_ALIGN } from "../subworkflows/local/align_cellrangerarc" +include { UNIVERSC_ALIGN } from "../subworkflows/local/align_universc" +include { MTX_CONVERSION } from "../subworkflows/local/mtx_conversion" +include { GTF_GENE_FILTER } from '../modules/local/gtf_gene_filter' +include { EMPTYDROPS_CELL_CALLING } from '../modules/local/emptydrops' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -149,7 +150,7 @@ workflow SCRNASEQ { ch_fastq ) ch_versions = ch_versions.mix(KALLISTO_BUSTOOLS.out.ch_versions) - ch_mtx_matrices = ch_mtx_matrices.mix(KALLISTO_BUSTOOLS.out.counts) + ch_mtx_matrices = ch_mtx_matrices.mix(KALLISTO_BUSTOOLS.out.raw_counts, KALLISTO_BUSTOOLS.out.filtered_counts) ch_txp2gene = KALLISTO_BUSTOOLS.out.txp2gene } @@ -232,6 +233,26 @@ workflow SCRNASEQ { ch_mtx_matrices = ch_mtx_matrices.mix(CELLRANGERARC_ALIGN.out.cellranger_arc_out) } + // Run emptydrops calling module + if ( !params.skip_emptydrops ) { + // emptydrops should only run on the raw matrices thus, filter-out the filtered results + // of the aligners that can produce it + if ( params.aligner == "cellranger" ) { + ch_mtx_matrices_for_emptydrops = + ch_mtx_matrices.map { meta, mtx_files -> + [ meta, mtx_files.findAll { it.toString().contains("raw_feature_bc_matrix") } ] + } + .filter { meta, mtx_files -> mtx_files } + } else if (params.aligner == 'kallisto') { + ch_mtx_matrices_for_emptydrops = + ch_mtx_matrices.filter { meta, mtx_files -> mtx_files.toUriString().contains("counts_unfiltered") } + } else { + ch_mtx_matrices_for_emptydrops = ch_mtx_matrices + } + EMPTYDROPS_CELL_CALLING( ch_mtx_matrices_for_emptydrops ) + ch_mtx_matrices = ch_mtx_matrices.mix( EMPTYDROPS_CELL_CALLING.out.filtered_matrices ) + } + // Run mtx to h5ad conversion subworkflow MTX_CONVERSION ( ch_mtx_matrices, From afe4904eb6faf381ceebe5fd3c21476210fc6d1f Mon Sep 17 00:00:00 2001 From: fmalmeida Date: Thu, 1 Feb 2024 12:19:45 +0100 Subject: [PATCH 02/30] update input_type labels --- modules/local/mtx_to_h5ad.nf | 15 ++++++++------- modules/local/mtx_to_seurat.nf | 20 ++++++++++++-------- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/modules/local/mtx_to_h5ad.nf b/modules/local/mtx_to_h5ad.nf index de5e01e8..08bb18a2 100644 --- a/modules/local/mtx_to_h5ad.nf +++ b/modules/local/mtx_to_h5ad.nf @@ -24,11 +24,12 @@ process MTX_TO_H5AD { script: // check input type of inputs input_type = (inputs.toUriString().contains('unfiltered') || inputs.toUriString().contains('raw')) ? 'raw' : 'filtered' - if (inputs.toUriString().contains('emptydrops')) { input_type = 'emptydrops' } + if ( params.aligner == 'alevin' ) { input_type = 'raw' } // alevin has its own filtering methods and mostly output a single mtx, raw here means, the base tool output + if (inputs.toUriString().contains('emptydrops')) { input_type = 'custom_emptydrops_filter' } // def file paths for aligners. Cellranger is normally converted with the .h5 files // However, the emptydrops call, always generate .mtx files, thus, cellranger 'emptydrops' required a parsing - if (params.aligner in [ 'cellranger', 'cellrangerarc' ] && input_type == 'emptydrops') { + if (params.aligner in [ 'cellranger', 'cellrangerarc' ] && input_type == 'custom_emptydrops_filter') { aligner = 'cellranger' txp2gene = '' @@ -40,7 +41,7 @@ process MTX_TO_H5AD { } else if (params.aligner == 'kallisto') { kb_pattern = (input_type == 'raw') ? 'un' : '' - mtx_dir = (input_type == 'emptydrops') ? 'emptydrops_filtered' : "counts_${kb_pattern}filtered" + mtx_dir = (input_type == 'custom_emptydrops_filter') ? 'emptydrops_filtered' : "counts_${kb_pattern}filtered" mtx_matrix = "${mtx_dir}/*.mtx" barcodes_tsv = "${mtx_dir}/*.barcodes.txt" features_tsv = "${mtx_dir}/*.genes.txt" @@ -48,15 +49,15 @@ process MTX_TO_H5AD { } else if (params.aligner == 'alevin') { // alevin does not have filtered/unfiltered results - mtx_dir = (input_type == 'emptydrops') ? 'emptydrops_filtered' : '*_alevin_results/af_quant/alevin' + mtx_dir = (input_type == 'custom_emptydrops_filter') ? 'emptydrops_filtered' : '*_alevin_results/af_quant/alevin' mtx_matrix = "${mtx_dir}/quants_mat.mtx" barcodes_tsv = "${mtx_dir}/quants_mat_rows.txt" features_tsv = "${mtx_dir}/quants_mat_cols.txt" } else if (params.aligner == 'star') { - mtx_dir = (input_type == 'emptydrops') ? 'emptydrops_filtered' : "*.Solo.out/Gene*/${input_type}" - suffix = (input_type == 'emptydrops') ? '' : '.gz' + mtx_dir = (input_type == 'custom_emptydrops_filter') ? 'emptydrops_filtered' : "*.Solo.out/Gene*/${input_type}" + suffix = (input_type == 'custom_emptydrops_filter') ? '' : '.gz' mtx_matrix = "${mtx_dir}/matrix.mtx${suffix}" barcodes_tsv = "${mtx_dir}/barcodes.tsv${suffix}" features_tsv = "${mtx_dir}/features.tsv${suffix}" @@ -66,7 +67,7 @@ process MTX_TO_H5AD { // // run script // - if (params.aligner in [ 'cellranger', 'cellrangerarc' ] && input_type != 'emptydrops') + if (params.aligner in [ 'cellranger', 'cellrangerarc' ] && input_type != 'custom_emptydrops_filter') """ # convert file types mtx_to_h5ad.py \\ diff --git a/modules/local/mtx_to_seurat.nf b/modules/local/mtx_to_seurat.nf index 212b2cbf..d95770d2 100644 --- a/modules/local/mtx_to_seurat.nf +++ b/modules/local/mtx_to_seurat.nf @@ -21,15 +21,19 @@ process MTX_TO_SEURAT { def aligner = params.aligner // check input type of inputs - def input_type = (inputs.toUriString().contains('unfiltered') || inputs.toUriString().contains('raw')) ? 'raw' : 'filtered' - if (inputs.toUriString().contains('emptydrops')) { input_type = 'emptydrops' } - def is_emptydrops = (input_type == 'emptydrops') ? '--is_emptydrops' : '0' + def is_emptydrops = '0' + input_type = (inputs.toUriString().contains('unfiltered') || inputs.toUriString().contains('raw')) ? 'raw' : 'filtered' + if ( params.aligner == 'alevin' ) { input_type = 'raw' } // alevin has its own filtering methods and mostly output a single mtx, raw here means, the base tool output + if (inputs.toUriString().contains('emptydrops')) { + input_type = 'custom_emptydrops_filter' + is_emptydrops = '--is_emptydrops' + } // def file paths for aligners. Cellranger is normally converted with the .h5 files // However, the emptydrops call, always generate .mtx files, thus, cellranger 'emptydrops' required a parsing if (params.aligner in [ 'cellranger', 'cellrangerarc' ]) { - mtx_dir = (input_type == 'emptydrops') ? 'emptydrops_filtered' : "${input_type}_feature_bc_matrix" + mtx_dir = (input_type == 'custom_emptydrops_filter') ? 'emptydrops_filtered' : "${input_type}_feature_bc_matrix" matrix = "${mtx_dir}/matrix.mtx*" barcodes = "${mtx_dir}/barcodes.tsv*" features = "${mtx_dir}/features.tsv*" @@ -37,22 +41,22 @@ process MTX_TO_SEURAT { } else if (params.aligner == 'kallisto') { kb_pattern = (input_type == 'raw') ? 'un' : '' - mtx_dir = (input_type == 'emptydrops') ? 'emptydrops_filtered' : "counts_${kb_pattern}filtered" + mtx_dir = (input_type == 'custom_emptydrops_filter') ? 'emptydrops_filtered' : "counts_${kb_pattern}filtered" matrix = "${mtx_dir}/*.mtx" barcodes = "${mtx_dir}/*.barcodes.txt" features = "${mtx_dir}/*.genes.txt" } else if (params.aligner == "alevin") { - mtx_dir = (input_type == 'emptydrops') ? 'emptydrops_filtered' : '*_alevin_results/af_quant/alevin' + mtx_dir = (input_type == 'custom_emptydrops_filter') ? 'emptydrops_filtered' : '*_alevin_results/af_quant/alevin' matrix = "${mtx_dir}/quants_mat.mtx" barcodes = "${mtx_dir}/quants_mat_rows.txt" features = "${mtx_dir}/quants_mat_cols.txt" } else if (params.aligner == 'star') { - mtx_dir = (input_type == 'emptydrops') ? 'emptydrops_filtered' : "*.Solo.out/Gene*/${input_type}" - suffix = (input_type == 'emptydrops') ? '' : '.gz' + mtx_dir = (input_type == 'custom_emptydrops_filter') ? 'emptydrops_filtered' : "*.Solo.out/Gene*/${input_type}" + suffix = (input_type == 'custom_emptydrops_filter') ? '' : '.gz' matrix = "${mtx_dir}/matrix.mtx${suffix}" barcodes = "${mtx_dir}/barcodes.tsv${suffix}" features = "${mtx_dir}/features.tsv${suffix}" From d03d81f5643219f70af23678e59fe2a6751ce8fc Mon Sep 17 00:00:00 2001 From: fmalmeida Date: Fri, 2 Feb 2024 11:23:08 +0100 Subject: [PATCH 03/30] fixed workflow conversions to work with star align results --- modules/local/emptydrops.nf | 6 +++--- modules/local/mtx_to_h5ad.nf | 2 +- modules/local/mtx_to_seurat.nf | 2 +- modules/local/star_align.nf | 14 ++++++++------ subworkflows/local/starsolo.nf | 2 ++ workflows/scrnaseq.nf | 22 +++++++++++----------- 6 files changed, 26 insertions(+), 22 deletions(-) diff --git a/modules/local/emptydrops.nf b/modules/local/emptydrops.nf index 0bc03026..215b6da3 100644 --- a/modules/local/emptydrops.nf +++ b/modules/local/emptydrops.nf @@ -39,9 +39,9 @@ process EMPTYDROPS_CELL_CALLING { } else if (params.aligner == 'star') { - matrix = "*.Solo.out/Gene*/raw/matrix.mtx.gz" - barcodes = "*.Solo.out/Gene*/raw/barcodes.tsv.gz" - features = "*.Solo.out/Gene*/raw/features.tsv.gz" + matrix = "raw/matrix.mtx.gz" + barcodes = "raw/barcodes.tsv.gz" + features = "raw/features.tsv.gz" } diff --git a/modules/local/mtx_to_h5ad.nf b/modules/local/mtx_to_h5ad.nf index 08bb18a2..7afc12fa 100644 --- a/modules/local/mtx_to_h5ad.nf +++ b/modules/local/mtx_to_h5ad.nf @@ -56,7 +56,7 @@ process MTX_TO_H5AD { } else if (params.aligner == 'star') { - mtx_dir = (input_type == 'custom_emptydrops_filter') ? 'emptydrops_filtered' : "*.Solo.out/Gene*/${input_type}" + mtx_dir = (input_type == 'custom_emptydrops_filter') ? 'emptydrops_filtered' : "${input_type}" suffix = (input_type == 'custom_emptydrops_filter') ? '' : '.gz' mtx_matrix = "${mtx_dir}/matrix.mtx${suffix}" barcodes_tsv = "${mtx_dir}/barcodes.tsv${suffix}" diff --git a/modules/local/mtx_to_seurat.nf b/modules/local/mtx_to_seurat.nf index d95770d2..8e991c24 100644 --- a/modules/local/mtx_to_seurat.nf +++ b/modules/local/mtx_to_seurat.nf @@ -55,7 +55,7 @@ process MTX_TO_SEURAT { } else if (params.aligner == 'star') { - mtx_dir = (input_type == 'custom_emptydrops_filter') ? 'emptydrops_filtered' : "*.Solo.out/Gene*/${input_type}" + mtx_dir = (input_type == 'custom_emptydrops_filter') ? 'emptydrops_filtered' : "${input_type}" suffix = (input_type == 'custom_emptydrops_filter') ? '' : '.gz' matrix = "${mtx_dir}/matrix.mtx${suffix}" barcodes = "${mtx_dir}/barcodes.tsv${suffix}" diff --git a/modules/local/star_align.nf b/modules/local/star_align.nf index a7dfab3d..4b3df1e1 100644 --- a/modules/local/star_align.nf +++ b/modules/local/star_align.nf @@ -21,12 +21,14 @@ process STAR_ALIGN { val other_10x_parameters output: - tuple val(meta), path('*d.out.bam') , emit: bam - tuple val(meta), path('*.Solo.out') , emit: counts - tuple val(meta), path('*Log.final.out') , emit: log_final - tuple val(meta), path('*Log.out') , emit: log_out - tuple val(meta), path('*Log.progress.out'), emit: log_progress - path "versions.yml" , emit: versions + tuple val(meta), path('*d.out.bam') , emit: bam + tuple val(meta), path('*.Solo.out') , emit: counts + tuple val(meta), path ("*.Solo.out/Gene*/raw") , emit: raw_counts + tuple val(meta), path ("*.Solo.out/Gene*/filtered"), emit: filtered_counts + tuple val(meta), path('*Log.final.out') , emit: log_final + tuple val(meta), path('*Log.out') , emit: log_out + tuple val(meta), path('*Log.progress.out') , emit: log_progress + path "versions.yml" , emit: versions tuple val(meta), path('*sortedByCoord.out.bam') , optional:true, emit: bam_sorted tuple val(meta), path('*toTranscriptome.out.bam'), optional:true, emit: bam_transcript diff --git a/subworkflows/local/starsolo.nf b/subworkflows/local/starsolo.nf index 47b2e757..9056d899 100644 --- a/subworkflows/local/starsolo.nf +++ b/subworkflows/local/starsolo.nf @@ -60,5 +60,7 @@ workflow STARSOLO { star_index = star_index.map{ meta, index -> index} star_result = STAR_ALIGN.out.tab star_counts = STAR_ALIGN.out.counts + raw_counts = STAR_ALIGN.out.raw_counts + filtered_counts = STAR_ALIGN.out.filtered_counts for_multiqc = STAR_ALIGN.out.log_final } diff --git a/workflows/scrnaseq.nf b/workflows/scrnaseq.nf index 596ce419..71abb7a2 100644 --- a/workflows/scrnaseq.nf +++ b/workflows/scrnaseq.nf @@ -184,7 +184,7 @@ workflow SCRNASEQ { protocol_config.get('extra_args', ""), ) ch_versions = ch_versions.mix(STARSOLO.out.ch_versions) - ch_mtx_matrices = ch_mtx_matrices.mix(STARSOLO.out.star_counts) + ch_mtx_matrices = ch_mtx_matrices.mix(STARSOLO.out.raw_counts, STARSOLO.out.filtered_counts) ch_star_index = STARSOLO.out.star_index ch_multiqc_star = STARSOLO.out.for_multiqc } @@ -235,17 +235,17 @@ workflow SCRNASEQ { // Run emptydrops calling module if ( !params.skip_emptydrops ) { - // emptydrops should only run on the raw matrices thus, filter-out the filtered results - // of the aligners that can produce it - if ( params.aligner == "cellranger" ) { - ch_mtx_matrices_for_emptydrops = - ch_mtx_matrices.map { meta, mtx_files -> - [ meta, mtx_files.findAll { it.toString().contains("raw_feature_bc_matrix") } ] - } - .filter { meta, mtx_files -> mtx_files } - } else if (params.aligner == 'kallisto') { + + // + // emptydrops should only run on the raw matrices thus, filter-out the filtered result of the aligners that can produce it + // + if ( params.aligner in [ 'cellranger', 'cellrangerarc', 'kallisto', 'star' ] ) { ch_mtx_matrices_for_emptydrops = - ch_mtx_matrices.filter { meta, mtx_files -> mtx_files.toUriString().contains("counts_unfiltered") } + ch_mtx_matrices.filter { meta, mtx_files -> + mtx_files.toString().contains("raw_feature_bc_matrix") || // cellranger + mtx_files.toUriString().contains("counts_unfiltered") || // kallisto + mtx_files.toUriString().contains("raw") // star + } } else { ch_mtx_matrices_for_emptydrops = ch_mtx_matrices } From 796afba1e5df0ca273fcadb1bac2a04cf5a48a51 Mon Sep 17 00:00:00 2001 From: fmalmeida Date: Mon, 5 Feb 2024 10:02:27 +0100 Subject: [PATCH 04/30] include modifications for working with cellranger --- bin/mtx_to_h5ad.py | 10 +++++++--- modules/local/emptydrops.nf | 6 +++--- modules/local/mtx_to_h5ad.nf | 11 +++++++---- modules/local/mtx_to_seurat.nf | 16 ++++++++++------ modules/nf-core/cellranger/count/main.nf | 6 ++++-- subworkflows/local/align_cellranger.nf | 5 +++-- subworkflows/local/mtx_conversion.nf | 8 -------- workflows/scrnaseq.nf | 8 ++++---- 8 files changed, 38 insertions(+), 32 deletions(-) diff --git a/bin/mtx_to_h5ad.py b/bin/mtx_to_h5ad.py index cdcc0c6c..2f5dc9ba 100755 --- a/bin/mtx_to_h5ad.py +++ b/bin/mtx_to_h5ad.py @@ -32,9 +32,13 @@ def _mtx_to_adata( aligner: str, ): adata = sc.read_mtx(mtx_file) - if ( - aligner == "star" - ): # for some reason star matrix comes transposed and doesn't fit when values are appended directly + # for some reason star matrix comes transposed and doesn't fit when values are appended directly + # also true for cellranger files ( this is only used when running with the custom emptydrops_filtered files ) + # otherwise, it uses the cellranger .h5 files + if aligner in [ + "cellranger", + "star", + ]: adata = adata.transpose() adata.obs_names = pd.read_csv(barcode_file, header=None, sep="\t")[0].values diff --git a/modules/local/emptydrops.nf b/modules/local/emptydrops.nf index 215b6da3..ca42e171 100644 --- a/modules/local/emptydrops.nf +++ b/modules/local/emptydrops.nf @@ -21,9 +21,9 @@ process EMPTYDROPS_CELL_CALLING { script: if (params.aligner == "cellranger") { - matrix = "raw_feature_bc_matrix/matrix.mtx.gz" - barcodes = "raw_feature_bc_matrix/barcodes.tsv.gz" - features = "raw_feature_bc_matrix/features.tsv.gz" + matrix = "matrix.mtx.gz" + barcodes = "barcodes.tsv.gz" + features = "features.tsv.gz" } else if (params.aligner == "kallisto") { diff --git a/modules/local/mtx_to_h5ad.nf b/modules/local/mtx_to_h5ad.nf index 7afc12fa..10b617df 100644 --- a/modules/local/mtx_to_h5ad.nf +++ b/modules/local/mtx_to_h5ad.nf @@ -22,10 +22,13 @@ process MTX_TO_H5AD { task.ext.when == null || task.ext.when script: + // Get a file to check input type. Some aligners bring arrays instead of a single file. + def input_to_check = (inputs instanceof String) ? inputs : inputs[0] + // check input type of inputs - input_type = (inputs.toUriString().contains('unfiltered') || inputs.toUriString().contains('raw')) ? 'raw' : 'filtered' + input_type = (input_to_check.toUriString().contains('unfiltered') || input_to_check.toUriString().contains('raw')) ? 'raw' : 'filtered' if ( params.aligner == 'alevin' ) { input_type = 'raw' } // alevin has its own filtering methods and mostly output a single mtx, raw here means, the base tool output - if (inputs.toUriString().contains('emptydrops')) { input_type = 'custom_emptydrops_filter' } + if (input_to_check.toUriString().contains('emptydrops')) { input_type = 'custom_emptydrops_filter' } // def file paths for aligners. Cellranger is normally converted with the .h5 files // However, the emptydrops call, always generate .mtx files, thus, cellranger 'emptydrops' required a parsing @@ -72,9 +75,9 @@ process MTX_TO_H5AD { # convert file types mtx_to_h5ad.py \\ --aligner cellranger \\ - --input filtered_feature_bc_matrix.h5 \\ + --input ${input_type}_feature_bc_matrix.h5 \\ --sample ${meta.id} \\ - --out ${meta.id}/${meta.id}_matrix.h5ad + --out ${meta.id}/${meta.id}_${input_type}_matrix.h5ad """ else if (params.aligner == 'kallisto' && params.kb_workflow != 'standard') diff --git a/modules/local/mtx_to_seurat.nf b/modules/local/mtx_to_seurat.nf index 8e991c24..626e0b97 100644 --- a/modules/local/mtx_to_seurat.nf +++ b/modules/local/mtx_to_seurat.nf @@ -20,11 +20,15 @@ process MTX_TO_SEURAT { script: def aligner = params.aligner + + // Get a file to check input type. Some aligners bring arrays instead of a single file. + def input_to_check = (inputs instanceof String) ? inputs : inputs[0] + // check input type of inputs def is_emptydrops = '0' - input_type = (inputs.toUriString().contains('unfiltered') || inputs.toUriString().contains('raw')) ? 'raw' : 'filtered' + input_type = (input_to_check.toUriString().contains('unfiltered') || input_to_check.toUriString().contains('raw')) ? 'raw' : 'filtered' if ( params.aligner == 'alevin' ) { input_type = 'raw' } // alevin has its own filtering methods and mostly output a single mtx, raw here means, the base tool output - if (inputs.toUriString().contains('emptydrops')) { + if (input_to_check.toUriString().contains('emptydrops')) { input_type = 'custom_emptydrops_filter' is_emptydrops = '--is_emptydrops' } @@ -33,10 +37,10 @@ process MTX_TO_SEURAT { // However, the emptydrops call, always generate .mtx files, thus, cellranger 'emptydrops' required a parsing if (params.aligner in [ 'cellranger', 'cellrangerarc' ]) { - mtx_dir = (input_type == 'custom_emptydrops_filter') ? 'emptydrops_filtered' : "${input_type}_feature_bc_matrix" - matrix = "${mtx_dir}/matrix.mtx*" - barcodes = "${mtx_dir}/barcodes.tsv*" - features = "${mtx_dir}/features.tsv*" + mtx_dir = (input_type == 'custom_emptydrops_filter') ? 'emptydrops_filtered/' : '' + matrix = "${mtx_dir}matrix.mtx*" + barcodes = "${mtx_dir}barcodes.tsv*" + features = "${mtx_dir}features.tsv*" } else if (params.aligner == 'kallisto') { diff --git a/modules/nf-core/cellranger/count/main.nf b/modules/nf-core/cellranger/count/main.nf index d7a191fc..3ccf34f7 100644 --- a/modules/nf-core/cellranger/count/main.nf +++ b/modules/nf-core/cellranger/count/main.nf @@ -9,8 +9,10 @@ process CELLRANGER_COUNT { path reference output: - tuple val(meta), path("**/outs/**"), emit: outs - path "versions.yml" , emit: versions + tuple val(meta), path("**/outs/**") , emit: outs + tuple val(meta), path("**/outs/filtered_feature_bc_matrix**"), emit: filtered // TODO: Add to nf-coew/modules before merging PR + tuple val(meta), path("**/outs/raw_feature_bc_matrix**") , emit: raw // TODO: Add to nf-coew/modules before merging PR + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when diff --git a/subworkflows/local/align_cellranger.nf b/subworkflows/local/align_cellranger.nf index bfdd533e..39e54675 100644 --- a/subworkflows/local/align_cellranger.nf +++ b/subworkflows/local/align_cellranger.nf @@ -42,6 +42,7 @@ workflow CELLRANGER_ALIGN { emit: ch_versions - cellranger_out = CELLRANGER_COUNT.out.outs - star_index = cellranger_index + cellranger_out = CELLRANGER_COUNT.out.outs + cellranger_matrices = CELLRANGER_COUNT.out.filtered.mix( CELLRANGER_COUNT.out.raw ) + star_index = cellranger_index } diff --git a/subworkflows/local/mtx_conversion.nf b/subworkflows/local/mtx_conversion.nf index 8d1caee4..b1aff9e9 100644 --- a/subworkflows/local/mtx_conversion.nf +++ b/subworkflows/local/mtx_conversion.nf @@ -14,14 +14,6 @@ workflow MTX_CONVERSION { main: ch_versions = Channel.empty() - // Cellranger module output contains too many files which cause path collisions, we filter to the ones we need. - if (params.aligner in [ 'cellranger', 'cellrangerarc' ]) { - mtx_matrices = mtx_matrices.map { meta, mtx_files -> - [ meta, mtx_files.findAll { it.toString().contains("filtered_feature_bc_matrix") } ] - } - .filter { meta, mtx_files -> mtx_files } // Remove any that are missing the relevant files - } - // // Convert matrix to h5ad // diff --git a/workflows/scrnaseq.nf b/workflows/scrnaseq.nf index 71abb7a2..b030a323 100644 --- a/workflows/scrnaseq.nf +++ b/workflows/scrnaseq.nf @@ -199,7 +199,7 @@ workflow SCRNASEQ { protocol_config['protocol'] ) ch_versions = ch_versions.mix(CELLRANGER_ALIGN.out.ch_versions) - ch_mtx_matrices = ch_mtx_matrices.mix(CELLRANGER_ALIGN.out.cellranger_out) + ch_mtx_matrices = ch_mtx_matrices.mix(CELLRANGER_ALIGN.out.cellranger_matrices) ch_star_index = CELLRANGER_ALIGN.out.star_index ch_multiqc_cellranger = CELLRANGER_ALIGN.out.cellranger_out.map{ meta, outs -> outs.findAll{ it -> it.name == "web_summary.html"} @@ -242,9 +242,9 @@ workflow SCRNASEQ { if ( params.aligner in [ 'cellranger', 'cellrangerarc', 'kallisto', 'star' ] ) { ch_mtx_matrices_for_emptydrops = ch_mtx_matrices.filter { meta, mtx_files -> - mtx_files.toString().contains("raw_feature_bc_matrix") || // cellranger - mtx_files.toUriString().contains("counts_unfiltered") || // kallisto - mtx_files.toUriString().contains("raw") // star + mtx_files.toString().contains("raw_feature_bc_matrix") || // cellranger + mtx_files.toString().contains("counts_unfiltered") || // kallisto + mtx_files.toString().contains("raw") // star } } else { ch_mtx_matrices_for_emptydrops = ch_mtx_matrices From c9c38ea26f9d2e6bb932b013818ad3adeeb3456d Mon Sep 17 00:00:00 2001 From: fmalmeida Date: Wed, 7 Feb 2024 10:49:55 +0100 Subject: [PATCH 05/30] fix the path of matrices when running non-standard kallisto workflow (tested with lamanno) --- modules/local/emptydrops.nf | 6 +++--- modules/local/mtx_to_h5ad.nf | 8 ++++---- modules/local/mtx_to_seurat.nf | 8 ++++---- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/modules/local/emptydrops.nf b/modules/local/emptydrops.nf index ca42e171..fb0d9791 100644 --- a/modules/local/emptydrops.nf +++ b/modules/local/emptydrops.nf @@ -54,9 +54,9 @@ process EMPTYDROPS_CELL_CALLING { # convert file types for splice_type in spliced unspliced ; do emptydrops_cell_calling.R \\ - *count/counts_unfiltered/\${splice_type}.mtx \\ - *count/counts_unfiltered/\${splice_type}.barcodes.txt \\ - *count/counts_unfiltered/\${splice_type}.genes.txt \\ + counts_unfiltered/\${splice_type}.mtx \\ + counts_unfiltered/\${splice_type}.barcodes.txt \\ + counts_unfiltered/\${splice_type}.genes.txt \\ emptydrops_filtered \\ ${params.aligner} \\ 0 diff --git a/modules/local/mtx_to_h5ad.nf b/modules/local/mtx_to_h5ad.nf index 10b617df..4cb84c88 100644 --- a/modules/local/mtx_to_h5ad.nf +++ b/modules/local/mtx_to_h5ad.nf @@ -80,16 +80,16 @@ process MTX_TO_H5AD { --out ${meta.id}/${meta.id}_${input_type}_matrix.h5ad """ - else if (params.aligner == 'kallisto' && params.kb_workflow != 'standard') + else if (params.aligner == 'kallisto' && params.kb_workflow != 'standard' && input_type != 'custom_emptydrops_filter') """ # convert file types for input_type in spliced unspliced ; do mtx_to_h5ad.py \\ --aligner ${params.aligner} \\ --sample ${meta.id} \\ - --input *count/counts_unfiltered/\${input_type}.mtx \\ - --barcode *count/counts_unfiltered/\${input_type}.barcodes.txt \\ - --feature *count/counts_unfiltered/\${input_type}.genes.txt \\ + --input ${mtx_dir}/\${input_type}.mtx \\ + --barcode ${mtx_dir}/\${input_type}.barcodes.txt \\ + --feature ${mtx_dir}/\${input_type}.genes.txt \\ --txp2gene ${txp2gene} \\ --star_index ${star_index} \\ --out ${meta.id}/${meta.id}_\${input_type}_matrix.h5ad ; diff --git a/modules/local/mtx_to_seurat.nf b/modules/local/mtx_to_seurat.nf index 626e0b97..b5042426 100644 --- a/modules/local/mtx_to_seurat.nf +++ b/modules/local/mtx_to_seurat.nf @@ -74,14 +74,14 @@ process MTX_TO_SEURAT { mkdir ${meta.id} """ - if (params.aligner == 'kallisto' && params.kb_workflow != 'standard') + if (params.aligner == 'kallisto' && params.kb_workflow != 'standard' && input_type != 'custom_emptydrops_filter') """ # convert file types for input_type in spliced unspliced ; do mtx_to_seurat.R \\ - *count/counts_unfiltered/\${input_type}.mtx \\ - *count/counts_unfiltered/\${input_type}.barcodes.txt \\ - *count/counts_unfiltered/\${input_type}.genes.txt \\ + ${mtx_dir}/\${input_type}.mtx \\ + ${mtx_dir}/\${input_type}.barcodes.txt \\ + ${mtx_dir}/\${input_type}.genes.txt \\ ${meta.id}/${meta.id}_\${input_type}_matrix.rds \\ ${aligner} \\ ${is_emptydrops} From a9ee3f1f984303b5c22c9a269481d7ce35bd123e Mon Sep 17 00:00:00 2001 From: fmalmeida Date: Wed, 7 Feb 2024 11:20:52 +0100 Subject: [PATCH 06/30] fix spliced/unspliced empty_drops conversion --- modules/local/mtx_to_h5ad.nf | 2 +- modules/local/mtx_to_seurat.nf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/local/mtx_to_h5ad.nf b/modules/local/mtx_to_h5ad.nf index 4cb84c88..1b32af58 100644 --- a/modules/local/mtx_to_h5ad.nf +++ b/modules/local/mtx_to_h5ad.nf @@ -80,7 +80,7 @@ process MTX_TO_H5AD { --out ${meta.id}/${meta.id}_${input_type}_matrix.h5ad """ - else if (params.aligner == 'kallisto' && params.kb_workflow != 'standard' && input_type != 'custom_emptydrops_filter') + else if (params.aligner == 'kallisto' && params.kb_workflow != 'standard') """ # convert file types for input_type in spliced unspliced ; do diff --git a/modules/local/mtx_to_seurat.nf b/modules/local/mtx_to_seurat.nf index b5042426..caadcc36 100644 --- a/modules/local/mtx_to_seurat.nf +++ b/modules/local/mtx_to_seurat.nf @@ -74,7 +74,7 @@ process MTX_TO_SEURAT { mkdir ${meta.id} """ - if (params.aligner == 'kallisto' && params.kb_workflow != 'standard' && input_type != 'custom_emptydrops_filter') + if (params.aligner == 'kallisto' && params.kb_workflow != 'standard') """ # convert file types for input_type in spliced unspliced ; do From 0cb2a2f3de5a46974945c52c828965d115549a59 Mon Sep 17 00:00:00 2001 From: fmalmeida Date: Wed, 7 Feb 2024 11:21:13 +0100 Subject: [PATCH 07/30] solve number of list levels when having spliced / unspliced --- subworkflows/local/mtx_conversion.nf | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/subworkflows/local/mtx_conversion.nf b/subworkflows/local/mtx_conversion.nf index b1aff9e9..55541f18 100644 --- a/subworkflows/local/mtx_conversion.nf +++ b/subworkflows/local/mtx_conversion.nf @@ -26,8 +26,15 @@ workflow MTX_CONVERSION { // // Concat sample-specific h5ad in one // + ch_concat_h5ad_input = MTX_TO_H5AD.out.h5ad.groupTuple() // gather all sample-specific files / per type + if (params.aligner == 'kallisto' && params.kb_workflow != 'standard') { + // when having spliced / unspliced matrices, the collected tuple has two levels ( [[mtx_1, mtx_2]] ) + // which nextflow break because it is not a valid 'path' thus, we have to remove one level + // making it as [ mtx_1, mtx_2 ] + ch_concat_h5ad_input = ch_concat_h5ad_input.map{ type, matrices -> [ type, matrices.flatten().toList() ] } + } CONCAT_H5AD ( - MTX_TO_H5AD.out.h5ad.groupTuple(), // gather all sample-specific files / per type + ch_concat_h5ad_input, samplesheet ) From 6458e2d8aedb6dddfa49accc813c3c76617a6649 Mon Sep 17 00:00:00 2001 From: fmalmeida Date: Wed, 14 Feb 2024 14:59:57 +0100 Subject: [PATCH 08/30] update shared nf-test config --- tests/nextflow.config | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/nextflow.config b/tests/nextflow.config index 66e56d29..7efd4642 100644 --- a/tests/nextflow.config +++ b/tests/nextflow.config @@ -22,6 +22,9 @@ params { gtf = 'https://github.com/nf-core/test-datasets/raw/scrnaseq/reference/gencode.vM19.annotation.chr19.gtf' protocol = '10XV2' + // small dataset does not have sufficient data for emptydrops module + skip_emptydrops = true + validationSchemaIgnoreParams = 'genomes' } From c5cc1ca7571c888692b13cb181e508a789399f7d Mon Sep 17 00:00:00 2001 From: fmalmeida Date: Wed, 14 Feb 2024 15:10:34 +0100 Subject: [PATCH 09/30] update alevin file names --- tests/main_pipeline_alevin.test | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/main_pipeline_alevin.test b/tests/main_pipeline_alevin.test index 303a625b..6bdbaba5 100644 --- a/tests/main_pipeline_alevin.test +++ b/tests/main_pipeline_alevin.test @@ -43,8 +43,8 @@ nextflow_pipeline { // // Check if files were produced // - {assert new File( "${outputDir}/results_alevin/alevin/mtx_conversions/Sample_X/Sample_X_matrix.h5ad" ).exists()}, - {assert new File( "${outputDir}/results_alevin/alevin/mtx_conversions/Sample_Y/Sample_Y_matrix.h5ad" ).exists()}, + {assert new File( "${outputDir}/results_alevin/alevin/mtx_conversions/Sample_X/Sample_X_raw_matrix.h5ad" ).exists()}, + {assert new File( "${outputDir}/results_alevin/alevin/mtx_conversions/Sample_Y/Sample_Y_raw_matrix.h5ad" ).exists()}, // // Check if files are the same From c5393992afe04cde364c549d300d409aebc8c814 Mon Sep 17 00:00:00 2001 From: Felipe Marques de Almeida Date: Wed, 14 Feb 2024 14:57:34 +0000 Subject: [PATCH 10/30] update alevin tests to also include the .rds files --- tests/main_pipeline_alevin.test | 4 +++- tests/main_pipeline_alevin.test.snap | 10 ++++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/tests/main_pipeline_alevin.test b/tests/main_pipeline_alevin.test index 6bdbaba5..32be1eb3 100644 --- a/tests/main_pipeline_alevin.test +++ b/tests/main_pipeline_alevin.test @@ -56,7 +56,9 @@ nextflow_pipeline { path( "${outputDir}/results_alevin/alevin/Sample_X_alevin_results/af_quant/alevin/quants_mat_rows.txt" ), path( "${outputDir}/results_alevin/alevin/Sample_Y_alevin_results/af_quant/alevin/quants_mat_cols.txt" ), path( "${outputDir}/results_alevin/alevin/Sample_Y_alevin_results/af_quant/alevin/quants_mat.mtx" ), - path( "${outputDir}/results_alevin/alevin/Sample_Y_alevin_results/af_quant/alevin/quants_mat_rows.txt" ) + path( "${outputDir}/results_alevin/alevin/Sample_Y_alevin_results/af_quant/alevin/quants_mat_rows.txt" ), + path( "${outputDir}/results_alevin/alevin/mtx_conversions/Sample_X/Sample_X_raw_matrix.rds" ), + path( "${outputDir}/results_alevin/alevin/mtx_conversions/Sample_Y/Sample_Y_raw_matrix.rds" ) ).match()} ) // end of assertAll() diff --git a/tests/main_pipeline_alevin.test.snap b/tests/main_pipeline_alevin.test.snap index 9d194012..b7648793 100644 --- a/tests/main_pipeline_alevin.test.snap +++ b/tests/main_pipeline_alevin.test.snap @@ -25,8 +25,14 @@ "quants_mat_rows.txt:md5,6227df5a13127b71c71fb18cd8574857", "quants_mat_cols.txt:md5,e9868982c17a330392e38c2a5933cf97", "quants_mat.mtx:md5,54cd12666016adce94c025b2e07f4b02", - "quants_mat_rows.txt:md5,6b458a7777260ba90eccbe7919df934b" + "quants_mat_rows.txt:md5,6b458a7777260ba90eccbe7919df934b", + "Sample_X_raw_matrix.rds:md5,ad35ee66bf2fc3d5d4656c19a7e64e2b", + "Sample_Y_raw_matrix.rds:md5,baf584142205b1d42bb6fdab1f22a06a" ], - "timestamp": "2024-01-19T10:28:35.652763852" + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-14T14:49:46.831540515" } } \ No newline at end of file From 2d7c90e7dd8038bb158d7b75620eea3bfc5e0221 Mon Sep 17 00:00:00 2001 From: Felipe Marques de Almeida Date: Wed, 14 Feb 2024 15:45:04 +0000 Subject: [PATCH 11/30] update the number of tasks that are executed, and include raw/filtered in the testings (cellranger) --- tests/main_pipeline_cellranger.test | 16 ++++++++++------ tests/main_pipeline_cellranger.test.snap | 16 +++++++++++----- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/tests/main_pipeline_cellranger.test b/tests/main_pipeline_cellranger.test index 996c0f76..30723ba3 100644 --- a/tests/main_pipeline_cellranger.test +++ b/tests/main_pipeline_cellranger.test @@ -30,12 +30,12 @@ nextflow_pipeline { {assert workflow.success}, // How many tasks were executed? - {assert workflow.trace.tasks().size() == 15}, + {assert workflow.trace.tasks().size() == 20}, // How many results were produced? {assert path("${outputDir}/results_cellranger").list().size() == 4}, {assert path("${outputDir}/results_cellranger/cellranger").list().size() == 4}, - {assert path("${outputDir}/results_cellranger/cellranger/mtx_conversions").list().size() == 4}, + {assert path("${outputDir}/results_cellranger/cellranger/mtx_conversions").list().size() == 5}, {assert path("${outputDir}/results_cellranger/cellranger/count").list().size() == 3}, {assert path("${outputDir}/results_cellranger/fastqc").list().size() == 12}, {assert path("${outputDir}/results_cellranger/multiqc").list().size() == 3}, @@ -43,8 +43,10 @@ nextflow_pipeline { // // Check if files were produced // - {assert new File( "${outputDir}/results_cellranger/cellranger/mtx_conversions/Sample_X/Sample_X_matrix.h5ad" ).exists()}, - {assert new File( "${outputDir}/results_cellranger/cellranger/mtx_conversions/Sample_Y/Sample_Y_matrix.h5ad" ).exists()}, + {assert new File( "${outputDir}/results_cellranger/cellranger/mtx_conversions/Sample_X/Sample_X_raw_matrix.h5ad" ).exists()}, + {assert new File( "${outputDir}/results_cellranger/cellranger/mtx_conversions/Sample_Y/Sample_Y_raw_matrix.h5ad" ).exists()}, + {assert new File( "${outputDir}/results_cellranger/cellranger/mtx_conversions/Sample_X/Sample_X_filtered_matrix.h5ad" ).exists()}, + {assert new File( "${outputDir}/results_cellranger/cellranger/mtx_conversions/Sample_Y/Sample_Y_filtered_matrix.h5ad" ).exists()}, // // Check if files are the same @@ -63,8 +65,10 @@ nextflow_pipeline { path( "${outputDir}/results_cellranger/cellranger/count/Sample_Y/outs/raw_feature_bc_matrix/barcodes.tsv.gz" ), path( "${outputDir}/results_cellranger/cellranger/count/Sample_Y/outs/raw_feature_bc_matrix/features.tsv.gz" ), path( "${outputDir}/results_cellranger/cellranger/count/Sample_Y/outs/raw_feature_bc_matrix/matrix.mtx.gz" ), - path( "${outputDir}/results_cellranger/cellranger/mtx_conversions/Sample_X/Sample_X_matrix.rds" ), - path( "${outputDir}/results_cellranger/cellranger/mtx_conversions/Sample_Y/Sample_Y_matrix.rds" ) + path( "${outputDir}/results_cellranger/cellranger/mtx_conversions/Sample_X/Sample_X_raw_matrix.rds" ), + path( "${outputDir}/results_cellranger/cellranger/mtx_conversions/Sample_Y/Sample_Y_raw_matrix.rds" ), + path( "${outputDir}/results_cellranger/cellranger/mtx_conversions/Sample_X/Sample_X_filtered_matrix.rds" ), + path( "${outputDir}/results_cellranger/cellranger/mtx_conversions/Sample_Y/Sample_Y_filtered_matrix.rds" ) ).match()} ) // end of assertAll() diff --git a/tests/main_pipeline_cellranger.test.snap b/tests/main_pipeline_cellranger.test.snap index 5f88b954..4aab7554 100644 --- a/tests/main_pipeline_cellranger.test.snap +++ b/tests/main_pipeline_cellranger.test.snap @@ -14,8 +14,8 @@ "errorMessage": "", "trace": { "tasksFailed": 0, - "tasksCount": 15, - "tasksSucceeded": 15 + "tasksCount": 20, + "tasksSucceeded": 20 }, "name": "workflow", "success": true @@ -32,9 +32,15 @@ "barcodes.tsv.gz:md5,081f72b5252ccaf5ffd535ffbd235c4c", "features.tsv.gz:md5,99e453cb1443a3e43e99405184e51a5e", "matrix.mtx.gz:md5,a4db04e43e650accc96361a287126a6b", - "Sample_X_matrix.rds:md5,f9191ba575a3ab79ada4807715f18573", - "Sample_Y_matrix.rds:md5,7be3f7b29d668dcf7e951b9f4d371a5e" + "Sample_X_raw_matrix.rds:md5,306a5477ace4d43d851b8389fdfeaf1f", + "Sample_Y_raw_matrix.rds:md5,74b31532da4cae5a8197d690021d77fc", + "Sample_X_filtered_matrix.rds:md5,f9191ba575a3ab79ada4807715f18573", + "Sample_Y_filtered_matrix.rds:md5,7be3f7b29d668dcf7e951b9f4d371a5e" ], - "timestamp": "2024-01-22T15:19:20.134275449" + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-14T15:10:31.638485641" } } \ No newline at end of file From c72c8f203de4d12cd8ec5bfefe20d117d8b66b39 Mon Sep 17 00:00:00 2001 From: Felipe Marques de Almeida Date: Wed, 14 Feb 2024 16:23:14 +0000 Subject: [PATCH 12/30] fix naming of generated files (kallisto) --- tests/main_pipeline_kallisto.test | 8 ++++---- tests/main_pipeline_kallisto.test.snap | 10 +++++++--- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/tests/main_pipeline_kallisto.test b/tests/main_pipeline_kallisto.test index 98ede8c4..5fa0ead3 100644 --- a/tests/main_pipeline_kallisto.test +++ b/tests/main_pipeline_kallisto.test @@ -44,8 +44,8 @@ nextflow_pipeline { // // Check if files were produced // - {assert new File( "${outputDir}/results_kallisto/kallisto/mtx_conversions/Sample_X/Sample_X_matrix.h5ad" ).exists()}, - {assert new File( "${outputDir}/results_kallisto/kallisto/mtx_conversions/Sample_Y/Sample_Y_matrix.h5ad" ).exists()}, + {assert new File( "${outputDir}/results_kallisto/kallisto/mtx_conversions/Sample_X/Sample_X_raw_matrix.h5ad" ).exists()}, + {assert new File( "${outputDir}/results_kallisto/kallisto/mtx_conversions/Sample_Y/Sample_Y_raw_matrix.h5ad" ).exists()}, // // Check if files are the same @@ -58,8 +58,8 @@ nextflow_pipeline { path( "${outputDir}/results_kallisto/kallisto/Sample_Y.count/counts_unfiltered/cells_x_genes.barcodes.txt" ), path( "${outputDir}/results_kallisto/kallisto/Sample_Y.count/counts_unfiltered/cells_x_genes.genes.txt" ), path( "${outputDir}/results_kallisto/kallisto/Sample_Y.count/counts_unfiltered/cells_x_genes.mtx" ), - path( "${outputDir}/results_kallisto/kallisto/mtx_conversions/Sample_X/Sample_X_matrix.rds" ), - path( "${outputDir}/results_kallisto/kallisto/mtx_conversions/Sample_Y/Sample_Y_matrix.rds" ) + path( "${outputDir}/results_kallisto/kallisto/mtx_conversions/Sample_X/Sample_X_raw_matrix.rds" ), + path( "${outputDir}/results_kallisto/kallisto/mtx_conversions/Sample_Y/Sample_Y_raw_matrix.rds" ) ).match()} ) // end of assertAll() diff --git a/tests/main_pipeline_kallisto.test.snap b/tests/main_pipeline_kallisto.test.snap index b0fb50bb..eda17cdc 100644 --- a/tests/main_pipeline_kallisto.test.snap +++ b/tests/main_pipeline_kallisto.test.snap @@ -26,9 +26,13 @@ "cells_x_genes.barcodes.txt:md5,488437e1f5477243697efb93366e5676", "cells_x_genes.genes.txt:md5,acd9d00120f52031974b2add3e7521b6", "cells_x_genes.mtx:md5,af90e05b404490f6cb133ab7f62949f8", - "Sample_X_matrix.rds:md5,f0e43f69403f4b2e7704065421592ad0", - "Sample_Y_matrix.rds:md5,61809156e64dbdaf254cbc1c3456588e" + "Sample_X_raw_matrix.rds:md5,f0e43f69403f4b2e7704065421592ad0", + "Sample_Y_raw_matrix.rds:md5,61809156e64dbdaf254cbc1c3456588e" ], - "timestamp": "2024-01-23T12:19:47.921508953" + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-14T16:04:36.951736171" } } \ No newline at end of file From 5d1d78336316ffa8a30eddd7bbb256d02b99c93d Mon Sep 17 00:00:00 2001 From: Felipe Marques de Almeida Date: Wed, 14 Feb 2024 16:38:36 +0000 Subject: [PATCH 13/30] update the amount of tasks and generated file names raw/filtered (star) --- tests/main_pipeline_star.test | 18 ++++++++++-------- tests/main_pipeline_star.test.snap | 16 +++++++++++----- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/tests/main_pipeline_star.test b/tests/main_pipeline_star.test index 6616ea87..f7afdfc9 100644 --- a/tests/main_pipeline_star.test +++ b/tests/main_pipeline_star.test @@ -30,20 +30,22 @@ nextflow_pipeline { {assert workflow.success}, // How many tasks were executed? - {assert workflow.trace.tasks().size() == 14}, + {assert workflow.trace.tasks().size() == 19}, // How many results were produced? {assert path("${outputDir}/results_star").list().size() == 4}, {assert path("${outputDir}/results_star/star").list().size() == 3}, - {assert path("${outputDir}/results_star/star/mtx_conversions").list().size() == 4}, + {assert path("${outputDir}/results_star/star/mtx_conversions").list().size() == 5}, {assert path("${outputDir}/results_star/fastqc").list().size() == 12}, {assert path("${outputDir}/results_star/multiqc").list().size() == 3}, // // Check if files were produced // - {assert new File( "${outputDir}/results_star/star/mtx_conversions/Sample_X/Sample_X_matrix.h5ad" ).exists()}, - {assert new File( "${outputDir}/results_star/star/mtx_conversions/Sample_Y/Sample_Y_matrix.h5ad" ).exists()}, + {assert new File( "${outputDir}/results_star/star/mtx_conversions/Sample_X/Sample_X_raw_matrix.h5ad" ).exists()}, + {assert new File( "${outputDir}/results_star/star/mtx_conversions/Sample_Y/Sample_Y_raw_matrix.h5ad" ).exists()}, + {assert new File( "${outputDir}/results_star/star/mtx_conversions/Sample_X/Sample_X_filtered_matrix.h5ad" ).exists()}, + {assert new File( "${outputDir}/results_star/star/mtx_conversions/Sample_Y/Sample_Y_filtered_matrix.h5ad" ).exists()}, // // Check if files are the same @@ -55,15 +57,15 @@ nextflow_pipeline { path( "${outputDir}/results_star/star/Sample_X/Sample_X.Solo.out/Gene/filtered/matrix.mtx.gz" ), path( "${outputDir}/results_star/star/Sample_X/Sample_X.Solo.out/Gene/filtered/features.tsv.gz" ), path( "${outputDir}/results_star/star/Sample_X/Sample_X.Solo.out/Gene/filtered/barcodes.tsv.gz" ), - // path( "${outputDir}/results_star/star/mtx_conversions/Sample_X/Sample_X_matrix.h5ad" ), // does not match - path( "${outputDir}/results_star/star/mtx_conversions/Sample_X/Sample_X_matrix.rds" ), path( "${outputDir}/results_star/star/Sample_Y/Sample_Y.SJ.out.tab" ), path( "${outputDir}/results_star/star/Sample_Y/Sample_Y.Solo.out/Barcodes.stats" ), path( "${outputDir}/results_star/star/Sample_Y/Sample_Y.Solo.out/Gene/filtered/matrix.mtx.gz" ), path( "${outputDir}/results_star/star/Sample_Y/Sample_Y.Solo.out/Gene/filtered/features.tsv.gz" ), path( "${outputDir}/results_star/star/Sample_Y/Sample_Y.Solo.out/Gene/filtered/barcodes.tsv.gz" ), - // path( "${outputDir}/results_star/star/mtx_conversions/Sample_Y/Sample_Y_matrix.h5ad" ), // does not match - path( "${outputDir}/results_star/star/mtx_conversions/Sample_Y/Sample_Y_matrix.rds" ), + path( "${outputDir}/results_star/star/mtx_conversions/Sample_X/Sample_X_raw_matrix.rds" ), + path( "${outputDir}/results_star/star/mtx_conversions/Sample_Y/Sample_Y_raw_matrix.rds" ), + path( "${outputDir}/results_star/star/mtx_conversions/Sample_X/Sample_X_filtered_matrix.rds" ), + path( "${outputDir}/results_star/star/mtx_conversions/Sample_Y/Sample_Y_filtered_matrix.rds" ), ).match()} ) // end of assertAll() diff --git a/tests/main_pipeline_star.test.snap b/tests/main_pipeline_star.test.snap index ef086f60..0d45f3b4 100644 --- a/tests/main_pipeline_star.test.snap +++ b/tests/main_pipeline_star.test.snap @@ -14,8 +14,8 @@ "errorMessage": "", "trace": { "tasksFailed": 0, - "tasksCount": 14, - "tasksSucceeded": 14 + "tasksCount": 19, + "tasksSucceeded": 19 }, "name": "workflow", "success": true @@ -25,14 +25,20 @@ "matrix.mtx.gz:md5,6a923393343aa1a69b0cf1bd998c9285", "features.tsv.gz:md5,99e453cb1443a3e43e99405184e51a5e", "barcodes.tsv.gz:md5,9a7dacaa1779ea43c1507a947fe6992a", - "Sample_X_matrix.rds:md5,aa2d36dd8507aba864347c88e4ce0d27", "Sample_Y.SJ.out.tab:md5,98bd31104a860cf80119dc30d938d163", "Barcodes.stats:md5,2dbf1ae426c1afd97903ee001f0db5ce", "matrix.mtx.gz:md5,0ae080bd0002e350531a5816e159345e", "features.tsv.gz:md5,99e453cb1443a3e43e99405184e51a5e", "barcodes.tsv.gz:md5,9b695b0b91bcb146ec9c4688ca10a690", - "Sample_Y_matrix.rds:md5,d459af8f99258bcc88b80b2f7c58e911" + "Sample_X_raw_matrix.rds:md5,31604db3e7846acc8d9a60b1a171ce78", + "Sample_Y_raw_matrix.rds:md5,1a52c823e91acce2b29621c8c99c8c72", + "Sample_X_filtered_matrix.rds:md5,aa2d36dd8507aba864347c88e4ce0d27", + "Sample_Y_filtered_matrix.rds:md5,d459af8f99258bcc88b80b2f7c58e911" ], - "timestamp": "2024-01-19T15:46:22.470527538" + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-14T16:30:25.7971791" } } \ No newline at end of file From 4095c3d68cc393c5a043efe798b130d7331750ad Mon Sep 17 00:00:00 2001 From: Felipe Marques de Almeida Date: Wed, 14 Feb 2024 16:39:04 +0000 Subject: [PATCH 14/30] update gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index a9ba2c52..9dfcadb3 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ log/ reports/ testme.sh .nf-test/ +.nf-test.log From 6981c44acae7f71950cf5f76b8de6c84603572e8 Mon Sep 17 00:00:00 2001 From: Felipe Marques de Almeida Date: Thu, 15 Feb 2024 07:27:56 +0000 Subject: [PATCH 15/30] add new params to schema --- nextflow_schema.json | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/nextflow_schema.json b/nextflow_schema.json index a53cc55e..f9d2fdd0 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -84,6 +84,10 @@ "skip_fastqc": { "type": "boolean", "description": "Skip FastQC" + }, + "skip_emptydrops": { + "type": "boolean", + "description": "Skip custom empty drops filter module" } } }, @@ -219,6 +223,11 @@ "description": "Type of workflow. Use `lamanno` for RNA velocity based on La Manno et al. 2018 logic. Use `nucleus` for RNA velocity on single-nucleus RNA-seq reads. Use `kite` for feature barcoding. Use `kite: 10xFB` for 10x Genomics Feature Barcoding technology. (default: standard)", "fa_icon": "fas fa-fish", "enum": ["standard", "lamanno", "nucleus", "kite", "kite: 10xFB"] + }, + "kb_filter": { + "type": "boolean", + "fa_icon": "fas fa-fish", + "description": "Activate Kallisto/BUStools filtering algorithm" } } }, From e350399798fe1b4656bea08ccf111a27b8e0757e Mon Sep 17 00:00:00 2001 From: Felipe Marques de Almeida Date: Fri, 8 Mar 2024 13:37:43 +0100 Subject: [PATCH 16/30] Update bin/emptydrops_cell_calling.R gsub .gz only in end of string Co-authored-by: Gregor Sturm --- bin/emptydrops_cell_calling.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/emptydrops_cell_calling.R b/bin/emptydrops_cell_calling.R index 989c2b34..2b4ee5e7 100755 --- a/bin/emptydrops_cell_calling.R +++ b/bin/emptydrops_cell_calling.R @@ -17,7 +17,7 @@ mtx <- readMM(fn_mtx) get_name <- function(file) { name <- as.character(basename(file)) - name <- gsub('\\.gz', '', name) + name <- gsub('\\.gz$', '', name) return(name) } From 7daa3107220be3df249c8a99715d900725fdf0cf Mon Sep 17 00:00:00 2001 From: fmalmeida Date: Mon, 11 Mar 2024 13:31:18 +0100 Subject: [PATCH 17/30] fixing stub snippet --- modules/local/emptydrops.nf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/local/emptydrops.nf b/modules/local/emptydrops.nf index fb0d9791..675819f7 100644 --- a/modules/local/emptydrops.nf +++ b/modules/local/emptydrops.nf @@ -77,6 +77,7 @@ process EMPTYDROPS_CELL_CALLING { stub: """ - touch emptydrops_filtered/* + mkdir emptydrops_filtered + touch emptydrops_filtered/empty_file """ } From c24a05d226643d0303dd957b6627f86a168333f8 Mon Sep 17 00:00:00 2001 From: fmalmeida Date: Mon, 11 Mar 2024 13:39:23 +0100 Subject: [PATCH 18/30] fix transposition snippet in new module --- bin/emptydrops_cell_calling.R | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/bin/emptydrops_cell_calling.R b/bin/emptydrops_cell_calling.R index 2b4ee5e7..bebdcc33 100755 --- a/bin/emptydrops_cell_calling.R +++ b/bin/emptydrops_cell_calling.R @@ -21,14 +21,16 @@ get_name <- function(file) { return(name) } -# Check if barcodes are in columns, if not, transpose mtx -is_transposed<-FALSE -if (dim(barcodes)[1]!=dim(mtx)[2]){ - mtx<-t(mtx) - is_transposed<-TRUE - print('Matrix was tranposed.') +# transpose matrices when required +# based on code of 'mtx_to_seurat.R', only the data from kallisto and alevin would require transposition +print("Only kallisto and alevin have transposed matrices.") +if (aligner %in% c( "kallisto", "alevin" )) { + is_transposed <- TRUE +} else { + is_transposed <- FALSE } + # Call empty drops e.out <- emptyDrops(mtx) is.cell <- e.out$FDR <= 0.01 From 492598b11f30249a5b2f3ed29c2aa4840f17b7c2 Mon Sep 17 00:00:00 2001 From: fmalmeida Date: Tue, 12 Mar 2024 09:02:51 +0100 Subject: [PATCH 19/30] fixed problem on loading fasta>f from params.genome --- main.nf | 19 ++++++++++++------- workflows/scrnaseq.nf | 37 ++++++++++++++++++++----------------- 2 files changed, 32 insertions(+), 24 deletions(-) diff --git a/main.nf b/main.nf index 3471e5b0..7d8ba356 100644 --- a/main.nf +++ b/main.nf @@ -17,10 +17,9 @@ nextflow.enable.dsl = 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { SCRNASEQ } from './workflows/scrnaseq' +include { SCRNASEQ } from './workflows/scrnaseq' include { PIPELINE_INITIALISATION } from './subworkflows/local/utils_nfcore_scrnaseq_pipeline' include { PIPELINE_COMPLETION } from './subworkflows/local/utils_nfcore_scrnaseq_pipeline' - include { getGenomeAttribute } from './subworkflows/local/utils_nfcore_scrnaseq_pipeline' /* @@ -28,9 +27,9 @@ include { getGenomeAttribute } from './subworkflows/local/utils_nfcore_scrn GENOME PARAMETER VALUES ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ - -params.fasta = getGenomeAttribute('fasta') -params.gtf = getGenomeAttribute('gtf') +// we cannot modify params. here, we must load the files +ch_genome_fasta = params.genome ? file( getGenomeAttribute('fasta'), checkIfExists: true ) : [] +ch_gtf = params.genome ? file( getGenomeAttribute('gtf'), checkIfExists: true ) : [] /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -45,6 +44,8 @@ workflow NFCORE_SCRNASEQ { take: samplesheet // channel: samplesheet read in from --input + ch_genome_fasta + ch_gtf main: @@ -52,7 +53,9 @@ workflow NFCORE_SCRNASEQ { // WORKFLOW: Run pipeline // SCRNASEQ ( - samplesheet + samplesheet, + ch_genome_fasta, + ch_gtf ) emit: @@ -86,7 +89,9 @@ workflow { // WORKFLOW: Run main workflow // NFCORE_SCRNASEQ ( - PIPELINE_INITIALISATION.out.samplesheet + PIPELINE_INITIALISATION.out.samplesheet, + ch_genome_fasta, + ch_gtf ) // diff --git a/workflows/scrnaseq.nf b/workflows/scrnaseq.nf index b3a695f0..7171c970 100644 --- a/workflows/scrnaseq.nf +++ b/workflows/scrnaseq.nf @@ -1,18 +1,17 @@ -include { MULTIQC } from '../modules/nf-core/multiqc/main' -include { FASTQC_CHECK } from '../subworkflows/local/fastqc' -include { KALLISTO_BUSTOOLS } from '../subworkflows/local/kallisto_bustools' -include { SCRNASEQ_ALEVIN } from '../subworkflows/local/alevin' -include { STARSOLO } from '../subworkflows/local/starsolo' -include { CELLRANGER_ALIGN } from "../subworkflows/local/align_cellranger" -include { CELLRANGERARC_ALIGN } from "../subworkflows/local/align_cellrangerarc" -include { UNIVERSC_ALIGN } from "../subworkflows/local/align_universc" -include { MTX_CONVERSION } from "../subworkflows/local/mtx_conversion" -include { GTF_GENE_FILTER } from '../modules/local/gtf_gene_filter' -include { EMPTYDROPS_CELL_CALLING } from '../modules/local/emptydrops' - -include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' -include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' -include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_scrnaseq_pipeline' +include { MULTIQC } from '../modules/nf-core/multiqc/main' +include { FASTQC_CHECK } from '../subworkflows/local/fastqc' +include { KALLISTO_BUSTOOLS } from '../subworkflows/local/kallisto_bustools' +include { SCRNASEQ_ALEVIN } from '../subworkflows/local/alevin' +include { STARSOLO } from '../subworkflows/local/starsolo' +include { CELLRANGER_ALIGN } from "../subworkflows/local/align_cellranger" +include { CELLRANGERARC_ALIGN } from "../subworkflows/local/align_cellrangerarc" +include { UNIVERSC_ALIGN } from "../subworkflows/local/align_universc" +include { MTX_CONVERSION } from "../subworkflows/local/mtx_conversion" +include { GTF_GENE_FILTER } from '../modules/local/gtf_gene_filter' +include { EMPTYDROPS_CELL_CALLING } from '../modules/local/emptydrops' +include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' +include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' +include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_scrnaseq_pipeline' include { paramsSummaryLog; paramsSummaryMap } from 'plugin/nf-validation' @@ -20,6 +19,8 @@ workflow SCRNASEQ { take: ch_fastq + ch_genome_fasta + ch_gtf main: @@ -28,9 +29,11 @@ workflow SCRNASEQ { error "Only cellranger supports `protocol = 'auto'`. Please specify the protocol manually!" } + // overwrite fasta and gtf if user provide a custom one + ch_genome_fasta = Channel.value(params.fasta ? file(params.fasta) : ch_genome_fasta) + ch_gtf = Channel.value(params.gtf ? file(params.gtf) : ch_gtf) + // general input and params - ch_genome_fasta = Channel.value(params.fasta ? file(params.fasta) : []) - ch_gtf = params.gtf ? file(params.gtf) : [] ch_transcript_fasta = params.transcript_fasta ? file(params.transcript_fasta): [] ch_motifs = params.motifs ? file(params.motifs) : [] ch_cellrangerarc_config = params.cellrangerarc_config ? file(params.cellrangerarc_config) : [] From 62076557aacdd2d293a2adf66093af7caa74beb1 Mon Sep 17 00:00:00 2001 From: fmalmeida Date: Tue, 12 Mar 2024 14:34:07 +0100 Subject: [PATCH 20/30] fixed transposition --- bin/emptydrops_cell_calling.R | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/emptydrops_cell_calling.R b/bin/emptydrops_cell_calling.R index bebdcc33..23a45267 100755 --- a/bin/emptydrops_cell_calling.R +++ b/bin/emptydrops_cell_calling.R @@ -26,6 +26,7 @@ get_name <- function(file) { print("Only kallisto and alevin have transposed matrices.") if (aligner %in% c( "kallisto", "alevin" )) { is_transposed <- TRUE + mtx<-t(mtx) } else { is_transposed <- FALSE } From 8c5702ba767d278aab1cbb4d7adb0852ea8000ff Mon Sep 17 00:00:00 2001 From: fmalmeida Date: Tue, 12 Mar 2024 14:34:13 +0100 Subject: [PATCH 21/30] fixed file used --- modules/local/emptydrops.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/emptydrops.nf b/modules/local/emptydrops.nf index 675819f7..ee9424f3 100644 --- a/modules/local/emptydrops.nf +++ b/modules/local/emptydrops.nf @@ -29,7 +29,7 @@ process EMPTYDROPS_CELL_CALLING { matrix = "counts_unfiltered/*.mtx" barcodes = "counts_unfiltered/*.barcodes.txt" - features = "counts_unfiltered/*.genes.txt" + features = "counts_unfiltered/*.genes.names.txt" } else if (params.aligner == "alevin") { From 3adeb65e0cd3bdac5e3db9fd1d50bf54ee5abea2 Mon Sep 17 00:00:00 2001 From: fmalmeida Date: Wed, 13 Mar 2024 10:48:14 +0100 Subject: [PATCH 22/30] updating documentation --- docs/output.md | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/docs/output.md b/docs/output.md index 7e9f0cd8..aac47219 100644 --- a/docs/output.md +++ b/docs/output.md @@ -19,6 +19,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [Cellranger](#cellranger) - [Cellranger ARC](#cellranger-arc) - [UniverSC](#universc) + - [Custom emptydrops filter](#custom-emptydrops-filter) - [Other output data](#other-output-data) - [MultiQC](#multiqc) - [Pipeline information](#pipeline-information) @@ -128,6 +129,16 @@ Battenberg, K., Kelly, S.T., Ras, R.A., Hetherington, N.A., Hayashi, K., and Min - Contains the mapped BAM files, filtered and unfiltered HDF5 matrices and output metrics created by the open-source implementation of Cell Ranger run via UniverSC +## Custom emptydrops filter + +The pipeline also possess a module to perform empty-drops calling and filtering with a custom-made script that uses a library called `bioconductor-dropletutils` that is available in `bioconda`. The process is simple, it takes a raw/unfiltered matrix file, and performs the empty-drops calling and filtering on it, generating another matrix file. + +> Users can turn it of with `--skip_emptydrops`. + +**Output directory: `results/${params.aligner}/emptydrops_filtered`** + +- Contains the empty-drops filtered matrices results generated by the `bioconductor-dropletutils` custom script + ## Other output data **Output directory: `results/reference_genome`** @@ -143,6 +154,21 @@ Battenberg, K., Kelly, S.T., Ras, R.A., Hetherington, N.A., Hayashi, K., and Min - `*_matrix.h5ad` - `.mtx` files converted to [AnnData](https://anndata.readthedocs.io/en/latest/) in `.h5ad` format, using [scanpy package](https://scanpy.readthedocs.io/en/stable/). - One per sample and a single one with all samples concatenated together `combined_matrix.h5ad` +- `*_matrix.rds` + - `.mtx` files converted to R native data format, rds, using the [Seurat package](https://github.com/satijalab/seurat) + - One per sample + +Because the pipeline has both the data directly from the aligners, and from the custom empty-drops filtering module the conversion modules were modified to understand the difference between raw/filtered from the aligners itself and filtered from the custom empty-drops module. So, to try to avoid confusion by the user, we added "suffixes" to the generated converted files so that we have provenance from what input it came from. + +So, the conversion modules generate data with the following syntax: **`*_{raw,filtered,custom_emptydrops_filter}_matrix.{h5ad,rds}`**. With the following meanings: + +| suffix | meaning | +| :----- | :------ | +| raw | Conversion of the raw/unprocessed matrix generated by the tool. It is also used for tools that generate only one matrix, such as alevin. | +| filtered | Conversion of the filtered/processed matrix generated by the tool | +| custom_emptydrops_filter | Conversion of the matrix that was generated by the new custom empty drops filter module | + +> Some aligners, like `alevin` do not produce both raw&filtered matrices. When aligners give only one output, they are treated with the `raw` suffix. Some aligners may have an option to give both raw&filtered and only one, like `kallisto`. Be aware when using the tools. ## MultiQC From 48dd996a65391afb8842f2f51f9f0ed7ec89b2f7 Mon Sep 17 00:00:00 2001 From: fmalmeida Date: Wed, 13 Mar 2024 12:18:28 +0100 Subject: [PATCH 23/30] remove unused parameter --- nextflow.config | 1 - nextflow_schema.json | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/nextflow.config b/nextflow.config index a0d4f37b..8fefe42c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -27,7 +27,6 @@ params { salmon_index = null // kallisto bustools parameters - kallisto_gene_map = null kallisto_index = null kb_workflow = "standard" kb_t1c = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 6b2353bb..b799d78c 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -168,7 +168,7 @@ "txp2gene": { "type": "string", "description": "Path to transcript to gene mapping file. This allows the specification of a transcript to gene mapping file for Salmon Alevin and AlevinQC.", - "help_text": "> This is not the same as the `kallisto_gene_map` parameter down below and is only used by the Salmon Alevin workflow.", + "help_text": "> This is only used by the Salmon Alevin workflow.", "fa_icon": "fas fa-map-marked-alt", "format": "file-path", "exists": true From 9cd8edd7796f62a412afbcbdbe8c44e12197473e Mon Sep 17 00:00:00 2001 From: fmalmeida Date: Wed, 13 Mar 2024 13:24:16 +0100 Subject: [PATCH 24/30] adjust modules to handle kallisto outputs form non-standard (lamanno & nac) workflows --- modules/local/emptydrops.nf | 26 ++++++++++++++++++++++---- modules/local/mtx_to_h5ad.nf | 26 ++++++++++++++++++++++---- modules/local/mtx_to_seurat.nf | 26 ++++++++++++++++++++++---- 3 files changed, 66 insertions(+), 12 deletions(-) diff --git a/modules/local/emptydrops.nf b/modules/local/emptydrops.nf index ee9424f3..907295ab 100644 --- a/modules/local/emptydrops.nf +++ b/modules/local/emptydrops.nf @@ -31,6 +31,24 @@ process EMPTYDROPS_CELL_CALLING { barcodes = "counts_unfiltered/*.barcodes.txt" features = "counts_unfiltered/*.genes.names.txt" + // kallisto allows the following workflows: ["standard", "lamanno", "nac"] + // lamanno creates "spliced" and "unspliced" + // nac creates "nascent", "ambiguous" "mature" + // also, lamanno produces a barcodes and genes file for both spliced and unspliced + // while nac keep only one for all the different .mtx files produced + kb_non_standard_files = "" + if (params.kb_workflow == "lamanno") { + kb_non_standard_files = "spliced unspliced" + matrix = "counts_unfiltered/\${input_type}.mtx" + barcodes = "counts_unfiltered/\${input_type}.barcodes.txt" + features = "counts_unfiltered/\${input_type}.genes.txt" + } + if (params.kb_workflow == "nac") { + kb_non_standard_files = "nascent ambiguous mature" + matrix = "counts_unfiltered/*\${input_type}.mtx" + features = "counts_unfiltered/*.genes.txt" + } // barcodes tsv has same pattern as standard workflow + } else if (params.aligner == "alevin") { matrix = "*_alevin_results/af_quant/alevin/quants_mat.mtx" @@ -52,11 +70,11 @@ process EMPTYDROPS_CELL_CALLING { """ mkdir emptydrops_filtered/ # convert file types - for splice_type in spliced unspliced ; do + for input_type in ${kb_non_standard_files} ; do emptydrops_cell_calling.R \\ - counts_unfiltered/\${splice_type}.mtx \\ - counts_unfiltered/\${splice_type}.barcodes.txt \\ - counts_unfiltered/\${splice_type}.genes.txt \\ + ${matrix} \\ + ${barcodes} \\ + ${features} \\ emptydrops_filtered \\ ${params.aligner} \\ 0 diff --git a/modules/local/mtx_to_h5ad.nf b/modules/local/mtx_to_h5ad.nf index 3b672645..c603683c 100644 --- a/modules/local/mtx_to_h5ad.nf +++ b/modules/local/mtx_to_h5ad.nf @@ -49,6 +49,24 @@ process MTX_TO_H5AD { barcodes_tsv = "${mtx_dir}/*.barcodes.txt" features_tsv = "${mtx_dir}/*.genes.names.txt" + // kallisto allows the following workflows: ["standard", "lamanno", "nac"] + // lamanno creates "spliced" and "unspliced" + // nac creates "nascent", "ambiguous" "mature" + // also, lamanno produces a barcodes and genes file for both spliced and unspliced + // while nac keep only one for all the different .mtx files produced + kb_non_standard_files = "" + if (params.kb_workflow == "lamanno") { + kb_non_standard_files = "spliced unspliced" + matrix = "${mtx_dir}/\${input_type}.mtx" + barcodes_tsv = "${mtx_dir}/\${input_type}.barcodes.txt" + features_tsv = "${mtx_dir}/\${input_type}.genes.txt" + } + if (params.kb_workflow == "nac") { + kb_non_standard_files = "nascent ambiguous mature" + matrix = "${mtx_dir}/*\${input_type}.mtx" + features_tsv = "${mtx_dir}/*.genes.txt" + } // barcodes tsv has same pattern as standard workflow + } else if (params.aligner == 'alevin') { // alevin does not have filtered/unfiltered results @@ -83,13 +101,13 @@ process MTX_TO_H5AD { else if (params.aligner == 'kallisto' && params.kb_workflow != 'standard') """ # convert file types - for input_type in nascent ambiguous mature ; do + for input_type in ${kb_non_standard_files} ; do mtx_to_h5ad.py \\ --aligner ${params.aligner} \\ --sample ${meta.id} \\ - --input ${mtx_dir}/\${input_type}.mtx \\ - --barcode ${mtx_dir}/\${input_type}.barcodes.txt \\ - --feature ${mtx_dir}/\${input_type}.genes.names.txt \\ + --input ${matrix} \\ + --barcode ${barcodes_tsv} \\ + --feature ${features_tsv} \\ --txp2gene ${txp2gene} \\ --star_index ${star_index} \\ --out ${meta.id}/${meta.id}_\${input_type}_matrix.h5ad ; diff --git a/modules/local/mtx_to_seurat.nf b/modules/local/mtx_to_seurat.nf index 4944686f..3c072c28 100644 --- a/modules/local/mtx_to_seurat.nf +++ b/modules/local/mtx_to_seurat.nf @@ -50,6 +50,24 @@ process MTX_TO_SEURAT { barcodes = "${mtx_dir}/*.barcodes.txt" features = "${mtx_dir}/*.genes.names.txt" + // kallisto allows the following workflows: ["standard", "lamanno", "nac"] + // lamanno creates "spliced" and "unspliced" + // nac creates "nascent", "ambiguous" "mature" + // also, lamanno produces a barcodes and genes file for both spliced and unspliced + // while nac keep only one for all the different .mtx files produced + kb_non_standard_files = "" + if (params.kb_workflow == "lamanno") { + kb_non_standard_files = "spliced unspliced" + matrix = "${mtx_dir}/\${input_type}.mtx" + barcodes = "${mtx_dir}/\${input_type}.barcodes.txt" + features = "${mtx_dir}/\${input_type}.genes.txt" + } + if (params.kb_workflow == "nac") { + kb_non_standard_files = "nascent ambiguous mature" + matrix = "${mtx_dir}/*\${input_type}.mtx" + features = "${mtx_dir}/*.genes.txt" + } // barcodes tsv has same pattern as standard workflow + } else if (params.aligner == "alevin") { mtx_dir = (input_type == 'custom_emptydrops_filter') ? 'emptydrops_filtered' : '*_alevin_results/af_quant/alevin' @@ -77,11 +95,11 @@ process MTX_TO_SEURAT { if (params.aligner == 'kallisto' && params.kb_workflow != 'standard') """ # convert file types - for input_type in nascent ambiguous mature ; do + for input_type in ${kb_non_standard_files} ; do mtx_to_seurat.R \\ - ${mtx_dir}/\${input_type}.mtx \\ - ${mtx_dir}/\${input_type}.barcodes.txt \\ - ${mtx_dir}/\${input_type}.genes.names.txt \\ + ${matrix} \\ + ${barcodes} \\ + ${features} \\ ${meta.id}/${meta.id}_\${input_type}_matrix.rds \\ ${aligner} \\ ${is_emptydrops} From d77eda82960050fdaa0b31095662e4bd79ebcdc4 Mon Sep 17 00:00:00 2001 From: fmalmeida Date: Wed, 13 Mar 2024 13:57:02 +0100 Subject: [PATCH 25/30] when running kallisto non-standard workflow store emptydrops in subdirs to avoid file collision --- modules/local/emptydrops.nf | 4 ++-- modules/local/mtx_to_h5ad.nf | 1 + modules/local/mtx_to_seurat.nf | 1 + 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/modules/local/emptydrops.nf b/modules/local/emptydrops.nf index 907295ab..e0b77435 100644 --- a/modules/local/emptydrops.nf +++ b/modules/local/emptydrops.nf @@ -68,14 +68,14 @@ process EMPTYDROPS_CELL_CALLING { // if (params.aligner == 'kallisto' && params.kb_workflow != 'standard') """ - mkdir emptydrops_filtered/ # convert file types for input_type in ${kb_non_standard_files} ; do + mkdir -p emptydrops_filtered/\${input_type} emptydrops_cell_calling.R \\ ${matrix} \\ ${barcodes} \\ ${features} \\ - emptydrops_filtered \\ + emptydrops_filtered/\${input_type} \\ ${params.aligner} \\ 0 done diff --git a/modules/local/mtx_to_h5ad.nf b/modules/local/mtx_to_h5ad.nf index c603683c..ba8a807e 100644 --- a/modules/local/mtx_to_h5ad.nf +++ b/modules/local/mtx_to_h5ad.nf @@ -45,6 +45,7 @@ process MTX_TO_H5AD { kb_pattern = (input_type == 'raw') ? 'un' : '' mtx_dir = (input_type == 'custom_emptydrops_filter') ? 'emptydrops_filtered' : "counts_${kb_pattern}filtered" + if ((input_type == 'custom_emptydrops_filter') && (params.kb_workflow != 'standard')) { mtx_dir = 'emptydrops_filtered/\${input_type}' } // dir has subdirs for non-standard workflows mtx_matrix = "${mtx_dir}/*.mtx" barcodes_tsv = "${mtx_dir}/*.barcodes.txt" features_tsv = "${mtx_dir}/*.genes.names.txt" diff --git a/modules/local/mtx_to_seurat.nf b/modules/local/mtx_to_seurat.nf index 3c072c28..3ba636ff 100644 --- a/modules/local/mtx_to_seurat.nf +++ b/modules/local/mtx_to_seurat.nf @@ -46,6 +46,7 @@ process MTX_TO_SEURAT { kb_pattern = (input_type == 'raw') ? 'un' : '' mtx_dir = (input_type == 'custom_emptydrops_filter') ? 'emptydrops_filtered' : "counts_${kb_pattern}filtered" + if ((input_type == 'custom_emptydrops_filter') && (params.kb_workflow != 'standard')) { mtx_dir = 'emptydrops_filtered/\${input_type}' } // dir has subdirs for non-standard workflows matrix = "${mtx_dir}/*.mtx" barcodes = "${mtx_dir}/*.barcodes.txt" features = "${mtx_dir}/*.genes.names.txt" From c460f28f596836dc3038f077e84616c561768df3 Mon Sep 17 00:00:00 2001 From: fmalmeida Date: Mon, 18 Mar 2024 14:07:41 +0100 Subject: [PATCH 26/30] update modules to get them from nf-core/modules --- modules.json | 78 ++++++++++++++----- .../nf-core/cellranger/count/environment.yml | 5 -- modules/nf-core/cellranger/count/main.nf | 8 +- modules/nf-core/cellranger/count/meta.yml | 8 ++ .../cellranger/count/tests/main.nf.test.snap | 52 ++++++++++++- .../nf-core/kallistobustools/count/main.nf | 4 +- .../nf-core/kallistobustools/count/meta.yml | 8 ++ .../count/tests/main.nf.test.snap | 30 ++++++- 8 files changed, 159 insertions(+), 34 deletions(-) delete mode 100644 modules/nf-core/cellranger/count/environment.yml diff --git a/modules.json b/modules.json index 32171104..227393dd 100644 --- a/modules.json +++ b/modules.json @@ -7,78 +7,108 @@ "nf-core": { "cellranger/count": { "branch": "master", - "git_sha": "a2dfd9a0b2e192243695711c723d652959de39fc", - "installed_by": ["modules"] + "git_sha": "92ca535c5a8c0fe89eb71e649ee536bd355ce4fc", + "installed_by": [ + "modules" + ] }, "cellranger/mkgtf": { "branch": "master", "git_sha": "575e1bc54b083fb15e7dd8b5fcc40bea60e8ce83", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "cellranger/mkref": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "cellrangerarc/count": { "branch": "master", "git_sha": "18e53e27cfeca5dbbfbeee675c05438dec68245f", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "cellrangerarc/mkgtf": { "branch": "master", "git_sha": "575e1bc54b083fb15e7dd8b5fcc40bea60e8ce83", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "cellrangerarc/mkref": { "branch": "master", "git_sha": "4196b1b2e7ce265892f3979eabf7a9ddc030702f", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "fastqc": { "branch": "master", "git_sha": "f4ae1d942bd50c5c0b9bd2de1393ce38315ba57c", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "gffread": { "branch": "master", "git_sha": "b1b959609bda44341120aed1766329909f54b8d0", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "gunzip": { "branch": "master", "git_sha": "3a5fef109d113b4997c9822198664ca5f2716208", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "kallistobustools/count": { "branch": "master", - "git_sha": "de8215983defba48cd81961d620a9e844f11c7e7", - "installed_by": ["modules"] + "git_sha": "9d3e489286eead7dfe1010fd324904d8b698eca7", + "installed_by": [ + "modules" + ] }, "kallistobustools/ref": { "branch": "master", "git_sha": "de8215983defba48cd81961d620a9e844f11c7e7", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "multiqc": { "branch": "master", "git_sha": "b7ebe95761cd389603f9cc0e0dc384c0f663815a", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "star/genomegenerate": { "branch": "master", "git_sha": "a21faa6a3481af92a343a10926f59c189a2c16c9", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "universc": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "unzip": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] } } }, @@ -87,20 +117,26 @@ "utils_nextflow_pipeline": { "branch": "master", "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] }, "utils_nfcore_pipeline": { "branch": "master", "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] }, "utils_nfvalidation_plugin": { "branch": "master", "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] } } } } } -} +} \ No newline at end of file diff --git a/modules/nf-core/cellranger/count/environment.yml b/modules/nf-core/cellranger/count/environment.yml deleted file mode 100644 index 662f747d..00000000 --- a/modules/nf-core/cellranger/count/environment.yml +++ /dev/null @@ -1,5 +0,0 @@ -name: cellranger_count -channels: - - conda-forge - - bioconda - - defaults diff --git a/modules/nf-core/cellranger/count/main.nf b/modules/nf-core/cellranger/count/main.nf index bbbe2359..1811d745 100644 --- a/modules/nf-core/cellranger/count/main.nf +++ b/modules/nf-core/cellranger/count/main.nf @@ -10,8 +10,8 @@ process CELLRANGER_COUNT { output: tuple val(meta), path("**/outs/**") , emit: outs - tuple val(meta), path("**/outs/filtered_feature_bc_matrix**"), emit: filtered // TODO: Add to nf-coew/modules before merging PR - tuple val(meta), path("**/outs/raw_feature_bc_matrix**") , emit: raw // TODO: Add to nf-coew/modules before merging PR + tuple val(meta), path("**/outs/filtered_feature_bc_matrix**"), emit: filtered + tuple val(meta), path("**/outs/raw_feature_bc_matrix**") , emit: raw path "versions.yml" , emit: versions when: @@ -34,7 +34,11 @@ process CELLRANGER_COUNT { def prefix = task.ext.prefix ?: "${meta.id}" """ mkdir -p "${prefix}/outs/" + mkdir -p "${prefix}/outs/filtered_feature_bc_matrix" + mkdir -p "${prefix}/outs/raw_feature_bc_matrix" echo "$prefix" > ${prefix}/outs/fake_file.txt + echo "$prefix" > ${prefix}/outs/filtered_feature_bc_matrix/fake_file.txt + echo "$prefix" > ${prefix}/outs/raw_feature_bc_matrix/fake_file.txt cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/cellranger/count/meta.yml b/modules/nf-core/cellranger/count/meta.yml index 1f1768a8..56244976 100644 --- a/modules/nf-core/cellranger/count/meta.yml +++ b/modules/nf-core/cellranger/count/meta.yml @@ -40,6 +40,14 @@ output: type: file description: Files containing the outputs of Cell Ranger, see official 10X Genomics documentation for a complete list pattern: "${meta.id}/outs/*" + - filtered: + type: file + description: Files containing the filtered outputs of Cell Ranger. + pattern: "**/outs/filtered_feature_bc_matrix**" + - raw: + type: file + description: Files containing the raw outputs of Cell Ranger. + pattern: "**/outs/raw_feature_bc_matrix**" - versions: type: file description: File containing software version diff --git a/modules/nf-core/cellranger/count/tests/main.nf.test.snap b/modules/nf-core/cellranger/count/tests/main.nf.test.snap index 7eafafd0..edfb304b 100644 --- a/modules/nf-core/cellranger/count/tests/main.nf.test.snap +++ b/modules/nf-core/cellranger/count/tests/main.nf.test.snap @@ -33,13 +33,61 @@ "single_end": false, "strandedness": "auto" }, - "fake_file.txt:md5,0d98223c768861fd6af96f00148dbb8d" + [ + "fake_file.txt:md5,0d98223c768861fd6af96f00148dbb8d", + "fake_file.txt:md5,0d98223c768861fd6af96f00148dbb8d", + "fake_file.txt:md5,0d98223c768861fd6af96f00148dbb8d" + ] ] ], "1": [ + [ + { + "id": "test_10x", + "single_end": false, + "strandedness": "auto" + }, + "fake_file.txt:md5,0d98223c768861fd6af96f00148dbb8d" + ] + ], + "2": [ + [ + { + "id": "test_10x", + "single_end": false, + "strandedness": "auto" + }, + "fake_file.txt:md5,0d98223c768861fd6af96f00148dbb8d" + ] + ], + "3": [ "versions.yml:md5,30cee1a9146b01c48d9b1db6bbe813b6" ], + "filtered": [ + [ + { + "id": "test_10x", + "single_end": false, + "strandedness": "auto" + }, + "fake_file.txt:md5,0d98223c768861fd6af96f00148dbb8d" + ] + ], "outs": [ + [ + { + "id": "test_10x", + "single_end": false, + "strandedness": "auto" + }, + [ + "fake_file.txt:md5,0d98223c768861fd6af96f00148dbb8d", + "fake_file.txt:md5,0d98223c768861fd6af96f00148dbb8d", + "fake_file.txt:md5,0d98223c768861fd6af96f00148dbb8d" + ] + ] + ], + "raw": [ [ { "id": "test_10x", @@ -58,6 +106,6 @@ "nf-test": "0.8.4", "nextflow": "23.10.1" }, - "timestamp": "2024-03-05T17:16:12.322822411" + "timestamp": "2024-03-18T11:41:17.258523741" } } \ No newline at end of file diff --git a/modules/nf-core/kallistobustools/count/main.nf b/modules/nf-core/kallistobustools/count/main.nf index 1693c786..1efda00a 100644 --- a/modules/nf-core/kallistobustools/count/main.nf +++ b/modules/nf-core/kallistobustools/count/main.nf @@ -18,8 +18,8 @@ process KALLISTOBUSTOOLS_COUNT { output: tuple val(meta), path ("*.count") , emit: count - tuple val(meta), path ("*.count/counts_unfiltered"), emit: raw_counts // TODO: Add to nf-coew/modules before merging PR - tuple val(meta), path ("*.count/counts_filtered") , emit: filtered_counts, optional: true // TODO: Add to nf-coew/modules before merging PR + tuple val(meta), path ("*.count/counts_unfiltered"), emit: raw_counts + tuple val(meta), path ("*.count/counts_filtered") , emit: filtered_counts, optional: true path "versions.yml" , emit: versions path "*.count/*/*.mtx" , emit: matrix //Ensure that kallisto finished and produced outputs diff --git a/modules/nf-core/kallistobustools/count/meta.yml b/modules/nf-core/kallistobustools/count/meta.yml index 55d5dc6c..d491dffa 100644 --- a/modules/nf-core/kallistobustools/count/meta.yml +++ b/modules/nf-core/kallistobustools/count/meta.yml @@ -58,6 +58,14 @@ output: type: file description: kb count output folder pattern: "*.{count}" + - raw_counts: + type: file + description: kb raw counts output folder + pattern: "*.{count}/counts_unfiltered" + - filtered_counts: + type: file + description: kb filtered counts output folder + pattern: "*.{count}/counts_filtered" - versions: type: file description: File containing software versions diff --git a/modules/nf-core/kallistobustools/count/tests/main.nf.test.snap b/modules/nf-core/kallistobustools/count/tests/main.nf.test.snap index 3378c3c1..6f6b3183 100644 --- a/modules/nf-core/kallistobustools/count/tests/main.nf.test.snap +++ b/modules/nf-core/kallistobustools/count/tests/main.nf.test.snap @@ -15,9 +15,22 @@ ] ], "1": [ - "versions.yml:md5,6ec06270afe0a7572c41567160d927d9" + [ + { + "id": "test" + }, + [ + "cells_x_genes.mtx:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] ], "2": [ + + ], + "3": [ + "versions.yml:md5,6ec06270afe0a7572c41567160d927d9" + ], + "4": [ "cells_x_genes.mtx:md5,d41d8cd98f00b204e9800998ecf8427e" ], "count": [ @@ -31,10 +44,23 @@ ] ] ] + ], + "filtered_counts": [ + ], "matrix": [ "cells_x_genes.mtx:md5,d41d8cd98f00b204e9800998ecf8427e" ], + "raw_counts": [ + [ + { + "id": "test" + }, + [ + "cells_x_genes.mtx:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], "versions": [ "versions.yml:md5,6ec06270afe0a7572c41567160d927d9" ] @@ -44,7 +70,7 @@ "nf-test": "0.8.4", "nextflow": "23.10.1" }, - "timestamp": "2024-03-01T15:48:45.775904562" + "timestamp": "2024-03-18T11:38:48.980939376" }, "genome.fasta + genome.gtf + '10X3' + 'standard'": { "content": [ From 1c1f1ac67231658d81b1e079df12d686ce087bc5 Mon Sep 17 00:00:00 2001 From: Felipe Marques de Almeida Date: Mon, 18 Mar 2024 13:48:09 +0000 Subject: [PATCH 27/30] prettier fix --- docs/output.md | 12 ++++---- modules.json | 74 +++++++++++++------------------------------------- 2 files changed, 25 insertions(+), 61 deletions(-) diff --git a/docs/output.md b/docs/output.md index aac47219..cff2d442 100644 --- a/docs/output.md +++ b/docs/output.md @@ -158,15 +158,15 @@ The pipeline also possess a module to perform empty-drops calling and filtering - `.mtx` files converted to R native data format, rds, using the [Seurat package](https://github.com/satijalab/seurat) - One per sample -Because the pipeline has both the data directly from the aligners, and from the custom empty-drops filtering module the conversion modules were modified to understand the difference between raw/filtered from the aligners itself and filtered from the custom empty-drops module. So, to try to avoid confusion by the user, we added "suffixes" to the generated converted files so that we have provenance from what input it came from. +Because the pipeline has both the data directly from the aligners, and from the custom empty-drops filtering module the conversion modules were modified to understand the difference between raw/filtered from the aligners itself and filtered from the custom empty-drops module. So, to try to avoid confusion by the user, we added "suffixes" to the generated converted files so that we have provenance from what input it came from. So, the conversion modules generate data with the following syntax: **`*_{raw,filtered,custom_emptydrops_filter}_matrix.{h5ad,rds}`**. With the following meanings: -| suffix | meaning | -| :----- | :------ | -| raw | Conversion of the raw/unprocessed matrix generated by the tool. It is also used for tools that generate only one matrix, such as alevin. | -| filtered | Conversion of the filtered/processed matrix generated by the tool | -| custom_emptydrops_filter | Conversion of the matrix that was generated by the new custom empty drops filter module | +| suffix | meaning | +| :----------------------- | :--------------------------------------------------------------------------------------------------------------------------------------- | +| raw | Conversion of the raw/unprocessed matrix generated by the tool. It is also used for tools that generate only one matrix, such as alevin. | +| filtered | Conversion of the filtered/processed matrix generated by the tool | +| custom_emptydrops_filter | Conversion of the matrix that was generated by the new custom empty drops filter module | > Some aligners, like `alevin` do not produce both raw&filtered matrices. When aligners give only one output, they are treated with the `raw` suffix. Some aligners may have an option to give both raw&filtered and only one, like `kallisto`. Be aware when using the tools. diff --git a/modules.json b/modules.json index 227393dd..5d8614a7 100644 --- a/modules.json +++ b/modules.json @@ -8,107 +8,77 @@ "cellranger/count": { "branch": "master", "git_sha": "92ca535c5a8c0fe89eb71e649ee536bd355ce4fc", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "cellranger/mkgtf": { "branch": "master", "git_sha": "575e1bc54b083fb15e7dd8b5fcc40bea60e8ce83", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "cellranger/mkref": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "cellrangerarc/count": { "branch": "master", "git_sha": "18e53e27cfeca5dbbfbeee675c05438dec68245f", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "cellrangerarc/mkgtf": { "branch": "master", "git_sha": "575e1bc54b083fb15e7dd8b5fcc40bea60e8ce83", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "cellrangerarc/mkref": { "branch": "master", "git_sha": "4196b1b2e7ce265892f3979eabf7a9ddc030702f", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "fastqc": { "branch": "master", "git_sha": "f4ae1d942bd50c5c0b9bd2de1393ce38315ba57c", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "gffread": { "branch": "master", "git_sha": "b1b959609bda44341120aed1766329909f54b8d0", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "gunzip": { "branch": "master", "git_sha": "3a5fef109d113b4997c9822198664ca5f2716208", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "kallistobustools/count": { "branch": "master", "git_sha": "9d3e489286eead7dfe1010fd324904d8b698eca7", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "kallistobustools/ref": { "branch": "master", "git_sha": "de8215983defba48cd81961d620a9e844f11c7e7", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "multiqc": { "branch": "master", "git_sha": "b7ebe95761cd389603f9cc0e0dc384c0f663815a", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "star/genomegenerate": { "branch": "master", "git_sha": "a21faa6a3481af92a343a10926f59c189a2c16c9", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "universc": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "unzip": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] } } }, @@ -117,26 +87,20 @@ "utils_nextflow_pipeline": { "branch": "master", "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", - "installed_by": [ - "subworkflows" - ] + "installed_by": ["subworkflows"] }, "utils_nfcore_pipeline": { "branch": "master", "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", - "installed_by": [ - "subworkflows" - ] + "installed_by": ["subworkflows"] }, "utils_nfvalidation_plugin": { "branch": "master", "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", - "installed_by": [ - "subworkflows" - ] + "installed_by": ["subworkflows"] } } } } } -} \ No newline at end of file +} From 6351f76b50a8a528c11de6f2599cbc933d65265f Mon Sep 17 00:00:00 2001 From: Felipe Marques de Almeida Date: Mon, 18 Mar 2024 14:52:00 +0000 Subject: [PATCH 28/30] small update as file names changed --- tests/main_pipeline_kallisto.test.snap | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/tests/main_pipeline_kallisto.test.snap b/tests/main_pipeline_kallisto.test.snap index 1eb15749..f9b9c96a 100644 --- a/tests/main_pipeline_kallisto.test.snap +++ b/tests/main_pipeline_kallisto.test.snap @@ -3,13 +3,13 @@ "content": [ { "stderr": [ - + ], "errorReport": "", "exitStatus": 0, "failed": false, "stdout": [ - + ], "errorMessage": "", "trace": { @@ -26,9 +26,13 @@ "cells_x_genes.barcodes.txt:md5,a8cf7ea4b2d075296a94bf066a64b7a4", "cells_x_genes.genes.txt:md5,acd9d00120f52031974b2add3e7521b6", "cells_x_genes.mtx:md5,abd83de117204d0a77df3c92d00cc025", - "Sample_X_matrix.rds:md5,0938f4189b7a7fd1030abfcee798741c", - "Sample_Y_matrix.rds:md5,93c12abe283ab37c5f37e5cd3cb25302" + "Sample_X_raw_matrix.rds:md5,0938f4189b7a7fd1030abfcee798741c", + "Sample_Y_raw_matrix.rds:md5,93c12abe283ab37c5f37e5cd3cb25302" ], - "timestamp": "2024-02-27T12:19:47.921508953" + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-18T14:51:42.040931572" } -} +} \ No newline at end of file From 9734aced56cffa65643668c8759ac2960d83a8c0 Mon Sep 17 00:00:00 2001 From: Felipe Marques de Almeida Date: Mon, 18 Mar 2024 14:55:58 +0000 Subject: [PATCH 29/30] add ending line --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 35378d07..bc675aba 100644 --- a/.gitignore +++ b/.gitignore @@ -10,4 +10,4 @@ log/ reports/ testme.sh .nf-test* -.vscode \ No newline at end of file +.vscode From 240660022119c0c1b7af9ac198e824355e5d79d8 Mon Sep 17 00:00:00 2001 From: Felipe Marques de Almeida Date: Mon, 18 Mar 2024 16:04:43 +0000 Subject: [PATCH 30/30] update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 51953318..775841d1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Update template to v2.13.1 ([#309](https://github.com/nf-core/scrnaseq/pull/309)) - Update to kallisto|bustools v0.28.2 ([#294](https://github.com/nf-core/scrnaseq/pull/294)) - Fix cellrangerarc matrix conversions and protocol selection ([#300](https://github.com/nf-core/scrnaseq/pull/300)) +- Add new emptydrops calling module ([#301](https://github.com/nf-core/scrnaseq/pull/301)) ## v2.5.1