Merge branch 'fix-mirdeep2_and_contamination_filter' into dev

nf-core · Oct 27, 2023 · ebf9b8e · ebf9b8e
2 parents 12b09d3 + b59a73f
commit ebf9b8e
Show file tree

Hide file tree

Showing 16 changed files with 105 additions and 69 deletions.
diff --git a/README.md b/README.md
@@ -27,29 +27,38 @@ You can find numerous talks on the nf-core events page from various topics inclu
 
 ## Pipeline summary
 
-1. Raw read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/))
-2. Adapter trimming ([`Trim Galore!`](https://www.bioinformatics.babraham.ac.uk/projects/trim_galore/))
-   1. Insert Size calculation
-   2. Collapse reads ([`seqcluster`](https://seqcluster.readthedocs.io/mirna_annotation.html#processing-of-reads))
-3. Contamination filtering ([`Bowtie2`](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml))
-4. Alignment against miRBase mature miRNA ([`Bowtie1`](http://bowtie-bio.sourceforge.net/index.shtml))
-5. Alignment against miRBase hairpin
-   1. Unaligned reads from step 3 ([`Bowtie1`](http://bowtie-bio.sourceforge.net/index.shtml))
-   2. Collapsed reads from step 2.2 ([`Bowtie1`](http://bowtie-bio.sourceforge.net/index.shtml))
-6. Post-alignment processing of miRBase hairpin
-   1. Basic statistics from step 3 and step 4.1 ([`SAMtools`](https://sourceforge.net/projects/samtools/files/samtools/))
-   2. Analysis on miRBase, or MirGeneDB hairpin counts ([`edgeR`](https://bioconductor.org/packages/release/bioc/html/edgeR.html))
-      - TMM normalization and a table of top expression hairpin
-      - MDS plot clustering samples
-      - Heatmap of sample similarities
-   3. miRNA and isomiR annotation from step 4.1 ([`mirtop`](https://github.com/miRTop/mirtop))
-7. Alignment against host reference genome ([`Bowtie1`](http://bowtie-bio.sourceforge.net/index.shtml))
-   1. Post-alignment processing of alignment against host reference genome ([`SAMtools`](https://sourceforge.net/projects/samtools/files/samtools/))
-8. Novel miRNAs and known miRNAs discovery ([`MiRDeep2`](https://www.mdc-berlin.de/content/mirdeep2-documentation))
-   1. Mapping against reference genome with the mapper module
-   2. Known and novel miRNA discovery with the mirdeep2 module
-9. miRNA quality control ([`mirtrace`](https://github.com/friedlanderlab/mirtrace))
-10. Present QC for raw read, alignment, and expression results ([`MultiQC`](http://multiqc.info/))
+1. Quality check and triming
+    1. Raw read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/))
+    2. Adapter trimming ([`fastp`](https://github.com/OpenGene/fastp))
+    3. Trim read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/))
+2. miRNA QC ([`miRTrace`](https://github.com/friedlanderlab/mirtrace))
+3. Contamination filtering ([`Bowtie2`](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml)) (Optional)
+    1. rRNA filtration
+    2. tRNA filtration
+    3. cDNA filtration
+    4. ncRNA filtration
+    5. piRNA filtration
+    6. Others filtration
+4. miRNA quantification
+    - EdgeR
+        1. Reads alignment against miRBase mature miRNA ([`Bowtie1`](http://bowtie-bio.sourceforge.net/index.shtml))
+        2. Post-alignment processing of alignment against Mature miRNA ([`SAMtools`](https://sourceforge.net/projects/samtools/files/samtools/))
+        3. Unmapped reads (from reads vs mature miRNA) alignment against miRBase hairpin
+        4. Post-alignment processing of alignment against Hairpin ([`SAMtools`](https://sourceforge.net/projects/samtools/files/samtools/))
+        5. Analysis on miRBase, or MirGeneDB hairpin counts ([`edgeR`](https://bioconductor.org/packages/release/bioc/html/edgeR.html))
+            - TMM normalization and a table of top expression hairpin
+            - MDS plot clustering samples
+            - Heatmap of sample similarities
+    - Mirtop quantification
+        1. Read collapsing  ([`seqcluster`](https://github.com/lpantano/seqcluster))
+        2. miRNA and isomiR annotation ([`mirtop`](https://github.com/miRTop/mirtop))
+5. Genome Quantification (Optional)
+    1. Reads alignment against host reference genome ([`Bowtie1`](http://bowtie-bio.sourceforge.net/index.shtml))
+    2. Post-alignment processing of alignment against host reference genome ([`SAMtools`](https://sourceforge.net/projects/samtools/files/samtools/))
+6. Novel miRNAs and known miRNAs discovery ([`MiRDeep2`](https://www.mdc-berlin.de/content/mirdeep2-documentation)) (Optional)
+    1. Mapping against reference genome with the mapper module
+    2. Known and novel miRNA discovery with the mirdeep2 module
+7. Present QC for raw read, alignment, and expression results ([`MultiQC`](http://multiqc.info/))
 
 ## Usage
 

diff --git a/conf/modules.config b/conf/modules.config
@@ -80,21 +80,13 @@ process {
 process {
     withName: 'MIRTRACE_RUN' {
         publishDir = [
-            path: { "${params.outdir}/mirtrace/${meta.id}" },
+            path: { "${params.outdir}/mirtrace" },
             mode: params.publish_dir_mode,
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
     }
 }
 
-if (!(params.skip_fastqc)) {
-    process {
-        withName: '.*:FASTQC_FASTP:FASTQC_.*' {
-            ext.args = '--quiet'
-        }
-    }
-}
-
 if (!params.skip_fastp) {
     process {
         withName: 'FASTP' {
@@ -130,6 +122,14 @@ if (!params.skip_fastp) {
 
     if (!params.skip_fastqc) {
         process {
+            withName: '.*:.*:FASTQC_FASTP:FASTQC_RAW' {
+                ext.args = '--quiet'
+                publishDir = [
+                    path: { "${params.outdir}/fastqc/raw" },
+                    mode: params.publish_dir_mode,
+                    saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+                ]
+            }
             withName: '.*:.*:FASTQC_FASTP:FASTQC_TRIM' {
                 ext.args = '--quiet'
                 publishDir = [
@@ -238,7 +238,14 @@ if (!params.skip_mirdeep) {
     process {
         withName: 'MIRDEEP2_MAPPER' {
             publishDir = [
-                path: { "${params.outdir}/mirdeep" },
+                path: { "${params.outdir}/mirdeep2/mapper" },
+                mode: params.publish_dir_mode,
+                saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+            ]
+        }
+        withName: 'MIRDEEP2_RUN' {
+            publishDir = [
+                path: { "${params.outdir}/mirdeep2/run" },
                 mode: params.publish_dir_mode,
                 saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
             ]

diff --git a/conf/test.config b/conf/test.config
@@ -23,8 +23,8 @@ params {
 
     input            = 'https://github.com/nf-core/test-datasets/raw/smrnaseq/samplesheet/v2.0/samplesheet.csv'
     fasta            = 'https://github.com/nf-core/test-datasets/raw/smrnaseq/reference/genome.fa'
-    mature           = 'https://mirbase.org/download/CURRENT/mature.fa'
-    hairpin          = 'https://mirbase.org/download/CURRENT/hairpin.fa'
+    mature           = 'https://mirbase.org/download/mature.fa'
+    hairpin          = 'https://mirbase.org/download/hairpin.fa'
     mirna_gtf        = 'https://mirbase.org/download/hsa.gff3'
     mirtrace_species = 'hsa'
     protocol         = 'illumina'

diff --git a/docs/usage.md b/docs/usage.md
@@ -27,10 +27,10 @@ It should point to the 3-letter species name used by [miRBase](https://www.mirba
 Different parameters can be set for the two supported databases. By default `miRBase` will be used with the parameters below.
 
 - `mirna_gtf`: If not supplied by the user, then `mirna_gtf` will point to the latest GFF3 file in miRbase: `https://mirbase.org/download/CURRENT/genomes/${params.mirtrace_species}.gff3`
-- `mature`: points to the FASTA file of mature miRNA sequences. `https://mirbase.org/download/CURRENT/mature.fa`
-- `hairpin`: points to the FASTA file of precursor miRNA sequences. `https://mirbase.org/download/CURRENT/hairpin.fa`
+- `mature`: points to the FASTA file of mature miRNA sequences. `https://mirbase.org/download/mature.fa`
+- `hairpin`: points to the FASTA file of precursor miRNA sequences. `https://mirbase.org/download/hairpin.fa`
 
-If MirGeneDB should be used instead it needs to be specified using `--mirgenedb` and use the parameters below .
+If MirGeneDB should be used instead it needs to be specified using `--mirgenedb` and use the parameters below.
 
 - `mirgenedb_gff`: The data can not be downloaded automatically (URLs are created with short term tokens in it), thus the user needs to supply the gff file for either his species, or all species downloaded from `https://mirgenedb.org/download`. The total set will automatically be subsetted to the species specified with `--mirgenedb_species`.
 - `mirgenedb_mature`: points to the FASTA file of mature miRNA sequences. Download from `https://mirgenedb.org/download`.

diff --git a/modules/local/blat_mirna.nf b/modules/local/blat_mirna.nf
@@ -53,7 +53,7 @@ process BLAT_MIRNA {
         blat -out=blast8 $mirna $contaminants /dev/stdout | awk 'BEGIN{FS="\t"}{if(\$11 < 1e-5)print \$1;}' | uniq > mirnahit.txt
         awk 'BEGIN { while((getline<"mirnahit.txt")>0) l[">"\$1]=1 } /^>/ {x = l[\$1]} {if(!x) print }' $contaminants  > filtered.fa
 
-cat <<-END_VERSIONS > versions.yml
+        cat <<-END_VERSIONS > versions.yml
         "${task.process}":
             blat: \$(echo \$(blat) | grep Standalone | awk '{ if (match(\$0,/[0-9]*[0-9]/,m)) print m[0] }')
         END_VERSIONS

diff --git a/modules/local/bowtie_map_contaminants.nf b/modules/local/bowtie_map_contaminants.nf
@@ -21,17 +21,20 @@ process BOWTIE_MAP_CONTAMINANTS {
     task.ext.when == null || task.ext.when
 
     script:
+    def args = task.ext.args ?: ""
+
     """
-    INDEX=`find -L ./ -name "*.3.ebwt" | sed 's/.3.ebwt//'`
+    INDEX=`find -L ./ -name "*.rev.1.bt2" | sed "s/\\.rev.1.bt2\$//"`
     bowtie2 \\
+        -x \$INDEX \\
+        -U ${reads} \\
         --threads ${task.cpus} \\
+        --un ${meta.id}.${contaminant_type}.filter.unmapped.contaminant.fastq \\
         --very-sensitive-local \\
         -k 1 \\
-        -x \$INDEX \\
-        --un ${meta.id}.${contaminant_type}.filter.unmapped.contaminant.fastq \\
-        ${reads} \\
+        -S ${meta.id}.filter.contaminant.sam \\
         ${args} \\
-        -S ${meta.id}.filter.contaminant.sam > ${meta.id}.contaminant_bowtie.log 2>&1
+        > ${meta.id}.contaminant_bowtie.log 2>&1
 
     # extracting number of reads from bowtie logs
     awk -v type=${contaminant_type} 'BEGIN{tot=0} {if(NR==4 || NR == 5){tot += \$1}} END {print "\\""type"\\": "tot }' ${meta.id}.contaminant_bowtie.log | tr -d , > filtered.${meta.id}_${contaminant_type}.stats

diff --git a/modules/local/mirdeep2_prepare.nf b/modules/local/mirdeep2_prepare.nf
@@ -23,7 +23,7 @@ process MIRDEEP2_PIGZ {
     pigz -f -d -p $task.cpus $reads
 
     cat <<-END_VERSIONS > versions.yml
-    ${task.process}":
+    "${task.process}":
         pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' )
     END_VERSIONS
     """

diff --git a/modules/local/mirdeep2_run.nf b/modules/local/mirdeep2_run.nf
@@ -35,7 +35,7 @@ process MIRDEEP2_RUN {
         -z _${reads.simpleName}
 
     cat <<-END_VERSIONS > versions.yml
-    ${task.process}":
+    "${task.process}":
         mirdeep2: \$(echo "$VERSION")
     END_VERSIONS
     """

diff --git a/modules/local/mirtop_quant.nf b/modules/local/mirtop_quant.nf
@@ -34,7 +34,7 @@ process MIRTOP_QUANT {
     mv mirtop/stats/mirtop_stats.log mirtop/stats/full_mirtop_stats.log
 
     cat <<-END_VERSIONS > versions.yml
-    ${task.process}":
+    "${task.process}":
         mirtop: \$(echo \$(mirtop --version 2>&1) | sed 's/^.*mirtop //')
     END_VERSIONS
     """

diff --git a/modules/local/mirtrace.nf b/modules/local/mirtrace.nf
@@ -43,7 +43,7 @@ process MIRTRACE_RUN {
         --force
 
     cat <<-END_VERSIONS > versions.yml
-    ${task.process}":
+    "${task.process}":
         mirtrace: \$(echo \$(mirtrace -v 2>&1))
     END_VERSIONS
     """

diff --git a/modules/local/parse_fasta_mirna.nf b/modules/local/parse_fasta_mirna.nf
@@ -34,7 +34,7 @@ process PARSE_FASTA_MIRNA {
     seqkit seq --rna2dna \${FASTA}_sps.fa > \${FASTA}_igenome.fa
 
     cat <<-END_VERSIONS > versions.yml
-    ${task.process}":
+    "${task.process}":
         seqkit: \$(echo \$(seqkit 2>&1) | sed 's/^.*Version: //; s/ .*\$//')
     END_VERSIONS
     """

diff --git a/modules/local/seqcluster_collapse.nf b/modules/local/seqcluster_collapse.nf
@@ -25,7 +25,7 @@ process SEQCLUSTER_SEQUENCES {
     mv collapsed/*.fastq.gz final/.
 
     cat <<-END_VERSIONS > versions.yml
-    ${task.process}":
+    "${task.process}":
         seqcluster: \$(echo \$(seqcluster --version 2>&1) | sed 's/^.*seqcluster //')
     END_VERSIONS
     """

diff --git a/nextflow.config b/nextflow.config
@@ -21,8 +21,8 @@ params {
     igenomes_base               = 's3://ngi-igenomes/igenomes'
     igenomes_ignore             = false
     mirna_gtf                   = null
-    mature                      = "https://mirbase.org/download/CURRENT/mature.fa"
-    hairpin                     = "https://mirbase.org/download/CURRENT/hairpin.fa"
+    mature                      = "https://mirbase.org/download/mature.fa"
+    hairpin                     = "https://mirbase.org/download/hairpin.fa"
     mirgenedb                   = false
     mirgenedb_mature            = null
     mirgenedb_hairpin           = null

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -104,7 +104,7 @@
                     "description": "Path to FASTA file with mature miRNAs.",
                     "fa_icon": "fas fa-wheelchair",
                     "help_text": "Typically this will be the `mature.fa` file from miRBase. Can be given either as a plain text `.fa` file or a compressed `.gz` file.\n\nDefaults to the current miRBase release URL, from which the file will be downloaded.",
-                    "default": "https://mirbase.org/download/CURRENT/mature.fa"
+                    "default": "https://mirbase.org/download/mature.fa"
                 },
                 "mirgenedb_mature": {
                     "type": "string",
@@ -116,7 +116,7 @@
                     "description": "Path to FASTA file with miRNAs precursors.",
                     "fa_icon": "fab fa-cuttlefish",
                     "help_text": "Typically this will be the `mature.fa` file from miRBase. Can be given either as a plain text `.fa` file or a compressed `.gz` file.\n\nDefaults to the current miRBase release URL, from which the file will be downloaded.",
-                    "default": "https://mirbase.org/download/CURRENT/hairpin.fa"
+                    "default": "https://mirbase.org/download/hairpin.fa"
                 },
                 "mirgenedb_hairpin": {
                     "type": "string",

diff --git a/subworkflows/local/mirna_quant.nf b/subworkflows/local/mirna_quant.nf
@@ -33,24 +33,17 @@ workflow MIRNA_QUANT {
     main:
     ch_versions = Channel.empty()
 
+
+
     PARSE_MATURE ( mature ).parsed_fasta.set { mirna_parsed }
     ch_versions = ch_versions.mix(PARSE_MATURE.out.versions)
 
     FORMAT_MATURE ( mirna_parsed )
     ch_versions = ch_versions.mix(FORMAT_MATURE.out.versions)
 
-    PARSE_HAIRPIN ( hairpin ).parsed_fasta.set { hairpin_parsed }
-    ch_versions = ch_versions.mix(PARSE_HAIRPIN.out.versions)
-
-    FORMAT_HAIRPIN ( hairpin_parsed )
-    ch_versions = ch_versions.mix(FORMAT_HAIRPIN.out.versions)
-
     INDEX_MATURE ( FORMAT_MATURE.out.formatted_fasta ).index.set { mature_bowtie }
     ch_versions = ch_versions.mix(INDEX_MATURE.out.versions)
 
-    INDEX_HAIRPIN ( FORMAT_HAIRPIN.out.formatted_fasta ).index.set { hairpin_bowtie }
-    ch_versions = ch_versions.mix(INDEX_HAIRPIN.out.versions)
-
     reads
         .map { add_suffix(it, "mature") }
         .dump (tag:'msux')
@@ -64,15 +57,28 @@ workflow MIRNA_QUANT {
         .dump (tag:'hsux')
         .set { reads_hairpin }
 
-    BOWTIE_MAP_HAIRPIN ( reads_hairpin, hairpin_bowtie.collect() )
-    ch_versions = ch_versions.mix(BOWTIE_MAP_HAIRPIN.out.versions)
-
     BAM_STATS_MATURE ( BOWTIE_MAP_MATURE.out.bam, FORMAT_MATURE.out.formatted_fasta )
     ch_versions = ch_versions.mix(BAM_STATS_MATURE.out.versions)
 
+
+
+    PARSE_HAIRPIN ( hairpin ).parsed_fasta.set { hairpin_parsed }
+    ch_versions = ch_versions.mix(PARSE_HAIRPIN.out.versions)
+
+    FORMAT_HAIRPIN ( hairpin_parsed )
+    ch_versions = ch_versions.mix(FORMAT_HAIRPIN.out.versions)
+
+    INDEX_HAIRPIN ( FORMAT_HAIRPIN.out.formatted_fasta ).index.set { hairpin_bowtie }
+    ch_versions = ch_versions.mix(INDEX_HAIRPIN.out.versions)
+
+    BOWTIE_MAP_HAIRPIN ( reads_hairpin, hairpin_bowtie.collect() )
+    ch_versions = ch_versions.mix(BOWTIE_MAP_HAIRPIN.out.versions)
+
     BAM_STATS_HAIRPIN ( BOWTIE_MAP_HAIRPIN.out.bam, FORMAT_HAIRPIN.out.formatted_fasta )
     ch_versions = ch_versions.mix(BAM_STATS_HAIRPIN.out.versions)
 
+
+
     BAM_STATS_MATURE.out.idxstats.collect{it[1]}
         .mix(BAM_STATS_HAIRPIN.out.idxstats.collect{it[1]})
         .dump(tag:'edger')
@@ -81,6 +87,8 @@ workflow MIRNA_QUANT {
         .set { edger_input }
     EDGER_QC ( edger_input )
 
+
+
     reads
         .map { add_suffix(it, "seqcluster") }
         .dump (tag:'ssux')
@@ -92,6 +100,9 @@ workflow MIRNA_QUANT {
     BOWTIE_MAP_SEQCLUSTER ( reads_collapsed, hairpin_bowtie.collect() )
     ch_versions = ch_versions.mix(BOWTIE_MAP_SEQCLUSTER.out.versions)
 
+
+
+
     ch_mirtop_logs = Channel.empty()
     if (params.mirtrace_species){
         MIRTOP_QUANT ( BOWTIE_MAP_SEQCLUSTER.out.bam.collect{it[1]}, FORMAT_HAIRPIN.out.formatted_fasta.collect{it[1]}, gtf )
@@ -106,6 +117,8 @@ workflow MIRNA_QUANT {
         .dump (tag:'gsux')
         .set { reads_genome }
 
+
+
     emit:
     fasta_mature        = FORMAT_MATURE.out.formatted_fasta
     fasta_hairpin       = FORMAT_HAIRPIN.out.formatted_fasta

diff --git a/workflows/smrnaseq.nf b/workflows/smrnaseq.nf
@@ -195,13 +195,16 @@ workflow SMRNASEQ {
         genome_stats = GENOME_QUANT.out.stats
         ch_versions = ch_versions.mix(GENOME_QUANT.out.versions)
 
+        hairpin_clean = MIRNA_QUANT.out.fasta_hairpin.map { it -> it[1] }
+        mature_clean  = MIRNA_QUANT.out.fasta_mature.map { it -> it[1] }
+
         if (!params.skip_mirdeep) {
             MIRDEEP2 (
                 FASTQC_FASTP.out.reads,
                 GENOME_QUANT.out.fasta,
                 GENOME_QUANT.out.index.collect(),
-                MIRNA_QUANT.out.fasta_hairpin,
-                MIRNA_QUANT.out.fasta_mature
+                hairpin_clean,
+                mature_clean
             )
             ch_versions = ch_versions.mix(MIRDEEP2.out.versions)
         }
@@ -227,7 +230,8 @@ workflow SMRNASEQ {
         ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect())
         ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml'))
         ch_multiqc_files = ch_multiqc_files.mix(FASTQC_FASTP.out.fastqc_raw_zip.collect{it[1]}.ifEmpty([]))
-        ch_multiqc_files = ch_multiqc_files.mix(FASTQC_FASTP.out.trim_json.collect{it[1]}.ifEmpty([]))
+        ch_multiqc_files = ch_multiqc_files.mix(FASTQC_FASTP.out.fastqc_trim_zip.collect{it[1]}.ifEmpty([]))
+        // ch_multiqc_files = ch_multiqc_files.mix(FASTQC_FASTP.out.trim_json.collect{it[1]}.ifEmpty([]))
         ch_multiqc_files = ch_multiqc_files.mix(contamination_stats.collect().ifEmpty([]))
         ch_multiqc_files = ch_multiqc_files.mix(genome_stats.collect({it[1]}).ifEmpty([]))
         ch_multiqc_files = ch_multiqc_files.mix(MIRNA_QUANT.out.mature_stats.collect({it[1]}).ifEmpty([]))