diff --git a/README.md b/README.md index 7c010c00..4290fc7b 100644 --- a/README.md +++ b/README.md @@ -27,29 +27,38 @@ You can find numerous talks on the nf-core events page from various topics inclu ## Pipeline summary -1. Raw read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) -2. Adapter trimming ([`Trim Galore!`](https://www.bioinformatics.babraham.ac.uk/projects/trim_galore/)) - 1. Insert Size calculation - 2. Collapse reads ([`seqcluster`](https://seqcluster.readthedocs.io/mirna_annotation.html#processing-of-reads)) -3. Contamination filtering ([`Bowtie2`](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml)) -4. Alignment against miRBase mature miRNA ([`Bowtie1`](http://bowtie-bio.sourceforge.net/index.shtml)) -5. Alignment against miRBase hairpin - 1. Unaligned reads from step 3 ([`Bowtie1`](http://bowtie-bio.sourceforge.net/index.shtml)) - 2. Collapsed reads from step 2.2 ([`Bowtie1`](http://bowtie-bio.sourceforge.net/index.shtml)) -6. Post-alignment processing of miRBase hairpin - 1. Basic statistics from step 3 and step 4.1 ([`SAMtools`](https://sourceforge.net/projects/samtools/files/samtools/)) - 2. Analysis on miRBase, or MirGeneDB hairpin counts ([`edgeR`](https://bioconductor.org/packages/release/bioc/html/edgeR.html)) - - TMM normalization and a table of top expression hairpin - - MDS plot clustering samples - - Heatmap of sample similarities - 3. miRNA and isomiR annotation from step 4.1 ([`mirtop`](https://github.com/miRTop/mirtop)) -7. Alignment against host reference genome ([`Bowtie1`](http://bowtie-bio.sourceforge.net/index.shtml)) - 1. Post-alignment processing of alignment against host reference genome ([`SAMtools`](https://sourceforge.net/projects/samtools/files/samtools/)) -8. Novel miRNAs and known miRNAs discovery ([`MiRDeep2`](https://www.mdc-berlin.de/content/mirdeep2-documentation)) - 1. Mapping against reference genome with the mapper module - 2. Known and novel miRNA discovery with the mirdeep2 module -9. miRNA quality control ([`mirtrace`](https://github.com/friedlanderlab/mirtrace)) -10. Present QC for raw read, alignment, and expression results ([`MultiQC`](http://multiqc.info/)) +1. Quality check and triming + 1. Raw read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) + 2. Adapter trimming ([`fastp`](https://github.com/OpenGene/fastp)) + 3. Trim read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) +2. miRNA QC ([`miRTrace`](https://github.com/friedlanderlab/mirtrace)) +3. Contamination filtering ([`Bowtie2`](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml)) (Optional) + 1. rRNA filtration + 2. tRNA filtration + 3. cDNA filtration + 4. ncRNA filtration + 5. piRNA filtration + 6. Others filtration +4. miRNA quantification + - EdgeR + 1. Reads alignment against miRBase mature miRNA ([`Bowtie1`](http://bowtie-bio.sourceforge.net/index.shtml)) + 2. Post-alignment processing of alignment against Mature miRNA ([`SAMtools`](https://sourceforge.net/projects/samtools/files/samtools/)) + 3. Unmapped reads (from reads vs mature miRNA) alignment against miRBase hairpin + 4. Post-alignment processing of alignment against Hairpin ([`SAMtools`](https://sourceforge.net/projects/samtools/files/samtools/)) + 5. Analysis on miRBase, or MirGeneDB hairpin counts ([`edgeR`](https://bioconductor.org/packages/release/bioc/html/edgeR.html)) + - TMM normalization and a table of top expression hairpin + - MDS plot clustering samples + - Heatmap of sample similarities + - Mirtop quantification + 1. Read collapsing ([`seqcluster`](https://github.com/lpantano/seqcluster)) + 2. miRNA and isomiR annotation ([`mirtop`](https://github.com/miRTop/mirtop)) +5. Genome Quantification (Optional) + 1. Reads alignment against host reference genome ([`Bowtie1`](http://bowtie-bio.sourceforge.net/index.shtml)) + 2. Post-alignment processing of alignment against host reference genome ([`SAMtools`](https://sourceforge.net/projects/samtools/files/samtools/)) +6. Novel miRNAs and known miRNAs discovery ([`MiRDeep2`](https://www.mdc-berlin.de/content/mirdeep2-documentation)) (Optional) + 1. Mapping against reference genome with the mapper module + 2. Known and novel miRNA discovery with the mirdeep2 module +7. Present QC for raw read, alignment, and expression results ([`MultiQC`](http://multiqc.info/)) ## Usage diff --git a/conf/modules.config b/conf/modules.config index b06d0055..3a94442f 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -80,21 +80,13 @@ process { process { withName: 'MIRTRACE_RUN' { publishDir = [ - path: { "${params.outdir}/mirtrace/${meta.id}" }, + path: { "${params.outdir}/mirtrace" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } } -if (!(params.skip_fastqc)) { - process { - withName: '.*:FASTQC_FASTP:FASTQC_.*' { - ext.args = '--quiet' - } - } -} - if (!params.skip_fastp) { process { withName: 'FASTP' { @@ -130,6 +122,14 @@ if (!params.skip_fastp) { if (!params.skip_fastqc) { process { + withName: '.*:.*:FASTQC_FASTP:FASTQC_RAW' { + ext.args = '--quiet' + publishDir = [ + path: { "${params.outdir}/fastqc/raw" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } withName: '.*:.*:FASTQC_FASTP:FASTQC_TRIM' { ext.args = '--quiet' publishDir = [ @@ -238,7 +238,14 @@ if (!params.skip_mirdeep) { process { withName: 'MIRDEEP2_MAPPER' { publishDir = [ - path: { "${params.outdir}/mirdeep" }, + path: { "${params.outdir}/mirdeep2/mapper" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: 'MIRDEEP2_RUN' { + publishDir = [ + path: { "${params.outdir}/mirdeep2/run" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] diff --git a/conf/test.config b/conf/test.config index 1a81afee..450ef11d 100644 --- a/conf/test.config +++ b/conf/test.config @@ -23,8 +23,8 @@ params { input = 'https://github.com/nf-core/test-datasets/raw/smrnaseq/samplesheet/v2.0/samplesheet.csv' fasta = 'https://github.com/nf-core/test-datasets/raw/smrnaseq/reference/genome.fa' - mature = 'https://mirbase.org/download/CURRENT/mature.fa' - hairpin = 'https://mirbase.org/download/CURRENT/hairpin.fa' + mature = 'https://mirbase.org/download/mature.fa' + hairpin = 'https://mirbase.org/download/hairpin.fa' mirna_gtf = 'https://mirbase.org/download/hsa.gff3' mirtrace_species = 'hsa' protocol = 'illumina' diff --git a/docs/usage.md b/docs/usage.md index 2884df74..f659727b 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -27,10 +27,10 @@ It should point to the 3-letter species name used by [miRBase](https://www.mirba Different parameters can be set for the two supported databases. By default `miRBase` will be used with the parameters below. - `mirna_gtf`: If not supplied by the user, then `mirna_gtf` will point to the latest GFF3 file in miRbase: `https://mirbase.org/download/CURRENT/genomes/${params.mirtrace_species}.gff3` -- `mature`: points to the FASTA file of mature miRNA sequences. `https://mirbase.org/download/CURRENT/mature.fa` -- `hairpin`: points to the FASTA file of precursor miRNA sequences. `https://mirbase.org/download/CURRENT/hairpin.fa` +- `mature`: points to the FASTA file of mature miRNA sequences. `https://mirbase.org/download/mature.fa` +- `hairpin`: points to the FASTA file of precursor miRNA sequences. `https://mirbase.org/download/hairpin.fa` -If MirGeneDB should be used instead it needs to be specified using `--mirgenedb` and use the parameters below . +If MirGeneDB should be used instead it needs to be specified using `--mirgenedb` and use the parameters below. - `mirgenedb_gff`: The data can not be downloaded automatically (URLs are created with short term tokens in it), thus the user needs to supply the gff file for either his species, or all species downloaded from `https://mirgenedb.org/download`. The total set will automatically be subsetted to the species specified with `--mirgenedb_species`. - `mirgenedb_mature`: points to the FASTA file of mature miRNA sequences. Download from `https://mirgenedb.org/download`. diff --git a/modules/local/blat_mirna.nf b/modules/local/blat_mirna.nf index 7f8a2324..aa0d3d51 100644 --- a/modules/local/blat_mirna.nf +++ b/modules/local/blat_mirna.nf @@ -53,7 +53,7 @@ process BLAT_MIRNA { blat -out=blast8 $mirna $contaminants /dev/stdout | awk 'BEGIN{FS="\t"}{if(\$11 < 1e-5)print \$1;}' | uniq > mirnahit.txt awk 'BEGIN { while((getline<"mirnahit.txt")>0) l[">"\$1]=1 } /^>/ {x = l[\$1]} {if(!x) print }' $contaminants > filtered.fa -cat <<-END_VERSIONS > versions.yml + cat <<-END_VERSIONS > versions.yml "${task.process}": blat: \$(echo \$(blat) | grep Standalone | awk '{ if (match(\$0,/[0-9]*[0-9]/,m)) print m[0] }') END_VERSIONS diff --git a/modules/local/bowtie_map_contaminants.nf b/modules/local/bowtie_map_contaminants.nf index d10f13b5..d744b1fd 100644 --- a/modules/local/bowtie_map_contaminants.nf +++ b/modules/local/bowtie_map_contaminants.nf @@ -21,17 +21,20 @@ process BOWTIE_MAP_CONTAMINANTS { task.ext.when == null || task.ext.when script: + def args = task.ext.args ?: "" + """ - INDEX=`find -L ./ -name "*.3.ebwt" | sed 's/.3.ebwt//'` + INDEX=`find -L ./ -name "*.rev.1.bt2" | sed "s/\\.rev.1.bt2\$//"` bowtie2 \\ + -x \$INDEX \\ + -U ${reads} \\ --threads ${task.cpus} \\ + --un ${meta.id}.${contaminant_type}.filter.unmapped.contaminant.fastq \\ --very-sensitive-local \\ -k 1 \\ - -x \$INDEX \\ - --un ${meta.id}.${contaminant_type}.filter.unmapped.contaminant.fastq \\ - ${reads} \\ + -S ${meta.id}.filter.contaminant.sam \\ ${args} \\ - -S ${meta.id}.filter.contaminant.sam > ${meta.id}.contaminant_bowtie.log 2>&1 + > ${meta.id}.contaminant_bowtie.log 2>&1 # extracting number of reads from bowtie logs awk -v type=${contaminant_type} 'BEGIN{tot=0} {if(NR==4 || NR == 5){tot += \$1}} END {print "\\""type"\\": "tot }' ${meta.id}.contaminant_bowtie.log | tr -d , > filtered.${meta.id}_${contaminant_type}.stats diff --git a/modules/local/mirdeep2_prepare.nf b/modules/local/mirdeep2_prepare.nf index 7e2f2437..124b5b63 100644 --- a/modules/local/mirdeep2_prepare.nf +++ b/modules/local/mirdeep2_prepare.nf @@ -23,7 +23,7 @@ process MIRDEEP2_PIGZ { pigz -f -d -p $task.cpus $reads cat <<-END_VERSIONS > versions.yml - ${task.process}": + "${task.process}": pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) END_VERSIONS """ diff --git a/modules/local/mirdeep2_run.nf b/modules/local/mirdeep2_run.nf index 26635a4f..a7c47ac6 100644 --- a/modules/local/mirdeep2_run.nf +++ b/modules/local/mirdeep2_run.nf @@ -35,7 +35,7 @@ process MIRDEEP2_RUN { -z _${reads.simpleName} cat <<-END_VERSIONS > versions.yml - ${task.process}": + "${task.process}": mirdeep2: \$(echo "$VERSION") END_VERSIONS """ diff --git a/modules/local/mirtop_quant.nf b/modules/local/mirtop_quant.nf index e97d6a09..ab38c93d 100644 --- a/modules/local/mirtop_quant.nf +++ b/modules/local/mirtop_quant.nf @@ -34,7 +34,7 @@ process MIRTOP_QUANT { mv mirtop/stats/mirtop_stats.log mirtop/stats/full_mirtop_stats.log cat <<-END_VERSIONS > versions.yml - ${task.process}": + "${task.process}": mirtop: \$(echo \$(mirtop --version 2>&1) | sed 's/^.*mirtop //') END_VERSIONS """ diff --git a/modules/local/mirtrace.nf b/modules/local/mirtrace.nf index f576ebc0..95989293 100644 --- a/modules/local/mirtrace.nf +++ b/modules/local/mirtrace.nf @@ -43,7 +43,7 @@ process MIRTRACE_RUN { --force cat <<-END_VERSIONS > versions.yml - ${task.process}": + "${task.process}": mirtrace: \$(echo \$(mirtrace -v 2>&1)) END_VERSIONS """ diff --git a/modules/local/parse_fasta_mirna.nf b/modules/local/parse_fasta_mirna.nf index a0bbc75e..ad63401e 100644 --- a/modules/local/parse_fasta_mirna.nf +++ b/modules/local/parse_fasta_mirna.nf @@ -34,7 +34,7 @@ process PARSE_FASTA_MIRNA { seqkit seq --rna2dna \${FASTA}_sps.fa > \${FASTA}_igenome.fa cat <<-END_VERSIONS > versions.yml - ${task.process}": + "${task.process}": seqkit: \$(echo \$(seqkit 2>&1) | sed 's/^.*Version: //; s/ .*\$//') END_VERSIONS """ diff --git a/modules/local/seqcluster_collapse.nf b/modules/local/seqcluster_collapse.nf index 39f6ce85..82470e5a 100644 --- a/modules/local/seqcluster_collapse.nf +++ b/modules/local/seqcluster_collapse.nf @@ -25,7 +25,7 @@ process SEQCLUSTER_SEQUENCES { mv collapsed/*.fastq.gz final/. cat <<-END_VERSIONS > versions.yml - ${task.process}": + "${task.process}": seqcluster: \$(echo \$(seqcluster --version 2>&1) | sed 's/^.*seqcluster //') END_VERSIONS """ diff --git a/nextflow.config b/nextflow.config index 757e2f91..cbf59426 100644 --- a/nextflow.config +++ b/nextflow.config @@ -21,8 +21,8 @@ params { igenomes_base = 's3://ngi-igenomes/igenomes' igenomes_ignore = false mirna_gtf = null - mature = "https://mirbase.org/download/CURRENT/mature.fa" - hairpin = "https://mirbase.org/download/CURRENT/hairpin.fa" + mature = "https://mirbase.org/download/mature.fa" + hairpin = "https://mirbase.org/download/hairpin.fa" mirgenedb = false mirgenedb_mature = null mirgenedb_hairpin = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 40360e49..05461595 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -104,7 +104,7 @@ "description": "Path to FASTA file with mature miRNAs.", "fa_icon": "fas fa-wheelchair", "help_text": "Typically this will be the `mature.fa` file from miRBase. Can be given either as a plain text `.fa` file or a compressed `.gz` file.\n\nDefaults to the current miRBase release URL, from which the file will be downloaded.", - "default": "https://mirbase.org/download/CURRENT/mature.fa" + "default": "https://mirbase.org/download/mature.fa" }, "mirgenedb_mature": { "type": "string", @@ -116,7 +116,7 @@ "description": "Path to FASTA file with miRNAs precursors.", "fa_icon": "fab fa-cuttlefish", "help_text": "Typically this will be the `mature.fa` file from miRBase. Can be given either as a plain text `.fa` file or a compressed `.gz` file.\n\nDefaults to the current miRBase release URL, from which the file will be downloaded.", - "default": "https://mirbase.org/download/CURRENT/hairpin.fa" + "default": "https://mirbase.org/download/hairpin.fa" }, "mirgenedb_hairpin": { "type": "string", diff --git a/subworkflows/local/mirna_quant.nf b/subworkflows/local/mirna_quant.nf index dfa16ab4..e26839bc 100644 --- a/subworkflows/local/mirna_quant.nf +++ b/subworkflows/local/mirna_quant.nf @@ -33,24 +33,17 @@ workflow MIRNA_QUANT { main: ch_versions = Channel.empty() + + PARSE_MATURE ( mature ).parsed_fasta.set { mirna_parsed } ch_versions = ch_versions.mix(PARSE_MATURE.out.versions) FORMAT_MATURE ( mirna_parsed ) ch_versions = ch_versions.mix(FORMAT_MATURE.out.versions) - PARSE_HAIRPIN ( hairpin ).parsed_fasta.set { hairpin_parsed } - ch_versions = ch_versions.mix(PARSE_HAIRPIN.out.versions) - - FORMAT_HAIRPIN ( hairpin_parsed ) - ch_versions = ch_versions.mix(FORMAT_HAIRPIN.out.versions) - INDEX_MATURE ( FORMAT_MATURE.out.formatted_fasta ).index.set { mature_bowtie } ch_versions = ch_versions.mix(INDEX_MATURE.out.versions) - INDEX_HAIRPIN ( FORMAT_HAIRPIN.out.formatted_fasta ).index.set { hairpin_bowtie } - ch_versions = ch_versions.mix(INDEX_HAIRPIN.out.versions) - reads .map { add_suffix(it, "mature") } .dump (tag:'msux') @@ -64,15 +57,28 @@ workflow MIRNA_QUANT { .dump (tag:'hsux') .set { reads_hairpin } - BOWTIE_MAP_HAIRPIN ( reads_hairpin, hairpin_bowtie.collect() ) - ch_versions = ch_versions.mix(BOWTIE_MAP_HAIRPIN.out.versions) - BAM_STATS_MATURE ( BOWTIE_MAP_MATURE.out.bam, FORMAT_MATURE.out.formatted_fasta ) ch_versions = ch_versions.mix(BAM_STATS_MATURE.out.versions) + + + PARSE_HAIRPIN ( hairpin ).parsed_fasta.set { hairpin_parsed } + ch_versions = ch_versions.mix(PARSE_HAIRPIN.out.versions) + + FORMAT_HAIRPIN ( hairpin_parsed ) + ch_versions = ch_versions.mix(FORMAT_HAIRPIN.out.versions) + + INDEX_HAIRPIN ( FORMAT_HAIRPIN.out.formatted_fasta ).index.set { hairpin_bowtie } + ch_versions = ch_versions.mix(INDEX_HAIRPIN.out.versions) + + BOWTIE_MAP_HAIRPIN ( reads_hairpin, hairpin_bowtie.collect() ) + ch_versions = ch_versions.mix(BOWTIE_MAP_HAIRPIN.out.versions) + BAM_STATS_HAIRPIN ( BOWTIE_MAP_HAIRPIN.out.bam, FORMAT_HAIRPIN.out.formatted_fasta ) ch_versions = ch_versions.mix(BAM_STATS_HAIRPIN.out.versions) + + BAM_STATS_MATURE.out.idxstats.collect{it[1]} .mix(BAM_STATS_HAIRPIN.out.idxstats.collect{it[1]}) .dump(tag:'edger') @@ -81,6 +87,8 @@ workflow MIRNA_QUANT { .set { edger_input } EDGER_QC ( edger_input ) + + reads .map { add_suffix(it, "seqcluster") } .dump (tag:'ssux') @@ -92,6 +100,9 @@ workflow MIRNA_QUANT { BOWTIE_MAP_SEQCLUSTER ( reads_collapsed, hairpin_bowtie.collect() ) ch_versions = ch_versions.mix(BOWTIE_MAP_SEQCLUSTER.out.versions) + + + ch_mirtop_logs = Channel.empty() if (params.mirtrace_species){ MIRTOP_QUANT ( BOWTIE_MAP_SEQCLUSTER.out.bam.collect{it[1]}, FORMAT_HAIRPIN.out.formatted_fasta.collect{it[1]}, gtf ) @@ -106,6 +117,8 @@ workflow MIRNA_QUANT { .dump (tag:'gsux') .set { reads_genome } + + emit: fasta_mature = FORMAT_MATURE.out.formatted_fasta fasta_hairpin = FORMAT_HAIRPIN.out.formatted_fasta diff --git a/workflows/smrnaseq.nf b/workflows/smrnaseq.nf index b6c2d3a2..0dbb80be 100644 --- a/workflows/smrnaseq.nf +++ b/workflows/smrnaseq.nf @@ -195,13 +195,16 @@ workflow SMRNASEQ { genome_stats = GENOME_QUANT.out.stats ch_versions = ch_versions.mix(GENOME_QUANT.out.versions) + hairpin_clean = MIRNA_QUANT.out.fasta_hairpin.map { it -> it[1] } + mature_clean = MIRNA_QUANT.out.fasta_mature.map { it -> it[1] } + if (!params.skip_mirdeep) { MIRDEEP2 ( FASTQC_FASTP.out.reads, GENOME_QUANT.out.fasta, GENOME_QUANT.out.index.collect(), - MIRNA_QUANT.out.fasta_hairpin, - MIRNA_QUANT.out.fasta_mature + hairpin_clean, + mature_clean ) ch_versions = ch_versions.mix(MIRDEEP2.out.versions) } @@ -227,7 +230,8 @@ workflow SMRNASEQ { ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(FASTQC_FASTP.out.fastqc_raw_zip.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(FASTQC_FASTP.out.trim_json.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(FASTQC_FASTP.out.fastqc_trim_zip.collect{it[1]}.ifEmpty([])) + // ch_multiqc_files = ch_multiqc_files.mix(FASTQC_FASTP.out.trim_json.collect{it[1]}.ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(contamination_stats.collect().ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(genome_stats.collect({it[1]}).ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(MIRNA_QUANT.out.mature_stats.collect({it[1]}).ifEmpty([]))