From 8bf71a8dd8b738629f6b010bd9dd3a78fe689581 Mon Sep 17 00:00:00 2001 From: Daniel-VM Date: Tue, 17 Sep 2024 15:10:44 +0200 Subject: [PATCH 1/6] fixed input channel to minimap2_align --- workflows/bacass.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/bacass.nf b/workflows/bacass.nf index 2846600..8c1221e 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -302,8 +302,8 @@ workflow BACASS { .set { ch_for_polish } MINIMAP2_POLISH ( - ch_for_polish.map { meta, sr, lr, fasta -> tuple(meta, lr) }, - ch_for_polish.map { meta, sr, lr, fasta -> fasta }, + ch_for_polish.map { meta, sr, lr, fasta -> tuple(meta, lr) }, + ch_for_polish.map { meta, sr, lr, fasta -> tuple(meta, fasta) }, true, false, false From 663a7706f5a72942bf9fc18436c4f15f3bbbb3c6 Mon Sep 17 00:00:00 2001 From: Daniel-VM Date: Wed, 18 Sep 2024 16:38:11 +0200 Subject: [PATCH 2/6] refactor longread polishing and update medaka --- modules/local/medaka/main.nf | 6 ++- workflows/bacass.nf | 97 ++++++++++++++++++++---------------- 2 files changed, 59 insertions(+), 44 deletions(-) diff --git a/modules/local/medaka/main.nf b/modules/local/medaka/main.nf index 7cc540b..656f88a 100644 --- a/modules/local/medaka/main.nf +++ b/modules/local/medaka/main.nf @@ -8,7 +8,7 @@ process MEDAKA { 'biocontainers/medaka:1.4.3--py38h130def0_0' }" input: - tuple val(meta), file(longreads), file(assembly) + tuple val(meta), path(longreads), path(assembly) output: tuple val(meta), path('*_polished_genome.fa') , emit: assembly @@ -33,9 +33,11 @@ process MEDAKA { medaka_consensus $args \ -i ${ reads_bgzip_out ?: longreads } \ -d ${ assembly_bgzip_out ?: assembly } \ - -o "${prefix}_polished_genome.fa" \ + -o "${prefix}_out" \ -t $task.cpus + mv ${prefix}_out/* . + mv consensus.fasta ${prefix}_polished_genome.fa cat <<-END_VERSIONS > versions.yml "${task.process}": medaka: \$( medaka --version 2>&1 | sed 's/medaka //g' ) diff --git a/workflows/bacass.nf b/workflows/bacass.nf index 8c1221e..e339689 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -294,51 +294,64 @@ workflow BACASS { } // - // MODULE: Nanopolish, polishes assembly using FAST5 files - should take either miniasm, canu, or unicycler consensus sequence + // SUBWORKFLOW: Long reads polishing. Uses medaka or Nanopolish (this last requires Fast5 files available in input samplesheet). // - if ( !params.skip_polish && params.assembly_type == 'long' && params.polish_method != 'medaka' ) { + if ( (params.assembly_type != 'short' && !params.skip_polish) || ( params.assembly_type != 'short' && params.polish_method) ){ + // Set channel for polishing long reads ch_for_assembly .join( ch_assembly ) - .set { ch_for_polish } - - MINIMAP2_POLISH ( - ch_for_polish.map { meta, sr, lr, fasta -> tuple(meta, lr) }, - ch_for_polish.map { meta, sr, lr, fasta -> tuple(meta, fasta) }, - true, - false, - false - ) - ch_versions = ch_versions.mix(MINIMAP2_POLISH.out.versions) - - SAMTOOLS_INDEX ( - MINIMAP2_POLISH.out.bam.dump(tag: 'samtools_sort') - ) - ch_versions = ch_versions.mix(SAMTOOLS_INDEX.out.versions) - - ch_for_polish // tuple val(meta), val(reads), file(longreads), file(assembly) - .join( MINIMAP2_POLISH.out.bam ) // tuple val(meta), file(bam) - .join( SAMTOOLS_INDEX.out.bai ) // tuple val(meta), file(bai) - .join( ch_fast5 ) // tuple val(meta), file(fast5) - .set { ch_for_nanopolish } // tuple val(meta), val(reads), file(longreads), file(assembly), file(bam), file(bai), file(fast5) - - // TODO: 'nanopolish index' couldn't be tested. No fast5 provided in test datasets. - NANOPOLISH ( - ch_for_nanopolish.dump(tag: 'into_nanopolish') - ) - ch_versions = ch_versions.mix(NANOPOLISH.out.versions) - } - - // - // MODULE: Medaka, polishes assembly - should take either miniasm, canu, or unicycler consensus sequence - // - if ( !params.skip_polish && params.assembly_type == 'long' && params.polish_method == 'medaka' ) { - ch_for_assembly - .join( ch_assembly ) - .map { meta, sr, lr, assembly -> tuple(meta, lr, assembly) } - .set { ch_for_medaka } - - MEDAKA ( ch_for_medaka.dump(tag: 'into_medaka') ) - ch_versions = ch_versions.mix(MEDAKA.out.versions) + .set { ch_polish_long } // channel: [ val(meta), path(sr), path(lr), path(fasta) ] + if (params.polish_method == 'medaka'){ + // + // MODULE: Medaka, polishes assembly - should take either miniasm, canu, or unicycler consensus sequence + // + ch_polish_long + .map { meta, sr, lr, fasta -> tuple(meta, lr, fasta) } + .set { ch_for_medaka } + MEDAKA ( ch_for_medaka.dump(tag: 'into_medaka') ) + ch_assembly = ch_assembly.mix( MEDAKA.out.assembly ) + ch_versions = ch_versions.mix(MEDAKA.out.versions) + } else if (params.polish_method == 'nanopolish') { + // + // MODULE: Nanopolish, polishes assembly using FAST5 files + // + if (!ch_fast5){ + log.error "ERROR: FAST5 files are required for Nanopolish but none were provided. Please supply FAST5 files or choose another polishing method. Available options are: medaka, nanopolish" + } else { + // + // MODULE: Minimap2 polish + // + MINIMAP2_POLISH ( + ch_polish_long.map { meta, sr, lr, fasta -> tuple(meta, lr) }, + ch_polish_long.map { meta, sr, lr, fasta -> tuple(meta, fasta) }, + true, + false, + false + ) + ch_versions = ch_versions.mix(MINIMAP2_POLISH.out.versions) + // + // MODULE: Samtools index + // + SAMTOOLS_INDEX ( + MINIMAP2_POLISH.out.bam.dump(tag: 'samtools_sort') + ) + ch_versions = ch_versions.mix(SAMTOOLS_INDEX.out.versions) + // + // MODULE: Nanopolish + // + ch_polish_long // tuple val(meta), val(reads), file(longreads), file(assembly) + .join( MINIMAP2_POLISH.out.bam ) // tuple val(meta), file(bam) + .join( SAMTOOLS_INDEX.out.bai ) // tuple val(meta), file(bai) + .join( ch_fast5 ) // tuple val(meta), file(fast5) + .set { ch_for_nanopolish } // tuple val(meta), val(reads), file(longreads), file(assembly), file(bam), file(bai), file(fast5) + // TODO: 'nanopolish index' couldn't be tested. No fast5 provided in test datasets. + NANOPOLISH ( + ch_for_nanopolish.dump(tag: 'into_nanopolish') + ) + ch_assembly = ch_assembly.mix( NANOPOLISH.out.assembly ) + ch_versions = ch_versions.mix( NANOPOLISH.out.versions ) + } + } } // From 92bcfdecee9a2de7be0675d316e21a082d600cbf Mon Sep 17 00:00:00 2001 From: Daniel-VM Date: Wed, 18 Sep 2024 16:38:47 +0200 Subject: [PATCH 3/6] add condition to run miniasm on long/hybrid mode only --- workflows/bacass.nf | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/workflows/bacass.nf b/workflows/bacass.nf index e339689..4b2b0ed 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -240,7 +240,7 @@ workflow BACASS { // // MODULE: Miniasm, genome assembly, long reads // - if ( params.assembler == 'miniasm' ) { + if ( params.assembly_type != 'short' && params.assembler == 'miniasm' ) { MINIMAP2_ALIGN ( ch_for_assembly.map{ meta,sr,lr -> tuple(meta,lr) }, [[:],[]], @@ -280,6 +280,8 @@ workflow BACASS { ) ch_assembly = ch_assembly.mix( RACON.out.improved_assembly.dump(tag: 'miniasm') ) ch_versions = ch_versions.mix( RACON.out.versions ) + } else if (params.assembly_type == 'short' && params.assembler == 'miniasm') { + exit("Selected assembler ${params.assembler} cannot run on short reads") } // From fc6c0b91bbfbe067bd6317e1b13c2272052e446b Mon Sep 17 00:00:00 2001 From: Daniel-VM Date: Fri, 20 Sep 2024 16:20:35 +0200 Subject: [PATCH 4/6] Refined polishing step by 1-removing short reads; 2- selecting the polished genome for downstream analysis --- workflows/bacass.nf | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/workflows/bacass.nf b/workflows/bacass.nf index 4b2b0ed..9f70e78 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -302,16 +302,14 @@ workflow BACASS { // Set channel for polishing long reads ch_for_assembly .join( ch_assembly ) - .set { ch_polish_long } // channel: [ val(meta), path(sr), path(lr), path(fasta) ] + .map { meta, sr, lr, fasta -> tuple(meta, lr, fasta) } + .set { ch_polish_long } // channel: [ val(meta), path(lr), path(fasta) ] if (params.polish_method == 'medaka'){ // // MODULE: Medaka, polishes assembly - should take either miniasm, canu, or unicycler consensus sequence // - ch_polish_long - .map { meta, sr, lr, fasta -> tuple(meta, lr, fasta) } - .set { ch_for_medaka } - MEDAKA ( ch_for_medaka.dump(tag: 'into_medaka') ) - ch_assembly = ch_assembly.mix( MEDAKA.out.assembly ) + MEDAKA ( ch_polish_long ) + ch_assembly = MEDAKA.out.assembly ch_versions = ch_versions.mix(MEDAKA.out.versions) } else if (params.polish_method == 'nanopolish') { // @@ -324,8 +322,8 @@ workflow BACASS { // MODULE: Minimap2 polish // MINIMAP2_POLISH ( - ch_polish_long.map { meta, sr, lr, fasta -> tuple(meta, lr) }, - ch_polish_long.map { meta, sr, lr, fasta -> tuple(meta, fasta) }, + ch_polish_long.map { meta, lr, fasta -> tuple(meta, lr) }, + ch_polish_long.map { meta, lr, fasta -> tuple(meta, fasta) }, true, false, false @@ -350,7 +348,7 @@ workflow BACASS { NANOPOLISH ( ch_for_nanopolish.dump(tag: 'into_nanopolish') ) - ch_assembly = ch_assembly.mix( NANOPOLISH.out.assembly ) + ch_assembly = NANOPOLISH.out.assembly ch_versions = ch_versions.mix( NANOPOLISH.out.versions ) } } From c5c32d5593a8878ab75cc9e911177b3c56fb337f Mon Sep 17 00:00:00 2001 From: Daniel-VM Date: Fri, 20 Sep 2024 16:21:33 +0200 Subject: [PATCH 5/6] restricted the polishing step to long reads mode --- workflows/bacass.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/bacass.nf b/workflows/bacass.nf index 9f70e78..17c3573 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -298,7 +298,7 @@ workflow BACASS { // // SUBWORKFLOW: Long reads polishing. Uses medaka or Nanopolish (this last requires Fast5 files available in input samplesheet). // - if ( (params.assembly_type != 'short' && !params.skip_polish) || ( params.assembly_type != 'short' && params.polish_method) ){ + if ( (params.assembly_type == 'long' && !params.skip_polish) || ( params.assembly_type != 'short' && params.polish_method) ){ // Set channel for polishing long reads ch_for_assembly .join( ch_assembly ) From 0917bda1c8cd13ec6a16905a57ca55d0b57e2083 Mon Sep 17 00:00:00 2001 From: Daniel-VM Date: Mon, 23 Sep 2024 09:15:40 +0200 Subject: [PATCH 6/6] update changelog in #169 --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index ab4eeef..923df4d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Changed` +- [#169](https://github.com/nf-core/bacass/pull/169) Refactored long-reads polishing step. - [#167](https://github.com/nf-core/bacass/pull/167) Remove params.save_merged as merged reads are not used in downstream analysis. - [#159](https://github.com/nf-core/bacass/pull/159) Updated Kmerfinder module and increased memory. - [#150](https://github.com/nf-core/bacass/pull/150) Replace local unicycler module with nf-core module + bump version. @@ -17,6 +18,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Fixed` +- [#169](https://github.com/nf-core/bacass/pull/169) Fixed long reads polishing input channel. - [#168](https://github.com/nf-core/bacass/pull/168) Fix wrong metadata in canu input channel. - [#163](https://github.com/nf-core/bacass/pull/163) Fixed `params.save_merged` to properly save merged files. - [#160](https://github.com/nf-core/bacass/pull/160) Fixed memory issues in KmerFinder, fixed handling of no species detected, and fixed handling of empty fasta files in the prokka/bakkta channel.