diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 9eb73e01..73441108 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -18,7 +18,7 @@ Learn more about contributing: [CONTRIBUTING.md](https://github.com/nf-core/baca - [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](https://github.com/nf-core/bacass/tree/master/.github/CONTRIBUTING.md) - [ ] If necessary, also make a PR on the nf-core/bacass _branch_ on the [nf-core/test-datasets](https://github.com/nf-core/test-datasets) repository. - [ ] Make sure your code lints (`nf-core lint`). -- [ ] Check for unexpected warnings in debug mode (`nextflow run . -profile debug,test,docker --outdir `). +- [ ] Ensure the test suite passes (`nextflow run . -profile test,docker --outdir `). - [ ] Usage Documentation in `docs/usage.md` is updated. - [ ] Output Documentation in `docs/output.md` is updated. - [ ] `CHANGELOG.md` is updated. diff --git a/.github/workflows/awsfulltest.yml b/.github/workflows/awsfulltest.yml index fc13318e..74def70e 100644 --- a/.github/workflows/awsfulltest.yml +++ b/.github/workflows/awsfulltest.yml @@ -8,12 +8,12 @@ on: types: [published] workflow_dispatch: jobs: - run-tower: + run-platform: name: Run AWS full tests if: github.repository == 'nf-core/bacass' runs-on: ubuntu-latest steps: - - name: Launch workflow via tower + - name: Launch workflow via Seqera Platform uses: seqeralabs/action-tower-launch@v2 # Add full size test data (but still relatively small datasets for few samples) # on the `test_full.config` test runs with only one set of parameters @@ -32,7 +32,7 @@ jobs: - uses: actions/upload-artifact@v4 with: - name: Tower debug log file + name: Seqera Platform debug log file path: | - tower_action_*.log - tower_action_*.json + seqera_platform_action_*.log + seqera_platform_action_*.json diff --git a/.github/workflows/awstest.yml b/.github/workflows/awstest.yml index 2f94a1e0..89b090f9 100644 --- a/.github/workflows/awstest.yml +++ b/.github/workflows/awstest.yml @@ -5,13 +5,13 @@ name: nf-core AWS test on: workflow_dispatch: jobs: - run-tower: + run-platform: name: Run AWS tests if: github.repository == 'nf-core/bacass' runs-on: ubuntu-latest steps: - # Launch workflow using Tower CLI tool action - - name: Launch workflow via tower + # Launch workflow using Seqera Platform CLI tool action + - name: Launch workflow via Seqera Platform uses: seqeralabs/action-tower-launch@v2 with: workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} @@ -27,7 +27,7 @@ jobs: - uses: actions/upload-artifact@v4 with: - name: Tower debug log file + name: Seqera Platform debug log file path: | - tower_action_*.log - tower_action_*.json + seqera_platform_action_*.log + seqera_platform_action_*.json diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c26c0d2b..c85cdae5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -28,10 +28,10 @@ jobs: - "latest-everything" steps: - name: Check out pipeline code - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4 + uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 - name: Install Nextflow - uses: nf-core/setup-nextflow@v1 + uses: nf-core/setup-nextflow@v2 with: version: "${{ matrix.NXF_VER }}" diff --git a/.github/workflows/download_pipeline.yml b/.github/workflows/download_pipeline.yml index 2e869b99..2d20d644 100644 --- a/.github/workflows/download_pipeline.yml +++ b/.github/workflows/download_pipeline.yml @@ -32,9 +32,12 @@ jobs: - name: Install Nextflow uses: nf-core/setup-nextflow@v2 - - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5 + - name: Disk space cleanup + uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1 + + - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5 with: - python-version: "3.11" + python-version: "3.12" architecture: "x64" - uses: eWaterCycle/setup-singularity@931d4e31109e875b13309ae1d07c70ca8fbc8537 # v7 with: diff --git a/.github/workflows/fix-linting.yml b/.github/workflows/fix-linting.yml index 4d48858f..0802d635 100644 --- a/.github/workflows/fix-linting.yml +++ b/.github/workflows/fix-linting.yml @@ -13,7 +13,7 @@ jobs: runs-on: ubuntu-latest steps: # Use the @nf-core-bot token to check out so we can push later - - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4 + - uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 with: token: ${{ secrets.nf_core_bot_auth_token }} @@ -32,9 +32,9 @@ jobs: GITHUB_TOKEN: ${{ secrets.nf_core_bot_auth_token }} # Install and run pre-commit - - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5 + - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5 with: - python-version: 3.11 + python-version: "3.12" - name: Install pre-commit run: pip install pre-commit diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 073e1876..1fcafe88 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -14,13 +14,12 @@ jobs: pre-commit: runs-on: ubuntu-latest steps: - - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4 + - uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 - - name: Set up Python 3.11 - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5 + - name: Set up Python 3.12 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5 with: - python-version: 3.11 - cache: "pip" + python-version: "3.12" - name: Install pre-commit run: pip install pre-commit @@ -32,14 +31,14 @@ jobs: runs-on: ubuntu-latest steps: - name: Check out pipeline code - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4 + uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 - name: Install Nextflow - uses: nf-core/setup-nextflow@v1 + uses: nf-core/setup-nextflow@v2 - - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5 + - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5 with: - python-version: "3.11" + python-version: "3.12" architecture: "x64" - name: Install dependencies @@ -60,7 +59,7 @@ jobs: - name: Upload linting log file artifact if: ${{ always() }} - uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4 + uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4 with: name: linting-logs path: | diff --git a/.github/workflows/linting_comment.yml b/.github/workflows/linting_comment.yml index b706875f..40acc23f 100644 --- a/.github/workflows/linting_comment.yml +++ b/.github/workflows/linting_comment.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Download lint results - uses: dawidd6/action-download-artifact@f6b0bace624032e30a85a8fd9c1a7f8f611f5737 # v3 + uses: dawidd6/action-download-artifact@09f2f74827fd3a8607589e5ad7f9398816f540fe # v3 with: workflow: linting.yml workflow_conclusion: completed diff --git a/.github/workflows/release-announcements.yml b/.github/workflows/release-announcements.yml index 5ada136c..03ecfcf7 100644 --- a/.github/workflows/release-announcements.yml +++ b/.github/workflows/release-announcements.yml @@ -31,7 +31,7 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5 + - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5 with: python-version: "3.10" - name: Install dependencies diff --git a/.nf-core.yml b/.nf-core.yml index a8ab3441..216c3138 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -8,3 +8,4 @@ lint: nextflow_config: - config_defaults: - params.dfast_config +nf_core_version: "2.14.1" diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index af57081f..4dc0f1dc 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,6 +3,9 @@ repos: rev: "v3.1.0" hooks: - id: prettier + additional_dependencies: + - prettier@3.2.5 + - repo: https://github.com/editorconfig-checker/editorconfig-checker.python rev: "2.7.3" hooks: diff --git a/CHANGELOG.md b/CHANGELOG.md index d69e8489..206091d4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,34 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## v2.2.0 nf-core/bacass: "Aqua Platinum Zebrafish" 2024/03/27 +## v2.3.0 nf-core/bacass: "Navy Iron Oyster" 2024/06/12 + +### `Changed` + +- [#135](https://github.com/nf-core/bacass/pull/135) Replaced nf-core MultiQC module with a custom MultiQC module. + +### `Added` + +- [#135](https://github.com/nf-core/bacass/pull/135) Implementation of KmerFinder subworkflow Custom Quast, and Custom MultiQC Reports: + + - Added KmerFinder subworkflow for read quality control, purity assessment, and sample grouping based on reference genome estimation. + - Enhanced Quast Assembly QC to run both general and reference genome-based analyses when KmerFinder is invoked. + - Implemented custom MultiQC module with multiqc_config.yml files for different assembly modes (short, long, hybrid). + - Generated custom MultiQC HTML report consolidating metrics from KmerFinder, Quast, and other relevant sources. + +- [#133](https://github.com/nf-core/bacass/pull/133) Update nf-core/bacass to the new nf-core 2.14.1 `TEMPLATE`. + +### `Fixed` + +- [#134](https://github.com/nf-core/bacass/pull/134) - Fixed samples reported of prokka/bakta in multiqc report. + +- [#125](https://github.com/nf-core/bacass/pull/125) - Fixed conflicting settings in save_trimmed_fail parameter. + +### `Dependencies` + +### `Deprecated` + +## v2.2.0 nf-core/bacass: "Aqua Platinum Zebrafish" 2024/03/30 ### `Changed` diff --git a/README.md b/README.md index f94c4c2d..ab496764 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) [![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/) [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/) -[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://tower.nf/launch?pipeline=https://github.com/nf-core/bacass) +[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/nf-core/bacass) [![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23bacass-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/bacass)[![Follow on Twitter](http://img.shields.io/badge/twitter-%40nf__core-1DA1F2?labelColor=000000&logo=twitter)](https://twitter.com/nf_core)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core) @@ -29,11 +29,12 @@ On release, automated continuous integration tests run the pipeline on a full-si ### Short Read Assembly -This pipeline is primarily for bacterial assembly of next-generation sequencing reads. It can be used to quality trim your reads using [FastP](https://github.com/OpenGene/fastp) and performs basic sequencing QC using [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/). Afterwards, the pipeline performs read assembly using [Unicycler](https://github.com/rrwick/Unicycler). Contamination of the assembly is checked using [Kraken2](https://ccb.jhu.edu/software/kraken2/) to verify sample purity. +This pipeline is primarily for bacterial assembly of next-generation sequencing reads. It can be used to quality trim your reads using [FastP](https://github.com/OpenGene/fastp) and performs basic sequencing QC using [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/). Afterwards, the pipeline performs read assembly using [Unicycler](https://github.com/rrwick/Unicycler). Contamination of the assembly is checked using [Kraken2](https://ccb.jhu.edu/software/kraken2/) and [Kmerfinder](https://bitbucket.org/genomicepidemiology/kmerfinder/src/master/) to verify sample purity. ### Long Read Assembly -For users that only have Nanopore data, the pipeline quality trims these using [PoreChop](https://github.com/rrwick/Porechop) and assesses basic sequencing QC utilizing [NanoPlot](https://github.com/wdecoster/NanoPlot) and [PycoQC](https://github.com/a-slide/pycoQC). +For users that only have Nanopore data, the pipeline quality trims these using [PoreChop](https://github.com/rrwick/Porechop) and assesses basic sequencing QC utilizing [NanoPlot](https://github.com/wdecoster/NanoPlot) and [PycoQC](https://github.com/a-slide/pycoQC). Contamination of the assembly is checked using [Kraken2](https://ccb.jhu.edu/software/kraken2/) and [Kmerfinder](https://bitbucket.org/genomicepidemiology/kmerfinder/src/master/) to verify sample purity. + The pipeline can then perform long read assembly utilizing [Unicycler](https://github.com/rrwick/Unicycler), [Miniasm](https://github.com/lh3/miniasm) in combination with [Racon](https://github.com/isovic/racon), [Canu](https://github.com/marbl/canu) or [Flye](https://github.com/fenderglass/Flye) by using the [Dragonflye](https://github.com/rpetit3/dragonflye)(\*) pipeline. Long reads assembly can be polished using [Medaka](https://github.com/nanoporetech/medaka) or [NanoPolish](https://github.com/jts/nanopolish) with Fast5 files. > [!NOTE] @@ -47,6 +48,11 @@ For users specifying both short read and long read (NanoPore) data, the pipeline In all cases, the assembly is assessed using [QUAST](http://bioinf.spbau.ru/quast). The resulting bacterial assembly is furthermore annotated using [Prokka](https://github.com/tseemann/prokka), [Bakta](https://github.com/oschwengers/bakta) or [DFAST](https://github.com/nigyta/dfast_core). +If Kmerfinder is invoked, the pipeline will group samples according to the [Kmerfinder](https://bitbucket.org/genomicepidemiology/kmerfinder/src/master/)-estimated reference genomes. Afterwards, two QUAST steps will be carried out: an initial ('general') [QUAST](http://bioinf.spbau.ru/quast) of all samples without reference genomes, and subsequently, a 'by reference genome' [QUAST](http://bioinf.spbau.ru/quast) to aggregate samples with their reference genomes. + +> [!NOTE] +> This scenario is supported when [Kmerfinder](https://bitbucket.org/genomicepidemiology/kmerfinder/src/master/) analysis is performed only. + ## Usage > [!NOTE] diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index 15ee2a1d..cae0cb10 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -1,7 +1,7 @@ report_comment: > - This report has been generated by the nf-core/bacass + This report has been generated by the nf-core/bacass analysis pipeline. For information about how to interpret these results, please see the - documentation. + documentation. report_section_order: "nf-core-bacass-methods-description": order: -1000 @@ -13,3 +13,6 @@ report_section_order: export_plots: true disable_version_detection: true +use_filename_as_sample_name: + - prokka + - bakta diff --git a/assets/multiqc_config_hybrid.yml b/assets/multiqc_config_hybrid.yml new file mode 100644 index 00000000..4c036265 --- /dev/null +++ b/assets/multiqc_config_hybrid.yml @@ -0,0 +1,166 @@ +report_comment: > + This report has been generated by the nf-core/bacass + analysis pipeline. For information about how to interpret these results, please see the + documentation. + +data_format: "yaml" + +max_table_rows: 10000 + +run_modules: + - custom_content + - fastqc + - fastp + - nanostat + - porechop + - pycoqc + - kraken2 + - quast + - prokka + - bakta + +exclude_modules: + - general_stats + +module_order: + - fastqc: + name: "PREPROCESS: FastQC (raw reads)" + info: "This section of the report shows FastQC results for the raw reads before adapter trimming." + path_filters: + - "./fastqc/*.zip" + - fastp: + name: "PREPROCESS: fastp (adapter trimming)" + info: "This section of the report shows fastp results for reads after adapter and quality trimming." + path_filters: + - "./fastp/*.json" + - nanostat: + name: "PREPROCESS: Nanoplot" + info: "This section of the report shows Nanoplot results for nanopore sequencing data." + path_filters: + - "./nanoplot/*.txt" + - porechop: + name: "PREPROCESS: Porechop" + info: "This section of the report shows Porechop results for reads after adapter trimming." + path_filters: + - "./porechop/*.log" + - pycoqc: + name: "PREPROCESS: PycoQC" + info: "This section of the report shows PycoQC results for quality control of long-read sequencing data." + path_filters: + - "./pycoqc/*.txt" + - kraken2: + name: "CONTAMINATION ANALYSIS: Kraken 2" + info: "This section of the report shows Kraken 2 classification results for reads after adapter trimming with fastp." + path_filters: + - ".*kraken2_*/*report.txt" + - quast: + name: "ASSEMBLY: Quast" + info: "This section of the report shows Quast QC results for assembled genomes with Unicycler." + path_filters: + - "./quast/*/report.tsv" + - prokka: + name: "ANNOTATION: Prokka" + info: "This section of the report shows Prokka annotation results for reads after adapter trimming and quality trimming." + path_filters: + - "./prokka/*.txt" + - bakta: + name: "ANNOTATION: Bakta" + info: "This section of the report shows Bakta mapping and annotation results for reads after adapter trimming." + path_filters: + - "./bakta/*.txt" + +report_section_order: + fastqc: + after: general_stats + fastp: + after: general_stats + nanostat: + after: general_stats + porechop: + before: nanostat + kraken2: + after: general_stats + quast: + after: general_stats + prokka: + before: nf-core-bacass-methods-description + bakta: + before: nf-core-bacass-methods-description + nf-core-bacass-methods-description: + order: -1000 + software_versions: + order: -1001 + nf-core-bacass-summary: + order: -1002 + +custom_data: + summary_assembly_metrics: + section_name: "De novo assembly metrics (shorts & long reads)" + description: "generated by nf-core/bacass" + plot_type: "table" + headers: + "Sample": + description: "Input sample names" + format: "{:,.0f}" + "# Input short reads": + description: "Total number of input reads in raw fastq files" + format: "{:,.0f}" + "# Trimmed short reads (fastp)": + description: "Total number of reads remaining after adapter/quality trimming with fastp" + format: "{:,.0f}" + "# Input long reads": + description: "Total number of input reads in raw fastq files" + format: "{:,.0f}" + "# Median long reads lenght": + description: "Median read lenght (bp)" + format: "{:,.0f}" + "# Median long reads quality": + description: "Median read quality (Phred scale)" + format: "{:,.0f}" + "# Contigs (hybrid assembly)": + description: "Total number of contigs calculated by QUAST" + format: "{:,.0f}" + "# Largest contig (hybrid assembly)": + description: "Size of largest contig calculated by QUAST" + format: "{:,.0f}" + "# N50 (hybrid assembly)": + description: "N50 metric for de novo assembly as calculated by QUAST" + format: "{:,.0f}" + "# % Genome fraction (hybrid assembly)": + description: "% genome fraction calculated by QUAST" + format: "{:,.2f}" + "# Best hit (Kmerfinder)": + description: "Specie name of the best hit from Kmerfinder (using short reads)" + format: "{:,.0f}" + "# Best hit assembly ID (Kmerfinder)": + description: "Assembly ID of the best hit from Kmerfinder (using short reads)" + format: "{:,.0f}" + "# Best hit query coverage (Kmerfinder)": + description: "Query coverage value of the best hit from Kmerfinder (using short reads)" + format: "{:,.0f}" + "# Best hit depth (Kmerfinder)": + description: "Depth of the best hit from Kmerfinder (using short reads)" + format: "{:,.0f}" + "# Second hit (Kmerfinder)": + description: "Specie name of the second hit from Kmerfinder (using short reads)" + format: "{:,.0f}" + "# Second hit assembly ID (Kmerfinder)": + description: "Assembly ID of the second hit from Kmerfinder (using short reads)" + format: "{:,.0f}" + "# Second hit query coverage (Kmerfinder)": + description: "Query coverage value of the second hit from Kmerfinder (using short reads)" + format: "{:,.0f}" + "# Second hit depth (Kmerfinder)": + description: "Depth of the second hit from Kmerfinder (using short reads)" + format: "{:,.0f}" + +export_plots: true + +# # Customise the module search patterns to speed up execution time +# # - Skip module sub-tools that we are not interested in +# # - Replace file-content searching with filename pattern searching +# # - Don't add anything that is the same as the MultiQC default +# # See https://multiqc.info/docs/#optimise-file-search-patterns for details +sp: + fastp: + fn: "*.fastp.json" diff --git a/assets/multiqc_config_long.yml b/assets/multiqc_config_long.yml new file mode 100644 index 00000000..51795ec6 --- /dev/null +++ b/assets/multiqc_config_long.yml @@ -0,0 +1,140 @@ +report_comment: > + This report has been generated by the nf-core/bacass + analysis pipeline. For information about how to interpret these results, please see the + documentation. + +data_format: "yaml" + +max_table_rows: 10000 + +run_modules: + - custom_content + - nanostat + - porechop + - pycoqc + - kraken2 + - quast + - prokka + - bakta + +exclude_modules: + - general_stats + +module_order: + - nanostat: + name: "PREPROCESS: Nanoplot" + info: "This section of the report shows Nanoplot results for nanopore sequencing data." + path_filters: + - "./nanoplot/*.txt" + - porechop: + name: "PREPROCESS: Porechop" + info: "This section of the report shows Porechop results for reads after adapter trimming." + path_filters: + - "./porechop/*.log" + - pycoqc: + name: "PREPROCESS: PycoQC" + info: "This section of the report shows PycoQC results for quality control of long-read sequencing data." + path_filters: + - "./pycoqc/*.txt" + - kraken2: + name: "CONTAMINATION ANALYSIS: Kraken 2" + info: "This section of the report shows Kraken 2 classification results for reads after adapter trimming with fastp." + path_filters: + - ".*kraken2_*/*report.txt" + - quast: + name: "ASSEMBLY: Quast" + info: "This section of the report shows Quast QC results for assembled genomes with Unicycler." + path_filters: + - "./quast/*/report.tsv" + - prokka: + name: "ANNOTATION: Prokka" + info: "This section of the report shows Prokka annotation results for reads after adapter trimming and quality trimming." + path_filters: + - "./prokka/*.txt" + - bakta: + name: "ANNOTATION: Bakta" + info: "This section of the report shows Bakta mapping and annotation results for reads after adapter trimming." + path_filters: + - "./bakta/*.txt" + +report_section_order: + nanostat: + after: general_stats + porechop: + before: nanostat + kraken2: + after: general_stats + quast: + after: general_stats + prokka: + before: nf-core-bacass-methods-description + bakta: + before: nf-core-bacass-methods-description + nf-core-bacass-methods-description: + order: -1000 + software_versions: + order: -1001 + nf-core-bacass-summary: + order: -1002 + +custom_data: + summary_assembly_metrics: + section_name: "De novo assembly metrics (long-reads)" + description: "generated by nf-core/bacass" + plot_type: "table" + headers: + "Sample": + description: "Input sample names" + format: "{:,.0f}" + "# Input reads": + description: "Total number of input reads in raw fastq files" + format: "{:,.0f}" + "# Median read lenght": + description: "Median read lenght (bp)" + format: "{:,.0f}" + "# Median read quality": + description: "Median read quality (Phred scale)" + format: "{:,.0f}" + "# Contigs": + description: "Total number of contigs calculated by QUAST" + format: "{:,.0f}" + "# Largest contig": + description: "Size of largest contig calculated by QUAST" + format: "{:,.0f}" + "# N50": + description: "N50 metric for de novo assembly as calculated by QUAST" + format: "{:,.0f}" + "# % Genome fraction": + description: "% genome fraction calculated by QUAST" + format: "{:,.2f}" + "# Best hit (Kmerfinder)": + description: "Specie name of the best hit from Kmerfinder" + format: "{:,.0f}" + "# Best hit assembly ID (Kmerfinder)": + description: "Assembly ID of the best hit from Kmerfinder" + format: "{:,.0f}" + "# Best hit query coverage (Kmerfinder)": + description: "Query coverage value of the best hit from Kmerfinder" + format: "{:,.0f}" + "# Best hit depth (Kmerfinder)": + description: "Depth of the best hit from Kmerfinder" + format: "{:,.0f}" + "# Second hit (Kmerfinder)": + description: "Specie name of the second hit from Kmerfinder" + format: "{:,.0f}" + "# Second hit assembly ID (Kmerfinder)": + description: "Assembly ID of the second hit from Kmerfinder" + format: "{:,.0f}" + "# Second hit query coverage (Kmerfinder)": + description: "Query coverage value of the second hit from Kmerfinder" + format: "{:,.0f}" + "# Second hit depth (Kmerfinder)": + description: "Depth of the second hit from Kmerfinder" + format: "{:,.0f}" + +export_plots: true +# # Customise the module search patterns to speed up execution time +# # - Skip module sub-tools that we are not interested in +# # - Replace file-content searching with filename pattern searching +# # - Don't add anything that is the same as the MultiQC default +# # See https://multiqc.info/docs/#optimise-file-search-patterns for details diff --git a/assets/multiqc_config_short.yml b/assets/multiqc_config_short.yml new file mode 100644 index 00000000..2ce2eca6 --- /dev/null +++ b/assets/multiqc_config_short.yml @@ -0,0 +1,135 @@ +report_comment: > + This report has been generated by the nf-core/bacass + analysis pipeline. For information about how to interpret these results, please see the + documentation. + +data_format: "yaml" + +max_table_rows: 10000 + +run_modules: + - custom_content + - fastqc + - fastp + - kraken2 + - quast + - prokka + - bakta + +exclude_modules: + - general_stats + +module_order: + - fastqc: + name: "PREPROCESS: FastQC (raw reads)" + info: "This section of the report shows FastQC results for the raw reads before adapter trimming." + path_filters: + - "./fastqc/*.zip" + - fastp: + name: "PREPROCESS: fastp (adapter trimming)" + info: "This section of the report shows fastp results for reads after adapter and quality trimming." + path_filters: + - "./fastp/*.json" + - kraken2: + name: "CONTAMINATION ANALYSIS: Kraken 2" + info: "This section of the report shows Kraken 2 classification results for reads after adapter trimming with fastp." + path_filters: + - ".*kraken2_*/*report.txt" + - quast: + name: "ASSEMBLY: Quast" + info: "This section of the report shows Quast QC results for assembled genomes with Unicycler." + path_filters: + - "./quast/*/report.tsv" + - prokka: + name: "ANNOTATION: Prokka" + info: "This section of the report shows Prokka annotation results for reads after adapter trimming and quality trimming." + path_filters: + - "./prokka/*.txt" + - bakta: + name: "ANNOTATION: Bakta" + info: "This section of the report shows Bakta mapping and annotation results for reads after adapter trimming." + path_filters: + - "./bakta/*.txt" + +report_section_order: + fastqc: + after: general_stats + fastp: + after: general_stats + kraken2: + after: general_stats + quast: + after: general_stats + prokka: + before: nf-core-bacass-methods-description + bakta: + before: nf-core-bacass-methods-description + nf-core-bacass-methods-description: + order: -1000 + software_versions: + order: -1001 + nf-core-bacass-summary: + order: -1002 + +custom_data: + summary_assembly_metrics: + section_name: "De novo assembly metrics (short-reads)" + description: "generated by nf-core/bacass" + plot_type: "table" + headers: + "Sample": + description: "Input sample names" + format: "{:,.0f}" + "# Input reads": + description: "Total number of input reads in raw fastq files" + format: "{:,.0f}" + "# Trimmed reads (fastp)": + description: "Total number of reads remaining after adapter/quality trimming with fastp" + format: "{:,.0f}" + "# Contigs": + description: "Total number of contigs calculated by QUAST" + format: "{:,.0f}" + "# Largest contig": + description: "Size of largest contig calculated by QUAST" + format: "{:,.0f}" + "# N50": + description: "N50 metric for de novo assembly as calculated by QUAST" + format: "{:,.0f}" + "# % Genome fraction": + description: "% genome fraction calculated by QUAST" + format: "{:,.2f}" + "# Best hit (Kmerfinder)": + description: "Specie name of the best hit from Kmerfinder" + format: "{:,.0f}" + "# Best hit assembly ID (Kmerfinder)": + description: "Assembly ID of the best hit from Kmerfinder" + format: "{:,.0f}" + "# Best hit query coverage (Kmerfinder)": + description: "Query coverage value of the best hit from Kmerfinder" + format: "{:,.0f}" + "# Best hit depth (Kmerfinder)": + description: "Depth of the best hit from Kmerfinder" + format: "{:,.0f}" + "# Second hit (Kmerfinder)": + description: "Specie name of the second hit from Kmerfinder" + format: "{:,.0f}" + "# Second hit assembly ID (Kmerfinder)": + description: "Assembly ID of the second hit from Kmerfinder" + format: "{:,.0f}" + "# Second hit query coverage (Kmerfinder)": + description: "Query coverage value of the second hit from Kmerfinder" + format: "{:,.0f}" + "# Second hit depth (Kmerfinder)": + description: "Depth of the second hit from Kmerfinder" + format: "{:,.0f}" + +export_plots: true + +# # Customise the module search patterns to speed up execution time +# # - Skip module sub-tools that we are not interested in +# # - Replace file-content searching with filename pattern searching +# # - Don't add anything that is the same as the MultiQC default +# # See https://multiqc.info/docs/#optimise-file-search-patterns for details +sp: + fastp: + fn: "*.fastp.json" diff --git a/bin/csv_to_yaml.py b/bin/csv_to_yaml.py new file mode 100755 index 00000000..48c66cc0 --- /dev/null +++ b/bin/csv_to_yaml.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python +""" +Author: Daniel VM +Email: da.valle@ciberisciii.es +Date: 2024/01/20 + +MIT License + +© 2024 Daniel VM + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND. +""" + +import sys +import argparse +import csv +import yaml + + +def parse_args(args=None): + Description = "Create a yaml file from csv input file grouping samples as keys and resting fields as their value pair." + + Epilog = "Example usage: python csv_to_yaml.py -i myfile.csv -k 'sample_name' -o converted_file" + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument( + "-i", "--input", type=str, dest="CSV_FILE", help="Input file in CSV format." + ) + + parser.add_argument( + "-k", + "--key_field", + type=str, + dest="KEY_FIELD", + help="Name of the key/column grupping field in the input csv.", + ) + + parser.add_argument( + "-op", + "--output_prefix", + type=str, + default="output_file", + dest="OUT_PREFIX", + help="Output file name", + ) + return parser.parse_args(args) + + +def parse_csv(csv_file): + with open(csv_file, "r") as c: + csv_reader = csv.DictReader(c) + data = [row for row in csv_reader] + return data + + +def create_yaml(data, key, output_prefix): + yaml_data = { + entry[key]: {k: v for k, v in entry.items() if k != key} for entry in data + } + with open(output_prefix + ".yaml", "w") as yaml_file: + yaml.dump(yaml_data, yaml_file, default_flow_style=False) + + +def main(args=None): + args = parse_args(args) + file_list = parse_csv(args.CSV_FILE) + + create_yaml(data=file_list, key=args.KEY_FIELD, output_prefix=args.OUT_PREFIX) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bin/download_reference.py b/bin/download_reference.py new file mode 100755 index 00000000..88e89364 --- /dev/null +++ b/bin/download_reference.py @@ -0,0 +1,161 @@ +#!/usr/bin/env python +""" +============================================================= +HEADER +============================================================= +INSTITUTION: BU-ISCIII +AUTHOR: Guillermo J. Gorines Cordero +EDITED BY: Daniel VM +VERSION: 0.1 +CREATED: Early 2022 +REVISED: 18-2-2022 +EDITED: 14-11-2023 +DESCRIPTION: 20-05-2024 + Given a file with the kmerfinder results and frequencies (probably + created by find_common_reference.py), and the NCBI assembly sheet, + download the top-reference genome, gff and protein files from + the NCBI ftp. + +INPUT: + -FILE: file containing the ranking of references from kmerfinder created by the script find_common_references + -REFERENCE: file with the NCBI reference list + -OUTDIR: name of the output dir + +OUTPUT: + - *_fna.gz: file with the top-reference genome + - *_gff.gz: file with the top-reference gff + - *_protein.gz: file with the top-reference proteins + +USAGE: + python download_reference.py + -file [FILE] + -reference [REFERENCE] + -out_dir [OUTDIR] + +REQUIREMENTS: + -Python >= 3.6 + -Python wget + +DISCLAIMER: + This script has been designed for the assembly pipeline of BU-ISCIII. + Feel free to use it at will, however we dont guarantee its success + outside its purpose. +================================================================ +END_OF_HEADER +================================================================ +""" + +import sys +import argparse +import os + +# import wget +import requests + + +# TODO: Generate report +def parse_args(args=None): + Description = "download the reference files \ + (fna, faa, gff)from the reference NCBI file." + Epilog = """Usage example: \ + python download_reference.py \ + -file \ + -reference \ + -out_dir """ + + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument( + "-file", help="File containing the ranking of references from kmerfinder." + ) + parser.add_argument( + "-reference", + help="File containing the paths to bacterial references. See example in: https://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_refseq.txt", + ) + parser.add_argument("-out_dir", help="Output directory.") + + return parser.parse_args(args) + + +def download_references(file, reference, out_dir): + """ + Downloads the top reference from the NCBI database + """ + + reference_ends = ["_genomic.fna.gz", "_protein.faa.gz", "_genomic.gff.gz"] + + # extract the most common reference from file + with open(file) as infile: + infile = infile.readlines() + infile = [ + item.replace("\n", "").split("\t") + for item in infile + if not item.startswith("#") + ] + top_reference = infile[0][0] + + with open(str(top_reference) + ".winner", "w") as topref: + topref.write(top_reference) + + # create the outdir (do nothing if already there) + try: + os.mkdir(out_dir) + except FileExistsError: + pass + + # open the reference and find the reference + with open(reference) as inref: + inref = inref.readlines() + inref = [ + item.replace("\n", "").split("\t") + for item in inref + if not item.startswith("#") + ] + + # Initialize an empty list to store the URLs + dir_url = [] + + # Iterate over each row in the inref + for row in inref: + # Construct the ref_query using assembly_accession and asm_name + assembly_accession = row[0] + asm_name = row[15] + ref_query = f"{assembly_accession}_{asm_name}" + + # Check if ref_query matches the search value + if ref_query == top_reference: + # make url # Append the 20th element of the row to the URL list: + assembly_url = row[19] + "/" + ref_query + dir_url.append(assembly_url) + + if len(dir_url) == 0: + print( + "No assemblies responding to the top reference: ", + top_reference, + " were found", + ) + sys.exit(1) + + dir_url = str(dir_url[0]) + + # get url and reference file + for r_end in reference_ends: + out_file = out_dir + "/" + top_reference + r_end + file_url = dir_url + r_end + print(file_url) + + # wget.download(file_url, out_file) + response = requests.get(file_url, stream=True) + with open(out_file, "wb") as out: + for chunk in response.iter_content(chunk_size=8192): + out.write(chunk) + + return + + +def main(args=None): + args = parse_args(args) + download_references(args.file, args.reference, args.out_dir) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bin/find_common_reference.py b/bin/find_common_reference.py new file mode 100755 index 00000000..e26aaf53 --- /dev/null +++ b/bin/find_common_reference.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python +""" +============================================================= +HEADER +============================================================= +INSTITUTION: BU-ISCIII +AUTHOR: Guillermo J. Gorines Cordero +MAIL: guillermo.gorines@urjc.es +VERSION: 0.1 +CREATED: Early 2022 +REVISED: 18-2-2022 +DESCRIPTION: + Given a directory with kmerfinder results, sum them up + in an outfile named by the user. + +INPUT: + -DIRECTORY: directory containing all kmerfinder results. + -OUTFILE: Name of the file to write the whole results in. + +OUTPUT: + -OUTFILE: file containing the kmerfinder results. + +USAGE: + python find_common_reference.py -d [DIRECTORY] -o [OUTFILE] +REQUIREMENTS: + -Python >= 3.6 + +DISCLAIMER: This script has been designed for the assembly pipeline of BU-ISCIII. + Feel free to use it at will, however we dont guarantee its success + outside its purpose. + +================================================================ +END_OF_HEADER +================================================================ +""" +import os +import sys +import errno +import argparse + + +def parse_args(args=None): + """ + Parse the args given to argparser + """ + Description = "Fetch kmerfinder result files and get the most used reference." + Epilog = """Example usage: python find_common_reference.py -d -o """ + + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument("-d", help="Input directory.") + parser.add_argument("-o", help="Output file.") + return parser.parse_args(args) + + +def group_references(kmer_result_dir, out_file): + """ + Unifies the kmerfinder results, and counts their occurrences + """ + reference_assembly = {} + + # for file in dir + for k_file in os.listdir(kmer_result_dir): + # open file + with open(os.path.join(kmer_result_dir, k_file), "r") as fh: + file_lines = fh.readlines() + + # remove heading + try: + heading = file_lines[0].split("\t") + first_line = file_lines[1].split("\t") + + # where is the assembly in the header? + # find reference according to index + index_assembly = heading.index("# Assembly") + reference = first_line[index_assembly] + + # add it to the dict if not there + if reference not in reference_assembly: + index_description = heading.index("Description") + reference_assembly[reference] = [0, first_line[index_description]] + # sum 1 for another occurrence + reference_assembly[reference][0] += 1 + except IndexError: + pass + + # sort it (more occurrences first in file) + order_reference = dict( + sorted(reference_assembly.items(), key=lambda x: x[1][0], reverse=True) + ) + + # write it + with open(out_file, "w") as f_out: + for key, value in order_reference.items(): + f_out.write(key + "\t" + str(value[0]) + "\t" + value[1] + "\n") + return + + +def main(args=None): + args = parse_args(args) + group_references(args.d, args.o) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bin/kmerfinder_summary.py b/bin/kmerfinder_summary.py new file mode 100755 index 00000000..27e97de5 --- /dev/null +++ b/bin/kmerfinder_summary.py @@ -0,0 +1,217 @@ +#!/usr/bin/env python3 +""" +Author: Daniel VM +Email: da.valle@ciberisciii.es +Date: 2024/01/20 + +MIT License + +© 2024 Daniel VM + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND. +""" + +import argparse +import sys +import re +import csv +import pickle +import os + + +################# +### FUNCTIONS ### +################# + + +def check_arg(args=None): + """ + Description: + Function collect arguments from command line using argparse + Input: + args # command line arguments + Constant: + None + Variables + parser + Return + parser.parse_args() # Parsed arguments + """ + + parser = argparse.ArgumentParser( + prog="07-kmerfinder.py", + formatter_class=argparse.RawDescriptionHelpFormatter, + description="07-kmerfinder.py creates a csv file from results.txt file", # FIXME + ) + + parser.add_argument( + "--path", + "-p", + required=True, + help="Insert path of results.txt file like /home/user/Service_folder/ANALYSIS/07-kmerfinder", # FIXME + ) + + parser.add_argument( + "--output_bn", "-b", required=True, help="The output in binary file" + ) + + parser.add_argument( + "--output_csv", "-c", required=True, help="The output in csv file" + ) + + # Example: python3 parse_kmerfinder.py -p /home/s.gonzalez/07-kmerfinder -b p_dic.dicke -c p_kmer.csv + + return parser.parse_args() + + +################# +### FUNCTIONS ### +################# + + +def kmerfinder_dictionary(file_txt): + """ + Description: + Function to extract the relevant part of result.txt file + Input: + result.txt file + Return: + dictionary + """ + + step = "07-kmerfinder_" # FIXME + + num_lines = sum(1 for line in open(file_txt)) + hits = num_lines - 1 # to count the total number of hits + lookupfile = open(file_txt, "r") + lines = lookupfile.readlines() + parameters = lines[0].strip().split("\t") + if num_lines > 1: + values_best_hit = lines[1].strip().split("\t") + if num_lines > 2: + values_second_hit = lines[2].strip().split("\t") + + kmer_dict = {} + + for i in range(len(parameters)): + if num_lines > 1: + kmer_dict[step + "best_hit_" + parameters[i]] = values_best_hit[i] + else: + kmer_dict[step + "best_hit_" + parameters[i]] = "" + + kmer_dict.update(Total_hits_07_kmerfinder=hits) + + if num_lines > 2: + + kmer_dict[step + "second_hit_" + parameters[i]] = values_second_hit[i] + + else: + + kmer_dict[step + "second_hit_" + parameters[i]] = "" + + return kmer_dict + + +################# +### FUNCTIONS ### +################# + + +def dictionary2bn(dictionary, binary_file): + """ + + Description: + Function to create a binary file from a dictionary + Input: + dictionary + Return: + binary file + """ + + pickle_out = open(binary_file, "wb") + pickle.dump(dictionary, pickle_out) + pickle_out.close() + + return + + +################# +### FUNCTIONS ### +################# + + +def dictionary2csv(dictionary, csv_file): + """ + + Description: + Function to create a csv from a dictionary + Input: + dictionary + Return: + csv file + + """ + + header = sorted(set(i for b in map(dict.keys, dictionary.values()) for i in b)) + with open(csv_file, "w", newline="") as f: + write = csv.writer(f) + write.writerow(["sample_name", *header]) + for a, b in dictionary.items(): + write.writerow([a] + [b.get(i, "") for i in header]) + return + + +################### +### MAIN SCRIPT ### +################### + + +if __name__ == "__main__": + + # Variables + version = "07-kmerfinder.py v 0.1.0." # Script version # FIXME + arguments = check_arg(sys.argv[1:]) + + # Create sample_id_list + path = arguments.path + sample_list = [] + tmp = os.listdir(path) + for item in tmp: + if os.path.isdir(os.path.join(path, item)): + if item != "logs": + sample_name = item.replace("_results.txt", "") + sample_list.append(sample_name) + else: + sample_name = item.replace("_results.txt", "") + sample_list.append(sample_name) + + print("sample_list done") + + # Create a dictionary + kmer_all = {} + + for sample in sample_list: + file_name = os.path.join(path, sample + "_results.txt") + kmer_all[sample] = kmerfinder_dictionary(file_name) + + print("kmerfinder_dictionary done") + # print (kmer_all) + + # Save the dicctionary to binary file + + dictionary2bn(kmer_all, arguments.output_bn) + + print("kmerfinder_dictionary_bn done") + + # Convert the dictionary to csv file + + dictionary2csv(kmer_all, arguments.output_csv) + + print("kmerfinder_dictionary_csv done") diff --git a/bin/multiqc_to_custom_csv.py b/bin/multiqc_to_custom_csv.py new file mode 100755 index 00000000..47b7dcc8 --- /dev/null +++ b/bin/multiqc_to_custom_csv.py @@ -0,0 +1,331 @@ +#!/usr/bin/env python +# Sourced and Edited from nf-core/viralrecon: +# https://github.com/nf-core/viralrecon/blob/3731dd3a32a67a2648ea22c2bd980c224abdaee2/bin/multiqc_to_custom_csv.py +import os +import sys +import errno +import argparse +import yaml + + +def parse_args(args=None): + Description = "Create custom spreadsheet for pertinent MultiQC metrics generated by the nf-core/viralrecon pipeline." + Epilog = "Example usage: python multiqc_to_custom_tsv.py" + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument( + "-md", + "--multiqc_data_dir", + type=str, + dest="MULTIQC_DATA_DIR", + default="multiqc_data", + help="Full path to directory containing YAML files for each module, as generated by MultiQC. (default: 'multiqc_data').", + ) + parser.add_argument( + "-t", + "--assembly_type", + type=str, + dest="ASSEMBLY_TYPE", + default="short", + help="String defining the assembly mode for genome de novo assembly (options: short, long, hybrid).", + ) + parser.add_argument( + "-op", + "--out_prefix", + type=str, + dest="OUT_PREFIX", + default="summary", + help="Full path to output prefix (default: 'summary').", + ) + return parser.parse_args(args) + + +def make_dir(path): + if not len(path) == 0: + try: + os.makedirs(path) + except OSError as exception: + if exception.errno != errno.EEXIST: + raise + + +# Find key in dictionary created from YAML file recursively +# From https://stackoverflow.com/a/37626981 +def find_tag(d, tag): + if tag in d: + yield d[tag] + for k, v in d.items(): + if isinstance(v, dict): + for i in find_tag(v, tag): + yield i + + +def yaml_fields_to_dict( + yaml_file, append_dict={}, field_mapping_list=[], valid_sample_list=[] +): + integer_fields = [ + "# contigs", + "# contigs (>= 5000 bp)", + "Largest contig", + ] + if os.path.exists(yaml_file): + with open(yaml_file) as f: + yaml_dict = yaml.safe_load(f) + for k in yaml_dict.keys(): + key = k + include_sample = True + if len(valid_sample_list) != 0 and key not in valid_sample_list: + include_sample = False + if include_sample: + if key not in append_dict: + append_dict[key] = {} + if field_mapping_list != []: + for i, j in field_mapping_list: + val = list(find_tag(yaml_dict[k], j[0])) + ## Fix for Cutadapt reporting reads/pairs as separate values + if j[0] == "r_written" and len(val) == 0: + val = [ + list(find_tag(yaml_dict[k], "pairs_written"))[0] * 2 + ] + if len(val) != 0: + val = val[0] + if len(j) == 2: + val = list(find_tag(val, j[1]))[0] + if j[0] in integer_fields: + val = int(val) + if i not in append_dict[key]: + append_dict[key][i] = val + else: + print( + "WARNING: {} key already exists in dictionary so will be overwritten. YAML file {}.".format( + i, yaml_file + ) + ) + else: + append_dict[key] = yaml_dict[k] + else: + print("WARNING: File does not exist: {}".format(yaml_file)) + if len(valid_sample_list) != 0: + for key in valid_sample_list: + if key not in append_dict: + append_dict[key] = {} + if field_mapping_list != []: + for i, j in field_mapping_list: + if i not in append_dict[key]: + append_dict[key][i] = "NA" + else: + print( + "WARNING: {} key already exists in dictionary so will be overwritten. YAML file {}.".format( + i, yaml_file + ) + ) + else: + append_dict[key] = "NA" + return append_dict + + +def metrics_dict_to_file( + file_field_list, multiqc_data_dir, out_file, valid_sample_list=[] +): + metrics_dict = {} + field_list = [] + for yaml_file, mapping_list in file_field_list: + yaml_file = os.path.join(multiqc_data_dir, yaml_file) + metrics_dict = yaml_fields_to_dict( + yaml_file=yaml_file, + append_dict=metrics_dict, + field_mapping_list=mapping_list, + valid_sample_list=valid_sample_list, + ) + field_list += [x[0] for x in mapping_list] + + if metrics_dict != {}: + make_dir(os.path.dirname(out_file)) + fout = open(out_file, "w") + header = ["Sample"] + field_list + fout.write("{}\n".format(",".join(header))) + for k in sorted(metrics_dict.keys()): + row_list = [k] + for field in field_list: + if field in metrics_dict[k]: + if metrics_dict[k][field]: + row_list.append(str(metrics_dict[k][field]).replace(",", ";")) + else: + row_list.append("NA") + else: + row_list.append("NA") + fout.write("{}\n".format(",".join(row_list))) + fout.close() + return metrics_dict + + +def main(args=None): + args = parse_args(args) + + ## File names for MultiQC YAML along with fields to fetch from each file + illumina_assembly_files = [ + ( + "multiqc_fastp.yaml", + [ + ("# Input reads", ["before_filtering", "total_reads"]), + ("# Trimmed reads (fastp)", ["after_filtering", "total_reads"]), + ], + ), + ( + "multiqc_quast.yaml", + [ + ("# Contigs", ["# contigs"]), + ("# Largest contig", ["Largest contig"]), + ("# N50", ["N50"]), + ("# % Genome fraction", ["Genome fraction (%)"]), + ], + ), + ( + "multiqc_kmerfinder.yaml", + [ + ("# Best hit (Kmerfinder)", ["07-kmerfinder_best_hit_Species"]), + ( + "# Best hit assembly ID (Kmerfinder)", + ["07-kmerfinder_best_hit_# Assembly"], + ), + ( + "# Best hit query coverage (Kmerfinder)", + ["07-kmerfinder_best_hit_Query_Coverage"], + ), + ("# Best hit depth (Kmerfinder)", ["07-kmerfinder_best_hit_Depth"]), + ("# Second hit (Kmerfinder)", ["07-kmerfinder_second_hit_Species"]), + ( + "# Second hit assembly ID (Kmerfinder)", + ["07-kmerfinder_second_hit_# Assembly"], + ), + ( + "# Second hit query coverage (Kmerfinder)", + ["07-kmerfinder_second_hit_Query_Coverage"], + ), + ("# Second hit depth (Kmerfinder)", ["07-kmerfinder_second_hit_Depth"]), + ], + ), + ] + + nanopore_assembly_files = [ + ( + "multiqc_nanostat.yaml", + [ + ("# Input reads", ["Number of reads_fastq"]), + ("# Median read lenght", ["Median read length_fastq"]), + ("# Median read quality", ["Median read quality_fastq"]), + ], + ), + ( + "multiqc_quast.yaml", + [ + ("# Contigs", ["# contigs"]), + ("# Largest contig", ["Largest contig"]), + ("# N50", ["N50"]), + ("# % Genome fraction", ["Genome fraction (%)"]), + ], + ), + ( + "multiqc_kmerfinder.yaml", + [ + ("# Best hit (Kmerfinder)", ["07-kmerfinder_best_hit_Species"]), + ( + "# Best hit assembly ID (Kmerfinder)", + ["07-kmerfinder_best_hit_# Assembly"], + ), + ( + "# Best hit query coverage (Kmerfinder)", + ["07-kmerfinder_best_hit_Query_Coverage"], + ), + ("# Best hit depth (Kmerfinder)", ["07-kmerfinder_best_hit_Depth"]), + ("# Second hit (Kmerfinder)", ["07-kmerfinder_second_hit_Species"]), + ( + "# Second hit assembly ID (Kmerfinder)", + ["07-kmerfinder_second_hit_# Assembly"], + ), + ( + "# Second hit query coverage (Kmerfinder)", + ["07-kmerfinder_second_hit_Query_Coverage"], + ), + ("# Second hit depth (Kmerfinder)", ["07-kmerfinder_second_hit_Depth"]), + ], + ), + ] + + hybrid_assembly_files = [ + ( + "multiqc_fastp.yaml", + [ + ("# Input short reads", ["before_filtering", "total_reads"]), + ("# Trimmed short reads (fastp)", ["after_filtering", "total_reads"]), + ], + ), + ( + "multiqc_nanostat.yaml", + [ + ("# Input long reads", ["Number of reads_fastq"]), + ("# Median long reads lenght", ["Median read length_fastq"]), + ("# Median long reads quality", ["Median read quality_fastq"]), + ], + ), + ( + "multiqc_quast.yaml", + [ + ("# Contigs (hybrid assembly)", ["# contigs"]), + ("# Largest contig (hybrid assembly)", ["Largest contig"]), + ("# N50 (hybrid assembly)", ["N50"]), + ("# % Genome fraction (hybrid assembly)", ["Genome fraction (%)"]), + ], + ), + ( + "multiqc_kmerfinder.yaml", + [ + ("# Best hit (Kmerfinder)", ["07-kmerfinder_best_hit_Species"]), + ( + "# Best hit assembly ID (Kmerfinder)", + ["07-kmerfinder_best_hit_# Assembly"], + ), + ( + "# Best hit query coverage (Kmerfinder)", + ["07-kmerfinder_best_hit_Query_Coverage"], + ), + ("# Best hit depth (Kmerfinder)", ["07-kmerfinder_best_hit_Depth"]), + ("# Second hit (Kmerfinder)", ["07-kmerfinder_second_hit_Species"]), + ( + "# Second hit assembly ID (Kmerfinder)", + ["07-kmerfinder_second_hit_# Assembly"], + ), + ( + "# Second hit query coverage (Kmerfinder)", + ["07-kmerfinder_second_hit_Query_Coverage"], + ), + ("# Second hit depth (Kmerfinder)", ["07-kmerfinder_second_hit_Depth"]), + ], + ), + ] + + ## Write de novo assembly metrics to file + if args.ASSEMBLY_TYPE == "short": + metrics_dict_to_file( + file_field_list=illumina_assembly_files, + multiqc_data_dir=args.MULTIQC_DATA_DIR, + out_file=args.OUT_PREFIX + "_assembly_metrics_mqc.csv", + valid_sample_list=[], + ) + elif args.ASSEMBLY_TYPE == "long": + metrics_dict_to_file( + file_field_list=nanopore_assembly_files, + multiqc_data_dir=args.MULTIQC_DATA_DIR, + out_file=args.OUT_PREFIX + "_assembly_metrics_mqc.csv", + valid_sample_list=[], + ) + elif args.ASSEMBLY_TYPE == "hybrid": + metrics_dict_to_file( + file_field_list=hybrid_assembly_files, + multiqc_data_dir=args.MULTIQC_DATA_DIR, + out_file=args.OUT_PREFIX + "_assembly_metrics_mqc.csv", + valid_sample_list=[], + ) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/conf/base.config b/conf/base.config index 69fd3dcb..57237c40 100644 --- a/conf/base.config +++ b/conf/base.config @@ -57,7 +57,4 @@ process { errorStrategy = 'retry' maxRetries = 3 } - withName:CUSTOM_DUMPSOFTWAREVERSIONS { - cache = false - } } diff --git a/conf/modules.config b/conf/modules.config index c1b4f8da..78bd92bb 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -35,9 +35,17 @@ process { ext.args = '' ext.prefix = { "${meta.id}.porechop" } publishDir = [ - path: { "${params.outdir}/trimming/longreads" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + [ + path: { "${params.outdir}/trimming/longreads" }, + pattern: "*.fastq.gz", + mode: params.publish_dir_mode, + enabled: params.save_trimmed + ], + [ + path: { "${params.outdir}/trimming/longreads" }, + pattern: "*.log", + mode: params.publish_dir_mode, + ] ] } @@ -176,12 +184,21 @@ process { ] } - withName: 'QUAST' { + withName: 'QUAST|QUAST_BYREFSEQID' { ext.args = '' publishDir = [ path: { "${params.outdir}/QUAST" }, mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + saveAs: { filename -> + if (filename.equals('versions.yml') || filename.endsWith('.tsv')){ + null + } else if (filename.startsWith('GCF')){ + "runs_per_reference/${filename}" + } + else { + "${filename}" + } + } ] } @@ -204,8 +221,8 @@ process { ] } - withName: 'MULTIQC' { - ext.args = '' + withName: 'MULTIQC_CUSTOM' { + ext.args = '-k yaml' publishDir = [ path: { "${params.outdir}/multiqc" }, mode: params.publish_dir_mode, @@ -229,13 +246,14 @@ if (!params.skip_fastqc) { if (!params.skip_fastp) { process { withName: '.*:.*:FASTQ_TRIM_FASTP_FASTQC:FASTP' { - ext.args = '' + ext.args = params.fastp_args ? params.fastp_args : '' publishDir = [ [ path: { "${params.outdir}/trimming/shortreads" }, mode: params.publish_dir_mode, pattern: "*.fastp.fastq.gz", - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: params.save_trimmed ], [ path: { "${params.outdir}/trimming/shortreads/json_html" }, @@ -270,6 +288,29 @@ if (!params.skip_fastp) { } } } +if (!params.skip_kmerfinder) { + process { + withName: '.*:.*:KMERFINDER_SUBWORKFLOW:KMERFINDER' { + ext.args = '' + publishDir = [ + path: { "${params.outdir}/Kmerfinder/${meta.id}" }, + mode: params.publish_dir_mode, + pattern: "*.{txt,json}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*:.*:KMERFINDER_SUBWORKFLOW:KMERFINDER_SUMMARY' { + ext.args = '' + publishDir = [ + path: { "${params.outdir}/Kmerfinder" }, + mode: params.publish_dir_mode, + pattern: "*.csv", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } +} if (params.annotation_tool == 'bakta') { if (params.baktadb_download == true) { diff --git a/conf/test.config b/conf/test.config index 165f51e7..0b422f03 100644 --- a/conf/test.config +++ b/conf/test.config @@ -20,7 +20,7 @@ params { max_time = '6.h' // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/bacass/bacass_short.tsv' + input = params.pipelines_testdata_base_path + 'bacass/bacass_short.tsv' // some extra args to speed tests up unicycler_args = "--no_correct --no_pilon" @@ -28,4 +28,5 @@ params { assembly_type = 'short' skip_pycoqc = true skip_kraken2 = true + skip_kmerfinder = true } diff --git a/conf/test_dfast.config b/conf/test_dfast.config index 3fa0e950..9edc4a7e 100644 --- a/conf/test_dfast.config +++ b/conf/test_dfast.config @@ -20,7 +20,7 @@ params { max_time = 6.h // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/bacass/bacass_short.tsv' + input = params.pipelines_testdata_base_path + 'bacass/bacass_short.tsv' // some extra args to speed tests up unicycler_args = "--no_correct --no_pilon" @@ -28,4 +28,5 @@ params { assembly_type = 'short' skip_pycoqc = true skip_kraken2 = true + skip_kmerfinder = true } diff --git a/conf/test_full.config b/conf/test_full.config index b89f4c4b..f51aa6d5 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -15,6 +15,8 @@ params { config_profile_description = 'Full test dataset to check pipeline function' // Input data for full size test - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/bacass/bacass_full.tsv' - kraken2db = 'https://genome-idx.s3.amazonaws.com/kraken/k2_standard_8gb_20210517.tar.gz' + input = params.pipelines_testdata_base_path + 'bacass/bacass_full.tsv' + kraken2db = 'https://genome-idx.s3.amazonaws.com/kraken/k2_standard_8gb_20210517.tar.gz' + kmerfinderdb = 'https://zenodo.org/records/10458361/files/20190108_kmerfinder_stable_dirs.tar.gz' + ncbi_assembly_metadata = 'https://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_refseq.txt' } diff --git a/conf/test_hybrid.config b/conf/test_hybrid.config index b3a560ae..a524de07 100644 --- a/conf/test_hybrid.config +++ b/conf/test_hybrid.config @@ -20,10 +20,11 @@ params { max_time = 6.h // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/bacass/bacass_hybrid.tsv' + input = params.pipelines_testdata_base_path + 'bacass/bacass_hybrid.tsv' // some extra args to speed tests up - assembly_type = 'hybrid' - prokka_args = " --fast" + assembly_type ='hybrid' + prokka_args =" --fast" skip_kraken2 = true + skip_kmerfinder = true } diff --git a/conf/test_hybrid_dragonflye.config b/conf/test_hybrid_dragonflye.config index c163d945..9cf34364 100644 --- a/conf/test_hybrid_dragonflye.config +++ b/conf/test_hybrid_dragonflye.config @@ -20,11 +20,12 @@ params { max_time = 6.h // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/bacass/bacass_hybrid_dragonflye.tsv' + input = params.pipelines_testdata_base_path + 'bacass/bacass_hybrid_dragonflye.tsv' // some extra args to speed tests up assembly_type = 'hybrid' assembler = 'dragonflye' prokka_args = " --fast" skip_kraken2 = true + skip_kmerfinder = true } diff --git a/conf/test_long.config b/conf/test_long.config index 43eea8ad..3cc3a8b6 100644 --- a/conf/test_long.config +++ b/conf/test_long.config @@ -20,11 +20,12 @@ params { max_time = 6.h // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/bacass/bacass_long_miniasm.tsv' + input = params.pipelines_testdata_base_path + 'bacass/bacass_long_miniasm.tsv' // some extra args to speed tests up prokka_args = " --fast" assembly_type = 'long' skip_polish = true skip_kraken2 = true + skip_kmerfinder = true } diff --git a/conf/test_long_dragonflye.config b/conf/test_long_dragonflye.config index 6007cff7..38301d47 100644 --- a/conf/test_long_dragonflye.config +++ b/conf/test_long_dragonflye.config @@ -15,7 +15,7 @@ params { config_profile_description = 'Minimal test dataset to check pipeline function' // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/bacass/bacass_long_miniasm.tsv' + input = params.pipelines_testdata_base_path + 'bacass/bacass_long_miniasm.tsv' // some extra args to speed tests up prokka_args = " --fast" @@ -23,4 +23,5 @@ params { assembler = 'dragonflye' skip_kraken2 = true skip_polish = true + skip_kmerfinder = true } diff --git a/conf/test_long_miniasm.config b/conf/test_long_miniasm.config index 8ae2d833..d6b5874e 100644 --- a/conf/test_long_miniasm.config +++ b/conf/test_long_miniasm.config @@ -20,11 +20,12 @@ params { max_time = 6.h // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/bacass/bacass_long_miniasm.tsv' + input = params.pipelines_testdata_base_path + 'bacass/bacass_long_miniasm.tsv' // some extra args to speed tests up prokka_args = " --fast" assembly_type = 'long' assembler = 'miniasm' kraken2db = "https://genome-idx.s3.amazonaws.com/kraken/16S_Greengenes13.5_20200326.tgz" + skip_kmerfinder = true } diff --git a/docs/output.md b/docs/output.md index a83dcd6e..5d0f0dac 100644 --- a/docs/output.md +++ b/docs/output.md @@ -118,6 +118,20 @@ Exemplary Kraken2 report screenshot: +## Reads QC and Sample purity + +The pipeline includes a dedicated step for short and long reads QC as well as contamination analysis using [Kmerfinder](https://bitbucket.org/genomicepidemiology/kmerfinder/src/master/). This process helps assess the quality and purity of the samples. + +
+Output files + +- `Kmerfinder/{ID}/` + - `*_results.txt`: Kmerfinder report table containing reads QC results and taxonomic information. +- `Kmerfinder/` + - `kmerfinder_summary.csv`: A CSV file containing the most relevant results of all samples analyzed with Kmerfinder. + +
+ ## Assembly Output Trimmed reads are assembled with [Unicycler](https://github.com/rrwick/Unicycler) in `short` or `hybrid` assembly modes. For long-read assembly, there are also `canu` and `miniasm` available. @@ -180,9 +194,10 @@ The assembly QC is performed with [QUAST](http://quast.sourceforge.net/quast) fo
Output files -- `QUAST` - - `report.tsv`: QUAST's report in text format -- `QUAST/report` +- `QUAST/report/` + - `icarus.html`: QUAST's contig browser as HTML + - `report.html`: QUAST assembly QC as HTML report + - `report.pdf`: QUAST assembly QC as pdf - `icarus.html`: QUAST's contig browser as HTML - `report.html`: QUAST assembly QC as HTML report - `report.pdf`: QUAST assembly QC as pdf @@ -240,6 +255,7 @@ Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQ - `multiqc_report.html`: a standalone HTML file that can be viewed in your web browser. - `multiqc_data/`: directory containing parsed statistics from the different tools used in the pipeline. - `multiqc_plots/`: directory containing static images from the report in various formats. + - `summary_assembly_metrics_mqc.csv`: custom table containing most relevant assembly QC metrics.
diff --git a/docs/usage.md b/docs/usage.md index e26ef5a0..6034a4f0 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -141,6 +141,8 @@ If `-profile` is not specified, the pipeline will run locally and expect all sof - A generic configuration profile to be used with [Charliecloud](https://hpc.github.io/charliecloud/) - `apptainer` - A generic configuration profile to be used with [Apptainer](https://apptainer.org/) +- `wave` + - A generic configuration profile to enable [Wave](https://seqera.io/wave/) containers. Use together with one of the above (requires Nextflow ` 24.03.0-edge` or later). - `conda` - A generic configuration profile to be used with [Conda](https://conda.io/docs/). Please only use Conda as a last resort i.e. when it's not possible to run the pipeline with Docker, Singularity, Podman, Shifter, Charliecloud, or Apptainer. diff --git a/modules.json b/modules.json index 1a559c15..6dc814ab 100644 --- a/modules.json +++ b/modules.json @@ -33,7 +33,7 @@ }, "fastqc": { "branch": "master", - "git_sha": "f4ae1d942bd50c5c0b9bd2de1393ce38315ba57c", + "git_sha": "285a50500f9e02578d90b3ce6382ea3c30216acd", "installed_by": ["fastq_trim_fastp_fastqc"] }, "gunzip": { @@ -56,11 +56,6 @@ "git_sha": "2c2d1cf80866dbd6dd0ea5d61ddd59533a72d41e", "installed_by": ["modules"] }, - "multiqc": { - "branch": "master", - "git_sha": "b7ebe95761cd389603f9cc0e0dc384c0f663815a", - "installed_by": ["modules"] - }, "nanoplot": { "branch": "master", "git_sha": "a31407dfaf0cb0d04768d5cb439fc6f4523a6981", @@ -118,7 +113,7 @@ }, "utils_nfcore_pipeline": { "branch": "master", - "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", + "git_sha": "92de218a329bfc9a9033116eb5f65fd270e72ba3", "installed_by": ["subworkflows"] }, "utils_nfvalidation_plugin": { diff --git a/modules/local/find_download_reference.nf b/modules/local/find_download_reference.nf new file mode 100644 index 00000000..87f73664 --- /dev/null +++ b/modules/local/find_download_reference.nf @@ -0,0 +1,40 @@ +process FIND_DOWNLOAD_REFERENCE { + tag "${task.process}" + label 'process_medium' + + conda "conda-forge::requests=2.26.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/requests:2.26.0' : + 'biocontainers/requests:2.26.0' }" + + input: + tuple val(refmeta), path(reports, stageAs: 'reports/*') + path(ncbi_metadata_db) + + output: + tuple val(refmeta), path("*.fna.gz") , emit: fna + tuple val(refmeta), path("*.gff.gz") , emit: gff + tuple val(refmeta), path("*.faa.gz") , emit: faa + tuple val(refmeta), path("references_found.tsv") , emit: references_tsv + tuple val(refmeta), path("*.winner") , emit: winner + path "versions.yml" , emit: versions + + script: + """ + ## Find the common reference genome + find_common_reference.py \\ + -d reports/ \\ + -o references_found.tsv + + ## Download the winner reference genome from the ncbi database + download_reference.py \\ + -file references_found.tsv \\ + -reference $ncbi_metadata_db \\ + -out_dir . + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | awk '{print \$2}') + END_VERSIONS + """ +} diff --git a/modules/local/kmerfinder.nf b/modules/local/kmerfinder.nf new file mode 100644 index 00000000..cca5f359 --- /dev/null +++ b/modules/local/kmerfinder.nf @@ -0,0 +1,39 @@ +process KMERFINDER { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::kmerfinder=3.0.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/kmerfinder:3.0.2--hdfd78af_0' : + 'biocontainers/kmerfinder:3.0.2--hdfd78af_0' }" + + input: + tuple val(meta), path(reads), path(kmerfinder_db) + + output: + tuple val(meta), path("*_results.txt") , emit: report + tuple val(meta), path("*_data.json") , emit: json + path "versions.yml" , emit: versions + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + def in_reads = reads[0] && reads[1] ? "${reads[0]} ${reads[1]}" : "${reads}" + // WARNING: Ensure to update software version in this line if you modify the container/environment. + def kmerfinder_version = "3.0.2" + """ + kmerfinder.py \\ + --infile $in_reads \\ + --output_folder . \\ + --db_path ${kmerfinder_db}/bacteria.ATG \\ + -tax ${kmerfinder_db}/bacteria.name \\ + -x + + mv results.txt ${prefix}_results.txt + mv data.json ${prefix}_data.json + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kmerfinder: \$(echo "${kmerfinder_version}") + END_VERSIONS + """ +} diff --git a/modules/local/kmerfinder_summary.nf b/modules/local/kmerfinder_summary.nf new file mode 100644 index 00000000..8e5fe45f --- /dev/null +++ b/modules/local/kmerfinder_summary.nf @@ -0,0 +1,31 @@ +process KMERFINDER_SUMMARY { + tag "kmerfinder_summary" + label 'process_low' + + conda "bioconda::multiqc=1.19" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/multiqc:1.19--pyhdfd78af_0' : + 'biocontainers/multiqc:1.19--pyhdfd78af_0' }" + + input: + path(report, stageAs: 'reports/*') + + output: + path "*.csv" , emit: summary + path "*.yaml" , emit: yaml + path "versions.yml" , emit: versions + + script: + """ + ## summarizing kmerfinder results + kmerfinder_summary.py --path reports/ --output_bn kmerfinder.bn --output_csv kmerfinder_summary.csv + + ## Create a yaml file from csv + csv_to_yaml.py -i kmerfinder_summary.csv -k 'sample_name' -op kmerfinder_summary + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | awk '{print \$2}') + END_VERSIONS + """ +} diff --git a/modules/local/multiqc_custom.nf b/modules/local/multiqc_custom.nf new file mode 100644 index 00000000..5c0cc9f5 --- /dev/null +++ b/modules/local/multiqc_custom.nf @@ -0,0 +1,63 @@ +process MULTIQC_CUSTOM { + label 'process_medium' + + conda "bioconda::multiqc=1.19" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/multiqc:1.19--pyhdfd78af_0' : + 'biocontainers/multiqc:1.19--pyhdfd78af_0' }" + + input: + path 'multiqc_config.yaml' + path multiqc_custom_config + path multiqc_logo + path workflow_summary + path methods_description + path software_versions + path ('fastqc/*') + path ('fastp/*') + path ('nanoplot/*') + path ('porechop/*') + path ('pycoqc/*') + path ('kraken2_short/*') + path ('kraken2_long/*') + path ('quast/*') + path ('prokka/*') + path ('bakta/*') + path ('extra/*') + + output: + path "*multiqc_report.html" , emit: report + path "*_data" , emit: data + path "*_assembly_metrics_mqc.csv" , optional:true, emit: csv_assembly + path "*_plots" , optional:true, emit: plots + path "versions.yml" , emit: versions + + script: + def args = task.ext.args ?: '' + def custom_config = multiqc_custom_config ? "--config $multiqc_custom_config" : '' + """ + ## Run MultiQC once to parse tool logs + multiqc -f $args $custom_config . + + ## Collect additional files to be included in the report + if [ -d extra/ ]; then + cp extra/* multiqc_data/ + fi + + ## Create multiqc custom data + multiqc_to_custom_csv.py --assembly_type $params.assembly_type + + ## Avoid the custom Multiqc table when the kmerfinder process is not invoked. + if grep ">skip_kmerfinder<" workflow_summary_mqc.yaml; then + rm *_assembly_metrics_mqc.csv + fi + + ## Run multiqc a second time + multiqc -f $args $custom_config . + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + multiqc: \$( multiqc --version | sed -e "s/multiqc, version //g" ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/fastqc/main.nf b/modules/nf-core/fastqc/main.nf index 9e19a74c..d79f1c86 100644 --- a/modules/nf-core/fastqc/main.nf +++ b/modules/nf-core/fastqc/main.nf @@ -25,6 +25,11 @@ process FASTQC { def old_new_pairs = reads instanceof Path || reads.size() == 1 ? [[ reads, "${prefix}.${reads.extension}" ]] : reads.withIndex().collect { entry, index -> [ entry, "${prefix}_${index + 1}.${entry.extension}" ] } def rename_to = old_new_pairs*.join(' ').join(' ') def renamed_files = old_new_pairs.collect{ old_name, new_name -> new_name }.join(' ') + + def memory_in_mb = MemoryUnit.of("${task.memory}").toUnit('MB') + // FastQC memory value allowed range (100 - 10000) + def fastqc_memory = memory_in_mb > 10000 ? 10000 : (memory_in_mb < 100 ? 100 : memory_in_mb) + """ printf "%s %s\\n" $rename_to | while read old_name new_name; do [ -f "\${new_name}" ] || ln -s \$old_name \$new_name @@ -33,6 +38,7 @@ process FASTQC { fastqc \\ $args \\ --threads $task.cpus \\ + --memory $fastqc_memory \\ $renamed_files cat <<-END_VERSIONS > versions.yml diff --git a/modules/nf-core/multiqc/environment.yml b/modules/nf-core/multiqc/environment.yml deleted file mode 100644 index ca39fb67..00000000 --- a/modules/nf-core/multiqc/environment.yml +++ /dev/null @@ -1,7 +0,0 @@ -name: multiqc -channels: - - conda-forge - - bioconda - - defaults -dependencies: - - bioconda::multiqc=1.21 diff --git a/modules/nf-core/multiqc/main.nf b/modules/nf-core/multiqc/main.nf deleted file mode 100644 index 47ac352f..00000000 --- a/modules/nf-core/multiqc/main.nf +++ /dev/null @@ -1,55 +0,0 @@ -process MULTIQC { - label 'process_single' - - conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.21--pyhdfd78af_0' : - 'biocontainers/multiqc:1.21--pyhdfd78af_0' }" - - input: - path multiqc_files, stageAs: "?/*" - path(multiqc_config) - path(extra_multiqc_config) - path(multiqc_logo) - - output: - path "*multiqc_report.html", emit: report - path "*_data" , emit: data - path "*_plots" , optional:true, emit: plots - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def config = multiqc_config ? "--config $multiqc_config" : '' - def extra_config = extra_multiqc_config ? "--config $extra_multiqc_config" : '' - def logo = multiqc_logo ? /--cl-config 'custom_logo: "${multiqc_logo}"'/ : '' - """ - multiqc \\ - --force \\ - $args \\ - $config \\ - $extra_config \\ - $logo \\ - . - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - multiqc: \$( multiqc --version | sed -e "s/multiqc, version //g" ) - END_VERSIONS - """ - - stub: - """ - mkdir multiqc_data - touch multiqc_plots - touch multiqc_report.html - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - multiqc: \$( multiqc --version | sed -e "s/multiqc, version //g" ) - END_VERSIONS - """ -} diff --git a/modules/nf-core/multiqc/meta.yml b/modules/nf-core/multiqc/meta.yml deleted file mode 100644 index 45a9bc35..00000000 --- a/modules/nf-core/multiqc/meta.yml +++ /dev/null @@ -1,58 +0,0 @@ -name: multiqc -description: Aggregate results from bioinformatics analyses across many samples into a single report -keywords: - - QC - - bioinformatics tools - - Beautiful stand-alone HTML report -tools: - - multiqc: - description: | - MultiQC searches a given directory for analysis logs and compiles a HTML report. - It's a general use tool, perfect for summarising the output from numerous bioinformatics tools. - homepage: https://multiqc.info/ - documentation: https://multiqc.info/docs/ - licence: ["GPL-3.0-or-later"] -input: - - multiqc_files: - type: file - description: | - List of reports / files recognised by MultiQC, for example the html and zip output of FastQC - - multiqc_config: - type: file - description: Optional config yml for MultiQC - pattern: "*.{yml,yaml}" - - extra_multiqc_config: - type: file - description: Second optional config yml for MultiQC. Will override common sections in multiqc_config. - pattern: "*.{yml,yaml}" - - multiqc_logo: - type: file - description: Optional logo file for MultiQC - pattern: "*.{png}" -output: - - report: - type: file - description: MultiQC report file - pattern: "multiqc_report.html" - - data: - type: directory - description: MultiQC data dir - pattern: "multiqc_data" - - plots: - type: file - description: Plots created by MultiQC - pattern: "*_data" - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" -authors: - - "@abhi18av" - - "@bunop" - - "@drpatelh" - - "@jfy133" -maintainers: - - "@abhi18av" - - "@bunop" - - "@drpatelh" - - "@jfy133" diff --git a/modules/nf-core/multiqc/tests/main.nf.test b/modules/nf-core/multiqc/tests/main.nf.test deleted file mode 100644 index f1c4242e..00000000 --- a/modules/nf-core/multiqc/tests/main.nf.test +++ /dev/null @@ -1,84 +0,0 @@ -nextflow_process { - - name "Test Process MULTIQC" - script "../main.nf" - process "MULTIQC" - - tag "modules" - tag "modules_nfcore" - tag "multiqc" - - test("sarscov2 single-end [fastqc]") { - - when { - process { - """ - input[0] = Channel.of(file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastqc/test_fastqc.zip', checkIfExists: true)) - input[1] = [] - input[2] = [] - input[3] = [] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert process.out.report[0] ==~ ".*/multiqc_report.html" }, - { assert process.out.data[0] ==~ ".*/multiqc_data" }, - { assert snapshot(process.out.versions).match("multiqc_versions_single") } - ) - } - - } - - test("sarscov2 single-end [fastqc] [config]") { - - when { - process { - """ - input[0] = Channel.of(file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastqc/test_fastqc.zip', checkIfExists: true)) - input[1] = Channel.of(file("https://github.com/nf-core/tools/raw/dev/nf_core/pipeline-template/assets/multiqc_config.yml", checkIfExists: true)) - input[2] = [] - input[3] = [] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert process.out.report[0] ==~ ".*/multiqc_report.html" }, - { assert process.out.data[0] ==~ ".*/multiqc_data" }, - { assert snapshot(process.out.versions).match("multiqc_versions_config") } - ) - } - } - - test("sarscov2 single-end [fastqc] - stub") { - - options "-stub" - - when { - process { - """ - input[0] = Channel.of(file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastqc/test_fastqc.zip', checkIfExists: true)) - input[1] = [] - input[2] = [] - input[3] = [] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot(process.out.report.collect { file(it).getName() } + - process.out.data.collect { file(it).getName() } + - process.out.plots.collect { file(it).getName() } + - process.out.versions ).match("multiqc_stub") } - ) - } - - } -} diff --git a/modules/nf-core/multiqc/tests/main.nf.test.snap b/modules/nf-core/multiqc/tests/main.nf.test.snap deleted file mode 100644 index bfebd802..00000000 --- a/modules/nf-core/multiqc/tests/main.nf.test.snap +++ /dev/null @@ -1,41 +0,0 @@ -{ - "multiqc_versions_single": { - "content": [ - [ - "versions.yml:md5,21f35ee29416b9b3073c28733efe4b7d" - ] - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" - }, - "timestamp": "2024-02-29T08:48:55.657331" - }, - "multiqc_stub": { - "content": [ - [ - "multiqc_report.html", - "multiqc_data", - "multiqc_plots", - "versions.yml:md5,21f35ee29416b9b3073c28733efe4b7d" - ] - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" - }, - "timestamp": "2024-02-29T08:49:49.071937" - }, - "multiqc_versions_config": { - "content": [ - [ - "versions.yml:md5,21f35ee29416b9b3073c28733efe4b7d" - ] - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" - }, - "timestamp": "2024-02-29T08:49:25.457567" - } -} \ No newline at end of file diff --git a/modules/nf-core/multiqc/tests/tags.yml b/modules/nf-core/multiqc/tests/tags.yml deleted file mode 100644 index bea6c0d3..00000000 --- a/modules/nf-core/multiqc/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -multiqc: - - modules/nf-core/multiqc/** diff --git a/modules/nf-core/racon/main.nf b/modules/nf-core/racon/main.nf index de29e355..3b45dc5d 100644 --- a/modules/nf-core/racon/main.nf +++ b/modules/nf-core/racon/main.nf @@ -11,7 +11,7 @@ process RACON { tuple val(meta), path(reads), path(assembly), path(paf) output: - tuple val(meta), path('*_assembly_consensus.fasta.gz') , emit: improved_assembly + tuple val(meta), path('*.consensus.fasta.gz') , emit: improved_assembly path "versions.yml" , emit: versions when: @@ -26,9 +26,9 @@ process RACON { "${paf}" \\ $args \\ "${assembly}" > \\ - ${prefix}_assembly_consensus.fasta + ${prefix}.consensus.fasta - gzip -n ${prefix}_assembly_consensus.fasta + gzip -n ${prefix}.consensus.fasta cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/racon/racon.diff b/modules/nf-core/racon/racon.diff new file mode 100644 index 00000000..c6e8d118 --- /dev/null +++ b/modules/nf-core/racon/racon.diff @@ -0,0 +1,26 @@ +Changes in module 'nf-core/racon' +--- modules/nf-core/racon/main.nf ++++ modules/nf-core/racon/main.nf +@@ -11,7 +11,7 @@ + tuple val(meta), path(reads), path(assembly), path(paf) + + output: +- tuple val(meta), path('*_assembly_consensus.fasta.gz') , emit: improved_assembly ++ tuple val(meta), path('*.consensus.fasta.gz') , emit: improved_assembly + path "versions.yml" , emit: versions + + when: +@@ -26,9 +26,9 @@ + "${paf}" \\ + $args \\ + "${assembly}" > \\ +- ${prefix}_assembly_consensus.fasta ++ ${prefix}.consensus.fasta + +- gzip -n ${prefix}_assembly_consensus.fasta ++ gzip -n ${prefix}.consensus.fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + +************************************************************ diff --git a/nextflow.config b/nextflow.config index 09198a1c..03038c32 100644 --- a/nextflow.config +++ b/nextflow.config @@ -13,11 +13,17 @@ params { input = null // QC and trimming options + fastp_args = "" + save_trimmed = false save_trimmed_fail = false save_merged = false // Contamination_screening - kraken2db = "" + kraken2db = '' + kmerfinderdb = '' + reference_fasta = '' + reference_gff = '' + ncbi_assembly_metadata = '' // Assembly parameters assembler = 'unicycler' // Allowed: ['unicycler', 'canu', 'miniasm', 'dragonflye'] @@ -42,6 +48,7 @@ params { skip_fastqc = false skip_fastp = false skip_kraken2 = false + skip_kmerfinder = false skip_pycoqc = false skip_annotation = false skip_polish = false @@ -66,6 +73,8 @@ params { validate_params = true schema_ignore_params = 'modules,igenomes_base' version = false + pipelines_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/' + // Config options config_profile_name = null @@ -100,103 +109,109 @@ try { } // Load nf-core/bacass custom profiles from different institutions. -// Warning: Uncomment only if a pipeline-specific institutional config already exists on nf-core/configs! -// try { -// includeConfig "${params.custom_config_base}/pipeline/bacass.config" -// } catch (Exception e) { -// System.err.println("WARNING: Could not load nf-core/config/bacass profiles: ${params.custom_config_base}/pipeline/bacass.config") -// } +try { + includeConfig "${params.custom_config_base}/pipeline/bacass.config" +} catch (Exception e) { + System.err.println("WARNING: Could not load nf-core/config/bacass profiles: ${params.custom_config_base}/pipeline/bacass.config") +} profiles { debug { - dumpHashes = true - process.beforeScript = 'echo $HOSTNAME' - cleanup = false + dumpHashes = true + process.beforeScript = 'echo $HOSTNAME' + cleanup = false nextflow.enable.configProcessNamesValidation = true } conda { - conda.enabled = true - docker.enabled = false - singularity.enabled = false - podman.enabled = false - shifter.enabled = false - charliecloud.enabled = false - channels = ['conda-forge', 'bioconda', 'defaults'] - apptainer.enabled = false + conda.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + conda.channels = ['conda-forge', 'bioconda', 'defaults'] + apptainer.enabled = false } mamba { - conda.enabled = true - conda.useMamba = true - docker.enabled = false - singularity.enabled = false - podman.enabled = false - shifter.enabled = false - charliecloud.enabled = false - apptainer.enabled = false + conda.enabled = true + conda.useMamba = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + apptainer.enabled = false } docker { - docker.enabled = true - conda.enabled = false - singularity.enabled = false - podman.enabled = false - shifter.enabled = false - charliecloud.enabled = false - apptainer.enabled = false - docker.runOptions = '-u $(id -u):$(id -g)' + docker.enabled = true + conda.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + apptainer.enabled = false + docker.runOptions = '-u $(id -u):$(id -g)' } arm { - docker.runOptions = '-u $(id -u):$(id -g) --platform=linux/amd64' + docker.runOptions = '-u $(id -u):$(id -g) --platform=linux/amd64' } singularity { - singularity.enabled = true - singularity.autoMounts = true - conda.enabled = false - docker.enabled = false - podman.enabled = false - shifter.enabled = false - charliecloud.enabled = false - apptainer.enabled = false + singularity.enabled = true + singularity.autoMounts = true + conda.enabled = false + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + apptainer.enabled = false } podman { - podman.enabled = true - conda.enabled = false - docker.enabled = false - singularity.enabled = false - shifter.enabled = false - charliecloud.enabled = false - apptainer.enabled = false + podman.enabled = true + conda.enabled = false + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + apptainer.enabled = false } shifter { - shifter.enabled = true - conda.enabled = false - docker.enabled = false - singularity.enabled = false - podman.enabled = false - charliecloud.enabled = false - apptainer.enabled = false + shifter.enabled = true + conda.enabled = false + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + apptainer.enabled = false } charliecloud { - charliecloud.enabled = true - conda.enabled = false - docker.enabled = false - singularity.enabled = false - podman.enabled = false - shifter.enabled = false - apptainer.enabled = false + charliecloud.enabled = true + conda.enabled = false + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + apptainer.enabled = false } apptainer { - apptainer.enabled = true - apptainer.autoMounts = true - conda.enabled = false - docker.enabled = false - singularity.enabled = false - podman.enabled = false - shifter.enabled = false - charliecloud.enabled = false + apptainer.enabled = true + apptainer.autoMounts = true + conda.enabled = false + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + wave { + apptainer.ociAutoPull = true + singularity.ociAutoPull = true + wave.enabled = true + wave.freeze = true + wave.strategy = 'conda,container' } gitpod { - executor.name = 'local' - executor.cpus = 4 - executor.memory = 8.GB + executor.name = 'local' + executor.cpus = 4 + executor.memory = 8.GB } test { includeConfig 'conf/test.config' } test_dfast { includeConfig 'conf/test_dfast.config' } @@ -263,7 +278,7 @@ manifest { description = """Simple bacterial assembly and annotation""" mainScript = 'main.nf' nextflowVersion = '!>=23.04.0' - version = '2.2.0' + version = '2.3.0' doi = '10.5281/zenodo.2669428' } diff --git a/nextflow_schema.json b/nextflow_schema.json index 0b1d176b..8eec0004 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -42,9 +42,16 @@ "description": "Parameters for QC and trim short-reads", "default": "", "properties": { + "fastp_args": { + "type": "string", + "description": "This can be used to pass arguments to [Fastp](https://github.com/OpenGene/fastp)" + }, + "save_trimmed": { + "type": "boolean", + "description": "save trimmed files" + }, "save_trimmed_fail": { "type": "boolean", - "enum": ["true", "false"], "description": "save files that failed to pass trimming thresholds ending in `*.fail.fastq.gz`" }, "save_merged": { @@ -73,6 +80,22 @@ "fa_icon": "fab fa-gitkraken", "help_text": "See [Kraken2 homepage](https://benlangmead.github.io/aws-indexes/k2) for download\nlinks. Minikraken2 8GB is a reasonable choice, since we run Kraken here mainly just to check for\nsample purity.", "description": "Path to Kraken2 database." + }, + "kmerfinderdb": { + "type": "string", + "description": "Path to the Kmerfinder bacteria database. For more details, see [Kmerfinder Databases](https://bitbucket.org/genomicepidemiology/kmerfinder_db/src/master/). You can also download precomputed Kmerfinder database (dated 2019/01/08) from https://zenodo.org/records/10458361/files/20190108_kmerfinder_stable_dirs.tar.gz " + }, + "reference_fasta": { + "type": "string", + "description": "Reference FASTA file." + }, + "reference_gff": { + "type": "string", + "description": "Reference GFF file." + }, + "ncbi_assembly_metadata": { + "type": "string", + "description": "Master file (*.txt) containing a summary of assemblies available in GeneBank or RefSeq. See: https://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt" } } }, @@ -190,6 +213,11 @@ "fa_icon": "fas fa-forward", "description": "Skip running Kraken2 classifier on reads." }, + "skip_kmerfinder": { + "type": "boolean", + "description": "Skip contamination analysis with [Kmerfinder](https://bitbucket.org/genomicepidemiology/kmerfinder/src/master/)", + "fa_icon": "fas fa-forward" + }, "skip_annotation": { "type": "boolean", "fa_icon": "fas fa-forward", @@ -207,7 +235,8 @@ }, "skip_multiqc": { "type": "boolean", - "description": "Skip MultiQC" + "description": "Skip MultiQC", + "fa_icon": "fas fa-forward" } } }, @@ -413,6 +442,13 @@ "default": "modules,igenomes_base", "hidden": true, "description": "A comma separated string of inputs the schema validation should ignore" + }, + "pipelines_testdata_base_path": { + "type": "string", + "fa_icon": "far fa-check-circle", + "description": "Base URL or local path to location of pipeline test dataset files", + "default": "https://raw.githubusercontent.com/nf-core/test-datasets/", + "hidden": true } } } diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index 56110621..00000000 --- a/pyproject.toml +++ /dev/null @@ -1,15 +0,0 @@ -# Config file for Python. Mostly used to configure linting of bin/*.py with Ruff. -# Should be kept the same as nf-core/tools to avoid fighting with template synchronisation. -[tool.ruff] -line-length = 120 -target-version = "py38" -cache-dir = "~/.cache/ruff" - -[tool.ruff.lint] -select = ["I", "E1", "E4", "E7", "E9", "F", "UP", "N"] - -[tool.ruff.lint.isort] -known-first-party = ["nf_core"] - -[tool.ruff.lint.per-file-ignores] -"__init__.py" = ["E402", "F401"] diff --git a/subworkflows/local/kmerfinder_subworkflow.nf b/subworkflows/local/kmerfinder_subworkflow.nf new file mode 100644 index 00000000..ef777ebc --- /dev/null +++ b/subworkflows/local/kmerfinder_subworkflow.nf @@ -0,0 +1,83 @@ +// +// Kmerfinder subworkflow for species identification & QC +// +include { UNTAR } from '../../modules/nf-core/untar/main' +include { KMERFINDER } from '../../modules/local/kmerfinder' +include { KMERFINDER_SUMMARY } from '../../modules/local/kmerfinder_summary' +include { FIND_DOWNLOAD_REFERENCE } from '../../modules/local/find_download_reference' + +workflow KMERFINDER_SUBWORKFLOW { + take: + reads // channel: [ meta, reads ] + consensus // channel: [ meta, consensus ] + + main: + ch_versions = Channel.empty() + + // Prepare kmerfinder database + ch_kmerfinderdb = file(params.kmerfinderdb, checkIfExists: true) + ch_ncbi_assembly_metadata = file(params.ncbi_assembly_metadata, checkIfExists: true) + + if ( ch_kmerfinderdb.name.endsWith('.gz') ) { + UNTAR ( [[ id: ch_kmerfinderdb.getSimpleName() ], ch_kmerfinderdb] ) + ch_kmerfinderdb_untar = UNTAR.out.untar.map{ meta, file -> file } + ch_versions = ch_versions.mix(UNTAR.out.versions) + } else { + ch_kmerfinderdb_untar = Channel.from(params.kmerfinderdb) + } + + // MODULE: Kmerfinder, QC for sample purity. Identifies reference specie and reference genome assembly for each sample. + reads + .combine(ch_kmerfinderdb_untar) + .map{ meta, reads, db -> tuple(meta, reads, db) } + .set{ ch_to_kmerfinder } + + KMERFINDER ( + ch_to_kmerfinder + ) + ch_kmerfinder_report = KMERFINDER.out.report + ch_kmerfinder_json = KMERFINDER.out.json + ch_versions = ch_versions.mix(KMERFINDER.out.versions) + + // MODULE: Kmerfinder summary report. Generates a csv report file collecting all sample references. + KMERFINDER_SUMMARY ( + ch_kmerfinder_report.map{ meta, report -> report }.collect() + ) + ch_summary_yaml = KMERFINDER_SUMMARY.out.yaml + ch_versions = ch_versions.mix(KMERFINDER_SUMMARY.out.versions) + + // SUBWORKFLOW: Create a channel to organize assemblies and reports based on the identified Kmerfinder reference. + ch_kmerfinder_json + .join(ch_kmerfinder_report, by:0) + .join(consensus, by:0) + .map{ + meta, report_json, report_txt, fasta -> + specie = report_json.splitJson(path:"kmerfinder.results.species_hits").value.get(0)["Species"] + return tuple(specie, meta, report_txt, fasta) + } + .groupTuple(by:0) // Group by the "Species" field + .set { ch_reports_byreference } + + // SUBWORKFLOW: For each species target, this subworkflow collects reference genome assemblies ('GCF*') and subsequently downloads the best matching reference assembly. + FIND_DOWNLOAD_REFERENCE ( + ch_reports_byreference.map{ specie, meta, report_txt, fasta-> tuple(specie, report_txt) }, + ch_ncbi_assembly_metadata + ) + ch_versions = ch_versions.mix(FIND_DOWNLOAD_REFERENCE.out.versions) + + // Organize sample assemblies into channels based on their corresponding reference files. + ch_reports_byreference + .join(FIND_DOWNLOAD_REFERENCE.out.fna) + .join(FIND_DOWNLOAD_REFERENCE.out.gff) + .join(FIND_DOWNLOAD_REFERENCE.out.winner) + .map { + specie, meta, report_txt, fasta, fna, gff, winner_id -> + return tuple([id: winner_id.getBaseName()], meta, fasta, fna, gff) + } + .set { ch_consensus_byrefseq } + + emit: + versions = ch_versions // channel: [ path(versions.yml) ] + summary_yaml = ch_summary_yaml // channel: [ path(kmerfinder_summary.yml) ] + consensus_byrefseq = ch_consensus_byrefseq // channel: [ refmeta, meta, fasta, fna, gff ] +} diff --git a/subworkflows/local/utils_nfcore_bacass_pipeline/main.nf b/subworkflows/local/utils_nfcore_bacass_pipeline/main.nf index 3d9063c4..b707cbcb 100644 --- a/subworkflows/local/utils_nfcore_bacass_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_bacass_pipeline/main.nf @@ -75,7 +75,7 @@ workflow PIPELINE_INITIALISATION { // // Custom validation for pipeline parameters // - //validateInputParameters() + validateInputParameters() // // Create channel from input file provided through params.input @@ -140,6 +140,10 @@ workflow PIPELINE_COMPLETION { imNotification(summary_params, hook_url) } } + + workflow.onError { + log.error "Pipeline failed. Please refer to troubleshooting docs: https://nf-co.re/docs/usage/troubleshooting" + } } /* @@ -152,6 +156,26 @@ workflow PIPELINE_COMPLETION { // def validateInputParameters() { // Add functions here for parameters validation + // Check Kraken2 dependencies + if (!params.skip_kraken2 && !params.kraken2db) { + def error_string = "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + + " Kraken2 database not provided.\n" + + " Please specify the '--kraken2db' parameter to provide the necessary database.\n" + + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + error(error_string) + } + + // Check kmerfinder dependencies + if (!params.skip_kmerfinder) { + if (!params.kmerfinderdb || !params.ncbi_assembly_metadata) { + def error_string = "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + + " Kmerfinder database and NCBI assembly metadata not provided.\n" + + " Please specify the '--kmerfinderdb' and '--ncbi_assembly_metadata' parameters.\n" + + " Both are required to run Kmerfinder.\n" + + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + error(error_string) + } + } } // @@ -180,7 +204,8 @@ def toolCitationText() { "ProeChop (Wick RR et al. 2017)", "Nanoplot (Wouter De Coster and Rosa Rademakers 2023)", "PycoQC (Adrien Leger & Tommaso Leonardi 2019)", - "Kreken2 (Derrick E. Wood et al. 2019)", + "Kraken2 (Derrick E. Wood et al. 2019)", + "Kmerfinder (Larsen et al. 2014)", "Unicycler (Ryan R Wick et al. 2017)", "Minimap & Miniasm (Heng Li 2016)", "Dragonflye (Robert A Petit III )", @@ -208,6 +233,7 @@ def toolBibliographyText() { "
  • Wouter De Coster, Rosa Rademakers, NanoPack2: population-scale evaluation of long-read sequencing data, Bioinformatics, Volume 39, Issue 5, May 2023, btad311, https://doi.org/10.1093/bioinformatics/btad311
  • ", "
  • Leger et al., (2019). pycoQC, interactive quality control for Oxford Nanopore Sequencing. Journal of Open Source Software, 4(34), 1236, https://doi.org/10.21105/joss.01236
  • ", "
  • Wood, D.E., Lu, J. & Langmead, B. Improved metagenomic analysis with Kraken 2. Genome Biol 20, 257 (2019). https://doi.org/10.1186/s13059-019-1891-0
  • ", + "
  • RBenchmarking of Methods for Genomic Taxonomy. Larsen MV, Cosentino S, Lukjancenko O, Saputra D, Rasmussen S, Hasman H, Sicheritz-Pontén T, Aarestrup FM, Ussery DW, Lund O. J Clin Microbiol. 2014 Feb 26.
  • ", "
  • Wick RR, Judd LM, Gorrie CL, Holt KE. Unicycler: Resolving bacterial genome assemblies from short and long sequencing reads. PLoS Comput Biol. 2017 Jun 8;13(6):e1005595. doi: 10.1371/journal.pcbi.1005595.
  • ", "
  • Heng Li, Minimap and miniasm: fast mapping and de novo assembly for noisy long sequences, Bioinformatics, Volume 32, Issue 14, July 2016, Pages 2103–2110, https://doi.org/10.1093/bioinformatics/btw152
  • ", "
  • Petit III, R. A. dragonflye: assemble bacterial isolate genomes from Nanopore reads (Version 1.1.2). https://github.com/rpetit3/dragonflye
  • ", @@ -232,8 +258,16 @@ def methodsDescriptionText(mqc_methods_yaml) { meta["manifest_map"] = workflow.manifest.toMap() // Pipeline DOI - meta["doi_text"] = meta.manifest_map.doi ? "(doi: ${meta.manifest_map.doi})" : "" - meta["nodoi_text"] = meta.manifest_map.doi ? "": "
  • If available, make sure to update the text to include the Zenodo DOI of version of the pipeline used.
  • " + if (meta.manifest_map.doi) { + // Using a loop to handle multiple DOIs + // Removing `https://doi.org/` to handle pipelines using DOIs vs DOI resolvers + // Removing ` ` since the manifest.doi is a string and not a proper list + def temp_doi_ref = "" + String[] manifest_doi = meta.manifest_map.doi.tokenize(",") + for (String doi_ref: manifest_doi) temp_doi_ref += "(doi: ${doi_ref.replace("https://doi.org/", "").replace(" ", "")}), " + meta["doi_text"] = temp_doi_ref.substring(0, temp_doi_ref.length() - 2) + } else meta["doi_text"] = "" + meta["nodoi_text"] = meta.manifest_map.doi ? "" : "
  • If available, make sure to update the text to include the Zenodo DOI of version of the pipeline used.
  • " // Tool references meta["tool_citations"] = "" diff --git a/subworkflows/nf-core/utils_nfcore_pipeline/main.nf b/subworkflows/nf-core/utils_nfcore_pipeline/main.nf index a8b55d6f..14558c39 100644 --- a/subworkflows/nf-core/utils_nfcore_pipeline/main.nf +++ b/subworkflows/nf-core/utils_nfcore_pipeline/main.nf @@ -65,9 +65,15 @@ def checkProfileProvided(nextflow_cli_args) { // Citation string for pipeline // def workflowCitation() { + def temp_doi_ref = "" + String[] manifest_doi = workflow.manifest.doi.tokenize(",") + // Using a loop to handle multiple DOIs + // Removing `https://doi.org/` to handle pipelines using DOIs vs DOI resolvers + // Removing ` ` since the manifest.doi is a string and not a proper list + for (String doi_ref: manifest_doi) temp_doi_ref += " https://doi.org/${doi_ref.replace('https://doi.org/', '').replace(' ', '')}\n" return "If you use ${workflow.manifest.name} for your analysis please cite:\n\n" + "* The pipeline\n" + - " ${workflow.manifest.doi}\n\n" + + temp_doi_ref + "\n" + "* The nf-core framework\n" + " https://doi.org/10.1038/s41587-020-0439-x\n\n" + "* Software dependencies\n" + diff --git a/workflows/bacass.nf b/workflows/bacass.nf index ca644cc5..3ed2de6e 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -8,14 +8,14 @@ def checkPathParamList = [ params.input, params.multiqc_config, params.kraken2db, params.dfast_config ] for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } -// Check krakendb -if(! params.skip_kraken2){ - if(params.kraken2db){ - kraken2db = file(params.kraken2db) - } else { - exit 1, "Missing Kraken2 DB arg" - } -} +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + CONFIG FILES +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +// Place config files here + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -26,12 +26,13 @@ if(! params.skip_kraken2){ // // MODULE: Local to the pipeline // -include { PYCOQC } from '../modules/local/pycoqc/main' -include { UNICYCLER } from '../modules/local/unicycler/main' -include { NANOPOLISH } from '../modules/local/nanopolish/main' -include { MEDAKA } from '../modules/local/medaka/main' -include { KRAKEN2_DB_PREPARATION } from '../modules/local/kraken2_db_preparation/main' -include { DFAST } from '../modules/local/dfast/main' +include { PYCOQC } from '../modules/local/pycoqc' +include { UNICYCLER } from '../modules/local/unicycler' +include { NANOPOLISH } from '../modules/local/nanopolish' +include { MEDAKA } from '../modules/local/medaka' +include { KRAKEN2_DB_PREPARATION } from '../modules/local/kraken2_db_preparation' +include { DFAST } from '../modules/local/dfast' +include { MULTIQC_CUSTOM } from '../modules/local/multiqc_custom' // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules @@ -60,14 +61,15 @@ include { SAMTOOLS_INDEX } from '../modules/nf-core/samto include { KRAKEN2_KRAKEN2 as KRAKEN2 } from '../modules/nf-core/kraken2/kraken2/main' include { KRAKEN2_KRAKEN2 as KRAKEN2_LONG } from '../modules/nf-core/kraken2/kraken2/main' include { QUAST } from '../modules/nf-core/quast/main' +include { QUAST as QUAST_BYREFSEQID } from '../modules/nf-core/quast/main' include { GUNZIP } from '../modules/nf-core/gunzip/main' include { PROKKA } from '../modules/nf-core/prokka/main' -include { MULTIQC } from '../modules/nf-core/multiqc/main' // // SUBWORKFLOWS: Consisting of a mix of local and nf-core/modules // include { FASTQ_TRIM_FASTP_FASTQC } from '../subworkflows/nf-core/fastq_trim_fastp_fastqc/main' +include { KMERFINDER_SUBWORKFLOW } from '../subworkflows/local/kmerfinder_subworkflow' include { BAKTA_DBDOWNLOAD_RUN } from '../subworkflows/local/bakta_dbdownload_run' include { paramsSummaryMap } from 'plugin/nf-validation' include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' @@ -237,6 +239,7 @@ workflow BACASS { // // MODULE: Miniasm, genome assembly, long reads + // if ( params.assembler == 'miniasm' ) { MINIMAP2_ALIGN ( ch_for_assembly.map{ meta,sr,lr -> tuple(meta,lr) }, @@ -315,7 +318,7 @@ workflow BACASS { ch_for_polish // tuple val(meta), val(reads), file(longreads), file(assembly) .join( MINIMAP2_POLISH.out.bam ) // tuple val(meta), file(bam) .join( SAMTOOLS_INDEX.out.bai ) // tuple val(meta), file(bai) - .join( ch_fast5 ) // tuple val(meta), file(fast5) + .join( ch_fast5 ) // tuple val(meta), file(fast5) .set { ch_for_nanopolish } // tuple val(meta), val(reads), file(longreads), file(assembly), file(bam), file(bai), file(fast5) // TODO: 'nanopolish index' couldn't be tested. No fast5 provided in test datasets. @@ -345,7 +348,7 @@ workflow BACASS { ch_kraken_long_multiqc = Channel.empty() if ( !params.skip_kraken2 ) { KRAKEN2_DB_PREPARATION ( - kraken2db + params.kraken2db ) ch_versions = ch_versions.mix(KRAKEN2_DB_PREPARATION.out.versions) KRAKEN2 ( @@ -374,21 +377,69 @@ workflow BACASS { ch_versions = ch_versions.mix(KRAKEN2_LONG.out.versions) } + // + // SUBWORKFLOW: Kmerfinder, QC for sample purity. + // + // Executes both kmerfinder and classifies samples by their reference genome (all this through the kmerfinder_subworkflow()). + + ch_kmerfinder_multiqc = Channel.empty() + if (!params.skip_kmerfinder) { + // Set kmerfinder channel based on assembly type + if( params.assembly_type == 'short' || params.assembly_type == 'hybrid' ) { + ch_for_kmerfinder = FASTQ_TRIM_FASTP_FASTQC.out.reads + } else if ( params.assembly_type == 'long' ) { + ch_for_kmerfinder = PORECHOP_PORECHOP.out.reads + } + // RUN kmerfinder subworkflow + KMERFINDER_SUBWORKFLOW ( + ch_for_kmerfinder, + ch_assembly + ) + ch_kmerfinder_multiqc = KMERFINDER_SUBWORKFLOW.out.summary_yaml + ch_consensus_byrefseq = KMERFINDER_SUBWORKFLOW.out.consensus_byrefseq + ch_versions = ch_versions.mix(KMERFINDER_SUBWORKFLOW.out.versions) + + // Set channel to perform by refseq QUAST based on reference genome identified with KMERFINDER. + ch_consensus_byrefseq + .map { + refmeta, meta, consensus, ref_fna, ref_gff -> + return tuple(refmeta, consensus.flatten(), ref_fna, ref_gff) + } + .set { ch_to_quast_byrefseq } + } + // // MODULE: QUAST, assembly QC // ch_assembly - .collect{ it[1] } - .map { consensus_collect -> tuple([id: "report"], consensus_collect) } - .set { ch_to_quast } - - QUAST ( - ch_to_quast, - [[:],[]], - [[:],[]] - ) - ch_quast_multiqc = QUAST.out.tsv - ch_versions = ch_versions.mix(QUAST.out.versions) + .collect{it[1]} + .map{ consensus -> tuple([id:'report'], consensus) } + .set{ ch_to_quast } + + if(params.skip_kmerfinder){ + QUAST( + ch_to_quast, + params.reference_fasta ?: [[:],[]], + params.reference_gff ?: [[:],[]] + ) + ch_quast_multiqc = QUAST.out.results + } else if (!params.skip_kmerfinder) { + // Quast runs twice if kmerfinder is allowed. + // This approach allow Quast to calculate relevant parameters such as genome fraction based on a reference genome. + QUAST( + ch_to_quast, + [[:],[]], + [[:],[]] + ) + QUAST_BYREFSEQID( + ch_to_quast_byrefseq.map{ refmeta, consensus, ref_fasta, ref_gff -> tuple( refmeta, consensus)}, + ch_to_quast_byrefseq.map{ refmeta, consensus, ref_fasta, ref_gff -> tuple( refmeta, ref_fasta)}, + ch_to_quast_byrefseq.map{ refmeta, consensus, ref_fasta, ref_gff -> tuple( refmeta, ref_gff)} + ) + ch_quast_multiqc = QUAST_BYREFSEQID.out.results + ch_versions = ch_versions.mix(QUAST_BYREFSEQID.out.versions) + } + ch_versions = ch_versions.mix(QUAST.out.versions) // Check assemblies that require further processing for gene annotation ch_assembly @@ -413,7 +464,7 @@ workflow BACASS { [], [] ) - ch_prokka_txt_multiqc = PROKKA.out.txt.collect() + ch_prokka_txt_multiqc = PROKKA.out.txt.map{ meta, prokka_txt -> [ prokka_txt ]} ch_versions = ch_versions.mix(PROKKA.out.versions) } @@ -432,7 +483,7 @@ workflow BACASS { params.baktadb, params.baktadb_download ) - ch_bakta_txt_multiqc = BAKTA_DBDOWNLOAD_RUN.out.bakta_txt_multiqc.collect() + ch_bakta_txt_multiqc = BAKTA_DBDOWNLOAD_RUN.out.bakta_txt_multiqc.map{ meta, bakta_txt -> [ bakta_txt ]} ch_versions = ch_versions.mix(BAKTA_DBDOWNLOAD_RUN.out.versions) } // @@ -451,46 +502,47 @@ workflow BACASS { // Collate and save software versions // softwareVersionsToYAML(ch_versions) - .collectFile(storeDir: "${params.outdir}/pipeline_info", name: 'nf_core_pipeline_software_mqc_versions.yml', sort: true, newLine: true) - .set { ch_collated_versions } + .collectFile( + storeDir: "${params.outdir}/pipeline_info", + name: 'nf_core_pipeline_software_mqc_versions.yml', + sort: true, + newLine: true + ).set { ch_collated_versions } // // MODULE: MultiQC // - ch_multiqc_config = Channel.fromPath("$projectDir/assets/multiqc_config.yml", checkIfExists: true) + ch_multiqc_config = !params.skip_kmerfinder && params.assembly_type ? Channel.fromPath("$projectDir/assets/multiqc_config_${params.assembly_type}.yml", checkIfExists: true) : Channel.fromPath("$projectDir/assets/multiqc_config.yml", checkIfExists: true) ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_config, checkIfExists: true) : Channel.empty() ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath(params.multiqc_logo, checkIfExists: true) : Channel.empty() summary_params = paramsSummaryMap(workflow, parameters_schema: "nextflow_schema.json") ch_workflow_summary = Channel.value(paramsSummaryMultiqc(summary_params)) - ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) - - ch_methods_description = Channel.value(methodsDescriptionText(ch_multiqc_custom_methods_description)) - ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) - ch_multiqc_files = ch_multiqc_files.mix(ch_collated_versions) - ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml', sort: false)) - ch_multiqc_files = ch_multiqc_files.mix(ch_fastqc_raw_multiqc.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_fastqc_trim_multiqc.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_trim_json_multiqc.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_kraken_short_multiqc.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_kraken_long_multiqc.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_quast_multiqc.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_prokka_txt_multiqc.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_bakta_txt_multiqc.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_nanoplot_txt_multiqc.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_porechop_log_multiqc.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_pycoqc_multiqc.collect{it[1]}.ifEmpty([])) - - MULTIQC ( - ch_multiqc_files.collect(), - ch_multiqc_config, - ch_multiqc_custom_config.collect().ifEmpty([]), - ch_multiqc_logo.collect().ifEmpty([]) + ch_multiqc_custom_methods_description = params.multiqc_methods_description ? Channel.fromPath(params.multiqc_methods_description, checkIfExists: true) : Channel.fromPath("$projectDir/assets/methods_description_template.yml", checkIfExists: true) + + MULTIQC_CUSTOM ( + ch_multiqc_config.ifEmpty([]), + ch_multiqc_custom_config.ifEmpty([]), + ch_multiqc_logo.ifEmpty([]), + ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml'), + ch_multiqc_custom_methods_description.ifEmpty([]), + ch_collated_versions.ifEmpty([]), + ch_fastqc_raw_multiqc.collect{it[1]}.ifEmpty([]), + ch_trim_json_multiqc.collect{it[1]}.ifEmpty([]), + ch_nanoplot_txt_multiqc.collect{it[1]}.ifEmpty([]), + ch_porechop_log_multiqc.collect{it[1]}.ifEmpty([]), + ch_pycoqc_multiqc.collect{it[1]}.ifEmpty([]), + ch_kraken_short_multiqc.collect{it[1]}.ifEmpty([]), + ch_kraken_long_multiqc.collect{it[1]}.ifEmpty([]), + ch_quast_multiqc.collect{it[1]}.ifEmpty([]), + ch_prokka_txt_multiqc.collect().ifEmpty([]), + ch_bakta_txt_multiqc.collect().ifEmpty([]), + ch_kmerfinder_multiqc.collectFile(name: 'multiqc_kmerfinder.yaml').ifEmpty([]), ) - multiqc_report = MULTIQC.out.report.toList() + multiqc_report = MULTIQC_CUSTOM.out.report.toList() emit: - multiqc_report = MULTIQC.out.report.toList() // channel: /path/to/multiqc_report.html - versions = ch_versions // channel: [ path(versions.yml) ] + multiqc_report = MULTIQC_CUSTOM.out.report.toList() // channel: /path/to/multiqc_report.html + versions = ch_versions // channel: [ path(versions.yml) ] } /*