diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md new file mode 100644 index 00000000..2b78eccc --- /dev/null +++ b/.github/CONTRIBUTING.md @@ -0,0 +1,47 @@ +# nf-core/smrnaseq: Contributing Guidelines + +Hi there! Many thanks for taking an interest in improving nf-core/smrnaseq. + +We try to manage the required tasks for nf-core/smrnaseq using GitHub issues, you probably came to this page when creating one. Please use the pre-filled template to save time. + +However, don't be put off by this template - other more general issues and suggestions are welcome! Contributions to the code are even more welcome ;) + +> If you need help using or modifying nf-core/smrnaseq then the best place to ask is on the pipeline channel on [Slack](https://nf-core-invite.herokuapp.com/). + + + +## Contribution workflow +If you'd like to write some code for nf-core/smrnaseq, the standard workflow +is as follows: + +1. Check that there isn't already an issue about your idea in the + [nf-core/smrnaseq issues](https://github.com/nf-core/smrnaseq/issues) to avoid + duplicating work. + * If there isn't one already, please create one so that others know you're working on this +2. Fork the [nf-core/smrnaseq repository](https://github.com/nf-core/smrnaseq) to your GitHub account +3. Make the necessary changes / additions within your forked repository +4. Submit a Pull Request against the `dev` branch and wait for the code to be reviewed and merged. + +If you're not used to this workflow with git, you can start with some [basic docs from GitHub](https://help.github.com/articles/fork-a-repo/) or even their [excellent interactive tutorial](https://try.github.io/). + + +## Tests +When you create a pull request with changes, [Travis CI](https://travis-ci.org/) will run automatic tests. +Typically, pull-requests are only fully reviewed when these tests are passing, though of course we can help out before then. + +There are typically two types of tests that run: + +### Lint Tests +The nf-core has a [set of guidelines](http://nf-co.re/guidelines) which all pipelines must adhere to. +To enforce these and ensure that all pipelines stay in sync, we have developed a helper tool which runs checks on the pipeline code. This is in the [nf-core/tools repository](https://github.com/nf-core/tools) and once installed can be run locally with the `nf-core lint ` command. + +If any failures or warnings are encountered, please follow the listed URL for more documentation. + +### Pipeline Tests +Each nf-core pipeline should be set up with a minimal set of test-data. +Travis CI then runs the pipeline on this data to ensure that it exists successfully. +If there are any failures then the automated tests fail. +These tests are run both with the latest available version of Nextflow and also the minimum required version that is stated in the pipeline code. + +## Getting help +For further information/help, please consult the [nf-core/smrnaseq documentation](https://github.com/nf-core/smrnaseq#documentation) and don't hesitate to get in touch on the pipeline channel on [Slack](https://nf-core-invite.herokuapp.com/). diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 00000000..73f22bdd --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,31 @@ +Hi there! + +Thanks for telling us about a problem with the pipeline. Please delete this text and anything that's not relevant from the template below: + +#### Describe the bug +A clear and concise description of what the bug is. + +#### Steps to reproduce +Steps to reproduce the behaviour: +1. Command line: `nextflow run ...` +2. See error: _Please provide your error message_ + +#### Expected behaviour +A clear and concise description of what you expected to happen. + +#### System: + - Hardware: [e.g. HPC, Desktop, Cloud...] + - Executor: [e.g. slurm, local, awsbatch...] + - OS: [e.g. CentOS Linux, macOS, Linux Mint...] + - Version [e.g. 7, 10.13.6, 18.3...] + +#### Nextflow Installation: + - Version: [e.g. 0.31.0] + +#### Container engine: + - Engine: [e.g. Conda, Docker or Singularity] + - version: [e.g. 1.0.0] + - Image tag: [e.g. nfcore/smrnaseq:1.0.0] + +#### Additional context +Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 00000000..1f025b77 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,16 @@ +Hi there! + +Thanks for suggesting a new feature for the pipeline! Please delete this text and anything that's not relevant from the template below: + +#### Is your feature request related to a problem? Please describe. +A clear and concise description of what the problem is. +Ex. I'm always frustrated when [...] + +#### Describe the solution you'd like +A clear and concise description of what you want to happen. + +#### Describe alternatives you've considered +A clear and concise description of any alternative solutions or features you've considered. + +#### Additional context +Add any other context about the feature request here. diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 00000000..6e48fa56 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,15 @@ +Many thanks to contributing to nf-core/smrnaseq! + +Please fill in the appropriate checklist below (delete whatever is not relevant). These are the most common things requested on pull requests (PRs). + +## PR checklist + - [ ] This comment contains a description of changes (with reason) + - [ ] If you've fixed a bug or added code that should be tested, add tests! + - [ ] If necessary, also make a PR on the [nf-core/smrnaseq branch on the nf-core/test-datasets repo]( https://github.com/nf-core/test-datasets/pull/new/nf-core/smrnaseq) + - [ ] Ensure the test suite passes (`nextflow run . -profile test,docker`). + - [ ] Make sure your code lints (`nf-core lint .`). + - [ ] Documentation in `docs` is updated + - [ ] `CHANGELOG.md` is updated + - [ ] `README.md` is updated + +**Learn more about contributing:** https://github.com/nf-core/smrnaseq/tree/master/.github/CONTRIBUTING.md diff --git a/.github/markdownlint.yml b/.github/markdownlint.yml new file mode 100644 index 00000000..e052a635 --- /dev/null +++ b/.github/markdownlint.yml @@ -0,0 +1,9 @@ +# Markdownlint configuration file +default: true, +line-length: false +no-multiple-blanks: 0 +blanks-around-headers: false +blanks-around-lists: false +header-increment: false +no-duplicate-header: + siblings_only: true diff --git a/.gitignore b/.gitignore index 46f69e41..5b54e3e6 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ data/ results/ .DS_Store tests/test_data +*.pyc diff --git a/.travis.yml b/.travis.yml index 35e3d9d3..287bb2ba 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,38 +1,42 @@ sudo: required -language: java +language: python jdk: openjdk8 -services: - - docker -python: - - "2.7" +services: docker +python: '3.6' cache: pip matrix: fast_finish: true before_install: + # PRs to master are only ok if coming from dev branch + - '[ $TRAVIS_PULL_REQUEST = "false" ] || [ $TRAVIS_BRANCH != "master" ] || ([ $TRAVIS_PULL_REQUEST_SLUG = $TRAVIS_REPO_SLUG ] && [ $TRAVIS_PULL_REQUEST_BRANCH = "dev" ])' # Pull the docker image first so the test doesn't wait for this - - docker pull nfcore/smrnaseq:latest + - docker pull nfcore/smrnaseq:dev + # Fake the tag locally so that the pipeline runs properly + # Looks weird when this is :dev to :dev, but makes sense when testing code for a release (:dev to :1.0.1) + - docker tag nfcore/smrnaseq:dev nfcore/smrnaseq:1.0.0 install: # Install Nextflow - - mkdir /tmp/nextflow - - cd /tmp/nextflow + - mkdir /tmp/nextflow && cd /tmp/nextflow - wget -qO- get.nextflow.io | bash - sudo ln -s /tmp/nextflow/nextflow /usr/local/bin/nextflow # Install nf-core/tools - - git clone https://github.com/nf-core/tools.git /tmp/nf-core-tools - - cd /tmp/nf-core-tools - - pip install --user -e . + - pip install --upgrade pip + - pip install nf-core # Reset - - mkdir ${TRAVIS_BUILD_DIR}/tests - - cd ${TRAVIS_BUILD_DIR}/tests + - mkdir ${TRAVIS_BUILD_DIR}/tests && cd ${TRAVIS_BUILD_DIR}/tests + # Install markdownlint-cli + - sudo apt-get install npm && npm install -g markdownlint-cli env: - - NXF_VER=0.30.2 - - '' + - NXF_VER='0.32.0' # Specify a minimum NF version that should be tested and work + - NXF_VER='' # Plus: get the latest NF version and check that it works script: # Lint the pipeline code - nf-core lint ${TRAVIS_BUILD_DIR} - # Run the pipeline + # Lint the documentation + - markdownlint ${TRAVIS_BUILD_DIR} -c ${TRAVIS_BUILD_DIR}/.github/markdownlint.yml + # Run the pipeline with the test profile - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker diff --git a/CHANGELOG.md b/CHANGELOG.md index 85ec625c..ac0e756c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,43 @@ -# NGI-smRNAseq +# nf-core/smrnaseq: Changelog -## [0.1dev](https://github.com/SciLifeLab/NGI-smRNAseq/releases/tag/0.1dev) - 2018-05-14 -* Change bowtie parameters from `bowtie -n x -l 15 -k 10 --best (x=0 for mature and x=1 for hairpin)` into `bowtie -k 1 -m 1 --best --strata`. +## v1.0.0 - 2019-09-19 +* Add figures to output documentation +* Add samtools stats for genome alignments +* Add seqkit and remove razers +* Add mirtop and razers tools +* Adapt code and docs to [nf-core](http://nf-co.re/) template +* Update tools and Dockerfile/Singularity to match current template + +#### Dependency Updates +* openjdk 8.0.144 -> 11.0.1 +* fastqc 0.11.7 -> 0.11.8 +* trim-galore 0.5.0 -> 0.6.2 +* bioconductor-edger 3.20.7 -> 3.26.0 +* bioconductor-limma 3.34.9 -> 3.40.0 +* conda-forge::r-data.table 1.11.4 -> 1.12.2 +* conda-forge::r-gplots 3.0.1 -> 3.0.1.1 +* conda-forge::r-r.methodss3 1.7.1 -> 1.7.1 +* htseq 0.9.1 -> 0.11.2 +* r-markdown 0.9 +* Added mirtop 0.4.18a +* Removed razers3 3.5.3 +* Added seqkit 0.10.1-1 +* Added seqcluster 1.2.5 +* conda-forge::r-base=3.5.1 -> 3.6.1 +* conda-forge::r-statmod=1.4.30 -> 1.4.32 +* conda-forge::r-markdown=0.9 -> 1.0 +* trim-galore=0.6.2 -> 0.6.3 +* mirtop=0.4.18a -> 0.4.22 +* bioconductor-edger=3.26.0 -> 3.26.5 +* bioconductor-limma=3.40.0 -> 3.40.2 + +## [1.0](https://github.com/nf-core/smrnaseq/releases/tag/1.0) - 2019-01-10 +* Add "protocol" with pre-defined settings +* Add miRTrace in the pipeline. +* Software updates: multiqc 1.6 to 1.7. + +## [1.0](https://github.com/nf-core/smrnaseq/releases/tag/1.0) - 2018-08-06 +* Switch from SciLifeLab/NGI-smRNAseq to nf-core/smrnaseq. +* Use Bowtie 1 instead of Bowtie 2 for the alignment to host reference genome. +* Add option for sequencing centre in BAM file. +* Software updates: trim-galore 0.4.5 to 0.5.0; samtools 1.8 to 1.9; multiqc 1.5 to 1.6. diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 00000000..09226d0d --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,46 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment include: + +* Using welcoming and inclusive language +* Being respectful of differing viewpoints and experiences +* Gracefully accepting constructive criticism +* Focusing on what is best for the community +* Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery and unwelcome sexual attention or advances +* Trolling, insulting/derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or electronic address, without explicit permission +* Other conduct which could reasonably be considered inappropriate in a professional setting + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team on [Slack](https://nf-core-invite.herokuapp.com/). The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version] + +[homepage]: http://contributor-covenant.org +[version]: http://contributor-covenant.org/version/1/4/ diff --git a/Dockerfile b/Dockerfile index fa25d24a..66b720e1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,7 @@ FROM nfcore/base -MAINTAINER Alexander Peltzer -LABEL authors="alex.peltzer@gmail.com" \ - description="Docker image containing all requirements for nf-core/smrnaseq pipeline" +LABEL authors="Alexander Peltzer " \ + description="Docker image containing all requirements for nf-core/smrnaseq pipeline" COPY environment.yml / -RUN conda env update -n root -f /environment.yml && conda clean -a \ No newline at end of file +RUN conda env create -f /environment.yml && conda clean -a +ENV PATH /opt/conda/envs/nf-core-smrnaseq-1.0.0/bin:$PATH diff --git a/LICENSE.md b/LICENSE similarity index 94% rename from LICENSE.md rename to LICENSE index 8fa6f918..9c60c99c 100644 --- a/LICENSE.md +++ b/LICENSE @@ -1,6 +1,6 @@ -The MIT License (MIT) +MIT License -Copyright (c) 2016 chuan-wang +Copyright (c) Phil Ewels Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index d0d77430..f65853d6 100644 --- a/README.md +++ b/README.md @@ -1,226 +1,55 @@ # ![nf-core/smrnaseq](docs/images/smrnaseq_logo.png) -[![Build Status](https://travis-ci.org/nf-core/smrnaseq.svg?branch=master)](https://travis-ci.org/nf-core/smrnaseq) -[![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A50.30.2-brightgreen.svg)](https://www.nextflow.io/) -[![Gitter](https://img.shields.io/badge/gitter-%20join%20chat%20%E2%86%92-4fb99a.svg)](https://gitter.im/nf-core/Lobby) +[![Build Status](https://travis-ci.com/nf-core/smrnaseq.svg?branch=master)](https://travis-ci.com/nf-core/smrnaseq) +[![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A50.32.0-brightgreen.svg)](https://www.nextflow.io/) [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg)](http://bioconda.github.io/) -[![Docker Container available](https://img.shields.io/docker/automated/nfcore/smrnaseq.svg)](https://hub.docker.com/r/nfcore/smrnaseq/) -[![https://www.singularity-hub.org/static/img/hosted-singularity--hub-%23e32929.svg](https://www.singularity-hub.org/static/img/hosted-singularity--hub-%23e32929.svg)](https://singularity-hub.org/collections/1250) - - ----- - -# UNDER DEVELOPMENT! -This pipeline has recently been moved to nf-core and is still under heavy development. It does not yet meet all of the requirements for nf-core pipelines. - -Use with caution! - ----- - -**nf-core/smrnaseq** is a bioinformatics best-practice analysis pipeline used for small RNA sequencing data at the [National Genomics Infastructure](https://ngisweden.scilifelab.se/) -at [SciLifeLab Stockholm](https://www.scilifelab.se/platforms/ngi/), Sweden. - -The pipeline uses [Nextflow](https://www.nextflow.io), a bioinformatics workflow tool. It pre-processes raw data from FastQ inputs, aligns the reads and performs extensive quality-control on the results. - -This pipeline is primarily used with a SLURM cluster on the Swedish [UPPMAX systems](https://www.uppmax.uu.se). However, the pipeline should be able to run on any system that Nextflow supports. We have done some limited testing using Docker and AWS, and the pipeline comes with some configuration for these systems. See the [installation docs](docs/installation.md) for more information. - -## Installation -### NextFlow installation -See https://github.com/SciLifeLab/NGI-NextflowDocs for instructions on how to install and configure -Nextflow. - -### Pipeline installation -This pipeline itself needs no installation - NextFlow will automatically fetch it from GitHub when run if -`nf-core/smrnaseq` is specified as the pipeline name. - -If you prefer, you can download the files yourself from GitHub and run them directly: -``` -git clone https://github.com/nf-core/smrnaseq.git -nextflow run nf-core/smrnaseq/main.nf -``` - -### Installation of the 'ngi_visualizations' module -This module needs to be installed locally in order to visualize the statistics from Bowtie2 alignment. -``` -pip install -U git+https://github.com/NationalGenomicsInfrastructure/ngi_visualizations.git -``` -Note that for ngi_visualizations, python packages HTSeq and pysam are required. - -### Installation of the NGI plugin for the'MultiQC' module -``` -pip install git+https://github.com/ewels/MultiQC_NGI.git -``` - -## Configuration -By default, the pipeline is configured to run on the Swedish UPPMAX cluster (milou / irma). - -You will need to specify your UPPMAX project ID when running a pipeline. To do this, use -the command line flag `--project `. - -To avoid having to specify this every time you run Nextflow, you can add it to your -personal Nextflow config file instead. Add this line to `~/.nextflow/config`: - -```nextflow -params.project = 'project_ID' -``` - -The pipeline will exit with an error message if you try to run it pipeline with the default -UPPMAX config profile and don't set project. - - -### Running on other clusters -It is entirely possible to run this pipeline on other clusters, though you will need to set up -your own config file so that the script knows where to find your reference files and how your -cluster works. - -Copy the contents of [`conf/uppmax.config`](conf/uppmax.config) to your own config file somewhere -and then reference it with `-c` when running the pipeline. - -If you think that there are other people using the pipeline who would benefit from your configuration -(eg. other common cluster setups), please let us know. It should be easy to create a new config file -in `conf` and reference this as a named profile in [`nextflow.config`](nextflow.config). Then these -configuration options can be used by specifying `-profile ` when running the pipeline. - - -## Running the pipeline -The typical command for running the pipeline is as follows: - -``` -nextflow run nf-core/smrnaseq --reads '*.fastq.gz' -``` - -**NOTE! Paired-end data is NOT supported by this pipeline!** -For paired-end data, use Read 1 only. For instance: - -``` -nextflow run nf-core/smrnaseq --reads '*.R1.fastq.gz' -``` - -Note that the pipeline will create files in your working directory: -```bash -work # Directory containing the nextflow working files -results # Finished results for each sample, one directory per pipeline step -.nextflow_log # Log file from Nextflow -# Other nextflow hidden files, eg. history of pipeline runs and old logs. -``` - -## Mandatory parameters -### `--reads` -Location of the input FastQ files: -``` - --reads 'path/to/data/*.fastq.gz' -``` -**NOTE! Must be enclosed in quotes!** -If left unspecified, the pipeline will assume that the data is in a directory called `data` in the working directory. - -### `--genome` -The reference genome to use of the analysis, needs to be one of the genome specified in the config file. -The human `GRCh37` genome is used by default. -``` ---genome 'GRCh37' -``` - -### Supported genomes - -| Parameter | Latin Name | Common Name | -| :------------ |:-------------------------------- |:------------------ | -| AGPv3 | *Zea mays* | Maize | -| BDGP6 | *Drosophila melanogaster* | Fruit fly | -| CanFam3.1 | *Canis familiaris* | Dog | -| CHIMP2.1.4 | *Pan troglodytes* | Chimpanze | -| EquCab2 | *Equus caballus* | Horse | -| Galgal4 | *Gallus gallus* | Chicken | -| Gm01 | *Glycine max* | Soybean | -| GRCh37 | *Homo sapiens* | Human | -| GRCm38 | *Mus musculus* | Mouse | -| GRCz10 | *Danio rerio* | Zebrafish | -| IRGSP-1.0 | *Oryza sativa japonica* | Rice | -| Mmul_1 | *Macaca mulatta* | Macaque | -| Rnor_6.0 | *Rattus norvegicus* | Rat | -| Sbi1 | *Sorghum bicolor* | Great millet | -| Sscrofa10.2 | *Sus scrofa* | Pig | -| TAIR10 | *Arabidopsis thaliana* | Thale cress | -| UMD3.1 | *Bos taurus* | Cow | -| WBcel235 | *Caenorhabditis elegans* | Nematode | - -**NOTE! With the option --genome 'ALL', the entire dataset of mature miRNAs and hairpins in miRBase will be used as reference regardless of species. Meanwhile the alignment against host reference genome will be skipped.** - - -## Other command line parameters -### `--outdir` -The output directory where the results will be saved. - -### `--email` -Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to speicfy this on the command line for every run. - -### `--plaintext_email` -Set to receive plain-text e-mails instead of HTML formatted. - -### `-name` -Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic. - -This is used in the MultiQC report (if not default) and in the summary HTML / e-mail (always). - -**NB:** Single hyphen (core Nextflow option) - -### `-resume` -Specify this when restarting a pipeline. Nextflow will used cached results from any pipeline steps where the inputs are the same, continuing from where it got to previously. - -You can also supply a run name to resume a specific run: `-resume [run-name]`. Use the `nextflow log` command to show previous run names. - -**NB:** Single hyphen (core Nextflow option) - -### `-c` -Specify the path to a specific config file (this is a core NextFlow command). Useful if using different UPPMAX -projects or different sets of reference genomes. **NOTE! One hyphen only (core Nextflow parameter).** - -**NB:** Single hyphen (core Nextflow option) - -Note - you can use this to override defaults. For example, we run on UPPMAX but don't want to use the MultiQC -environment module as is the default. So we specify a config file using `-c` that contains the following: - -```nextflow -process.$multiqc.module = [] -``` - -### `--bt2index` -If you prefer, you can specify the full path to your reference genome when you run the pipeline: -``` ---bt2index [path to Bowtie2 index] -``` - -### `--rlocation` -Some steps in the pipeline run R with required modules. By default, the pipeline will install -these modules to `~/R/nxtflow_libs/` if not present. You can specify what path to use with this -command line flag. - -### Trimming options -`--length [int]`: Discard reads that became shorter than length [int] because of either quality or adapter trimming. Default: 18 -`--clip_R1 [int]`: Instructs Trim Galore to remove bp from the 5' end of read 1 -`--three_prime_clip_R1 [int]`: Instructs Trim Galore to remove bp from the 3' end of read 1 AFTER adapter/quality trimming has been performed - -### `--saveReference` -Supply this parameter to save any generated reference genome files to your results folder. These can then be used for future pipeline runs, reducing processing times. - -### `--multiqc_config` -If you would like to supply a custom config file to MultiQC, you can specify a path with `--multiqc_config`. This is used instead of the config file specific to the pipeline. - -### `--clusterOptions` -Submit arbitrary SLURM options (UPPMAX profile only). For instance, you could use `--clusterOptions '-p devcore'` -to run on the development node (though won't work with default process time requests). - -## Stand-alone scripts -The `bin` directory contains some scripts used by the pipeline which may also be run manually: - -* `edgeR_miRBase.r` - * R script using for processing reads counts of mature miRNAs and miRNA precursors (hairpins). - +[![Docker](https://img.shields.io/docker/automated/nfcore/smrnaseq.svg)](https://hub.docker.com/r/nfcore/smrnaseq) + +## Introduction +**nf-core/smrnaseq** is a bioinformatics best-practice analysis pipeline used for small RNA sequencing data. + +The pipeline is built using Nextflow, a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It comes with docker containers making installation trivial and results highly reproducible. + +### Pipeline summary + + +1. Raw read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) +2. Adapter trimming ([`Trim Galore!`](https://www.bioinformatics.babraham.ac.uk/projects/trim_galore/)) + 1. Insert Size calculation + 2. Collapse reads ([`seqcsluter`](https://seqcluster.readthedocs.io/mirna_annotation.html#processing-of-reads)) +3. Alignment against miRBase mature miRNA ([`Bowtie1`](http://bowtie-bio.sourceforge.net/index.shtml)) +4. Alignment against miRBase hairpin + 1. Unaligned reads from step 3 ([`Bowtie1`](http://bowtie-bio.sourceforge.net/index.shtml)) + 2. Collapsed reads from step 2.2 ([`Bowtie1`](http://bowtie-bio.sourceforge.net/index.shtml)) +5. Post-alignment processing of miRBase hairpin + 1. Basic statistics from step 3 and step 4.1 ([`SAMtools`](https://sourceforge.net/projects/samtools/files/samtools/)) + 2. Analysis on miRBase hairpin counts ([`edgeR`](https://bioconductor.org/packages/release/bioc/html/edgeR.html)) + * TMM normalization and a table of top expression hairpin + * MDS plot clustering samples + * Heatmap of sample similarities + 3. miRNA and isomiR annotation from step 4.1 ([`mirtop`](https://github.com/miRTop/mirtop)) +6. Alignment against host reference genome ([`Bowtie1`](http://bowtie-bio.sourceforge.net/index.shtml)) + 1. Post-alignment processing of alignment against host reference genome ([`SAMtools`](https://sourceforge.net/projects/samtools/files/samtools/)) +7. miRNA quality control ([`mirtrace`](https://github.com/friedlanderlab/mirtrace)) +8. Present QC for raw read, alignment, and expression results ([`MultiQC`](http://multiqc.info/)) + + +### Documentation +The nf-core/smrnaseq pipeline comes with documentation about the pipeline, found in the `docs/` directory: + +1. [Installation](https://nf-co.re/usage/installation) +2. Pipeline configuration + * [Local installation](https://nf-co.re/usage/local_installation) + * [Adding your own system config](https://nf-co.re/usage/adding_own_config) + * [Reference genomes](https://nf-co.re/usage/reference_genomes) +3. [Running the pipeline](docs/usage.md) +4. [Output and how to interpret the results](docs/output.md) +5. [Troubleshooting](https://nf-co.re/usage/troubleshooting) ## Credits -These scripts were written for use at the [National Genomics Infrastructure](https://portal.scilifelab.se/genomics/) -at [SciLifeLab](http://www.scilifelab.se/) in Stockholm, Sweden. - -Written by Phil Ewels (@ewels), Chuan Wang (@chuan-wang) and Rickard Hammarén (@Hammarn) +nf-core/smrnaseq was originally written for use at the [National Genomics Infrastructure](https://portal.scilifelab.se/genomics/) at [SciLifeLab](http://www.scilifelab.se/) in Stockholm, Sweden, by Phil Ewels (@ewels), Chuan Wang (@chuan-wang) and Rickard Hammarén (@Hammarn). Updated by Lorena Pantano (@lpantano) from MIT. -

+## Citation +You can cite the `nf-core` pre-print as follows: +Ewels PA, Peltzer A, Fillinger S, Alneberg JA, Patel H, Wilm A, Garcia MU, Di Tommaso P, Nahnsen S. **nf-core: Community curated bioinformatics pipelines**. *bioRxiv*. 2019. p. 610741. [doi: 10.1101/610741](https://www.biorxiv.org/content/10.1101/610741v1). diff --git a/Singularity b/Singularity deleted file mode 100644 index b1ea318e..00000000 --- a/Singularity +++ /dev/null @@ -1,14 +0,0 @@ -From:nfcore/base -Bootstrap:docker - -%labels - MAINTAINER Alexander Peltzer - DESCRIPTION Container image containing all requirements for the nf-core/smrnaseq pipeline - VERSION 0.2dev - -%files - environment.yml / - -%post - /opt/conda/bin/conda env update -n root -f /environment.yml - /opt/conda/bin/conda clean -a diff --git a/assets/NGI_logo.png b/assets/NGI_logo.png deleted file mode 100644 index 988c0302..00000000 Binary files a/assets/NGI_logo.png and /dev/null differ diff --git a/assets/SciLifeLab_logo.png b/assets/SciLifeLab_logo.png deleted file mode 100644 index c821e78b..00000000 Binary files a/assets/SciLifeLab_logo.png and /dev/null differ diff --git a/assets/email_template.html b/assets/email_template.html index 51615d2c..ee75a803 100644 --- a/assets/email_template.html +++ b/assets/email_template.html @@ -5,24 +5,16 @@ - + nf-core/smrnaseq Pipeline Report
- - -

nf-core/smrnaseq: smRNA-Seq Best Practice v${version}

+

nf-core/smrnaseq v${version}

Run Name: $runName

-<% if (success){ - out << """ -
- nf-core/smrnaseq execution completed successfully! -
- """ -} else { +<% if (!success){ out << """

nf-core/smrnaseq execution completed unsuccessfully!

@@ -31,6 +23,12 @@

nf-core/smrnaseq execution completed u
${errorReport}

""" +} else { + out << """ +
+ nf-core/smrnaseq execution completed successfully! +
+ """ } %> @@ -45,14 +43,8 @@

Pipeline Configuration:

-

nf-core/smrnaseq is a bioinformatics best-practice analysis pipeline used for small RNA sequencing data at the National Genomics Infrastructure at SciLifeLab Stockholm, Sweden.

-

The pipeline uses Nextflow, a bioinformatics workflow tool. It pre-processes raw data from FastQ inputs, aligns the reads and performs extensive quality-control on the results.

-

For more information, please see the pipeline homepage: https://github.com/nf-core/smrnaseq

- -
- - - +

nf-core/smrnaseq

+

https://github.com/nf-core/smrnaseq

diff --git a/assets/email_template.txt b/assets/email_template.txt index 306a27c6..1e22721d 100644 --- a/assets/email_template.txt +++ b/assets/email_template.txt @@ -1,5 +1,5 @@ ======================================== - nf-core/smrnaseq: small RNA-Seq Best Practice v${version} + nf-core/smrnaseq v${version} ======================================== Run Name: $runName @@ -16,6 +16,7 @@ ${errorReport} """ } %> + The workflow was completed at $dateComplete (duration: $duration) The command used to launch the workflow was as follows: diff --git a/conf/multiqc_config.yaml b/assets/multiqc_config.yaml similarity index 61% rename from conf/multiqc_config.yaml rename to assets/multiqc_config.yaml index e50ef53e..3232c446 100644 --- a/conf/multiqc_config.yaml +++ b/assets/multiqc_config.yaml @@ -1,10 +1,12 @@ -extra_fn_clean_exts: - - _R1 - - _R2 report_comment: > This report has been generated by the nf-core/smrnaseq analysis pipeline. For information about how to interpret these results, please see the - documentation. + documentation. report_section_order: software_versions: order: -1000 + nf-core-smrnaseq-summary: + order: -1100 +sp: + cutadapt: + contents: 'Cutadapt version' \ No newline at end of file diff --git a/assets/sendmail_template.txt b/assets/sendmail_template.txt index 99cb2973..2d671220 100644 --- a/assets/sendmail_template.txt +++ b/assets/sendmail_template.txt @@ -1,20 +1,25 @@ To: $email Subject: $subject Mime-Version: 1.0 -Content-Type: multipart/related;boundary="ngimimeboundary" +Content-Type: multipart/related;boundary="nfcoremimeboundary" ---ngimimeboundary +--nfcoremimeboundary Content-Type: text/html; charset=utf-8 $email_html ---ngimimeboundary -Content-Type: image/png;name="smrnaseq_logo.png" +<% +if (mqcFile){ +def mqcFileObj = new File("$mqcFile") +if (mqcFileObj.length() < mqcMaxSize){ +out << """ +--nfcoremimeboundary +Content-Type: text/html; name=\"multiqc_report\" Content-Transfer-Encoding: base64 -Content-ID: -Content-Disposition: inline; filename="smrnaseq_logo.png" +Content-ID: +Content-Disposition: attachment; filename=\"${mqcFileObj.getName()}\" -<% out << new File("$baseDir/assets/smrnaseq_logo.png"). +${mqcFileObj. bytes. encodeBase64(). toString(). @@ -23,40 +28,9 @@ Content-Disposition: inline; filename="smrnaseq_logo.png" collate( 76 )*. collect { it.join() }. flatten(). - join( '\n' ) %> + join( '\n' )} +""" +}} +%> ---ngimimeboundary -Content-Type: image/png;name="SciLifeLab_logo.png" -Content-Transfer-Encoding: base64 -Content-ID: -Content-Disposition: inline; filename="SciLifeLab_logo.png" - -<% out << new File("$baseDir/assets/SciLifeLab_logo.png"). - bytes. - encodeBase64(). - toString(). - tokenize( '\n' )*. - toList()*. - collate( 76 )*. - collect { it.join() }. - flatten(). - join( '\n' ) %> - ---ngimimeboundary -Content-Type: image/png;name="NGI_logo.png" -Content-Transfer-Encoding: base64 -Content-ID: -Content-Disposition: inline; filename="NGI_logo.png" - -<% out << new File("$baseDir/assets/NGI_logo.png"). - bytes. - encodeBase64(). - toString(). - tokenize( '\n' )*. - toList()*. - collate( 76 )*. - collect { it.join() }. - flatten(). - join( '\n' ) %> - ---ngimimeboundary-- +--nfcoremimeboundary-- diff --git a/bin/collapse_mirtop.r b/bin/collapse_mirtop.r new file mode 100755 index 00000000..b42f9a36 --- /dev/null +++ b/bin/collapse_mirtop.r @@ -0,0 +1,13 @@ +#!/usr/bin/env Rscript + +# Command line arguments +args = commandArgs(trailingOnly=TRUE) + +input <- as.character(args[1:length(args)]) + +library(data.table) + +df = read.delim(input[1], sep = "\t") +counts = as.data.table(df[!duplicated(df[["UID"]]),c(3, 13:ncol(df))]) +mirna = counts[, lapply(.SD, sum), by = miRNA] +write.table(mirna, file.path(dirname(input[1]), "mirna.tsv"), quote=FALSE, sep="\t", row.names=FALSE) \ No newline at end of file diff --git a/bin/edgeR_miRBase.r b/bin/edgeR_miRBase.r index f9765593..4fa12c25 100755 --- a/bin/edgeR_miRBase.r +++ b/bin/edgeR_miRBase.r @@ -40,12 +40,12 @@ if (!require("methods")) { # Put mature and hairpin count files in separated file lists filelist<-list() -filelist[[1]]<-input[grep(".mature.count",input)] -filelist[[2]]<-input[grep(".hairpin.count",input)] +filelist[[1]]<-input[grep(".mature.stats",input)] +filelist[[2]]<-input[grep(".hairpin.stats",input)] names(filelist)<-c("mature","hairpin") +print(filelist) for (i in 1:2) { - header<-names(filelist)[i] # Prepare the combined data frame with gene ID as rownames and sample ID as colname @@ -57,8 +57,8 @@ for (i in 1:2) { temp <- fread(filelist[[i]][1],header=FALSE, select=c(1)) rownames(data)<-temp$V1 rownames(unmapped)<-temp$V1 - colnames(data)<-gsub(".count","",basename(filelist[[i]])) - colnames(unmapped)<-gsub(".count","",basename(filelist[[i]])) + colnames(data)<-gsub(".stats","",basename(filelist[[i]])) + colnames(unmapped)<-gsub(".stats","",basename(filelist[[i]])) data<-data[rownames(data)!="*",] unmapped<-unmapped[rownames(unmapped)=="*",] @@ -90,35 +90,35 @@ for (i in 1:2) { pdf(paste(header,"_edgeR_MDS_plot.pdf",sep="")) MDSdata <- plotMDS(dataNorm) dev.off() - } - # Print distance matrix to file - write.table(MDSdata$distance.matrix, paste(header,"_edgeR_MDS_distance_matrix.txt",sep=""), quote=FALSE, sep="\t") + # Print distance matrix to file + write.table(MDSdata$distance.matrix, paste(header,"_edgeR_MDS_distance_matrix.txt",sep=""), quote=FALSE, sep="\t") - # Print plot x,y co-ordinates to file - MDSxy = MDSdata$cmdscale.out - colnames(MDSxy) = c(paste(MDSdata$axislabel, '1'), paste(MDSdata$axislabel, '2')) + # Print plot x,y co-ordinates to file + MDSxy = MDSdata$cmdscale.out + colnames(MDSxy) = c(paste(MDSdata$axislabel, '1'), paste(MDSdata$axislabel, '2')) - write.table(MDSxy, paste(header,"_edgeR_MDS_plot_coordinates.txt",sep=""), quote=FALSE, sep="\t") + write.table(MDSxy, paste(header,"_edgeR_MDS_plot_coordinates.txt",sep=""), quote=FALSE, sep="\t") - # Get the log counts per million values - logcpm <- cpm(dataNorm, prior.count=2, log=TRUE) + # Get the log counts per million values + logcpm <- cpm(dataNorm, prior.count=2, log=TRUE) - # Calculate the euclidean distances between samples - dists = dist(t(logcpm)) + # Calculate the euclidean distances between samples + dists = dist(t(logcpm)) - # Plot a heatmap of correlations - pdf(paste(header,"_log2CPM_sample_distances_heatmap.pdf",sep="")) - hmap <- heatmap.2(as.matrix(dists),main="Sample Correlations", key.title="Distance", trace="none",dendrogram="row", margin=c(9, 9)) - dev.off() + # Plot a heatmap of correlations + pdf(paste(header,"_log2CPM_sample_distances_heatmap.pdf",sep="")) + hmap <- heatmap.2(as.matrix(dists),main="Sample Correlations", key.title="Distance", trace="none",dendrogram="row", margin=c(9, 9)) + dev.off() - # Plot the heatmap dendrogram - pdf(paste(header,"_log2CPM_sample_distances_dendrogram.pdf",sep="")) - plot(hmap$rowDendrogram, main="Sample Dendrogram") - dev.off() + # Plot the heatmap dendrogram + pdf(paste(header,"_log2CPM_sample_distances_dendrogram.pdf",sep="")) + plot(hmap$rowDendrogram, main="Sample Dendrogram") + dev.off() - # Write clustered distance values to file - write.table(hmap$carpet, paste(header,"_log2CPM_sample_distances.txt",sep=""), quote=FALSE, sep="\t") + # Write clustered distance values to file + write.table(hmap$carpet, paste(header,"_log2CPM_sample_distances.txt",sep=""), quote=FALSE, sep="\t") + } } file.create("corr.done") diff --git a/bin/markdown_to_html.r b/bin/markdown_to_html.r new file mode 100755 index 00000000..abe13350 --- /dev/null +++ b/bin/markdown_to_html.r @@ -0,0 +1,51 @@ +#!/usr/bin/env Rscript + +# Command line argument processing +args = commandArgs(trailingOnly=TRUE) +if (length(args) < 2) { + stop("Usage: markdown_to_html.r ", call.=FALSE) +} +markdown_fn <- args[1] +output_fn <- args[2] + +# Load / install packages +if (!require("markdown")) { + install.packages("markdown", dependencies=TRUE, repos='http://cloud.r-project.org/') + library("markdown") +} + +base_css_fn <- getOption("markdown.HTML.stylesheet") +base_css <- readChar(base_css_fn, file.info(base_css_fn)$size) +custom_css <- paste(base_css, " +body { + padding: 3em; + margin-right: 350px; + max-width: 100%; +} +#toc { + position: fixed; + right: 20px; + width: 300px; + padding-top: 20px; + overflow: scroll; + height: calc(100% - 3em - 20px); +} +#toc_header { + font-size: 1.8em; + font-weight: bold; +} +#toc > ul { + padding-left: 0; + list-style-type: none; +} +#toc > ul ul { padding-left: 20px; } +#toc > ul > li > a { display: none; } +img { max-width: 800px; } +") + +markdownToHTML( + file = markdown_fn, + output = output_fn, + stylesheet = custom_css, + options = c('toc', 'base64_images', 'highlight_code') +) diff --git a/bin/scrape_software_versions.py b/bin/scrape_software_versions.py index 62a3b1e4..17d5a2f8 100755 --- a/bin/scrape_software_versions.py +++ b/bin/scrape_software_versions.py @@ -4,27 +4,32 @@ import re regexes = { - 'nf-core/smrnaseq': ['v_nfcore_smrnaseq.txt', r"(\S+)"], + 'nf-core/smrnaseq': ['v_pipeline.txt', r"(\S+)"], + 'R': ['v_R.txt', r"R version (\S+)"], 'Nextflow': ['v_nextflow.txt', r"(\S+)"], 'FastQC': ['v_fastqc.txt', r"FastQC v(\S+)"], 'Trim Galore!': ['v_trim_galore.txt', r"version (\S+)"], 'Bowtie': ['v_bowtie.txt', r"version (\S+)"], - 'Bowtie 2': ['v_bowtie2.txt', r"version (\S+)"], 'Samtools': ['v_samtools.txt', r"samtools (\S+)"], + 'Htseq': ['v_htseq.txt', r"version (\S+)"], 'FASTX': ['v_fastx.txt', r"Toolkit (\S+)"], + 'miRTrace': ['v_mirtrace.txt', r"mirtrace, version (\S+)"], 'MultiQC': ['v_multiqc.txt', r"multiqc, version (\S+)"], } results = OrderedDict() results['nf-core/smrnaseq'] = 'N/A' results['Nextflow'] = 'N/A' +results['R'] = 'N/A' results['FastQC'] = 'N/A' results['Trim Galore!'] = 'N/A' results['Bowtie'] = 'N/A' -results['Bowtie 2'] = 'N/A' results['Samtools'] = 'N/A' +results['Htseq'] = 'N/A' results['FASTX'] = 'N/A' +results['miRTrace'] = 'N/A' results['MultiQC'] = 'N/A' + # Search each file using its regex for k, v in regexes.items(): with open(v[0]) as x: @@ -33,6 +38,11 @@ if match: results[k] = "v{}".format(match.group(1)) +# Remove software set to false in results +for k in results: + if not results[k]: + del(results[k]) + # Dump to YAML print (''' id: 'software_versions' @@ -44,5 +54,10 @@
''') for k,v in results.items(): - print("
{}
{}
".format(k,v)) + print("
{}
{}
".format(k,v)) print ("
") + +# Write out regexes as csv file: +with open('software_versions.csv', 'w') as f: + for k,v in results.items(): + f.write("{}\t{}\n".format(k,v)) diff --git a/conf/awsbatch.config b/conf/awsbatch.config new file mode 100644 index 00000000..14af5866 --- /dev/null +++ b/conf/awsbatch.config @@ -0,0 +1,18 @@ +/* + * ------------------------------------------------- + * Nextflow config file for running on AWS batch + * ------------------------------------------------- + * Base config needed for running with -profile awsbatch + */ +params { + config_profile_name = 'AWSBATCH' + config_profile_description = 'AWSBATCH Cloud Profile' + config_profile_contact = 'Alexander Peltzer (@apeltzer)' + config_profile_url = 'https://aws.amazon.com/de/batch/' +} + +aws.region = params.awsregion +process.executor = 'awsbatch' +process.queue = params.awsqueue +executor.awscli = '/home/ec2-user/miniconda/bin/aws' +params.tracedir = './' diff --git a/conf/base.config b/conf/base.config index a11abd71..c7ae2f9c 100644 --- a/conf/base.config +++ b/conf/base.config @@ -1,6 +1,6 @@ /* * ------------------------------------------------- - * Nextflow base config file + * nf-core/smrnaseq Nextflow base config file * ------------------------------------------------- * A 'blank slate' config file, appropriate for general * use on most high performace compute environments. @@ -11,67 +11,30 @@ process { - container = params.container - cpus = { check_max( 1 * task.attempt, 'cpus' ) } memory = { check_max( 8.GB * task.attempt, 'memory' ) } time = { check_max( 2.h * task.attempt, 'time' ) } - errorStrategy = { task.exitStatus in [143,137] ? 'retry' : 'terminate' } - maxRetries = 3 + errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' } + maxRetries = 1 maxErrors = '-1' // Resource requirements - $makeBowtieIndex { - cpus = { check_max( 4 * task.attempt, 'cpus' ) } - memory = { check_max( 32.GB * task.attempt, 'memory' ) } - time = { check_max( 8.h * task.attempt, 'time' ) } - } - $trim_galore { + withLabel:process_low { cpus = { check_max( 2 * task.attempt, 'cpus' ) } memory = { check_max( 16.GB * task.attempt, 'memory' ) } time = { check_max( 8.h * task.attempt, 'time' ) } } - $insertsize { - cpus = { check_max( 2 * task.attempt, 'cpus' ) } - memory = { check_max( 16.GB * task.attempt, 'memory' ) } - time = { check_max( 8.h * task.attempt, 'time' ) } - } - $bowtie_miRBase_mature { - cpus = { check_max( 4 * task.attempt, 'cpus' ) } - memory = { check_max( 32.GB * task.attempt, 'memory' ) } - time = { check_max( 8.h * task.attempt, 'time' ) } - } - $bowtie_miRBase_hairpin { + withLabel:process_medium { cpus = { check_max( 4 * task.attempt, 'cpus' ) } memory = { check_max( 32.GB * task.attempt, 'memory' ) } time = { check_max( 8.h * task.attempt, 'time' ) } } - $miRBasePostAlignment { - cpus = { check_max( 4 * task.attempt, 'cpus' ) } - memory = { check_max( 32.GB * task.attempt, 'memory' ) } - time = { check_max( 8.h * task.attempt, 'time' ) } - } - $edgeR_miRBase { - cpus = { check_max( 2 * task.attempt, 'cpus' ) } - memory = { check_max( 16.GB * task.attempt, 'memory' ) } - time = { check_max( 2.h * task.attempt, 'time' ) } - } - $bowtie2 { + withLabel:process_high { cpus = { check_max( 8 * task.attempt, 'cpus' ) } memory = { check_max( 64.GB * task.attempt, 'memory' ) } time = { check_max( 8.h * task.attempt, 'time' ) } } - $bowtie2_unmapped { - cpus = { check_max( 4 * task.attempt, 'cpus' ) } - memory = { check_max( 32.GB * task.attempt, 'memory' ) } - time = { check_max( 4.h * task.attempt, 'time' ) } - } - $ngi_visualizations { - cpus = { check_max( 2 * task.attempt, 'cpus' ) } - memory = { check_max( 16.GB * task.attempt, 'memory' ) } - time = { check_max( 2.h * task.attempt, 'time' ) } - } } params { diff --git a/conf/igenomes.config b/conf/igenomes.config index 8e29eef4..690fe048 100644 --- a/conf/igenomes.config +++ b/conf/igenomes.config @@ -14,145 +14,145 @@ mature = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/SmallRNA/mature.fa" hairpin = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/SmallRNA/hairpin.fa" bowtie = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/BowtieIndex/genome" - bowtie2 = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/Bowtie2Index/genome" fasta = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/WholeGenomeFasta" gtf = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.gtf" + mirtrace_species = "hsa" } 'GRCm38' { mature = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Annotation/SmallRNA/mature.fa" hairpin = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Annotation/SmallRNA/hairpin.fa" bowtie = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/BowtieIndex/genome" - bowtie2 = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/Bowtie2Index/genome" fasta = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/WholeGenomeFasta" gtf = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Annotation/Genes/genes.gtf" + mirtrace_species = "mmu" } 'TAIR10' { mature = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Annotation/SmallRNA/mature.fa" hairpin = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Annotation/SmallRNA/hairpin.fa" bowtie = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/BowtieIndex/genome" - bowtie2 = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/Bowtie2Index/genome" fasta = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/WholeGenomeFasta" gtf = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Annotation/Genes/genes.gtf" + mirtrace_species = "ath" } 'UMD3.1' { mature = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Annotation/SmallRNA/mature.fa" hairpin = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Annotation/SmallRNA/hairpin.fa" bowtie = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/BowtieIndex/genome" - bowtie2 = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/Bowtie2Index/genome" fasta = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/WholeGenomeFasta" gtf = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Annotation/Genes/genes.gtf" + mirtrace_species = "bta" } 'WBcel235' { mature = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Annotation/SmallRNA/mature.fa" hairpin = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Annotation/SmallRNA/hairpin.fa" bowtie = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/BowtieIndex/genome" - bowtie2 = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/Bowtie2Index/genome" fasta = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/WholeGenomeFasta" gtf = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Annotation/Genes/genes.gtf" + mirtrace_species = "cel" } 'CanFam3.1' { mature = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Annotation/SmallRNA/mature.fa" hairpin = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Annotation/SmallRNA/hairpin.fa" bowtie = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/BowtieIndex/genome" - bowtie2 = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/Bowtie2Index/genome" fasta = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/WholeGenomeFasta" gtf = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Annotation/Genes/genes.gtf" + mirtrace_species = "cfa" } 'GRCz10' { mature = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Annotation/SmallRNA/mature.fa" hairpin = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Annotation/SmallRNA/hairpin.fa" bowtie = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/BowtieIndex/genome" - bowtie2 = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/Bowtie2Index/genome" fasta = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/WholeGenomeFasta" gtf = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Annotation/Genes/genes.gtf" + mirtrace_species = "dre" } 'BDGP6' { mature = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Annotation/SmallRNA/mature.fa" hairpin = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Annotation/SmallRNA/hairpin.fa" bowtie = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/BowtieIndex/genome" - bowtie2 = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/Bowtie2Index/genome" fasta = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/WholeGenomeFasta" gtf = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Annotation/Genes/genes.gtf" + mirtrace_species = "dme" } 'EquCab2' { mature = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Annotation/SmallRNA/mature.fa" hairpin = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Annotation/SmallRNA/hairpin.fa" bowtie = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/BowtieIndex/genome" - bowtie2 = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/Bowtie2Index/genome" fasta = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/WholeGenomeFasta" gtf = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Annotation/Genes/genes.gtf" + mirtrace_species = "eca" } 'Galgal4' { mature = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Annotation/SmallRNA/mature.fa" hairpin = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Annotation/SmallRNA/hairpin.fa" bowtie = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/BowtieIndex/genome" - bowtie2 = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/Bowtie2Index/genome" fasta = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/WholeGenomeFasta" gtf = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Annotation/Genes/genes.gtf" + mirtrace_species = "gga" } 'Gm01' { mature = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Annotation/SmallRNA/mature.fa" hairpin = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Annotation/SmallRNA/hairpin.fa" bowtie = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/BowtieIndex/genome" - bowtie2 = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/Bowtie2Index/genome" fasta = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/WholeGenomeFasta" gtf = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Annotation/Genes/genes.gtf" + mirtrace_species = "gma" } 'Mmul_1' { mature = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Annotation/SmallRNA/mature.fa" hairpin = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Annotation/SmallRNA/hairpin.fa" bowtie = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/BowtieIndex/genome" - bowtie2 = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/Bowtie2Index/genome" fasta = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/WholeGenomeFasta" gtf = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Annotation/Genes/genes.gtf" + mirtrace_species = "mml" } 'IRGSP-1.0' { mature = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Annotation/SmallRNA/mature.fa" hairpin = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Annotation/SmallRNA/hairpin.fa" bowtie = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/BowtieIndex/genome" - bowtie2 = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/Bowtie2Index/genome" fasta = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/WholeGenomeFasta" gtf = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Annotation/Genes/genes.gtf" + mirtrace_species = "osa" } 'CHIMP2.1.4' { mature = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Annotation/SmallRNA/mature.fa" hairpin = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Annotation/SmallRNA/hairpin.fa" bowtie = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/BowtieIndex/genome" - bowtie2 = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/Bowtie2Index/genome" fasta = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/WholeGenomeFasta" gtf = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Annotation/Genes/genes.gtf" + mirtrace_species = "ptr" } 'Rnor_6.0' { mature = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Annotation/SmallRNA/mature.fa" hairpin = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Annotation/SmallRNA/hairpin.fa" bowtie = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/BowtieIndex/genome" - bowtie2 = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/Bowtie2Index/genome" fasta = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/WholeGenomeFasta" gtf = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Annotation/Genes/genes.gtf" + mirtrace_species = "rno" } 'Sbi1' { mature = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Annotation/SmallRNA/mature.fa" hairpin = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Annotation/SmallRNA/hairpin.fa" bowtie = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/BowtieIndex/genome" - bowtie2 = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/Bowtie2Index/genome" fasta = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/WholeGenomeFasta" gtf = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Annotation/Genes/genes.gtf" + mirtrace_species = "sbi" } 'Sscrofa10.2' { mature = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Annotation/SmallRNA/mature.fa" hairpin = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Annotation/SmallRNA/hairpin.fa" bowtie = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/BowtieIndex/genome" - bowtie2 = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/Bowtie2Index/genome" fasta = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/WholeGenomeFasta" gtf = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Annotation/Genes/genes.gtf" + mirtrace_species = "ssc" } 'AGPv3' { mature = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Annotation/SmallRNA/mature.fa" hairpin = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Annotation/SmallRNA/hairpin.fa" bowtie = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/BowtieIndex/genome" - bowtie2 = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/Bowtie2Index/genome" fasta = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/WholeGenomeFasta" gtf = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Annotation/Genes/genes.gtf" + mirtrace_species = "zma" } } } diff --git a/conf/test.config b/conf/test.config index 1983e176..3c962ec3 100644 --- a/conf/test.config +++ b/conf/test.config @@ -8,6 +8,9 @@ */ params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + // Limit resources so that this can run on Travis max_cpus = 2 max_memory = 6.GB max_time = 48.h @@ -18,15 +21,17 @@ params { 'https://github.com/nf-core/test-datasets/raw/smrnaseq/testdata/sample_3.fastq.gz' ] // Genome references - bt2indices = [ - 'https://github.com/nf-core/test-datasets/raw/smrnaseq/reference/genome.1.bt2', - 'https://github.com/nf-core/test-datasets/raw/smrnaseq/reference/genome.2.bt2', - 'https://github.com/nf-core/test-datasets/raw/smrnaseq/reference/genome.3.bt2', - 'https://github.com/nf-core/test-datasets/raw/smrnaseq/reference/genome.4.bt2', - 'https://github.com/nf-core/test-datasets/raw/smrnaseq/reference/genome.rev.1.bt2', - 'https://github.com/nf-core/test-datasets/raw/smrnaseq/reference/genome.rev.1.bt2' + bt_indices = [ + 'https://github.com/nf-core/test-datasets/raw/smrnaseq/reference/genome.1.ebwt', + 'https://github.com/nf-core/test-datasets/raw/smrnaseq/reference/genome.2.ebwt', + 'https://github.com/nf-core/test-datasets/raw/smrnaseq/reference/genome.3.ebwt', + 'https://github.com/nf-core/test-datasets/raw/smrnaseq/reference/genome.4.ebwt', + 'https://github.com/nf-core/test-datasets/raw/smrnaseq/reference/genome.rev.1.ebwt', + 'https://github.com/nf-core/test-datasets/raw/smrnaseq/reference/genome.rev.1.ebwt' ] gtf = 'https://github.com/nf-core/test-datasets/raw/smrnaseq/reference/genes.gtf' mature = 'https://github.com/nf-core/test-datasets/raw/smrnaseq/reference/mature.fa' hairpin = 'https://github.com/nf-core/test-datasets/raw/smrnaseq/reference/hairpin.fa' + mirna_gtf = 'https://github.com/nf-core/test-datasets/raw/smrnaseq/reference/hsa.gff3' + mirtrace_species = "hsa" } diff --git a/conf/uppmax.config b/conf/uppmax.config deleted file mode 100644 index 5ae66c3c..00000000 --- a/conf/uppmax.config +++ /dev/null @@ -1,27 +0,0 @@ -/* - * ------------------------------------------------- - * Nextflow config file for UPPMAX (milou / irma) - * ------------------------------------------------- - * Imported under the default 'standard' Nextflow - * profile in nextflow.config - */ - - singularity { - enabled = true - } - -process { - executor = 'slurm' - clusterOptions = { "-A $params.project ${params.clusterOptions ?: ''}" } -} - -params { - clusterOptions = false - // Max resources requested by a normal node on milou. If you need more memory, run on a fat node using: - // --clusterOptions "-C mem512GB" --max_memory "512GB" - max_memory = 128.GB - max_cpus = 16 - max_time = 240.h - // illumina iGenomes reference file paths on UPPMAX - igenomes_base = '/sw/data/uppnex/igenomes/' -} diff --git a/docs/README.md b/docs/README.md index 2b9ad38e..aa3edcd8 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,17 +1,12 @@ -# nfcore/smrnaseq Documentation +# nf-core/smrnaseq: Documentation -The nfcore/smrnaseq documentation is split into the following files: +The nf-core/smrnaseq documentation is split into the following files: - -1. [Installation](installation.md) +1. [Installation](https://nf-co.re/usage/installation) 2. Pipeline configuration - * [Local installation](configuration/local.md) - * [Amazon Web Services (aws)](configuration/aws.md) - * [Swedish UPPMAX clusters](configuration/uppmax.md) - * [Swedish cs3e Hebbe cluster](configuration/c3se.md) - * [Tübingen QBiC](configuration/qbic.md) - * [CCGA Kiel](configuration/ccga.md) - * [Adding your own system](configuration/adding_your_own.md) + * [Local installation](https://nf-co.re/usage/local_installation) + * [Adding your own system config](https://nf-co.re/usage/adding_own_config) + * [Reference genomes](https://nf-co.re/usage/reference_genomes) 3. [Running the pipeline](usage.md) 4. [Output and how to interpret the results](output.md) -5. [Troubleshooting](troubleshooting.md) +5. [Troubleshooting](https://nf-co.re/usage/troubleshooting) diff --git a/docs/images/Example_MDS_plot.png b/docs/images/Example_MDS_plot.png index 326629f4..d4d83aab 100644 Binary files a/docs/images/Example_MDS_plot.png and b/docs/images/Example_MDS_plot.png differ diff --git a/docs/images/Example_heatmap.png b/docs/images/Example_heatmap.png index 2f029af2..46b0d144 100644 Binary files a/docs/images/Example_heatmap.png and b/docs/images/Example_heatmap.png differ diff --git a/docs/images/NGI-Visualizations_example1.png b/docs/images/NGI-Visualizations_example1.png deleted file mode 100644 index 17fee315..00000000 Binary files a/docs/images/NGI-Visualizations_example1.png and /dev/null differ diff --git a/docs/images/NGI-Visualizations_example2.png b/docs/images/NGI-Visualizations_example2.png deleted file mode 100644 index 466ef369..00000000 Binary files a/docs/images/NGI-Visualizations_example2.png and /dev/null differ diff --git a/docs/images/NGI-final-small.png b/docs/images/NGI-final-small.png deleted file mode 100644 index ff3a24a4..00000000 Binary files a/docs/images/NGI-final-small.png and /dev/null differ diff --git a/docs/images/SciLifeLab_logo.png b/docs/images/SciLifeLab_logo.png deleted file mode 100644 index c821e78b..00000000 Binary files a/docs/images/SciLifeLab_logo.png and /dev/null differ diff --git a/docs/images/cutadapt_plot.png b/docs/images/cutadapt_plot.png new file mode 100644 index 00000000..8b4c978a Binary files /dev/null and b/docs/images/cutadapt_plot.png differ diff --git a/docs/images/mirtrace_plot.png b/docs/images/mirtrace_plot.png new file mode 100644 index 00000000..03c62cd0 Binary files /dev/null and b/docs/images/mirtrace_plot.png differ diff --git a/docs/images/samtools_alignment_plot.png b/docs/images/samtools_alignment_plot.png new file mode 100644 index 00000000..f7ac077d Binary files /dev/null and b/docs/images/samtools_alignment_plot.png differ diff --git a/docs/output.md b/docs/output.md index 061c100c..79757fff 100644 --- a/docs/output.md +++ b/docs/output.md @@ -1,10 +1,10 @@ # nf-core/smrnaseq Output -**nf-core/smrnaseq** is a bioinformatics best-practice analysis pipeline used for small RNA sequencing data analysis at the [National Genomics Infastructure](https://ngisweden.scilifelab.se/) at [SciLifeLab Stockholm](https://www.scilifelab.se/platforms/ngi/), Sweden. +**nf-core/smrnaseq** is a bioinformatics best-practice analysis pipeline used for small RNA sequencing data analysis. -This document describes the output produced by the pipeline. +This document describes the output produced by the pipeline. Most of the plots are taken from the MultiQC report, which summarises results at the end of the pipeline. -## Pipeline overview: +## Pipeline overview The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: @@ -13,8 +13,9 @@ and processes data using the following steps: * [Bowtie](#bowtie) - alignment against mature miRNAs and miRNA precursors (hairpins) * [SAMtools](#samtools) - alignment result processing and feature counting * [edgeR](#edger) - normalization, MDS plot and sample pairwise distance heatmap -* [Bowtie2](#bowtie2) - alignment against reference genome for QC purpose -* [NGI-Visualizations](#ngi_visualizations) - summary of biotypes based on Bowtie2 alignment results +* [Bowtie](#bowtie) - alignment against reference genome for QC purpose +* [mirtop](#mirtop) - miRNA and isomiR annotation +* [miRTrace](#mirtrace) - a comprehensive tool for QC purpose * [MultiQC](#multiqc) - aggregate report, describing results of the whole pipeline ## FastQC @@ -48,7 +49,11 @@ Contains FastQ files with quality and adapter trimmed reads for each sample, alo * `sample_trimmed_fastqc.zip` * FastQC report for trimmed reads -## Bowtie +This is an example of the output we can get: + +![cutadapt](images/cutadapt_plot.png) + +## Bowtie - miRNAs [Bowtie](http://bowtie-bio.sourceforge.net/index.shtml) is used for mapping adapter trimmed reads against the mature miRNAs and miRNA precursors (hairpins) in [miRBase](http://www.mirbase.org/). **Output directory: `results/bowtie`** @@ -67,19 +72,21 @@ Contains FastQ files with quality and adapter trimmed reads for each sample, alo **Output directory: `results/bowtie`** -* `miRBase_mature/sample.mature.count` - * Raw mapped read counts of mature miRNAs +* `miRBase_mature/sample.mature.stats|idxstats|flagstat` + * Raw mapped read counts and stats of mature miRNAs * `miRBase_mature/sample.mature.sorted.bam` * The sorted BAM file of alignment against mature miRNAs * `miRBase_mature/sample.mature.sorted.bam.bai` * The index file of alignment against mature miRNAs -* `miRBase_hairpin/sample.hairpin.count` - * Raw mapped read counts of miRNA precursors (hairpins) +* `miRBase_hairpin/sample.hairpin.stats|idxstats|flagstat` + * Raw mapped read counts and stats of miRNA precursors (hairpins) * `miRBase_hairpin/sample.hairpin.sorted.bam` * The sorted BAM file of alignment against miRNA precursors (hairpins) * `miRBase_hairpin/sample.hairpin.sorted.bam.bai` * The index file of alignment against miRNA precursors (hairpins) +![samtools](images/samtools_alignment_plot.png) + ## edgeR [edgeR](https://bioconductor.org/packages/release/bioc/html/edgeR.html) is an R package used for differential expression analysis of RNA-seq expression profiles. @@ -99,51 +106,61 @@ Contains FastQ files with quality and adapter trimmed reads for each sample, alo **Example**: MDS plot of 10 samples based on their expression profiles of mature miRNAs. Here we can see that samples cluster based on different sample types and library preparation kits. ![edgeR](images/Example_MDS_plot.png) + **Example**: Heatmap of tumor and normal samples based on the top differentially expressed mature miRNAs. ![edgeR](images/Example_heatmap.png) -## Bowtie2 -[Bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) is used for mapping adapter trimmed reads against the reference genome for quality control purposes. +## Bowtie - QC +[Bowtie](http://bowtie-bio.sourceforge.net/index.shtml) is used for mapping adapter trimmed reads against the reference genome for quality control purposes. + +**Output directory: `results/bowtie_ref`** + +* `sample.genome.bam` + * The aligned BAM file against reference genome +* `sample.genome.stats|idxstats|flagstat` + * Raw mapped read counts and stats of mature miRNAs -**Output directory: `results/bowtie2`** +## mirtop +[mirtop](https://github.com/miRTop/mirtop) is used to parse the BAM files from `bowtie` alignment, and produce a [mirgff3](https://github.com/miRTop/mirGFF3) file with information about miRNAs and isomirs. -* `sample.bowite2.bam` - * The aligned BAM file of alignment against reference genome +** Output directory: `results/mirtop` ** -## NGI-Visualizations -[NGI-Visualizations](https://github.com/NationalGenomicsInfrastructure/ngi_visualizations) takes the aligned BAM file against reference genome as input, and counts the overlaps with different biotype flags within a GTF annotation file. +* `mirtop.gff`: [mirgff3](https://github.com/miRTop/mirGFF3) file +* `mirtop.tsv`: tabular file of the previous file for easy integration with downstream analysis. +* `mirtop_rawData.tsv`: File compatible with [isomiRs](http://lpantano.github.io/isomiRs/reference/IsomirDataSeqFromMirtop.html) Bioconductor package to perform isomiRs analysis. +* `mirna.tsv`: tabular file with miRNA counts after summarizing unique isomiRs for each miRNA -**Output directory: `results/bowtie2/ngi_visualizations`** +## miRTrace +[miRTrace](https://github.com/friedlanderlab/mirtrace) is a quality control specifically for small RNA sequencing data (smRNA-Seq). Each sample is characterized by profiling sequencing quality, read length, sequencing depth and miRNA complexity and also the amounts of miRNAs versus undesirable sequences (derived from tRNAs, rRNAs and sequencing artifacts). -* `sample.bowite2_biotypeCounts.pdf/png` - * Summary of annotation categories of aligned reads -* `sample.bowite2_biotypeCounts_log.pdf/png` - * Summary of annotation categories of aligned reads in logarithm scale -* `sample.bowite2_biotypeLengths.pdf/png` - * Stacked bar plot of annotations of aligned reads with different read lengths -* `sample.bowite2_biotypeLengthPercentages.pdf/png` - * Stacked bar plot of annotation percentage of aligned reads with different read lengths +**Output directory: `results/miRTrace`** -**Example**: Summary of annotation categories of aligned reads -![NGI-Visualizations](images/NGI-Visualizations_example1.png) +* `mirtrace-report.html` + * An interactive HTML report summarizing all output statistics from miRTrace +* `mirtrace-results.json` + * A JSON file with all output statistics from miRTrace +* `mirtrace-stats-*.tsv` + * Tab-separated statistics files +* `qc_passed_reads.all.collapsed` + * FASTA file per sample with sequence reads that passed QC in miRTrace +* `qc_passed_reads.rnatype_unknown.collapsed` + * FASTA file per sample with unknown reads in the RNA type analysis + +Refer to the [tool manual](https://github.com/friedlanderlab/mirtrace/blob/master/release-bundle-includes/manual.pdf) for detailed specifications about output files. Here is an example of the RNA types plot that you will see: + +![mirtrace](images/mirtrace_plot.png) -**Example**: Stacked bar plot of annotations of aligned reads with different read lengths -![NGI-Visualizations](images/NGI-Visualizations_example2.png) ## MultiQC [MultiQC](http://multiqc.info) is a visualisation tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in within the report data directory. -**Output directory: `results/MultiQC`** +The pipeline has special steps which allow the software versions used to be reported in the MultiQC output for future traceability. + +**Output directory: `results/multiqc`** -* `multiqc_report.html` +* `Project_multiqc_report.html` * MultiQC report - a standalone HTML file that can be viewed in your web browser -* `multiqc_data/` +* `Project_multiqc_data/` * Directory containing parsed statistics from the different tools used in the pipeline -For more information about how to use MultiQC reports, see http://multiqc.info - ------------------------------------------------------------------------------------------ - -

- -

+For more information about how to use MultiQC reports, see [http://multiqc.info](http://multiqc.info) diff --git a/docs/usage.md b/docs/usage.md index e69de29b..384bcd90 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -0,0 +1,387 @@ +# nf-core/smrnaseq: Usage + +## Table of contents + + + +* [Table of contents](#table-of-contents) +* [Introduction](#introduction) +* [Running the pipeline](#running-the-pipeline) + * [Updating the pipeline](#updating-the-pipeline) + * [Reproducibility](#reproducibility) +* [Main Arguments](#main-arguments) + * [`-profile`](#-profile) + * [`--reads`](#--reads) + * [`--protocol`](#--protocol) +* [Reference genomes](#reference-genomes) + * [`--genome` (using iGenomes)](#--genome-using-igenomes) + * [Supported genomes](#supported-genomes) + * [`--saveReference`](#--savereference) + * [`--fasta`](#--fasta) + * [`--igenomesIgnore`](#--igenomesignore) + * [`--mature`](#--mature) + * [`--hairpin`](#--hairpin) + * [`--bt_index`](#--bt_index) +* [Trimming options](#trimming-options) + * [`--min_length [int]`](#--min_length-int) + * [`--clip_R1 [int]`](#--clip_r1-int) + * [`--three_prime_clip_R1 [int]`](#--three_prime_clip_r1-int) + * [`--three_prime_adapter [sequence]`](#--three_prime_adapter-sequence) +* [Skipping QC steps](#skipping-qc-steps) + * [`--skipQC`](#--skipqc) + * [`--skipFastqc`](#--skipfastqc) + * [`--skipMultiqc`](#--skipmultiqc) +* [Job resources](#job-resources) + * [Automatic resubmission](#automatic-resubmission) + * [Custom resource requests](#custom-resource-requests) +* [AWS Batch specific parameters](#aws-batch-specific-parameters) + * [`--awsqueue`](#--awsqueue) + * [`--awsregion`](#--awsregion) +* [Other command line parameters](#other-command-line-parameters) + * [`--outdir`](#--outdir) + * [`--email`](#--email) + * [`-name`](#-name) + * [`--seq_center`](#--seq_center) + * [`-resume`](#-resume) + * [`-c`](#-c) + * [`--rlocation`](#--rlocation) +* [Stand-alone scripts](#stand-alone-scripts) + * [`--custom_config_version`](#--custom_config_version) + * [`--custom_config_base`](#--custom_config_base) + * [`--max_memory`](#--max_memory) + * [`--max_time`](#--max_time) + * [`--max_cpus`](#--max_cpus) + * [`--plaintext_email`](#--plaintext_email) + * [`--monochrome_logs`](#--monochrome_logs) + * [`--multiqc_config`](#--multiqc_config) + + + +## Introduction +Nextflow handles job submissions on SLURM or other environments, and supervises running the jobs. Thus the Nextflow process must run until the pipeline is finished. We recommend that you put the process running in the background through `screen` / `tmux` or similar tool. Alternatively you can run nextflow within a cluster job submitted your job scheduler. + +It is recommended to limit the Nextflow Java virtual machines memory. We recommend adding the following line to your environment (typically in `~/.bashrc` or `~./bash_profile`): + +```bash +NXF_OPTS='-Xms1g -Xmx4g' +``` + +## Running the pipeline +The typical command for running the pipeline is as follows: + +```bash +nextflow run nf-core/smrnaseq --reads '*.fastq.gz' -profile docker +``` + +This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. + +Note that the pipeline will create the following files in your working directory: + +```bash +work # Directory containing the nextflow working files +results # Finished results (configurable, see below) +.nextflow_log # Log file from Nextflow +# Other nextflow hidden files, eg. history of pipeline runs and old logs. +``` + +### Updating the pipeline +When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: + +```bash +nextflow pull nf-core/smrnaseq +``` + +### Reproducibility +It's a good idea to specify a pipeline version when running the pipeline on your data. This ensures that a specific version of the pipeline code and software are used when you run your pipeline. If you keep using the same tag, you'll be running the same version of the pipeline, even if there have been changes to the code since. + +First, go to the [nf-core/smrnaseq releases page](https://github.com/nf-core/smrnaseq/releases) and find the latest version number - numeric only (eg. `1.3.1`). Then specify this when running the pipeline with `-r` (one hyphen) - eg. `-r 1.3.1`. + +This version number will be logged in reports when you run the pipeline, so that you'll know what you used when you look back in the future. + +## Main Arguments +### `-profile` +Use this parameter to choose a configuration profile. Profiles can give configuration presets for different compute environments. Note that multiple profiles can be loaded, for example: `-profile docker` - the order of arguments is important! + +If `-profile` is not specified at all the pipeline will be run locally and expects all software to be installed and available on the `PATH`. + +* `awsbatch` + * A generic configuration profile to be used with AWS Batch. +* `conda` + * A generic configuration profile to be used with [conda](https://conda.io/docs/) + * Pulls most software from [Bioconda](https://bioconda.github.io/) +* `docker` + * A generic configuration profile to be used with [Docker](http://docker.com/) + * Pulls software from dockerhub: [`nfcore/smrnaseq`](http://hub.docker.com/r/nfcore/smrnaseq/) +* `singularity` + * A generic configuration profile to be used with [Singularity](http://singularity.lbl.gov/) + * Pulls software from DockerHub: [`nfcore/smrnaseq`](http://hub.docker.com/r/nfcore/smrnaseq/) +* `test` + * A profile with a complete configuration for automated testing + * Includes links to test data so needs no other parameters + +### `--reads` +Location of the input FastQ files: + +```bash + --reads 'path/to/data/*.fastq.gz' +``` + +Please note the following requirements: + +1. The path must be enclosed in quotes +2. The path must have at least one `*` wildcard character + +### `--protocol` +Protocol for constructing smRNA-seq libraries. Note that trimming parameters and 3' adapter sequence are pre-defined with a specified protocol. +Default: "illumina" + +```bash +--protocol [one protocol listed in the table below] +``` + +| Protocol | Library Prep Kit | Trimming Parameter | 3' Adapter Sequence | +| :------------ | :-------------------------------------- | :----------------------------------- | :------------------- | +| illumina | Illumina TruSeq Small RNA | clip_R1 = 0; three_prime_clip_R1 = 0 | TGGAATTCTCGGGTGCCAAGG | +| nextflex | BIOO SCIENTIFIC NEXTFLEX Small RNA-Seq | clip_R1 = 4; three_prime_clip_R1 = 4 | TGGAATTCTCGGGTGCCAAGG | +| qiaseq | QIAGEN QIAseq miRNA | clip_R1 = 0; three_prime_clip_R1 = 0 | AACTGTAGGCACCATCAAT | +| cats | Diagenode CATS Small RNA-seq | clip_R1 = 3; three_prime_clip_R1 = 0 | GATCGGAAGAGCACACGTCTG | + +## Reference genomes + +The pipeline config files come bundled with paths to the illumina iGenomes reference index files. If running with docker or AWS, the configuration is set up to use the [AWS-iGenomes](https://ewels.github.io/AWS-iGenomes/) resource. + +### `--genome` (using iGenomes) + +The reference genome to use of the analysis, needs to be one of the genome specified in the config file. +The human `GRCh37` genome is used by default. + +```bash +--genome 'GRCh37' +``` + +### Supported genomes + +| Parameter | Latin Name | Common Name | +| :------------ |:-------------------------------- |:------------------ | +| AGPv3 | *Zea mays* | Maize | +| BDGP6 | *Drosophila melanogaster* | Fruit fly | +| CanFam3.1 | *Canis familiaris* | Dog | +| CHIMP2.1.4 | *Pan troglodytes* | Chimpanze | +| EquCab2 | *Equus caballus* | Horse | +| Galgal4 | *Gallus gallus* | Chicken | +| Gm01 | *Glycine max* | Soybean | +| GRCh37 | *Homo sapiens* | Human | +| GRCm38 | *Mus musculus* | Mouse | +| GRCz10 | *Danio rerio* | Zebrafish | +| IRGSP-1.0 | *Oryza sativa japonica* | Rice | +| Mmul_1 | *Macaca mulatta* | Macaque | +| Rnor_6.0 | *Rattus norvegicus* | Rat | +| Sbi1 | *Sorghum bicolor* | Great millet | +| Sscrofa10.2 | *Sus scrofa* | Pig | +| TAIR10 | *Arabidopsis thaliana* | Thale cress | +| UMD3.1 | *Bos taurus* | Cow | +| WBcel235 | *Caenorhabditis elegans* | Nematode | + +There are 31 different species supported in the iGenomes references. To run the pipeline, you must specify which to use with the `--genome` flag. + +You can find the keys to specify the genomes in the [iGenomes config file](../conf/igenomes.config). Common genomes that are supported are: + +* Human + * `--genome GRCh37` +* Mouse + * `--genome GRCm38` +* _Drosophila_ + * `--genome BDGP6` +* _S. cerevisiae_ + * `--genome 'R64-1-1'` + +> There are numerous others - check the config file for more. + +Note that you can use the same configuration setup to save sets of reference files for your own use, even if they are not part of the iGenomes resource. See the [Nextflow documentation](https://www.nextflow.io/docs/latest/config.html) for instructions on where to save such a file. + +The syntax for this reference configuration is as follows: + +```nextflow +params { + genomes { + 'GRCh37' { + fasta = '' // Used if no star index given + mature = '' //mature.fa" + hairpin = '' //hairpin.fa" + bowtie = '' // index + gtf = '' + mirtrace_species = "sps" // species according mirbase + + } + // Any number of additional genomes, key is used with --genome + } +} +``` + +### `--saveReference` +Supply this parameter to save any generated reference genome files to your results folder. These can then be used for future pipeline runs, reducing processing times. + +### `--fasta` +If you prefer, you can specify the full path to your reference genome when you run the pipeline: + +```bash +--fasta '[path to Fasta reference]' +``` + +### `--igenomesIgnore` +Do not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`. + +### `--mature` +If you prefer, you can specify the full path to the FASTA file of mature miRNAs when you run the pipeline: + +```bash +--mature [path to the FASTA file of mature miRNAs] +``` + +### `--hairpin` +If you prefer, you can specify the full path to the FASTA file of miRNA precursors when you run the pipeline: + +```bash +--hairpin [path to the FASTA file of miRNA precursors] +``` + +### `--bt_index` +If you prefer, you can specify the full path to your reference genome when you run the pipeline: + +```bash +--bt_index [path to Bowtie 1 index] +``` + +## Trimming options +### `--min_length [int]` +Discard reads that became shorter than length [int] because of either quality or adapter trimming. Default: 18 +### `--clip_R1 [int]` +Instructs Trim Galore to remove bp from the 5' end of read 1 +### `--three_prime_clip_R1 [int]` +Instructs Trim Galore to remove bp from the 3' end of read 1 AFTER adapter/quality trimming has been performed +### `--three_prime_adapter [sequence]` +Instructs Trim Galore to remove 3' adapters which are typically used in smRNA-seq library preparation + +## Skipping QC steps +### `--skipQC` +Skip all QC steps aside from MultiQC + +### `--skipFastqc` +Skip FastQC + +### `--skipMultiqc` +Skip MultiQC + + +## Job resources +### Automatic resubmission +Each step in the pipeline has a default set of requirements for number of CPUs, memory and time. For most of the steps in the pipeline, if the job exits with an error code of `143` (exceeded requested resources) it will automatically resubmit with higher requests (2 x original, then 3 x original). If it still fails after three times then the pipeline is stopped. + +### Custom resource requests +Wherever process-specific requirements are set in the pipeline, the default value can be changed by creating a custom config file. See the files hosted at [`nf-core/configs`](https://github.com/nf-core/configs/tree/master/conf) for examples. + +If you are likely to be running `nf-core` pipelines regularly it may be a good idea to request that your custom config file is uploaded to the `nf-core/configs` git repository. Before you do this please can you test that the config file works with your pipeline of choice using the `-c` parameter (see definition below). You can then create a pull request to the `nf-core/configs` repository with the addition of your config file, associated documentation file (see examples in [`nf-core/configs/docs`](https://github.com/nf-core/configs/tree/master/docs)), and amending [`nfcore_custom.config`](https://github.com/nf-core/configs/blob/master/nfcore_custom.config) to include your custom profile. + +If you have any questions or issues please send us a message on [Slack](https://nf-core-invite.herokuapp.com/). + +## AWS Batch specific parameters +Running the pipeline on AWS Batch requires a couple of specific parameters to be set according to your AWS Batch configuration. Please use the `-awsbatch` profile and then specify all of the following parameters. +### `--awsqueue` +The JobQueue that you intend to use on AWS Batch. +### `--awsregion` +The AWS region to run your job in. Default is set to `eu-west-1` but can be adjusted to your needs. + +Please make sure to also set the `-w/--work-dir` and `--outdir` parameters to a S3 storage bucket of your choice - you'll get an error message notifying you if you didn't. + +## Other command line parameters +### `--outdir` +The output directory where the results will be saved. + +### `--email` +Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run. + +### `-name` +Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic. + +This is used in the MultiQC report (if not default) and in the summary HTML / e-mail (always). + +**NB:** Single hyphen (core Nextflow option) + +### `--seq_center` +Text about sequencing center which will be added in the header of output bam files. + +### `-resume` +Specify this when restarting a pipeline. Nextflow will used cached results from any pipeline steps where the inputs are the same, continuing from where it got to previously. + +You can also supply a run name to resume a specific run: `-resume [run-name]`. Use the `nextflow log` command to show previous run names. + +**NB:** Single hyphen (core Nextflow option) + +### `-c` +Specify the path to a specific config file (this is a core NextFlow command). Useful if using different UPPMAX +projects or different sets of reference genomes. **NOTE! One hyphen only (core Nextflow parameter).** + +**NB:** Single hyphen (core Nextflow option) + +Note - you can use this to override defaults. For example, we run on UPPMAX but don't want to use the MultiQC +environment module as is the default. So we specify a config file using `-c` that contains the following: + +```nextflow +process.$multiqc.module = [] +``` + +## Stand-alone scripts +The `bin` directory contains some scripts used by the pipeline which may also be run manually: + +* `edgeR_miRBase.r` + * R script using for processing reads counts of mature miRNAs and miRNA precursors (hairpins). + + +### `--custom_config_version` +Provide git commit id for custom Institutional configs hosted at `nf-core/configs`. This was implemented for reproducibility purposes. Default is set to `master`. + +```bash +## Download and use config file with following git commid id +--custom_config_version d52db660777c4bf36546ddb188ec530c3ada1b96 +``` + +### `--custom_config_base` +If you're running offline, nextflow will not be able to fetch the institutional config files +from the internet. If you don't need them, then this is not a problem. If you do need them, +you should download the files from the repo and tell nextflow where to find them with the +`custom_config_base` option. For example: + +```bash +## Download and unzip the config files +cd /path/to/my/configs +wget https://github.com/nf-core/configs/archive/master.zip +unzip master.zip + +## Run the pipeline +cd /path/to/my/data +nextflow run /path/to/pipeline/ --custom_config_base /path/to/my/configs/configs-master/ +``` + +> Note that the nf-core/tools helper package has a `download` command to download all required pipeline +> files + singularity containers + institutional configs in one go for you, to make this process easier. + +### `--max_memory` +Use to set a top-limit for the default memory requirement for each process. +Should be a string in the format integer-unit. eg. `--max_memory '8.GB'` + +### `--max_time` +Use to set a top-limit for the default time requirement for each process. +Should be a string in the format integer-unit. eg. `--max_time '2.h'` + +### `--max_cpus` +Use to set a top-limit for the default CPU requirement for each process. +Should be a string in the format integer-unit. eg. `--max_cpus 1` + +### `--plaintext_email` +Set to receive plain-text e-mails instead of HTML formatted. + +### `--monochrome_logs` +Set to disable colourful command line output and live life in monochrome. + +### `--multiqc_config` +Specify a path to a custom MultiQC configuration file. diff --git a/environment.yml b/environment.yml index ee8b82a9..753f672b 100644 --- a/environment.yml +++ b/environment.yml @@ -1,27 +1,32 @@ # You can use this file to create a conda environment for this pipeline: # conda env create -f environment.yml -name: nfcore-smrnaseq-0.2dev +name: nf-core-smrnaseq-1.0.0 channels: - - bioconda - conda-forge + - bioconda - defaults dependencies: - - conda-forge::openjdk=8.0.144 # Needed for FastQC - conda build hangs without this - - fastqc=0.11.7 - - trim-galore=0.4.5 - - samtools=1.8 - - bowtie=1.2.2 - - bowtie2=2.3.4.1 - - multiqc=1.5 - - bioconductor-edger=3.20.7 - - bioconductor-limma=3.34.9 - - conda-forge::r-statmod=1.4.30 - - conda-forge::r-data.table=1.11.4 - - conda-forge::r-gplots=3.0.1 + ## conda-forge packages + - conda-forge::r-base=3.6.1 + - conda-forge::openjdk=11.0.1 # Needed for FastQC - conda build hangs without this + - conda-forge::r-statmod=1.4.32 + - conda-forge::r-data.table=1.12.2 + - conda-forge::r-gplots=3.0.1.1 - conda-forge::r-r.methodss3=1.7.1 - - htseq=0.9.1 + - conda-forge::r-markdown=1.0 + - conda-forge::matplotlib=3.0.3 # Current 3.1.0 build incompatible with multiqc=1.7 + + ## bioconda packages + - fastqc=0.11.8 + - trim-galore=0.6.3 + - samtools=1.9 + - bowtie=1.2.2 + - multiqc=1.7 + - mirtop=0.4.22 + - seqcluster=1.2.5 + - htseq=0.11.2 - fastx_toolkit=0.0.14 - # Install NGI Visualizations - # TODO: Replace dependency / conda-ise somehow - - pip: - - "--editable=git+https://github.com/NationalGenomicsInfrastructure/ngi_visualizations.git#egg=ngi_visualizations" + - seqkit=0.10.1 + - mirtrace=1.0.0 + - bioconductor-edger=3.26.5 + - bioconductor-limma=3.40.2 \ No newline at end of file diff --git a/main.nf b/main.nf index 245b0878..f400b181 100644 --- a/main.nf +++ b/main.nf @@ -1,75 +1,61 @@ #!/usr/bin/env nextflow - /* ======================================================================================== - S M A L L R N A - S E Q B E S T P R A C T I C E + nf-core/smrnaseq ======================================================================================== - Small-RNA-Seq Best Practice Analysis Pipeline. Started May 2016. + nf-core/smrnaseq Analysis Pipeline. #### Homepage / Documentation https://github.com/nf-core/smrnaseq - #### Authors - Phil Ewels - Chuan Wang - Rickard Hammarén ----------------------------------------------------------------------------------------- ----------------------------------------------------------------------------------------- - Pipeline overview: - - 1: FastQC for raw sequencing reads quality control - - 2: Trim Galore! for adapter trimming - - 3.1: Bowtie 1 alignment against miRBase mature miRNA - - 3.2: Post-alignment processing of miRBase mature miRNA counts - - 3.3: edgeR analysis on miRBase mature miRNA counts - - TMM normalization and a table of top expression mature miRNA - - MDS plot clustering samples - - Heatmap of sample similarities - - 4.1: Bowtie 1 alignment against miRBase hairpin for the unaligned reads in step 3 - - 4.2: Post-alignment processing of miRBase hairpin counts - - 4.3: edgeR analysis on miRBase hairpin counts - - TMM normalization and a table of top expression hairpin - - MDS plot clustering samples - - Heatmap of sample similarities - - 5.1: Bowtie 2 alignment against host reference genome - - 5.2: Post-alignment processing of Bowtie 2 - - 6: NGI-Visualization of Bowtie 2 alignment statistics - - 7: MultiQC ---------------------------------------------------------------------------------------- */ def helpMessage() { + log.info nfcoreHeader() log.info""" - ========================================= - nf-core/smrnaseq : smRNA-Seq Best Practice v${params.version} - ========================================= + Usage: The typical command for running the pipeline is as follows: - nextflow run nf-core/smrnaseq --reads '*.fastq.gz' --genome GRCh37 + nextflow run nf-core/smrnaseq --reads '*.fastq.gz' --genome GRCh37 -profile docker Mandatory arguments: --reads Path to input data (must be surrounded with quotes). - NOTE! Paired-end data is NOT supported by this pipeline! For paired-end data, use Read 1 only. - --genome Name of iGenomes reference - NOTE! With the option --genome 'ALL', the entire dataset of mature miRNAs and hairpins - in miRBase will be used as reference regardless of species. Meanwhile the alignment against - host reference genome will be skipped. + NOTE! Paired-end data is NOT supported by this pipeline! For paired-end data, use Read 1 only + --genome Name of iGenomes reference. Not needed if --mature, --hairpin, --mirtrace_species are provided. + --protocol Library preparation protocol. Default: "illumina". Can be set as "illumina", "nextflex", "qiaseq" or "cats" + References - --saveReference Save the generated reference files the the Results directory. + --saveReference Save the generated reference files the the Results directory + --mature Path to the FASTA file of mature miRNAs + --hairpin Path to the FASTA file of miRNA precursors + --mirna_gtf GFF/GTF file with coordinates positions of precursor and miRNAs. See: ftp://mirbase.org/pub/mirbase/CURRENT/genomes/hsa.gff3 + --bt_index Path to the bowtie 1 index files of the host reference genome. Optional. + --mirtrace_species Species for miRTrace. Pre-defined when '--genome' is specified. (hsa, mmu ...) Trimming options - --length [int] Discard reads that became shorter than length [int] because of either quality or adapter trimming. Default: 18 + --three_prime_adapter 3’ Adapter to trim. Default: None + --min_length [int] Discard reads that became shorter than length [int] because of either quality or adapter trimming. Default: 18 --clip_R1 [int] Instructs Trim Galore to remove bp from the 5' end of read 1 --three_prime_clip_R1 [int] Instructs Trim Galore to remove bp from the 3' end of read 1 AFTER adapter/quality trimming has been performed - Other options: + QC + --skipQC Skip all QC steps aside from MultiQC + --skipFastqc Skip FastQC + --skipMultiqc Skip MultiQC + + Other options --outdir The output directory where the results will be saved --email Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits - --clusterOptions Extra SLURM options, used in conjunction with Uppmax.config - -name Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic. - --skip_qc Skip all QC steps aside from MultiQC - --skip_fastqc Skip FastQC - --skip_multiqc Skip MultiQC + -name Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic + --seq_center Text about sequencing center which will be added in the header of output bam files + --maxMultiqcEmailFileSize Theshold size for MultiQC report to be attached in notification email. If file generated by pipeline exceeds the threshold, it will not be attached (Default: 25MB) + + AWSBatch options: + --awsqueue The AWSBatch JobQueue that needs to be set when running on AWSBatch + --awsregion The AWS Region for your AWS Batch job to run on + """.stripIndent() } @@ -77,40 +63,89 @@ def helpMessage() { * SET UP CONFIGURATION VARIABLES */ -// Show help emssage -params.help = false +// Show help message if (params.help){ helpMessage() exit 0 } + +// Check if genome exists in the config file +if (params.genomes && params.genome && !params.genomes.containsKey(params.genome)) { + exit 1, "The provided genome '${params.genome}' is not available in the iGenomes file. Currently the available genomes are ${params.genomes.keySet().join(", ")}" +} + +// Genome options +params.bt_index = params.genome ? params.genomes[ params.genome ].bowtie ?: false : false +params.bt_indices = null +params.mature = params.genome ? params.genomes[ params.genome ].mature ?: false : false +params.hairpin = params.genome ? params.genomes[ params.genome ].hairpin ?: false : false +params.mirtrace_species = params.genome ? params.genomes[ params.genome ].mirtrace_species ?: false : false + + +// Define regular variables so that they can be overwritten +clip_R1 = params.clip_R1 +three_prime_clip_R1 = params.three_prime_clip_R1 +three_prime_adapter = params.three_prime_adapter +protocol = params.protocol +// Presets +if (params.protocol == "illumina"){ + clip_R1 = 0 + three_prime_clip_R1 = 0 + three_prime_adapter = "TGGAATTCTCGGGTGCCAAGG" +} else if (params.protocol == "nextflex"){ + clip_R1 = 4 + three_prime_clip_R1 = 4 + three_prime_adapter = "TGGAATTCTCGGGTGCCAAGG" +} else if (params.protocol == "qiaseq"){ + clip_R1 = 0 + three_prime_clip_R1 = 0 + three_prime_adapter = "AACTGTAGGCACCATCAAT" +} else if (params.protocol == "cats"){ + clip_R1 = 3 + three_prime_clip_R1 = 0 + // three_prime_adapter = "GATCGGAAGAGCACACGTCTG" + three_prime_adapter = "AAAAAAAA" +} else { + //custom protocol + clip_R1 = params.clip_R1 + three_prime_clip_R1 = params.three_prime_clip_R1 + three_prime_adapter = params.three_prime_adapter + protocol = params.protocol +} + +if (!params.mirna_gtf && params.mirtrace_species){ + mirna_gtf = file("ftp://mirbase.org/pub/mirbase/CURRENT/genomes/${params.mirtrace_species}.gff3", checkIfExists: true) +}else if (params.mirna_gtf) { + mirna_gtf = file(params.mirna_gtf, checkIfExists: true) +}else{ + mirna_gtf = false +} + // Validate inputs if( !params.mature || !params.hairpin ){ exit 1, "Missing mature / hairpin reference indexes! Is --genome specified?" } -if( params.mature ){ - mature = file(params.mature) - if( !mature.exists() ) exit 1, "Mature file not found: ${params.mature}" -} -if( params.hairpin ){ - hairpin = file(params.hairpin) - if( !hairpin.exists() ) exit 1, "Hairpin file not found: ${params.hairpin}" -} -if( params.gtf ){ - gtf = file(params.gtf) - if( !gtf.exists() ) exit 1, "GTF file not found: ${params.gtf}" + +if (params.mature) { mature = file(params.mature, checkIfExists: true) } else { exit 1, "Mature file not found: ${params.mature}" } + +if (params.hairpin) { hairpin = file(params.hairpin, checkIfExists: true) } else { exit 1, "Hairpin file not found: ${params.hairpin}" } + +if (params.gtf) { gtf = file(params.gtf, checkIfExists: true) } + +if( params.bt_index ){ + bt_indices = Channel + .fromPath("${params.bt_index}*", checkIfExists: true) + .ifEmpty { exit 1, "Bowtie1 index directory not found: ${bt_dir}" } +} else if( params.bt_indices ){ + bt_indices = Channel.from(params.readPaths).map{ file(it) }.toList() } -if( params.bt2index ){ - bt2_index = file("${params.bt2index}.fa") - bt2_indices = Channel.fromPath( "${params.bt2index}*.bt2" ).toList() - if( !bt2_index.exists() ) exit 1, "Reference genome Bowtie 2 not found: ${params.bt2index}" -} else if( params.bt2indices ){ - bt2_indices = Channel.from(params.readPaths).map{ file(it) }.toList() +if( !params.bt_index) { + log.info "No GTF / Bowtie 1 index supplied - host reference genome analysis will be skipped." } -if( !params.gtf || !params.bt2index) { - log.info "No GTF / Bowtie2 index supplied - host reference genome analysis will be skipped." +if( !params.mirtrace_species ){ + exit 1, "Reference species for miRTrace is not defined." } -multiqc_config = file(params.multiqc_config) // Has the run name been specified by the user? // this has the bonus effect of catching both -name and --name @@ -119,6 +154,20 @@ if( !(workflow.runName ==~ /[a-z]+_[a-z]+/) ){ custom_runName = workflow.runName } +if( workflow.profile == 'awsbatch') { + // AWSBatch sanity checking + if (!params.awsqueue || !params.awsregion) exit 1, "Specify correct --awsqueue and --awsregion parameters on AWSBatch!" + // Check outdir paths to be S3 buckets if running on AWSBatch + // related: https://github.com/nextflow-io/nextflow/issues/813 + if (!params.outdir.startsWith('s3:')) exit 1, "Outdir not on S3 - specify S3 Bucket to run on AWSBatch!" + // Prevent trace files to be stored on S3 since S3 does not support rolling files. + if (workflow.tracedir.startsWith('s3:')) exit 1, "Specify a local tracedir or run without trace! S3 cannot be used for tracefiles." +} + +// Stage config files +ch_multiqc_config = Channel.fromPath(params.multiqc_config, checkIfExists: true) +ch_output_docs = Channel.fromPath("$baseDir/docs/output.md", checkIfExists: true) + /* * Create a channel for input read files */ @@ -127,81 +176,115 @@ if(params.readPaths){ .from(params.readPaths) .map { file(it) } .ifEmpty { exit 1, "params.readPaths was empty - no input files supplied" } - .into { raw_reads_fastqc; raw_reads_trimgalore } + .into { raw_reads_fastqc; raw_reads_trimgalore; raw_reads_mirtrace } } else { Channel .fromPath( params.reads ) .ifEmpty { exit 1, "Cannot find any reads matching: ${params.reads}" } - .into { raw_reads_fastqc; raw_reads_trimgalore } + .into { raw_reads_fastqc; raw_reads_trimgalore; raw_reads_mirtrace } } // Header log info -log.info """======================================================= - ,--./,-. - ___ __ __ __ ___ /,-._.--~\' - |\\ | |__ __ / ` / \\ |__) |__ } { - | \\| | \\__, \\__/ | \\ |___ \\`-._,-`-, - `._,._,\' - -nf-core/smrnaseq : Small RNA-Seq Best Practice v${params.version} -=======================================================""" +log.info nfcoreHeader() def summary = [:] +if(workflow.revision) summary['Pipeline Release'] = workflow.revision summary['Run Name'] = custom_runName ?: workflow.runName summary['Reads'] = params.reads summary['Genome'] = params.genome -summary['Trim min length'] = params.length -summary["Trim 5' R1"] = params.clip_R1 -summary["Trim 3' R1"] = params.three_prime_clip_R1 +summary['Min Trimmed Length'] = params.min_length +summary["Trim 5' R1"] = clip_R1 +summary["Trim 3' R1"] = three_prime_clip_R1 summary['miRBase mature'] = params.mature summary['miRBase hairpin'] = params.hairpin -if(params.bt2index) summary['Bowtie2 Index'] = params.bt2index -if(params.gtf) summary['GTF Annotation'] = params.gtf +if(params.bt_index) summary['Bowtie Index for Ref'] = params.bt_index summary['Save Reference'] = params.saveReference ? 'Yes' : 'No' +summary['Protocol'] = params.protocol +summary['miRTrace species'] = params.mirtrace_species +summary["3' adapter"] = three_prime_adapter summary['Output dir'] = params.outdir +summary['Launch dir'] = workflow.launchDir summary['Working dir'] = workflow.workDir summary['Current home'] = "$HOME" summary['Current user'] = "$USER" summary['Current path'] = "$PWD" summary['Script dir'] = workflow.projectDir -summary['Config Profile'] = (workflow.profile == 'standard' ? 'UPPMAX' : workflow.profile) -if(params.project) summary['UPPMAX Project'] = params.project -if(params.email) summary['E-mail Address'] = params.email -log.info summary.collect { k,v -> "${k.padRight(15)}: $v" }.join("\n") -log.info "===========================================" - -// Check that Nextflow version is up to date enough -// try / throw / catch works for NF versions < 0.25 when this was implemented -try { - if( ! nextflow.version.matches(">= $params.nf_required_version") ){ - throw GroovyException('Nextflow version too old') - } -} catch (all) { - log.error "====================================================\n" + - " Nextflow version $params.nf_required_version required! You are running v$workflow.nextflow.version.\n" + - " Pipeline execution will continue, but things may break.\n" + - " Please run `nextflow self-update` to update Nextflow.\n" + - "============================================================" -} -// Show a big error message if we're running on the base config and an uppmax cluster -if( workflow.profile == 'standard'){ - if ( "hostname".execute().text.contains('.uppmax.uu.se') ) { - log.error "====================================================\n" + - " WARNING! You are running with the default 'standard'\n" + - " pipeline config profile, which runs on the head node\n" + - " and assumes all software is on the PATH.\n" + - " ALL JOBS ARE RUNNING LOCALLY and stuff will probably break.\n" + - " Please use `-profile uppmax` to run on UPPMAX clusters.\n" + - "============================================================" - } +summary['Config Profile'] = (workflow.profile == 'standard' ? 'UPPMAX' : workflow.profile) +summary['Max Resources'] = "$params.max_memory memory, $params.max_cpus cpus, $params.max_time time per job" +if(workflow.containerEngine) summary['Container'] = "$workflow.containerEngine - $workflow.container" +summary['Script dir'] = workflow.projectDir +summary['User'] = workflow.userName +if(workflow.profile == 'awsbatch'){ + summary['AWS Region'] = params.awsregion + summary['AWS Queue'] = params.awsqueue } +summary['Config Profile'] = workflow.profile +if(params.config_profile_description) summary['Config Description'] = params.config_profile_description +if(params.config_profile_contact) summary['Config Contact'] = params.config_profile_contact +if(params.config_profile_url) summary['Config URL'] = params.config_profile_url +if(params.email) { + summary['E-mail Address'] = params.email + summary['MultiQC maxsize'] = params.maxMultiqcEmailFileSize +} +log.info summary.collect { k,v -> "${k.padRight(18)}: $v" }.join("\n") +log.info "\033[2m----------------------------------------------------\033[0m" + +// Check the hostnames against configured profiles +checkHostname() + +def create_workflow_summary(summary) { + def yaml_file = workDir.resolve('workflow_summary_mqc.yaml') + yaml_file.text = """ + id: 'nf-core-smrnaseq-summary' + description: " - this information is collected when the pipeline is started." + section_name: 'nf-core/smrnaseq Workflow Summary' + section_href: 'https://github.com/nf-core/smrnaseq' + plot_type: 'html' + data: | +
+${summary.collect { k,v -> "
$k
${v ?: 'N/A'}
" }.join("\n")} +
+ """.stripIndent() + return yaml_file +} + +/* +* Parse software version numbers +*/ +process get_software_versions { + publishDir "${params.outdir}/pipeline_info", mode: 'copy', + saveAs: {filename -> + if (filename.indexOf(".csv") > 0) filename + else null + } + + output: + file 'software_versions_mqc.yaml' into software_versions_yaml + file "software_versions.csv" + + script: + """ + echo $workflow.manifest.version > v_pipeline.txt + echo $workflow.nextflow.version > v_nextflow.txt + echo \$(R --version 2>&1) > v_R.txt + fastqc --version > v_fastqc.txt + trim_galore --version > v_trim_galore.txt + bowtie --version > v_bowtie.txt + samtools --version > v_samtools.txt + htseq-count -h > v_htseq.txt + fasta_formatter -h > v_fastx.txt + mirtrace --version > v_mirtrace.txt + multiqc --version > v_multiqc.txt + scrape_software_versions.py > software_versions_mqc.yaml + """ +} /* * PREPROCESSING - Build Bowtie index for mature and hairpin */ -process makeBowtieIndex { - +process make_bowtie_index { + label 'process_medium' publishDir path: { params.saveReference ? "${params.outdir}/bowtie/reference" : params.outdir }, saveAs: { params.saveReference ? it : null }, mode: 'copy' @@ -212,28 +295,34 @@ process makeBowtieIndex { output: file 'mature_idx.*' into mature_index file 'hairpin_idx.*' into hairpin_index + file 'hairpin_idx.fa' into hairpin_mirtop script: """ - fasta_formatter -w 0 -i $mature -o mature_igenome.fa - fasta_nucleotide_changer -d -i mature_igenome.fa -o mature_idx.fa + seqkit grep -r --pattern \".*${params.mirtrace_species}-.*\" $mature > mature_sps.fa + seqkit seq --rna2dna mature_sps.fa > mature_igenome.fa + fasta_formatter -w 0 -i mature_igenome.fa -o mature_idx.fa + # fasta_nucleotide_changer -d -i mature_igenome.fa -o mature_idx.fa bowtie-build mature_idx.fa mature_idx - fasta_formatter -w 0 -i $hairpin -o hairpin_igenome.fa - fasta_nucleotide_changer -d -i hairpin_igenome.fa -o hairpin_idx.fa + + seqkit grep -r --pattern \".*${params.mirtrace_species}-.*\" $hairpin > hairpin_sps.fa + seqkit seq --rna2dna hairpin_sps.fa > hairpin_igenome.fa + # fasta_nucleotide_changer -d -i hairpin_igenome.fa -o hairpin_idx.fa + fasta_formatter -w 0 -i hairpin_igenome.fa -o hairpin_idx.fa bowtie-build hairpin_idx.fa hairpin_idx """ } - /* * STEP 1 - FastQC */ process fastqc { + label 'process_low' tag "$reads" publishDir "${params.outdir}/fastqc", mode: 'copy' when: - !params.skip_qc && !params.skip_fastqc + !params.skipQC && !params.skipFastqc input: file reads from raw_reads_fastqc @@ -252,6 +341,7 @@ process fastqc { * STEP 2 - Trim Galore! */ process trim_galore { + label 'process_low' tag "$reads" publishDir "${params.outdir}/trim_galore", mode: 'copy' @@ -259,16 +349,17 @@ process trim_galore { file reads from raw_reads_trimgalore output: - file '*.gz' into trimmed_reads_bowtie, trimmed_reads_bowtie2, trimmed_reads_insertsize + file '*.gz' into trimmed_reads_bowtie, trimmed_reads_collapse, trimmed_reads_bowtie_ref, trimmed_reads_insertsize file '*trimming_report.txt' into trimgalore_results file "*_fastqc.{zip,html}" into trimgalore_fastqc_reports script: - tg_length = "--length ${params.length}" - c_r1 = params.clip_R1 > 0 ? "--clip_R1 ${params.clip_R1}" : '' - tpc_r1 = params.three_prime_clip_R1 > 0 ? "--three_prime_clip_R1 ${params.three_prime_clip_R1}" : '' + tg_length = "--length ${params.min_length}" + c_r1 = clip_R1 > 0 ? "--clip_R1 ${clip_R1}" : '' + tpc_r1 = three_prime_clip_R1 > 0 ? "--three_prime_clip_R1 ${three_prime_clip_R1}" : '' + tpa = (protocol == "qiaseq" | protocol == "cats") ? "--adapter ${three_prime_adapter}" : '--small_rna' """ - trim_galore --small_rna $tg_length $c_r1 $tpc_r1 --gzip $reads --fastqc + trim_galore --adapter ${three_prime_adapter} $tg_length $c_r1 $tpc_r1 --max_length 40 --gzip $reads --fastqc """ } @@ -276,8 +367,8 @@ process trim_galore { /* * STEP 2.1 - Insertsize */ - process insertsize { + label 'process_low' tag "$reads" publishDir "${params.outdir}/trim_galore/insertsize", mode: 'copy' @@ -294,11 +385,33 @@ process insertsize { """ } +/* + * STEP 2.2 - Collapse + */ +process collapse { + label 'process_medium' + tag "$reads" + + input: + file reads from trimmed_reads_collapse + + output: + file 'collapsed/*.fastq' into collapsed_fasta + + script: + prefix = reads.toString() - '_trimmed.fq.gz' + """ + seqcluster collapse -f $reads -m 1 --min_size 15 -o collapsed + mv collapsed/${prefix}_trimmed_trimmed.fastq collapsed/${prefix}.fastq + """ +} + /* * STEP 3 - Bowtie miRBase mature miRNA */ process bowtie_miRBase_mature { + label 'process_medium' tag "$reads" publishDir "${params.outdir}/bowtie/miRBase_mature", mode: 'copy', pattern: '*.mature_unmapped.fq.gz' @@ -313,20 +426,20 @@ process bowtie_miRBase_mature { script: index_base = index.toString().tokenize(' ')[0].tokenize('.')[0] prefix = reads.toString() - ~/(.R1)?(_R1)?(_trimmed)?(\.fq)?(\.fastq)?(\.gz)?$/ + seq_center = params.seq_center ? "--sam-RG ID:${prefix} --sam-RG 'CN:${params.seq_center}'" : '' """ bowtie \\ $index_base \\ -q <(zcat $reads) \\ - -p 2 \\ + -p ${task.cpus} \\ -t \\ - -k 1 \\ - -m 1 \\ + -k 50 \\ --best \\ --strata \\ -e 99999 \\ --chunkmbs 2048 \\ --un ${prefix}.mature_unmapped.fq \\ - -S \\ + -S $seq_center \\ | samtools view -bS - > ${prefix}.mature.bam gzip ${prefix}.mature_unmapped.fq @@ -337,6 +450,7 @@ process bowtie_miRBase_mature { * STEP 4 - Bowtie against miRBase hairpin */ process bowtie_miRBase_hairpin { + label 'process_medium' tag "$reads" publishDir "${params.outdir}/bowtie/miRBase_hairpin", mode: 'copy', pattern: '*.hairpin_unmapped.fq.gz' @@ -345,42 +459,77 @@ process bowtie_miRBase_hairpin { file index from hairpin_index output: - file '*.hairpin.bam' into miRBase_hairpin_bam + file '*.hairpin.bam' into miRBase_hairpin_bam, miRBase_hairpin_bam_mirtop file '*.hairpin_unmapped.fq.gz' into hairpin_unmapped_reads script: index_base = index.toString().tokenize(' ')[0].tokenize('.')[0] prefix = reads.toString() - '.mature_unmapped.fq.gz' + seq_center = params.seq_center ? "--sam-RG ID:${prefix} --sam-RG 'CN:${params.seq_center}'" : '' """ bowtie \\ $index_base \\ - -p 2 \\ + -p ${task.cpus} \\ -t \\ - -k 1 \\ - -m 1 \\ + -a \\ --best \\ --strata \\ -e 99999 \\ --chunkmbs 2048 \\ -q <(zcat $reads) \\ --un ${prefix}.hairpin_unmapped.fq \\ - -S \\ + -S $seq_center \\ | samtools view -bS - > ${prefix}.hairpin.bam gzip ${prefix}.hairpin_unmapped.fq """ } +/* + * STEP 4.1 - Bowtie against miRBase hairpin with collapsed reads + */ +process bowtie_miRBase_hairpin_collapsed { + label 'process_medium' + tag "$reads" + + input: + file reads from collapsed_fasta + file index from hairpin_index + + output: + file '*.bam' into miRBase_hairpin_collapse_bam + + script: + index_base = index.toString().tokenize(' ')[0].tokenize('.')[0] + prefix = reads.baseName + seq_center = params.seq_center ? "--sam-RG ID:${prefix} --sam-RG 'CN:${params.seq_center}'" : '' + """ + bowtie \\ + $index_base \\ + -p ${task.cpus} \\ + -t \\ + -k 50 \\ + -a \\ + --best \\ + --strata \\ + -e 99999 \\ + --chunkmbs 2048 \\ + -q <(cat $reads) \\ + -S $seq_center \\ + | samtools view -bS - > ${prefix}.bam + """ +} /* - * STEP 5 - Post-alignment processing for miRBase mature and hairpin + * STEP 5.1 - Post-alignment processing for miRBase mature and hairpin */ def wrap_mature_and_hairpin = { file -> if ( file.contains("mature") ) return "miRBase_mature/$file" if ( file.contains("hairpin") ) return "miRBase_hairpin/$file" } -process miRBasePostAlignment { +process mirna_post_alignment { + label 'process_medium' tag "$input" publishDir "${params.outdir}/bowtie", mode: 'copy', saveAs: wrap_mature_and_hairpin @@ -388,7 +537,8 @@ process miRBasePostAlignment { file input from miRBase_mature_bam.mix(miRBase_hairpin_bam) output: - file "${input.baseName}.count" into miRBase_counts + file "${input.baseName}.stats" into miRBase_counts + file "*.{flagstat,idxstats,stats}" into ch_sort_bam_flagstat_mqc file "${input.baseName}.sorted.bam" into miRBase_bam file "${input.baseName}.sorted.bam.bai" into miRBase_bai @@ -396,15 +546,18 @@ process miRBasePostAlignment { """ samtools sort ${input.baseName}.bam -o ${input.baseName}.sorted.bam samtools index ${input.baseName}.sorted.bam - samtools idxstats ${input.baseName}.sorted.bam > ${input.baseName}.count + samtools idxstats ${input.baseName}.sorted.bam > ${input.baseName}.stats + samtools flagstat ${input.baseName}.sorted.bam > ${input.baseName}.sorted.bam.flagstat + samtools stats ${input.baseName}.sorted.bam > ${input.baseName}.sorted.bam.stats """ } - /* - * STEP 6 - edgeR miRBase feature counts processing + * STEP 5.2 - edgeR miRBase feature counts processing */ -process edgeR_miRBase { +process edgeR_mirna { + label 'process_low' + label 'process_ignore' publishDir "${params.outdir}/edgeR", mode: 'copy', saveAs: wrap_mature_and_hairpin input: @@ -419,114 +572,164 @@ process edgeR_miRBase { """ } +/* + * STEP 5.3 - miRNA format conversion to mirGFF3 + */ +process mirtop_bam_hairpin { + label 'process_medium' + tag "$input" + publishDir "${params.outdir}", mode: 'copy' + + when: + mirna_gtf + + input: + file input from miRBase_hairpin_collapse_bam.collect() + file hairpin from hairpin_mirtop + file gtf from mirna_gtf + + output: + file "mirtop/mirtop.gff" into mirtop_gff + file "mirtop/mirtop.tsv" into mirtop_tsv + file "mirtop/mirna.tsv" into mirna_tsv + file "mirtop/mirtop_rawData.tsv" into isomir_tsv + + script: + """ + mirtop gff --hairpin $hairpin --gtf $gtf -o mirtop --sps $params.mirtrace_species $input + mirtop counts --hairpin $hairpin --gtf $gtf -o mirtop --sps $params.mirtrace_species --add-extra --gff mirtop/mirtop.gff + mirtop export --format isomir --hairpin $hairpin --gtf $gtf --sps $params.mirtrace_species -o mirtop mirtop/mirtop.gff + collapse_mirtop.r mirtop/mirtop.tsv + """ +} + /* - * STEP 7.1 and 7.2 IF A GENOME SPECIFIED ONLY! + * STEP 6.1 and 6.2 IF A GENOME SPECIFIED ONLY! */ -if( params.gtf && params.bt2index) { +if( params.bt_index ) { /* - * STEP 7.1 - Bowtie 2 against reference genome + * STEP 6.1 - Bowtie 1 against reference genome */ - process bowtie2 { + process bowtie_ref { + label 'process_high' tag "$reads" - publishDir "${params.outdir}/bowtie2", mode: 'copy' + publishDir "${params.outdir}/bowtie_ref", mode: 'copy' input: - file reads from trimmed_reads_bowtie2 - file bt2_indices + file reads from trimmed_reads_bowtie_ref + file indices from bt_indices.collect() output: - file '*.bowtie2.bam' into bowtie2_bam, bowtie2_bam_for_unmapped + file '*.genome.bam' into bowtie_bam, bowtie_bam_for_unmapped script: - index_base = bt2_indices[0].toString() - ~/\.\d+\.bt2/ + index_base = indices[0].toString() - ~/.rev.\d.ebwt?/ - ~/.\d.ebwt?/ prefix = reads.toString() - ~/(.R1)?(_R1)?(_trimmed)?(\.fq)?(\.fastq)?(\.gz)?$/ + seq_center = params.seq_center ? "--sam-RG ID:${prefix} --sam-RG 'CN:${params.seq_center}'" : '' """ - bowtie2 \\ - -x $index_base \\ - -U $reads \\ - -k 10 \\ - --very-sensitive \\ - -p 8 \\ + bowtie \\ + $index_base \\ + -q <(zcat $reads) \\ + -p ${task.cpus} \\ -t \\ - | samtools view -bT $index_base - > ${prefix}.bowtie2.bam + -k 50 \\ + --best \\ + --strata \\ + -e 99999 \\ + --chunkmbs 2048 \\ + -S $seq_center \\ + | samtools view -bS - > ${prefix}.genome.bam """ } - /* - * STEP 7.2 - Bowtie 2 Statistics about unmapped reads against ref genome - */ + process genome_post_alignment { + label 'process_low' + tag "$input" + publishDir "${params.outdir}/bowtie_ref", mode: 'copy' - process bowtie2_unmapped { - tag "${input_files[0].baseName}" - publishDir "${params.outdir}/bowtie2/unmapped", mode: 'copy' - - input: - file input_files from bowtie2_bam_for_unmapped.toSortedList() + input: + file input from bowtie_bam output: - file 'unmapped_refgenome.txt' into bowtie2_unmapped + file "*.{flagstat,idxstats,stats}" into ch_genome_bam_flagstat_mqc script: """ - for i in $input_files - do - printf "\${i}\t" - samtools view -c -f0x4 \${i} - done > unmapped_refgenome.txt + samtools sort ${input.baseName}.bam -o ${input.baseName}.sorted.bam + samtools index ${input.baseName}.sorted.bam + samtools idxstats ${input.baseName}.sorted.bam > ${input.baseName}.stats + samtools flagstat ${input.baseName}.sorted.bam > ${input.baseName}.sorted.bam.flagstat + samtools stats ${input.baseName}.sorted.bam > ${input.baseName}.sorted.bam.stats """ } - /* - * STEP 7.3 - NGI-Visualizations of Bowtie 2 alignment statistics + * STEP 6.2 - Statistics about unmapped reads against ref genome */ - process ngi_visualizations { - tag "$bowtie2_bam" - publishDir "${params.outdir}/bowtie2/ngi_visualizations", mode: 'copy' + + process bowtie_unmapped { + label 'process_ignore' + label 'process_medium' + tag "${input_files[0].baseName}" + publishDir "${params.outdir}/bowtie_ref/unmapped", mode: 'copy' input: - file gtf from gtf - file bowtie2_bam + file input_files from bowtie_bam_for_unmapped.toSortedList() output: - file '*.{png,pdf}' into bowtie2_ngi_visualizations + file 'unmapped_refgenome.txt' into bowtie_unmapped script: - // Note! ngi_visualizations needs to be installed! - // See https://github.com/NationalGenomicsInfrastructure/ngi_visualizations """ - #!/usr/bin/env python - from ngi_visualizations.biotypes import count_biotypes - count_biotypes.main('$gtf','$bowtie2_bam') + for i in $input_files + do + printf "\${i}\t" + samtools view -c -f0x4 \${i} + done > unmapped_refgenome.txt """ } +} else{ + ch_genome_bam_flagstat_mqc = Channel.empty() } + /* - * Parse software version numbers + * STEP 7 - miRTrace */ -process get_software_versions { - - output: - file 'software_versions_mqc.yaml' into software_versions_yaml +process mirtrace { + tag "$reads" + publishDir "${params.outdir}/miRTrace", mode: 'copy' + + input: + file reads from raw_reads_mirtrace.collect() + + output: + file '*mirtrace' into mirtrace_results + + script: + primer = (protocol=="cats") ? " " : " --adapter $three_prime_adapter " + """ + for i in $reads + do + path=\$(realpath \${i}) + prefix=\$(echo \${i} | sed -e "s/.gz//" -e "s/.fastq//" -e "s/.fq//" -e "s/_val_1//" -e "s/_trimmed//" -e "s/_R1//" -e "s/.R1//") + echo \$path","\$prefix + done > mirtrace_config + + mirtrace qc \\ + --species $params.mirtrace_species \\ + $primer \\ + --protocol $protocol \\ + --config mirtrace_config \\ + --write-fasta \\ + --output-dir mirtrace \\ + --force + """ + } - script: - """ - echo "$params.version" > v_nfcore_smrnaseq.txt - echo "$workflow.nextflow.version" > v_nextflow.txt - fastqc --version > v_fastqc.txt - trim_galore --version > v_trim_galore.txt - bowtie --version > v_bowtie.txt - bowtie2 --version > v_bowtie2.txt - samtools --version > v_samtools.txt - fasta_formatter -h > v_fastx.txt - multiqc --version > v_multiqc.txt - scrape_software_versions.py > software_versions_mqc.yaml - """ -} /* * STEP 8 - MultiQC @@ -535,25 +738,52 @@ process multiqc { publishDir "${params.outdir}/MultiQC", mode: 'copy' when: - !params.skip_qc && !params.skip_multiqc + !params.skipQC && !params.skipMultiqc input: - file ('fastqc/*') from fastqc_results.toList() - file ('trim_galore/*') from trimgalore_results.toList() - file ('software_versions/*') from software_versions_yaml.toList() + file multiqc_config from ch_multiqc_config + file ('fastqc/*') from fastqc_results.collect() + file ('trim_galore/*') from trimgalore_results.collect() + file ('mirtrace/*') from mirtrace_results.collect() + file ('samtools/*') from ch_sort_bam_flagstat_mqc.collect() + file ('samtools_genome/*') from ch_genome_bam_flagstat_mqc.collect().ifEmpty([]) + file ('software_versions/*') from software_versions_yaml.collect() + file workflow_summary from create_workflow_summary(summary) output: - file '*multiqc_report.html' into multiqc_html - file '*multiqc_data' into multiqc_data + file "*multiqc_report.html" into multiqc_report + file "*_data" script: rtitle = custom_runName ? "--title \"$custom_runName\"" : '' rfilename = custom_runName ? "--filename " + custom_runName.replaceAll('\\W','_').replaceAll('_+','_') + "_multiqc_report" : '' """ - multiqc -f $rtitle $rfilename --config $multiqc_config . + multiqc . -f $rtitle $rfilename --config $multiqc_config -m bowtie1 -m samtools -m cutadapt -m fastqc -m custom_content """ } + +/* + * STEP 9 - Output Description HTML + */ +process output_documentation { + publishDir "${params.outdir}/pipeline_info", mode: 'copy' + + input: + file output_docs from ch_output_docs + + output: + file "results_description.html" + + script: + """ + markdown_to_html.r $output_docs results_description.html + """ +} + + + + /* * Completion e-mail notification */ @@ -565,7 +795,7 @@ workflow.onComplete { subject = "[nf-core/smrnaseq] FAILED: $workflow.runName" } def email_fields = [:] - email_fields['version'] = params.version + email_fields['version'] = workflow.manifest.version email_fields['runName'] = custom_runName ?: workflow.runName email_fields['success'] = workflow.success email_fields['dateComplete'] = workflow.complete @@ -578,15 +808,29 @@ workflow.onComplete { email_fields['summary'] = summary email_fields['summary']['Date Started'] = workflow.start email_fields['summary']['Date Completed'] = workflow.complete - email_fields['summary']['Nextflow Version'] = workflow.nextflow.version - email_fields['summary']['Nextflow Build'] = workflow.nextflow.build - email_fields['summary']['Nextflow Compile Timestamp'] = workflow.nextflow.timestamp email_fields['summary']['Pipeline script file path'] = workflow.scriptFile email_fields['summary']['Pipeline script hash ID'] = workflow.scriptId if(workflow.repository) email_fields['summary']['Pipeline repository Git URL'] = workflow.repository if(workflow.commitId) email_fields['summary']['Pipeline repository Git Commit'] = workflow.commitId if(workflow.revision) email_fields['summary']['Pipeline Git branch/tag'] = workflow.revision if(workflow.container) email_fields['summary']['Docker image'] = workflow.container + email_fields['summary']['Nextflow Version'] = workflow.nextflow.version + email_fields['summary']['Nextflow Build'] = workflow.nextflow.build + email_fields['summary']['Nextflow Compile Timestamp'] = workflow.nextflow.timestamp + + // On success try attach the multiqc report + def mqc_report = null + try { + if (workflow.success) { + mqc_report = multiqc_report.getVal() + if (mqc_report.getClass() == ArrayList){ + log.warn "[nf-core/smrnaseq] Found multiple reports from process 'multiqc', will use only one" + mqc_report = mqc_report[0] + } + } + } catch (all) { + log.warn "[nf-core/smrnaseq] Could not attach MultiQC report to summary email" + } // Render the TXT template def engine = new groovy.text.GStringTemplateEngine() @@ -600,7 +844,7 @@ workflow.onComplete { def email_html = html_template.toString() // Render the sendmail template - def smail_fields = [ email: params.email, subject: subject, email_txt: email_txt, email_html: email_html, baseDir: "$baseDir" ] + def smail_fields = [ email: params.email, subject: subject, email_txt: email_txt, email_html: email_html, baseDir: "$baseDir", mqcFile: mqc_report, mqcMaxSize: params.maxMultiqcEmailFileSize.toBytes() ] def sf = new File("$baseDir/assets/sendmail_template.txt") def sendmail_template = engine.createTemplate(sf).make(smail_fields) def sendmail_html = sendmail_template.toString() @@ -621,14 +865,10 @@ workflow.onComplete { // Switch the embedded MIME images with base64 encoded src smrnaseqlogo = new File("$baseDir/assets/smrnaseq_logo.png").bytes.encodeBase64().toString() - scilifelablogo = new File("$baseDir/assets/SciLifeLab_logo.png").bytes.encodeBase64().toString() - ngilogo = new File("$baseDir/assets/NGI_logo.png").bytes.encodeBase64().toString() email_html = email_html.replaceAll(~/cid:smrnaseqlogo/, "data:image/png;base64,$smrnaseqlogo") - email_html = email_html.replaceAll(~/cid:scilifelablogo/, "data:image/png;base64,$scilifelablogo") - email_html = email_html.replaceAll(~/cid:ngilogo/, "data:image/png;base64,$ngilogo") // Write summary e-mail HTML to a file - def output_d = new File( "${params.outdir}/Documentation/" ) + def output_d = new File( "${params.outdir}/pipeline_info/" ) if( !output_d.exists() ) { output_d.mkdirs() } @@ -637,6 +877,66 @@ workflow.onComplete { def output_tf = new File( output_d, "pipeline_report.txt" ) output_tf.withWriter { w -> w << email_txt } - log.info "[nf-core/smrnaseq] Pipeline Complete" + c_reset = params.monochrome_logs ? '' : "\033[0m"; + c_purple = params.monochrome_logs ? '' : "\033[0;35m"; + c_green = params.monochrome_logs ? '' : "\033[0;32m"; + c_red = params.monochrome_logs ? '' : "\033[0;31m"; + + if (workflow.stats.ignoredCount > 0 && workflow.success) { + log.info "${c_purple}Warning, pipeline completed, but with errored process(es) ${c_reset}" + log.info "${c_red}Number of ignored errored process(es) : ${workflow.stats.ignoredCount} ${c_reset}" + log.info "${c_green}Number of successfully ran process(es) : ${workflow.stats.succeedCount} ${c_reset}" + } + + if(workflow.success){ + log.info "${c_purple}[nf-core/smrnaseq]${c_green} Pipeline completed successfully${c_reset}" + } else { + checkHostname() + log.info "${c_purple}[nf-core/smrnaseq]${c_red} Pipeline completed with errors${c_reset}" + } + +} + +def nfcoreHeader(){ + // Log colors ANSI codes + c_reset = params.monochrome_logs ? '' : "\033[0m"; + c_dim = params.monochrome_logs ? '' : "\033[2m"; + c_black = params.monochrome_logs ? '' : "\033[0;30m"; + c_green = params.monochrome_logs ? '' : "\033[0;32m"; + c_yellow = params.monochrome_logs ? '' : "\033[0;33m"; + c_blue = params.monochrome_logs ? '' : "\033[0;34m"; + c_purple = params.monochrome_logs ? '' : "\033[0;35m"; + c_cyan = params.monochrome_logs ? '' : "\033[0;36m"; + c_white = params.monochrome_logs ? '' : "\033[0;37m"; + return """ ${c_dim}----------------------------------------------------${c_reset} + ${c_green},--.${c_black}/${c_green},-.${c_reset} + ${c_blue} ___ __ __ __ ___ ${c_green}/,-._.--~\'${c_reset} + ${c_blue} |\\ | |__ __ / ` / \\ |__) |__ ${c_yellow}} {${c_reset} + ${c_blue} | \\| | \\__, \\__/ | \\ |___ ${c_green}\\`-._,-`-,${c_reset} + ${c_green}`._,._,\'${c_reset} + ${c_purple} nf-core/smrnaseq v${workflow.manifest.version}${c_reset} + ${c_dim}----------------------------------------------------${c_reset} + """.stripIndent() +} + +def checkHostname(){ + def c_reset = params.monochrome_logs ? '' : "\033[0m" + def c_white = params.monochrome_logs ? '' : "\033[0;37m" + def c_red = params.monochrome_logs ? '' : "\033[1;91m" + def c_yellow_bold = params.monochrome_logs ? '' : "\033[1;93m" + if(params.hostnames){ + def hostname = "hostname".execute().text.trim() + params.hostnames.each { prof, hnames -> + hnames.each { hname -> + if(hostname.contains(hname) && !workflow.profile.contains(prof)){ + log.error "====================================================\n" + + " ${c_red}WARNING!${c_reset} You are running with `-profile $workflow.profile`\n" + + " but your machine hostname is ${c_white}'$hostname'${c_reset}\n" + + " ${c_yellow_bold}It's highly recommended that you use `-profile $prof${c_reset}`\n" + + "============================================================" + } + } + } + } } diff --git a/nextflow.config b/nextflow.config index 85c2cb9b..fc8f88b7 100644 --- a/nextflow.config +++ b/nextflow.config @@ -3,92 +3,105 @@ * nf-core/smrnaseq Nextflow config file * ------------------------------------------------- * Default config options for all environments. - * Cluster-specific config options should be saved - * in the conf folder and imported under a profile - * name here. */ +// Global default params, used in configs params { - version = '0.2dev' // Pipeline version - nf_required_version = '0.30.2' // Minimum version of nextflow required - container = 'nfcore/smrnaseq:latest' // Container slug. Stable releases should specify release tag!! + // Workflow flags + reads = "data/*.fastq.gz" + outdir = './results' + protocol = 'illumina' + genome = false + singleEnd = true + clip_R1 = 0 + three_prime_clip_R1 = 0 + three_prime_adapter = "TGGAATTCTCGGGTGCCAAGG" + min_length = 17 + skipQC = false + skipFastqc = false + skipMultiqc = false + saveReference = true + seq_center = "" - // Pipeline options - params.name = false - params.project = false - params.genome = false - params.gtf = params.genome ? params.genomes[ params.genome ].gtf ?: false : false - params.bt2index = params.genome ? params.genomes[ params.genome ].bowtie2 ?: false : false - params.bt2indices = null - params.mature = params.genome ? params.genomes[ params.genome ].mature ?: false : false - params.hairpin = params.genome ? params.genomes[ params.genome ].hairpin ?: false : false - params.saveReference = false - params.reads = "data/*.fastq.gz" - params.readPaths = null - params.outdir = './results' - multiqc_config = "$baseDir/conf/multiqc_config.yaml" - params.email = false - params.plaintext_email = false - params.skip_qc = false - params.skip_fastqc = false - params.skip_multiqc = false - // Custom trimming options - params.length = 18 - params.clip_R1 = 0 - params.three_prime_clip_R1 = 0 + // Boilerplate options + name = false + multiqc_config = "$baseDir/assets/multiqc_config.yaml" + email = false + maxMultiqcEmailFileSize = 25.MB + plaintext_email = false + monochrome_logs = false + help = false + igenomes_base = "./iGenomes" + tracedir = "${params.outdir}/pipeline_info" + awsqueue = false + awsregion = 'eu-west-1' + igenomesIgnore = false + custom_config_version = 'master' + custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" + hostnames = false + config_profile_description = false + config_profile_contact = false + config_profile_url = false } -profiles { +// Container slug. Stable releases should specify release tag! +// Developmental code should specify :dev +process.container = 'nfcore/smrnaseq:1.0.0' - standard { - includeConfig 'conf/base.config' - includeConfig 'conf/igenomes.config' - } +// Load base.config by default for all pipelines +includeConfig 'conf/base.config' + +// Load nf-core custom profiles from different Institutions +try { + includeConfig "${params.custom_config_base}/nfcore_custom.config" +} catch (Exception e) { + System.err.println("WARNING: Could not load nf-core/config profiles: ${params.custom_config_base}/nfcore_custom.config") +} + +profiles { + awsbatch { includeConfig 'conf/awsbatch.config' } conda { process.conda = "$baseDir/environment.yml" } + debug { process.beforeScript = 'echo $HOSTNAME' } docker { docker.enabled = true } singularity { singularity.enabled = true } - uppmax { - includeConfig 'conf/base.config' - includeConfig 'conf/uppmax.config' - includeConfig 'conf/igenomes.config' - } - test { - includeConfig 'conf/base.config' - includeConfig 'conf/test.config' - } - none { - // Don't load any config (for use with custom home configs) - } - + test { includeConfig 'conf/test.config' } } +// Load igenomes.config if required +if(!params.igenomesIgnore){ + includeConfig 'conf/igenomes.config' +} // Capture exit codes from upstream processes when piping process.shell = ['/bin/bash', '-euo', 'pipefail'] timeline { enabled = true - file = "${params.outdir}/pipeline_info/smRNAseq_timeline.html" + file = "${params.tracedir}/execution_timeline.html" } report { enabled = true - file = "${params.outdir}/pipeline_info/smRNAseq_report.html" + file = "${params.tracedir}/execution_report.html" } trace { enabled = true - file = "${params.outdir}/pipeline_info/smRNAseq_trace.txt" + file = "${params.tracedir}/execution_trace.txt" } dag { enabled = true - file = "${params.outdir}/pipeline_info/smRNAseq_dag.svg" + file = "${params.tracedir}/pipeline_dag.svg" } manifest { + name = 'nf-core/smrnaseq' + author = 'Phil Ewels , Chuan Wang , Rickard Hammarén , Lorena Pantano ' homePage = 'https://github.com/nf-core/smrnaseq' - description = 'Nextflow small RNA sequencing analysis pipeline.' + description = 'Small RNA-Seq Best Practice Analysis Pipeline.' mainScript = 'main.nf' + nextflowVersion = '>=0.32.0' + version = '1.0.0' } // Function to ensure that resource requirements don't go beyond