diff --git a/README.md b/README.md
index c506cead..eb7d02df 100644
--- a/README.md
+++ b/README.md
@@ -23,12 +23,31 @@ The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool
-1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/))
-2. Present QC for raw reads ([`MultiQC`](http://multiqc.info/))
-3. Trim reads ([`FASTP`](https://github.com/OpenGene/fastp))
-4. Align reads to the genome ([`STAR`](https://github.com/alexdobin/STAR))
-5. Alignment QC ([`Picard CollectRnaSeqMetrics`](https://broadinstitute.github.io/picard/))
-6. Transcript quantification ([`Salmon`](https://salmon.readthedocs.io/en/latest/))
+1. Trim reads ([`FASTP`](https://github.com/OpenGene/fastp))
+2. Transcript quantification ([`Salmon`](https://salmon.readthedocs.io/en/latest/))
+3. Align reads to the genome ([`STAR`](https://github.com/alexdobin/STAR))
+4. Output junction tracks
+5. Output bigwig ([`UCSC wigToBigWig`](https://genome.ucsc.edu/goldenPath/help/bigWig.html))
+6. Choice to subsample overrepresented regions ([`Samtools`](https://github.com/samtools/samtools/))
+7. Choice to downsample number of reads ([`Samtools`](https://github.com/samtools/samtools/))
+8. Detection of aberrant expression ([`DROP`](https://github.com/gagneurlab/drop/))
+9. Detection of aberrant splicing ([`DROP`](https://github.com/gagneurlab/drop/))
+10. Filter aberrant expression and aberrant splicing results
+11. Guided transcript assembly ([`StringTie`](https://ccb.jhu.edu/software/stringtie/))
+12. Filtering results of guided transcript assembly ([`GffCompare`](https://github.com/gpertea/gffcompare))
+13. To Call SNVs either path a or b can be followed. Path A will run by default
+ a. Call SNVs
+ 1. ([`BCFtools Mpileups`](https://samtools.github.io/bcftools/bcftools.html#mpileup))
+14. b. Call SNVs
+ 1. Split cigar reads ([`SplitN Cigar Reads`](https://gatk.broadinstitute.org/hc/en-us/articles/360036858811-SplitNCigarReads))
+ 2. Haplotype caller ([`Haplotype Caller`](https://gatk.broadinstitute.org/hc/en-us/articles/360037225632-HaplotypeCaller))
+ 3. Variant filtration ([`Variant Filtration`](https://gatk.broadinstitute.org/hc/en-us/articles/360037434691-VariantFiltration))
+ 4. BCFtools statistics ([`BCFtools stats`](https://samtools.github.io/bcftools/bcftools.html#stats))
+15. Allele Specific Read Counter ([`ASEReadCounter`](https://gatk.broadinstitute.org/hc/en-us/articles/360037428291-ASEReadCounter))
+16. Assess allelic imbalance ([`BootstrapAnn`](https://github.com/J35P312/BootstrapAnn#bootstrapann))
+17. Annotation ([`VEP`](https://github.com/Ensembl/ensembl-vep))
+18. Alignment QC ([`Picard CollectRnaSeqMetrics`](https://broadinstitute.github.io/picard/))
+19. Present QCs ([`MultiQC`](http://multiqc.info/))
## Usage
@@ -74,11 +93,11 @@ For more details about the output files and reports, please refer to the [output
## Credits
-tomte was originally written by Clinical Genomics Stockholm.
+genomic-medicine-sweden/tomte was written by Clinical Genomics Stockholm, Sweden, with major contributions from [Lucía Peña-Pérez](https://github.com/Lucpen), [Anders Jemt](https://github.com/jemten), and [Jesper Eisfeldt](https://github.com/J35P312).
-We thank the following people for their extensive assistance in the development of this pipeline:
+Additional contributors were [Ramprasad Neethiraj](https://github.com/ramprasadn), [Esmee ten Berk de Boer](https://github.com/Esmeetbdb), [Vadym Ivanchuk](https://github.com/ivadym), and [Mei Wu](https://github.com/projectoriented).
-
+We thank the nf-core community for their extensive assistance in the development of this pipeline.
## Contributions and Support
@@ -88,10 +107,7 @@ For further information or help, don't hesitate to get in touch by opening an [i
## Citations
-
-
-
-
+If you use genomic-medicine-sweden/tomte for your analysis, please cite it using the following doi: [XX.XXXX/zenodo.XXXXXXX](https://doi.org/XX.XXXX/zenodo.XXXXXXX)
An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.
diff --git a/bin/drop_filter_results.py b/bin/drop_filter_results.py
index 02121a1a..64e4b95b 100755
--- a/bin/drop_filter_results.py
+++ b/bin/drop_filter_results.py
@@ -76,6 +76,9 @@ def filter_outrider_results(
df_family_annotated_aberrant_expression_top_hits = annotate_with_hgnc(
df_family_aberrant_expression_top_hits, out_drop_gene_name
)
+ df_family_annotated_aberrant_expression_top_hits.to_csv(
+ "OUTRIDER_provided_samples_top_hits.tsv", sep="\t", index=False, header=True
+ )
filter_by_gene_panel(df_family_annotated_aberrant_expression_top_hits, gene_panel, "OUTRIDER")
diff --git a/conf/modules.config b/conf/modules.config
index d664519e..10d5b68e 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -318,7 +318,7 @@ process {
process {
withName: '.*ANALYSE_TRANSCRIPTS:DROP_SAMPLE_ANNOT' {
- ext.when = {params.run_drop_ae|params.run_drop_as}
+ ext.when = {params.run_drop_ae_switch|params.run_drop_as_switch}
publishDir = [
path: { "${params.outdir}/analyse_transcripts/drop" },
mode: params.publish_dir_mode,
@@ -327,7 +327,7 @@ process {
}
withName: '.*ANALYSE_TRANSCRIPTS:DROP_CONFIG_RUN_AE' {
- ext.when = {params.run_drop_ae}
+ ext.when = {params.run_drop_ae_switch}
publishDir = [
path: { "${params.outdir}/analyse_transcripts/drop/AE" },
mode: params.publish_dir_mode,
@@ -336,7 +336,7 @@ process {
}
withName: '.*ANALYSE_TRANSCRIPTS:DROP_CONFIG_RUN_AS' {
- ext.when = {params.run_drop_as}
+ ext.when = {params.run_drop_as_switch}
publishDir = [
path: { "${params.outdir}/analyse_transcripts/drop/AS" },
mode: params.publish_dir_mode,
@@ -345,7 +345,7 @@ process {
}
withName: '.*ANALYSE_TRANSCRIPTS:DROP_FILTER_RESULTS' {
- ext.when = {params.run_drop_ae | params.run_drop_as}
+ ext.when = {params.run_drop_ae_switch|params.run_drop_as_switch}
publishDir = [
path: { "${params.outdir}/analyse_transcripts/drop" },
mode: params.publish_dir_mode,
diff --git a/docs/README.md b/docs/README.md
index 09864d9d..9d76d9e1 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -6,7 +6,5 @@ The tomte documentation is split into the following pages:
- An overview of how the pipeline works, how to run it and a description of all of the different command-line flags.
- [Output](output.md)
- An overview of the different results produced by the pipeline and how to interpret them.
-- [Parameters](parameters.md)
- - An overview of the different pipeline parameters.
You can find a lot more documentation about installing, configuring and running nf-core and nf-core based pipelines on the website: [https://nf-co.re](https://nf-co.re)
diff --git a/docs/images/tomte_pipeline_metromap.png b/docs/images/tomte_pipeline_metromap.png
index b9772928..b114f831 100644
Binary files a/docs/images/tomte_pipeline_metromap.png and b/docs/images/tomte_pipeline_metromap.png differ
diff --git a/docs/output.md b/docs/output.md
index 2ba3e815..589cb62b 100644
--- a/docs/output.md
+++ b/docs/output.md
@@ -6,58 +6,221 @@ This document describes the output produced by the pipeline. Most of the plots a
The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory.
-
-
## Pipeline overview
The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps:
-- [FastQC](#fastqc) - Raw read QC
-- [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline
-- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution
-- [FASTP](#FASTP) - Trim reads
-- [STAR](#STAR) - Align reads to the genome
+- [`Trimming`](#trimming)
+ - [`FASTP`](#fastp) trims reads
+- [`Transcript quantification`](#transcript-quantification)
+ - [`Salmon`](#salmon) quantifies transcripts
+- [`Allignment`](#allignment)
+ - [`STAR`](#star) aligns reads to the genome
+- [`Tracks`](#tracks)
+ - [`Tracks`](#tracks-1) outputs tracks
+- [`Transcript analysis`](#transcript-analysis)
+ - [`DROP`](#drop) aberrant expression and aberrant splicing discovery
+ - [`StringTie`](#stringtie) guided transcript assembly
+ - [`GffCompare`](#gffcompare) annnotation of guided transcript assembly
+- [`Variant Calling`](#variant-calling)
+ - [`BCFtools Mpileups`](#mpileups) single nucleotide variation calling
+ - [`GATK best practices SNV Calling`](#gatk-best-practices-snv-calling)
+- [`Allele specific variant Calling`](#allele-specific-variant-calling)
+ - [`ASEReadCounter`](#asereadcounter) allele Specific Read Counter
+ - [`BootstrapAnn`](#bootstrapann) assesses allelic imbalance
+- [`Variant annotation`](#variant-annotation)
+ - [`VEP`](#vep) annotation
+- [`Pipeline information and QCs`](#pipeline-information-and-qcs)
+ - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution
+ - [`Picard CollectRnaSeqMetrics`](#picard-collectrnaseqmetrics) alignment QC
+ - [`MultiQC`](#multiqc) presents QCs
+
+### Trimming
+
+#### FASTP
-### FastQC
+[FASTP](https://github.com/OpenGene/fastp) is a fastq preprocessing tool that gives general quality metrics about your sequenced reads and trims adapters from them. For further reading and documentation see the [FASTP documentation](https://github.com/OpenGene/fastp).
Output files
-- `fastqc/`
- - `*_fastqc.html`: FastQC report containing quality metrics.
- - `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images.
+- `trimming/`
+ - `*.fastp.html`: a report consisting on a standalone HTML file that can be viewed in your web browser.
+ - `*.fastp.log`: run log.
+ - `*.fastp.json`: a report containing the same information as the html as a json file.
+ - `*.fastp.fastq.gz`: gzip compressed trimmed reads.
-[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/).
+### Transcript quantification
-![MultiQC - FastQC sequence counts plot](images/mqc_fastqc_counts.png)
+#### Salmon
-![MultiQC - FastQC mean quality scores plot](images/mqc_fastqc_quality.png)
+[`Salmon`](https://salmon.readthedocs.io/en/latest/) quantifies reads.
-![MultiQC - FastQC adapter content plot](images/mqc_fastqc_adapter.png)
+
+Output files
+
+- `alignment/sample`
+ - `quant.sf`: quantification file.
+ - `quant.genes.sf`: quantification file per gene.
+ - `logs/salmon_quant.log`: log file.
+ - `cmd_info.json`: main command line parameters with which Salmon was run.
+
+
-:::note
-The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality.
-:::
+### Alignment
-### MultiQC
+#### STAR
+
+[`STAR`](https://github.com/alexdobin/STAR) aligns reads to the genome reference. For further reading and documentation see the [STAR manual](https://physiology.med.cornell.edu/faculty/skrabanek/lab/angsd/lecture_notes/STARmanual.pdf).
Output files
-- `multiqc/`
- - `multiqc_report.html`: a standalone HTML file that can be viewed in your web browser.
- - `multiqc_data/`: directory containing parsed statistics from the different tools used in the pipeline.
- - `multiqc_plots/`: directory containing static images from the report in various formats.
+- `alignment/`
+ - `*.SJ.out.tab`: the high confidence collapssed junctions.
+ - `*.ReadsPerGene.out.tab`: read count per gene.
+ - `*.Log.progress.out`: run progress statistics report updated every minute.
+ - `*.Log.out`: log file containing run details.
+ - `*.Log.final.out`: a summary of the mapping statistics. It is calculated indivisually per read and then averaged.
+ - `*.Aligned.out.bam`: Aligned reads.
-[MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory.
+### Tracks
-Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see .
+#### Tracks
+
+Outputs both junction tracks and bigwig files. For wigToBigWig [`UCSC wigToBigWig`](https://genome.ucsc.edu/goldenPath/help/bigWig.html) is used.
+
+
+Output files
+
+- `ucsc/`
+ - `*.bw`: track in bigwig format.
+ - `*_junction.bed`: junction bed.
+ - `*_bed.gz`: bed file with sample data.
+ - `*_bed.gz.tbi`: index for bed file with sample data.
+
+
+
+### Transcript analysis
+
+#### DROP
+
+[`DROP`](https://github.com/gagneurlab/drop/) is a pipleine that detects aberrant expression, aberrant spliceing, and monoallelic expression. For the time being, aberrant expression and aberrant splicing modules are run. Afterwards another script is run to filter results.
+
+
+Output files
+
+- `analyse_transcripts/drop`
+ - `OUTRIDER_provided_samples_top_hits.tsv`: provides at least the top 20 most significant events reported by OUTRIDER in each sample.
+ - `OUTRIDER_provided_samples_top_hits_filtered.tsv`: filters OUTRIDER_provided_samples_top_hits according to genes provided by gene_panel_clinical_filter.
+ - `FRASER_provided_samples_top_hits.tsv`: provides the aberrant spliced events reported by FRASER.
+ - `FRASER_provided_samples_top_hits_filtered.tsv`: filters FRASER_provided_samples_top_hits according to genes provided by gene_panel_clinical_filter.
+
+
+
+#### StringTie
+
+[`StringTie`](https://ccb.jhu.edu/software/stringtie/) will perform guided transcript assembly.
+
+
+Output files
+
+- `analyse_transcripts`
+ - `*.coverage.gtf`: coverage on the sample.
+ - `*.gene.abundance.txt`: gene abundance on the sample.
+ - `*.transcripts.gtf`: transcripts assembled on the sample
+
+
+
+#### GffCompare
+
+[`GffCompare`](https://github.com/gpertea/gffcompare) annotates stringtie results with the reference transcripts, marking each assembled transcript as either normal or aberrant.
+
+
+Output files
+
+- `analyse_transcripts`
+ - `*.stats`: data summary and accuracy estimation.
+ - `*.annotated.gtf`: annotated gtf file.
+ - `*.tracking`: transcripts assembled on the sample
+ - `*.transcripts.gtf.refmap`: list for each reference transcript what query transcript partially or fully matches it.
+ - `*.transcripts.gtf.tmap`: list the most similar reference transcript to each query transcript.
+
+
+
+### Variant Calling
+
+#### Mpileups
-### Pipeline information
+[`BCFtools Mpileups`](https://samtools.github.io/bcftools/bcftools.html#mpileup) SNV calling. Default SNV caller.
+
+
+Output files
+
+- `call variants`
+ - `*.vcf.gz`: file in vcf format containing variants found in the patient.
+ - `*.vcf.gz.tbi`: index for .vcf.gz file.
+ - `*.bcftools_stats.txt`: stats on non-reference allele frequency, depth distribution, stats by quality and per-sample counts, singleton stats, etc.
+
+
+
+#### GATK best practices SNV Calling
+
+[`GATK best practices SNV Calling`](https://gatk.broadinstitute.org/hc/en-us/articles/360035531192-RNAseq-short-variant-discovery-SNPs-Indels-) SNV calling will only be activated by setting parameter variant_caller
+to "gatk". Involves several steps: [`SplitN Cigar Reads`](https://gatk.broadinstitute.org/hc/en-us/articles/360036858811-SplitNCigarReads), [`Haplotype Caller`](https://gatk.broadinstitute.org/hc/en-us/articles/360037225632-HaplotypeCaller), [`Variant Filtration`](https://gatk.broadinstitute.org/hc/en-us/articles/360037434691-VariantFiltration) and [`BCFtools stats`](https://samtools.github.io/bcftools/bcftools.html#stats).
+
+
+Output files
+
+- `call variants`
+ - `*.vcf.gz`: file in vcf format containing variants found in the patient.
+ - `*.vcf.gz.tbi`: index for .vcf.gz file.
+ - `*.bcftools_stats.txt`: stats on non-reference allele frequency, depth distribution, stats by quality and per-sample counts, singleton stats, etc.
+
+
+
+### Allele specific variant calling
+
+#### ASEReadCounter
+
+[`ASEReadCounter`](https://gatk.broadinstitute.org/hc/en-us/articles/360037428291-ASEReadCounter) allele Specific Read Counter.
+
+#### BootstrapAnn
+
+[`BootstrapAnn`](https://github.com/J35P312/BootstrapAnn#bootstrapann) detects expression imbalance between alleles.
+
+
+Output files
+
+- `bootstrapann`
+ - `*ase.vcf`: annotated vcf where allelic imbalance is marked
+
+
+
+### Variant annotation
+
+#### VEP
+
+[`VEP`](https://github.com/Ensembl/ensembl-vep) annotates vcfs.
+
+
+Output files
+
+- `annotate_vep`
+ - `*ase_vep.vcf.gz`: annotated vcf
+ - `*ase_vep.vcf.gz.tbi`: index for annotated vcf
+
+
+
+### Pipeline information and QCs
+
+#### Pipeline information
+
+[Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage.
Output files
@@ -70,36 +233,30 @@ Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQ
-[Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage.
+#### Picard CollectRnaSeqMetrics
-### FASTP
+[`Picard CollectRnaSeqMetrics`](https://broadinstitute.github.io/picard/) alignment QC
Output files
-- `trimming/`
- - `*.fastp.html`: a report consisting on a standalone HTML file that can be viewed in your web browser.
- - `*.fastp.log`: run log.
- - `*.fastp.json`: a report containing the same information as the html as a json file.
- - `*.fastp.fastq.gz`: gzip compressed trimmed reads.
+- `bam_qc/`
+ - `*rna_metrics`: metrics describing the distribution of the bases within the transcripts.
-[FASTP](https://github.com/OpenGene/fastp) is a fastq preprocessing tool that gives general quality metrics about your sequenced reads and trims adapters from them. For further reading and documentation see the [FASTP documentation](https://github.com/OpenGene/fastp).
+#### MultiQC
-### STAR
+[MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory.
+
+Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see .
Output files
-- `alignment/`
- - `*.SJ.out.tab`: the high confidence collapssed junctions.
- - `*.ReadsPerGene.out.tab`: read count per gene.
- - `*.Log.progress.out`: run progress statistics report updated every minute.
- - `*.Log.out`: log file containing run details.
- - `*.Log.final.out`: a summary of the mapping statistics. It is calculated indivisually per read and then averaged.
- - `*.Aligned.out.bam`: Aligned reads.
+- `multiqc/`
+ - `multiqc_report.html`: a standalone HTML file that can be viewed in your web browser.
+ - `multiqc_data/`: directory containing parsed statistics from the different tools used in the pipeline.
+ - `multiqc_plots/`: directory containing static images from the report in various formats.
-
-[STAR](https://github.com/alexdobin/STAR) aligns reads to the genome reference. For further reading and documentation see the [STAR manual](https://physiology.med.cornell.edu/faculty/skrabanek/lab/angsd/lecture_notes/STARmanual.pdf).
diff --git a/docs/parameters.md b/docs/parameters.md
deleted file mode 100644
index ba4b9cce..00000000
--- a/docs/parameters.md
+++ /dev/null
@@ -1,100 +0,0 @@
-# tomte pipeline parameters
-
-Pipeline to analyse RNAseq from raredisease patients
-
-## Input/output options
-
-Define where the pipeline should find input data and save output data.
-
-`--input` `string` containing path to comma-separated file containing information about the samples in the experiment [Help](https://github.com/genomic-medicine-sweden/tomte/blob/master/docs/usage.md#samplesheet-input).
-
-`--outdir` `string` with the output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.
-
-`--email` `string` with email address for completion summary. Set this parameter to your e-mail address to get a summary e-mail
-with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.
-
-`--multiqc_title` `string` with MultiQC report title. Printed as page header, used for filename if not otherwise specified.
-
-## Reference genome options
-
-Reference genome related files and options required for the workflow.
-
-`--genome` `string` with name of iGenomes reference. If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`. [Further details](https://nf-co.re/usage/reference_genomes).
-
-`--fasta` `string` with path to FASTA genome file. This parameter is _mandatory_ if `--genome` is not specified. If you don't have a BWA index available this will be generated for you automatically. Combine with `--save_reference` to save BWA index for future runs.
-
-`--fasta_fai` `string` with path to FASTA genome index file. If none provided, will be generated automatically from the FASTA.
-
-`--gtf` `string` with path to GTF annotation file. This parameter is _mandatory_ if `--genome` is not specified.
-
-`--igenomes_base` `string` with directory / URL base for iGenomes references. Default `s3://ngi-igenomes/igenomes`.
-
-`--igenomes_ignore` `boolean` avoids loading the iGenomes reference config. Do not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`.
-
-`--save_reference` `boolean` to indicate if indices/references generated by the pipeline should be saved in the required results directory.
-
-`--sequence_dict` `string` path to genome dictionary file.
-
-`--star_index` `string` path to directory or tar.gz archive for pre-built STAR index.
-
-## Institutional config options
-
-Parameters used to describe centralised config profiles. These should not be edited.
-
-`--custom_config_version` `string` git commit id for Institutional configs.
-
-`--custom_config_base` `string` base directory for Institutional configs. If you're running offline, Nextflow will not be
-able to fetch the institutional config files from the internet. If you don't need them, then this is not a problem. If you do need them, you should download the files from the repo and tell Nextflow where to find them with this parameter https://raw.githubusercontent.com/nf-core/configs/master.
-
-`--config_profile_name` `string` Institutional config name.
-
-`--config_profile_description` `string` Institutional config description.
-
-`--config_profile_contact` `string` Institutional config contact information.
-
-`--config_profile_url` `string` Institutional config URL link.
-
-## Max job request options
-
-Set the top limit for requested resources for any single job.
-
-`--max_cpus` `integer` Maximum number of CPUs that can be requested for any single job. Use to set an upper-limit for the CPU
-requirement for each process. Should be an integer e.g. `--max_cpus 1`. Default `16`.
-
-`--max_memory` `string` Maximum amount of memory that can be requested for any single job. Use to set an upper-limit for the memory requirement for each process. Should be a string in the format integer-unit e.g. `--max_memory '8.GB'`. Default `128.GB`.
-
-`--max_time` `string` Maximum amount of time that can be requested for any single job. Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`. Default `240.h`.
-
-## Generic options
-
-Less common options for the pipeline, typically set in a config file.
-
-`--help` `boolean` Display help text.
-
-`--version` `boolean` Display version and exit.
-
-`--publish_dir_mode` `string` Method used to save pipeline results to output directory. The Nextflow `publishDir` option
-specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.
-
-`--email_on_fail` `string` Email address for completion summary, only when pipeline fails. An email address to send a summary email to when the pipeline is completed - ONLY sent if the pipeline does not exit successfully.
-
-`--plaintext_email` `boolean` Send plain-text email instead of HTML.
-
-`--max_multiqc_email_size` `string` File size limit when attaching MultiQC reports to summary emails. Default `25.MB`.
-
-`--monochrome_logs` `boolean` Do not use coloured log outputs.
-
-`--hook_url` `string` Incoming hook URL for messaging service. Incoming hook URL for messaging service. Currently, MS Teams
-and Slack are supported.
-
-`--multiqc_config` `string` Custom config file to supply to MultiQC.
-
-`--multiqc_logo` `string` Custom logo file to supply to MultiQC. File name must also be set in the MultiQC config file.
-
-`--multiqc_methods_description` `string` Custom MultiQC yaml file containing HTML including a methods description.
-
-`--tracedir` `string` Directory to keep pipeline Nextflow logs and reports. Default `${params.outdir}/pipeline_info`.
-
-`--validate_params` `boolean` Boolean whether to validate parameters against the schema at runtime. Default `True`.
-
-`--show_hidden_params` Show all params when using `--help`, including those that are hidden in the schema that are not shown by default with `--help`. Specifying this option will tell the pipeline to show all parameters.
diff --git a/docs/usage.md b/docs/usage.md
index 157d5acf..ddabdb21 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -1,60 +1,234 @@
-# tomte: Usage
+# genomic-medicine-sweden/tomte: Usage
## :warning: Please read this documentation on github website: [tomte usage](https://github.com/genomic-medicine-sweden/tomte)
> _Documentation of pipeline parameters is generated automatically from the pipeline schema and can no longer be found in markdown files._
+Table of contents:
+
+- [genomic-medicine-sweden/tomte: Usage](#genomic-medicine-swedentomte-usage)
+ - [Introduction](#introduction)
+ - [Prerequisites](#prerequisites)
+ - [Run genomic-medicine-sweden/tomte with test data](#run-genomic-medicine-swedentomte-with-test-data)
+ - [Updating the pipeline](#updating-the-pipeline)
+ - [Run genomic-medicine-sweden/tomte with your data](#run-genomic-medicine-swedentomte-with-your-data)
+ - [Samplesheet](#samplesheet)
+ - [Reference files and parameters](#reference-files-and-parameters)
+ - [Alignment and pseudo quantification](#1-alignment)
+ - [Region subsampling](#2-subsample-region)
+ - [Variant calling](#3-variant-calling---snv)
+ - [SNV annotation](#4-snv-annotation-ensembl-vep)
+ - [DROP](#5-drop)
+ - [Preparing DROP input](#preparing-input-for-drop)
+ - [Run the pipeline](#run-the-pipeline)
+ - [Direct input in CLI](#direct-input-in-cli)
+ - [Import from a config file (recommended)](#import-from-a-config-file-recommended)
+- [Best practices](#best-practices)
+- [Core Nextflow arguments](#core-nextflow-arguments)
+ - [`-profile`](#-profile)
+ - [`-resume`](#-resume)
+ - [`-c`](#-c)
+- [Custom configuration](#custom-configuration)
+ - [Changing resources](#changing-resources)
+ - [Custom Containers](#custom-containers)
+ - [Custom Tool Arguments](#custom-tool-arguments)
+ - [nf-core/configs](#nf-coreconfigs)
+ - [Azure Resource Requests](#azure-resource-requests)
+ - [Running in the background](#running-in-the-background)
+ - [Nextflow memory requirements](#nextflow-memory-requirements)
+ - [Running the pipeline without Internet access](#running-the-pipeline-without-internet-access)
+
## Introduction
-
+**tomte** is a bioinformatics best-practice analysis pipeline for analysing RNAseq data from patients with rare diseases.
+
+The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community!
-## Samplesheet input
+## Prerequisites
-You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below.
+1. Install Nextflow (>=22.10.1) using the instructions [here.](https://nextflow.io/docs/latest/getstarted.html#installation)
+2. Install one of the following technologies for full pipeline reproducibility: Docker, Singularity, Podman, Shifter or Charliecloud.
+ > Almost all nf-core pipelines give you the option to use conda as well. However, some tools used in the tomte pipeline do not have a conda package so we do not support conda at the moment.
+
+### Updating the pipeline
+
+When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline:
```bash
---input '[path to samplesheet file]'
+nextflow pull genomic-medicine-sweden/tomte
```
-### Multiple runs of the same sample
+## Run genomic-medicine-sweden/tomte with test data
-The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes:
+Before running the pipeline with your data, we recommend running it with the test dataset available in the test_data folder provided with the pipeline and [here](https://github.com/nf-core/test-datasets/tree/raredisease). You do not need to download any of the data as part of it came directly with the pipeline and the other part will be fetched automatically for you when you use the test profile.
-```console
-case,sample,fastq_1,fastq_2,strandedness
-CASE_1,CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz,reverse
-CASE_1,CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz,reverse
-CASE_1,CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz,reverse
-```
+Run the following command, where YOURPROFILE is the package manager you installed on your machine. For example, `-profile test,docker` or `-profile test,singularity`:
-### Full samplesheet
+```
+nextflow run genomic-medicine-sweden/tomte \
+ -revision dev -profile test, \
+ --outdir
+```
-The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 5 columns to match those defined in the table below.
+> Check [nf-core/configs](https://github.com/nf-core/configs/tree/master/conf) to see if a custom config file to run nf-core pipelines already exists for your institute. If so, you can simply use `-profile test,` in your command. This enables the appropriate package manager and sets the appropriate execution settings for your machine.
+> NB: The order of profiles is important! They are loaded in sequence, so later profiles can overwrite earlier profiles.
-A final samplesheet file consisting of both single- and paired-end data may look something like the one below. This is for 6 samples, where `TREATMENT_REP3` has been sequenced twice.
+Running the command creates the following files in your working directory:
-```console
-case,sample,fastq_1,fastq_2,strandedness
-CASE_1,CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz,reverse
-CASE_1,CONTROL_REP2,AEG588A2_S2_L002_R1_001.fastq.gz,AEG588A2_S2_L002_R2_001.fastq.gz,reverse
-CASE_1,CONTROL_REP3,AEG588A3_S3_L002_R1_001.fastq.gz,AEG588A3_S3_L002_R2_001.fastq.gz,reverse
-CASE_1,TREATMENT_REP1,AEG588A4_S4_L003_R1_001.fastq.gz,,reverse
-CASE_1,TREATMENT_REP2,AEG588A5_S5_L003_R1_001.fastq.gz,,reverse
-CASE_1,TREATMENT_REP3,AEG588A6_S6_L003_R1_001.fastq.gz,,reverse
-CASE_1,TREATMENT_REP3,AEG588A6_S6_L004_R1_001.fastq.gz,,reverse
```
+work # Directory containing the Nextflow working files
+ # Finished results in specified location (defined with --outdir)
+.nextflow_log # Log file from Nextflow
+# Other Nextflow hidden files, like history of pipeline logs.
+```
+
+Test profile runs the pipeline with a case containing three samples, but if you would like to test the pipeline with one sample, use `-profile test_one_sample,`.
+
+> Note that the default cpu and memory configurations used in tomte are written keeping the test profile (&dataset, which is tiny) in mind. You should override these values in configs to get it to work on larger datasets. Check the section `custom-configuration` below to know more about how to configure resources for your platform.
+
+## Run genomic-medicine-sweden/tomte with your data
+
+Running the pipeline involves three steps:
+
+1. Prepare a samplesheet
+2. Gather all required references
+3. Supply samplesheet and references, and run the command
-| Column | Description |
+#### Samplesheet
+
+A samplesheet is used to pass the information about the sample(s), such as the path to the FASTQ files and other meta data (sex, phenotype, etc.,) to the pipeline in csv format.
+
+genomic-medicine-sweden/tomte will requires the information given bellow.
+
+| Fields | Description |
| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `case` | Custom name for the case. Not in use currently |
+| `case` | Case ID, for the analysis used when generating a family VCF. |
| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). |
-| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". |
-| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". |
-| `strandedness` | Library strandedness. Allowed values: "unstranded", "forward", "reverse" |
+| `fastq_1` | Absolute path to FASTQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". |
+| `fastq_2` | Absolute path to FASTQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". |
+| `strandedness` | Sample strandness |
+
+It is also possible to include multiple runs of the same sample in a samplesheet. For example, when you have re-sequenced the same sample more than once to increase sequencing depth. In that case, the `sample` identifiers in the samplesheet have to be the same. The pipeline will align the raw read/read-pairs independently before merging the alignments belonging to the same sample. Below is an example for a trio with the proband sequenced across two lanes:
+
+| case | sample | fastq_1 | fastq_2 | strandedness |
+| ----- | ------------ | -------------------------------- | -------------------------------- | ------------ |
+| fam_1 | CONTROL_REP1 | AEG588A1_S1_L002_R1_001.fastq.gz | AEG588A1_S1_L002_R2_001.fastq.gz | reverse |
+| fam_1 | CONTROL_REP2 | AEG588A2_S1_L003_R1_001.fastq.gz | AEG588A2_S1_L003_R2_001.fastq.gz | reverse |
+| fam_1 | PATIENT_1 | AEG588A3_S1_L001_R1_001.fastq.gz | AEG588A3_S1_L001_R2_001.fastq.gz | reverse |
+| fam_1 | PATIENT_1 | AEG588A3_S1_L002_R1_001.fastq.gz | AEG588A3_S1_L002_R2_001.fastq.gz | reverse |
+
+If you would like to see more examples of what a typical samplesheet looks like for a duo, follow this links, [sample_sheet](https://github.com/genomic-medicine-sweden/tomte/blob/master/test_data/samplesheet_chr21.csv)
+
+#### Reference files and parameters
+
+In genomic-medicine-sweden/tomte, references can be supplied using parameters.
+
+Note that the pipeline is modular in architecture. It offers you the flexibility to choose between different tools. For example, you can call SNVs either with BCFtools or with GATK. You also have the option to turn off sections of the pipeline if you do not want to run them. For example, drop aberrant expression module can be turned off by setting `--run_drop_ae_switch FALSE`. This flexibility means that in any given analysis run, a combination of tools included in the pipeline will not be executed. So the pipeline is written in a way that can account for these differences while working with reference parameters. If a tool is not going to be executed during the course of a run, parameters used only by that tool need not be provided. For example, if you are not running DROP aberrant splicing, you do not need to provide `--reference_drop_splice_folder`.
+
+genomic-medicine-sweden/tomte consists of several tools used for various purposes. For convenience, we have grouped those tools under the following categories:
+
+1. Alignment and pseudo quantification (STAR & Salmon)
+2. Subsample_region (Samtools)
+3. Variant calling - SNV (BCFTools or GATK's GermlineCNVCaller)
+4. SNV annotation (ensembl VEP)
+5. DROP
+
+> We have only listed the groups that require at least one input from the user. For example, the pipeline also runs WigToBigWig, but it does not require any input other than the bam files passed by the pipeline. Hence, it is not mentioned in the list above. To know more about the tools used in the pipeline check the [README](../README.md).
+
+The mandatory and optional parameters for each category are tabulated below.
+
+> Alignment, QC stats, repeat expansions, SNV variant calling and ensembl VEP are run by default. Hence, the mandatory parameters used by those features will always have to be provided to the pipeline.
+
+##### 1. Alignment
+
+| Mandatory | Optional |
+| --------- | ------------------------------ |
+| fasta | fasta_fai1 |
+| gtf | sequence_dict1 |
+| | salmon_index1 |
+| | star_index1 |
+| | transcript_fasta1 |
+| | genome2 |
+| | platform3 |
+| | min_trimmed_length4 |
+| | star_two_pass_mode4 |
+
+1 If the parameter is not provided by the user, it will be generated from the fasta and gtf files.
+2 If it is not provided by the user, the default value is GRCh38.
+3 If it is not provided by the user, the default value is illumina.
+4 If it is not provided by the user, the default value is 40.
+5 If it is not provided by the user, the default value is Basic.
+
+##### 2. Subsample region
+
+| Mandatory | Optional |
+| ------------- | ------------------------------------ |
+| subsample_bed | subsample_region_switch 1 |
+| | seed_frac2 |
+
+1 If it is not provided by the user, the default value is true
+2 If it is not provided by the user, the default value is 0.001
+
+##### 3. Variant calling - SNV
+
+| Mandatory | Optional |
+| --------- | -------------------------------- |
+| | variant_caller1 |
+| | bcftools_caller_mode2 |
+
+1 If it is not provided by the user, the default value is bcftools
+2 If it is not provided by the user, the default value is multiallelic
+
+#### 4. SNV annotation (ensembl VEP)
+
+| Mandatory | Optional |
+| --------- | ----------------------------- |
+| vep_cache | vep_cache_version1 |
+| | vep_filters |
+
+1 For the time being, only 107 is suported
+
+#### 5. DROP
+
+DROP - aberrant expression
+
+| Mandatory | Optional |
+| ------------------------------------- | --------------------------------- |
+| reference_drop_annot_file1 | run_drop_ae_switch2 |
+| reference_drop_count_file | drop_group_samples_ae3 |
+| | drop_padjcutoff_ae4 |
+| | drop_zscorecutoff5 |
+| | gene_panel_clinical_filter |
+| | downsample_switch6 |
+| | num_reads7 |
+
+1 To get more information on how to format it, see below
+2 If it is not provided by the user, the default value is true
+3 If it is not provided by the user, the default value is outrider
+4 If it is not provided by the user, the default value is 0.05
+5 If it is not provided by the user, the default value is 0
+6 If it is not provided by the user, the default value is true
+7 If it is not provided by the user, the default value is 120000000
+
+DROP - aberrant splicing
-An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.
+| Mandatory | Optional |
+| ------------------------------------- | --------------------------------- |
+| reference_drop_annot_file1 | run_drop_as_switch2 |
+| reference_drop_splice_folder | drop_group_samples_as3 |
+| | drop_padjcutoff_as4 |
+| | gene_panel_clinical_filter |
+| | downsample_switch5 |
+| | num_reads6 |
-## Preparing input for DROP
+1 To get more information on how to format it, see below
+2 If it is not provided by the user, the default value is true
+3 If it is not provided by the user, the default value is fraser
+4 If it is not provided by the user, the default value is 0.1
+5 If it is not provided by the user, the default value is true
+6 If it is not provided by the user, the default value is 120000000
+
+##### Preparing input for DROP
If you want to run [DROP](https://github.com/gagneurlab/drop) aberrant expression or aberrant splicing you have to provide reference counts, splice counts and a sample sheet. The sample sheet should contain the columns as those in the [test sample annotation](../test_data/drop_data/sampleAnnotation.tsv), you do not need to include the samples you are running through the pipeline in the sample sheet.
@@ -94,7 +268,7 @@ Do not use `-c ` to specify parameters as this will result in errors. Cust
The above pipeline run specified with a params file in yaml format:
```bash
-nextflow run nf-core/tomte -profile docker -params-file params.yaml
+nextflow run genomic-medicine-sweden/tomte -profile docker -params-file params.yaml
```
with `params.yaml` containing:
diff --git a/modules/local/drop_filter_results.nf b/modules/local/drop_filter_results.nf
index 5ff57776..619c57c3 100644
--- a/modules/local/drop_filter_results.nf
+++ b/modules/local/drop_filter_results.nf
@@ -17,11 +17,11 @@ process DROP_FILTER_RESULTS {
path out_drop_as_tsv_in
output:
- path('OUTRIDER_provided_sample_top20.tsv') , optional: true, emit: ae_out_unfiltered
- path('OUTRIDER_provided_sample_top20_filtered.tsv'), optional: true, emit: ae_out_filtered
- path('FRASER_provided_sample.tsv') , optional: true, emit: as_out_unfiltered
- path('FRASER_provided_sample_filtered.tsv') , optional: true, emit: as_out_filtered
- path "versions.yml" , emit: versions
+ path('OUTRIDER_provided_samples_top_hits.tsv') , optional: true, emit: ae_out_unfiltered
+ path('OUTRIDER_provided_samples_top_hits_filtered.tsv'), optional: true, emit: ae_out_filtered
+ path('FRASER_provided_samples_top_hits.tsv') , optional: true, emit: as_out_unfiltered
+ path('FRASER_provided_samples_top_hits_filtered.tsv') , optional: true, emit: as_out_filtered
+ path "versions.yml" , emit: versions
when:
task.ext.when == null || task.ext.when
diff --git a/nextflow.config b/nextflow.config
index 4cd43aed..578e2b37 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -27,15 +27,15 @@ params {
star_two_pass_mode = 'Basic'
subsample_region_switch = true
downsample_switch = true
- num_reads = 80000000
+ num_reads = 120000000
seed_frac = 0.001
save_mapped_as_cram = true
// Variant calling
variant_caller = 'bcftools'
bcftools_caller_mode = 'multiallelic'
- run_drop_ae = true
- run_drop_as = true
+ run_drop_ae_switch = true
+ run_drop_as_switch = true
drop_group_samples_ae = 'outrider'
drop_group_samples_as = 'fraser'
drop_padjcutoff_ae = 0.05
diff --git a/nextflow_schema.json b/nextflow_schema.json
index b17fccc0..b8cfd4c9 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -148,14 +148,14 @@
"vep_cache_version": {
"type": "integer",
"default": 107,
- "description": "Specifies which analysis type for the pipeline- either 'wgs','wes','mito'. This changes resources consumed and tools used.",
- "fa_icon": "fas fa-book",
+ "description": "Specifies version of vep cache to use.",
+ "fa_icon": "fas fa-folder-open",
"enum": [107]
},
"vep_filters": {
"type": "string",
"format": "path",
- "fa_icon": "fas fa-chart-bar",
+ "fa_icon": "fas fa-file-csv",
"description": "File containing HGNC_IDs of interest on separate lines.",
"hidden": true
}
@@ -220,7 +220,7 @@
},
"num_reads": {
"type": "integer",
- "default": 80000000,
+ "default": 120000000,
"description": "Number of reads to downsample RNAseq sample to",
"fa_icon": "fas fa-list-ol"
}
@@ -249,13 +249,13 @@
"enum": ["consensus", "multiallelic"],
"help_text": "Bcftools call can eitherbe run in multiallelic mode or in consensus mode. In consensus mode a p-value threshold of 0.01 is applied."
},
- "run_drop_ae": {
+ "run_drop_ae_switch": {
"type": "boolean",
"default": true,
"description": "Should DROP Aberrant Expression module be run?",
"fa_icon": "fas fa-toggle-off"
},
- "run_drop_as": {
+ "run_drop_as_switch": {
"type": "boolean",
"default": true,
"description": "Should DROP Aberrant Splicing module be run?",