From bfce097bdc6dde32515f8f6650754927c59c61a2 Mon Sep 17 00:00:00 2001 From: hoelzer Date: Thu, 1 Feb 2024 13:20:57 +0100 Subject: [PATCH 1/8] add pig fasta and gtf --- README.md | 6 ++++-- main.nf | 5 +++-- modules/annotationGet.nf | 6 ++++++ modules/referenceGet.nf | 10 ++++++++++ 4 files changed, 23 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 3bb2393..8adbaa4 100644 --- a/README.md +++ b/README.md @@ -194,7 +194,7 @@ nextflow pull hoelzer-lab/rnaflow -r nextflow run hoelzer-lab/rnaflow --reads input.csv --autodownload hsa --pathway hsa --max_cores 6 --cores 2 ``` -with `--autodownload ` [build-in species](#build-in-species), or define your own genome reference and annotation files in CSV files: +with `--autodownload ` [build-in species](#build-in-species), or define your own genome reference and annotation files in CSV files: ```bash nextflow run hoelzer-lab/rnaflow --reads input.csv --genome fastas.csv --annotation gtfs.csv --max_cores 6 --cores 2 @@ -258,10 +258,11 @@ You can add a [build-in species](#build-in-species) to your defined genomes and We provide a small set of build-in species for which the genome and annotation files are automatically downloaded from [Ensembl](https://www.ensembl.org/index.html) with `--autodownload xxx`. Please let us know, we can easily add other species. -| Species | three-letter shortcut | Genome | Annotation | +| Species | three-letter shortcut | Annotation | Genome | | ------------ | --------------------- | ----------------------------------- | --------------------------------------------- | | Homo sapiens | `hsa` * | Homo_sapiens.GRCh38.98 | Homo_sapiens.GRCh38.dna.primary_assembly | | Mus musculus | `mmu` * | Mus_musculus.GRCm38.99 | Mus_musculus.GRCm38.dna.primary_assembly | +| Sus scrofa | `ssc` * | Sus_scrofa.Sscrofa11.1.111 | Sus_scrofa.Sscrofa11.1.dna.toplevel | | Mesocricetus auratus | `mau` * | Mesocricetus_auratus.MesAur1.0.100 | Mesocricetus_auratus.MesAur1.0.dna.toplevel | | Escherichia coli | `eco` | Escherichia_coli_k_12.ASM80076v1.45 | Escherichia_coli_k_12.ASM80076v1.dna.toplevel | @@ -518,6 +519,7 @@ Input: - hsa [Ensembl: Homo_sapiens.GRCh38.dna.primary_assembly | Homo_sapiens.GRCh38.98] - eco [Ensembl: Escherichia_coli_k_12.ASM80076v1.dna.toplevel | Escherichia_coli_k_12.ASM80076v1.45] - mmu [Ensembl: Mus_musculus.GRCm38.dna.primary_assembly | Mus_musculus.GRCm38.99.gtf] + - ssc [Ensembl: Sus_scrofa.Sscrofa11.1.dna.toplevel | Sus_scrofa.Sscrofa11.1.111 ] - mau [Ensembl: Mesocricetus_auratus.MesAur1.0.dna.toplevel | Mesocricetus_auratus.MesAur1.0.100] --species Specifies the species identifier for downstream path analysis. (DEPRECATED) If `--include_species` is set, reference genome and annotation are added and automatically downloaded. [default: ] diff --git a/main.nf b/main.nf index 7edeccf..4f49d59 100755 --- a/main.nf +++ b/main.nf @@ -85,8 +85,8 @@ if (params.nanopore) { } -Set species = ['hsa', 'eco', 'mmu', 'mau'] -Set autodownload = ['hsa', 'eco', 'mmu', 'mau'] +Set species = ['hsa', 'eco', 'mmu', 'mau', 'ssc'] +Set autodownload = ['hsa', 'eco', 'mmu', 'mau', 'ssc'] Set pathway = ['hsa', 'mmu', 'mau'] if ( params.profile ) { exit 1, "--profile is WRONG use -profile" } @@ -924,6 +924,7 @@ def helpMSG() { - hsa [Ensembl: Homo_sapiens.GRCh38.dna.primary_assembly | Homo_sapiens.GRCh38.98] - eco [Ensembl: Escherichia_coli_k_12.ASM80076v1.dna.toplevel | Escherichia_coli_k_12.ASM80076v1.45] - mmu [Ensembl: Mus_musculus.GRCm38.dna.primary_assembly | Mus_musculus.GRCm38.99.gtf] + - ssc [Ensembl: Sus_scrofa.Sscrofa11.1.dna.toplevel | Sus_scrofa.Sscrofa11.1.111 ] - mau [Ensembl: Mesocricetus_auratus.MesAur1.0.dna.toplevel | Mesocricetus_auratus.MesAur1.0.100]${c_reset} ${c_dim}--species Specifies the species identifier for downstream path analysis. (DEPRECATED) If `--include_species` is set, reference genome and annotation are added and automatically downloaded. [default: $params.species] diff --git a/modules/annotationGet.nf b/modules/annotationGet.nf index 832d3bf..a424162 100644 --- a/modules/annotationGet.nf +++ b/modules/annotationGet.nf @@ -27,6 +27,12 @@ process annotationGet { gunzip -f Mus_musculus.GRCm38.99.gtf.gz mv Mus_musculus.GRCm38.99.gtf ${species}.gtf """ + else if (species == 'ssc') + """ + wget ftp://ftp.ensembl.org/pub/release-111/gtf/sus_scrofa/Sus_scrofa.Sscrofa11.1.111.gtf.gz + gunzip -f Sus_scrofa.Sscrofa11.1.111.gtf.gz + mv Sus_scrofa.Sscrofa11.1.111.gtf.gz ${species}.gtf + """ else if (species == 'eco') """ wget ftp://ftp.ensemblgenomes.org/pub/release-45/bacteria//gtf/bacteria_90_collection/escherichia_coli_k_12/Escherichia_coli_k_12.ASM80076v1.45.gtf.gz diff --git a/modules/referenceGet.nf b/modules/referenceGet.nf index 572fa9d..4717415 100644 --- a/modules/referenceGet.nf +++ b/modules/referenceGet.nf @@ -27,6 +27,16 @@ process referenceGet { gunzip -f Mus_musculus.GRCm38.dna.primary_assembly.fa.gz mv Mus_musculus.GRCm38.dna.primary_assembly.fa ${species}.fa """ + else if (species == 'ssc') + """ + # Primary assembly contains all toplevel sequence regions excluding haplotypes and patches. + # This file is best used for performing sequence similarity searches where patch and haplotype + # sequences would confuse analysis. If the primary assembly file is not present, that + # indicates that there are no haplotype/patch regions, and the 'toplevel' file is equivalent. + wget ftp://ftp.ensembl.org/pub/release-111/fasta/sus_scrofa/dna/Sus_scrofa.Sscrofa11.1.dna.toplevel.fa.gz + gunzip -f Sus_scrofa.Sscrofa11.1.dna.toplevel.fa.gz + mv Sus_scrofa.Sscrofa11.1.dna.toplevel.fa ${species}.fa + """ else if (species == 'eco') """ wget ftp://ftp.ensemblgenomes.org/pub/release-45/bacteria//fasta/bacteria_90_collection/escherichia_coli_k_12/dna/Escherichia_coli_k_12.ASM80076v1.dna.toplevel.fa.gz From 90375ccdcbeaed01262d5114ffdbedafa8b44c2d Mon Sep 17 00:00:00 2001 From: hoelzer Date: Fri, 2 Feb 2024 16:58:55 +0100 Subject: [PATCH 2/8] add pig fasta and gtf --- modules/annotationGet.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/annotationGet.nf b/modules/annotationGet.nf index a424162..ef87193 100644 --- a/modules/annotationGet.nf +++ b/modules/annotationGet.nf @@ -31,7 +31,7 @@ process annotationGet { """ wget ftp://ftp.ensembl.org/pub/release-111/gtf/sus_scrofa/Sus_scrofa.Sscrofa11.1.111.gtf.gz gunzip -f Sus_scrofa.Sscrofa11.1.111.gtf.gz - mv Sus_scrofa.Sscrofa11.1.111.gtf.gz ${species}.gtf + mv Sus_scrofa.Sscrofa11.1.111.gtf ${species}.gtf """ else if (species == 'eco') """ From 28ebeb61bc41f3e4443d7505cbb92020ae7274a7 Mon Sep 17 00:00:00 2001 From: hoelzer Date: Mon, 5 Feb 2024 21:04:36 +0100 Subject: [PATCH 3/8] activate piano and webgestalt pathway analysis for Sus scrofa (ssc) --- README.md | 7 ++++--- bin/piano.R | 2 ++ bin/webgestalt.R | 2 ++ main.nf | 3 ++- 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 8adbaa4..f2b8959 100644 --- a/README.md +++ b/README.md @@ -204,7 +204,7 @@ Genomes and annotations from `--autodownload`, `--genome` and `--annotation` are By default, all possible comparisons are performed. Use `--deg` to change this. -`--pathway ` performs downstream pathway analysis. Available are WebGestalt set enrichment analysis (GSEA) for `hsa`, piano GSEA with different settings and consensus scoring for `hsa`, `mmu` and `mau`. +`--pathway ` performs downstream pathway analysis. Available are WebGestalt set enrichment analysis (GSEA) for `hsa` and `mmu`, piano GSEA with different settings and consensus scoring for `hsa`, `mmu`, `mau`, and `ssc`. ### Input files @@ -314,7 +314,7 @@ Nextflow will need access to the working directory where temporary calculations --strand # strandness for counting with featureCounts: 0 (unstranded), 1 (stranded) and 2 (reversely stranded) [default 0] --tpm # threshold for TPM (transcripts per million) filter [default 1] --deg # a CSV file following the pattern: conditionX,conditionY ---pathway # perform different downstream pathway analysis for the species hsa|mmu|mau +--pathway # perform different downstream pathway analysis for the species hsa|mmu|mau|ssc --feature_id_type # ID type for downstream analysis [default: ensembl_gene_id] ``` @@ -469,7 +469,7 @@ We provide `DESeq2` normalized, regularized log (rlog), variance stabilized (vsd For each comparison (specified with `--deg` or, per default, all possible pairwise comparisons in one direction), a new folder `X_vs_Y` is created. This also describes the direction of the comparison, e.g., the log2FoldChange describes the change of a gene A under condition Y with respect to the gene under condition X. For example, a log2FoldChange of +2 for gene A would tell you that this gene is 2-fold upregulated when we compare condition X vs. condition Y. The gene A is higher expressed in samples belonging to condition X. -Downstream analysis (`--pathway xxx`) are currently provided for some species: GSEA consensus scoring with `piano` for *Homo sapiens* (`hsa`), *Mus musculus* (`mmu`) and *Mesocricetus auratus* (`mau`); and `WebGestalt` GSEA for *Homo sapiens* and *Mus musculus*. +Downstream analysis (`--pathway xxx`) are currently provided for some species: GSEA consensus scoring with `piano` for *Homo sapiens* (`hsa`), *Mus musculus* (`mmu`), *Mesocricetus auratus* (`mau`), and *Sus scofa* (`ssc`); and `WebGestalt` GSEA for *Homo sapiens* and *Mus musculus*. ## Working offline @@ -554,6 +554,7 @@ DEG analysis options: - hsa | Homo sapiens - mmu | Mus musculus - mau | Mesocricetus auratus + - ssc | Sus scrofa --feature_id_type ID type for downstream analysis [default: ensembl_gene_id] Transcriptome assembly options: diff --git a/bin/piano.R b/bin/piano.R index bfafee6..04494c5 100644 --- a/bin/piano.R +++ b/bin/piano.R @@ -26,6 +26,8 @@ try.biomart <- try( biomart.ensembl <- useMart('ensembl', dataset='mmusculus_gene_ensembl') } else if (species == 'hsa') { biomart.ensembl <- useMart('ensembl', dataset='hsapiens_gene_ensembl') + } else if (species == 'ssc') { + biomart.ensembl <- useMart('ensembl', dataset='sscrofa_gene_ensembl') } else if (species == 'mau') { biomart.ensembl <- useMart('ensembl', dataset='mauratus_gene_ensembl') } else { diff --git a/bin/webgestalt.R b/bin/webgestalt.R index 1971819..6c14fc6 100644 --- a/bin/webgestalt.R +++ b/bin/webgestalt.R @@ -18,6 +18,8 @@ if ( species == 'hsa' ){ organism <- "hsapiens" } else if (species == 'mmu') { organism <- "mmusculus" +} else if (species == 'ssc') { +organism <- "sscrofa" } else { organism <- NA } diff --git a/main.nf b/main.nf index 4f49d59..a4f358b 100755 --- a/main.nf +++ b/main.nf @@ -87,7 +87,7 @@ if (params.nanopore) { Set species = ['hsa', 'eco', 'mmu', 'mau', 'ssc'] Set autodownload = ['hsa', 'eco', 'mmu', 'mau', 'ssc'] -Set pathway = ['hsa', 'mmu', 'mau'] +Set pathway = ['hsa', 'mmu', 'mau', 'ssc'] if ( params.profile ) { exit 1, "--profile is WRONG use -profile" } @@ -961,6 +961,7 @@ def helpMSG() { ${c_dim}Currently supported are: - hsa | Homo sapiens - mmu | Mus musculus + - ssc | Sus scrofa - mau | Mesocricetus auratus${c_reset} --feature_id_type ID type for downstream analysis [default: $params.feature_id_type] From dad23fdd8d41850f29c5c34cacf009415d1913a0 Mon Sep 17 00:00:00 2001 From: hoelzer Date: Mon, 5 Feb 2024 21:06:13 +0100 Subject: [PATCH 4/8] Add ssc to README --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f2b8959..55dfc0a 100644 --- a/README.md +++ b/README.md @@ -204,7 +204,7 @@ Genomes and annotations from `--autodownload`, `--genome` and `--annotation` are By default, all possible comparisons are performed. Use `--deg` to change this. -`--pathway ` performs downstream pathway analysis. Available are WebGestalt set enrichment analysis (GSEA) for `hsa` and `mmu`, piano GSEA with different settings and consensus scoring for `hsa`, `mmu`, `mau`, and `ssc`. +`--pathway ` performs downstream pathway analysis. Available are WebGestalt set enrichment analysis (GSEA) for `hsa`, `mmu` and `ssc`, piano GSEA with different settings and consensus scoring for `hsa`, `mmu`, `mau`, and `ssc`. ### Input files @@ -469,7 +469,7 @@ We provide `DESeq2` normalized, regularized log (rlog), variance stabilized (vsd For each comparison (specified with `--deg` or, per default, all possible pairwise comparisons in one direction), a new folder `X_vs_Y` is created. This also describes the direction of the comparison, e.g., the log2FoldChange describes the change of a gene A under condition Y with respect to the gene under condition X. For example, a log2FoldChange of +2 for gene A would tell you that this gene is 2-fold upregulated when we compare condition X vs. condition Y. The gene A is higher expressed in samples belonging to condition X. -Downstream analysis (`--pathway xxx`) are currently provided for some species: GSEA consensus scoring with `piano` for *Homo sapiens* (`hsa`), *Mus musculus* (`mmu`), *Mesocricetus auratus* (`mau`), and *Sus scofa* (`ssc`); and `WebGestalt` GSEA for *Homo sapiens* and *Mus musculus*. +Downstream analysis (`--pathway xxx`) are currently provided for some species: GSEA consensus scoring with `piano` for *Homo sapiens* (`hsa`), *Mus musculus* (`mmu`), *Mesocricetus auratus* (`mau`), and *Sus scofa* (`ssc`); and `WebGestalt` GSEA for *Homo sapiens*, *Mus musculus*, and *Sus scrofa*. ## Working offline From 38a22c95e5bdc7adda1341a64c29f49fa41ab958 Mon Sep 17 00:00:00 2001 From: hoelzer Date: Tue, 6 Feb 2024 13:10:04 +0100 Subject: [PATCH 5/8] error code webgestalt --- bin/webgestalt.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/webgestalt.R b/bin/webgestalt.R index 6c14fc6..adc53f7 100644 --- a/bin/webgestalt.R +++ b/bin/webgestalt.R @@ -44,5 +44,5 @@ if (! is.na(organism)) { print(paste('SKIPPING: WebGestaltR. Feature ID', id_type, 'not supported.')) } } else { - print("Unknown organism, only organisms 'hsapiens' and 'mmusculus' are supported by default. Exiting.") + print("Unknown organism, only organisms 'hsapiens', 'mmusculus', and 'sscrofa' are supported by default. Exiting.") } \ No newline at end of file From d160cb2876822bea31e6813225ffb412585f3a55 Mon Sep 17 00:00:00 2001 From: hoelzer Date: Thu, 22 Feb 2024 12:31:54 +0100 Subject: [PATCH 6/8] fix help for input example --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index a4f358b..1f120f8 100755 --- a/main.nf +++ b/main.nf @@ -916,7 +916,7 @@ def helpMSG() { ${c_dim}Genomes and annotations from --autodownload, --genome and --annotation are concatenated.${c_reset} ${c_yellow}Input:${c_reset} - ${c_green}--reads${c_reset} A CSV file following the pattern: Sample,R,Condition,Source for single-end or Sample,R1,R2,Condition,Source for paired-end + ${c_green}--reads${c_reset} A CSV file following the pattern: Sample,R1,R2,Condition,Source,Strandedness. For single-end data, leave column R2 empty. ${c_dim}(check terminal output if correctly assigned) Per default, all possible comparisons of conditions in one direction are made. Use --deg to change.${c_reset} ${c_green}--autodownload${c_reset} Specifies the species identifier for automated download [default: $params.autodownload] From 9922ca213c17930340135d4b48a69114c8f65bc9 Mon Sep 17 00:00:00 2001 From: hoelzer Date: Thu, 22 Feb 2024 15:59:53 +0100 Subject: [PATCH 7/8] fix readme, the help was actually correct for read inputa --- README.md | 16 ++++++++-------- main.nf | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 55dfc0a..450a7dd 100644 --- a/README.md +++ b/README.md @@ -210,16 +210,16 @@ By default, all possible comparisons are performed. Use `--deg` to change this. #### Read files (required) -Specify your read files in `FASTQ` format with `--reads input.csv`. The file `input.csv` has to look like this for single-end reads (just leave R2 empty): +Specify your read files in `FASTQ` format with `--reads input.csv`. The file `input.csv` has to look like this for single-end reads: ```csv -Sample,R1,R2,Condition,Source,Strandedness -mock_rep1,/path/to/reads/mock1.fastq.gz,,mock,,0 -mock_rep2,/path/to/reads/mock2.fastq.gz,,mock,,0 -mock_rep3,/path/to/reads/mock3.fastq.gz,,mock,,0 -treated_rep1,/path/to/reads/treat1.fastq.gz,,treated,,0 -treated_rep2,/path/to/reads/treat2.fastq.gz,,treated,,0 -treated_rep3,/path/to/reads/treat3.fastq.gz,,treated,,0 +Sample,R,Condition,Source,Strandedness +mock_rep1,/path/to/reads/mock1.fastq.gz,mock,,0 +mock_rep2,/path/to/reads/mock2.fastq.gz,mock,,0 +mock_rep3,/path/to/reads/mock3.fastq.gz,mock,,0 +treated_rep1,/path/to/reads/treat1.fastq.gz,treated,,0 +treated_rep2,/path/to/reads/treat2.fastq.gz,treated,,0 +treated_rep3,/path/to/reads/treat3.fastq.gz,treated,,0 ``` and for paired-end reads, like this: diff --git a/main.nf b/main.nf index 1f120f8..a4f358b 100755 --- a/main.nf +++ b/main.nf @@ -916,7 +916,7 @@ def helpMSG() { ${c_dim}Genomes and annotations from --autodownload, --genome and --annotation are concatenated.${c_reset} ${c_yellow}Input:${c_reset} - ${c_green}--reads${c_reset} A CSV file following the pattern: Sample,R1,R2,Condition,Source,Strandedness. For single-end data, leave column R2 empty. + ${c_green}--reads${c_reset} A CSV file following the pattern: Sample,R,Condition,Source for single-end or Sample,R1,R2,Condition,Source for paired-end ${c_dim}(check terminal output if correctly assigned) Per default, all possible comparisons of conditions in one direction are made. Use --deg to change.${c_reset} ${c_green}--autodownload${c_reset} Specifies the species identifier for automated download [default: $params.autodownload] From 203b76dd40cc9449fd339d799b9d82f9a99a392f Mon Sep 17 00:00:00 2001 From: hoelzer Date: Mon, 26 Feb 2024 21:43:37 +0100 Subject: [PATCH 8/8] I was right, fixed the old --help mssg for read input format to match the code and README! --- README.md | 14 +++++++------- main.nf | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 450a7dd..e28f836 100644 --- a/README.md +++ b/README.md @@ -213,13 +213,13 @@ By default, all possible comparisons are performed. Use `--deg` to change this. Specify your read files in `FASTQ` format with `--reads input.csv`. The file `input.csv` has to look like this for single-end reads: ```csv -Sample,R,Condition,Source,Strandedness -mock_rep1,/path/to/reads/mock1.fastq.gz,mock,,0 -mock_rep2,/path/to/reads/mock2.fastq.gz,mock,,0 -mock_rep3,/path/to/reads/mock3.fastq.gz,mock,,0 -treated_rep1,/path/to/reads/treat1.fastq.gz,treated,,0 -treated_rep2,/path/to/reads/treat2.fastq.gz,treated,,0 -treated_rep3,/path/to/reads/treat3.fastq.gz,treated,,0 +Sample,R1,R2,Condition,Source,Strandedness +mock_rep1,/path/to/reads/mock1.fastq.gz,,mock,,0 +mock_rep2,/path/to/reads/mock2.fastq.gz,,mock,,0 +mock_rep3,/path/to/reads/mock3.fastq.gz,,mock,,0 +treated_rep1,/path/to/reads/treat1.fastq.gz,,treated,,0 +treated_rep2,/path/to/reads/treat2.fastq.gz,,treated,,0 +treated_rep3,/path/to/reads/treat3.fastq.gz,,treated,,0 ``` and for paired-end reads, like this: diff --git a/main.nf b/main.nf index a4f358b..045ff4e 100755 --- a/main.nf +++ b/main.nf @@ -916,7 +916,7 @@ def helpMSG() { ${c_dim}Genomes and annotations from --autodownload, --genome and --annotation are concatenated.${c_reset} ${c_yellow}Input:${c_reset} - ${c_green}--reads${c_reset} A CSV file following the pattern: Sample,R,Condition,Source for single-end or Sample,R1,R2,Condition,Source for paired-end + ${c_green}--reads${c_reset} A CSV file following the pattern: Sample,R1,R2,Condition,Source,Strandedness (for single-end leave 'R2' column empty) ${c_dim}(check terminal output if correctly assigned) Per default, all possible comparisons of conditions in one direction are made. Use --deg to change.${c_reset} ${c_green}--autodownload${c_reset} Specifies the species identifier for automated download [default: $params.autodownload]