diff --git a/bin/drop_config.py b/bin/drop_config.py index a5eae1d4..09abd544 100755 --- a/bin/drop_config.py +++ b/bin/drop_config.py @@ -85,6 +85,7 @@ def update_config( genome: Path, gtf: Path, genome_assembly: str, + drop_group_samples: str, padjcutoff: float, zscorecutoff: float, drop_module: str, @@ -109,10 +110,12 @@ def update_config( # Export counts if drop_module == "AE": config_copy["aberrantExpression"]["run"] = ["true"] + config_copy["aberrantExpression"]["groups"] = [drop_group_samples] config_copy["aberrantExpression"]["padjCutoff"] = padjcutoff config_copy["aberrantExpression"]["zScoreCutoff"] = zscorecutoff elif drop_module == "AS": config_copy["aberrantSplicing"]["run"] = ["true"] + config_copy["aberrantSplicing"]["groups"] = [drop_group_samples] config_copy["aberrantSplicing"]["padjCutoff"] = padjcutoff elif drop_module == "MAE": config_copy["mae"]["run"] = ["true"] @@ -156,6 +159,12 @@ def parse_args(argv=None): choices=["hg19", "hs37d5", "hg38", "GRCh38"], required=True, ) + parser.add_argument( + "--drop_group_samples", + type=str, + help="Specify drop group to analyse", + required=True, + ) parser.add_argument( "--padjcutoff", type=float, @@ -190,6 +199,7 @@ def main(): genome=args.genome_fasta, gtf=args.gtf, genome_assembly=args.genome_assembly, + drop_group_samples=args.drop_group_samples, padjcutoff=args.padjcutoff, zscorecutoff=args.zscorecutoff, drop_module=args.drop_module, diff --git a/bin/drop_counts.py b/bin/drop_counts.py deleted file mode 100755 index c2593b03..00000000 --- a/bin/drop_counts.py +++ /dev/null @@ -1,136 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -import re -from collections import OrderedDict -from pathlib import Path -from pandas import read_csv, DataFrame -from typing import Set, Dict - -SCRIPT_VERSION = "v1.0" -TRANSLATOR = { - "unstranded": 1, - "forward": 2, - "reverse": 3, -} - -STANDARD_CHROMOSOMES = [ - "chr1", - "chr2", - "chr3", - "chr4", - "chr5", - "chr6", - "chr7", - "chr8", - "chr9", - "chr10", - "chr11", - "chr12", - "chr13", - "chr14", - "chr15", - "chr16", - "chr17", - "chr18", - "chr19", - "chr20", - "chr21", - "chr22", - "chrX", - "chrY", - "chrM", -] - - -def get_non_std_genes(gtf: Path) -> Set[str]: - """Create list of genes not belonging to chr1-21 or chrM""" - gene_id_regex = re.compile('gene_id "(.+?)"') - genes_to_exclude = set() - with open(gtf, "r") as gtf_file: - for line in gtf_file: - if line.startswith("#"): - continue - if line.split()[0] in STANDARD_CHROMOSOMES: - continue - gene_id = re.search(gene_id_regex, line) - genes_to_exclude.add(gene_id.group(1)) - return genes_to_exclude - - -def read_star_gene_counts(sample: str, star: Path, strandedness: str) -> Dict: - """Read gene count file(s) from STAR output to return sample_ids.""" - sample_ids = {} - gene_ids = {} - with open(star) as in_tab: - for line in in_tab: - if not line.startswith("N_"): - split_line = line.split() - gene_id = split_line[0] - strand = TRANSLATOR[strandedness] - counts = int(split_line[strand]) - gene_ids[gene_id] = counts - gene_ids = OrderedDict(sorted(gene_ids.items())) - sample_ids[sample] = gene_ids - return sample_ids - - -def get_counts_from_dict(gene_ids_dict: dict) -> DataFrame: - """Transform gene ids dict into count_table""" - one_sample = next(iter(gene_ids_dict)) - gene_list = list(gene_ids_dict[one_sample].keys()) - genes = {gene: [gene_ids_dict[sample][gene] for sample in gene_ids_dict] for gene in gene_list} - count_table: DataFrame = DataFrame.from_dict(genes, orient="index", columns=gene_ids_dict.keys()) - count_table.index.name = "geneID" - return count_table - - -def write_tsv_from_dict(gene_ids_dict: dict, outfile: str, ref_count_file: str, genes_to_exclude: Set[str]) -> None: - """Transform dictionary into tsv friendly.""" - count_table = get_counts_from_dict(gene_ids_dict) - if ref_count_file: - if ref_count_file.endswith(".gz"): - ref_table = read_csv(ref_count_file, compression="gzip", sep="\t", header=0, index_col=0) - else: - ref_table = read_csv(ref_count_file, sep="\t", header=0, index_col=0) - count_table = count_table.combine_first(ref_table) - count_table.drop(genes_to_exclude, inplace=True) - count_table.to_csv(outfile, compression="gzip", sep="\t", header=True) - - -def parse_args(argv=None): - """Define and immediately parse command line arguments.""" - parser = argparse.ArgumentParser( - formatter_class=argparse.MetavarTypeHelpFormatter, - description="""Generate collated gene counts from each STAR output.""", - ) - parser.add_argument("--star", type=str, nargs="+", help="*ReadsPerGene.out.tab from STAR", required=True) - parser.add_argument("--samples", type=str, nargs="+", help="corresponding sample name", required=True) - parser.add_argument("--strandedness", type=str, nargs="+", help="strandedness of RNA", required=True) - parser.add_argument("--output", type=str, help="output tsv file name", required=True) - parser.add_argument("--gtf", type=str, help="Transcript annotation file in gtf format", required=True) - parser.add_argument("--ref_count_file", type=str, help="Optional reference count set", required=True) - parser.add_argument("--version", action="version", version=SCRIPT_VERSION) - return parser.parse_args(argv) - - -def main(): - """Coordinate argument parsing and program execution.""" - args = parse_args() - master_dict = {} - for index, sample_id in enumerate(args.samples): - master_dict.update( - read_star_gene_counts(sample=sample_id, star=args.star[index], strandedness=args.strandedness[index]) - ) - - genes_to_exclude: Set[str] = get_non_std_genes(gtf=args.gtf) - write_tsv_from_dict( - gene_ids_dict=master_dict, - outfile=args.output, - ref_count_file=args.ref_count_file, - genes_to_exclude=genes_to_exclude, - ) - - -if __name__ == "__main__": - main() diff --git a/bin/drop_sample_annot.py b/bin/drop_sample_annot.py index 8858f422..31f0602f 100755 --- a/bin/drop_sample_annot.py +++ b/bin/drop_sample_annot.py @@ -1,9 +1,9 @@ #!/usr/bin/env python3 import argparse -from pathlib import Path import csv from pandas import read_csv, DataFrame, concat +import os SCRIPT_VERSION = "v1.0" SAMPLE_ANNOTATION_COLUMNS = [ @@ -25,7 +25,7 @@ def write_sample_annotation_to_tsv( - bam: str, samples: str, strandedness: str, single_end: str, gtf: str, count_file: str, out_file: str + bam: str, samples: str, strandedness: str, single_end: str, drop_group_sample: str, out_file: str ): """Write the Sample Annotation tsv file.""" with open(out_file, "w") as tsv_file: @@ -34,9 +34,9 @@ def write_sample_annotation_to_tsv( for index, id in enumerate(samples): sa_dict: dict = {}.fromkeys(SAMPLE_ANNOTATION_COLUMNS, "NA") sa_dict["RNA_ID"] = id - sa_dict["DROP_GROUP"] = "outrider,fraser" - sa_dict["GENE_COUNTS_FILE"] = count_file - sa_dict["GENE_ANNOTATION"] = Path(gtf).stem + sa_dict["DROP_GROUP"] = drop_group_sample + sa_dict["GENE_COUNTS_FILE"] = "NA" + sa_dict["GENE_ANNOTATION"] = "NA" sa_dict["STRAND"] = strandedness[index] sa_dict["PAIRED_END"] = is_paired_end(single_end[index]) sa_dict["RNA_BAM_FILE"] = bam[index] @@ -50,14 +50,16 @@ def is_paired_end(single_end: str) -> bool: return False -def write_final_annot_to_tsv(count_file: str, ref_annot: str, out_file: str): +def write_final_annot_to_tsv(ref_count_file: str, ref_annot: str, out_file: str): """ Concatenates the Sample Annotation produced by SampleAnnotation with the one provided for the reference samples, checking for duplicate sample IDs """ df_samples: DataFrame = read_csv("drop_annotation_given_samples.tsv", sep="\t") df_reference: DataFrame = read_csv(ref_annot, sep="\t") - df_reference["GENE_COUNTS_FILE"] = count_file + df_reference["GENE_COUNTS_FILE"] = ref_count_file + df_reference["SPLICE_COUNTS_DIR"] = df_reference["SPLICE_COUNTS_DIR"].str.rstrip("/").apply(os.path.basename) + df_reference["DROP_GROUP"] = df_reference["DROP_GROUP"].str.replace(" ", "") df_samples["COUNT_OVERLAPS"] = df_reference["COUNT_OVERLAPS"].iloc[0] df_samples["COUNT_MODE"] = df_reference["COUNT_MODE"].iloc[0] df_samples["HPO_TERMS"] = df_reference["HPO_TERMS"].iloc[0] @@ -74,15 +76,15 @@ def parse_args(argv=None): formatter_class=argparse.MetavarTypeHelpFormatter, description="""Generate DROP sample annotation for patients.""", ) - parser.add_argument("--bam", type=str, nargs="+", help="bam files for the patient", required=True) + parser.add_argument("--bam", type=str, nargs="+", help="bam files for the analyzed samples", required=True) parser.add_argument("--samples", type=str, nargs="+", help="corresponding sample name", required=True) parser.add_argument("--strandedness", type=str, nargs="+", help="strandedness of RNA", required=True) parser.add_argument("--single_end", type=str, nargs="+", help="is the sample paired end?", required=True) - parser.add_argument("--gtf", type=str, help="Transcript annotation file in gtf format", required=True) parser.add_argument( - "--count_file", type=str, help="A tsv file of gene counts for all processed samples.", required=True + "--ref_count_file", type=str, help="A tsv file of gene counts for reference samples.", required=True ) parser.add_argument("--ref_annot", type=str, help="Path to reference annotation tsv", required=True) + parser.add_argument("--drop_group_sample", type=str, help="Drop group of analyzed samples", required=True) parser.add_argument("--output", type=str, help="Path to save to", required=True) parser.add_argument("--version", action="version", version=SCRIPT_VERSION) return parser.parse_args(argv) @@ -96,11 +98,10 @@ def main(): samples=args.samples, strandedness=args.strandedness, single_end=args.single_end, - gtf=args.gtf, - count_file=args.count_file, + drop_group_sample=args.drop_group_sample, out_file="drop_annotation_given_samples.tsv", ) - write_final_annot_to_tsv(count_file=args.count_file, ref_annot=args.ref_annot, out_file=args.output) + write_final_annot_to_tsv(ref_count_file=args.ref_count_file, ref_annot=args.ref_annot, out_file=args.output) if __name__ == "__main__": diff --git a/conf/modules.config b/conf/modules.config index 37b5e7af..ad221d39 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -181,6 +181,7 @@ process { withName: '.*PREPARE_REFERENCES:GFFREAD' { ext.when = { (!params.transcript_fasta) } + ext.args = { '-w' } publishDir = [ path: { "${params.outdir}/references" }, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, @@ -316,15 +317,6 @@ process { // process { - withName: '.*ANALYSE_TRANSCRIPTS:DROP_COUNTS' { - ext.when = {params.run_drop_ae} - publishDir = [ - path: { "${params.outdir}/analyse_transcripts/drop/AE" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - ] - } - withName: '.*ANALYSE_TRANSCRIPTS:DROP_ANNOT' { ext.when = {params.run_drop_ae|params.run_drop_as} publishDir = [ @@ -473,13 +465,11 @@ process { process { withName: '.*ANNOTATE_SNV:ENSEMBLVEP' { - ext.prefix = { "${vcf.simpleName}_rohann_vcfanno_filter_vep" } + ext.prefix = { "${vcf.simpleName}_vep" } ext.args = [ - '--dir_plugins vep_cache/Plugins', - '--plugin LoFtool,vep_cache/LoFtool_scores.txt', - '--plugin pLI,vep_cache/pLI_values_107.txt', - '--plugin SpliceAI,snv=vep_cache/spliceai_21_scores_raw_snv_-v1.3-.vcf.gz,indel=vep_cache/spliceai_21_scores_raw_snv_-v1.3-.vcf.gz', - '--plugin MaxEntScan,vep_cache/fordownload,SWA,NCSS', + '--dir_plugins cache/Plugins', + '--plugin LoFtool,cache/Plugins/LoFtool_scores.txt', + '--plugin pLI,cache/Plugins/pLI_values_107.txt', '--distance 5000', '--buffer_size 20000', '--format vcf --max_sv_size 248956422', diff --git a/conf/test.config b/conf/test.config index efe42266..c7a9bcba 100644 --- a/conf/test.config +++ b/conf/test.config @@ -32,7 +32,7 @@ params { star_two_pass_mode = 'None' subsample_bed = "${projectDir}/test_data/subsample.bed" seed_frac = 0.001 - num_reads = 10000 + num_reads = 20000 // VEP vep_cache = "https://raw.githubusercontent.com/nf-core/test-datasets/raredisease/reference/vep_cache_and_plugins.tar.gz" diff --git a/modules/local/drop_config_runAE.nf b/modules/local/drop_config_runAE.nf index 3c1465d9..26d39fe0 100644 --- a/modules/local/drop_config_runAE.nf +++ b/modules/local/drop_config_runAE.nf @@ -13,8 +13,11 @@ process DROP_CONFIG_RUN_AE { tuple val(meta), path(fasta), path(fai) path gtf path sample_annotation - path gene_counts + tuple path(bam), path(bai) + path ref_drop_count_file + path ref_splice_folder val(genome) + val(drop_group_samples_ae) val(drop_padjcutoff_ae) val(drop_zScoreCutoff) @@ -30,6 +33,7 @@ process DROP_CONFIG_RUN_AE { script: def genome_assembly = "${genome}".contains("h37") ? "hg19" : "${genome}" + def drop_group = "${drop_group_samples_ae}".replace(" ","") def zscorecutoff = drop_zScoreCutoff ? "--zscorecutoff ${drop_zScoreCutoff}" : '' """ @@ -42,6 +46,7 @@ process DROP_CONFIG_RUN_AE { --gtf ${gtf} \\ --drop_module AE \\ --genome_assembly $genome_assembly \\ + --drop_group_samples $drop_group \\ --padjcutoff ${drop_padjcutoff_ae} \\ $zscorecutoff \\ --output config.yaml diff --git a/modules/local/drop_config_runAS.nf b/modules/local/drop_config_runAS.nf index 5a3c0a2b..8fa7349f 100644 --- a/modules/local/drop_config_runAS.nf +++ b/modules/local/drop_config_runAS.nf @@ -14,8 +14,10 @@ process DROP_CONFIG_RUN_AS { path gtf path sample_annotation tuple path(bam), path(bai) + path ref_drop_count_file path ref_splice_folder val(genome) + val(drop_group_samples_as) val(drop_padjcutoff_as) output: @@ -29,6 +31,7 @@ process DROP_CONFIG_RUN_AS { script: def genome_assembly = "${genome}".contains("h37") ? "hg19" : "${genome}" + def drop_group = "${drop_group_samples_as}".replace(" ","") """ TMPDIR=\$PWD @@ -39,6 +42,7 @@ process DROP_CONFIG_RUN_AS { --gtf ${gtf}\\ --drop_module AS \\ --genome_assembly $genome_assembly \\ + --drop_group_samples $drop_group \\ --padjcutoff ${drop_padjcutoff_as} \\ --output config.yaml diff --git a/modules/local/drop_counts.nf b/modules/local/drop_counts.nf deleted file mode 100644 index b28a33fc..00000000 --- a/modules/local/drop_counts.nf +++ /dev/null @@ -1,52 +0,0 @@ -process DROP_COUNTS { - tag "DROP_counts" - label 'process_low' - - // Exit if running this module with -profile conda / -profile mamba - if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { - exit 1, "Local DROP module does not support Conda. Please use Docker / Singularity / Podman instead." - } - - container "docker.io/clinicalgenomics/drop:1.3.3" - - input: - path(counts) - val(samples) - path(gtf) - path(reference_count_file) - - output: - path('processed_geneCounts.tsv.gz'), emit: processed_gene_counts - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def ids = "${samples.id}".replace("[","").replace("]","").replace(",","") - def strandedness = "${samples.strandedness}".replace("[","").replace("]","").replace(",","") - """ - $baseDir/bin/drop_counts.py \\ - --star ${counts} \\ - --samples $ids \\ - --strandedness $strandedness \\ - --ref_count_file ${reference_count_file} \\ - --output processed_geneCounts.tsv.gz \\ - --gtf ${gtf} \\ - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - drop_counts: \$(\$baseDir/bin/drop_counts.py --version ) - END_VERSIONS - """ - - stub: - """ - touch processed_geneCounts.tsv.gz - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - drop_counts: \$(\$baseDir/bin/drop_counts.py --version ) - END_VERSIONS - """ -} diff --git a/modules/local/drop_sample_annot.nf b/modules/local/drop_sample_annot.nf index fda4abd5..248d8b19 100644 --- a/modules/local/drop_sample_annot.nf +++ b/modules/local/drop_sample_annot.nf @@ -12,9 +12,10 @@ process DROP_SAMPLE_ANNOT { input: path(bam) val(samples) - path(processed_gene_counts) + path(ref_gene_counts) path(ref_annot) - path(gtf) + val(drop_group_samples_ae) + val(drop_group_samples_as) output: path('sample_annotation.tsv'), emit: drop_annot @@ -27,15 +28,16 @@ process DROP_SAMPLE_ANNOT { def ids = "${samples.id}".replace("[","").replace("]","").replace(",","") def strandedness = "${samples.strandedness}".replace("[","").replace("]","").replace(",","") def single_end = "${samples.single_end}".replace("[","").replace("]","").replace(",","") + def drop_group = "${drop_group_samples_ae},${drop_group_samples_as}".replace(" ","").replace("[","").replace("]","") """ $baseDir/bin/drop_sample_annot.py \\ --bam ${bam} \\ --samples $ids \\ --strandedness $strandedness \\ --single_end $single_end \\ - --gtf ${gtf} \\ - --count_file ${processed_gene_counts} \\ + --ref_count_file ${ref_gene_counts} \\ --ref_annot ${ref_annot} \\ + --drop_group_sample $drop_group \\ --output sample_annotation.tsv cat <<-END_VERSIONS > versions.yml diff --git a/modules/local/gffread.nf b/modules/local/gffread.nf index 66546a04..98863efa 100644 --- a/modules/local/gffread.nf +++ b/modules/local/gffread.nf @@ -12,8 +12,8 @@ process GFFREAD { tuple val(meta2), path(fasta), path(fai) output: - path "*${prefix}.gtf" , emit: gtf, optional: true - path "*${prefix}.fa" , emit: tr_fasta, optional: true + path "*_gffread_output.gtf", emit: gtf, optional: true + path "*_gffread_output.fa" , emit: tr_fasta, optional: true path "versions.yml" , emit: versions when: @@ -22,14 +22,13 @@ process GFFREAD { script: def args = task.ext.args ?: '' def genome = fasta ? "-g ${fasta}" : '' - def prefix = task.ext.prefix ?: "${meta.id}_gffread" - def outp = args.contains("-w ") ? "" : "-o ${prefix}.gtf" + def prefix = task.ext.prefix ?: "${meta.id}" + def outp = args.contains("-w") ? "-w ${prefix}_gffread_output.fa" : "-o ${prefix}_gffread_output.gtf" """ gffread \\ $gff \\ $genome \\ - $args \\ $outp cat <<-END_VERSIONS > versions.yml "${task.process}": @@ -40,8 +39,8 @@ process GFFREAD { stub: def prefix = task.ext.prefix ?: "${meta.id}_gffread" """ - touch ${prefix}.gtf - touch ${prefix}.fa + touch ${prefix}_gffread_output.gtf + touch ${prefix}_gffread_output.fa cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/nextflow.config b/nextflow.config index 9886581c..02f9e422 100644 --- a/nextflow.config +++ b/nextflow.config @@ -37,6 +37,8 @@ params { bcftools_caller_mode = 'multiallelic' run_drop_ae = true run_drop_as = true + drop_group_samples_ae = 'outrider' + drop_group_samples_as = 'fraser' drop_padjcutoff_ae = 0.05 drop_padjcutoff_as = 0.1 drop_zscorecutoff = 0 diff --git a/nextflow_schema.json b/nextflow_schema.json index f29bc040..b17fccc0 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -261,6 +261,18 @@ "description": "Should DROP Aberrant Splicing module be run?", "fa_icon": "fas fa-toggle-off" }, + "drop_group_samples_ae": { + "type": "string", + "default": "outrider", + "description": "DROP group to run when AE only one allowed. Make sure it matches your reference annotation file.", + "fa_icon": "fas fa-list-ol" + }, + "drop_group_samples_as": { + "type": "string", + "default": "fraser", + "description": "DROP group to run when AS only one allowed. Make sure it matches your reference annotation file.", + "fa_icon": "fas fa-list-ol" + }, "drop_padjcutoff_ae": { "type": "number", "default": 0.05, diff --git a/subworkflows/local/alignment.nf b/subworkflows/local/alignment.nf index 8c41b26d..b0fca701 100644 --- a/subworkflows/local/alignment.nf +++ b/subworkflows/local/alignment.nf @@ -43,21 +43,23 @@ workflow ALIGNMENT { RNA_SUBSAMPLE_REGION( STAR_ALIGN.out.bam, subsample_bed, seed_frac) ch_bam_bai = ch_bam_bai.mix(RNA_SUBSAMPLE_REGION.out.bam_bai) if (!downsample_switch) { - ch_bam_bai_out = ch_bam_bai.mix(RNA_SUBSAMPLE_REGION.out.bam_bai) + ch_bam_bai_out = RNA_SUBSAMPLE_REGION.out.bam_bai + } else { + RNA_DOWNSAMPLE( ch_bam_bai, num_reads) + ch_bam_bai_out = RNA_DOWNSAMPLE.out.bam_bai } } else { ch_bam_bai = ch_bam_bai.mix(STAR_ALIGN.out.bam.join(SAMTOOLS_INDEX.out.bai)) if (!downsample_switch) { ch_bam_bai_out = STAR_ALIGN.out.bam.join(SAMTOOLS_INDEX.out.bai) + } else { + RNA_DOWNSAMPLE( ch_bam_bai, num_reads) + ch_bam_bai_out = RNA_DOWNSAMPLE.out.bam_bai } } SAMTOOLS_VIEW( STAR_ALIGN.out.bam.join(SAMTOOLS_INDEX.out.bai), ch_genome_fasta, [] ) - if (downsample_switch) { - RNA_DOWNSAMPLE( ch_bam_bai, num_reads) - ch_bam_bai_out = ch_bam_bai.mix(RNA_DOWNSAMPLE.out.bam_bai) - } SALMON_QUANT( FASTP.out.reads, salmon_index, gtf, [], false, 'A') diff --git a/subworkflows/local/analyse_transcripts.nf b/subworkflows/local/analyse_transcripts.nf index 2e997c8f..10c9927a 100644 --- a/subworkflows/local/analyse_transcripts.nf +++ b/subworkflows/local/analyse_transcripts.nf @@ -4,7 +4,6 @@ include { STRINGTIE_STRINGTIE } from '../../modules/nf-core/stringtie/stringtie/main' include { GFFCOMPARE } from '../../modules/nf-core/gffcompare/main' - include { DROP_COUNTS } from '../../modules/local/drop_counts' include { DROP_SAMPLE_ANNOT } from '../../modules/local/drop_sample_annot' include { DROP_CONFIG_RUN_AE } from '../../modules/local/drop_config_runAE' include { DROP_CONFIG_RUN_AS } from '../../modules/local/drop_config_runAS' @@ -13,6 +12,7 @@ workflow ANALYSE_TRANSCRIPTS { take: ch_bam_bai // channel (mandatory): [ val(meta), [ path(bam) ],[ path(bai) ] ] + ch_bam_ds_bai // channel (mandatory): [ val(meta), [ path(bam) ],[ path(bai) ] ] ch_gtf // channel (mandatory): [ path(gtf) ] ch_fasta_fai_meta // channel (mandatory): [ val(meta), [ path(fasta), path(fai) ] gene_counts // channel [val(meta), path(tsv)] @@ -20,6 +20,8 @@ workflow ANALYSE_TRANSCRIPTS { ch_ref_drop_annot_file // channel [ path(tsv) ] ch_ref_drop_splice_folder // channel [ path(folder) ] genome // channel [val(genome)] + drop_group_samples_ae // channel [val(drop_group_samples_ae)] + drop_group_samples_as // channel [val(drop_group_samples_as)] drop_padjcutoff_ae // channel [val(drop_padjcutoff_ae)] drop_padjcutoff_as // channel [val(drop_padjcutoff_as)] drop_zscorecutoff // channel [val(drop_zscorecutoff)] @@ -30,47 +32,45 @@ workflow ANALYSE_TRANSCRIPTS { // DROP // Generates count files for samples and merges them with reference count file - star_count = gene_counts.map{ meta, cnt_file -> cnt_file }.collect() - star_samples = gene_counts.map{ meta, cnt_file -> meta }.collect() - DROP_COUNTS( - star_count, - star_samples, - ch_gtf, - ch_ref_drop_count_file - ) - ch_drop_counts = DROP_COUNTS.out.processed_gene_counts.collect() // Generates sample annotation - ch_bam_files = ch_bam_bai.collect{it[1]} + star_samples = gene_counts.map{ meta, cnt_file -> meta }.collect() + ch_bam_files = ch_bam_ds_bai.collect{it[1]} DROP_SAMPLE_ANNOT( ch_bam_files, star_samples, - ch_drop_counts, + ch_ref_drop_count_file, ch_ref_drop_annot_file, - ch_gtf + drop_group_samples_ae, + drop_group_samples_as ) // Generates config file and runs Aberrant expression module + ch_bai_files = ch_bam_ds_bai.collect{ it[2] }.toList() + ch_bam_bai_files = ch_bam_files.toList().combine(ch_bai_files) DROP_CONFIG_RUN_AE( ch_fasta_fai_meta, ch_gtf, DROP_SAMPLE_ANNOT.out.drop_annot, - ch_drop_counts, + ch_bam_bai_files, + ch_ref_drop_count_file, + ch_ref_drop_splice_folder, genome, + drop_group_samples_ae, drop_padjcutoff_ae, drop_zscorecutoff ) // Generates config file and runs Aberrant splicing module - ch_bai_files = ch_bam_bai.collect{it[2]}.toList() - ch_bam_bai_files=ch_bam_files.toList().combine(ch_bai_files) DROP_CONFIG_RUN_AS( ch_fasta_fai_meta, ch_gtf, DROP_SAMPLE_ANNOT.out.drop_annot, ch_bam_bai_files, + ch_ref_drop_count_file, ch_ref_drop_splice_folder, genome, + drop_group_samples_as, drop_padjcutoff_as ) @@ -90,7 +90,7 @@ workflow ANALYSE_TRANSCRIPTS { ) // Stringtie - ch_bam = ch_bam_bai.map{ meta, bam, bai -> [meta, [bam]]} + ch_bam = ch_bam_bai.map{ meta, bam, bai -> [meta, [bam]] } STRINGTIE_STRINGTIE( ch_bam, ch_gtf @@ -103,7 +103,6 @@ workflow ANALYSE_TRANSCRIPTS { ch_gtf.map{ gtf -> [ [id:gtf.simpleName], gtf ] } ) - ch_versions = ch_versions.mix(DROP_COUNTS.out.versions) ch_versions = ch_versions.mix(DROP_SAMPLE_ANNOT.out.versions) ch_versions = ch_versions.mix(DROP_CONFIG_RUN_AE.out.versions) ch_versions = ch_versions.mix(DROP_CONFIG_RUN_AS.out.versions) @@ -116,7 +115,6 @@ workflow ANALYSE_TRANSCRIPTS { coverage_gtf = STRINGTIE_STRINGTIE.out.coverage_gtf // channel: [ val(meta), [ path(coverage_gtf) ] ] annotated_gtf = GFFCOMPARE.out.annotated_gtf // channel: [ val(meta), [ path(annotated_gtf) ] ] stats_gtf = GFFCOMPARE.out.stats // channel: [ val(meta), [ path(stats) ] ] - processed_gene_counts = DROP_COUNTS.out.processed_gene_counts // channel: [ path(tsv) ] annotation_drop = DROP_SAMPLE_ANNOT.out.drop_annot // channel: [ path(sample_annotation.tsv) ] config_drop_ae = DROP_CONFIG_RUN_AE.out.config_drop // channel: [ path(confg_file.yml) ] drop_ae_out = DROP_CONFIG_RUN_AE.out.drop_ae_out // channel: [ path(drop_output_AE) ] diff --git a/subworkflows/local/prepare_references.nf b/subworkflows/local/prepare_references.nf index 7706c9bb..05097380 100644 --- a/subworkflows/local/prepare_references.nf +++ b/subworkflows/local/prepare_references.nf @@ -31,8 +31,10 @@ workflow PREPARE_REFERENCES { main: ch_versions = Channel.empty() - GUNZIP_FASTA(fasta) - ch_fasta = GUNZIP_FASTA.out.gunzip ? GUNZIP_FASTA.out.gunzip.collect() : fasta + fasta_meta = Channel.fromPath(fasta).map { it -> [ [id:it[0] ], it] }.collect() + GUNZIP_FASTA(fasta_meta) + ch_fasta = fasta.endsWith(".gz") ? GUNZIP_FASTA.out.gunzip.collect() : fasta_meta + // If no genome indices, create it SAMTOOLS_FAIDX_GENOME(ch_fasta,[[],[]]) @@ -41,38 +43,35 @@ workflow PREPARE_REFERENCES { BUILD_DICT(ch_fasta) ch_dict = BUILD_DICT.out.dict.collect() - gtf_meta=channel.of(gtf).map{it -> [[id:it[0]], it]}.collect() + gtf_meta = Channel.fromPath(gtf).map{ it -> [ [id:it[0]], it ] }.collect() GUNZIP_GTF(gtf_meta) - ch_gtf_no_meta = GUNZIP_GTF.out.gunzip ? GUNZIP_GTF.out.gunzip.map{ meta, gtf -> [gtf] }.collect() : Channel.fromPath(gtf) + ch_gtf_no_meta = gtf.endsWith(".gz") ? GUNZIP_GTF.out.gunzip.map{ meta, gtf -> [gtf] }.collect() : Channel.fromPath(gtf) // Get chrom sizes GET_CHROM_SIZES( ch_fai ) - ch_fasta_no_meta = ch_fasta.map{ meta, fasta -> [ fasta ] } + ch_fasta_no_meta = ch_fasta.map{ meta, fasta -> [ fasta ] } ch_star = star_index ? Channel.fromPath(star_index).collect() : Channel.empty() BUILD_STAR_GENOME (ch_fasta_no_meta, ch_gtf_no_meta) UNTAR_STAR_INDEX( ch_star.map { it -> [[:], it] } ) ch_star_index = (!star_index) ? BUILD_STAR_GENOME.out.index.collect() : - (star_index.endsWith(".gz") ? UNTAR_STAR_INDEX.out.untar.map { it[1] }.collect() : star_index) - + (star_index.endsWith(".gz") ? UNTAR_STAR_INDEX.out.untar.map { it[1] } : star_index) // Convert gtf to refflat for picard GTF_TO_REFFLAT(ch_gtf_no_meta) // Get rRNA transcripts and convert to interval_list format GET_RRNA_TRANSCRIPTS(ch_gtf_no_meta) - BEDTOINTERVALLIST( GET_RRNA_TRANSCRIPTS.out.bed.map {it -> [ [id:it.name], it ]}, ch_dict ) + BEDTOINTERVALLIST( GET_RRNA_TRANSCRIPTS.out.bed.map { it -> [ [id:it.name], it ] }, ch_dict ) UNTAR_VEP_CACHE (ch_vep_cache) // Preparing transcript fasta ch_fasta_fai = ch_fasta.join(ch_fai).collect() - ch_tr_fasta = transcript_fasta ? Channel.fromPath(transcript_fasta).map {it -> [[id:it[0].simpleName], it]}.collect() : Channel.empty() - GFFREAD(ch_gtf_no_meta.map{it -> [[id:it[0].simpleName], it]},ch_fasta_fai) - GUNZIP_TRFASTA(ch_tr_fasta) - transcript_fasta_no_meta = (!transcript_fasta) ? GFFREAD.out.tr_fasta : - (transcript_fasta.endsWith(".gz") ? GUNZIP_TRFASTA.out.gunzip.map{ meta, fasta -> [ fasta ] } : ch_tr_fasta.map{ meta, fasta -> [ fasta ] }) + GFFREAD(ch_gtf_no_meta.map{ it -> [ [id:it[0].simpleName], it ] },ch_fasta_fai) + transcript_fasta_no_meta = (!transcript_fasta) ? GFFREAD.out.tr_fasta.collect() : + (transcript_fasta.endsWith(".gz") ? GUNZIP_TRFASTA.out.gunzip.collect().map{ meta, fasta -> [ fasta ] } : transcript_fasta) // Setting up Salmon index ch_salmon = salmon_index ? Channel.fromPath(salmon_index).collect() : Channel.empty() @@ -80,7 +79,7 @@ workflow PREPARE_REFERENCES { SALMON_INDEX(ch_fasta_no_meta, transcript_fasta_no_meta) ch_salmon_index = (!salmon_index) ? SALMON_INDEX.out.index.collect() : - (salmon_index.endsWith(".gz") ? UNTAR_SALMON_INDEX.out.untar.map { it[1] }.collect() : salmon_index) + (salmon_index.endsWith(".gz") ? UNTAR_SALMON_INDEX.out.untar.map { it[1] } : salmon_index) ch_versions = ch_versions.mix(GUNZIP_FASTA.out.versions) ch_versions = ch_versions.mix(SAMTOOLS_FAIDX_GENOME.out.versions) @@ -108,6 +107,6 @@ workflow PREPARE_REFERENCES { refflat = GTF_TO_REFFLAT.out.refflat.collect() rrna_bed = GET_RRNA_TRANSCRIPTS.out.bed.collect() interval_list = BEDTOINTERVALLIST.out.interval_list.map{ meta, interv -> [interv] }.collect() - vep_resources = UNTAR_VEP_CACHE.out.untar.map{meta, files -> [files]}.collect() // channel: [ path(cache) ] + vep_resources = UNTAR_VEP_CACHE.out.untar.map{ meta, files -> [files] }.collect() // channel: [ path(cache) ] versions = ch_versions } diff --git a/test_data/drop_data/sampleAnnotation.tsv b/test_data/drop_data/sampleAnnotation.tsv index 01ce87d3..9f2cfe0b 100644 --- a/test_data/drop_data/sampleAnnotation.tsv +++ b/test_data/drop_data/sampleAnnotation.tsv @@ -4,7 +4,7 @@ R23611 NA NA NA outrider, fraser TRUE IntersectionStrict TRUE reverse drop_data R97979 NA NA NA outrider, fraser TRUE IntersectionStrict TRUE reverse drop_data NA drop_data/geneCounts.tsv grch37_chr21 NA R73084 NA NA NA outrider, fraser TRUE IntersectionStrict TRUE reverse drop_data NA drop_data/geneCounts.tsv grch37_chr21 NA R34104 NA NA NA outrider, fraser TRUE IntersectionStrict TRUE reverse drop_data NA drop_data/geneCounts.tsv grch37_chr21 NA -R18431 NA NA NA outrider, fraser TRUE IntersectionStrict TRUE reverse drop_data NA drop_data/geneCounts.tsv grch37_chr21 NA +R18431 NA NA NA fraser TRUE IntersectionStrict TRUE reverse drop_data NA drop_data/geneCounts.tsv grch37_chr21 NA R83610 NA NA NA outrider, fraser TRUE IntersectionStrict TRUE reverse drop_data NA drop_data/geneCounts.tsv grch37_chr21 NA R0014 NA NA NA outrider, fraser TRUE IntersectionStrict TRUE reverse drop_data NA drop_data/geneCounts.tsv grch37_chr21 NA R17262 NA NA NA outrider, fraser TRUE IntersectionStrict TRUE reverse drop_data NA drop_data/geneCounts.tsv grch37_chr21 NA diff --git a/workflows/tomte.nf b/workflows/tomte.nf index 7a88a17d..b0efea15 100644 --- a/workflows/tomte.nf +++ b/workflows/tomte.nf @@ -111,7 +111,6 @@ workflow TOMTE { exit 1, 'Input samplesheet not specified!' } - fasta = Channel.fromPath(params.fasta).map { it -> [[id:it[0].simpleName], it] }.collect() ch_vep_cache_unprocessed = params.vep_cache ? Channel.fromPath(params.vep_cache).map { it -> [[id:'vep_cache'], it] }.collect() : Channel.value([[],[]]) ch_vep_filters = params.vep_filters ? Channel.fromPath(params.vep_filters).collect() @@ -128,7 +127,7 @@ workflow TOMTE { : Channel.empty() PREPARE_REFERENCES( - fasta, + params.fasta, fai, params.star_index, params.gtf, @@ -195,6 +194,7 @@ workflow TOMTE { // Analyse transcripts ANALYSE_TRANSCRIPTS( ch_alignment.bam_bai, + ch_alignment.bam_ds_bai, ch_references.gtf, ch_references.fasta_fai_meta, ch_alignment.gene_counts, @@ -202,6 +202,8 @@ workflow TOMTE { ch_ref_drop_annot_file, ch_ref_drop_splice_folder, params.genome, + params.drop_group_samples_ae, + params.drop_group_samples_as, params.drop_padjcutoff_ae, params.drop_padjcutoff_as, params.drop_zscorecutoff,