From f725565a64649ec9daeb6b46a96313e54f9a2f13 Mon Sep 17 00:00:00 2001
From: "lucia.pena.perez@scilifelab.se" <lucia.penaperez@hasta-login.local>
Date: Fri, 6 Oct 2023 10:37:55 +0200
Subject: [PATCH 01/10] feat added drop_group as aparameter

---
 bin/drop_config.py                        | 10 ++++++
 bin/drop_sample_annot.py                  | 27 ++++++++-------
 modules/local/drop_config_runAE.nf        |  7 +++-
 modules/local/drop_config_runAS.nf        |  4 +++
 modules/local/drop_sample_annot.nf        | 10 +++---
 nextflow.config                           |  2 ++
 nextflow_schema.json                      | 12 +++++++
 subworkflows/local/alignment.nf           | 12 ++++---
 subworkflows/local/analyse_transcripts.nf | 42 ++++++++++++++---------
 workflows/tomte.nf                        |  7 ++--
 10 files changed, 93 insertions(+), 40 deletions(-)

diff --git a/bin/drop_config.py b/bin/drop_config.py
index a5eae1d4..09abd544 100755
--- a/bin/drop_config.py
+++ b/bin/drop_config.py
@@ -85,6 +85,7 @@ def update_config(
     genome: Path,
     gtf: Path,
     genome_assembly: str,
+    drop_group_samples: str,
     padjcutoff: float,
     zscorecutoff: float,
     drop_module: str,
@@ -109,10 +110,12 @@ def update_config(
     # Export counts
     if drop_module == "AE":
         config_copy["aberrantExpression"]["run"] = ["true"]
+        config_copy["aberrantExpression"]["groups"] = [drop_group_samples]
         config_copy["aberrantExpression"]["padjCutoff"] = padjcutoff
         config_copy["aberrantExpression"]["zScoreCutoff"] = zscorecutoff
     elif drop_module == "AS":
         config_copy["aberrantSplicing"]["run"] = ["true"]
+        config_copy["aberrantSplicing"]["groups"] = [drop_group_samples]
         config_copy["aberrantSplicing"]["padjCutoff"] = padjcutoff
     elif drop_module == "MAE":
         config_copy["mae"]["run"] = ["true"]
@@ -156,6 +159,12 @@ def parse_args(argv=None):
         choices=["hg19", "hs37d5", "hg38", "GRCh38"],
         required=True,
     )
+    parser.add_argument(
+        "--drop_group_samples",
+        type=str,
+        help="Specify drop group to analyse",
+        required=True,
+    )
     parser.add_argument(
         "--padjcutoff",
         type=float,
@@ -190,6 +199,7 @@ def main():
         genome=args.genome_fasta,
         gtf=args.gtf,
         genome_assembly=args.genome_assembly,
+        drop_group_samples=args.drop_group_samples,
         padjcutoff=args.padjcutoff,
         zscorecutoff=args.zscorecutoff,
         drop_module=args.drop_module,
diff --git a/bin/drop_sample_annot.py b/bin/drop_sample_annot.py
index 8858f422..72e5ac4e 100755
--- a/bin/drop_sample_annot.py
+++ b/bin/drop_sample_annot.py
@@ -4,6 +4,7 @@
 from pathlib import Path
 import csv
 from pandas import read_csv, DataFrame, concat
+import os
 
 SCRIPT_VERSION = "v1.0"
 SAMPLE_ANNOTATION_COLUMNS = [
@@ -25,7 +26,7 @@
 
 
 def write_sample_annotation_to_tsv(
-    bam: str, samples: str, strandedness: str, single_end: str, gtf: str, count_file: str, out_file: str
+    bam: str, samples: str, strandedness: str, single_end: str, drop_group_sample:str, out_file: str
 ):
     """Write the Sample Annotation tsv file."""
     with open(out_file, "w") as tsv_file:
@@ -34,9 +35,9 @@ def write_sample_annotation_to_tsv(
         for index, id in enumerate(samples):
             sa_dict: dict = {}.fromkeys(SAMPLE_ANNOTATION_COLUMNS, "NA")
             sa_dict["RNA_ID"] = id
-            sa_dict["DROP_GROUP"] = "outrider,fraser"
-            sa_dict["GENE_COUNTS_FILE"] = count_file
-            sa_dict["GENE_ANNOTATION"] = Path(gtf).stem
+            sa_dict["DROP_GROUP"] = drop_group_sample
+            sa_dict["GENE_COUNTS_FILE"] = "NA"
+            sa_dict["GENE_ANNOTATION"] = "NA"
             sa_dict["STRAND"] = strandedness[index]
             sa_dict["PAIRED_END"] = is_paired_end(single_end[index])
             sa_dict["RNA_BAM_FILE"] = bam[index]
@@ -50,14 +51,17 @@ def is_paired_end(single_end: str) -> bool:
     return False
 
 
-def write_final_annot_to_tsv(count_file: str, ref_annot: str, out_file: str):
+def write_final_annot_to_tsv(ref_count_file: str, ref_annot: str, out_file: str):
     """
     Concatenates the Sample Annotation produced by SampleAnnotation with the one
     provided for the reference samples, checking for duplicate sample IDs
     """
     df_samples: DataFrame = read_csv("drop_annotation_given_samples.tsv", sep="\t")
     df_reference: DataFrame = read_csv(ref_annot, sep="\t")
-    df_reference["GENE_COUNTS_FILE"] = count_file
+    df_reference["GENE_COUNTS_FILE"] = ref_count_file
+    df_reference["SPLICE_COUNTS_DIR"] = df_reference["SPLICE_COUNTS_DIR"].str.rstrip('/')
+    df_reference["SPLICE_COUNTS_DIR"] = df_reference["SPLICE_COUNTS_DIR"].apply(os.path.basename)
+    df_reference["DROP_GROUP"] = df_reference["DROP_GROUP"].str.replace(" ", "")
     df_samples["COUNT_OVERLAPS"] = df_reference["COUNT_OVERLAPS"].iloc[0]
     df_samples["COUNT_MODE"] = df_reference["COUNT_MODE"].iloc[0]
     df_samples["HPO_TERMS"] = df_reference["HPO_TERMS"].iloc[0]
@@ -74,15 +78,15 @@ def parse_args(argv=None):
         formatter_class=argparse.MetavarTypeHelpFormatter,
         description="""Generate DROP sample annotation for patients.""",
     )
-    parser.add_argument("--bam", type=str, nargs="+", help="bam files for the patient", required=True)
+    parser.add_argument("--bam", type=str, nargs="+", help="bam files for the analyzed samples", required=True)
     parser.add_argument("--samples", type=str, nargs="+", help="corresponding sample name", required=True)
     parser.add_argument("--strandedness", type=str, nargs="+", help="strandedness of RNA", required=True)
     parser.add_argument("--single_end", type=str, nargs="+", help="is the sample paired end?", required=True)
-    parser.add_argument("--gtf", type=str, help="Transcript annotation file in gtf format", required=True)
     parser.add_argument(
-        "--count_file", type=str, help="A tsv file of gene counts for all processed samples.", required=True
+        "--ref_count_file", type=str, help="A tsv file of gene counts for reference samples.", required=True
     )
     parser.add_argument("--ref_annot", type=str, help="Path to reference annotation tsv", required=True)
+    parser.add_argument("--drop_group_sample", type=str, help="Drop group of analyzed samples", required=True)
     parser.add_argument("--output", type=str, help="Path to save to", required=True)
     parser.add_argument("--version", action="version", version=SCRIPT_VERSION)
     return parser.parse_args(argv)
@@ -96,11 +100,10 @@ def main():
         samples=args.samples,
         strandedness=args.strandedness,
         single_end=args.single_end,
-        gtf=args.gtf,
-        count_file=args.count_file,
+        drop_group_sample=args.drop_group_sample,
         out_file="drop_annotation_given_samples.tsv",
     )
-    write_final_annot_to_tsv(count_file=args.count_file, ref_annot=args.ref_annot, out_file=args.output)
+    write_final_annot_to_tsv(ref_count_file=args.ref_count_file, ref_annot=args.ref_annot, out_file=args.output)
 
 
 if __name__ == "__main__":
diff --git a/modules/local/drop_config_runAE.nf b/modules/local/drop_config_runAE.nf
index 3c1465d9..26d39fe0 100644
--- a/modules/local/drop_config_runAE.nf
+++ b/modules/local/drop_config_runAE.nf
@@ -13,8 +13,11 @@ process DROP_CONFIG_RUN_AE {
     tuple val(meta), path(fasta), path(fai)
     path gtf
     path sample_annotation
-    path gene_counts
+    tuple path(bam), path(bai)
+    path ref_drop_count_file
+    path ref_splice_folder
     val(genome)
+    val(drop_group_samples_ae)
     val(drop_padjcutoff_ae)
     val(drop_zScoreCutoff)
 
@@ -30,6 +33,7 @@ process DROP_CONFIG_RUN_AE {
 
     script:
     def genome_assembly = "${genome}".contains("h37") ? "hg19" : "${genome}"
+    def drop_group = "${drop_group_samples_ae}".replace(" ","")
     def zscorecutoff = drop_zScoreCutoff ? "--zscorecutoff ${drop_zScoreCutoff}" : ''
 
     """
@@ -42,6 +46,7 @@ process DROP_CONFIG_RUN_AE {
         --gtf ${gtf} \\
         --drop_module AE \\
         --genome_assembly $genome_assembly \\
+        --drop_group_samples $drop_group \\
         --padjcutoff ${drop_padjcutoff_ae} \\
         $zscorecutoff \\
         --output config.yaml
diff --git a/modules/local/drop_config_runAS.nf b/modules/local/drop_config_runAS.nf
index 5a3c0a2b..8fa7349f 100644
--- a/modules/local/drop_config_runAS.nf
+++ b/modules/local/drop_config_runAS.nf
@@ -14,8 +14,10 @@ process DROP_CONFIG_RUN_AS {
     path gtf
     path sample_annotation
     tuple path(bam), path(bai)
+    path ref_drop_count_file
     path ref_splice_folder
     val(genome)
+    val(drop_group_samples_as)
     val(drop_padjcutoff_as)
 
     output:
@@ -29,6 +31,7 @@ process DROP_CONFIG_RUN_AS {
 
     script:
     def genome_assembly = "${genome}".contains("h37") ? "hg19" : "${genome}"
+    def drop_group = "${drop_group_samples_as}".replace(" ","")
     """
     TMPDIR=\$PWD
 
@@ -39,6 +42,7 @@ process DROP_CONFIG_RUN_AS {
         --gtf ${gtf}\\
         --drop_module AS \\
         --genome_assembly $genome_assembly \\
+        --drop_group_samples $drop_group \\
         --padjcutoff ${drop_padjcutoff_as} \\
         --output config.yaml
 
diff --git a/modules/local/drop_sample_annot.nf b/modules/local/drop_sample_annot.nf
index fda4abd5..248d8b19 100644
--- a/modules/local/drop_sample_annot.nf
+++ b/modules/local/drop_sample_annot.nf
@@ -12,9 +12,10 @@ process DROP_SAMPLE_ANNOT {
     input:
     path(bam)
     val(samples)
-    path(processed_gene_counts)
+    path(ref_gene_counts)
     path(ref_annot)
-    path(gtf)
+    val(drop_group_samples_ae)
+    val(drop_group_samples_as)
 
     output:
     path('sample_annotation.tsv'), emit: drop_annot
@@ -27,15 +28,16 @@ process DROP_SAMPLE_ANNOT {
     def ids = "${samples.id}".replace("[","").replace("]","").replace(",","")
     def strandedness = "${samples.strandedness}".replace("[","").replace("]","").replace(",","")
     def single_end = "${samples.single_end}".replace("[","").replace("]","").replace(",","")
+    def drop_group = "${drop_group_samples_ae},${drop_group_samples_as}".replace(" ","").replace("[","").replace("]","")
     """
     $baseDir/bin/drop_sample_annot.py \\
         --bam ${bam} \\
         --samples $ids \\
         --strandedness $strandedness \\
         --single_end $single_end \\
-        --gtf ${gtf} \\
-        --count_file ${processed_gene_counts} \\
+        --ref_count_file ${ref_gene_counts} \\
         --ref_annot ${ref_annot} \\
+        --drop_group_sample $drop_group \\
         --output sample_annotation.tsv
 
     cat <<-END_VERSIONS > versions.yml
diff --git a/nextflow.config b/nextflow.config
index 5070f54d..c12a33b4 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -37,6 +37,8 @@ params {
     bcftools_caller_mode       = 'multiallelic'
     run_drop_ae                = true
     run_drop_as                = true
+    drop_group_samples_ae      = 'outrider'
+    drop_group_samples_as      = 'fraser'
     drop_padjcutoff_ae         = 0.05
     drop_padjcutoff_as         = 0.1
     drop_zscorecutoff          = 0
diff --git a/nextflow_schema.json b/nextflow_schema.json
index cc0d6dc1..f2b1f804 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -269,6 +269,18 @@
                     "description": "Should DROP Aberrant Splicing module be run?",
                     "fa_icon": "fas fa-toggle-off"
                 },
+                "drop_group_samples_ae": {
+                    "type": "string",
+                    "default": "outrider",
+                    "description": "DROP group to run when AE only one allowed. Make sure it matches your reference annotation file.",
+                    "fa_icon": "fas fa-list-ol"
+                },
+                "drop_group_samples_as": {
+                    "type": "string",
+                    "default": "fraser",
+                    "description": "DROP group to run when AS only one allowed. Make sure it matches your reference annotation file.",
+                    "fa_icon": "fas fa-list-ol"
+                },
                 "drop_padjcutoff_ae": {
                     "type": "number",
                     "default": 0.05,
diff --git a/subworkflows/local/alignment.nf b/subworkflows/local/alignment.nf
index 8c41b26d..b0fca701 100644
--- a/subworkflows/local/alignment.nf
+++ b/subworkflows/local/alignment.nf
@@ -43,21 +43,23 @@ workflow ALIGNMENT {
             RNA_SUBSAMPLE_REGION( STAR_ALIGN.out.bam, subsample_bed, seed_frac)
             ch_bam_bai = ch_bam_bai.mix(RNA_SUBSAMPLE_REGION.out.bam_bai)
             if (!downsample_switch) {
-                ch_bam_bai_out = ch_bam_bai.mix(RNA_SUBSAMPLE_REGION.out.bam_bai)
+                ch_bam_bai_out = RNA_SUBSAMPLE_REGION.out.bam_bai
+            } else {
+                RNA_DOWNSAMPLE( ch_bam_bai, num_reads)
+                ch_bam_bai_out = RNA_DOWNSAMPLE.out.bam_bai
             }
         } else {
             ch_bam_bai = ch_bam_bai.mix(STAR_ALIGN.out.bam.join(SAMTOOLS_INDEX.out.bai))
              if (!downsample_switch) {
                 ch_bam_bai_out = STAR_ALIGN.out.bam.join(SAMTOOLS_INDEX.out.bai)
+            } else {
+                RNA_DOWNSAMPLE( ch_bam_bai, num_reads)
+                ch_bam_bai_out = RNA_DOWNSAMPLE.out.bam_bai
             }
         }
 
         SAMTOOLS_VIEW( STAR_ALIGN.out.bam.join(SAMTOOLS_INDEX.out.bai), ch_genome_fasta, [] )
 
-        if (downsample_switch) {
-            RNA_DOWNSAMPLE( ch_bam_bai, num_reads)
-            ch_bam_bai_out = ch_bam_bai.mix(RNA_DOWNSAMPLE.out.bam_bai)
-        }
 
         SALMON_QUANT( FASTP.out.reads, salmon_index, gtf, [], false, 'A')
 
diff --git a/subworkflows/local/analyse_transcripts.nf b/subworkflows/local/analyse_transcripts.nf
index 2e997c8f..9a7f50f2 100644
--- a/subworkflows/local/analyse_transcripts.nf
+++ b/subworkflows/local/analyse_transcripts.nf
@@ -13,6 +13,7 @@
 workflow ANALYSE_TRANSCRIPTS {
     take:
         ch_bam_bai                    // channel (mandatory): [ val(meta), [ path(bam) ],[ path(bai) ] ]
+        ch_bam_ds_bai                 // channel (mandatory): [ val(meta), [ path(bam) ],[ path(bai) ] ]
         ch_gtf                        // channel (mandatory): [ path(gtf) ]
         ch_fasta_fai_meta             // channel (mandatory): [ val(meta), [ path(fasta), path(fai) ]
         gene_counts                   // channel [val(meta), path(tsv)] 
@@ -20,6 +21,8 @@ workflow ANALYSE_TRANSCRIPTS {
         ch_ref_drop_annot_file        // channel [ path(tsv) ]
         ch_ref_drop_splice_folder     // channel [ path(folder) ]
         genome                        // channel [val(genome)]
+        drop_group_samples_ae         // channel [val(drop_group_samples_ae)]
+        drop_group_samples_as         // channel [val(drop_group_samples_as)]
         drop_padjcutoff_ae            // channel [val(drop_padjcutoff_ae)]
         drop_padjcutoff_as            // channel [val(drop_padjcutoff_as)]
         drop_zscorecutoff             // channel [val(drop_zscorecutoff)]
@@ -30,47 +33,54 @@ workflow ANALYSE_TRANSCRIPTS {
 
         // DROP
         // Generates count files for samples and merges them with reference count file
-        star_count = gene_counts.map{ meta, cnt_file -> cnt_file }.collect()
+        //star_count = gene_counts.map{ meta, cnt_file -> cnt_file }.collect()
         star_samples  = gene_counts.map{ meta, cnt_file -> meta }.collect()
-        DROP_COUNTS(
-            star_count,
-            star_samples,
-            ch_gtf,
-            ch_ref_drop_count_file
-        )
-        ch_drop_counts = DROP_COUNTS.out.processed_gene_counts.collect()
+        //DROP_COUNTS(
+        //    star_count,
+        //    star_samples,
+        //    ch_gtf,
+        //    ch_ref_drop_count_file
+        //)
+        //ch_drop_counts = DROP_COUNTS.out.processed_gene_counts.collect()
         
         // Generates sample annotation
-        ch_bam_files = ch_bam_bai.collect{it[1]}
+        ch_bam_files = ch_bam_ds_bai.collect{it[1]}
         DROP_SAMPLE_ANNOT(
             ch_bam_files,
             star_samples,
-            ch_drop_counts,
+            ch_ref_drop_count_file,
             ch_ref_drop_annot_file,
-            ch_gtf
+            drop_group_samples_ae,
+            drop_group_samples_as
+        //    ch_gtf
         )
 
         // Generates  config file and runs Aberrant expression module
+        ch_bai_files = ch_bam_ds_bai.collect{it[2]}.toList()
+        ch_bam_bai_files=ch_bam_files.toList().combine(ch_bai_files)
         DROP_CONFIG_RUN_AE(
             ch_fasta_fai_meta, 
             ch_gtf, 
             DROP_SAMPLE_ANNOT.out.drop_annot,
-            ch_drop_counts,
+            ch_bam_bai_files,
+            ch_ref_drop_count_file,
+            ch_ref_drop_splice_folder,
             genome,
+            drop_group_samples_ae,
             drop_padjcutoff_ae,
             drop_zscorecutoff
         )
         
         // Generates  config file and runs Aberrant splicing module
-        ch_bai_files = ch_bam_bai.collect{it[2]}.toList()
-        ch_bam_bai_files=ch_bam_files.toList().combine(ch_bai_files)
         DROP_CONFIG_RUN_AS(
             ch_fasta_fai_meta, 
             ch_gtf, 
             DROP_SAMPLE_ANNOT.out.drop_annot,
             ch_bam_bai_files,
+            ch_ref_drop_count_file,
             ch_ref_drop_splice_folder,
             genome,
+            drop_group_samples_as,
             drop_padjcutoff_as
         )
 
@@ -103,7 +113,7 @@ workflow ANALYSE_TRANSCRIPTS {
             ch_gtf.map{ gtf -> [ [id:gtf.simpleName], gtf ] }
         )
 
-        ch_versions = ch_versions.mix(DROP_COUNTS.out.versions)
+        //ch_versions = ch_versions.mix(DROP_COUNTS.out.versions)
         ch_versions = ch_versions.mix(DROP_SAMPLE_ANNOT.out.versions)
         ch_versions = ch_versions.mix(DROP_CONFIG_RUN_AE.out.versions)
         ch_versions = ch_versions.mix(DROP_CONFIG_RUN_AS.out.versions)
@@ -116,7 +126,7 @@ workflow ANALYSE_TRANSCRIPTS {
         coverage_gtf          = STRINGTIE_STRINGTIE.out.coverage_gtf       // channel: [ val(meta), [ path(coverage_gtf) ] ]
         annotated_gtf         = GFFCOMPARE.out.annotated_gtf               // channel: [ val(meta), [ path(annotated_gtf) ] ]
         stats_gtf             = GFFCOMPARE.out.stats                       // channel: [ val(meta), [ path(stats) ] ]
-        processed_gene_counts = DROP_COUNTS.out.processed_gene_counts      // channel: [ path(tsv) ]
+        //processed_gene_counts = DROP_COUNTS.out.processed_gene_counts      // channel: [ path(tsv) ]
         annotation_drop       = DROP_SAMPLE_ANNOT.out.drop_annot           // channel: [ path(sample_annotation.tsv) ]
         config_drop_ae        = DROP_CONFIG_RUN_AE.out.config_drop         // channel: [ path(confg_file.yml) ]
         drop_ae_out           = DROP_CONFIG_RUN_AE.out.drop_ae_out         // channel: [ path(drop_output_AE) ]
diff --git a/workflows/tomte.nf b/workflows/tomte.nf
index 7f28c658..49163e51 100644
--- a/workflows/tomte.nf
+++ b/workflows/tomte.nf
@@ -111,7 +111,7 @@ workflow TOMTE {
         exit 1, 'Input samplesheet not specified!'
     }
 
-    fasta                         = Channel.fromPath(params.fasta).map { it -> [[id:it[0].simpleName], it] }.collect()
+    //fasta                         = Channel.fromPath(params.fasta).map { it -> [[id:it[0].simpleName], it] }.collect()
     ch_vep_cache_unprocessed      = params.vep_cache                    ? Channel.fromPath(params.vep_cache).map { it -> [[id:'vep_cache'], it] }.collect()
                                                                         : Channel.value([[],[]])
     ch_vep_filters                = params.vep_filters                  ? Channel.fromPath(params.vep_filters).collect()
@@ -128,7 +128,7 @@ workflow TOMTE {
                                                                         : Channel.empty()
 
     PREPARE_REFERENCES(
-        fasta,
+        params.fasta,
         fai,
         params.star_index,
         params.gtf,
@@ -185,6 +185,7 @@ workflow TOMTE {
     // Analyse transcripts
     ANALYSE_TRANSCRIPTS(
         ch_alignment.bam_bai,
+        ch_alignment.bam_ds_bai,
         ch_references.gtf,
         ch_references.fasta_fai_meta,
         ch_alignment.gene_counts,
@@ -192,6 +193,8 @@ workflow TOMTE {
         ch_ref_drop_annot_file,
         ch_ref_drop_splice_folder,
         params.genome,
+        params.drop_group_samples_ae,
+        params.drop_group_samples_as,
         params.drop_padjcutoff_ae,
         params.drop_padjcutoff_as,
         params.drop_zscorecutoff,

From 277413cb8e7a48ce4ade1243f5a3bab8f19b24d0 Mon Sep 17 00:00:00 2001
From: "lucia.pena.perez@scilifelab.se" <lucia.penaperez@hasta-login.local>
Date: Fri, 6 Oct 2023 10:38:44 +0200
Subject: [PATCH 02/10] fix run black

---
 bin/drop_sample_annot.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bin/drop_sample_annot.py b/bin/drop_sample_annot.py
index 72e5ac4e..c4a4f9c0 100755
--- a/bin/drop_sample_annot.py
+++ b/bin/drop_sample_annot.py
@@ -26,7 +26,7 @@
 
 
 def write_sample_annotation_to_tsv(
-    bam: str, samples: str, strandedness: str, single_end: str, drop_group_sample:str, out_file: str
+    bam: str, samples: str, strandedness: str, single_end: str, drop_group_sample: str, out_file: str
 ):
     """Write the Sample Annotation tsv file."""
     with open(out_file, "w") as tsv_file:
@@ -59,7 +59,7 @@ def write_final_annot_to_tsv(ref_count_file: str, ref_annot: str, out_file: str)
     df_samples: DataFrame = read_csv("drop_annotation_given_samples.tsv", sep="\t")
     df_reference: DataFrame = read_csv(ref_annot, sep="\t")
     df_reference["GENE_COUNTS_FILE"] = ref_count_file
-    df_reference["SPLICE_COUNTS_DIR"] = df_reference["SPLICE_COUNTS_DIR"].str.rstrip('/')
+    df_reference["SPLICE_COUNTS_DIR"] = df_reference["SPLICE_COUNTS_DIR"].str.rstrip("/")
     df_reference["SPLICE_COUNTS_DIR"] = df_reference["SPLICE_COUNTS_DIR"].apply(os.path.basename)
     df_reference["DROP_GROUP"] = df_reference["DROP_GROUP"].str.replace(" ", "")
     df_samples["COUNT_OVERLAPS"] = df_reference["COUNT_OVERLAPS"].iloc[0]

From 28be7da75e2d8a3a8726f53e1b6d9cff52383828 Mon Sep 17 00:00:00 2001
From: "lucia.pena.perez@scilifelab.se" <lucia.penaperez@hasta-login.local>
Date: Mon, 16 Oct 2023 15:19:56 +0200
Subject: [PATCH 03/10] fix vep

---
 conf/modules.config | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/conf/modules.config b/conf/modules.config
index 37b5e7af..958d7e29 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -181,6 +181,7 @@ process {
 
     withName: '.*PREPARE_REFERENCES:GFFREAD' {
         ext.when = { (!params.transcript_fasta) }
+        ext.args = { '-w' }
         publishDir = [
             path: { "${params.outdir}/references" },
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
@@ -475,11 +476,11 @@ process {
     withName: '.*ANNOTATE_SNV:ENSEMBLVEP' {
         ext.prefix = { "${vcf.simpleName}_rohann_vcfanno_filter_vep" }
         ext.args   = [
-            '--dir_plugins vep_cache/Plugins',
-            '--plugin LoFtool,vep_cache/LoFtool_scores.txt',
-            '--plugin pLI,vep_cache/pLI_values_107.txt',
-            '--plugin SpliceAI,snv=vep_cache/spliceai_21_scores_raw_snv_-v1.3-.vcf.gz,indel=vep_cache/spliceai_21_scores_raw_snv_-v1.3-.vcf.gz',
-            '--plugin MaxEntScan,vep_cache/fordownload,SWA,NCSS',
+            '--dir_plugins cache/Plugins',
+            '--plugin LoFtool,cache/Plugins/LoFtool_scores.txt',
+            '--plugin pLI,cache/Plugins/pLI_values_107.txt',
+            '--plugin SpliceAI,snv=cache/Plugins/spliceai_21_scores_raw_snv_-v1.3-.vcf.gz,indel=cache/Plugins/spliceai_21_scores_raw_snv_-v1.3-.vcf.gz',
+            '--plugin MaxEntScan,cache/Plugins/fordownload,SWA,NCSS',
             '--distance 5000',
             '--buffer_size 20000',
             '--format vcf --max_sv_size 248956422',

From 1f97ca5e9987ad37e658a1e4eae45dea5615726e Mon Sep 17 00:00:00 2001
From: "lucia.pena.perez@scilifelab.se" <lucia.penaperez@hasta-login.local>
Date: Tue, 17 Oct 2023 10:01:30 +0200
Subject: [PATCH 04/10] fix gffread

---
 modules/local/gffread.nf | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/modules/local/gffread.nf b/modules/local/gffread.nf
index 66546a04..98863efa 100644
--- a/modules/local/gffread.nf
+++ b/modules/local/gffread.nf
@@ -12,8 +12,8 @@ process GFFREAD {
     tuple val(meta2), path(fasta), path(fai)
 
     output:
-    path "*${prefix}.gtf"        , emit: gtf,        optional: true
-    path "*${prefix}.fa"         , emit: tr_fasta,   optional: true
+    path "*_gffread_output.gtf", emit: gtf,        optional: true
+    path "*_gffread_output.fa"  , emit: tr_fasta,   optional: true
     path "versions.yml" , emit: versions
 
     when:
@@ -22,14 +22,13 @@ process GFFREAD {
     script:
     def args   = task.ext.args   ?: ''
     def genome = fasta ? "-g ${fasta}" : ''
-    def prefix = task.ext.prefix ?: "${meta.id}_gffread"
-    def outp   = args.contains("-w ") ? "" : "-o ${prefix}.gtf"
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    def outp   = args.contains("-w") ? "-w ${prefix}_gffread_output.fa" : "-o ${prefix}_gffread_output.gtf"
 
     """
     gffread \\
         $gff \\
         $genome \\
-        $args \\
         $outp
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
@@ -40,8 +39,8 @@ process GFFREAD {
     stub:
     def prefix = task.ext.prefix ?: "${meta.id}_gffread"
     """
-    touch ${prefix}.gtf
-    touch ${prefix}.fa
+    touch ${prefix}_gffread_output.gtf
+    touch ${prefix}_gffread_output.fa
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

From 24dfb0fb7a1c03f4907d0f06bec866abb5ac1d84 Mon Sep 17 00:00:00 2001
From: "lucia.pena.perez@scilifelab.se" <lucia.penaperez@hasta-login.local>
Date: Tue, 24 Oct 2023 14:21:44 +0200
Subject: [PATCH 05/10] fix prepare references

---
 bin/drop_counts.py                        | 136 ----------------------
 modules/local/drop_counts.nf              |  52 ---------
 subworkflows/local/analyse_transcripts.nf |  12 +-
 subworkflows/local/prepare_references.nf  |  25 ++--
 4 files changed, 14 insertions(+), 211 deletions(-)
 delete mode 100755 bin/drop_counts.py
 delete mode 100644 modules/local/drop_counts.nf

diff --git a/bin/drop_counts.py b/bin/drop_counts.py
deleted file mode 100755
index c2593b03..00000000
--- a/bin/drop_counts.py
+++ /dev/null
@@ -1,136 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-import re
-from collections import OrderedDict
-from pathlib import Path
-from pandas import read_csv, DataFrame
-from typing import Set, Dict
-
-SCRIPT_VERSION = "v1.0"
-TRANSLATOR = {
-    "unstranded": 1,
-    "forward": 2,
-    "reverse": 3,
-}
-
-STANDARD_CHROMOSOMES = [
-    "chr1",
-    "chr2",
-    "chr3",
-    "chr4",
-    "chr5",
-    "chr6",
-    "chr7",
-    "chr8",
-    "chr9",
-    "chr10",
-    "chr11",
-    "chr12",
-    "chr13",
-    "chr14",
-    "chr15",
-    "chr16",
-    "chr17",
-    "chr18",
-    "chr19",
-    "chr20",
-    "chr21",
-    "chr22",
-    "chrX",
-    "chrY",
-    "chrM",
-]
-
-
-def get_non_std_genes(gtf: Path) -> Set[str]:
-    """Create list of genes not belonging to chr1-21 or chrM"""
-    gene_id_regex = re.compile('gene_id "(.+?)"')
-    genes_to_exclude = set()
-    with open(gtf, "r") as gtf_file:
-        for line in gtf_file:
-            if line.startswith("#"):
-                continue
-            if line.split()[0] in STANDARD_CHROMOSOMES:
-                continue
-            gene_id = re.search(gene_id_regex, line)
-            genes_to_exclude.add(gene_id.group(1))
-    return genes_to_exclude
-
-
-def read_star_gene_counts(sample: str, star: Path, strandedness: str) -> Dict:
-    """Read gene count file(s) from STAR output to return sample_ids."""
-    sample_ids = {}
-    gene_ids = {}
-    with open(star) as in_tab:
-        for line in in_tab:
-            if not line.startswith("N_"):
-                split_line = line.split()
-                gene_id = split_line[0]
-                strand = TRANSLATOR[strandedness]
-                counts = int(split_line[strand])
-                gene_ids[gene_id] = counts
-    gene_ids = OrderedDict(sorted(gene_ids.items()))
-    sample_ids[sample] = gene_ids
-    return sample_ids
-
-
-def get_counts_from_dict(gene_ids_dict: dict) -> DataFrame:
-    """Transform gene ids dict into count_table"""
-    one_sample = next(iter(gene_ids_dict))
-    gene_list = list(gene_ids_dict[one_sample].keys())
-    genes = {gene: [gene_ids_dict[sample][gene] for sample in gene_ids_dict] for gene in gene_list}
-    count_table: DataFrame = DataFrame.from_dict(genes, orient="index", columns=gene_ids_dict.keys())
-    count_table.index.name = "geneID"
-    return count_table
-
-
-def write_tsv_from_dict(gene_ids_dict: dict, outfile: str, ref_count_file: str, genes_to_exclude: Set[str]) -> None:
-    """Transform dictionary into tsv friendly."""
-    count_table = get_counts_from_dict(gene_ids_dict)
-    if ref_count_file:
-        if ref_count_file.endswith(".gz"):
-            ref_table = read_csv(ref_count_file, compression="gzip", sep="\t", header=0, index_col=0)
-        else:
-            ref_table = read_csv(ref_count_file, sep="\t", header=0, index_col=0)
-        count_table = count_table.combine_first(ref_table)
-    count_table.drop(genes_to_exclude, inplace=True)
-    count_table.to_csv(outfile, compression="gzip", sep="\t", header=True)
-
-
-def parse_args(argv=None):
-    """Define and immediately parse command line arguments."""
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.MetavarTypeHelpFormatter,
-        description="""Generate collated gene counts from each STAR output.""",
-    )
-    parser.add_argument("--star", type=str, nargs="+", help="*ReadsPerGene.out.tab from STAR", required=True)
-    parser.add_argument("--samples", type=str, nargs="+", help="corresponding sample name", required=True)
-    parser.add_argument("--strandedness", type=str, nargs="+", help="strandedness of RNA", required=True)
-    parser.add_argument("--output", type=str, help="output tsv file name", required=True)
-    parser.add_argument("--gtf", type=str, help="Transcript annotation file in gtf format", required=True)
-    parser.add_argument("--ref_count_file", type=str, help="Optional reference count set", required=True)
-    parser.add_argument("--version", action="version", version=SCRIPT_VERSION)
-    return parser.parse_args(argv)
-
-
-def main():
-    """Coordinate argument parsing and program execution."""
-    args = parse_args()
-    master_dict = {}
-    for index, sample_id in enumerate(args.samples):
-        master_dict.update(
-            read_star_gene_counts(sample=sample_id, star=args.star[index], strandedness=args.strandedness[index])
-        )
-
-    genes_to_exclude: Set[str] = get_non_std_genes(gtf=args.gtf)
-    write_tsv_from_dict(
-        gene_ids_dict=master_dict,
-        outfile=args.output,
-        ref_count_file=args.ref_count_file,
-        genes_to_exclude=genes_to_exclude,
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/modules/local/drop_counts.nf b/modules/local/drop_counts.nf
deleted file mode 100644
index b28a33fc..00000000
--- a/modules/local/drop_counts.nf
+++ /dev/null
@@ -1,52 +0,0 @@
-process DROP_COUNTS {
-    tag "DROP_counts"
-    label 'process_low'
-
-    // Exit if running this module with -profile conda / -profile mamba
-    if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
-        exit 1, "Local DROP module does not support Conda. Please use Docker / Singularity / Podman instead."
-    }
-
-    container "docker.io/clinicalgenomics/drop:1.3.3"
-
-    input:
-    path(counts)
-    val(samples)
-    path(gtf)
-    path(reference_count_file)
-
-    output:
-    path('processed_geneCounts.tsv.gz'), emit: processed_gene_counts
-    path "versions.yml"             , emit: versions
-
-    when:
-    task.ext.when == null || task.ext.when
-
-    script:
-    def ids = "${samples.id}".replace("[","").replace("]","").replace(",","")
-    def strandedness = "${samples.strandedness}".replace("[","").replace("]","").replace(",","")
-    """
-    $baseDir/bin/drop_counts.py \\
-        --star ${counts} \\
-        --samples $ids \\
-        --strandedness $strandedness \\
-        --ref_count_file ${reference_count_file} \\
-        --output processed_geneCounts.tsv.gz \\
-        --gtf ${gtf} \\
-
-    cat <<-END_VERSIONS > versions.yml
-    "${task.process}":
-        drop_counts: \$(\$baseDir/bin/drop_counts.py --version )
-    END_VERSIONS
-    """
-
-    stub:
-    """
-    touch processed_geneCounts.tsv.gz
-
-    cat <<-END_VERSIONS > versions.yml
-    "${task.process}":
-        drop_counts: \$(\$baseDir/bin/drop_counts.py --version )
-    END_VERSIONS
-    """
-}
diff --git a/subworkflows/local/analyse_transcripts.nf b/subworkflows/local/analyse_transcripts.nf
index 9a7f50f2..6e9e1b56 100644
--- a/subworkflows/local/analyse_transcripts.nf
+++ b/subworkflows/local/analyse_transcripts.nf
@@ -33,17 +33,9 @@ workflow ANALYSE_TRANSCRIPTS {
 
         // DROP
         // Generates count files for samples and merges them with reference count file
-        //star_count = gene_counts.map{ meta, cnt_file -> cnt_file }.collect()
-        star_samples  = gene_counts.map{ meta, cnt_file -> meta }.collect()
-        //DROP_COUNTS(
-        //    star_count,
-        //    star_samples,
-        //    ch_gtf,
-        //    ch_ref_drop_count_file
-        //)
-        //ch_drop_counts = DROP_COUNTS.out.processed_gene_counts.collect()
         
         // Generates sample annotation
+        star_samples  = gene_counts.map{ meta, cnt_file -> meta }.collect()
         ch_bam_files = ch_bam_ds_bai.collect{it[1]}
         DROP_SAMPLE_ANNOT(
             ch_bam_files,
@@ -52,7 +44,6 @@ workflow ANALYSE_TRANSCRIPTS {
             ch_ref_drop_annot_file,
             drop_group_samples_ae,
             drop_group_samples_as
-        //    ch_gtf
         )
 
         // Generates  config file and runs Aberrant expression module
@@ -126,7 +117,6 @@ workflow ANALYSE_TRANSCRIPTS {
         coverage_gtf          = STRINGTIE_STRINGTIE.out.coverage_gtf       // channel: [ val(meta), [ path(coverage_gtf) ] ]
         annotated_gtf         = GFFCOMPARE.out.annotated_gtf               // channel: [ val(meta), [ path(annotated_gtf) ] ]
         stats_gtf             = GFFCOMPARE.out.stats                       // channel: [ val(meta), [ path(stats) ] ]
-        //processed_gene_counts = DROP_COUNTS.out.processed_gene_counts      // channel: [ path(tsv) ]
         annotation_drop       = DROP_SAMPLE_ANNOT.out.drop_annot           // channel: [ path(sample_annotation.tsv) ]
         config_drop_ae        = DROP_CONFIG_RUN_AE.out.config_drop         // channel: [ path(confg_file.yml) ]
         drop_ae_out           = DROP_CONFIG_RUN_AE.out.drop_ae_out         // channel: [ path(drop_output_AE) ]
diff --git a/subworkflows/local/prepare_references.nf b/subworkflows/local/prepare_references.nf
index 7706c9bb..f158d240 100644
--- a/subworkflows/local/prepare_references.nf
+++ b/subworkflows/local/prepare_references.nf
@@ -31,8 +31,10 @@ workflow PREPARE_REFERENCES {
     main:
         ch_versions = Channel.empty()
 
-        GUNZIP_FASTA(fasta)
-        ch_fasta = GUNZIP_FASTA.out.gunzip ? GUNZIP_FASTA.out.gunzip.collect() : fasta
+        fasta_meta=Channel.fromPath(fasta).map { it -> [[id:it[0]], it]}.collect()
+        GUNZIP_FASTA(fasta_meta)
+        ch_fasta = fasta.endsWith(".gz") ? GUNZIP_FASTA.out.gunzip.collect() : fasta_meta
+
 
         // If no genome indices, create it
         SAMTOOLS_FAIDX_GENOME(ch_fasta,[[],[]])
@@ -41,21 +43,21 @@ workflow PREPARE_REFERENCES {
         BUILD_DICT(ch_fasta)
         ch_dict = BUILD_DICT.out.dict.collect()
 
-        gtf_meta=channel.of(gtf).map{it -> [[id:it[0]], it]}.collect()
+        gtf_meta=Channel.fromPath(gtf).map{it -> [[id:it[0]], it]}.collect()
         GUNZIP_GTF(gtf_meta)
-        ch_gtf_no_meta  = GUNZIP_GTF.out.gunzip ? GUNZIP_GTF.out.gunzip.map{ meta, gtf -> [gtf] }.collect() : Channel.fromPath(gtf)
+        ch_gtf_no_meta = gtf.endsWith(".gz") ? GUNZIP_GTF.out.gunzip.map{ meta, gtf -> [gtf] }.collect() : Channel.fromPath(gtf)
 
         // Get chrom sizes
         GET_CHROM_SIZES( ch_fai )
 
-        ch_fasta_no_meta =  ch_fasta.map{ meta, fasta -> [ fasta ] }
+        ch_fasta_no_meta = ch_fasta.map{ meta, fasta -> [ fasta ] }
 
         ch_star = star_index ? Channel.fromPath(star_index).collect() : Channel.empty()
         BUILD_STAR_GENOME (ch_fasta_no_meta, ch_gtf_no_meta)
         UNTAR_STAR_INDEX( ch_star.map { it -> [[:], it] } )
         ch_star_index = (!star_index) ?  BUILD_STAR_GENOME.out.index.collect() : 
-                                        (star_index.endsWith(".gz") ? UNTAR_STAR_INDEX.out.untar.map { it[1] }.collect() : star_index)
-        
+                                        (star_index.endsWith(".gz") ? UNTAR_STAR_INDEX.out.untar.map { it[1] } : star_index)
+                
         // Convert gtf to refflat for picard
         GTF_TO_REFFLAT(ch_gtf_no_meta)
 
@@ -68,11 +70,10 @@ workflow PREPARE_REFERENCES {
         
         // Preparing transcript fasta
         ch_fasta_fai = ch_fasta.join(ch_fai).collect()
-        ch_tr_fasta = transcript_fasta ? Channel.fromPath(transcript_fasta).map {it -> [[id:it[0].simpleName], it]}.collect() : Channel.empty()
         GFFREAD(ch_gtf_no_meta.map{it -> [[id:it[0].simpleName], it]},ch_fasta_fai)
-        GUNZIP_TRFASTA(ch_tr_fasta)
-        transcript_fasta_no_meta = (!transcript_fasta) ? GFFREAD.out.tr_fasta : 
-                                   (transcript_fasta.endsWith(".gz") ? GUNZIP_TRFASTA.out.gunzip.map{ meta, fasta -> [ fasta ] } : ch_tr_fasta.map{ meta, fasta -> [ fasta ] })
+        tr=GFFREAD.out.tr_fasta.collect()
+        transcript_fasta_no_meta = (!transcript_fasta) ? GFFREAD.out.tr_fasta.collect() :
+                                    (transcript_fasta.endsWith(".gz") ? GUNZIP_TRFASTA.out.gunzip.collect().map{ meta, fasta -> [ fasta ] } : transcript_fasta)
 
         // Setting up Salmon index
         ch_salmon = salmon_index ? Channel.fromPath(salmon_index).collect() : Channel.empty()
@@ -80,7 +81,7 @@ workflow PREPARE_REFERENCES {
         SALMON_INDEX(ch_fasta_no_meta, transcript_fasta_no_meta)
 
         ch_salmon_index = (!salmon_index) ? SALMON_INDEX.out.index.collect() : 
-                                            (salmon_index.endsWith(".gz") ? UNTAR_SALMON_INDEX.out.untar.map { it[1] }.collect() : salmon_index)
+                                            (salmon_index.endsWith(".gz") ? UNTAR_SALMON_INDEX.out.untar.map { it[1] } : salmon_index)
         
         ch_versions = ch_versions.mix(GUNZIP_FASTA.out.versions)
         ch_versions = ch_versions.mix(SAMTOOLS_FAIDX_GENOME.out.versions)

From 69a9fb4ab629780548e5d5776c9285fd02b0e569 Mon Sep 17 00:00:00 2001
From: "lucia.pena.perez@scilifelab.se" <lucia.penaperez@hasta-login.local>
Date: Tue, 24 Oct 2023 14:25:25 +0200
Subject: [PATCH 06/10] fix analyse transcripts

---
 subworkflows/local/analyse_transcripts.nf | 1 -
 1 file changed, 1 deletion(-)

diff --git a/subworkflows/local/analyse_transcripts.nf b/subworkflows/local/analyse_transcripts.nf
index 6e9e1b56..32b44779 100644
--- a/subworkflows/local/analyse_transcripts.nf
+++ b/subworkflows/local/analyse_transcripts.nf
@@ -4,7 +4,6 @@
 
  include { STRINGTIE_STRINGTIE } from '../../modules/nf-core/stringtie/stringtie/main'
  include { GFFCOMPARE          } from '../../modules/nf-core/gffcompare/main'
- include { DROP_COUNTS         } from '../../modules/local/drop_counts'
  include { DROP_SAMPLE_ANNOT   } from '../../modules/local/drop_sample_annot'
  include { DROP_CONFIG_RUN_AE  } from '../../modules/local/drop_config_runAE'
  include { DROP_CONFIG_RUN_AS  } from '../../modules/local/drop_config_runAS'

From d319990f2b0e0902451bd229dbe2af7009693029 Mon Sep 17 00:00:00 2001
From: "lucia.pena.perez@scilifelab.se" <lucia.penaperez@hasta-login.local>
Date: Tue, 24 Oct 2023 14:32:00 +0200
Subject: [PATCH 07/10] fix modules.config

---
 conf/modules.config | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/conf/modules.config b/conf/modules.config
index 958d7e29..9f0d7566 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -317,15 +317,6 @@ process {
 //
 
 process {
-    withName: '.*ANALYSE_TRANSCRIPTS:DROP_COUNTS' {
-        ext.when = {params.run_drop_ae}
-        publishDir = [
-            path: { "${params.outdir}/analyse_transcripts/drop/AE" },
-            mode: params.publish_dir_mode,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
-        ]
-    }
-
     withName: '.*ANALYSE_TRANSCRIPTS:DROP_ANNOT' {
         ext.when = {params.run_drop_ae|params.run_drop_as}
         publishDir = [

From a1b1c8dcaf44498e2298fd5ac42a259b7459263d Mon Sep 17 00:00:00 2001
From: "lucia.pena.perez@scilifelab.se" <lucia.penaperez@hasta-login.local>
Date: Tue, 24 Oct 2023 16:01:06 +0200
Subject: [PATCH 08/10] fix drop

---
 test_data/drop_data/sampleAnnotation.tsv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_data/drop_data/sampleAnnotation.tsv b/test_data/drop_data/sampleAnnotation.tsv
index 01ce87d3..9f2cfe0b 100644
--- a/test_data/drop_data/sampleAnnotation.tsv
+++ b/test_data/drop_data/sampleAnnotation.tsv
@@ -4,7 +4,7 @@ R23611	NA	NA	NA	outrider, fraser	TRUE	IntersectionStrict	TRUE	reverse	drop_data
 R97979	NA	NA	NA	outrider, fraser	TRUE	IntersectionStrict	TRUE	reverse	drop_data	NA	drop_data/geneCounts.tsv	grch37_chr21	NA
 R73084	NA	NA	NA	outrider, fraser	TRUE	IntersectionStrict	TRUE	reverse	drop_data	NA	drop_data/geneCounts.tsv	grch37_chr21	NA
 R34104	NA	NA	NA	outrider, fraser	TRUE	IntersectionStrict	TRUE	reverse	drop_data	NA	drop_data/geneCounts.tsv	grch37_chr21	NA
-R18431	NA	NA	NA	outrider, fraser	TRUE	IntersectionStrict	TRUE	reverse	drop_data	NA	drop_data/geneCounts.tsv	grch37_chr21	NA
+R18431	NA	NA	NA	fraser	TRUE	IntersectionStrict	TRUE	reverse	drop_data	NA	drop_data/geneCounts.tsv	grch37_chr21	NA
 R83610	NA	NA	NA	outrider, fraser	TRUE	IntersectionStrict	TRUE	reverse	drop_data	NA	drop_data/geneCounts.tsv	grch37_chr21	NA
 R0014	NA	NA	NA	outrider, fraser	TRUE	IntersectionStrict	TRUE	reverse	drop_data	NA	drop_data/geneCounts.tsv	grch37_chr21	NA
 R17262	NA	NA	NA	outrider, fraser	TRUE	IntersectionStrict	TRUE	reverse	drop_data	NA	drop_data/geneCounts.tsv	grch37_chr21	NA

From c4b0983a8b1e5f9ba907e7511f571b2ed72b8e88 Mon Sep 17 00:00:00 2001
From: "lucia.pena.perez@scilifelab.se" <lucia.penaperez@hasta-login.local>
Date: Tue, 24 Oct 2023 17:46:04 +0200
Subject: [PATCH 09/10] fix drop

---
 conf/test.config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/test.config b/conf/test.config
index efe42266..c7a9bcba 100644
--- a/conf/test.config
+++ b/conf/test.config
@@ -32,7 +32,7 @@ params {
     star_two_pass_mode   = 'None'
     subsample_bed        = "${projectDir}/test_data/subsample.bed"
     seed_frac            = 0.001
-    num_reads            = 10000
+    num_reads            = 20000
 
     // VEP
     vep_cache            = "https://raw.githubusercontent.com/nf-core/test-datasets/raredisease/reference/vep_cache_and_plugins.tar.gz"

From 71a93cd30291391b49a702f6ddbe810f69083410 Mon Sep 17 00:00:00 2001
From: "lucia.pena.perez@scilifelab.se" <lucia.penaperez@hasta.scilifelab.se>
Date: Wed, 25 Oct 2023 15:20:13 +0200
Subject: [PATCH 10/10] fix reviewer suggestions

---
 bin/drop_sample_annot.py                  |  4 +---
 conf/modules.config                       |  4 +---
 subworkflows/local/analyse_transcripts.nf |  9 ++++-----
 subworkflows/local/prepare_references.nf  | 12 +++++-------
 workflows/tomte.nf                        |  1 -
 5 files changed, 11 insertions(+), 19 deletions(-)

diff --git a/bin/drop_sample_annot.py b/bin/drop_sample_annot.py
index c4a4f9c0..31f0602f 100755
--- a/bin/drop_sample_annot.py
+++ b/bin/drop_sample_annot.py
@@ -1,7 +1,6 @@
 #!/usr/bin/env python3
 
 import argparse
-from pathlib import Path
 import csv
 from pandas import read_csv, DataFrame, concat
 import os
@@ -59,8 +58,7 @@ def write_final_annot_to_tsv(ref_count_file: str, ref_annot: str, out_file: str)
     df_samples: DataFrame = read_csv("drop_annotation_given_samples.tsv", sep="\t")
     df_reference: DataFrame = read_csv(ref_annot, sep="\t")
     df_reference["GENE_COUNTS_FILE"] = ref_count_file
-    df_reference["SPLICE_COUNTS_DIR"] = df_reference["SPLICE_COUNTS_DIR"].str.rstrip("/")
-    df_reference["SPLICE_COUNTS_DIR"] = df_reference["SPLICE_COUNTS_DIR"].apply(os.path.basename)
+    df_reference["SPLICE_COUNTS_DIR"] = df_reference["SPLICE_COUNTS_DIR"].str.rstrip("/").apply(os.path.basename)
     df_reference["DROP_GROUP"] = df_reference["DROP_GROUP"].str.replace(" ", "")
     df_samples["COUNT_OVERLAPS"] = df_reference["COUNT_OVERLAPS"].iloc[0]
     df_samples["COUNT_MODE"] = df_reference["COUNT_MODE"].iloc[0]
diff --git a/conf/modules.config b/conf/modules.config
index 9f0d7566..ad221d39 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -465,13 +465,11 @@ process {
 
 process {
     withName: '.*ANNOTATE_SNV:ENSEMBLVEP' {
-        ext.prefix = { "${vcf.simpleName}_rohann_vcfanno_filter_vep" }
+        ext.prefix = { "${vcf.simpleName}_vep" }
         ext.args   = [
             '--dir_plugins cache/Plugins',
             '--plugin LoFtool,cache/Plugins/LoFtool_scores.txt',
             '--plugin pLI,cache/Plugins/pLI_values_107.txt',
-            '--plugin SpliceAI,snv=cache/Plugins/spliceai_21_scores_raw_snv_-v1.3-.vcf.gz,indel=cache/Plugins/spliceai_21_scores_raw_snv_-v1.3-.vcf.gz',
-            '--plugin MaxEntScan,cache/Plugins/fordownload,SWA,NCSS',
             '--distance 5000',
             '--buffer_size 20000',
             '--format vcf --max_sv_size 248956422',
diff --git a/subworkflows/local/analyse_transcripts.nf b/subworkflows/local/analyse_transcripts.nf
index 32b44779..10c9927a 100644
--- a/subworkflows/local/analyse_transcripts.nf
+++ b/subworkflows/local/analyse_transcripts.nf
@@ -34,7 +34,7 @@ workflow ANALYSE_TRANSCRIPTS {
         // Generates count files for samples and merges them with reference count file
         
         // Generates sample annotation
-        star_samples  = gene_counts.map{ meta, cnt_file -> meta }.collect()
+        star_samples = gene_counts.map{ meta, cnt_file -> meta }.collect()
         ch_bam_files = ch_bam_ds_bai.collect{it[1]}
         DROP_SAMPLE_ANNOT(
             ch_bam_files,
@@ -46,8 +46,8 @@ workflow ANALYSE_TRANSCRIPTS {
         )
 
         // Generates  config file and runs Aberrant expression module
-        ch_bai_files = ch_bam_ds_bai.collect{it[2]}.toList()
-        ch_bam_bai_files=ch_bam_files.toList().combine(ch_bai_files)
+        ch_bai_files = ch_bam_ds_bai.collect{ it[2] }.toList()
+        ch_bam_bai_files = ch_bam_files.toList().combine(ch_bai_files)
         DROP_CONFIG_RUN_AE(
             ch_fasta_fai_meta, 
             ch_gtf, 
@@ -90,7 +90,7 @@ workflow ANALYSE_TRANSCRIPTS {
         )
 
         // Stringtie
-        ch_bam = ch_bam_bai.map{ meta, bam, bai -> [meta, [bam]]}
+        ch_bam = ch_bam_bai.map{ meta, bam, bai -> [meta, [bam]] }
         STRINGTIE_STRINGTIE(
             ch_bam,
             ch_gtf
@@ -103,7 +103,6 @@ workflow ANALYSE_TRANSCRIPTS {
             ch_gtf.map{ gtf -> [ [id:gtf.simpleName], gtf ] }
         )
 
-        //ch_versions = ch_versions.mix(DROP_COUNTS.out.versions)
         ch_versions = ch_versions.mix(DROP_SAMPLE_ANNOT.out.versions)
         ch_versions = ch_versions.mix(DROP_CONFIG_RUN_AE.out.versions)
         ch_versions = ch_versions.mix(DROP_CONFIG_RUN_AS.out.versions)
diff --git a/subworkflows/local/prepare_references.nf b/subworkflows/local/prepare_references.nf
index f158d240..05097380 100644
--- a/subworkflows/local/prepare_references.nf
+++ b/subworkflows/local/prepare_references.nf
@@ -31,7 +31,7 @@ workflow PREPARE_REFERENCES {
     main:
         ch_versions = Channel.empty()
 
-        fasta_meta=Channel.fromPath(fasta).map { it -> [[id:it[0]], it]}.collect()
+        fasta_meta = Channel.fromPath(fasta).map { it -> [ [id:it[0] ], it] }.collect()
         GUNZIP_FASTA(fasta_meta)
         ch_fasta = fasta.endsWith(".gz") ? GUNZIP_FASTA.out.gunzip.collect() : fasta_meta
 
@@ -43,7 +43,7 @@ workflow PREPARE_REFERENCES {
         BUILD_DICT(ch_fasta)
         ch_dict = BUILD_DICT.out.dict.collect()
 
-        gtf_meta=Channel.fromPath(gtf).map{it -> [[id:it[0]], it]}.collect()
+        gtf_meta = Channel.fromPath(gtf).map{ it -> [ [id:it[0]], it ] }.collect()
         GUNZIP_GTF(gtf_meta)
         ch_gtf_no_meta = gtf.endsWith(".gz") ? GUNZIP_GTF.out.gunzip.map{ meta, gtf -> [gtf] }.collect() : Channel.fromPath(gtf)
 
@@ -57,21 +57,19 @@ workflow PREPARE_REFERENCES {
         UNTAR_STAR_INDEX( ch_star.map { it -> [[:], it] } )
         ch_star_index = (!star_index) ?  BUILD_STAR_GENOME.out.index.collect() : 
                                         (star_index.endsWith(".gz") ? UNTAR_STAR_INDEX.out.untar.map { it[1] } : star_index)
-                
         // Convert gtf to refflat for picard
         GTF_TO_REFFLAT(ch_gtf_no_meta)
 
         // Get rRNA transcripts and convert to interval_list format
         GET_RRNA_TRANSCRIPTS(ch_gtf_no_meta)
        
-        BEDTOINTERVALLIST( GET_RRNA_TRANSCRIPTS.out.bed.map {it -> [ [id:it.name], it ]}, ch_dict )
+        BEDTOINTERVALLIST( GET_RRNA_TRANSCRIPTS.out.bed.map { it -> [ [id:it.name], it ] }, ch_dict )
 
         UNTAR_VEP_CACHE (ch_vep_cache)
         
         // Preparing transcript fasta
         ch_fasta_fai = ch_fasta.join(ch_fai).collect()
-        GFFREAD(ch_gtf_no_meta.map{it -> [[id:it[0].simpleName], it]},ch_fasta_fai)
-        tr=GFFREAD.out.tr_fasta.collect()
+        GFFREAD(ch_gtf_no_meta.map{ it -> [ [id:it[0].simpleName], it ] },ch_fasta_fai)
         transcript_fasta_no_meta = (!transcript_fasta) ? GFFREAD.out.tr_fasta.collect() :
                                     (transcript_fasta.endsWith(".gz") ? GUNZIP_TRFASTA.out.gunzip.collect().map{ meta, fasta -> [ fasta ] } : transcript_fasta)
 
@@ -109,6 +107,6 @@ workflow PREPARE_REFERENCES {
         refflat          = GTF_TO_REFFLAT.out.refflat.collect()
         rrna_bed         = GET_RRNA_TRANSCRIPTS.out.bed.collect()
         interval_list    = BEDTOINTERVALLIST.out.interval_list.map{ meta, interv -> [interv] }.collect()
-        vep_resources    = UNTAR_VEP_CACHE.out.untar.map{meta, files -> [files]}.collect()     // channel: [ path(cache) ]
+        vep_resources    = UNTAR_VEP_CACHE.out.untar.map{ meta, files -> [files] }.collect()     // channel: [ path(cache) ]
         versions         = ch_versions
 }
diff --git a/workflows/tomte.nf b/workflows/tomte.nf
index 28ca5197..b0efea15 100644
--- a/workflows/tomte.nf
+++ b/workflows/tomte.nf
@@ -111,7 +111,6 @@ workflow TOMTE {
         exit 1, 'Input samplesheet not specified!'
     }
 
-    //fasta                         = Channel.fromPath(params.fasta).map { it -> [[id:it[0].simpleName], it] }.collect()
     ch_vep_cache_unprocessed      = params.vep_cache                    ? Channel.fromPath(params.vep_cache).map { it -> [[id:'vep_cache'], it] }.collect()
                                                                         : Channel.value([[],[]])
     ch_vep_filters                = params.vep_filters                  ? Channel.fromPath(params.vep_filters).collect()