genomic-medicine-sweden · jemten · Oct 26, 2023 · Oct 6, 2023 · Oct 6, 2023 · Oct 10, 2023
@@ -85,6 +85,7 @@ def update_config(
     genome: Path,
     gtf: Path,
     genome_assembly: str,
+    drop_group_samples: str,
     padjcutoff: float,
     zscorecutoff: float,
     drop_module: str,
@@ -109,10 +110,12 @@ def update_config(
     # Export counts
     if drop_module == "AE":
         config_copy["aberrantExpression"]["run"] = ["true"]
+        config_copy["aberrantExpression"]["groups"] = [drop_group_samples]
         config_copy["aberrantExpression"]["padjCutoff"] = padjcutoff
         config_copy["aberrantExpression"]["zScoreCutoff"] = zscorecutoff
     elif drop_module == "AS":
         config_copy["aberrantSplicing"]["run"] = ["true"]
+        config_copy["aberrantSplicing"]["groups"] = [drop_group_samples]
         config_copy["aberrantSplicing"]["padjCutoff"] = padjcutoff
     elif drop_module == "MAE":
         config_copy["mae"]["run"] = ["true"]
@@ -156,6 +159,12 @@ def parse_args(argv=None):
         choices=["hg19", "hs37d5", "hg38", "GRCh38"],
         required=True,
     )
+    parser.add_argument(
+        "--drop_group_samples",
+        type=str,
+        help="Specify drop group to analyse",
+        required=True,
+    )
     parser.add_argument(
         "--padjcutoff",
         type=float,
@@ -190,6 +199,7 @@ def main():
         genome=args.genome_fasta,
         gtf=args.gtf,
         genome_assembly=args.genome_assembly,
+        drop_group_samples=args.drop_group_samples,
         padjcutoff=args.padjcutoff,
         zscorecutoff=args.zscorecutoff,
         drop_module=args.drop_module,

@@ -4,6 +4,7 @@
 from pathlib import Path
 import csv
 from pandas import read_csv, DataFrame, concat
+import os
 
 SCRIPT_VERSION = "v1.0"
 SAMPLE_ANNOTATION_COLUMNS = [
@@ -25,7 +26,7 @@
 
 
 def write_sample_annotation_to_tsv(
-    bam: str, samples: str, strandedness: str, single_end: str, gtf: str, count_file: str, out_file: str
+    bam: str, samples: str, strandedness: str, single_end: str, drop_group_sample: str, out_file: str
 ):
     """Write the Sample Annotation tsv file."""
     with open(out_file, "w") as tsv_file:
@@ -34,9 +35,9 @@ def write_sample_annotation_to_tsv(
         for index, id in enumerate(samples):
             sa_dict: dict = {}.fromkeys(SAMPLE_ANNOTATION_COLUMNS, "NA")
             sa_dict["RNA_ID"] = id
-            sa_dict["DROP_GROUP"] = "outrider,fraser"
-            sa_dict["GENE_COUNTS_FILE"] = count_file
-            sa_dict["GENE_ANNOTATION"] = Path(gtf).stem
+            sa_dict["DROP_GROUP"] = drop_group_sample
+            sa_dict["GENE_COUNTS_FILE"] = "NA"
+            sa_dict["GENE_ANNOTATION"] = "NA"
             sa_dict["STRAND"] = strandedness[index]
             sa_dict["PAIRED_END"] = is_paired_end(single_end[index])
             sa_dict["RNA_BAM_FILE"] = bam[index]
@@ -50,14 +51,17 @@ def is_paired_end(single_end: str) -> bool:
     return False
 
 
-def write_final_annot_to_tsv(count_file: str, ref_annot: str, out_file: str):
+def write_final_annot_to_tsv(ref_count_file: str, ref_annot: str, out_file: str):
     """
     Concatenates the Sample Annotation produced by SampleAnnotation with the one
     provided for the reference samples, checking for duplicate sample IDs
     """
     df_samples: DataFrame = read_csv("drop_annotation_given_samples.tsv", sep="\t")
     df_reference: DataFrame = read_csv(ref_annot, sep="\t")
-    df_reference["GENE_COUNTS_FILE"] = count_file
+    df_reference["GENE_COUNTS_FILE"] = ref_count_file
+    df_reference["SPLICE_COUNTS_DIR"] = df_reference["SPLICE_COUNTS_DIR"].str.rstrip("/")
+    df_reference["SPLICE_COUNTS_DIR"] = df_reference["SPLICE_COUNTS_DIR"].apply(os.path.basename)
+    df_reference["DROP_GROUP"] = df_reference["DROP_GROUP"].str.replace(" ", "")
     df_samples["COUNT_OVERLAPS"] = df_reference["COUNT_OVERLAPS"].iloc[0]
     df_samples["COUNT_MODE"] = df_reference["COUNT_MODE"].iloc[0]
     df_samples["HPO_TERMS"] = df_reference["HPO_TERMS"].iloc[0]
@@ -74,15 +78,15 @@ def parse_args(argv=None):
         formatter_class=argparse.MetavarTypeHelpFormatter,
         description="""Generate DROP sample annotation for patients.""",
     )
-    parser.add_argument("--bam", type=str, nargs="+", help="bam files for the patient", required=True)
+    parser.add_argument("--bam", type=str, nargs="+", help="bam files for the analyzed samples", required=True)
     parser.add_argument("--samples", type=str, nargs="+", help="corresponding sample name", required=True)
     parser.add_argument("--strandedness", type=str, nargs="+", help="strandedness of RNA", required=True)
     parser.add_argument("--single_end", type=str, nargs="+", help="is the sample paired end?", required=True)
-    parser.add_argument("--gtf", type=str, help="Transcript annotation file in gtf format", required=True)
     parser.add_argument(
-        "--count_file", type=str, help="A tsv file of gene counts for all processed samples.", required=True
+        "--ref_count_file", type=str, help="A tsv file of gene counts for reference samples.", required=True
     )
     parser.add_argument("--ref_annot", type=str, help="Path to reference annotation tsv", required=True)
+    parser.add_argument("--drop_group_sample", type=str, help="Drop group of analyzed samples", required=True)
     parser.add_argument("--output", type=str, help="Path to save to", required=True)
     parser.add_argument("--version", action="version", version=SCRIPT_VERSION)
     return parser.parse_args(argv)
@@ -96,11 +100,10 @@ def main():
         samples=args.samples,
         strandedness=args.strandedness,
         single_end=args.single_end,
-        gtf=args.gtf,
-        count_file=args.count_file,
+        drop_group_sample=args.drop_group_sample,
         out_file="drop_annotation_given_samples.tsv",
     )
-    write_final_annot_to_tsv(count_file=args.count_file, ref_annot=args.ref_annot, out_file=args.output)
+    write_final_annot_to_tsv(ref_count_file=args.ref_count_file, ref_annot=args.ref_annot, out_file=args.output)
 
 
 if __name__ == "__main__":

@@ -181,6 +181,7 @@ process {
 
     withName: '.*PREPARE_REFERENCES:GFFREAD' {
         ext.when = { (!params.transcript_fasta) }
+        ext.args = { '-w' }
         publishDir = [
             path: { "${params.outdir}/references" },
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
@@ -316,15 +317,6 @@ process {
 //
 
 process {
-    withName: '.*ANALYSE_TRANSCRIPTS:DROP_COUNTS' {
-        ext.when = {params.run_drop_ae}
-        publishDir = [
-            path: { "${params.outdir}/analyse_transcripts/drop/AE" },
-            mode: params.publish_dir_mode,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
-        ]
-    }
-
     withName: '.*ANALYSE_TRANSCRIPTS:DROP_ANNOT' {
         ext.when = {params.run_drop_ae|params.run_drop_as}
         publishDir = [
@@ -475,11 +467,11 @@ process {
     withName: '.*ANNOTATE_SNV:ENSEMBLVEP' {
         ext.prefix = { "${vcf.simpleName}_rohann_vcfanno_filter_vep" }
         ext.args   = [
-            '--dir_plugins vep_cache/Plugins',
-            '--plugin LoFtool,vep_cache/LoFtool_scores.txt',
-            '--plugin pLI,vep_cache/pLI_values_107.txt',
-            '--plugin SpliceAI,snv=vep_cache/spliceai_21_scores_raw_snv_-v1.3-.vcf.gz,indel=vep_cache/spliceai_21_scores_raw_snv_-v1.3-.vcf.gz',
-            '--plugin MaxEntScan,vep_cache/fordownload,SWA,NCSS',
+            '--dir_plugins cache/Plugins',
+            '--plugin LoFtool,cache/Plugins/LoFtool_scores.txt',
+            '--plugin pLI,cache/Plugins/pLI_values_107.txt',
+            '--plugin SpliceAI,snv=cache/Plugins/spliceai_21_scores_raw_snv_-v1.3-.vcf.gz,indel=cache/Plugins/spliceai_21_scores_raw_snv_-v1.3-.vcf.gz',
+            '--plugin MaxEntScan,cache/Plugins/fordownload,SWA,NCSS',
             '--distance 5000',
             '--buffer_size 20000',
             '--format vcf --max_sv_size 248956422',

@@ -32,7 +32,7 @@ params {
     star_two_pass_mode   = 'None'
     subsample_bed        = "${projectDir}/test_data/subsample.bed"
     seed_frac            = 0.001
-    num_reads            = 10000
+    num_reads            = 20000
 
     // VEP
     vep_cache            = "https://raw.githubusercontent.com/nf-core/test-datasets/raredisease/reference/vep_cache_and_plugins.tar.gz"

@@ -13,8 +13,11 @@ process DROP_CONFIG_RUN_AE {
     tuple val(meta), path(fasta), path(fai)
     path gtf
     path sample_annotation
-    path gene_counts
+    tuple path(bam), path(bai)
+    path ref_drop_count_file
+    path ref_splice_folder
     val(genome)
+    val(drop_group_samples_ae)
     val(drop_padjcutoff_ae)
     val(drop_zScoreCutoff)
 
@@ -30,6 +33,7 @@ process DROP_CONFIG_RUN_AE {
 
     script:
     def genome_assembly = "${genome}".contains("h37") ? "hg19" : "${genome}"
+    def drop_group = "${drop_group_samples_ae}".replace(" ","")
     def zscorecutoff = drop_zScoreCutoff ? "--zscorecutoff ${drop_zScoreCutoff}" : ''
 
     """
@@ -42,6 +46,7 @@ process DROP_CONFIG_RUN_AE {
         --gtf ${gtf} \\
         --drop_module AE \\
         --genome_assembly $genome_assembly \\
+        --drop_group_samples $drop_group \\
         --padjcutoff ${drop_padjcutoff_ae} \\
         $zscorecutoff \\
         --output config.yaml

@@ -14,8 +14,10 @@ process DROP_CONFIG_RUN_AS {
     path gtf
     path sample_annotation
     tuple path(bam), path(bai)
+    path ref_drop_count_file
     path ref_splice_folder
     val(genome)
+    val(drop_group_samples_as)
     val(drop_padjcutoff_as)
 
     output:
@@ -29,6 +31,7 @@ process DROP_CONFIG_RUN_AS {
 
     script:
     def genome_assembly = "${genome}".contains("h37") ? "hg19" : "${genome}"
+    def drop_group = "${drop_group_samples_as}".replace(" ","")
     """
     TMPDIR=\$PWD
 
@@ -39,6 +42,7 @@ process DROP_CONFIG_RUN_AS {
         --gtf ${gtf}\\
         --drop_module AS \\
         --genome_assembly $genome_assembly \\
+        --drop_group_samples $drop_group \\
         --padjcutoff ${drop_padjcutoff_as} \\
         --output config.yaml