genomic-medicine-sweden · Lucpen · Oct 11, 2024 · Jul 30, 2024 · Jul 30, 2024 · Aug 21, 2024
@@ -1,5 +1,5 @@
 lint:
-  files_exist: false
+  files_exist:
     - assets/nf-core-tomte_logo_light.png
     - docs/images/nf-core-tomte_logo_light.png
     - docs/images/nf-core-tomte_logo_dark.png
@@ -8,7 +8,7 @@ lint:
     - docs/images/tomte_pipeline_metromap.png
     - conf/modules.config
     - .github/ISSUE_TEMPLATE/config.yml
-  files_unchanged: false
+  files_unchanged:
     - assets/sendmail_template.txt
     - .github/CONTRIBUTING.md
     - .github/ISSUE_TEMPLATE/bug_report.yml

@@ -3,23 +3,41 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## 2.2.1 - Scrooge [2024-08-28]
+## X.X.X - [XXXX-XX-XX]
+
+### `Added`
+
+- Functionality to create DROP databases and to add samples to existing ones [#147](https://github.com/genomic-medicine-sweden/tomte/pull/147)
 
 ### `Fixed`
 
-- After an update, MultiQC was not outputing data for RnaSeqMetrics so an earlier version will be used [#156](https://github.com/genomic-medicine-sweden/tomte/pull/156)
+### `Parameters`
 
 ### `Changed`
 
-- Downgraded multiqc version [#156](https://github.com/genomic-medicine-sweden/tomte/pull/156)
 - Updated modules ensemblvep/filtervep, ensemblvep/vep [#159](https://github.com/genomic-medicine-sweden/tomte/pull/159)
 - Updated gencode version from 37 to 46 [#159](https://github.com/genomic-medicine-sweden/tomte/pull/159)
+- Updated modules using drop drop_config_runAE, drop_config_runAS, drop_sample_annot, and drop_filter_results [#147](https://github.com/genomic-medicine-sweden/tomte/pull/147)
 
 | Tool                 | Old version | New version |
 | -------------------- | ----------- | ----------- |
-| multiqc              | 1.24.1      | 1.21        |
 | ensemblvep/filtervep | 110         | 112         |
 | ensemblvep/vep       | 110         | 112         |
+| DROP                 | 1.3.3       | 1.4.0       |
+
+## 2.2.1 - Scrooge [2024-08-28]
+
+### `Fixed`
+
+- After an update, MultiQC was not outputing data for RnaSeqMetrics so an earlier version will be used [#156](https://github.com/genomic-medicine-sweden/tomte/pull/156)
+
+### `Changed`
+
+- Downgraded multiqc version [#156](https://github.com/genomic-medicine-sweden/tomte/pull/156)
+
+| Tool    | Old version | New version |
+| ------- | ----------- | ----------- |
+| multiqc | 1.24.1      | 1.21        |
 
 ## 2.2.0 - TioDeNadal [2024-08-27]
 

@@ -6,7 +6,7 @@
 from typing import Dict, Any
 from copy import deepcopy
 
-SCRIPT_VERSION = "v1.0"
+SCRIPT_VERSION = "v2.1"
 CONFIG_YAML = {
     "projectTitle": "DROP: Detection of RNA Outliers Pipeline",
     "root": None,
@@ -86,6 +86,7 @@ def update_config(
     gtf: Path,
     genome_assembly: str,
     drop_group_samples: str,
+    drop_other_group_samples: str,
     padjcutoff: float,
     zscorecutoff: float,
     drop_module: str,
@@ -99,13 +100,14 @@ def update_config(
     config_copy: Dict[str, Any] = deepcopy(CONFIG_YAML)
 
     config_copy["genome"] = genome_name
-    config_copy["root"] = "output"
+    config_copy["root"] = str(Path.cwd() / "output")
     config_copy["htmlOutputPath"] = "output/html"
     config_copy["sampleAnnotation"] = "sample_annotation.tsv"
     config_copy["geneAnnotation"][gtf_without_ext] = str(gtf)
     config_copy["geneAnnotation"].pop("gtf", None)
     config_copy["exportCounts"]["geneAnnotations"] = [gtf_without_ext]
     config_copy["genomeAssembly"] = genome_assembly
+    config_copy["exportCounts"]["excludeGroups"] = [drop_other_group_samples]
 
     # Export counts
     if drop_module == "AE":
@@ -165,6 +167,12 @@ def parse_args(argv=None):
         help="Specify drop group to analyse",
         required=True,
     )
+    parser.add_argument(
+        "--drop_other_group_samples",
+        type=str,
+        help="Specify the drop group to exclude in exportCounts",
+        required=True,
+    )
     parser.add_argument(
         "--padjcutoff",
         type=float,
@@ -200,6 +208,7 @@ def main():
         gtf=args.gtf,
         genome_assembly=args.genome_assembly,
         drop_group_samples=args.drop_group_samples,
+        drop_other_group_samples=args.drop_other_group_samples,
         padjcutoff=args.padjcutoff,
         zscorecutoff=args.zscorecutoff,
         drop_module=args.drop_module,

@@ -2,10 +2,10 @@
 
 import argparse
 import csv
-from pandas import read_csv, DataFrame, concat
+from pandas import read_csv, DataFrame, concat, isna
 import os
 
-SCRIPT_VERSION = "v1.0"
+SCRIPT_VERSION = "v1.1"
 SAMPLE_ANNOTATION_COLUMNS = [
     "RNA_ID",
     "RNA_BAM_FILE",
@@ -25,19 +25,24 @@
 
 
 def write_sample_annotation_to_tsv(
-    bam: str, samples: str, strandedness: str, single_end: str, drop_group_sample: str, out_file: str
+    bam: str,
+    samples: str,
+    strandedness: str,
+    single_end: str,
+    drop_group_sample: str,
+    out_file: str,
 ):
     """Write the Sample Annotation tsv file."""
     with open(out_file, "w") as tsv_file:
-        writer = csv.DictWriter(tsv_file, fieldnames=SAMPLE_ANNOTATION_COLUMNS, delimiter="\t")
+        writer = csv.DictWriter(
+            tsv_file, fieldnames=SAMPLE_ANNOTATION_COLUMNS, delimiter="\t"
+        )
         writer.writeheader()
         for index, id in enumerate(samples):
             sa_dict: dict = {}.fromkeys(SAMPLE_ANNOTATION_COLUMNS, "NA")
             sa_dict["RNA_ID"] = id
             sa_dict["DROP_GROUP"] = drop_group_sample
-            sa_dict["GENE_COUNTS_FILE"] = "NA"
-            sa_dict["GENE_ANNOTATION"] = "NA"
-            sa_dict["STRAND"] = strandedness[index]
+            sa_dict["STRAND"] = is_stranded(strandedness[index])
             sa_dict["PAIRED_END"] = is_paired_end(single_end[index])
             sa_dict["RNA_BAM_FILE"] = bam[index]
             writer.writerow(sa_dict)
@@ -50,24 +55,74 @@ def is_paired_end(single_end: str) -> bool:
     return False
 
 
+def is_stranded(strandedness: str) -> str:
+    """Logical funciton to determine sample strandness"""
+    if strandedness.lower() == "reverse":
+        return "reverse"
+    elif strandedness.lower() == "forward":
+        return "yes"
+    else:
+        return "no"
+
+
+def count_mode(sample_count_mode: str) -> str:
+    """Logical function to determine if count mode is given or default "IntersectionStrict" should be used"""
+    if isna(sample_count_mode) or sample_count_mode == "" or sample_count_mode == "NA":
+        return "IntersectionStrict"
+    else:
+        return sample_count_mode
+
+
+def count_overlaps(sample_count_overlap: str) -> str:
+    """Logical funciton to determine if count overlap is given or default "TRUE" should be used"""
+    if (
+        isna(sample_count_overlap)
+        or sample_count_overlap == ""
+        or sample_count_overlap == "NA"
+    ):
+        return True
+    else:
+        return sample_count_overlap
+
+
 def write_final_annot_to_tsv(ref_count_file: str, ref_annot: str, out_file: str):
     """
     Concatenates the Sample Annotation produced by SampleAnnotation with the one
-    provided for the reference samples, checking for duplicate sample IDs
+    provided for the reference samples, if one is provided, checking for duplicate sample IDs
     """
     df_samples: DataFrame = read_csv("drop_annotation_given_samples.tsv", sep="\t")
-    df_reference: DataFrame = read_csv(ref_annot, sep="\t")
-    df_reference["GENE_COUNTS_FILE"] = ref_count_file
-    df_reference["SPLICE_COUNTS_DIR"] = df_reference["SPLICE_COUNTS_DIR"].str.rstrip("/").apply(os.path.basename)
-    df_reference["DROP_GROUP"] = df_reference["DROP_GROUP"].str.replace(" ", "")
-    df_samples["COUNT_OVERLAPS"] = df_reference["COUNT_OVERLAPS"].iloc[0]
-    df_samples["COUNT_MODE"] = df_reference["COUNT_MODE"].iloc[0]
-    df_samples["HPO_TERMS"] = df_reference["HPO_TERMS"].iloc[0]
-    for id in df_samples["RNA_ID"]:
-        df_reference = df_reference[df_reference["RNA_ID"].str.contains(id) == False]
-    df: DataFrame = concat([df_samples, df_reference]).reset_index(drop=True)
-    df.fillna("NA", inplace=True)
-    df.to_csv(out_file, index=False, sep="\t")
+    if ref_annot == "None" or ref_count_file == "None":
+        print(
+            "No reference samples were provided by the user see usage of --ref_count_file and --ref_annot if you want to provide reference samples"
+        )
+        if df_samples.shape[0] < 50:
+            print(
+                "At least 30 samples are required for Aberrant Splicing and 50 for Aberrant expression"
+            )
+            print(f"Only {df_samples.shape[0]} samples were provided by the user")
+        df_samples.fillna("NA", inplace=True)
+        df_samples["COUNT_MODE"] = "IntersectionStrict"
+        df_samples["COUNT_OVERLAPS"] = True
+        df_samples.to_csv(out_file, index=False, sep="\t")
+    else:
+        df_reference: DataFrame = read_csv(ref_annot, sep="\t")
+        df_reference["GENE_COUNTS_FILE"] = ref_count_file
+        df_reference["SPLICE_COUNTS_DIR"] = (
+            df_reference["SPLICE_COUNTS_DIR"].str.rstrip("/").apply(os.path.basename)
+        )
+        df_reference["DROP_GROUP"] = df_reference["DROP_GROUP"].str.replace(" ", "")
+        df_samples["COUNT_OVERLAPS"] = count_overlaps(
+            df_reference["COUNT_OVERLAPS"].iloc[0]
+        )
+        df_samples["COUNT_MODE"] = count_mode(df_reference["COUNT_MODE"].iloc[0])
+        df_samples["HPO_TERMS"] = df_reference["HPO_TERMS"].iloc[0]
+        for id in df_samples["RNA_ID"]:
+            df_reference = df_reference[
+                df_reference["RNA_ID"].str.contains(id) == False
+            ]
+        df: DataFrame = concat([df_samples, df_reference]).reset_index(drop=True)
+        df.fillna("NA", inplace=True)
+        df.to_csv(out_file, index=False, sep="\t")
 
 
 def parse_args(argv=None):
@@ -76,15 +131,51 @@ def parse_args(argv=None):
         formatter_class=argparse.MetavarTypeHelpFormatter,
         description="""Generate DROP sample annotation for patients.""",
     )
-    parser.add_argument("--bam", type=str, nargs="+", help="bam files for the analyzed samples", required=True)
-    parser.add_argument("--samples", type=str, nargs="+", help="corresponding sample name", required=True)
-    parser.add_argument("--strandedness", type=str, nargs="+", help="strandedness of RNA", required=True)
-    parser.add_argument("--single_end", type=str, nargs="+", help="is the sample paired end?", required=True)
     parser.add_argument(
-        "--ref_count_file", type=str, help="A tsv file of gene counts for reference samples.", required=True
+        "--bam",
+        type=str,
+        nargs="+",
+        help="bam files for the analyzed samples",
+        required=True,
+    )
+    parser.add_argument(
+        "--samples",
+        type=str,
+        nargs="+",
+        help="corresponding sample name",
+        required=True,
+    )
+    parser.add_argument(
+        "--strandedness", type=str, nargs="+", help="strandedness of RNA", required=True
+    )
+    parser.add_argument(
+        "--single_end",
+        type=str,
+        nargs="+",
+        help="is the sample paired end?",
+        required=True,
+    )
+    parser.add_argument(
+        "--ref_count_file",
+        type=str,
+        default="None",
+        help="A tsv file of gene counts for reference samples.",
+        required=False,
+    )
+    parser.add_argument(
+        "--ref_annot",
+        type=str,
+        default="None",
+        help="Path to reference annotation tsv",
+        required=False,
+    )
+    parser.add_argument(
+        "--drop_group_sample",
+        type=str,
+        default="None",
+        help="Drop group of analyzed samples",
+        required=False,
     )
-    parser.add_argument("--ref_annot", type=str, help="Path to reference annotation tsv", required=True)
-    parser.add_argument("--drop_group_sample", type=str, help="Drop group of analyzed samples", required=True)
     parser.add_argument("--output", type=str, help="Path to save to", required=True)
     parser.add_argument("--version", action="version", version=SCRIPT_VERSION)
     return parser.parse_args(argv)
@@ -101,7 +192,12 @@ def main():
         drop_group_sample=args.drop_group_sample,
         out_file="drop_annotation_given_samples.tsv",
     )
-    write_final_annot_to_tsv(ref_count_file=args.ref_count_file, ref_annot=args.ref_annot, out_file=args.output)
+
+    write_final_annot_to_tsv(
+        ref_count_file=args.ref_count_file,
+        ref_annot=args.ref_annot,
+        out_file=args.output,
+    )
 
 
 if __name__ == "__main__":

@@ -62,4 +62,17 @@ process {
         errorStrategy = 'retry'
         maxRetries    = 2
     }
+    if (!params.skip_export_counts_drop) {
+        withLabel:process_drop {
+            cpus   = { check_max( 36    * task.attempt, 'cpus'    ) }
+            memory = { check_max( 144.GB              , 'memory'  ) }
+            time   = { check_max( 48.h  * task.attempt, 'time'    ) }
+            }
+    } else {
+        withLabel:process_drop {
+            cpus   = { check_max( 12    * task.attempt, 'cpus'    ) }
+            memory = { check_max( 72.GB * task.attempt, 'memory'  ) }
+            time   = { check_max( 16.h  * task.attempt, 'time'    ) }
+            }
+    }
 }