genomic-medicine-sweden · Lucpen · Aug 7, 2023 · Jul 28, 2023 · Jul 28, 2023 · Aug 5, 2023
@@ -0,0 +1,173 @@
+#!/usr/bin/env python3
+
+import argparse
+from pathlib import Path
+import yaml
+
+
+def read_config():
+    return yaml.safe_load(
+        """
+        projectTitle: "DROP: Detection of RNA Outliers Pipeline"
+        root:             # root directory of all intermediate output
+        htmlOutputPath:   # path for HTML rendered reports
+        indexWithFolderName: true # whether the root base name should be part of the index name
+
+        hpoFile: null  # if null, downloads it from webserver
+        sampleAnnotation: # path to sample annotation (see documenation on how to create it)
+
+        geneAnnotation:
+            gtf: null
+            # multiple annotations with custom names are possible
+            # <annotation_name> : <path to gencode v29 annotation>
+            # v37:  /path/to/gencode29.gtf.gz # example
+
+        genomeAssembly: hg19  # either hg19/hs37d5 or hg38/GRCh38
+        exportCounts:
+            # specify which gene annotations to include and which
+            # groups to exclude when exporting counts
+            geneAnnotations:
+                - gtf
+            excludeGroups:
+                - null
+        genome: # path to genome sequence in fasta format.
+            # You can define multiple reference genomes in yaml format, ncbi: path_to_ncbi, ucsc: path_to_ucsc
+            # the keywords that define the path should be in the GENOME column of the SA table
+
+        exportCounts:
+            # specify which gene annotations to include and which
+            # groups to exclude when exporting counts
+            geneAnnotations:
+                - gtf
+            excludeGroups:
+                - null
+
+        aberrantExpression:
+            run: true
+            groups:
+                - outrider
+            fpkmCutoff: 1
+            implementation: autoencoder
+            padjCutoff: 0.05
+            zScoreCutoff: 0
+            genesToTest: null
+            maxTestedDimensionProportion: 3
+            yieldSize: 2000000
+
+        aberrantSplicing:
+            run: false
+            groups:
+                - outrider
+            recount: false
+            longRead: false
+            keepNonStandardChrs: true
+            filter: true
+            minExpressionInOneSample: 20
+            quantileMinExpression: 10
+            minDeltaPsi: 0.05
+            implementation: PCA
+            padjCutoff: 0.1
+            maxTestedDimensionProportion: 6
+            genesToTest: null
+            FRASER_version: "FRASER2"
+            deltaPsiCutoff : 0.1
+            quantileForFiltering: 0.75
+
+        mae:
+            run: false
+            groups:
+                - outrider
+            gatkIgnoreHeaderCheck: true
+            padjCutoff: .05
+            allelicRatioCutoff: 0.8
+            addAF: true
+            maxAF: .001
+            maxVarFreqCohort: .05
+            # VCF-BAM matching
+            qcVcf: null
+            qcGroups: mae
+            dnaRnaMatchCutoff: 0.85
+
+        rnaVariantCalling:
+            run: false
+            groups:
+                - batch_0
+            highQualityVCFs:
+                - Data/Mills_and_1000G_gold_standard.indels.hg19.sites.chrPrefix.vcf.gz
+                - Data/1000G_phase1.snps.high_confidence.hg19.sites.chrPrefix.vcf.gz
+            dbSNP: Data/00-All.vcf.gz
+            repeat_mask: Data/hg19_repeatMasker_sorted.chrPrefix.bed
+            createSingleVCF: true
+            addAF: true
+            maxAF: 0.001
+            maxVarFreqCohort: 0.05
+            hcArgs: ""
+            minAlt: 3
+            yieldSize: 100000
+
+        tools:
+            gatkCmd: gatk
+            bcftoolsCmd: bcftools
+            samtoolsCmd: samtools
+        """
+    )
+
+
+def update_config(yaml_object, genome, gtf):
+    gtf_name = Path(gtf).name
+    gtf_without_ext = Path(gtf).stem
+    genome_name = Path(genome).name
+
+    yaml_object["genome"] = genome_name
+    yaml_object["root"] = "output"
+    yaml_object["htmlOutputPath"] = "output/html"
+    yaml_object["sampleAnnotation"] = "sample_annotation.tsv"
+    yaml_object["geneAnnotation"][gtf_without_ext] = gtf_name
+    yaml_object["geneAnnotation"].pop("gtf", None)
+
+    ## Export counts
+    yaml_object["exportCounts"]["geneAnnotations"] = [gtf_without_ext]
+
+    ## Expression module
+    yaml_object["aberrantExpression"]["groups"] = ["outrider"]
+
+    ## Splicing module
+
+    ## MAE module
+    return yaml_object
+
+
+def write_yaml(out_path, yaml_object):
+    with open(out_path, "w") as outfile:
+        yaml.dump(yaml_object, outfile, sort_keys=False)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.MetavarTypeHelpFormatter,
+        description="""Generate config file for DROP.""",
+    )
+
+    parser.add_argument(
+        "--genome_fasta",
+        type=str,
+        help="Specify genome fasta base name",
+    )
+
+    parser.add_argument(
+        "--gtf",
+        type=str,
+        help="Specify gtf file name",
+    )
+
+    parser.add_argument(
+        "--output",
+        type=str,
+        help="Specify output file",
+    )
+
+    args = parser.parse_args()
+
+    yaml_object = read_config()
+    master_config = update_config(yaml_object=yaml_object, genome=args.genome_fasta, gtf=args.gtf)
+    write_yaml(out_path=args.output, yaml_object=master_config)
@@ -25,9 +25,8 @@ class SampleAnnotation:
         "GENOME",
     ]
 
-    def __init__(self, cnts_file, out_file, gtf_file, ref_cnts_file=False):
+    def __init__(self, cnts_file, out_file, gtf_file):
         """Create SampleAnnotation given the parameters"""
-        self.ref_cnts_file = ref_cnts_file
         self.cnts_file = cnts_file
         self.out_file = out_file
         self.gtf_file = gtf_file
@@ -42,13 +41,6 @@ def parse_header(self):
 
         sample_cnt_file = {sample: os.path.basename(self.cnts_file) for sample in samples}
 
-        if self.ref_cnts_file:
-            with open(self.ref_cnts_file) as file_object:
-                ref_samples = file_object.readline().split()[1:]
-
-            samples += ref_samples
-            sample_cnt_file.update({ref_sample: os.path.basename(self.ref_cnts_file) for ref_sample in ref_samples})
-
         return samples, sample_cnt_file
 
     def write_table(self):
@@ -81,12 +73,6 @@ def write_table(self):
         required=True,
     )
 
-    parser.add_argument(
-        "--ref_count_file",
-        type=str,
-        help="A tsv file of gene counts. Used as background",
-    )
-
     parser.add_argument(
         "--output",
         type=str,
@@ -103,4 +89,4 @@ def write_table(self):
 
     args = parser.parse_args()
 
-    SampleAnnotation(args.count_file, args.output, args.gtf, args.ref_count_file).write_table()
+    SampleAnnotation(args.count_file, args.output, args.gtf).write_table()
@@ -307,7 +307,24 @@ process {
 //
 
 process {
-    withName: '.*ANALYSE_TRANSCRIPTS:GENERATE_COUNTS_DROP' {
+    withName: '.*ANALYSE_TRANSCRIPTS:DROP_COUNTS' {
+        publishDir = [
+            path: { "${params.outdir}/analyse_transcripts" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+        ]
+    }
+
+    withName: '.*ANALYSE_TRANSCRIPTS:DROP_ANNOTATION' {
+        ext.when = {!params.drop_annot_file}
+        publishDir = [
+            path: { "${params.outdir}/analyse_transcripts" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+        ]
+    }
+
+    withName: '.*ANALYSE_TRANSCRIPTS:DROP_CONFIG_RUN_AE' {
         publishDir = [
             path: { "${params.outdir}/analyse_transcripts" },
             mode: params.publish_dir_mode,

@@ -0,0 +1,57 @@
+process DROP_CONFIG_RUN_AE {
+    tag "DROP_CONFIG_RUN_AE"
+    label 'process_high'
+
+    // Exit if running this module with -profile conda / -profile mamba
+    if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
+        exit 1, "Local DROP module does not support Conda. Please use Docker / Singularity / Podman instead."
+    }
+
+    container "docker.io/clinicalgenomics/drop:1.3.3"
+
+    input:
+    tuple val(meta), path(fasta), path(fai)
+    path gtf
+    path sample_annotation
+    path gene_counts
+
+    output:
+    path('config.yaml'), emit: config_drop
+    path('output')     , emit: drop_ae_out
+    path "versions.yml", emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    """
+    TMPDIR=\$PWD
+
+    drop init
+
+    $baseDir/bin/drop_config.py \\
+        --genome_fasta $fasta \\
+        --gtf $gtf \\
+        --output config.yaml
+
+    snakemake aberrantExpression --cores ${task.cpus} --rerun-triggers mtime
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        drop_config: v1.0
+        drop: v\$(echo \$(drop --version) |  sed -n 's/drop, version //p')
+    END_VERSIONS
+    """
+
+    stub:
+    """
+    touch config.yaml
+    mkdir output
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        drop_config: v1.0
+        drop: v\$(echo \$(drop --version) |  sed -n 's/drop, version //p')
+    END_VERSIONS
+    """
+}
@@ -1,11 +1,11 @@
-process GENERATE_COUNTS_DROP {
+process DROP_COUNTS {
     tag "DROP_counts"
     label 'process_low'
 
-    conda "bioconda::pydamage=0.70"
+    conda "bioconda::drop=1.3.3"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/pydamage:0.70--pyhdfd78af_0' :
-        'biocontainers/pydamage:0.70--pyhdfd78af_0' }"
+        'https://depot.galaxyproject.org/singularity/drop:1.3.3--pyhdfd78af_0' :
+        'biocontainers/drop:1.3.3--pyhdfd78af_0' }"
 
     input:
     path(counts)
@@ -25,7 +25,7 @@ process GENERATE_COUNTS_DROP {
     def strandedness = samples ? "--strandedness ${samples.strandedness}" : ""
     def input_samples = samples ? "${samples.id}" : ""
     """
-    $baseDir/bin/generate_gene_counts.py \\
+    $baseDir/bin/drop_counts.py \\
         --star ${counts} \\
         --sample $input_samples \\
         $strandedness \\
@@ -35,7 +35,7 @@ process GENERATE_COUNTS_DROP {
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
-        generate_gene_counts: v1.0
+        drop_counts: v1.0
     END_VERSIONS
     """
 
@@ -45,7 +45,7 @@ process GENERATE_COUNTS_DROP {
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
-        generate_gene_counts: v1.0
+        drop_counts: v1.0
     END_VERSIONS
     """
 }
@@ -1,16 +1,15 @@
-process GENERATE_ANNOTATION_DROP {
+process DROP_ANNOTATION {
     tag "DROP_annot"
     label 'process_low'
 
-    conda "bioconda::pydamage=0.70"
+    conda "bioconda::drop=1.3.3"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/pydamage:0.70--pyhdfd78af_0' :
-        'biocontainers/pydamage:0.70--pyhdfd78af_0' }"
+        'https://depot.galaxyproject.org/singularity/drop:1.3.3--pyhdfd78af_0' :
+        'biocontainers/drop:1.3.3--pyhdfd78af_0' }"
 
     input:
     path(processed_gene_counts)
     path(gtf)
-    path(reference_count_file)
 
     output:
     path('sample_annotation.tsv'), emit: sample_annotation_drop
@@ -20,19 +19,17 @@ process GENERATE_ANNOTATION_DROP {
     task.ext.when == null || task.ext.when
 
     script:
-    def ref_counts = reference_count_file ? "--ref_count_file $reference_count_file" : ""
     def gtf_name   = gtf ? gtf.getBaseName() : ""
 
     """
-    generate_drop_sample_annot.py \\
+    drop_sample_annot.py \\
         --count_file $processed_gene_counts \\
         --gtf $gtf_name \\
-        $ref_counts \\
         --output sample_annotation.tsv
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
-        generate_drop_sample_annot: v1.0
+        drop_sample_annot: v1.0
     END_VERSIONS
     """
 
@@ -42,7 +39,7 @@ process GENERATE_ANNOTATION_DROP {
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
-        generate_drop_sample_annot: v1.0
+        drop_sample_annot: v1.0
     END_VERSIONS
     """
 }