Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat add drop AE #34

Merged
merged 9 commits into from
Aug 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
173 changes: 173 additions & 0 deletions bin/drop_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
#!/usr/bin/env python3

import argparse
from pathlib import Path
import yaml


def read_config():
return yaml.safe_load(
"""
projectTitle: "DROP: Detection of RNA Outliers Pipeline"
root: # root directory of all intermediate output
htmlOutputPath: # path for HTML rendered reports
indexWithFolderName: true # whether the root base name should be part of the index name

hpoFile: null # if null, downloads it from webserver
sampleAnnotation: # path to sample annotation (see documenation on how to create it)

geneAnnotation:
gtf: null
# multiple annotations with custom names are possible
# <annotation_name> : <path to gencode v29 annotation>
# v37: /path/to/gencode29.gtf.gz # example

genomeAssembly: hg19 # either hg19/hs37d5 or hg38/GRCh38
exportCounts:
# specify which gene annotations to include and which
# groups to exclude when exporting counts
geneAnnotations:
- gtf
excludeGroups:
- null
genome: # path to genome sequence in fasta format.
# You can define multiple reference genomes in yaml format, ncbi: path_to_ncbi, ucsc: path_to_ucsc
# the keywords that define the path should be in the GENOME column of the SA table

exportCounts:
# specify which gene annotations to include and which
# groups to exclude when exporting counts
geneAnnotations:
- gtf
excludeGroups:
- null

aberrantExpression:
run: true
groups:
- outrider
fpkmCutoff: 1
implementation: autoencoder
padjCutoff: 0.05
zScoreCutoff: 0
genesToTest: null
maxTestedDimensionProportion: 3
yieldSize: 2000000

aberrantSplicing:
run: false
groups:
- outrider
recount: false
longRead: false
keepNonStandardChrs: true
filter: true
minExpressionInOneSample: 20
quantileMinExpression: 10
minDeltaPsi: 0.05
implementation: PCA
padjCutoff: 0.1
maxTestedDimensionProportion: 6
genesToTest: null
FRASER_version: "FRASER2"
deltaPsiCutoff : 0.1
quantileForFiltering: 0.75

mae:
run: false
groups:
- outrider
gatkIgnoreHeaderCheck: true
padjCutoff: .05
allelicRatioCutoff: 0.8
addAF: true
maxAF: .001
maxVarFreqCohort: .05
# VCF-BAM matching
qcVcf: null
qcGroups: mae
dnaRnaMatchCutoff: 0.85

rnaVariantCalling:
run: false
groups:
- batch_0
highQualityVCFs:
- Data/Mills_and_1000G_gold_standard.indels.hg19.sites.chrPrefix.vcf.gz
- Data/1000G_phase1.snps.high_confidence.hg19.sites.chrPrefix.vcf.gz
dbSNP: Data/00-All.vcf.gz
repeat_mask: Data/hg19_repeatMasker_sorted.chrPrefix.bed
createSingleVCF: true
addAF: true
maxAF: 0.001
maxVarFreqCohort: 0.05
hcArgs: ""
minAlt: 3
yieldSize: 100000

tools:
gatkCmd: gatk
bcftoolsCmd: bcftools
samtoolsCmd: samtools
"""
)


def update_config(yaml_object, genome, gtf):
gtf_name = Path(gtf).name
gtf_without_ext = Path(gtf).stem
genome_name = Path(genome).name

yaml_object["genome"] = genome_name
yaml_object["root"] = "output"
yaml_object["htmlOutputPath"] = "output/html"
yaml_object["sampleAnnotation"] = "sample_annotation.tsv"
yaml_object["geneAnnotation"][gtf_without_ext] = gtf_name
yaml_object["geneAnnotation"].pop("gtf", None)

## Export counts
yaml_object["exportCounts"]["geneAnnotations"] = [gtf_without_ext]

## Expression module
yaml_object["aberrantExpression"]["groups"] = ["outrider"]

## Splicing module

## MAE module
return yaml_object


def write_yaml(out_path, yaml_object):
with open(out_path, "w") as outfile:
yaml.dump(yaml_object, outfile, sort_keys=False)


if __name__ == "__main__":
parser = argparse.ArgumentParser(
formatter_class=argparse.MetavarTypeHelpFormatter,
description="""Generate config file for DROP.""",
)

parser.add_argument(
"--genome_fasta",
type=str,
help="Specify genome fasta base name",
)

parser.add_argument(
"--gtf",
type=str,
help="Specify gtf file name",
)

parser.add_argument(
"--output",
type=str,
help="Specify output file",
)

args = parser.parse_args()

yaml_object = read_config()
master_config = update_config(yaml_object=yaml_object, genome=args.genome_fasta, gtf=args.gtf)
write_yaml(out_path=args.output, yaml_object=master_config)
File renamed without changes.
18 changes: 2 additions & 16 deletions bin/generate_drop_sample_annot.py → bin/drop_sample_annot.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,8 @@ class SampleAnnotation:
"GENOME",
]

def __init__(self, cnts_file, out_file, gtf_file, ref_cnts_file=False):
def __init__(self, cnts_file, out_file, gtf_file):
"""Create SampleAnnotation given the parameters"""
self.ref_cnts_file = ref_cnts_file
self.cnts_file = cnts_file
self.out_file = out_file
self.gtf_file = gtf_file
Expand All @@ -42,13 +41,6 @@ def parse_header(self):

sample_cnt_file = {sample: os.path.basename(self.cnts_file) for sample in samples}

if self.ref_cnts_file:
with open(self.ref_cnts_file) as file_object:
ref_samples = file_object.readline().split()[1:]

samples += ref_samples
sample_cnt_file.update({ref_sample: os.path.basename(self.ref_cnts_file) for ref_sample in ref_samples})

return samples, sample_cnt_file

def write_table(self):
Expand Down Expand Up @@ -81,12 +73,6 @@ def write_table(self):
required=True,
)

parser.add_argument(
"--ref_count_file",
type=str,
help="A tsv file of gene counts. Used as background",
)

parser.add_argument(
"--output",
type=str,
Expand All @@ -103,4 +89,4 @@ def write_table(self):

args = parser.parse_args()

SampleAnnotation(args.count_file, args.output, args.gtf, args.ref_count_file).write_table()
SampleAnnotation(args.count_file, args.output, args.gtf).write_table()
19 changes: 18 additions & 1 deletion conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,24 @@ process {
//

process {
withName: '.*ANALYSE_TRANSCRIPTS:GENERATE_COUNTS_DROP' {
withName: '.*ANALYSE_TRANSCRIPTS:DROP_COUNTS' {
publishDir = [
path: { "${params.outdir}/analyse_transcripts" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
]
}

withName: '.*ANALYSE_TRANSCRIPTS:DROP_ANNOTATION' {
ext.when = {!params.drop_annot_file}
publishDir = [
path: { "${params.outdir}/analyse_transcripts" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
]
}

withName: '.*ANALYSE_TRANSCRIPTS:DROP_CONFIG_RUN_AE' {
publishDir = [
path: { "${params.outdir}/analyse_transcripts" },
mode: params.publish_dir_mode,
Expand Down
57 changes: 57 additions & 0 deletions modules/local/drop_config_runAE.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
process DROP_CONFIG_RUN_AE {
tag "DROP_CONFIG_RUN_AE"
label 'process_high'

// Exit if running this module with -profile conda / -profile mamba
if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
exit 1, "Local DROP module does not support Conda. Please use Docker / Singularity / Podman instead."
}

container "docker.io/clinicalgenomics/drop:1.3.3"

input:
tuple val(meta), path(fasta), path(fai)
path gtf
path sample_annotation
path gene_counts

output:
path('config.yaml'), emit: config_drop
path('output') , emit: drop_ae_out
path "versions.yml", emit: versions

when:
task.ext.when == null || task.ext.when

script:
"""
TMPDIR=\$PWD

drop init

$baseDir/bin/drop_config.py \\
--genome_fasta $fasta \\
--gtf $gtf \\
--output config.yaml

snakemake aberrantExpression --cores ${task.cpus} --rerun-triggers mtime

cat <<-END_VERSIONS > versions.yml
"${task.process}":
drop_config: v1.0
drop: v\$(echo \$(drop --version) | sed -n 's/drop, version //p')
END_VERSIONS
"""

stub:
"""
touch config.yaml
mkdir output

cat <<-END_VERSIONS > versions.yml
"${task.process}":
drop_config: v1.0
drop: v\$(echo \$(drop --version) | sed -n 's/drop, version //p')
END_VERSIONS
"""
}
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
process GENERATE_COUNTS_DROP {
process DROP_COUNTS {
tag "DROP_counts"
label 'process_low'

conda "bioconda::pydamage=0.70"
conda "bioconda::drop=1.3.3"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/pydamage:0.70--pyhdfd78af_0' :
'biocontainers/pydamage:0.70--pyhdfd78af_0' }"
'https://depot.galaxyproject.org/singularity/drop:1.3.3--pyhdfd78af_0' :
'biocontainers/drop:1.3.3--pyhdfd78af_0' }"

input:
path(counts)
Expand All @@ -25,7 +25,7 @@ process GENERATE_COUNTS_DROP {
def strandedness = samples ? "--strandedness ${samples.strandedness}" : ""
def input_samples = samples ? "${samples.id}" : ""
"""
$baseDir/bin/generate_gene_counts.py \\
$baseDir/bin/drop_counts.py \\
--star ${counts} \\
--sample $input_samples \\
$strandedness \\
Expand All @@ -35,7 +35,7 @@ process GENERATE_COUNTS_DROP {

cat <<-END_VERSIONS > versions.yml
"${task.process}":
generate_gene_counts: v1.0
drop_counts: v1.0
END_VERSIONS
"""

Expand All @@ -45,7 +45,7 @@ process GENERATE_COUNTS_DROP {

cat <<-END_VERSIONS > versions.yml
"${task.process}":
generate_gene_counts: v1.0
drop_counts: v1.0
END_VERSIONS
"""
}
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
process GENERATE_ANNOTATION_DROP {
process DROP_ANNOTATION {
tag "DROP_annot"
label 'process_low'

conda "bioconda::pydamage=0.70"
conda "bioconda::drop=1.3.3"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/pydamage:0.70--pyhdfd78af_0' :
'biocontainers/pydamage:0.70--pyhdfd78af_0' }"
'https://depot.galaxyproject.org/singularity/drop:1.3.3--pyhdfd78af_0' :
'biocontainers/drop:1.3.3--pyhdfd78af_0' }"

input:
path(processed_gene_counts)
path(gtf)
path(reference_count_file)

output:
path('sample_annotation.tsv'), emit: sample_annotation_drop
Expand All @@ -20,19 +19,17 @@ process GENERATE_ANNOTATION_DROP {
task.ext.when == null || task.ext.when

script:
def ref_counts = reference_count_file ? "--ref_count_file $reference_count_file" : ""
def gtf_name = gtf ? gtf.getBaseName() : ""

"""
generate_drop_sample_annot.py \\
drop_sample_annot.py \\
--count_file $processed_gene_counts \\
--gtf $gtf_name \\
$ref_counts \\
--output sample_annotation.tsv

cat <<-END_VERSIONS > versions.yml
"${task.process}":
generate_drop_sample_annot: v1.0
drop_sample_annot: v1.0
END_VERSIONS
"""

Expand All @@ -42,7 +39,7 @@ process GENERATE_ANNOTATION_DROP {

cat <<-END_VERSIONS > versions.yml
"${task.process}":
generate_drop_sample_annot: v1.0
drop_sample_annot: v1.0
END_VERSIONS
"""
}
Loading