Skip to content

Commit

Permalink
add rules for variants and qc, minor changes
Browse files Browse the repository at this point in the history
  • Loading branch information
dohmjan committed May 7, 2021
1 parent 20b5437 commit 1ac66a0
Showing 1 changed file with 103 additions and 56 deletions.
159 changes: 103 additions & 56 deletions snakefile.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,24 +29,28 @@
# 1. obtain things from config file
# 2. make dummy samples and put them to tests/sample_data/reads
# 3. align overall appearance
READS_DIR = 'tests/sample_data/reads'
REFERENCE_FASTA = 'tests/sample_data/NC_045512.2.fasta'
KRAKEN_DB = 'tests/databases/kraken_db'
KRONA_DB = 'tests/databases/krona_db'
SAMPLE_SHEET_CSV = 'tests/sample_sheet.csv'

OUTPUT_DIR = 'output'
OUTPUT_DIR = os.path.join(os.getcwd(), OUTPUT_DIR)
READS_DIR = os.path.join(os.getcwd(), 'tests/sample_data/reads')
REFERENCE_FASTA = os.path.join(os.getcwd(), 'tests/sample_data/NC_045512.2.fasta')
AMPLICONS_BED = os.path.join(os.getcwd(), 'tests/sample_data/nCoV-2019_NCref.bed')
KRAKEN_DB = os.path.join(os.getcwd(), 'tests/databases/kraken_db')
KRONA_DB = os.path.join(os.getcwd(), 'tests/databases/krona_db')
SIGMUT_DB = os.path.join(os.getcwd(), 'tests/databases/sigmut_db')
VEP_DB = os.path.join(os.getcwd(), 'tests/databases/vep_db')
SAMPLE_SHEET_CSV = os.path.join(os.getcwd(), 'tests/sample_sheet.csv')

OUTPUT_DIR = os.path.join(os.getcwd(), 'output')

TRIMMED_READS_DIR = os.path.join(OUTPUT_DIR, 'trimmed_reads')
LOG_DIR = os.path.join(OUTPUT_DIR, 'logs')
FASTQC_DIR = os.path.join(OUTPUT_DIR, 'fastqc')
MULTIQC_DIR = os.path.join(OUTPUT_DIR, 'multiqc')
MAPPED_READS_DIR = os.path.join(OUTPUT_DIR, 'mapped_reads')
VARIANTS_DIR = os.path.join(OUTPUT_DIR, 'variants')
KRAKEN_DIR = os.path.join(OUTPUT_DIR, 'kraken')
COVERAGE_DIR = os.path.join(OUTPUT_DIR, 'coverage')
REPORT_DIR = os.path.join(OUTPUT_DIR, 'report')

SCRIPTS_DIR = 'scripts'
SCRIPTS_DIR = os.path.join(os.getcwd(), 'scripts')


# Load sample sheet
Expand All @@ -63,7 +67,7 @@
'final_reports': {
'description': "Produce a comprehensive report. This is the default target.",
'files': (
expand(os.path.join(VARIANTS_DIR, '{sample}_snv.csv'), sample=SAMPLES) +
expand(os.path.join(REPORT_DIR, '{sample}_qc_report.html'), sample=SAMPLES) +
expand(os.path.join(REPORT_DIR, '{sample}_variant_report.html'), sample=SAMPLES) +
expand(os.path.join(REPORT_DIR, '{sample}_kraken_report.html'), sample=SAMPLES)
)
Expand All @@ -74,22 +78,27 @@
expand(os.path.join(VARIANTS_DIR, '{sample}_snv.csv'), sample=SAMPLES)
)
},
'variant_report': {
'description': "Make variants report.",
'variant_reports': {
'description': "Make variants reports.",
'files': (
expand(os.path.join(REPORT_DIR, '{sample}_variant_report.html'), sample=SAMPLES)
)
},
'qc_reports': {
'description': "Make QC reports.",
'files': (
expand(os.path.join(REPORT_DIR, '{sample}_qc_report.html'), sample=SAMPLES)
)
},
'kraken_reports': {
'description': "Make Kraken reports.",
'files': (
expand(os.path.join(REPORT_DIR, '{sample}_kraken_report.html'), sample=SAMPLES) +
expand(os.path.join(REPORT_DIR, '{sample}_krona_report.html'), sample=SAMPLES)
)
}

}
selected_targets = ['kraken_reports']
selected_targets = ['variant_reports']
OUTPUT_FILES = list(chain.from_iterable([targets[name]['files'] for name in selected_targets]))


Expand Down Expand Up @@ -130,9 +139,10 @@
r1 = os.path.join(TRIMMED_READS_DIR, "{sample}_1.fastq"),
r2 = os.path.join(TRIMMED_READS_DIR, "{sample}_2.fastq")
params:
len_cutoff = int(150 * 0.8) # read length * pct_cutoff
len_cutoff = int(150 * 0.8), # read length * pct_cutoff
output = os.path.join(TRIMMED_READS_DIR, "{sample}")
log: os.path.join(LOG_DIR, 'prinseq_{sample}.log')
shell: "prinseq-lite.pl -fastq {input.r1} -fastq2 {input.r2} -ns_max_n 4 -min_qual_mean 30 -trim_qual_left 30 -trim_qual_right 30 -trim_qual_window 10 -out_good {TRIMMED_READS_DIR}/{wildcards.sample} -out_bad null -min_len {params.len_cutoff} >> {log} 2>&1"
shell: "prinseq-lite.pl -fastq {input.r1} -fastq2 {input.r2} -ns_max_n 4 -min_qual_mean 30 -trim_qual_left 30 -trim_qual_right 30 -trim_qual_window 10 -out_good {params.output} -out_bad null -min_len {params.len_cutoff} >> {log} 2>&1"


rule bwa_index:
Expand All @@ -154,38 +164,20 @@
shell: "bwa mem -t {params.threads} {input.ref} {input.fastq} > {output} 2>> {log} 3>&2"


# TODO: also get unaligned .bam --> so far tmp.bam is used
# same rule to get unaligned .bam
# probably something like below, but needs to be checked for our purpose (kraken reports)
# "samtools view -bh -F 2 {input} > {output} 2>> {log} 3>&2"
rule samtools_filter_aligned:
input: os.path.join(MAPPED_READS_DIR, '{sample}_aligned_tmp.sam')
output: os.path.join(MAPPED_READS_DIR, '{sample}_aligned.bam')
log: os.path.join(LOG_DIR, 'samtools_filter_aligned_{sample}.log')
shell: "samtools view -bh -f 2 -F 2048 {input} > {output} 2>> {log} 3>&2"


# TODO: check command to get unaligned ones
rule samtools_filter_unaligned:
input: os.path.join(MAPPED_READS_DIR, '{sample}_aligned_tmp.sam')
output: os.path.join(MAPPED_READS_DIR, '{sample}_unaligned.bam')
log: os.path.join(LOG_DIR, 'samtools_filter_unaligned_{sample}.log')
shell: "samtools view -bh -F 2 {input} > {output} 2>> {log} 3>&2"

# probably not needed --> directly converted during filtering step (-b)
# rule samtools_sam2bam_A:
# input: os.path.join(MAPPED_READS_DIR, '{sample}_aligned.sam')
# output: os.path.join(MAPPED_READS_DIR, '{sample}_aligned.bam')
# log: os.path.join(LOG_DIR, 'samtools_sam2bam_{sample}.log')
# shell: "samtools view -bS {input} > {output} >> {log} 2>&1"


# # TODO: merge rule A and B and possibly? use unaligned instead of tmp
# rule samtools_sam2bam_B:
# input: os.path.join(MAPPED_READS_DIR, '{sample}_aligned_tmp.sam')
# output: os.path.join(MAPPED_READS_DIR, '{sample}_aligned_tmp.bam')
# log: os.path.join(LOG_DIR, 'samtools_sam2bam_B_{sample}.log')
# shell: "samtools view -bS {input} > {output} >> {log} 2>&1"


rule samtools_sort:
input: os.path.join(MAPPED_READS_DIR, '{sample}_aligned.bam')
Expand All @@ -205,15 +197,14 @@
input: os.path.join(MAPPED_READS_DIR, '{sample}_aligned_sorted.bam')
output: os.path.join(FASTQC_DIR, '{sample}_aligned_sorted_fastqc.zip')
log: os.path.join(LOG_DIR, 'fastqc_{sample}.log')
params:
out_dir = os.path.join(FASTQC_DIR)
shell: "fastqc -o {params.out_dir} -f bam {input} >> {log} 2>&1"
shell: "fastqc -o {FASTQC_DIR} -f bam {input} >> {log} 2>&1"


# TODO: check if --call-indels is needed?
rule lofreq:
input:
aligned_bam = os.path.join(MAPPED_READS_DIR, '{sample}_aligned_sorted.bam'),
aligned_bai = os.path.join(MAPPED_READS_DIR, '{sample}_aligned_sorted.bai'),
ref = REFERENCE_FASTA
output: os.path.join(VARIANTS_DIR, '{sample}_snv.vcf')
log: os.path.join(LOG_DIR, 'lofreq_{sample}.log')
Expand All @@ -223,26 +214,40 @@
rule vcf2csv:
input: os.path.join(VARIANTS_DIR, '{sample}_snv.vcf')
output: os.path.join(VARIANTS_DIR, '{sample}_snv.csv')
params:
script = os.path.join(SCRIPTS_DIR, 'vcfTocsv.py')
log: os.path.join(LOG_DIR, 'vcf2csv_{sample}.log')
shell: "python {SCRIPTS_DIR}/vcfTocsv.py {input} >> {log} 2>&1"
shell: "python {params.script} {input} >> {log} 2>&1"


rule vep:
input: os.path.join(VARIANTS_DIR, '{sample}_snv.vcf')
output: os.path.join(VARIANTS_DIR, '{sample}_vep_sarscov2.txt')
params:
species = "sars_cov_2"
log: os.path.join(LOG_DIR, 'vep_{sample}.log')
shell: "vep --verbose --offline --dir_cache {VEP_DB} --DB_VERSION 101 --appris --biotype --buffer_size 5000 --check_existing --distance 5000 --mane --protein --species {params.species} --symbol --transcript_version --tsl --input_file {input} --output_file {output} >> {log} 2>&1"


# TODO: add vep rule
# rule vep:
# input:
# output:
# log:
# shell: ""
rule parse_vep:
input: os.path.join(VARIANTS_DIR, '{sample}_vep_sarscov2.txt')
output: os.path.join(VARIANTS_DIR, '{sample}_vep_sarscov2_parsed.txt')
params:
script = os.path.join(SCRIPTS_DIR, 'parse_vep.py')
log: os.path.join(LOG_DIR, 'parse_vep_{sample}.log')
shell: "python {params.script} {VARIANTS_DIR} {input} {output} >> {log} 2>&1"


# TODO: fill in blanks, why sample_dir? where are sigmuts coming from?
# rule variant_report:
# input:
# vep_txt = os.path.join(VARIANTS_DIR, '{sample}_vep.txt'),
# snv_csv = os.path.join(VARIANTS_DIR, '{sample}_snv.csv')
# output: os.path.join(REPORT_DIR, '{sample}_variant_report.html')
# log: os.path.join(LOG_DIR, 'vcf2csv_{sample}.log')
# shell: "Rscript {SCRIPTS_DIR}/run_variant_report.R --reportFile={SCRIPTS_DIR}/variantreport_p_sample.rmd --vep_txt_file={input.vep_txt} --snv_csv_file={input.snv_csv} --location_sigmuts=??? --sample_dir=??? --sample_name={wildcards.sample} >> {log} 2>&1"
rule variant_report:
input:
vep_txt = os.path.join(VARIANTS_DIR, '{sample}_vep_sarscov2_parsed.txt'),
snv_csv = os.path.join(VARIANTS_DIR, '{sample}_snv.csv')
output: os.path.join(REPORT_DIR, '{sample}_variant_report.html')
params:
run_report = os.path.join(SCRIPTS_DIR, 'run_variant_report.R'),
report_rmd = os.path.join(SCRIPTS_DIR, 'variant_report.Rmd')
log: os.path.join(LOG_DIR, 'variant_report_{sample}.log')
shell: "Rscript {params.run_report} --reportFile={params.report_rmd} --vep_txt_file={input.vep_txt} --snv_csv_file={input.snv_csv} --location_sigmuts={SIGMUT_DB} --sample_dir={VARIANTS_DIR} --sample_name={wildcards.sample} --outFile={output} >> {log} 2>&1"


rule bam2fastq:
Expand All @@ -266,10 +271,9 @@
output: os.path.join(REPORT_DIR, '{sample}_kraken_report.html')
params:
run_report = os.path.join(SCRIPTS_DIR, 'run_kraken_report.R'),
report_rmd = os.path.join(SCRIPTS_DIR, 'kraken_report.Rmd'),
kraken_output = os.path.join(KRAKEN_DIR, '{sample}_classified_unaligned_reads.txt')
report_rmd = os.path.join(SCRIPTS_DIR, 'kraken_report.Rmd')
log: os.path.join(LOG_DIR, 'kraken_report_{sample}.log')
shell: "Rscript {params.run_report} --reportFile={params.report_rmd} --kraken_output={params.kraken_output} --sample_name={wildcards.sample} --output_file={output} >> {log} 2>&1"
shell: "Rscript {params.run_report} --reportFile={params.report_rmd} --kraken_output={input} --sample_name={wildcards.sample} --output_file={output} >> {log} 2>&1"


rule krona_report:
Expand All @@ -279,3 +283,46 @@
output: os.path.join(REPORT_DIR, '{sample}_krona_report.html')
log: os.path.join(LOG_DIR, 'krona_report_{sample}.log')
shell: "ktImportTaxonomy -m 3 -t 5 {input.kraken_output} -tax {input.database} -o {output} >> {log} 2>&1"


# TODO: does it have to be the 'unsorted' bam file? If so, we have to do indexing on those, too
rule samtools_bedcov:
input:
amplicons_bed = AMPLICONS_BED,
aligned_bam = os.path.join(MAPPED_READS_DIR, '{sample}_aligned_sorted.bam'),
aligned_bai = os.path.join(MAPPED_READS_DIR, '{sample}_aligned_sorted.bai')
output: os.path.join(COVERAGE_DIR, '{sample}_amplicons.csv')
log: os.path.join(LOG_DIR, 'samtools_bedcov_{sample}.log')
shell:"samtools bedcov {input.amplicons_bed} {input.aligned_bam} > {output} 2>> {log} 3>&2"


# TODO: does it have to be the 'unsorted' bam file? If so, we have to do indexing on those, too
rule samtools_coverage:
input:
aligned_bam = os.path.join(MAPPED_READS_DIR, '{sample}_aligned_sorted.bam'),
aligned_bai = os.path.join(MAPPED_READS_DIR, '{sample}_aligned_sorted.bai')
output: os.path.join(COVERAGE_DIR, '{sample}_coverage.csv')
log: os.path.join(LOG_DIR, 'samtools_coverage_{sample}.log')
shell: "samtools coverage {input.aligned_bam} > {output} 2>> {log} 3>&2"


rule get_qc_table:
input:
coverage_csv = os.path.join(COVERAGE_DIR, '{sample}_coverage.csv'),
amplicon_csv = os.path.join(COVERAGE_DIR, '{sample}_amplicons.csv')
output: os.path.join(COVERAGE_DIR, '{sample}_merged_covs.csv')
params:
script = os.path.join(SCRIPTS_DIR, 'get_qc_table.py')
log: os.path.join(LOG_DIR, 'get_qc_table_{sample}.log')
shell: "python {params.script} {input.coverage_csv} {input.amplicon_csv} {output} >> {log} 2>&1"


# TODO: fix Error: unexpected end of input
# rule qc_report:
# input: os.path.join(COVERAGE_DIR, '{sample}_merged_covs.csv')
# output: os.path.join(REPORT_DIR, '{sample}_qc_report.html')
# params:
# run_report = os.path.join(SCRIPTS_DIR, 'run_qc_report.R'),
# report_rmd = os.path.join(SCRIPTS_DIR, 'qc_report.Rmd')
# log: os.path.join(LOG_DIR, 'qc_report_{sample}.log')
# shell: "Rscript {params.run_report} --reportFile={params.report_rmd} --coverage_file={input} --sample_name={wildcards.sample} --outFile={output} >> {log} 2>&1"

0 comments on commit 1ac66a0

Please sign in to comment.