add rules for variants and qc, minor changes

BIMSBbioinfo · May 7, 2021 · 1ac66a0 · 1ac66a0
1 parent 20b5437
commit 1ac66a0
Showing 1 changed file with 103 additions and 56 deletions.
diff --git a/snakefile.py b/snakefile.py
@@ -29,24 +29,28 @@
 # 1. obtain things from config file
 # 2. make dummy samples and put them to tests/sample_data/reads
 # 3. align overall appearance
-READS_DIR = 'tests/sample_data/reads'
-REFERENCE_FASTA = 'tests/sample_data/NC_045512.2.fasta'
-KRAKEN_DB = 'tests/databases/kraken_db'
-KRONA_DB = 'tests/databases/krona_db'
-SAMPLE_SHEET_CSV = 'tests/sample_sheet.csv'
-
-OUTPUT_DIR = 'output'
-OUTPUT_DIR = os.path.join(os.getcwd(), OUTPUT_DIR)
+READS_DIR = os.path.join(os.getcwd(), 'tests/sample_data/reads')
+REFERENCE_FASTA = os.path.join(os.getcwd(), 'tests/sample_data/NC_045512.2.fasta')
+AMPLICONS_BED = os.path.join(os.getcwd(), 'tests/sample_data/nCoV-2019_NCref.bed')
+KRAKEN_DB = os.path.join(os.getcwd(), 'tests/databases/kraken_db')
+KRONA_DB = os.path.join(os.getcwd(), 'tests/databases/krona_db')
+SIGMUT_DB = os.path.join(os.getcwd(), 'tests/databases/sigmut_db')
+VEP_DB = os.path.join(os.getcwd(), 'tests/databases/vep_db')
+SAMPLE_SHEET_CSV = os.path.join(os.getcwd(), 'tests/sample_sheet.csv')
+
+OUTPUT_DIR = os.path.join(os.getcwd(), 'output')
+
 TRIMMED_READS_DIR = os.path.join(OUTPUT_DIR, 'trimmed_reads')
 LOG_DIR = os.path.join(OUTPUT_DIR, 'logs')
 FASTQC_DIR = os.path.join(OUTPUT_DIR, 'fastqc')
 MULTIQC_DIR = os.path.join(OUTPUT_DIR, 'multiqc')
 MAPPED_READS_DIR = os.path.join(OUTPUT_DIR, 'mapped_reads')
 VARIANTS_DIR = os.path.join(OUTPUT_DIR, 'variants')
 KRAKEN_DIR = os.path.join(OUTPUT_DIR, 'kraken')
+COVERAGE_DIR = os.path.join(OUTPUT_DIR, 'coverage')
 REPORT_DIR = os.path.join(OUTPUT_DIR, 'report')
 
-SCRIPTS_DIR = 'scripts'
+SCRIPTS_DIR = os.path.join(os.getcwd(), 'scripts')
 
 
 # Load sample sheet
@@ -63,7 +67,7 @@
     'final_reports': {
         'description': "Produce a comprehensive report. This is the default target.",
         'files': (
-            expand(os.path.join(VARIANTS_DIR, '{sample}_snv.csv'), sample=SAMPLES) + 
+            expand(os.path.join(REPORT_DIR, '{sample}_qc_report.html'), sample=SAMPLES) + 
             expand(os.path.join(REPORT_DIR, '{sample}_variant_report.html'), sample=SAMPLES) + 
             expand(os.path.join(REPORT_DIR, '{sample}_kraken_report.html'), sample=SAMPLES)
         )
@@ -74,22 +78,27 @@
             expand(os.path.join(VARIANTS_DIR, '{sample}_snv.csv'), sample=SAMPLES)
         )
     },
-    'variant_report': {
-        'description': "Make variants report.",
+    'variant_reports': {
+        'description': "Make variants reports.",
         'files': (
             expand(os.path.join(REPORT_DIR, '{sample}_variant_report.html'), sample=SAMPLES)
         )
     },
+    'qc_reports': {
+        'description': "Make QC reports.",
+        'files': (
+            expand(os.path.join(REPORT_DIR, '{sample}_qc_report.html'), sample=SAMPLES)
+        )
+    },
     'kraken_reports': {
         'description': "Make Kraken reports.",
         'files': (
             expand(os.path.join(REPORT_DIR, '{sample}_kraken_report.html'), sample=SAMPLES) +
             expand(os.path.join(REPORT_DIR, '{sample}_krona_report.html'), sample=SAMPLES)
         )
     }
-
 }
-selected_targets = ['kraken_reports']
+selected_targets = ['variant_reports']
 OUTPUT_FILES = list(chain.from_iterable([targets[name]['files'] for name in selected_targets]))
 
 
@@ -130,9 +139,10 @@
         r1 = os.path.join(TRIMMED_READS_DIR, "{sample}_1.fastq"),
         r2 = os.path.join(TRIMMED_READS_DIR, "{sample}_2.fastq")
     params:
-        len_cutoff = int(150 * 0.8) # read length * pct_cutoff
+        len_cutoff = int(150 * 0.8), # read length * pct_cutoff
+        output = os.path.join(TRIMMED_READS_DIR, "{sample}")
     log: os.path.join(LOG_DIR, 'prinseq_{sample}.log')
-    shell: "prinseq-lite.pl -fastq {input.r1} -fastq2 {input.r2} -ns_max_n 4 -min_qual_mean 30 -trim_qual_left 30 -trim_qual_right 30 -trim_qual_window 10 -out_good {TRIMMED_READS_DIR}/{wildcards.sample} -out_bad null -min_len {params.len_cutoff} >> {log} 2>&1"
+    shell: "prinseq-lite.pl -fastq {input.r1} -fastq2 {input.r2} -ns_max_n 4 -min_qual_mean 30 -trim_qual_left 30 -trim_qual_right 30 -trim_qual_window 10 -out_good {params.output} -out_bad null -min_len {params.len_cutoff} >> {log} 2>&1"
 
 
 rule bwa_index:
@@ -154,38 +164,20 @@
     shell: "bwa mem -t {params.threads} {input.ref} {input.fastq} > {output} 2>> {log} 3>&2"
 
 
-# TODO: also get unaligned .bam --> so far tmp.bam is used
-# same rule to get unaligned .bam
-# probably something like below, but needs to be checked for our purpose (kraken reports)
-# "samtools view -bh -F 2 {input} > {output} 2>> {log} 3>&2"
 rule samtools_filter_aligned:
     input: os.path.join(MAPPED_READS_DIR, '{sample}_aligned_tmp.sam')
     output: os.path.join(MAPPED_READS_DIR, '{sample}_aligned.bam')
     log: os.path.join(LOG_DIR, 'samtools_filter_aligned_{sample}.log')
     shell: "samtools view -bh -f 2 -F 2048 {input} > {output} 2>> {log} 3>&2"
 
 
+# TODO: check command to get unaligned ones
 rule samtools_filter_unaligned:
     input: os.path.join(MAPPED_READS_DIR, '{sample}_aligned_tmp.sam')
     output: os.path.join(MAPPED_READS_DIR, '{sample}_unaligned.bam')
     log: os.path.join(LOG_DIR, 'samtools_filter_unaligned_{sample}.log')
     shell: "samtools view -bh -F 2 {input} > {output} 2>> {log} 3>&2"
 
-# probably not needed --> directly converted during filtering step (-b)
-# rule samtools_sam2bam_A:
-#     input: os.path.join(MAPPED_READS_DIR, '{sample}_aligned.sam')
-#     output: os.path.join(MAPPED_READS_DIR, '{sample}_aligned.bam')
-#     log: os.path.join(LOG_DIR, 'samtools_sam2bam_{sample}.log')
-#     shell: "samtools view -bS {input} > {output} >> {log} 2>&1"
-
-
-# # TODO: merge rule A and B and possibly? use unaligned instead of tmp
-# rule samtools_sam2bam_B:
-#     input: os.path.join(MAPPED_READS_DIR, '{sample}_aligned_tmp.sam')
-#     output: os.path.join(MAPPED_READS_DIR, '{sample}_aligned_tmp.bam')
-#     log: os.path.join(LOG_DIR, 'samtools_sam2bam_B_{sample}.log')
-#     shell: "samtools view -bS {input} > {output} >> {log} 2>&1"
-
 
 rule samtools_sort:
     input: os.path.join(MAPPED_READS_DIR, '{sample}_aligned.bam')
@@ -205,15 +197,14 @@
     input: os.path.join(MAPPED_READS_DIR, '{sample}_aligned_sorted.bam')
     output: os.path.join(FASTQC_DIR, '{sample}_aligned_sorted_fastqc.zip')
     log: os.path.join(LOG_DIR, 'fastqc_{sample}.log')
-    params:
-        out_dir = os.path.join(FASTQC_DIR)
-    shell: "fastqc -o {params.out_dir} -f bam {input} >> {log} 2>&1"
+    shell: "fastqc -o {FASTQC_DIR} -f bam {input} >> {log} 2>&1"
 
 
 # TODO: check if --call-indels is needed?
 rule lofreq:
     input:
         aligned_bam = os.path.join(MAPPED_READS_DIR, '{sample}_aligned_sorted.bam'),
+        aligned_bai = os.path.join(MAPPED_READS_DIR, '{sample}_aligned_sorted.bai'),
         ref = REFERENCE_FASTA
     output: os.path.join(VARIANTS_DIR, '{sample}_snv.vcf')
     log: os.path.join(LOG_DIR, 'lofreq_{sample}.log')
@@ -223,26 +214,40 @@
 rule vcf2csv:
     input: os.path.join(VARIANTS_DIR, '{sample}_snv.vcf')
     output: os.path.join(VARIANTS_DIR, '{sample}_snv.csv')
+    params:
+        script = os.path.join(SCRIPTS_DIR, 'vcfTocsv.py')
     log: os.path.join(LOG_DIR, 'vcf2csv_{sample}.log')
-    shell: "python {SCRIPTS_DIR}/vcfTocsv.py {input} >> {log} 2>&1"
+    shell: "python {params.script} {input} >> {log} 2>&1"
+
+
+rule vep:
+    input: os.path.join(VARIANTS_DIR, '{sample}_snv.vcf')
+    output: os.path.join(VARIANTS_DIR, '{sample}_vep_sarscov2.txt')
+    params:
+        species = "sars_cov_2"
+    log: os.path.join(LOG_DIR, 'vep_{sample}.log')
+    shell: "vep --verbose --offline --dir_cache {VEP_DB} --DB_VERSION 101 --appris --biotype --buffer_size 5000 --check_existing --distance 5000 --mane --protein --species {params.species} --symbol --transcript_version --tsl --input_file {input} --output_file {output} >> {log} 2>&1"
 
 
-# TODO: add vep rule
-# rule vep:
-#     input: 
-#     output: 
-#     log: 
-#     shell: ""
+rule parse_vep:
+    input: os.path.join(VARIANTS_DIR, '{sample}_vep_sarscov2.txt')
+    output: os.path.join(VARIANTS_DIR, '{sample}_vep_sarscov2_parsed.txt')
+    params:
+        script = os.path.join(SCRIPTS_DIR, 'parse_vep.py')
+    log: os.path.join(LOG_DIR, 'parse_vep_{sample}.log')
+    shell: "python {params.script} {VARIANTS_DIR} {input} {output} >> {log} 2>&1"
 
 
-# TODO: fill in blanks, why sample_dir? where are sigmuts coming from?
-# rule variant_report:
-#     input:
-#         vep_txt = os.path.join(VARIANTS_DIR, '{sample}_vep.txt'),
-#         snv_csv = os.path.join(VARIANTS_DIR, '{sample}_snv.csv')
-#     output: os.path.join(REPORT_DIR, '{sample}_variant_report.html')
-#     log: os.path.join(LOG_DIR, 'vcf2csv_{sample}.log')
-#     shell: "Rscript {SCRIPTS_DIR}/run_variant_report.R --reportFile={SCRIPTS_DIR}/variantreport_p_sample.rmd --vep_txt_file={input.vep_txt} --snv_csv_file={input.snv_csv} --location_sigmuts=??? --sample_dir=??? --sample_name={wildcards.sample} >> {log} 2>&1"
+rule variant_report:
+    input:
+        vep_txt = os.path.join(VARIANTS_DIR, '{sample}_vep_sarscov2_parsed.txt'),
+        snv_csv = os.path.join(VARIANTS_DIR, '{sample}_snv.csv')
+    output: os.path.join(REPORT_DIR, '{sample}_variant_report.html')
+    params:
+        run_report = os.path.join(SCRIPTS_DIR, 'run_variant_report.R'),
+        report_rmd = os.path.join(SCRIPTS_DIR, 'variant_report.Rmd')
+    log: os.path.join(LOG_DIR, 'variant_report_{sample}.log')
+    shell: "Rscript {params.run_report} --reportFile={params.report_rmd} --vep_txt_file={input.vep_txt} --snv_csv_file={input.snv_csv} --location_sigmuts={SIGMUT_DB} --sample_dir={VARIANTS_DIR} --sample_name={wildcards.sample} --outFile={output} >> {log} 2>&1"
 
 
 rule bam2fastq:
@@ -266,10 +271,9 @@
     output: os.path.join(REPORT_DIR, '{sample}_kraken_report.html')
     params:
         run_report = os.path.join(SCRIPTS_DIR, 'run_kraken_report.R'),
-        report_rmd = os.path.join(SCRIPTS_DIR, 'kraken_report.Rmd'),
-        kraken_output = os.path.join(KRAKEN_DIR, '{sample}_classified_unaligned_reads.txt')
+        report_rmd = os.path.join(SCRIPTS_DIR, 'kraken_report.Rmd')
     log: os.path.join(LOG_DIR, 'kraken_report_{sample}.log')
-    shell: "Rscript {params.run_report} --reportFile={params.report_rmd} --kraken_output={params.kraken_output} --sample_name={wildcards.sample} --output_file={output} >> {log} 2>&1"
+    shell: "Rscript {params.run_report} --reportFile={params.report_rmd} --kraken_output={input} --sample_name={wildcards.sample} --output_file={output} >> {log} 2>&1"
 
 
 rule krona_report:
@@ -279,3 +283,46 @@
     output: os.path.join(REPORT_DIR, '{sample}_krona_report.html')
     log: os.path.join(LOG_DIR, 'krona_report_{sample}.log')
     shell: "ktImportTaxonomy -m 3 -t 5 {input.kraken_output} -tax {input.database} -o {output} >> {log} 2>&1"
+
+
+# TODO: does it have to be the 'unsorted' bam file? If so, we have to do indexing on those, too
+rule samtools_bedcov:
+    input:
+        amplicons_bed = AMPLICONS_BED,
+        aligned_bam = os.path.join(MAPPED_READS_DIR, '{sample}_aligned_sorted.bam'),
+        aligned_bai = os.path.join(MAPPED_READS_DIR, '{sample}_aligned_sorted.bai')
+    output: os.path.join(COVERAGE_DIR, '{sample}_amplicons.csv')
+    log: os.path.join(LOG_DIR, 'samtools_bedcov_{sample}.log')
+    shell:"samtools bedcov {input.amplicons_bed} {input.aligned_bam} > {output} 2>> {log} 3>&2"
+
+
+# TODO: does it have to be the 'unsorted' bam file? If so, we have to do indexing on those, too
+rule samtools_coverage:
+    input:
+        aligned_bam = os.path.join(MAPPED_READS_DIR, '{sample}_aligned_sorted.bam'),
+        aligned_bai = os.path.join(MAPPED_READS_DIR, '{sample}_aligned_sorted.bai')
+    output: os.path.join(COVERAGE_DIR, '{sample}_coverage.csv')
+    log: os.path.join(LOG_DIR, 'samtools_coverage_{sample}.log')
+    shell: "samtools coverage {input.aligned_bam} > {output} 2>> {log} 3>&2"
+
+
+rule get_qc_table:
+    input:
+        coverage_csv = os.path.join(COVERAGE_DIR, '{sample}_coverage.csv'),
+        amplicon_csv = os.path.join(COVERAGE_DIR, '{sample}_amplicons.csv')
+    output: os.path.join(COVERAGE_DIR, '{sample}_merged_covs.csv')
+    params:
+        script = os.path.join(SCRIPTS_DIR, 'get_qc_table.py')
+    log: os.path.join(LOG_DIR, 'get_qc_table_{sample}.log')
+    shell: "python {params.script} {input.coverage_csv} {input.amplicon_csv} {output} >> {log} 2>&1"
+
+
+# TODO: fix Error: unexpected end of input
+# rule qc_report:
+#     input: os.path.join(COVERAGE_DIR, '{sample}_merged_covs.csv')
+#     output: os.path.join(REPORT_DIR, '{sample}_qc_report.html')
+#     params:
+#         run_report = os.path.join(SCRIPTS_DIR, 'run_qc_report.R'),
+#         report_rmd = os.path.join(SCRIPTS_DIR, 'qc_report.Rmd')
+#     log: os.path.join(LOG_DIR, 'qc_report_{sample}.log')
+#     shell: "Rscript {params.run_report} --reportFile={params.report_rmd} --coverage_file={input} --sample_name={wildcards.sample} --outFile={output} >> {log} 2>&1"