From a6b0b08186d27c64955adb2aa2e641e6ab13624d Mon Sep 17 00:00:00 2001
From: Dohmen <jan.dohmen@googlemail.com>
Date: Thu, 6 May 2021 17:38:34 +0200
Subject: [PATCH 1/4] add kraken and krona reports

---
 scripts/kraken_report.Rmd   | 58 +++++++++++++++++++++++++
 scripts/run_kraken_report.R | 75 ++++++++++++++++++++++++++++++++
 snakefile.py                | 87 +++++++++++++++++++++++++------------
 3 files changed, 193 insertions(+), 27 deletions(-)
 create mode 100644 scripts/kraken_report.Rmd
 create mode 100644 scripts/run_kraken_report.R

diff --git a/scripts/kraken_report.Rmd b/scripts/kraken_report.Rmd
new file mode 100644
index 00000000..7ea1adb8
--- /dev/null
+++ b/scripts/kraken_report.Rmd
@@ -0,0 +1,58 @@
+---
+title: "Kraken2-report"
+#author: "mfaxel"
+date: '`r format(as.POSIXct(if ("" != Sys.getenv("SOURCE_DATE_EPOCH")) { as.numeric(Sys.getenv("SOURCE_DATE_EPOCH")) } else { Sys.time() }, origin="1970-01-01"), "%Y-%m-%d %H:%M:%S")`'
+#output: html_document
+params:
+  #sample_dir: '../../tests/sample_data/' # must be kraken REPORT file
+  #sample_name: 'tmp_unaligned_krakenReport'
+  sample_name: ""
+  kraken_output: ""
+
+---
+
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
+knitr::opts_knit$set(root.dir = params$workdir)
+library(DT)
+```
+
+
+
+
+```{r, echo=FALSE}
+#input 
+sampleName <- params$sample_name
+#sampleDir <- params$sample_dir
+#krakenReportFile <- paste0(params$sample_dir, params$sample_name)
+# krakenReportFile <- paste0(params$sample_dir, params$sample_name, ".txt")
+krakenReportFile <- kraken_output
+```
+
+
+## Kraken2 report on unaligned reads from the Sars-Cov2 enriched samples
+
+```{r,echo=F}
+print(paste( "Sample: ",sampleName))
+```
+
+
+The Sars-Cov2 enriched wastewater samples are aligned to the Virus genome. The unaligned reads, that are left are aligned with Kraken2 against the [database PlusPFP](https://benlangmead.github.io/aws-indexes/k2). This report is an overview over the species found.  
+
+A table with all the species found: 
+
+* Column explanation:   
+  + Percentage of fragments covered by the clade rooted at this taxon  
+  + Number of fragments covered by the clade rooted at this taxon  
+  + Number of fragments assigned directly to this taxon  
+  + A rank code, indicating (U)nclassified, (R)oot, (D)omain, (K)ingdom, (P)hylum, (C)lass, (O)rder, (F)amily, (G)enus, or (S)pecies.     Taxa that are not at any of these 10 ranks have a rank code that is formed by using the rank code of the closest ancestor rank         with a number indicating the distance from that rank. E.g., "G2" is a rank code indicating a taxon is between genus and species         and the grandparent taxon is at the genus rank.  
+  + NCBI taxonomic ID number  
+  + Indented scientific name  
+
+```{r, echo=FALSE}
+table<- read.table(krakenReportFile,sep = "\t")
+header_table <- c("percent","num_fragments","num_fragments_taxon","rank","NCBI_ID","scientific_name")
+colnames(table)<-header_table
+datatable(table)
+```
diff --git a/scripts/run_kraken_report.R b/scripts/run_kraken_report.R
new file mode 100644
index 00000000..742855b4
--- /dev/null
+++ b/scripts/run_kraken_report.R
@@ -0,0 +1,75 @@
+# Title  : run Kraken2 report
+# author: mfaxel
+# Based on the runDeseqReport.R from Bora Uyar
+# Created on: 2.05.21
+
+#library(rmarkdown)
+
+#function to run report
+runReport <- function(reportFile, 
+                      sample_name, 
+                      kraken_output,
+                      output_file,
+                      selfContained = TRUE,
+                      quiet = FALSE) {
+  
+#   outFile <- paste0(output_dir,"/",sample_name,"kraken_report.html")
+  
+  # htmlwidgets::setWidgetIdSeed(1234)
+  rmarkdown::render(
+    input = reportFile, 
+    # output_dir = sample_dir,
+    #  intermediates_dir = file.path(workdir, prefix),
+    clean = TRUE,
+    output_file = output_file,
+    output_format = rmarkdown::html_document(
+      code_folding = 'hide', 
+      depth = 2,
+      toc = TRUE,
+      toc_float = TRUE,
+      theme = 'lumen',
+      number_sections = TRUE
+    ),
+    output_options = list(self_contained = selfContained),
+    params = list(kraken_output = kraken_output,
+                  sample_name = sample_name
+                  ),
+    quiet = quiet
+  )
+  
+#   if(dir.exists(file.path(sample_dir, sample_name))) {
+#     unlink(file.path(sample_dir, sample_name), recursive = TRUE)
+#   }
+  
+}
+
+#1. Collect arguments
+args <- commandArgs(TRUE)
+
+cat("arguments:", args,"\n")
+
+## Parse arguments (we expect the form --arg=value)
+#parseArgs <- function(x) strsplit(sub("^--", "", x), "=")
+parseArgs <- function(x) {
+  myArgs <- unlist(strsplit(x, "--"))
+  myArgs <- myArgs[myArgs != '']
+  #when no values are provided for the argument
+  myArgs <- gsub(pattern = "=$", replacement = "= ", x = myArgs)
+  myArgs <- as.data.frame(do.call(rbind, strsplit(myArgs, "=")))
+  myArgs$V2 <- gsub(' ', '', myArgs$V2)
+  return(myArgs)
+}
+
+argsDF <- parseArgs(args)
+argsL <- as.list(as.character(argsDF$V2))
+names(argsL) <- argsDF$V1
+
+reportFile <- argsL$reportFile
+sample_name <- argsL$sample_name
+kraken_output <- argsL$kraken_output
+output_file <- argsL$output_file
+
+runReport(reportFile = reportFile,
+          sample_name = sample_name,
+          kraken_output = kraken_output,
+          output_file = output_file)
\ No newline at end of file
diff --git a/snakefile.py b/snakefile.py
index a9bc8e73..66a279df 100644
--- a/snakefile.py
+++ b/snakefile.py
@@ -28,12 +28,15 @@
 # TODO: 
 # 1. obtain things from config file
 # 2. make dummy samples and put them to tests/sample_data/reads
+# 3. align overall appearance
 READS_DIR = 'tests/sample_data/reads'
 REFERENCE_FASTA = 'tests/sample_data/NC_045512.2.fasta'
-KRAKEN_DB = 'tests/sample_data/kraken_db.db' # download first, where?
+KRAKEN_DB = 'tests/databases/kraken_db'
+KRONA_DB = 'tests/databases/krona_db'
 SAMPLE_SHEET_CSV = 'tests/sample_sheet.csv'
 
 OUTPUT_DIR = 'output'
+OUTPUT_DIR = os.path.join(os.getcwd(), OUTPUT_DIR)
 TRIMMED_READS_DIR = os.path.join(OUTPUT_DIR, 'trimmed_reads')
 LOG_DIR = os.path.join(OUTPUT_DIR, 'logs')
 FASTQC_DIR = os.path.join(OUTPUT_DIR, 'fastqc')
@@ -70,9 +73,23 @@
         'files': (
             expand(os.path.join(VARIANTS_DIR, '{sample}_snv.csv'), sample=SAMPLES)
         )
+    },
+    'variant_report': {
+        'description': "Make variants report.",
+        'files': (
+            expand(os.path.join(REPORT_DIR, '{sample}_variant_report.html'), sample=SAMPLES)
+        )
+    },
+    'kraken_reports': {
+        'description': "Make Kraken reports.",
+        'files': (
+            expand(os.path.join(REPORT_DIR, '{sample}_kraken_report.html'), sample=SAMPLES) +
+            expand(os.path.join(REPORT_DIR, '{sample}_krona_report.html'), sample=SAMPLES)
+        )
     }
+
 }
-selected_targets = ['lofreq']
+selected_targets = ['kraken_reports']
 OUTPUT_FILES = list(chain.from_iterable([targets[name]['files'] for name in selected_targets]))
 
 
@@ -141,12 +158,19 @@
 # same rule to get unaligned .bam
 # probably something like below, but needs to be checked for our purpose (kraken reports)
 # "samtools view -bh -F 2 {input} > {output} 2>> {log} 3>&2"
-rule samtools_filter:
+rule samtools_filter_aligned:
     input: os.path.join(MAPPED_READS_DIR, '{sample}_aligned_tmp.sam')
     output: os.path.join(MAPPED_READS_DIR, '{sample}_aligned.bam')
-    log: os.path.join(LOG_DIR, 'samtools_filter_{sample}.log')
+    log: os.path.join(LOG_DIR, 'samtools_filter_aligned_{sample}.log')
     shell: "samtools view -bh -f 2 -F 2048 {input} > {output} 2>> {log} 3>&2"
 
+
+rule samtools_filter_unaligned:
+    input: os.path.join(MAPPED_READS_DIR, '{sample}_aligned_tmp.sam')
+    output: os.path.join(MAPPED_READS_DIR, '{sample}_unaligned.bam')
+    log: os.path.join(LOG_DIR, 'samtools_filter_unaligned_{sample}.log')
+    shell: "samtools view -bh -F 2 {input} > {output} 2>> {log} 3>&2"
+
 # probably not needed --> directly converted during filtering step (-b)
 # rule samtools_sam2bam_A:
 #     input: os.path.join(MAPPED_READS_DIR, '{sample}_aligned.sam')
@@ -221,28 +245,37 @@
 #     shell: "Rscript {SCRIPTS_DIR}/run_variant_report.R --reportFile={SCRIPTS_DIR}/variantreport_p_sample.rmd --vep_txt_file={input.vep_txt} --snv_csv_file={input.snv_csv} --location_sigmuts=??? --sample_dir=??? --sample_name={wildcards.sample} >> {log} 2>&1"
 
 
-# TODO: check command/output as in miro board R1 & R2 is mentioned
-# rule bam2fastq:
-#     input: os.path.join(MAPPED_READS_DIR, '{sample}_aligned_tmp.bam')
-#     output: os.path.join(MAPPED_READS_DIR, '{sample}_aligned_tmp.fastq')
-#     log: os.path.join(LOG_DIR, 'bam2fastq_{sample}.log')
-#     shell: "samtools fastq {input} > {output} >> {log} 2>&1"
+rule bam2fastq:
+    input: os.path.join(MAPPED_READS_DIR, '{sample}_unaligned.bam')
+    output: os.path.join(MAPPED_READS_DIR, '{sample}_unaligned.fastq')
+    log: os.path.join(LOG_DIR, 'bam2fastq_{sample}.log')
+    shell: "samtools fastq {input} > {output} 2>> {log} 3>&2"
 
 
-# TODO: check command as in miro board there where R1 & R2 inputs
-# rule kraken:
-#     input:
-#         unaligned_fastq = os.path.join(MAPPED_READS_DIR, '{sample}_aligned_tmp.fastq'),
-#         database = KRAKEN_DB
-#     output: os.path.join(KRAKEN_DIR, '{sample}_classified_unaligned_reads.txt')
-#     log: os.path.join(LOG_DIR, 'kraken_{sample}.log')
-#     shell: "kraken2 --output {output} --db {input.database} --paired {input.unaligned_fastq} >> {log} 2>&1"
-
-
-# TODO: adapt command and possibly input due to 'taxonomy_folder'
-# What is meant by taxonomy_folder??
-# rule kraken_report:
-#     input: os.path.join(KRAKEN_DIR, '{sample}_classified_unaligned_reads.txt')
-#     output: os.path.join(REPORT_DIR, '{sample}_kraken_report.html')
-#     log: os.path.join(LOG_DIR, 'kraken_report_{sample}.log')
-#     shell: "ktImportTaxonomy -m 3 -t 5 {input} -tax [taxonomy folder]??? -o {output} >> {log} 2>&1"
+rule kraken:
+    input:
+        unaligned_fastq = os.path.join(MAPPED_READS_DIR, '{sample}_unaligned.fastq'),
+        database = KRAKEN_DB
+    output: os.path.join(KRAKEN_DIR, '{sample}_classified_unaligned_reads.txt')
+    log: os.path.join(LOG_DIR, 'kraken_{sample}.log')
+    shell: "kraken2 --report {output} --db {input.database} {input.unaligned_fastq} >> {log} 2>&1"
+
+
+rule kraken_report:
+    input: os.path.join(KRAKEN_DIR, '{sample}_classified_unaligned_reads.txt')
+    output: os.path.join(REPORT_DIR, '{sample}_kraken_report.html')
+    params:
+        run_report = os.path.join(SCRIPTS_DIR, 'run_kraken_report.R'),
+        report_rmd = os.path.join(SCRIPTS_DIR, 'kraken_report.Rmd'),
+        kraken_output = os.path.join(KRAKEN_DIR, '{sample}_classified_unaligned_reads.txt')
+    log: os.path.join(LOG_DIR, 'kraken_report_{sample}.log')
+    shell: "Rscript {params.run_report} --reportFile={params.report_rmd} --kraken_output={params.kraken_output} --sample_name={wildcards.sample} --output_file={output} >> {log} 2>&1"
+
+
+rule krona_report:
+    input: 
+        kraken_output = os.path.join(KRAKEN_DIR, '{sample}_classified_unaligned_reads.txt'),
+        database = KRONA_DB
+    output: os.path.join(REPORT_DIR, '{sample}_krona_report.html')
+    log: os.path.join(LOG_DIR, 'krona_report_{sample}.log')
+    shell: "ktImportTaxonomy -m 3 -t 5 {input.kraken_output} -tax {input.database} -o {output} >> {log} 2>&1"

From 20b5437ee1796d42886a07456640f07a82bd971b Mon Sep 17 00:00:00 2001
From: Dohmen <jan.dohmen@googlemail.com>
Date: Fri, 7 May 2021 15:54:20 +0200
Subject: [PATCH 2/4] add input bed file

---
 tests/sample_data/nCoV-2019_NCref.bed | 218 ++++++++++++++++++++++++++
 1 file changed, 218 insertions(+)
 create mode 100644 tests/sample_data/nCoV-2019_NCref.bed

diff --git a/tests/sample_data/nCoV-2019_NCref.bed b/tests/sample_data/nCoV-2019_NCref.bed
new file mode 100644
index 00000000..c2264dac
--- /dev/null
+++ b/tests/sample_data/nCoV-2019_NCref.bed
@@ -0,0 +1,218 @@
+NC_045512.2	30	54	nCoV-2019_1_LEFT	nCoV-2019_1	+
+NC_045512.2	385	410	nCoV-2019_1_RIGHT	nCoV-2019_1	-
+NC_045512.2	320	342	nCoV-2019_2_LEFT	nCoV-2019_2	+
+NC_045512.2	704	726	nCoV-2019_2_RIGHT	nCoV-2019_2	-
+NC_045512.2	642	664	nCoV-2019_3_LEFT	nCoV-2019_1	+
+NC_045512.2	1004	1028	nCoV-2019_3_RIGHT	nCoV-2019_1	-
+NC_045512.2	943	965	nCoV-2019_4_LEFT	nCoV-2019_2	+
+NC_045512.2	1312	1337	nCoV-2019_4_RIGHT	nCoV-2019_2	-
+NC_045512.2	1242	1264	nCoV-2019_5_LEFT	nCoV-2019_1	+
+NC_045512.2	1623	1651	nCoV-2019_5_RIGHT	nCoV-2019_1	-
+NC_045512.2	1573	1595	nCoV-2019_6_LEFT	nCoV-2019_2	+
+NC_045512.2	1942	1964	nCoV-2019_6_RIGHT	nCoV-2019_2	-
+NC_045512.2	1875	1897	nCoV-2019_7_LEFT	nCoV-2019_1	+
+NC_045512.2	1868	1890	nCoV-2019_7_LEFT_alt0	nCoV-2019_1	+
+NC_045512.2	2247	2269	nCoV-2019_7_RIGHT	nCoV-2019_1	-
+NC_045512.2	2242	2264	nCoV-2019_7_RIGHT_alt5	nCoV-2019_1	-
+NC_045512.2	2181	2205	nCoV-2019_8_LEFT	nCoV-2019_2	+
+NC_045512.2	2568	2592	nCoV-2019_8_RIGHT	nCoV-2019_2	-
+NC_045512.2	2505	2529	nCoV-2019_9_LEFT	nCoV-2019_1	+
+NC_045512.2	2504	2528	nCoV-2019_9_LEFT_alt4	nCoV-2019_1	+
+NC_045512.2	2882	2904	nCoV-2019_9_RIGHT	nCoV-2019_1	-
+NC_045512.2	2880	2902	nCoV-2019_9_RIGHT_alt2	nCoV-2019_1	-
+NC_045512.2	2826	2850	nCoV-2019_10_LEFT	nCoV-2019_2	+
+NC_045512.2	3183	3210	nCoV-2019_10_RIGHT	nCoV-2019_2	-
+NC_045512.2	3144	3166	nCoV-2019_11_LEFT	nCoV-2019_1	+
+NC_045512.2	3507	3531	nCoV-2019_11_RIGHT	nCoV-2019_1	-
+NC_045512.2	3460	3482	nCoV-2019_12_LEFT	nCoV-2019_2	+
+NC_045512.2	3826	3853	nCoV-2019_12_RIGHT	nCoV-2019_2	-
+NC_045512.2	3771	3795	nCoV-2019_13_LEFT	nCoV-2019_1	+
+NC_045512.2	4142	4164	nCoV-2019_13_RIGHT	nCoV-2019_1	-
+NC_045512.2	4054	4077	nCoV-2019_14_LEFT	nCoV-2019_2	+
+NC_045512.2	4044	4068	nCoV-2019_14_LEFT_alt4	nCoV-2019_2	+
+NC_045512.2	4428	4450	nCoV-2019_14_RIGHT	nCoV-2019_2	-
+NC_045512.2	4402	4424	nCoV-2019_14_RIGHT_alt2	nCoV-2019_2	-
+NC_045512.2	4294	4321	nCoV-2019_15_LEFT	nCoV-2019_1	+
+NC_045512.2	4296	4322	nCoV-2019_15_LEFT_alt1	nCoV-2019_1	+
+NC_045512.2	4674	4696	nCoV-2019_15_RIGHT	nCoV-2019_1	-
+NC_045512.2	4666	4689	nCoV-2019_15_RIGHT_alt3	nCoV-2019_1	-
+NC_045512.2	4636	4658	nCoV-2019_16_LEFT	nCoV-2019_2	+
+NC_045512.2	4995	5017	nCoV-2019_16_RIGHT	nCoV-2019_2	-
+NC_045512.2	4939	4966	nCoV-2019_17_LEFT	nCoV-2019_1	+
+NC_045512.2	5296	5321	nCoV-2019_17_RIGHT	nCoV-2019_1	-
+NC_045512.2	5230	5259	nCoV-2019_18_LEFT	nCoV-2019_2	+
+NC_045512.2	5257	5287	nCoV-2019_18_LEFT_alt2	nCoV-2019_2	+
+NC_045512.2	5620	5644	nCoV-2019_18_RIGHT	nCoV-2019_2	-
+NC_045512.2	5620	5643	nCoV-2019_18_RIGHT_alt1	nCoV-2019_2	-
+NC_045512.2	5563	5586	nCoV-2019_19_LEFT	nCoV-2019_1	+
+NC_045512.2	5932	5957	nCoV-2019_19_RIGHT	nCoV-2019_1	-
+NC_045512.2	5867	5894	nCoV-2019_20_LEFT	nCoV-2019_2	+
+NC_045512.2	6247	6272	nCoV-2019_20_RIGHT	nCoV-2019_2	-
+NC_045512.2	6167	6196	nCoV-2019_21_LEFT	nCoV-2019_1	+
+NC_045512.2	6168	6197	nCoV-2019_21_LEFT_alt2	nCoV-2019_1	+
+NC_045512.2	6528	6550	nCoV-2019_21_RIGHT	nCoV-2019_1	-
+NC_045512.2	6526	6548	nCoV-2019_21_RIGHT_alt0	nCoV-2019_1	-
+NC_045512.2	6466	6495	nCoV-2019_22_LEFT	nCoV-2019_2	+
+NC_045512.2	6846	6873	nCoV-2019_22_RIGHT	nCoV-2019_2	-
+NC_045512.2	6718	6745	nCoV-2019_23_LEFT	nCoV-2019_1	+
+NC_045512.2	7092	7117	nCoV-2019_23_RIGHT	nCoV-2019_1	-
+NC_045512.2	7035	7058	nCoV-2019_24_LEFT	nCoV-2019_2	+
+NC_045512.2	7389	7415	nCoV-2019_24_RIGHT	nCoV-2019_2	-
+NC_045512.2	7305	7332	nCoV-2019_25_LEFT	nCoV-2019_1	+
+NC_045512.2	7671	7694	nCoV-2019_25_RIGHT	nCoV-2019_1	-
+NC_045512.2	7626	7651	nCoV-2019_26_LEFT	nCoV-2019_2	+
+NC_045512.2	7997	8019	nCoV-2019_26_RIGHT	nCoV-2019_2	-
+NC_045512.2	7943	7968	nCoV-2019_27_LEFT	nCoV-2019_1	+
+NC_045512.2	8319	8341	nCoV-2019_27_RIGHT	nCoV-2019_1	-
+NC_045512.2	8249	8275	nCoV-2019_28_LEFT	nCoV-2019_2	+
+NC_045512.2	8635	8661	nCoV-2019_28_RIGHT	nCoV-2019_2	-
+NC_045512.2	8595	8619	nCoV-2019_29_LEFT	nCoV-2019_1	+
+NC_045512.2	8954	8983	nCoV-2019_29_RIGHT	nCoV-2019_1	-
+NC_045512.2	8888	8913	nCoV-2019_30_LEFT	nCoV-2019_2	+
+NC_045512.2	9245	9271	nCoV-2019_30_RIGHT	nCoV-2019_2	-
+NC_045512.2	9204	9226	nCoV-2019_31_LEFT	nCoV-2019_1	+
+NC_045512.2	9557	9585	nCoV-2019_31_RIGHT	nCoV-2019_1	-
+NC_045512.2	9477	9502	nCoV-2019_32_LEFT	nCoV-2019_2	+
+NC_045512.2	9834	9858	nCoV-2019_32_RIGHT	nCoV-2019_2	-
+NC_045512.2	9784	9806	nCoV-2019_33_LEFT	nCoV-2019_1	+
+NC_045512.2	10146	10171	nCoV-2019_33_RIGHT	nCoV-2019_1	-
+NC_045512.2	10076	10099	nCoV-2019_34_LEFT	nCoV-2019_2	+
+NC_045512.2	10437	10459	nCoV-2019_34_RIGHT	nCoV-2019_2	-
+NC_045512.2	10362	10384	nCoV-2019_35_LEFT	nCoV-2019_1	+
+NC_045512.2	10737	10763	nCoV-2019_35_RIGHT	nCoV-2019_1	-
+NC_045512.2	10666	10688	nCoV-2019_36_LEFT	nCoV-2019_2	+
+NC_045512.2	11048	11074	nCoV-2019_36_RIGHT	nCoV-2019_2	-
+NC_045512.2	10999	11022	nCoV-2019_37_LEFT	nCoV-2019_1	+
+NC_045512.2	11372	11394	nCoV-2019_37_RIGHT	nCoV-2019_1	-
+NC_045512.2	11306	11331	nCoV-2019_38_LEFT	nCoV-2019_2	+
+NC_045512.2	11668	11693	nCoV-2019_38_RIGHT	nCoV-2019_2	-
+NC_045512.2	11555	11584	nCoV-2019_39_LEFT	nCoV-2019_1	+
+NC_045512.2	11927	11949	nCoV-2019_39_RIGHT	nCoV-2019_1	-
+NC_045512.2	11863	11889	nCoV-2019_40_LEFT	nCoV-2019_2	+
+NC_045512.2	12234	12256	nCoV-2019_40_RIGHT	nCoV-2019_2	-
+NC_045512.2	12110	12133	nCoV-2019_41_LEFT	nCoV-2019_1	+
+NC_045512.2	12465	12490	nCoV-2019_41_RIGHT	nCoV-2019_1	-
+NC_045512.2	12417	12439	nCoV-2019_42_LEFT	nCoV-2019_2	+
+NC_045512.2	12779	12802	nCoV-2019_42_RIGHT	nCoV-2019_2	-
+NC_045512.2	12710	12732	nCoV-2019_43_LEFT	nCoV-2019_1	+
+NC_045512.2	13074	13096	nCoV-2019_43_RIGHT	nCoV-2019_1	-
+NC_045512.2	13005	13027	nCoV-2019_44_LEFT	nCoV-2019_2	+
+NC_045512.2	13007	13029	nCoV-2019_44_LEFT_alt3	nCoV-2019_2	+
+NC_045512.2	13378	13400	nCoV-2019_44_RIGHT	nCoV-2019_2	-
+NC_045512.2	13363	13385	nCoV-2019_44_RIGHT_alt0	nCoV-2019_2	-
+NC_045512.2	13319	13344	nCoV-2019_45_LEFT	nCoV-2019_1	+
+NC_045512.2	13307	13336	nCoV-2019_45_LEFT_alt2	nCoV-2019_1	+
+NC_045512.2	13669	13699	nCoV-2019_45_RIGHT	nCoV-2019_1	-
+NC_045512.2	13660	13689	nCoV-2019_45_RIGHT_alt7	nCoV-2019_1	-
+NC_045512.2	13599	13621	nCoV-2019_46_LEFT	nCoV-2019_2	+
+NC_045512.2	13602	13625	nCoV-2019_46_LEFT_alt1	nCoV-2019_2	+
+NC_045512.2	13962	13984	nCoV-2019_46_RIGHT	nCoV-2019_2	-
+NC_045512.2	13961	13984	nCoV-2019_46_RIGHT_alt2	nCoV-2019_2	-
+NC_045512.2	13918	13946	nCoV-2019_47_LEFT	nCoV-2019_1	+
+NC_045512.2	14271	14299	nCoV-2019_47_RIGHT	nCoV-2019_1	-
+NC_045512.2	14207	14232	nCoV-2019_48_LEFT	nCoV-2019_2	+
+NC_045512.2	14579	14601	nCoV-2019_48_RIGHT	nCoV-2019_2	-
+NC_045512.2	14545	14570	nCoV-2019_49_LEFT	nCoV-2019_1	+
+NC_045512.2	14898	14926	nCoV-2019_49_RIGHT	nCoV-2019_1	-
+NC_045512.2	14865	14895	nCoV-2019_50_LEFT	nCoV-2019_2	+
+NC_045512.2	15224	15246	nCoV-2019_50_RIGHT	nCoV-2019_2	-
+NC_045512.2	15171	15193	nCoV-2019_51_LEFT	nCoV-2019_1	+
+NC_045512.2	15538	15560	nCoV-2019_51_RIGHT	nCoV-2019_1	-
+NC_045512.2	15481	15503	nCoV-2019_52_LEFT	nCoV-2019_2	+
+NC_045512.2	15861	15886	nCoV-2019_52_RIGHT	nCoV-2019_2	-
+NC_045512.2	15827	15851	nCoV-2019_53_LEFT	nCoV-2019_1	+
+NC_045512.2	16186	16209	nCoV-2019_53_RIGHT	nCoV-2019_1	-
+NC_045512.2	16118	16144	nCoV-2019_54_LEFT	nCoV-2019_2	+
+NC_045512.2	16485	16510	nCoV-2019_54_RIGHT	nCoV-2019_2	-
+NC_045512.2	16416	16444	nCoV-2019_55_LEFT	nCoV-2019_1	+
+NC_045512.2	16804	16833	nCoV-2019_55_RIGHT	nCoV-2019_1	-
+NC_045512.2	16748	16770	nCoV-2019_56_LEFT	nCoV-2019_2	+
+NC_045512.2	17130	17152	nCoV-2019_56_RIGHT	nCoV-2019_2	-
+NC_045512.2	17065	17087	nCoV-2019_57_LEFT	nCoV-2019_1	+
+NC_045512.2	17430	17452	nCoV-2019_57_RIGHT	nCoV-2019_1	-
+NC_045512.2	17381	17406	nCoV-2019_58_LEFT	nCoV-2019_2	+
+NC_045512.2	17738	17761	nCoV-2019_58_RIGHT	nCoV-2019_2	-
+NC_045512.2	17674	17697	nCoV-2019_59_LEFT	nCoV-2019_1	+
+NC_045512.2	18036	18062	nCoV-2019_59_RIGHT	nCoV-2019_1	-
+NC_045512.2	17966	17993	nCoV-2019_60_LEFT	nCoV-2019_2	+
+NC_045512.2	18324	18348	nCoV-2019_60_RIGHT	nCoV-2019_2	-
+NC_045512.2	18253	18275	nCoV-2019_61_LEFT	nCoV-2019_1	+
+NC_045512.2	18650	18672	nCoV-2019_61_RIGHT	nCoV-2019_1	-
+NC_045512.2	18596	18618	nCoV-2019_62_LEFT	nCoV-2019_2	+
+NC_045512.2	18957	18979	nCoV-2019_62_RIGHT	nCoV-2019_2	-
+NC_045512.2	18896	18918	nCoV-2019_63_LEFT	nCoV-2019_1	+
+NC_045512.2	19275	19297	nCoV-2019_63_RIGHT	nCoV-2019_1	-
+NC_045512.2	19204	19232	nCoV-2019_64_LEFT	nCoV-2019_2	+
+NC_045512.2	19591	19616	nCoV-2019_64_RIGHT	nCoV-2019_2	-
+NC_045512.2	19548	19570	nCoV-2019_65_LEFT	nCoV-2019_1	+
+NC_045512.2	19911	19939	nCoV-2019_65_RIGHT	nCoV-2019_1	-
+NC_045512.2	19844	19866	nCoV-2019_66_LEFT	nCoV-2019_2	+
+NC_045512.2	20231	20255	nCoV-2019_66_RIGHT	nCoV-2019_2	-
+NC_045512.2	20172	20200	nCoV-2019_67_LEFT	nCoV-2019_1	+
+NC_045512.2	20542	20572	nCoV-2019_67_RIGHT	nCoV-2019_1	-
+NC_045512.2	20472	20496	nCoV-2019_68_LEFT	nCoV-2019_2	+
+NC_045512.2	20867	20890	nCoV-2019_68_RIGHT	nCoV-2019_2	-
+NC_045512.2	20786	20813	nCoV-2019_69_LEFT	nCoV-2019_1	+
+NC_045512.2	21146	21169	nCoV-2019_69_RIGHT	nCoV-2019_1	-
+NC_045512.2	21075	21104	nCoV-2019_70_LEFT	nCoV-2019_2	+
+NC_045512.2	21427	21455	nCoV-2019_70_RIGHT	nCoV-2019_2	-
+NC_045512.2	21357	21386	nCoV-2019_71_LEFT	nCoV-2019_1	+
+NC_045512.2	21716	21743	nCoV-2019_71_RIGHT	nCoV-2019_1	-
+NC_045512.2	21658	21682	nCoV-2019_72_LEFT	nCoV-2019_2	+
+NC_045512.2	22013	22038	nCoV-2019_72_RIGHT	nCoV-2019_2	-
+NC_045512.2	21961	21990	nCoV-2019_73_LEFT	nCoV-2019_1	+
+NC_045512.2	22324	22346	nCoV-2019_73_RIGHT	nCoV-2019_1	-
+NC_045512.2	22262	22290	nCoV-2019_74_LEFT	nCoV-2019_2	+
+NC_045512.2	22626	22650	nCoV-2019_74_RIGHT	nCoV-2019_2	-
+NC_045512.2	22516	22542	nCoV-2019_75_LEFT	nCoV-2019_1	+
+NC_045512.2	22877	22903	nCoV-2019_75_RIGHT	nCoV-2019_1	-
+NC_045512.2	22797	22819	nCoV-2019_76_LEFT	nCoV-2019_2	+
+NC_045512.2	22798	22821	nCoV-2019_76_LEFT_alt3	nCoV-2019_2	+
+NC_045512.2	23192	23214	nCoV-2019_76_RIGHT	nCoV-2019_2	-
+NC_045512.2	23189	23212	nCoV-2019_76_RIGHT_alt0	nCoV-2019_2	-
+NC_045512.2	23122	23144	nCoV-2019_77_LEFT	nCoV-2019_1	+
+NC_045512.2	23500	23522	nCoV-2019_77_RIGHT	nCoV-2019_1	-
+NC_045512.2	23443	23466	nCoV-2019_78_LEFT	nCoV-2019_2	+
+NC_045512.2	23822	23847	nCoV-2019_78_RIGHT	nCoV-2019_2	-
+NC_045512.2	23789	23812	nCoV-2019_79_LEFT	nCoV-2019_1	+
+NC_045512.2	24145	24169	nCoV-2019_79_RIGHT	nCoV-2019_1	-
+NC_045512.2	24078	24100	nCoV-2019_80_LEFT	nCoV-2019_2	+
+NC_045512.2	24443	24467	nCoV-2019_80_RIGHT	nCoV-2019_2	-
+NC_045512.2	24391	24416	nCoV-2019_81_LEFT	nCoV-2019_1	+
+NC_045512.2	24765	24789	nCoV-2019_81_RIGHT	nCoV-2019_1	-
+NC_045512.2	24696	24721	nCoV-2019_82_LEFT	nCoV-2019_2	+
+NC_045512.2	25052	25076	nCoV-2019_82_RIGHT	nCoV-2019_2	-
+NC_045512.2	24978	25003	nCoV-2019_83_LEFT	nCoV-2019_1	+
+NC_045512.2	25347	25369	nCoV-2019_83_RIGHT	nCoV-2019_1	-
+NC_045512.2	25279	25301	nCoV-2019_84_LEFT	nCoV-2019_2	+
+NC_045512.2	25646	25673	nCoV-2019_84_RIGHT	nCoV-2019_2	-
+NC_045512.2	25601	25623	nCoV-2019_85_LEFT	nCoV-2019_1	+
+NC_045512.2	25969	25994	nCoV-2019_85_RIGHT	nCoV-2019_1	-
+NC_045512.2	25902	25924	nCoV-2019_86_LEFT	nCoV-2019_2	+
+NC_045512.2	26290	26315	nCoV-2019_86_RIGHT	nCoV-2019_2	-
+NC_045512.2	26197	26219	nCoV-2019_87_LEFT	nCoV-2019_1	+
+NC_045512.2	26566	26590	nCoV-2019_87_RIGHT	nCoV-2019_1	-
+NC_045512.2	26520	26542	nCoV-2019_88_LEFT	nCoV-2019_2	+
+NC_045512.2	26890	26913	nCoV-2019_88_RIGHT	nCoV-2019_2	-
+NC_045512.2	26835	26857	nCoV-2019_89_LEFT	nCoV-2019_1	+
+NC_045512.2	26838	26860	nCoV-2019_89_LEFT_alt2	nCoV-2019_1	+
+NC_045512.2	27202	27227	nCoV-2019_89_RIGHT	nCoV-2019_1	-
+NC_045512.2	27190	27215	nCoV-2019_89_RIGHT_alt4	nCoV-2019_1	-
+NC_045512.2	27141	27164	nCoV-2019_90_LEFT	nCoV-2019_2	+
+NC_045512.2	27511	27533	nCoV-2019_90_RIGHT	nCoV-2019_2	-
+NC_045512.2	27446	27471	nCoV-2019_91_LEFT	nCoV-2019_1	+
+NC_045512.2	27825	27854	nCoV-2019_91_RIGHT	nCoV-2019_1	-
+NC_045512.2	27784	27808	nCoV-2019_92_LEFT	nCoV-2019_2	+
+NC_045512.2	28145	28172	nCoV-2019_92_RIGHT	nCoV-2019_2	-
+NC_045512.2	28081	28104	nCoV-2019_93_LEFT	nCoV-2019_1	+
+NC_045512.2	28442	28464	nCoV-2019_93_RIGHT	nCoV-2019_1	-
+NC_045512.2	28394	28416	nCoV-2019_94_LEFT	nCoV-2019_2	+
+NC_045512.2	28756	28779	nCoV-2019_94_RIGHT	nCoV-2019_2	-
+NC_045512.2	28677	28699	nCoV-2019_95_LEFT	nCoV-2019_1	+
+NC_045512.2	29041	29063	nCoV-2019_95_RIGHT	nCoV-2019_1	-
+NC_045512.2	28985	29007	nCoV-2019_96_LEFT	nCoV-2019_2	+
+NC_045512.2	29356	29378	nCoV-2019_96_RIGHT	nCoV-2019_2	-
+NC_045512.2	29288	29316	nCoV-2019_97_LEFT	nCoV-2019_1	+
+NC_045512.2	29665	29693	nCoV-2019_97_RIGHT	nCoV-2019_1	-
+NC_045512.2	29486	29510	nCoV-2019_98_LEFT	nCoV-2019_2	+
+NC_045512.2	29836	29866	nCoV-2019_98_RIGHT	nCoV-2019_2	-

From 1ac66a0be696627fb3193af14620d017585a94e8 Mon Sep 17 00:00:00 2001
From: Dohmen <jan.dohmen@googlemail.com>
Date: Fri, 7 May 2021 15:55:45 +0200
Subject: [PATCH 3/4] add rules for variants and qc, minor changes

---
 snakefile.py | 159 +++++++++++++++++++++++++++++++++------------------
 1 file changed, 103 insertions(+), 56 deletions(-)

diff --git a/snakefile.py b/snakefile.py
index 66a279df..39e753f6 100644
--- a/snakefile.py
+++ b/snakefile.py
@@ -29,14 +29,17 @@
 # 1. obtain things from config file
 # 2. make dummy samples and put them to tests/sample_data/reads
 # 3. align overall appearance
-READS_DIR = 'tests/sample_data/reads'
-REFERENCE_FASTA = 'tests/sample_data/NC_045512.2.fasta'
-KRAKEN_DB = 'tests/databases/kraken_db'
-KRONA_DB = 'tests/databases/krona_db'
-SAMPLE_SHEET_CSV = 'tests/sample_sheet.csv'
-
-OUTPUT_DIR = 'output'
-OUTPUT_DIR = os.path.join(os.getcwd(), OUTPUT_DIR)
+READS_DIR = os.path.join(os.getcwd(), 'tests/sample_data/reads')
+REFERENCE_FASTA = os.path.join(os.getcwd(), 'tests/sample_data/NC_045512.2.fasta')
+AMPLICONS_BED = os.path.join(os.getcwd(), 'tests/sample_data/nCoV-2019_NCref.bed')
+KRAKEN_DB = os.path.join(os.getcwd(), 'tests/databases/kraken_db')
+KRONA_DB = os.path.join(os.getcwd(), 'tests/databases/krona_db')
+SIGMUT_DB = os.path.join(os.getcwd(), 'tests/databases/sigmut_db')
+VEP_DB = os.path.join(os.getcwd(), 'tests/databases/vep_db')
+SAMPLE_SHEET_CSV = os.path.join(os.getcwd(), 'tests/sample_sheet.csv')
+
+OUTPUT_DIR = os.path.join(os.getcwd(), 'output')
+
 TRIMMED_READS_DIR = os.path.join(OUTPUT_DIR, 'trimmed_reads')
 LOG_DIR = os.path.join(OUTPUT_DIR, 'logs')
 FASTQC_DIR = os.path.join(OUTPUT_DIR, 'fastqc')
@@ -44,9 +47,10 @@
 MAPPED_READS_DIR = os.path.join(OUTPUT_DIR, 'mapped_reads')
 VARIANTS_DIR = os.path.join(OUTPUT_DIR, 'variants')
 KRAKEN_DIR = os.path.join(OUTPUT_DIR, 'kraken')
+COVERAGE_DIR = os.path.join(OUTPUT_DIR, 'coverage')
 REPORT_DIR = os.path.join(OUTPUT_DIR, 'report')
 
-SCRIPTS_DIR = 'scripts'
+SCRIPTS_DIR = os.path.join(os.getcwd(), 'scripts')
 
 
 # Load sample sheet
@@ -63,7 +67,7 @@
     'final_reports': {
         'description': "Produce a comprehensive report. This is the default target.",
         'files': (
-            expand(os.path.join(VARIANTS_DIR, '{sample}_snv.csv'), sample=SAMPLES) + 
+            expand(os.path.join(REPORT_DIR, '{sample}_qc_report.html'), sample=SAMPLES) + 
             expand(os.path.join(REPORT_DIR, '{sample}_variant_report.html'), sample=SAMPLES) + 
             expand(os.path.join(REPORT_DIR, '{sample}_kraken_report.html'), sample=SAMPLES)
         )
@@ -74,12 +78,18 @@
             expand(os.path.join(VARIANTS_DIR, '{sample}_snv.csv'), sample=SAMPLES)
         )
     },
-    'variant_report': {
-        'description': "Make variants report.",
+    'variant_reports': {
+        'description': "Make variants reports.",
         'files': (
             expand(os.path.join(REPORT_DIR, '{sample}_variant_report.html'), sample=SAMPLES)
         )
     },
+    'qc_reports': {
+        'description': "Make QC reports.",
+        'files': (
+            expand(os.path.join(REPORT_DIR, '{sample}_qc_report.html'), sample=SAMPLES)
+        )
+    },
     'kraken_reports': {
         'description': "Make Kraken reports.",
         'files': (
@@ -87,9 +97,8 @@
             expand(os.path.join(REPORT_DIR, '{sample}_krona_report.html'), sample=SAMPLES)
         )
     }
-
 }
-selected_targets = ['kraken_reports']
+selected_targets = ['variant_reports']
 OUTPUT_FILES = list(chain.from_iterable([targets[name]['files'] for name in selected_targets]))
 
 
@@ -130,9 +139,10 @@
         r1 = os.path.join(TRIMMED_READS_DIR, "{sample}_1.fastq"),
         r2 = os.path.join(TRIMMED_READS_DIR, "{sample}_2.fastq")
     params:
-        len_cutoff = int(150 * 0.8) # read length * pct_cutoff
+        len_cutoff = int(150 * 0.8), # read length * pct_cutoff
+        output = os.path.join(TRIMMED_READS_DIR, "{sample}")
     log: os.path.join(LOG_DIR, 'prinseq_{sample}.log')
-    shell: "prinseq-lite.pl -fastq {input.r1} -fastq2 {input.r2} -ns_max_n 4 -min_qual_mean 30 -trim_qual_left 30 -trim_qual_right 30 -trim_qual_window 10 -out_good {TRIMMED_READS_DIR}/{wildcards.sample} -out_bad null -min_len {params.len_cutoff} >> {log} 2>&1"
+    shell: "prinseq-lite.pl -fastq {input.r1} -fastq2 {input.r2} -ns_max_n 4 -min_qual_mean 30 -trim_qual_left 30 -trim_qual_right 30 -trim_qual_window 10 -out_good {params.output} -out_bad null -min_len {params.len_cutoff} >> {log} 2>&1"
 
 
 rule bwa_index:
@@ -154,10 +164,6 @@
     shell: "bwa mem -t {params.threads} {input.ref} {input.fastq} > {output} 2>> {log} 3>&2"
 
 
-# TODO: also get unaligned .bam --> so far tmp.bam is used
-# same rule to get unaligned .bam
-# probably something like below, but needs to be checked for our purpose (kraken reports)
-# "samtools view -bh -F 2 {input} > {output} 2>> {log} 3>&2"
 rule samtools_filter_aligned:
     input: os.path.join(MAPPED_READS_DIR, '{sample}_aligned_tmp.sam')
     output: os.path.join(MAPPED_READS_DIR, '{sample}_aligned.bam')
@@ -165,27 +171,13 @@
     shell: "samtools view -bh -f 2 -F 2048 {input} > {output} 2>> {log} 3>&2"
 
 
+# TODO: check command to get unaligned ones
 rule samtools_filter_unaligned:
     input: os.path.join(MAPPED_READS_DIR, '{sample}_aligned_tmp.sam')
     output: os.path.join(MAPPED_READS_DIR, '{sample}_unaligned.bam')
     log: os.path.join(LOG_DIR, 'samtools_filter_unaligned_{sample}.log')
     shell: "samtools view -bh -F 2 {input} > {output} 2>> {log} 3>&2"
 
-# probably not needed --> directly converted during filtering step (-b)
-# rule samtools_sam2bam_A:
-#     input: os.path.join(MAPPED_READS_DIR, '{sample}_aligned.sam')
-#     output: os.path.join(MAPPED_READS_DIR, '{sample}_aligned.bam')
-#     log: os.path.join(LOG_DIR, 'samtools_sam2bam_{sample}.log')
-#     shell: "samtools view -bS {input} > {output} >> {log} 2>&1"
-
-
-# # TODO: merge rule A and B and possibly? use unaligned instead of tmp
-# rule samtools_sam2bam_B:
-#     input: os.path.join(MAPPED_READS_DIR, '{sample}_aligned_tmp.sam')
-#     output: os.path.join(MAPPED_READS_DIR, '{sample}_aligned_tmp.bam')
-#     log: os.path.join(LOG_DIR, 'samtools_sam2bam_B_{sample}.log')
-#     shell: "samtools view -bS {input} > {output} >> {log} 2>&1"
-
 
 rule samtools_sort:
     input: os.path.join(MAPPED_READS_DIR, '{sample}_aligned.bam')
@@ -205,15 +197,14 @@
     input: os.path.join(MAPPED_READS_DIR, '{sample}_aligned_sorted.bam')
     output: os.path.join(FASTQC_DIR, '{sample}_aligned_sorted_fastqc.zip')
     log: os.path.join(LOG_DIR, 'fastqc_{sample}.log')
-    params:
-        out_dir = os.path.join(FASTQC_DIR)
-    shell: "fastqc -o {params.out_dir} -f bam {input} >> {log} 2>&1"
+    shell: "fastqc -o {FASTQC_DIR} -f bam {input} >> {log} 2>&1"
 
 
 # TODO: check if --call-indels is needed?
 rule lofreq:
     input:
         aligned_bam = os.path.join(MAPPED_READS_DIR, '{sample}_aligned_sorted.bam'),
+        aligned_bai = os.path.join(MAPPED_READS_DIR, '{sample}_aligned_sorted.bai'),
         ref = REFERENCE_FASTA
     output: os.path.join(VARIANTS_DIR, '{sample}_snv.vcf')
     log: os.path.join(LOG_DIR, 'lofreq_{sample}.log')
@@ -223,26 +214,40 @@
 rule vcf2csv:
     input: os.path.join(VARIANTS_DIR, '{sample}_snv.vcf')
     output: os.path.join(VARIANTS_DIR, '{sample}_snv.csv')
+    params:
+        script = os.path.join(SCRIPTS_DIR, 'vcfTocsv.py')
     log: os.path.join(LOG_DIR, 'vcf2csv_{sample}.log')
-    shell: "python {SCRIPTS_DIR}/vcfTocsv.py {input} >> {log} 2>&1"
+    shell: "python {params.script} {input} >> {log} 2>&1"
+
+
+rule vep:
+    input: os.path.join(VARIANTS_DIR, '{sample}_snv.vcf')
+    output: os.path.join(VARIANTS_DIR, '{sample}_vep_sarscov2.txt')
+    params:
+        species = "sars_cov_2"
+    log: os.path.join(LOG_DIR, 'vep_{sample}.log')
+    shell: "vep --verbose --offline --dir_cache {VEP_DB} --DB_VERSION 101 --appris --biotype --buffer_size 5000 --check_existing --distance 5000 --mane --protein --species {params.species} --symbol --transcript_version --tsl --input_file {input} --output_file {output} >> {log} 2>&1"
 
 
-# TODO: add vep rule
-# rule vep:
-#     input: 
-#     output: 
-#     log: 
-#     shell: ""
+rule parse_vep:
+    input: os.path.join(VARIANTS_DIR, '{sample}_vep_sarscov2.txt')
+    output: os.path.join(VARIANTS_DIR, '{sample}_vep_sarscov2_parsed.txt')
+    params:
+        script = os.path.join(SCRIPTS_DIR, 'parse_vep.py')
+    log: os.path.join(LOG_DIR, 'parse_vep_{sample}.log')
+    shell: "python {params.script} {VARIANTS_DIR} {input} {output} >> {log} 2>&1"
 
 
-# TODO: fill in blanks, why sample_dir? where are sigmuts coming from?
-# rule variant_report:
-#     input:
-#         vep_txt = os.path.join(VARIANTS_DIR, '{sample}_vep.txt'),
-#         snv_csv = os.path.join(VARIANTS_DIR, '{sample}_snv.csv')
-#     output: os.path.join(REPORT_DIR, '{sample}_variant_report.html')
-#     log: os.path.join(LOG_DIR, 'vcf2csv_{sample}.log')
-#     shell: "Rscript {SCRIPTS_DIR}/run_variant_report.R --reportFile={SCRIPTS_DIR}/variantreport_p_sample.rmd --vep_txt_file={input.vep_txt} --snv_csv_file={input.snv_csv} --location_sigmuts=??? --sample_dir=??? --sample_name={wildcards.sample} >> {log} 2>&1"
+rule variant_report:
+    input:
+        vep_txt = os.path.join(VARIANTS_DIR, '{sample}_vep_sarscov2_parsed.txt'),
+        snv_csv = os.path.join(VARIANTS_DIR, '{sample}_snv.csv')
+    output: os.path.join(REPORT_DIR, '{sample}_variant_report.html')
+    params:
+        run_report = os.path.join(SCRIPTS_DIR, 'run_variant_report.R'),
+        report_rmd = os.path.join(SCRIPTS_DIR, 'variant_report.Rmd')
+    log: os.path.join(LOG_DIR, 'variant_report_{sample}.log')
+    shell: "Rscript {params.run_report} --reportFile={params.report_rmd} --vep_txt_file={input.vep_txt} --snv_csv_file={input.snv_csv} --location_sigmuts={SIGMUT_DB} --sample_dir={VARIANTS_DIR} --sample_name={wildcards.sample} --outFile={output} >> {log} 2>&1"
 
 
 rule bam2fastq:
@@ -266,10 +271,9 @@
     output: os.path.join(REPORT_DIR, '{sample}_kraken_report.html')
     params:
         run_report = os.path.join(SCRIPTS_DIR, 'run_kraken_report.R'),
-        report_rmd = os.path.join(SCRIPTS_DIR, 'kraken_report.Rmd'),
-        kraken_output = os.path.join(KRAKEN_DIR, '{sample}_classified_unaligned_reads.txt')
+        report_rmd = os.path.join(SCRIPTS_DIR, 'kraken_report.Rmd')
     log: os.path.join(LOG_DIR, 'kraken_report_{sample}.log')
-    shell: "Rscript {params.run_report} --reportFile={params.report_rmd} --kraken_output={params.kraken_output} --sample_name={wildcards.sample} --output_file={output} >> {log} 2>&1"
+    shell: "Rscript {params.run_report} --reportFile={params.report_rmd} --kraken_output={input} --sample_name={wildcards.sample} --output_file={output} >> {log} 2>&1"
 
 
 rule krona_report:
@@ -279,3 +283,46 @@
     output: os.path.join(REPORT_DIR, '{sample}_krona_report.html')
     log: os.path.join(LOG_DIR, 'krona_report_{sample}.log')
     shell: "ktImportTaxonomy -m 3 -t 5 {input.kraken_output} -tax {input.database} -o {output} >> {log} 2>&1"
+
+
+# TODO: does it have to be the 'unsorted' bam file? If so, we have to do indexing on those, too
+rule samtools_bedcov:
+    input:
+        amplicons_bed = AMPLICONS_BED,
+        aligned_bam = os.path.join(MAPPED_READS_DIR, '{sample}_aligned_sorted.bam'),
+        aligned_bai = os.path.join(MAPPED_READS_DIR, '{sample}_aligned_sorted.bai')
+    output: os.path.join(COVERAGE_DIR, '{sample}_amplicons.csv')
+    log: os.path.join(LOG_DIR, 'samtools_bedcov_{sample}.log')
+    shell:"samtools bedcov {input.amplicons_bed} {input.aligned_bam} > {output} 2>> {log} 3>&2"
+
+
+# TODO: does it have to be the 'unsorted' bam file? If so, we have to do indexing on those, too
+rule samtools_coverage:
+    input:
+        aligned_bam = os.path.join(MAPPED_READS_DIR, '{sample}_aligned_sorted.bam'),
+        aligned_bai = os.path.join(MAPPED_READS_DIR, '{sample}_aligned_sorted.bai')
+    output: os.path.join(COVERAGE_DIR, '{sample}_coverage.csv')
+    log: os.path.join(LOG_DIR, 'samtools_coverage_{sample}.log')
+    shell: "samtools coverage {input.aligned_bam} > {output} 2>> {log} 3>&2"
+
+
+rule get_qc_table:
+    input:
+        coverage_csv = os.path.join(COVERAGE_DIR, '{sample}_coverage.csv'),
+        amplicon_csv = os.path.join(COVERAGE_DIR, '{sample}_amplicons.csv')
+    output: os.path.join(COVERAGE_DIR, '{sample}_merged_covs.csv')
+    params:
+        script = os.path.join(SCRIPTS_DIR, 'get_qc_table.py')
+    log: os.path.join(LOG_DIR, 'get_qc_table_{sample}.log')
+    shell: "python {params.script} {input.coverage_csv} {input.amplicon_csv} {output} >> {log} 2>&1"
+        
+
+# TODO: fix Error: unexpected end of input
+# rule qc_report:
+#     input: os.path.join(COVERAGE_DIR, '{sample}_merged_covs.csv')
+#     output: os.path.join(REPORT_DIR, '{sample}_qc_report.html')
+#     params:
+#         run_report = os.path.join(SCRIPTS_DIR, 'run_qc_report.R'),
+#         report_rmd = os.path.join(SCRIPTS_DIR, 'qc_report.Rmd')
+#     log: os.path.join(LOG_DIR, 'qc_report_{sample}.log')
+#     shell: "Rscript {params.run_report} --reportFile={params.report_rmd} --coverage_file={input} --sample_name={wildcards.sample} --outFile={output} >> {log} 2>&1"
\ No newline at end of file

From 4390e17f3487c7e1bc270b548e5f028e35ebd06f Mon Sep 17 00:00:00 2001
From: Dohmen <jan.dohmen@googlemail.com>
Date: Fri, 7 May 2021 15:53:35 +0200
Subject: [PATCH 4/4] focus on relevant scripts, add sigmut input

---
 .../{kraken_report.Rmd => Kraken2-report.Rmd} |   0
 scripts/ensemble_vep.py                       | 103 -------------
 scripts/get_qc_table.py                       |  86 +++++++++++
 scripts/parse_vep.py                          |  50 ++++++
 scripts/prep_sample_space_210320.py           |  21 ---
 scripts/qc_report_per_sample.rmd              |  89 +++++++++++
 ...n_kraken_report.R => run_Kraken2-report.R} |   0
 scripts/run_QC_report.R                       |  81 ++++++++++
 scripts/run_variant_report.R                  |   9 +-
 scripts/variantreport_p_sample.rmd            | 143 +++++++-----------
 snakefile.py                                  |  12 +-
 ...akinfo_mutation_report_data_2021-03-06.tsv |  22 +++
 ...akinfo_mutation_report_data_2021-03-06.tsv |   9 ++
 ...akinfo_mutation_report_data_2021-03-06.tsv |   9 ++
 ...akinfo_mutation_report_data_2021-03-06.tsv |  11 ++
 ...akinfo_mutation_report_data_2021-03-06.tsv |  11 ++
 ...akinfo_mutation_report_data_2021-03-06.tsv |  23 +++
 17 files changed, 458 insertions(+), 221 deletions(-)
 rename scripts/{kraken_report.Rmd => Kraken2-report.Rmd} (100%)
 delete mode 100644 scripts/ensemble_vep.py
 create mode 100644 scripts/get_qc_table.py
 create mode 100644 scripts/parse_vep.py
 delete mode 100644 scripts/prep_sample_space_210320.py
 create mode 100644 scripts/qc_report_per_sample.rmd
 rename scripts/{run_kraken_report.R => run_Kraken2-report.R} (100%)
 create mode 100644 scripts/run_QC_report.R
 create mode 100644 tests/databases/sigmut_db/B117_outbreakinfo_mutation_report_data_2021-03-06.tsv
 create mode 100644 tests/databases/sigmut_db/B1351_outbreakinfo_mutation_report_data_2021-03-06.tsv
 create mode 100644 tests/databases/sigmut_db/B1427_outbreakinfo_mutation_report_data_2021-03-06.tsv
 create mode 100644 tests/databases/sigmut_db/B1429_outbreakinfo_mutation_report_data_2021-03-06.tsv
 create mode 100644 tests/databases/sigmut_db/B1526_outbreakinfo_mutation_report_data_2021-03-06.tsv
 create mode 100644 tests/databases/sigmut_db/P1_outbreakinfo_mutation_report_data_2021-03-06.tsv

diff --git a/scripts/kraken_report.Rmd b/scripts/Kraken2-report.Rmd
similarity index 100%
rename from scripts/kraken_report.Rmd
rename to scripts/Kraken2-report.Rmd
diff --git a/scripts/ensemble_vep.py b/scripts/ensemble_vep.py
deleted file mode 100644
index ec271766..00000000
--- a/scripts/ensemble_vep.py
+++ /dev/null
@@ -1,103 +0,0 @@
-
-# 2021/04/28
-# Vic-Fabienne Schumann
-# dependencies: ensemble-vep
-
-# this script contains the functions and commands to run the guix ensemble-vep for sars-cov-2 and parse it's output
-# so it can directly used for the variant reports
-
-# check if database is downloaded and located in ~/.vep
-# TODO: or does this will be handled by snakemake?
-# NOTE: Input file should be given with the full path, otherwise I can't guarantee that it works.^^
-# NOTE: parse_vep_cli_output() only works if the vep_report exists already. 
-#       But in theory the other functions are executed earlier anyways
-
-# !!!!
-# prepare vep data base by downloading the sars-cov-2 assembly and annotation: 
-# bash: "wget ftp://ftp.ensemblgenomes.org/pub/viruses/variation/indexed_vep_cache/sars_cov_2_vep_101_ASM985889v3.tar.gz"
-# bash: "mv sars_cov_2*.tar.gz ~/.vep && cd ~/.vep && tar xf sars_cov_2*.tar.gz"
-
-import os
-import sys
-import re
-import subprocess
-
-def check_vep_dir(vep_report_txt): 
-    
-    vep_dir = os.path.dirname(vep_report_txt)
-    if not os.path.isdir(vep_dir): 
-        
-        # print("mkdir -v {}".format(vep_dir))
-       subprocess.call("mkdir -v {}".format(vep_dir), shell=True)
-
-def run_vep(vep_report, snv_file):
-    
-     if not os.path.isfile(vep_report):
-        
-        # print("vep --verbose --offline --DB_VERSION 101 --appris --biotype --buffer_size 5000 --check_existing "
-        #                 "--distance 5000 --mane --protein --species sars_cov_2 --symbol --transcript_version --tsl "
-        #                 "--input_file {} --output_file {}".format(snv_file,vep_report))
-        
-        subprocess.call("vep --verbose --offline --DB_VERSION 101 --appris --biotype --buffer_size 5000 --check_existing "
-                        "--distance 5000 --mane --protein --species sars_cov_2 --symbol --transcript_version --tsl "
-                        "--input_file {} --output_file {}".format(snv_file,vep_report_txt))
-    
-
-def prepare_vep_output(vep_report_txt):
-    
-    ''' checks if the first line of the vep output is the correct headerline, will parse the file
-    accordingly if not '''
-    
-    filename = os.path.basename(vep_report_txt)
-    
-    with open(vep_report_txt, 'r') as vep_report:
-        # check if it starts with "Uploaded_variation" in the first line 
-        if not re.match("^Uploaded_variation", vep_report.readline()):
-            print("some parsing to do")
-            parse_vep_cli_output(vep_report_txt)
-            
-            # remove old file and rename new file with that name
-            # print(f"rm -v {vep_report_txt} && mv -v tmp.txt {filename}")
-            subprocess.call(f"rm {vep_report_txt} && mv tmp.txt {filename}")
-        else:
-            print("fine to go")
-            
-def parse_vep_cli_output(vep_report):
-    
-    ''' takes the txt output of ensemble vep cli as input and removes all info lines and lines starting with # so it can be 
-    read in as a table with correct header by R '''
-    
-    vep_dir = os.path.dirname(os.path.abspath(vep_report))
-    print(f"operating in dir: {vep_dir}")
-    header_line = None
-    
-    with open(vep_report, 'r') as file:
-        with open(os.path.join(vep_dir,"tmp.txt"), 'w+') as clean_file:
-            for num, line in enumerate(file,1):
-                if re.match("#Uploaded_variation",line):
-                    header_line = num
-                    clean_file.write(line[1:])
-                    break
-            if header_line is not None:
-                for num, line in enumerate(file,header_line):
-                    clean_file.write(line)  
-            else:
-                raise Exception("file has a format or does not has to be parsed anymore")
-
-def main():
-    
-    vep_report_txt = sys.argv[1]
-    snv_file = sys.argv[2]
-    
-    # check if there is a vep dir in the sample dir, or make one
-    check_vep_dir(vep_report_txt)
-     
-    # check if there is a vep file, if not run ensemble vep
-    run_vep(vep_report_txt,snv_file)
-   
-    # read file and check if it looks ok or not
-    prepare_vep_output(vep_report_txt)
-    
-if __name__ == '__main__':
-  main()
-
diff --git a/scripts/get_qc_table.py b/scripts/get_qc_table.py
new file mode 100644
index 00000000..038b8d9b
--- /dev/null
+++ b/scripts/get_qc_table.py
@@ -0,0 +1,86 @@
+import sys
+import re
+import csv
+
+
+def parsing_cov(coverage_file):
+    with open(coverage_file, "r") as cov_file:
+        reader = csv.reader(cov_file, delimiter="\t")
+
+        for row in reader:
+            for element in row:
+                if not re.match("#", element):
+                    aligned_reads = row[3]
+                    total_cov = row[5]
+                    mean_depth = row[6]
+        return (aligned_reads, total_cov, mean_depth)
+
+
+def parsing_amplicon(amplicon_file):
+    with open(amplicon_file, "r") as amp_file:
+        reader = csv.reader(amp_file, delimiter="\t")
+
+        num_covered_amplicons = 0
+        covered_amplicon = 0
+        drop_out_amplicons = []
+        full_drop_out = []
+        half_covered = []
+
+        for row in reader:
+
+            amplicon = row[3].split("_")[1]
+
+            if amplicon != str(covered_amplicon):
+                covered_amplicon = amplicon
+                num_covered_amplicons += 1
+
+            if int(row[6]) == 0:
+                drop_out_amplicons.append(amplicon)
+
+        if len(drop_out_amplicons) != 0:
+            for ampli in drop_out_amplicons:
+                cnt = drop_out_amplicons.count(ampli)
+                if cnt == 2:
+                    full_drop_out.append(ampli)
+                if cnt == 1:
+                    half_covered.append(ampli)
+
+    full_drop_out = list(set(full_drop_out))
+    fully_covered_amplicons = int(num_covered_amplicons) - len(drop_out_amplicons)
+
+    return (fully_covered_amplicons, half_covered, full_drop_out)
+
+
+def make_report_input_csv(coverage_file, amplicon_file, output_file):
+    coverage_info = parsing_cov(coverage_file)
+    amplicon_info = parsing_amplicon(amplicon_file)
+    print(coverage_info)
+    print(amplicon_info)
+
+    header = [
+        "Total number of amplicons fully covered",
+        "Amplicons partially covered",
+        "Drop-out amplicons",
+        "Total number aligned reads",
+        "Coverage",
+        "Mean depth",
+    ]
+    fields = [
+        amplicon_info[0],
+        amplicon_info[1],
+        amplicon_info[2],
+        coverage_info[0],
+        coverage_info[1],
+        coverage_info[2],
+    ]
+
+    with open(output_file, "a") as f:
+        writer = csv.writer(f, delimiter="\t")
+        writer.writerow(header)
+        writer.writerow(fields)
+
+if __name__ == '__main__':
+    coverage_file = sys.argv[1]
+    amplicon_file = sys.argv[2]
+    output = sys.argv[3]
+    make_report_input_csv(coverage_file, amplicon_file, output)
\ No newline at end of file
diff --git a/scripts/parse_vep.py b/scripts/parse_vep.py
new file mode 100644
index 00000000..f45d69f4
--- /dev/null
+++ b/scripts/parse_vep.py
@@ -0,0 +1,50 @@
+import re
+import os
+import csv
+import sys
+
+def parse_vep(vep_dir, input, output):
+    header_line = None
+
+    with open(input, 'r') as file:
+        if header_line:
+            raise Exception("file has not the correct format or does not has to be parsed anymore")
+
+        with open(os.path.join(vep_dir, "tmp.txt"), 'w+') as clean_file:
+            for num, line in enumerate(file, 1):
+                if re.match("#Uploaded_variation", line):
+                    header_line = num
+                    clean_file.write(line[1:])
+                    break
+            for num, line in enumerate(file, header_line):
+                clean_file.write(line)
+
+    with open(os.path.join(vep_dir,'tmp.txt'), 'r') as file:
+        with open(output, 'w') as output:
+            reader = csv.reader(file, delimiter="\t")
+            writer = csv.writer(output, lineterminator='\n')
+
+            # make new column SYMBOL for gene names 
+            # fixme: This is redundant with the making of the header line above, but works for now
+            gene_column = []
+            header = next(reader)
+            header.append("SYMBOL")
+            gene_column.append(header)
+
+            for row in reader:  # fixme: I'm not sure if the following construct is ugly or not
+                # get gene name 
+                extra = row[13].split(";")
+                for element in extra:
+                    if re.search(r"SYMBOL=", element):
+                        gene = element.split("=")[1:][0]
+                        row.append(gene)
+                        gene_column.append(row)
+
+            writer.writerows(gene_column)
+    # TODO: add step to clean "tmp.txt" 
+
+if __name__ == '__main__':
+    vep_dir = sys.argv[1]
+    input = sys.argv[2]
+    output = sys.argv[3]
+    parse_vep(vep_dir, input, output)
\ No newline at end of file
diff --git a/scripts/prep_sample_space_210320.py b/scripts/prep_sample_space_210320.py
deleted file mode 100644
index 5921dbbd..00000000
--- a/scripts/prep_sample_space_210320.py
+++ /dev/null
@@ -1,21 +0,0 @@
-## 21/04/20 - writing a sample space script 
-
-import subprocess
-import os
-
-datadir = "/local/Projects/AAkalin_pathogenomics/data/wastewater_all"
-sampledir = "/home/vschuma/pathogenomics/vpipe/akalinlab_pathogenomics/samples/wastewater_all"
-
-#########
-for file in datadir:
-    samplename = os.path.basename(file).split("_R")[0]
-    filename_clean = file.split("_001")
-    filename_clean = ''.join(filename_clean)
-
-    # make dir
-    dir = ''.join([samplename, '/', 'raw_data'])
-    subprocess.call('mkdir -vp {}'.format(dir), shell=True)
-
-    # copy files from local to sample dirs
-    clean_file_sample_dir = ''.join([dir, '/', os.path.basename(filename_clean)])
-    subprocess.call('cp -v {} {}'.format(file, clean_file_sample_dir), shell=True)
diff --git a/scripts/qc_report_per_sample.rmd b/scripts/qc_report_per_sample.rmd
new file mode 100644
index 00000000..a9fd4745
--- /dev/null
+++ b/scripts/qc_report_per_sample.rmd
@@ -0,0 +1,89 @@
+---
+title: "SARS-CoV-2 QC report"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 
+date: '`r format(as.POSIXct(if ("" != Sys.getenv("SOURCE_DATE_EPOCH")) { as.numeric(Sys.getenv("SOURCE_DATE_EPOCH")) } else { Sys.time() }, origin="1970-01-01"), "%Y-%m-%d %H:%M:%S")`'
+params:
+    coverage_file: ''
+    sample_name: ''
+theme: darkly
+---
+
+```{r setup, include=FALSE, message = FALSE, warning = FALSE}
+knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
+knitr::opts_knit$set(root.dir = params$workdir)
+library(DT)
+library(ggplot2)
+library(dplyr)
+library(tidyr)
+library(qpcR)
+library(stringr)
+library(magrittr)
+library(openssl)
+```
+# Input Settings
+```{r InputSettings, echo = FALSE, include = FALSE}
+coverage_file <- params$coverage_file
+sample_name <- params$sample_name
+``` 
+```{r printInputSettings, echo = FALSE, include = FALSE}
+# coverage_file <- '/home/vfs/PycharmProjects/Akalinlab_pathogenomics/PiGx_WW/tests/coverage/Test_merged_covs.csv'
+# sample_name <- "Test"
+``` 
+### Sample: `r sample_name` 
+```{r reading data, include = FALSE}
+coverage.df <- read.table(coverage_file,sep = "\t", header = T) 
+coverage.df <- na_if(coverage.df, "[]")
+coverage.df <- gsub('\\[','',coverage.df)
+coverage.df <- gsub('\\]','',coverage.df)
+# fixme: this can be sure done in a nicer and more compact way
+```
+## Amplicon coverage  
+The information about the amplicon primer was taken from the [ARTIC bioinformatics platform](https://github.com/joshquick/artic-ncov2019/blob/master/primer_schemes/nCoV-2019/V3/nCoV-2019.bed).
+The reference name "MN908947.3" was changed to "NC_045512.2". Both are describing the same reference genome for 
+[SARS-CoV-2](https://www.ncbi.nlm.nih.gov/nuccore/1798174254). 
+ 
+```{r, message=F, warning=F, echo = FALSE}
+
+AmpliconCoverage <- data.frame("Number of amplicon fully covered" = paste0(coverage.df[1],' out of 98'), 
+                  "Anmplicons partially covered" = coverage.df[2],
+                  "Drop-out amplicons" = coverage.df[3]) # maybe more clear title stating which coverage
+
+DT::datatable(t(AmpliconCoverage), options = list(searching = FALSE, paging = FALSE))
+
+```
+
+## Coverage
+
+```{r, message=F, warning=F, echo = FALSE}
+
+Coverage <- data.frame("Total coverage" = coverage.df[4], 
+                  "Aligned reads" = paste(round(as.integer(coverage.df[5]),1),"%"),
+                  "Mean depth" = round(as.integer(coverage.df[6]))) # maybe more clear title stating which coverage
+
+DT::datatable(t(Coverage), options = list(searching = FALSE, paging = FALSE))
+
+```
+## FastQC and Trimming
+
+Download: [fastqc report of unprocessed data](/path/to/fastqc.html)  
+Download: [fastqc report of pre-processes data](/path/to/fastqc.html) 
+
+
+
+The reads were pre-processed as followed:  
+
+|      |     |  
+|:-----|:----|  
+|trim length cutoff prinseq |0.2|
+
+## Download Stats csv
+```{r, include = F}
+ 
+write.csv2(coverage.df, paste0(dirname(coverage_file),"/report_download_coverage.csv")) # not sure about the funct of the dataframe here, but it works so I leave it for now
+
+readLines(coverage_file) %>% 
+  paste0(collapse="\n") %>% 
+  openssl::base64_encode() -> coverage
+```
+Coverage statistics:  
+[Download Coverage_stats.csv](`r sprintf('data:text/csv;base64,%s', coverage)`)
+
diff --git a/scripts/run_kraken_report.R b/scripts/run_Kraken2-report.R
similarity index 100%
rename from scripts/run_kraken_report.R
rename to scripts/run_Kraken2-report.R
diff --git a/scripts/run_QC_report.R b/scripts/run_QC_report.R
new file mode 100644
index 00000000..a143444e
--- /dev/null
+++ b/scripts/run_QC_report.R
@@ -0,0 +1,81 @@
+# Title     : runVariantReport_SarsCov2
+# Objective : run variant reports
+# Created by: vfs
+# Based on the runDeseqReport.R from Bora Uyar
+# Created on: 29.04.21
+
+# files should be given with full pathes?
+# ! signature mutation sources are hardcoded
+
+runReport <- function(reportFile,
+                      coverage_file,
+                      sample_name,
+                      outFile,
+                      # logo,
+                      selfContained = TRUE, 
+                      quiet = FALSE) {
+  
+  outFile <- outFile
+  
+ # htmlwidgets::setWidgetIdSeed(1234)
+  rmarkdown::render(
+    input = reportFile,
+  #  intermediates_dir = file.path(workdir, prefix),
+    clean = TRUE,
+    output_file = outFile,
+    output_format = rmarkdown::html_document(
+      code_folding = 'hide', 
+      depth = 2,
+      toc = TRUE,
+      toc_float = TRUE,
+      theme = 'lumen',
+      number_sections = TRUE
+    ),
+    output_options = list(self_contained = selfContained),
+    params = list(coverage_file = coverage_file,
+                  outFile = outFile,
+                  sample_name = sample_name
+               #  logo = logo, 
+                   ),
+    quiet = quiet
+  )
+  
+#   if(dir.exists(file.path(sample_dir, sample_name))) {
+#   unlink(file.path(sample_dir, sample_name), recursive = TRUE)
+#   }
+# }
+
+#1. Collect arguments
+args <- commandArgs(TRUE)
+
+cat("arguments:", args,"\n")
+
+## Parse arguments (we expect the form --arg=value)
+#parseArgs <- function(x) strsplit(sub("^--", "", x), "=")
+parseArgs <- function(x) {
+  myArgs <- unlist(strsplit(x, "--"))
+  myArgs <- myArgs[myArgs != '']
+  #when no values are provided for the argument
+  myArgs <- gsub(pattern = "=$", replacement = "= ", x = myArgs)
+  myArgs <- as.data.frame(do.call(rbind, strsplit(myArgs, "=")))
+  myArgs$V2 <- gsub(' ', '', myArgs$V2)
+  return(myArgs)
+}
+
+#print(class(myArgs <- unlist(strsplit(args, "--"))))
+
+argsDF <- parseArgs(args)
+argsL <- as.list(as.character(argsDF$V2))
+names(argsL) <- argsDF$V1
+
+reportFile <- argsL$reportFile
+coverage_file <- argsL$coverage_file
+outFile <- argsL$outFile
+sample_name <- argsL$sample_name
+
+runReport(reportFile = reportFile,
+          coverage_file = coverage_file,
+          sample_name = sample_name,
+          outFile = outFile,
+          # logo = logo,
+          selfContained = selfContained)
\ No newline at end of file
diff --git a/scripts/run_variant_report.R b/scripts/run_variant_report.R
index fa36b2d7..a5713141 100644
--- a/scripts/run_variant_report.R
+++ b/scripts/run_variant_report.R
@@ -13,16 +13,18 @@ runReport <- function(reportFile,
                       location_sigmuts,
                       sample_dir,
                       sample_name,
+                      outFile,
                       # logo,
                       selfContained = TRUE, 
                       quiet = FALSE) {
   
-  outFile <- paste0(sample_dir,'/variant_report_',sample_name,'.html')
+  # outFile <- outFile
+  output_dir <- dirname(outFile)
   
  # htmlwidgets::setWidgetIdSeed(1234)
   rmarkdown::render(
     input = reportFile, 
-    output_dir = sample_dir,
+    output_dir = output_dir,
   #  intermediates_dir = file.path(workdir, prefix),
     clean = TRUE,
     output_file = outFile,
@@ -39,6 +41,7 @@ runReport <- function(reportFile,
                   snv_csv_file = snv_csv_file,
                   location_sigmuts = location_sigmuts,
                   sample_dir = sample_dir,
+                  outFile = outFile,
                   sample_name = sample_name
                #  logo = logo, 
                    ),
@@ -78,6 +81,7 @@ vep_txt_file <- argsL$vep_txt_file
 snv_csv_file <- argsL$snv_csv_file
 location_sigmuts <- argsL$location_sigmuts
 sample_dir <- argsL$sample_dir
+outFile <- argsL$outFile
 sample_name <- argsL$sample_name
 
 runReport(reportFile = reportFile,
@@ -86,5 +90,6 @@ runReport(reportFile = reportFile,
           location_sigmuts = location_sigmuts, 
           sample_dir = sample_dir,
           sample_name = sample_name,
+          outFile = outFile,
           # logo = logo,
           selfContained = selfContained)
\ No newline at end of file
diff --git a/scripts/variantreport_p_sample.rmd b/scripts/variantreport_p_sample.rmd
index 4233c000..1f76420f 100644
--- a/scripts/variantreport_p_sample.rmd
+++ b/scripts/variantreport_p_sample.rmd
@@ -8,8 +8,9 @@ params:
   location_sigmuts: ''
   sample_dir: ''
   sample_name: ''
+  outFile: ''
 ---
-```{r setup, include=FALSE}
+```{r setup, include=FALSE, messages = FALSE, warnings = FALSE}
 knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
 knitr::opts_knit$set(root.dir = params$workdir)
 library(DT)
@@ -28,14 +29,16 @@ snvfile <- params$snv_csv_file
 sampledir <- params$sample_dir
 location_sigmuts <- params$location_sigmuts
 sample_name <- params$sample_name
+outFile <- params$outFile
 ``` 
 ### Sample: `r sample_name` 
 
 ```{r test_files, include = FALSE}
 # location_sigmuts <- '/mnt/beast/pathogenomics/vpipe/akalinlab_pathogenomics/vep_mutation_matching/signature_mutations'
-# sampledir <- '/mnt/beast/pathogenomics/vpipe/akalinlab_pathogenomics/samples/wastewater_clock_time/WW_210325_KW2_22_24'
-# vepfile <- '/mnt/beast/pathogenomics/vpipe/akalinlab_pathogenomics/samples/wastewater_clock_time/WW_210325_KW2_22_24/vep_reports/vep_sarscov2_WW_210325_KW2_22_24.txt' 
-# snvfile <- '/mnt/beast/pathogenomics/vpipe/akalinlab_pathogenomics/samples/wastewater_clock_time/WW_210325_KW2_22_24/variants/SNVs/snvs.csv'
+# sampledir <- '/mnt/beast/pathogenomics/vpipe/akalinlab_pathogenomics/samples/wastewater_all/Test'
+# # vepfile <- '/mnt/beast/pathogenomics/pigx_sarscov2_ww/output/vep/Test_vep_sarscov2_reportinput.txt' 
+# vepfile <- '/mnt/beast/pathogenomics/pigx_sarscov2_ww/output/vep/report_ready.txt' 
+# snvfile <- '/mnt/beast/pathogenomics/vpipe/akalinlab_pathogenomics/samples/wastewater_all/Test/variants/SNVs/snvs.csv'
 ```
 This report shows the variant analysis of SARS-CoV-2 RNA-seq samples from wastewater samples. Variants are 
 identified by SNV calling. Translated protein mutations were matched against lists of signature mutations for known
@@ -65,11 +68,10 @@ parse_snv_csv <- function (snvfile,...){ # allele frequency from v-pipe vcf
 
 
 ```{r function_get_protein_mut, message = F, include = FALSE}
-
 get_protein_mut <- function (vepfile){ # parse together from vep output "Protein_position" and "Amino_acid"
   
   # reading in whole vep txt output
-    vepfile.df <- read.table(vepfile, sep = '\t', header = T) # you should include in the vep script to parse out the # 
+    vepfile.df <- read.table(vepfile, sep = ',', header = T) # you should include in the vep script to parse out the # 
       #in the beginning of the line or include that step here. 
     
     # parsing snv and protmut location
@@ -84,6 +86,7 @@ get_protein_mut <- function (vepfile){ # parse together from vep output "Protein
     locations <- locations %>% filter(!grepl("^-", prot_mut_loc))
     # specific B117 mutations: 21990-21993, 21764-21770, maybe also 3675-3677, 69-70 - all there
     
+    
     detectable_deletions <- function (x, colnames) { 
       
       if (x['gene_mut_loc.2'] != x['gene_mut_loc.3'] & str_detect(x['prot_mut_loc'],"-")){
@@ -116,9 +119,13 @@ get_protein_mut <- function (vepfile){ # parse together from vep output "Protein
                                 Conseq == "inframe_deletion" &
                                 str_detect(deletions.df$prot_mut_loc,"-"))
     colnames <- colnames(deletions.df)
-    deletions <- dplyr::bind_rows(apply(deletions.df,1, detectable_deletions, colnames = colnames))
-    locations <- dplyr::bind_rows(locations,deletions)
-    #locations <- dplyr::bind_rows(locations,deletions.df)
+    
+    # if there is a deletion the snv would span a couple of positions, if there is not such a spanning region there are no deletions
+    # ! 06/05/2021 Vic - I think, I don't know how robust this is, but it will work for the sig mutations we have so far
+    if (!(any(is.na(locations[,'gene_mut_loc.3'])))) {
+        deletions <- dplyr::bind_rows(apply(deletions.df,1, detectable_deletions, colnames = colnames))
+        locations <- dplyr::bind_rows(locations,deletions)
+    }
   
   # substitute "nothing" at alternative-aa-column (for deletions) with "-"
   locations$AAs.2[is.na(locations$AAs.2)] <- "-"
@@ -191,13 +198,10 @@ get_protein_mut <- function (vepfile){ # parse together from vep output "Protein
 
 
 ```{r processa_and_match_snvs_to_signature_mutations, message=F, warning=F, include = FALSE}
-
 # reading in signature data
 # it's probably not really necessary to add the pase0 column but I like to have a full overview
 # but maybe this part could be done a bt nicer more compact with e.g. looping through the file directories?
-
 ################
-
 b117 <- read.table(paste0(location_sigmuts,"/B117_outbreakinfo_mutation_report_data_2021-03-06.tsv"),
                 sep="\t",header=T)
 # modifiyng deletions, subsitute "nothing" with a - e.g: D789 -> D789-
@@ -205,57 +209,45 @@ b117$alt_aa <- na_if(b117$alt_aa, '')
 b117$alt_aa[is.na(b117$alt_aa)] <- "-"
 #pasting column with correct prot mutation notation
 b117$AA_mut <- paste0(b117$ref_aa, b117$codon_num, b117$alt_aa)
-
 #####################
-
 b1351 <- read.table(paste0(location_sigmuts,"/B1351_outbreakinfo_mutation_report_data_2021-03-06.tsv"),
                 sep="\t",header=T)
 b1351$alt_aa <- na_if(b1351$alt_aa, '')
 b1351$alt_aa[is.na(b1351$alt_aa)] <- "-"
 b1351$AA_mut <- paste0(b1351$ref_aa, b1351$codon_num, b1351$alt_aa)
-
 b1427 <- read.table(paste0(location_sigmuts,"/B1427_outbreakinfo_mutation_report_data_2021-03-06.tsv"),
                 sep="\t",header=T)
 b1427$alt_aa <- na_if(b1427$alt_aa, '')
 b1427$alt_aa[is.na(b1427$alt_aa)] <- "-"
 b1427$AA_mut <- paste0(b1427$ref_aa, b1427$codon_num, b1427$alt_aa)
-
 b1429 <- read.table(paste0(location_sigmuts,"/B1429_outbreakinfo_mutation_report_data_2021-03-06.tsv"),
                 sep="\t",header=T)
 b1429$alt_aa <- na_if(b1429$alt_aa, '')
 b1429$alt_aa[is.na(b1429$alt_aa)] <- "-"
 b1429$AA_mut <- paste0(b1429$ref_aa, b1429$codon_num, b1429$alt_aa)
-
 b1526 <- read.table(paste0(location_sigmuts,"/B1526_outbreakinfo_mutation_report_data_2021-03-06.tsv"),
                 sep="\t",header=T)
 b1526$alt_aa <- na_if(b1526$alt_aa, '')
 b1526$alt_aa[is.na(b1526$alt_aa)] <- "-"
 b1526$AA_mut <- paste0(b1526$ref_aa, b1526$codon_num, b1526$alt_aa)
-
 p1 <- read.table(paste0(location_sigmuts,"/P1_outbreakinfo_mutation_report_data_2021-03-06.tsv"),
                 sep="\t",header=T)
 p1$alt_aa <- na_if(p1$alt_aa, '')
 p1$alt_aa[is.na(p1$alt_aa)] <- "-"
 p1$AA_mut <- paste0(p1$ref_aa, p1$codon_num, p1$alt_aa)
-
 ################
-
-vepfile.df <- read.table(vepfile,sep = "\t", header = T) #TODO - in na's umwandeln
+vepfile.df <- read.table(vepfile,sep = ",", header = T) #TODO - in na's umwandeln
 vepfile.df <- na_if(vepfile.df, "-")
-
 sig_mutations.df <- as.data.frame(qpcR:::cbind.na(b117=b117$AA_mut, b1351=b1351$AA_mut,
                                                   b1427=b1427$AA_mut, b1429=b1429$AA_mut,
                                                   b1526=b1526$AA_mut, p1=p1$AA_mut))
 sig_mutations.df <- sig_mutations.df %>%
                           tidyr::pivot_longer(everything()) %>%
                             tidyr::drop_na(.,value)
-
 variant_protein_mut <- get_protein_mut(vepfile)
 variant_protein_mut <- dplyr::left_join(variant_protein_mut, sig_mutations.df, by = c('AA_mut'='value'))
-
 match.df <- variant_protein_mut %>%
                     filter(!is.na(name))
-
 nomatch.df <- variant_protein_mut %>%
                         filter(is.na(name))
 ```
@@ -266,17 +258,13 @@ nomatch.df <- variant_protein_mut %>%
 lofreq.info <- as_data_frame(parse_snv_csv(snvfile))
 # vep.info <- write the part that returns the variant_protein_mut into a function 
 vep.info <- variant_protein_mut
-
 # add freqency and coverage information to protein mutations
 #variant_protein_mut <- dplyr::left_join(variant_protein_mut, sig_mutations.df, by = c('AA_mut'='value'))
 complete.df <- dplyr::left_join(lofreq.info, vep.info, by = c('gene_pos'='gene_mut_loc.2'), copy = T)
-
 match.df <- complete.df %>%
                     filter(!is.na(name))
-
 nomatch.df <- complete.df %>%
                         filter(is.na(name))
-
 ```
 ## Mutations matching variants of concern
 
@@ -289,14 +277,12 @@ Table 2 shows all other mutations without any match.
 cat("\n Table 1: SNVs from sample matching signature mutations")
 ```
 ```{r, message=F, warning=F, echo = FALSE, results = 'asis'}
-
 SignatureTable <-  data.frame("SNV" = match.df$gene_mut, 
                   "gene" = match.df$genes,
                   "protein_mutation" = match.df$AA_mut,
                   "variant" = match.df$name, # maybe more nice writing of it e.g B1.1.7            
                   "frequency" = match.df$freq, # maybe more clear name stating which frequency
                   "coverage" =  match.df$cov) # maybe more clear title stating which coverage 
-
 DT::datatable(SignatureTable,
               extensions = c('FixedColumns', 'Scroller'),
               options = list(fixedColumns = TRUE, 
@@ -315,7 +301,6 @@ NoSignatureTable <-  data.frame("SNV" = nomatch.df$gene_mut,
                   "protein_mutation" = nomatch.df$AA_mut,
                   "frequency" = nomatch.df$freq, # maybe more clear name stating which frequency
                   "coverage" =  nomatch.df$cov) # maybe more clear title stating which coverage
-
 DT::datatable(NoSignatureTable,
               extensions = c('FixedColumns', 'Scroller'),
               options = list(fixedColumns = TRUE, 
@@ -330,44 +315,35 @@ DT::datatable(NoSignatureTable,
 
 ```{r deconvolution, message=F, warning=F, include = FALSE}
 # markdown text:
-## Deconvolution
-### Deconvolution  
+# # Deconvolution
+# ## Deconvolution  
 # Deconvolution is done by using robust regression. The resulting coefficients are normalized to get the percentage of 
 # each present variant. 
-
+# 
 # prepare unique signature mutation dataframe
-
+# 
 # sig_mutations_unique.df <- sig_mutations.df 
 # unique_stuff <- unique(sig_mutations.df$value)
 # sig_mutations_unique.df <- subset(sig_mutations.df, !duplicated(value))
 # sig_mutations_unique.df <- sig_mutations_unique.df %>%
 #                                 tidyr::pivot_wider(values_fn = unlist)
-
+# 
 # 1. count occurences of mutation
-
 ```
 ```{r deconvolution_prep, message=F, warning=F, include = FALSE}
-
 # filter match table for unique mutations
-match_unique.df <- dplyr::distinct(match.df[,-13])
-
+match_unique.df <- unique(dplyr::distinct(match.df[,-13]))
 # get prot.mutations only and add "WT" 
 muts_debug <- c("WT",match_unique.df$AA_mut)
-
 # create an empty data frame
 # !!!!!!!!
 # in theory this df has to have a column for every variant in the beginning -> can we have a "variant-sheet" in the future?
 # there have to be a check if two or more columns are the same and either have to be removed or merged
 # in case we decide for only using unique signature mutations we'll probably don't have this problem anymore 
 # I think I'll go for the "we only look for unique mutations anyway" way.
-
-
-
  msig_debug <- data.frame(muts_debug,WT=0,b117=0,b1351=0,b1427=0,b1429=0,b1526=0,p1=0)
 # !! dirty hack !!
-
 # make sigmuts per variant tables new with parsed mutations
-
 msig_debug[msig_debug$muts %in% b117$AA_mut,"b117"]=1
 msig_debug[msig_debug$muts %in% b1351$AA_mut,"b1351"]=1
 msig_debug[msig_debug$muts %in% b1351$AA_mut,"b1427"]=1
@@ -375,20 +351,15 @@ msig_debug[msig_debug$muts %in% b1351$AA_mut,"b1429"]=1
 msig_debug[msig_debug$muts %in% b1351$AA_mut,"b1526"]=1
 msig_debug[msig_debug$muts %in% p1$AA_mut,"p1"]=1
 msig_debug[1,2] <- 1 # put the WT signature, ideally there should be more than 1 WT signature # ??
-
-
 ```
 ```{r DIRTY HACK, include = FALSE}
 msig_debug <- data.frame(muts_debug,WT=0,b117=0,b1351=0,p1=0)
 # !! dirty hack !!
-
 # make sigmuts per variant tables new with parsed mutations
-
 msig_debug[msig_debug$muts %in% b117$AA_mut,"b117"]=1
 msig_debug[msig_debug$muts %in% b1351$AA_mut,"b1351"]=1
 msig_debug[msig_debug$muts %in% p1$AA_mut,"p1"]=1
 msig_debug[1,2] <- 1 # put the WT signature, ideally there should be more than 1 WT signature # ??
-
 ```
 ```{r, message=F, warning=F, include = FALSE}
 deconv <- function (bulk,sig){
@@ -406,41 +377,37 @@ deconv <- function (bulk,sig){
   
   as.vector(rlm_coefficients)
 }
-
 ```
-```{r deconvolution, message=F, warning=F, include = FALSE}
-
-bulk_debug <- as.numeric(c(1, match_unique.df$freq))
-
-sig_debug <- msig_debug
-
-deconv_debug <- deconv(bulk_debug,sig_debug[,-1])
-
+```{r deconvolution_debug, message=F, warning=F, include = FALSE}
+# bulk_debug <- as.numeric(c(1, match_unique.df$freq))
+# 
+# sig_debug <- msig_debug
+# 
+# # deconv_debug <- deconv(bulk_debug,sig_debug[,-1])
 ```
 ```{r plot, message=F, warning=F, echo = FALSE, include = F}
-
-cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
-
-# work in progress...only to show how it theoretically can look like in the report
-variants <- colnames(msig_debug[,-1])
-df <- as.data.frame(rbind(as.numeric(deconv_debug)))
-colnames(df) <- variants
-df <- df %>% 
-    tidyr::pivot_longer(everything())
-
-ggplot(data = df, aes(name,value, 
-                              fill = name)) +
-            geom_bar(stat="identity") +
-            labs(title = "Proportion of variants of concern inferred from deconvolution",
-                 x = "Variants",
-                 y = "Percentage") +
-            scale_x_discrete(limits = variants) +
-            scale_y_continuous(limits = c(0,1), breaks = seq(0,1,0.1)) +
-            scale_colour_manual(values=cbbPalette) + 
-            geom_text(aes(label=sprintf("%0.3f", round(value, digits = 3))), 
-                          position=position_dodge(width=0.9), 
-                          vjust=-0.25) + 
-            scale_fill_discrete(name = "Variants")
+# cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
+# 
+# # work in progress...only to show how it theoretically can look like in the report
+# variants <- colnames(msig_debug[,-1])
+# df <- as.data.frame(rbind(as.numeric(deconv_debug)))
+# colnames(df) <- variants
+# df <- df %>% 
+#     tidyr::pivot_longer(everything())
+# 
+# ggplot(data = df, aes(name,value, 
+#                               fill = name)) +
+#             geom_bar(stat="identity") +
+#             labs(title = "Proportion of variants of concern inferred from deconvolution",
+#                  x = "Variants",
+#                  y = "Percentage") +
+#             scale_x_discrete(limits = variants) +
+#             scale_y_continuous(limits = c(0,1), breaks = seq(0,1,0.1)) +
+#             scale_colour_manual(values=cbbPalette) + 
+#             geom_text(aes(label=sprintf("%0.3f", round(value, digits = 3))), 
+#                           position=position_dodge(width=0.9), 
+#                           vjust=-0.25) + 
+#             scale_fill_discrete(name = "Variants")
 ```
 
 
@@ -468,7 +435,6 @@ ggplot(data = df, aes(name,value,
 
 ```{r, include = F}
 write.csv2(mtcars, "./file.csv") # not sure about the funct of the dataframe here, but it works so I leave it for now
-
 library(magrittr)
 readLines(snvfile) %>% 
   paste0(collapse="\n") %>% 
@@ -476,16 +442,15 @@ readLines(snvfile) %>%
 ```
 Called SNVs:  
 [Download SNVs.csv](`r sprintf('data:text/csv;base64,%s', snvs)`)
-
 ```{r, include = F}
 # can possible be fused together with above. But rigth now it's safer like this. 
 write.csv2(vepfile.df, "./file.csv") # not sure about the funct of the dataframe here, but it works so I leave it for now
-
 readLines(vepfile) %>% 
   paste0(collapse="\n") %>% 
   openssl::base64_encode() -> vep
 ```
 Variant effect prediction (*Ensemble VEP output*):  
 [Download VEP_out.csv](`r sprintf('data:text/csv;base64,%s', vep)`)
-
-
+```{r}
+# rlm_model = suppressWarnings(MASS::rlm(sig_debug,bulk_debug, maxit = 100)) 
+```
\ No newline at end of file
diff --git a/snakefile.py b/snakefile.py
index 39e753f6..4ff4b220 100644
--- a/snakefile.py
+++ b/snakefile.py
@@ -98,7 +98,7 @@
         )
     }
 }
-selected_targets = ['variant_reports']
+selected_targets = ['kraken_reports']
 OUTPUT_FILES = list(chain.from_iterable([targets[name]['files'] for name in selected_targets]))
 
 
@@ -245,7 +245,7 @@
     output: os.path.join(REPORT_DIR, '{sample}_variant_report.html')
     params:
         run_report = os.path.join(SCRIPTS_DIR, 'run_variant_report.R'),
-        report_rmd = os.path.join(SCRIPTS_DIR, 'variant_report.Rmd')
+        report_rmd = os.path.join(SCRIPTS_DIR, 'variantreport_p_sample.rmd')
     log: os.path.join(LOG_DIR, 'variant_report_{sample}.log')
     shell: "Rscript {params.run_report} --reportFile={params.report_rmd} --vep_txt_file={input.vep_txt} --snv_csv_file={input.snv_csv} --location_sigmuts={SIGMUT_DB} --sample_dir={VARIANTS_DIR} --sample_name={wildcards.sample} --outFile={output} >> {log} 2>&1"
 
@@ -270,8 +270,8 @@
     input: os.path.join(KRAKEN_DIR, '{sample}_classified_unaligned_reads.txt')
     output: os.path.join(REPORT_DIR, '{sample}_kraken_report.html')
     params:
-        run_report = os.path.join(SCRIPTS_DIR, 'run_kraken_report.R'),
-        report_rmd = os.path.join(SCRIPTS_DIR, 'kraken_report.Rmd')
+        run_report = os.path.join(SCRIPTS_DIR, 'run_Kraken2-report.R'),
+        report_rmd = os.path.join(SCRIPTS_DIR, 'Kraken2-report.Rmd')
     log: os.path.join(LOG_DIR, 'kraken_report_{sample}.log')
     shell: "Rscript {params.run_report} --reportFile={params.report_rmd} --kraken_output={input} --sample_name={wildcards.sample} --output_file={output} >> {log} 2>&1"
 
@@ -322,7 +322,7 @@
 #     input: os.path.join(COVERAGE_DIR, '{sample}_merged_covs.csv')
 #     output: os.path.join(REPORT_DIR, '{sample}_qc_report.html')
 #     params:
-#         run_report = os.path.join(SCRIPTS_DIR, 'run_qc_report.R'),
-#         report_rmd = os.path.join(SCRIPTS_DIR, 'qc_report.Rmd')
+#         run_report = os.path.join(SCRIPTS_DIR, 'run_QC_report.R'),
+#         report_rmd = os.path.join(SCRIPTS_DIR, 'qc_report_per_sample.rmd')
 #     log: os.path.join(LOG_DIR, 'qc_report_{sample}.log')
 #     shell: "Rscript {params.run_report} --reportFile={params.report_rmd} --coverage_file={input} --sample_name={wildcards.sample} --outFile={output} >> {log} 2>&1"
\ No newline at end of file
diff --git a/tests/databases/sigmut_db/B117_outbreakinfo_mutation_report_data_2021-03-06.tsv b/tests/databases/sigmut_db/B117_outbreakinfo_mutation_report_data_2021-03-06.tsv
new file mode 100644
index 00000000..27072be0
--- /dev/null
+++ b/tests/databases/sigmut_db/B117_outbreakinfo_mutation_report_data_2021-03-06.tsv
@@ -0,0 +1,22 @@
+mutation	type	gene	ref_codon	pos	ref_aa	codon_num	alt_aa	source	change_length_nt
+"ORF1a:T1001I"	"substitution"	"ORF1a"	"ACT"	3266	"T"	1001	"I"		
+"ORF1a:A1708D"	"substitution"	"ORF1a"	"GCT"	5387	"A"	1708	"D"		
+"ORF1a:I2230T"	"substitution"	"ORF1a"	"ATA"	6954	"I"	2230	"T"		
+"ORF1a:S3675"	"deletion"	"ORF1a"	"TGGTTTTAA"	11288	"S"	3675	"NA"		9
+"ORF1a:G3676"	"deletion"	"ORF1a"	"TGGTTTTAA"	11288	"G"	3676	"NA"		9
+"ORF1a:F3677"	"deletion"	"ORF1a"	"TGGTTTTAA"	11288	"F"	3677	"NA"		9
+"S:H69"	"deletion"	"S"	"CATGTC"	21765	"H"	69	"NA"		6
+"S:V70"	"deletion"	"S"	"CATGTC"	21765	"V"	70	"NA"		6
+"S:Y144"	"deletion"	"S"	"TAT"	21990	"Y"	144	"NA"		3
+"S:N501Y"	"substitution"	"S"	"AAT"	23063	"N"	501	"Y"		
+"S:A570D"	"substitution"	"S"	"GCT"	23271	"A"	570	"D"		
+"S:D614G"	"substitution"	"S"	"GAT"	23402	"D"	614	"G"		
+"S:P681H"	"substitution"	"S"	"CCT"	23604	"P"	681	"H"		
+"S:T716I"	"substitution"	"S"	"ACA"	23708	"T"	716	"I"		
+"S:S982A"	"substitution"	"S"	"TCA"	24506	"S"	982	"A"		
+"S:D1118H"	"substitution"	"S"	"GAC"	24914	"D"	1118	"H"		
+"ORF8:Q27*"	"substitution"	"ORF8"	"CAA"	27971	"Q"	27	"*"		
+"ORF8:R52I"	"substitution"	"ORF8"	"AGA"	28047	"R"	52	"I"		
+"ORF8:Y73C"	"substitution"	"ORF8"	"TAC"	28111	"Y"	73	"C"		
+"N:D3L"	"substitution"	"N"	"GAT"	28280	"D"	3	"L"		
+"N:S235F"	"substitution"	"N"	"TCT"	28977	"S"	235	"F"		
diff --git a/tests/databases/sigmut_db/B1351_outbreakinfo_mutation_report_data_2021-03-06.tsv b/tests/databases/sigmut_db/B1351_outbreakinfo_mutation_report_data_2021-03-06.tsv
new file mode 100644
index 00000000..0bb468e3
--- /dev/null
+++ b/tests/databases/sigmut_db/B1351_outbreakinfo_mutation_report_data_2021-03-06.tsv
@@ -0,0 +1,9 @@
+type	is_synonymous	mutation	gene	ref_aa	codon_num	alt_aa	source
+"substitution"	false	"ORF1ab:K1655N"	"ORF1ab"	"K"	1655	"N"	
+"substitution"	false	"E:P71L"	"E"	"P"	71	"L"	
+"substitution"	false	"N:T205I"	"N"	"T"	205	"I"	
+"substitution"	false	"S:K417N"	"S"	"K"	417	"N"	
+"substitution"	false	"S:E484K"	"S"	"E"	484	"K"	
+"substitution"	false	"S:N501Y"	"S"	"N"	501	"Y"	
+"substitution"	false	"S:D614G"	"S"	"D"	614	"G"	
+"substitution"	false	"S:A701V"	"S"	"A"	701	"V"	
diff --git a/tests/databases/sigmut_db/B1427_outbreakinfo_mutation_report_data_2021-03-06.tsv b/tests/databases/sigmut_db/B1427_outbreakinfo_mutation_report_data_2021-03-06.tsv
new file mode 100644
index 00000000..3ef24b8b
--- /dev/null
+++ b/tests/databases/sigmut_db/B1427_outbreakinfo_mutation_report_data_2021-03-06.tsv
@@ -0,0 +1,9 @@
+mutation	gene	ref_aa	alt_aa	codon_num	type	source
+"S:D614G"	"S"	"D"	"G"	614	"substitution"	
+"ORF1a:T265I"	"ORF1a"	"T"	"I"	265	"substitution"	
+"ORF1b:D1183Y"	"ORF1b"	"D"	"Y"	1183	"substitution"	
+"N:T205I"	"N"	"T"	"I"	205	"substitution"	
+"S:L452R"	"S"	"L"	"R"	452	"substitution"	
+"ORF1a:S3158T"	"ORF1a"	"S"	"T"	3158	"substitution"	
+"ORF1b:P976L"	"ORF1b"	"P"	"L"	976	"substitution"	
+"ORF3a:Q57H"	"ORF3a"	"Q"	"H"	57	"substitution"	
diff --git a/tests/databases/sigmut_db/B1429_outbreakinfo_mutation_report_data_2021-03-06.tsv b/tests/databases/sigmut_db/B1429_outbreakinfo_mutation_report_data_2021-03-06.tsv
new file mode 100644
index 00000000..0ac83d62
--- /dev/null
+++ b/tests/databases/sigmut_db/B1429_outbreakinfo_mutation_report_data_2021-03-06.tsv
@@ -0,0 +1,11 @@
+mutation	type	gene	ref_codon	pos	ref_aa	codon_num	alt_aa	source
+"ORF1a:T265I"	"substitution"	"ORF1a"	"ACC"	1059	"T"	265	"I"	
+"ORF1a:I4205V"	"substitution"	"ORF1a"	"ATC"	12879	"I"	4205	"V"	
+"ORF1b:P314L"	"substitution"	"ORF1b"	"CCT"	14407	"P"	314	"L"	
+"ORF1b:D1183Y"	"substitution"	"ORF1b"	"GAT"	17013	"D"	1183	"Y"	
+"S:S13I"	"substitution"	"S"	"AGT"	21599	"S"	13	"I"	
+"S:W152C"	"substitution"	"S"	"TGG"	22017	"W"	152	"C"	
+"S:L452R"	"substitution"	"S"	"CTG"	22916	"L"	452	"R"	
+"S:D614G"	"substitution"	"S"	"GAT"	23402	"D"	614	"G"	
+"ORF3a:Q57H"	"substitution"	"ORF3a"	"CAG"	25562	"Q"	57	"H"	
+"N:T205I"	"substitution"	"N"	"ACT"	28887	"T"	205	"I"	
diff --git a/tests/databases/sigmut_db/B1526_outbreakinfo_mutation_report_data_2021-03-06.tsv b/tests/databases/sigmut_db/B1526_outbreakinfo_mutation_report_data_2021-03-06.tsv
new file mode 100644
index 00000000..69c1b62b
--- /dev/null
+++ b/tests/databases/sigmut_db/B1526_outbreakinfo_mutation_report_data_2021-03-06.tsv
@@ -0,0 +1,11 @@
+mutation	gene	ref_aa	alt_aa	codon_num	type	source
+"ORF1b:Q1011H"	"ORF1b"	"Q"	"H"	1011	"substitution"	
+"S:D253G"	"S"	"D"	"G"	253	"substitution"	
+"S:D614G"	"S"	"D"	"G"	614	"substitution"	
+"ORF1b:P314L"	"ORF1b"	"P"	"L"	314	"substitution"	
+"ORF1a:L3201P"	"ORF1a"	"L"	"P"	3201	"substitution"	
+"ORF1a:T265I"	"ORF1a"	"T"	"I"	265	"substitution"	
+"ORF3a:Q57H"	"ORF3a"	"Q"	"H"	57	"substitution"	
+"S:T95I"	"S"	"T"	"I"	95	"substitution"	
+"ORF3a:P42L"	"ORF3a"	"P"	"L"	42	"substitution"	
+"ORF8:T11I"	"ORF8"	"T"	"I"	11	"substitution"	
diff --git a/tests/databases/sigmut_db/P1_outbreakinfo_mutation_report_data_2021-03-06.tsv b/tests/databases/sigmut_db/P1_outbreakinfo_mutation_report_data_2021-03-06.tsv
new file mode 100644
index 00000000..bc975bd0
--- /dev/null
+++ b/tests/databases/sigmut_db/P1_outbreakinfo_mutation_report_data_2021-03-06.tsv
@@ -0,0 +1,23 @@
+type	is_synonymous	mutation	gene	ref_aa	codon_num	alt_aa	source	ref_codon	pos	change_length_nt
+"substitution"	false	"N:P80R"	"N"	"P"	80	"R"				
+"substitution"	false	"ORF8:E92K"	"ORF8"	"E"	92	"K"				
+"substitution"	false	"ORF3a:G174C"	"ORF3a"	"G"	174	"C"				
+"substitution"	false	"S:L18F"	"S"	"L"	18	"F"				
+"substitution"	false	"S:T20N"	"S"	"T"	20	"N"				
+"substitution"	false	"S:P26S"	"S"	"P"	26	"S"				
+"substitution"	false	"S:D138Y"	"S"	"D"	138	"Y"				
+"substitution"	false	"S:R190S"	"S"	"R"	190	"S"				
+"substitution"	false	"S:K417T"	"S"	"K"	417	"T"				
+"substitution"	false	"S:E484K"	"S"	"E"	484	"K"				
+"substitution"	false	"S:N501Y"	"S"	"N"	501	"Y"				
+"substitution"	false	"S:D614G"	"S"	"D"	614	"G"				
+"substitution"	false	"S:H655Y"	"S"	"H"	655	"Y"				
+"substitution"	false	"S:T1027I"	"S"	"T"	1027	"I"				
+"substitution"	false	"ORF1ab:E5662D"	"ORF1ab"	"E"	5662	"D"				
+"substitution"	false	"ORF1ab:K1795Q"	"ORF1ab"	"K"	1795	"Q"				
+"substitution"	false	"ORF1ab:S1188L"	"ORF1ab"	"S"	1188	"L"				
+"substitution"	false	"ORF1ab:I760T"	"ORF1ab"	"I"	760	"T"				
+"deletion"		"ORF1a:S3675"	"ORF1a"	"S"	3675	"NA"		"TGGTTTTAA"	11288	9
+"deletion"		"ORF1a:G3676"	"ORF1a"	"G"	3676	"NA"		"TGGTTTTAA"	11288	9
+"deletion"		"ORF1a:F3677"	"ORF1a"	"F"	3677	"NA"		"TGGTTTTAA"	11288	9
+"substitution"	false	"ORF1ab:F681L"	"ORF1ab"	"F"	681	"L"