Merge pull request #5 from BIMSBbioinfo/develop-snakemake

Develop snakemake
BIMSBbioinfo · May 11, 2021 · d1a514c · d1a514c
2 parents b6b70d9 + 4390e17
commit d1a514c
Show file tree

Hide file tree

Showing 18 changed files with 961 additions and 293 deletions.
diff --git a/scripts/Kraken2-report.Rmd b/scripts/Kraken2-report.Rmd
@@ -0,0 +1,58 @@
+---
+title: "Kraken2-report"
+#author: "mfaxel"
+date: '`r format(as.POSIXct(if ("" != Sys.getenv("SOURCE_DATE_EPOCH")) { as.numeric(Sys.getenv("SOURCE_DATE_EPOCH")) } else { Sys.time() }, origin="1970-01-01"), "%Y-%m-%d %H:%M:%S")`'
+#output: html_document
+params:
+  #sample_dir: '../../tests/sample_data/' # must be kraken REPORT file
+  #sample_name: 'tmp_unaligned_krakenReport'
+  sample_name: ""
+  kraken_output: ""
+
+---
+
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
+knitr::opts_knit$set(root.dir = params$workdir)
+library(DT)
+```
+
+
+
+
+```{r, echo=FALSE}
+#input 
+sampleName <- params$sample_name
+#sampleDir <- params$sample_dir
+#krakenReportFile <- paste0(params$sample_dir, params$sample_name)
+# krakenReportFile <- paste0(params$sample_dir, params$sample_name, ".txt")
+krakenReportFile <- kraken_output
+```
+
+
+## Kraken2 report on unaligned reads from the Sars-Cov2 enriched samples
+
+```{r,echo=F}
+print(paste( "Sample: ",sampleName))
+```
+
+
+The Sars-Cov2 enriched wastewater samples are aligned to the Virus genome. The unaligned reads, that are left are aligned with Kraken2 against the [database PlusPFP](https://benlangmead.github.io/aws-indexes/k2). This report is an overview over the species found.  
+
+A table with all the species found: 
+
+* Column explanation:   
+  + Percentage of fragments covered by the clade rooted at this taxon  
+  + Number of fragments covered by the clade rooted at this taxon  
+  + Number of fragments assigned directly to this taxon  
+  + A rank code, indicating (U)nclassified, (R)oot, (D)omain, (K)ingdom, (P)hylum, (C)lass, (O)rder, (F)amily, (G)enus, or (S)pecies.     Taxa that are not at any of these 10 ranks have a rank code that is formed by using the rank code of the closest ancestor rank         with a number indicating the distance from that rank. E.g., "G2" is a rank code indicating a taxon is between genus and species         and the grandparent taxon is at the genus rank.  
+  + NCBI taxonomic ID number  
+  + Indented scientific name  
+
+```{r, echo=FALSE}
+table<- read.table(krakenReportFile,sep = "\t")
+header_table <- c("percent","num_fragments","num_fragments_taxon","rank","NCBI_ID","scientific_name")
+colnames(table)<-header_table
+datatable(table)
+```
diff --git a/scripts/ensemble_vep.py b/scripts/ensemble_vep.py
diff --git a/scripts/get_qc_table.py b/scripts/get_qc_table.py
@@ -0,0 +1,86 @@
+import sys
+import re
+import csv
+
+
+def parsing_cov(coverage_file):
+    with open(coverage_file, "r") as cov_file:
+        reader = csv.reader(cov_file, delimiter="\t")
+
+        for row in reader:
+            for element in row:
+                if not re.match("#", element):
+                    aligned_reads = row[3]
+                    total_cov = row[5]
+                    mean_depth = row[6]
+        return (aligned_reads, total_cov, mean_depth)
+
+
+def parsing_amplicon(amplicon_file):
+    with open(amplicon_file, "r") as amp_file:
+        reader = csv.reader(amp_file, delimiter="\t")
+
+        num_covered_amplicons = 0
+        covered_amplicon = 0
+        drop_out_amplicons = []
+        full_drop_out = []
+        half_covered = []
+
+        for row in reader:
+
+            amplicon = row[3].split("_")[1]
+
+            if amplicon != str(covered_amplicon):
+                covered_amplicon = amplicon
+                num_covered_amplicons += 1
+
+            if int(row[6]) == 0:
+                drop_out_amplicons.append(amplicon)
+
+        if len(drop_out_amplicons) != 0:
+            for ampli in drop_out_amplicons:
+                cnt = drop_out_amplicons.count(ampli)
+                if cnt == 2:
+                    full_drop_out.append(ampli)
+                if cnt == 1:
+                    half_covered.append(ampli)
+
+    full_drop_out = list(set(full_drop_out))
+    fully_covered_amplicons = int(num_covered_amplicons) - len(drop_out_amplicons)
+
+    return (fully_covered_amplicons, half_covered, full_drop_out)
+
+
+def make_report_input_csv(coverage_file, amplicon_file, output_file):
+    coverage_info = parsing_cov(coverage_file)
+    amplicon_info = parsing_amplicon(amplicon_file)
+    print(coverage_info)
+    print(amplicon_info)
+
+    header = [
+        "Total number of amplicons fully covered",
+        "Amplicons partially covered",
+        "Drop-out amplicons",
+        "Total number aligned reads",
+        "Coverage",
+        "Mean depth",
+    ]
+    fields = [
+        amplicon_info[0],
+        amplicon_info[1],
+        amplicon_info[2],
+        coverage_info[0],
+        coverage_info[1],
+        coverage_info[2],
+    ]
+
+    with open(output_file, "a") as f:
+        writer = csv.writer(f, delimiter="\t")
+        writer.writerow(header)
+        writer.writerow(fields)
+
+if __name__ == '__main__':
+    coverage_file = sys.argv[1]
+    amplicon_file = sys.argv[2]
+    output = sys.argv[3]
+    make_report_input_csv(coverage_file, amplicon_file, output)
diff --git a/scripts/parse_vep.py b/scripts/parse_vep.py
@@ -0,0 +1,50 @@
+import re
+import os
+import csv
+import sys
+
+def parse_vep(vep_dir, input, output):
+    header_line = None
+
+    with open(input, 'r') as file:
+        if header_line:
+            raise Exception("file has not the correct format or does not has to be parsed anymore")
+
+        with open(os.path.join(vep_dir, "tmp.txt"), 'w+') as clean_file:
+            for num, line in enumerate(file, 1):
+                if re.match("#Uploaded_variation", line):
+                    header_line = num
+                    clean_file.write(line[1:])
+                    break
+            for num, line in enumerate(file, header_line):
+                clean_file.write(line)
+
+    with open(os.path.join(vep_dir,'tmp.txt'), 'r') as file:
+        with open(output, 'w') as output:
+            reader = csv.reader(file, delimiter="\t")
+            writer = csv.writer(output, lineterminator='\n')
+
+            # make new column SYMBOL for gene names 
+            # fixme: This is redundant with the making of the header line above, but works for now
+            gene_column = []
+            header = next(reader)
+            header.append("SYMBOL")
+            gene_column.append(header)
+
+            for row in reader:  # fixme: I'm not sure if the following construct is ugly or not
+                # get gene name 
+                extra = row[13].split(";")
+                for element in extra:
+                    if re.search(r"SYMBOL=", element):
+                        gene = element.split("=")[1:][0]
+                        row.append(gene)
+                        gene_column.append(row)
+
+            writer.writerows(gene_column)
+    # TODO: add step to clean "tmp.txt" 
+
+if __name__ == '__main__':
+    vep_dir = sys.argv[1]
+    input = sys.argv[2]
+    output = sys.argv[3]
+    parse_vep(vep_dir, input, output)
diff --git a/scripts/prep_sample_space_210320.py b/scripts/prep_sample_space_210320.py