Skip to content

Commit

Permalink
Merge pull request #5 from BIMSBbioinfo/develop-snakemake
Browse files Browse the repository at this point in the history
Develop snakemake
  • Loading branch information
dohmjan authored May 11, 2021
2 parents b6b70d9 + 4390e17 commit d1a514c
Show file tree
Hide file tree
Showing 18 changed files with 961 additions and 293 deletions.
58 changes: 58 additions & 0 deletions scripts/Kraken2-report.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
---
title: "Kraken2-report"
#author: "mfaxel"
date: '`r format(as.POSIXct(if ("" != Sys.getenv("SOURCE_DATE_EPOCH")) { as.numeric(Sys.getenv("SOURCE_DATE_EPOCH")) } else { Sys.time() }, origin="1970-01-01"), "%Y-%m-%d %H:%M:%S")`'
#output: html_document
params:
#sample_dir: '../../tests/sample_data/' # must be kraken REPORT file
#sample_name: 'tmp_unaligned_krakenReport'
sample_name: ""
kraken_output: ""

---


```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
knitr::opts_knit$set(root.dir = params$workdir)
library(DT)
```




```{r, echo=FALSE}
#input
sampleName <- params$sample_name
#sampleDir <- params$sample_dir
#krakenReportFile <- paste0(params$sample_dir, params$sample_name)
# krakenReportFile <- paste0(params$sample_dir, params$sample_name, ".txt")
krakenReportFile <- kraken_output
```


## Kraken2 report on unaligned reads from the Sars-Cov2 enriched samples

```{r,echo=F}
print(paste( "Sample: ",sampleName))
```


The Sars-Cov2 enriched wastewater samples are aligned to the Virus genome. The unaligned reads, that are left are aligned with Kraken2 against the [database PlusPFP](https://benlangmead.github.io/aws-indexes/k2). This report is an overview over the species found.

A table with all the species found:

* Column explanation:
+ Percentage of fragments covered by the clade rooted at this taxon
+ Number of fragments covered by the clade rooted at this taxon
+ Number of fragments assigned directly to this taxon
+ A rank code, indicating (U)nclassified, (R)oot, (D)omain, (K)ingdom, (P)hylum, (C)lass, (O)rder, (F)amily, (G)enus, or (S)pecies. Taxa that are not at any of these 10 ranks have a rank code that is formed by using the rank code of the closest ancestor rank with a number indicating the distance from that rank. E.g., "G2" is a rank code indicating a taxon is between genus and species and the grandparent taxon is at the genus rank.
+ NCBI taxonomic ID number
+ Indented scientific name

```{r, echo=FALSE}
table<- read.table(krakenReportFile,sep = "\t")
header_table <- c("percent","num_fragments","num_fragments_taxon","rank","NCBI_ID","scientific_name")
colnames(table)<-header_table
datatable(table)
```
103 changes: 0 additions & 103 deletions scripts/ensemble_vep.py

This file was deleted.

86 changes: 86 additions & 0 deletions scripts/get_qc_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import sys
import re
import csv


def parsing_cov(coverage_file):
with open(coverage_file, "r") as cov_file:
reader = csv.reader(cov_file, delimiter="\t")

for row in reader:
for element in row:
if not re.match("#", element):
aligned_reads = row[3]
total_cov = row[5]
mean_depth = row[6]
return (aligned_reads, total_cov, mean_depth)


def parsing_amplicon(amplicon_file):
with open(amplicon_file, "r") as amp_file:
reader = csv.reader(amp_file, delimiter="\t")

num_covered_amplicons = 0
covered_amplicon = 0
drop_out_amplicons = []
full_drop_out = []
half_covered = []

for row in reader:

amplicon = row[3].split("_")[1]

if amplicon != str(covered_amplicon):
covered_amplicon = amplicon
num_covered_amplicons += 1

if int(row[6]) == 0:
drop_out_amplicons.append(amplicon)

if len(drop_out_amplicons) != 0:
for ampli in drop_out_amplicons:
cnt = drop_out_amplicons.count(ampli)
if cnt == 2:
full_drop_out.append(ampli)
if cnt == 1:
half_covered.append(ampli)

full_drop_out = list(set(full_drop_out))
fully_covered_amplicons = int(num_covered_amplicons) - len(drop_out_amplicons)

return (fully_covered_amplicons, half_covered, full_drop_out)


def make_report_input_csv(coverage_file, amplicon_file, output_file):
coverage_info = parsing_cov(coverage_file)
amplicon_info = parsing_amplicon(amplicon_file)
print(coverage_info)
print(amplicon_info)

header = [
"Total number of amplicons fully covered",
"Amplicons partially covered",
"Drop-out amplicons",
"Total number aligned reads",
"Coverage",
"Mean depth",
]
fields = [
amplicon_info[0],
amplicon_info[1],
amplicon_info[2],
coverage_info[0],
coverage_info[1],
coverage_info[2],
]

with open(output_file, "a") as f:
writer = csv.writer(f, delimiter="\t")
writer.writerow(header)
writer.writerow(fields)

if __name__ == '__main__':
coverage_file = sys.argv[1]
amplicon_file = sys.argv[2]
output = sys.argv[3]
make_report_input_csv(coverage_file, amplicon_file, output)
50 changes: 50 additions & 0 deletions scripts/parse_vep.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import re
import os
import csv
import sys

def parse_vep(vep_dir, input, output):
header_line = None

with open(input, 'r') as file:
if header_line:
raise Exception("file has not the correct format or does not has to be parsed anymore")

with open(os.path.join(vep_dir, "tmp.txt"), 'w+') as clean_file:
for num, line in enumerate(file, 1):
if re.match("#Uploaded_variation", line):
header_line = num
clean_file.write(line[1:])
break
for num, line in enumerate(file, header_line):
clean_file.write(line)

with open(os.path.join(vep_dir,'tmp.txt'), 'r') as file:
with open(output, 'w') as output:
reader = csv.reader(file, delimiter="\t")
writer = csv.writer(output, lineterminator='\n')

# make new column SYMBOL for gene names
# fixme: This is redundant with the making of the header line above, but works for now
gene_column = []
header = next(reader)
header.append("SYMBOL")
gene_column.append(header)

for row in reader: # fixme: I'm not sure if the following construct is ugly or not
# get gene name
extra = row[13].split(";")
for element in extra:
if re.search(r"SYMBOL=", element):
gene = element.split("=")[1:][0]
row.append(gene)
gene_column.append(row)

writer.writerows(gene_column)
# TODO: add step to clean "tmp.txt"

if __name__ == '__main__':
vep_dir = sys.argv[1]
input = sys.argv[2]
output = sys.argv[3]
parse_vep(vep_dir, input, output)
21 changes: 0 additions & 21 deletions scripts/prep_sample_space_210320.py

This file was deleted.

Loading

0 comments on commit d1a514c

Please sign in to comment.