-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #5 from BIMSBbioinfo/develop-snakemake
Develop snakemake
- Loading branch information
Showing
18 changed files
with
961 additions
and
293 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
--- | ||
title: "Kraken2-report" | ||
#author: "mfaxel" | ||
date: '`r format(as.POSIXct(if ("" != Sys.getenv("SOURCE_DATE_EPOCH")) { as.numeric(Sys.getenv("SOURCE_DATE_EPOCH")) } else { Sys.time() }, origin="1970-01-01"), "%Y-%m-%d %H:%M:%S")`' | ||
#output: html_document | ||
params: | ||
#sample_dir: '../../tests/sample_data/' # must be kraken REPORT file | ||
#sample_name: 'tmp_unaligned_krakenReport' | ||
sample_name: "" | ||
kraken_output: "" | ||
|
||
--- | ||
|
||
|
||
```{r setup, include=FALSE} | ||
knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE) | ||
knitr::opts_knit$set(root.dir = params$workdir) | ||
library(DT) | ||
``` | ||
|
||
|
||
|
||
|
||
```{r, echo=FALSE} | ||
#input | ||
sampleName <- params$sample_name | ||
#sampleDir <- params$sample_dir | ||
#krakenReportFile <- paste0(params$sample_dir, params$sample_name) | ||
# krakenReportFile <- paste0(params$sample_dir, params$sample_name, ".txt") | ||
krakenReportFile <- kraken_output | ||
``` | ||
|
||
|
||
## Kraken2 report on unaligned reads from the Sars-Cov2 enriched samples | ||
|
||
```{r,echo=F} | ||
print(paste( "Sample: ",sampleName)) | ||
``` | ||
|
||
|
||
The Sars-Cov2 enriched wastewater samples are aligned to the Virus genome. The unaligned reads, that are left are aligned with Kraken2 against the [database PlusPFP](https://benlangmead.github.io/aws-indexes/k2). This report is an overview over the species found. | ||
|
||
A table with all the species found: | ||
|
||
* Column explanation: | ||
+ Percentage of fragments covered by the clade rooted at this taxon | ||
+ Number of fragments covered by the clade rooted at this taxon | ||
+ Number of fragments assigned directly to this taxon | ||
+ A rank code, indicating (U)nclassified, (R)oot, (D)omain, (K)ingdom, (P)hylum, (C)lass, (O)rder, (F)amily, (G)enus, or (S)pecies. Taxa that are not at any of these 10 ranks have a rank code that is formed by using the rank code of the closest ancestor rank with a number indicating the distance from that rank. E.g., "G2" is a rank code indicating a taxon is between genus and species and the grandparent taxon is at the genus rank. | ||
+ NCBI taxonomic ID number | ||
+ Indented scientific name | ||
|
||
```{r, echo=FALSE} | ||
table<- read.table(krakenReportFile,sep = "\t") | ||
header_table <- c("percent","num_fragments","num_fragments_taxon","rank","NCBI_ID","scientific_name") | ||
colnames(table)<-header_table | ||
datatable(table) | ||
``` |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
import sys | ||
import re | ||
import csv | ||
|
||
|
||
def parsing_cov(coverage_file): | ||
with open(coverage_file, "r") as cov_file: | ||
reader = csv.reader(cov_file, delimiter="\t") | ||
|
||
for row in reader: | ||
for element in row: | ||
if not re.match("#", element): | ||
aligned_reads = row[3] | ||
total_cov = row[5] | ||
mean_depth = row[6] | ||
return (aligned_reads, total_cov, mean_depth) | ||
|
||
|
||
def parsing_amplicon(amplicon_file): | ||
with open(amplicon_file, "r") as amp_file: | ||
reader = csv.reader(amp_file, delimiter="\t") | ||
|
||
num_covered_amplicons = 0 | ||
covered_amplicon = 0 | ||
drop_out_amplicons = [] | ||
full_drop_out = [] | ||
half_covered = [] | ||
|
||
for row in reader: | ||
|
||
amplicon = row[3].split("_")[1] | ||
|
||
if amplicon != str(covered_amplicon): | ||
covered_amplicon = amplicon | ||
num_covered_amplicons += 1 | ||
|
||
if int(row[6]) == 0: | ||
drop_out_amplicons.append(amplicon) | ||
|
||
if len(drop_out_amplicons) != 0: | ||
for ampli in drop_out_amplicons: | ||
cnt = drop_out_amplicons.count(ampli) | ||
if cnt == 2: | ||
full_drop_out.append(ampli) | ||
if cnt == 1: | ||
half_covered.append(ampli) | ||
|
||
full_drop_out = list(set(full_drop_out)) | ||
fully_covered_amplicons = int(num_covered_amplicons) - len(drop_out_amplicons) | ||
|
||
return (fully_covered_amplicons, half_covered, full_drop_out) | ||
|
||
|
||
def make_report_input_csv(coverage_file, amplicon_file, output_file): | ||
coverage_info = parsing_cov(coverage_file) | ||
amplicon_info = parsing_amplicon(amplicon_file) | ||
print(coverage_info) | ||
print(amplicon_info) | ||
|
||
header = [ | ||
"Total number of amplicons fully covered", | ||
"Amplicons partially covered", | ||
"Drop-out amplicons", | ||
"Total number aligned reads", | ||
"Coverage", | ||
"Mean depth", | ||
] | ||
fields = [ | ||
amplicon_info[0], | ||
amplicon_info[1], | ||
amplicon_info[2], | ||
coverage_info[0], | ||
coverage_info[1], | ||
coverage_info[2], | ||
] | ||
|
||
with open(output_file, "a") as f: | ||
writer = csv.writer(f, delimiter="\t") | ||
writer.writerow(header) | ||
writer.writerow(fields) | ||
|
||
if __name__ == '__main__': | ||
coverage_file = sys.argv[1] | ||
amplicon_file = sys.argv[2] | ||
output = sys.argv[3] | ||
make_report_input_csv(coverage_file, amplicon_file, output) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
import re | ||
import os | ||
import csv | ||
import sys | ||
|
||
def parse_vep(vep_dir, input, output): | ||
header_line = None | ||
|
||
with open(input, 'r') as file: | ||
if header_line: | ||
raise Exception("file has not the correct format or does not has to be parsed anymore") | ||
|
||
with open(os.path.join(vep_dir, "tmp.txt"), 'w+') as clean_file: | ||
for num, line in enumerate(file, 1): | ||
if re.match("#Uploaded_variation", line): | ||
header_line = num | ||
clean_file.write(line[1:]) | ||
break | ||
for num, line in enumerate(file, header_line): | ||
clean_file.write(line) | ||
|
||
with open(os.path.join(vep_dir,'tmp.txt'), 'r') as file: | ||
with open(output, 'w') as output: | ||
reader = csv.reader(file, delimiter="\t") | ||
writer = csv.writer(output, lineterminator='\n') | ||
|
||
# make new column SYMBOL for gene names | ||
# fixme: This is redundant with the making of the header line above, but works for now | ||
gene_column = [] | ||
header = next(reader) | ||
header.append("SYMBOL") | ||
gene_column.append(header) | ||
|
||
for row in reader: # fixme: I'm not sure if the following construct is ugly or not | ||
# get gene name | ||
extra = row[13].split(";") | ||
for element in extra: | ||
if re.search(r"SYMBOL=", element): | ||
gene = element.split("=")[1:][0] | ||
row.append(gene) | ||
gene_column.append(row) | ||
|
||
writer.writerows(gene_column) | ||
# TODO: add step to clean "tmp.txt" | ||
|
||
if __name__ == '__main__': | ||
vep_dir = sys.argv[1] | ||
input = sys.argv[2] | ||
output = sys.argv[3] | ||
parse_vep(vep_dir, input, output) |
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.