-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
focus on relevant scripts, add sigmut input
- Loading branch information
Showing
17 changed files
with
458 additions
and
221 deletions.
There are no files selected for viewing
File renamed without changes.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
import sys | ||
import re | ||
import csv | ||
|
||
|
||
def parsing_cov(coverage_file): | ||
with open(coverage_file, "r") as cov_file: | ||
reader = csv.reader(cov_file, delimiter="\t") | ||
|
||
for row in reader: | ||
for element in row: | ||
if not re.match("#", element): | ||
aligned_reads = row[3] | ||
total_cov = row[5] | ||
mean_depth = row[6] | ||
return (aligned_reads, total_cov, mean_depth) | ||
|
||
|
||
def parsing_amplicon(amplicon_file): | ||
with open(amplicon_file, "r") as amp_file: | ||
reader = csv.reader(amp_file, delimiter="\t") | ||
|
||
num_covered_amplicons = 0 | ||
covered_amplicon = 0 | ||
drop_out_amplicons = [] | ||
full_drop_out = [] | ||
half_covered = [] | ||
|
||
for row in reader: | ||
|
||
amplicon = row[3].split("_")[1] | ||
|
||
if amplicon != str(covered_amplicon): | ||
covered_amplicon = amplicon | ||
num_covered_amplicons += 1 | ||
|
||
if int(row[6]) == 0: | ||
drop_out_amplicons.append(amplicon) | ||
|
||
if len(drop_out_amplicons) != 0: | ||
for ampli in drop_out_amplicons: | ||
cnt = drop_out_amplicons.count(ampli) | ||
if cnt == 2: | ||
full_drop_out.append(ampli) | ||
if cnt == 1: | ||
half_covered.append(ampli) | ||
|
||
full_drop_out = list(set(full_drop_out)) | ||
fully_covered_amplicons = int(num_covered_amplicons) - len(drop_out_amplicons) | ||
|
||
return (fully_covered_amplicons, half_covered, full_drop_out) | ||
|
||
|
||
def make_report_input_csv(coverage_file, amplicon_file, output_file): | ||
coverage_info = parsing_cov(coverage_file) | ||
amplicon_info = parsing_amplicon(amplicon_file) | ||
print(coverage_info) | ||
print(amplicon_info) | ||
|
||
header = [ | ||
"Total number of amplicons fully covered", | ||
"Amplicons partially covered", | ||
"Drop-out amplicons", | ||
"Total number aligned reads", | ||
"Coverage", | ||
"Mean depth", | ||
] | ||
fields = [ | ||
amplicon_info[0], | ||
amplicon_info[1], | ||
amplicon_info[2], | ||
coverage_info[0], | ||
coverage_info[1], | ||
coverage_info[2], | ||
] | ||
|
||
with open(output_file, "a") as f: | ||
writer = csv.writer(f, delimiter="\t") | ||
writer.writerow(header) | ||
writer.writerow(fields) | ||
|
||
if __name__ == '__main__': | ||
coverage_file = sys.argv[1] | ||
amplicon_file = sys.argv[2] | ||
output = sys.argv[3] | ||
make_report_input_csv(coverage_file, amplicon_file, output) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
import re | ||
import os | ||
import csv | ||
import sys | ||
|
||
def parse_vep(vep_dir, input, output): | ||
header_line = None | ||
|
||
with open(input, 'r') as file: | ||
if header_line: | ||
raise Exception("file has not the correct format or does not has to be parsed anymore") | ||
|
||
with open(os.path.join(vep_dir, "tmp.txt"), 'w+') as clean_file: | ||
for num, line in enumerate(file, 1): | ||
if re.match("#Uploaded_variation", line): | ||
header_line = num | ||
clean_file.write(line[1:]) | ||
break | ||
for num, line in enumerate(file, header_line): | ||
clean_file.write(line) | ||
|
||
with open(os.path.join(vep_dir,'tmp.txt'), 'r') as file: | ||
with open(output, 'w') as output: | ||
reader = csv.reader(file, delimiter="\t") | ||
writer = csv.writer(output, lineterminator='\n') | ||
|
||
# make new column SYMBOL for gene names | ||
# fixme: This is redundant with the making of the header line above, but works for now | ||
gene_column = [] | ||
header = next(reader) | ||
header.append("SYMBOL") | ||
gene_column.append(header) | ||
|
||
for row in reader: # fixme: I'm not sure if the following construct is ugly or not | ||
# get gene name | ||
extra = row[13].split(";") | ||
for element in extra: | ||
if re.search(r"SYMBOL=", element): | ||
gene = element.split("=")[1:][0] | ||
row.append(gene) | ||
gene_column.append(row) | ||
|
||
writer.writerows(gene_column) | ||
# TODO: add step to clean "tmp.txt" | ||
|
||
if __name__ == '__main__': | ||
vep_dir = sys.argv[1] | ||
input = sys.argv[2] | ||
output = sys.argv[3] | ||
parse_vep(vep_dir, input, output) |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
--- | ||
title: "SARS-CoV-2 QC report" | ||
date: '`r format(as.POSIXct(if ("" != Sys.getenv("SOURCE_DATE_EPOCH")) { as.numeric(Sys.getenv("SOURCE_DATE_EPOCH")) } else { Sys.time() }, origin="1970-01-01"), "%Y-%m-%d %H:%M:%S")`' | ||
params: | ||
coverage_file: '' | ||
sample_name: '' | ||
theme: darkly | ||
--- | ||
|
||
```{r setup, include=FALSE, message = FALSE, warning = FALSE} | ||
knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE) | ||
knitr::opts_knit$set(root.dir = params$workdir) | ||
library(DT) | ||
library(ggplot2) | ||
library(dplyr) | ||
library(tidyr) | ||
library(qpcR) | ||
library(stringr) | ||
library(magrittr) | ||
library(openssl) | ||
``` | ||
# Input Settings | ||
```{r InputSettings, echo = FALSE, include = FALSE} | ||
coverage_file <- params$coverage_file | ||
sample_name <- params$sample_name | ||
``` | ||
```{r printInputSettings, echo = FALSE, include = FALSE} | ||
# coverage_file <- '/home/vfs/PycharmProjects/Akalinlab_pathogenomics/PiGx_WW/tests/coverage/Test_merged_covs.csv' | ||
# sample_name <- "Test" | ||
``` | ||
### Sample: `r sample_name` | ||
```{r reading data, include = FALSE} | ||
coverage.df <- read.table(coverage_file,sep = "\t", header = T) | ||
coverage.df <- na_if(coverage.df, "[]") | ||
coverage.df <- gsub('\\[','',coverage.df) | ||
coverage.df <- gsub('\\]','',coverage.df) | ||
# fixme: this can be sure done in a nicer and more compact way | ||
``` | ||
## Amplicon coverage | ||
The information about the amplicon primer was taken from the [ARTIC bioinformatics platform](https://github.com/joshquick/artic-ncov2019/blob/master/primer_schemes/nCoV-2019/V3/nCoV-2019.bed). | ||
The reference name "MN908947.3" was changed to "NC_045512.2". Both are describing the same reference genome for | ||
[SARS-CoV-2](https://www.ncbi.nlm.nih.gov/nuccore/1798174254). | ||
|
||
```{r, message=F, warning=F, echo = FALSE} | ||
AmpliconCoverage <- data.frame("Number of amplicon fully covered" = paste0(coverage.df[1],' out of 98'), | ||
"Anmplicons partially covered" = coverage.df[2], | ||
"Drop-out amplicons" = coverage.df[3]) # maybe more clear title stating which coverage | ||
DT::datatable(t(AmpliconCoverage), options = list(searching = FALSE, paging = FALSE)) | ||
``` | ||
|
||
## Coverage | ||
|
||
```{r, message=F, warning=F, echo = FALSE} | ||
Coverage <- data.frame("Total coverage" = coverage.df[4], | ||
"Aligned reads" = paste(round(as.integer(coverage.df[5]),1),"%"), | ||
"Mean depth" = round(as.integer(coverage.df[6]))) # maybe more clear title stating which coverage | ||
DT::datatable(t(Coverage), options = list(searching = FALSE, paging = FALSE)) | ||
``` | ||
## FastQC and Trimming | ||
|
||
Download: [fastqc report of unprocessed data](/path/to/fastqc.html) | ||
Download: [fastqc report of pre-processes data](/path/to/fastqc.html) | ||
|
||
|
||
|
||
The reads were pre-processed as followed: | ||
|
||
| | | | ||
|:-----|:----| | ||
|trim length cutoff prinseq |0.2| | ||
|
||
## Download Stats csv | ||
```{r, include = F} | ||
write.csv2(coverage.df, paste0(dirname(coverage_file),"/report_download_coverage.csv")) # not sure about the funct of the dataframe here, but it works so I leave it for now | ||
readLines(coverage_file) %>% | ||
paste0(collapse="\n") %>% | ||
openssl::base64_encode() -> coverage | ||
``` | ||
Coverage statistics: | ||
[Download Coverage_stats.csv](`r sprintf('data:text/csv;base64,%s', coverage)`) | ||
|
File renamed without changes.
Oops, something went wrong.