Skip to content

Commit

Permalink
focus on relevant scripts, add sigmut input
Browse files Browse the repository at this point in the history
  • Loading branch information
dohmjan committed May 7, 2021
1 parent 1ac66a0 commit 4390e17
Show file tree
Hide file tree
Showing 17 changed files with 458 additions and 221 deletions.
File renamed without changes.
103 changes: 0 additions & 103 deletions scripts/ensemble_vep.py

This file was deleted.

86 changes: 86 additions & 0 deletions scripts/get_qc_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import sys
import re
import csv


def parsing_cov(coverage_file):
with open(coverage_file, "r") as cov_file:
reader = csv.reader(cov_file, delimiter="\t")

for row in reader:
for element in row:
if not re.match("#", element):
aligned_reads = row[3]
total_cov = row[5]
mean_depth = row[6]
return (aligned_reads, total_cov, mean_depth)


def parsing_amplicon(amplicon_file):
with open(amplicon_file, "r") as amp_file:
reader = csv.reader(amp_file, delimiter="\t")

num_covered_amplicons = 0
covered_amplicon = 0
drop_out_amplicons = []
full_drop_out = []
half_covered = []

for row in reader:

amplicon = row[3].split("_")[1]

if amplicon != str(covered_amplicon):
covered_amplicon = amplicon
num_covered_amplicons += 1

if int(row[6]) == 0:
drop_out_amplicons.append(amplicon)

if len(drop_out_amplicons) != 0:
for ampli in drop_out_amplicons:
cnt = drop_out_amplicons.count(ampli)
if cnt == 2:
full_drop_out.append(ampli)
if cnt == 1:
half_covered.append(ampli)

full_drop_out = list(set(full_drop_out))
fully_covered_amplicons = int(num_covered_amplicons) - len(drop_out_amplicons)

return (fully_covered_amplicons, half_covered, full_drop_out)


def make_report_input_csv(coverage_file, amplicon_file, output_file):
coverage_info = parsing_cov(coverage_file)
amplicon_info = parsing_amplicon(amplicon_file)
print(coverage_info)
print(amplicon_info)

header = [
"Total number of amplicons fully covered",
"Amplicons partially covered",
"Drop-out amplicons",
"Total number aligned reads",
"Coverage",
"Mean depth",
]
fields = [
amplicon_info[0],
amplicon_info[1],
amplicon_info[2],
coverage_info[0],
coverage_info[1],
coverage_info[2],
]

with open(output_file, "a") as f:
writer = csv.writer(f, delimiter="\t")
writer.writerow(header)
writer.writerow(fields)

if __name__ == '__main__':
coverage_file = sys.argv[1]
amplicon_file = sys.argv[2]
output = sys.argv[3]
make_report_input_csv(coverage_file, amplicon_file, output)
50 changes: 50 additions & 0 deletions scripts/parse_vep.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import re
import os
import csv
import sys

def parse_vep(vep_dir, input, output):
header_line = None

with open(input, 'r') as file:
if header_line:
raise Exception("file has not the correct format or does not has to be parsed anymore")

with open(os.path.join(vep_dir, "tmp.txt"), 'w+') as clean_file:
for num, line in enumerate(file, 1):
if re.match("#Uploaded_variation", line):
header_line = num
clean_file.write(line[1:])
break
for num, line in enumerate(file, header_line):
clean_file.write(line)

with open(os.path.join(vep_dir,'tmp.txt'), 'r') as file:
with open(output, 'w') as output:
reader = csv.reader(file, delimiter="\t")
writer = csv.writer(output, lineterminator='\n')

# make new column SYMBOL for gene names
# fixme: This is redundant with the making of the header line above, but works for now
gene_column = []
header = next(reader)
header.append("SYMBOL")
gene_column.append(header)

for row in reader: # fixme: I'm not sure if the following construct is ugly or not
# get gene name
extra = row[13].split(";")
for element in extra:
if re.search(r"SYMBOL=", element):
gene = element.split("=")[1:][0]
row.append(gene)
gene_column.append(row)

writer.writerows(gene_column)
# TODO: add step to clean "tmp.txt"

if __name__ == '__main__':
vep_dir = sys.argv[1]
input = sys.argv[2]
output = sys.argv[3]
parse_vep(vep_dir, input, output)
21 changes: 0 additions & 21 deletions scripts/prep_sample_space_210320.py

This file was deleted.

89 changes: 89 additions & 0 deletions scripts/qc_report_per_sample.rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
---
title: "SARS-CoV-2 QC report"
date: '`r format(as.POSIXct(if ("" != Sys.getenv("SOURCE_DATE_EPOCH")) { as.numeric(Sys.getenv("SOURCE_DATE_EPOCH")) } else { Sys.time() }, origin="1970-01-01"), "%Y-%m-%d %H:%M:%S")`'
params:
coverage_file: ''
sample_name: ''
theme: darkly
---

```{r setup, include=FALSE, message = FALSE, warning = FALSE}
knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
knitr::opts_knit$set(root.dir = params$workdir)
library(DT)
library(ggplot2)
library(dplyr)
library(tidyr)
library(qpcR)
library(stringr)
library(magrittr)
library(openssl)
```
# Input Settings
```{r InputSettings, echo = FALSE, include = FALSE}
coverage_file <- params$coverage_file
sample_name <- params$sample_name
```
```{r printInputSettings, echo = FALSE, include = FALSE}
# coverage_file <- '/home/vfs/PycharmProjects/Akalinlab_pathogenomics/PiGx_WW/tests/coverage/Test_merged_covs.csv'
# sample_name <- "Test"
```
### Sample: `r sample_name`
```{r reading data, include = FALSE}
coverage.df <- read.table(coverage_file,sep = "\t", header = T)
coverage.df <- na_if(coverage.df, "[]")
coverage.df <- gsub('\\[','',coverage.df)
coverage.df <- gsub('\\]','',coverage.df)
# fixme: this can be sure done in a nicer and more compact way
```
## Amplicon coverage
The information about the amplicon primer was taken from the [ARTIC bioinformatics platform](https://github.com/joshquick/artic-ncov2019/blob/master/primer_schemes/nCoV-2019/V3/nCoV-2019.bed).
The reference name "MN908947.3" was changed to "NC_045512.2". Both are describing the same reference genome for
[SARS-CoV-2](https://www.ncbi.nlm.nih.gov/nuccore/1798174254).

```{r, message=F, warning=F, echo = FALSE}
AmpliconCoverage <- data.frame("Number of amplicon fully covered" = paste0(coverage.df[1],' out of 98'),
"Anmplicons partially covered" = coverage.df[2],
"Drop-out amplicons" = coverage.df[3]) # maybe more clear title stating which coverage
DT::datatable(t(AmpliconCoverage), options = list(searching = FALSE, paging = FALSE))
```

## Coverage

```{r, message=F, warning=F, echo = FALSE}
Coverage <- data.frame("Total coverage" = coverage.df[4],
"Aligned reads" = paste(round(as.integer(coverage.df[5]),1),"%"),
"Mean depth" = round(as.integer(coverage.df[6]))) # maybe more clear title stating which coverage
DT::datatable(t(Coverage), options = list(searching = FALSE, paging = FALSE))
```
## FastQC and Trimming

Download: [fastqc report of unprocessed data](/path/to/fastqc.html)
Download: [fastqc report of pre-processes data](/path/to/fastqc.html)



The reads were pre-processed as followed:

| | |
|:-----|:----|
|trim length cutoff prinseq |0.2|

## Download Stats csv
```{r, include = F}
write.csv2(coverage.df, paste0(dirname(coverage_file),"/report_download_coverage.csv")) # not sure about the funct of the dataframe here, but it works so I leave it for now
readLines(coverage_file) %>%
paste0(collapse="\n") %>%
openssl::base64_encode() -> coverage
```
Coverage statistics:
[Download Coverage_stats.csv](`r sprintf('data:text/csv;base64,%s', coverage)`)

File renamed without changes.
Loading

0 comments on commit 4390e17

Please sign in to comment.