From a6b0b08186d27c64955adb2aa2e641e6ab13624d Mon Sep 17 00:00:00 2001 From: Dohmen Date: Thu, 6 May 2021 17:38:34 +0200 Subject: [PATCH 1/4] add kraken and krona reports --- scripts/kraken_report.Rmd | 58 +++++++++++++++++++++++++ scripts/run_kraken_report.R | 75 ++++++++++++++++++++++++++++++++ snakefile.py | 87 +++++++++++++++++++++++++------------ 3 files changed, 193 insertions(+), 27 deletions(-) create mode 100644 scripts/kraken_report.Rmd create mode 100644 scripts/run_kraken_report.R diff --git a/scripts/kraken_report.Rmd b/scripts/kraken_report.Rmd new file mode 100644 index 00000000..7ea1adb8 --- /dev/null +++ b/scripts/kraken_report.Rmd @@ -0,0 +1,58 @@ +--- +title: "Kraken2-report" +#author: "mfaxel" +date: '`r format(as.POSIXct(if ("" != Sys.getenv("SOURCE_DATE_EPOCH")) { as.numeric(Sys.getenv("SOURCE_DATE_EPOCH")) } else { Sys.time() }, origin="1970-01-01"), "%Y-%m-%d %H:%M:%S")`' +#output: html_document +params: + #sample_dir: '../../tests/sample_data/' # must be kraken REPORT file + #sample_name: 'tmp_unaligned_krakenReport' + sample_name: "" + kraken_output: "" + +--- + + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE) +knitr::opts_knit$set(root.dir = params$workdir) +library(DT) +``` + + + + +```{r, echo=FALSE} +#input +sampleName <- params$sample_name +#sampleDir <- params$sample_dir +#krakenReportFile <- paste0(params$sample_dir, params$sample_name) +# krakenReportFile <- paste0(params$sample_dir, params$sample_name, ".txt") +krakenReportFile <- kraken_output +``` + + +## Kraken2 report on unaligned reads from the Sars-Cov2 enriched samples + +```{r,echo=F} +print(paste( "Sample: ",sampleName)) +``` + + +The Sars-Cov2 enriched wastewater samples are aligned to the Virus genome. The unaligned reads, that are left are aligned with Kraken2 against the [database PlusPFP](https://benlangmead.github.io/aws-indexes/k2). This report is an overview over the species found. + +A table with all the species found: + +* Column explanation: + + Percentage of fragments covered by the clade rooted at this taxon + + Number of fragments covered by the clade rooted at this taxon + + Number of fragments assigned directly to this taxon + + A rank code, indicating (U)nclassified, (R)oot, (D)omain, (K)ingdom, (P)hylum, (C)lass, (O)rder, (F)amily, (G)enus, or (S)pecies. Taxa that are not at any of these 10 ranks have a rank code that is formed by using the rank code of the closest ancestor rank with a number indicating the distance from that rank. E.g., "G2" is a rank code indicating a taxon is between genus and species and the grandparent taxon is at the genus rank. + + NCBI taxonomic ID number + + Indented scientific name + +```{r, echo=FALSE} +table<- read.table(krakenReportFile,sep = "\t") +header_table <- c("percent","num_fragments","num_fragments_taxon","rank","NCBI_ID","scientific_name") +colnames(table)<-header_table +datatable(table) +``` diff --git a/scripts/run_kraken_report.R b/scripts/run_kraken_report.R new file mode 100644 index 00000000..742855b4 --- /dev/null +++ b/scripts/run_kraken_report.R @@ -0,0 +1,75 @@ +# Title : run Kraken2 report +# author: mfaxel +# Based on the runDeseqReport.R from Bora Uyar +# Created on: 2.05.21 + +#library(rmarkdown) + +#function to run report +runReport <- function(reportFile, + sample_name, + kraken_output, + output_file, + selfContained = TRUE, + quiet = FALSE) { + +# outFile <- paste0(output_dir,"/",sample_name,"kraken_report.html") + + # htmlwidgets::setWidgetIdSeed(1234) + rmarkdown::render( + input = reportFile, + # output_dir = sample_dir, + # intermediates_dir = file.path(workdir, prefix), + clean = TRUE, + output_file = output_file, + output_format = rmarkdown::html_document( + code_folding = 'hide', + depth = 2, + toc = TRUE, + toc_float = TRUE, + theme = 'lumen', + number_sections = TRUE + ), + output_options = list(self_contained = selfContained), + params = list(kraken_output = kraken_output, + sample_name = sample_name + ), + quiet = quiet + ) + +# if(dir.exists(file.path(sample_dir, sample_name))) { +# unlink(file.path(sample_dir, sample_name), recursive = TRUE) +# } + +} + +#1. Collect arguments +args <- commandArgs(TRUE) + +cat("arguments:", args,"\n") + +## Parse arguments (we expect the form --arg=value) +#parseArgs <- function(x) strsplit(sub("^--", "", x), "=") +parseArgs <- function(x) { + myArgs <- unlist(strsplit(x, "--")) + myArgs <- myArgs[myArgs != ''] + #when no values are provided for the argument + myArgs <- gsub(pattern = "=$", replacement = "= ", x = myArgs) + myArgs <- as.data.frame(do.call(rbind, strsplit(myArgs, "="))) + myArgs$V2 <- gsub(' ', '', myArgs$V2) + return(myArgs) +} + +argsDF <- parseArgs(args) +argsL <- as.list(as.character(argsDF$V2)) +names(argsL) <- argsDF$V1 + +reportFile <- argsL$reportFile +sample_name <- argsL$sample_name +kraken_output <- argsL$kraken_output +output_file <- argsL$output_file + +runReport(reportFile = reportFile, + sample_name = sample_name, + kraken_output = kraken_output, + output_file = output_file) \ No newline at end of file diff --git a/snakefile.py b/snakefile.py index a9bc8e73..66a279df 100644 --- a/snakefile.py +++ b/snakefile.py @@ -28,12 +28,15 @@ # TODO: # 1. obtain things from config file # 2. make dummy samples and put them to tests/sample_data/reads +# 3. align overall appearance READS_DIR = 'tests/sample_data/reads' REFERENCE_FASTA = 'tests/sample_data/NC_045512.2.fasta' -KRAKEN_DB = 'tests/sample_data/kraken_db.db' # download first, where? +KRAKEN_DB = 'tests/databases/kraken_db' +KRONA_DB = 'tests/databases/krona_db' SAMPLE_SHEET_CSV = 'tests/sample_sheet.csv' OUTPUT_DIR = 'output' +OUTPUT_DIR = os.path.join(os.getcwd(), OUTPUT_DIR) TRIMMED_READS_DIR = os.path.join(OUTPUT_DIR, 'trimmed_reads') LOG_DIR = os.path.join(OUTPUT_DIR, 'logs') FASTQC_DIR = os.path.join(OUTPUT_DIR, 'fastqc') @@ -70,9 +73,23 @@ 'files': ( expand(os.path.join(VARIANTS_DIR, '{sample}_snv.csv'), sample=SAMPLES) ) + }, + 'variant_report': { + 'description': "Make variants report.", + 'files': ( + expand(os.path.join(REPORT_DIR, '{sample}_variant_report.html'), sample=SAMPLES) + ) + }, + 'kraken_reports': { + 'description': "Make Kraken reports.", + 'files': ( + expand(os.path.join(REPORT_DIR, '{sample}_kraken_report.html'), sample=SAMPLES) + + expand(os.path.join(REPORT_DIR, '{sample}_krona_report.html'), sample=SAMPLES) + ) } + } -selected_targets = ['lofreq'] +selected_targets = ['kraken_reports'] OUTPUT_FILES = list(chain.from_iterable([targets[name]['files'] for name in selected_targets])) @@ -141,12 +158,19 @@ # same rule to get unaligned .bam # probably something like below, but needs to be checked for our purpose (kraken reports) # "samtools view -bh -F 2 {input} > {output} 2>> {log} 3>&2" -rule samtools_filter: +rule samtools_filter_aligned: input: os.path.join(MAPPED_READS_DIR, '{sample}_aligned_tmp.sam') output: os.path.join(MAPPED_READS_DIR, '{sample}_aligned.bam') - log: os.path.join(LOG_DIR, 'samtools_filter_{sample}.log') + log: os.path.join(LOG_DIR, 'samtools_filter_aligned_{sample}.log') shell: "samtools view -bh -f 2 -F 2048 {input} > {output} 2>> {log} 3>&2" + +rule samtools_filter_unaligned: + input: os.path.join(MAPPED_READS_DIR, '{sample}_aligned_tmp.sam') + output: os.path.join(MAPPED_READS_DIR, '{sample}_unaligned.bam') + log: os.path.join(LOG_DIR, 'samtools_filter_unaligned_{sample}.log') + shell: "samtools view -bh -F 2 {input} > {output} 2>> {log} 3>&2" + # probably not needed --> directly converted during filtering step (-b) # rule samtools_sam2bam_A: # input: os.path.join(MAPPED_READS_DIR, '{sample}_aligned.sam') @@ -221,28 +245,37 @@ # shell: "Rscript {SCRIPTS_DIR}/run_variant_report.R --reportFile={SCRIPTS_DIR}/variantreport_p_sample.rmd --vep_txt_file={input.vep_txt} --snv_csv_file={input.snv_csv} --location_sigmuts=??? --sample_dir=??? --sample_name={wildcards.sample} >> {log} 2>&1" -# TODO: check command/output as in miro board R1 & R2 is mentioned -# rule bam2fastq: -# input: os.path.join(MAPPED_READS_DIR, '{sample}_aligned_tmp.bam') -# output: os.path.join(MAPPED_READS_DIR, '{sample}_aligned_tmp.fastq') -# log: os.path.join(LOG_DIR, 'bam2fastq_{sample}.log') -# shell: "samtools fastq {input} > {output} >> {log} 2>&1" +rule bam2fastq: + input: os.path.join(MAPPED_READS_DIR, '{sample}_unaligned.bam') + output: os.path.join(MAPPED_READS_DIR, '{sample}_unaligned.fastq') + log: os.path.join(LOG_DIR, 'bam2fastq_{sample}.log') + shell: "samtools fastq {input} > {output} 2>> {log} 3>&2" -# TODO: check command as in miro board there where R1 & R2 inputs -# rule kraken: -# input: -# unaligned_fastq = os.path.join(MAPPED_READS_DIR, '{sample}_aligned_tmp.fastq'), -# database = KRAKEN_DB -# output: os.path.join(KRAKEN_DIR, '{sample}_classified_unaligned_reads.txt') -# log: os.path.join(LOG_DIR, 'kraken_{sample}.log') -# shell: "kraken2 --output {output} --db {input.database} --paired {input.unaligned_fastq} >> {log} 2>&1" - - -# TODO: adapt command and possibly input due to 'taxonomy_folder' -# What is meant by taxonomy_folder?? -# rule kraken_report: -# input: os.path.join(KRAKEN_DIR, '{sample}_classified_unaligned_reads.txt') -# output: os.path.join(REPORT_DIR, '{sample}_kraken_report.html') -# log: os.path.join(LOG_DIR, 'kraken_report_{sample}.log') -# shell: "ktImportTaxonomy -m 3 -t 5 {input} -tax [taxonomy folder]??? -o {output} >> {log} 2>&1" +rule kraken: + input: + unaligned_fastq = os.path.join(MAPPED_READS_DIR, '{sample}_unaligned.fastq'), + database = KRAKEN_DB + output: os.path.join(KRAKEN_DIR, '{sample}_classified_unaligned_reads.txt') + log: os.path.join(LOG_DIR, 'kraken_{sample}.log') + shell: "kraken2 --report {output} --db {input.database} {input.unaligned_fastq} >> {log} 2>&1" + + +rule kraken_report: + input: os.path.join(KRAKEN_DIR, '{sample}_classified_unaligned_reads.txt') + output: os.path.join(REPORT_DIR, '{sample}_kraken_report.html') + params: + run_report = os.path.join(SCRIPTS_DIR, 'run_kraken_report.R'), + report_rmd = os.path.join(SCRIPTS_DIR, 'kraken_report.Rmd'), + kraken_output = os.path.join(KRAKEN_DIR, '{sample}_classified_unaligned_reads.txt') + log: os.path.join(LOG_DIR, 'kraken_report_{sample}.log') + shell: "Rscript {params.run_report} --reportFile={params.report_rmd} --kraken_output={params.kraken_output} --sample_name={wildcards.sample} --output_file={output} >> {log} 2>&1" + + +rule krona_report: + input: + kraken_output = os.path.join(KRAKEN_DIR, '{sample}_classified_unaligned_reads.txt'), + database = KRONA_DB + output: os.path.join(REPORT_DIR, '{sample}_krona_report.html') + log: os.path.join(LOG_DIR, 'krona_report_{sample}.log') + shell: "ktImportTaxonomy -m 3 -t 5 {input.kraken_output} -tax {input.database} -o {output} >> {log} 2>&1" From 20b5437ee1796d42886a07456640f07a82bd971b Mon Sep 17 00:00:00 2001 From: Dohmen Date: Fri, 7 May 2021 15:54:20 +0200 Subject: [PATCH 2/4] add input bed file --- tests/sample_data/nCoV-2019_NCref.bed | 218 ++++++++++++++++++++++++++ 1 file changed, 218 insertions(+) create mode 100644 tests/sample_data/nCoV-2019_NCref.bed diff --git a/tests/sample_data/nCoV-2019_NCref.bed b/tests/sample_data/nCoV-2019_NCref.bed new file mode 100644 index 00000000..c2264dac --- /dev/null +++ b/tests/sample_data/nCoV-2019_NCref.bed @@ -0,0 +1,218 @@ +NC_045512.2 30 54 nCoV-2019_1_LEFT nCoV-2019_1 + +NC_045512.2 385 410 nCoV-2019_1_RIGHT nCoV-2019_1 - +NC_045512.2 320 342 nCoV-2019_2_LEFT nCoV-2019_2 + +NC_045512.2 704 726 nCoV-2019_2_RIGHT nCoV-2019_2 - +NC_045512.2 642 664 nCoV-2019_3_LEFT nCoV-2019_1 + +NC_045512.2 1004 1028 nCoV-2019_3_RIGHT nCoV-2019_1 - +NC_045512.2 943 965 nCoV-2019_4_LEFT nCoV-2019_2 + +NC_045512.2 1312 1337 nCoV-2019_4_RIGHT nCoV-2019_2 - +NC_045512.2 1242 1264 nCoV-2019_5_LEFT nCoV-2019_1 + +NC_045512.2 1623 1651 nCoV-2019_5_RIGHT nCoV-2019_1 - +NC_045512.2 1573 1595 nCoV-2019_6_LEFT nCoV-2019_2 + +NC_045512.2 1942 1964 nCoV-2019_6_RIGHT nCoV-2019_2 - +NC_045512.2 1875 1897 nCoV-2019_7_LEFT nCoV-2019_1 + +NC_045512.2 1868 1890 nCoV-2019_7_LEFT_alt0 nCoV-2019_1 + +NC_045512.2 2247 2269 nCoV-2019_7_RIGHT nCoV-2019_1 - +NC_045512.2 2242 2264 nCoV-2019_7_RIGHT_alt5 nCoV-2019_1 - +NC_045512.2 2181 2205 nCoV-2019_8_LEFT nCoV-2019_2 + +NC_045512.2 2568 2592 nCoV-2019_8_RIGHT nCoV-2019_2 - +NC_045512.2 2505 2529 nCoV-2019_9_LEFT nCoV-2019_1 + +NC_045512.2 2504 2528 nCoV-2019_9_LEFT_alt4 nCoV-2019_1 + +NC_045512.2 2882 2904 nCoV-2019_9_RIGHT nCoV-2019_1 - +NC_045512.2 2880 2902 nCoV-2019_9_RIGHT_alt2 nCoV-2019_1 - +NC_045512.2 2826 2850 nCoV-2019_10_LEFT nCoV-2019_2 + +NC_045512.2 3183 3210 nCoV-2019_10_RIGHT nCoV-2019_2 - +NC_045512.2 3144 3166 nCoV-2019_11_LEFT nCoV-2019_1 + +NC_045512.2 3507 3531 nCoV-2019_11_RIGHT nCoV-2019_1 - +NC_045512.2 3460 3482 nCoV-2019_12_LEFT nCoV-2019_2 + +NC_045512.2 3826 3853 nCoV-2019_12_RIGHT nCoV-2019_2 - +NC_045512.2 3771 3795 nCoV-2019_13_LEFT nCoV-2019_1 + +NC_045512.2 4142 4164 nCoV-2019_13_RIGHT nCoV-2019_1 - +NC_045512.2 4054 4077 nCoV-2019_14_LEFT nCoV-2019_2 + +NC_045512.2 4044 4068 nCoV-2019_14_LEFT_alt4 nCoV-2019_2 + +NC_045512.2 4428 4450 nCoV-2019_14_RIGHT nCoV-2019_2 - +NC_045512.2 4402 4424 nCoV-2019_14_RIGHT_alt2 nCoV-2019_2 - +NC_045512.2 4294 4321 nCoV-2019_15_LEFT nCoV-2019_1 + +NC_045512.2 4296 4322 nCoV-2019_15_LEFT_alt1 nCoV-2019_1 + +NC_045512.2 4674 4696 nCoV-2019_15_RIGHT nCoV-2019_1 - +NC_045512.2 4666 4689 nCoV-2019_15_RIGHT_alt3 nCoV-2019_1 - +NC_045512.2 4636 4658 nCoV-2019_16_LEFT nCoV-2019_2 + +NC_045512.2 4995 5017 nCoV-2019_16_RIGHT nCoV-2019_2 - +NC_045512.2 4939 4966 nCoV-2019_17_LEFT nCoV-2019_1 + +NC_045512.2 5296 5321 nCoV-2019_17_RIGHT nCoV-2019_1 - +NC_045512.2 5230 5259 nCoV-2019_18_LEFT nCoV-2019_2 + +NC_045512.2 5257 5287 nCoV-2019_18_LEFT_alt2 nCoV-2019_2 + +NC_045512.2 5620 5644 nCoV-2019_18_RIGHT nCoV-2019_2 - +NC_045512.2 5620 5643 nCoV-2019_18_RIGHT_alt1 nCoV-2019_2 - +NC_045512.2 5563 5586 nCoV-2019_19_LEFT nCoV-2019_1 + +NC_045512.2 5932 5957 nCoV-2019_19_RIGHT nCoV-2019_1 - +NC_045512.2 5867 5894 nCoV-2019_20_LEFT nCoV-2019_2 + +NC_045512.2 6247 6272 nCoV-2019_20_RIGHT nCoV-2019_2 - +NC_045512.2 6167 6196 nCoV-2019_21_LEFT nCoV-2019_1 + +NC_045512.2 6168 6197 nCoV-2019_21_LEFT_alt2 nCoV-2019_1 + +NC_045512.2 6528 6550 nCoV-2019_21_RIGHT nCoV-2019_1 - +NC_045512.2 6526 6548 nCoV-2019_21_RIGHT_alt0 nCoV-2019_1 - +NC_045512.2 6466 6495 nCoV-2019_22_LEFT nCoV-2019_2 + +NC_045512.2 6846 6873 nCoV-2019_22_RIGHT nCoV-2019_2 - +NC_045512.2 6718 6745 nCoV-2019_23_LEFT nCoV-2019_1 + +NC_045512.2 7092 7117 nCoV-2019_23_RIGHT nCoV-2019_1 - +NC_045512.2 7035 7058 nCoV-2019_24_LEFT nCoV-2019_2 + +NC_045512.2 7389 7415 nCoV-2019_24_RIGHT nCoV-2019_2 - +NC_045512.2 7305 7332 nCoV-2019_25_LEFT nCoV-2019_1 + +NC_045512.2 7671 7694 nCoV-2019_25_RIGHT nCoV-2019_1 - +NC_045512.2 7626 7651 nCoV-2019_26_LEFT nCoV-2019_2 + +NC_045512.2 7997 8019 nCoV-2019_26_RIGHT nCoV-2019_2 - +NC_045512.2 7943 7968 nCoV-2019_27_LEFT nCoV-2019_1 + +NC_045512.2 8319 8341 nCoV-2019_27_RIGHT nCoV-2019_1 - +NC_045512.2 8249 8275 nCoV-2019_28_LEFT nCoV-2019_2 + +NC_045512.2 8635 8661 nCoV-2019_28_RIGHT nCoV-2019_2 - +NC_045512.2 8595 8619 nCoV-2019_29_LEFT nCoV-2019_1 + +NC_045512.2 8954 8983 nCoV-2019_29_RIGHT nCoV-2019_1 - +NC_045512.2 8888 8913 nCoV-2019_30_LEFT nCoV-2019_2 + +NC_045512.2 9245 9271 nCoV-2019_30_RIGHT nCoV-2019_2 - +NC_045512.2 9204 9226 nCoV-2019_31_LEFT nCoV-2019_1 + +NC_045512.2 9557 9585 nCoV-2019_31_RIGHT nCoV-2019_1 - +NC_045512.2 9477 9502 nCoV-2019_32_LEFT nCoV-2019_2 + +NC_045512.2 9834 9858 nCoV-2019_32_RIGHT nCoV-2019_2 - +NC_045512.2 9784 9806 nCoV-2019_33_LEFT nCoV-2019_1 + +NC_045512.2 10146 10171 nCoV-2019_33_RIGHT nCoV-2019_1 - +NC_045512.2 10076 10099 nCoV-2019_34_LEFT nCoV-2019_2 + +NC_045512.2 10437 10459 nCoV-2019_34_RIGHT nCoV-2019_2 - +NC_045512.2 10362 10384 nCoV-2019_35_LEFT nCoV-2019_1 + +NC_045512.2 10737 10763 nCoV-2019_35_RIGHT nCoV-2019_1 - +NC_045512.2 10666 10688 nCoV-2019_36_LEFT nCoV-2019_2 + +NC_045512.2 11048 11074 nCoV-2019_36_RIGHT nCoV-2019_2 - +NC_045512.2 10999 11022 nCoV-2019_37_LEFT nCoV-2019_1 + +NC_045512.2 11372 11394 nCoV-2019_37_RIGHT nCoV-2019_1 - +NC_045512.2 11306 11331 nCoV-2019_38_LEFT nCoV-2019_2 + +NC_045512.2 11668 11693 nCoV-2019_38_RIGHT nCoV-2019_2 - +NC_045512.2 11555 11584 nCoV-2019_39_LEFT nCoV-2019_1 + +NC_045512.2 11927 11949 nCoV-2019_39_RIGHT nCoV-2019_1 - +NC_045512.2 11863 11889 nCoV-2019_40_LEFT nCoV-2019_2 + +NC_045512.2 12234 12256 nCoV-2019_40_RIGHT nCoV-2019_2 - +NC_045512.2 12110 12133 nCoV-2019_41_LEFT nCoV-2019_1 + +NC_045512.2 12465 12490 nCoV-2019_41_RIGHT nCoV-2019_1 - +NC_045512.2 12417 12439 nCoV-2019_42_LEFT nCoV-2019_2 + +NC_045512.2 12779 12802 nCoV-2019_42_RIGHT nCoV-2019_2 - +NC_045512.2 12710 12732 nCoV-2019_43_LEFT nCoV-2019_1 + +NC_045512.2 13074 13096 nCoV-2019_43_RIGHT nCoV-2019_1 - +NC_045512.2 13005 13027 nCoV-2019_44_LEFT nCoV-2019_2 + +NC_045512.2 13007 13029 nCoV-2019_44_LEFT_alt3 nCoV-2019_2 + +NC_045512.2 13378 13400 nCoV-2019_44_RIGHT nCoV-2019_2 - +NC_045512.2 13363 13385 nCoV-2019_44_RIGHT_alt0 nCoV-2019_2 - +NC_045512.2 13319 13344 nCoV-2019_45_LEFT nCoV-2019_1 + +NC_045512.2 13307 13336 nCoV-2019_45_LEFT_alt2 nCoV-2019_1 + +NC_045512.2 13669 13699 nCoV-2019_45_RIGHT nCoV-2019_1 - +NC_045512.2 13660 13689 nCoV-2019_45_RIGHT_alt7 nCoV-2019_1 - +NC_045512.2 13599 13621 nCoV-2019_46_LEFT nCoV-2019_2 + +NC_045512.2 13602 13625 nCoV-2019_46_LEFT_alt1 nCoV-2019_2 + +NC_045512.2 13962 13984 nCoV-2019_46_RIGHT nCoV-2019_2 - +NC_045512.2 13961 13984 nCoV-2019_46_RIGHT_alt2 nCoV-2019_2 - +NC_045512.2 13918 13946 nCoV-2019_47_LEFT nCoV-2019_1 + +NC_045512.2 14271 14299 nCoV-2019_47_RIGHT nCoV-2019_1 - +NC_045512.2 14207 14232 nCoV-2019_48_LEFT nCoV-2019_2 + +NC_045512.2 14579 14601 nCoV-2019_48_RIGHT nCoV-2019_2 - +NC_045512.2 14545 14570 nCoV-2019_49_LEFT nCoV-2019_1 + +NC_045512.2 14898 14926 nCoV-2019_49_RIGHT nCoV-2019_1 - +NC_045512.2 14865 14895 nCoV-2019_50_LEFT nCoV-2019_2 + +NC_045512.2 15224 15246 nCoV-2019_50_RIGHT nCoV-2019_2 - +NC_045512.2 15171 15193 nCoV-2019_51_LEFT nCoV-2019_1 + +NC_045512.2 15538 15560 nCoV-2019_51_RIGHT nCoV-2019_1 - +NC_045512.2 15481 15503 nCoV-2019_52_LEFT nCoV-2019_2 + +NC_045512.2 15861 15886 nCoV-2019_52_RIGHT nCoV-2019_2 - +NC_045512.2 15827 15851 nCoV-2019_53_LEFT nCoV-2019_1 + +NC_045512.2 16186 16209 nCoV-2019_53_RIGHT nCoV-2019_1 - +NC_045512.2 16118 16144 nCoV-2019_54_LEFT nCoV-2019_2 + +NC_045512.2 16485 16510 nCoV-2019_54_RIGHT nCoV-2019_2 - +NC_045512.2 16416 16444 nCoV-2019_55_LEFT nCoV-2019_1 + +NC_045512.2 16804 16833 nCoV-2019_55_RIGHT nCoV-2019_1 - +NC_045512.2 16748 16770 nCoV-2019_56_LEFT nCoV-2019_2 + +NC_045512.2 17130 17152 nCoV-2019_56_RIGHT nCoV-2019_2 - +NC_045512.2 17065 17087 nCoV-2019_57_LEFT nCoV-2019_1 + +NC_045512.2 17430 17452 nCoV-2019_57_RIGHT nCoV-2019_1 - +NC_045512.2 17381 17406 nCoV-2019_58_LEFT nCoV-2019_2 + +NC_045512.2 17738 17761 nCoV-2019_58_RIGHT nCoV-2019_2 - +NC_045512.2 17674 17697 nCoV-2019_59_LEFT nCoV-2019_1 + +NC_045512.2 18036 18062 nCoV-2019_59_RIGHT nCoV-2019_1 - +NC_045512.2 17966 17993 nCoV-2019_60_LEFT nCoV-2019_2 + +NC_045512.2 18324 18348 nCoV-2019_60_RIGHT nCoV-2019_2 - +NC_045512.2 18253 18275 nCoV-2019_61_LEFT nCoV-2019_1 + +NC_045512.2 18650 18672 nCoV-2019_61_RIGHT nCoV-2019_1 - +NC_045512.2 18596 18618 nCoV-2019_62_LEFT nCoV-2019_2 + +NC_045512.2 18957 18979 nCoV-2019_62_RIGHT nCoV-2019_2 - +NC_045512.2 18896 18918 nCoV-2019_63_LEFT nCoV-2019_1 + +NC_045512.2 19275 19297 nCoV-2019_63_RIGHT nCoV-2019_1 - +NC_045512.2 19204 19232 nCoV-2019_64_LEFT nCoV-2019_2 + +NC_045512.2 19591 19616 nCoV-2019_64_RIGHT nCoV-2019_2 - +NC_045512.2 19548 19570 nCoV-2019_65_LEFT nCoV-2019_1 + +NC_045512.2 19911 19939 nCoV-2019_65_RIGHT nCoV-2019_1 - +NC_045512.2 19844 19866 nCoV-2019_66_LEFT nCoV-2019_2 + +NC_045512.2 20231 20255 nCoV-2019_66_RIGHT nCoV-2019_2 - +NC_045512.2 20172 20200 nCoV-2019_67_LEFT nCoV-2019_1 + +NC_045512.2 20542 20572 nCoV-2019_67_RIGHT nCoV-2019_1 - +NC_045512.2 20472 20496 nCoV-2019_68_LEFT nCoV-2019_2 + +NC_045512.2 20867 20890 nCoV-2019_68_RIGHT nCoV-2019_2 - +NC_045512.2 20786 20813 nCoV-2019_69_LEFT nCoV-2019_1 + +NC_045512.2 21146 21169 nCoV-2019_69_RIGHT nCoV-2019_1 - +NC_045512.2 21075 21104 nCoV-2019_70_LEFT nCoV-2019_2 + +NC_045512.2 21427 21455 nCoV-2019_70_RIGHT nCoV-2019_2 - +NC_045512.2 21357 21386 nCoV-2019_71_LEFT nCoV-2019_1 + +NC_045512.2 21716 21743 nCoV-2019_71_RIGHT nCoV-2019_1 - +NC_045512.2 21658 21682 nCoV-2019_72_LEFT nCoV-2019_2 + +NC_045512.2 22013 22038 nCoV-2019_72_RIGHT nCoV-2019_2 - +NC_045512.2 21961 21990 nCoV-2019_73_LEFT nCoV-2019_1 + +NC_045512.2 22324 22346 nCoV-2019_73_RIGHT nCoV-2019_1 - +NC_045512.2 22262 22290 nCoV-2019_74_LEFT nCoV-2019_2 + +NC_045512.2 22626 22650 nCoV-2019_74_RIGHT nCoV-2019_2 - +NC_045512.2 22516 22542 nCoV-2019_75_LEFT nCoV-2019_1 + +NC_045512.2 22877 22903 nCoV-2019_75_RIGHT nCoV-2019_1 - +NC_045512.2 22797 22819 nCoV-2019_76_LEFT nCoV-2019_2 + +NC_045512.2 22798 22821 nCoV-2019_76_LEFT_alt3 nCoV-2019_2 + +NC_045512.2 23192 23214 nCoV-2019_76_RIGHT nCoV-2019_2 - +NC_045512.2 23189 23212 nCoV-2019_76_RIGHT_alt0 nCoV-2019_2 - +NC_045512.2 23122 23144 nCoV-2019_77_LEFT nCoV-2019_1 + +NC_045512.2 23500 23522 nCoV-2019_77_RIGHT nCoV-2019_1 - +NC_045512.2 23443 23466 nCoV-2019_78_LEFT nCoV-2019_2 + +NC_045512.2 23822 23847 nCoV-2019_78_RIGHT nCoV-2019_2 - +NC_045512.2 23789 23812 nCoV-2019_79_LEFT nCoV-2019_1 + +NC_045512.2 24145 24169 nCoV-2019_79_RIGHT nCoV-2019_1 - +NC_045512.2 24078 24100 nCoV-2019_80_LEFT nCoV-2019_2 + +NC_045512.2 24443 24467 nCoV-2019_80_RIGHT nCoV-2019_2 - +NC_045512.2 24391 24416 nCoV-2019_81_LEFT nCoV-2019_1 + +NC_045512.2 24765 24789 nCoV-2019_81_RIGHT nCoV-2019_1 - +NC_045512.2 24696 24721 nCoV-2019_82_LEFT nCoV-2019_2 + +NC_045512.2 25052 25076 nCoV-2019_82_RIGHT nCoV-2019_2 - +NC_045512.2 24978 25003 nCoV-2019_83_LEFT nCoV-2019_1 + +NC_045512.2 25347 25369 nCoV-2019_83_RIGHT nCoV-2019_1 - +NC_045512.2 25279 25301 nCoV-2019_84_LEFT nCoV-2019_2 + +NC_045512.2 25646 25673 nCoV-2019_84_RIGHT nCoV-2019_2 - +NC_045512.2 25601 25623 nCoV-2019_85_LEFT nCoV-2019_1 + +NC_045512.2 25969 25994 nCoV-2019_85_RIGHT nCoV-2019_1 - +NC_045512.2 25902 25924 nCoV-2019_86_LEFT nCoV-2019_2 + +NC_045512.2 26290 26315 nCoV-2019_86_RIGHT nCoV-2019_2 - +NC_045512.2 26197 26219 nCoV-2019_87_LEFT nCoV-2019_1 + +NC_045512.2 26566 26590 nCoV-2019_87_RIGHT nCoV-2019_1 - +NC_045512.2 26520 26542 nCoV-2019_88_LEFT nCoV-2019_2 + +NC_045512.2 26890 26913 nCoV-2019_88_RIGHT nCoV-2019_2 - +NC_045512.2 26835 26857 nCoV-2019_89_LEFT nCoV-2019_1 + +NC_045512.2 26838 26860 nCoV-2019_89_LEFT_alt2 nCoV-2019_1 + +NC_045512.2 27202 27227 nCoV-2019_89_RIGHT nCoV-2019_1 - +NC_045512.2 27190 27215 nCoV-2019_89_RIGHT_alt4 nCoV-2019_1 - +NC_045512.2 27141 27164 nCoV-2019_90_LEFT nCoV-2019_2 + +NC_045512.2 27511 27533 nCoV-2019_90_RIGHT nCoV-2019_2 - +NC_045512.2 27446 27471 nCoV-2019_91_LEFT nCoV-2019_1 + +NC_045512.2 27825 27854 nCoV-2019_91_RIGHT nCoV-2019_1 - +NC_045512.2 27784 27808 nCoV-2019_92_LEFT nCoV-2019_2 + +NC_045512.2 28145 28172 nCoV-2019_92_RIGHT nCoV-2019_2 - +NC_045512.2 28081 28104 nCoV-2019_93_LEFT nCoV-2019_1 + +NC_045512.2 28442 28464 nCoV-2019_93_RIGHT nCoV-2019_1 - +NC_045512.2 28394 28416 nCoV-2019_94_LEFT nCoV-2019_2 + +NC_045512.2 28756 28779 nCoV-2019_94_RIGHT nCoV-2019_2 - +NC_045512.2 28677 28699 nCoV-2019_95_LEFT nCoV-2019_1 + +NC_045512.2 29041 29063 nCoV-2019_95_RIGHT nCoV-2019_1 - +NC_045512.2 28985 29007 nCoV-2019_96_LEFT nCoV-2019_2 + +NC_045512.2 29356 29378 nCoV-2019_96_RIGHT nCoV-2019_2 - +NC_045512.2 29288 29316 nCoV-2019_97_LEFT nCoV-2019_1 + +NC_045512.2 29665 29693 nCoV-2019_97_RIGHT nCoV-2019_1 - +NC_045512.2 29486 29510 nCoV-2019_98_LEFT nCoV-2019_2 + +NC_045512.2 29836 29866 nCoV-2019_98_RIGHT nCoV-2019_2 - From 1ac66a0be696627fb3193af14620d017585a94e8 Mon Sep 17 00:00:00 2001 From: Dohmen Date: Fri, 7 May 2021 15:55:45 +0200 Subject: [PATCH 3/4] add rules for variants and qc, minor changes --- snakefile.py | 159 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 103 insertions(+), 56 deletions(-) diff --git a/snakefile.py b/snakefile.py index 66a279df..39e753f6 100644 --- a/snakefile.py +++ b/snakefile.py @@ -29,14 +29,17 @@ # 1. obtain things from config file # 2. make dummy samples and put them to tests/sample_data/reads # 3. align overall appearance -READS_DIR = 'tests/sample_data/reads' -REFERENCE_FASTA = 'tests/sample_data/NC_045512.2.fasta' -KRAKEN_DB = 'tests/databases/kraken_db' -KRONA_DB = 'tests/databases/krona_db' -SAMPLE_SHEET_CSV = 'tests/sample_sheet.csv' - -OUTPUT_DIR = 'output' -OUTPUT_DIR = os.path.join(os.getcwd(), OUTPUT_DIR) +READS_DIR = os.path.join(os.getcwd(), 'tests/sample_data/reads') +REFERENCE_FASTA = os.path.join(os.getcwd(), 'tests/sample_data/NC_045512.2.fasta') +AMPLICONS_BED = os.path.join(os.getcwd(), 'tests/sample_data/nCoV-2019_NCref.bed') +KRAKEN_DB = os.path.join(os.getcwd(), 'tests/databases/kraken_db') +KRONA_DB = os.path.join(os.getcwd(), 'tests/databases/krona_db') +SIGMUT_DB = os.path.join(os.getcwd(), 'tests/databases/sigmut_db') +VEP_DB = os.path.join(os.getcwd(), 'tests/databases/vep_db') +SAMPLE_SHEET_CSV = os.path.join(os.getcwd(), 'tests/sample_sheet.csv') + +OUTPUT_DIR = os.path.join(os.getcwd(), 'output') + TRIMMED_READS_DIR = os.path.join(OUTPUT_DIR, 'trimmed_reads') LOG_DIR = os.path.join(OUTPUT_DIR, 'logs') FASTQC_DIR = os.path.join(OUTPUT_DIR, 'fastqc') @@ -44,9 +47,10 @@ MAPPED_READS_DIR = os.path.join(OUTPUT_DIR, 'mapped_reads') VARIANTS_DIR = os.path.join(OUTPUT_DIR, 'variants') KRAKEN_DIR = os.path.join(OUTPUT_DIR, 'kraken') +COVERAGE_DIR = os.path.join(OUTPUT_DIR, 'coverage') REPORT_DIR = os.path.join(OUTPUT_DIR, 'report') -SCRIPTS_DIR = 'scripts' +SCRIPTS_DIR = os.path.join(os.getcwd(), 'scripts') # Load sample sheet @@ -63,7 +67,7 @@ 'final_reports': { 'description': "Produce a comprehensive report. This is the default target.", 'files': ( - expand(os.path.join(VARIANTS_DIR, '{sample}_snv.csv'), sample=SAMPLES) + + expand(os.path.join(REPORT_DIR, '{sample}_qc_report.html'), sample=SAMPLES) + expand(os.path.join(REPORT_DIR, '{sample}_variant_report.html'), sample=SAMPLES) + expand(os.path.join(REPORT_DIR, '{sample}_kraken_report.html'), sample=SAMPLES) ) @@ -74,12 +78,18 @@ expand(os.path.join(VARIANTS_DIR, '{sample}_snv.csv'), sample=SAMPLES) ) }, - 'variant_report': { - 'description': "Make variants report.", + 'variant_reports': { + 'description': "Make variants reports.", 'files': ( expand(os.path.join(REPORT_DIR, '{sample}_variant_report.html'), sample=SAMPLES) ) }, + 'qc_reports': { + 'description': "Make QC reports.", + 'files': ( + expand(os.path.join(REPORT_DIR, '{sample}_qc_report.html'), sample=SAMPLES) + ) + }, 'kraken_reports': { 'description': "Make Kraken reports.", 'files': ( @@ -87,9 +97,8 @@ expand(os.path.join(REPORT_DIR, '{sample}_krona_report.html'), sample=SAMPLES) ) } - } -selected_targets = ['kraken_reports'] +selected_targets = ['variant_reports'] OUTPUT_FILES = list(chain.from_iterable([targets[name]['files'] for name in selected_targets])) @@ -130,9 +139,10 @@ r1 = os.path.join(TRIMMED_READS_DIR, "{sample}_1.fastq"), r2 = os.path.join(TRIMMED_READS_DIR, "{sample}_2.fastq") params: - len_cutoff = int(150 * 0.8) # read length * pct_cutoff + len_cutoff = int(150 * 0.8), # read length * pct_cutoff + output = os.path.join(TRIMMED_READS_DIR, "{sample}") log: os.path.join(LOG_DIR, 'prinseq_{sample}.log') - shell: "prinseq-lite.pl -fastq {input.r1} -fastq2 {input.r2} -ns_max_n 4 -min_qual_mean 30 -trim_qual_left 30 -trim_qual_right 30 -trim_qual_window 10 -out_good {TRIMMED_READS_DIR}/{wildcards.sample} -out_bad null -min_len {params.len_cutoff} >> {log} 2>&1" + shell: "prinseq-lite.pl -fastq {input.r1} -fastq2 {input.r2} -ns_max_n 4 -min_qual_mean 30 -trim_qual_left 30 -trim_qual_right 30 -trim_qual_window 10 -out_good {params.output} -out_bad null -min_len {params.len_cutoff} >> {log} 2>&1" rule bwa_index: @@ -154,10 +164,6 @@ shell: "bwa mem -t {params.threads} {input.ref} {input.fastq} > {output} 2>> {log} 3>&2" -# TODO: also get unaligned .bam --> so far tmp.bam is used -# same rule to get unaligned .bam -# probably something like below, but needs to be checked for our purpose (kraken reports) -# "samtools view -bh -F 2 {input} > {output} 2>> {log} 3>&2" rule samtools_filter_aligned: input: os.path.join(MAPPED_READS_DIR, '{sample}_aligned_tmp.sam') output: os.path.join(MAPPED_READS_DIR, '{sample}_aligned.bam') @@ -165,27 +171,13 @@ shell: "samtools view -bh -f 2 -F 2048 {input} > {output} 2>> {log} 3>&2" +# TODO: check command to get unaligned ones rule samtools_filter_unaligned: input: os.path.join(MAPPED_READS_DIR, '{sample}_aligned_tmp.sam') output: os.path.join(MAPPED_READS_DIR, '{sample}_unaligned.bam') log: os.path.join(LOG_DIR, 'samtools_filter_unaligned_{sample}.log') shell: "samtools view -bh -F 2 {input} > {output} 2>> {log} 3>&2" -# probably not needed --> directly converted during filtering step (-b) -# rule samtools_sam2bam_A: -# input: os.path.join(MAPPED_READS_DIR, '{sample}_aligned.sam') -# output: os.path.join(MAPPED_READS_DIR, '{sample}_aligned.bam') -# log: os.path.join(LOG_DIR, 'samtools_sam2bam_{sample}.log') -# shell: "samtools view -bS {input} > {output} >> {log} 2>&1" - - -# # TODO: merge rule A and B and possibly? use unaligned instead of tmp -# rule samtools_sam2bam_B: -# input: os.path.join(MAPPED_READS_DIR, '{sample}_aligned_tmp.sam') -# output: os.path.join(MAPPED_READS_DIR, '{sample}_aligned_tmp.bam') -# log: os.path.join(LOG_DIR, 'samtools_sam2bam_B_{sample}.log') -# shell: "samtools view -bS {input} > {output} >> {log} 2>&1" - rule samtools_sort: input: os.path.join(MAPPED_READS_DIR, '{sample}_aligned.bam') @@ -205,15 +197,14 @@ input: os.path.join(MAPPED_READS_DIR, '{sample}_aligned_sorted.bam') output: os.path.join(FASTQC_DIR, '{sample}_aligned_sorted_fastqc.zip') log: os.path.join(LOG_DIR, 'fastqc_{sample}.log') - params: - out_dir = os.path.join(FASTQC_DIR) - shell: "fastqc -o {params.out_dir} -f bam {input} >> {log} 2>&1" + shell: "fastqc -o {FASTQC_DIR} -f bam {input} >> {log} 2>&1" # TODO: check if --call-indels is needed? rule lofreq: input: aligned_bam = os.path.join(MAPPED_READS_DIR, '{sample}_aligned_sorted.bam'), + aligned_bai = os.path.join(MAPPED_READS_DIR, '{sample}_aligned_sorted.bai'), ref = REFERENCE_FASTA output: os.path.join(VARIANTS_DIR, '{sample}_snv.vcf') log: os.path.join(LOG_DIR, 'lofreq_{sample}.log') @@ -223,26 +214,40 @@ rule vcf2csv: input: os.path.join(VARIANTS_DIR, '{sample}_snv.vcf') output: os.path.join(VARIANTS_DIR, '{sample}_snv.csv') + params: + script = os.path.join(SCRIPTS_DIR, 'vcfTocsv.py') log: os.path.join(LOG_DIR, 'vcf2csv_{sample}.log') - shell: "python {SCRIPTS_DIR}/vcfTocsv.py {input} >> {log} 2>&1" + shell: "python {params.script} {input} >> {log} 2>&1" + + +rule vep: + input: os.path.join(VARIANTS_DIR, '{sample}_snv.vcf') + output: os.path.join(VARIANTS_DIR, '{sample}_vep_sarscov2.txt') + params: + species = "sars_cov_2" + log: os.path.join(LOG_DIR, 'vep_{sample}.log') + shell: "vep --verbose --offline --dir_cache {VEP_DB} --DB_VERSION 101 --appris --biotype --buffer_size 5000 --check_existing --distance 5000 --mane --protein --species {params.species} --symbol --transcript_version --tsl --input_file {input} --output_file {output} >> {log} 2>&1" -# TODO: add vep rule -# rule vep: -# input: -# output: -# log: -# shell: "" +rule parse_vep: + input: os.path.join(VARIANTS_DIR, '{sample}_vep_sarscov2.txt') + output: os.path.join(VARIANTS_DIR, '{sample}_vep_sarscov2_parsed.txt') + params: + script = os.path.join(SCRIPTS_DIR, 'parse_vep.py') + log: os.path.join(LOG_DIR, 'parse_vep_{sample}.log') + shell: "python {params.script} {VARIANTS_DIR} {input} {output} >> {log} 2>&1" -# TODO: fill in blanks, why sample_dir? where are sigmuts coming from? -# rule variant_report: -# input: -# vep_txt = os.path.join(VARIANTS_DIR, '{sample}_vep.txt'), -# snv_csv = os.path.join(VARIANTS_DIR, '{sample}_snv.csv') -# output: os.path.join(REPORT_DIR, '{sample}_variant_report.html') -# log: os.path.join(LOG_DIR, 'vcf2csv_{sample}.log') -# shell: "Rscript {SCRIPTS_DIR}/run_variant_report.R --reportFile={SCRIPTS_DIR}/variantreport_p_sample.rmd --vep_txt_file={input.vep_txt} --snv_csv_file={input.snv_csv} --location_sigmuts=??? --sample_dir=??? --sample_name={wildcards.sample} >> {log} 2>&1" +rule variant_report: + input: + vep_txt = os.path.join(VARIANTS_DIR, '{sample}_vep_sarscov2_parsed.txt'), + snv_csv = os.path.join(VARIANTS_DIR, '{sample}_snv.csv') + output: os.path.join(REPORT_DIR, '{sample}_variant_report.html') + params: + run_report = os.path.join(SCRIPTS_DIR, 'run_variant_report.R'), + report_rmd = os.path.join(SCRIPTS_DIR, 'variant_report.Rmd') + log: os.path.join(LOG_DIR, 'variant_report_{sample}.log') + shell: "Rscript {params.run_report} --reportFile={params.report_rmd} --vep_txt_file={input.vep_txt} --snv_csv_file={input.snv_csv} --location_sigmuts={SIGMUT_DB} --sample_dir={VARIANTS_DIR} --sample_name={wildcards.sample} --outFile={output} >> {log} 2>&1" rule bam2fastq: @@ -266,10 +271,9 @@ output: os.path.join(REPORT_DIR, '{sample}_kraken_report.html') params: run_report = os.path.join(SCRIPTS_DIR, 'run_kraken_report.R'), - report_rmd = os.path.join(SCRIPTS_DIR, 'kraken_report.Rmd'), - kraken_output = os.path.join(KRAKEN_DIR, '{sample}_classified_unaligned_reads.txt') + report_rmd = os.path.join(SCRIPTS_DIR, 'kraken_report.Rmd') log: os.path.join(LOG_DIR, 'kraken_report_{sample}.log') - shell: "Rscript {params.run_report} --reportFile={params.report_rmd} --kraken_output={params.kraken_output} --sample_name={wildcards.sample} --output_file={output} >> {log} 2>&1" + shell: "Rscript {params.run_report} --reportFile={params.report_rmd} --kraken_output={input} --sample_name={wildcards.sample} --output_file={output} >> {log} 2>&1" rule krona_report: @@ -279,3 +283,46 @@ output: os.path.join(REPORT_DIR, '{sample}_krona_report.html') log: os.path.join(LOG_DIR, 'krona_report_{sample}.log') shell: "ktImportTaxonomy -m 3 -t 5 {input.kraken_output} -tax {input.database} -o {output} >> {log} 2>&1" + + +# TODO: does it have to be the 'unsorted' bam file? If so, we have to do indexing on those, too +rule samtools_bedcov: + input: + amplicons_bed = AMPLICONS_BED, + aligned_bam = os.path.join(MAPPED_READS_DIR, '{sample}_aligned_sorted.bam'), + aligned_bai = os.path.join(MAPPED_READS_DIR, '{sample}_aligned_sorted.bai') + output: os.path.join(COVERAGE_DIR, '{sample}_amplicons.csv') + log: os.path.join(LOG_DIR, 'samtools_bedcov_{sample}.log') + shell:"samtools bedcov {input.amplicons_bed} {input.aligned_bam} > {output} 2>> {log} 3>&2" + + +# TODO: does it have to be the 'unsorted' bam file? If so, we have to do indexing on those, too +rule samtools_coverage: + input: + aligned_bam = os.path.join(MAPPED_READS_DIR, '{sample}_aligned_sorted.bam'), + aligned_bai = os.path.join(MAPPED_READS_DIR, '{sample}_aligned_sorted.bai') + output: os.path.join(COVERAGE_DIR, '{sample}_coverage.csv') + log: os.path.join(LOG_DIR, 'samtools_coverage_{sample}.log') + shell: "samtools coverage {input.aligned_bam} > {output} 2>> {log} 3>&2" + + +rule get_qc_table: + input: + coverage_csv = os.path.join(COVERAGE_DIR, '{sample}_coverage.csv'), + amplicon_csv = os.path.join(COVERAGE_DIR, '{sample}_amplicons.csv') + output: os.path.join(COVERAGE_DIR, '{sample}_merged_covs.csv') + params: + script = os.path.join(SCRIPTS_DIR, 'get_qc_table.py') + log: os.path.join(LOG_DIR, 'get_qc_table_{sample}.log') + shell: "python {params.script} {input.coverage_csv} {input.amplicon_csv} {output} >> {log} 2>&1" + + +# TODO: fix Error: unexpected end of input +# rule qc_report: +# input: os.path.join(COVERAGE_DIR, '{sample}_merged_covs.csv') +# output: os.path.join(REPORT_DIR, '{sample}_qc_report.html') +# params: +# run_report = os.path.join(SCRIPTS_DIR, 'run_qc_report.R'), +# report_rmd = os.path.join(SCRIPTS_DIR, 'qc_report.Rmd') +# log: os.path.join(LOG_DIR, 'qc_report_{sample}.log') +# shell: "Rscript {params.run_report} --reportFile={params.report_rmd} --coverage_file={input} --sample_name={wildcards.sample} --outFile={output} >> {log} 2>&1" \ No newline at end of file From 4390e17f3487c7e1bc270b548e5f028e35ebd06f Mon Sep 17 00:00:00 2001 From: Dohmen Date: Fri, 7 May 2021 15:53:35 +0200 Subject: [PATCH 4/4] focus on relevant scripts, add sigmut input --- .../{kraken_report.Rmd => Kraken2-report.Rmd} | 0 scripts/ensemble_vep.py | 103 ------------- scripts/get_qc_table.py | 86 +++++++++++ scripts/parse_vep.py | 50 ++++++ scripts/prep_sample_space_210320.py | 21 --- scripts/qc_report_per_sample.rmd | 89 +++++++++++ ...n_kraken_report.R => run_Kraken2-report.R} | 0 scripts/run_QC_report.R | 81 ++++++++++ scripts/run_variant_report.R | 9 +- scripts/variantreport_p_sample.rmd | 143 +++++++----------- snakefile.py | 12 +- ...akinfo_mutation_report_data_2021-03-06.tsv | 22 +++ ...akinfo_mutation_report_data_2021-03-06.tsv | 9 ++ ...akinfo_mutation_report_data_2021-03-06.tsv | 9 ++ ...akinfo_mutation_report_data_2021-03-06.tsv | 11 ++ ...akinfo_mutation_report_data_2021-03-06.tsv | 11 ++ ...akinfo_mutation_report_data_2021-03-06.tsv | 23 +++ 17 files changed, 458 insertions(+), 221 deletions(-) rename scripts/{kraken_report.Rmd => Kraken2-report.Rmd} (100%) delete mode 100644 scripts/ensemble_vep.py create mode 100644 scripts/get_qc_table.py create mode 100644 scripts/parse_vep.py delete mode 100644 scripts/prep_sample_space_210320.py create mode 100644 scripts/qc_report_per_sample.rmd rename scripts/{run_kraken_report.R => run_Kraken2-report.R} (100%) create mode 100644 scripts/run_QC_report.R create mode 100644 tests/databases/sigmut_db/B117_outbreakinfo_mutation_report_data_2021-03-06.tsv create mode 100644 tests/databases/sigmut_db/B1351_outbreakinfo_mutation_report_data_2021-03-06.tsv create mode 100644 tests/databases/sigmut_db/B1427_outbreakinfo_mutation_report_data_2021-03-06.tsv create mode 100644 tests/databases/sigmut_db/B1429_outbreakinfo_mutation_report_data_2021-03-06.tsv create mode 100644 tests/databases/sigmut_db/B1526_outbreakinfo_mutation_report_data_2021-03-06.tsv create mode 100644 tests/databases/sigmut_db/P1_outbreakinfo_mutation_report_data_2021-03-06.tsv diff --git a/scripts/kraken_report.Rmd b/scripts/Kraken2-report.Rmd similarity index 100% rename from scripts/kraken_report.Rmd rename to scripts/Kraken2-report.Rmd diff --git a/scripts/ensemble_vep.py b/scripts/ensemble_vep.py deleted file mode 100644 index ec271766..00000000 --- a/scripts/ensemble_vep.py +++ /dev/null @@ -1,103 +0,0 @@ - -# 2021/04/28 -# Vic-Fabienne Schumann -# dependencies: ensemble-vep - -# this script contains the functions and commands to run the guix ensemble-vep for sars-cov-2 and parse it's output -# so it can directly used for the variant reports - -# check if database is downloaded and located in ~/.vep -# TODO: or does this will be handled by snakemake? -# NOTE: Input file should be given with the full path, otherwise I can't guarantee that it works.^^ -# NOTE: parse_vep_cli_output() only works if the vep_report exists already. -# But in theory the other functions are executed earlier anyways - -# !!!! -# prepare vep data base by downloading the sars-cov-2 assembly and annotation: -# bash: "wget ftp://ftp.ensemblgenomes.org/pub/viruses/variation/indexed_vep_cache/sars_cov_2_vep_101_ASM985889v3.tar.gz" -# bash: "mv sars_cov_2*.tar.gz ~/.vep && cd ~/.vep && tar xf sars_cov_2*.tar.gz" - -import os -import sys -import re -import subprocess - -def check_vep_dir(vep_report_txt): - - vep_dir = os.path.dirname(vep_report_txt) - if not os.path.isdir(vep_dir): - - # print("mkdir -v {}".format(vep_dir)) - subprocess.call("mkdir -v {}".format(vep_dir), shell=True) - -def run_vep(vep_report, snv_file): - - if not os.path.isfile(vep_report): - - # print("vep --verbose --offline --DB_VERSION 101 --appris --biotype --buffer_size 5000 --check_existing " - # "--distance 5000 --mane --protein --species sars_cov_2 --symbol --transcript_version --tsl " - # "--input_file {} --output_file {}".format(snv_file,vep_report)) - - subprocess.call("vep --verbose --offline --DB_VERSION 101 --appris --biotype --buffer_size 5000 --check_existing " - "--distance 5000 --mane --protein --species sars_cov_2 --symbol --transcript_version --tsl " - "--input_file {} --output_file {}".format(snv_file,vep_report_txt)) - - -def prepare_vep_output(vep_report_txt): - - ''' checks if the first line of the vep output is the correct headerline, will parse the file - accordingly if not ''' - - filename = os.path.basename(vep_report_txt) - - with open(vep_report_txt, 'r') as vep_report: - # check if it starts with "Uploaded_variation" in the first line - if not re.match("^Uploaded_variation", vep_report.readline()): - print("some parsing to do") - parse_vep_cli_output(vep_report_txt) - - # remove old file and rename new file with that name - # print(f"rm -v {vep_report_txt} && mv -v tmp.txt {filename}") - subprocess.call(f"rm {vep_report_txt} && mv tmp.txt {filename}") - else: - print("fine to go") - -def parse_vep_cli_output(vep_report): - - ''' takes the txt output of ensemble vep cli as input and removes all info lines and lines starting with # so it can be - read in as a table with correct header by R ''' - - vep_dir = os.path.dirname(os.path.abspath(vep_report)) - print(f"operating in dir: {vep_dir}") - header_line = None - - with open(vep_report, 'r') as file: - with open(os.path.join(vep_dir,"tmp.txt"), 'w+') as clean_file: - for num, line in enumerate(file,1): - if re.match("#Uploaded_variation",line): - header_line = num - clean_file.write(line[1:]) - break - if header_line is not None: - for num, line in enumerate(file,header_line): - clean_file.write(line) - else: - raise Exception("file has a format or does not has to be parsed anymore") - -def main(): - - vep_report_txt = sys.argv[1] - snv_file = sys.argv[2] - - # check if there is a vep dir in the sample dir, or make one - check_vep_dir(vep_report_txt) - - # check if there is a vep file, if not run ensemble vep - run_vep(vep_report_txt,snv_file) - - # read file and check if it looks ok or not - prepare_vep_output(vep_report_txt) - -if __name__ == '__main__': - main() - diff --git a/scripts/get_qc_table.py b/scripts/get_qc_table.py new file mode 100644 index 00000000..038b8d9b --- /dev/null +++ b/scripts/get_qc_table.py @@ -0,0 +1,86 @@ +import sys +import re +import csv + + +def parsing_cov(coverage_file): + with open(coverage_file, "r") as cov_file: + reader = csv.reader(cov_file, delimiter="\t") + + for row in reader: + for element in row: + if not re.match("#", element): + aligned_reads = row[3] + total_cov = row[5] + mean_depth = row[6] + return (aligned_reads, total_cov, mean_depth) + + +def parsing_amplicon(amplicon_file): + with open(amplicon_file, "r") as amp_file: + reader = csv.reader(amp_file, delimiter="\t") + + num_covered_amplicons = 0 + covered_amplicon = 0 + drop_out_amplicons = [] + full_drop_out = [] + half_covered = [] + + for row in reader: + + amplicon = row[3].split("_")[1] + + if amplicon != str(covered_amplicon): + covered_amplicon = amplicon + num_covered_amplicons += 1 + + if int(row[6]) == 0: + drop_out_amplicons.append(amplicon) + + if len(drop_out_amplicons) != 0: + for ampli in drop_out_amplicons: + cnt = drop_out_amplicons.count(ampli) + if cnt == 2: + full_drop_out.append(ampli) + if cnt == 1: + half_covered.append(ampli) + + full_drop_out = list(set(full_drop_out)) + fully_covered_amplicons = int(num_covered_amplicons) - len(drop_out_amplicons) + + return (fully_covered_amplicons, half_covered, full_drop_out) + + +def make_report_input_csv(coverage_file, amplicon_file, output_file): + coverage_info = parsing_cov(coverage_file) + amplicon_info = parsing_amplicon(amplicon_file) + print(coverage_info) + print(amplicon_info) + + header = [ + "Total number of amplicons fully covered", + "Amplicons partially covered", + "Drop-out amplicons", + "Total number aligned reads", + "Coverage", + "Mean depth", + ] + fields = [ + amplicon_info[0], + amplicon_info[1], + amplicon_info[2], + coverage_info[0], + coverage_info[1], + coverage_info[2], + ] + + with open(output_file, "a") as f: + writer = csv.writer(f, delimiter="\t") + writer.writerow(header) + writer.writerow(fields) + +if __name__ == '__main__': + coverage_file = sys.argv[1] + amplicon_file = sys.argv[2] + output = sys.argv[3] + make_report_input_csv(coverage_file, amplicon_file, output) \ No newline at end of file diff --git a/scripts/parse_vep.py b/scripts/parse_vep.py new file mode 100644 index 00000000..f45d69f4 --- /dev/null +++ b/scripts/parse_vep.py @@ -0,0 +1,50 @@ +import re +import os +import csv +import sys + +def parse_vep(vep_dir, input, output): + header_line = None + + with open(input, 'r') as file: + if header_line: + raise Exception("file has not the correct format or does not has to be parsed anymore") + + with open(os.path.join(vep_dir, "tmp.txt"), 'w+') as clean_file: + for num, line in enumerate(file, 1): + if re.match("#Uploaded_variation", line): + header_line = num + clean_file.write(line[1:]) + break + for num, line in enumerate(file, header_line): + clean_file.write(line) + + with open(os.path.join(vep_dir,'tmp.txt'), 'r') as file: + with open(output, 'w') as output: + reader = csv.reader(file, delimiter="\t") + writer = csv.writer(output, lineterminator='\n') + + # make new column SYMBOL for gene names + # fixme: This is redundant with the making of the header line above, but works for now + gene_column = [] + header = next(reader) + header.append("SYMBOL") + gene_column.append(header) + + for row in reader: # fixme: I'm not sure if the following construct is ugly or not + # get gene name + extra = row[13].split(";") + for element in extra: + if re.search(r"SYMBOL=", element): + gene = element.split("=")[1:][0] + row.append(gene) + gene_column.append(row) + + writer.writerows(gene_column) + # TODO: add step to clean "tmp.txt" + +if __name__ == '__main__': + vep_dir = sys.argv[1] + input = sys.argv[2] + output = sys.argv[3] + parse_vep(vep_dir, input, output) \ No newline at end of file diff --git a/scripts/prep_sample_space_210320.py b/scripts/prep_sample_space_210320.py deleted file mode 100644 index 5921dbbd..00000000 --- a/scripts/prep_sample_space_210320.py +++ /dev/null @@ -1,21 +0,0 @@ -## 21/04/20 - writing a sample space script - -import subprocess -import os - -datadir = "/local/Projects/AAkalin_pathogenomics/data/wastewater_all" -sampledir = "/home/vschuma/pathogenomics/vpipe/akalinlab_pathogenomics/samples/wastewater_all" - -######### -for file in datadir: - samplename = os.path.basename(file).split("_R")[0] - filename_clean = file.split("_001") - filename_clean = ''.join(filename_clean) - - # make dir - dir = ''.join([samplename, '/', 'raw_data']) - subprocess.call('mkdir -vp {}'.format(dir), shell=True) - - # copy files from local to sample dirs - clean_file_sample_dir = ''.join([dir, '/', os.path.basename(filename_clean)]) - subprocess.call('cp -v {} {}'.format(file, clean_file_sample_dir), shell=True) diff --git a/scripts/qc_report_per_sample.rmd b/scripts/qc_report_per_sample.rmd new file mode 100644 index 00000000..a9fd4745 --- /dev/null +++ b/scripts/qc_report_per_sample.rmd @@ -0,0 +1,89 @@ +--- +title: "SARS-CoV-2 QC report" +date: '`r format(as.POSIXct(if ("" != Sys.getenv("SOURCE_DATE_EPOCH")) { as.numeric(Sys.getenv("SOURCE_DATE_EPOCH")) } else { Sys.time() }, origin="1970-01-01"), "%Y-%m-%d %H:%M:%S")`' +params: + coverage_file: '' + sample_name: '' +theme: darkly +--- + +```{r setup, include=FALSE, message = FALSE, warning = FALSE} +knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE) +knitr::opts_knit$set(root.dir = params$workdir) +library(DT) +library(ggplot2) +library(dplyr) +library(tidyr) +library(qpcR) +library(stringr) +library(magrittr) +library(openssl) +``` +# Input Settings +```{r InputSettings, echo = FALSE, include = FALSE} +coverage_file <- params$coverage_file +sample_name <- params$sample_name +``` +```{r printInputSettings, echo = FALSE, include = FALSE} +# coverage_file <- '/home/vfs/PycharmProjects/Akalinlab_pathogenomics/PiGx_WW/tests/coverage/Test_merged_covs.csv' +# sample_name <- "Test" +``` +### Sample: `r sample_name` +```{r reading data, include = FALSE} +coverage.df <- read.table(coverage_file,sep = "\t", header = T) +coverage.df <- na_if(coverage.df, "[]") +coverage.df <- gsub('\\[','',coverage.df) +coverage.df <- gsub('\\]','',coverage.df) +# fixme: this can be sure done in a nicer and more compact way +``` +## Amplicon coverage +The information about the amplicon primer was taken from the [ARTIC bioinformatics platform](https://github.com/joshquick/artic-ncov2019/blob/master/primer_schemes/nCoV-2019/V3/nCoV-2019.bed). +The reference name "MN908947.3" was changed to "NC_045512.2". Both are describing the same reference genome for +[SARS-CoV-2](https://www.ncbi.nlm.nih.gov/nuccore/1798174254). + +```{r, message=F, warning=F, echo = FALSE} + +AmpliconCoverage <- data.frame("Number of amplicon fully covered" = paste0(coverage.df[1],' out of 98'), + "Anmplicons partially covered" = coverage.df[2], + "Drop-out amplicons" = coverage.df[3]) # maybe more clear title stating which coverage + +DT::datatable(t(AmpliconCoverage), options = list(searching = FALSE, paging = FALSE)) + +``` + +## Coverage + +```{r, message=F, warning=F, echo = FALSE} + +Coverage <- data.frame("Total coverage" = coverage.df[4], + "Aligned reads" = paste(round(as.integer(coverage.df[5]),1),"%"), + "Mean depth" = round(as.integer(coverage.df[6]))) # maybe more clear title stating which coverage + +DT::datatable(t(Coverage), options = list(searching = FALSE, paging = FALSE)) + +``` +## FastQC and Trimming + +Download: [fastqc report of unprocessed data](/path/to/fastqc.html) +Download: [fastqc report of pre-processes data](/path/to/fastqc.html) + + + +The reads were pre-processed as followed: + +| | | +|:-----|:----| +|trim length cutoff prinseq |0.2| + +## Download Stats csv +```{r, include = F} + +write.csv2(coverage.df, paste0(dirname(coverage_file),"/report_download_coverage.csv")) # not sure about the funct of the dataframe here, but it works so I leave it for now + +readLines(coverage_file) %>% + paste0(collapse="\n") %>% + openssl::base64_encode() -> coverage +``` +Coverage statistics: +[Download Coverage_stats.csv](`r sprintf('data:text/csv;base64,%s', coverage)`) + diff --git a/scripts/run_kraken_report.R b/scripts/run_Kraken2-report.R similarity index 100% rename from scripts/run_kraken_report.R rename to scripts/run_Kraken2-report.R diff --git a/scripts/run_QC_report.R b/scripts/run_QC_report.R new file mode 100644 index 00000000..a143444e --- /dev/null +++ b/scripts/run_QC_report.R @@ -0,0 +1,81 @@ +# Title : runVariantReport_SarsCov2 +# Objective : run variant reports +# Created by: vfs +# Based on the runDeseqReport.R from Bora Uyar +# Created on: 29.04.21 + +# files should be given with full pathes? +# ! signature mutation sources are hardcoded + +runReport <- function(reportFile, + coverage_file, + sample_name, + outFile, + # logo, + selfContained = TRUE, + quiet = FALSE) { + + outFile <- outFile + + # htmlwidgets::setWidgetIdSeed(1234) + rmarkdown::render( + input = reportFile, + # intermediates_dir = file.path(workdir, prefix), + clean = TRUE, + output_file = outFile, + output_format = rmarkdown::html_document( + code_folding = 'hide', + depth = 2, + toc = TRUE, + toc_float = TRUE, + theme = 'lumen', + number_sections = TRUE + ), + output_options = list(self_contained = selfContained), + params = list(coverage_file = coverage_file, + outFile = outFile, + sample_name = sample_name + # logo = logo, + ), + quiet = quiet + ) + +# if(dir.exists(file.path(sample_dir, sample_name))) { +# unlink(file.path(sample_dir, sample_name), recursive = TRUE) +# } +# } + +#1. Collect arguments +args <- commandArgs(TRUE) + +cat("arguments:", args,"\n") + +## Parse arguments (we expect the form --arg=value) +#parseArgs <- function(x) strsplit(sub("^--", "", x), "=") +parseArgs <- function(x) { + myArgs <- unlist(strsplit(x, "--")) + myArgs <- myArgs[myArgs != ''] + #when no values are provided for the argument + myArgs <- gsub(pattern = "=$", replacement = "= ", x = myArgs) + myArgs <- as.data.frame(do.call(rbind, strsplit(myArgs, "="))) + myArgs$V2 <- gsub(' ', '', myArgs$V2) + return(myArgs) +} + +#print(class(myArgs <- unlist(strsplit(args, "--")))) + +argsDF <- parseArgs(args) +argsL <- as.list(as.character(argsDF$V2)) +names(argsL) <- argsDF$V1 + +reportFile <- argsL$reportFile +coverage_file <- argsL$coverage_file +outFile <- argsL$outFile +sample_name <- argsL$sample_name + +runReport(reportFile = reportFile, + coverage_file = coverage_file, + sample_name = sample_name, + outFile = outFile, + # logo = logo, + selfContained = selfContained) \ No newline at end of file diff --git a/scripts/run_variant_report.R b/scripts/run_variant_report.R index fa36b2d7..a5713141 100644 --- a/scripts/run_variant_report.R +++ b/scripts/run_variant_report.R @@ -13,16 +13,18 @@ runReport <- function(reportFile, location_sigmuts, sample_dir, sample_name, + outFile, # logo, selfContained = TRUE, quiet = FALSE) { - outFile <- paste0(sample_dir,'/variant_report_',sample_name,'.html') + # outFile <- outFile + output_dir <- dirname(outFile) # htmlwidgets::setWidgetIdSeed(1234) rmarkdown::render( input = reportFile, - output_dir = sample_dir, + output_dir = output_dir, # intermediates_dir = file.path(workdir, prefix), clean = TRUE, output_file = outFile, @@ -39,6 +41,7 @@ runReport <- function(reportFile, snv_csv_file = snv_csv_file, location_sigmuts = location_sigmuts, sample_dir = sample_dir, + outFile = outFile, sample_name = sample_name # logo = logo, ), @@ -78,6 +81,7 @@ vep_txt_file <- argsL$vep_txt_file snv_csv_file <- argsL$snv_csv_file location_sigmuts <- argsL$location_sigmuts sample_dir <- argsL$sample_dir +outFile <- argsL$outFile sample_name <- argsL$sample_name runReport(reportFile = reportFile, @@ -86,5 +90,6 @@ runReport(reportFile = reportFile, location_sigmuts = location_sigmuts, sample_dir = sample_dir, sample_name = sample_name, + outFile = outFile, # logo = logo, selfContained = selfContained) \ No newline at end of file diff --git a/scripts/variantreport_p_sample.rmd b/scripts/variantreport_p_sample.rmd index 4233c000..1f76420f 100644 --- a/scripts/variantreport_p_sample.rmd +++ b/scripts/variantreport_p_sample.rmd @@ -8,8 +8,9 @@ params: location_sigmuts: '' sample_dir: '' sample_name: '' + outFile: '' --- -```{r setup, include=FALSE} +```{r setup, include=FALSE, messages = FALSE, warnings = FALSE} knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE) knitr::opts_knit$set(root.dir = params$workdir) library(DT) @@ -28,14 +29,16 @@ snvfile <- params$snv_csv_file sampledir <- params$sample_dir location_sigmuts <- params$location_sigmuts sample_name <- params$sample_name +outFile <- params$outFile ``` ### Sample: `r sample_name` ```{r test_files, include = FALSE} # location_sigmuts <- '/mnt/beast/pathogenomics/vpipe/akalinlab_pathogenomics/vep_mutation_matching/signature_mutations' -# sampledir <- '/mnt/beast/pathogenomics/vpipe/akalinlab_pathogenomics/samples/wastewater_clock_time/WW_210325_KW2_22_24' -# vepfile <- '/mnt/beast/pathogenomics/vpipe/akalinlab_pathogenomics/samples/wastewater_clock_time/WW_210325_KW2_22_24/vep_reports/vep_sarscov2_WW_210325_KW2_22_24.txt' -# snvfile <- '/mnt/beast/pathogenomics/vpipe/akalinlab_pathogenomics/samples/wastewater_clock_time/WW_210325_KW2_22_24/variants/SNVs/snvs.csv' +# sampledir <- '/mnt/beast/pathogenomics/vpipe/akalinlab_pathogenomics/samples/wastewater_all/Test' +# # vepfile <- '/mnt/beast/pathogenomics/pigx_sarscov2_ww/output/vep/Test_vep_sarscov2_reportinput.txt' +# vepfile <- '/mnt/beast/pathogenomics/pigx_sarscov2_ww/output/vep/report_ready.txt' +# snvfile <- '/mnt/beast/pathogenomics/vpipe/akalinlab_pathogenomics/samples/wastewater_all/Test/variants/SNVs/snvs.csv' ``` This report shows the variant analysis of SARS-CoV-2 RNA-seq samples from wastewater samples. Variants are identified by SNV calling. Translated protein mutations were matched against lists of signature mutations for known @@ -65,11 +68,10 @@ parse_snv_csv <- function (snvfile,...){ # allele frequency from v-pipe vcf ```{r function_get_protein_mut, message = F, include = FALSE} - get_protein_mut <- function (vepfile){ # parse together from vep output "Protein_position" and "Amino_acid" # reading in whole vep txt output - vepfile.df <- read.table(vepfile, sep = '\t', header = T) # you should include in the vep script to parse out the # + vepfile.df <- read.table(vepfile, sep = ',', header = T) # you should include in the vep script to parse out the # #in the beginning of the line or include that step here. # parsing snv and protmut location @@ -84,6 +86,7 @@ get_protein_mut <- function (vepfile){ # parse together from vep output "Protein locations <- locations %>% filter(!grepl("^-", prot_mut_loc)) # specific B117 mutations: 21990-21993, 21764-21770, maybe also 3675-3677, 69-70 - all there + detectable_deletions <- function (x, colnames) { if (x['gene_mut_loc.2'] != x['gene_mut_loc.3'] & str_detect(x['prot_mut_loc'],"-")){ @@ -116,9 +119,13 @@ get_protein_mut <- function (vepfile){ # parse together from vep output "Protein Conseq == "inframe_deletion" & str_detect(deletions.df$prot_mut_loc,"-")) colnames <- colnames(deletions.df) - deletions <- dplyr::bind_rows(apply(deletions.df,1, detectable_deletions, colnames = colnames)) - locations <- dplyr::bind_rows(locations,deletions) - #locations <- dplyr::bind_rows(locations,deletions.df) + + # if there is a deletion the snv would span a couple of positions, if there is not such a spanning region there are no deletions + # ! 06/05/2021 Vic - I think, I don't know how robust this is, but it will work for the sig mutations we have so far + if (!(any(is.na(locations[,'gene_mut_loc.3'])))) { + deletions <- dplyr::bind_rows(apply(deletions.df,1, detectable_deletions, colnames = colnames)) + locations <- dplyr::bind_rows(locations,deletions) + } # substitute "nothing" at alternative-aa-column (for deletions) with "-" locations$AAs.2[is.na(locations$AAs.2)] <- "-" @@ -191,13 +198,10 @@ get_protein_mut <- function (vepfile){ # parse together from vep output "Protein ```{r processa_and_match_snvs_to_signature_mutations, message=F, warning=F, include = FALSE} - # reading in signature data # it's probably not really necessary to add the pase0 column but I like to have a full overview # but maybe this part could be done a bt nicer more compact with e.g. looping through the file directories? - ################ - b117 <- read.table(paste0(location_sigmuts,"/B117_outbreakinfo_mutation_report_data_2021-03-06.tsv"), sep="\t",header=T) # modifiyng deletions, subsitute "nothing" with a - e.g: D789 -> D789- @@ -205,57 +209,45 @@ b117$alt_aa <- na_if(b117$alt_aa, '') b117$alt_aa[is.na(b117$alt_aa)] <- "-" #pasting column with correct prot mutation notation b117$AA_mut <- paste0(b117$ref_aa, b117$codon_num, b117$alt_aa) - ##################### - b1351 <- read.table(paste0(location_sigmuts,"/B1351_outbreakinfo_mutation_report_data_2021-03-06.tsv"), sep="\t",header=T) b1351$alt_aa <- na_if(b1351$alt_aa, '') b1351$alt_aa[is.na(b1351$alt_aa)] <- "-" b1351$AA_mut <- paste0(b1351$ref_aa, b1351$codon_num, b1351$alt_aa) - b1427 <- read.table(paste0(location_sigmuts,"/B1427_outbreakinfo_mutation_report_data_2021-03-06.tsv"), sep="\t",header=T) b1427$alt_aa <- na_if(b1427$alt_aa, '') b1427$alt_aa[is.na(b1427$alt_aa)] <- "-" b1427$AA_mut <- paste0(b1427$ref_aa, b1427$codon_num, b1427$alt_aa) - b1429 <- read.table(paste0(location_sigmuts,"/B1429_outbreakinfo_mutation_report_data_2021-03-06.tsv"), sep="\t",header=T) b1429$alt_aa <- na_if(b1429$alt_aa, '') b1429$alt_aa[is.na(b1429$alt_aa)] <- "-" b1429$AA_mut <- paste0(b1429$ref_aa, b1429$codon_num, b1429$alt_aa) - b1526 <- read.table(paste0(location_sigmuts,"/B1526_outbreakinfo_mutation_report_data_2021-03-06.tsv"), sep="\t",header=T) b1526$alt_aa <- na_if(b1526$alt_aa, '') b1526$alt_aa[is.na(b1526$alt_aa)] <- "-" b1526$AA_mut <- paste0(b1526$ref_aa, b1526$codon_num, b1526$alt_aa) - p1 <- read.table(paste0(location_sigmuts,"/P1_outbreakinfo_mutation_report_data_2021-03-06.tsv"), sep="\t",header=T) p1$alt_aa <- na_if(p1$alt_aa, '') p1$alt_aa[is.na(p1$alt_aa)] <- "-" p1$AA_mut <- paste0(p1$ref_aa, p1$codon_num, p1$alt_aa) - ################ - -vepfile.df <- read.table(vepfile,sep = "\t", header = T) #TODO - in na's umwandeln +vepfile.df <- read.table(vepfile,sep = ",", header = T) #TODO - in na's umwandeln vepfile.df <- na_if(vepfile.df, "-") - sig_mutations.df <- as.data.frame(qpcR:::cbind.na(b117=b117$AA_mut, b1351=b1351$AA_mut, b1427=b1427$AA_mut, b1429=b1429$AA_mut, b1526=b1526$AA_mut, p1=p1$AA_mut)) sig_mutations.df <- sig_mutations.df %>% tidyr::pivot_longer(everything()) %>% tidyr::drop_na(.,value) - variant_protein_mut <- get_protein_mut(vepfile) variant_protein_mut <- dplyr::left_join(variant_protein_mut, sig_mutations.df, by = c('AA_mut'='value')) - match.df <- variant_protein_mut %>% filter(!is.na(name)) - nomatch.df <- variant_protein_mut %>% filter(is.na(name)) ``` @@ -266,17 +258,13 @@ nomatch.df <- variant_protein_mut %>% lofreq.info <- as_data_frame(parse_snv_csv(snvfile)) # vep.info <- write the part that returns the variant_protein_mut into a function vep.info <- variant_protein_mut - # add freqency and coverage information to protein mutations #variant_protein_mut <- dplyr::left_join(variant_protein_mut, sig_mutations.df, by = c('AA_mut'='value')) complete.df <- dplyr::left_join(lofreq.info, vep.info, by = c('gene_pos'='gene_mut_loc.2'), copy = T) - match.df <- complete.df %>% filter(!is.na(name)) - nomatch.df <- complete.df %>% filter(is.na(name)) - ``` ## Mutations matching variants of concern @@ -289,14 +277,12 @@ Table 2 shows all other mutations without any match. cat("\n Table 1: SNVs from sample matching signature mutations") ``` ```{r, message=F, warning=F, echo = FALSE, results = 'asis'} - SignatureTable <- data.frame("SNV" = match.df$gene_mut, "gene" = match.df$genes, "protein_mutation" = match.df$AA_mut, "variant" = match.df$name, # maybe more nice writing of it e.g B1.1.7 "frequency" = match.df$freq, # maybe more clear name stating which frequency "coverage" = match.df$cov) # maybe more clear title stating which coverage - DT::datatable(SignatureTable, extensions = c('FixedColumns', 'Scroller'), options = list(fixedColumns = TRUE, @@ -315,7 +301,6 @@ NoSignatureTable <- data.frame("SNV" = nomatch.df$gene_mut, "protein_mutation" = nomatch.df$AA_mut, "frequency" = nomatch.df$freq, # maybe more clear name stating which frequency "coverage" = nomatch.df$cov) # maybe more clear title stating which coverage - DT::datatable(NoSignatureTable, extensions = c('FixedColumns', 'Scroller'), options = list(fixedColumns = TRUE, @@ -330,44 +315,35 @@ DT::datatable(NoSignatureTable, ```{r deconvolution, message=F, warning=F, include = FALSE} # markdown text: -## Deconvolution -### Deconvolution +# # Deconvolution +# ## Deconvolution # Deconvolution is done by using robust regression. The resulting coefficients are normalized to get the percentage of # each present variant. - +# # prepare unique signature mutation dataframe - +# # sig_mutations_unique.df <- sig_mutations.df # unique_stuff <- unique(sig_mutations.df$value) # sig_mutations_unique.df <- subset(sig_mutations.df, !duplicated(value)) # sig_mutations_unique.df <- sig_mutations_unique.df %>% # tidyr::pivot_wider(values_fn = unlist) - +# # 1. count occurences of mutation - ``` ```{r deconvolution_prep, message=F, warning=F, include = FALSE} - # filter match table for unique mutations -match_unique.df <- dplyr::distinct(match.df[,-13]) - +match_unique.df <- unique(dplyr::distinct(match.df[,-13])) # get prot.mutations only and add "WT" muts_debug <- c("WT",match_unique.df$AA_mut) - # create an empty data frame # !!!!!!!! # in theory this df has to have a column for every variant in the beginning -> can we have a "variant-sheet" in the future? # there have to be a check if two or more columns are the same and either have to be removed or merged # in case we decide for only using unique signature mutations we'll probably don't have this problem anymore # I think I'll go for the "we only look for unique mutations anyway" way. - - - msig_debug <- data.frame(muts_debug,WT=0,b117=0,b1351=0,b1427=0,b1429=0,b1526=0,p1=0) # !! dirty hack !! - # make sigmuts per variant tables new with parsed mutations - msig_debug[msig_debug$muts %in% b117$AA_mut,"b117"]=1 msig_debug[msig_debug$muts %in% b1351$AA_mut,"b1351"]=1 msig_debug[msig_debug$muts %in% b1351$AA_mut,"b1427"]=1 @@ -375,20 +351,15 @@ msig_debug[msig_debug$muts %in% b1351$AA_mut,"b1429"]=1 msig_debug[msig_debug$muts %in% b1351$AA_mut,"b1526"]=1 msig_debug[msig_debug$muts %in% p1$AA_mut,"p1"]=1 msig_debug[1,2] <- 1 # put the WT signature, ideally there should be more than 1 WT signature # ?? - - ``` ```{r DIRTY HACK, include = FALSE} msig_debug <- data.frame(muts_debug,WT=0,b117=0,b1351=0,p1=0) # !! dirty hack !! - # make sigmuts per variant tables new with parsed mutations - msig_debug[msig_debug$muts %in% b117$AA_mut,"b117"]=1 msig_debug[msig_debug$muts %in% b1351$AA_mut,"b1351"]=1 msig_debug[msig_debug$muts %in% p1$AA_mut,"p1"]=1 msig_debug[1,2] <- 1 # put the WT signature, ideally there should be more than 1 WT signature # ?? - ``` ```{r, message=F, warning=F, include = FALSE} deconv <- function (bulk,sig){ @@ -406,41 +377,37 @@ deconv <- function (bulk,sig){ as.vector(rlm_coefficients) } - ``` -```{r deconvolution, message=F, warning=F, include = FALSE} - -bulk_debug <- as.numeric(c(1, match_unique.df$freq)) - -sig_debug <- msig_debug - -deconv_debug <- deconv(bulk_debug,sig_debug[,-1]) - +```{r deconvolution_debug, message=F, warning=F, include = FALSE} +# bulk_debug <- as.numeric(c(1, match_unique.df$freq)) +# +# sig_debug <- msig_debug +# +# # deconv_debug <- deconv(bulk_debug,sig_debug[,-1]) ``` ```{r plot, message=F, warning=F, echo = FALSE, include = F} - -cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") - -# work in progress...only to show how it theoretically can look like in the report -variants <- colnames(msig_debug[,-1]) -df <- as.data.frame(rbind(as.numeric(deconv_debug))) -colnames(df) <- variants -df <- df %>% - tidyr::pivot_longer(everything()) - -ggplot(data = df, aes(name,value, - fill = name)) + - geom_bar(stat="identity") + - labs(title = "Proportion of variants of concern inferred from deconvolution", - x = "Variants", - y = "Percentage") + - scale_x_discrete(limits = variants) + - scale_y_continuous(limits = c(0,1), breaks = seq(0,1,0.1)) + - scale_colour_manual(values=cbbPalette) + - geom_text(aes(label=sprintf("%0.3f", round(value, digits = 3))), - position=position_dodge(width=0.9), - vjust=-0.25) + - scale_fill_discrete(name = "Variants") +# cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") +# +# # work in progress...only to show how it theoretically can look like in the report +# variants <- colnames(msig_debug[,-1]) +# df <- as.data.frame(rbind(as.numeric(deconv_debug))) +# colnames(df) <- variants +# df <- df %>% +# tidyr::pivot_longer(everything()) +# +# ggplot(data = df, aes(name,value, +# fill = name)) + +# geom_bar(stat="identity") + +# labs(title = "Proportion of variants of concern inferred from deconvolution", +# x = "Variants", +# y = "Percentage") + +# scale_x_discrete(limits = variants) + +# scale_y_continuous(limits = c(0,1), breaks = seq(0,1,0.1)) + +# scale_colour_manual(values=cbbPalette) + +# geom_text(aes(label=sprintf("%0.3f", round(value, digits = 3))), +# position=position_dodge(width=0.9), +# vjust=-0.25) + +# scale_fill_discrete(name = "Variants") ``` @@ -468,7 +435,6 @@ ggplot(data = df, aes(name,value, ```{r, include = F} write.csv2(mtcars, "./file.csv") # not sure about the funct of the dataframe here, but it works so I leave it for now - library(magrittr) readLines(snvfile) %>% paste0(collapse="\n") %>% @@ -476,16 +442,15 @@ readLines(snvfile) %>% ``` Called SNVs: [Download SNVs.csv](`r sprintf('data:text/csv;base64,%s', snvs)`) - ```{r, include = F} # can possible be fused together with above. But rigth now it's safer like this. write.csv2(vepfile.df, "./file.csv") # not sure about the funct of the dataframe here, but it works so I leave it for now - readLines(vepfile) %>% paste0(collapse="\n") %>% openssl::base64_encode() -> vep ``` Variant effect prediction (*Ensemble VEP output*): [Download VEP_out.csv](`r sprintf('data:text/csv;base64,%s', vep)`) - - +```{r} +# rlm_model = suppressWarnings(MASS::rlm(sig_debug,bulk_debug, maxit = 100)) +``` \ No newline at end of file diff --git a/snakefile.py b/snakefile.py index 39e753f6..4ff4b220 100644 --- a/snakefile.py +++ b/snakefile.py @@ -98,7 +98,7 @@ ) } } -selected_targets = ['variant_reports'] +selected_targets = ['kraken_reports'] OUTPUT_FILES = list(chain.from_iterable([targets[name]['files'] for name in selected_targets])) @@ -245,7 +245,7 @@ output: os.path.join(REPORT_DIR, '{sample}_variant_report.html') params: run_report = os.path.join(SCRIPTS_DIR, 'run_variant_report.R'), - report_rmd = os.path.join(SCRIPTS_DIR, 'variant_report.Rmd') + report_rmd = os.path.join(SCRIPTS_DIR, 'variantreport_p_sample.rmd') log: os.path.join(LOG_DIR, 'variant_report_{sample}.log') shell: "Rscript {params.run_report} --reportFile={params.report_rmd} --vep_txt_file={input.vep_txt} --snv_csv_file={input.snv_csv} --location_sigmuts={SIGMUT_DB} --sample_dir={VARIANTS_DIR} --sample_name={wildcards.sample} --outFile={output} >> {log} 2>&1" @@ -270,8 +270,8 @@ input: os.path.join(KRAKEN_DIR, '{sample}_classified_unaligned_reads.txt') output: os.path.join(REPORT_DIR, '{sample}_kraken_report.html') params: - run_report = os.path.join(SCRIPTS_DIR, 'run_kraken_report.R'), - report_rmd = os.path.join(SCRIPTS_DIR, 'kraken_report.Rmd') + run_report = os.path.join(SCRIPTS_DIR, 'run_Kraken2-report.R'), + report_rmd = os.path.join(SCRIPTS_DIR, 'Kraken2-report.Rmd') log: os.path.join(LOG_DIR, 'kraken_report_{sample}.log') shell: "Rscript {params.run_report} --reportFile={params.report_rmd} --kraken_output={input} --sample_name={wildcards.sample} --output_file={output} >> {log} 2>&1" @@ -322,7 +322,7 @@ # input: os.path.join(COVERAGE_DIR, '{sample}_merged_covs.csv') # output: os.path.join(REPORT_DIR, '{sample}_qc_report.html') # params: -# run_report = os.path.join(SCRIPTS_DIR, 'run_qc_report.R'), -# report_rmd = os.path.join(SCRIPTS_DIR, 'qc_report.Rmd') +# run_report = os.path.join(SCRIPTS_DIR, 'run_QC_report.R'), +# report_rmd = os.path.join(SCRIPTS_DIR, 'qc_report_per_sample.rmd') # log: os.path.join(LOG_DIR, 'qc_report_{sample}.log') # shell: "Rscript {params.run_report} --reportFile={params.report_rmd} --coverage_file={input} --sample_name={wildcards.sample} --outFile={output} >> {log} 2>&1" \ No newline at end of file diff --git a/tests/databases/sigmut_db/B117_outbreakinfo_mutation_report_data_2021-03-06.tsv b/tests/databases/sigmut_db/B117_outbreakinfo_mutation_report_data_2021-03-06.tsv new file mode 100644 index 00000000..27072be0 --- /dev/null +++ b/tests/databases/sigmut_db/B117_outbreakinfo_mutation_report_data_2021-03-06.tsv @@ -0,0 +1,22 @@ +mutation type gene ref_codon pos ref_aa codon_num alt_aa source change_length_nt +"ORF1a:T1001I" "substitution" "ORF1a" "ACT" 3266 "T" 1001 "I" +"ORF1a:A1708D" "substitution" "ORF1a" "GCT" 5387 "A" 1708 "D" +"ORF1a:I2230T" "substitution" "ORF1a" "ATA" 6954 "I" 2230 "T" +"ORF1a:S3675" "deletion" "ORF1a" "TGGTTTTAA" 11288 "S" 3675 "NA" 9 +"ORF1a:G3676" "deletion" "ORF1a" "TGGTTTTAA" 11288 "G" 3676 "NA" 9 +"ORF1a:F3677" "deletion" "ORF1a" "TGGTTTTAA" 11288 "F" 3677 "NA" 9 +"S:H69" "deletion" "S" "CATGTC" 21765 "H" 69 "NA" 6 +"S:V70" "deletion" "S" "CATGTC" 21765 "V" 70 "NA" 6 +"S:Y144" "deletion" "S" "TAT" 21990 "Y" 144 "NA" 3 +"S:N501Y" "substitution" "S" "AAT" 23063 "N" 501 "Y" +"S:A570D" "substitution" "S" "GCT" 23271 "A" 570 "D" +"S:D614G" "substitution" "S" "GAT" 23402 "D" 614 "G" +"S:P681H" "substitution" "S" "CCT" 23604 "P" 681 "H" +"S:T716I" "substitution" "S" "ACA" 23708 "T" 716 "I" +"S:S982A" "substitution" "S" "TCA" 24506 "S" 982 "A" +"S:D1118H" "substitution" "S" "GAC" 24914 "D" 1118 "H" +"ORF8:Q27*" "substitution" "ORF8" "CAA" 27971 "Q" 27 "*" +"ORF8:R52I" "substitution" "ORF8" "AGA" 28047 "R" 52 "I" +"ORF8:Y73C" "substitution" "ORF8" "TAC" 28111 "Y" 73 "C" +"N:D3L" "substitution" "N" "GAT" 28280 "D" 3 "L" +"N:S235F" "substitution" "N" "TCT" 28977 "S" 235 "F" diff --git a/tests/databases/sigmut_db/B1351_outbreakinfo_mutation_report_data_2021-03-06.tsv b/tests/databases/sigmut_db/B1351_outbreakinfo_mutation_report_data_2021-03-06.tsv new file mode 100644 index 00000000..0bb468e3 --- /dev/null +++ b/tests/databases/sigmut_db/B1351_outbreakinfo_mutation_report_data_2021-03-06.tsv @@ -0,0 +1,9 @@ +type is_synonymous mutation gene ref_aa codon_num alt_aa source +"substitution" false "ORF1ab:K1655N" "ORF1ab" "K" 1655 "N" +"substitution" false "E:P71L" "E" "P" 71 "L" +"substitution" false "N:T205I" "N" "T" 205 "I" +"substitution" false "S:K417N" "S" "K" 417 "N" +"substitution" false "S:E484K" "S" "E" 484 "K" +"substitution" false "S:N501Y" "S" "N" 501 "Y" +"substitution" false "S:D614G" "S" "D" 614 "G" +"substitution" false "S:A701V" "S" "A" 701 "V" diff --git a/tests/databases/sigmut_db/B1427_outbreakinfo_mutation_report_data_2021-03-06.tsv b/tests/databases/sigmut_db/B1427_outbreakinfo_mutation_report_data_2021-03-06.tsv new file mode 100644 index 00000000..3ef24b8b --- /dev/null +++ b/tests/databases/sigmut_db/B1427_outbreakinfo_mutation_report_data_2021-03-06.tsv @@ -0,0 +1,9 @@ +mutation gene ref_aa alt_aa codon_num type source +"S:D614G" "S" "D" "G" 614 "substitution" +"ORF1a:T265I" "ORF1a" "T" "I" 265 "substitution" +"ORF1b:D1183Y" "ORF1b" "D" "Y" 1183 "substitution" +"N:T205I" "N" "T" "I" 205 "substitution" +"S:L452R" "S" "L" "R" 452 "substitution" +"ORF1a:S3158T" "ORF1a" "S" "T" 3158 "substitution" +"ORF1b:P976L" "ORF1b" "P" "L" 976 "substitution" +"ORF3a:Q57H" "ORF3a" "Q" "H" 57 "substitution" diff --git a/tests/databases/sigmut_db/B1429_outbreakinfo_mutation_report_data_2021-03-06.tsv b/tests/databases/sigmut_db/B1429_outbreakinfo_mutation_report_data_2021-03-06.tsv new file mode 100644 index 00000000..0ac83d62 --- /dev/null +++ b/tests/databases/sigmut_db/B1429_outbreakinfo_mutation_report_data_2021-03-06.tsv @@ -0,0 +1,11 @@ +mutation type gene ref_codon pos ref_aa codon_num alt_aa source +"ORF1a:T265I" "substitution" "ORF1a" "ACC" 1059 "T" 265 "I" +"ORF1a:I4205V" "substitution" "ORF1a" "ATC" 12879 "I" 4205 "V" +"ORF1b:P314L" "substitution" "ORF1b" "CCT" 14407 "P" 314 "L" +"ORF1b:D1183Y" "substitution" "ORF1b" "GAT" 17013 "D" 1183 "Y" +"S:S13I" "substitution" "S" "AGT" 21599 "S" 13 "I" +"S:W152C" "substitution" "S" "TGG" 22017 "W" 152 "C" +"S:L452R" "substitution" "S" "CTG" 22916 "L" 452 "R" +"S:D614G" "substitution" "S" "GAT" 23402 "D" 614 "G" +"ORF3a:Q57H" "substitution" "ORF3a" "CAG" 25562 "Q" 57 "H" +"N:T205I" "substitution" "N" "ACT" 28887 "T" 205 "I" diff --git a/tests/databases/sigmut_db/B1526_outbreakinfo_mutation_report_data_2021-03-06.tsv b/tests/databases/sigmut_db/B1526_outbreakinfo_mutation_report_data_2021-03-06.tsv new file mode 100644 index 00000000..69c1b62b --- /dev/null +++ b/tests/databases/sigmut_db/B1526_outbreakinfo_mutation_report_data_2021-03-06.tsv @@ -0,0 +1,11 @@ +mutation gene ref_aa alt_aa codon_num type source +"ORF1b:Q1011H" "ORF1b" "Q" "H" 1011 "substitution" +"S:D253G" "S" "D" "G" 253 "substitution" +"S:D614G" "S" "D" "G" 614 "substitution" +"ORF1b:P314L" "ORF1b" "P" "L" 314 "substitution" +"ORF1a:L3201P" "ORF1a" "L" "P" 3201 "substitution" +"ORF1a:T265I" "ORF1a" "T" "I" 265 "substitution" +"ORF3a:Q57H" "ORF3a" "Q" "H" 57 "substitution" +"S:T95I" "S" "T" "I" 95 "substitution" +"ORF3a:P42L" "ORF3a" "P" "L" 42 "substitution" +"ORF8:T11I" "ORF8" "T" "I" 11 "substitution" diff --git a/tests/databases/sigmut_db/P1_outbreakinfo_mutation_report_data_2021-03-06.tsv b/tests/databases/sigmut_db/P1_outbreakinfo_mutation_report_data_2021-03-06.tsv new file mode 100644 index 00000000..bc975bd0 --- /dev/null +++ b/tests/databases/sigmut_db/P1_outbreakinfo_mutation_report_data_2021-03-06.tsv @@ -0,0 +1,23 @@ +type is_synonymous mutation gene ref_aa codon_num alt_aa source ref_codon pos change_length_nt +"substitution" false "N:P80R" "N" "P" 80 "R" +"substitution" false "ORF8:E92K" "ORF8" "E" 92 "K" +"substitution" false "ORF3a:G174C" "ORF3a" "G" 174 "C" +"substitution" false "S:L18F" "S" "L" 18 "F" +"substitution" false "S:T20N" "S" "T" 20 "N" +"substitution" false "S:P26S" "S" "P" 26 "S" +"substitution" false "S:D138Y" "S" "D" 138 "Y" +"substitution" false "S:R190S" "S" "R" 190 "S" +"substitution" false "S:K417T" "S" "K" 417 "T" +"substitution" false "S:E484K" "S" "E" 484 "K" +"substitution" false "S:N501Y" "S" "N" 501 "Y" +"substitution" false "S:D614G" "S" "D" 614 "G" +"substitution" false "S:H655Y" "S" "H" 655 "Y" +"substitution" false "S:T1027I" "S" "T" 1027 "I" +"substitution" false "ORF1ab:E5662D" "ORF1ab" "E" 5662 "D" +"substitution" false "ORF1ab:K1795Q" "ORF1ab" "K" 1795 "Q" +"substitution" false "ORF1ab:S1188L" "ORF1ab" "S" 1188 "L" +"substitution" false "ORF1ab:I760T" "ORF1ab" "I" 760 "T" +"deletion" "ORF1a:S3675" "ORF1a" "S" 3675 "NA" "TGGTTTTAA" 11288 9 +"deletion" "ORF1a:G3676" "ORF1a" "G" 3676 "NA" "TGGTTTTAA" 11288 9 +"deletion" "ORF1a:F3677" "ORF1a" "F" 3677 "NA" "TGGTTTTAA" 11288 9 +"substitution" false "ORF1ab:F681L" "ORF1ab" "F" 681 "L"