diff --git a/.gitignore b/.gitignore index d22071f..4b402ae 100644 --- a/.gitignore +++ b/.gitignore @@ -22,6 +22,7 @@ # RStudio files .Rproj.user/ *.Rproj +/R_scripts # produced vignettes vignettes/*.html diff --git a/R/SV_incorporation.R b/R/SV_incorporation.R index 0658840..c482b3a 100644 --- a/R/SV_incorporation.R +++ b/R/SV_incorporation.R @@ -18,10 +18,10 @@ SV_incorporation = function( # criteria <- 'stringent' # # DMP fusion calls -------------------------------------------------------- - DMP.fusion <- fread(paste0(dmp.dir,'/data_SV.txt')) %>% - transmute(DMP_SAMPLE_ID = SampleId,EventType = Sv_Class_Name,Gene1 = Site1_Gene,Gene2 = Site2_Gene, - Chr1 = Site1_Chrom,Chr2 = Site2_Chrom,Pos1 = Site1_Pos,Pos2 = Site2_Pos,PairedReadCount = Paired_End_Read_Support, - SplitReadCount = Split_Read_Support,TumorReadCount = Tumor_Read_Count,EventInfo = Annotation) %>% data.table() + DMP.fusion <- fread(paste0(dmp.dir,'/data_sv.txt')) %>% + transmute(DMP_SAMPLE_ID = Sample_ID,EventType = Class,Gene1 = Site1_Hugo_Symbol,Gene2 = Site2_Hugo_Symbol, + Chr1 = Site1_Chromosome,Chr2 = Site2_Chromosome,Pos1 = Site1_Position,Pos2 = Site2_Position,PairedReadCount = Tumor_Paired_End_Read_Count, + SplitReadCount = Tumor_Split_Read_Count,TumorReadCount = Tumor_Read_Count,EventInfo = Event_Info) %>% data.table() # execution --------------------------------------------------------------- diff --git a/R/compile_reads_all.R b/R/compile_reads_all.R new file mode 100644 index 0000000..364c4b8 --- /dev/null +++ b/R/compile_reads_all.R @@ -0,0 +1,782 @@ +#library(data.table) +#library(tidyr) +#library(stringr) +#library(dplyr) + + +#' @export +compile_reads_all <- function(master.ref, + results.dir, + project.ID, + pooled.bam.dir = "/juno/work/access/production/resources/msk-access/current/novaseq_curated_duplex_bams_dmp/current/", + fasta.path = "/juno/work/access/production/resources/reference/current/Homo_sapiens_assembly19.fasta", + genotyper.path = "/work/access/production/resources/tools/GetBaseCountsMultiSample/current/GetBaseCountsMultiSample", + dmp.dir = "/juno/work/access/production/resources/cbioportal/current/msk_solid_heme", + mirror.bam.dir = "/juno/res/dmpcollab/dmpshare/share/irb12_245", + mirror.access.bam.dir = "/juno/res/dmpcollab/dmpshare/share/access_12_245/", + dmp.key.path = "/juno/res/dmpcollab/dmprequest/12-245/key.txt", + access.key.path = "/juno/res/dmpcollab/dmprequest/ACCESS-12-245/key.txt") { + # # test input section ----------------------------------------------------------- + # master.ref = fread('/juno/work/bergerm1/bergerlab/zhengy1/access_data_analysis/data/example_master_file.csv') + # results.dir = paste0('/juno/work/bergerm1/MSK-ACCESS/ACCESS-Projects/test_access/access_data_analysis/output_',format(Sys.time(),'%m%d%y')) + # pooled.bam.dir = '/ifs/work/bergerm1/ACCESS-Projects/novaseq_curated_duplex_v2/' + # fasta.path = '/work/access/production/resources/reference/current/Homo_sapiens_assembly19.fasta' + # genotyper.path = '/ifs/work/bergerm1/Innovation/software/maysun/GetBaseCountsMultiSample/GetBaseCountsMultiSample' + # dmp.dir = '/ifs/work/bergerm1/zhengy1/dmp/mskimpact/' + # mirror.bam.dir = '/ifs/dmpshare/share/irb12_245/' + # dmp.key.path = '/ifs/dmprequest/12-245/key.txt' + # setting up directory ---------------------------------------------------- + dir.create(results.dir) + # make tmp directory in output directory + dir.create(paste0(results.dir, "/tmp")) + # checking virtualenv ----------------------------------------------------- + geno.bash <- system("which genotype_variants", intern = T) + if (length(geno.bash) == 0) { + # print(pyclone.path) + stop( + "needs to run \nsource /home/accessbot/miniconda3/bin/activate && conda activate genotype-variants-0.3.0" + ) + } + + # data from DMP ----------------------------------------------------------- + DMP.key <- fread(dmp.key.path) + if (any(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-IH.|-T..-IM.", "", DMP.key[grepl("IH|IM", V1)]$V1))) { + message(paste0( + "These DMP IDs are not found in DMP key file: ", + paste0(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id[which(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% + gsub("-T..-IH.|-T..-IM.", "", DMP.key[grepl("IH|IM", V1)]$V1))], collapse = " ,") + )) + } + # data from DMP ACCESS ---------------------------------------------------- + access.key <- + as.data.table(read.csv(access.key.path, header = FALSE, sep = ",")) + if (any(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1))) { + message(paste0( + "These DMP IDs are not found in DMP ACCESS key file: ", + paste0(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id[which(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% + gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1))], collapse = " ,") + )) + } + + DMP.maf <- + fread(paste0(dmp.dir, "/data_mutations_extended.txt")) %>% + filter(Mutation_Status != "GERMLINE") %>% + data.table() + DMP.RET.maf <- + DMP.maf[grepl(paste0(unique(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id), collapse = "|"), Tumor_Sample_Barcode), ] + + # Pooled normal samples --------------------------------------------------- + pooled.bams <- + list.files(pooled.bam.dir, pattern = ".bam", full.names = T) + + # For each patient -------------------------------------------------------- + x <- unique(master.ref$cmo_patient_id)[1] + # x = unique(master.ref$cmo_sample_id_plasma)[16] + # x = 'C-YW82CY' + print("Compiling reads per patient") + all.fillout.id <- + lapply(unique(master.ref$cmo_patient_id), function(x) { + print(x) + dir.create(paste0(results.dir, "/", x)) + dmp_id <- + unique(master.ref[cmo_patient_id == x]$dmp_patient_id) + # sample sheet with colummns -- TSB, sample type, bam path, treatm -------- + # need to get DMP tumor, DMP normal, plasma, plasma normal (if there is any), pooled normal + # DMP sample sheet + if (is.na(dmp_id) | dmp_id == '') { + dmp.sample.sheet <- NULL + } else { + all.dmp.ids.IM <- + DMP.key[grepl(paste0(dmp_id, "-(T|N)..-IM."), V1)]$V1 + all.dmp.ids.IH <- + DMP.key[grepl(paste0(dmp_id, "-(T|N)..-IH."), V1)]$V1 + all.dmp.ids.XS <- + access.key[grepl(paste0(dmp_id, "-T..-XS."), V1)]$V1 + all.dmp.ids.normal.XS <- + access.key[grepl(paste0(dmp_id, "-N..-XS."), V1)]$V1 + all.dmp.ids <- c(all.dmp.ids.IM, all.dmp.ids.IH) + all.dmp.bam.ids.IM <- + DMP.key[grepl(paste0(dmp_id, "-(T|N)..-IM."), V1)]$V2 + all.dmp.bam.ids.IH <- + DMP.key[grepl(paste0(dmp_id, "-(T|N)..-IH."), V1)]$V2 + all.dmp.bam.ids.XS <- + gsub("-standard|-unfilter|-simplex|-duplex", + "", + access.key[grepl(paste0(dmp_id, "-T..-XS."), V1)]$V2) + all.dmp.bam.ids.normal.XS <- + gsub("-standard|-unfilter|-simplex|-duplex", + "", + access.key[grepl(paste0(dmp_id, "-N..-XS."), V1)]$V2) + all.dmp.bam.ids <- + c(all.dmp.bam.ids.IM, + all.dmp.bam.ids.IH) + if (length(all.dmp.ids) == 0) { + dmp.sample.sheet <- NULL + } else{ + bam.sub.dir <- + unlist(lapply(strsplit(substr( + all.dmp.bam.ids, 1, 2 + ), ""), function(x) { + paste0(x, collapse = "/") + })) + dmp.sample.sheet <- data.frame( + Sample_Barcode = all.dmp.ids, + standard_bam = paste0( + mirror.bam.dir, + "/", + bam.sub.dir, + "/", + all.dmp.bam.ids, + ".bam" + ), + duplex_bam = NA, + simplex_bam = NA + ) %>% + mutate( + cmo_patient_id = x, + Sample_Type = ifelse( + grepl("-T", Sample_Barcode), + "DMP_Tumor", + "DMP_Normal" + ), + dmp_patient_id = dmp_id + ) + } + if (length(all.dmp.ids.XS) == 0) { + access.sample.sheet <- NULL + } else{ + access.bam.sub.dir <- + unlist(lapply(strsplit( + substr(all.dmp.bam.ids.XS, 1, 2), "" + ), function(x) { + paste0(x, collapse = "/") + })) + access.sample.sheet <- unique( + data.frame( + Sample_Barcode = all.dmp.ids.XS, + standard_bam = NA, + duplex_bam = paste0( + mirror.access.bam.dir, + "/", + access.bam.sub.dir, + "/", + all.dmp.bam.ids.XS, + "-duplex.bam" + ), + simplex_bam = paste0( + mirror.access.bam.dir, + "/", + access.bam.sub.dir, + "/", + all.dmp.bam.ids.XS, + "-simplex.bam" + ) + ) %>% + mutate( + cmo_patient_id = x, + Sample_Type = ifelse( + grepl("-T", Sample_Barcode), + "duplex", + "unfilterednormal" + ), + dmp_patient_id = dmp_id + ) + ) + access.normal.bam.sub.dir <- + unlist(lapply(strsplit( + substr(all.dmp.bam.ids.normal.XS, 1, 2), "" + ), function(x) { + paste0(x, collapse = "/") + })) + access.normal.sample.sheet <- unique( + data.frame( + Sample_Barcode = all.dmp.ids.normal.XS, + standard_bam = paste0( + mirror.access.bam.dir, + "/", + access.normal.bam.sub.dir, + "/", + all.dmp.bam.ids.normal.XS, + "-unfilter.bam" + ), + duplex_bam = NA, + simplex_bam = NA + ) %>% + mutate( + cmo_patient_id = x, + Sample_Type = ifelse( + grepl("-N", Sample_Barcode), + "unfilterednormal", + "duplex" + ), + dmp_patient_id = dmp_id + ) + ) + access.sample.sheet = bind_rows(access.sample.sheet, access.normal.sample.sheet) + } + if (!is.null(dmp.sample.sheet) & + !is.null(access.sample.sheet)) { + print("DMP IMPACT and DMP ACCESS samples are available") + dmp.sample.sheet <- + bind_rows(dmp.sample.sheet, access.sample.sheet) + + } else if (is.null(dmp.sample.sheet) & + !is.null(access.sample.sheet)) { + print("DMP IMPACT samples are NOT available and DMP ACCESS samples are available") + dmp.sample.sheet <- access.sample.sheet + } else if (!is.null(dmp.sample.sheet) & + is.null(access.sample.sheet)) { + print("DMP IMPACT samples are available and DMP ACCESS samples are NOT available") + dmp.sample.sheet <- dmp.sample.sheet + } else{ + print("No DMP IMPACT samples or DMP ACCESS samples are available") + dmp.sample.sheet <- NULL + } + } + # total sample sheet + sample.sheet <- master.ref[cmo_patient_id == x, + # plasma bams -- duplex and simplex bam + .( + Sample_Barcode = as.character(cmo_sample_id_plasma), + standard_bam = NA, + duplex_bam = bam_path_plasma_duplex, + simplex_bam = bam_path_plasma_simplex, + cmo_patient_id, + Sample_Type = "duplex", + dmp_patient_id + )] %>% + merge(rbind(unique(master.ref[cmo_patient_id == x & + paired == 'Paired', + # buffy coat + DMP bams -- standard bam only + .( + Sample_Barcode = as.character(cmo_sample_id_normal), + standard_bam = bam_path_normal, + duplex_bam = NA, + simplex_bam = NA, + cmo_patient_id, + Sample_Type = "unfilterednormal", + dmp_patient_id + )]), + dmp.sample.sheet), all = T) + # catch '' or NA for empty cells for some cmo_sample_id_normal + sample.sheet <- + sample.sheet[!is.na(Sample_Barcode) | + Sample_Barcode != ""] + write.table( + sample.sheet, + paste0(results.dir, "/", x, "/", x, "_sample_sheet.tsv"), + sep = "\t", + quote = F, + row.names = F + ) + # piece together all unique calls ----------------------------------------- + # get duplex calls + duplex.calls <- + do.call(rbind, lapply(master.ref[cmo_patient_id == x]$maf_path, function(x) { + # fread(x) %>% filter(as.numeric(D_t_alt_count_fragment) > 0) %>% data.table() + selectcolumns <- + c( + "Hugo_Symbol", + "Entrez_Gene_Id", + "Center", + "NCBI_Build", + "Chromosome", + "Start_Position", + "End_Position", + "Strand", + "Variant_Classification", + "Variant_Type", + "Reference_Allele", + "Tumor_Seq_Allele1", + "Tumor_Seq_Allele2", + "dbSNP_RS", + "dbSNP_Val_Status", + "Tumor_Sample_Barcode", + "caller_Norm_Sample_Barcode", + "Match_Norm_Seq_Allele1", + "Match_Norm_Seq_Allele2", + "Tumor_Validation_Allele1", + "Tumor_Validation_Allele2", + "Match_Norm_Validation_Allele1", + "Match_Norm_Validation_Allele2", + "Verification_Status", + "Validation_Status", + "Mutation_Status", + "Sequencing_Phase", + "Sequence_Source", + "Validation_Method", + "Score", + "BAM_File", + "Sequencer", + "Tumor_Sample_UUID", + "Matched_Norm_Sample_UUID", + "HGVSc", + "HGVSp", + "HGVSp_Short", + "Transcript_ID", + "Exon_Number", + "caller_t_depth", + "caller_t_ref_count", + "caller_t_alt_count", + "caller_n_depth", + "caller_n_ref_count", + "caller_n_alt_count", + "all_effects", + "Allele", + "Gene", + "Feature", + "Feature_type", + "Consequence", + "cDNA_position", + "CDS_position", + "Protein_position", + "Amino_acids", + "Codons", + "Existing_variation", + "ALLELE_NUM", + "DISTANCE", + "STRAND_VEP", + "SYMBOL", + "SYMBOL_SOURCE", + "HGNC_ID", + "BIOTYPE", + "CANONICAL", + "CCDS", + "ENSP", + "SWISSPROT", + "TREMBL", + "UNIPARC", + "RefSeq", + "SIFT", + "PolyPhen", + "EXON", + "INTRON", + "DOMAINS", + "AF", + "AFR_AF", + "AMR_AF", + "ASN_AF", + "EAS_AF", + "EUR_AF", + "SAS_AF", + "AA_AF", + "EA_AF", + "CLIN_SIG", + "SOMATIC", + "PUBMED", + "MOTIF_NAME", + "MOTIF_POS", + "HIGH_INF_POS", + "MOTIF_SCORE_CHANGE", + "IMPACT", + "PICK", + "VARIANT_CLASS", + "TSL", + "HGVS_OFFSET", + "PHENO", + "MINIMISED", + "ExAC_AF", + "ExAC_AF_AFR", + "ExAC_AF_AMR", + "ExAC_AF_EAS", + "ExAC_AF_FIN", + "ExAC_AF_NFE", + "ExAC_AF_OTH", + "ExAC_AF_SAS", + "GENE_PHENO", + "FILTER", + "flanking_bps", + "variant_id", + "variant_qual", + "ExAC_AF_Adj", + "ExAC_AC_AN_Adj", + "ExAC_AC_AN", + "ExAC_AC_AN_AFR", + "ExAC_AC_AN_AMR", + "ExAC_AC_AN_EAS", + "ExAC_AC_AN_FIN", + "ExAC_AC_AN_NFE", + "ExAC_AC_AN_OTH", + "ExAC_AC_AN_SAS", + "ExAC_FILTER", + "gnomAD_AF", + "gnomAD_AFR_AF", + "gnomAD_AMR_AF", + "gnomAD_ASJ_AF", + "gnomAD_EAS_AF", + "gnomAD_FIN_AF", + "gnomAD_NFE_AF", + "gnomAD_OTH_AF", + "gnomAD_SAS_AF", + "CallMethod", + "VCF_POS", + "VCF_REF", + "VCF_ALT", + "hotspot_whitelist", + "Status", + "D_t_alt_count_fragment", + "D_t_ref_count_fragment", + "D_t_vaf_fragment", + "SD_t_alt_count_fragment", + "SD_t_ref_count_fragment", + "SD_t_vaf_fragment", + "Matched_Norm_Sample_Barcode", + "Matched_Norm_Bamfile", + "n_alt_count_fragment", + "n_ref_count_fragment", + "n_vaf_fragment" + ) + if ("Status" %in% names(fread(x))) { + fread(x) %>% select(one_of(selectcolumns)) %>% subset((Status == "") | + (is.na(Status))) + } else { + fread(x) %>% select(one_of(selectcolumns)) + } + # fread(x) + # %>% + # filter(as.numeric(t_alt_count) > 0) %>% + # data.table() + })) + # get impact calls + impact.calls <- + DMP.RET.maf[Tumor_Sample_Barcode %in% sample.sheet$Sample_Barcode] + write.table( + impact.calls[, .( + Hugo_Symbol, + Chromosome, + Start_Position, + End_Position, + Variant_Classification, + HGVSp_Short, + Reference_Allele, + Tumor_Seq_Allele2 + )], + paste0(results.dir, "/", x, "/", x, "_impact_calls.maf"), + sep = "\t", + quote = F, + row.names = F + ) + # combining plasma and impact calls + all.calls <- + rbind(duplex.calls[, intersect(colnames(duplex.calls), colnames(DMP.RET.maf)), with = F], + impact.calls[, intersect(colnames(duplex.calls), colnames(DMP.RET.maf)), with = F]) + # getting rid of duplicate calls and take the first occurence of all events + all.calls <- + all.calls[which(!duplicated(all.calls[, .( + Hugo_Symbol, + Chromosome, + Start_Position, + End_Position, + Variant_Classification, + HGVSp_Short, + Reference_Allele, + Tumor_Seq_Allele2 + )])), ] %>% + mutate( + t_ref_count = 0, + t_alt_count = 0, + n_ref_count = 0, + n_alt_count = 0, + Matched_Norm_Sample_Barcode = NA + ) %>% + filter( + Variant_Classification != "Silent" & + !grepl("RP11-", Hugo_Symbol) & + !grepl("Intron", Variant_Classification) + ) + write.table( + all.calls, + paste0(results.dir, "/", x, "/", x, "_all_unique_calls.maf"), + sep = "\t", + quote = F, + row.names = F + ) + # tagging hotspots + system( + paste0( + 'bsub -R "rusage[mem=4]" -cwd ', + results.dir, + "/", + x, + "/ -oo hotspot.o -eo hotspot.e -W 00:59 ", + " -P ", + project.ID, + " -J ", + x, + "_tag_hotspot ", + " python /work/access/production/workflows/access_workflows/v1/pipeline_2.0.0/ACCESS-Pipeline/cwl_tools/hotspots/tag_hotspots.py ", + " -m ", + results.dir, + "/", + x, + "/", + x, + "_all_unique_calls.maf", + " -itxt /work/access/production/resources/msk-access/current/regions_of_interest/current/hotspot-list-union-v1-v2_with_TERT.txt ", + " -o ", + results.dir, + "/", + x, + "/", + x, + "_all_unique_calls_hotspots.maf", + " -outdir ", + results.dir, + "/", + x, + "/", + x + ) + ) + # genotype all bams in this patient directory ----------------------------- + # genotyping plasma samples -- plasma duplex&simplex, plasma normal, pooled plasma normal + write.table( + sample.sheet[, .( + sample_id = Sample_Barcode, + maf = paste0(results.dir, "/", x, "/", x, "_all_unique_calls.maf"), + standard_bam, + duplex_bam, + simplex_bam + )], + paste0(results.dir, "/", x, "/", x, "_genotype_metadata.tsv"), + sep = "\t", + quote = F, + row.names = F + ) + job.ids <- system( + paste0( + "bsub -cwd ", + results.dir, + "/", + x, + ' -W 12:00 -R "rusage[mem=8]" -oo genotyping.o -eo genotyping.e ', + " -P ", + project.ID, + " -J ", + x, + "_genotype_variants ", + " genotype_variants small_variants multiple-samples -i ", + results.dir, + "/", + x, + "/", + x, + "_genotype_metadata.tsv", + " -r ", + fasta.path, + " -g ", + genotyper.path, + " -v DEBUG " + ), + intern = T + ) + job.ids <- as.numeric(gsub("Job <|> is.*.$", "", job.ids)) + }) + + + # Get base count multi sample in pooled normal ---------------------------- + # all all unique calls in entire cohort + print("Compiling reads in pooled samples") + dir.create(paste0(results.dir, "/pooled")) + all.all.unique.mafs <- + do.call(rbind, lapply(unique(master.ref$cmo_patient_id), function(x) { + fread(list.files( + paste0(results.dir, "/", x), + pattern = "unique_calls.maf$", + full.names = T + )) + })) + all.all.unique.mafs <- + all.all.unique.mafs[!duplicated(all.all.unique.mafs[, .( + Hugo_Symbol, + Chromosome, + Start_Position, + End_Position, + Variant_Classification, + HGVSp_Short, + Reference_Allele, + Tumor_Seq_Allele2 + )]),] + write.table( + all.all.unique.mafs, + paste0(results.dir, "/pooled/all_all_unique.maf"), + sep = "\t", + quote = F, + row.names = F + ) + + write.table( + data.frame( + sample_id = gsub("^.*./|.bam", "", pooled.bams), + maf = paste0(results.dir, "/pooled/all_all_unique.maf"), + standard_bam = pooled.bams, + duplex_bam = "", + simplex_bam = "" + ), + paste0(results.dir, "/pooled/pooled_metadata.tsv"), + sep = "\t", + quote = F, + row.names = F + ) + + pooled.sample.job.id <- system( + paste0( + "bsub -cwd ", + results.dir, + '/pooled -W 12:00 -R "rusage[mem=8]" -oo genotyping.o -eo genotyping.e ', + " -w ", + ' \"', + paste0(paste0("done(", unlist(all.fillout.id), ")"), collapse = "&&"), + '\" ', + " -P ", + project.ID, + " -J pooled_genotype_variants ", + " genotype_variants small_variants multiple-samples -i ", + results.dir, + "/pooled/pooled_metadata.tsv", + " -r ", + fasta.path, + " -g ", + genotyper.path, + " -v DEBUG " + ), + intern = T + ) + pooled.sample.job.id <- + as.numeric(gsub("Job <|> is.*.$", "", pooled.sample.job.id)) + while (!any(grepl("Done successfully", system( + paste0("bjobs -l ", pooled.sample.job.id), intern = T + )))) { + Sys.sleep(120) + } + print("Compile reads done!") +} + +# Executable ----------------------------------------------------------------------------------------------------------- +# Minimal columns for input mafs +# +# Hugo_Symbol,Chromosome,Start_Position,End_Position,Tumor_Sample_Barcode,Variant_Classification,HGVSp_Short,Reference_Allele,Tumor_Seq_Allele2,D_t_alt_count_fragment + +suppressPackageStartupMessages({ + library(data.table) + library(tidyr) + library(stringr) + library(dplyr) + library(argparse) +}) + +if (!interactive()) { + parser <- ArgumentParser() + parser$add_argument("-m", "--masterref", type = "character", help = "File path to master reference file") + parser$add_argument("-o", "--resultsdir", type = "character", help = "Output directory") + parser$add_argument( + "-pid", + "--projectid", + type = "character", + default = "", + help = "Project ID for submitted jobs involved in this run" + ) + parser$add_argument( + "-pb", + "--pooledbamdir", + type = "character", + default = "/juno/work/access/production/resources/msk-access/current/novaseq_curated_duplex_bams_dmp/current/", + help = "Directory for all pooled bams [default]" + ) + parser$add_argument( + "-fa", + "--fastapath", + type = "character", + default = "/juno/work/access/production/resources/reference/current/Homo_sapiens_assembly19.fasta", + help = "Reference fasta path [default]" + ) + parser$add_argument( + "-gt", + "--genotyperpath", + type = "character", + default = "/work/access/production/resources/tools/GetBaseCountsMultiSample/current/GetBaseCountsMultiSample", + help = "Genotyper executable path [default]" + ) + parser$add_argument( + "-dmp", + "--dmpdir", + type = "character", + default = "/juno/work/access/production/resources/cbioportal/current/msk_solid_heme", + help = "Directory of clinical DMP repository [default]" + ) + parser$add_argument( + "-mb", + "--mirrorbamdir", + type = "character", + default = "/juno/res/dmpcollab/dmpshare/share/irb12_245", + help = "Mirror BAM file directory [default]" + ) + parser$add_argument( + "-mab", + "--mirroraccessbamdir", + type = "character", + default = "/juno/res/dmpcollab/dmpshare/share/access_12_245", + help = "Mirror BAM file directory for MSK-ACCESS [default]" + ) + parser$add_argument( + "-dmpk", + "--dmpkeypath", + type = "character", + default = "/juno/res/dmpcollab/dmprequest/12-245/key.txt", + help = "DMP mirror BAM key file [default]" + ) + parser$add_argument( + "-dmpak", + "--dmpaccesskeypath", + type = "character", + default = "/juno/res/dmpcollab/dmprequest/ACCESS-12-245/key.txt", + help = "DMP mirror BAM key file for MSK-ACCESS [default]" + ) + args <- parser$parse_args() + + master.ref <- args$masterref + results.dir <- args$resultsdir + project.ID <- args$projectid + pooled.bam.dir <- args$pooledbamdir + fasta.path <- args$fastapath + genotyper.path <- args$genotyperpath + dmp.dir <- args$dmpdir + mirror.bam.dir <- args$mirrorbamdir + mirror.access.bam.dir <- args$mirroraccessbamdir + dmp.key.path <- args$dmpkeypath + access.key.path <- args$dmpaccesskeypath + + + if (project.ID == "") { + project.ID <- + paste0(sample(c(0:9), size = 10, replace = T), collapse = "") + } + + print(paste0("Input parameters for run ", project.ID)) + print(master.ref) + print(results.dir) + print(pooled.bam.dir) + print(fasta.path) + print(genotyper.path) + print(dmp.dir) + print(mirror.bam.dir) + print(mirror.access.bam.dir) + print(dmp.key.path) + print(access.key.path) + suppressWarnings( + compile_reads_all( + fread(master.ref), + results.dir, + project.ID, + pooled.bam.dir, + fasta.path, + genotyper.path, + dmp.dir, + mirror.bam.dir, + mirror.access.bam.dir, + dmp.key.path, + access.key.path + ) + ) + print("compile reads function finished") +} diff --git a/python/convert_csv_to_maf/README.md b/python/convert_csv_to_maf/README.md new file mode 100644 index 0000000..03c0767 --- /dev/null +++ b/python/convert_csv_to_maf/README.md @@ -0,0 +1,88 @@ +# Convert output of Rscript (filter_calls.R) CSV file to MAF + +Tool does the following operations: + +* Read one or more files from the inputs +* Removes unwanted columns, modifying the column headers depending on the + requirements +* Massaging the data frame to make it compatible with MAF format +* Write the data frame to a file in MAF format and Excel format + +## Installation + +Dependencies may be installed from the requirements.txt file using ```pip install -r requirements.txt```. +This should contains all the required python packages required to run csv_to_maf.py and convert CSV files to MAF. + +## Example command + +### Explicitly specifying files on command line + +```bash +python csv_to_maf.py -i /path/to/Test1.csv -i /path/to/Test2.csv -i /path/to/Test3.csv +``` + +### Specifying files in a text FileOfFiles + +```bash +python csv_to_maf.py -l /path/to/FileOfFiles.txt +``` + +where **FileOfFiles.txt** + +```bash +> cat FileOfFiles.txt +/path/to/Test1.csv +/path/to/Test2.csv +/path/to/Test3.csv +``` + +### Keeping normal samples identified using "normal" string, by default they are filtered + +```bash +python csv_to_maf.py -n -i /path/to/Test1.csv -i /path/to/Test2.csv -i /path/to/Test3.csv +# OR +python csv_to_maf.py -n -l /path/to/FileOfFiles.txt +``` + +## Usage + +```bash +> python csv_to_maf.py --help +Usage: csv_to_maf.py [OPTIONS] + + Tool does the following operations: + + A. Read one or more files from the inputs + + B. Removes unwanted columns, modifying the column headers depending on the + requirements + + C. Massaging the data frame to make it compatible with MAF format + + D. Write the data frame to a file in MAF format and Excel format + + Requirement: pandas; openpyxl; typing; typer; + +Options: + -l, --list PATH File of files, List of CSV files to be + converted to maf, one per line, no header, + CSV file generated by Rscript filter_calls.R + [default: ] + + -i, --csv FILE File to convert from csv to maf. CSV file + generated by Rscript filter_calls.R, Can be + given multiple times [default: ] + + -n, --normal / -N, --keep-normal + Keep samples tagged as normal [default: + False] + + -p, --prefix TEXT Prefix of the output MAF and EXCEL file + [default: csv_to_maf_output] + + --install-completion Install completion for the current shell. + --show-completion Show completion for the current shell, to + copy it or customize the installation. + + --help Show this message and exit. +``` diff --git a/python/convert_csv_to_maf/csv_to_maf.py b/python/convert_csv_to_maf/csv_to_maf.py new file mode 100644 index 0000000..0efbc39 --- /dev/null +++ b/python/convert_csv_to_maf/csv_to_maf.py @@ -0,0 +1,250 @@ +from pathlib import Path +from typing import List, Optional +import typer +import pandas as pd + + +def main( + list_of_files: Path = typer.Option( + "", + "--list", + "-l", + help="File of files, List of CSV files to be converted to maf, one per line, no header, CSV file generated by Rscript filter_calls.R", + ), + csv: Optional[List[Path]] = typer.Option( + "", + "--csv", + "-i", + exists=True, + file_okay=True, + dir_okay=False, + writable=False, + readable=True, + resolve_path=True, + help="File to convert from csv to maf. CSV file generated by Rscript filter_calls.R, Can be given multiple times", + ), + normal: bool = typer.Option( + False, + "--normal/--keep-normal", + "-n/-N", + help="Keep samples tagged as normal", + ), + output_file_prefix: str = typer.Option( + "csv_to_maf_output", + "--prefix", + "-p", + help="Prefix of the output MAF and EXCEL file", + ), +): + + """ + Tool does the following operations: + + A. Read one or more files from the inputs + + B. Removes unwanted columns, modifying the column headers depending on the requirements + + C. Massaging the data frame to make it compatible with MAF format + + D. Write the data frame to a file in MAF format and Excel format + + Requirement: + pandas; openpyxl; typing; typer; + + """ + if not list_of_files: + typer.secho( + "File are not provided as file of files.", fg=typer.colors.BRIGHT_YELLOW + ) + if not csv: + typer.secho( + "File were not provided via command line as well", + fg=typer.colors.BRIGHT_RED, + ) + raise typer.Abort() + + # Read file of files + if not csv: + csv = [line.strip() for line in open(list_of_files, "r")] + # print(csv) + final_df = pd.DataFrame() + for csv_file in csv: + if Path(csv_file).is_file(): + # Read csv file + typer.secho(f"Reading: {csv_file}", fg=typer.colors.BRIGHT_GREEN) + csv_df = pd.read_csv(csv_file, sep=",", low_memory=False) + # filter csv of "duplex.called columns" + csv_df = csv_df.loc[:, ~csv_df.columns.str.contains("__duplex.called")] + # filter csv of "duplex_support_num columns" + csv_df = csv_df.loc[:, ~csv_df.columns.str.contains("duplex_support_num")] + # filter csv of "normal" samples if normal is not wanted + if not normal: + csv_df = csv_df.loc[:, ~csv_df.columns.str.contains("normal")] + # filter rows that have call_confidence == "Drop" + csv_df = csv_df[ + csv_df["call_confidence"] + .astype(str) + .str.lower() + .str.contains("drop", na=False) + == False + ] + is_csv_df_empty = csv_df.empty + # melt the data frame + if is_csv_df_empty == False: + melt_csv_df = csv_df.melt( + id_vars=[ + "Hugo_Symbol", + "Chromosome", + "Start_Position", + "End_Position", + "Variant_Classification", + "HGVSp_Short", + "Reference_Allele", + "Tumor_Seq_Allele2", + "ExAC_AF", + "Hotspot", + "DMP", + "CH", + "call_confidence", + ], + var_name="Tumor_Sample_Barcode", + value_name="Evidence", + ) + # fix tumor_sample_barcode + melt_csv_df[ + "Tumor_Sample_Barcode" + ] = melt_csv_df.Tumor_Sample_Barcode.str.split("___", 1).str.get(0) + # convert Chromosome to string + melt_csv_df["Chromosome"] = melt_csv_df["Chromosome"].astype(str) + # split Evidence columns into multiple columns + melt_csv_df[["t_alt_count", "t_depth"]] = melt_csv_df[ + "Evidence" + ].str.split("/", 1, expand=True) + # convert t_alt_count to to_numeric + melt_csv_df["t_alt_count"] = melt_csv_df["t_alt_count"].apply( + pd.to_numeric, errors="coerce" + ) + # remove variant frequency information + melt_csv_df["t_depth"] = melt_csv_df.t_depth.str.split("(", 1).str.get( + 0 + ) + # convert t_depth to to_numeric + melt_csv_df["t_depth"] = melt_csv_df["t_depth"].apply( + pd.to_numeric, errors="coerce" + ) + # calculate t_ref_count + melt_csv_df = melt_csv_df.assign( + t_ref_count=melt_csv_df["t_depth"] - melt_csv_df["t_alt_count"] + ) + # calculate t_alt_freq + melt_csv_df = melt_csv_df.assign( + t_alt_freq=( + melt_csv_df["t_alt_count"] / melt_csv_df["t_depth"] + ).round(4) + ) + # drop Evidence columns + melt_csv_df.drop(columns=["Evidence"], inplace=True) + # add additional columns + melt_csv_df["Entrez_Gene_Id"] = 0 + melt_csv_df["Center"] = "mskcc.org" + melt_csv_df["NCBI_Build"] = "GRCh37" + melt_csv_df["Tumor_Seq_Allele1"] = melt_csv_df["Reference_Allele"] + melt_csv_df["Strand"] = "" + melt_csv_df["Consequence"] = "" + melt_csv_df["dbSNP_RS"] = "" + melt_csv_df["dbSNP_Val_Status"] = "" + melt_csv_df["Match_Norm_Seq_Allele1"] = "" + melt_csv_df["Match_Norm_Seq_Allele2"] = "" + melt_csv_df["Tumor_Validation_Allele1"] = "" + melt_csv_df["Tumor_Validation_Allele2"] = "" + melt_csv_df["Match_Norm_Validation_Allele1"] = "" + melt_csv_df["Match_Norm_Validation_Allele2"] = "" + melt_csv_df["Verification_Status"] = "" + melt_csv_df["Validation_Status"] = "" + melt_csv_df["Mutation_Status"] = "" + melt_csv_df["Sequencing_Phase"] = "" + melt_csv_df["Sequence_Source"] = "" + melt_csv_df["Validation_Method"] = "" + melt_csv_df["Score"] = "" + melt_csv_df["BAM_File"] = "" + melt_csv_df["Sequencer"] = "" + melt_csv_df["n_ref_count"] = "" + melt_csv_df["n_alt_count"] = "" + melt_csv_df["HGVSc"] = "" + melt_csv_df["HGVSp"] = "" + melt_csv_df["Transcript_ID"] = "" + melt_csv_df["RefSeq"] = "" + melt_csv_df["Protein_position"] = "" + melt_csv_df["Codons"] = "" + melt_csv_df = melt_csv_df.reindex( + columns=[ + "Hugo_Symbol", + "Entrez_Gene_Id", + "Center", + "NCBI_Build", + "Chromosome", + "Start_Position", + "End_Position", + "Strand", + "Consequence", + "Variant_Classification", + "Variant_Type", + "Reference_Allele", + "Tumor_Seq_Allele1", + "Tumor_Seq_Allele2", + "dbSNP_RS", + "dbSNP_Val_Status", + "Tumor_Sample_Barcode", + "Matched_Norm_Sample_Barcode", + "Match_Norm_Seq_Allele1", + "Match_Norm_Seq_Allele2", + "Tumor_Validation_Allele1", + "Tumor_Validation_Allele2", + "Match_Norm_Validation_Allele1", + "Match_Norm_Validation_Allele2", + "Verification_Status", + "Validation_Status", + "Mutation_Status", + "Sequencing_Phase", + "Sequence_Source", + "Validation_Method", + "Score", + "BAM_File", + "Sequencer", + "t_depth", + "t_ref_count", + "t_alt_count", + "t_alt_freq", + "n_ref_count", + "n_alt_count", + "HGVSc", + "HGVSp", + "HGVSp_Short", + "Transcript_ID", + "RefSeq", + "Protein_position", + "Codons", + "Hotspot", + "DMP", + "CH", + "call_confidence", + "ExAC_AF", + ] + ) + final_df = final_df.append(melt_csv_df, ignore_index=True) + else: + continue + else: + typer.secho(f"{csv_file} file does not exists", fg=typer.colors.BRIGHT_RED) + raise typer.Abort() + # write final_df to tsv + typer.secho( + f"Done processing the CSV file writing output to {output_file_prefix} in txt and excel format", + fg=typer.colors.GREEN, + ) + final_df.to_csv(f"{output_file_prefix}.maf", index=False, sep="\t") + final_df.to_excel(f"{output_file_prefix}.xlsx", index=False) + + +if __name__ == "__main__": + typer.run(main) diff --git a/python/convert_csv_to_maf/example_output.maf.txt b/python/convert_csv_to_maf/example_output.maf.txt new file mode 100644 index 0000000..6a84795 --- /dev/null +++ b/python/convert_csv_to_maf/example_output.maf.txt @@ -0,0 +1,382 @@ +Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position End_Position Strand Consequence Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 Tumor_Validation_Allele1 Tumor_Validation_Allele2 Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2 Verification_Status Validation_Status Mutation_Status Sequencing_Phase Sequence_Source Validation_Method Score BAM_File Sequencer t_depth t_ref_count t_alt_count t_alt_freq n_ref_count n_alt_count HGVSc HGVSp HGVSp_Short Transcript_ID RefSeq Protein_position Codons Hotspot DMP CH call_confidence ExAC_AF +ANKRD11 0 mskcc.org GRCh37 16 89347285 89347285 Missense_Mutation T T C test 0 0 0 p.K1889E Signed out No High +BRAF 0 mskcc.org GRCh37 7 140453136 140453136 Missense_Mutation A A T test 1488 1359 129 0.0867 p.V600E Hotspot Signed out Yes High +CDKN2A 0 mskcc.org GRCh37 9 21971120 21971121 Nonsense_Mutation GG GG AA test 1153 1124 29 0.0252 p.R80* Hotspot Signed out No High +CREBBP 0 mskcc.org GRCh37 16 3831298 3831298 Missense_Mutation G G A test 0 0 0 p.P528L Signed out No High +DNMT3B 0 mskcc.org GRCh37 20 31383227 31383227 Missense_Mutation G G A test 0 0 0 p.R380Q Signed out No High +EZH1 0 mskcc.org GRCh37 17 40855800 40855800 Nonsense_Mutation G G A test 0 0 0 p.R686* Signed out No High +KIT 0 mskcc.org GRCh37 4 55570044 55570044 Missense_Mutation C C T test 0 0 0 p.T304I Signed out No High +NOTCH4 0 mskcc.org GRCh37 6 32166850 32166850 Missense_Mutation C C T test 1 1 0 0 p.G1463E Signed out No High +PAK7 0 mskcc.org GRCh37 20 9520177 9520177 Missense_Mutation G G A test 0 0 0 p.P698S Signed out No High +PIK3R1 0 mskcc.org GRCh37 5 67593308 67593308 Missense_Mutation A A G test 0 0 0 p.Y685C Signed out No High +PTPRT 0 mskcc.org GRCh37 20 41420106 41420106 Missense_Mutation C C T test 0 0 0 p.G72E Signed out No High +RAD51B 0 mskcc.org GRCh37 14 68352692 68352692 Missense_Mutation G G A test 0 0 0 p.E187K Signed out No High +SPEN 0 mskcc.org GRCh37 1 16199514 16199514 Missense_Mutation C C T test 0 0 0 p.S96F Signed out No High +SRSF2 0 mskcc.org GRCh37 17 74732283 74732283 Missense_Mutation G G A test 0 0 0 p.P209L Signed out No High +TERT 0 mskcc.org GRCh37 5 1295228 1295228 5'Flank G G A test 999 980 19 0.019 Signed out No Low +ANKRD11 0 mskcc.org GRCh37 16 89347285 89347285 Missense_Mutation T T C test 0 0 0 p.K1889E Signed out No High +BRAF 0 mskcc.org GRCh37 7 140453136 140453136 Missense_Mutation A A T test 2086 2081 5 0.0024 p.V600E Hotspot Signed out Yes High +CDKN2A 0 mskcc.org GRCh37 9 21971120 21971121 Nonsense_Mutation GG GG AA test 2143 2140 3 0.0014 p.R80* Hotspot Signed out No High +CREBBP 0 mskcc.org GRCh37 16 3831298 3831298 Missense_Mutation G G A test 0 0 0 p.P528L Signed out No High +DNMT3B 0 mskcc.org GRCh37 20 31383227 31383227 Missense_Mutation G G A test 0 0 0 p.R380Q Signed out No High +EZH1 0 mskcc.org GRCh37 17 40855800 40855800 Nonsense_Mutation G G A test 1 1 0 0 p.R686* Signed out No High +KIT 0 mskcc.org GRCh37 4 55570044 55570044 Missense_Mutation C C T test 0 0 0 p.T304I Signed out No High +NOTCH4 0 mskcc.org GRCh37 6 32166850 32166850 Missense_Mutation C C T test 0 0 0 p.G1463E Signed out No High +PAK7 0 mskcc.org GRCh37 20 9520177 9520177 Missense_Mutation G G A test 0 0 0 p.P698S Signed out No High +PIK3R1 0 mskcc.org GRCh37 5 67593308 67593308 Missense_Mutation A A G test 0 0 0 p.Y685C Signed out No High +PTPRT 0 mskcc.org GRCh37 20 41420106 41420106 Missense_Mutation C C T test 0 0 0 p.G72E Signed out No High +RAD51B 0 mskcc.org GRCh37 14 68352692 68352692 Missense_Mutation G G A test 0 0 0 p.E187K Signed out No High +SPEN 0 mskcc.org GRCh37 1 16199514 16199514 Missense_Mutation C C T test 0 0 0 p.S96F Signed out No High +SRSF2 0 mskcc.org GRCh37 17 74732283 74732283 Missense_Mutation G G A test 0 0 0 p.P209L Signed out No High +TERT 0 mskcc.org GRCh37 5 1295228 1295228 5'Flank G G A test 1743 1743 0 0 Signed out No Low +ANKRD11 0 mskcc.org GRCh37 16 89347285 89347285 Missense_Mutation T T C test 0 0 0 p.K1889E Signed out No High +BRAF 0 mskcc.org GRCh37 7 140453136 140453136 Missense_Mutation A A T test 2238 2238 0 0 p.V600E Hotspot Signed out Yes High +CDKN2A 0 mskcc.org GRCh37 9 21971120 21971121 Nonsense_Mutation GG GG AA test 2243 2243 0 0 p.R80* Hotspot Signed out No High +CREBBP 0 mskcc.org GRCh37 16 3831298 3831298 Missense_Mutation G G A test 0 0 0 p.P528L Signed out No High +DNMT3B 0 mskcc.org GRCh37 20 31383227 31383227 Missense_Mutation G G A test 0 0 0 p.R380Q Signed out No High +EZH1 0 mskcc.org GRCh37 17 40855800 40855800 Nonsense_Mutation G G A test 0 0 0 p.R686* Signed out No High +KIT 0 mskcc.org GRCh37 4 55570044 55570044 Missense_Mutation C C T test 1 1 0 0 p.T304I Signed out No High +NOTCH4 0 mskcc.org GRCh37 6 32166850 32166850 Missense_Mutation C C T test 0 0 0 p.G1463E Signed out No High +PAK7 0 mskcc.org GRCh37 20 9520177 9520177 Missense_Mutation G G A test 0 0 0 p.P698S Signed out No High +PIK3R1 0 mskcc.org GRCh37 5 67593308 67593308 Missense_Mutation A A G test 0 0 0 p.Y685C Signed out No High +PTPRT 0 mskcc.org GRCh37 20 41420106 41420106 Missense_Mutation C C T test 0 0 0 p.G72E Signed out No High +RAD51B 0 mskcc.org GRCh37 14 68352692 68352692 Missense_Mutation G G A test 0 0 0 p.E187K Signed out No High +SPEN 0 mskcc.org GRCh37 1 16199514 16199514 Missense_Mutation C C T test 0 0 0 p.S96F Signed out No High +SRSF2 0 mskcc.org GRCh37 17 74732283 74732283 Missense_Mutation G G A test 0 0 0 p.P209L Signed out No High +TERT 0 mskcc.org GRCh37 5 1295228 1295228 5'Flank G G A test 1869 1869 0 0 Signed out No Low +ANKRD11 0 mskcc.org GRCh37 16 89347285 89347285 Missense_Mutation T T C test 0 0 0 p.K1889E Signed out No High +BRAF 0 mskcc.org GRCh37 7 140453136 140453136 Missense_Mutation A A T test 485 485 0 0 p.V600E Hotspot Signed out Yes High +CDKN2A 0 mskcc.org GRCh37 9 21971120 21971121 Nonsense_Mutation GG GG AA test 839 839 0 0 p.R80* Hotspot Signed out No High +CREBBP 0 mskcc.org GRCh37 16 3831298 3831298 Missense_Mutation G G A test 0 0 0 p.P528L Signed out No High +DNMT3B 0 mskcc.org GRCh37 20 31383227 31383227 Missense_Mutation G G A test 0 0 0 p.R380Q Signed out No High +EZH1 0 mskcc.org GRCh37 17 40855800 40855800 Nonsense_Mutation G G A test 0 0 0 p.R686* Signed out No High +KIT 0 mskcc.org GRCh37 4 55570044 55570044 Missense_Mutation C C T test 0 0 0 p.T304I Signed out No High +NOTCH4 0 mskcc.org GRCh37 6 32166850 32166850 Missense_Mutation C C T test 0 0 0 p.G1463E Signed out No High +PAK7 0 mskcc.org GRCh37 20 9520177 9520177 Missense_Mutation G G A test 0 0 0 p.P698S Signed out No High +PIK3R1 0 mskcc.org GRCh37 5 67593308 67593308 Missense_Mutation A A G test 0 0 0 p.Y685C Signed out No High +PTPRT 0 mskcc.org GRCh37 20 41420106 41420106 Missense_Mutation C C T test 0 0 0 p.G72E Signed out No High +RAD51B 0 mskcc.org GRCh37 14 68352692 68352692 Missense_Mutation G G A test 0 0 0 p.E187K Signed out No High +SPEN 0 mskcc.org GRCh37 1 16199514 16199514 Missense_Mutation C C T test 0 0 0 p.S96F Signed out No High +SRSF2 0 mskcc.org GRCh37 17 74732283 74732283 Missense_Mutation G G A test 0 0 0 p.P209L Signed out No High +TERT 0 mskcc.org GRCh37 5 1295228 1295228 5'Flank G G A test 316 316 0 0 Signed out No Low +ANKRD11 0 mskcc.org GRCh37 16 89347285 89347285 Missense_Mutation T T C test 835 721 114 0.1365 p.K1889E Signed out No High +BRAF 0 mskcc.org GRCh37 7 140453136 140453136 Missense_Mutation A A T test 661 485 176 0.2663 p.V600E Hotspot Signed out Yes High +CDKN2A 0 mskcc.org GRCh37 9 21971120 21971121 Nonsense_Mutation GG GG AA test 482 395 87 0.1805 p.R80* Hotspot Signed out No High +CREBBP 0 mskcc.org GRCh37 16 3831298 3831298 Missense_Mutation G G A test 525 465 60 0.1143 p.P528L Signed out No High +DNMT3B 0 mskcc.org GRCh37 20 31383227 31383227 Missense_Mutation G G A test 710 613 97 0.1366 p.R380Q Signed out No High +EZH1 0 mskcc.org GRCh37 17 40855800 40855800 Nonsense_Mutation G G A test 860 769 91 0.1058 p.R686* Signed out No High +KIT 0 mskcc.org GRCh37 4 55570044 55570044 Missense_Mutation C C T test 463 385 78 0.1685 p.T304I Signed out No High +NOTCH4 0 mskcc.org GRCh37 6 32166850 32166850 Missense_Mutation C C T test 1149 1015 134 0.1166 p.G1463E Signed out No High +PAK7 0 mskcc.org GRCh37 20 9520177 9520177 Missense_Mutation G G A test 765 673 92 0.1203 p.P698S Signed out No High +PIK3R1 0 mskcc.org GRCh37 5 67593308 67593308 Missense_Mutation A A G test 512 450 62 0.1211 p.Y685C Signed out No High +PTPRT 0 mskcc.org GRCh37 20 41420106 41420106 Missense_Mutation C C T test 351 309 42 0.1197 p.G72E Signed out No High +RAD51B 0 mskcc.org GRCh37 14 68352692 68352692 Missense_Mutation G G A test 482 422 60 0.1245 p.E187K Signed out No High +SPEN 0 mskcc.org GRCh37 1 16199514 16199514 Missense_Mutation C C T test 749 670 79 0.1055 p.S96F Signed out No High +SRSF2 0 mskcc.org GRCh37 17 74732283 74732283 Missense_Mutation G G A test 459 398 61 0.1329 p.P209L Signed out No High +TERT 0 mskcc.org GRCh37 5 1295228 1295228 5'Flank G G A test 382 282 100 0.2618 Signed out No Low +ARID1B 0 mskcc.org GRCh37 6 157528952 157528952 Missense_Mutation C C T test 0 0 0 p.S2226L Signed out No High +ARID2 0 mskcc.org GRCh37 12 46245843 46245843 Nonsense_Mutation C C T test 0 0 0 p.Q1313* Signed out No High +AXL 0 mskcc.org GRCh37 19 41745117 41745117 Missense_Mutation G G A test 0 0 0 p.G395R Signed out No High +BRAF 0 mskcc.org GRCh37 7 140453136 140453137 Missense_Mutation AC AC TT test 842 632 210 0.2494 p.V600K Hotspot Signed out No High +BRCA2 0 mskcc.org GRCh37 13 32911493 32911493 Missense_Mutation T T C test 875 623 252 0.288 p.S1001P Signed out No High +BTK 0 mskcc.org GRCh37 X 100613674 100613674 Missense_Mutation C C T test 0 0 0 p.G302E Signed out No High +CARD11 0 mskcc.org GRCh37 7 2952942 2952942 Missense_Mutation C C T test 0 0 0 p.E1000K Signed out No High +DNMT3B 0 mskcc.org GRCh37 20 31384996 31384996 Missense_Mutation C C T test 0 0 0 p.R461C Signed out No High +EPHA7 0 mskcc.org GRCh37 6 93965572 93965572 Missense_Mutation C C T test 0 0 0 p.D786N Signed out No High +ERCC2 0 mskcc.org GRCh37 19 45868119 45868119 Missense_Mutation G G A test 43 35 8 0.186 p.P191S Signed out No High 9.42E-06 +ESR1 0 mskcc.org GRCh37 6 152129330 152129330 Missense_Mutation G G A test 1 1 0 0 p.G95R Signed out No High +FAT1 0 mskcc.org GRCh37 4 187629805 187629805 Missense_Mutation G G A test 0 0 0 p.P393S Signed out No High +HGF 0 mskcc.org GRCh37 7 81346589 81346589 Missense_Mutation C C T test 0 0 0 p.G455E Signed out No High +HIST3H3 0 mskcc.org GRCh37 1 228612807 228612807 Missense_Mutation C C T test 15 13 2 0.1333 p.E74K Signed out No High +HLA-A 0 mskcc.org GRCh37 6 29911302 29911302 Missense_Mutation G G A test 0 0 0 p.E201K Signed out No High +IKZF1 0 mskcc.org GRCh37 7 50367234 50367234 Missense_Mutation G G A test 0 0 0 p.G14E Signed out No High +IL7R 0 mskcc.org GRCh37 5 35876230 35876230 Missense_Mutation G G A test 0 0 0 p.G341E Signed out No High +KLF4 0 mskcc.org GRCh37 9 110249881 110249881 Missense_Mutation G G A test 0 0 0 p.P265L Signed out No High +KMT2C 0 mskcc.org GRCh37 7 151845361 151845362 Missense_Mutation CC CC TT test 0 0 0 p.G4551S Signed out No High +LYN 0 mskcc.org GRCh37 8 56863061 56863061 Frame_Shift_Del G G - test 0 0 0 p.E110Kfs*15 Signed out No High +LYN 0 mskcc.org GRCh37 8 56863061 56863061 Missense_Mutation G G A test 0 0 0 p.E110K Signed out No High +MAP2K1 0 mskcc.org GRCh37 15 66729162 66729163 Missense_Mutation CC CC TT test 1129 870 259 0.2294 p.P124L Hotspot Signed out No High +MET 0 mskcc.org GRCh37 7 116415001 116415001 Missense_Mutation C C T test 760 601 159 0.2092 p.S1032F Signed out No High +MGA 0 mskcc.org GRCh37 15 41961969 41961969 Missense_Mutation C C T test 0 0 0 p.R293C Signed out No High +MITF 0 mskcc.org GRCh37 3 69788756 69788757 Missense_Mutation CC CC TT test 0 0 0 p.S3F Signed out No High +NOTCH4 0 mskcc.org GRCh37 6 32183063 32183063 Missense_Mutation C C T test 0 0 0 p.G654E Signed out No High +NTRK2 0 mskcc.org GRCh37 9 87285689 87285689 Missense_Mutation G G A test 0 0 0 p.G9E Signed out No High +PAK7 0 mskcc.org GRCh37 20 9546694 9546694 Missense_Mutation C C T test 1 0 1 1 p.G443E Signed out No High +PIK3CG 0 mskcc.org GRCh37 7 106509612 106509612 Missense_Mutation C C T test 0 0 0 p.P536S Signed out No High +PPARG 0 mskcc.org GRCh37 3 12393114 12393114 Missense_Mutation C C T test 0 0 0 p.S8F Signed out No High +PRDM14 0 mskcc.org GRCh37 8 70970993 70970993 Missense_Mutation G G A test 0 0 0 p.P423L Signed out No High +PTEN 0 mskcc.org GRCh37 10 89717705 89717705 Missense_Mutation C C T test 571 425 146 0.2557 p.P244S Signed out No High +PTPRD 0 mskcc.org GRCh37 9 8484270 8484270 Missense_Mutation G G A test 0 0 0 p.R1088C Signed out No High +PTPRT 0 mskcc.org GRCh37 20 40980844 40980844 Missense_Mutation C C T test 0 0 0 p.E548K Hotspot Signed out No High +PTPRT 0 mskcc.org GRCh37 20 41419877 41419878 Missense_Mutation CT CT TA test 0 0 0 p.K148I Signed out No High +ROS1 0 mskcc.org GRCh37 6 117710750 117710750 Missense_Mutation C C T test 0 0 0 p.D508N Signed out No High +RTEL1 0 mskcc.org GRCh37 20 62293236 62293236 Missense_Mutation C C A test 0 0 0 p.A112D Signed out No High +SYK 0 mskcc.org GRCh37 9 93624541 93624541 Missense_Mutation G G A test 0 0 0 p.G211E Signed out No High +SYK 0 mskcc.org GRCh37 9 93637113 93637113 Missense_Mutation G G A test 0 0 0 p.G388D Signed out No High +TEK 0 mskcc.org GRCh37 9 27173277 27173277 Missense_Mutation G G A test 0 0 0 p.G273E Signed out No High +TERT 0 mskcc.org GRCh37 5 1295228 1295229 5'Flank GG GG AA test 594 539 55 0.0926 Signed out No Low +TP53 0 mskcc.org GRCh37 17 7577539 7577539 Missense_Mutation G G A test 691 494 197 0.2851 p.R248W Hotspot Signed out Yes High +ARID1B 0 mskcc.org GRCh37 6 157528952 157528952 Missense_Mutation C C T test 0 0 0 p.S2226L Signed out No High +ARID2 0 mskcc.org GRCh37 12 46245843 46245843 Nonsense_Mutation C C T test 0 0 0 p.Q1313* Signed out No High +AXL 0 mskcc.org GRCh37 19 41745117 41745117 Missense_Mutation G G A test 0 0 0 p.G395R Signed out No High +BRAF 0 mskcc.org GRCh37 7 140453136 140453137 Missense_Mutation AC AC TT test 647 542 105 0.1623 p.V600K Hotspot Signed out No High +BRCA2 0 mskcc.org GRCh37 13 32911493 32911493 Missense_Mutation T T C test 649 544 105 0.1618 p.S1001P Signed out No High +BTK 0 mskcc.org GRCh37 X 100613674 100613674 Missense_Mutation C C T test 0 0 0 p.G302E Signed out No High +CARD11 0 mskcc.org GRCh37 7 2952942 2952942 Missense_Mutation C C T test 0 0 0 p.E1000K Signed out No High +DNMT3B 0 mskcc.org GRCh37 20 31384996 31384996 Missense_Mutation C C T test 0 0 0 p.R461C Signed out No High +EPHA7 0 mskcc.org GRCh37 6 93965572 93965572 Missense_Mutation C C T test 0 0 0 p.D786N Signed out No High +ERCC2 0 mskcc.org GRCh37 19 45868119 45868119 Missense_Mutation G G A test 33 22 11 0.3333 p.P191S Signed out No High 9.42E-06 +ESR1 0 mskcc.org GRCh37 6 152129330 152129330 Missense_Mutation G G A test 1 1 0 0 p.G95R Signed out No High +FAT1 0 mskcc.org GRCh37 4 187629805 187629805 Missense_Mutation G G A test 0 0 0 p.P393S Signed out No High +HGF 0 mskcc.org GRCh37 7 81346589 81346589 Missense_Mutation C C T test 0 0 0 p.G455E Signed out No High +HIST3H3 0 mskcc.org GRCh37 1 228612807 228612807 Missense_Mutation C C T test 11 11 0 0 p.E74K Signed out No High +HLA-A 0 mskcc.org GRCh37 6 29911302 29911302 Missense_Mutation G G A test 0 0 0 p.E201K Signed out No High +IKZF1 0 mskcc.org GRCh37 7 50367234 50367234 Missense_Mutation G G A test 0 0 0 p.G14E Signed out No High +IL7R 0 mskcc.org GRCh37 5 35876230 35876230 Missense_Mutation G G A test 1 1 0 0 p.G341E Signed out No High +KLF4 0 mskcc.org GRCh37 9 110249881 110249881 Missense_Mutation G G A test 1 1 0 0 p.P265L Signed out No High +KMT2C 0 mskcc.org GRCh37 7 151845361 151845362 Missense_Mutation CC CC TT test 0 0 0 p.G4551S Signed out No High +LYN 0 mskcc.org GRCh37 8 56863061 56863061 Frame_Shift_Del G G - test 0 0 0 p.E110Kfs*15 Signed out No High +LYN 0 mskcc.org GRCh37 8 56863061 56863061 Missense_Mutation G G A test 0 0 0 p.E110K Signed out No High +MAP2K1 0 mskcc.org GRCh37 15 66729162 66729163 Missense_Mutation CC CC TT test 851 732 119 0.1398 p.P124L Hotspot Signed out No High +MET 0 mskcc.org GRCh37 7 116415001 116415001 Missense_Mutation C C T test 564 479 85 0.1507 p.S1032F Signed out No High +MGA 0 mskcc.org GRCh37 15 41961969 41961969 Missense_Mutation C C T test 0 0 0 p.R293C Signed out No High +MITF 0 mskcc.org GRCh37 3 69788756 69788757 Missense_Mutation CC CC TT test 0 0 0 p.S3F Signed out No High +NOTCH4 0 mskcc.org GRCh37 6 32183063 32183063 Missense_Mutation C C T test 0 0 0 p.G654E Signed out No High +NTRK2 0 mskcc.org GRCh37 9 87285689 87285689 Missense_Mutation G G A test 0 0 0 p.G9E Signed out No High +PAK7 0 mskcc.org GRCh37 20 9546694 9546694 Missense_Mutation C C T test 0 0 0 p.G443E Signed out No High +PIK3CG 0 mskcc.org GRCh37 7 106509612 106509612 Missense_Mutation C C T test 0 0 0 p.P536S Signed out No High +PPARG 0 mskcc.org GRCh37 3 12393114 12393114 Missense_Mutation C C T test 0 0 0 p.S8F Signed out No High +PRDM14 0 mskcc.org GRCh37 8 70970993 70970993 Missense_Mutation G G A test 0 0 0 p.P423L Signed out No High +PTEN 0 mskcc.org GRCh37 10 89717705 89717705 Missense_Mutation C C T test 441 389 52 0.1179 p.P244S Signed out No High +PTPRD 0 mskcc.org GRCh37 9 8484270 8484270 Missense_Mutation G G A test 0 0 0 p.R1088C Signed out No High +PTPRT 0 mskcc.org GRCh37 20 40980844 40980844 Missense_Mutation C C T test 0 0 0 p.E548K Hotspot Signed out No High +PTPRT 0 mskcc.org GRCh37 20 41419877 41419878 Missense_Mutation CT CT TA test 0 0 0 p.K148I Signed out No High +ROS1 0 mskcc.org GRCh37 6 117710750 117710750 Missense_Mutation C C T test 0 0 0 p.D508N Signed out No High +RTEL1 0 mskcc.org GRCh37 20 62293236 62293236 Missense_Mutation C C A test 0 0 0 p.A112D Signed out No High +SYK 0 mskcc.org GRCh37 9 93624541 93624541 Missense_Mutation G G A test 0 0 0 p.G211E Signed out No High +SYK 0 mskcc.org GRCh37 9 93637113 93637113 Missense_Mutation G G A test 0 0 0 p.G388D Signed out No High +TEK 0 mskcc.org GRCh37 9 27173277 27173277 Missense_Mutation G G A test 0 0 0 p.G273E Signed out No High +TERT 0 mskcc.org GRCh37 5 1295228 1295229 5'Flank GG GG AA test 517 468 49 0.0948 Signed out No Low +TP53 0 mskcc.org GRCh37 17 7577539 7577539 Missense_Mutation G G A test 574 484 90 0.1568 p.R248W Hotspot Signed out Yes High +ARID1B 0 mskcc.org GRCh37 6 157528952 157528952 Missense_Mutation C C T test 0 0 0 p.S2226L Signed out No High +ARID2 0 mskcc.org GRCh37 12 46245843 46245843 Nonsense_Mutation C C T test 0 0 0 p.Q1313* Signed out No High +AXL 0 mskcc.org GRCh37 19 41745117 41745117 Missense_Mutation G G A test 0 0 0 p.G395R Signed out No High +BRAF 0 mskcc.org GRCh37 7 140453136 140453137 Missense_Mutation AC AC TT test 2557 2544 13 0.0051 p.V600K Hotspot Signed out No High +BRCA2 0 mskcc.org GRCh37 13 32911493 32911493 Missense_Mutation T T C test 3170 3148 22 0.0069 p.S1001P Signed out No High +BTK 0 mskcc.org GRCh37 X 100613674 100613674 Missense_Mutation C C T test 0 0 0 p.G302E Signed out No High +CARD11 0 mskcc.org GRCh37 7 2952942 2952942 Missense_Mutation C C T test 0 0 0 p.E1000K Signed out No High +DNMT3B 0 mskcc.org GRCh37 20 31384996 31384996 Missense_Mutation C C T test 0 0 0 p.R461C Signed out No High +EPHA7 0 mskcc.org GRCh37 6 93965572 93965572 Missense_Mutation C C T test 0 0 0 p.D786N Signed out No High +ERCC2 0 mskcc.org GRCh37 19 45868119 45868119 Missense_Mutation G G A test 58 58 0 0 p.P191S Signed out No High 9.42E-06 +ESR1 0 mskcc.org GRCh37 6 152129330 152129330 Missense_Mutation G G A test 1 1 0 0 p.G95R Signed out No High +FAT1 0 mskcc.org GRCh37 4 187629805 187629805 Missense_Mutation G G A test 0 0 0 p.P393S Signed out No High +HGF 0 mskcc.org GRCh37 7 81346589 81346589 Missense_Mutation C C T test 0 0 0 p.G455E Signed out No High +HIST3H3 0 mskcc.org GRCh37 1 228612807 228612807 Missense_Mutation C C T test 17 17 0 0 p.E74K Signed out No High +HLA-A 0 mskcc.org GRCh37 6 29911302 29911302 Missense_Mutation G G A test 0 0 0 p.E201K Signed out No High +IKZF1 0 mskcc.org GRCh37 7 50367234 50367234 Missense_Mutation G G A test 0 0 0 p.G14E Signed out No High +IL7R 0 mskcc.org GRCh37 5 35876230 35876230 Missense_Mutation G G A test 0 0 0 p.G341E Signed out No High +KLF4 0 mskcc.org GRCh37 9 110249881 110249881 Missense_Mutation G G A test 0 0 0 p.P265L Signed out No High +KMT2C 0 mskcc.org GRCh37 7 151845361 151845362 Missense_Mutation CC CC TT test 0 0 0 p.G4551S Signed out No High +LYN 0 mskcc.org GRCh37 8 56863061 56863061 Frame_Shift_Del G G - test 0 0 0 p.E110Kfs*15 Signed out No High +LYN 0 mskcc.org GRCh37 8 56863061 56863061 Missense_Mutation G G A test 0 0 0 p.E110K Signed out No High +MAP2K1 0 mskcc.org GRCh37 15 66729162 66729163 Missense_Mutation CC CC TT test 3871 3850 21 0.0054 p.P124L Hotspot Signed out No High +MET 0 mskcc.org GRCh37 7 116415001 116415001 Missense_Mutation C C T test 1521 1513 8 0.0053 p.S1032F Signed out No High +MGA 0 mskcc.org GRCh37 15 41961969 41961969 Missense_Mutation C C T test 0 0 0 p.R293C Signed out No High +MITF 0 mskcc.org GRCh37 3 69788756 69788757 Missense_Mutation CC CC TT test 0 0 0 p.S3F Signed out No High +NOTCH4 0 mskcc.org GRCh37 6 32183063 32183063 Missense_Mutation C C T test 0 0 0 p.G654E Signed out No High +NTRK2 0 mskcc.org GRCh37 9 87285689 87285689 Missense_Mutation G G A test 0 0 0 p.G9E Signed out No High +PAK7 0 mskcc.org GRCh37 20 9546694 9546694 Missense_Mutation C C T test 0 0 0 p.G443E Signed out No High +PIK3CG 0 mskcc.org GRCh37 7 106509612 106509612 Missense_Mutation C C T test 0 0 0 p.P536S Signed out No High +PPARG 0 mskcc.org GRCh37 3 12393114 12393114 Missense_Mutation C C T test 0 0 0 p.S8F Signed out No High +PRDM14 0 mskcc.org GRCh37 8 70970993 70970993 Missense_Mutation G G A test 0 0 0 p.P423L Signed out No High +PTEN 0 mskcc.org GRCh37 10 89717705 89717705 Missense_Mutation C C T test 2270 2265 5 0.0022 p.P244S Signed out No High +PTPRD 0 mskcc.org GRCh37 9 8484270 8484270 Missense_Mutation G G A test 0 0 0 p.R1088C Signed out No High +PTPRT 0 mskcc.org GRCh37 20 40980844 40980844 Missense_Mutation C C T test 0 0 0 p.E548K Hotspot Signed out No High +PTPRT 0 mskcc.org GRCh37 20 41419877 41419878 Missense_Mutation CT CT TA test 0 0 0 p.K148I Signed out No High +ROS1 0 mskcc.org GRCh37 6 117710750 117710750 Missense_Mutation C C T test 0 0 0 p.D508N Signed out No High +RTEL1 0 mskcc.org GRCh37 20 62293236 62293236 Missense_Mutation C C A test 0 0 0 p.A112D Signed out No High +SYK 0 mskcc.org GRCh37 9 93624541 93624541 Missense_Mutation G G A test 0 0 0 p.G211E Signed out No High +SYK 0 mskcc.org GRCh37 9 93637113 93637113 Missense_Mutation G G A test 0 0 0 p.G388D Signed out No High +TEK 0 mskcc.org GRCh37 9 27173277 27173277 Missense_Mutation G G A test 0 0 0 p.G273E Signed out No High +TERT 0 mskcc.org GRCh37 5 1295228 1295229 5'Flank GG GG AA test 2258 2254 4 0.0018 Signed out No Low +TP53 0 mskcc.org GRCh37 17 7577539 7577539 Missense_Mutation G G A test 2211 2203 8 0.0036 p.R248W Hotspot Signed out Yes High +ARID1B 0 mskcc.org GRCh37 6 157528952 157528952 Missense_Mutation C C T test 0 0 0 p.S2226L Signed out No High +ARID2 0 mskcc.org GRCh37 12 46245843 46245843 Nonsense_Mutation C C T test 0 0 0 p.Q1313* Signed out No High +AXL 0 mskcc.org GRCh37 19 41745117 41745117 Missense_Mutation G G A test 1 1 0 0 p.G395R Signed out No High +BRAF 0 mskcc.org GRCh37 7 140453136 140453137 Missense_Mutation AC AC TT test 2582 2581 1 0.0004 p.V600K Hotspot Signed out No High +BRCA2 0 mskcc.org GRCh37 13 32911493 32911493 Missense_Mutation T T C test 3018 3018 0 0 p.S1001P Signed out No High +BTK 0 mskcc.org GRCh37 X 100613674 100613674 Missense_Mutation C C T test 0 0 0 p.G302E Signed out No High +CARD11 0 mskcc.org GRCh37 7 2952942 2952942 Missense_Mutation C C T test 0 0 0 p.E1000K Signed out No High +DNMT3B 0 mskcc.org GRCh37 20 31384996 31384996 Missense_Mutation C C T test 0 0 0 p.R461C Signed out No High +EPHA7 0 mskcc.org GRCh37 6 93965572 93965572 Missense_Mutation C C T test 0 0 0 p.D786N Signed out No High +ERCC2 0 mskcc.org GRCh37 19 45868119 45868119 Missense_Mutation G G A test 63 63 0 0 p.P191S Signed out No High 9.42E-06 +ESR1 0 mskcc.org GRCh37 6 152129330 152129330 Missense_Mutation G G A test 0 0 0 p.G95R Signed out No High +FAT1 0 mskcc.org GRCh37 4 187629805 187629805 Missense_Mutation G G A test 0 0 0 p.P393S Signed out No High +HGF 0 mskcc.org GRCh37 7 81346589 81346589 Missense_Mutation C C T test 0 0 0 p.G455E Signed out No High +HIST3H3 0 mskcc.org GRCh37 1 228612807 228612807 Missense_Mutation C C T test 28 28 0 0 p.E74K Signed out No High +HLA-A 0 mskcc.org GRCh37 6 29911302 29911302 Missense_Mutation G G A test 0 0 0 p.E201K Signed out No High +IKZF1 0 mskcc.org GRCh37 7 50367234 50367234 Missense_Mutation G G A test 0 0 0 p.G14E Signed out No High +IL7R 0 mskcc.org GRCh37 5 35876230 35876230 Missense_Mutation G G A test 0 0 0 p.G341E Signed out No High +KLF4 0 mskcc.org GRCh37 9 110249881 110249881 Missense_Mutation G G A test 0 0 0 p.P265L Signed out No High +KMT2C 0 mskcc.org GRCh37 7 151845361 151845362 Missense_Mutation CC CC TT test 0 0 0 p.G4551S Signed out No High +LYN 0 mskcc.org GRCh37 8 56863061 56863061 Frame_Shift_Del G G - test 0 0 0 p.E110Kfs*15 Signed out No High +LYN 0 mskcc.org GRCh37 8 56863061 56863061 Missense_Mutation G G A test 0 0 0 p.E110K Signed out No High +MAP2K1 0 mskcc.org GRCh37 15 66729162 66729163 Missense_Mutation CC CC TT test 3838 3838 0 0 p.P124L Hotspot Signed out No High +MET 0 mskcc.org GRCh37 7 116415001 116415001 Missense_Mutation C C T test 1790 1790 0 0 p.S1032F Signed out No High +MGA 0 mskcc.org GRCh37 15 41961969 41961969 Missense_Mutation C C T test 0 0 0 p.R293C Signed out No High +MITF 0 mskcc.org GRCh37 3 69788756 69788757 Missense_Mutation CC CC TT test 0 0 0 p.S3F Signed out No High +NOTCH4 0 mskcc.org GRCh37 6 32183063 32183063 Missense_Mutation C C T test 0 0 0 p.G654E Signed out No High +NTRK2 0 mskcc.org GRCh37 9 87285689 87285689 Missense_Mutation G G A test 0 0 0 p.G9E Signed out No High +PAK7 0 mskcc.org GRCh37 20 9546694 9546694 Missense_Mutation C C T test 0 0 0 p.G443E Signed out No High +PIK3CG 0 mskcc.org GRCh37 7 106509612 106509612 Missense_Mutation C C T test 0 0 0 p.P536S Signed out No High +PPARG 0 mskcc.org GRCh37 3 12393114 12393114 Missense_Mutation C C T test 0 0 0 p.S8F Signed out No High +PRDM14 0 mskcc.org GRCh37 8 70970993 70970993 Missense_Mutation G G A test 0 0 0 p.P423L Signed out No High +PTEN 0 mskcc.org GRCh37 10 89717705 89717705 Missense_Mutation C C T test 2233 2233 0 0 p.P244S Signed out No High +PTPRD 0 mskcc.org GRCh37 9 8484270 8484270 Missense_Mutation G G A test 0 0 0 p.R1088C Signed out No High +PTPRT 0 mskcc.org GRCh37 20 40980844 40980844 Missense_Mutation C C T test 0 0 0 p.E548K Hotspot Signed out No High +PTPRT 0 mskcc.org GRCh37 20 41419877 41419878 Missense_Mutation CT CT TA test 0 0 0 p.K148I Signed out No High +ROS1 0 mskcc.org GRCh37 6 117710750 117710750 Missense_Mutation C C T test 0 0 0 p.D508N Signed out No High +RTEL1 0 mskcc.org GRCh37 20 62293236 62293236 Missense_Mutation C C A test 0 0 0 p.A112D Signed out No High +SYK 0 mskcc.org GRCh37 9 93624541 93624541 Missense_Mutation G G A test 0 0 0 p.G211E Signed out No High +SYK 0 mskcc.org GRCh37 9 93637113 93637113 Missense_Mutation G G A test 0 0 0 p.G388D Signed out No High +TEK 0 mskcc.org GRCh37 9 27173277 27173277 Missense_Mutation G G A test 0 0 0 p.G273E Signed out No High +TERT 0 mskcc.org GRCh37 5 1295228 1295229 5'Flank GG GG AA test 2365 2365 0 0 Signed out No Low +TP53 0 mskcc.org GRCh37 17 7577539 7577539 Missense_Mutation G G A test 2107 2106 1 0.0005 p.R248W Hotspot Signed out Yes High +ARID1B 0 mskcc.org GRCh37 6 157528952 157528952 Missense_Mutation C C T test 0 0 0 p.S2226L Signed out No High +ARID2 0 mskcc.org GRCh37 12 46245843 46245843 Nonsense_Mutation C C T test 0 0 0 p.Q1313* Signed out No High +AXL 0 mskcc.org GRCh37 19 41745117 41745117 Missense_Mutation G G A test 0 0 0 p.G395R Signed out No High +BRAF 0 mskcc.org GRCh37 7 140453136 140453137 Missense_Mutation AC AC TT test 458 458 0 0 p.V600K Hotspot Signed out No High +BRCA2 0 mskcc.org GRCh37 13 32911493 32911493 Missense_Mutation T T C test 547 547 0 0 p.S1001P Signed out No High +BTK 0 mskcc.org GRCh37 X 100613674 100613674 Missense_Mutation C C T test 0 0 0 p.G302E Signed out No High +CARD11 0 mskcc.org GRCh37 7 2952942 2952942 Missense_Mutation C C T test 0 0 0 p.E1000K Signed out No High +DNMT3B 0 mskcc.org GRCh37 20 31384996 31384996 Missense_Mutation C C T test 0 0 0 p.R461C Signed out No High +EPHA7 0 mskcc.org GRCh37 6 93965572 93965572 Missense_Mutation C C T test 0 0 0 p.D786N Signed out No High +ERCC2 0 mskcc.org GRCh37 19 45868119 45868119 Missense_Mutation G G A test 35 35 0 0 p.P191S Signed out No High 9.42E-06 +ESR1 0 mskcc.org GRCh37 6 152129330 152129330 Missense_Mutation G G A test 0 0 0 p.G95R Signed out No High +FAT1 0 mskcc.org GRCh37 4 187629805 187629805 Missense_Mutation G G A test 0 0 0 p.P393S Signed out No High +HGF 0 mskcc.org GRCh37 7 81346589 81346589 Missense_Mutation C C T test 0 0 0 p.G455E Signed out No High +HIST3H3 0 mskcc.org GRCh37 1 228612807 228612807 Missense_Mutation C C T test 5 5 0 0 p.E74K Signed out No High +HLA-A 0 mskcc.org GRCh37 6 29911302 29911302 Missense_Mutation G G A test 0 0 0 p.E201K Signed out No High +IKZF1 0 mskcc.org GRCh37 7 50367234 50367234 Missense_Mutation G G A test 0 0 0 p.G14E Signed out No High +IL7R 0 mskcc.org GRCh37 5 35876230 35876230 Missense_Mutation G G A test 0 0 0 p.G341E Signed out No High +KLF4 0 mskcc.org GRCh37 9 110249881 110249881 Missense_Mutation G G A test 0 0 0 p.P265L Signed out No High +KMT2C 0 mskcc.org GRCh37 7 151845361 151845362 Missense_Mutation CC CC TT test 0 0 0 p.G4551S Signed out No High +LYN 0 mskcc.org GRCh37 8 56863061 56863061 Frame_Shift_Del G G - test 0 0 0 p.E110Kfs*15 Signed out No High +LYN 0 mskcc.org GRCh37 8 56863061 56863061 Missense_Mutation G G A test 0 0 0 p.E110K Signed out No High +MAP2K1 0 mskcc.org GRCh37 15 66729162 66729163 Missense_Mutation CC CC TT test 815 815 0 0 p.P124L Hotspot Signed out No High +MET 0 mskcc.org GRCh37 7 116415001 116415001 Missense_Mutation C C T test 518 518 0 0 p.S1032F Signed out No High +MGA 0 mskcc.org GRCh37 15 41961969 41961969 Missense_Mutation C C T test 0 0 0 p.R293C Signed out No High +MITF 0 mskcc.org GRCh37 3 69788756 69788757 Missense_Mutation CC CC TT test 0 0 0 p.S3F Signed out No High +NOTCH4 0 mskcc.org GRCh37 6 32183063 32183063 Missense_Mutation C C T test 0 0 0 p.G654E Signed out No High +NTRK2 0 mskcc.org GRCh37 9 87285689 87285689 Missense_Mutation G G A test 0 0 0 p.G9E Signed out No High +PAK7 0 mskcc.org GRCh37 20 9546694 9546694 Missense_Mutation C C T test 0 0 0 p.G443E Signed out No High +PIK3CG 0 mskcc.org GRCh37 7 106509612 106509612 Missense_Mutation C C T test 0 0 0 p.P536S Signed out No High +PPARG 0 mskcc.org GRCh37 3 12393114 12393114 Missense_Mutation C C T test 0 0 0 p.S8F Signed out No High +PRDM14 0 mskcc.org GRCh37 8 70970993 70970993 Missense_Mutation G G A test 0 0 0 p.P423L Signed out No High +PTEN 0 mskcc.org GRCh37 10 89717705 89717705 Missense_Mutation C C T test 497 497 0 0 p.P244S Signed out No High +PTPRD 0 mskcc.org GRCh37 9 8484270 8484270 Missense_Mutation G G A test 0 0 0 p.R1088C Signed out No High +PTPRT 0 mskcc.org GRCh37 20 40980844 40980844 Missense_Mutation C C T test 0 0 0 p.E548K Hotspot Signed out No High +PTPRT 0 mskcc.org GRCh37 20 41419877 41419878 Missense_Mutation CT CT TA test 0 0 0 p.K148I Signed out No High +ROS1 0 mskcc.org GRCh37 6 117710750 117710750 Missense_Mutation C C T test 0 0 0 p.D508N Signed out No High +RTEL1 0 mskcc.org GRCh37 20 62293236 62293236 Missense_Mutation C C A test 0 0 0 p.A112D Signed out No High +SYK 0 mskcc.org GRCh37 9 93624541 93624541 Missense_Mutation G G A test 0 0 0 p.G211E Signed out No High +SYK 0 mskcc.org GRCh37 9 93637113 93637113 Missense_Mutation G G A test 0 0 0 p.G388D Signed out No High +TEK 0 mskcc.org GRCh37 9 27173277 27173277 Missense_Mutation G G A test 0 0 0 p.G273E Signed out No High +TERT 0 mskcc.org GRCh37 5 1295228 1295229 5'Flank GG GG AA test 330 330 0 0 Signed out No Low +TP53 0 mskcc.org GRCh37 17 7577539 7577539 Missense_Mutation G G A test 466 466 0 0 p.R248W Hotspot Signed out Yes High +ARID1B 0 mskcc.org GRCh37 6 157528952 157528952 Missense_Mutation C C T test 1007 538 469 0.4657 p.S2226L Signed out No High +ARID2 0 mskcc.org GRCh37 12 46245843 46245843 Nonsense_Mutation C C T test 537 45 492 0.9162 p.Q1313* Signed out No High +AXL 0 mskcc.org GRCh37 19 41745117 41745117 Missense_Mutation G G A test 1183 647 536 0.4531 p.G395R Signed out No High +BRAF 0 mskcc.org GRCh37 7 140453136 140453137 Missense_Mutation AC AC TT test 1217 636 581 0.4774 p.V600K Hotspot Signed out No High +BRCA2 0 mskcc.org GRCh37 13 32911493 32911493 Missense_Mutation T T C test 1414 836 578 0.4088 p.S1001P Signed out No High +BTK 0 mskcc.org GRCh37 X 100613674 100613674 Missense_Mutation C C T test 471 40 431 0.9151 p.G302E Signed out No High +CARD11 0 mskcc.org GRCh37 7 2952942 2952942 Missense_Mutation C C T test 803 422 381 0.4745 p.E1000K Signed out No High +DNMT3B 0 mskcc.org GRCh37 20 31384996 31384996 Missense_Mutation C C T test 934 500 434 0.4647 p.R461C Signed out No High +EPHA7 0 mskcc.org GRCh37 6 93965572 93965572 Missense_Mutation C C T test 964 516 448 0.4647 p.D786N Signed out No High +ERCC2 0 mskcc.org GRCh37 19 45868119 45868119 Missense_Mutation G G A test 1598 813 785 0.4912 p.P191S Signed out No High 9.42E-06 +ESR1 0 mskcc.org GRCh37 6 152129330 152129330 Missense_Mutation G G A test 699 415 284 0.4063 p.G95R Signed out No High +FAT1 0 mskcc.org GRCh37 4 187629805 187629805 Missense_Mutation G G A test 1031 65 966 0.937 p.P393S Signed out No High +HGF 0 mskcc.org GRCh37 7 81346589 81346589 Missense_Mutation C C T test 1046 572 474 0.4532 p.G455E Signed out No High +HIST3H3 0 mskcc.org GRCh37 1 228612807 228612807 Missense_Mutation C C T test 1156 599 557 0.4818 p.E74K Signed out No High +HLA-A 0 mskcc.org GRCh37 6 29911302 29911302 Missense_Mutation G G A test 712 587 125 0.1756 p.E201K Signed out No High +IKZF1 0 mskcc.org GRCh37 7 50367234 50367234 Missense_Mutation G G A test 447 272 175 0.3915 p.G14E Signed out No High +IL7R 0 mskcc.org GRCh37 5 35876230 35876230 Missense_Mutation G G A test 811 446 365 0.4501 p.G341E Signed out No High +KLF4 0 mskcc.org GRCh37 9 110249881 110249881 Missense_Mutation G G A test 858 422 436 0.5082 p.P265L Signed out No High +KMT2C 0 mskcc.org GRCh37 7 151845361 151845362 Missense_Mutation CC CC TT test 1216 745 471 0.3873 p.G4551S Signed out No High +LYN 0 mskcc.org GRCh37 8 56863061 56863061 Frame_Shift_Del G G - test 590 556 34 0.0576 p.E110Kfs*15 Signed out No High +LYN 0 mskcc.org GRCh37 8 56863061 56863061 Missense_Mutation G G A test 590 96 494 0.8373 p.E110K Signed out No High +MAP2K1 0 mskcc.org GRCh37 15 66729162 66729163 Missense_Mutation CC CC TT test 1581 927 654 0.4137 p.P124L Hotspot Signed out No High +MET 0 mskcc.org GRCh37 7 116415001 116415001 Missense_Mutation C C T test 1904 1036 868 0.4559 p.S1032F Signed out No High +MGA 0 mskcc.org GRCh37 15 41961969 41961969 Missense_Mutation C C T test 1442 782 660 0.4577 p.R293C Signed out No High +MITF 0 mskcc.org GRCh37 3 69788756 69788757 Missense_Mutation CC CC TT test 890 469 421 0.473 p.S3F Signed out No High +NOTCH4 0 mskcc.org GRCh37 6 32183063 32183063 Missense_Mutation C C T test 1512 768 744 0.4921 p.G654E Signed out No High +NTRK2 0 mskcc.org GRCh37 9 87285689 87285689 Missense_Mutation G G A test 866 481 385 0.4446 p.G9E Signed out No High +PAK7 0 mskcc.org GRCh37 20 9546694 9546694 Missense_Mutation C C T test 481 236 245 0.5094 p.G443E Signed out No High +PIK3CG 0 mskcc.org GRCh37 7 106509612 106509612 Missense_Mutation C C T test 1103 553 550 0.4986 p.P536S Signed out No High +PPARG 0 mskcc.org GRCh37 3 12393114 12393114 Missense_Mutation C C T test 1319 741 578 0.4382 p.S8F Signed out No High +PRDM14 0 mskcc.org GRCh37 8 70970993 70970993 Missense_Mutation G G A test 829 110 719 0.8673 p.P423L Signed out No High +PTEN 0 mskcc.org GRCh37 10 89717705 89717705 Missense_Mutation C C T test 1345 721 624 0.4639 p.P244S Signed out No High +PTPRD 0 mskcc.org GRCh37 9 8484270 8484270 Missense_Mutation G G A test 1128 515 613 0.5434 p.R1088C Signed out No High +PTPRT 0 mskcc.org GRCh37 20 40980844 40980844 Missense_Mutation C C T test 846 463 383 0.4527 p.E548K Hotspot Signed out No High +PTPRT 0 mskcc.org GRCh37 20 41419877 41419878 Missense_Mutation CT CT TA test 687 403 284 0.4134 p.K148I Signed out No High +ROS1 0 mskcc.org GRCh37 6 117710750 117710750 Missense_Mutation C C T test 707 382 325 0.4597 p.D508N Signed out No High +RTEL1 0 mskcc.org GRCh37 20 62293236 62293236 Missense_Mutation C C A test 1552 832 720 0.4639 p.A112D Signed out No High +SYK 0 mskcc.org GRCh37 9 93624541 93624541 Missense_Mutation G G A test 685 369 316 0.4613 p.G211E Signed out No High +SYK 0 mskcc.org GRCh37 9 93637113 93637113 Missense_Mutation G G A test 836 434 402 0.4809 p.G388D Signed out No High +TEK 0 mskcc.org GRCh37 9 27173277 27173277 Missense_Mutation G G A test 1692 899 793 0.4687 p.G273E Signed out No High +TERT 0 mskcc.org GRCh37 5 1295228 1295229 5'Flank GG GG AA test 277 126 151 0.5451 Signed out No Low +TP53 0 mskcc.org GRCh37 17 7577539 7577539 Missense_Mutation G G A test 737 89 648 0.8792 p.R248W Hotspot Signed out Yes High +AGO2 0 mskcc.org GRCh37 8 141554401 141554401 Missense_Mutation G G A test 0 0 0 p.P584S Signed out No High +ARID1B 0 mskcc.org GRCh37 6 157528705 157528705 Missense_Mutation C C G test 0 0 0 p.L2144V Signed out No High +DIS3 0 mskcc.org GRCh37 13 73350187 73350187 Missense_Mutation G G A test 0 0 0 p.P233L Signed out No High +EPHA7 0 mskcc.org GRCh37 6 93979282 93979282 Missense_Mutation G G A test 0 0 0 p.R516W Signed out No High +GNAS 0 mskcc.org GRCh37 20 57485864 57485864 Missense_Mutation C C T test 0 0 0 p.R389C Signed out No High +INPPL1 0 mskcc.org GRCh37 11 71943744 71943744 Missense_Mutation C C T test 0 0 0 p.S596F Signed out No High +PTPRS 0 mskcc.org GRCh37 19 5231320 5231321 In_Frame_Ins - - CATCCTCGT test 0 0 0 p.D716_D718dup Signed out No High +PTPRT 0 mskcc.org GRCh37 20 40727065 40727065 Missense_Mutation G G T test 0 0 0 p.A1300D Signed out No High +TERT 0 mskcc.org GRCh37 5 1295228 1295228 5'Flank G G A test 1552 1552 0 0 Signed out No Low +AGO2 0 mskcc.org GRCh37 8 141554401 141554401 Missense_Mutation G G A test 0 0 0 p.P584S Signed out No High +ARID1B 0 mskcc.org GRCh37 6 157528705 157528705 Missense_Mutation C C G test 0 0 0 p.L2144V Signed out No High +DIS3 0 mskcc.org GRCh37 13 73350187 73350187 Missense_Mutation G G A test 0 0 0 p.P233L Signed out No High +EPHA7 0 mskcc.org GRCh37 6 93979282 93979282 Missense_Mutation G G A test 1 1 0 0 p.R516W Signed out No High +GNAS 0 mskcc.org GRCh37 20 57485864 57485864 Missense_Mutation C C T test 0 0 0 p.R389C Signed out No High +INPPL1 0 mskcc.org GRCh37 11 71943744 71943744 Missense_Mutation C C T test 0 0 0 p.S596F Signed out No High +PTPRS 0 mskcc.org GRCh37 19 5231320 5231321 In_Frame_Ins - - CATCCTCGT test 0 0 0 p.D716_D718dup Signed out No High +PTPRT 0 mskcc.org GRCh37 20 40727065 40727065 Missense_Mutation G G T test 0 0 0 p.A1300D Signed out No High +TERT 0 mskcc.org GRCh37 5 1295228 1295228 5'Flank G G A test 1503 1503 0 0 Signed out No Low +AGO2 0 mskcc.org GRCh37 8 141554401 141554401 Missense_Mutation G G A test 0 0 0 p.P584S Signed out No High +ARID1B 0 mskcc.org GRCh37 6 157528705 157528705 Missense_Mutation C C G test 0 0 0 p.L2144V Signed out No High +DIS3 0 mskcc.org GRCh37 13 73350187 73350187 Missense_Mutation G G A test 0 0 0 p.P233L Signed out No High +EPHA7 0 mskcc.org GRCh37 6 93979282 93979282 Missense_Mutation G G A test 0 0 0 p.R516W Signed out No High +GNAS 0 mskcc.org GRCh37 20 57485864 57485864 Missense_Mutation C C T test 0 0 0 p.R389C Signed out No High +INPPL1 0 mskcc.org GRCh37 11 71943744 71943744 Missense_Mutation C C T test 0 0 0 p.S596F Signed out No High +PTPRS 0 mskcc.org GRCh37 19 5231320 5231321 In_Frame_Ins - - CATCCTCGT test 0 0 0 p.D716_D718dup Signed out No High +PTPRT 0 mskcc.org GRCh37 20 40727065 40727065 Missense_Mutation G G T test 0 0 0 p.A1300D Signed out No High +TERT 0 mskcc.org GRCh37 5 1295228 1295228 5'Flank G G A test 1360 1360 0 0 Signed out No Low +AGO2 0 mskcc.org GRCh37 8 141554401 141554401 Missense_Mutation G G A test 0 0 0 p.P584S Signed out No High +ARID1B 0 mskcc.org GRCh37 6 157528705 157528705 Missense_Mutation C C G test 0 0 0 p.L2144V Signed out No High +DIS3 0 mskcc.org GRCh37 13 73350187 73350187 Missense_Mutation G G A test 0 0 0 p.P233L Signed out No High +EPHA7 0 mskcc.org GRCh37 6 93979282 93979282 Missense_Mutation G G A test 0 0 0 p.R516W Signed out No High +GNAS 0 mskcc.org GRCh37 20 57485864 57485864 Missense_Mutation C C T test 0 0 0 p.R389C Signed out No High +INPPL1 0 mskcc.org GRCh37 11 71943744 71943744 Missense_Mutation C C T test 0 0 0 p.S596F Signed out No High +PTPRS 0 mskcc.org GRCh37 19 5231320 5231321 In_Frame_Ins - - CATCCTCGT test 0 0 0 p.D716_D718dup Signed out No High +PTPRT 0 mskcc.org GRCh37 20 40727065 40727065 Missense_Mutation G G T test 0 0 0 p.A1300D Signed out No High +TERT 0 mskcc.org GRCh37 5 1295228 1295228 5'Flank G G A test 1649 1649 0 0 Signed out No Low +AGO2 0 mskcc.org GRCh37 8 141554401 141554401 Missense_Mutation G G A test 0 0 0 p.P584S Signed out No High +ARID1B 0 mskcc.org GRCh37 6 157528705 157528705 Missense_Mutation C C G test 0 0 0 p.L2144V Signed out No High +DIS3 0 mskcc.org GRCh37 13 73350187 73350187 Missense_Mutation G G A test 0 0 0 p.P233L Signed out No High +EPHA7 0 mskcc.org GRCh37 6 93979282 93979282 Missense_Mutation G G A test 0 0 0 p.R516W Signed out No High +GNAS 0 mskcc.org GRCh37 20 57485864 57485864 Missense_Mutation C C T test 0 0 0 p.R389C Signed out No High +INPPL1 0 mskcc.org GRCh37 11 71943744 71943744 Missense_Mutation C C T test 0 0 0 p.S596F Signed out No High +PTPRS 0 mskcc.org GRCh37 19 5231320 5231321 In_Frame_Ins - - CATCCTCGT test 0 0 0 p.D716_D718dup Signed out No High +PTPRT 0 mskcc.org GRCh37 20 40727065 40727065 Missense_Mutation G G T test 0 0 0 p.A1300D Signed out No High +TERT 0 mskcc.org GRCh37 5 1295228 1295228 5'Flank G G A test 202 202 0 0 Signed out No Low +AGO2 0 mskcc.org GRCh37 8 141554401 141554401 Missense_Mutation G G A test 1339 1014 325 0.2427 p.P584S Signed out No High +ARID1B 0 mskcc.org GRCh37 6 157528705 157528705 Missense_Mutation C C G test 589 472 117 0.1986 p.L2144V Signed out No High +DIS3 0 mskcc.org GRCh37 13 73350187 73350187 Missense_Mutation G G A test 500 416 84 0.168 p.P233L Signed out No High +EPHA7 0 mskcc.org GRCh37 6 93979282 93979282 Missense_Mutation G G A test 496 285 211 0.4254 p.R516W Signed out No High +GNAS 0 mskcc.org GRCh37 20 57485864 57485864 Missense_Mutation C C T test 881 522 359 0.4075 p.R389C Signed out No High +INPPL1 0 mskcc.org GRCh37 11 71943744 71943744 Missense_Mutation C C T test 2712 2281 431 0.1589 p.S596F Signed out No High +PTPRS 0 mskcc.org GRCh37 19 5231320 5231321 In_Frame_Ins - - CATCCTCGT test 554 448 106 0.1913 p.D716_D718dup Signed out No High +PTPRT 0 mskcc.org GRCh37 20 40727065 40727065 Missense_Mutation G G T test 703 598 105 0.1494 p.A1300D Signed out No High +TERT 0 mskcc.org GRCh37 5 1295228 1295228 5'Flank G G A test 439 338 101 0.2301 Signed out No Low \ No newline at end of file diff --git a/python/convert_csv_to_maf/example_output.xlsx b/python/convert_csv_to_maf/example_output.xlsx new file mode 100644 index 0000000..3e88d89 Binary files /dev/null and b/python/convert_csv_to_maf/example_output.xlsx differ diff --git a/python/convert_csv_to_maf/fof.txt b/python/convert_csv_to_maf/fof.txt new file mode 100644 index 0000000..d90329c --- /dev/null +++ b/python/convert_csv_to_maf/fof.txt @@ -0,0 +1,19 @@ +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-2AVE7W_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-2CJKAC_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-4PX38M_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-5DUJR8_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-5KCFV3_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-5PLA6N_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-70H905_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-84KMCA_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-8W2E8L_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-AME7C6_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-DDK2LJ_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-DFJ7RT_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-KPNF34_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-PXVUM9_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-R9MPAU_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-VUEN2P_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-WJPT69_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-XFV0RE_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-Y5K7R2_SNV_table.csv diff --git a/python/convert_csv_to_maf/requirements.txt b/python/convert_csv_to_maf/requirements.txt new file mode 100644 index 0000000..849b3a2 --- /dev/null +++ b/python/convert_csv_to_maf/requirements.txt @@ -0,0 +1,4 @@ +typer==0.3.2 +openpyxl==3.0.9 +typing_extensions==3.10.0.0 +pandas==1.2.5 diff --git a/python/convert_dates_to_days/convert_dates_to_days.py b/python/convert_dates_to_days/convert_dates_to_days.py index 25d84fd..c498beb 100644 --- a/python/convert_dates_to_days/convert_dates_to_days.py +++ b/python/convert_dates_to_days/convert_dates_to_days.py @@ -5,17 +5,26 @@ import arrow from datetime import datetime + def validate_date(date_string): - date_format = ['MM/DD/YY','M/D/YY','MM/D/YY','M/DD/YY','MM/DD/YYYY','YYYY/MM/DD'] + date_format = [ + "MM/DD/YY", + "M/D/YY", + "MM/D/YY", + "M/DD/YY", + "MM/DD/YYYY", + "YYYY/MM/DD", + "YYYY-MM-DD", + ] for fmt in date_format: try: - date_obj = arrow.get(date_string, fmt).date() - return date_obj + return arrow.get(date_string, fmt).date() except ValueError: pass except: print("Something else went wrong") - raise ValueError('no valid date format found') + raise ValueError("no valid date format found") + def main( input: Path = typer.Option( @@ -34,76 +43,116 @@ def main( "C1D1", "--timepoint1", "-t1", - help="Column name which has timpoint information to use the baseline date, first preference", - ), + help="timepoint name which in the timepoint column to use for the baseline date, first preference", + ), timepoint_label_for_baseline_second: str = typer.Option( "", "--timepoint2", "-t2", - help="Column name which has timpoint information to use the baseline date, second preference", - ), - + help="timepoint name which in the timepoint column to use for the baseline date, second preference", + ), + timepoint_label_for_baseline_third: str = typer.Option( + "", + "--timepoint3", + "-t3", + help="timepoint name which in the timepoint column to use for the baseline date, third preference", + ), output_file: str = typer.Option( - "output.txt", - "--output", + "output.txt", + "--output", "-o", help="Name of the output file", - ), ): - - ''' + + """ Tool to do the following operations: A. Reads meta data file, and based on the timepoint information given convert them to days for a samples belonging to a given patient_id B. Supports following date formats: 'MM/DD/YY','M/D/YY','MM/D/YY','M/DD/YY','MM/DD/YYYY','YYYY/MM/DD' - + Requirement: pandas; typer; arrow - ''' + """ - #Read input file - i_df = pd.read_csv(input,sep='\t',comment='#',low_memory=False) - #group by cmo_patient_id - grouped = i_df.groupby('cmo_patient_id') + # Read input file + i_df = pd.read_csv(input, sep="\t", comment="#", low_memory=False) + # group by cmo_patient_id + grouped = i_df.groupby("cmo_patient_id") keys = grouped.groups.keys() df_list = [] - #tarverse via cmo_patient_id to get associated samples + # tarverse via cmo_patient_id to get associated samples for i in keys: t_df = pd.DataFrame() t_df = grouped.get_group(i) baseline_date = None + # Get the baseline date - try: - baseline_date = t_df.loc[t_df['timepoint'] == timepoint_label_for_baseline_first, 'collection_date'].iloc[0] + if len(t_df) > 1: + try: + baseline_date = t_df.loc[ + t_df["timepoint"] == timepoint_label_for_baseline_first, + "collection_date", + ].iloc[0] + baseline_date = validate_date(baseline_date) + except IndexError: + print( + i, + "patient does not have first preference timepoint:", + timepoint_label_for_baseline_first, + ) + print( + "We will try to use second timepoint if available to use as baseline\n" + ) + if timepoint_label_for_baseline_second: + try: + baseline_date = t_df.loc[ + t_df["timepoint"] == timepoint_label_for_baseline_second, + "collection_date", + ].iloc[0] + baseline_date = validate_date(baseline_date) + except IndexError as e: + print( + i, + "patient does not have second preference timepoint:", + timepoint_label_for_baseline_second, + "\n", + ) + print(e) + if timepoint_label_for_baseline_third: + try: + baseline_date = t_df.loc[ + t_df["timepoint"] + == timepoint_label_for_baseline_third, + "collection_date", + ].iloc[0] + baseline_date = validate_date(baseline_date) + except IndexError as e: + print( + i, + "patient does not have third preference timepoint:", + timepoint_label_for_baseline_third, + "\n", + ) + print(e) + exit(1) + else: + baseline_date = str(t_df["collection_date"]) baseline_date = validate_date(baseline_date) - except IndexError: - print(i ,"patient does not have first preference timepoint:", timepoint_label_for_baseline_first) - print ("We will try to use second timepoint if available to use as baseline\n") - if timepoint_label_for_baseline_second: - try: - baseline_date = t_df.loc[t_df['timepoint'] == timepoint_label_for_baseline_second, 'collection_date'].iloc[0] - baseline_date = validate_date(baseline_date) - except IndexError as e: - print(i ,"patient does not have second preference timepoint:", timepoint_label_for_baseline_second,"\n") - print(e) - except: - print("Something else went wrong") - except: - print("Something else went wrong") - #convert to days + # convert to days days_list = [] - for a, b in zip(t_df['collection_date'], t_df['timepoint']): + for a, b in zip(t_df["collection_date"], t_df["timepoint"]): fmt_date = validate_date(a) delta = fmt_date - baseline_date days_list.append(delta.days) - #make list of modified dataframes + # make list of modified dataframes t_df_copy = t_df.copy(deep=True) - t_df_copy['collection_in_days'] = days_list + t_df_copy["collection_in_days"] = days_list df_list.append(t_df_copy) - #merge and write the dataframe - results = pd.concat(df_list, axis=0, join='outer') - results.to_csv(output_file, sep='\t', index=False) + # merge and write the dataframe + results = pd.concat(df_list, axis=0, join="outer") + results.to_csv(output_file, sep="\t", index=False) + if __name__ == "__main__": - typer.run(main) \ No newline at end of file + typer.run(main) diff --git a/python/get_cbioportal_variants/README.md b/python/get_cbioportal_variants/README.md index 0aa5a83..78a7252 100644 --- a/python/get_cbioportal_variants/README.md +++ b/python/get_cbioportal_variants/README.md @@ -13,7 +13,7 @@ Tool to do the following operations: ### Example command ```bash -python get_cbioportal_variants.py --id "Test1" --id "Test2" --id "Test3" +python get_cbioportal_variants.py --sid "Test1" --sid "Test2" --sid "Test3" ``` ```bash @@ -43,7 +43,7 @@ Options: 'Tumor_Sample_Barcode' column. Header of this file is 'sample_id' [default: ] - --id TEXT Identifiers to search for in the + --sid TEXT Identifiers to search for in the 'Tumor_Sample_Barcode' column. Can be given multiple times [default: ] @@ -58,4 +58,4 @@ Options: customize the installation. --help Show this message and exit. -``` \ No newline at end of file +``` diff --git a/python/get_cbioportal_variants/get_cbioportal_variants.py b/python/get_cbioportal_variants/get_cbioportal_variants.py index aa35486..3ed6ddc 100644 --- a/python/get_cbioportal_variants/get_cbioportal_variants.py +++ b/python/get_cbioportal_variants/get_cbioportal_variants.py @@ -4,7 +4,6 @@ import typer import pandas as pd - def main( maf: Path = typer.Option( "/work/access/production/resources/cbioportal/current/msk_solid_heme/data_mutations_extended.txt", @@ -22,13 +21,12 @@ def main( "", "--ids", "-i", - help="List of ids to search for in the \'Tumor_Sample_Barcode\' column. Header of this file is \'sample_id\'", + help="List of ids to search for in the 'Tumor_Sample_Barcode' column. Header of this file is 'sample_id'", ), - id: Optional[List[str]] = typer.Option( + sid: Optional[List[str]] = typer.Option( "", - help="Identifiers to search for in the \'Tumor_Sample_Barcode\' column. Can be given multiple times", - ), - + help="Identifiers to search for in the 'Tumor_Sample_Barcode' column. Can be given multiple times", + ), bed: Path = typer.Option( "/work/access/production/resources/msk-access/current/regions_of_interest/current/MSK-ACCESS-v1_0-probe-A.sorted.bed", "--bed", @@ -42,50 +40,62 @@ def main( help="BED file to find overlapping variants", ), output_file: str = typer.Option( - "output.maf", - "--name", + "output.maf", + "--name", "-n", help="Name of the output file", - ), ): - - ''' + + """ Tool to do the following operations: - A. Get subset of variants based on Tumor_Sample_Barcode in MAF file + A. Get subset of variants based on Tumor_Sample_Barcode in MAF file B. Mark the variants as overlapping with BED file as covered [yes/no], by appending "covered" column to the subset MAF - + Requirement: pandas; typing; typer; bed_lookup(https://github.com/msk-access/python_bed_lookup) - ''' + """ if not ids: typer.echo("Identifiers were not provided in a text file") - if not id: + if not sid: typer.echo("Identifiers were not provided via command line as well") raise typer.Abort() - #Read maf files - maf_df = pd.read_csv(maf,sep='\t', skiprows=1,low_memory=False) + # Read maf files + skip = get_row(maf) + maf_df = pd.read_csv(maf, sep="\t", skiprows=skip, low_memory=False) # Read Identifiers - if not id: + if not sid: file = open(ids) - id = file.read().splitlines()[1:] + sid = file.read().splitlines()[1:] file.close() - #filter for ids - ns=set(id) - pattern = "|".join([r'\b{}\b'.format(i) for i in ns]) - result = maf_df[maf_df['Tumor_Sample_Barcode'].str.contains(pattern, regex=True)] + # filter for ids + ns = set(sid) + pattern = "|".join([r"\b{}\b".format(i) for i in ns]) + result = maf_df[maf_df["Tumor_Sample_Barcode"].str.contains(pattern, regex=True)] results_covered = result.copy(deep=True) + results_covered["Chromosome"] = results_covered["Chromosome"].apply(str) # Read bed file b = BedFile(bed.as_posix()) # Our chromosome column is 'Chromosome' and position column is 'Start_Position'. - results_covered['covered'] = b.lookup_df(results_covered, 'Chromosome', 'Start_Position') - results_covered.loc[results_covered['covered'].notnull(),'covered'] = 'yes' - results_covered.loc[results_covered['covered'].notna(),'covered'] = 'yes' - results_covered.loc[results_covered['covered'].isnull(),'covered'] = 'no' - results_covered.loc[results_covered['covered'].isna(),'covered'] = 'no' - results_covered.drop_duplicates().to_csv(output_file, sep='\t', index=False) + results_covered["covered"] = b.lookup_df( + results_covered, "Chromosome", "Start_Position" + ) + results_covered.loc[results_covered["covered"].notnull(), "covered"] = "yes" + results_covered.loc[results_covered["covered"].notna(), "covered"] = "yes" + results_covered.loc[results_covered["covered"].isnull(), "covered"] = "no" + results_covered.loc[results_covered["covered"].isna(), "covered"] = "no" + results_covered.drop_duplicates().to_csv(output_file, sep="\t", index=False) + + +# preprocessing +def get_row(file): + skipped = [] + with open(file, "r") as csv_file: + skipped.extend(i for i, line in enumerate(csv_file) if line.startswith("#")) + return skipped + if __name__ == "__main__": - typer.run(main) \ No newline at end of file + typer.run(main) diff --git a/python/get_cbioportal_variants/requirements.txt b/python/get_cbioportal_variants/requirements.txt new file mode 100644 index 0000000..a3d8c94 --- /dev/null +++ b/python/get_cbioportal_variants/requirements.txt @@ -0,0 +1,5 @@ +typer==0.3.2 +openpyxl==3.0.9 +typing_extensions==3.10.0.0 +pandas==1.2.5 +bed_lookup diff --git a/reports/create_report.R b/reports/create_report.R index 8a46dcc..630daec 100644 --- a/reports/create_report.R +++ b/reports/create_report.R @@ -4,7 +4,6 @@ library(knitr) library(rmarkdown) library(argparse) - parser <- ArgumentParser() parser$add_argument("-t", "--template", required=T, help="Path to Rmarkdown template file.") @@ -16,9 +15,11 @@ parser$add_argument("-m", "--metadata", required=T, help="Path to file containin parser$add_argument("-d", "--dmp-id", help="DMP patient ID (optional).") parser$add_argument("-ds", "--dmp-sample-id", help="DMP sample ID (optional).") parser$add_argument("-dm", "--dmp-maf", help="Path to DMP MAF file (optional).") -parser$add_argument("-o", "--output", help="Output file") +parser$add_argument("-o", "--output", help="Output file with .html extension") +parser$add_argument( + "-md", "--keep-rmarkdown", help="Dont make tmp file for markdown, keep it in the same directory", action="store_true") parser$add_argument( - "-ca", "--combine-access", help="Don't splite VAF plots by clonality.", action="store_true") + "-ca", "--combine-access", help="Don't split VAF plots by clonality.", action="store_true") parser$add_argument( "-pi", "--plot-impact", help="Also plot VAFs from IMPACT samples.", action="store_true") @@ -47,6 +48,12 @@ input_text <- knitr::knit_expand( tmp <- tempfile(fileext = ".Rmd") cat(input_text, file = tmp) +if (args$keep_rmarkdown){ + rmd_name <- gsub(".html",".Rmd", args$output) + output_cwd <- normalizePath(dirname(args$output)) + output_rmd_path <- paste(output_cwd,"/",rmd_name, sep='') + file.copy(tmp,output_rmd_path) +} rmarkdown::render( tmp, output_format = "html_document", diff --git a/reports/template_days.Rmd b/reports/template_days.Rmd index 9cd3dd3..91bff1a 100644 --- a/reports/template_days.Rmd +++ b/reports/template_days.Rmd @@ -1,4 +1,5 @@ --- +title: "{{PATIENT_ID}}" output: html_document: df_print: paged @@ -6,6 +7,7 @@ output: ```{r global_options, include=FALSE} knitr::opts_chunk$set(echo=FALSE, warning=FALSE, message=FALSE, fig.width=10, fig.height=6) +library(knitr) library(data.table) library(tidyr) library(dplyr) @@ -19,17 +21,26 @@ library(RColorBrewer) theme_set(theme_bw()) show_text <- TRUE +``` -dmp_id <- "{{DMP_ID}}" -has_dmp <- F -if (dmp_id != "") { - has_dmp = T +```{r echo=FALSE, eval=show_text} +if ("{{DMP_ID}}" != "") { + asis_output("### DMP Patient ID: {{DMP_ID}} \n") } +``` +```{r echo=FALSE, eval=show_text} +if ("{{DMP_SAMPLE_ID}}" != "") { + asis_output("### DMP Sample ID: {{DMP_SAMPLE_ID}} \n") +} ``` ```{r echo=FALSE} - +dmp_id <- "{{DMP_ID}}" +has_dmp <- F +if (dmp_id != "") { + has_dmp = T +} if (dmp_id != "") { page_title = "{{PATIENT_ID}} ({{DMP_ID}})" } else { @@ -37,10 +48,6 @@ if (dmp_id != "") { } ``` ---- -title: "`r page_title`" ---- - ```{r echo=FALSE, eval=show_text} if ("{{TUMOR_TYPE}}" != "") { asis_output("### {{TUMOR_TYPE}} \n") @@ -167,7 +174,7 @@ cna_plot<-function(cna, xlimits, xbreaks){ } print_table<-function(table){ - datatable(table, rownames=FALSE, escape=FALSE, options=list(scrollX=T, autoWidth = TRUE)) + datatable(table, class='cell-border stripe compact', filter = 'top', rownames=FALSE, escape=FALSE, extensions = 'Buttons', options=list(scrollX=T, autoWidth = TRUE, dom = 'Bfrtip',buttons = c('copy', 'csv', 'excel', 'pdf', 'print'))) } ``` @@ -376,9 +383,6 @@ subplot(fig1,fig2,nrows=2,shareX=TRUE, heights=c(0.2,0.7), which_layout=1) ```{r adjustedvaf-linear, fig.height=4, eval=has_dmp} sample="{{PATIENT_ID}}" -filename = paste(sample,"_clonal.csv") -path = getwd() -file_path = file.path(path,filename) clonal <- subset(final, final$clonality=="CLONAL") if (nrow(clonal)>0) { @@ -387,7 +391,6 @@ if (nrow(clonal)>0) { clonal$adjustedvaf <- clonal$vaf*clonal$ncn / (clonal$expected_alt_copies + (clonal$ncn - clonal$tcn)*clonal$vaf) clonal$adjustedvaf <- round(clonal$adjustedvaf,4) clonaltoplot <- clonal - if (length(unique(clonal$VarName))>1) { clonal_mean <- data.frame( @@ -404,11 +407,17 @@ if (nrow(clonal)>0) { clonaltoplot$vaf<-round(clonaltoplot$vaf,4) clonaltoplot$adjustedvaf<-round(clonaltoplot$adjustedvaf,4) } - write.csv(clonaltoplot, file_path, sep = "\t", quote = F, row.names = F) fig2<-vaf_plot(clonaltoplot, xlimits, xbreaks, xlabels, varcolors, yaccuracy=0.01, log=FALSE, cnadjusted = TRUE) subplot(fig1,fig2,nrows=2,shareX=TRUE, heights=c(0.2,0.8)) } ``` +```{r write_clonal, eval=has_dmp} +if(nrow(clonal)>0){ + sample="{{PATIENT_ID}}" + filename = paste(sample,"_clonal_adjvaf.csv",sep='') + fwrite(clonal, file=paste(getwd(),filename,sep='/')) +} +``` ```{asis echo=has_dmp} ### Log @@ -433,7 +442,8 @@ if (nrow(clonal) > 0) { ### Description We adjust the variant allele fractions to account for the copy number alterations of the segments they are in. \ -Since it is not easy to call copy number changes from ACCESS data, here we rely on the copy number alterations called by FACETS in the IMPACT sample `r toString(impact_sample_id)`.\ +Since it is not easy to call copy number changes from ACCESS data, here we rely on the copy number alterations called by FACETS in the IMPACT sample.\ + *Note: This assumes that there are no changes to copy numbers of these segments between the IMPACT and ACCESS samples.* \