From 237af31b026961f01dd9aff70d83f57492cb59f7 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Tue, 21 Jun 2022 13:17:56 -0400 Subject: [PATCH 001/126] Update get_cbioportal_variants.py --- python/get_cbioportal_variants/get_cbioportal_variants.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/get_cbioportal_variants/get_cbioportal_variants.py b/python/get_cbioportal_variants/get_cbioportal_variants.py index aa35486..1c382aa 100644 --- a/python/get_cbioportal_variants/get_cbioportal_variants.py +++ b/python/get_cbioportal_variants/get_cbioportal_variants.py @@ -66,7 +66,7 @@ def main( raise typer.Abort() #Read maf files - maf_df = pd.read_csv(maf,sep='\t', skiprows=1,low_memory=False) + maf_df = pd.read_csv(maf,sep='\t', comment="#",low_memory=False, header=True) # Read Identifiers if not id: file = open(ids) @@ -77,6 +77,7 @@ def main( pattern = "|".join([r'\b{}\b'.format(i) for i in ns]) result = maf_df[maf_df['Tumor_Sample_Barcode'].str.contains(pattern, regex=True)] results_covered = result.copy(deep=True) + results_covered['Chromosome'].apply(str) # Read bed file b = BedFile(bed.as_posix()) # Our chromosome column is 'Chromosome' and position column is 'Start_Position'. From bab8864569982795e6ee52b5ab39d6b9cbee8e7f Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Tue, 21 Jun 2022 13:20:59 -0400 Subject: [PATCH 002/126] Update get_cbioportal_variants.py --- python/get_cbioportal_variants/get_cbioportal_variants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/get_cbioportal_variants/get_cbioportal_variants.py b/python/get_cbioportal_variants/get_cbioportal_variants.py index 1c382aa..3f5de36 100644 --- a/python/get_cbioportal_variants/get_cbioportal_variants.py +++ b/python/get_cbioportal_variants/get_cbioportal_variants.py @@ -77,7 +77,7 @@ def main( pattern = "|".join([r'\b{}\b'.format(i) for i in ns]) result = maf_df[maf_df['Tumor_Sample_Barcode'].str.contains(pattern, regex=True)] results_covered = result.copy(deep=True) - results_covered['Chromosome'].apply(str) + results_covered = results_covered['Chromosome'].apply(str) # Read bed file b = BedFile(bed.as_posix()) # Our chromosome column is 'Chromosome' and position column is 'Start_Position'. From d00c2bce864a3529b41d1431ee9fc3a8a8fcafc0 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Tue, 21 Jun 2022 13:23:44 -0400 Subject: [PATCH 003/126] Update get_cbioportal_variants.py --- python/get_cbioportal_variants/get_cbioportal_variants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/get_cbioportal_variants/get_cbioportal_variants.py b/python/get_cbioportal_variants/get_cbioportal_variants.py index 3f5de36..e69dae5 100644 --- a/python/get_cbioportal_variants/get_cbioportal_variants.py +++ b/python/get_cbioportal_variants/get_cbioportal_variants.py @@ -66,7 +66,7 @@ def main( raise typer.Abort() #Read maf files - maf_df = pd.read_csv(maf,sep='\t', comment="#",low_memory=False, header=True) + maf_df = pd.read_csv(maf,sep='\t', comment="#",low_memory=False) # Read Identifiers if not id: file = open(ids) From 1cc47568e5badd96b0390cfd86494dda0219cb3e Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Tue, 21 Jun 2022 13:35:01 -0400 Subject: [PATCH 004/126] Update get_cbioportal_variants.py --- python/get_cbioportal_variants/get_cbioportal_variants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/get_cbioportal_variants/get_cbioportal_variants.py b/python/get_cbioportal_variants/get_cbioportal_variants.py index e69dae5..27e0395 100644 --- a/python/get_cbioportal_variants/get_cbioportal_variants.py +++ b/python/get_cbioportal_variants/get_cbioportal_variants.py @@ -66,7 +66,7 @@ def main( raise typer.Abort() #Read maf files - maf_df = pd.read_csv(maf,sep='\t', comment="#",low_memory=False) + maf_df = pd.read_csv(maf, sep='\t', comment="#", low_memory=False) # Read Identifiers if not id: file = open(ids) From f770c7bf98baa521c1518e2d7e28c14b18008b00 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Tue, 21 Jun 2022 13:37:50 -0400 Subject: [PATCH 005/126] Update get_cbioportal_variants.py --- python/get_cbioportal_variants/get_cbioportal_variants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/get_cbioportal_variants/get_cbioportal_variants.py b/python/get_cbioportal_variants/get_cbioportal_variants.py index 27e0395..d502155 100644 --- a/python/get_cbioportal_variants/get_cbioportal_variants.py +++ b/python/get_cbioportal_variants/get_cbioportal_variants.py @@ -66,7 +66,7 @@ def main( raise typer.Abort() #Read maf files - maf_df = pd.read_csv(maf, sep='\t', comment="#", low_memory=False) + maf_df = pd.read_csv(maf, sep='\t', comment='#', low_memory=False) # Read Identifiers if not id: file = open(ids) From 6931bf081b305c369c5d0ee3f18d0022920e46f9 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Tue, 21 Jun 2022 13:40:36 -0400 Subject: [PATCH 006/126] Update get_cbioportal_variants.py --- python/get_cbioportal_variants/get_cbioportal_variants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/get_cbioportal_variants/get_cbioportal_variants.py b/python/get_cbioportal_variants/get_cbioportal_variants.py index d502155..8a22d05 100644 --- a/python/get_cbioportal_variants/get_cbioportal_variants.py +++ b/python/get_cbioportal_variants/get_cbioportal_variants.py @@ -66,7 +66,7 @@ def main( raise typer.Abort() #Read maf files - maf_df = pd.read_csv(maf, sep='\t', comment='#', low_memory=False) + maf_df = pd.read_csv(maf, sep='\t', comment='#', low_memory=False, header='infer') # Read Identifiers if not id: file = open(ids) From 17e88f4ddc28f31f4db7dccba0a956110b384d06 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Tue, 21 Jun 2022 13:44:58 -0400 Subject: [PATCH 007/126] Update get_cbioportal_variants.py --- .../get_cbioportal_variants.py | 67 +++++++++++-------- 1 file changed, 40 insertions(+), 27 deletions(-) diff --git a/python/get_cbioportal_variants/get_cbioportal_variants.py b/python/get_cbioportal_variants/get_cbioportal_variants.py index 8a22d05..7194738 100644 --- a/python/get_cbioportal_variants/get_cbioportal_variants.py +++ b/python/get_cbioportal_variants/get_cbioportal_variants.py @@ -3,7 +3,7 @@ from bed_lookup import BedFile import typer import pandas as pd - +import csv def main( maf: Path = typer.Option( @@ -22,13 +22,12 @@ def main( "", "--ids", "-i", - help="List of ids to search for in the \'Tumor_Sample_Barcode\' column. Header of this file is \'sample_id\'", + help="List of ids to search for in the 'Tumor_Sample_Barcode' column. Header of this file is 'sample_id'", ), id: Optional[List[str]] = typer.Option( "", - help="Identifiers to search for in the \'Tumor_Sample_Barcode\' column. Can be given multiple times", - ), - + help="Identifiers to search for in the 'Tumor_Sample_Barcode' column. Can be given multiple times", + ), bed: Path = typer.Option( "/work/access/production/resources/msk-access/current/regions_of_interest/current/MSK-ACCESS-v1_0-probe-A.sorted.bed", "--bed", @@ -42,51 +41,65 @@ def main( help="BED file to find overlapping variants", ), output_file: str = typer.Option( - "output.maf", - "--name", + "output.maf", + "--name", "-n", help="Name of the output file", - ), ): - - ''' + + """ Tool to do the following operations: - A. Get subset of variants based on Tumor_Sample_Barcode in MAF file + A. Get subset of variants based on Tumor_Sample_Barcode in MAF file B. Mark the variants as overlapping with BED file as covered [yes/no], by appending "covered" column to the subset MAF - + Requirement: pandas; typing; typer; bed_lookup(https://github.com/msk-access/python_bed_lookup) - ''' + """ if not ids: typer.echo("Identifiers were not provided in a text file") if not id: typer.echo("Identifiers were not provided via command line as well") raise typer.Abort() - #Read maf files - maf_df = pd.read_csv(maf, sep='\t', comment='#', low_memory=False, header='infer') + # Read maf files + skip = get_row(maf) + maf_df = pd.read_csv(maf, sep="\t", skiprows=skip, low_memory=False) # Read Identifiers if not id: file = open(ids) id = file.read().splitlines()[1:] file.close() - #filter for ids - ns=set(id) - pattern = "|".join([r'\b{}\b'.format(i) for i in ns]) - result = maf_df[maf_df['Tumor_Sample_Barcode'].str.contains(pattern, regex=True)] + # filter for ids + ns = set(id) + pattern = "|".join([r"\b{}\b".format(i) for i in ns]) + result = maf_df[maf_df["Tumor_Sample_Barcode"].str.contains(pattern, regex=True)] results_covered = result.copy(deep=True) - results_covered = results_covered['Chromosome'].apply(str) + results_covered = results_covered["Chromosome"].apply(str) # Read bed file b = BedFile(bed.as_posix()) # Our chromosome column is 'Chromosome' and position column is 'Start_Position'. - results_covered['covered'] = b.lookup_df(results_covered, 'Chromosome', 'Start_Position') - results_covered.loc[results_covered['covered'].notnull(),'covered'] = 'yes' - results_covered.loc[results_covered['covered'].notna(),'covered'] = 'yes' - results_covered.loc[results_covered['covered'].isnull(),'covered'] = 'no' - results_covered.loc[results_covered['covered'].isna(),'covered'] = 'no' - results_covered.drop_duplicates().to_csv(output_file, sep='\t', index=False) + results_covered["covered"] = b.lookup_df( + results_covered, "Chromosome", "Start_Position" + ) + results_covered.loc[results_covered["covered"].notnull(), "covered"] = "yes" + results_covered.loc[results_covered["covered"].notna(), "covered"] = "yes" + results_covered.loc[results_covered["covered"].isnull(), "covered"] = "no" + results_covered.loc[results_covered["covered"].isna(), "covered"] = "no" + results_covered.drop_duplicates().to_csv(output_file, sep="\t", index=False) + + +# preprocessing +def get_row(file): + skipped = [] + with open(file, "r") as csvfile: + reader = csv.reader(csvfile) + for i, row in enumerate(reader): + if row[0].strip()[:2] == "#": + skipped.append(i) + return skipped + if __name__ == "__main__": - typer.run(main) \ No newline at end of file + typer.run(main) From 5fda4d001511a7c72d983552ac8c467bc14472d6 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Tue, 21 Jun 2022 13:47:30 -0400 Subject: [PATCH 008/126] Update get_cbioportal_variants.py --- python/get_cbioportal_variants/get_cbioportal_variants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/get_cbioportal_variants/get_cbioportal_variants.py b/python/get_cbioportal_variants/get_cbioportal_variants.py index 7194738..efbbf39 100644 --- a/python/get_cbioportal_variants/get_cbioportal_variants.py +++ b/python/get_cbioportal_variants/get_cbioportal_variants.py @@ -94,7 +94,7 @@ def main( def get_row(file): skipped = [] with open(file, "r") as csvfile: - reader = csv.reader(csvfile) + reader = csv.reader(csvfile, delimiter='\t') for i, row in enumerate(reader): if row[0].strip()[:2] == "#": skipped.append(i) From e7d26958914d79217f24bf93551a3a783358d8b9 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Tue, 21 Jun 2022 13:50:44 -0400 Subject: [PATCH 009/126] Update get_cbioportal_variants.py --- python/get_cbioportal_variants/get_cbioportal_variants.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/get_cbioportal_variants/get_cbioportal_variants.py b/python/get_cbioportal_variants/get_cbioportal_variants.py index efbbf39..c7134a3 100644 --- a/python/get_cbioportal_variants/get_cbioportal_variants.py +++ b/python/get_cbioportal_variants/get_cbioportal_variants.py @@ -65,7 +65,7 @@ def main( # Read maf files skip = get_row(maf) - maf_df = pd.read_csv(maf, sep="\t", skiprows=skip, low_memory=False) + maf_df = pd.read_csv(maf, sep="\t", skiprows=1, low_memory=False) # Read Identifiers if not id: file = open(ids) @@ -76,7 +76,7 @@ def main( pattern = "|".join([r"\b{}\b".format(i) for i in ns]) result = maf_df[maf_df["Tumor_Sample_Barcode"].str.contains(pattern, regex=True)] results_covered = result.copy(deep=True) - results_covered = results_covered["Chromosome"].apply(str) + #results_covered = results_covered["Chromosome"].apply(str) # Read bed file b = BedFile(bed.as_posix()) # Our chromosome column is 'Chromosome' and position column is 'Start_Position'. From 545570d8091dc07d4858dd6de376751f56013e49 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Tue, 21 Jun 2022 13:51:08 -0400 Subject: [PATCH 010/126] Update get_cbioportal_variants.py --- python/get_cbioportal_variants/get_cbioportal_variants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/get_cbioportal_variants/get_cbioportal_variants.py b/python/get_cbioportal_variants/get_cbioportal_variants.py index c7134a3..63cca1e 100644 --- a/python/get_cbioportal_variants/get_cbioportal_variants.py +++ b/python/get_cbioportal_variants/get_cbioportal_variants.py @@ -76,7 +76,7 @@ def main( pattern = "|".join([r"\b{}\b".format(i) for i in ns]) result = maf_df[maf_df["Tumor_Sample_Barcode"].str.contains(pattern, regex=True)] results_covered = result.copy(deep=True) - #results_covered = results_covered["Chromosome"].apply(str) + results_covered = results_covered["Chromosome"].apply(str) # Read bed file b = BedFile(bed.as_posix()) # Our chromosome column is 'Chromosome' and position column is 'Start_Position'. From 75b3b63e3374b3730b768db7a5e35d69b201e828 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Tue, 21 Jun 2022 13:52:05 -0400 Subject: [PATCH 011/126] Update get_cbioportal_variants.py --- python/get_cbioportal_variants/get_cbioportal_variants.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/get_cbioportal_variants/get_cbioportal_variants.py b/python/get_cbioportal_variants/get_cbioportal_variants.py index 63cca1e..e37af12 100644 --- a/python/get_cbioportal_variants/get_cbioportal_variants.py +++ b/python/get_cbioportal_variants/get_cbioportal_variants.py @@ -65,7 +65,7 @@ def main( # Read maf files skip = get_row(maf) - maf_df = pd.read_csv(maf, sep="\t", skiprows=1, low_memory=False) + maf_df = pd.read_csv(maf, sep="\t", skiprows=skip, low_memory=False) # Read Identifiers if not id: file = open(ids) @@ -76,7 +76,7 @@ def main( pattern = "|".join([r"\b{}\b".format(i) for i in ns]) result = maf_df[maf_df["Tumor_Sample_Barcode"].str.contains(pattern, regex=True)] results_covered = result.copy(deep=True) - results_covered = results_covered["Chromosome"].apply(str) + results_covered["Chromosome"] = results_covered["Chromosome"].apply(str) # Read bed file b = BedFile(bed.as_posix()) # Our chromosome column is 'Chromosome' and position column is 'Start_Position'. From 31cbd1771de3fad05aefd8360aca68c0da29c12c Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Tue, 21 Jun 2022 13:55:50 -0400 Subject: [PATCH 012/126] Update get_cbioportal_variants.py --- python/get_cbioportal_variants/get_cbioportal_variants.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/get_cbioportal_variants/get_cbioportal_variants.py b/python/get_cbioportal_variants/get_cbioportal_variants.py index e37af12..04a4ea9 100644 --- a/python/get_cbioportal_variants/get_cbioportal_variants.py +++ b/python/get_cbioportal_variants/get_cbioportal_variants.py @@ -5,6 +5,7 @@ import pandas as pd import csv + def main( maf: Path = typer.Option( "/work/access/production/resources/cbioportal/current/msk_solid_heme/data_mutations_extended.txt", @@ -94,7 +95,7 @@ def main( def get_row(file): skipped = [] with open(file, "r") as csvfile: - reader = csv.reader(csvfile, delimiter='\t') + reader = csv.reader(csvfile, delimiter="\t") for i, row in enumerate(reader): if row[0].strip()[:2] == "#": skipped.append(i) From c5e063d0b01192cd1fa96670e86d849bab202931 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Thu, 30 Jun 2022 12:03:05 -0400 Subject: [PATCH 013/126] Adding script to convert csv to MAF --- fof.txt | 19 + python/convert_csv_to_maf/README.md | 89 ++++ python/convert_csv_to_maf/csv_to_maf.py | 142 +++++++ .../convert_csv_to_maf/example_output.maf.txt | 382 ++++++++++++++++++ python/convert_csv_to_maf/example_output.xlsx | Bin 0 -> 42445 bytes 5 files changed, 632 insertions(+) create mode 100644 fof.txt create mode 100644 python/convert_csv_to_maf/README.md create mode 100644 python/convert_csv_to_maf/csv_to_maf.py create mode 100644 python/convert_csv_to_maf/example_output.maf.txt create mode 100644 python/convert_csv_to_maf/example_output.xlsx diff --git a/fof.txt b/fof.txt new file mode 100644 index 0000000..d90329c --- /dev/null +++ b/fof.txt @@ -0,0 +1,19 @@ +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-2AVE7W_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-2CJKAC_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-4PX38M_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-5DUJR8_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-5KCFV3_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-5PLA6N_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-70H905_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-84KMCA_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-8W2E8L_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-AME7C6_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-DDK2LJ_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-DFJ7RT_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-KPNF34_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-PXVUM9_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-R9MPAU_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-VUEN2P_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-WJPT69_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-XFV0RE_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-Y5K7R2_SNV_table.csv diff --git a/python/convert_csv_to_maf/README.md b/python/convert_csv_to_maf/README.md new file mode 100644 index 0000000..41a9c80 --- /dev/null +++ b/python/convert_csv_to_maf/README.md @@ -0,0 +1,89 @@ +# Convert output of Rscript (filter_calls.R) CSV file to MAF + +Tool does the following operations: + +* Read one or more files from the inputs +* Removes unwanted columns, modifying the column headers depending on the + requirements +* Massaging the data frame to make it compatible with MAF format +* Write the data frame to a file in MAF format and Excel format + +## Requirements + +* pandas +* openpyxl +* typing +* typer + +## Example command + +### Explicitly specifying files on command line + +```bash +python csv_to_maf.py -i /path/to/Test1.csv -i /path/to/Test2.csv -i /path/to/Test3.csv +``` + +### Specifying files in a text FileOfFiles + +```bash +python csv_to_maf.py -l /path/to/FileOfFiles.txt +``` + +where **FileOfFiles.txt** +```bash +> cat FileOfFiles.txt +/path/to/Test1.csv +/path/to/Test2.csv +/path/to/Test3.csv +``` + +### Keeping normal samples identified using "normal" string, by default they are filtered + +```bash +python csv_to_maf.py -n -i /path/to/Test1.csv -i /path/to/Test2.csv -i /path/to/Test3.csv +# OR +python csv_to_maf.py -n -l /path/to/FileOfFiles.txt +``` + +## Usage + +```bash +> python csv_to_maf.py --help +Usage: csv_to_maf.py [OPTIONS] + + Tool does the following operations: + + A. Read one or more files from the inputs + + B. Removes unwanted columns, modifying the column headers depending on the + requirements + + C. Massaging the data frame to make it compatible with MAF format + + D. Write the data frame to a file in MAF format and Excel format + + Requirement: pandas; openpyxl; typing; typer; + +Options: + -l, --list PATH File of files, List of CSV files to be + converted to maf, one per line, no header, + CSV file generated by Rscript filter_calls.R + [default: ] + + -i, --csv FILE File to convert from csv to maf. CSV file + generated by Rscript filter_calls.R, Can be + given multiple times [default: ] + + -n, --normal / -N, --keep-normal + Keep samples tagged as normal [default: + False] + + -p, --prefix TEXT Prefix of the output MAF and EXCEL file + [default: csv_to_maf_output] + + --install-completion Install completion for the current shell. + --show-completion Show completion for the current shell, to + copy it or customize the installation. + + --help Show this message and exit. +``` diff --git a/python/convert_csv_to_maf/csv_to_maf.py b/python/convert_csv_to_maf/csv_to_maf.py new file mode 100644 index 0000000..64034db --- /dev/null +++ b/python/convert_csv_to_maf/csv_to_maf.py @@ -0,0 +1,142 @@ +from pathlib import Path +from typing import List, Optional +import typer +import pandas as pd + + +def main( + list_of_files: Path = typer.Option( + "", + "--list", + "-l", + help="File of files, List of CSV files to be converted to maf, one per line, no header, CSV file generated by Rscript filter_calls.R", + ), + csv: Optional[List[Path]] = typer.Option( + "", + "--csv", + "-i", + exists=True, + file_okay=True, + dir_okay=False, + writable=False, + readable=True, + resolve_path=True, + help="File to convert from csv to maf. CSV file generated by Rscript filter_calls.R, Can be given multiple times", + ), + normal: bool = typer.Option( + False, + "--normal/--keep-normal", + "-n/-N", + help="Keep samples tagged as normal", + ), + output_file_prefix: str = typer.Option( + "csv_to_maf_output", + "--prefix", + "-p", + help="Prefix of the output MAF and EXCEL file", + ), +): + + """ + Tool does the following operations: + + A. Read one or more files from the inputs + + B. Removes unwanted columns, modifying the column headers depending on the requirements + + C. Massaging the data frame to make it compatible with MAF format + + D. Write the data frame to a file in MAF format and Excel format + + Requirement: + pandas; openpyxl; typing; typer; + + """ + if not list_of_files: + typer.secho("File are not provided as file of files.", fg=typer.colors.BRIGHT_YELLOW) + if not csv: + typer.secho("File were not provided via command line as well", fg=typer.colors.BRIGHT_RED) + raise typer.Abort() + + # Read file of files + if not csv: + csv = [line.strip() for line in open(list_of_files, 'r')] + #print(csv) + final_df = pd.DataFrame() + for csv_file in csv: + if Path(csv_file).is_file(): + # Read csv file + typer.secho(f"Reading: {csv_file}", fg=typer.colors.BRIGHT_GREEN) + csv_df = pd.read_csv(csv_file, sep=",", low_memory=False) + # filter csv of "duplex.called columns" + csv_df = csv_df.loc[:, ~csv_df.columns.str.contains('__duplex.called')] + # filter csv of "duplex_support_num columns" + csv_df = csv_df.loc[:, ~csv_df.columns.str.contains('duplex_support_num')] + # filter csv of "normal" samples if normal is not wanted + if(not normal): + csv_df = csv_df.loc[:, ~csv_df.columns.str.contains('normal')] + # filter rows that have call_confidence == "Drop" + csv_df = csv_df[csv_df['call_confidence'].astype(str).str.lower().str.contains("drop",na=False) == False] + # melt the data frame + melt_csv_df = csv_df.melt(id_vars =['Hugo_Symbol','Chromosome','Start_Position','End_Position','Variant_Classification','HGVSp_Short','Reference_Allele','Tumor_Seq_Allele2','ExAC_AF','Hotspot','DMP','CH','call_confidence'], var_name ='Tumor_Sample_Barcode', value_name ='Evidence') + #fix tumor_sample_barcode + melt_csv_df['Tumor_Sample_Barcode'] = melt_csv_df.Tumor_Sample_Barcode.str.split('___', 1).str.get(0) + # convert Chromosome to string + melt_csv_df['Chromosome'] = melt_csv_df['Chromosome'].astype(str) + # split Evidence columns into multiple columns + melt_csv_df[['t_alt_count', 't_depth']] = melt_csv_df['Evidence'].str.split('/', 1, expand=True) + # convert t_alt_count to to_numeric + melt_csv_df['t_alt_count'] = melt_csv_df['t_alt_count'].apply(pd.to_numeric, errors='coerce') + #remove variant frequency information + melt_csv_df['t_depth'] = melt_csv_df.t_depth.str.split('(', 1).str.get(0) + # convert t_depth to to_numeric + melt_csv_df['t_depth'] = melt_csv_df['t_depth'].apply(pd.to_numeric, errors='coerce') + #calculate t_ref_count + melt_csv_df = melt_csv_df.assign(t_ref_count=melt_csv_df['t_depth'] - melt_csv_df['t_alt_count']) + #calculate t_alt_freq + melt_csv_df = melt_csv_df.assign(t_alt_freq=(melt_csv_df['t_alt_count'] / melt_csv_df['t_depth']).round(4)) + #drop Evidence columns + melt_csv_df.drop(columns=['Evidence'], inplace=True) + # add additional columns + melt_csv_df['Entrez_Gene_Id'] = 0 + melt_csv_df['Center'] = 'mskcc.org' + melt_csv_df['NCBI_Build'] = 'GRCh37' + melt_csv_df['Tumor_Seq_Allele1'] = melt_csv_df['Reference_Allele'] + melt_csv_df['Strand'] = '' + melt_csv_df['Consequence'] = '' + melt_csv_df['dbSNP_RS'] = '' + melt_csv_df['dbSNP_Val_Status'] = '' + melt_csv_df['Match_Norm_Seq_Allele1'] = '' + melt_csv_df['Match_Norm_Seq_Allele2'] = '' + melt_csv_df['Tumor_Validation_Allele1'] = '' + melt_csv_df['Tumor_Validation_Allele2'] = '' + melt_csv_df['Match_Norm_Validation_Allele1'] = '' + melt_csv_df['Match_Norm_Validation_Allele2'] = '' + melt_csv_df['Verification_Status'] = '' + melt_csv_df['Validation_Status'] = '' + melt_csv_df['Mutation_Status'] = '' + melt_csv_df['Sequencing_Phase'] = '' + melt_csv_df['Sequence_Source'] = '' + melt_csv_df['Validation_Method'] = '' + melt_csv_df['Score'] = '' + melt_csv_df['BAM_File'] = '' + melt_csv_df['Sequencer'] = '' + melt_csv_df['n_ref_count'] = '' + melt_csv_df['n_alt_count'] = '' + melt_csv_df['HGVSc'] = '' + melt_csv_df['HGVSp'] = '' + melt_csv_df['Transcript_ID'] = '' + melt_csv_df['RefSeq'] = '' + melt_csv_df['Protein_position'] = '' + melt_csv_df['Codons'] = '' + melt_csv_df = melt_csv_df.reindex(columns = ['Hugo_Symbol','Entrez_Gene_Id','Center','NCBI_Build','Chromosome','Start_Position','End_Position','Strand','Consequence','Variant_Classification','Variant_Type','Reference_Allele','Tumor_Seq_Allele1','Tumor_Seq_Allele2','dbSNP_RS','dbSNP_Val_Status','Tumor_Sample_Barcode','Matched_Norm_Sample_Barcode','Match_Norm_Seq_Allele1','Match_Norm_Seq_Allele2','Tumor_Validation_Allele1','Tumor_Validation_Allele2','Match_Norm_Validation_Allele1','Match_Norm_Validation_Allele2','Verification_Status','Validation_Status','Mutation_Status','Sequencing_Phase','Sequence_Source','Validation_Method','Score','BAM_File','Sequencer','t_depth','t_ref_count','t_alt_count','t_alt_freq','n_ref_count','n_alt_count','HGVSc','HGVSp','HGVSp_Short','Transcript_ID','RefSeq','Protein_position','Codons','Hotspot','DMP','CH','call_confidence','ExAC_AF']) + final_df = final_df.append(melt_csv_df, ignore_index=True) + else: + typer.secho(f"{csv_file} file does not exists", fg=typer.colors.BRIGHT_RED) + raise typer.Abort() + #write final_df to tsv + typer.secho(f"Done processing the CSV file writing output to {output_file_prefix} in txt and excel format", fg=typer.colors.GREEN) + final_df.to_csv(f"{output_file_prefix}.maf", index=False, sep='\t') + final_df.to_excel(f"{output_file_prefix}.xlsx", index=False) +if __name__ == "__main__": + typer.run(main) diff --git a/python/convert_csv_to_maf/example_output.maf.txt b/python/convert_csv_to_maf/example_output.maf.txt new file mode 100644 index 0000000..6a84795 --- /dev/null +++ b/python/convert_csv_to_maf/example_output.maf.txt @@ -0,0 +1,382 @@ +Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position End_Position Strand Consequence Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 Tumor_Validation_Allele1 Tumor_Validation_Allele2 Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2 Verification_Status Validation_Status Mutation_Status Sequencing_Phase Sequence_Source Validation_Method Score BAM_File Sequencer t_depth t_ref_count t_alt_count t_alt_freq n_ref_count n_alt_count HGVSc HGVSp HGVSp_Short Transcript_ID RefSeq Protein_position Codons Hotspot DMP CH call_confidence ExAC_AF +ANKRD11 0 mskcc.org GRCh37 16 89347285 89347285 Missense_Mutation T T C test 0 0 0 p.K1889E Signed out No High +BRAF 0 mskcc.org GRCh37 7 140453136 140453136 Missense_Mutation A A T test 1488 1359 129 0.0867 p.V600E Hotspot Signed out Yes High +CDKN2A 0 mskcc.org GRCh37 9 21971120 21971121 Nonsense_Mutation GG GG AA test 1153 1124 29 0.0252 p.R80* Hotspot Signed out No High +CREBBP 0 mskcc.org GRCh37 16 3831298 3831298 Missense_Mutation G G A test 0 0 0 p.P528L Signed out No High +DNMT3B 0 mskcc.org GRCh37 20 31383227 31383227 Missense_Mutation G G A test 0 0 0 p.R380Q Signed out No High +EZH1 0 mskcc.org GRCh37 17 40855800 40855800 Nonsense_Mutation G G A test 0 0 0 p.R686* Signed out No High +KIT 0 mskcc.org GRCh37 4 55570044 55570044 Missense_Mutation C C T test 0 0 0 p.T304I Signed out No High +NOTCH4 0 mskcc.org GRCh37 6 32166850 32166850 Missense_Mutation C C T test 1 1 0 0 p.G1463E Signed out No High +PAK7 0 mskcc.org GRCh37 20 9520177 9520177 Missense_Mutation G G A test 0 0 0 p.P698S Signed out No High +PIK3R1 0 mskcc.org GRCh37 5 67593308 67593308 Missense_Mutation A A G test 0 0 0 p.Y685C Signed out No High +PTPRT 0 mskcc.org GRCh37 20 41420106 41420106 Missense_Mutation C C T test 0 0 0 p.G72E Signed out No High +RAD51B 0 mskcc.org GRCh37 14 68352692 68352692 Missense_Mutation G G A test 0 0 0 p.E187K Signed out No High +SPEN 0 mskcc.org GRCh37 1 16199514 16199514 Missense_Mutation C C T test 0 0 0 p.S96F Signed out No High +SRSF2 0 mskcc.org GRCh37 17 74732283 74732283 Missense_Mutation G G A test 0 0 0 p.P209L Signed out No High +TERT 0 mskcc.org GRCh37 5 1295228 1295228 5'Flank G G A test 999 980 19 0.019 Signed out No Low +ANKRD11 0 mskcc.org GRCh37 16 89347285 89347285 Missense_Mutation T T C test 0 0 0 p.K1889E Signed out No High +BRAF 0 mskcc.org GRCh37 7 140453136 140453136 Missense_Mutation A A T test 2086 2081 5 0.0024 p.V600E Hotspot Signed out Yes High +CDKN2A 0 mskcc.org GRCh37 9 21971120 21971121 Nonsense_Mutation GG GG AA test 2143 2140 3 0.0014 p.R80* Hotspot Signed out No High +CREBBP 0 mskcc.org GRCh37 16 3831298 3831298 Missense_Mutation G G A test 0 0 0 p.P528L Signed out No High +DNMT3B 0 mskcc.org GRCh37 20 31383227 31383227 Missense_Mutation G G A test 0 0 0 p.R380Q Signed out No High +EZH1 0 mskcc.org GRCh37 17 40855800 40855800 Nonsense_Mutation G G A test 1 1 0 0 p.R686* Signed out No High +KIT 0 mskcc.org GRCh37 4 55570044 55570044 Missense_Mutation C C T test 0 0 0 p.T304I Signed out No High +NOTCH4 0 mskcc.org GRCh37 6 32166850 32166850 Missense_Mutation C C T test 0 0 0 p.G1463E Signed out No High +PAK7 0 mskcc.org GRCh37 20 9520177 9520177 Missense_Mutation G G A test 0 0 0 p.P698S Signed out No High +PIK3R1 0 mskcc.org GRCh37 5 67593308 67593308 Missense_Mutation A A G test 0 0 0 p.Y685C Signed out No High +PTPRT 0 mskcc.org GRCh37 20 41420106 41420106 Missense_Mutation C C T test 0 0 0 p.G72E Signed out No High +RAD51B 0 mskcc.org GRCh37 14 68352692 68352692 Missense_Mutation G G A test 0 0 0 p.E187K Signed out No High +SPEN 0 mskcc.org GRCh37 1 16199514 16199514 Missense_Mutation C C T test 0 0 0 p.S96F Signed out No High +SRSF2 0 mskcc.org GRCh37 17 74732283 74732283 Missense_Mutation G G A test 0 0 0 p.P209L Signed out No High +TERT 0 mskcc.org GRCh37 5 1295228 1295228 5'Flank G G A test 1743 1743 0 0 Signed out No Low +ANKRD11 0 mskcc.org GRCh37 16 89347285 89347285 Missense_Mutation T T C test 0 0 0 p.K1889E Signed out No High +BRAF 0 mskcc.org GRCh37 7 140453136 140453136 Missense_Mutation A A T test 2238 2238 0 0 p.V600E Hotspot Signed out Yes High +CDKN2A 0 mskcc.org GRCh37 9 21971120 21971121 Nonsense_Mutation GG GG AA test 2243 2243 0 0 p.R80* Hotspot Signed out No High +CREBBP 0 mskcc.org GRCh37 16 3831298 3831298 Missense_Mutation G G A test 0 0 0 p.P528L Signed out No High +DNMT3B 0 mskcc.org GRCh37 20 31383227 31383227 Missense_Mutation G G A test 0 0 0 p.R380Q Signed out No High +EZH1 0 mskcc.org GRCh37 17 40855800 40855800 Nonsense_Mutation G G A test 0 0 0 p.R686* Signed out No High +KIT 0 mskcc.org GRCh37 4 55570044 55570044 Missense_Mutation C C T test 1 1 0 0 p.T304I Signed out No High +NOTCH4 0 mskcc.org GRCh37 6 32166850 32166850 Missense_Mutation C C T test 0 0 0 p.G1463E Signed out No High +PAK7 0 mskcc.org GRCh37 20 9520177 9520177 Missense_Mutation G G A test 0 0 0 p.P698S Signed out No High +PIK3R1 0 mskcc.org GRCh37 5 67593308 67593308 Missense_Mutation A A G test 0 0 0 p.Y685C Signed out No High +PTPRT 0 mskcc.org GRCh37 20 41420106 41420106 Missense_Mutation C C T test 0 0 0 p.G72E Signed out No High +RAD51B 0 mskcc.org GRCh37 14 68352692 68352692 Missense_Mutation G G A test 0 0 0 p.E187K Signed out No High +SPEN 0 mskcc.org GRCh37 1 16199514 16199514 Missense_Mutation C C T test 0 0 0 p.S96F Signed out No High +SRSF2 0 mskcc.org GRCh37 17 74732283 74732283 Missense_Mutation G G A test 0 0 0 p.P209L Signed out No High +TERT 0 mskcc.org GRCh37 5 1295228 1295228 5'Flank G G A test 1869 1869 0 0 Signed out No Low +ANKRD11 0 mskcc.org GRCh37 16 89347285 89347285 Missense_Mutation T T C test 0 0 0 p.K1889E Signed out No High +BRAF 0 mskcc.org GRCh37 7 140453136 140453136 Missense_Mutation A A T test 485 485 0 0 p.V600E Hotspot Signed out Yes High +CDKN2A 0 mskcc.org GRCh37 9 21971120 21971121 Nonsense_Mutation GG GG AA test 839 839 0 0 p.R80* Hotspot Signed out No High +CREBBP 0 mskcc.org GRCh37 16 3831298 3831298 Missense_Mutation G G A test 0 0 0 p.P528L Signed out No High +DNMT3B 0 mskcc.org GRCh37 20 31383227 31383227 Missense_Mutation G G A test 0 0 0 p.R380Q Signed out No High +EZH1 0 mskcc.org GRCh37 17 40855800 40855800 Nonsense_Mutation G G A test 0 0 0 p.R686* Signed out No High +KIT 0 mskcc.org GRCh37 4 55570044 55570044 Missense_Mutation C C T test 0 0 0 p.T304I Signed out No High +NOTCH4 0 mskcc.org GRCh37 6 32166850 32166850 Missense_Mutation C C T test 0 0 0 p.G1463E Signed out No High +PAK7 0 mskcc.org GRCh37 20 9520177 9520177 Missense_Mutation G G A test 0 0 0 p.P698S Signed out No High +PIK3R1 0 mskcc.org GRCh37 5 67593308 67593308 Missense_Mutation A A G test 0 0 0 p.Y685C Signed out No High +PTPRT 0 mskcc.org GRCh37 20 41420106 41420106 Missense_Mutation C C T test 0 0 0 p.G72E Signed out No High +RAD51B 0 mskcc.org GRCh37 14 68352692 68352692 Missense_Mutation G G A test 0 0 0 p.E187K Signed out No High +SPEN 0 mskcc.org GRCh37 1 16199514 16199514 Missense_Mutation C C T test 0 0 0 p.S96F Signed out No High +SRSF2 0 mskcc.org GRCh37 17 74732283 74732283 Missense_Mutation G G A test 0 0 0 p.P209L Signed out No High +TERT 0 mskcc.org GRCh37 5 1295228 1295228 5'Flank G G A test 316 316 0 0 Signed out No Low +ANKRD11 0 mskcc.org GRCh37 16 89347285 89347285 Missense_Mutation T T C test 835 721 114 0.1365 p.K1889E Signed out No High +BRAF 0 mskcc.org GRCh37 7 140453136 140453136 Missense_Mutation A A T test 661 485 176 0.2663 p.V600E Hotspot Signed out Yes High +CDKN2A 0 mskcc.org GRCh37 9 21971120 21971121 Nonsense_Mutation GG GG AA test 482 395 87 0.1805 p.R80* Hotspot Signed out No High +CREBBP 0 mskcc.org GRCh37 16 3831298 3831298 Missense_Mutation G G A test 525 465 60 0.1143 p.P528L Signed out No High +DNMT3B 0 mskcc.org GRCh37 20 31383227 31383227 Missense_Mutation G G A test 710 613 97 0.1366 p.R380Q Signed out No High +EZH1 0 mskcc.org GRCh37 17 40855800 40855800 Nonsense_Mutation G G A test 860 769 91 0.1058 p.R686* Signed out No High +KIT 0 mskcc.org GRCh37 4 55570044 55570044 Missense_Mutation C C T test 463 385 78 0.1685 p.T304I Signed out No High +NOTCH4 0 mskcc.org GRCh37 6 32166850 32166850 Missense_Mutation C C T test 1149 1015 134 0.1166 p.G1463E Signed out No High +PAK7 0 mskcc.org GRCh37 20 9520177 9520177 Missense_Mutation G G A test 765 673 92 0.1203 p.P698S Signed out No High +PIK3R1 0 mskcc.org GRCh37 5 67593308 67593308 Missense_Mutation A A G test 512 450 62 0.1211 p.Y685C Signed out No High +PTPRT 0 mskcc.org GRCh37 20 41420106 41420106 Missense_Mutation C C T test 351 309 42 0.1197 p.G72E Signed out No High +RAD51B 0 mskcc.org GRCh37 14 68352692 68352692 Missense_Mutation G G A test 482 422 60 0.1245 p.E187K Signed out No High +SPEN 0 mskcc.org GRCh37 1 16199514 16199514 Missense_Mutation C C T test 749 670 79 0.1055 p.S96F Signed out No High +SRSF2 0 mskcc.org GRCh37 17 74732283 74732283 Missense_Mutation G G A test 459 398 61 0.1329 p.P209L Signed out No High +TERT 0 mskcc.org GRCh37 5 1295228 1295228 5'Flank G G A test 382 282 100 0.2618 Signed out No Low +ARID1B 0 mskcc.org GRCh37 6 157528952 157528952 Missense_Mutation C C T test 0 0 0 p.S2226L Signed out No High +ARID2 0 mskcc.org GRCh37 12 46245843 46245843 Nonsense_Mutation C C T test 0 0 0 p.Q1313* Signed out No High +AXL 0 mskcc.org GRCh37 19 41745117 41745117 Missense_Mutation G G A test 0 0 0 p.G395R Signed out No High +BRAF 0 mskcc.org GRCh37 7 140453136 140453137 Missense_Mutation AC AC TT test 842 632 210 0.2494 p.V600K Hotspot Signed out No High +BRCA2 0 mskcc.org GRCh37 13 32911493 32911493 Missense_Mutation T T C test 875 623 252 0.288 p.S1001P Signed out No High +BTK 0 mskcc.org GRCh37 X 100613674 100613674 Missense_Mutation C C T test 0 0 0 p.G302E Signed out No High +CARD11 0 mskcc.org GRCh37 7 2952942 2952942 Missense_Mutation C C T test 0 0 0 p.E1000K Signed out No High +DNMT3B 0 mskcc.org GRCh37 20 31384996 31384996 Missense_Mutation C C T test 0 0 0 p.R461C Signed out No High +EPHA7 0 mskcc.org GRCh37 6 93965572 93965572 Missense_Mutation C C T test 0 0 0 p.D786N Signed out No High +ERCC2 0 mskcc.org GRCh37 19 45868119 45868119 Missense_Mutation G G A test 43 35 8 0.186 p.P191S Signed out No High 9.42E-06 +ESR1 0 mskcc.org GRCh37 6 152129330 152129330 Missense_Mutation G G A test 1 1 0 0 p.G95R Signed out No High +FAT1 0 mskcc.org GRCh37 4 187629805 187629805 Missense_Mutation G G A test 0 0 0 p.P393S Signed out No High +HGF 0 mskcc.org GRCh37 7 81346589 81346589 Missense_Mutation C C T test 0 0 0 p.G455E Signed out No High +HIST3H3 0 mskcc.org GRCh37 1 228612807 228612807 Missense_Mutation C C T test 15 13 2 0.1333 p.E74K Signed out No High +HLA-A 0 mskcc.org GRCh37 6 29911302 29911302 Missense_Mutation G G A test 0 0 0 p.E201K Signed out No High +IKZF1 0 mskcc.org GRCh37 7 50367234 50367234 Missense_Mutation G G A test 0 0 0 p.G14E Signed out No High +IL7R 0 mskcc.org GRCh37 5 35876230 35876230 Missense_Mutation G G A test 0 0 0 p.G341E Signed out No High +KLF4 0 mskcc.org GRCh37 9 110249881 110249881 Missense_Mutation G G A test 0 0 0 p.P265L Signed out No High +KMT2C 0 mskcc.org GRCh37 7 151845361 151845362 Missense_Mutation CC CC TT test 0 0 0 p.G4551S Signed out No High +LYN 0 mskcc.org GRCh37 8 56863061 56863061 Frame_Shift_Del G G - test 0 0 0 p.E110Kfs*15 Signed out No High +LYN 0 mskcc.org GRCh37 8 56863061 56863061 Missense_Mutation G G A test 0 0 0 p.E110K Signed out No High +MAP2K1 0 mskcc.org GRCh37 15 66729162 66729163 Missense_Mutation CC CC TT test 1129 870 259 0.2294 p.P124L Hotspot Signed out No High +MET 0 mskcc.org GRCh37 7 116415001 116415001 Missense_Mutation C C T test 760 601 159 0.2092 p.S1032F Signed out No High +MGA 0 mskcc.org GRCh37 15 41961969 41961969 Missense_Mutation C C T test 0 0 0 p.R293C Signed out No High +MITF 0 mskcc.org GRCh37 3 69788756 69788757 Missense_Mutation CC CC TT test 0 0 0 p.S3F Signed out No High +NOTCH4 0 mskcc.org GRCh37 6 32183063 32183063 Missense_Mutation C C T test 0 0 0 p.G654E Signed out No High +NTRK2 0 mskcc.org GRCh37 9 87285689 87285689 Missense_Mutation G G A test 0 0 0 p.G9E Signed out No High +PAK7 0 mskcc.org GRCh37 20 9546694 9546694 Missense_Mutation C C T test 1 0 1 1 p.G443E Signed out No High +PIK3CG 0 mskcc.org GRCh37 7 106509612 106509612 Missense_Mutation C C T test 0 0 0 p.P536S Signed out No High +PPARG 0 mskcc.org GRCh37 3 12393114 12393114 Missense_Mutation C C T test 0 0 0 p.S8F Signed out No High +PRDM14 0 mskcc.org GRCh37 8 70970993 70970993 Missense_Mutation G G A test 0 0 0 p.P423L Signed out No High +PTEN 0 mskcc.org GRCh37 10 89717705 89717705 Missense_Mutation C C T test 571 425 146 0.2557 p.P244S Signed out No High +PTPRD 0 mskcc.org GRCh37 9 8484270 8484270 Missense_Mutation G G A test 0 0 0 p.R1088C Signed out No High +PTPRT 0 mskcc.org GRCh37 20 40980844 40980844 Missense_Mutation C C T test 0 0 0 p.E548K Hotspot Signed out No High +PTPRT 0 mskcc.org GRCh37 20 41419877 41419878 Missense_Mutation CT CT TA test 0 0 0 p.K148I Signed out No High +ROS1 0 mskcc.org GRCh37 6 117710750 117710750 Missense_Mutation C C T test 0 0 0 p.D508N Signed out No High +RTEL1 0 mskcc.org GRCh37 20 62293236 62293236 Missense_Mutation C C A test 0 0 0 p.A112D Signed out No High +SYK 0 mskcc.org GRCh37 9 93624541 93624541 Missense_Mutation G G A test 0 0 0 p.G211E Signed out No High +SYK 0 mskcc.org GRCh37 9 93637113 93637113 Missense_Mutation G G A test 0 0 0 p.G388D Signed out No High +TEK 0 mskcc.org GRCh37 9 27173277 27173277 Missense_Mutation G G A test 0 0 0 p.G273E Signed out No High +TERT 0 mskcc.org GRCh37 5 1295228 1295229 5'Flank GG GG AA test 594 539 55 0.0926 Signed out No Low +TP53 0 mskcc.org GRCh37 17 7577539 7577539 Missense_Mutation G G A test 691 494 197 0.2851 p.R248W Hotspot Signed out Yes High +ARID1B 0 mskcc.org GRCh37 6 157528952 157528952 Missense_Mutation C C T test 0 0 0 p.S2226L Signed out No High +ARID2 0 mskcc.org GRCh37 12 46245843 46245843 Nonsense_Mutation C C T test 0 0 0 p.Q1313* Signed out No High +AXL 0 mskcc.org GRCh37 19 41745117 41745117 Missense_Mutation G G A test 0 0 0 p.G395R Signed out No High +BRAF 0 mskcc.org GRCh37 7 140453136 140453137 Missense_Mutation AC AC TT test 647 542 105 0.1623 p.V600K Hotspot Signed out No High +BRCA2 0 mskcc.org GRCh37 13 32911493 32911493 Missense_Mutation T T C test 649 544 105 0.1618 p.S1001P Signed out No High +BTK 0 mskcc.org GRCh37 X 100613674 100613674 Missense_Mutation C C T test 0 0 0 p.G302E Signed out No High +CARD11 0 mskcc.org GRCh37 7 2952942 2952942 Missense_Mutation C C T test 0 0 0 p.E1000K Signed out No High +DNMT3B 0 mskcc.org GRCh37 20 31384996 31384996 Missense_Mutation C C T test 0 0 0 p.R461C Signed out No High +EPHA7 0 mskcc.org GRCh37 6 93965572 93965572 Missense_Mutation C C T test 0 0 0 p.D786N Signed out No High +ERCC2 0 mskcc.org GRCh37 19 45868119 45868119 Missense_Mutation G G A test 33 22 11 0.3333 p.P191S Signed out No High 9.42E-06 +ESR1 0 mskcc.org GRCh37 6 152129330 152129330 Missense_Mutation G G A test 1 1 0 0 p.G95R Signed out No High +FAT1 0 mskcc.org GRCh37 4 187629805 187629805 Missense_Mutation G G A test 0 0 0 p.P393S Signed out No High +HGF 0 mskcc.org GRCh37 7 81346589 81346589 Missense_Mutation C C T test 0 0 0 p.G455E Signed out No High +HIST3H3 0 mskcc.org GRCh37 1 228612807 228612807 Missense_Mutation C C T test 11 11 0 0 p.E74K Signed out No High +HLA-A 0 mskcc.org GRCh37 6 29911302 29911302 Missense_Mutation G G A test 0 0 0 p.E201K Signed out No High +IKZF1 0 mskcc.org GRCh37 7 50367234 50367234 Missense_Mutation G G A test 0 0 0 p.G14E Signed out No High +IL7R 0 mskcc.org GRCh37 5 35876230 35876230 Missense_Mutation G G A test 1 1 0 0 p.G341E Signed out No High +KLF4 0 mskcc.org GRCh37 9 110249881 110249881 Missense_Mutation G G A test 1 1 0 0 p.P265L Signed out No High +KMT2C 0 mskcc.org GRCh37 7 151845361 151845362 Missense_Mutation CC CC TT test 0 0 0 p.G4551S Signed out No High +LYN 0 mskcc.org GRCh37 8 56863061 56863061 Frame_Shift_Del G G - test 0 0 0 p.E110Kfs*15 Signed out No High +LYN 0 mskcc.org GRCh37 8 56863061 56863061 Missense_Mutation G G A test 0 0 0 p.E110K Signed out No High +MAP2K1 0 mskcc.org GRCh37 15 66729162 66729163 Missense_Mutation CC CC TT test 851 732 119 0.1398 p.P124L Hotspot Signed out No High +MET 0 mskcc.org GRCh37 7 116415001 116415001 Missense_Mutation C C T test 564 479 85 0.1507 p.S1032F Signed out No High +MGA 0 mskcc.org GRCh37 15 41961969 41961969 Missense_Mutation C C T test 0 0 0 p.R293C Signed out No High +MITF 0 mskcc.org GRCh37 3 69788756 69788757 Missense_Mutation CC CC TT test 0 0 0 p.S3F Signed out No High +NOTCH4 0 mskcc.org GRCh37 6 32183063 32183063 Missense_Mutation C C T test 0 0 0 p.G654E Signed out No High +NTRK2 0 mskcc.org GRCh37 9 87285689 87285689 Missense_Mutation G G A test 0 0 0 p.G9E Signed out No High +PAK7 0 mskcc.org GRCh37 20 9546694 9546694 Missense_Mutation C C T test 0 0 0 p.G443E Signed out No High +PIK3CG 0 mskcc.org GRCh37 7 106509612 106509612 Missense_Mutation C C T test 0 0 0 p.P536S Signed out No High +PPARG 0 mskcc.org GRCh37 3 12393114 12393114 Missense_Mutation C C T test 0 0 0 p.S8F Signed out No High +PRDM14 0 mskcc.org GRCh37 8 70970993 70970993 Missense_Mutation G G A test 0 0 0 p.P423L Signed out No High +PTEN 0 mskcc.org GRCh37 10 89717705 89717705 Missense_Mutation C C T test 441 389 52 0.1179 p.P244S Signed out No High +PTPRD 0 mskcc.org GRCh37 9 8484270 8484270 Missense_Mutation G G A test 0 0 0 p.R1088C Signed out No High +PTPRT 0 mskcc.org GRCh37 20 40980844 40980844 Missense_Mutation C C T test 0 0 0 p.E548K Hotspot Signed out No High +PTPRT 0 mskcc.org GRCh37 20 41419877 41419878 Missense_Mutation CT CT TA test 0 0 0 p.K148I Signed out No High +ROS1 0 mskcc.org GRCh37 6 117710750 117710750 Missense_Mutation C C T test 0 0 0 p.D508N Signed out No High +RTEL1 0 mskcc.org GRCh37 20 62293236 62293236 Missense_Mutation C C A test 0 0 0 p.A112D Signed out No High +SYK 0 mskcc.org GRCh37 9 93624541 93624541 Missense_Mutation G G A test 0 0 0 p.G211E Signed out No High +SYK 0 mskcc.org GRCh37 9 93637113 93637113 Missense_Mutation G G A test 0 0 0 p.G388D Signed out No High +TEK 0 mskcc.org GRCh37 9 27173277 27173277 Missense_Mutation G G A test 0 0 0 p.G273E Signed out No High +TERT 0 mskcc.org GRCh37 5 1295228 1295229 5'Flank GG GG AA test 517 468 49 0.0948 Signed out No Low +TP53 0 mskcc.org GRCh37 17 7577539 7577539 Missense_Mutation G G A test 574 484 90 0.1568 p.R248W Hotspot Signed out Yes High +ARID1B 0 mskcc.org GRCh37 6 157528952 157528952 Missense_Mutation C C T test 0 0 0 p.S2226L Signed out No High +ARID2 0 mskcc.org GRCh37 12 46245843 46245843 Nonsense_Mutation C C T test 0 0 0 p.Q1313* Signed out No High +AXL 0 mskcc.org GRCh37 19 41745117 41745117 Missense_Mutation G G A test 0 0 0 p.G395R Signed out No High +BRAF 0 mskcc.org GRCh37 7 140453136 140453137 Missense_Mutation AC AC TT test 2557 2544 13 0.0051 p.V600K Hotspot Signed out No High +BRCA2 0 mskcc.org GRCh37 13 32911493 32911493 Missense_Mutation T T C test 3170 3148 22 0.0069 p.S1001P Signed out No High +BTK 0 mskcc.org GRCh37 X 100613674 100613674 Missense_Mutation C C T test 0 0 0 p.G302E Signed out No High +CARD11 0 mskcc.org GRCh37 7 2952942 2952942 Missense_Mutation C C T test 0 0 0 p.E1000K Signed out No High +DNMT3B 0 mskcc.org GRCh37 20 31384996 31384996 Missense_Mutation C C T test 0 0 0 p.R461C Signed out No High +EPHA7 0 mskcc.org GRCh37 6 93965572 93965572 Missense_Mutation C C T test 0 0 0 p.D786N Signed out No High +ERCC2 0 mskcc.org GRCh37 19 45868119 45868119 Missense_Mutation G G A test 58 58 0 0 p.P191S Signed out No High 9.42E-06 +ESR1 0 mskcc.org GRCh37 6 152129330 152129330 Missense_Mutation G G A test 1 1 0 0 p.G95R Signed out No High +FAT1 0 mskcc.org GRCh37 4 187629805 187629805 Missense_Mutation G G A test 0 0 0 p.P393S Signed out No High +HGF 0 mskcc.org GRCh37 7 81346589 81346589 Missense_Mutation C C T test 0 0 0 p.G455E Signed out No High +HIST3H3 0 mskcc.org GRCh37 1 228612807 228612807 Missense_Mutation C C T test 17 17 0 0 p.E74K Signed out No High +HLA-A 0 mskcc.org GRCh37 6 29911302 29911302 Missense_Mutation G G A test 0 0 0 p.E201K Signed out No High +IKZF1 0 mskcc.org GRCh37 7 50367234 50367234 Missense_Mutation G G A test 0 0 0 p.G14E Signed out No High +IL7R 0 mskcc.org GRCh37 5 35876230 35876230 Missense_Mutation G G A test 0 0 0 p.G341E Signed out No High +KLF4 0 mskcc.org GRCh37 9 110249881 110249881 Missense_Mutation G G A test 0 0 0 p.P265L Signed out No High +KMT2C 0 mskcc.org GRCh37 7 151845361 151845362 Missense_Mutation CC CC TT test 0 0 0 p.G4551S Signed out No High +LYN 0 mskcc.org GRCh37 8 56863061 56863061 Frame_Shift_Del G G - test 0 0 0 p.E110Kfs*15 Signed out No High +LYN 0 mskcc.org GRCh37 8 56863061 56863061 Missense_Mutation G G A test 0 0 0 p.E110K Signed out No High +MAP2K1 0 mskcc.org GRCh37 15 66729162 66729163 Missense_Mutation CC CC TT test 3871 3850 21 0.0054 p.P124L Hotspot Signed out No High +MET 0 mskcc.org GRCh37 7 116415001 116415001 Missense_Mutation C C T test 1521 1513 8 0.0053 p.S1032F Signed out No High +MGA 0 mskcc.org GRCh37 15 41961969 41961969 Missense_Mutation C C T test 0 0 0 p.R293C Signed out No High +MITF 0 mskcc.org GRCh37 3 69788756 69788757 Missense_Mutation CC CC TT test 0 0 0 p.S3F Signed out No High +NOTCH4 0 mskcc.org GRCh37 6 32183063 32183063 Missense_Mutation C C T test 0 0 0 p.G654E Signed out No High +NTRK2 0 mskcc.org GRCh37 9 87285689 87285689 Missense_Mutation G G A test 0 0 0 p.G9E Signed out No High +PAK7 0 mskcc.org GRCh37 20 9546694 9546694 Missense_Mutation C C T test 0 0 0 p.G443E Signed out No High +PIK3CG 0 mskcc.org GRCh37 7 106509612 106509612 Missense_Mutation C C T test 0 0 0 p.P536S Signed out No High +PPARG 0 mskcc.org GRCh37 3 12393114 12393114 Missense_Mutation C C T test 0 0 0 p.S8F Signed out No High +PRDM14 0 mskcc.org GRCh37 8 70970993 70970993 Missense_Mutation G G A test 0 0 0 p.P423L Signed out No High +PTEN 0 mskcc.org GRCh37 10 89717705 89717705 Missense_Mutation C C T test 2270 2265 5 0.0022 p.P244S Signed out No High +PTPRD 0 mskcc.org GRCh37 9 8484270 8484270 Missense_Mutation G G A test 0 0 0 p.R1088C Signed out No High +PTPRT 0 mskcc.org GRCh37 20 40980844 40980844 Missense_Mutation C C T test 0 0 0 p.E548K Hotspot Signed out No High +PTPRT 0 mskcc.org GRCh37 20 41419877 41419878 Missense_Mutation CT CT TA test 0 0 0 p.K148I Signed out No High +ROS1 0 mskcc.org GRCh37 6 117710750 117710750 Missense_Mutation C C T test 0 0 0 p.D508N Signed out No High +RTEL1 0 mskcc.org GRCh37 20 62293236 62293236 Missense_Mutation C C A test 0 0 0 p.A112D Signed out No High +SYK 0 mskcc.org GRCh37 9 93624541 93624541 Missense_Mutation G G A test 0 0 0 p.G211E Signed out No High +SYK 0 mskcc.org GRCh37 9 93637113 93637113 Missense_Mutation G G A test 0 0 0 p.G388D Signed out No High +TEK 0 mskcc.org GRCh37 9 27173277 27173277 Missense_Mutation G G A test 0 0 0 p.G273E Signed out No High +TERT 0 mskcc.org GRCh37 5 1295228 1295229 5'Flank GG GG AA test 2258 2254 4 0.0018 Signed out No Low +TP53 0 mskcc.org GRCh37 17 7577539 7577539 Missense_Mutation G G A test 2211 2203 8 0.0036 p.R248W Hotspot Signed out Yes High +ARID1B 0 mskcc.org GRCh37 6 157528952 157528952 Missense_Mutation C C T test 0 0 0 p.S2226L Signed out No High +ARID2 0 mskcc.org GRCh37 12 46245843 46245843 Nonsense_Mutation C C T test 0 0 0 p.Q1313* Signed out No High +AXL 0 mskcc.org GRCh37 19 41745117 41745117 Missense_Mutation G G A test 1 1 0 0 p.G395R Signed out No High +BRAF 0 mskcc.org GRCh37 7 140453136 140453137 Missense_Mutation AC AC TT test 2582 2581 1 0.0004 p.V600K Hotspot Signed out No High +BRCA2 0 mskcc.org GRCh37 13 32911493 32911493 Missense_Mutation T T C test 3018 3018 0 0 p.S1001P Signed out No High +BTK 0 mskcc.org GRCh37 X 100613674 100613674 Missense_Mutation C C T test 0 0 0 p.G302E Signed out No High +CARD11 0 mskcc.org GRCh37 7 2952942 2952942 Missense_Mutation C C T test 0 0 0 p.E1000K Signed out No High +DNMT3B 0 mskcc.org GRCh37 20 31384996 31384996 Missense_Mutation C C T test 0 0 0 p.R461C Signed out No High +EPHA7 0 mskcc.org GRCh37 6 93965572 93965572 Missense_Mutation C C T test 0 0 0 p.D786N Signed out No High +ERCC2 0 mskcc.org GRCh37 19 45868119 45868119 Missense_Mutation G G A test 63 63 0 0 p.P191S Signed out No High 9.42E-06 +ESR1 0 mskcc.org GRCh37 6 152129330 152129330 Missense_Mutation G G A test 0 0 0 p.G95R Signed out No High +FAT1 0 mskcc.org GRCh37 4 187629805 187629805 Missense_Mutation G G A test 0 0 0 p.P393S Signed out No High +HGF 0 mskcc.org GRCh37 7 81346589 81346589 Missense_Mutation C C T test 0 0 0 p.G455E Signed out No High +HIST3H3 0 mskcc.org GRCh37 1 228612807 228612807 Missense_Mutation C C T test 28 28 0 0 p.E74K Signed out No High +HLA-A 0 mskcc.org GRCh37 6 29911302 29911302 Missense_Mutation G G A test 0 0 0 p.E201K Signed out No High +IKZF1 0 mskcc.org GRCh37 7 50367234 50367234 Missense_Mutation G G A test 0 0 0 p.G14E Signed out No High +IL7R 0 mskcc.org GRCh37 5 35876230 35876230 Missense_Mutation G G A test 0 0 0 p.G341E Signed out No High +KLF4 0 mskcc.org GRCh37 9 110249881 110249881 Missense_Mutation G G A test 0 0 0 p.P265L Signed out No High +KMT2C 0 mskcc.org GRCh37 7 151845361 151845362 Missense_Mutation CC CC TT test 0 0 0 p.G4551S Signed out No High +LYN 0 mskcc.org GRCh37 8 56863061 56863061 Frame_Shift_Del G G - test 0 0 0 p.E110Kfs*15 Signed out No High +LYN 0 mskcc.org GRCh37 8 56863061 56863061 Missense_Mutation G G A test 0 0 0 p.E110K Signed out No High +MAP2K1 0 mskcc.org GRCh37 15 66729162 66729163 Missense_Mutation CC CC TT test 3838 3838 0 0 p.P124L Hotspot Signed out No High +MET 0 mskcc.org GRCh37 7 116415001 116415001 Missense_Mutation C C T test 1790 1790 0 0 p.S1032F Signed out No High +MGA 0 mskcc.org GRCh37 15 41961969 41961969 Missense_Mutation C C T test 0 0 0 p.R293C Signed out No High +MITF 0 mskcc.org GRCh37 3 69788756 69788757 Missense_Mutation CC CC TT test 0 0 0 p.S3F Signed out No High +NOTCH4 0 mskcc.org GRCh37 6 32183063 32183063 Missense_Mutation C C T test 0 0 0 p.G654E Signed out No High +NTRK2 0 mskcc.org GRCh37 9 87285689 87285689 Missense_Mutation G G A test 0 0 0 p.G9E Signed out No High +PAK7 0 mskcc.org GRCh37 20 9546694 9546694 Missense_Mutation C C T test 0 0 0 p.G443E Signed out No High +PIK3CG 0 mskcc.org GRCh37 7 106509612 106509612 Missense_Mutation C C T test 0 0 0 p.P536S Signed out No High +PPARG 0 mskcc.org GRCh37 3 12393114 12393114 Missense_Mutation C C T test 0 0 0 p.S8F Signed out No High +PRDM14 0 mskcc.org GRCh37 8 70970993 70970993 Missense_Mutation G G A test 0 0 0 p.P423L Signed out No High +PTEN 0 mskcc.org GRCh37 10 89717705 89717705 Missense_Mutation C C T test 2233 2233 0 0 p.P244S Signed out No High +PTPRD 0 mskcc.org GRCh37 9 8484270 8484270 Missense_Mutation G G A test 0 0 0 p.R1088C Signed out No High +PTPRT 0 mskcc.org GRCh37 20 40980844 40980844 Missense_Mutation C C T test 0 0 0 p.E548K Hotspot Signed out No High +PTPRT 0 mskcc.org GRCh37 20 41419877 41419878 Missense_Mutation CT CT TA test 0 0 0 p.K148I Signed out No High +ROS1 0 mskcc.org GRCh37 6 117710750 117710750 Missense_Mutation C C T test 0 0 0 p.D508N Signed out No High +RTEL1 0 mskcc.org GRCh37 20 62293236 62293236 Missense_Mutation C C A test 0 0 0 p.A112D Signed out No High +SYK 0 mskcc.org GRCh37 9 93624541 93624541 Missense_Mutation G G A test 0 0 0 p.G211E Signed out No High +SYK 0 mskcc.org GRCh37 9 93637113 93637113 Missense_Mutation G G A test 0 0 0 p.G388D Signed out No High +TEK 0 mskcc.org GRCh37 9 27173277 27173277 Missense_Mutation G G A test 0 0 0 p.G273E Signed out No High +TERT 0 mskcc.org GRCh37 5 1295228 1295229 5'Flank GG GG AA test 2365 2365 0 0 Signed out No Low +TP53 0 mskcc.org GRCh37 17 7577539 7577539 Missense_Mutation G G A test 2107 2106 1 0.0005 p.R248W Hotspot Signed out Yes High +ARID1B 0 mskcc.org GRCh37 6 157528952 157528952 Missense_Mutation C C T test 0 0 0 p.S2226L Signed out No High +ARID2 0 mskcc.org GRCh37 12 46245843 46245843 Nonsense_Mutation C C T test 0 0 0 p.Q1313* Signed out No High +AXL 0 mskcc.org GRCh37 19 41745117 41745117 Missense_Mutation G G A test 0 0 0 p.G395R Signed out No High +BRAF 0 mskcc.org GRCh37 7 140453136 140453137 Missense_Mutation AC AC TT test 458 458 0 0 p.V600K Hotspot Signed out No High +BRCA2 0 mskcc.org GRCh37 13 32911493 32911493 Missense_Mutation T T C test 547 547 0 0 p.S1001P Signed out No High +BTK 0 mskcc.org GRCh37 X 100613674 100613674 Missense_Mutation C C T test 0 0 0 p.G302E Signed out No High +CARD11 0 mskcc.org GRCh37 7 2952942 2952942 Missense_Mutation C C T test 0 0 0 p.E1000K Signed out No High +DNMT3B 0 mskcc.org GRCh37 20 31384996 31384996 Missense_Mutation C C T test 0 0 0 p.R461C Signed out No High +EPHA7 0 mskcc.org GRCh37 6 93965572 93965572 Missense_Mutation C C T test 0 0 0 p.D786N Signed out No High +ERCC2 0 mskcc.org GRCh37 19 45868119 45868119 Missense_Mutation G G A test 35 35 0 0 p.P191S Signed out No High 9.42E-06 +ESR1 0 mskcc.org GRCh37 6 152129330 152129330 Missense_Mutation G G A test 0 0 0 p.G95R Signed out No High +FAT1 0 mskcc.org GRCh37 4 187629805 187629805 Missense_Mutation G G A test 0 0 0 p.P393S Signed out No High +HGF 0 mskcc.org GRCh37 7 81346589 81346589 Missense_Mutation C C T test 0 0 0 p.G455E Signed out No High +HIST3H3 0 mskcc.org GRCh37 1 228612807 228612807 Missense_Mutation C C T test 5 5 0 0 p.E74K Signed out No High +HLA-A 0 mskcc.org GRCh37 6 29911302 29911302 Missense_Mutation G G A test 0 0 0 p.E201K Signed out No High +IKZF1 0 mskcc.org GRCh37 7 50367234 50367234 Missense_Mutation G G A test 0 0 0 p.G14E Signed out No High +IL7R 0 mskcc.org GRCh37 5 35876230 35876230 Missense_Mutation G G A test 0 0 0 p.G341E Signed out No High +KLF4 0 mskcc.org GRCh37 9 110249881 110249881 Missense_Mutation G G A test 0 0 0 p.P265L Signed out No High +KMT2C 0 mskcc.org GRCh37 7 151845361 151845362 Missense_Mutation CC CC TT test 0 0 0 p.G4551S Signed out No High +LYN 0 mskcc.org GRCh37 8 56863061 56863061 Frame_Shift_Del G G - test 0 0 0 p.E110Kfs*15 Signed out No High +LYN 0 mskcc.org GRCh37 8 56863061 56863061 Missense_Mutation G G A test 0 0 0 p.E110K Signed out No High +MAP2K1 0 mskcc.org GRCh37 15 66729162 66729163 Missense_Mutation CC CC TT test 815 815 0 0 p.P124L Hotspot Signed out No High +MET 0 mskcc.org GRCh37 7 116415001 116415001 Missense_Mutation C C T test 518 518 0 0 p.S1032F Signed out No High +MGA 0 mskcc.org GRCh37 15 41961969 41961969 Missense_Mutation C C T test 0 0 0 p.R293C Signed out No High +MITF 0 mskcc.org GRCh37 3 69788756 69788757 Missense_Mutation CC CC TT test 0 0 0 p.S3F Signed out No High +NOTCH4 0 mskcc.org GRCh37 6 32183063 32183063 Missense_Mutation C C T test 0 0 0 p.G654E Signed out No High +NTRK2 0 mskcc.org GRCh37 9 87285689 87285689 Missense_Mutation G G A test 0 0 0 p.G9E Signed out No High +PAK7 0 mskcc.org GRCh37 20 9546694 9546694 Missense_Mutation C C T test 0 0 0 p.G443E Signed out No High +PIK3CG 0 mskcc.org GRCh37 7 106509612 106509612 Missense_Mutation C C T test 0 0 0 p.P536S Signed out No High +PPARG 0 mskcc.org GRCh37 3 12393114 12393114 Missense_Mutation C C T test 0 0 0 p.S8F Signed out No High +PRDM14 0 mskcc.org GRCh37 8 70970993 70970993 Missense_Mutation G G A test 0 0 0 p.P423L Signed out No High +PTEN 0 mskcc.org GRCh37 10 89717705 89717705 Missense_Mutation C C T test 497 497 0 0 p.P244S Signed out No High +PTPRD 0 mskcc.org GRCh37 9 8484270 8484270 Missense_Mutation G G A test 0 0 0 p.R1088C Signed out No High +PTPRT 0 mskcc.org GRCh37 20 40980844 40980844 Missense_Mutation C C T test 0 0 0 p.E548K Hotspot Signed out No High +PTPRT 0 mskcc.org GRCh37 20 41419877 41419878 Missense_Mutation CT CT TA test 0 0 0 p.K148I Signed out No High +ROS1 0 mskcc.org GRCh37 6 117710750 117710750 Missense_Mutation C C T test 0 0 0 p.D508N Signed out No High +RTEL1 0 mskcc.org GRCh37 20 62293236 62293236 Missense_Mutation C C A test 0 0 0 p.A112D Signed out No High +SYK 0 mskcc.org GRCh37 9 93624541 93624541 Missense_Mutation G G A test 0 0 0 p.G211E Signed out No High +SYK 0 mskcc.org GRCh37 9 93637113 93637113 Missense_Mutation G G A test 0 0 0 p.G388D Signed out No High +TEK 0 mskcc.org GRCh37 9 27173277 27173277 Missense_Mutation G G A test 0 0 0 p.G273E Signed out No High +TERT 0 mskcc.org GRCh37 5 1295228 1295229 5'Flank GG GG AA test 330 330 0 0 Signed out No Low +TP53 0 mskcc.org GRCh37 17 7577539 7577539 Missense_Mutation G G A test 466 466 0 0 p.R248W Hotspot Signed out Yes High +ARID1B 0 mskcc.org GRCh37 6 157528952 157528952 Missense_Mutation C C T test 1007 538 469 0.4657 p.S2226L Signed out No High +ARID2 0 mskcc.org GRCh37 12 46245843 46245843 Nonsense_Mutation C C T test 537 45 492 0.9162 p.Q1313* Signed out No High +AXL 0 mskcc.org GRCh37 19 41745117 41745117 Missense_Mutation G G A test 1183 647 536 0.4531 p.G395R Signed out No High +BRAF 0 mskcc.org GRCh37 7 140453136 140453137 Missense_Mutation AC AC TT test 1217 636 581 0.4774 p.V600K Hotspot Signed out No High +BRCA2 0 mskcc.org GRCh37 13 32911493 32911493 Missense_Mutation T T C test 1414 836 578 0.4088 p.S1001P Signed out No High +BTK 0 mskcc.org GRCh37 X 100613674 100613674 Missense_Mutation C C T test 471 40 431 0.9151 p.G302E Signed out No High +CARD11 0 mskcc.org GRCh37 7 2952942 2952942 Missense_Mutation C C T test 803 422 381 0.4745 p.E1000K Signed out No High +DNMT3B 0 mskcc.org GRCh37 20 31384996 31384996 Missense_Mutation C C T test 934 500 434 0.4647 p.R461C Signed out No High +EPHA7 0 mskcc.org GRCh37 6 93965572 93965572 Missense_Mutation C C T test 964 516 448 0.4647 p.D786N Signed out No High +ERCC2 0 mskcc.org GRCh37 19 45868119 45868119 Missense_Mutation G G A test 1598 813 785 0.4912 p.P191S Signed out No High 9.42E-06 +ESR1 0 mskcc.org GRCh37 6 152129330 152129330 Missense_Mutation G G A test 699 415 284 0.4063 p.G95R Signed out No High +FAT1 0 mskcc.org GRCh37 4 187629805 187629805 Missense_Mutation G G A test 1031 65 966 0.937 p.P393S Signed out No High +HGF 0 mskcc.org GRCh37 7 81346589 81346589 Missense_Mutation C C T test 1046 572 474 0.4532 p.G455E Signed out No High +HIST3H3 0 mskcc.org GRCh37 1 228612807 228612807 Missense_Mutation C C T test 1156 599 557 0.4818 p.E74K Signed out No High +HLA-A 0 mskcc.org GRCh37 6 29911302 29911302 Missense_Mutation G G A test 712 587 125 0.1756 p.E201K Signed out No High +IKZF1 0 mskcc.org GRCh37 7 50367234 50367234 Missense_Mutation G G A test 447 272 175 0.3915 p.G14E Signed out No High +IL7R 0 mskcc.org GRCh37 5 35876230 35876230 Missense_Mutation G G A test 811 446 365 0.4501 p.G341E Signed out No High +KLF4 0 mskcc.org GRCh37 9 110249881 110249881 Missense_Mutation G G A test 858 422 436 0.5082 p.P265L Signed out No High +KMT2C 0 mskcc.org GRCh37 7 151845361 151845362 Missense_Mutation CC CC TT test 1216 745 471 0.3873 p.G4551S Signed out No High +LYN 0 mskcc.org GRCh37 8 56863061 56863061 Frame_Shift_Del G G - test 590 556 34 0.0576 p.E110Kfs*15 Signed out No High +LYN 0 mskcc.org GRCh37 8 56863061 56863061 Missense_Mutation G G A test 590 96 494 0.8373 p.E110K Signed out No High +MAP2K1 0 mskcc.org GRCh37 15 66729162 66729163 Missense_Mutation CC CC TT test 1581 927 654 0.4137 p.P124L Hotspot Signed out No High +MET 0 mskcc.org GRCh37 7 116415001 116415001 Missense_Mutation C C T test 1904 1036 868 0.4559 p.S1032F Signed out No High +MGA 0 mskcc.org GRCh37 15 41961969 41961969 Missense_Mutation C C T test 1442 782 660 0.4577 p.R293C Signed out No High +MITF 0 mskcc.org GRCh37 3 69788756 69788757 Missense_Mutation CC CC TT test 890 469 421 0.473 p.S3F Signed out No High +NOTCH4 0 mskcc.org GRCh37 6 32183063 32183063 Missense_Mutation C C T test 1512 768 744 0.4921 p.G654E Signed out No High +NTRK2 0 mskcc.org GRCh37 9 87285689 87285689 Missense_Mutation G G A test 866 481 385 0.4446 p.G9E Signed out No High +PAK7 0 mskcc.org GRCh37 20 9546694 9546694 Missense_Mutation C C T test 481 236 245 0.5094 p.G443E Signed out No High +PIK3CG 0 mskcc.org GRCh37 7 106509612 106509612 Missense_Mutation C C T test 1103 553 550 0.4986 p.P536S Signed out No High +PPARG 0 mskcc.org GRCh37 3 12393114 12393114 Missense_Mutation C C T test 1319 741 578 0.4382 p.S8F Signed out No High +PRDM14 0 mskcc.org GRCh37 8 70970993 70970993 Missense_Mutation G G A test 829 110 719 0.8673 p.P423L Signed out No High +PTEN 0 mskcc.org GRCh37 10 89717705 89717705 Missense_Mutation C C T test 1345 721 624 0.4639 p.P244S Signed out No High +PTPRD 0 mskcc.org GRCh37 9 8484270 8484270 Missense_Mutation G G A test 1128 515 613 0.5434 p.R1088C Signed out No High +PTPRT 0 mskcc.org GRCh37 20 40980844 40980844 Missense_Mutation C C T test 846 463 383 0.4527 p.E548K Hotspot Signed out No High +PTPRT 0 mskcc.org GRCh37 20 41419877 41419878 Missense_Mutation CT CT TA test 687 403 284 0.4134 p.K148I Signed out No High +ROS1 0 mskcc.org GRCh37 6 117710750 117710750 Missense_Mutation C C T test 707 382 325 0.4597 p.D508N Signed out No High +RTEL1 0 mskcc.org GRCh37 20 62293236 62293236 Missense_Mutation C C A test 1552 832 720 0.4639 p.A112D Signed out No High +SYK 0 mskcc.org GRCh37 9 93624541 93624541 Missense_Mutation G G A test 685 369 316 0.4613 p.G211E Signed out No High +SYK 0 mskcc.org GRCh37 9 93637113 93637113 Missense_Mutation G G A test 836 434 402 0.4809 p.G388D Signed out No High +TEK 0 mskcc.org GRCh37 9 27173277 27173277 Missense_Mutation G G A test 1692 899 793 0.4687 p.G273E Signed out No High +TERT 0 mskcc.org GRCh37 5 1295228 1295229 5'Flank GG GG AA test 277 126 151 0.5451 Signed out No Low +TP53 0 mskcc.org GRCh37 17 7577539 7577539 Missense_Mutation G G A test 737 89 648 0.8792 p.R248W Hotspot Signed out Yes High +AGO2 0 mskcc.org GRCh37 8 141554401 141554401 Missense_Mutation G G A test 0 0 0 p.P584S Signed out No High +ARID1B 0 mskcc.org GRCh37 6 157528705 157528705 Missense_Mutation C C G test 0 0 0 p.L2144V Signed out No High +DIS3 0 mskcc.org GRCh37 13 73350187 73350187 Missense_Mutation G G A test 0 0 0 p.P233L Signed out No High +EPHA7 0 mskcc.org GRCh37 6 93979282 93979282 Missense_Mutation G G A test 0 0 0 p.R516W Signed out No High +GNAS 0 mskcc.org GRCh37 20 57485864 57485864 Missense_Mutation C C T test 0 0 0 p.R389C Signed out No High +INPPL1 0 mskcc.org GRCh37 11 71943744 71943744 Missense_Mutation C C T test 0 0 0 p.S596F Signed out No High +PTPRS 0 mskcc.org GRCh37 19 5231320 5231321 In_Frame_Ins - - CATCCTCGT test 0 0 0 p.D716_D718dup Signed out No High +PTPRT 0 mskcc.org GRCh37 20 40727065 40727065 Missense_Mutation G G T test 0 0 0 p.A1300D Signed out No High +TERT 0 mskcc.org GRCh37 5 1295228 1295228 5'Flank G G A test 1552 1552 0 0 Signed out No Low +AGO2 0 mskcc.org GRCh37 8 141554401 141554401 Missense_Mutation G G A test 0 0 0 p.P584S Signed out No High +ARID1B 0 mskcc.org GRCh37 6 157528705 157528705 Missense_Mutation C C G test 0 0 0 p.L2144V Signed out No High +DIS3 0 mskcc.org GRCh37 13 73350187 73350187 Missense_Mutation G G A test 0 0 0 p.P233L Signed out No High +EPHA7 0 mskcc.org GRCh37 6 93979282 93979282 Missense_Mutation G G A test 1 1 0 0 p.R516W Signed out No High +GNAS 0 mskcc.org GRCh37 20 57485864 57485864 Missense_Mutation C C T test 0 0 0 p.R389C Signed out No High +INPPL1 0 mskcc.org GRCh37 11 71943744 71943744 Missense_Mutation C C T test 0 0 0 p.S596F Signed out No High +PTPRS 0 mskcc.org GRCh37 19 5231320 5231321 In_Frame_Ins - - CATCCTCGT test 0 0 0 p.D716_D718dup Signed out No High +PTPRT 0 mskcc.org GRCh37 20 40727065 40727065 Missense_Mutation G G T test 0 0 0 p.A1300D Signed out No High +TERT 0 mskcc.org GRCh37 5 1295228 1295228 5'Flank G G A test 1503 1503 0 0 Signed out No Low +AGO2 0 mskcc.org GRCh37 8 141554401 141554401 Missense_Mutation G G A test 0 0 0 p.P584S Signed out No High +ARID1B 0 mskcc.org GRCh37 6 157528705 157528705 Missense_Mutation C C G test 0 0 0 p.L2144V Signed out No High +DIS3 0 mskcc.org GRCh37 13 73350187 73350187 Missense_Mutation G G A test 0 0 0 p.P233L Signed out No High +EPHA7 0 mskcc.org GRCh37 6 93979282 93979282 Missense_Mutation G G A test 0 0 0 p.R516W Signed out No High +GNAS 0 mskcc.org GRCh37 20 57485864 57485864 Missense_Mutation C C T test 0 0 0 p.R389C Signed out No High +INPPL1 0 mskcc.org GRCh37 11 71943744 71943744 Missense_Mutation C C T test 0 0 0 p.S596F Signed out No High +PTPRS 0 mskcc.org GRCh37 19 5231320 5231321 In_Frame_Ins - - CATCCTCGT test 0 0 0 p.D716_D718dup Signed out No High +PTPRT 0 mskcc.org GRCh37 20 40727065 40727065 Missense_Mutation G G T test 0 0 0 p.A1300D Signed out No High +TERT 0 mskcc.org GRCh37 5 1295228 1295228 5'Flank G G A test 1360 1360 0 0 Signed out No Low +AGO2 0 mskcc.org GRCh37 8 141554401 141554401 Missense_Mutation G G A test 0 0 0 p.P584S Signed out No High +ARID1B 0 mskcc.org GRCh37 6 157528705 157528705 Missense_Mutation C C G test 0 0 0 p.L2144V Signed out No High +DIS3 0 mskcc.org GRCh37 13 73350187 73350187 Missense_Mutation G G A test 0 0 0 p.P233L Signed out No High +EPHA7 0 mskcc.org GRCh37 6 93979282 93979282 Missense_Mutation G G A test 0 0 0 p.R516W Signed out No High +GNAS 0 mskcc.org GRCh37 20 57485864 57485864 Missense_Mutation C C T test 0 0 0 p.R389C Signed out No High +INPPL1 0 mskcc.org GRCh37 11 71943744 71943744 Missense_Mutation C C T test 0 0 0 p.S596F Signed out No High +PTPRS 0 mskcc.org GRCh37 19 5231320 5231321 In_Frame_Ins - - CATCCTCGT test 0 0 0 p.D716_D718dup Signed out No High +PTPRT 0 mskcc.org GRCh37 20 40727065 40727065 Missense_Mutation G G T test 0 0 0 p.A1300D Signed out No High +TERT 0 mskcc.org GRCh37 5 1295228 1295228 5'Flank G G A test 1649 1649 0 0 Signed out No Low +AGO2 0 mskcc.org GRCh37 8 141554401 141554401 Missense_Mutation G G A test 0 0 0 p.P584S Signed out No High +ARID1B 0 mskcc.org GRCh37 6 157528705 157528705 Missense_Mutation C C G test 0 0 0 p.L2144V Signed out No High +DIS3 0 mskcc.org GRCh37 13 73350187 73350187 Missense_Mutation G G A test 0 0 0 p.P233L Signed out No High +EPHA7 0 mskcc.org GRCh37 6 93979282 93979282 Missense_Mutation G G A test 0 0 0 p.R516W Signed out No High +GNAS 0 mskcc.org GRCh37 20 57485864 57485864 Missense_Mutation C C T test 0 0 0 p.R389C Signed out No High +INPPL1 0 mskcc.org GRCh37 11 71943744 71943744 Missense_Mutation C C T test 0 0 0 p.S596F Signed out No High +PTPRS 0 mskcc.org GRCh37 19 5231320 5231321 In_Frame_Ins - - CATCCTCGT test 0 0 0 p.D716_D718dup Signed out No High +PTPRT 0 mskcc.org GRCh37 20 40727065 40727065 Missense_Mutation G G T test 0 0 0 p.A1300D Signed out No High +TERT 0 mskcc.org GRCh37 5 1295228 1295228 5'Flank G G A test 202 202 0 0 Signed out No Low +AGO2 0 mskcc.org GRCh37 8 141554401 141554401 Missense_Mutation G G A test 1339 1014 325 0.2427 p.P584S Signed out No High +ARID1B 0 mskcc.org GRCh37 6 157528705 157528705 Missense_Mutation C C G test 589 472 117 0.1986 p.L2144V Signed out No High +DIS3 0 mskcc.org GRCh37 13 73350187 73350187 Missense_Mutation G G A test 500 416 84 0.168 p.P233L Signed out No High +EPHA7 0 mskcc.org GRCh37 6 93979282 93979282 Missense_Mutation G G A test 496 285 211 0.4254 p.R516W Signed out No High +GNAS 0 mskcc.org GRCh37 20 57485864 57485864 Missense_Mutation C C T test 881 522 359 0.4075 p.R389C Signed out No High +INPPL1 0 mskcc.org GRCh37 11 71943744 71943744 Missense_Mutation C C T test 2712 2281 431 0.1589 p.S596F Signed out No High +PTPRS 0 mskcc.org GRCh37 19 5231320 5231321 In_Frame_Ins - - CATCCTCGT test 554 448 106 0.1913 p.D716_D718dup Signed out No High +PTPRT 0 mskcc.org GRCh37 20 40727065 40727065 Missense_Mutation G G T test 703 598 105 0.1494 p.A1300D Signed out No High +TERT 0 mskcc.org GRCh37 5 1295228 1295228 5'Flank G G A test 439 338 101 0.2301 Signed out No Low \ No newline at end of file diff --git a/python/convert_csv_to_maf/example_output.xlsx b/python/convert_csv_to_maf/example_output.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..3e88d8998fc7e9c5d1db6f7433bdf09d6e4eedf6 GIT binary patch literal 42445 zcmeFY1zVd@*CiYrio3fzrMMR_?(T&acXugXv;>NKp@rb?uEik)DDF;iXF{KMzM1)E z{=rPHi~COCu(HqDd#x=HHAOghJOCm982|uK11wImtxaG6fJArz02hD^YbfR9=x*ic zZmQ+uY~^Om;q72gRR9mmlm~!?uK$0J|BFwcJY`g+mkUegM*dN1i(TPQjRZ39ap)ip zi@I1>-?xEZ9}RPCY+iDrAFw3z@oo95aVGxwasM5&u5ol|2#p$O)j*348S2tDA?4@n zA3UP(BqUCDHPAoJ!y}b=OKfTy_bC(b-KDWtgGYM(y+S#OH9mn@@MiYnpbqxxKr`Ae zgHpkvrS3HqofT|>k)JbIn^U;6wl>UuCKsFRAK+tY{QeMlniQ*1>uEAuXf(Utrzv*0 zF*UuLGy4d;!WEI&jV6RSGa>-+@&X4?`(J2Tugyh!35_)+DCSQbTplgKjdcdd7;98$e#M@jznA2TF|D-BF9yT#>dpszUx9dvlI8ppIs$uNP~t4 z=azdyZje6jD#)OY=kI8?Pai)@Z`sh9I?nk+XvSJ?>Ry*o*B$rpFCqDLqJ15PN_8}(n za2@j!1NFPW%T-dA7GpiIA}5?vgwN^Npyv%Q#s(bjRX1yC3=X_6uKlSng45W=7#y<2 zF^D2m4gnYE=-{yI7!@PK4Yye=d5A3sPhjVkiJmyJ@CS|bW%G0dkL@^@2e+-;B>4z5 zW=89_g73d*9)UvZ8XKuHm_Kd5G)paABj$$sP)=PCqkIrS_W?}p%~G@8cFSBNpSL}J zQuZ0Q>n^-U+a@bhr1-@v>L1}w6*(kF8N{6Iwq20ks(rfdF8MIR+_`klHhOvbX=`|J zZWZe3|2<>~Q`0z$q2WRYRb5Qz3TVju&%sipwd7dHg%x0G^U{4T%*%jcoS)2+=1FSf zh40XK{BetjQ4b-rUYX<9{i!emwL?alV{8{8V*JGKtn}DXW&J!LmyRva!k55;4sSkP zYZPzwsTrfg+>%_An6(avdF4$k=YwgdQF*;jmfcenKw<%JcU&8VI(4_e@>)q(GQ=uh zd6>+8EEzI_oVs9ytV=-mruTb_5=)zu{BJ+5OAG2|B0G~b$!78>Yy>xn)t#Cec7x$n zI9=`wJzbc2D+DYzefdTQiMK31-1;mIkTyUge_bRW;t3H2Ne@;%W61tisfKGI$fW6A zFoW2Om4#ZJ1PNk5nRBb<{wXH&%Xp5^RNg## zWVRXCo5AkV8BxrL>=l8%>AzG*XocN(_yLHYm9m`2Y|~C<*77y~gn!Nb9$2Gnmv62y zdK5mO+0+Ru-brk*ROgH zqTS*~YM2s?3o!paRIBh8JM(Y+7_U(XWjrKN*FEFMo4O3*pFgl10}6{L+n@fx6VZOV zefOKLP#Jb?LP z_1P(E%nyUQad>;_=^1vwwY8qXsGmlJMh-ZS$8Rq_5zVFk3Vj383%w~H(A^)zbaA)| z6}2a>MW)iqUIW$7g(eEEa(_5PHGYf0Sj?Af0LeiA$=YkJ-m0jPIC)bdtx@3|S(SL# z87yy`;|WZ&(^nszH*#)us^&_*+i)zb%eRBk$~2R7n}yfJEf_#cr~~t#%IA<8A5NP9 zIHs(I-L6wlvVNnCCCCT+Q)pmei~HmIeH@q0!0OFu{)SoN@)HPBukadS5P&b>K!{~5 z4l4ak9@5jUOES;7m;R;-{(F&2ML6G3sC!~FxS-TBdk`Jp-=WEk_2-twYP!^C>$#IB zy2wAmv@Rk-d^T;KCY39`gw35Z&vVXcsBdG6-W9!-cT86&&MF9eYN`yb2|YN^Y3*tg^%3|sMSOr3oSuqthdH?cKH_5O>6Fj`1l-;7+Y7$ zdoPLk>6Tpow|h^LA%B{F=L6lpT+NQ)F{ zuCY}p^5p(@T_Ml)8(SB?7e{(O&DgOeZgh?a)JxGF@wvO;hsfGXv=-oTg%;s@ov%Qf z{<^{`OM2OHA$8Cti2YXf1Gu^9EssK@jG@CVs*wkIYOi1z4E#s%`5%3Wl!oh!nh8Lk zh{91>9R@NtgGwzlfTU9mr)+P1(eQQ zp3?FC8F!H?D-=BP=+7m;G_phrm;_jZyo6DPo zyrJp7=6s*{9>w+1sHF4%SWWEzpl|vFO$(Z&ubuF})ZlJyop z9fW(rSbnN(3&|G5NpOm(8+1pF26u$h}e+vwU`O%2X3N@Uj4 zeGBGQQ>DMKU1@}s@}zznz??^>to`))tT+RlT8-tG9K5@M6HBzv^Jqfy&2R$ z`phw4^+#+VQf!4hOR(ZN-mNraCePh|^+Ocx4n=Qz&)eZB$$N52?2V}tisE=Fy2|_3;Zk4}z6h|PKTgZ}lg`ySyXT~R=q5=`c*X)0`o@#6~JzPpUM-Ifx zUjU|@1`0|SRi2b>u3IdAGihyl^?UmYIdo5LDMn}G(t8u1nCf{q{uNlF#X5{dnrLl# zs5n7iv?V*(%Y$N0XP8&9A$ zP{@~i`dFsTkJPSz{hE&V@m-J#F~|n?vsHuV_VIUZke1@u&MzI_f^?1DQa(*$|E5wB zw()WlW8}~kgz`VZ*v6w6wJ4wHrac?oE@s&?uEJa783bs;6Y5rC#eR`6Du1*hk)@#A z7d*wI1UB*VE;O!r5-kJ8>xxX{A zg0L~&FGd_wggaRnwmId2u0$^rT2s3yMug=ui*VF`w%&Yo?`VMK(CW#5vyFo82g}i{ zbR29Qa;WEx{vP3Gc&C#p^1gFSP_eg&S1G_YcR*fEAVG@;c_FOEvfvcYVs7K8hED(e zkhInZqjbY+q^-oNj*1ZUfCeO{5HW-;IxxQK zD6DFq80?fbI$y)F-+Ak;e3pHwEW%gP{Niphi=)~!5hGx&aG$t^8C0XMu+T=Nu-Etn z6g1BmqAM;Df*j0^Lf`Fanvar@hZpgA#!%tD090;#Rn6JNB1MTrHi9?%?pL z=L5o44LVV1p_JA3)iX+4mxf%edhR2%)PTDD-z6p6*okK_srV%(-*2Y;*|C>zk}R5x z;OhEp+1TZ%9hV38dPKMivG(0dAZ=@HpYMjW^iPx9o&(LKBx0ZMlrA+aFPEb)cXzul zx7UL&w|_Zb`cJDr!n;<0UQ6wgYUT(CRfgT5LPZYZAZHhH{- zzDP7A+)_MaZ1zxxzDO}7+)+IH+tf))#WPQzvtG<*b}HR= zKaKsd!F&<2J25U1sb|(Y7b;-7q7I{)fti=X@ggLL`HXp9DIv~^N#Za+IYjXc zltmhhE^&}AY#v^jLC!0|&JsMkJF$m|o?I_=W5#jP3-y#S;?MBbcxb1zU7eyLCfhB=h_$Zd!li$pzN(bK z{d5B6nxS~M{PDmVxETIBD--!0jPIwO^uy}i*4i;=F^n+NlXif6?{Jibu|#KXnA}a; zb1`fsAbXG?Z*_W_dxfZ|(h)2a z9_o!ibnN4G(`avZb-%?(Y3Lha!}!v`UcNhCUbwESr|wC*EbfxUJBXCynZQiI_s$MO z`h4esfyjfl>FE;})PVTqg3We!um!WuK>o`JYIHCIolrS-lIY_H8h&pva!H&qIg#VWdBuVl;YmB7I!evX_ zCXuZr3pYtf7J74eT!B795j+{0$Z-H)lCkb@c1st_3nzc;R&y!HZ#6Tvl^u(bnf@RIOI4b{6fP)idk<(W7{HdlTX|+tO;d%0>Y|!J5)=h2P&&!@Qq27$1 z{O8O&?U&_+z)o>7vHPcIMTGdXeL31-NBBpL5Z&OwG4cP!NNS@%=If9Hl(?NzM~V;z z9-+$o*Im=jBdG2s%DUm^V@s&g3b>J9__L3(f z3?}a6>WX&A?y;7as$k0S@j|#@&&`Cgw7$wNo~1RI;fRDncUbEBlCkqo-=n3K%!i7d zjdyPu46xKx-$E#!dVUIvobo4xA(^r`TV{(4@6Ap4(EB2=#4_f=Ol@Epj0mHFikmlA zSJ!v9d@j0oP(cvH^he~+eYOgNPV?l)`=)}f1Y<5y(Y*Z0`E~%7zEG)~v?u$9xZ%jn z=eA|B`#9t1_)FH=jEAesTUf&clMsrh&^Ol7Ag(A`(CrO4-kI-9ZmlYp()$5D)Z&AHv5O*km+!(7W$a(acx8+3{%B7HbK zyxn7S8N+&A%9Qd8mB<@+8j_n1FcMk|x5ZT*BLbPHk^;kxtXRk!gdBmx?hsi`i)59A zFuP5iWHqmV;!%)L35MG;HO}}>pSI1Fd5$?bg;hKLXRvuOt!}zuG6R5iopr@%vq=ZENr+LHu5 z+K?^^T`J7xHo}mg8-F-yZycCN1Cjr2iz0}M=~ZB9Jb)J|pRvJazLl}1COvdJlgj$i zn9FJlwDE>e_)~NU|BhdsJsa{xX`a;j;Hk}HA=-Fn%ShvrS`4N-^8alb35s=C{guY2b z>Ovrd;xRhwgiA14YS!a)D7pO^SxaR*We-?-@qlm5gS8UYM7(8vbXbX=jN1a@jVsFu za(cR1Hz!PTqqr~38hft{=L%+l*(__FUoF73&%3opiTaXoETp}w3rQ()B{w_Gi{FOTPV zsD1p31fw1uSVY|avc$8zJW;P$Ws>ErSpd(XJD)JGrFUyj>Nqe zS*K7o)=27cvH|r12kCZSjDKn$!rNPL=Q~1e)3D1C6Ki=Rs;^I7uaKRmotFnD0~X19 zeJ9&xp27~?mcUe7Xi9~iu^-ASQ~j`H6pn|U=e=eD?bC>Lguxvhr$az;TIs42AdZRf z5T??_*?~v#67E(on&*|Lwod@?zBCFH%HX2BCJ1TI2KNx@+O1lYiUpUR%-80Zuu?o6 za8JHd5~LxX2U07W#WaI%6AWjv`MS?1K%feZay5zq$UrRELxFt=DRx$pdsR_0Q%|%C z8b>hrw$!nlc2!UhV5wCqBgdD2!z!D_Fk@YWQ8Nx1Q;d%SS|hnw$LU3T1{>^AQ}LNb zYv&Gr#;wICDhiXa7@HY(MYhIeg<2$2t|1rYY znA>*Dp-`Pw4EREVG%pLF-dmdK#4z+l9$Jess+v|s zPx*Xj&U#k?jRbhD0$RK6VFaxK5KcQR&;`|6qCy}lZGH=y1Gp4V?-f!F?K=}=;}Sz+ zow=PIbc^?wX5P8pNl73^Zn0ckGHJ?klmU;XF@T zhlMfU1ZOjF-yX5wJ%zp3{WPcVOV~Cct`taarJWHNo7Ex?u(Td3=T@Ge3Ahd^O(SUS z-2$jZ$Fu22u+J_~0t!Il0O?su04xc*;jqiIKvNnx-Wwu`Dh*>n4>a@;q5$uwG_jC( z%B1RuwSV*XD^VIOlqauSn=IKJYY69kdLHjV@>b)dCKU1WNeXe^13H|ea^zt*pUkb; zZ;{$mws*tucn(b%I}M+n+J}%`!;txVAXRIAqVaZxM0V8Pxoh(U#nM%ZAAPhI*aI&3 zoGKg{HF#sT_0FUs8Ma|4X_T8Y71pfCFG}N09!P0mZrA&F-4pOjWUy>1YoQf0 z0v9s3L24y-7oUX{4Z`KXkp7>o8!xc>_#HGOmm9nZ_7K9agaQeOole{lkzjZi;|0@r zbcIP(zV(P47R<%@mxkLJ&%({IY_++2bRn{{3Pr&JO${T8MF6n6uFoD`u?{;z-dT!4 zSyAw@2m;%#KN|W^Z3P%;66S53Ym>WM$T7*j#hXO#bRCaG2l7(%P|Zr_{K>s|KMbV- z^M7anyCyer!F8knx<`jaauzoO<9$i7unq@w@5DuLh;H z6Z6wx$`&e?rAP$kym^E8MEcG~&pwodEkp4mN%dSkpJFl|z*yZAL9|(`0*$rmuIb}j zo+2Y=>^%8mGBV$*pTbJxcPX!`)2u-QRUM25$!3ITDvSavmygVO^|UIHRIUYL|73?c z*n>~~M28vvc>nREc!cb1Xj0(cdOTz^)(jsnqTGw0T}dA2eFeJW%Bg$!rA5vmh%=UC zJ)?H#es*&5Y@R8f$B{BQ`z%N;f74_seQVV|knj_Ffh6QpS1Y&xWJbQrL^_^$pJ04* z6Z?-??7RF&?z1jnpmmR6A0^Vw^dhzk1C*W&$tVrec$2-I`4Ap%XZAZc1TpqB020iayd7NmjX zc~M^Je>6CmXizK^OanA<-qSUJYT$T@wA?+u?buVy4-`Ye+c#Q_eLhgHCHLODIL8?b zn+JFT%j+Qi#X13cxyrv8`n30-mPK4x{}Tc~Zq#3;%NRZfO2#bJe`$;b^KeE-7Ia=E zW>0HNA9?jE?@+We+^h?8(iB6F7$^~|>RDtuGj|=YvJ$$fn@2Y9nKXyPZ{Yrfo;Ux0 z4y)i+#55#t9AOi+r*);*zlUzCVymm;kk!~RUpK|bBZH#2WXk~fV|A>y@xGceObXf_ z_@?~a>Jf&mNvRaOp0UHFfF&_je1wuph*9UmBdfiKO4gF>p95u~yT^)ST+LEmMkPP` z&LFP6k#Zr$)6lf6iv^IvOk;_-spaMC2Gg~F?CTea?@G-o#{Sn)rLy%B92`yDO@ebT zs1GLY0ML9^h?dW&+d)+;lBZmJ_oKBuh+8><#$z&6D)+}J zMy;C|ugG+5B5DpjoO-@)D%(AORsg=Yg-|53gz~ns$#414T89`Q!&1M~;5qC!|__gH^Y*l2`&j{W6i`sSTYwSQV zAY7NwQrd~`FaW9%Y*1?dmqlxntV+=MNc|6$fZPCwaD!qrz|#K@0s>#G1A^@mDh9LQ znh*B zr$EP5P5iYY5iFgkz;THl)3u$$0l+1y+W}##av3AG|Fa{6S96@{Oa{sW)xQ4J63OoBf3kz{^pIEW-Z~nEB{|cJ|!yX;RJ_YjIH_#Ju8Gq(|HEQ^8ZJB$qHsfD&P;MAa zR-8du^6i69sLuHLYkpAL_*d7oh*#`FOOesnqVx^aUBvbOESk+&f+}Cf0`9B3$cE(D zLu*CKi!ls6SSY{?RUUf47$;T2)uXopG=$}JgsQW#8xr)DbL0%4eO~>p%8|$I2!v`n z+g1qTR!o?;&uepsd|^;gY1!O=joJ)B{xFEEf>_b}%iZ6v<@VD*&GHu+Nl8L2^{*`v z{@S^q8B%B$Ej|jT*Q`x{SXrg*$X>MR2Wkw{{bs6J#LF<-_ia_2$aPygJJu^-QRSRW zhkkQu=I#T9l#~f@*qeYkjKID<7(&j#_|GEZy0!V-t$dL9u+(hs7L>pSH&l!LmCv$G zD><73%-d<;RD#>?)g2$7%$6;42*q_mZBWNoFR+0-b6FCd3xQfj)>lG@GJdBlXjsE?a{h??^MH zPUW4?KimH5obDfj2ydIa3E1y6`ujAr*DAY`AXnwInq>mZkV~=<3r_9DOo|c766vl> zLeS8g4SqqZg621o&pc{g-~BHlOU2Rqel$S>F||~n)jtsW5}wE6Kr~92bCX!}bO^kQ zXb)+lJa69FchHLKM{wuw3eUM zFqBAA(1MISngqa?&NGNnNmp{`r7psTQNh{X`NG)KW~><2t?nHK%6TFsd~MH9KKIVN z-|S`xoYzhBYUm-n?*=em*g!6q;Yd)&GWy0;!p?y>XYzo20&>t}8d7;l$w}siEf5ZBN)RKlX!rW^yH% zQO#SSL;JoD#x>-ha@{Eez{;@P?KEV3WfrHKg;GHtIdD4>o0LWI!;cn;Q!6TFc$ry7*O59#Fa^;6gHG(-8 zBW+tjM|Ji{rvY_{2OlG)E;AieIWuJK`EH~-3 ztJZ)>gb_FE-ZKfu&&QL+*N|nA9C)=jX$Fa)Wy&qYRCV86ZpBZ)jtSezB@=`Ao^mzXiWc&Z1>Jl(83| zDT_|ma4B^7h=$?n+ZWeVg2l|TF#^i!5;jG)L8iLg0!60Gfq6>Blq*|jrn=DP7gzrk zd9pG^+^p=QDRWR4Ux6v1;C0RF@$tgFxL@JwvMk*ml-+D_?Sep->_Y=%xB^^YY&}uv zv?fOdj#T^wT2tWH;kCrrVv=RXU97fEQWl z($?PVJ2|&U)#i%)M2@kC>fG#rOHRTKMQQ+f#!prsQ>ht5Ms`$(D(B17M zPU6kSX2!PLbvft+RH`R+1Ysm)YSQOYPPujPc;}C_%HVDqv^vqdjZt{Oy|24syo_$? z{8lMJTeDK=Ep(qXXy-IxPTQg{#}_Ej)TSJ8UL|P0NkzKuMQEEe1!1xJ2(s)FGQ7*q zvt+Ar$?4!TPK^AVTV9nISDZ3sdv=EygO+=_7hZQVb?xLieY{pOnMOjfZ*Z&ki@5Ge z7_}hq7>Q@&W^(*cAYMvZNk17nZU7kvtciVenS1W;ZGtT z6Ek%>=vf|FR$=xr_!~ZpKBONN!^-*M+vX1h8ooEoC$JAUKsAv0@4cWB`*q5mnSGc> zSt^ZLwg#$3A5Hzv{^u8l(HZ7cnxowqeTi^==RFz7NmaIN!wt3(X9sl1YJk$Hz#Q7p z1JZ~IcHpt6Uv-KJv+LD4e`2^L-ZFbaTFU$LD;!a;)Z%Txu)xTK|MOcl(4vohHQ)NuE2^R1g#;2glCZnw;Su94ZXuE^3Le5kY)5o zET%=lpdmd8dP-Oxi+Rx)-*$he__o$1gJ}lY7lyv2S<34l>8t)7g#--kO#eInI(P-WC^hTEp}xfz?UGkCDm>p zB7MA*1N6lAl^GTi<0!>;A#zy4D#L(|wi`m)>F+EWO!u|Ru4uu;GPgh7*vgxOKqFP9kNz>}m$kKIo z77~OhEl`#hCzlb>nq|ise~!Xd+_F^-I=D3Kbm9_FDCt(1MJ~uOl9=FtpIBc1S(URO$AsRO{$L-0=R3Dw{dRrkJWAr9lUX(X^bTcmi`{PxD|^cxKkV5c z$mW!0Pec^!me#_R$pnit@2>CS!n>&-6l`FuV(I|1oC&$DQV;P%T=tYt74@&p|H;@P z+XaKYRWqX2j&dIMO>$Yz=4%7h<1=#wJR*Yx((8ios^W;7b(j%8ZykFD@|!u!S7cn5%UEaz(A#*+KL@_b9udrQl<3?oI4z}f0b_5$U?fWzt`U|QqcLO@@tA| zM-T$pTQ+R84P??@obE1<@Ba4@)M^oPp)ln$F!n$*XW9FePQpt|=bhhFGV#EnFSi)kt;?r)`6Af}$~_4X2Cg|*yqWiV|ZJ->M0U*6YSlkFaSbi8`t zTme8q^)6hxlb(Kr*5)(aJ$Pr>m<_6x9jZZ5- z0(~IgAvzr`gTAK$IY&rqzLIIw+CAVmP`&Aqq}HQj!wYRMnz9IAGh%_44zDVfG;82r zb`v;4d5i^uK=COA7{7%(y1vB)8R=fn1D(BlJV(DSF;kF6_>SxHU_K|fprzUGinfbOgr zztx0^qG!*yQwlIvdakUQy<8Y*qbH0ve=Z1ng6Q|^BKdLSV0_x8%S)R_YvN_h$_rTi za6LAseJg7Ogo|X%)bkS!B;TZ>7>eZqHGOv}_IECm2_(uoTNA&z5%CetN?;WJeX6Q0 z1aTQ?LH5ub$RFl5ZXNOYBY@?N**qqb9GASWyppcwPaV}nwuv_?xlbXV{p6EQhJpCb$bRB0dq$47% z^e+b%%)0G*cRg>mHnk`lu47KT*d3_vjK~i2_$JwD+wV zOL(aVrU>iOh%SCsJ%-cv=$kr>1<%SBQ|AQ|4ls%1ShEA@u{m+Lhwy4#0fH3%6m2>` z5FAl2@&3X8PndMIk`&#})5!8byIzrXXPO9`sZE|oRl}iVf1$CTCFz?WG0R`GSesy< zvkD`&ReNXfh>YH$R_mFo6wXvbZj^Skp<8k>xsa->0N-3e1f6tYs(E0mXy&q(B zUkV=-xkvhc$vm_d8Qm4YfeN3bUqmK9`K^qQUh>^)S9xG(v2FqmF&aW0_+j*j`6GL$ zIFmS&rTVPFLOfn3 zPyB6ub93RDv2uR?KadK7Vc_nA3OpWba&C?!o@9Cc0s4Z7(^j*G7zy}~Mj*S%_hO(5 z4Zb7A4;OH0&mZ9in)H_PjJpJ^DOuax5^2>9qACo7`SvvMIQrKKVV8pp32! zaW8EjT99CbI*-mfYeKa;|IH9^V3NIl`?F5(cn8Dg{5w7r!cq?g5mxctsw~Pp4=x&- zxl>ihln@#ACL!6^?BPsIVlz^7&#aPfW8KKo5{(88&H*W7-MvD~1)|S)`hooJq6B;%WZrT)@ z!~V&@J>^~-<Q>pdv^hSH#{Q{s;#R98&)S#``Q%v2)VM`6Hb>{7b-GWuKJ7rwmy#J3~B% zuGY#}vLe@NWTwVEo4o4%ufpXsJd2yT)5pV<{0-1gNA23G!}TL(g(cJ2^_dW8j$#pASagD=()tx_pqi7dpyYa(v%qW~kZozN^ZoMGTXd3aLUeWBu*AEe!MozDR8teOWr0MQ#*bG;I<_bYc`> z6ho}Su1MtsAbrU+Jd`B>cOy5_nfCp)*R*9Yr3=Q!F!s9EdU6gyMsB1stpYKxQ<<{i zVw{I}u7EvjqW#S~b8^e(a+~&ux|7vmP#cTowLo}v6%5MEb)0@M82~PiteKf zw*GoJ34XT;{xKlftg!FEEwpq@b1|VW?%44VAkl^J7Op-)gC6a8mjl7{fdsLXcogkR z1s0tm4lne-!a8p~%4A$%@xZQQzO|0n;wK4%?Wb5D%|Z;E<}?u3%(^lSk>Dt5?u6FZ zKM=PcAT?TQOxPIRGd47dhdEXp-^ApC^8 zcAfZZsvjs8DtVx{!^ypq(1NN`6rb&oy>?_H9 zvcBE`@0sCE%~mG8JEyEU@ooM#FbmpAEig2TM^*(b94exllCyo48l@TNhpGP{4C6Q! zofiGIV5M;L;Mb@$9u`-;_lN*8No?33#510>q1{kY3e(`p^PvOAvE9Ozc)pLrAR)Z zm<87*;tF@kDSF>M!J0 zVfqsxZA&<9X~)ItB_^C5FsEN0#H_SPcX-{sa@a#yv)BUC(v&KL+!|OnObo{0Th#eA ztlL9v*IHD_PpCS!_^?Un(9~wQz-%cPE`F6*?Rmy{99RFf?o-V|PB0L48s%ulc;r*I z`!vN%O62wcc`KcWxfOeM9wZoed2mkOV0l!jLN|ZHwg!%__m$LZV5YZkD_=#0Av@>R zRNc_yS7D$mvCs}h^@b+uNPtSC$Ko)e zM#z>qT0DP$y?dwzTtoNIEZIGkZ-+h+ID8#*(e*FfrN zx76i1KB6<$w42|#RE zwpe(K15i_jeu87-|9%p3c7Tr4HPVmMQRBQ#Q(lWQ8JET_ygIltQzDKufJ0BPf`+b2&R_LowutpwbX}+PMwE5OhyC=!BRa|z6xa4iUV|A!eux{m|F+X(|)gN&RP5=ET=|EfNp6!KF>T8ndREg z<99?=yo;M%Go!U@hB01Uup#AK=o|B5vU00VNxR}de}szuT)M_~esr=;O;ab`-_Vs? z;qHPA1%-qVEyw0NyZ(&>MqP2MvU9CqsO?=@vlt~0=k>;N1dR0=C|VAQl` zIC{jGAGR3K!jS!Lbh9&l=ob3!vAB;yN^ru@c4K$KcnAA&LbwIFVOEZaNm z1&`(^YY0P6)RRfYBj9iZli5cwVKVzuBepV7!TGQa`uSWkXY5~bw3!ZkYF^tL$voV= zDujVgQl`l^(GOkR>?mf3V%NVqxPK!f7#U3}5q)tdiTG6WTZV3B8(d(XY0Yr$H!RaN zB4fGfL2|;GLZ?fp3>-P(u=P&2G0*fM)z}!2UGptR^%fxfO!+&eaCv9rC``Y6RHajD zSP#D35KF}@Hz62HB~?Gtz2y(jJA2&qzCccWBt3nk%KPYs6W(UWPrtO#3<;~<`A-8> zMTlVRw)=o5r#u$>)Vmw8t?b=|aE*xcC}Isf@@MQ()YeKVYne;CS%xJ}_W4fmS&EnF z5ClQ1tqGvp8Hw}B3GcQMf+6ADIpSKdYkeEcp;>GzozwS#KFW_q*VyIoRn0*4*$YlI zX|9p?#cKFr4r4puWg+}R(dTrHRMLbaE!Vcq;B4;rDz0%KplTOXnd{{=i-nF9C`pZ! zLGBg0%(J(^{=;RtzC!bg#&MH?Z}ldTM?f3>V~?Q~vE`o1@Y6hWo;8q zi4C~!zcz@v`e-6V{Lv+BCX8Q?sJ;&I$~U~I>KUQZGUkRFS&l>=}kDT^%I_- zWPTdiT%RhnDyb<|77$6+A?JQNXr#I6+v+8!8gKY*(fQD6qKbWVRtnS8Aw^Y=)NY~E zw=QH1MXxz8r;1fmr$STJbExL9Up^x8&K833(e6p;P5VK={IZ?rTMZOLOz;MZIlUEo z&ZI#fZWlFkt+3RF{8A1*%vYBq;W5gZ3Ybj-JRC^L;o!1Rc#)`BYmDMJ9TCO53LSYh zdoGOWGJ9a_{E-D+Py9P*zLoA_K&lmFDfw?)e05lqU(_w#Af3ZVN;x3XCDJ7=As{K; zDGVSrbc&Rubi*JuAPoY7Gz=i!AV^5}eew4_-@VWMhj~QZv(J0>*?X~}?02Z`6wEP>Me3T~cQH!r}e-!HW! z?h0VhW#ehjJ3IOGlWV9s9>eFD` zTY^k2=Rrg?skrp61$1lX_;l+t@2^mW8u(2?FalMv?ibecqPZ5~@0u0^-g*BNj=-&| zSt4Q=GzEm;u~w_|$H{O{O^62y5v6PWD5^b$C`x$DAS_OJJl1t~m$vF4_$?CDb_we% z#O!)5{h0kWt?LneiZo%cuh>m^RA(7cOaz4==t0$5Ya6`W12fMp?30r^&=-Qel?ObV zzk_`oTaWq=(~|0)g3LXsF)D<>p2fu%z4^e7lndZ@h+?I{c}bQ0M9eUS-@Q8%RRC@F z_Re%|_r_^U1qb15H!vsy#9#r5OX}OAzk*fH<}1Z(W_T-ztxPy0*odUFYM$8Z%_nUj zD%-&65=BG0VN`lz7(r8+*6rl&&SvBo`76l@{z6e2c|nD3;Qd3l96Et0~uV(#Ii ziW}M$^}q-jbUR)Y&x_WWOOEJ2g06jB;oY*UM+4Yk`Bcb6O za)-iqy;TSPOdPys*9mxK0ZQCpO7JY6ja7I!Tq=}-A^wI&%(G_Ova@RqhX7sX_*m49yie1FXxVa99( z(MKV|5o$zEBTeavos{6)p6A_>B;3~}I%w1@qn@3y3Iwt*5tyC9j;gqgYVaB6ok}4v z4LIJV-ZaQOgoNm`8>`WMi?-XEX0^5-V(l%LC?SJj?X{ zo&6KMnJs&SC-ggbAZXG1o%bt*LG{ZT)Rj%-mATVhJ(}H&mgbnZ+1KrRWhryY^0_ACgE-C@+ye zm63N(Ggge@C?9pdM?y$;%}?Qit<$c2877;VgDI{xm}gv(UejTT?+~{*r`c}a)hT{Q zCqH40+;bi{QPED5C&1!{BbXNem4c>3*<*zS{UId{1DZmw)Udu4i1gRrwh=~6zPw?{S_o;L9h|Fe5YB`z|4l1h zFy^S8GP^IBM&Ix9^PBfS_q~S`##Xe`75zbD5ht#EM6wjvkI#fVFv4aZ!7?ZtrkF>? zKu`J+z%ieVqFLY#884dPH9xNQ1-_w_!eaaDm8EA+*Ql)jnC82(=J~OLz9NJi6J!Q@ zS8+c%*U!^V>8k!Eakp?yO#-XUSmcZ_M$ay#Z(CiXrEs_40+q5*LpSA3)JcnRgok1h z#dYgD4pVyPovA|Yn8I6h&Dl^z0jIYH(OfW3I&}NDuhEcl8;JQpwBqal!%lY%yM3i13cCOA)MF-oCModf0v&%y)OUuvB z#7rmB1u}URGY6~>9BtDjfqDxiUy4g<_vr1gm~(^obuHM?Jl}-#+lp!YFK>mYg2dFXPqCSonM7IEyADvqix> z3~&pI^n3vErnB}Ca{hydQE_fvU-=wxus$2wv$N|8kavi__A37IN1jI})=+(#7H>JQ z+1*~^9Ay{y+)Zdcf1qrey&aVzxURK&N=DVcQ^Tlp0VQA+FC+m%7Qfq*eCF+W4Wxc5 zdsocwSVozKyMKU$0KL|XHQbtV{)9q=5N;$w;K)f2FMl~qi2mG68vg6aJ_fudF{@bN zoF)Ru|G)9}jw5}un%!W-HJ&7Xv;^@SdgWZ~f?q)6SBZN-nw^VRejqA(ID^VHFvmst zm9JVDAX08JJ%IOgS*}Lyib^0x9v3q(-a1P@7k^?@S@l4L-R)a%OT<*2 zCCglQ^9|lIJ^&a8HJi#L0G=g%Q{Wdqo62DAuax=`li`0ha9X}^I1>{Xi;su9=Pp>K ztr*vz2COQq&!K_>KxuGSC;E!Je{MmuzRtyO8^#5q5^Wl`tT6q0UJ+8iG%vbtenzVV z6s8-?>W@HSI(vd&{SgkiqQyP(h-_5b6J~l_)%chOOjboHrs42XczwpRtraM~lD|C% zoFwy{S?A~9zeM0QTv7^5V=g9+ll0X(e&Q=+Ms*141FVs6=bHjSBRYdja?uc_bT$%Z z0zmvfd%B{4Jw#JI`0M{XwlJ8%E((a)$%%fC{mUbEKt_mrkP$v3KcdJBd%cC{zhpq` zg4oczB&e5;il*R($21wcdA#Z&x^5kiAVa=p7~gZ-{352xBU0v?Zv=UdLd1^ED$k}f z`Ie)R<$Vu{W76Py9LBSOn&V&zOf(V=C#n6wkW9A(u@nyMCYgihqEFnyZ;iF8Qp8Lv z?VR3=F6gWm;Sa7Xa3wEV_6+RQ4G8WQ<@Gq4V^*;>1cs|Ly*iG_Ur2HLyxG}|@t%g4 z5_Ol7|B;uDU~M+CRUD9nej;~TpNEC>aqYc6W_XHD-}<}tIlu?1U*X1|ktF$K+`Oe- zBAX5(+7xYU9^c?*UMAUEP57mliqeQBM>qR^Nh`}}_R)aIa!{R=_y%Y)>ifrijC!VJ zqdeUX?q$#k+6JfvD>GxiQkcFWzEDH;Ae z&Uk|?0Pr`J#7wUlK&(nA4I|@Pr#I5>oz+Csr4$?C1>gESeE*unIx;W7E+u6$OvM_G z3?~|KXC?PGg=i%10tSYyl@c>W_N$rJ-gZ|7yj^&nXvHT#998?xqBi`kzxZ!VSsI_^ zT`rB#y&onBg_lj?5_Nhkfr08MIO^|OjOOhNK!IVQ*KVIKunH|y z-?7sh46}o`Qg@Aas9SBExKWX+s}6NfttZ!b*IJQM`jG_oO5zg+q$7clSm1IgXT#+B zFDx&yjpTFQ5?->V0!#SCWyc(wx}h5U35Ez*W-t`o)%0|Y>m)${k{fO#k>4o&D6+lr zhyvcLS66l#V^yvrkxv{Lzo5i5P}!JL<+! zaZ?j)X&bP8uhg-3^0lgS=q5>kGtKl8UXbTJ^I^APb0$DJ()mEQ9X2|HdLJ7uU}8`nNOe01XtN6>@Lqpqahg@6D3+Cv6nb!^HGAG6C{iUSo_=385$EL zOihmer0pGr%{L+W@a7NBptI=lI6YZAvc{sJkHV9>vwq_tAfnSUih#es~p4wsY(6pkmrKDSgH4D`$eMc!(na&3qT>%Q(#Zjx|7 zH)4J7trw4?_k%uDTze$KUm(e_pB~P`asJ`eKDFf1)Rkg*7Ef^pWW%($)Vk|ys6{!{ zA+1vPG{)d!Zu{8Dmm`MxX^j2K!OlvTu)bKYPq=&nS74t@%go13HJ%+e_5tDL&7Ut@ z478TwBk9++#ClnsFxK#D(7nQ3TW30Tb@u)5JTnmn^2 zLC;%-@&V3u^r2!Nl*df4Mrn@SXYhPKZRUt*R%(|9bKk;<$7{K!xuQsSBU99{@;6Pr z%PUPDFJdG$w*74Xn1RLo;;-lXJEO<5e$Oo8cKuTLwk$L(nEslbO!NLi&6LdI@W{*@ z{RpaNBP1G0@L+t`(PkVGk4tmV?{4p(-yxyo%ZMH?z#`{nWDI}Ul~_^!C$&_(C>d#p zM&x@U(@L~wrh}uuyTaZdM?_AA#`3#zYK)SEIpGHdTR<#Ud4pEz_1HW}xKD7?KFcAd z*-R@>W6{9m)xYTgM>TlViIEXAgJgBc2t*>A)zT^7I_4p@TFq{V_(w^8y2tg>Cb4mY zHPkP-*YRhjR}Rwrj@s1KciDW=&&I0tcn)8a-MT=eNg=gkWo;y8rOrJKr{uw~dqYw0 z+YiA_laf(y8UJilaxDvD=%xy`R4d?Aa61;XZ5!&Hgyw>Ja9it8uhZT7dZOPKbkw(L zE@!*9LWBBfhfjZhcc`my2C{*|cIMZrX&#%(>n`K=!1}_>RF2g>!yPJ2Vm1=B?B7(N zsN8w!_CSBtCX1`@Vkh{O&g6((Nh$C!I{uH@21-OKci=9cwe#dPZrIxh0wuG0IsP&`9BhwWk#h$D%K zez;5Mi2rBIN7$+)^%+WrJ7yh4Qyr?kdzHQt^$YIlQsiR*5I+%Y}?L&u-lOO<)P%~1wH z6o(U@Z4WcVrLH~LG+ptGx;WRyrOp)9UW$l&1~bH=2H8s?`VT=Ix_1|sRWk~d&#?BH zK!0E^xWh2!r_~q^u~%sF?I(wglU`3iW{gIU0@Ni&V9X9FlU+GE_w{Jt?i#jPanC#x z76qJ@as78ns@_TnnXB)Ym{*~bLbM1r5PvZLOu(jvr4s8?J?BPwe}Cq@AF;fWAdS2p z=bpkg%}3ZvE}xihk8eWY8*>v`TeTm^9qIH;-$Z)H{Qme#ne==0rzxfW-3jpZT*%U@=0^>n@9J@z!{=kCp;h8{=3s~H~&6+_vfDE zCHl|->k=)V%%Wn`&yfI?eXjgXMxV7CG#{BLTEclgM6%F@e)n=~y`Z@c6T%H>WXPG$*A2kmRqq==Jq50S zR*&Uuw*)kgTql9JdAq;grz@xON;Dt!kuwVAfWa!B#pXAQ-JL)?WsQr6DvF@sUlZCw84t=DYE{q8fXt zx8l5TJ40)hGdkf6U(J-@#p`Za{`4i51=`9odP{K#AMg9h1}Xh=ddMLIGYe)Dk?j1} zqn%0N()D}?;=kyF@BCxzU&&(ye^SriWotYA>vorVpWPTr@_hETr8?Sw?GD>%3Fn*^ zd>*BgXY^xDChEr}W>h4(7!WsZXebvNF93s4Cpa7TRsEf{BbW1)cx!z8QTTZr1YA2E#fgjjL{L%dkXhkr z(nPF-mrC@4GEW&(Hlg6L>Oc*Z&IO9UrRa(!F|34?qp*SL@DejGYEpm}u?2cfCh?;# z56>|9{Rt53Z}TY$n{zE=0|S$t=s_gJ@Nq}RQu0NR2Nz2+9!|6C>bn2Vq#oHSREc#+ zQ?8MWC7E@hcSKTpUP9z>P{edlFs(FCTA!RkFx^CJh8FUh_Vln3RY*vh^r>HdJxr*p zOQ^$XZu^$T?_Zh93vN&|m}ZZCeHn(Gb|~MnPDh?Bu*C3E@ooO$8!3dbCT{H}9rs$P zq}dSRu>2C>U8OoWjz)TpbooJ_s)V6ez~;JBT)u}Ubd7zaIpI6J#O|~GP0RXF*jY)-GuA|{ zh*HK6*y`5N463)Bcen8&<25%5631G9ME=1Gi$|?~(*IT$YxZN_(^!^Lv&u z;#)FtcfDMkAsf+Qry*L-=hMg6b1DX=%=!)9KS@N3caOn>NxRDtDWhPPa>5y{;XziC zr(E=fC`>GFfBAh>6{rUw9^?S>A(mEnqfHbQmc%2PgGbMLxA-736Zt@Y7>~$D|CKY$ z5eLC`Ey=r@8DaHKIHtdTz3Ffci@j1-C7Yi(If?Tn4;g} z<@dNzlgGG%ZpNqIF-1G!uDL!PZC1?u@d9+YFBC8Hj!Q?yR}Ua`s9bdCaFIunbdUim zM}N)%Dnf*6$DUd3qL)iWWB>6k^jo5Mi=@H@SCsGTm&8B`ieqqj@NZ&SfAEwty*J&Z zKT74FIex(0gM{EBDK$N3oiBHpSE&yCKy3u#D9nW(1d|y_cHxrhzIG?BBIUYC>&nvYCYe4`H4tqboru9ouUs(i}{!(ssEG4ks7e2z2#m3_(4mgdC zNFf6=N1_@orGKUp|Hl*?o^Y2eBDA2hMi(ab_!EEc$a@`$amiVptq#bFX>mH1S)@zg zX-dCC9j8g1XIV!(VN0}YV6q$JChL*jK<6*V!xn`QHvpu>=?RRl?|cr9(-?3C^Y@bQ za8T`T2iz?IuYS5aetf@wBIz_JciJ-IkW=6{CgP$l^6p%&?z@FRFRJdCm!Ls1&o5D( zWFA^?6_L0%L=->R+{zfLs!dTFE6NK95B~HcDLNhzofd3FC-M;NQjTL4m%UrywOxw= zA(9eBu=n3gtI&B@n9&;~SC>0h0!xPq7e%P6`(`dhWW1bSqEu2Jc|5*60ojVOTMMnm z-fEF^iDRDno_uU7+^F%))FfMPq)#^T=(y+|QUs`f#z`GO{TnS?nSaW4WH(6VnDuXq z2|cp555+1y#lpig>OBjU0K!r~rShl~q5Nri-u5zY9j6-+`lPcaMRPyxiL1PU;Q&<@ z`gzQ*0IoRl8Bk1kuKxK(ntA8gM}C@!5&rZ$kxkV`G^tYx(*IcSr-VP$XArlv^Bbve z%%Jt38boerEjiC|uc-o(%M-XJxdjuw(6Eo$^mgQkFwM&-^1tD2{Y# zpc1gk*D1`a)W!sVnj55rd5X;p($=Mb_)G_9d~nyfm!9MFrp#DqU|N)mG1)o^{OQZ3 zmgj!`Xl6PF_PfT4NPnBt&6WPYK2r)2=fuPe#pG?{NWwQ zue>J~4W~_New0bBv^vb0*zVQpBX1w!AVke3-Sc&=f^9$Up4bcD;vy2H5#RW4Hw(xA z?5#hrH3aGmiz(C<9l6QhLMjBXz7bP^PDQ4MMo-aj>~ z?TK{8ODQddG0PH=vES{VcQc_a{8eeVlJ4-z- zBoAX9gJvWcoHdJUGz}ptRhzVVPNiDw2WI?CLx7yeg5ur5JmSQwbeiWZ|?I zq5$k@0zYsvXt)#jkuf0%7xj6ZUW&wCo6RjbJQ8PI08J(KI@Ggk)p-td^UAdXb4w3& zZIJUFiVrIu+|tN%sLL1n46r+z!l9J}GJCKmjC|fJHLlM`v>FVo)C5n7`EKz!5g?CL z4v2XEU*C+!87g>*khE(k_A!P#H10NxE|#AK&fL_k>GpWpbLh|b$e~!hJ_EiRBLS?E zG}7EBc>EgS9Tw2i~k@h6sWoC1W zzd+_&=c&f4QL}Rb5W98<|G%eE=D|>oV7Pp{anzT3`SxV;m0OXxnq+cK6p}>-IU6aa zyN>>PQ1r^0bQy4T@8XmwQvJ(*9oA@Ynp!1twrZi;clQBlUnr5FSZJde^N!ub0S_!z=XA#2BS23ceT56c3?nIO53~|& z=#utV@uDAtwv_Nm{~@p*ytsN6P*0TiyZf46u6H4RLjXo}dAtJr@AxTJ3V5VTt;6?Q zkM_*tP9~h`P5|(g1GRH{soiwMQVA32!xidyD=2Ih{7VN@oZC1uh)`W1-^0YbZddW9 zUgDEt?&AV6=3VjN3(v%`d~~qv%TII`AA}h=6funON$XuIN)XK-jWhp;t9_j6_r1}7 zt+L0)f;g&oOT#@7{VbS&sYF+FSI3(#Eile|j|8q-QSemaO9 ziKor0SJhf5f@6-(jw#v&`vT3CucKy9FI=n_m$B!Hlu4Wg3OM!RrDEUoA7I`$(=7ZI z+ECxm_)G0ePN_b0qqOvS66|l!PH?_%b`v{NJ;I42ad-*m?G?Q+jCrr`0R~Xe}giY_NJ6>ampgLYLc+7@Sc;&F0VSw?O*2J z05Ug6IOsCk&-KKhpYG^9#SH)r4g!~}yFLvDQ%rU%X-owh6WsyKe%igKS6J9pz&WQ| z%w3xYJ}`;DNqdh?GMSqYu2EiL(!bfq%QdN>>GMUmWsI;;WHbd7yWUeYtcoEaC+l(@ zNFF6o?YAKIC3-w48Hq9y;9X8x^wxRw`NaL5$#RZ3 z(eR>Ci`U;Zdj`QI=(A%jn$fY#0RLjfQK0SRo_r+UZuN9BUclm`^z^FqwDfI6dt(x2 zMUrKCI>@iVfPyML>#xgCN1gy;wc`uluWBxb_1X%ndo78QR!h#8S6BL}Ir+NVyamBk zavbhnvTcPg;*iu4xogsNLHH9{$jY|j3eG=H?*HP8qPBT!Y3$5OowR>Yw z($`LStp;WUUYhy>ewNZxxA9C!vA_}9|LSFJpg9Dm=WT17u9DW zjJY7}3Z+W*7w|BQ0N(JG=+xKqk*+Rqy@O{fP+|`1h@mdvAd~;Mxf4i0kjXAysa8R0 z{Y&K1+L!vjj-K%V06iRveiFYz?iV6x4!oEisL}Y27-y%6&{uR#Qp%&i8hyJ?f8l`b zA8_mIrT;M_b1OjfFcxVa_>X0INmN$3dMorXB2YfKWLDJ?E7;*QCMcMiknN|5x`l}5 z6^1_OMN=dXdQqjFZSJgHCZ?lpE@d3YZ%opTbT8YLna)!*T+>0sZ99*Q{r%Fhk~tdD7)b62^H6257;Y&^Od+h8sXb?STl@#r{5ObP-aM_4;mM*-Sa zQ!D3lI*1DXsrOHh>L?V!$v+2v9R?=Js1BuurA&)c%=)@zis>ysbC$QHDOU7P7M`wd zI31>Oy2)6*Rl&#H=a?>M@~+4Qa)t{2ki5E%Z7w)oM0%^Bj@VSDskQz|shJ~tW1?J( zLSGm>Q4Wtmnu$&Q)1(IsHxmC1%mTvke$Swf@M}a0CEmPtRjS;fzKqEF8mkJRwkhM+ zyAW6D$>HpV5hI%!F)HY~k~vo&vbMi}*?*{}JDXVmW>~@muNH!7$fcY^o%=KGZF4&$ zR-kV%0FFOr2>fZ*<{`#H+*+SX>vLe+7lKR+=fMqwzn_8x zA_zagwdF$K#6)nbRO@^lvs}tMFRMA}2f9ajHypluMg-hwUCgQZMcH8?{9-A!NKKvG z112v#984PLtt8w}Gl+8@Re_)c9vfLzD&L|0TSwI0Z=-;E3fI8zBRCDlGL=EJ)@QSs zI^r~v^8dqvQuz{$H3nAV}cBm-i*C*6o5#HZlnCj zq>!TA&`@K1RRAIVW5cr^B3;AsgI&_N>cOr67_>tXG|}8YVxadc;xTyONicvXbvFOL zeqCDKg!bFy2UVCSz8O03#T;<{U@`Vp0+5Ahot^NxL;boDHo$A5Fo2*n{uo$4JN%~-vc@UL1m26o1y1`GHdJ#- zJ@-MjKL?iS<`~LUV+Bh5WuU)Zufei<@aGqvK24vl_Q38#k)WVt%FG69PvVmXE{sPF zZwLul*kS@iDftQSxBC>;K91ELr%-VIUlB1$x!jen`IqtoTY%>xp(&M08oL#fgT9{It zuojHMw?u#R>|ltjS}^L|VxafpawMJ3Hn=Sly&*H?GaenxGN)hOGDDcIG77K$uyJf- zKO4&o7hvoX5=OzG3i_oMNsDlOl=zj45Zqs)pYx;0og zH~>9TLRfV5`se&Y9-LtAT|TgxvK6`l^rlka*+=04z(u{oXm@T|v$6t2M$~lVLT3;K@;LS}-TnN)JvQpb@#j`c)dxPQkJ+Z)ypsK5 zn$_-iWRKbNIhb3}8jql7)HPS#8Iq4ql%@v= z&@OqP>dBnp?l&K};Fz=LX_v42`m3^Xpo8PF)>BF@r9j;bDZq)VkNMD75Fy4b)n5!j zJ9F4g#t{d$#JQ5y$SS|hN~5RkQi9FUmHh&VPXZG-JC(+$6gc}+`Nz%>mt=9wqQ0_9 z?+X{3wib@$Gc8D;`2~WQ)6^uq-9q}ie@;L0(H7R;<;*T1hpsSJ0q;WJ0Axumo3m0* zj-CcjNZ8LWtf8h(%6P0si(k;3INtmq2hvbt@~e+mdCnSRRAFui)7*pZ?zV#YIo{9a zKo8)S!O^fi6TLArBAk4ESE0sFY#GMa{yIH8S}_)%3rMLbxh6yJKNF)E>K7HI1lv+-2**mvto5w|HZ!^ z=f5FD1tR@j!AEdbqr%q!8;IcwdaOsZMgT>mPI{y$#vM7Kq% z;u(yq2XE%HTs`n_?WfGq!B4A|@#!C$MU{tg!lG*u}Gvqbub60c#j~< zzCc4IIY4>rKZH8CIC7GsNrb(28B(FD8F$mI_aOdYs^xu>$H)cx8235nzL#h+=qN4t zC`=ZSLWDQ(Ua$8INWN38^*@Tm0qll71ZXU;oT|- zgJC1{i~2$F^zq<+yB{tyHC}ZFR2I~1wk>`0Wo?j;peJ7O=EPES$$Nu=kn8za=oPA% zyG@vI-e;SJ^cyh2!g~DmDdnZjlNO+XV9HKA9SR;-F+dF{u79(gOSLWC-kir9?j^dR zZWNdN!C391C@7vcR?&6Pq>L`eU2Uv*_m8=al3w{sX=fS(ZVb|y)SzQ}v>%BnYg>;q z)LX@X_a0R-Br&BYv;s$qrOr>*d1rk80#tHUzxAAPPR8My&mcET}00n74cS4Jz4j^l~{J)PFtg45JQ;xOpv3ZYNX4B9bdi;p?(@kt1 z*LX^&^DGg+L58HbN10nd@#AlAcn-`$@=v~Pii!w!kAwrHnr1HpA{ioT}#!PMSiJ#~dbhyWd!1k#L zRpZIXPGsRdbfNm@a+jb}eYmczd#ID<19#73HM^!dnVUDqWCr0aYDdO4k^sfd4itdp zY5LeZ0CM^(psEcSUo%!5? zUmM@wl8%G)lIj%u^83=TAP3GOQnYoM+9x|#AEYVyC0Wtftmi-x%c9+yF^icKL0f*) z?T-EuS|4}l%_q`F6PW6lw8e)X|Mso1J;@WBJU5>J<{A2dc?RJ6KgP^6sB@)=w*Kz7 zq(Df14%mN)f=@m(D>%`waG$q;_tBA;;wqH*O2!`Q6HAKq?Vg|a`|L36n#*UH7|JL< zCbF?l-Zhb)Pt05goA6~6|GqT$1z$g8AB-eb13(H)JH8K==c)Q-->iF#D6RzmRPOIp zAVeVtEYaF;CZTxMvz{*)&qx^87Txd|beoXPeSV87Sw6>%|9ut-l9aF(ZcfPF&P>ej zD=2=?PG)e-;qqAFI$FrPA}T*{2=^Y?kYn4kAKkoJt$$_p-+Khu|`A zQ8*5(bKK1puR>0hv#Vvzr^929K~%0-HAQ73=QEZTmL0~AJActWDh((Q&VQ<-_N^VM> zqpIej3(nQ}VQy6$-pS(D)W3FBe~7TkJlauBrD7+Zt`GtS3+kpafWZQde#$q)OF+k0 z_0;8+HJ2cXAi;v+pid;y&6vr7MwEh8$pK3yN5ik=DI1C)yJFgi%44#V9a6W6;M`1m z5gyhNny@oFZ8xi0;aGJ+p3LJKDeCf)u4pLS^^H=$jCFD z&6B@wisn!N!xZr>kv>MSS(ftx_LDTY{|+((&z__l6YM@y_6kM_VGLxA?-`G4eM-uEJq(HE)89$<*7HOHfsAy57a`~3f4us$Bf;ShdI#MSc-Kl;|B5h zaZ8~BE+b>QZmSlA3XJcz$M1k)H)y@5UAOyG`DAl;g0PhYQ@uBw9MMv$Fkf}x?q{u; z8eFXY)Hrhr^A2}MjGg3#(8>;}38B-*mH3(#8<)?>VVz#CjY*$YiH3c)bZSDnTq;$y zbZom2?ij|d61Y-%C=VFLu+n^_z`ijt@%5a}l}zP)(<*Xn;#kl4!eel}SjE{bOGr_a zQDj{b>r{V@)%M=(WCu(WxDeoc|95Gluy~KIcJ*A8{PA+!-%z5{^ZTQN`&0FM;`?jo z`=j`P_dY>&bygfbSlwDY5Q+55QhHYB8~5PkLYpbLhpw)Vzb|1UOZx5e?7RL}ec>l{ zIw?4Gi46m!l|xFl&W<9Y*Zaa(utm9!B`iOGAuyIi0tAynJpyX|5RgTl&U)pY& zk@>CgBtPw{bK(>vLFgnyGJ>1OSf4h)dCi9F>6n35hzT%Up=^Y!cJnB#p;^n$s0{k_ zP=9~rdF}jxtI9YBWMN(6TSS5S-fiZm|{KsWL*?NHZW8NP2| zc*^m{-aH`pg}=w$t=!_z`wruisT)Gy!5rO7(9=5($?s1r&U0E)x)%E~crVG6ZN0+^ zTW@5q6AO!Faw5M0aQ8(!kIzKMpA|eWPDv8Vl&?A*zXYHH29PMP(?g*+j_Swlton)r zRo0RGZ}64O{xJOk{n|^k_fSbgcIyeH8O=lkDlkLMdHw);>H-I*$K$&j?7sP^0Q z-6>y=&yYj?c@xv`-*i8=psiuzj*Qr@8`5a;gnrm@*P?Fb0H+vPQ#TW5jlLYxh)}Z+ zllDklHFJ&4kH~M`Cbt@jE_fIOxinn>x+CHh;TEF4W4LhenW1p74UZ^l1G)dzv1)rw zEg_dN$;XPF%+TmIs#{*EoGLw$3)z|VlizbJ9x03)(jIw|1K;TZ|C4+vq+aBCi%_I% z-?S2D1PVAvBbBnF;Fk`QRMhrkrthkSNRVG}?FrKZ1#GHLibS3jzI71VdVlltqfH3Q z_s8W|UgbMyq*P8`_Zv@jN8Oi1v_C%e86I`d70L|~wE|CMsEw)>FwL^Mw4d*tJ`o4B z1>aU%K?5C@IZ>VEPE8Lrp`5_ve0q>38Qjg|(Oikh_ag6F5tGs-EU!WE^$qpJacqlU zD6`^~>)_69Z0XI-nS@xtIIWefM_2)V4eny867-dWsFdhOS}S0um3(V(A4Vn1|JofR z?KHcAC-h=WXoKB!!v-5$0hLSP9KNyL^4CMe5=)lB-~7D%&97pD!W{n#AeBuc7FuTf z+!{7M_|s;suotj2{gnkERbN37ne5|Is2A?MdtK{y{-3{wwxsRSD=BqG8q|D*g@382 z1`h4lrly-#YLS@^jup!+4Tt3&s|5WWg9C|pie_PmwdNDY4##XvBC_|bATc=q7&@-awGnc(S=d4^CvMiG@jJ^ z5gnTPTh=zd^ShK46Q)*4igE1BF87~a)dHv~f9=~Rj0S5hjg05Jk0IeX3jxW7ukNpS zpZ~xVmlt4)Y;tOU6N&>hz?|jx_IHjS<^ut{0q1#-x@BQ7ap9vq@`n@=|Fv}n*7utm z1#aA#5vzMT|IiJ8Wy1p;2I09u)jFi`x5SLi<^8M6BS#}D=1KMP=q6aBs%Fm{4$Pa7 zHsMI}Lp`^Y+zn~dU`aazFj?BL2E%iiHY2q2x;;$b6pWJUg|mDHMtSVo9R2^gBz&bx z!{OpEgY5X}@1Bvv2{Y9LvYmWdBmeAx&J6D;`}A*ftj=#3hrPesX5?NlO#Z$}dSW=D zeiECzE$x)|eqic0ZQdgcFBhu)5J-!#KH4bh=7eJaa+Z+3w*|0{u`5~yTM1>{EeDia zC=c#CPC*xDbWa%jCpf0T&iViQ;zlrk->>z@To86C;alWyf@X$yVjggfH9f zMxA^mCwy73e zsR_#i?Fk#I@eO8>F$~VnJDG1jmNM*ocA7EZ;VBVg4HTH z0_eT=WY|0=8s!=o zooDibf9y@-)u%XjrP0uzFsyZ*uJ)r#&l~Y4d|8Zqt;N$kt1Kq{ui$(cqjznjhEvhG zwNw4Ui%!$_cl+{B{~k+!>I>;^MPkgjAg0qz9nW&PDt~A)r8`z9wo z4Obyez&97it;^THDIB)L;YGOq!VF1JI}3ZU&Hs$^JxKFHbzNs{~cW&Q@78EO#)t;dFAH36+vsXqy! zv~bHz&u!C8Y;ZRRbzjrO9??TTRl^BrR=D;hW#A{#aeQ{ceNeQW4)qtP&;+ISXKh+K z{R1SeX_$ddIv{vCq0bOIIai9w=AF)=|WIHqYUFj8*L?==M{!*7=Rs0x$fNtp#w-_dbOKU4PXucvWfZ6B&CF zydAa6%=!DLHzGLJ^z?HGvUS~G=kGE^qJQh&Di-^ELVT1%E-77v~2 zDBK4{So;;X;!#G3Om14oFbrYL8|2x-m{w-oyT8|YJSW1v3A`R2r%^%sj_XMTvvSg9 ztoP)y>0CR{;Y{z*CC6tlTX?d~^KUr^L>OBY)e1?+FJ(o}zVEF)dlcr5Pl!P{nJHEj zWOVk%(O>1M>Em@MF3?()A`%U0*?C6S+BQFOo_aGh{$;)v7(&JU%}U&1MK`U&;D_i% zBx3+gKJ!e^ljRKTU}py|ZQ%x27ud?2^k|^}P$SS6v1GKt0A>~Nfk7iiBOaTd^7vBu zG8P{wgRxw?h1e-=o8nQzubH803*Bcka3!+Mp>#*1E98*HYbVQFPmg}o&W2FG&X3!c zmFs=VZ3xgO9$<3tZ|HgeI&k4+1OMT|Nu}N0`v!x}0AF#d;^}87bOWcfK+9gqv(U^t zHICTYUqU()o2ke3jQ?ImA&nSYqBnua+eb^HNW>_iUT_W}?H-sl+k2XhUZHCDP8qN9 z`VUB}X*0l3e6u7KgtPkt6$y3QCXbx?Uda-E5rnTS)uimk9{2^sitSIAzzp15Q6z%% z=uPeN{xG}oquLyQaWTm1LbqvJoW%Sz5#4a==D?wT;JyK9kZh^Bq5vagq3y>jwCkpS zkU9sWIR8c(CCiJNCfw2Kb7Rn*$gx*-625`8LBLqcbG764Am3ibavqcU?-rhBm=#h^ zf(9D_h7Uaw5DHz_%4W2~`a{#Ou>t0L#n)~|3Ofc`kRR~3il5Z+obdim9_Rz_Fn0HZ zl1i~9y%H~yHTpcP29BE1V<&<0)jU#iiS<`JA0z<=BU!@?g^Q~6m~aNo=ekI)4qZgL z#S+}cSh8y?xfJl?g_|AfEsAhcBUU-`m(Swd|5MYr9%s%fUtl7Y(FF;3ojU^u|Wu&t-3LEG;(3t4sm zY{8nYH0*p!Ux$k^54ddQFzB#GDUAW#+#)CD9B`cHWn66@kslPAVCtn%#@KrmBSUIr z0a}kIFKTm&rHoveWsSyK5N7b1pht+PpGad=4EdO|$~ySRKaczC*-YyjQzoqAowdqy zCDq1o-yR{>Fcz<4egS_pEqdb5<*+nnjtoBD&q?_b3rn6fG(S2`rNpuE5c zRpy*wM&=J)mO@2>5w&C#iny3HDjgP@|8HLyeFB`!)82KkHPLPB(2)`mkQ$00(wlTa0a1DrMCnCDflxvM z(t8sr(gY14VgU(FdXXw1y$I4ikRSm;s?wX}?)aU1{PdiEaBuQF!;^%yX74>~zx$n8 z@0!UnGw~MTlIP{tYHPK!j|*r^r=Y zAXLcl8RQ5}M1gcf>GOD67k=oJw*1T;VGV-bm~S@fV{Ib~jBw%q3MjS05RaxTxm19u z1QHS@SLo7YDnW+OrzZ3ISN#;$@5iu+g5pbz%_r~8i)3@}32y^DJ|k}{F^~3BJSQKQ z?JZ7D0dUABkdPu7$IJzTOz3fSWVlDetP0@nXs9K^>8f}w*4NJyn#WU?=q47S=L|VtpH43sTeJ1W9l?t+jTTw;p%oDj3 z^LOUy+d*P`-$q$E+x}#=A-g=iUOS)ijeYE-WUbv%eGi^|?JN03Rh)&m+#8cVeE*G} zBX>fW`f9x=0)l%!3RCv(qO%n1L#JlJsFBf~Nhr_Tx4G~xqUHz5l-B9LQFhA5M5Wpz z@(V;Ug-vzH#+cWC{H1yoJMqE0GA*LmAhTE-97_ytcks&k(D=LgP0RY zFz1ilDF~tr65DdjP8dmHAEpk#YXK?6`l4#h5=v+Bfj>T-kbt9v!r+ydMotER)!tv_ zeerS#9r^LE7;V!VCe40EAf305Wjba$T)Tnb7J)lc2OFGBPZ9d$beSOWqhN+0=K3Sj@sj&}h6(*w~xB}@sM6hR#CSztnwtiby@hGEvbT6g>K zI#3@BP!Ro9h#62X*bTm$jX4sZOhb{_etHiiKu{b=A=lw%04cEa>c}AgtIlTA zN&#R5?mroUU#qYi`{F{{&k{Ru0O_DY(O@#ZxO!Zz#>{X3T)|*EZ^G}N}3GT2w8o(dyvkEWG-Xqa?zK@;j5W*k&>AC?>-2h8v-8K^Rb1~|M z3|`SfbhGIqXLe+SS485w#+3iikR5Ka}8u5%aiQV z?!U1D+!5+W1zSM8IpX23z_+t9wkv`Q*q22nrYVN~QM4@A83`dM9gAiYFTw-C{$ z!k2zLo{c_qd)!|SevZIYYOutB0YZUL0CGHo>*pjJ4zH2GU5-^ zHl8=)dvXMDA?)p;KBs5bIvAtBZ;Am=cP&fgXG+;-AsGbT)QKm4bmY#KG#5Nk6~s~l zbYWYM=YnGt`%TC=>r3p-L_@f0#p}Q$GFbXWr76b8Kt0441eQNb2G9N`4}j`SeiYIl z4vs>A8n@u5~O6_*6#I4A1PoDy5<+%c#-6wwE=E{?Xn_`}qbl ztg63*`Nn)WFZ&KsTZyY@sjxV(-@npEZgGBi->-kahB7?UzVFnqRTMfFPPe65JEEAY zGvy0cLAHhZ-`S}n<%B=3tsdR#0@d}LV>rp^1`$vF2)Rpma^(+j%6m$engID(T>sdWkFxx zkk~Rc*UfDBkhHcE$o92=T()GP_5;zYcE6|W9ERrf6(@^Uk|Nayp`tQk6Y7T1jvcNv z+oXC9X1u}iskZf$u596an!dN9RQA7D`1a9EB!tz}RP-bCx~FJDoyi1SJh)x|V&`xg z7Ib0l?W2qE6^$OMPb2Al>qwbrRG4JlZ)_cL? zlM|dRFJ>4z)YOv1TC}>2#tm@}Y`M4cY30Ba#VAv7#BT`|*Y%|Ca~+ z2F+yRr?@sY(Bj4q-zmZ8$l>N+s%SoTqxhQ(ll))s#w$gNhVHMcym7t2%NrS=YN()> z1u3|xzkYZwq0q}Q{${qE{s9^5mii^?#0OXXSM-@xg54ygPi?5XZ5+# ztelvpCD-libhIi?%Z2#`<<6@w3g5e#Ss2&)`Jttn#EYt991eeYxR6~vP;Fxafk5#1 zpPMp4ORE{|v)F(z;5Xm6ledgDH} zrGDf|bbdBladbq_qJiyKZ&GR!cq`85&9ts6#@^2qA}1g3o6lA+)~YDr6AI zG6Mv{2)^-g5l1=NAsucTdm^3O?xMteTwNke_1q`r=^Dh};9I^qgc6rtU25In(K^#p zCfbQK(aLc3J99lFA+kG}=fm68;XYXcX4Kh01nsK}MH5!ejY1$0hJKovF79U#E%uUxdv2sQ@;$aUD zS3|jCgFaT&rwJeCDl%O@p1c@$ScPjZRz`>=j67zGlDfp?a1rK`VJzwvRi)a6Kwx@0 z74P7>d;tw3lCZ03;EJ2%*>aCoH}HUR74Idmt%4BlS*XQ8)mczywe7IJ~4cD zN09{4t&ik4K@ZZTt~BC48JJnl?2p78TxsKf!j(TqefUbY|F2Z55WeW=9Y~)KVm)Z% zyp(SSO!i_>+D_$3Sq{=HUrCEpfiY$Y)6TF}A7Ln_lXQ9S>BUMWOCh1}qeG3jhfBE! z$a=I@T($YMgN4k;iE-z65k={&5Wl2`7ub6a!lv>!nH`?!IA4bk$%LiLz{R(~D4hq3ZbK@l_t09g`AuNabvu# z{&iuG>pGS@7t{M5{YaEYP{|q5t_tuJE7D%~zj5hffSjX#^KdTF`hnx{~j&y^p_GVU0Rgk)^4_{QeU(5pazdy-$^O{*5sx zRIYh`Xxz(Oc)<0(mW9)uF_5NJ0WV96L=C&*_vPQFrlbq$_Kp${g%RX-RWJBTVGQ(7 ze3Hm-&+1ySlxVR$5uEyr7(&KUegn>)o!TO<7T+hs;M%iMH-n*%05E= zX7ZY>*E7-EPE{c>{M(n~1sC~Fe!H~86^nV@gvl$(2v!|-lJ8K|zMg7C(ZBIhZia{L zC0C4=ZesfG2_AX@-6zy2=R64G`6uupA2<5M`({i2_y4-ma=T4}l5tHW`hGTIeMs6+ zwJ~pbZB-zV;*7poYMR>2Q`NE!=}O7$8t9_1W}po}>^z^LhH8|_*{(4}(>G*1f&t}x zd2ocjO~l^3igu)us(i}Qsf__ID--wFVtM4gxzKJ{pF#kgbO#Ug+se?BMaZ)y;|{+KmV@$%#TDTXD>H)F-BE11w0>x6 zb(NOpbE~EXCq;PlrM<`g`auc{$+rTXGiDg#PHU$L#N1*Gx&46JSKvcX^$B-FTaHJW zvf7tp{N6ZPS=pS{WyqybVtv&jPbB;8tka2Mz6%%fPP|X!$ay8Z0sY!;{v_=}urx)X zpno1kvlFZYL7jkH65tHUc^b1AH|L_hbaFTj(k~%B80N{ue=RJy9;yq+*GmXd-xanj z3Cu%H@nw{2Wq#rpzGFY;buB$!GT@lB$&Kr22PP-6wV34@W+qO8ySbAV9=MYif@CU0j-i^;-m-CG#xUYs02VwM|VQ4zLrFb92 z7$pe=Lif)wv_l~NKMH@%yv*c2w@C@Qh;_09DDk@Fd<7*{_$|7#f=EY0l6ym&Y362E z9F1n&NZ%S@A4SL`Cym_?rC5SsLzY}SIg64h2Bk(lSM#aj#6~Wy32j&W@cVxCbQ>L< zQt;NR1kI(tf{tuQ6}%o#@SeVPD&I&HRXv2ZjP-vOS@^;Qox0mR^d(DQ563ahw_DfI zc~B-&BF)KSAHRLo`~3A39A*J8QJeRi;jM7ZxyFZdftTMo z_AtEFcFXck1S)(@#{be^LyNv}St#*z_o=E-700c^W+iT#??o+l%W@Y?9wr!zb>fpD z;wIl2C@J1(Zzo>7!qluPD`O9e-d%!U9$gmYN%_F@ILcmOtRIe_5Yq@{TzJ=FOPtiM zFguD;IhZayv#ESP9W|R|H=Bi!CBb@r$#kzT%iN0ImE*Em%%99vv<>K43viSbyYF%@WYKlAxUX1{4M=uevD4?kS zOI&ZQWjR`d-x5h75Wrpkcv+~qyLmdedD>j}ML3|W2mx)V^Sgse6A9ieV5SRjnEE&K zbTB1skPa>=aWTU7Ps7klsp*@*pbi+a{bpEC{*yreZR_ohgx_*^hZFYwU;QwrrHj7; zKF$KGlfU^{2izPG@qc>wvk@T~?+BEZ)=C#o;G?|I_qqE?>WWE=-}Xq0w)vT z&wusu=b6VZMy85U2?+Ex$bKhtGF#i`G8tRaN SF@r#;z&ivSHMtjrPyYk6%g-(V literal 0 HcmV?d00001 From 06642398d457d96bd9021734e92a50a41d2050eb Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Thu, 30 Jun 2022 12:06:13 -0400 Subject: [PATCH 014/126] Formatting --- python/convert_csv_to_maf/README.md | 1 + python/convert_csv_to_maf/csv_to_maf.py | 232 +++++++++++++++++------- 2 files changed, 168 insertions(+), 65 deletions(-) diff --git a/python/convert_csv_to_maf/README.md b/python/convert_csv_to_maf/README.md index 41a9c80..be325d0 100644 --- a/python/convert_csv_to_maf/README.md +++ b/python/convert_csv_to_maf/README.md @@ -30,6 +30,7 @@ python csv_to_maf.py -l /path/to/FileOfFiles.txt ``` where **FileOfFiles.txt** + ```bash > cat FileOfFiles.txt /path/to/Test1.csv diff --git a/python/convert_csv_to_maf/csv_to_maf.py b/python/convert_csv_to_maf/csv_to_maf.py index 64034db..85cf2db 100644 --- a/python/convert_csv_to_maf/csv_to_maf.py +++ b/python/convert_csv_to_maf/csv_to_maf.py @@ -24,8 +24,8 @@ def main( help="File to convert from csv to maf. CSV file generated by Rscript filter_calls.R, Can be given multiple times", ), normal: bool = typer.Option( - False, - "--normal/--keep-normal", + False, + "--normal/--keep-normal", "-n/-N", help="Keep samples tagged as normal", ), @@ -39,13 +39,13 @@ def main( """ Tool does the following operations: - + A. Read one or more files from the inputs - + B. Removes unwanted columns, modifying the column headers depending on the requirements - + C. Massaging the data frame to make it compatible with MAF format - + D. Write the data frame to a file in MAF format and Excel format Requirement: @@ -53,15 +53,20 @@ def main( """ if not list_of_files: - typer.secho("File are not provided as file of files.", fg=typer.colors.BRIGHT_YELLOW) + typer.secho( + "File are not provided as file of files.", fg=typer.colors.BRIGHT_YELLOW + ) if not csv: - typer.secho("File were not provided via command line as well", fg=typer.colors.BRIGHT_RED) + typer.secho( + "File were not provided via command line as well", + fg=typer.colors.BRIGHT_RED, + ) raise typer.Abort() # Read file of files if not csv: - csv = [line.strip() for line in open(list_of_files, 'r')] - #print(csv) + csv = [line.strip() for line in open(list_of_files, "r")] + # print(csv) final_df = pd.DataFrame() for csv_file in csv: if Path(csv_file).is_file(): @@ -69,74 +74,171 @@ def main( typer.secho(f"Reading: {csv_file}", fg=typer.colors.BRIGHT_GREEN) csv_df = pd.read_csv(csv_file, sep=",", low_memory=False) # filter csv of "duplex.called columns" - csv_df = csv_df.loc[:, ~csv_df.columns.str.contains('__duplex.called')] + csv_df = csv_df.loc[:, ~csv_df.columns.str.contains("__duplex.called")] # filter csv of "duplex_support_num columns" - csv_df = csv_df.loc[:, ~csv_df.columns.str.contains('duplex_support_num')] + csv_df = csv_df.loc[:, ~csv_df.columns.str.contains("duplex_support_num")] # filter csv of "normal" samples if normal is not wanted - if(not normal): - csv_df = csv_df.loc[:, ~csv_df.columns.str.contains('normal')] + if not normal: + csv_df = csv_df.loc[:, ~csv_df.columns.str.contains("normal")] # filter rows that have call_confidence == "Drop" - csv_df = csv_df[csv_df['call_confidence'].astype(str).str.lower().str.contains("drop",na=False) == False] + csv_df = csv_df[ + csv_df["call_confidence"] + .astype(str) + .str.lower() + .str.contains("drop", na=False) + == False + ] # melt the data frame - melt_csv_df = csv_df.melt(id_vars =['Hugo_Symbol','Chromosome','Start_Position','End_Position','Variant_Classification','HGVSp_Short','Reference_Allele','Tumor_Seq_Allele2','ExAC_AF','Hotspot','DMP','CH','call_confidence'], var_name ='Tumor_Sample_Barcode', value_name ='Evidence') - #fix tumor_sample_barcode - melt_csv_df['Tumor_Sample_Barcode'] = melt_csv_df.Tumor_Sample_Barcode.str.split('___', 1).str.get(0) + melt_csv_df = csv_df.melt( + id_vars=[ + "Hugo_Symbol", + "Chromosome", + "Start_Position", + "End_Position", + "Variant_Classification", + "HGVSp_Short", + "Reference_Allele", + "Tumor_Seq_Allele2", + "ExAC_AF", + "Hotspot", + "DMP", + "CH", + "call_confidence", + ], + var_name="Tumor_Sample_Barcode", + value_name="Evidence", + ) + # fix tumor_sample_barcode + melt_csv_df[ + "Tumor_Sample_Barcode" + ] = melt_csv_df.Tumor_Sample_Barcode.str.split("___", 1).str.get(0) # convert Chromosome to string - melt_csv_df['Chromosome'] = melt_csv_df['Chromosome'].astype(str) + melt_csv_df["Chromosome"] = melt_csv_df["Chromosome"].astype(str) # split Evidence columns into multiple columns - melt_csv_df[['t_alt_count', 't_depth']] = melt_csv_df['Evidence'].str.split('/', 1, expand=True) + melt_csv_df[["t_alt_count", "t_depth"]] = melt_csv_df["Evidence"].str.split( + "/", 1, expand=True + ) # convert t_alt_count to to_numeric - melt_csv_df['t_alt_count'] = melt_csv_df['t_alt_count'].apply(pd.to_numeric, errors='coerce') - #remove variant frequency information - melt_csv_df['t_depth'] = melt_csv_df.t_depth.str.split('(', 1).str.get(0) + melt_csv_df["t_alt_count"] = melt_csv_df["t_alt_count"].apply( + pd.to_numeric, errors="coerce" + ) + # remove variant frequency information + melt_csv_df["t_depth"] = melt_csv_df.t_depth.str.split("(", 1).str.get(0) # convert t_depth to to_numeric - melt_csv_df['t_depth'] = melt_csv_df['t_depth'].apply(pd.to_numeric, errors='coerce') - #calculate t_ref_count - melt_csv_df = melt_csv_df.assign(t_ref_count=melt_csv_df['t_depth'] - melt_csv_df['t_alt_count']) - #calculate t_alt_freq - melt_csv_df = melt_csv_df.assign(t_alt_freq=(melt_csv_df['t_alt_count'] / melt_csv_df['t_depth']).round(4)) - #drop Evidence columns - melt_csv_df.drop(columns=['Evidence'], inplace=True) + melt_csv_df["t_depth"] = melt_csv_df["t_depth"].apply( + pd.to_numeric, errors="coerce" + ) + # calculate t_ref_count + melt_csv_df = melt_csv_df.assign( + t_ref_count=melt_csv_df["t_depth"] - melt_csv_df["t_alt_count"] + ) + # calculate t_alt_freq + melt_csv_df = melt_csv_df.assign( + t_alt_freq=(melt_csv_df["t_alt_count"] / melt_csv_df["t_depth"]).round( + 4 + ) + ) + # drop Evidence columns + melt_csv_df.drop(columns=["Evidence"], inplace=True) # add additional columns - melt_csv_df['Entrez_Gene_Id'] = 0 - melt_csv_df['Center'] = 'mskcc.org' - melt_csv_df['NCBI_Build'] = 'GRCh37' - melt_csv_df['Tumor_Seq_Allele1'] = melt_csv_df['Reference_Allele'] - melt_csv_df['Strand'] = '' - melt_csv_df['Consequence'] = '' - melt_csv_df['dbSNP_RS'] = '' - melt_csv_df['dbSNP_Val_Status'] = '' - melt_csv_df['Match_Norm_Seq_Allele1'] = '' - melt_csv_df['Match_Norm_Seq_Allele2'] = '' - melt_csv_df['Tumor_Validation_Allele1'] = '' - melt_csv_df['Tumor_Validation_Allele2'] = '' - melt_csv_df['Match_Norm_Validation_Allele1'] = '' - melt_csv_df['Match_Norm_Validation_Allele2'] = '' - melt_csv_df['Verification_Status'] = '' - melt_csv_df['Validation_Status'] = '' - melt_csv_df['Mutation_Status'] = '' - melt_csv_df['Sequencing_Phase'] = '' - melt_csv_df['Sequence_Source'] = '' - melt_csv_df['Validation_Method'] = '' - melt_csv_df['Score'] = '' - melt_csv_df['BAM_File'] = '' - melt_csv_df['Sequencer'] = '' - melt_csv_df['n_ref_count'] = '' - melt_csv_df['n_alt_count'] = '' - melt_csv_df['HGVSc'] = '' - melt_csv_df['HGVSp'] = '' - melt_csv_df['Transcript_ID'] = '' - melt_csv_df['RefSeq'] = '' - melt_csv_df['Protein_position'] = '' - melt_csv_df['Codons'] = '' - melt_csv_df = melt_csv_df.reindex(columns = ['Hugo_Symbol','Entrez_Gene_Id','Center','NCBI_Build','Chromosome','Start_Position','End_Position','Strand','Consequence','Variant_Classification','Variant_Type','Reference_Allele','Tumor_Seq_Allele1','Tumor_Seq_Allele2','dbSNP_RS','dbSNP_Val_Status','Tumor_Sample_Barcode','Matched_Norm_Sample_Barcode','Match_Norm_Seq_Allele1','Match_Norm_Seq_Allele2','Tumor_Validation_Allele1','Tumor_Validation_Allele2','Match_Norm_Validation_Allele1','Match_Norm_Validation_Allele2','Verification_Status','Validation_Status','Mutation_Status','Sequencing_Phase','Sequence_Source','Validation_Method','Score','BAM_File','Sequencer','t_depth','t_ref_count','t_alt_count','t_alt_freq','n_ref_count','n_alt_count','HGVSc','HGVSp','HGVSp_Short','Transcript_ID','RefSeq','Protein_position','Codons','Hotspot','DMP','CH','call_confidence','ExAC_AF']) + melt_csv_df["Entrez_Gene_Id"] = 0 + melt_csv_df["Center"] = "mskcc.org" + melt_csv_df["NCBI_Build"] = "GRCh37" + melt_csv_df["Tumor_Seq_Allele1"] = melt_csv_df["Reference_Allele"] + melt_csv_df["Strand"] = "" + melt_csv_df["Consequence"] = "" + melt_csv_df["dbSNP_RS"] = "" + melt_csv_df["dbSNP_Val_Status"] = "" + melt_csv_df["Match_Norm_Seq_Allele1"] = "" + melt_csv_df["Match_Norm_Seq_Allele2"] = "" + melt_csv_df["Tumor_Validation_Allele1"] = "" + melt_csv_df["Tumor_Validation_Allele2"] = "" + melt_csv_df["Match_Norm_Validation_Allele1"] = "" + melt_csv_df["Match_Norm_Validation_Allele2"] = "" + melt_csv_df["Verification_Status"] = "" + melt_csv_df["Validation_Status"] = "" + melt_csv_df["Mutation_Status"] = "" + melt_csv_df["Sequencing_Phase"] = "" + melt_csv_df["Sequence_Source"] = "" + melt_csv_df["Validation_Method"] = "" + melt_csv_df["Score"] = "" + melt_csv_df["BAM_File"] = "" + melt_csv_df["Sequencer"] = "" + melt_csv_df["n_ref_count"] = "" + melt_csv_df["n_alt_count"] = "" + melt_csv_df["HGVSc"] = "" + melt_csv_df["HGVSp"] = "" + melt_csv_df["Transcript_ID"] = "" + melt_csv_df["RefSeq"] = "" + melt_csv_df["Protein_position"] = "" + melt_csv_df["Codons"] = "" + melt_csv_df = melt_csv_df.reindex( + columns=[ + "Hugo_Symbol", + "Entrez_Gene_Id", + "Center", + "NCBI_Build", + "Chromosome", + "Start_Position", + "End_Position", + "Strand", + "Consequence", + "Variant_Classification", + "Variant_Type", + "Reference_Allele", + "Tumor_Seq_Allele1", + "Tumor_Seq_Allele2", + "dbSNP_RS", + "dbSNP_Val_Status", + "Tumor_Sample_Barcode", + "Matched_Norm_Sample_Barcode", + "Match_Norm_Seq_Allele1", + "Match_Norm_Seq_Allele2", + "Tumor_Validation_Allele1", + "Tumor_Validation_Allele2", + "Match_Norm_Validation_Allele1", + "Match_Norm_Validation_Allele2", + "Verification_Status", + "Validation_Status", + "Mutation_Status", + "Sequencing_Phase", + "Sequence_Source", + "Validation_Method", + "Score", + "BAM_File", + "Sequencer", + "t_depth", + "t_ref_count", + "t_alt_count", + "t_alt_freq", + "n_ref_count", + "n_alt_count", + "HGVSc", + "HGVSp", + "HGVSp_Short", + "Transcript_ID", + "RefSeq", + "Protein_position", + "Codons", + "Hotspot", + "DMP", + "CH", + "call_confidence", + "ExAC_AF", + ] + ) final_df = final_df.append(melt_csv_df, ignore_index=True) else: typer.secho(f"{csv_file} file does not exists", fg=typer.colors.BRIGHT_RED) raise typer.Abort() - #write final_df to tsv - typer.secho(f"Done processing the CSV file writing output to {output_file_prefix} in txt and excel format", fg=typer.colors.GREEN) - final_df.to_csv(f"{output_file_prefix}.maf", index=False, sep='\t') + # write final_df to tsv + typer.secho( + f"Done processing the CSV file writing output to {output_file_prefix} in txt and excel format", + fg=typer.colors.GREEN, + ) + final_df.to_csv(f"{output_file_prefix}.maf", index=False, sep="\t") final_df.to_excel(f"{output_file_prefix}.xlsx", index=False) + + if __name__ == "__main__": typer.run(main) From ff2e4b66584d7b888d86a53cccf006ce257e70b4 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Mon, 12 Sep 2022 22:58:21 -0400 Subject: [PATCH 015/126] Update csv_to_maf.py --- python/convert_csv_to_maf/csv_to_maf.py | 282 ++++++++++++------------ 1 file changed, 144 insertions(+), 138 deletions(-) diff --git a/python/convert_csv_to_maf/csv_to_maf.py b/python/convert_csv_to_maf/csv_to_maf.py index 85cf2db..0efbc39 100644 --- a/python/convert_csv_to_maf/csv_to_maf.py +++ b/python/convert_csv_to_maf/csv_to_maf.py @@ -88,146 +88,152 @@ def main( .str.contains("drop", na=False) == False ] + is_csv_df_empty = csv_df.empty # melt the data frame - melt_csv_df = csv_df.melt( - id_vars=[ - "Hugo_Symbol", - "Chromosome", - "Start_Position", - "End_Position", - "Variant_Classification", - "HGVSp_Short", - "Reference_Allele", - "Tumor_Seq_Allele2", - "ExAC_AF", - "Hotspot", - "DMP", - "CH", - "call_confidence", - ], - var_name="Tumor_Sample_Barcode", - value_name="Evidence", - ) - # fix tumor_sample_barcode - melt_csv_df[ - "Tumor_Sample_Barcode" - ] = melt_csv_df.Tumor_Sample_Barcode.str.split("___", 1).str.get(0) - # convert Chromosome to string - melt_csv_df["Chromosome"] = melt_csv_df["Chromosome"].astype(str) - # split Evidence columns into multiple columns - melt_csv_df[["t_alt_count", "t_depth"]] = melt_csv_df["Evidence"].str.split( - "/", 1, expand=True - ) - # convert t_alt_count to to_numeric - melt_csv_df["t_alt_count"] = melt_csv_df["t_alt_count"].apply( - pd.to_numeric, errors="coerce" - ) - # remove variant frequency information - melt_csv_df["t_depth"] = melt_csv_df.t_depth.str.split("(", 1).str.get(0) - # convert t_depth to to_numeric - melt_csv_df["t_depth"] = melt_csv_df["t_depth"].apply( - pd.to_numeric, errors="coerce" - ) - # calculate t_ref_count - melt_csv_df = melt_csv_df.assign( - t_ref_count=melt_csv_df["t_depth"] - melt_csv_df["t_alt_count"] - ) - # calculate t_alt_freq - melt_csv_df = melt_csv_df.assign( - t_alt_freq=(melt_csv_df["t_alt_count"] / melt_csv_df["t_depth"]).round( - 4 + if is_csv_df_empty == False: + melt_csv_df = csv_df.melt( + id_vars=[ + "Hugo_Symbol", + "Chromosome", + "Start_Position", + "End_Position", + "Variant_Classification", + "HGVSp_Short", + "Reference_Allele", + "Tumor_Seq_Allele2", + "ExAC_AF", + "Hotspot", + "DMP", + "CH", + "call_confidence", + ], + var_name="Tumor_Sample_Barcode", + value_name="Evidence", ) - ) - # drop Evidence columns - melt_csv_df.drop(columns=["Evidence"], inplace=True) - # add additional columns - melt_csv_df["Entrez_Gene_Id"] = 0 - melt_csv_df["Center"] = "mskcc.org" - melt_csv_df["NCBI_Build"] = "GRCh37" - melt_csv_df["Tumor_Seq_Allele1"] = melt_csv_df["Reference_Allele"] - melt_csv_df["Strand"] = "" - melt_csv_df["Consequence"] = "" - melt_csv_df["dbSNP_RS"] = "" - melt_csv_df["dbSNP_Val_Status"] = "" - melt_csv_df["Match_Norm_Seq_Allele1"] = "" - melt_csv_df["Match_Norm_Seq_Allele2"] = "" - melt_csv_df["Tumor_Validation_Allele1"] = "" - melt_csv_df["Tumor_Validation_Allele2"] = "" - melt_csv_df["Match_Norm_Validation_Allele1"] = "" - melt_csv_df["Match_Norm_Validation_Allele2"] = "" - melt_csv_df["Verification_Status"] = "" - melt_csv_df["Validation_Status"] = "" - melt_csv_df["Mutation_Status"] = "" - melt_csv_df["Sequencing_Phase"] = "" - melt_csv_df["Sequence_Source"] = "" - melt_csv_df["Validation_Method"] = "" - melt_csv_df["Score"] = "" - melt_csv_df["BAM_File"] = "" - melt_csv_df["Sequencer"] = "" - melt_csv_df["n_ref_count"] = "" - melt_csv_df["n_alt_count"] = "" - melt_csv_df["HGVSc"] = "" - melt_csv_df["HGVSp"] = "" - melt_csv_df["Transcript_ID"] = "" - melt_csv_df["RefSeq"] = "" - melt_csv_df["Protein_position"] = "" - melt_csv_df["Codons"] = "" - melt_csv_df = melt_csv_df.reindex( - columns=[ - "Hugo_Symbol", - "Entrez_Gene_Id", - "Center", - "NCBI_Build", - "Chromosome", - "Start_Position", - "End_Position", - "Strand", - "Consequence", - "Variant_Classification", - "Variant_Type", - "Reference_Allele", - "Tumor_Seq_Allele1", - "Tumor_Seq_Allele2", - "dbSNP_RS", - "dbSNP_Val_Status", - "Tumor_Sample_Barcode", - "Matched_Norm_Sample_Barcode", - "Match_Norm_Seq_Allele1", - "Match_Norm_Seq_Allele2", - "Tumor_Validation_Allele1", - "Tumor_Validation_Allele2", - "Match_Norm_Validation_Allele1", - "Match_Norm_Validation_Allele2", - "Verification_Status", - "Validation_Status", - "Mutation_Status", - "Sequencing_Phase", - "Sequence_Source", - "Validation_Method", - "Score", - "BAM_File", - "Sequencer", - "t_depth", - "t_ref_count", - "t_alt_count", - "t_alt_freq", - "n_ref_count", - "n_alt_count", - "HGVSc", - "HGVSp", - "HGVSp_Short", - "Transcript_ID", - "RefSeq", - "Protein_position", - "Codons", - "Hotspot", - "DMP", - "CH", - "call_confidence", - "ExAC_AF", - ] - ) - final_df = final_df.append(melt_csv_df, ignore_index=True) + # fix tumor_sample_barcode + melt_csv_df[ + "Tumor_Sample_Barcode" + ] = melt_csv_df.Tumor_Sample_Barcode.str.split("___", 1).str.get(0) + # convert Chromosome to string + melt_csv_df["Chromosome"] = melt_csv_df["Chromosome"].astype(str) + # split Evidence columns into multiple columns + melt_csv_df[["t_alt_count", "t_depth"]] = melt_csv_df[ + "Evidence" + ].str.split("/", 1, expand=True) + # convert t_alt_count to to_numeric + melt_csv_df["t_alt_count"] = melt_csv_df["t_alt_count"].apply( + pd.to_numeric, errors="coerce" + ) + # remove variant frequency information + melt_csv_df["t_depth"] = melt_csv_df.t_depth.str.split("(", 1).str.get( + 0 + ) + # convert t_depth to to_numeric + melt_csv_df["t_depth"] = melt_csv_df["t_depth"].apply( + pd.to_numeric, errors="coerce" + ) + # calculate t_ref_count + melt_csv_df = melt_csv_df.assign( + t_ref_count=melt_csv_df["t_depth"] - melt_csv_df["t_alt_count"] + ) + # calculate t_alt_freq + melt_csv_df = melt_csv_df.assign( + t_alt_freq=( + melt_csv_df["t_alt_count"] / melt_csv_df["t_depth"] + ).round(4) + ) + # drop Evidence columns + melt_csv_df.drop(columns=["Evidence"], inplace=True) + # add additional columns + melt_csv_df["Entrez_Gene_Id"] = 0 + melt_csv_df["Center"] = "mskcc.org" + melt_csv_df["NCBI_Build"] = "GRCh37" + melt_csv_df["Tumor_Seq_Allele1"] = melt_csv_df["Reference_Allele"] + melt_csv_df["Strand"] = "" + melt_csv_df["Consequence"] = "" + melt_csv_df["dbSNP_RS"] = "" + melt_csv_df["dbSNP_Val_Status"] = "" + melt_csv_df["Match_Norm_Seq_Allele1"] = "" + melt_csv_df["Match_Norm_Seq_Allele2"] = "" + melt_csv_df["Tumor_Validation_Allele1"] = "" + melt_csv_df["Tumor_Validation_Allele2"] = "" + melt_csv_df["Match_Norm_Validation_Allele1"] = "" + melt_csv_df["Match_Norm_Validation_Allele2"] = "" + melt_csv_df["Verification_Status"] = "" + melt_csv_df["Validation_Status"] = "" + melt_csv_df["Mutation_Status"] = "" + melt_csv_df["Sequencing_Phase"] = "" + melt_csv_df["Sequence_Source"] = "" + melt_csv_df["Validation_Method"] = "" + melt_csv_df["Score"] = "" + melt_csv_df["BAM_File"] = "" + melt_csv_df["Sequencer"] = "" + melt_csv_df["n_ref_count"] = "" + melt_csv_df["n_alt_count"] = "" + melt_csv_df["HGVSc"] = "" + melt_csv_df["HGVSp"] = "" + melt_csv_df["Transcript_ID"] = "" + melt_csv_df["RefSeq"] = "" + melt_csv_df["Protein_position"] = "" + melt_csv_df["Codons"] = "" + melt_csv_df = melt_csv_df.reindex( + columns=[ + "Hugo_Symbol", + "Entrez_Gene_Id", + "Center", + "NCBI_Build", + "Chromosome", + "Start_Position", + "End_Position", + "Strand", + "Consequence", + "Variant_Classification", + "Variant_Type", + "Reference_Allele", + "Tumor_Seq_Allele1", + "Tumor_Seq_Allele2", + "dbSNP_RS", + "dbSNP_Val_Status", + "Tumor_Sample_Barcode", + "Matched_Norm_Sample_Barcode", + "Match_Norm_Seq_Allele1", + "Match_Norm_Seq_Allele2", + "Tumor_Validation_Allele1", + "Tumor_Validation_Allele2", + "Match_Norm_Validation_Allele1", + "Match_Norm_Validation_Allele2", + "Verification_Status", + "Validation_Status", + "Mutation_Status", + "Sequencing_Phase", + "Sequence_Source", + "Validation_Method", + "Score", + "BAM_File", + "Sequencer", + "t_depth", + "t_ref_count", + "t_alt_count", + "t_alt_freq", + "n_ref_count", + "n_alt_count", + "HGVSc", + "HGVSp", + "HGVSp_Short", + "Transcript_ID", + "RefSeq", + "Protein_position", + "Codons", + "Hotspot", + "DMP", + "CH", + "call_confidence", + "ExAC_AF", + ] + ) + final_df = final_df.append(melt_csv_df, ignore_index=True) + else: + continue else: typer.secho(f"{csv_file} file does not exists", fg=typer.colors.BRIGHT_RED) raise typer.Abort() From 5e4e3b7369f0a9c9bdbb5097497d35185e6d7998 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Mon, 19 Sep 2022 13:32:34 -0400 Subject: [PATCH 016/126] Update SV_incorporation.R --- R/SV_incorporation.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R/SV_incorporation.R b/R/SV_incorporation.R index 0658840..c482b3a 100644 --- a/R/SV_incorporation.R +++ b/R/SV_incorporation.R @@ -18,10 +18,10 @@ SV_incorporation = function( # criteria <- 'stringent' # # DMP fusion calls -------------------------------------------------------- - DMP.fusion <- fread(paste0(dmp.dir,'/data_SV.txt')) %>% - transmute(DMP_SAMPLE_ID = SampleId,EventType = Sv_Class_Name,Gene1 = Site1_Gene,Gene2 = Site2_Gene, - Chr1 = Site1_Chrom,Chr2 = Site2_Chrom,Pos1 = Site1_Pos,Pos2 = Site2_Pos,PairedReadCount = Paired_End_Read_Support, - SplitReadCount = Split_Read_Support,TumorReadCount = Tumor_Read_Count,EventInfo = Annotation) %>% data.table() + DMP.fusion <- fread(paste0(dmp.dir,'/data_sv.txt')) %>% + transmute(DMP_SAMPLE_ID = Sample_ID,EventType = Class,Gene1 = Site1_Hugo_Symbol,Gene2 = Site2_Hugo_Symbol, + Chr1 = Site1_Chromosome,Chr2 = Site2_Chromosome,Pos1 = Site1_Position,Pos2 = Site2_Position,PairedReadCount = Tumor_Paired_End_Read_Count, + SplitReadCount = Tumor_Split_Read_Count,TumorReadCount = Tumor_Read_Count,EventInfo = Event_Info) %>% data.table() # execution --------------------------------------------------------------- From 7a2e89f9dceba51972ab01073cf0ee96c9a3a905 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Fri, 23 Sep 2022 23:09:11 -0400 Subject: [PATCH 017/126] Update convert_dates_to_days.py --- .../convert_dates_to_days.py | 139 ++++++++++++------ 1 file changed, 94 insertions(+), 45 deletions(-) diff --git a/python/convert_dates_to_days/convert_dates_to_days.py b/python/convert_dates_to_days/convert_dates_to_days.py index 25d84fd..c498beb 100644 --- a/python/convert_dates_to_days/convert_dates_to_days.py +++ b/python/convert_dates_to_days/convert_dates_to_days.py @@ -5,17 +5,26 @@ import arrow from datetime import datetime + def validate_date(date_string): - date_format = ['MM/DD/YY','M/D/YY','MM/D/YY','M/DD/YY','MM/DD/YYYY','YYYY/MM/DD'] + date_format = [ + "MM/DD/YY", + "M/D/YY", + "MM/D/YY", + "M/DD/YY", + "MM/DD/YYYY", + "YYYY/MM/DD", + "YYYY-MM-DD", + ] for fmt in date_format: try: - date_obj = arrow.get(date_string, fmt).date() - return date_obj + return arrow.get(date_string, fmt).date() except ValueError: pass except: print("Something else went wrong") - raise ValueError('no valid date format found') + raise ValueError("no valid date format found") + def main( input: Path = typer.Option( @@ -34,76 +43,116 @@ def main( "C1D1", "--timepoint1", "-t1", - help="Column name which has timpoint information to use the baseline date, first preference", - ), + help="timepoint name which in the timepoint column to use for the baseline date, first preference", + ), timepoint_label_for_baseline_second: str = typer.Option( "", "--timepoint2", "-t2", - help="Column name which has timpoint information to use the baseline date, second preference", - ), - + help="timepoint name which in the timepoint column to use for the baseline date, second preference", + ), + timepoint_label_for_baseline_third: str = typer.Option( + "", + "--timepoint3", + "-t3", + help="timepoint name which in the timepoint column to use for the baseline date, third preference", + ), output_file: str = typer.Option( - "output.txt", - "--output", + "output.txt", + "--output", "-o", help="Name of the output file", - ), ): - - ''' + + """ Tool to do the following operations: A. Reads meta data file, and based on the timepoint information given convert them to days for a samples belonging to a given patient_id B. Supports following date formats: 'MM/DD/YY','M/D/YY','MM/D/YY','M/DD/YY','MM/DD/YYYY','YYYY/MM/DD' - + Requirement: pandas; typer; arrow - ''' + """ - #Read input file - i_df = pd.read_csv(input,sep='\t',comment='#',low_memory=False) - #group by cmo_patient_id - grouped = i_df.groupby('cmo_patient_id') + # Read input file + i_df = pd.read_csv(input, sep="\t", comment="#", low_memory=False) + # group by cmo_patient_id + grouped = i_df.groupby("cmo_patient_id") keys = grouped.groups.keys() df_list = [] - #tarverse via cmo_patient_id to get associated samples + # tarverse via cmo_patient_id to get associated samples for i in keys: t_df = pd.DataFrame() t_df = grouped.get_group(i) baseline_date = None + # Get the baseline date - try: - baseline_date = t_df.loc[t_df['timepoint'] == timepoint_label_for_baseline_first, 'collection_date'].iloc[0] + if len(t_df) > 1: + try: + baseline_date = t_df.loc[ + t_df["timepoint"] == timepoint_label_for_baseline_first, + "collection_date", + ].iloc[0] + baseline_date = validate_date(baseline_date) + except IndexError: + print( + i, + "patient does not have first preference timepoint:", + timepoint_label_for_baseline_first, + ) + print( + "We will try to use second timepoint if available to use as baseline\n" + ) + if timepoint_label_for_baseline_second: + try: + baseline_date = t_df.loc[ + t_df["timepoint"] == timepoint_label_for_baseline_second, + "collection_date", + ].iloc[0] + baseline_date = validate_date(baseline_date) + except IndexError as e: + print( + i, + "patient does not have second preference timepoint:", + timepoint_label_for_baseline_second, + "\n", + ) + print(e) + if timepoint_label_for_baseline_third: + try: + baseline_date = t_df.loc[ + t_df["timepoint"] + == timepoint_label_for_baseline_third, + "collection_date", + ].iloc[0] + baseline_date = validate_date(baseline_date) + except IndexError as e: + print( + i, + "patient does not have third preference timepoint:", + timepoint_label_for_baseline_third, + "\n", + ) + print(e) + exit(1) + else: + baseline_date = str(t_df["collection_date"]) baseline_date = validate_date(baseline_date) - except IndexError: - print(i ,"patient does not have first preference timepoint:", timepoint_label_for_baseline_first) - print ("We will try to use second timepoint if available to use as baseline\n") - if timepoint_label_for_baseline_second: - try: - baseline_date = t_df.loc[t_df['timepoint'] == timepoint_label_for_baseline_second, 'collection_date'].iloc[0] - baseline_date = validate_date(baseline_date) - except IndexError as e: - print(i ,"patient does not have second preference timepoint:", timepoint_label_for_baseline_second,"\n") - print(e) - except: - print("Something else went wrong") - except: - print("Something else went wrong") - #convert to days + # convert to days days_list = [] - for a, b in zip(t_df['collection_date'], t_df['timepoint']): + for a, b in zip(t_df["collection_date"], t_df["timepoint"]): fmt_date = validate_date(a) delta = fmt_date - baseline_date days_list.append(delta.days) - #make list of modified dataframes + # make list of modified dataframes t_df_copy = t_df.copy(deep=True) - t_df_copy['collection_in_days'] = days_list + t_df_copy["collection_in_days"] = days_list df_list.append(t_df_copy) - #merge and write the dataframe - results = pd.concat(df_list, axis=0, join='outer') - results.to_csv(output_file, sep='\t', index=False) + # merge and write the dataframe + results = pd.concat(df_list, axis=0, join="outer") + results.to_csv(output_file, sep="\t", index=False) + if __name__ == "__main__": - typer.run(main) \ No newline at end of file + typer.run(main) From 44248757c9a1b3ae1f90577fe10d087cde14fd9f Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Sat, 24 Sep 2022 08:09:42 -0400 Subject: [PATCH 018/126] adding additional functionality --- reports/create_report.R | 12 +++++++++--- reports/template_days.Rmd | 2 +- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/reports/create_report.R b/reports/create_report.R index 8a46dcc..0ec7bc1 100644 --- a/reports/create_report.R +++ b/reports/create_report.R @@ -16,9 +16,11 @@ parser$add_argument("-m", "--metadata", required=T, help="Path to file containin parser$add_argument("-d", "--dmp-id", help="DMP patient ID (optional).") parser$add_argument("-ds", "--dmp-sample-id", help="DMP sample ID (optional).") parser$add_argument("-dm", "--dmp-maf", help="Path to DMP MAF file (optional).") -parser$add_argument("-o", "--output", help="Output file") +parser$add_argument("-o", "--output", help="Output file with .html extension") parser$add_argument( - "-ca", "--combine-access", help="Don't splite VAF plots by clonality.", action="store_true") + "-md", "--keep-rmarkdown", help="Dont make tmp file for markdown, keep it in the same directory", action="store_true") +parser$add_argument( + "-ca", "--combine-access", help="Don't split VAF plots by clonality.", action="store_true") parser$add_argument( "-pi", "--plot-impact", help="Also plot VAFs from IMPACT samples.", action="store_true") @@ -43,8 +45,12 @@ input_text <- knitr::knit_expand( COMBINE_ACCESS=args$combine_access, PLOT_IMPACT=args$plot_impact ) - +if(arg$keep_rmarkdown) { + file = gsub(".html",".Rmd",args$output_file) +} +else { tmp <- tempfile(fileext = ".Rmd") +} cat(input_text, file = tmp) rmarkdown::render( diff --git a/reports/template_days.Rmd b/reports/template_days.Rmd index 9cd3dd3..b2ebb40 100644 --- a/reports/template_days.Rmd +++ b/reports/template_days.Rmd @@ -404,7 +404,7 @@ if (nrow(clonal)>0) { clonaltoplot$vaf<-round(clonaltoplot$vaf,4) clonaltoplot$adjustedvaf<-round(clonaltoplot$adjustedvaf,4) } - write.csv(clonaltoplot, file_path, sep = "\t", quote = F, row.names = F) + write_tsv(clonaltoplot, file_path, quote = F, row.names = F) fig2<-vaf_plot(clonaltoplot, xlimits, xbreaks, xlabels, varcolors, yaccuracy=0.01, log=FALSE, cnadjusted = TRUE) subplot(fig1,fig2,nrows=2,shareX=TRUE, heights=c(0.2,0.8)) } From a687e8357e4cfb2a00f16a8e0a2b3fe046e4f022 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Sat, 24 Sep 2022 08:10:51 -0400 Subject: [PATCH 019/126] Update create_report.R --- reports/create_report.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reports/create_report.R b/reports/create_report.R index 0ec7bc1..a64daff 100644 --- a/reports/create_report.R +++ b/reports/create_report.R @@ -45,7 +45,7 @@ input_text <- knitr::knit_expand( COMBINE_ACCESS=args$combine_access, PLOT_IMPACT=args$plot_impact ) -if(arg$keep_rmarkdown) { +if(args$keep_rmarkdown) { file = gsub(".html",".Rmd",args$output_file) } else { From 85850803d19a705bbe0c268703031453b0695551 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Sat, 24 Sep 2022 08:12:13 -0400 Subject: [PATCH 020/126] Update create_report.R --- reports/create_report.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reports/create_report.R b/reports/create_report.R index a64daff..b35bd71 100644 --- a/reports/create_report.R +++ b/reports/create_report.R @@ -46,7 +46,7 @@ input_text <- knitr::knit_expand( PLOT_IMPACT=args$plot_impact ) if(args$keep_rmarkdown) { - file = gsub(".html",".Rmd",args$output_file) + tmp <- gsub(".html",".Rmd",args$output_file) } else { tmp <- tempfile(fileext = ".Rmd") From 13cb07032c54bd503a9100b9f89580c9d2601682 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Sat, 24 Sep 2022 08:12:30 -0400 Subject: [PATCH 021/126] Update create_report.R --- reports/create_report.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reports/create_report.R b/reports/create_report.R index b35bd71..56925bd 100644 --- a/reports/create_report.R +++ b/reports/create_report.R @@ -49,7 +49,7 @@ if(args$keep_rmarkdown) { tmp <- gsub(".html",".Rmd",args$output_file) } else { -tmp <- tempfile(fileext = ".Rmd") + tmp <- tempfile(fileext = ".Rmd") } cat(input_text, file = tmp) From 92e246c8dd732f59a303b6a1b34de0ef6619780f Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Sat, 24 Sep 2022 08:15:09 -0400 Subject: [PATCH 022/126] Update create_report.R --- reports/create_report.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/reports/create_report.R b/reports/create_report.R index 56925bd..f1cc6b4 100644 --- a/reports/create_report.R +++ b/reports/create_report.R @@ -45,12 +45,13 @@ input_text <- knitr::knit_expand( COMBINE_ACCESS=args$combine_access, PLOT_IMPACT=args$plot_impact ) +tmp <- NULL if(args$keep_rmarkdown) { tmp <- gsub(".html",".Rmd",args$output_file) -} -else { +} else { tmp <- tempfile(fileext = ".Rmd") } + cat(input_text, file = tmp) rmarkdown::render( From a7faba53cdf3a5c03f2e3fb3b0f14272e1f1e1af Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Sat, 24 Sep 2022 08:16:56 -0400 Subject: [PATCH 023/126] Update template_days.Rmd --- reports/template_days.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reports/template_days.Rmd b/reports/template_days.Rmd index b2ebb40..db0ccfa 100644 --- a/reports/template_days.Rmd +++ b/reports/template_days.Rmd @@ -404,7 +404,7 @@ if (nrow(clonal)>0) { clonaltoplot$vaf<-round(clonaltoplot$vaf,4) clonaltoplot$adjustedvaf<-round(clonaltoplot$adjustedvaf,4) } - write_tsv(clonaltoplot, file_path, quote = F, row.names = F) + write.table(clonaltoplot, file_path, sep="\t", quote = F, row.names = F) fig2<-vaf_plot(clonaltoplot, xlimits, xbreaks, xlabels, varcolors, yaccuracy=0.01, log=FALSE, cnadjusted = TRUE) subplot(fig1,fig2,nrows=2,shareX=TRUE, heights=c(0.2,0.8)) } From 6d8cfe0d360d3429fa85e860364957b46381bf82 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Sat, 24 Sep 2022 14:32:24 -0400 Subject: [PATCH 024/126] Update create_report.R --- reports/create_report.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reports/create_report.R b/reports/create_report.R index f1cc6b4..d2ae282 100644 --- a/reports/create_report.R +++ b/reports/create_report.R @@ -47,7 +47,7 @@ input_text <- knitr::knit_expand( ) tmp <- NULL if(args$keep_rmarkdown) { - tmp <- gsub(".html",".Rmd",args$output_file) + tmp <- gsub("\.html","\.Rmd",args$output_file) } else { tmp <- tempfile(fileext = ".Rmd") } From 39ce5d5852d2cf4d6565de76ca3269946d92772e Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Sat, 24 Sep 2022 14:33:38 -0400 Subject: [PATCH 025/126] Update create_report.R --- reports/create_report.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reports/create_report.R b/reports/create_report.R index d2ae282..dffdb29 100644 --- a/reports/create_report.R +++ b/reports/create_report.R @@ -47,7 +47,7 @@ input_text <- knitr::knit_expand( ) tmp <- NULL if(args$keep_rmarkdown) { - tmp <- gsub("\.html","\.Rmd",args$output_file) + tmp <- gsub("html","Rmd",args$output_file) } else { tmp <- tempfile(fileext = ".Rmd") } From 4fe3ca1f252fd7d7b5f2f851f8f2961a20866416 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Sat, 24 Sep 2022 14:45:29 -0400 Subject: [PATCH 026/126] Update create_report.R --- reports/create_report.R | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/reports/create_report.R b/reports/create_report.R index dffdb29..fcad7d4 100644 --- a/reports/create_report.R +++ b/reports/create_report.R @@ -45,15 +45,15 @@ input_text <- knitr::knit_expand( COMBINE_ACCESS=args$combine_access, PLOT_IMPACT=args$plot_impact ) -tmp <- NULL -if(args$keep_rmarkdown) { - tmp <- gsub("html","Rmd",args$output_file) -} else { - tmp <- tempfile(fileext = ".Rmd") -} +rmd_name <- gsub(".html","", args$output_file) + +tmp <- tempfile(rmd_name,fileext = ".Rmd") cat(input_text, file = tmp) +if (args$keep_rmarkdown){ + file.copy(tmp,cwd) +} rmarkdown::render( tmp, output_format = "html_document", From 99fee6bcc4e10fc870bc9b987081e273a91fb448 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Sat, 24 Sep 2022 14:49:55 -0400 Subject: [PATCH 027/126] Update create_report.R --- reports/create_report.R | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/reports/create_report.R b/reports/create_report.R index fcad7d4..d805072 100644 --- a/reports/create_report.R +++ b/reports/create_report.R @@ -46,13 +46,14 @@ input_text <- knitr::knit_expand( PLOT_IMPACT=args$plot_impact ) -rmd_name <- gsub(".html","", args$output_file) - tmp <- tempfile(rmd_name,fileext = ".Rmd") cat(input_text, file = tmp) if (args$keep_rmarkdown){ - file.copy(tmp,cwd) + rmd_name <- gsub(".html",".Rmd", args$output_file) + output_cwd <- getwd() + output_rmd_path <- paste(output_cwd,"/",rmd_name) + file.copy(tmp,output_rmd_path) } rmarkdown::render( tmp, From 9dd99d18b3e51ee5579d6495601286bf1746cf93 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Sat, 24 Sep 2022 14:52:02 -0400 Subject: [PATCH 028/126] Update create_report.R --- reports/create_report.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reports/create_report.R b/reports/create_report.R index d805072..bde7aa4 100644 --- a/reports/create_report.R +++ b/reports/create_report.R @@ -46,7 +46,7 @@ input_text <- knitr::knit_expand( PLOT_IMPACT=args$plot_impact ) -tmp <- tempfile(rmd_name,fileext = ".Rmd") +tmp <- tempfile(fileext = ".Rmd") cat(input_text, file = tmp) if (args$keep_rmarkdown){ From ab767f1f6f6ab0d44e8574e48bcf1ad1bc54ae9a Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Sat, 24 Sep 2022 14:56:13 -0400 Subject: [PATCH 029/126] Update create_report.R --- reports/create_report.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/reports/create_report.R b/reports/create_report.R index bde7aa4..3adca2a 100644 --- a/reports/create_report.R +++ b/reports/create_report.R @@ -50,8 +50,8 @@ tmp <- tempfile(fileext = ".Rmd") cat(input_text, file = tmp) if (args$keep_rmarkdown){ - rmd_name <- gsub(".html",".Rmd", args$output_file) - output_cwd <- getwd() + rmd_name <- gsub(".html",".Rmd", args$output) + output_cwd <- normalizePath(dirname(args$output)) output_rmd_path <- paste(output_cwd,"/",rmd_name) file.copy(tmp,output_rmd_path) } From a8a3bbd811340682300984906b799c8d28813914 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Sat, 24 Sep 2022 14:58:12 -0400 Subject: [PATCH 030/126] Update create_report.R --- reports/create_report.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reports/create_report.R b/reports/create_report.R index 3adca2a..c055654 100644 --- a/reports/create_report.R +++ b/reports/create_report.R @@ -52,7 +52,7 @@ cat(input_text, file = tmp) if (args$keep_rmarkdown){ rmd_name <- gsub(".html",".Rmd", args$output) output_cwd <- normalizePath(dirname(args$output)) - output_rmd_path <- paste(output_cwd,"/",rmd_name) + output_rmd_path <- paste(output_cwd,"/",rmd_name, sep='') file.copy(tmp,output_rmd_path) } rmarkdown::render( From dc356f2f0f2f21016423f31ebb669e2eff0bdde0 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Sat, 24 Sep 2022 15:05:43 -0400 Subject: [PATCH 031/126] Update template_days.Rmd --- reports/template_days.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reports/template_days.Rmd b/reports/template_days.Rmd index db0ccfa..5e0a2a3 100644 --- a/reports/template_days.Rmd +++ b/reports/template_days.Rmd @@ -404,7 +404,7 @@ if (nrow(clonal)>0) { clonaltoplot$vaf<-round(clonaltoplot$vaf,4) clonaltoplot$adjustedvaf<-round(clonaltoplot$adjustedvaf,4) } - write.table(clonaltoplot, file_path, sep="\t", quote = F, row.names = F) + write.table(clonal, file_path, sep="\t", quote = F, row.names = F) fig2<-vaf_plot(clonaltoplot, xlimits, xbreaks, xlabels, varcolors, yaccuracy=0.01, log=FALSE, cnadjusted = TRUE) subplot(fig1,fig2,nrows=2,shareX=TRUE, heights=c(0.2,0.8)) } From 388e4255475480323b2c3aeb6da90a0f4ccdcf26 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Sat, 24 Sep 2022 15:08:17 -0400 Subject: [PATCH 032/126] Update template_days.Rmd --- reports/template_days.Rmd | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/reports/template_days.Rmd b/reports/template_days.Rmd index 5e0a2a3..e4df017 100644 --- a/reports/template_days.Rmd +++ b/reports/template_days.Rmd @@ -387,7 +387,7 @@ if (nrow(clonal)>0) { clonal$adjustedvaf <- clonal$vaf*clonal$ncn / (clonal$expected_alt_copies + (clonal$ncn - clonal$tcn)*clonal$vaf) clonal$adjustedvaf <- round(clonal$adjustedvaf,4) clonaltoplot <- clonal - + write.table(clonal, file_path, sep="\t", quote = F, row.names = F) if (length(unique(clonal$VarName))>1) { clonal_mean <- data.frame( @@ -404,7 +404,6 @@ if (nrow(clonal)>0) { clonaltoplot$vaf<-round(clonaltoplot$vaf,4) clonaltoplot$adjustedvaf<-round(clonaltoplot$adjustedvaf,4) } - write.table(clonal, file_path, sep="\t", quote = F, row.names = F) fig2<-vaf_plot(clonaltoplot, xlimits, xbreaks, xlabels, varcolors, yaccuracy=0.01, log=FALSE, cnadjusted = TRUE) subplot(fig1,fig2,nrows=2,shareX=TRUE, heights=c(0.2,0.8)) } From bb9c35cd6d48a793d0dafa328d8545d4ed78e2ae Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Sat, 24 Sep 2022 15:16:11 -0400 Subject: [PATCH 033/126] Update template_days.Rmd --- reports/template_days.Rmd | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/reports/template_days.Rmd b/reports/template_days.Rmd index e4df017..be8dbf3 100644 --- a/reports/template_days.Rmd +++ b/reports/template_days.Rmd @@ -376,9 +376,6 @@ subplot(fig1,fig2,nrows=2,shareX=TRUE, heights=c(0.2,0.7), which_layout=1) ```{r adjustedvaf-linear, fig.height=4, eval=has_dmp} sample="{{PATIENT_ID}}" -filename = paste(sample,"_clonal.csv") -path = getwd() -file_path = file.path(path,filename) clonal <- subset(final, final$clonality=="CLONAL") if (nrow(clonal)>0) { @@ -387,7 +384,6 @@ if (nrow(clonal)>0) { clonal$adjustedvaf <- clonal$vaf*clonal$ncn / (clonal$expected_alt_copies + (clonal$ncn - clonal$tcn)*clonal$vaf) clonal$adjustedvaf <- round(clonal$adjustedvaf,4) clonaltoplot <- clonal - write.table(clonal, file_path, sep="\t", quote = F, row.names = F) if (length(unique(clonal$VarName))>1) { clonal_mean <- data.frame( @@ -408,6 +404,13 @@ if (nrow(clonal)>0) { subplot(fig1,fig2,nrows=2,shareX=TRUE, heights=c(0.2,0.8)) } ``` +```{r write_clonal, eval=has_dmp} +sample="{{PATIENT_ID}}" +filename = paste(sample,"_clonal.csv",sep='') +path = getwd() +file_path = file.path(path,filename) +write.table(clonal, file_path, sep="\t", quote = F, row.names = F) +``` ```{asis echo=has_dmp} ### Log From e592b1f85b289536fe34c91884b35a37a66d041e Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Sat, 24 Sep 2022 15:17:48 -0400 Subject: [PATCH 034/126] Update template_days.Rmd --- reports/template_days.Rmd | 1 + 1 file changed, 1 insertion(+) diff --git a/reports/template_days.Rmd b/reports/template_days.Rmd index be8dbf3..20a03ed 100644 --- a/reports/template_days.Rmd +++ b/reports/template_days.Rmd @@ -409,6 +409,7 @@ sample="{{PATIENT_ID}}" filename = paste(sample,"_clonal.csv",sep='') path = getwd() file_path = file.path(path,filename) +cat(file_path) write.table(clonal, file_path, sep="\t", quote = F, row.names = F) ``` From bb6963f0860933e79d5b7980b64366fca51686cd Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Sat, 24 Sep 2022 15:23:18 -0400 Subject: [PATCH 035/126] Update template_days.Rmd --- reports/template_days.Rmd | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/reports/template_days.Rmd b/reports/template_days.Rmd index 20a03ed..1415711 100644 --- a/reports/template_days.Rmd +++ b/reports/template_days.Rmd @@ -409,8 +409,7 @@ sample="{{PATIENT_ID}}" filename = paste(sample,"_clonal.csv",sep='') path = getwd() file_path = file.path(path,filename) -cat(file_path) -write.table(clonal, file_path, sep="\t", quote = F, row.names = F) +fwrite(clonal, file=file_path) ``` ```{asis echo=has_dmp} From 7126efa50e671d6dd75a6908eb3a01c2927f2498 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Sat, 24 Sep 2022 15:25:21 -0400 Subject: [PATCH 036/126] Update template_days.Rmd --- reports/template_days.Rmd | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/reports/template_days.Rmd b/reports/template_days.Rmd index 1415711..7615bf4 100644 --- a/reports/template_days.Rmd +++ b/reports/template_days.Rmd @@ -407,9 +407,7 @@ if (nrow(clonal)>0) { ```{r write_clonal, eval=has_dmp} sample="{{PATIENT_ID}}" filename = paste(sample,"_clonal.csv",sep='') -path = getwd() -file_path = file.path(path,filename) -fwrite(clonal, file=file_path) +fwrite(clonal, file=filename) ``` ```{asis echo=has_dmp} From 3e546e647ad3ee15e248fe8ab7ad76f15d07c377 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Sat, 24 Sep 2022 15:28:17 -0400 Subject: [PATCH 037/126] Update template_days.Rmd --- reports/template_days.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reports/template_days.Rmd b/reports/template_days.Rmd index 7615bf4..cb949ab 100644 --- a/reports/template_days.Rmd +++ b/reports/template_days.Rmd @@ -407,7 +407,7 @@ if (nrow(clonal)>0) { ```{r write_clonal, eval=has_dmp} sample="{{PATIENT_ID}}" filename = paste(sample,"_clonal.csv",sep='') -fwrite(clonal, file=filename) +fwrite(clonal[,c("sample_id","collection_day","VarName","tcn","expected_alt_copies","ncn", "vaf","adjustedvaf")], file=filename) ``` ```{asis echo=has_dmp} From 314484a14fcaa63fffba23792a663127be15df4b Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Sat, 24 Sep 2022 15:29:04 -0400 Subject: [PATCH 038/126] Update template_days.Rmd --- reports/template_days.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reports/template_days.Rmd b/reports/template_days.Rmd index cb949ab..6c4de16 100644 --- a/reports/template_days.Rmd +++ b/reports/template_days.Rmd @@ -407,7 +407,7 @@ if (nrow(clonal)>0) { ```{r write_clonal, eval=has_dmp} sample="{{PATIENT_ID}}" filename = paste(sample,"_clonal.csv",sep='') -fwrite(clonal[,c("sample_id","collection_day","VarName","tcn","expected_alt_copies","ncn", "vaf","adjustedvaf")], file=filename) +fwrite(clonal[,c("sample_id","collection_day","VarName","tcn","expected_alt_copies","ncn", "vaf","adjustedvaf")], file="test.csv") ``` ```{asis echo=has_dmp} From c66342ba4d742fc47ae5daad68aa93f01cb135f3 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Sat, 24 Sep 2022 15:35:58 -0400 Subject: [PATCH 039/126] Update template_days.Rmd --- reports/template_days.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reports/template_days.Rmd b/reports/template_days.Rmd index 6c4de16..d17458e 100644 --- a/reports/template_days.Rmd +++ b/reports/template_days.Rmd @@ -167,7 +167,7 @@ cna_plot<-function(cna, xlimits, xbreaks){ } print_table<-function(table){ - datatable(table, rownames=FALSE, escape=FALSE, options=list(scrollX=T, autoWidth = TRUE)) + datatable(table, extensions = 'Buttons', rownames=FALSE, escape=FALSE, options=list(scrollX=T, autoWidth = TRUE, dom = 'Bfrtip',buttons = c('copy', 'csv', 'excel'))) } ``` From 02db997e2d4f82514bc4a2130dcd503a36823411 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Sat, 24 Sep 2022 15:37:07 -0400 Subject: [PATCH 040/126] Update template_days.Rmd --- reports/template_days.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reports/template_days.Rmd b/reports/template_days.Rmd index d17458e..90324ac 100644 --- a/reports/template_days.Rmd +++ b/reports/template_days.Rmd @@ -407,7 +407,7 @@ if (nrow(clonal)>0) { ```{r write_clonal, eval=has_dmp} sample="{{PATIENT_ID}}" filename = paste(sample,"_clonal.csv",sep='') -fwrite(clonal[,c("sample_id","collection_day","VarName","tcn","expected_alt_copies","ncn", "vaf","adjustedvaf")], file="test.csv") +fwrite(as.dataframe(clonal), file=filename) ``` ```{asis echo=has_dmp} From bf2660f658d0564da0fa67a877ece049abe9f0f5 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Sat, 24 Sep 2022 15:38:43 -0400 Subject: [PATCH 041/126] Update template_days.Rmd --- reports/template_days.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reports/template_days.Rmd b/reports/template_days.Rmd index 90324ac..7544ed6 100644 --- a/reports/template_days.Rmd +++ b/reports/template_days.Rmd @@ -407,7 +407,7 @@ if (nrow(clonal)>0) { ```{r write_clonal, eval=has_dmp} sample="{{PATIENT_ID}}" filename = paste(sample,"_clonal.csv",sep='') -fwrite(as.dataframe(clonal), file=filename) +fwrite(clonal, file=paste(getwd(),filename,sep='/')) ``` ```{asis echo=has_dmp} From 897c14f5f1059e864d372614859e111f1ba6bafc Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Sat, 24 Sep 2022 15:54:09 -0400 Subject: [PATCH 042/126] Update template_days.Rmd --- reports/template_days.Rmd | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/reports/template_days.Rmd b/reports/template_days.Rmd index 7544ed6..beca6da 100644 --- a/reports/template_days.Rmd +++ b/reports/template_days.Rmd @@ -167,7 +167,8 @@ cna_plot<-function(cna, xlimits, xbreaks){ } print_table<-function(table){ - datatable(table, extensions = 'Buttons', rownames=FALSE, escape=FALSE, options=list(scrollX=T, autoWidth = TRUE, dom = 'Bfrtip',buttons = c('copy', 'csv', 'excel'))) + datatable(table, rownames=FALSE, escape=FALSE, extensions = c('Buttons','Responsive'), + options=list(scrollX=T, autoWidth = TRUE, dom = 'Bfrtip',buttons = c('copy', 'csv', 'excel'))) } ``` From a8287a2203f59a454de7da6d83fe8d4c62768c04 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Sat, 24 Sep 2022 16:03:12 -0400 Subject: [PATCH 043/126] Update template_days.Rmd --- reports/template_days.Rmd | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/reports/template_days.Rmd b/reports/template_days.Rmd index beca6da..b87475f 100644 --- a/reports/template_days.Rmd +++ b/reports/template_days.Rmd @@ -167,8 +167,7 @@ cna_plot<-function(cna, xlimits, xbreaks){ } print_table<-function(table){ - datatable(table, rownames=FALSE, escape=FALSE, extensions = c('Buttons','Responsive'), - options=list(scrollX=T, autoWidth = TRUE, dom = 'Bfrtip',buttons = c('copy', 'csv', 'excel'))) + datatable(table, class='cell-border stripe compact', filter = 'top', rownames=FALSE, escape=FALSE, extensions = 'Buttons', options=list(scrollX=T, autoWidth = TRUE, dom = 'Bfrtip',buttons = c('copy', 'csv', 'excel', 'pdf', 'print'))) } ``` From 488dea8063a7a3c46054f8590c9eb3df0725c450 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Sat, 24 Sep 2022 16:31:47 -0400 Subject: [PATCH 044/126] Fix for file output --- reports/create_report.R | 13 ++++++++----- reports/template_days.Rmd | 8 +++++--- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/reports/create_report.R b/reports/create_report.R index c055654..b3e8419 100644 --- a/reports/create_report.R +++ b/reports/create_report.R @@ -3,6 +3,7 @@ library(knitr) library(rmarkdown) library(argparse) +library(quarto) parser <- ArgumentParser() @@ -55,8 +56,10 @@ if (args$keep_rmarkdown){ output_rmd_path <- paste(output_cwd,"/",rmd_name, sep='') file.copy(tmp,output_rmd_path) } -rmarkdown::render( - tmp, - output_format = "html_document", - output_dir = normalizePath(dirname(args$output)), - output_file=args$output) +#rmarkdown::render( +# tmp, +# output_format = "html_document", +# output_dir = normalizePath(dirname(args$output)), +# output_file=args$output) + +quarto::render(tmp,output_file=args$output) diff --git a/reports/template_days.Rmd b/reports/template_days.Rmd index b87475f..e308e51 100644 --- a/reports/template_days.Rmd +++ b/reports/template_days.Rmd @@ -405,9 +405,11 @@ if (nrow(clonal)>0) { } ``` ```{r write_clonal, eval=has_dmp} -sample="{{PATIENT_ID}}" -filename = paste(sample,"_clonal.csv",sep='') -fwrite(clonal, file=paste(getwd(),filename,sep='/')) +if(nrow(clonal)>0){ + sample="{{PATIENT_ID}}" + filename = paste(sample,"_clonal_adjvaf.csv",sep='') + fwrite(clonal, file=paste(getwd(),filename,sep='/')) +} ``` ```{asis echo=has_dmp} From 2abb5d4415b1eb861af0a71060c322890faffd8d Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Sat, 24 Sep 2022 16:32:45 -0400 Subject: [PATCH 045/126] Update create_report.R --- reports/create_report.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reports/create_report.R b/reports/create_report.R index b3e8419..d73254b 100644 --- a/reports/create_report.R +++ b/reports/create_report.R @@ -62,4 +62,4 @@ if (args$keep_rmarkdown){ # output_dir = normalizePath(dirname(args$output)), # output_file=args$output) -quarto::render(tmp,output_file=args$output) +quarto::quarto_render(tmp,output_file=args$output) From 4020cff71cc2a5acce2b33f5d84936b5e45d849a Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Sat, 24 Sep 2022 16:43:34 -0400 Subject: [PATCH 046/126] Update create_report.R --- reports/create_report.R | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/reports/create_report.R b/reports/create_report.R index d73254b..e2b5764 100644 --- a/reports/create_report.R +++ b/reports/create_report.R @@ -56,10 +56,8 @@ if (args$keep_rmarkdown){ output_rmd_path <- paste(output_cwd,"/",rmd_name, sep='') file.copy(tmp,output_rmd_path) } -#rmarkdown::render( -# tmp, -# output_format = "html_document", -# output_dir = normalizePath(dirname(args$output)), -# output_file=args$output) - -quarto::quarto_render(tmp,output_file=args$output) +rmarkdown::render( + tmp, + output_format = "html_document", + output_dir = normalizePath(dirname(args$output)), + output_file=args$output) From 96dcd4439a6be1ad3a0a6a551a1df83e702fcc91 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Sat, 24 Sep 2022 16:44:07 -0400 Subject: [PATCH 047/126] Update create_report.R --- reports/create_report.R | 2 -- 1 file changed, 2 deletions(-) diff --git a/reports/create_report.R b/reports/create_report.R index e2b5764..630daec 100644 --- a/reports/create_report.R +++ b/reports/create_report.R @@ -3,8 +3,6 @@ library(knitr) library(rmarkdown) library(argparse) -library(quarto) - parser <- ArgumentParser() From 103dbae2dd34bfe7d49e1e0528ab4199da4d4732 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Mon, 26 Sep 2022 16:01:56 -0400 Subject: [PATCH 048/126] Update template_days.Rmd --- reports/template_days.Rmd | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/reports/template_days.Rmd b/reports/template_days.Rmd index e308e51..dc5b71d 100644 --- a/reports/template_days.Rmd +++ b/reports/template_days.Rmd @@ -1,4 +1,5 @@ --- +title: "{{PATIENT_ID}}" output: html_document: df_print: paged @@ -6,6 +7,7 @@ output: ```{r global_options, include=FALSE} knitr::opts_chunk$set(echo=FALSE, warning=FALSE, message=FALSE, fig.width=10, fig.height=6) +library(knitr) library(data.table) library(tidyr) library(dplyr) @@ -19,17 +21,19 @@ library(RColorBrewer) theme_set(theme_bw()) show_text <- TRUE +``` +```{r echo=FALSE, eval=show_text} +if ("{{DMP_ID}}" != "") { + asis_output("### DMP ID: {{DMP_ID}}} \n") +} +``` +```{r echo=FALSE} dmp_id <- "{{DMP_ID}}" has_dmp <- F if (dmp_id != "") { has_dmp = T } - -``` - -```{r echo=FALSE} - if (dmp_id != "") { page_title = "{{PATIENT_ID}} ({{DMP_ID}})" } else { @@ -37,10 +41,6 @@ if (dmp_id != "") { } ``` ---- -title: "`r page_title`" ---- - ```{r echo=FALSE, eval=show_text} if ("{{TUMOR_TYPE}}" != "") { asis_output("### {{TUMOR_TYPE}} \n") From fb9e7045ae4acf6bf5b388a540107a70e4cc9c78 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Mon, 26 Sep 2022 16:05:40 -0400 Subject: [PATCH 049/126] Update template_days.Rmd --- reports/template_days.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reports/template_days.Rmd b/reports/template_days.Rmd index dc5b71d..caa1264 100644 --- a/reports/template_days.Rmd +++ b/reports/template_days.Rmd @@ -25,7 +25,7 @@ show_text <- TRUE ```{r echo=FALSE, eval=show_text} if ("{{DMP_ID}}" != "") { - asis_output("### DMP ID: {{DMP_ID}}} \n") + asis_output("### DMP ID: {{DMP_ID}} \n") } ``` ```{r echo=FALSE} From c14667ef697a02705fbc38fb14806008b37356e9 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Mon, 26 Sep 2022 16:15:18 -0400 Subject: [PATCH 050/126] Update template_days.Rmd --- reports/template_days.Rmd | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/reports/template_days.Rmd b/reports/template_days.Rmd index caa1264..91bff1a 100644 --- a/reports/template_days.Rmd +++ b/reports/template_days.Rmd @@ -25,9 +25,16 @@ show_text <- TRUE ```{r echo=FALSE, eval=show_text} if ("{{DMP_ID}}" != "") { - asis_output("### DMP ID: {{DMP_ID}} \n") + asis_output("### DMP Patient ID: {{DMP_ID}} \n") } ``` + +```{r echo=FALSE, eval=show_text} +if ("{{DMP_SAMPLE_ID}}" != "") { + asis_output("### DMP Sample ID: {{DMP_SAMPLE_ID}} \n") +} +``` + ```{r echo=FALSE} dmp_id <- "{{DMP_ID}}" has_dmp <- F @@ -435,7 +442,8 @@ if (nrow(clonal) > 0) { ### Description We adjust the variant allele fractions to account for the copy number alterations of the segments they are in. \ -Since it is not easy to call copy number changes from ACCESS data, here we rely on the copy number alterations called by FACETS in the IMPACT sample `r toString(impact_sample_id)`.\ +Since it is not easy to call copy number changes from ACCESS data, here we rely on the copy number alterations called by FACETS in the IMPACT sample.\ + *Note: This assumes that there are no changes to copy numbers of these segments between the IMPACT and ACCESS samples.* \ From 9b1f1341097dcd10f11f3ff30fa7e665d2ceb8f2 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Fri, 30 Sep 2022 11:12:54 -0400 Subject: [PATCH 051/126] Update get_cbioportal_variants.py Updating to make sure we dont have OverflowError --- .../get_cbioportal_variants.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/python/get_cbioportal_variants/get_cbioportal_variants.py b/python/get_cbioportal_variants/get_cbioportal_variants.py index 04a4ea9..c62a0e5 100644 --- a/python/get_cbioportal_variants/get_cbioportal_variants.py +++ b/python/get_cbioportal_variants/get_cbioportal_variants.py @@ -4,7 +4,7 @@ import typer import pandas as pd import csv - +import sys def main( maf: Path = typer.Option( @@ -93,12 +93,19 @@ def main( # preprocessing def get_row(file): + maxInt = sys.maxsize + while True: + # decrease the maxInt value by factor 10 + # as long as the OverflowError occurs. + try: + csv.field_size_limit(maxInt) + break + except OverflowError: + maxInt = int(maxInt/10) skipped = [] - with open(file, "r") as csvfile: - reader = csv.reader(csvfile, delimiter="\t") - for i, row in enumerate(reader): - if row[0].strip()[:2] == "#": - skipped.append(i) + with open(file, "r") as csv_file: + reader = csv.reader(csv_file, delimiter="\t") + skipped.extend(i for i, row in enumerate(reader) if row[0].strip()[:2] == "#") return skipped From 9c7c10f308d7b001d778518e35f3bd870948af4e Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Fri, 30 Sep 2022 11:20:39 -0400 Subject: [PATCH 052/126] Update get_cbioportal_variants.py --- .../get_cbioportal_variants.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/python/get_cbioportal_variants/get_cbioportal_variants.py b/python/get_cbioportal_variants/get_cbioportal_variants.py index c62a0e5..ed7a2f7 100644 --- a/python/get_cbioportal_variants/get_cbioportal_variants.py +++ b/python/get_cbioportal_variants/get_cbioportal_variants.py @@ -25,7 +25,7 @@ def main( "-i", help="List of ids to search for in the 'Tumor_Sample_Barcode' column. Header of this file is 'sample_id'", ), - id: Optional[List[str]] = typer.Option( + sid: Optional[List[str]] = typer.Option( "", help="Identifiers to search for in the 'Tumor_Sample_Barcode' column. Can be given multiple times", ), @@ -60,21 +60,21 @@ def main( """ if not ids: typer.echo("Identifiers were not provided in a text file") - if not id: + if not sid: typer.echo("Identifiers were not provided via command line as well") raise typer.Abort() # Read maf files skip = get_row(maf) + print("SkipRows:", skip) maf_df = pd.read_csv(maf, sep="\t", skiprows=skip, low_memory=False) # Read Identifiers - if not id: - file = open(ids) - id = file.read().splitlines()[1:] - file.close() + if not sid: + with open(ids) as file: + sid = file.read().splitlines()[1:] # filter for ids - ns = set(id) - pattern = "|".join([r"\b{}\b".format(i) for i in ns]) + ns = set(sid) + pattern = "|".join([f"\b{i}\b" for i in ns]) result = maf_df[maf_df["Tumor_Sample_Barcode"].str.contains(pattern, regex=True)] results_covered = result.copy(deep=True) results_covered["Chromosome"] = results_covered["Chromosome"].apply(str) From c9245fb747fa0118c455463aba0cbb2e8164799a Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Fri, 30 Sep 2022 11:30:20 -0400 Subject: [PATCH 053/126] Update get_cbioportal_variants.py --- python/get_cbioportal_variants/get_cbioportal_variants.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/python/get_cbioportal_variants/get_cbioportal_variants.py b/python/get_cbioportal_variants/get_cbioportal_variants.py index ed7a2f7..610523d 100644 --- a/python/get_cbioportal_variants/get_cbioportal_variants.py +++ b/python/get_cbioportal_variants/get_cbioportal_variants.py @@ -103,9 +103,11 @@ def get_row(file): except OverflowError: maxInt = int(maxInt/10) skipped = [] - with open(file, "r") as csv_file: - reader = csv.reader(csv_file, delimiter="\t") - skipped.extend(i for i, row in enumerate(reader) if row[0].strip()[:2] == "#") + with open(file, "r") as csvfile: + reader = csv.reader(csvfile, delimiter="\t") + for i, row in enumerate(reader): + if row[0].strip()[:2] == "#": + skipped.append(i) return skipped From fa4824873c1b3bc50cf617ceafa659947f8d529b Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Fri, 30 Sep 2022 11:46:49 -0400 Subject: [PATCH 054/126] Update get_cbioportal_variants.py --- .../get_cbioportal_variants.py | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/python/get_cbioportal_variants/get_cbioportal_variants.py b/python/get_cbioportal_variants/get_cbioportal_variants.py index 610523d..c85d911 100644 --- a/python/get_cbioportal_variants/get_cbioportal_variants.py +++ b/python/get_cbioportal_variants/get_cbioportal_variants.py @@ -3,8 +3,6 @@ from bed_lookup import BedFile import typer import pandas as pd -import csv -import sys def main( maf: Path = typer.Option( @@ -93,21 +91,9 @@ def main( # preprocessing def get_row(file): - maxInt = sys.maxsize - while True: - # decrease the maxInt value by factor 10 - # as long as the OverflowError occurs. - try: - csv.field_size_limit(maxInt) - break - except OverflowError: - maxInt = int(maxInt/10) skipped = [] - with open(file, "r") as csvfile: - reader = csv.reader(csvfile, delimiter="\t") - for i, row in enumerate(reader): - if row[0].strip()[:2] == "#": - skipped.append(i) + with open(file, "r") as csv_file: + skipped.extend(i for i, line in enumerate(csv_file) if line.startswith("#")) return skipped From 7c468c665d24d4a7e584fe3e7d2572bb3a13634d Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Fri, 30 Sep 2022 11:49:46 -0400 Subject: [PATCH 055/126] Update get_cbioportal_variants.py --- .../get_cbioportal_variants/get_cbioportal_variants.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/python/get_cbioportal_variants/get_cbioportal_variants.py b/python/get_cbioportal_variants/get_cbioportal_variants.py index c85d911..ebd92a4 100644 --- a/python/get_cbioportal_variants/get_cbioportal_variants.py +++ b/python/get_cbioportal_variants/get_cbioportal_variants.py @@ -64,15 +64,16 @@ def main( # Read maf files skip = get_row(maf) - print("SkipRows:", skip) + typer.echo("Skipping Rows:", skip) maf_df = pd.read_csv(maf, sep="\t", skiprows=skip, low_memory=False) # Read Identifiers if not sid: - with open(ids) as file: - sid = file.read().splitlines()[1:] + file = open(ids) + sid = file.read().splitlines()[1:] + file.close() # filter for ids ns = set(sid) - pattern = "|".join([f"\b{i}\b" for i in ns]) + pattern = "|".join([r"\b{}\b".format(i) for i in ns]) result = maf_df[maf_df["Tumor_Sample_Barcode"].str.contains(pattern, regex=True)] results_covered = result.copy(deep=True) results_covered["Chromosome"] = results_covered["Chromosome"].apply(str) From a8de8e9eec007f96810a5550ed9d8b2eed605f8e Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Fri, 30 Sep 2022 11:51:00 -0400 Subject: [PATCH 056/126] Update get_cbioportal_variants.py --- python/get_cbioportal_variants/get_cbioportal_variants.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/get_cbioportal_variants/get_cbioportal_variants.py b/python/get_cbioportal_variants/get_cbioportal_variants.py index ebd92a4..3ed6ddc 100644 --- a/python/get_cbioportal_variants/get_cbioportal_variants.py +++ b/python/get_cbioportal_variants/get_cbioportal_variants.py @@ -64,7 +64,6 @@ def main( # Read maf files skip = get_row(maf) - typer.echo("Skipping Rows:", skip) maf_df = pd.read_csv(maf, sep="\t", skiprows=skip, low_memory=False) # Read Identifiers if not sid: From 5a1a6a879d5642a14f5515f99a51ad40db477b75 Mon Sep 17 00:00:00 2001 From: carmelinacharalambous Date: Mon, 31 Oct 2022 09:48:19 -0400 Subject: [PATCH 057/126] adding requirements --- python/convert_csv_to_maf/README.md | 8 +++----- python/convert_csv_to_maf/requirements.txt | 4 ++++ 2 files changed, 7 insertions(+), 5 deletions(-) create mode 100644 python/convert_csv_to_maf/requirements.txt diff --git a/python/convert_csv_to_maf/README.md b/python/convert_csv_to_maf/README.md index be325d0..03c0767 100644 --- a/python/convert_csv_to_maf/README.md +++ b/python/convert_csv_to_maf/README.md @@ -8,12 +8,10 @@ Tool does the following operations: * Massaging the data frame to make it compatible with MAF format * Write the data frame to a file in MAF format and Excel format -## Requirements +## Installation -* pandas -* openpyxl -* typing -* typer +Dependencies may be installed from the requirements.txt file using ```pip install -r requirements.txt```. +This should contains all the required python packages required to run csv_to_maf.py and convert CSV files to MAF. ## Example command diff --git a/python/convert_csv_to_maf/requirements.txt b/python/convert_csv_to_maf/requirements.txt new file mode 100644 index 0000000..849b3a2 --- /dev/null +++ b/python/convert_csv_to_maf/requirements.txt @@ -0,0 +1,4 @@ +typer==0.3.2 +openpyxl==3.0.9 +typing_extensions==3.10.0.0 +pandas==1.2.5 From e949a2aa399488a4d8bdf3250b501f15a7f2bfd5 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 25 Jan 2023 15:50:53 -0500 Subject: [PATCH 058/126] Update README.md --- python/get_cbioportal_variants/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/get_cbioportal_variants/README.md b/python/get_cbioportal_variants/README.md index 0aa5a83..78a7252 100644 --- a/python/get_cbioportal_variants/README.md +++ b/python/get_cbioportal_variants/README.md @@ -13,7 +13,7 @@ Tool to do the following operations: ### Example command ```bash -python get_cbioportal_variants.py --id "Test1" --id "Test2" --id "Test3" +python get_cbioportal_variants.py --sid "Test1" --sid "Test2" --sid "Test3" ``` ```bash @@ -43,7 +43,7 @@ Options: 'Tumor_Sample_Barcode' column. Header of this file is 'sample_id' [default: ] - --id TEXT Identifiers to search for in the + --sid TEXT Identifiers to search for in the 'Tumor_Sample_Barcode' column. Can be given multiple times [default: ] @@ -58,4 +58,4 @@ Options: customize the installation. --help Show this message and exit. -``` \ No newline at end of file +``` From 56a0db173a44b944056cfab44283fa61060469e9 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 14:06:11 -0500 Subject: [PATCH 059/126] adding access samples to genotype --- .gitignore | 1 + R/compile_reads.R | 745 ++++++++++++++++++++++++++++++++++++---------- 2 files changed, 581 insertions(+), 165 deletions(-) diff --git a/.gitignore b/.gitignore index d22071f..4b402ae 100644 --- a/.gitignore +++ b/.gitignore @@ -22,6 +22,7 @@ # RStudio files .Rproj.user/ *.Rproj +/R_scripts # produced vignettes vignettes/*.html diff --git a/R/compile_reads.R b/R/compile_reads.R index f30735c..a3d5b92 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -5,12 +5,17 @@ #' @export -compile_reads <- function( - master.ref, results.dir, project.ID, pooled.bam.dir = "/juno/work/access/production/resources/msk-access/current/novaseq_curated_duplex_bams_dmp/current/", +compile_reads <- function(master.ref, + results.dir, + project.ID, + pooled.bam.dir = "/juno/work/access/production/resources/msk-access/current/novaseq_curated_duplex_bams_dmp/current/", fasta.path = "/juno/work/access/production/resources/reference/current/Homo_sapiens_assembly19.fasta", genotyper.path = "/work/access/production/resources/tools/GetBaseCountsMultiSample/current/GetBaseCountsMultiSample", - dmp.dir = "/juno/work/access/production/resources/cbioportal/current/msk_solid_heme", mirror.bam.dir = "/juno/res/dmpcollab/dmpshare/share/irb12_245", - dmp.key.path = "/juno/res/dmpcollab/dmprequest/12-245/key.txt") { + dmp.dir = "/juno/work/access/production/resources/cbioportal/current/msk_solid_heme", + mirror.bam.dir = "/juno/res/dmpcollab/dmpshare/share/irb12_245", + mirror.access.bam.dir = "/juno/res/dmpcollab/dmpshare/share/access_12_245/", + dmp.key.path = "/juno/res/dmpcollab/dmprequest/12-245/key.txt", + access.key.path = "/juno/res/dmpcollab/dmprequest/ACCESS-12-245/key.txt") { # # test input section ----------------------------------------------------------- # master.ref = fread('/juno/work/bergerm1/bergerlab/zhengy1/access_data_analysis/data/example_master_file.csv') # results.dir = paste0('/juno/work/bergerm1/MSK-ACCESS/ACCESS-Projects/test_access/access_data_analysis/output_',format(Sys.time(),'%m%d%y')) @@ -28,174 +33,550 @@ compile_reads <- function( geno.bash <- system("which genotype_variants", intern = T) if (length(geno.bash) == 0) { # print(pyclone.path) - stop("needs to run \nsource /home/accessbot/miniconda3/bin/activate && conda activate genotype-variants-0.3.0") + stop( + "needs to run \nsource /home/accessbot/miniconda3/bin/activate && conda activate genotype-variants-0.3.0" + ) } # data from DMP ----------------------------------------------------------- DMP.key <- fread(dmp.key.path) - if (any(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-IH.|-T..-IM.|-T..-XS", "", DMP.key[grepl("IH|IM|XS", V1)]$V1))) { + if (any( + !master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-IH.|-T..-IM.|-T..-XS", "", DMP.key[grepl("IH|IM|XS", V1)]$V1) + )) { stop(paste0( "These DMP IDs are not found in DMP key file: ", - paste0(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id[which(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% - gsub("-T..-IH.|-T..-IM.|-T..-XS", "", DMP.key[grepl("IH|IM|XS", V1)]$V1))], collapse = " ,") + paste0(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id[which( + !master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% + gsub("-T..-IH.|-T..-IM.|-T..-XS", "", DMP.key[grepl("IH|IM|XS", V1)]$V1) + )], collapse = " ,") )) } - DMP.maf <- fread(paste0(dmp.dir, "/data_mutations_extended.txt")) %>% + acccess.key <- fread(access.key.path) + if (any( + !master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-IH.|-T..-IM.|-T..-XS", "", access.key[grepl("IH|IM|XS", V1)]$V1) + )) { + stop(paste0( + "These DMP IDs are not found in DMP key file: ", + paste0(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id[which( + !master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% + gsub("-T..-IH.|-T..-IM.|-T..-XS", "", access.key[grepl("IH|IM|XS", V1)]$V1) + )], collapse = " ,") + )) + } + DMP.maf <- + fread(paste0(dmp.dir, "/data_mutations_extended.txt")) %>% filter(Mutation_Status != "GERMLINE") %>% data.table() - DMP.RET.maf <- DMP.maf[grepl(paste0(unique(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id), collapse = "|"), Tumor_Sample_Barcode), ] + DMP.RET.maf <- + DMP.maf[grepl(paste0(unique(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id), collapse = "|"), Tumor_Sample_Barcode), ] # Pooled normal samples --------------------------------------------------- - pooled.bams <- list.files(pooled.bam.dir, pattern = ".bam", full.names = T) + pooled.bams <- + list.files(pooled.bam.dir, pattern = ".bam", full.names = T) # For each patient -------------------------------------------------------- x <- unique(master.ref$cmo_patient_id)[1] # x = unique(master.ref$cmo_sample_id_plasma)[16] # x = 'C-YW82CY' print("Compiling reads per patient") - all.fillout.id <- lapply(unique(master.ref$cmo_patient_id), function(x) { - print(x) - dir.create(paste0(results.dir, "/", x)) - dmp_id <- unique(master.ref[cmo_patient_id == x]$dmp_patient_id) - # sample sheet with colummns -- TSB, sample type, bam path, treatm -------- - # need to get DMP tumor, DMP normal, plasma, plasma normal (if there is any), pooled normal - # DMP sample sheet - if (is.na(dmp_id) | dmp_id == '') { - dmp.sample.sheet <- NULL - } else { - all.dmp.ids.IM <- DMP.key[grepl(paste0(dmp_id, "-(T|N)..-IM."), V1)]$V1 - all.dmp.ids.IH <- DMP.key[grepl(paste0(dmp_id, "-(T|N)..-IH."), V1)]$V1 - all.dmp.ids.XS <- DMP.key[grepl(paste0(dmp_id, "-(T|N)..-XS."), V1)]$V1 - all.dmp.ids <- c(all.dmp.ids.IM,all.dmp.ids.IH,all.dmp.ids.XS) - all.dmp.bam.ids.IM <- DMP.key[grepl(paste0(dmp_id, "-(T|N)..-IM."), V1)]$V2 - all.dmp.bam.ids.IH <- DMP.key[grepl(paste0(dmp_id, "-(T|N)..-IH."), V1)]$V2 - all.dmp.bam.ids.XS <- DMP.key[grepl(paste0(dmp_id, "-(T|N)..-XS."), V1)]$V2 - all.dmp.bam.ids <- c(all.dmp.bam.ids.IM,all.dmp.bam.ids.IH,all.dmp.bam.ids.XS) - bam.sub.dir <- unlist(lapply(strsplit(substr(all.dmp.bam.ids, 1, 2), ""), function(x) { - paste0(x, collapse = "/") - })) - dmp.sample.sheet <- data.frame( - Sample_Barcode = all.dmp.ids, - standard_bam = paste0(mirror.bam.dir, "/", bam.sub.dir, "/", all.dmp.bam.ids, ".bam") - ) %>% - mutate(cmo_patient_id = x, Sample_Type = ifelse(grepl("-T", Sample_Barcode), "DMP_Tumor", "DMP_Normal"), dmp_patient_id = dmp_id) - } - # total sample sheet - sample.sheet <- master.ref[ - cmo_patient_id == x, - # plasma bams -- duplex and simplex bam - .( - Sample_Barcode = as.character(cmo_sample_id_plasma), duplex_bam = bam_path_plasma_duplex, - simplex_bam = bam_path_plasma_simplex, cmo_patient_id, Sample_Type = "duplex", dmp_patient_id - ) - ] %>% - merge(rbind( - unique(master.ref[ - cmo_patient_id == x&paired=='Paired', - # buffy coat + DMP bams -- standard bam only - .( - Sample_Barcode = as.character(cmo_sample_id_normal), standard_bam = bam_path_normal, - cmo_patient_id, Sample_Type = "unfilterednormal", dmp_patient_id - ) - ]), - dmp.sample.sheet - ), all = T) - # catch '' or NA for empty cells for some cmo_sample_id_normal - sample.sheet <- sample.sheet[!is.na(Sample_Barcode) | Sample_Barcode != ""] - write.table(sample.sheet, paste0(results.dir, "/", x, "/", x, "_sample_sheet.tsv"), sep = "\t", quote = F, row.names = F) - # piece together all unique calls ----------------------------------------- - # get duplex calls - duplex.calls <- do.call(rbind, lapply(master.ref[cmo_patient_id == x]$maf_path, function(x) { - # fread(x) %>% filter(as.numeric(D_t_alt_count_fragment) > 0) %>% data.table() - selectcolumns <- c("Hugo_Symbol","Entrez_Gene_Id","Center","NCBI_Build","Chromosome","Start_Position","End_Position","Strand","Variant_Classification","Variant_Type","Reference_Allele","Tumor_Seq_Allele1","Tumor_Seq_Allele2","dbSNP_RS","dbSNP_Val_Status","Tumor_Sample_Barcode","caller_Norm_Sample_Barcode","Match_Norm_Seq_Allele1","Match_Norm_Seq_Allele2","Tumor_Validation_Allele1","Tumor_Validation_Allele2","Match_Norm_Validation_Allele1","Match_Norm_Validation_Allele2","Verification_Status","Validation_Status","Mutation_Status","Sequencing_Phase","Sequence_Source","Validation_Method","Score","BAM_File","Sequencer","Tumor_Sample_UUID","Matched_Norm_Sample_UUID","HGVSc","HGVSp","HGVSp_Short","Transcript_ID","Exon_Number","caller_t_depth","caller_t_ref_count","caller_t_alt_count","caller_n_depth","caller_n_ref_count","caller_n_alt_count","all_effects","Allele","Gene","Feature","Feature_type","Consequence","cDNA_position","CDS_position","Protein_position","Amino_acids","Codons","Existing_variation","ALLELE_NUM","DISTANCE","STRAND_VEP","SYMBOL","SYMBOL_SOURCE","HGNC_ID","BIOTYPE","CANONICAL","CCDS","ENSP","SWISSPROT","TREMBL","UNIPARC","RefSeq","SIFT","PolyPhen","EXON","INTRON","DOMAINS","AF","AFR_AF","AMR_AF","ASN_AF","EAS_AF","EUR_AF","SAS_AF","AA_AF","EA_AF","CLIN_SIG","SOMATIC","PUBMED","MOTIF_NAME","MOTIF_POS","HIGH_INF_POS","MOTIF_SCORE_CHANGE","IMPACT","PICK","VARIANT_CLASS","TSL","HGVS_OFFSET","PHENO","MINIMISED","ExAC_AF","ExAC_AF_AFR","ExAC_AF_AMR","ExAC_AF_EAS","ExAC_AF_FIN","ExAC_AF_NFE","ExAC_AF_OTH","ExAC_AF_SAS","GENE_PHENO","FILTER","flanking_bps","variant_id","variant_qual","ExAC_AF_Adj","ExAC_AC_AN_Adj","ExAC_AC_AN","ExAC_AC_AN_AFR","ExAC_AC_AN_AMR","ExAC_AC_AN_EAS","ExAC_AC_AN_FIN","ExAC_AC_AN_NFE","ExAC_AC_AN_OTH","ExAC_AC_AN_SAS","ExAC_FILTER","gnomAD_AF","gnomAD_AFR_AF","gnomAD_AMR_AF","gnomAD_ASJ_AF","gnomAD_EAS_AF","gnomAD_FIN_AF","gnomAD_NFE_AF","gnomAD_OTH_AF","gnomAD_SAS_AF","CallMethod","VCF_POS","VCF_REF","VCF_ALT","hotspot_whitelist","Status","D_t_alt_count_fragment","D_t_ref_count_fragment","D_t_vaf_fragment","SD_t_alt_count_fragment","SD_t_ref_count_fragment","SD_t_vaf_fragment","Matched_Norm_Sample_Barcode","Matched_Norm_Bamfile","n_alt_count_fragment","n_ref_count_fragment","n_vaf_fragment") - if("Status" %in% names(fread(x))){ - fread(x) %>% select(one_of(selectcolumns)) %>% subset((Status == "") | (is.na(Status))) + all.fillout.id <- + lapply(unique(master.ref$cmo_patient_id), function(x) { + print(x) + dir.create(paste0(results.dir, "/", x)) + dmp_id <- + unique(master.ref[cmo_patient_id == x]$dmp_patient_id) + # sample sheet with colummns -- TSB, sample type, bam path, treatm -------- + # need to get DMP tumor, DMP normal, plasma, plasma normal (if there is any), pooled normal + # DMP sample sheet + if (is.na(dmp_id) | dmp_id == '') { + dmp.sample.sheet <- NULL } else { - fread(x) %>% select(one_of(selectcolumns)) - } -# fread(x) - # %>% - # filter(as.numeric(t_alt_count) > 0) %>% - # data.table() - })) - # get impact calls - impact.calls <- DMP.RET.maf[Tumor_Sample_Barcode %in% sample.sheet$Sample_Barcode] - write.table(impact.calls[, .(Hugo_Symbol, Chromosome, Start_Position, End_Position, Variant_Classification, HGVSp_Short, Reference_Allele, Tumor_Seq_Allele2)], - paste0(results.dir, "/", x, "/", x, "_impact_calls.maf"), - sep = "\t", quote = F, row.names = F - ) - # combining plasma and impact calls - all.calls <- rbind( - duplex.calls[, intersect(colnames(duplex.calls), colnames(DMP.RET.maf)), with = F], - impact.calls[, intersect(colnames(duplex.calls), colnames(DMP.RET.maf)), with = F] - ) - # getting rid of duplicate calls and take the first occurence of all events - all.calls <- all.calls[which(!duplicated(all.calls[, .(Hugo_Symbol, Chromosome, Start_Position, End_Position, Variant_Classification, HGVSp_Short, Reference_Allele, Tumor_Seq_Allele2)])), ] %>% - mutate(t_ref_count=0, t_alt_count=0, n_ref_count=0, n_alt_count=0, Matched_Norm_Sample_Barcode=NA ) %>% - filter(Variant_Classification != "Silent" & !grepl("RP11-", Hugo_Symbol) & !grepl("Intron", Variant_Classification)) - write.table(all.calls, paste0(results.dir, "/", x, "/", x, "_all_unique_calls.maf"), sep = "\t", quote = F, row.names = F) - # tagging hotspots - system(paste0( - 'bsub -R "rusage[mem=4]" -cwd ', results.dir, "/", x, "/ -oo hotspot.o -eo hotspot.e -W 00:59 ", - " -P ", project.ID, " -J ", x, "_tag_hotspot ", - " python /work/access/production/workflows/access_workflows/v1/pipeline_2.0.0/ACCESS-Pipeline/cwl_tools/hotspots/tag_hotspots.py ", - " -m ", results.dir, "/", x, "/", x, "_all_unique_calls.maf", - " -itxt /work/access/production/resources/msk-access/current/regions_of_interest/current/hotspot-list-union-v1-v2_with_TERT.txt ", - " -o ", results.dir, "/", x, "/", x, "_all_unique_calls_hotspots.maf", - " -outdir ", results.dir, "/", x, "/", x - )) - # genotype all bams in this patient directory ----------------------------- - # genotyping plasma samples -- plasma duplex&simplex, plasma normal, pooled plasma normal - write.table(sample.sheet[, .( - sample_id = Sample_Barcode, maf = paste0(results.dir, "/", x, "/", x, "_all_unique_calls.maf"), - standard_bam, duplex_bam, simplex_bam - )], - paste0(results.dir, "/", x, "/", x, "_genotype_metadata.tsv"), - sep = "\t", quote = F, row.names = F - ) - job.ids <- system(paste0( - "bsub -cwd ", results.dir, "/", x, ' -W 12:00 -R "rusage[mem=8]" -oo genotyping.o -eo genotyping.e ', - " -P ", project.ID, " -J ", x, "_genotype_variants ", - " genotype_variants small_variants multiple-samples -i ", results.dir, "/", x, "/", x, "_genotype_metadata.tsv", - " -r ", fasta.path, " -g ", genotyper.path, " -v DEBUG " - ), intern = T) - job.ids <- as.numeric(gsub("Job <|> is.*.$", "", job.ids)) - }) + all.dmp.ids.IM <- + DMP.key[grepl(paste0(dmp_id, "-(T|N)..-IM."), V1)]$V1 + all.dmp.ids.IH <- + DMP.key[grepl(paste0(dmp_id, "-(T|N)..-IH."), V1)]$V1 + all.dmp.ids.XS <- + acccess.key[grepl(paste0(dmp_id, "-(T|N)..-XS."), V1)]$V1 + all.dmp.ids <- c(all.dmp.ids.IM, all.dmp.ids.IH) + all.dmp.bam.ids.IM <- + DMP.key[grepl(paste0(dmp_id, "-(T|N)..-IM."), V1)]$V2 + all.dmp.bam.ids.IH <- + DMP.key[grepl(paste0(dmp_id, "-(T|N)..-IH."), V1)]$V2 + all.dmp.bam.ids.XS <- + gsub("-standard|-unfilter|-simplex|-duplex", + "", + access.key[grepl(paste0(dmp_id, "-(T|N)..-XS."), V1)]$V2) + all.dmp.bam.ids <- + c(all.dmp.bam.ids.IM, + all.dmp.bam.ids.IH) + bam.sub.dir <- + unlist(lapply(strsplit(substr( + all.dmp.bam.ids, 1, 2 + ), ""), function(x) { + paste0(x, collapse = "/") + })) + dmp.sample.sheet <- data.frame( + Sample_Barcode = all.dmp.ids, + standard_bam = paste0( + mirror.bam.dir, + "/", + bam.sub.dir, + "/", + all.dmp.bam.ids, + ".bam" + ) + access.bam.sub.dir <- + unlist(lapply(strsplit( + substr(all.dmp.bam.ids.XS, 1, 2), "" + ), function(x) { + paste0(x, collapse = "/") + })) + access.sample.sheet <- unique( + data.frame( + Sample_Barcode = all.dmp.ids.XS, + duplex_bam = paste0( + mirror.access.bam.dir, + "/", + access / bam.sub.dir, + "/", + all.dmp.bam.ids.XS, + "-duplex.bam", + simplex_bam = paste0( + mirror.access.bam.dir, + "/", + access / bam.sub.dir, + "/", + all.dmp.bam.ids.XS, + "-simplex.bam" + ) + ) + dmp.sample.sheet <- + bind_row(dmp.sample.sheet, access.sample.sheet) + ) %>% + mutate( + cmo_patient_id = x, + Sample_Type = ifelse(grepl("-T", Sample_Barcode), "DMP_Tumor", "DMP_Normal"), + dmp_patient_id = dmp_id + ) + } + # total sample sheet + sample.sheet <- master.ref[cmo_patient_id == x, + # plasma bams -- duplex and simplex bam + .( + Sample_Barcode = as.character(cmo_sample_id_plasma), + duplex_bam = bam_path_plasma_duplex, + simplex_bam = bam_path_plasma_simplex, + cmo_patient_id, + Sample_Type = "duplex", + dmp_patient_id + )] %>% + merge(rbind(unique(master.ref[cmo_patient_id == x & + paired == 'Paired', + # buffy coat + DMP bams -- standard bam only + .( + Sample_Barcode = as.character(cmo_sample_id_normal), + standard_bam = bam_path_normal, + cmo_patient_id, + Sample_Type = "unfilterednormal", + dmp_patient_id + )]), + dmp.sample.sheet), all = T) + # catch '' or NA for empty cells for some cmo_sample_id_normal + sample.sheet <- + sample.sheet[!is.na(Sample_Barcode) | Sample_Barcode != ""] + write.table( + sample.sheet, + paste0(results.dir, "/", x, "/", x, "_sample_sheet.tsv"), + sep = "\t", + quote = F, + row.names = F + ) + # piece together all unique calls ----------------------------------------- + # get duplex calls + duplex.calls <- + do.call(rbind, lapply(master.ref[cmo_patient_id == x]$maf_path, function(x) { + # fread(x) %>% filter(as.numeric(D_t_alt_count_fragment) > 0) %>% data.table() + selectcolumns <- + c( + "Hugo_Symbol", + "Entrez_Gene_Id", + "Center", + "NCBI_Build", + "Chromosome", + "Start_Position", + "End_Position", + "Strand", + "Variant_Classification", + "Variant_Type", + "Reference_Allele", + "Tumor_Seq_Allele1", + "Tumor_Seq_Allele2", + "dbSNP_RS", + "dbSNP_Val_Status", + "Tumor_Sample_Barcode", + "caller_Norm_Sample_Barcode", + "Match_Norm_Seq_Allele1", + "Match_Norm_Seq_Allele2", + "Tumor_Validation_Allele1", + "Tumor_Validation_Allele2", + "Match_Norm_Validation_Allele1", + "Match_Norm_Validation_Allele2", + "Verification_Status", + "Validation_Status", + "Mutation_Status", + "Sequencing_Phase", + "Sequence_Source", + "Validation_Method", + "Score", + "BAM_File", + "Sequencer", + "Tumor_Sample_UUID", + "Matched_Norm_Sample_UUID", + "HGVSc", + "HGVSp", + "HGVSp_Short", + "Transcript_ID", + "Exon_Number", + "caller_t_depth", + "caller_t_ref_count", + "caller_t_alt_count", + "caller_n_depth", + "caller_n_ref_count", + "caller_n_alt_count", + "all_effects", + "Allele", + "Gene", + "Feature", + "Feature_type", + "Consequence", + "cDNA_position", + "CDS_position", + "Protein_position", + "Amino_acids", + "Codons", + "Existing_variation", + "ALLELE_NUM", + "DISTANCE", + "STRAND_VEP", + "SYMBOL", + "SYMBOL_SOURCE", + "HGNC_ID", + "BIOTYPE", + "CANONICAL", + "CCDS", + "ENSP", + "SWISSPROT", + "TREMBL", + "UNIPARC", + "RefSeq", + "SIFT", + "PolyPhen", + "EXON", + "INTRON", + "DOMAINS", + "AF", + "AFR_AF", + "AMR_AF", + "ASN_AF", + "EAS_AF", + "EUR_AF", + "SAS_AF", + "AA_AF", + "EA_AF", + "CLIN_SIG", + "SOMATIC", + "PUBMED", + "MOTIF_NAME", + "MOTIF_POS", + "HIGH_INF_POS", + "MOTIF_SCORE_CHANGE", + "IMPACT", + "PICK", + "VARIANT_CLASS", + "TSL", + "HGVS_OFFSET", + "PHENO", + "MINIMISED", + "ExAC_AF", + "ExAC_AF_AFR", + "ExAC_AF_AMR", + "ExAC_AF_EAS", + "ExAC_AF_FIN", + "ExAC_AF_NFE", + "ExAC_AF_OTH", + "ExAC_AF_SAS", + "GENE_PHENO", + "FILTER", + "flanking_bps", + "variant_id", + "variant_qual", + "ExAC_AF_Adj", + "ExAC_AC_AN_Adj", + "ExAC_AC_AN", + "ExAC_AC_AN_AFR", + "ExAC_AC_AN_AMR", + "ExAC_AC_AN_EAS", + "ExAC_AC_AN_FIN", + "ExAC_AC_AN_NFE", + "ExAC_AC_AN_OTH", + "ExAC_AC_AN_SAS", + "ExAC_FILTER", + "gnomAD_AF", + "gnomAD_AFR_AF", + "gnomAD_AMR_AF", + "gnomAD_ASJ_AF", + "gnomAD_EAS_AF", + "gnomAD_FIN_AF", + "gnomAD_NFE_AF", + "gnomAD_OTH_AF", + "gnomAD_SAS_AF", + "CallMethod", + "VCF_POS", + "VCF_REF", + "VCF_ALT", + "hotspot_whitelist", + "Status", + "D_t_alt_count_fragment", + "D_t_ref_count_fragment", + "D_t_vaf_fragment", + "SD_t_alt_count_fragment", + "SD_t_ref_count_fragment", + "SD_t_vaf_fragment", + "Matched_Norm_Sample_Barcode", + "Matched_Norm_Bamfile", + "n_alt_count_fragment", + "n_ref_count_fragment", + "n_vaf_fragment" + ) + if ("Status" %in% names(fread(x))) { + fread(x) %>% select(one_of(selectcolumns)) %>% subset((Status == "") | + (is.na(Status))) + } else { + fread(x) %>% select(one_of(selectcolumns)) + } + # fread(x) + # %>% + # filter(as.numeric(t_alt_count) > 0) %>% + # data.table() + })) + # get impact calls + impact.calls <- + DMP.RET.maf[Tumor_Sample_Barcode %in% sample.sheet$Sample_Barcode] + write.table( + impact.calls[, .( + Hugo_Symbol, + Chromosome, + Start_Position, + End_Position, + Variant_Classification, + HGVSp_Short, + Reference_Allele, + Tumor_Seq_Allele2 + )], + paste0(results.dir, "/", x, "/", x, "_impact_calls.maf"), + sep = "\t", + quote = F, + row.names = F + ) + # combining plasma and impact calls + all.calls <- + rbind(duplex.calls[, intersect(colnames(duplex.calls), colnames(DMP.RET.maf)), with = F], + impact.calls[, intersect(colnames(duplex.calls), colnames(DMP.RET.maf)), with = F]) + # getting rid of duplicate calls and take the first occurence of all events + all.calls <- + all.calls[which(!duplicated(all.calls[, .( + Hugo_Symbol, + Chromosome, + Start_Position, + End_Position, + Variant_Classification, + HGVSp_Short, + Reference_Allele, + Tumor_Seq_Allele2 + )])), ] %>% + mutate( + t_ref_count = 0, + t_alt_count = 0, + n_ref_count = 0, + n_alt_count = 0, + Matched_Norm_Sample_Barcode = NA + ) %>% + filter( + Variant_Classification != "Silent" & + !grepl("RP11-", Hugo_Symbol) & + !grepl("Intron", Variant_Classification) + ) + write.table( + all.calls, + paste0(results.dir, "/", x, "/", x, "_all_unique_calls.maf"), + sep = "\t", + quote = F, + row.names = F + ) + # tagging hotspots + system( + paste0( + 'bsub -R "rusage[mem=4]" -cwd ', + results.dir, + "/", + x, + "/ -oo hotspot.o -eo hotspot.e -W 00:59 ", + " -P ", + project.ID, + " -J ", + x, + "_tag_hotspot ", + " python /work/access/production/workflows/access_workflows/v1/pipeline_2.0.0/ACCESS-Pipeline/cwl_tools/hotspots/tag_hotspots.py ", + " -m ", + results.dir, + "/", + x, + "/", + x, + "_all_unique_calls.maf", + " -itxt /work/access/production/resources/msk-access/current/regions_of_interest/current/hotspot-list-union-v1-v2_with_TERT.txt ", + " -o ", + results.dir, + "/", + x, + "/", + x, + "_all_unique_calls_hotspots.maf", + " -outdir ", + results.dir, + "/", + x, + "/", + x + ) + ) + # genotype all bams in this patient directory ----------------------------- + # genotyping plasma samples -- plasma duplex&simplex, plasma normal, pooled plasma normal + write.table( + sample.sheet[, .( + sample_id = Sample_Barcode, + maf = paste0(results.dir, "/", x, "/", x, "_all_unique_calls.maf"), + standard_bam, + duplex_bam, + simplex_bam + )], + paste0(results.dir, "/", x, "/", x, "_genotype_metadata.tsv"), + sep = "\t", + quote = F, + row.names = F + ) + job.ids <- system( + paste0( + "bsub -cwd ", + results.dir, + "/", + x, + ' -W 12:00 -R "rusage[mem=8]" -oo genotyping.o -eo genotyping.e ', + " -P ", + project.ID, + " -J ", + x, + "_genotype_variants ", + " genotype_variants small_variants multiple-samples -i ", + results.dir, + "/", + x, + "/", + x, + "_genotype_metadata.tsv", + " -r ", + fasta.path, + " -g ", + genotyper.path, + " -v DEBUG " + ), + intern = T + ) + job.ids <- as.numeric(gsub("Job <|> is.*.$", "", job.ids)) + }) - # Get base count multi sample in pooled normal ---------------------------- - # all all unique calls in entire cohort - print("Compiling reads in pooled samples") - dir.create(paste0(results.dir, "/pooled")) - all.all.unique.mafs <- do.call(rbind, lapply(unique(master.ref$cmo_patient_id), function(x) { - fread(list.files(paste0(results.dir, "/", x), pattern = "unique_calls.maf$", full.names = T)) - })) - all.all.unique.mafs <- all.all.unique.mafs[!duplicated(all.all.unique.mafs[, .(Hugo_Symbol, Chromosome, Start_Position, End_Position, Variant_Classification, HGVSp_Short, Reference_Allele, Tumor_Seq_Allele2)]),] - write.table(all.all.unique.mafs, paste0(results.dir, "/pooled/all_all_unique.maf"), sep = "\t", quote = F, row.names = F) + # Get base count multi sample in pooled normal ---------------------------- + # all all unique calls in entire cohort + print("Compiling reads in pooled samples") + dir.create(paste0(results.dir, "/pooled")) + all.all.unique.mafs <- + do.call(rbind, lapply(unique(master.ref$cmo_patient_id), function(x) { + fread(list.files( + paste0(results.dir, "/", x), + pattern = "unique_calls.maf$", + full.names = T + )) + })) + all.all.unique.mafs <- + all.all.unique.mafs[!duplicated(all.all.unique.mafs[, .( + Hugo_Symbol, + Chromosome, + Start_Position, + End_Position, + Variant_Classification, + HGVSp_Short, + Reference_Allele, + Tumor_Seq_Allele2 + )]),] + write.table( + all.all.unique.mafs, + paste0(results.dir, "/pooled/all_all_unique.maf"), + sep = "\t", + quote = F, + row.names = F + ) - write.table(data.frame( - sample_id = gsub("^.*./|.bam", "", pooled.bams), maf = paste0(results.dir, "/pooled/all_all_unique.maf"), - standard_bam = pooled.bams, duplex_bam = "", simplex_bam = "" - ), - paste0(results.dir, "/pooled/pooled_metadata.tsv"), - sep = "\t", quote = F, row.names = F - ) + write.table( + data.frame( + sample_id = gsub("^.*./|.bam", "", pooled.bams), + maf = paste0(results.dir, "/pooled/all_all_unique.maf"), + standard_bam = pooled.bams, + duplex_bam = "", + simplex_bam = "" + ), + paste0(results.dir, "/pooled/pooled_metadata.tsv"), + sep = "\t", + quote = F, + row.names = F + ) - pooled.sample.job.id <- system(paste0( - "bsub -cwd ", results.dir, '/pooled -W 12:00 -R "rusage[mem=8]" -oo genotyping.o -eo genotyping.e ', - " -w ", ' \"', paste0(paste0("done(", unlist(all.fillout.id), ")"), collapse = "&&"), '\" ', - " -P ", project.ID, " -J pooled_genotype_variants ", - " genotype_variants small_variants multiple-samples -i ", results.dir, "/pooled/pooled_metadata.tsv", - " -r ", fasta.path, " -g ", genotyper.path, " -v DEBUG " - ), intern = T) - pooled.sample.job.id <- as.numeric(gsub("Job <|> is.*.$", "", pooled.sample.job.id)) - while ( - !any(grepl("Done successfully", system(paste0("bjobs -l ", pooled.sample.job.id), intern = T))) - ) { - Sys.sleep(120) - } - print("Compile reads done!") -} + pooled.sample.job.id <- system( + paste0( + "bsub -cwd ", + results.dir, + '/pooled -W 12:00 -R "rusage[mem=8]" -oo genotyping.o -eo genotyping.e ', + " -w ", + ' \"', + paste0(paste0( + "done(", unlist(all.fillout.id), ")" + ), collapse = "&&"), + '\" ', + " -P ", + project.ID, + " -J pooled_genotype_variants ", + " genotype_variants small_variants multiple-samples -i ", + results.dir, + "/pooled/pooled_metadata.tsv", + " -r ", + fasta.path, + " -g ", + genotyper.path, + " -v DEBUG " + ), + intern = T + ) + pooled.sample.job.id <- + as.numeric(gsub( + "Job <|> is.*.$", "", pooled.sample.job.id + )) + while (!any(grepl( + "Done successfully", system(paste0("bjobs -l ", pooled.sample.job.id), intern = T) + ))) { + Sys.sleep(120) + } + print("Compile reads done!") + } # Executable ----------------------------------------------------------------------------------------------------------- # Minimal columns for input mafs @@ -214,32 +595,53 @@ if (!interactive()) { parser <- ArgumentParser() parser$add_argument("-m", "--masterref", type = "character", help = "File path to master reference file") parser$add_argument("-o", "--resultsdir", type = "character", help = "Output directory") - parser$add_argument("-pid", "--projectid", - type = "character", default = "", + parser$add_argument( + "-pid", + "--projectid", + type = "character", + default = "", help = "Project ID for submitted jobs involved in this run" ) - parser$add_argument("-pb", "--pooledbamdir", - type = "character", default = "/juno/work/access/production/resources/msk-access/current/novaseq_curated_duplex_bams_dmp/current/", + parser$add_argument( + "-pb", + "--pooledbamdir", + type = "character", + default = "/juno/work/access/production/resources/msk-access/current/novaseq_curated_duplex_bams_dmp/current/", help = "Directory for all pooled bams [default]" ) - parser$add_argument("-fa", "--fastapath", - type = "character", default = "/juno/work/access/production/resources/reference/current/Homo_sapiens_assembly19.fasta", + parser$add_argument( + "-fa", + "--fastapath", + type = "character", + default = "/juno/work/access/production/resources/reference/current/Homo_sapiens_assembly19.fasta", help = "Reference fasta path [default]" ) - parser$add_argument("-gt", "--genotyperpath", - type = "character", default = "/work/access/production/resources/tools/GetBaseCountsMultiSample/current/GetBaseCountsMultiSample", + parser$add_argument( + "-gt", + "--genotyperpath", + type = "character", + default = "/work/access/production/resources/tools/GetBaseCountsMultiSample/current/GetBaseCountsMultiSample", help = "Genotyper executable path [default]" ) - parser$add_argument("-dmp", "--dmpdir", - type = "character", default = "/juno/work/access/production/resources/cbioportal/current/msk_solid_heme", + parser$add_argument( + "-dmp", + "--dmpdir", + type = "character", + default = "/juno/work/access/production/resources/cbioportal/current/msk_solid_heme", help = "Directory of clinical DMP repository [default]" ) - parser$add_argument("-mb", "--mirrorbamdir", - type = "character", default = "/juno/res/dmpcollab/dmpshare/share/irb12_245", + parser$add_argument( + "-mb", + "--mirrorbamdir", + type = "character", + default = "/juno/res/dmpcollab/dmpshare/share/irb12_245", help = "Mirror BAM file directory [default]" ) - parser$add_argument("-dmpk", "--dmpkeypath", - type = "character", default = "/juno/res/dmpcollab/dmprequest/12-245/key.txt", + parser$add_argument( + "-dmpk", + "--dmpkeypath", + type = "character", + default = "/juno/res/dmpcollab/dmprequest/12-245/key.txt", help = "DMP mirror BAM key file [default]" ) args <- parser$parse_args() @@ -256,7 +658,8 @@ if (!interactive()) { if (project.ID == "") { - project.ID <- paste0(sample(c(0:9), size = 10, replace = T), collapse = "") + project.ID <- + paste0(sample(c(0:9), size = 10, replace = T), collapse = "") } print(paste0("Input parameters for run ", project.ID)) @@ -268,6 +671,18 @@ if (!interactive()) { print(dmp.dir) print(mirror.bam.dir) print(dmp.key.path) - suppressWarnings(compile_reads(fread(master.ref), results.dir, project.ID, pooled.bam.dir, fasta.path, genotyper.path, dmp.dir, mirror.bam.dir, dmp.key.path)) + suppressWarnings( + compile_reads( + fread(master.ref), + results.dir, + project.ID, + pooled.bam.dir, + fasta.path, + genotyper.path, + dmp.dir, + mirror.bam.dir, + dmp.key.path + ) + ) print("compile reads function finished") } From f1219d256fead514bb2f6d6024084cf045620674 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 14:12:12 -0500 Subject: [PATCH 060/126] adding access samples to genotype --- R/compile_reads.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index a3d5b92..6b615d2 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -137,7 +137,7 @@ compile_reads <- function(master.ref, duplex_bam = paste0( mirror.access.bam.dir, "/", - access / bam.sub.dir, + access.bam.sub.dir, "/", all.dmp.bam.ids.XS, "-duplex.bam", From 9354f02150069a5a344ff67b1c4c0bff925321d8 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 14:13:38 -0500 Subject: [PATCH 061/126] adding access samples to genotype --- R/compile_reads.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index 6b615d2..ad07cbf 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -144,7 +144,7 @@ compile_reads <- function(master.ref, simplex_bam = paste0( mirror.access.bam.dir, "/", - access / bam.sub.dir, + access.bam.sub.dir, "/", all.dmp.bam.ids.XS, "-simplex.bam" From ac4a3dd2a9ce789113602175043f7e5f9e349459 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 14:15:29 -0500 Subject: [PATCH 062/126] adding access samples to genotype --- R/compile_reads.R | 195 +++++++++++++++++++++++----------------------- 1 file changed, 97 insertions(+), 98 deletions(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index ad07cbf..b88bba3 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -68,7 +68,7 @@ compile_reads <- function(master.ref, filter(Mutation_Status != "GERMLINE") %>% data.table() DMP.RET.maf <- - DMP.maf[grepl(paste0(unique(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id), collapse = "|"), Tumor_Sample_Barcode), ] + DMP.maf[grepl(paste0(unique(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id), collapse = "|"), Tumor_Sample_Barcode),] # Pooled normal samples --------------------------------------------------- pooled.bams <- @@ -140,25 +140,26 @@ compile_reads <- function(master.ref, access.bam.sub.dir, "/", all.dmp.bam.ids.XS, - "-duplex.bam", - simplex_bam = paste0( - mirror.access.bam.dir, - "/", - access.bam.sub.dir, - "/", - all.dmp.bam.ids.XS, - "-simplex.bam" - ) + "-duplex.bam" ) - dmp.sample.sheet <- - bind_row(dmp.sample.sheet, access.sample.sheet) - ) %>% - mutate( - cmo_patient_id = x, - Sample_Type = ifelse(grepl("-T", Sample_Barcode), "DMP_Tumor", "DMP_Normal"), - dmp_patient_id = dmp_id + simplex_bam = paste0( + mirror.access.bam.dir, + "/", + access.bam.sub.dir, + "/", + all.dmp.bam.ids.XS, + "-simplex.bam" ) - } + ) + dmp.sample.sheet <- + bind_row(dmp.sample.sheet, access.sample.sheet) + ) %>% + mutate( + cmo_patient_id = x, + Sample_Type = ifelse(grepl("-T", Sample_Barcode), "DMP_Tumor", "DMP_Normal"), + dmp_patient_id = dmp_id + ) + } # total sample sheet sample.sheet <- master.ref[cmo_patient_id == x, # plasma bams -- duplex and simplex bam @@ -393,7 +394,7 @@ compile_reads <- function(master.ref, HGVSp_Short, Reference_Allele, Tumor_Seq_Allele2 - )])), ] %>% + )])),] %>% mutate( t_ref_count = 0, t_alt_count = 0, @@ -496,87 +497,85 @@ compile_reads <- function(master.ref, }) - # Get base count multi sample in pooled normal ---------------------------- - # all all unique calls in entire cohort - print("Compiling reads in pooled samples") - dir.create(paste0(results.dir, "/pooled")) - all.all.unique.mafs <- - do.call(rbind, lapply(unique(master.ref$cmo_patient_id), function(x) { - fread(list.files( - paste0(results.dir, "/", x), - pattern = "unique_calls.maf$", - full.names = T - )) - })) - all.all.unique.mafs <- - all.all.unique.mafs[!duplicated(all.all.unique.mafs[, .( - Hugo_Symbol, - Chromosome, - Start_Position, - End_Position, - Variant_Classification, - HGVSp_Short, - Reference_Allele, - Tumor_Seq_Allele2 - )]),] - write.table( - all.all.unique.mafs, - paste0(results.dir, "/pooled/all_all_unique.maf"), - sep = "\t", - quote = F, - row.names = F - ) + # Get base count multi sample in pooled normal ---------------------------- + # all all unique calls in entire cohort + print("Compiling reads in pooled samples") + dir.create(paste0(results.dir, "/pooled")) + all.all.unique.mafs <- + do.call(rbind, lapply(unique(master.ref$cmo_patient_id), function(x) { + fread(list.files( + paste0(results.dir, "/", x), + pattern = "unique_calls.maf$", + full.names = T + )) + })) + all.all.unique.mafs <- + all.all.unique.mafs[!duplicated(all.all.unique.mafs[, .( + Hugo_Symbol, + Chromosome, + Start_Position, + End_Position, + Variant_Classification, + HGVSp_Short, + Reference_Allele, + Tumor_Seq_Allele2 + )]), ] + write.table( + all.all.unique.mafs, + paste0(results.dir, "/pooled/all_all_unique.maf"), + sep = "\t", + quote = F, + row.names = F + ) - write.table( - data.frame( - sample_id = gsub("^.*./|.bam", "", pooled.bams), - maf = paste0(results.dir, "/pooled/all_all_unique.maf"), - standard_bam = pooled.bams, - duplex_bam = "", - simplex_bam = "" - ), - paste0(results.dir, "/pooled/pooled_metadata.tsv"), - sep = "\t", - quote = F, - row.names = F - ) + write.table( + data.frame( + sample_id = gsub("^.*./|.bam", "", pooled.bams), + maf = paste0(results.dir, "/pooled/all_all_unique.maf"), + standard_bam = pooled.bams, + duplex_bam = "", + simplex_bam = "" + ), + paste0(results.dir, "/pooled/pooled_metadata.tsv"), + sep = "\t", + quote = F, + row.names = F + ) - pooled.sample.job.id <- system( - paste0( - "bsub -cwd ", - results.dir, - '/pooled -W 12:00 -R "rusage[mem=8]" -oo genotyping.o -eo genotyping.e ', - " -w ", - ' \"', - paste0(paste0( - "done(", unlist(all.fillout.id), ")" - ), collapse = "&&"), - '\" ', - " -P ", - project.ID, - " -J pooled_genotype_variants ", - " genotype_variants small_variants multiple-samples -i ", - results.dir, - "/pooled/pooled_metadata.tsv", - " -r ", - fasta.path, - " -g ", - genotyper.path, - " -v DEBUG " - ), - intern = T - ) - pooled.sample.job.id <- - as.numeric(gsub( - "Job <|> is.*.$", "", pooled.sample.job.id - )) - while (!any(grepl( - "Done successfully", system(paste0("bjobs -l ", pooled.sample.job.id), intern = T) - ))) { - Sys.sleep(120) - } - print("Compile reads done!") - } + pooled.sample.job.id <- system( + paste0( + "bsub -cwd ", + results.dir, + '/pooled -W 12:00 -R "rusage[mem=8]" -oo genotyping.o -eo genotyping.e ', + " -w ", + ' \"', + paste0(paste0( + "done(", unlist(all.fillout.id), ")" + ), collapse = "&&"), + '\" ', + " -P ", + project.ID, + " -J pooled_genotype_variants ", + " genotype_variants small_variants multiple-samples -i ", + results.dir, + "/pooled/pooled_metadata.tsv", + " -r ", + fasta.path, + " -g ", + genotyper.path, + " -v DEBUG " + ), + intern = T + ) + pooled.sample.job.id <- + as.numeric(gsub("Job <|> is.*.$", "", pooled.sample.job.id)) + while (!any(grepl("Done successfully", system( + paste0("bjobs -l ", pooled.sample.job.id), intern = T + )))) { + Sys.sleep(120) + } + print("Compile reads done!") + } # Executable ----------------------------------------------------------------------------------------------------------- # Minimal columns for input mafs From 1bf18b77340dd65249c7a7eb6d6aadc3a7c52111 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 14:16:23 -0500 Subject: [PATCH 063/126] adding access samples to genotype --- R/compile_reads.R | 177 +++++++++++++++++++++++----------------------- 1 file changed, 88 insertions(+), 89 deletions(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index b88bba3..d574c1c 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -68,7 +68,7 @@ compile_reads <- function(master.ref, filter(Mutation_Status != "GERMLINE") %>% data.table() DMP.RET.maf <- - DMP.maf[grepl(paste0(unique(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id), collapse = "|"), Tumor_Sample_Barcode),] + DMP.maf[grepl(paste0(unique(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id), collapse = "|"), Tumor_Sample_Barcode), ] # Pooled normal samples --------------------------------------------------- pooled.bams <- @@ -151,15 +151,16 @@ compile_reads <- function(master.ref, "-simplex.bam" ) ) - dmp.sample.sheet <- - bind_row(dmp.sample.sheet, access.sample.sheet) - ) %>% - mutate( - cmo_patient_id = x, - Sample_Type = ifelse(grepl("-T", Sample_Barcode), "DMP_Tumor", "DMP_Normal"), - dmp_patient_id = dmp_id - ) - } + ) + dmp.sample.sheet <- + bind_row(dmp.sample.sheet, access.sample.sheet) + ) %>% + mutate( + cmo_patient_id = x, + Sample_Type = ifelse(grepl("-T", Sample_Barcode), "DMP_Tumor", "DMP_Normal"), + dmp_patient_id = dmp_id + ) + } # total sample sheet sample.sheet <- master.ref[cmo_patient_id == x, # plasma bams -- duplex and simplex bam @@ -394,7 +395,7 @@ compile_reads <- function(master.ref, HGVSp_Short, Reference_Allele, Tumor_Seq_Allele2 - )])),] %>% + )])), ] %>% mutate( t_ref_count = 0, t_alt_count = 0, @@ -494,88 +495,86 @@ compile_reads <- function(master.ref, intern = T ) job.ids <- as.numeric(gsub("Job <|> is.*.$", "", job.ids)) - }) + }) - # Get base count multi sample in pooled normal ---------------------------- - # all all unique calls in entire cohort - print("Compiling reads in pooled samples") - dir.create(paste0(results.dir, "/pooled")) - all.all.unique.mafs <- - do.call(rbind, lapply(unique(master.ref$cmo_patient_id), function(x) { - fread(list.files( - paste0(results.dir, "/", x), - pattern = "unique_calls.maf$", - full.names = T - )) - })) - all.all.unique.mafs <- - all.all.unique.mafs[!duplicated(all.all.unique.mafs[, .( - Hugo_Symbol, - Chromosome, - Start_Position, - End_Position, - Variant_Classification, - HGVSp_Short, - Reference_Allele, - Tumor_Seq_Allele2 - )]), ] - write.table( - all.all.unique.mafs, - paste0(results.dir, "/pooled/all_all_unique.maf"), - sep = "\t", - quote = F, - row.names = F - ) + # Get base count multi sample in pooled normal ---------------------------- + # all all unique calls in entire cohort + print("Compiling reads in pooled samples") + dir.create(paste0(results.dir, "/pooled")) + all.all.unique.mafs <- + do.call(rbind, lapply(unique(master.ref$cmo_patient_id), function(x) { + fread(list.files( + paste0(results.dir, "/", x), + pattern = "unique_calls.maf$", + full.names = T + )) + })) + all.all.unique.mafs <- + all.all.unique.mafs[!duplicated(all.all.unique.mafs[, .( + Hugo_Symbol, + Chromosome, + Start_Position, + End_Position, + Variant_Classification, + HGVSp_Short, + Reference_Allele, + Tumor_Seq_Allele2 + )]),] + write.table( + all.all.unique.mafs, + paste0(results.dir, "/pooled/all_all_unique.maf"), + sep = "\t", + quote = F, + row.names = F + ) - write.table( - data.frame( - sample_id = gsub("^.*./|.bam", "", pooled.bams), - maf = paste0(results.dir, "/pooled/all_all_unique.maf"), - standard_bam = pooled.bams, - duplex_bam = "", - simplex_bam = "" - ), - paste0(results.dir, "/pooled/pooled_metadata.tsv"), - sep = "\t", - quote = F, - row.names = F - ) + write.table( + data.frame( + sample_id = gsub("^.*./|.bam", "", pooled.bams), + maf = paste0(results.dir, "/pooled/all_all_unique.maf"), + standard_bam = pooled.bams, + duplex_bam = "", + simplex_bam = "" + ), + paste0(results.dir, "/pooled/pooled_metadata.tsv"), + sep = "\t", + quote = F, + row.names = F + ) - pooled.sample.job.id <- system( - paste0( - "bsub -cwd ", - results.dir, - '/pooled -W 12:00 -R "rusage[mem=8]" -oo genotyping.o -eo genotyping.e ', - " -w ", - ' \"', - paste0(paste0( - "done(", unlist(all.fillout.id), ")" - ), collapse = "&&"), - '\" ', - " -P ", - project.ID, - " -J pooled_genotype_variants ", - " genotype_variants small_variants multiple-samples -i ", - results.dir, - "/pooled/pooled_metadata.tsv", - " -r ", - fasta.path, - " -g ", - genotyper.path, - " -v DEBUG " - ), - intern = T - ) - pooled.sample.job.id <- - as.numeric(gsub("Job <|> is.*.$", "", pooled.sample.job.id)) - while (!any(grepl("Done successfully", system( - paste0("bjobs -l ", pooled.sample.job.id), intern = T - )))) { - Sys.sleep(120) - } - print("Compile reads done!") - } + pooled.sample.job.id <- system( + paste0( + "bsub -cwd ", + results.dir, + '/pooled -W 12:00 -R "rusage[mem=8]" -oo genotyping.o -eo genotyping.e ', + " -w ", + ' \"', + paste0(paste0("done(", unlist(all.fillout.id), ")"), collapse = "&&"), + '\" ', + " -P ", + project.ID, + " -J pooled_genotype_variants ", + " genotype_variants small_variants multiple-samples -i ", + results.dir, + "/pooled/pooled_metadata.tsv", + " -r ", + fasta.path, + " -g ", + genotyper.path, + " -v DEBUG " + ), + intern = T + ) + pooled.sample.job.id <- + as.numeric(gsub("Job <|> is.*.$", "", pooled.sample.job.id)) + while (!any(grepl("Done successfully", system( + paste0("bjobs -l ", pooled.sample.job.id), intern = T + )))) { + Sys.sleep(120) + } + print("Compile reads done!") +} # Executable ----------------------------------------------------------------------------------------------------------- # Minimal columns for input mafs From 5ff880b68a2ca24d0464e3e4c8142b8a9fe62d1b Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 14:20:07 -0500 Subject: [PATCH 064/126] adding access samples to genotype --- R/compile_reads.R | 224 +++++++++++++++++++++++----------------------- 1 file changed, 113 insertions(+), 111 deletions(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index d574c1c..e581599 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -68,7 +68,7 @@ compile_reads <- function(master.ref, filter(Mutation_Status != "GERMLINE") %>% data.table() DMP.RET.maf <- - DMP.maf[grepl(paste0(unique(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id), collapse = "|"), Tumor_Sample_Barcode), ] + DMP.maf[grepl(paste0(unique(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id), collapse = "|"), Tumor_Sample_Barcode),] # Pooled normal samples --------------------------------------------------- pooled.bams <- @@ -125,42 +125,44 @@ compile_reads <- function(master.ref, all.dmp.bam.ids, ".bam" ) - access.bam.sub.dir <- - unlist(lapply(strsplit( - substr(all.dmp.bam.ids.XS, 1, 2), "" - ), function(x) { - paste0(x, collapse = "/") - })) - access.sample.sheet <- unique( - data.frame( - Sample_Barcode = all.dmp.ids.XS, - duplex_bam = paste0( - mirror.access.bam.dir, - "/", - access.bam.sub.dir, - "/", - all.dmp.bam.ids.XS, - "-duplex.bam" - ) - simplex_bam = paste0( - mirror.access.bam.dir, - "/", - access.bam.sub.dir, - "/", - all.dmp.bam.ids.XS, - "-simplex.bam" - ) + ) + access.bam.sub.dir <- + unlist(lapply(strsplit( + substr(all.dmp.bam.ids.XS, 1, 2), "" + ), function(x) { + paste0(x, collapse = "/") + })) + access.sample.sheet <- unique( + data.frame( + Sample_Barcode = all.dmp.ids.XS, + duplex_bam = paste0( + mirror.access.bam.dir, + "/", + access.bam.sub.dir, + "/", + all.dmp.bam.ids.XS, + "-duplex.bam" + ) + simplex_bam = paste0( + mirror.access.bam.dir, + "/", + access.bam.sub.dir, + "/", + all.dmp.bam.ids.XS, + "-simplex.bam" ) ) - dmp.sample.sheet <- - bind_row(dmp.sample.sheet, access.sample.sheet) - ) %>% - mutate( - cmo_patient_id = x, - Sample_Type = ifelse(grepl("-T", Sample_Barcode), "DMP_Tumor", "DMP_Normal"), - dmp_patient_id = dmp_id - ) - } + ) + dmp.sample.sheet <- + bind_row(dmp.sample.sheet, access.sample.sheet) + ) + %>% + mutate( + cmo_patient_id = x, + Sample_Type = ifelse(grepl("-T", Sample_Barcode), "DMP_Tumor", "DMP_Normal"), + dmp_patient_id = dmp_id + ) + } # total sample sheet sample.sheet <- master.ref[cmo_patient_id == x, # plasma bams -- duplex and simplex bam @@ -395,7 +397,7 @@ compile_reads <- function(master.ref, HGVSp_Short, Reference_Allele, Tumor_Seq_Allele2 - )])), ] %>% + )])),] %>% mutate( t_ref_count = 0, t_alt_count = 0, @@ -495,85 +497,85 @@ compile_reads <- function(master.ref, intern = T ) job.ids <- as.numeric(gsub("Job <|> is.*.$", "", job.ids)) - }) + }) - # Get base count multi sample in pooled normal ---------------------------- - # all all unique calls in entire cohort - print("Compiling reads in pooled samples") - dir.create(paste0(results.dir, "/pooled")) - all.all.unique.mafs <- - do.call(rbind, lapply(unique(master.ref$cmo_patient_id), function(x) { - fread(list.files( - paste0(results.dir, "/", x), - pattern = "unique_calls.maf$", - full.names = T - )) - })) - all.all.unique.mafs <- - all.all.unique.mafs[!duplicated(all.all.unique.mafs[, .( - Hugo_Symbol, - Chromosome, - Start_Position, - End_Position, - Variant_Classification, - HGVSp_Short, - Reference_Allele, - Tumor_Seq_Allele2 - )]),] - write.table( - all.all.unique.mafs, - paste0(results.dir, "/pooled/all_all_unique.maf"), - sep = "\t", - quote = F, - row.names = F - ) +# Get base count multi sample in pooled normal ---------------------------- +# all all unique calls in entire cohort +print("Compiling reads in pooled samples") +dir.create(paste0(results.dir, "/pooled")) +all.all.unique.mafs <- + do.call(rbind, lapply(unique(master.ref$cmo_patient_id), function(x) { + fread(list.files( + paste0(results.dir, "/", x), + pattern = "unique_calls.maf$", + full.names = T + )) + })) +all.all.unique.mafs <- + all.all.unique.mafs[!duplicated(all.all.unique.mafs[, .( + Hugo_Symbol, + Chromosome, + Start_Position, + End_Position, + Variant_Classification, + HGVSp_Short, + Reference_Allele, + Tumor_Seq_Allele2 + )]),] +write.table( + all.all.unique.mafs, + paste0(results.dir, "/pooled/all_all_unique.maf"), + sep = "\t", + quote = F, + row.names = F +) - write.table( - data.frame( - sample_id = gsub("^.*./|.bam", "", pooled.bams), - maf = paste0(results.dir, "/pooled/all_all_unique.maf"), - standard_bam = pooled.bams, - duplex_bam = "", - simplex_bam = "" - ), - paste0(results.dir, "/pooled/pooled_metadata.tsv"), - sep = "\t", - quote = F, - row.names = F - ) +write.table( + data.frame( + sample_id = gsub("^.*./|.bam", "", pooled.bams), + maf = paste0(results.dir, "/pooled/all_all_unique.maf"), + standard_bam = pooled.bams, + duplex_bam = "", + simplex_bam = "" + ), + paste0(results.dir, "/pooled/pooled_metadata.tsv"), + sep = "\t", + quote = F, + row.names = F +) - pooled.sample.job.id <- system( - paste0( - "bsub -cwd ", - results.dir, - '/pooled -W 12:00 -R "rusage[mem=8]" -oo genotyping.o -eo genotyping.e ', - " -w ", - ' \"', - paste0(paste0("done(", unlist(all.fillout.id), ")"), collapse = "&&"), - '\" ', - " -P ", - project.ID, - " -J pooled_genotype_variants ", - " genotype_variants small_variants multiple-samples -i ", - results.dir, - "/pooled/pooled_metadata.tsv", - " -r ", - fasta.path, - " -g ", - genotyper.path, - " -v DEBUG " - ), - intern = T - ) - pooled.sample.job.id <- - as.numeric(gsub("Job <|> is.*.$", "", pooled.sample.job.id)) - while (!any(grepl("Done successfully", system( - paste0("bjobs -l ", pooled.sample.job.id), intern = T - )))) { - Sys.sleep(120) - } - print("Compile reads done!") +pooled.sample.job.id <- system( + paste0( + "bsub -cwd ", + results.dir, + '/pooled -W 12:00 -R "rusage[mem=8]" -oo genotyping.o -eo genotyping.e ', + " -w ", + ' \"', + paste0(paste0("done(", unlist(all.fillout.id), ")"), collapse = "&&"), + '\" ', + " -P ", + project.ID, + " -J pooled_genotype_variants ", + " genotype_variants small_variants multiple-samples -i ", + results.dir, + "/pooled/pooled_metadata.tsv", + " -r ", + fasta.path, + " -g ", + genotyper.path, + " -v DEBUG " + ), + intern = T +) +pooled.sample.job.id <- + as.numeric(gsub("Job <|> is.*.$", "", pooled.sample.job.id)) +while (!any(grepl("Done successfully", system( + paste0("bjobs -l ", pooled.sample.job.id), intern = T +)))) { + Sys.sleep(120) +} +print("Compile reads done!") } # Executable ----------------------------------------------------------------------------------------------------------- From ed8e2f0d1c83d219e5a40bd13ab018b0fec85f1d Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 14:21:31 -0500 Subject: [PATCH 065/126] adding access samples to genotype --- R/compile_reads.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index e581599..af7b655 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -68,7 +68,7 @@ compile_reads <- function(master.ref, filter(Mutation_Status != "GERMLINE") %>% data.table() DMP.RET.maf <- - DMP.maf[grepl(paste0(unique(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id), collapse = "|"), Tumor_Sample_Barcode),] + DMP.maf[grepl(paste0(unique(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id), collapse = "|"), Tumor_Sample_Barcode), ] # Pooled normal samples --------------------------------------------------- pooled.bams <- @@ -142,7 +142,7 @@ compile_reads <- function(master.ref, "/", all.dmp.bam.ids.XS, "-duplex.bam" - ) + ), simplex_bam = paste0( mirror.access.bam.dir, "/", @@ -397,7 +397,7 @@ compile_reads <- function(master.ref, HGVSp_Short, Reference_Allele, Tumor_Seq_Allele2 - )])),] %>% + )])), ] %>% mutate( t_ref_count = 0, t_alt_count = 0, From 33f0e4b1c52eab18be13ca5aa41effad9409c6b5 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 14:22:33 -0500 Subject: [PATCH 066/126] adding access samples to genotype --- R/compile_reads.R | 169 +++++++++++++++++++++++----------------------- 1 file changed, 84 insertions(+), 85 deletions(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index af7b655..fe6bee5 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -68,7 +68,7 @@ compile_reads <- function(master.ref, filter(Mutation_Status != "GERMLINE") %>% data.table() DMP.RET.maf <- - DMP.maf[grepl(paste0(unique(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id), collapse = "|"), Tumor_Sample_Barcode), ] + DMP.maf[grepl(paste0(unique(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id), collapse = "|"), Tumor_Sample_Barcode),] # Pooled normal samples --------------------------------------------------- pooled.bams <- @@ -155,14 +155,13 @@ compile_reads <- function(master.ref, ) dmp.sample.sheet <- bind_row(dmp.sample.sheet, access.sample.sheet) - ) - %>% - mutate( - cmo_patient_id = x, - Sample_Type = ifelse(grepl("-T", Sample_Barcode), "DMP_Tumor", "DMP_Normal"), - dmp_patient_id = dmp_id - ) - } + %>% + mutate( + cmo_patient_id = x, + Sample_Type = ifelse(grepl("-T", Sample_Barcode), "DMP_Tumor", "DMP_Normal"), + dmp_patient_id = dmp_id + ) + } # total sample sheet sample.sheet <- master.ref[cmo_patient_id == x, # plasma bams -- duplex and simplex bam @@ -397,7 +396,7 @@ compile_reads <- function(master.ref, HGVSp_Short, Reference_Allele, Tumor_Seq_Allele2 - )])), ] %>% + )])),] %>% mutate( t_ref_count = 0, t_alt_count = 0, @@ -497,85 +496,85 @@ compile_reads <- function(master.ref, intern = T ) job.ids <- as.numeric(gsub("Job <|> is.*.$", "", job.ids)) - }) + }) -# Get base count multi sample in pooled normal ---------------------------- -# all all unique calls in entire cohort -print("Compiling reads in pooled samples") -dir.create(paste0(results.dir, "/pooled")) -all.all.unique.mafs <- - do.call(rbind, lapply(unique(master.ref$cmo_patient_id), function(x) { - fread(list.files( - paste0(results.dir, "/", x), - pattern = "unique_calls.maf$", - full.names = T - )) - })) -all.all.unique.mafs <- - all.all.unique.mafs[!duplicated(all.all.unique.mafs[, .( - Hugo_Symbol, - Chromosome, - Start_Position, - End_Position, - Variant_Classification, - HGVSp_Short, - Reference_Allele, - Tumor_Seq_Allele2 - )]),] -write.table( - all.all.unique.mafs, - paste0(results.dir, "/pooled/all_all_unique.maf"), - sep = "\t", - quote = F, - row.names = F -) + # Get base count multi sample in pooled normal ---------------------------- + # all all unique calls in entire cohort + print("Compiling reads in pooled samples") + dir.create(paste0(results.dir, "/pooled")) + all.all.unique.mafs <- + do.call(rbind, lapply(unique(master.ref$cmo_patient_id), function(x) { + fread(list.files( + paste0(results.dir, "/", x), + pattern = "unique_calls.maf$", + full.names = T + )) + })) + all.all.unique.mafs <- + all.all.unique.mafs[!duplicated(all.all.unique.mafs[, .( + Hugo_Symbol, + Chromosome, + Start_Position, + End_Position, + Variant_Classification, + HGVSp_Short, + Reference_Allele, + Tumor_Seq_Allele2 + )]), ] + write.table( + all.all.unique.mafs, + paste0(results.dir, "/pooled/all_all_unique.maf"), + sep = "\t", + quote = F, + row.names = F + ) -write.table( - data.frame( - sample_id = gsub("^.*./|.bam", "", pooled.bams), - maf = paste0(results.dir, "/pooled/all_all_unique.maf"), - standard_bam = pooled.bams, - duplex_bam = "", - simplex_bam = "" - ), - paste0(results.dir, "/pooled/pooled_metadata.tsv"), - sep = "\t", - quote = F, - row.names = F -) + write.table( + data.frame( + sample_id = gsub("^.*./|.bam", "", pooled.bams), + maf = paste0(results.dir, "/pooled/all_all_unique.maf"), + standard_bam = pooled.bams, + duplex_bam = "", + simplex_bam = "" + ), + paste0(results.dir, "/pooled/pooled_metadata.tsv"), + sep = "\t", + quote = F, + row.names = F + ) -pooled.sample.job.id <- system( - paste0( - "bsub -cwd ", - results.dir, - '/pooled -W 12:00 -R "rusage[mem=8]" -oo genotyping.o -eo genotyping.e ', - " -w ", - ' \"', - paste0(paste0("done(", unlist(all.fillout.id), ")"), collapse = "&&"), - '\" ', - " -P ", - project.ID, - " -J pooled_genotype_variants ", - " genotype_variants small_variants multiple-samples -i ", - results.dir, - "/pooled/pooled_metadata.tsv", - " -r ", - fasta.path, - " -g ", - genotyper.path, - " -v DEBUG " - ), - intern = T -) -pooled.sample.job.id <- - as.numeric(gsub("Job <|> is.*.$", "", pooled.sample.job.id)) -while (!any(grepl("Done successfully", system( - paste0("bjobs -l ", pooled.sample.job.id), intern = T -)))) { - Sys.sleep(120) -} -print("Compile reads done!") + pooled.sample.job.id <- system( + paste0( + "bsub -cwd ", + results.dir, + '/pooled -W 12:00 -R "rusage[mem=8]" -oo genotyping.o -eo genotyping.e ', + " -w ", + ' \"', + paste0(paste0("done(", unlist(all.fillout.id), ")"), collapse = "&&"), + '\" ', + " -P ", + project.ID, + " -J pooled_genotype_variants ", + " genotype_variants small_variants multiple-samples -i ", + results.dir, + "/pooled/pooled_metadata.tsv", + " -r ", + fasta.path, + " -g ", + genotyper.path, + " -v DEBUG " + ), + intern = T + ) + pooled.sample.job.id <- + as.numeric(gsub("Job <|> is.*.$", "", pooled.sample.job.id)) + while (!any(grepl("Done successfully", system( + paste0("bjobs -l ", pooled.sample.job.id), intern = T + )))) { + Sys.sleep(120) + } + print("Compile reads done!") } # Executable ----------------------------------------------------------------------------------------------------------- From 6de50741aad96457938a6e8099c846d16266b48d Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 14:25:06 -0500 Subject: [PATCH 067/126] adding access samples to genotype --- R/compile_reads.R | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index fe6bee5..229fef2 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -68,7 +68,7 @@ compile_reads <- function(master.ref, filter(Mutation_Status != "GERMLINE") %>% data.table() DMP.RET.maf <- - DMP.maf[grepl(paste0(unique(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id), collapse = "|"), Tumor_Sample_Barcode),] + DMP.maf[grepl(paste0(unique(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id), collapse = "|"), Tumor_Sample_Barcode), ] # Pooled normal samples --------------------------------------------------- pooled.bams <- @@ -154,8 +154,7 @@ compile_reads <- function(master.ref, ) ) dmp.sample.sheet <- - bind_row(dmp.sample.sheet, access.sample.sheet) - %>% + bind_row(dmp.sample.sheet, access.sample.sheet) %>% mutate( cmo_patient_id = x, Sample_Type = ifelse(grepl("-T", Sample_Barcode), "DMP_Tumor", "DMP_Normal"), @@ -396,7 +395,7 @@ compile_reads <- function(master.ref, HGVSp_Short, Reference_Allele, Tumor_Seq_Allele2 - )])),] %>% + )])), ] %>% mutate( t_ref_count = 0, t_alt_count = 0, @@ -521,7 +520,7 @@ compile_reads <- function(master.ref, HGVSp_Short, Reference_Allele, Tumor_Seq_Allele2 - )]), ] + )]),] write.table( all.all.unique.mafs, paste0(results.dir, "/pooled/all_all_unique.maf"), From 5f3fcfccf2b7b10c94b5fbf6b1462ab2cc29ee6d Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 14:27:15 -0500 Subject: [PATCH 068/126] adding access samples to genotype --- R/compile_reads.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index 229fef2..a979def 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -43,7 +43,7 @@ compile_reads <- function(master.ref, if (any( !master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-IH.|-T..-IM.|-T..-XS", "", DMP.key[grepl("IH|IM|XS", V1)]$V1) )) { - stop(paste0( + warning(paste0( "These DMP IDs are not found in DMP key file: ", paste0(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id[which( !master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% From fac54d0b36b7057cf1a9a9373ca680141c0f3dd2 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 14:32:16 -0500 Subject: [PATCH 069/126] adding access samples to genotype --- R/compile_reads.R | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/R/compile_reads.R b/R/compile_reads.R index a979def..605bd7a 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -635,6 +635,13 @@ if (!interactive()) { default = "/juno/res/dmpcollab/dmpshare/share/irb12_245", help = "Mirror BAM file directory [default]" ) + parser$add_argument( + "-mab", + "--mirroraccessbamdir", + type = "character", + default = "/juno/res/dmpcollab/dmpshare/share/access_12_245", + help = "Mirror BAM file directory for MSK-ACCESS [default]" + ) parser$add_argument( "-dmpk", "--dmpkeypath", @@ -642,6 +649,13 @@ if (!interactive()) { default = "/juno/res/dmpcollab/dmprequest/12-245/key.txt", help = "DMP mirror BAM key file [default]" ) + parser$add_argument( + "-dmpak", + "--dmpaccesskeypath", + type = "character", + default = " /juno/res/dmpcollab/dmprequest/ACCESS-12-245/key.txt", + help = "DMP mirror BAM key file for MSK-ACCESS [default]" + ) args <- parser$parse_args() master.ref <- args$masterref @@ -652,7 +666,9 @@ if (!interactive()) { genotyper.path <- args$genotyperpath dmp.dir <- args$dmpdir mirror.bam.dir <- args$mirrorbamdir + mirror.acess.bam.dir <- args$mirroraccessbamdir dmp.key.path <- args$dmpkeypath + access.key.path <- args$dmpaccesskeypath if (project.ID == "") { From 0d4e7dd4a96212d1ec95bdf77bcf55617c40c6be Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 14:34:16 -0500 Subject: [PATCH 070/126] adding access samples to genotype --- R/compile_reads.R | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index 605bd7a..7274bfb 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -51,7 +51,7 @@ compile_reads <- function(master.ref, )], collapse = " ,") )) } - acccess.key <- fread(access.key.path) + access.key <- fread(access.key.path) if (any( !master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-IH.|-T..-IM.|-T..-XS", "", access.key[grepl("IH|IM|XS", V1)]$V1) )) { @@ -68,7 +68,7 @@ compile_reads <- function(master.ref, filter(Mutation_Status != "GERMLINE") %>% data.table() DMP.RET.maf <- - DMP.maf[grepl(paste0(unique(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id), collapse = "|"), Tumor_Sample_Barcode), ] + DMP.maf[grepl(paste0(unique(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id), collapse = "|"), Tumor_Sample_Barcode),] # Pooled normal samples --------------------------------------------------- pooled.bams <- @@ -395,7 +395,7 @@ compile_reads <- function(master.ref, HGVSp_Short, Reference_Allele, Tumor_Seq_Allele2 - )])), ] %>% + )])),] %>% mutate( t_ref_count = 0, t_alt_count = 0, @@ -520,7 +520,7 @@ compile_reads <- function(master.ref, HGVSp_Short, Reference_Allele, Tumor_Seq_Allele2 - )]),] + )]), ] write.table( all.all.unique.mafs, paste0(results.dir, "/pooled/all_all_unique.maf"), @@ -666,7 +666,7 @@ if (!interactive()) { genotyper.path <- args$genotyperpath dmp.dir <- args$dmpdir mirror.bam.dir <- args$mirrorbamdir - mirror.acess.bam.dir <- args$mirroraccessbamdir + mirror.access.bam.dir <- args$mirroraccessbamdir dmp.key.path <- args$dmpkeypath access.key.path <- args$dmpaccesskeypath @@ -684,7 +684,9 @@ if (!interactive()) { print(genotyper.path) print(dmp.dir) print(mirror.bam.dir) - print(dmp.key.path) + print(mirror.bam.dir) + print(dmp.access.key.path) + print(access.key.path) suppressWarnings( compile_reads( fread(master.ref), From fa8f465418365a124c50170c11e7e105441cc44b Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 14:35:54 -0500 Subject: [PATCH 071/126] adding access samples to genotype --- R/compile_reads.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index 7274bfb..9c4c22f 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -684,8 +684,8 @@ if (!interactive()) { print(genotyper.path) print(dmp.dir) print(mirror.bam.dir) - print(mirror.bam.dir) - print(dmp.access.key.path) + print(mirror.access.bam.dir) + print(dmp.key.path) print(access.key.path) suppressWarnings( compile_reads( From 22c161f7744112fa47dfe78596f021d8adb360e6 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 14:37:22 -0500 Subject: [PATCH 072/126] adding access samples to genotype --- R/compile_reads.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index 9c4c22f..a1d14b9 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -55,7 +55,7 @@ compile_reads <- function(master.ref, if (any( !master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-IH.|-T..-IM.|-T..-XS", "", access.key[grepl("IH|IM|XS", V1)]$V1) )) { - stop(paste0( + warning(paste0( "These DMP IDs are not found in DMP key file: ", paste0(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id[which( !master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% From ae9a31f5918749579464f54c5dd6a36792e7b389 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 14:39:04 -0500 Subject: [PATCH 073/126] adding access samples to genotype --- R/compile_reads.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index a1d14b9..9a46e76 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -96,7 +96,7 @@ compile_reads <- function(master.ref, all.dmp.ids.IH <- DMP.key[grepl(paste0(dmp_id, "-(T|N)..-IH."), V1)]$V1 all.dmp.ids.XS <- - acccess.key[grepl(paste0(dmp_id, "-(T|N)..-XS."), V1)]$V1 + access.key[grepl(paste0(dmp_id, "-(T|N)..-XS."), V1)]$V1 all.dmp.ids <- c(all.dmp.ids.IM, all.dmp.ids.IH) all.dmp.bam.ids.IM <- DMP.key[grepl(paste0(dmp_id, "-(T|N)..-IM."), V1)]$V2 From eef32cdf44e4405c8d6acdcc532e262c5386baa6 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 14:45:26 -0500 Subject: [PATCH 074/126] adding access samples to genotype --- R/compile_reads.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/R/compile_reads.R b/R/compile_reads.R index 9a46e76..3026cf6 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -102,10 +102,14 @@ compile_reads <- function(master.ref, DMP.key[grepl(paste0(dmp_id, "-(T|N)..-IM."), V1)]$V2 all.dmp.bam.ids.IH <- DMP.key[grepl(paste0(dmp_id, "-(T|N)..-IH."), V1)]$V2 + all.dmp.bam.ids.XS <- + access.key[grepl(paste0(dmp_id, "-(T|N)..-XS."), V1)]$V2) + cat(all.dmp.bam.ids.XS) all.dmp.bam.ids.XS <- gsub("-standard|-unfilter|-simplex|-duplex", "", access.key[grepl(paste0(dmp_id, "-(T|N)..-XS."), V1)]$V2) + cat(all.dmp.bam.ids.XS) all.dmp.bam.ids <- c(all.dmp.bam.ids.IM, all.dmp.bam.ids.IH) From 43ef060f9b5f12a3c9eca8cb29887bf66e569f5d Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 14:46:30 -0500 Subject: [PATCH 075/126] adding access samples to genotype --- R/compile_reads.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index 3026cf6..7608566 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -103,7 +103,7 @@ compile_reads <- function(master.ref, all.dmp.bam.ids.IH <- DMP.key[grepl(paste0(dmp_id, "-(T|N)..-IH."), V1)]$V2 all.dmp.bam.ids.XS <- - access.key[grepl(paste0(dmp_id, "-(T|N)..-XS."), V1)]$V2) + access.key[grepl(paste0(dmp_id, "-(T|N)..-XS."), V1)]$V2 cat(all.dmp.bam.ids.XS) all.dmp.bam.ids.XS <- gsub("-standard|-unfilter|-simplex|-duplex", From 3eba1791e1409bed71e99d64e86abeec4af2f49b Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 14:47:43 -0500 Subject: [PATCH 076/126] adding access samples to genotype --- R/compile_reads.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index 7608566..c0b3c15 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -104,12 +104,12 @@ compile_reads <- function(master.ref, DMP.key[grepl(paste0(dmp_id, "-(T|N)..-IH."), V1)]$V2 all.dmp.bam.ids.XS <- access.key[grepl(paste0(dmp_id, "-(T|N)..-XS."), V1)]$V2 - cat(all.dmp.bam.ids.XS) + print(all.dmp.bam.ids.XS) all.dmp.bam.ids.XS <- gsub("-standard|-unfilter|-simplex|-duplex", "", access.key[grepl(paste0(dmp_id, "-(T|N)..-XS."), V1)]$V2) - cat(all.dmp.bam.ids.XS) + print(all.dmp.bam.ids.XS) all.dmp.bam.ids <- c(all.dmp.bam.ids.IM, all.dmp.bam.ids.IH) From b68868f4d8d6fdd4da1c545b283f6f28968369fb Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 14:57:34 -0500 Subject: [PATCH 077/126] adding access samples to genotype --- R/compile_reads.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index c0b3c15..4d15cbb 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -43,7 +43,7 @@ compile_reads <- function(master.ref, if (any( !master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-IH.|-T..-IM.|-T..-XS", "", DMP.key[grepl("IH|IM|XS", V1)]$V1) )) { - warning(paste0( + message(paste0( "These DMP IDs are not found in DMP key file: ", paste0(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id[which( !master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% @@ -55,7 +55,7 @@ compile_reads <- function(master.ref, if (any( !master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-IH.|-T..-IM.|-T..-XS", "", access.key[grepl("IH|IM|XS", V1)]$V1) )) { - warning(paste0( + message(paste0( "These DMP IDs are not found in DMP key file: ", paste0(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id[which( !master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% @@ -63,6 +63,7 @@ compile_reads <- function(master.ref, )], collapse = " ,") )) } + print(dmp_patient_id) DMP.maf <- fread(paste0(dmp.dir, "/data_mutations_extended.txt")) %>% filter(Mutation_Status != "GERMLINE") %>% From 8f9defecf874ed17bad3028c43af9b8ab30a2c14 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 15:29:38 -0500 Subject: [PATCH 078/126] adding access samples to genotype --- R/compile_reads.R | 1033 +++++++++++++++++++++++---------------------- 1 file changed, 525 insertions(+), 508 deletions(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index 4d15cbb..296c02e 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -63,13 +63,12 @@ compile_reads <- function(master.ref, )], collapse = " ,") )) } - print(dmp_patient_id) DMP.maf <- fread(paste0(dmp.dir, "/data_mutations_extended.txt")) %>% filter(Mutation_Status != "GERMLINE") %>% data.table() DMP.RET.maf <- - DMP.maf[grepl(paste0(unique(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id), collapse = "|"), Tumor_Sample_Barcode),] + DMP.maf[grepl(paste0(unique(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id), collapse = "|"), Tumor_Sample_Barcode), ] # Pooled normal samples --------------------------------------------------- pooled.bams <- @@ -103,404 +102,422 @@ compile_reads <- function(master.ref, DMP.key[grepl(paste0(dmp_id, "-(T|N)..-IM."), V1)]$V2 all.dmp.bam.ids.IH <- DMP.key[grepl(paste0(dmp_id, "-(T|N)..-IH."), V1)]$V2 - all.dmp.bam.ids.XS <- - access.key[grepl(paste0(dmp_id, "-(T|N)..-XS."), V1)]$V2 - print(all.dmp.bam.ids.XS) all.dmp.bam.ids.XS <- gsub("-standard|-unfilter|-simplex|-duplex", "", access.key[grepl(paste0(dmp_id, "-(T|N)..-XS."), V1)]$V2) - print(all.dmp.bam.ids.XS) - all.dmp.bam.ids <- - c(all.dmp.bam.ids.IM, - all.dmp.bam.ids.IH) - bam.sub.dir <- - unlist(lapply(strsplit(substr( - all.dmp.bam.ids, 1, 2 - ), ""), function(x) { - paste0(x, collapse = "/") - })) - dmp.sample.sheet <- data.frame( - Sample_Barcode = all.dmp.ids, - standard_bam = paste0( - mirror.bam.dir, - "/", - bam.sub.dir, - "/", - all.dmp.bam.ids, - ".bam" - ) - ) - access.bam.sub.dir <- - unlist(lapply(strsplit( - substr(all.dmp.bam.ids.XS, 1, 2), "" - ), function(x) { - paste0(x, collapse = "/") - })) - access.sample.sheet <- unique( - data.frame( - Sample_Barcode = all.dmp.ids.XS, - duplex_bam = paste0( - mirror.access.bam.dir, - "/", - access.bam.sub.dir, + if (is.null(all.dmp.ids)) { + dmp.sample.sheet <- NULL + } else{ + all.dmp.bam.ids <- + c(all.dmp.bam.ids.IM, + all.dmp.bam.ids.IH) + bam.sub.dir <- + unlist(lapply(strsplit(substr( + all.dmp.bam.ids, 1, 2 + ), ""), function(x) { + paste0(x, collapse = "/") + })) + dmp.sample.sheet <- data.frame( + Sample_Barcode = all.dmp.ids, + standard_bam = paste0( + mirror.bam.dir, "/", - all.dmp.bam.ids.XS, - "-duplex.bam" - ), - simplex_bam = paste0( - mirror.access.bam.dir, + bam.sub.dir, "/", - access.bam.sub.dir, - "/", - all.dmp.bam.ids.XS, - "-simplex.bam" + all.dmp.bam.ids, + ".bam" + ) + ) + } + if (is.null(all.dmp.bam.ids.XS)) { + access.sample.sheet <- NULL + } else{ + access.bam.sub.dir <- + unlist(lapply(strsplit( + substr(all.dmp.bam.ids.XS, 1, 2), "" + ), function(x) { + paste0(x, collapse = "/") + })) + access.sample.sheet <- unique( + data.frame( + Sample_Barcode = all.dmp.ids.XS, + duplex_bam = paste0( + mirror.access.bam.dir, + "/", + access.bam.sub.dir, + "/", + all.dmp.bam.ids.XS, + "-duplex.bam" + ), + simplex_bam = paste0( + mirror.access.bam.dir, + "/", + access.bam.sub.dir, + "/", + all.dmp.bam.ids.XS, + "-simplex.bam" + ) ) ) + if (is.null(dmp.sample.sheet) & + is.null(access.sample.sheet)) { + dmp.sample.sheet <- NULL + } else if (is.null(dmp.sample.sheet) & + !is.null(access.sample.sheet)) { + dmp.sample.sheet <- access.sample.sheet + } else if (!is.null(dmp.sample.sheet) & + is.null(access.sample.sheet)) { + dmp.sample.sheet <- dmp.sample.sheet + } else{ + dmp.sample.sheet <- + bind_row(dmp.sample.sheet, access.sample.sheet) + } %>% + mutate( + cmo_patient_id = x, + Sample_Type = ifelse( + grepl("-T", Sample_Barcode), + "DMP_Tumor", + "DMP_Normal" + ), + dmp_patient_id = dmp_id + ) + } + # total sample sheet + sample.sheet <- master.ref[cmo_patient_id == x, + # plasma bams -- duplex and simplex bam + .( + Sample_Barcode = as.character(cmo_sample_id_plasma), + duplex_bam = bam_path_plasma_duplex, + simplex_bam = bam_path_plasma_simplex, + cmo_patient_id, + Sample_Type = "duplex", + dmp_patient_id + )] %>% + merge(rbind(unique(master.ref[cmo_patient_id == x & + paired == 'Paired', + # buffy coat + DMP bams -- standard bam only + .( + Sample_Barcode = as.character(cmo_sample_id_normal), + standard_bam = bam_path_normal, + cmo_patient_id, + Sample_Type = "unfilterednormal", + dmp_patient_id + )]), + dmp.sample.sheet), all = T) + # catch '' or NA for empty cells for some cmo_sample_id_normal + sample.sheet <- + sample.sheet[!is.na(Sample_Barcode) | Sample_Barcode != ""] + write.table( + sample.sheet, + paste0(results.dir, "/", x, "/", x, "_sample_sheet.tsv"), + sep = "\t", + quote = F, + row.names = F ) - dmp.sample.sheet <- - bind_row(dmp.sample.sheet, access.sample.sheet) %>% + # piece together all unique calls ----------------------------------------- + # get duplex calls + duplex.calls <- + do.call(rbind, lapply(master.ref[cmo_patient_id == x]$maf_path, function(x) { + # fread(x) %>% filter(as.numeric(D_t_alt_count_fragment) > 0) %>% data.table() + selectcolumns <- + c( + "Hugo_Symbol", + "Entrez_Gene_Id", + "Center", + "NCBI_Build", + "Chromosome", + "Start_Position", + "End_Position", + "Strand", + "Variant_Classification", + "Variant_Type", + "Reference_Allele", + "Tumor_Seq_Allele1", + "Tumor_Seq_Allele2", + "dbSNP_RS", + "dbSNP_Val_Status", + "Tumor_Sample_Barcode", + "caller_Norm_Sample_Barcode", + "Match_Norm_Seq_Allele1", + "Match_Norm_Seq_Allele2", + "Tumor_Validation_Allele1", + "Tumor_Validation_Allele2", + "Match_Norm_Validation_Allele1", + "Match_Norm_Validation_Allele2", + "Verification_Status", + "Validation_Status", + "Mutation_Status", + "Sequencing_Phase", + "Sequence_Source", + "Validation_Method", + "Score", + "BAM_File", + "Sequencer", + "Tumor_Sample_UUID", + "Matched_Norm_Sample_UUID", + "HGVSc", + "HGVSp", + "HGVSp_Short", + "Transcript_ID", + "Exon_Number", + "caller_t_depth", + "caller_t_ref_count", + "caller_t_alt_count", + "caller_n_depth", + "caller_n_ref_count", + "caller_n_alt_count", + "all_effects", + "Allele", + "Gene", + "Feature", + "Feature_type", + "Consequence", + "cDNA_position", + "CDS_position", + "Protein_position", + "Amino_acids", + "Codons", + "Existing_variation", + "ALLELE_NUM", + "DISTANCE", + "STRAND_VEP", + "SYMBOL", + "SYMBOL_SOURCE", + "HGNC_ID", + "BIOTYPE", + "CANONICAL", + "CCDS", + "ENSP", + "SWISSPROT", + "TREMBL", + "UNIPARC", + "RefSeq", + "SIFT", + "PolyPhen", + "EXON", + "INTRON", + "DOMAINS", + "AF", + "AFR_AF", + "AMR_AF", + "ASN_AF", + "EAS_AF", + "EUR_AF", + "SAS_AF", + "AA_AF", + "EA_AF", + "CLIN_SIG", + "SOMATIC", + "PUBMED", + "MOTIF_NAME", + "MOTIF_POS", + "HIGH_INF_POS", + "MOTIF_SCORE_CHANGE", + "IMPACT", + "PICK", + "VARIANT_CLASS", + "TSL", + "HGVS_OFFSET", + "PHENO", + "MINIMISED", + "ExAC_AF", + "ExAC_AF_AFR", + "ExAC_AF_AMR", + "ExAC_AF_EAS", + "ExAC_AF_FIN", + "ExAC_AF_NFE", + "ExAC_AF_OTH", + "ExAC_AF_SAS", + "GENE_PHENO", + "FILTER", + "flanking_bps", + "variant_id", + "variant_qual", + "ExAC_AF_Adj", + "ExAC_AC_AN_Adj", + "ExAC_AC_AN", + "ExAC_AC_AN_AFR", + "ExAC_AC_AN_AMR", + "ExAC_AC_AN_EAS", + "ExAC_AC_AN_FIN", + "ExAC_AC_AN_NFE", + "ExAC_AC_AN_OTH", + "ExAC_AC_AN_SAS", + "ExAC_FILTER", + "gnomAD_AF", + "gnomAD_AFR_AF", + "gnomAD_AMR_AF", + "gnomAD_ASJ_AF", + "gnomAD_EAS_AF", + "gnomAD_FIN_AF", + "gnomAD_NFE_AF", + "gnomAD_OTH_AF", + "gnomAD_SAS_AF", + "CallMethod", + "VCF_POS", + "VCF_REF", + "VCF_ALT", + "hotspot_whitelist", + "Status", + "D_t_alt_count_fragment", + "D_t_ref_count_fragment", + "D_t_vaf_fragment", + "SD_t_alt_count_fragment", + "SD_t_ref_count_fragment", + "SD_t_vaf_fragment", + "Matched_Norm_Sample_Barcode", + "Matched_Norm_Bamfile", + "n_alt_count_fragment", + "n_ref_count_fragment", + "n_vaf_fragment" + ) + if ("Status" %in% names(fread(x))) { + fread(x) %>% select(one_of(selectcolumns)) %>% subset((Status == "") | + (is.na(Status))) + } else { + fread(x) %>% select(one_of(selectcolumns)) + } + # fread(x) + # %>% + # filter(as.numeric(t_alt_count) > 0) %>% + # data.table() + })) + # get impact calls + impact.calls <- + DMP.RET.maf[Tumor_Sample_Barcode %in% sample.sheet$Sample_Barcode] + write.table( + impact.calls[, .( + Hugo_Symbol, + Chromosome, + Start_Position, + End_Position, + Variant_Classification, + HGVSp_Short, + Reference_Allele, + Tumor_Seq_Allele2 + )], + paste0(results.dir, "/", x, "/", x, "_impact_calls.maf"), + sep = "\t", + quote = F, + row.names = F + ) + # combining plasma and impact calls + all.calls <- + rbind(duplex.calls[, intersect(colnames(duplex.calls), colnames(DMP.RET.maf)), with = F], + impact.calls[, intersect(colnames(duplex.calls), colnames(DMP.RET.maf)), with = F]) + # getting rid of duplicate calls and take the first occurence of all events + all.calls <- + all.calls[which(!duplicated(all.calls[, .( + Hugo_Symbol, + Chromosome, + Start_Position, + End_Position, + Variant_Classification, + HGVSp_Short, + Reference_Allele, + Tumor_Seq_Allele2 + )])), ] %>% mutate( - cmo_patient_id = x, - Sample_Type = ifelse(grepl("-T", Sample_Barcode), "DMP_Tumor", "DMP_Normal"), - dmp_patient_id = dmp_id + t_ref_count = 0, + t_alt_count = 0, + n_ref_count = 0, + n_alt_count = 0, + Matched_Norm_Sample_Barcode = NA + ) %>% + filter( + Variant_Classification != "Silent" & + !grepl("RP11-", Hugo_Symbol) & + !grepl("Intron", Variant_Classification) ) - } - # total sample sheet - sample.sheet <- master.ref[cmo_patient_id == x, - # plasma bams -- duplex and simplex bam - .( - Sample_Barcode = as.character(cmo_sample_id_plasma), - duplex_bam = bam_path_plasma_duplex, - simplex_bam = bam_path_plasma_simplex, - cmo_patient_id, - Sample_Type = "duplex", - dmp_patient_id - )] %>% - merge(rbind(unique(master.ref[cmo_patient_id == x & - paired == 'Paired', - # buffy coat + DMP bams -- standard bam only - .( - Sample_Barcode = as.character(cmo_sample_id_normal), - standard_bam = bam_path_normal, - cmo_patient_id, - Sample_Type = "unfilterednormal", - dmp_patient_id - )]), - dmp.sample.sheet), all = T) - # catch '' or NA for empty cells for some cmo_sample_id_normal - sample.sheet <- - sample.sheet[!is.na(Sample_Barcode) | Sample_Barcode != ""] - write.table( - sample.sheet, - paste0(results.dir, "/", x, "/", x, "_sample_sheet.tsv"), - sep = "\t", - quote = F, - row.names = F - ) - # piece together all unique calls ----------------------------------------- - # get duplex calls - duplex.calls <- - do.call(rbind, lapply(master.ref[cmo_patient_id == x]$maf_path, function(x) { - # fread(x) %>% filter(as.numeric(D_t_alt_count_fragment) > 0) %>% data.table() - selectcolumns <- - c( - "Hugo_Symbol", - "Entrez_Gene_Id", - "Center", - "NCBI_Build", - "Chromosome", - "Start_Position", - "End_Position", - "Strand", - "Variant_Classification", - "Variant_Type", - "Reference_Allele", - "Tumor_Seq_Allele1", - "Tumor_Seq_Allele2", - "dbSNP_RS", - "dbSNP_Val_Status", - "Tumor_Sample_Barcode", - "caller_Norm_Sample_Barcode", - "Match_Norm_Seq_Allele1", - "Match_Norm_Seq_Allele2", - "Tumor_Validation_Allele1", - "Tumor_Validation_Allele2", - "Match_Norm_Validation_Allele1", - "Match_Norm_Validation_Allele2", - "Verification_Status", - "Validation_Status", - "Mutation_Status", - "Sequencing_Phase", - "Sequence_Source", - "Validation_Method", - "Score", - "BAM_File", - "Sequencer", - "Tumor_Sample_UUID", - "Matched_Norm_Sample_UUID", - "HGVSc", - "HGVSp", - "HGVSp_Short", - "Transcript_ID", - "Exon_Number", - "caller_t_depth", - "caller_t_ref_count", - "caller_t_alt_count", - "caller_n_depth", - "caller_n_ref_count", - "caller_n_alt_count", - "all_effects", - "Allele", - "Gene", - "Feature", - "Feature_type", - "Consequence", - "cDNA_position", - "CDS_position", - "Protein_position", - "Amino_acids", - "Codons", - "Existing_variation", - "ALLELE_NUM", - "DISTANCE", - "STRAND_VEP", - "SYMBOL", - "SYMBOL_SOURCE", - "HGNC_ID", - "BIOTYPE", - "CANONICAL", - "CCDS", - "ENSP", - "SWISSPROT", - "TREMBL", - "UNIPARC", - "RefSeq", - "SIFT", - "PolyPhen", - "EXON", - "INTRON", - "DOMAINS", - "AF", - "AFR_AF", - "AMR_AF", - "ASN_AF", - "EAS_AF", - "EUR_AF", - "SAS_AF", - "AA_AF", - "EA_AF", - "CLIN_SIG", - "SOMATIC", - "PUBMED", - "MOTIF_NAME", - "MOTIF_POS", - "HIGH_INF_POS", - "MOTIF_SCORE_CHANGE", - "IMPACT", - "PICK", - "VARIANT_CLASS", - "TSL", - "HGVS_OFFSET", - "PHENO", - "MINIMISED", - "ExAC_AF", - "ExAC_AF_AFR", - "ExAC_AF_AMR", - "ExAC_AF_EAS", - "ExAC_AF_FIN", - "ExAC_AF_NFE", - "ExAC_AF_OTH", - "ExAC_AF_SAS", - "GENE_PHENO", - "FILTER", - "flanking_bps", - "variant_id", - "variant_qual", - "ExAC_AF_Adj", - "ExAC_AC_AN_Adj", - "ExAC_AC_AN", - "ExAC_AC_AN_AFR", - "ExAC_AC_AN_AMR", - "ExAC_AC_AN_EAS", - "ExAC_AC_AN_FIN", - "ExAC_AC_AN_NFE", - "ExAC_AC_AN_OTH", - "ExAC_AC_AN_SAS", - "ExAC_FILTER", - "gnomAD_AF", - "gnomAD_AFR_AF", - "gnomAD_AMR_AF", - "gnomAD_ASJ_AF", - "gnomAD_EAS_AF", - "gnomAD_FIN_AF", - "gnomAD_NFE_AF", - "gnomAD_OTH_AF", - "gnomAD_SAS_AF", - "CallMethod", - "VCF_POS", - "VCF_REF", - "VCF_ALT", - "hotspot_whitelist", - "Status", - "D_t_alt_count_fragment", - "D_t_ref_count_fragment", - "D_t_vaf_fragment", - "SD_t_alt_count_fragment", - "SD_t_ref_count_fragment", - "SD_t_vaf_fragment", - "Matched_Norm_Sample_Barcode", - "Matched_Norm_Bamfile", - "n_alt_count_fragment", - "n_ref_count_fragment", - "n_vaf_fragment" - ) - if ("Status" %in% names(fread(x))) { - fread(x) %>% select(one_of(selectcolumns)) %>% subset((Status == "") | - (is.na(Status))) - } else { - fread(x) %>% select(one_of(selectcolumns)) - } - # fread(x) - # %>% - # filter(as.numeric(t_alt_count) > 0) %>% - # data.table() - })) - # get impact calls - impact.calls <- - DMP.RET.maf[Tumor_Sample_Barcode %in% sample.sheet$Sample_Barcode] - write.table( - impact.calls[, .( - Hugo_Symbol, - Chromosome, - Start_Position, - End_Position, - Variant_Classification, - HGVSp_Short, - Reference_Allele, - Tumor_Seq_Allele2 - )], - paste0(results.dir, "/", x, "/", x, "_impact_calls.maf"), - sep = "\t", - quote = F, - row.names = F - ) - # combining plasma and impact calls - all.calls <- - rbind(duplex.calls[, intersect(colnames(duplex.calls), colnames(DMP.RET.maf)), with = F], - impact.calls[, intersect(colnames(duplex.calls), colnames(DMP.RET.maf)), with = F]) - # getting rid of duplicate calls and take the first occurence of all events - all.calls <- - all.calls[which(!duplicated(all.calls[, .( - Hugo_Symbol, - Chromosome, - Start_Position, - End_Position, - Variant_Classification, - HGVSp_Short, - Reference_Allele, - Tumor_Seq_Allele2 - )])),] %>% - mutate( - t_ref_count = 0, - t_alt_count = 0, - n_ref_count = 0, - n_alt_count = 0, - Matched_Norm_Sample_Barcode = NA - ) %>% - filter( - Variant_Classification != "Silent" & - !grepl("RP11-", Hugo_Symbol) & - !grepl("Intron", Variant_Classification) + write.table( + all.calls, + paste0(results.dir, "/", x, "/", x, "_all_unique_calls.maf"), + sep = "\t", + quote = F, + row.names = F ) - write.table( - all.calls, - paste0(results.dir, "/", x, "/", x, "_all_unique_calls.maf"), - sep = "\t", - quote = F, - row.names = F - ) - # tagging hotspots - system( - paste0( - 'bsub -R "rusage[mem=4]" -cwd ', - results.dir, - "/", - x, - "/ -oo hotspot.o -eo hotspot.e -W 00:59 ", - " -P ", - project.ID, - " -J ", - x, - "_tag_hotspot ", - " python /work/access/production/workflows/access_workflows/v1/pipeline_2.0.0/ACCESS-Pipeline/cwl_tools/hotspots/tag_hotspots.py ", - " -m ", - results.dir, - "/", - x, - "/", - x, - "_all_unique_calls.maf", - " -itxt /work/access/production/resources/msk-access/current/regions_of_interest/current/hotspot-list-union-v1-v2_with_TERT.txt ", - " -o ", - results.dir, - "/", - x, - "/", - x, - "_all_unique_calls_hotspots.maf", - " -outdir ", - results.dir, - "/", - x, - "/", - x + # tagging hotspots + system( + paste0( + 'bsub -R "rusage[mem=4]" -cwd ', + results.dir, + "/", + x, + "/ -oo hotspot.o -eo hotspot.e -W 00:59 ", + " -P ", + project.ID, + " -J ", + x, + "_tag_hotspot ", + " python /work/access/production/workflows/access_workflows/v1/pipeline_2.0.0/ACCESS-Pipeline/cwl_tools/hotspots/tag_hotspots.py ", + " -m ", + results.dir, + "/", + x, + "/", + x, + "_all_unique_calls.maf", + " -itxt /work/access/production/resources/msk-access/current/regions_of_interest/current/hotspot-list-union-v1-v2_with_TERT.txt ", + " -o ", + results.dir, + "/", + x, + "/", + x, + "_all_unique_calls_hotspots.maf", + " -outdir ", + results.dir, + "/", + x, + "/", + x + ) ) - ) - # genotype all bams in this patient directory ----------------------------- - # genotyping plasma samples -- plasma duplex&simplex, plasma normal, pooled plasma normal - write.table( - sample.sheet[, .( - sample_id = Sample_Barcode, - maf = paste0(results.dir, "/", x, "/", x, "_all_unique_calls.maf"), - standard_bam, - duplex_bam, - simplex_bam - )], - paste0(results.dir, "/", x, "/", x, "_genotype_metadata.tsv"), - sep = "\t", - quote = F, - row.names = F - ) - job.ids <- system( - paste0( - "bsub -cwd ", - results.dir, - "/", - x, - ' -W 12:00 -R "rusage[mem=8]" -oo genotyping.o -eo genotyping.e ', - " -P ", - project.ID, - " -J ", - x, - "_genotype_variants ", - " genotype_variants small_variants multiple-samples -i ", - results.dir, - "/", - x, - "/", - x, - "_genotype_metadata.tsv", - " -r ", - fasta.path, - " -g ", - genotyper.path, - " -v DEBUG " - ), - intern = T - ) - job.ids <- as.numeric(gsub("Job <|> is.*.$", "", job.ids)) - }) + # genotype all bams in this patient directory ----------------------------- + # genotyping plasma samples -- plasma duplex&simplex, plasma normal, pooled plasma normal + write.table( + sample.sheet[, .( + sample_id = Sample_Barcode, + maf = paste0(results.dir, "/", x, "/", x, "_all_unique_calls.maf"), + standard_bam, + duplex_bam, + simplex_bam + )], + paste0(results.dir, "/", x, "/", x, "_genotype_metadata.tsv"), + sep = "\t", + quote = F, + row.names = F + ) + job.ids <- system( + paste0( + "bsub -cwd ", + results.dir, + "/", + x, + ' -W 12:00 -R "rusage[mem=8]" -oo genotyping.o -eo genotyping.e ', + " -P ", + project.ID, + " -J ", + x, + "_genotype_variants ", + " genotype_variants small_variants multiple-samples -i ", + results.dir, + "/", + x, + "/", + x, + "_genotype_metadata.tsv", + " -r ", + fasta.path, + " -g ", + genotyper.path, + " -v DEBUG " + ), + intern = T + ) + job.ids <- as.numeric(gsub("Job <|> is.*.$", "", job.ids)) + }) # Get base count multi sample in pooled normal ---------------------------- @@ -525,7 +542,7 @@ compile_reads <- function(master.ref, HGVSp_Short, Reference_Allele, Tumor_Seq_Allele2 - )]), ] + )]),] write.table( all.all.unique.mafs, paste0(results.dir, "/pooled/all_all_unique.maf"), @@ -579,131 +596,131 @@ compile_reads <- function(master.ref, Sys.sleep(120) } print("Compile reads done!") -} + } -# Executable ----------------------------------------------------------------------------------------------------------- -# Minimal columns for input mafs -# -# Hugo_Symbol,Chromosome,Start_Position,End_Position,Tumor_Sample_Barcode,Variant_Classification,HGVSp_Short,Reference_Allele,Tumor_Seq_Allele2,D_t_alt_count_fragment + # Executable ----------------------------------------------------------------------------------------------------------- + # Minimal columns for input mafs + # + # Hugo_Symbol,Chromosome,Start_Position,End_Position,Tumor_Sample_Barcode,Variant_Classification,HGVSp_Short,Reference_Allele,Tumor_Seq_Allele2,D_t_alt_count_fragment -suppressPackageStartupMessages({ - library(data.table) - library(tidyr) - library(stringr) - library(dplyr) - library(argparse) -}) + suppressPackageStartupMessages({ + library(data.table) + library(tidyr) + library(stringr) + library(dplyr) + library(argparse) + }) -if (!interactive()) { - parser <- ArgumentParser() - parser$add_argument("-m", "--masterref", type = "character", help = "File path to master reference file") - parser$add_argument("-o", "--resultsdir", type = "character", help = "Output directory") - parser$add_argument( - "-pid", - "--projectid", - type = "character", - default = "", - help = "Project ID for submitted jobs involved in this run" - ) - parser$add_argument( - "-pb", - "--pooledbamdir", - type = "character", - default = "/juno/work/access/production/resources/msk-access/current/novaseq_curated_duplex_bams_dmp/current/", - help = "Directory for all pooled bams [default]" - ) - parser$add_argument( - "-fa", - "--fastapath", - type = "character", - default = "/juno/work/access/production/resources/reference/current/Homo_sapiens_assembly19.fasta", - help = "Reference fasta path [default]" - ) - parser$add_argument( - "-gt", - "--genotyperpath", - type = "character", - default = "/work/access/production/resources/tools/GetBaseCountsMultiSample/current/GetBaseCountsMultiSample", - help = "Genotyper executable path [default]" - ) - parser$add_argument( - "-dmp", - "--dmpdir", - type = "character", - default = "/juno/work/access/production/resources/cbioportal/current/msk_solid_heme", - help = "Directory of clinical DMP repository [default]" - ) - parser$add_argument( - "-mb", - "--mirrorbamdir", - type = "character", - default = "/juno/res/dmpcollab/dmpshare/share/irb12_245", - help = "Mirror BAM file directory [default]" - ) - parser$add_argument( - "-mab", - "--mirroraccessbamdir", - type = "character", - default = "/juno/res/dmpcollab/dmpshare/share/access_12_245", - help = "Mirror BAM file directory for MSK-ACCESS [default]" - ) - parser$add_argument( - "-dmpk", - "--dmpkeypath", - type = "character", - default = "/juno/res/dmpcollab/dmprequest/12-245/key.txt", - help = "DMP mirror BAM key file [default]" - ) - parser$add_argument( - "-dmpak", - "--dmpaccesskeypath", - type = "character", - default = " /juno/res/dmpcollab/dmprequest/ACCESS-12-245/key.txt", - help = "DMP mirror BAM key file for MSK-ACCESS [default]" - ) - args <- parser$parse_args() + if (!interactive()) { + parser <- ArgumentParser() + parser$add_argument("-m", "--masterref", type = "character", help = "File path to master reference file") + parser$add_argument("-o", "--resultsdir", type = "character", help = "Output directory") + parser$add_argument( + "-pid", + "--projectid", + type = "character", + default = "", + help = "Project ID for submitted jobs involved in this run" + ) + parser$add_argument( + "-pb", + "--pooledbamdir", + type = "character", + default = "/juno/work/access/production/resources/msk-access/current/novaseq_curated_duplex_bams_dmp/current/", + help = "Directory for all pooled bams [default]" + ) + parser$add_argument( + "-fa", + "--fastapath", + type = "character", + default = "/juno/work/access/production/resources/reference/current/Homo_sapiens_assembly19.fasta", + help = "Reference fasta path [default]" + ) + parser$add_argument( + "-gt", + "--genotyperpath", + type = "character", + default = "/work/access/production/resources/tools/GetBaseCountsMultiSample/current/GetBaseCountsMultiSample", + help = "Genotyper executable path [default]" + ) + parser$add_argument( + "-dmp", + "--dmpdir", + type = "character", + default = "/juno/work/access/production/resources/cbioportal/current/msk_solid_heme", + help = "Directory of clinical DMP repository [default]" + ) + parser$add_argument( + "-mb", + "--mirrorbamdir", + type = "character", + default = "/juno/res/dmpcollab/dmpshare/share/irb12_245", + help = "Mirror BAM file directory [default]" + ) + parser$add_argument( + "-mab", + "--mirroraccessbamdir", + type = "character", + default = "/juno/res/dmpcollab/dmpshare/share/access_12_245", + help = "Mirror BAM file directory for MSK-ACCESS [default]" + ) + parser$add_argument( + "-dmpk", + "--dmpkeypath", + type = "character", + default = "/juno/res/dmpcollab/dmprequest/12-245/key.txt", + help = "DMP mirror BAM key file [default]" + ) + parser$add_argument( + "-dmpak", + "--dmpaccesskeypath", + type = "character", + default = " /juno/res/dmpcollab/dmprequest/ACCESS-12-245/key.txt", + help = "DMP mirror BAM key file for MSK-ACCESS [default]" + ) + args <- parser$parse_args() - master.ref <- args$masterref - results.dir <- args$resultsdir - project.ID <- args$projectid - pooled.bam.dir <- args$pooledbamdir - fasta.path <- args$fastapath - genotyper.path <- args$genotyperpath - dmp.dir <- args$dmpdir - mirror.bam.dir <- args$mirrorbamdir - mirror.access.bam.dir <- args$mirroraccessbamdir - dmp.key.path <- args$dmpkeypath - access.key.path <- args$dmpaccesskeypath + master.ref <- args$masterref + results.dir <- args$resultsdir + project.ID <- args$projectid + pooled.bam.dir <- args$pooledbamdir + fasta.path <- args$fastapath + genotyper.path <- args$genotyperpath + dmp.dir <- args$dmpdir + mirror.bam.dir <- args$mirrorbamdir + mirror.access.bam.dir <- args$mirroraccessbamdir + dmp.key.path <- args$dmpkeypath + access.key.path <- args$dmpaccesskeypath - if (project.ID == "") { - project.ID <- - paste0(sample(c(0:9), size = 10, replace = T), collapse = "") - } + if (project.ID == "") { + project.ID <- + paste0(sample(c(0:9), size = 10, replace = T), collapse = "") + } - print(paste0("Input parameters for run ", project.ID)) - print(master.ref) - print(results.dir) - print(pooled.bam.dir) - print(fasta.path) - print(genotyper.path) - print(dmp.dir) - print(mirror.bam.dir) - print(mirror.access.bam.dir) - print(dmp.key.path) - print(access.key.path) - suppressWarnings( - compile_reads( - fread(master.ref), - results.dir, - project.ID, - pooled.bam.dir, - fasta.path, - genotyper.path, - dmp.dir, - mirror.bam.dir, - dmp.key.path + print(paste0("Input parameters for run ", project.ID)) + print(master.ref) + print(results.dir) + print(pooled.bam.dir) + print(fasta.path) + print(genotyper.path) + print(dmp.dir) + print(mirror.bam.dir) + print(mirror.access.bam.dir) + print(dmp.key.path) + print(access.key.path) + suppressWarnings( + compile_reads( + fread(master.ref), + results.dir, + project.ID, + pooled.bam.dir, + fasta.path, + genotyper.path, + dmp.dir, + mirror.bam.dir, + dmp.key.path + ) ) - ) - print("compile reads function finished") -} + print("compile reads function finished") + } From 07bae6de17ed67ec1940b53424757f5d5d476279 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 15:32:46 -0500 Subject: [PATCH 079/126] adding access samples to genotype --- R/compile_reads.R | 950 +++++++++++++++++++++++----------------------- 1 file changed, 475 insertions(+), 475 deletions(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index 296c02e..9c3208b 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -68,7 +68,7 @@ compile_reads <- function(master.ref, filter(Mutation_Status != "GERMLINE") %>% data.table() DMP.RET.maf <- - DMP.maf[grepl(paste0(unique(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id), collapse = "|"), Tumor_Sample_Barcode), ] + DMP.maf[grepl(paste0(unique(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id), collapse = "|"), Tumor_Sample_Barcode),] # Pooled normal samples --------------------------------------------------- pooled.bams <- @@ -160,364 +160,364 @@ compile_reads <- function(master.ref, ) ) ) - if (is.null(dmp.sample.sheet) & - is.null(access.sample.sheet)) { - dmp.sample.sheet <- NULL - } else if (is.null(dmp.sample.sheet) & - !is.null(access.sample.sheet)) { - dmp.sample.sheet <- access.sample.sheet - } else if (!is.null(dmp.sample.sheet) & - is.null(access.sample.sheet)) { - dmp.sample.sheet <- dmp.sample.sheet - } else{ - dmp.sample.sheet <- - bind_row(dmp.sample.sheet, access.sample.sheet) - } %>% - mutate( - cmo_patient_id = x, - Sample_Type = ifelse( - grepl("-T", Sample_Barcode), - "DMP_Tumor", - "DMP_Normal" - ), - dmp_patient_id = dmp_id - ) } - # total sample sheet - sample.sheet <- master.ref[cmo_patient_id == x, - # plasma bams -- duplex and simplex bam - .( - Sample_Barcode = as.character(cmo_sample_id_plasma), - duplex_bam = bam_path_plasma_duplex, - simplex_bam = bam_path_plasma_simplex, - cmo_patient_id, - Sample_Type = "duplex", - dmp_patient_id - )] %>% - merge(rbind(unique(master.ref[cmo_patient_id == x & - paired == 'Paired', - # buffy coat + DMP bams -- standard bam only - .( - Sample_Barcode = as.character(cmo_sample_id_normal), - standard_bam = bam_path_normal, - cmo_patient_id, - Sample_Type = "unfilterednormal", - dmp_patient_id - )]), - dmp.sample.sheet), all = T) - # catch '' or NA for empty cells for some cmo_sample_id_normal - sample.sheet <- - sample.sheet[!is.na(Sample_Barcode) | Sample_Barcode != ""] - write.table( - sample.sheet, - paste0(results.dir, "/", x, "/", x, "_sample_sheet.tsv"), - sep = "\t", - quote = F, - row.names = F - ) - # piece together all unique calls ----------------------------------------- - # get duplex calls - duplex.calls <- - do.call(rbind, lapply(master.ref[cmo_patient_id == x]$maf_path, function(x) { - # fread(x) %>% filter(as.numeric(D_t_alt_count_fragment) > 0) %>% data.table() - selectcolumns <- - c( - "Hugo_Symbol", - "Entrez_Gene_Id", - "Center", - "NCBI_Build", - "Chromosome", - "Start_Position", - "End_Position", - "Strand", - "Variant_Classification", - "Variant_Type", - "Reference_Allele", - "Tumor_Seq_Allele1", - "Tumor_Seq_Allele2", - "dbSNP_RS", - "dbSNP_Val_Status", - "Tumor_Sample_Barcode", - "caller_Norm_Sample_Barcode", - "Match_Norm_Seq_Allele1", - "Match_Norm_Seq_Allele2", - "Tumor_Validation_Allele1", - "Tumor_Validation_Allele2", - "Match_Norm_Validation_Allele1", - "Match_Norm_Validation_Allele2", - "Verification_Status", - "Validation_Status", - "Mutation_Status", - "Sequencing_Phase", - "Sequence_Source", - "Validation_Method", - "Score", - "BAM_File", - "Sequencer", - "Tumor_Sample_UUID", - "Matched_Norm_Sample_UUID", - "HGVSc", - "HGVSp", - "HGVSp_Short", - "Transcript_ID", - "Exon_Number", - "caller_t_depth", - "caller_t_ref_count", - "caller_t_alt_count", - "caller_n_depth", - "caller_n_ref_count", - "caller_n_alt_count", - "all_effects", - "Allele", - "Gene", - "Feature", - "Feature_type", - "Consequence", - "cDNA_position", - "CDS_position", - "Protein_position", - "Amino_acids", - "Codons", - "Existing_variation", - "ALLELE_NUM", - "DISTANCE", - "STRAND_VEP", - "SYMBOL", - "SYMBOL_SOURCE", - "HGNC_ID", - "BIOTYPE", - "CANONICAL", - "CCDS", - "ENSP", - "SWISSPROT", - "TREMBL", - "UNIPARC", - "RefSeq", - "SIFT", - "PolyPhen", - "EXON", - "INTRON", - "DOMAINS", - "AF", - "AFR_AF", - "AMR_AF", - "ASN_AF", - "EAS_AF", - "EUR_AF", - "SAS_AF", - "AA_AF", - "EA_AF", - "CLIN_SIG", - "SOMATIC", - "PUBMED", - "MOTIF_NAME", - "MOTIF_POS", - "HIGH_INF_POS", - "MOTIF_SCORE_CHANGE", - "IMPACT", - "PICK", - "VARIANT_CLASS", - "TSL", - "HGVS_OFFSET", - "PHENO", - "MINIMISED", - "ExAC_AF", - "ExAC_AF_AFR", - "ExAC_AF_AMR", - "ExAC_AF_EAS", - "ExAC_AF_FIN", - "ExAC_AF_NFE", - "ExAC_AF_OTH", - "ExAC_AF_SAS", - "GENE_PHENO", - "FILTER", - "flanking_bps", - "variant_id", - "variant_qual", - "ExAC_AF_Adj", - "ExAC_AC_AN_Adj", - "ExAC_AC_AN", - "ExAC_AC_AN_AFR", - "ExAC_AC_AN_AMR", - "ExAC_AC_AN_EAS", - "ExAC_AC_AN_FIN", - "ExAC_AC_AN_NFE", - "ExAC_AC_AN_OTH", - "ExAC_AC_AN_SAS", - "ExAC_FILTER", - "gnomAD_AF", - "gnomAD_AFR_AF", - "gnomAD_AMR_AF", - "gnomAD_ASJ_AF", - "gnomAD_EAS_AF", - "gnomAD_FIN_AF", - "gnomAD_NFE_AF", - "gnomAD_OTH_AF", - "gnomAD_SAS_AF", - "CallMethod", - "VCF_POS", - "VCF_REF", - "VCF_ALT", - "hotspot_whitelist", - "Status", - "D_t_alt_count_fragment", - "D_t_ref_count_fragment", - "D_t_vaf_fragment", - "SD_t_alt_count_fragment", - "SD_t_ref_count_fragment", - "SD_t_vaf_fragment", - "Matched_Norm_Sample_Barcode", - "Matched_Norm_Bamfile", - "n_alt_count_fragment", - "n_ref_count_fragment", - "n_vaf_fragment" - ) - if ("Status" %in% names(fread(x))) { - fread(x) %>% select(one_of(selectcolumns)) %>% subset((Status == "") | - (is.na(Status))) - } else { - fread(x) %>% select(one_of(selectcolumns)) - } - # fread(x) - # %>% - # filter(as.numeric(t_alt_count) > 0) %>% - # data.table() - })) - # get impact calls - impact.calls <- - DMP.RET.maf[Tumor_Sample_Barcode %in% sample.sheet$Sample_Barcode] - write.table( - impact.calls[, .( - Hugo_Symbol, - Chromosome, - Start_Position, - End_Position, - Variant_Classification, - HGVSp_Short, - Reference_Allele, - Tumor_Seq_Allele2 - )], - paste0(results.dir, "/", x, "/", x, "_impact_calls.maf"), - sep = "\t", - quote = F, - row.names = F - ) - # combining plasma and impact calls - all.calls <- - rbind(duplex.calls[, intersect(colnames(duplex.calls), colnames(DMP.RET.maf)), with = F], - impact.calls[, intersect(colnames(duplex.calls), colnames(DMP.RET.maf)), with = F]) - # getting rid of duplicate calls and take the first occurence of all events - all.calls <- - all.calls[which(!duplicated(all.calls[, .( - Hugo_Symbol, - Chromosome, - Start_Position, - End_Position, - Variant_Classification, - HGVSp_Short, - Reference_Allele, - Tumor_Seq_Allele2 - )])), ] %>% + if (is.null(dmp.sample.sheet) & + is.null(access.sample.sheet)) { + dmp.sample.sheet <- NULL + } else if (is.null(dmp.sample.sheet) & + !is.null(access.sample.sheet)) { + dmp.sample.sheet <- access.sample.sheet + } else if (!is.null(dmp.sample.sheet) & + is.null(access.sample.sheet)) { + dmp.sample.sheet <- dmp.sample.sheet + } else{ + dmp.sample.sheet <- + bind_row(dmp.sample.sheet, access.sample.sheet) + } %>% mutate( - t_ref_count = 0, - t_alt_count = 0, - n_ref_count = 0, - n_alt_count = 0, - Matched_Norm_Sample_Barcode = NA - ) %>% - filter( - Variant_Classification != "Silent" & - !grepl("RP11-", Hugo_Symbol) & - !grepl("Intron", Variant_Classification) - ) - write.table( - all.calls, - paste0(results.dir, "/", x, "/", x, "_all_unique_calls.maf"), - sep = "\t", - quote = F, - row.names = F - ) - # tagging hotspots - system( - paste0( - 'bsub -R "rusage[mem=4]" -cwd ', - results.dir, - "/", - x, - "/ -oo hotspot.o -eo hotspot.e -W 00:59 ", - " -P ", - project.ID, - " -J ", - x, - "_tag_hotspot ", - " python /work/access/production/workflows/access_workflows/v1/pipeline_2.0.0/ACCESS-Pipeline/cwl_tools/hotspots/tag_hotspots.py ", - " -m ", - results.dir, - "/", - x, - "/", - x, - "_all_unique_calls.maf", - " -itxt /work/access/production/resources/msk-access/current/regions_of_interest/current/hotspot-list-union-v1-v2_with_TERT.txt ", - " -o ", - results.dir, - "/", - x, - "/", - x, - "_all_unique_calls_hotspots.maf", - " -outdir ", - results.dir, - "/", - x, - "/", - x + cmo_patient_id = x, + Sample_Type = ifelse(grepl("-T", Sample_Barcode), + "DMP_Tumor", + "DMP_Normal"), + dmp_patient_id = dmp_id ) + } + # total sample sheet + sample.sheet <- master.ref[cmo_patient_id == x, + # plasma bams -- duplex and simplex bam + .( + Sample_Barcode = as.character(cmo_sample_id_plasma), + duplex_bam = bam_path_plasma_duplex, + simplex_bam = bam_path_plasma_simplex, + cmo_patient_id, + Sample_Type = "duplex", + dmp_patient_id + )] %>% + merge(rbind(unique(master.ref[cmo_patient_id == x & + paired == 'Paired', + # buffy coat + DMP bams -- standard bam only + .( + Sample_Barcode = as.character(cmo_sample_id_normal), + standard_bam = bam_path_normal, + cmo_patient_id, + Sample_Type = "unfilterednormal", + dmp_patient_id + )]), + dmp.sample.sheet), all = T) + # catch '' or NA for empty cells for some cmo_sample_id_normal + sample.sheet <- + sample.sheet[!is.na(Sample_Barcode) | + Sample_Barcode != ""] + write.table( + sample.sheet, + paste0(results.dir, "/", x, "/", x, "_sample_sheet.tsv"), + sep = "\t", + quote = F, + row.names = F + ) + # piece together all unique calls ----------------------------------------- + # get duplex calls + duplex.calls <- + do.call(rbind, lapply(master.ref[cmo_patient_id == x]$maf_path, function(x) { + # fread(x) %>% filter(as.numeric(D_t_alt_count_fragment) > 0) %>% data.table() + selectcolumns <- + c( + "Hugo_Symbol", + "Entrez_Gene_Id", + "Center", + "NCBI_Build", + "Chromosome", + "Start_Position", + "End_Position", + "Strand", + "Variant_Classification", + "Variant_Type", + "Reference_Allele", + "Tumor_Seq_Allele1", + "Tumor_Seq_Allele2", + "dbSNP_RS", + "dbSNP_Val_Status", + "Tumor_Sample_Barcode", + "caller_Norm_Sample_Barcode", + "Match_Norm_Seq_Allele1", + "Match_Norm_Seq_Allele2", + "Tumor_Validation_Allele1", + "Tumor_Validation_Allele2", + "Match_Norm_Validation_Allele1", + "Match_Norm_Validation_Allele2", + "Verification_Status", + "Validation_Status", + "Mutation_Status", + "Sequencing_Phase", + "Sequence_Source", + "Validation_Method", + "Score", + "BAM_File", + "Sequencer", + "Tumor_Sample_UUID", + "Matched_Norm_Sample_UUID", + "HGVSc", + "HGVSp", + "HGVSp_Short", + "Transcript_ID", + "Exon_Number", + "caller_t_depth", + "caller_t_ref_count", + "caller_t_alt_count", + "caller_n_depth", + "caller_n_ref_count", + "caller_n_alt_count", + "all_effects", + "Allele", + "Gene", + "Feature", + "Feature_type", + "Consequence", + "cDNA_position", + "CDS_position", + "Protein_position", + "Amino_acids", + "Codons", + "Existing_variation", + "ALLELE_NUM", + "DISTANCE", + "STRAND_VEP", + "SYMBOL", + "SYMBOL_SOURCE", + "HGNC_ID", + "BIOTYPE", + "CANONICAL", + "CCDS", + "ENSP", + "SWISSPROT", + "TREMBL", + "UNIPARC", + "RefSeq", + "SIFT", + "PolyPhen", + "EXON", + "INTRON", + "DOMAINS", + "AF", + "AFR_AF", + "AMR_AF", + "ASN_AF", + "EAS_AF", + "EUR_AF", + "SAS_AF", + "AA_AF", + "EA_AF", + "CLIN_SIG", + "SOMATIC", + "PUBMED", + "MOTIF_NAME", + "MOTIF_POS", + "HIGH_INF_POS", + "MOTIF_SCORE_CHANGE", + "IMPACT", + "PICK", + "VARIANT_CLASS", + "TSL", + "HGVS_OFFSET", + "PHENO", + "MINIMISED", + "ExAC_AF", + "ExAC_AF_AFR", + "ExAC_AF_AMR", + "ExAC_AF_EAS", + "ExAC_AF_FIN", + "ExAC_AF_NFE", + "ExAC_AF_OTH", + "ExAC_AF_SAS", + "GENE_PHENO", + "FILTER", + "flanking_bps", + "variant_id", + "variant_qual", + "ExAC_AF_Adj", + "ExAC_AC_AN_Adj", + "ExAC_AC_AN", + "ExAC_AC_AN_AFR", + "ExAC_AC_AN_AMR", + "ExAC_AC_AN_EAS", + "ExAC_AC_AN_FIN", + "ExAC_AC_AN_NFE", + "ExAC_AC_AN_OTH", + "ExAC_AC_AN_SAS", + "ExAC_FILTER", + "gnomAD_AF", + "gnomAD_AFR_AF", + "gnomAD_AMR_AF", + "gnomAD_ASJ_AF", + "gnomAD_EAS_AF", + "gnomAD_FIN_AF", + "gnomAD_NFE_AF", + "gnomAD_OTH_AF", + "gnomAD_SAS_AF", + "CallMethod", + "VCF_POS", + "VCF_REF", + "VCF_ALT", + "hotspot_whitelist", + "Status", + "D_t_alt_count_fragment", + "D_t_ref_count_fragment", + "D_t_vaf_fragment", + "SD_t_alt_count_fragment", + "SD_t_ref_count_fragment", + "SD_t_vaf_fragment", + "Matched_Norm_Sample_Barcode", + "Matched_Norm_Bamfile", + "n_alt_count_fragment", + "n_ref_count_fragment", + "n_vaf_fragment" + ) + if ("Status" %in% names(fread(x))) { + fread(x) %>% select(one_of(selectcolumns)) %>% subset((Status == "") | + (is.na(Status))) + } else { + fread(x) %>% select(one_of(selectcolumns)) + } + # fread(x) + # %>% + # filter(as.numeric(t_alt_count) > 0) %>% + # data.table() + })) + # get impact calls + impact.calls <- + DMP.RET.maf[Tumor_Sample_Barcode %in% sample.sheet$Sample_Barcode] + write.table( + impact.calls[, .( + Hugo_Symbol, + Chromosome, + Start_Position, + End_Position, + Variant_Classification, + HGVSp_Short, + Reference_Allele, + Tumor_Seq_Allele2 + )], + paste0(results.dir, "/", x, "/", x, "_impact_calls.maf"), + sep = "\t", + quote = F, + row.names = F + ) + # combining plasma and impact calls + all.calls <- + rbind(duplex.calls[, intersect(colnames(duplex.calls), colnames(DMP.RET.maf)), with = F], + impact.calls[, intersect(colnames(duplex.calls), colnames(DMP.RET.maf)), with = F]) + # getting rid of duplicate calls and take the first occurence of all events + all.calls <- + all.calls[which(!duplicated(all.calls[, .( + Hugo_Symbol, + Chromosome, + Start_Position, + End_Position, + Variant_Classification, + HGVSp_Short, + Reference_Allele, + Tumor_Seq_Allele2 + )])),] %>% + mutate( + t_ref_count = 0, + t_alt_count = 0, + n_ref_count = 0, + n_alt_count = 0, + Matched_Norm_Sample_Barcode = NA + ) %>% + filter( + Variant_Classification != "Silent" & + !grepl("RP11-", Hugo_Symbol) & + !grepl("Intron", Variant_Classification) ) - # genotype all bams in this patient directory ----------------------------- - # genotyping plasma samples -- plasma duplex&simplex, plasma normal, pooled plasma normal - write.table( - sample.sheet[, .( - sample_id = Sample_Barcode, - maf = paste0(results.dir, "/", x, "/", x, "_all_unique_calls.maf"), - standard_bam, - duplex_bam, - simplex_bam - )], - paste0(results.dir, "/", x, "/", x, "_genotype_metadata.tsv"), - sep = "\t", - quote = F, - row.names = F - ) - job.ids <- system( - paste0( - "bsub -cwd ", - results.dir, - "/", - x, - ' -W 12:00 -R "rusage[mem=8]" -oo genotyping.o -eo genotyping.e ', - " -P ", - project.ID, - " -J ", - x, - "_genotype_variants ", - " genotype_variants small_variants multiple-samples -i ", - results.dir, - "/", - x, - "/", - x, - "_genotype_metadata.tsv", - " -r ", - fasta.path, - " -g ", - genotyper.path, - " -v DEBUG " - ), - intern = T + write.table( + all.calls, + paste0(results.dir, "/", x, "/", x, "_all_unique_calls.maf"), + sep = "\t", + quote = F, + row.names = F + ) + # tagging hotspots + system( + paste0( + 'bsub -R "rusage[mem=4]" -cwd ', + results.dir, + "/", + x, + "/ -oo hotspot.o -eo hotspot.e -W 00:59 ", + " -P ", + project.ID, + " -J ", + x, + "_tag_hotspot ", + " python /work/access/production/workflows/access_workflows/v1/pipeline_2.0.0/ACCESS-Pipeline/cwl_tools/hotspots/tag_hotspots.py ", + " -m ", + results.dir, + "/", + x, + "/", + x, + "_all_unique_calls.maf", + " -itxt /work/access/production/resources/msk-access/current/regions_of_interest/current/hotspot-list-union-v1-v2_with_TERT.txt ", + " -o ", + results.dir, + "/", + x, + "/", + x, + "_all_unique_calls_hotspots.maf", + " -outdir ", + results.dir, + "/", + x, + "/", + x ) - job.ids <- as.numeric(gsub("Job <|> is.*.$", "", job.ids)) - }) + ) + # genotype all bams in this patient directory ----------------------------- + # genotyping plasma samples -- plasma duplex&simplex, plasma normal, pooled plasma normal + write.table( + sample.sheet[, .( + sample_id = Sample_Barcode, + maf = paste0(results.dir, "/", x, "/", x, "_all_unique_calls.maf"), + standard_bam, + duplex_bam, + simplex_bam + )], + paste0(results.dir, "/", x, "/", x, "_genotype_metadata.tsv"), + sep = "\t", + quote = F, + row.names = F + ) + job.ids <- system( + paste0( + "bsub -cwd ", + results.dir, + "/", + x, + ' -W 12:00 -R "rusage[mem=8]" -oo genotyping.o -eo genotyping.e ', + " -P ", + project.ID, + " -J ", + x, + "_genotype_variants ", + " genotype_variants small_variants multiple-samples -i ", + results.dir, + "/", + x, + "/", + x, + "_genotype_metadata.tsv", + " -r ", + fasta.path, + " -g ", + genotyper.path, + " -v DEBUG " + ), + intern = T + ) + job.ids <- as.numeric(gsub("Job <|> is.*.$", "", job.ids)) + }) # Get base count multi sample in pooled normal ---------------------------- @@ -542,7 +542,7 @@ compile_reads <- function(master.ref, HGVSp_Short, Reference_Allele, Tumor_Seq_Allele2 - )]),] + )]), ] write.table( all.all.unique.mafs, paste0(results.dir, "/pooled/all_all_unique.maf"), @@ -596,131 +596,131 @@ compile_reads <- function(master.ref, Sys.sleep(120) } print("Compile reads done!") - } +} - # Executable ----------------------------------------------------------------------------------------------------------- - # Minimal columns for input mafs - # - # Hugo_Symbol,Chromosome,Start_Position,End_Position,Tumor_Sample_Barcode,Variant_Classification,HGVSp_Short,Reference_Allele,Tumor_Seq_Allele2,D_t_alt_count_fragment +# Executable ----------------------------------------------------------------------------------------------------------- +# Minimal columns for input mafs +# +# Hugo_Symbol,Chromosome,Start_Position,End_Position,Tumor_Sample_Barcode,Variant_Classification,HGVSp_Short,Reference_Allele,Tumor_Seq_Allele2,D_t_alt_count_fragment - suppressPackageStartupMessages({ - library(data.table) - library(tidyr) - library(stringr) - library(dplyr) - library(argparse) - }) +suppressPackageStartupMessages({ + library(data.table) + library(tidyr) + library(stringr) + library(dplyr) + library(argparse) +}) - if (!interactive()) { - parser <- ArgumentParser() - parser$add_argument("-m", "--masterref", type = "character", help = "File path to master reference file") - parser$add_argument("-o", "--resultsdir", type = "character", help = "Output directory") - parser$add_argument( - "-pid", - "--projectid", - type = "character", - default = "", - help = "Project ID for submitted jobs involved in this run" - ) - parser$add_argument( - "-pb", - "--pooledbamdir", - type = "character", - default = "/juno/work/access/production/resources/msk-access/current/novaseq_curated_duplex_bams_dmp/current/", - help = "Directory for all pooled bams [default]" - ) - parser$add_argument( - "-fa", - "--fastapath", - type = "character", - default = "/juno/work/access/production/resources/reference/current/Homo_sapiens_assembly19.fasta", - help = "Reference fasta path [default]" - ) - parser$add_argument( - "-gt", - "--genotyperpath", - type = "character", - default = "/work/access/production/resources/tools/GetBaseCountsMultiSample/current/GetBaseCountsMultiSample", - help = "Genotyper executable path [default]" - ) - parser$add_argument( - "-dmp", - "--dmpdir", - type = "character", - default = "/juno/work/access/production/resources/cbioportal/current/msk_solid_heme", - help = "Directory of clinical DMP repository [default]" - ) - parser$add_argument( - "-mb", - "--mirrorbamdir", - type = "character", - default = "/juno/res/dmpcollab/dmpshare/share/irb12_245", - help = "Mirror BAM file directory [default]" - ) - parser$add_argument( - "-mab", - "--mirroraccessbamdir", - type = "character", - default = "/juno/res/dmpcollab/dmpshare/share/access_12_245", - help = "Mirror BAM file directory for MSK-ACCESS [default]" - ) - parser$add_argument( - "-dmpk", - "--dmpkeypath", - type = "character", - default = "/juno/res/dmpcollab/dmprequest/12-245/key.txt", - help = "DMP mirror BAM key file [default]" - ) - parser$add_argument( - "-dmpak", - "--dmpaccesskeypath", - type = "character", - default = " /juno/res/dmpcollab/dmprequest/ACCESS-12-245/key.txt", - help = "DMP mirror BAM key file for MSK-ACCESS [default]" - ) - args <- parser$parse_args() +if (!interactive()) { + parser <- ArgumentParser() + parser$add_argument("-m", "--masterref", type = "character", help = "File path to master reference file") + parser$add_argument("-o", "--resultsdir", type = "character", help = "Output directory") + parser$add_argument( + "-pid", + "--projectid", + type = "character", + default = "", + help = "Project ID for submitted jobs involved in this run" + ) + parser$add_argument( + "-pb", + "--pooledbamdir", + type = "character", + default = "/juno/work/access/production/resources/msk-access/current/novaseq_curated_duplex_bams_dmp/current/", + help = "Directory for all pooled bams [default]" + ) + parser$add_argument( + "-fa", + "--fastapath", + type = "character", + default = "/juno/work/access/production/resources/reference/current/Homo_sapiens_assembly19.fasta", + help = "Reference fasta path [default]" + ) + parser$add_argument( + "-gt", + "--genotyperpath", + type = "character", + default = "/work/access/production/resources/tools/GetBaseCountsMultiSample/current/GetBaseCountsMultiSample", + help = "Genotyper executable path [default]" + ) + parser$add_argument( + "-dmp", + "--dmpdir", + type = "character", + default = "/juno/work/access/production/resources/cbioportal/current/msk_solid_heme", + help = "Directory of clinical DMP repository [default]" + ) + parser$add_argument( + "-mb", + "--mirrorbamdir", + type = "character", + default = "/juno/res/dmpcollab/dmpshare/share/irb12_245", + help = "Mirror BAM file directory [default]" + ) + parser$add_argument( + "-mab", + "--mirroraccessbamdir", + type = "character", + default = "/juno/res/dmpcollab/dmpshare/share/access_12_245", + help = "Mirror BAM file directory for MSK-ACCESS [default]" + ) + parser$add_argument( + "-dmpk", + "--dmpkeypath", + type = "character", + default = "/juno/res/dmpcollab/dmprequest/12-245/key.txt", + help = "DMP mirror BAM key file [default]" + ) + parser$add_argument( + "-dmpak", + "--dmpaccesskeypath", + type = "character", + default = " /juno/res/dmpcollab/dmprequest/ACCESS-12-245/key.txt", + help = "DMP mirror BAM key file for MSK-ACCESS [default]" + ) + args <- parser$parse_args() - master.ref <- args$masterref - results.dir <- args$resultsdir - project.ID <- args$projectid - pooled.bam.dir <- args$pooledbamdir - fasta.path <- args$fastapath - genotyper.path <- args$genotyperpath - dmp.dir <- args$dmpdir - mirror.bam.dir <- args$mirrorbamdir - mirror.access.bam.dir <- args$mirroraccessbamdir - dmp.key.path <- args$dmpkeypath - access.key.path <- args$dmpaccesskeypath + master.ref <- args$masterref + results.dir <- args$resultsdir + project.ID <- args$projectid + pooled.bam.dir <- args$pooledbamdir + fasta.path <- args$fastapath + genotyper.path <- args$genotyperpath + dmp.dir <- args$dmpdir + mirror.bam.dir <- args$mirrorbamdir + mirror.access.bam.dir <- args$mirroraccessbamdir + dmp.key.path <- args$dmpkeypath + access.key.path <- args$dmpaccesskeypath - if (project.ID == "") { - project.ID <- - paste0(sample(c(0:9), size = 10, replace = T), collapse = "") - } + if (project.ID == "") { + project.ID <- + paste0(sample(c(0:9), size = 10, replace = T), collapse = "") + } - print(paste0("Input parameters for run ", project.ID)) - print(master.ref) - print(results.dir) - print(pooled.bam.dir) - print(fasta.path) - print(genotyper.path) - print(dmp.dir) - print(mirror.bam.dir) - print(mirror.access.bam.dir) - print(dmp.key.path) - print(access.key.path) - suppressWarnings( - compile_reads( - fread(master.ref), - results.dir, - project.ID, - pooled.bam.dir, - fasta.path, - genotyper.path, - dmp.dir, - mirror.bam.dir, - dmp.key.path - ) + print(paste0("Input parameters for run ", project.ID)) + print(master.ref) + print(results.dir) + print(pooled.bam.dir) + print(fasta.path) + print(genotyper.path) + print(dmp.dir) + print(mirror.bam.dir) + print(mirror.access.bam.dir) + print(dmp.key.path) + print(access.key.path) + suppressWarnings( + compile_reads( + fread(master.ref), + results.dir, + project.ID, + pooled.bam.dir, + fasta.path, + genotyper.path, + dmp.dir, + mirror.bam.dir, + dmp.key.path ) - print("compile reads function finished") - } + ) + print("compile reads function finished") +} From 11b450fd3959e77eb54d8877770066d682f3f68c Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 15:39:53 -0500 Subject: [PATCH 080/126] adding access samples to genotype --- R/compile_reads.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/R/compile_reads.R b/R/compile_reads.R index 9c3208b..29443c2 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -139,6 +139,10 @@ compile_reads <- function(master.ref, ), function(x) { paste0(x, collapse = "/") })) + print("ID\n") + print(all.dmp.ids.XS) + print("\nDIR\n") + print(access.bam.sub.dir) access.sample.sheet <- unique( data.frame( Sample_Barcode = all.dmp.ids.XS, From 40db560a52d329e5aff0b0fb973e730588506f4e Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 15:52:24 -0500 Subject: [PATCH 081/126] adding access samples to genotype --- R/compile_reads.R | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index 29443c2..b875eac 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -68,7 +68,7 @@ compile_reads <- function(master.ref, filter(Mutation_Status != "GERMLINE") %>% data.table() DMP.RET.maf <- - DMP.maf[grepl(paste0(unique(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id), collapse = "|"), Tumor_Sample_Barcode),] + DMP.maf[grepl(paste0(unique(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id), collapse = "|"), Tumor_Sample_Barcode), ] # Pooled normal samples --------------------------------------------------- pooled.bams <- @@ -106,12 +106,12 @@ compile_reads <- function(master.ref, gsub("-standard|-unfilter|-simplex|-duplex", "", access.key[grepl(paste0(dmp_id, "-(T|N)..-XS."), V1)]$V2) + all.dmp.bam.ids <- + c(all.dmp.bam.ids.IM, + all.dmp.bam.ids.IH) if (is.null(all.dmp.ids)) { dmp.sample.sheet <- NULL } else{ - all.dmp.bam.ids <- - c(all.dmp.bam.ids.IM, - all.dmp.bam.ids.IH) bam.sub.dir <- unlist(lapply(strsplit(substr( all.dmp.bam.ids, 1, 2 @@ -130,7 +130,7 @@ compile_reads <- function(master.ref, ) ) } - if (is.null(all.dmp.bam.ids.XS)) { + if (is.null(all.dmp.ids.XS) | is.null(all.dmp.bam.ids.XS)) { access.sample.sheet <- NULL } else{ access.bam.sub.dir <- @@ -421,7 +421,7 @@ compile_reads <- function(master.ref, HGVSp_Short, Reference_Allele, Tumor_Seq_Allele2 - )])),] %>% + )])), ] %>% mutate( t_ref_count = 0, t_alt_count = 0, @@ -546,7 +546,7 @@ compile_reads <- function(master.ref, HGVSp_Short, Reference_Allele, Tumor_Seq_Allele2 - )]), ] + )]),] write.table( all.all.unique.mafs, paste0(results.dir, "/pooled/all_all_unique.maf"), From a75a94d6bc4aee793fc29198c388e6b0499ceaac Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 15:56:55 -0500 Subject: [PATCH 082/126] adding access samples to genotype --- R/compile_reads.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index b875eac..764cf9d 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -109,7 +109,7 @@ compile_reads <- function(master.ref, all.dmp.bam.ids <- c(all.dmp.bam.ids.IM, all.dmp.bam.ids.IH) - if (is.null(all.dmp.ids)) { + if (length(all.dmp.ids)==0) { dmp.sample.sheet <- NULL } else{ bam.sub.dir <- @@ -130,7 +130,7 @@ compile_reads <- function(master.ref, ) ) } - if (is.null(all.dmp.ids.XS) | is.null(all.dmp.bam.ids.XS)) { + if (length(all.dmp.ids.XS) == 0){ access.sample.sheet <- NULL } else{ access.bam.sub.dir <- From 1111edc519b0c7339623279054628f195e90e19c Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 21:20:27 -0500 Subject: [PATCH 083/126] adding access samples to genotype --- R/compile_reads.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/compile_reads.R b/R/compile_reads.R index 764cf9d..76a58fb 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -186,6 +186,8 @@ compile_reads <- function(master.ref, dmp_patient_id = dmp_id ) } + print(dmp.sample.sheet) + stop() # total sample sheet sample.sheet <- master.ref[cmo_patient_id == x, # plasma bams -- duplex and simplex bam From bc9f504d4cfbfe1cd81f0a0e3923ec99f807745b Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 21:23:28 -0500 Subject: [PATCH 084/126] adding access samples to genotype --- R/compile_reads.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index 76a58fb..1dab03b 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -177,7 +177,8 @@ compile_reads <- function(master.ref, } else{ dmp.sample.sheet <- bind_row(dmp.sample.sheet, access.sample.sheet) - } %>% + } + dmp.sample.sheet %>% mutate( cmo_patient_id = x, Sample_Type = ifelse(grepl("-T", Sample_Barcode), From 39c6a0493ac45d60a109a3a63be916692f99cb27 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 21:29:53 -0500 Subject: [PATCH 085/126] Update compile_reads.R --- R/compile_reads.R | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index 1dab03b..5bb5aa1 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -109,7 +109,7 @@ compile_reads <- function(master.ref, all.dmp.bam.ids <- c(all.dmp.bam.ids.IM, all.dmp.bam.ids.IH) - if (length(all.dmp.ids)==0) { + if (length(all.dmp.ids) == 0) { dmp.sample.sheet <- NULL } else{ bam.sub.dir <- @@ -130,7 +130,7 @@ compile_reads <- function(master.ref, ) ) } - if (length(all.dmp.ids.XS) == 0){ + if (length(all.dmp.ids.XS) == 0) { access.sample.sheet <- NULL } else{ access.bam.sub.dir <- @@ -165,9 +165,11 @@ compile_reads <- function(master.ref, ) ) } - if (is.null(dmp.sample.sheet) & - is.null(access.sample.sheet)) { - dmp.sample.sheet <- NULL + if (!is.null(dmp.sample.sheet) & + !is.null(access.sample.sheet)) { + dmp.sample.sheet <- + bind_row(dmp.sample.sheet, access.sample.sheet) + } else if (is.null(dmp.sample.sheet) & !is.null(access.sample.sheet)) { dmp.sample.sheet <- access.sample.sheet @@ -175,17 +177,22 @@ compile_reads <- function(master.ref, is.null(access.sample.sheet)) { dmp.sample.sheet <- dmp.sample.sheet } else{ - dmp.sample.sheet <- - bind_row(dmp.sample.sheet, access.sample.sheet) + dmp.sample.sheet <- NULL + } + if (!is.null(dmp.sample.sheet)) { + dmp.sample.sheet %>% + mutate( + cmo_patient_id = x, + Sample_Type = ifelse( + grepl("-T", Sample_Barcode), + "DMP_Tumor", + "DMP_Normal" + ), + dmp_patient_id = dmp_id + ) + } else{ + dmp.sample.shett <- NULL } - dmp.sample.sheet %>% - mutate( - cmo_patient_id = x, - Sample_Type = ifelse(grepl("-T", Sample_Barcode), - "DMP_Tumor", - "DMP_Normal"), - dmp_patient_id = dmp_id - ) } print(dmp.sample.sheet) stop() From 5b49ff00b7adc89a4a664c74faffa5d8c09a0de0 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 21:33:40 -0500 Subject: [PATCH 086/126] Update compile_reads.R --- R/compile_reads.R | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index 5bb5aa1..f031f19 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -167,19 +167,24 @@ compile_reads <- function(master.ref, } if (!is.null(dmp.sample.sheet) & !is.null(access.sample.sheet)) { + print("I am in 1") dmp.sample.sheet <- bind_row(dmp.sample.sheet, access.sample.sheet) } else if (is.null(dmp.sample.sheet) & !is.null(access.sample.sheet)) { + print("I am in 2") dmp.sample.sheet <- access.sample.sheet } else if (!is.null(dmp.sample.sheet) & is.null(access.sample.sheet)) { + print("I am in 3") dmp.sample.sheet <- dmp.sample.sheet } else{ + print("I am in 4") dmp.sample.sheet <- NULL } if (!is.null(dmp.sample.sheet)) { + print("I am in 5") dmp.sample.sheet %>% mutate( cmo_patient_id = x, @@ -191,7 +196,8 @@ compile_reads <- function(master.ref, dmp_patient_id = dmp_id ) } else{ - dmp.sample.shett <- NULL + print("I am in 6") + dmp.sample.sheet <- NULL } } print(dmp.sample.sheet) From cd265dc652e57e011ab61c04e789bac2a852c2a1 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 21:38:43 -0500 Subject: [PATCH 087/126] Update compile_reads.R --- R/compile_reads.R | 44 +++++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index f031f19..fd660e1 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -68,7 +68,7 @@ compile_reads <- function(master.ref, filter(Mutation_Status != "GERMLINE") %>% data.table() DMP.RET.maf <- - DMP.maf[grepl(paste0(unique(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id), collapse = "|"), Tumor_Sample_Barcode), ] + DMP.maf[grepl(paste0(unique(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id), collapse = "|"), Tumor_Sample_Barcode),] # Pooled normal samples --------------------------------------------------- pooled.bams <- @@ -128,7 +128,16 @@ compile_reads <- function(master.ref, all.dmp.bam.ids, ".bam" ) - ) + ) %>% + mutate( + cmo_patient_id = x, + Sample_Type = ifelse( + grepl("-T", Sample_Barcode), + "DMP_Tumor", + "DMP_Normal" + ), + dmp_patient_id = dmp_id + ) } if (length(all.dmp.ids.XS) == 0) { access.sample.sheet <- NULL @@ -162,7 +171,16 @@ compile_reads <- function(master.ref, all.dmp.bam.ids.XS, "-simplex.bam" ) - ) + ) %>% + mutate( + cmo_patient_id = x, + Sample_Type = ifelse( + grepl("-T", Sample_Barcode), + "DMP_Tumor", + "DMP_Normal" + ), + dmp_patient_id = dmp_id + ) ) } if (!is.null(dmp.sample.sheet) & @@ -183,22 +201,6 @@ compile_reads <- function(master.ref, print("I am in 4") dmp.sample.sheet <- NULL } - if (!is.null(dmp.sample.sheet)) { - print("I am in 5") - dmp.sample.sheet %>% - mutate( - cmo_patient_id = x, - Sample_Type = ifelse( - grepl("-T", Sample_Barcode), - "DMP_Tumor", - "DMP_Normal" - ), - dmp_patient_id = dmp_id - ) - } else{ - print("I am in 6") - dmp.sample.sheet <- NULL - } } print(dmp.sample.sheet) stop() @@ -437,7 +439,7 @@ compile_reads <- function(master.ref, HGVSp_Short, Reference_Allele, Tumor_Seq_Allele2 - )])), ] %>% + )])),] %>% mutate( t_ref_count = 0, t_alt_count = 0, @@ -562,7 +564,7 @@ compile_reads <- function(master.ref, HGVSp_Short, Reference_Allele, Tumor_Seq_Allele2 - )]),] + )]), ] write.table( all.all.unique.mafs, paste0(results.dir, "/pooled/all_all_unique.maf"), From c28287ec296e95021a2d07be45c863609ee3dfc0 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 21:40:24 -0500 Subject: [PATCH 088/126] Update compile_reads.R --- R/compile_reads.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index fd660e1..44bf85b 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -203,7 +203,6 @@ compile_reads <- function(master.ref, } } print(dmp.sample.sheet) - stop() # total sample sheet sample.sheet <- master.ref[cmo_patient_id == x, # plasma bams -- duplex and simplex bam From 0c1ce6fae0a9ffd984cb1125c36ae4951ec1aa25 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 22:16:05 -0500 Subject: [PATCH 089/126] Update compile_reads.R --- R/compile_reads.R | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index 44bf85b..abf0735 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -41,25 +41,25 @@ compile_reads <- function(master.ref, # data from DMP ----------------------------------------------------------- DMP.key <- fread(dmp.key.path) if (any( - !master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-IH.|-T..-IM.|-T..-XS", "", DMP.key[grepl("IH|IM|XS", V1)]$V1) + !master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-IH.|-T..-IM.", "", DMP.key[grepl("IH|IM", V1)]$V1) )) { message(paste0( "These DMP IDs are not found in DMP key file: ", paste0(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id[which( !master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% - gsub("-T..-IH.|-T..-IM.|-T..-XS", "", DMP.key[grepl("IH|IM|XS", V1)]$V1) + gsub("-T..-IH.|-T..-IM.", "", DMP.key[grepl("IH|IM", V1)]$V1) )], collapse = " ,") )) } access.key <- fread(access.key.path) if (any( - !master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-IH.|-T..-IM.|-T..-XS", "", access.key[grepl("IH|IM|XS", V1)]$V1) + !master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1) )) { message(paste0( - "These DMP IDs are not found in DMP key file: ", + "These DMP IDs are not found in DMP ACCESS key file: ", paste0(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id[which( !master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% - gsub("-T..-IH.|-T..-IM.|-T..-XS", "", access.key[grepl("IH|IM|XS", V1)]$V1) + gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1) )], collapse = " ,") )) } @@ -148,10 +148,6 @@ compile_reads <- function(master.ref, ), function(x) { paste0(x, collapse = "/") })) - print("ID\n") - print(all.dmp.ids.XS) - print("\nDIR\n") - print(access.bam.sub.dir) access.sample.sheet <- unique( data.frame( Sample_Barcode = all.dmp.ids.XS, From 57500c189873bbedbe5e76b218691759411fb05c Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 22:16:24 -0500 Subject: [PATCH 090/126] Update compile_reads.R --- R/compile_reads.R | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index abf0735..6df0390 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -40,27 +40,19 @@ compile_reads <- function(master.ref, # data from DMP ----------------------------------------------------------- DMP.key <- fread(dmp.key.path) - if (any( - !master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-IH.|-T..-IM.", "", DMP.key[grepl("IH|IM", V1)]$V1) - )) { + if (any(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-IH.|-T..-IM.", "", DMP.key[grepl("IH|IM", V1)]$V1))) { message(paste0( "These DMP IDs are not found in DMP key file: ", - paste0(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id[which( - !master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% - gsub("-T..-IH.|-T..-IM.", "", DMP.key[grepl("IH|IM", V1)]$V1) - )], collapse = " ,") + paste0(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id[which(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% + gsub("-T..-IH.|-T..-IM.", "", DMP.key[grepl("IH|IM", V1)]$V1))], collapse = " ,") )) } access.key <- fread(access.key.path) - if (any( - !master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1) - )) { + if (any(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1))) { message(paste0( "These DMP IDs are not found in DMP ACCESS key file: ", - paste0(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id[which( - !master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% - gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1) - )], collapse = " ,") + paste0(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id[which(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% + gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1))], collapse = " ,") )) } DMP.maf <- @@ -68,7 +60,7 @@ compile_reads <- function(master.ref, filter(Mutation_Status != "GERMLINE") %>% data.table() DMP.RET.maf <- - DMP.maf[grepl(paste0(unique(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id), collapse = "|"), Tumor_Sample_Barcode),] + DMP.maf[grepl(paste0(unique(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id), collapse = "|"), Tumor_Sample_Barcode), ] # Pooled normal samples --------------------------------------------------- pooled.bams <- @@ -434,7 +426,7 @@ compile_reads <- function(master.ref, HGVSp_Short, Reference_Allele, Tumor_Seq_Allele2 - )])),] %>% + )])), ] %>% mutate( t_ref_count = 0, t_alt_count = 0, @@ -559,7 +551,7 @@ compile_reads <- function(master.ref, HGVSp_Short, Reference_Allele, Tumor_Seq_Allele2 - )]), ] + )]),] write.table( all.all.unique.mafs, paste0(results.dir, "/pooled/all_all_unique.maf"), From a34016198dd5c57060da825a152d8e00693fa401 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 22:29:50 -0500 Subject: [PATCH 091/126] Update compile_reads.R --- R/compile_reads.R | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index 6df0390..4d998c9 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -40,27 +40,28 @@ compile_reads <- function(master.ref, # data from DMP ----------------------------------------------------------- DMP.key <- fread(dmp.key.path) + access.key <- fread(access.key.path) if (any(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-IH.|-T..-IM.", "", DMP.key[grepl("IH|IM", V1)]$V1))) { - message(paste0( + warnings(paste0( "These DMP IDs are not found in DMP key file: ", paste0(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id[which(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-IH.|-T..-IM.", "", DMP.key[grepl("IH|IM", V1)]$V1))], collapse = " ,") )) } - access.key <- fread(access.key.path) if (any(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1))) { - message(paste0( + warnings(paste0( "These DMP IDs are not found in DMP ACCESS key file: ", paste0(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id[which(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1))], collapse = " ,") )) } - DMP.maf <- + + if DMP.maf <- fread(paste0(dmp.dir, "/data_mutations_extended.txt")) %>% filter(Mutation_Status != "GERMLINE") %>% data.table() DMP.RET.maf <- - DMP.maf[grepl(paste0(unique(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id), collapse = "|"), Tumor_Sample_Barcode), ] + DMP.maf[grepl(paste0(unique(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id), collapse = "|"), Tumor_Sample_Barcode),] # Pooled normal samples --------------------------------------------------- pooled.bams <- @@ -426,7 +427,7 @@ compile_reads <- function(master.ref, HGVSp_Short, Reference_Allele, Tumor_Seq_Allele2 - )])), ] %>% + )])),] %>% mutate( t_ref_count = 0, t_alt_count = 0, @@ -551,7 +552,7 @@ compile_reads <- function(master.ref, HGVSp_Short, Reference_Allele, Tumor_Seq_Allele2 - )]),] + )]), ] write.table( all.all.unique.mafs, paste0(results.dir, "/pooled/all_all_unique.maf"), From aaf90f77c6036201b04fd71d28c6b4869297a4d4 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 22:32:01 -0500 Subject: [PATCH 092/126] Update compile_reads.R --- R/compile_reads.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index 4d998c9..7e8b85a 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -56,12 +56,12 @@ compile_reads <- function(master.ref, )) } - if DMP.maf <- + DMP.maf <- fread(paste0(dmp.dir, "/data_mutations_extended.txt")) %>% filter(Mutation_Status != "GERMLINE") %>% data.table() DMP.RET.maf <- - DMP.maf[grepl(paste0(unique(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id), collapse = "|"), Tumor_Sample_Barcode),] + DMP.maf[grepl(paste0(unique(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id), collapse = "|"), Tumor_Sample_Barcode), ] # Pooled normal samples --------------------------------------------------- pooled.bams <- @@ -427,7 +427,7 @@ compile_reads <- function(master.ref, HGVSp_Short, Reference_Allele, Tumor_Seq_Allele2 - )])),] %>% + )])), ] %>% mutate( t_ref_count = 0, t_alt_count = 0, @@ -552,7 +552,7 @@ compile_reads <- function(master.ref, HGVSp_Short, Reference_Allele, Tumor_Seq_Allele2 - )]), ] + )]),] write.table( all.all.unique.mafs, paste0(results.dir, "/pooled/all_all_unique.maf"), From d5dd4dcfdd82b869f3b7b68341eeafb2002c2352 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 22:35:27 -0500 Subject: [PATCH 093/126] Update compile_reads.R --- R/compile_reads.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index 7e8b85a..d7e3f63 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -42,14 +42,14 @@ compile_reads <- function(master.ref, DMP.key <- fread(dmp.key.path) access.key <- fread(access.key.path) if (any(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-IH.|-T..-IM.", "", DMP.key[grepl("IH|IM", V1)]$V1))) { - warnings(paste0( + message(paste0( "These DMP IDs are not found in DMP key file: ", paste0(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id[which(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-IH.|-T..-IM.", "", DMP.key[grepl("IH|IM", V1)]$V1))], collapse = " ,") )) } if (any(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1))) { - warnings(paste0( + stop(paste0( "These DMP IDs are not found in DMP ACCESS key file: ", paste0(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id[which(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1))], collapse = " ,") From d2884548b03b09b46da4ac63ddccd069a7caca30 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 22:45:41 -0500 Subject: [PATCH 094/126] Update compile_reads.R --- R/compile_reads.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/compile_reads.R b/R/compile_reads.R index d7e3f63..e11aac8 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -41,6 +41,7 @@ compile_reads <- function(master.ref, # data from DMP ----------------------------------------------------------- DMP.key <- fread(dmp.key.path) access.key <- fread(access.key.path) + print(head(access.key)) if (any(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-IH.|-T..-IM.", "", DMP.key[grepl("IH|IM", V1)]$V1))) { message(paste0( "These DMP IDs are not found in DMP key file: ", From 25ac7b6b66aa1347e915bff615c1f1f0122ca0bf Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 22:47:53 -0500 Subject: [PATCH 095/126] Update compile_reads.R --- R/compile_reads.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index e11aac8..5014c22 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -41,7 +41,7 @@ compile_reads <- function(master.ref, # data from DMP ----------------------------------------------------------- DMP.key <- fread(dmp.key.path) access.key <- fread(access.key.path) - print(head(access.key)) + print(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1)) if (any(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-IH.|-T..-IM.", "", DMP.key[grepl("IH|IM", V1)]$V1))) { message(paste0( "These DMP IDs are not found in DMP key file: ", From 8d98e5b38e7703edcfcf93118a480ad9a3caaaf4 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 22:50:31 -0500 Subject: [PATCH 096/126] Update compile_reads.R --- R/compile_reads.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index 5014c22..1eb5e27 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -41,7 +41,7 @@ compile_reads <- function(master.ref, # data from DMP ----------------------------------------------------------- DMP.key <- fread(dmp.key.path) access.key <- fread(access.key.path) - print(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1)) + print(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T07-XS1", "", access.key[grepl("XS", V1)]$V1)) if (any(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-IH.|-T..-IM.", "", DMP.key[grepl("IH|IM", V1)]$V1))) { message(paste0( "These DMP IDs are not found in DMP key file: ", From edd59407b953b6b578601da257bc5cf43d74aca8 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 22:52:25 -0500 Subject: [PATCH 097/126] Update compile_reads.R --- R/compile_reads.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index 1eb5e27..a5c1916 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -41,7 +41,7 @@ compile_reads <- function(master.ref, # data from DMP ----------------------------------------------------------- DMP.key <- fread(dmp.key.path) access.key <- fread(access.key.path) - print(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T07-XS1", "", access.key[grepl("XS", V1)]$V1)) + print(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% access.key[grepl("XS", V1)]$V1) if (any(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-IH.|-T..-IM.", "", DMP.key[grepl("IH|IM", V1)]$V1))) { message(paste0( "These DMP IDs are not found in DMP key file: ", From dd002450b2b2bc4a6a97094f4f693f091a315808 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 22:55:18 -0500 Subject: [PATCH 098/126] Update compile_reads.R --- R/compile_reads.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index a5c1916..b4d709a 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -41,7 +41,7 @@ compile_reads <- function(master.ref, # data from DMP ----------------------------------------------------------- DMP.key <- fread(dmp.key.path) access.key <- fread(access.key.path) - print(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% access.key[grepl("XS", V1)]$V1) + print(head(gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1))) if (any(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-IH.|-T..-IM.", "", DMP.key[grepl("IH|IM", V1)]$V1))) { message(paste0( "These DMP IDs are not found in DMP key file: ", From 1a7065b80bc2981b14ef77dd73c8c5b8cae50be5 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 22:57:06 -0500 Subject: [PATCH 099/126] Update compile_reads.R --- R/compile_reads.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/compile_reads.R b/R/compile_reads.R index b4d709a..725eb28 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -41,6 +41,7 @@ compile_reads <- function(master.ref, # data from DMP ----------------------------------------------------------- DMP.key <- fread(dmp.key.path) access.key <- fread(access.key.path) + print(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id) print(head(gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1))) if (any(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-IH.|-T..-IM.", "", DMP.key[grepl("IH|IM", V1)]$V1))) { message(paste0( From cdffe40636ca4aa2a44f925b39aa84c63bb7ed82 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 22:58:40 -0500 Subject: [PATCH 100/126] Update compile_reads.R --- R/compile_reads.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index 725eb28..fce6b17 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -41,7 +41,7 @@ compile_reads <- function(master.ref, # data from DMP ----------------------------------------------------------- DMP.key <- fread(dmp.key.path) access.key <- fread(access.key.path) - print(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id) + print(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1)) print(head(gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1))) if (any(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-IH.|-T..-IM.", "", DMP.key[grepl("IH|IM", V1)]$V1))) { message(paste0( From ca57ca616b83b82cf0f51abf76d635090c244c08 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 22:59:52 -0500 Subject: [PATCH 101/126] Update compile_reads.R --- R/compile_reads.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index fce6b17..0791cfe 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -41,7 +41,7 @@ compile_reads <- function(master.ref, # data from DMP ----------------------------------------------------------- DMP.key <- fread(dmp.key.path) access.key <- fread(access.key.path) - print(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1)) + print(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1)) print(head(gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1))) if (any(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-IH.|-T..-IM.", "", DMP.key[grepl("IH|IM", V1)]$V1))) { message(paste0( From 0c9bfa29666330698956d656b843546e28752c33 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 23:05:31 -0500 Subject: [PATCH 102/126] Update compile_reads.R --- R/compile_reads.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index 0791cfe..8d6979d 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -42,7 +42,7 @@ compile_reads <- function(master.ref, DMP.key <- fread(dmp.key.path) access.key <- fread(access.key.path) print(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1)) - print(head(gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1))) + print(head("P-0069484" %in% gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1))) if (any(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-IH.|-T..-IM.", "", DMP.key[grepl("IH|IM", V1)]$V1))) { message(paste0( "These DMP IDs are not found in DMP key file: ", From 28c9f5ebf5d931ed7cf5596d3ee1f838ed26f774 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 23:06:50 -0500 Subject: [PATCH 103/126] Update compile_reads.R --- R/compile_reads.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index 8d6979d..3bdfb99 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -42,7 +42,7 @@ compile_reads <- function(master.ref, DMP.key <- fread(dmp.key.path) access.key <- fread(access.key.path) print(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1)) - print(head("P-0069484" %in% gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1))) + print(gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1) if (any(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-IH.|-T..-IM.", "", DMP.key[grepl("IH|IM", V1)]$V1))) { message(paste0( "These DMP IDs are not found in DMP key file: ", From 98f31ce98ac33de1eaee87e73c752e3639b172d7 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 23:08:12 -0500 Subject: [PATCH 104/126] Update compile_reads.R --- R/compile_reads.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index 3bdfb99..911dc2d 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -42,7 +42,7 @@ compile_reads <- function(master.ref, DMP.key <- fread(dmp.key.path) access.key <- fread(access.key.path) print(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1)) - print(gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1) + print(gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1)) if (any(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-IH.|-T..-IM.", "", DMP.key[grepl("IH|IM", V1)]$V1))) { message(paste0( "These DMP IDs are not found in DMP key file: ", From d4acb114f3505e0fa4b2fb5729e1e81171ffd05a Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 23:25:32 -0500 Subject: [PATCH 105/126] Update compile_reads.R --- R/compile_reads.R | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index 911dc2d..da9f103 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -39,8 +39,8 @@ compile_reads <- function(master.ref, } # data from DMP ----------------------------------------------------------- - DMP.key <- fread(dmp.key.path) - access.key <- fread(access.key.path) + DMP.key <- fread(dmp.key.path, select = c(1, 2)) + access.key <- fread(access.key.path, select = c(1, 2)) print(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1)) print(gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1)) if (any(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-IH.|-T..-IM.", "", DMP.key[grepl("IH|IM", V1)]$V1))) { @@ -51,7 +51,7 @@ compile_reads <- function(master.ref, )) } if (any(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1))) { - stop(paste0( + messsage(paste0( "These DMP IDs are not found in DMP ACCESS key file: ", paste0(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id[which(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1))], collapse = " ,") @@ -63,7 +63,7 @@ compile_reads <- function(master.ref, filter(Mutation_Status != "GERMLINE") %>% data.table() DMP.RET.maf <- - DMP.maf[grepl(paste0(unique(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id), collapse = "|"), Tumor_Sample_Barcode), ] + DMP.maf[grepl(paste0(unique(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id), collapse = "|"), Tumor_Sample_Barcode),] # Pooled normal samples --------------------------------------------------- pooled.bams <- @@ -429,7 +429,7 @@ compile_reads <- function(master.ref, HGVSp_Short, Reference_Allele, Tumor_Seq_Allele2 - )])), ] %>% + )])),] %>% mutate( t_ref_count = 0, t_alt_count = 0, @@ -554,7 +554,7 @@ compile_reads <- function(master.ref, HGVSp_Short, Reference_Allele, Tumor_Seq_Allele2 - )]),] + )]), ] write.table( all.all.unique.mafs, paste0(results.dir, "/pooled/all_all_unique.maf"), From 1608eed2683d88170abec8399d2c394d27946780 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 23:29:24 -0500 Subject: [PATCH 106/126] Update compile_reads.R --- R/compile_reads.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index da9f103..d5ef074 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -40,7 +40,7 @@ compile_reads <- function(master.ref, # data from DMP ----------------------------------------------------------- DMP.key <- fread(dmp.key.path, select = c(1, 2)) - access.key <- fread(access.key.path, select = c(1, 2)) + access.key <- read.csv(access.key.path, header = FALSE, quote = FALSE) print(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1)) print(gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1)) if (any(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-IH.|-T..-IM.", "", DMP.key[grepl("IH|IM", V1)]$V1))) { From c21e095832b1dea4f7c16791c658fe1d2fe6205f Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 23:30:46 -0500 Subject: [PATCH 107/126] Update compile_reads.R --- R/compile_reads.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index d5ef074..b4ccdd1 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -39,8 +39,8 @@ compile_reads <- function(master.ref, } # data from DMP ----------------------------------------------------------- - DMP.key <- fread(dmp.key.path, select = c(1, 2)) - access.key <- read.csv(access.key.path, header = FALSE, quote = FALSE) + DMP.key <- fread(dmp.key.path) + access.key <- read.csv(access.key.path, header = FALSE, sep = ",") print(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1)) print(gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1)) if (any(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-IH.|-T..-IM.", "", DMP.key[grepl("IH|IM", V1)]$V1))) { From 6a91d6710f2ae51bc062ca1d9d4a623b08a251d6 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 23:38:11 -0500 Subject: [PATCH 108/126] Update compile_reads.R --- R/compile_reads.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index b4ccdd1..f855f32 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -40,7 +40,7 @@ compile_reads <- function(master.ref, # data from DMP ----------------------------------------------------------- DMP.key <- fread(dmp.key.path) - access.key <- read.csv(access.key.path, header = FALSE, sep = ",") + access.key <- as.data.table(read.csv(access.key.path, header = FALSE, sep = ",") print(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1)) print(gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1)) if (any(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-IH.|-T..-IM.", "", DMP.key[grepl("IH|IM", V1)]$V1))) { From 6184eaec2ff551b9e9e3116e37b03ef757085f52 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 23:38:47 -0500 Subject: [PATCH 109/126] Update compile_reads.R --- R/compile_reads.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index f855f32..eac6b53 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -40,7 +40,7 @@ compile_reads <- function(master.ref, # data from DMP ----------------------------------------------------------- DMP.key <- fread(dmp.key.path) - access.key <- as.data.table(read.csv(access.key.path, header = FALSE, sep = ",") + access.key <- as.data.table(read.csv(access.key.path, header = FALSE, sep = ",")) print(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1)) print(gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1)) if (any(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-IH.|-T..-IM.", "", DMP.key[grepl("IH|IM", V1)]$V1))) { From 82d266a70f826d4142096cd4c25439b387505d08 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 23:50:50 -0500 Subject: [PATCH 110/126] Update compile_reads.R --- R/compile_reads.R | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index eac6b53..fddbf10 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -40,9 +40,8 @@ compile_reads <- function(master.ref, # data from DMP ----------------------------------------------------------- DMP.key <- fread(dmp.key.path) - access.key <- as.data.table(read.csv(access.key.path, header = FALSE, sep = ",")) - print(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1)) - print(gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1)) + access.key <- + as.data.table(read.csv(access.key.path, header = FALSE, sep = ",")) if (any(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-IH.|-T..-IM.", "", DMP.key[grepl("IH|IM", V1)]$V1))) { message(paste0( "These DMP IDs are not found in DMP key file: ", @@ -63,7 +62,7 @@ compile_reads <- function(master.ref, filter(Mutation_Status != "GERMLINE") %>% data.table() DMP.RET.maf <- - DMP.maf[grepl(paste0(unique(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id), collapse = "|"), Tumor_Sample_Barcode),] + DMP.maf[grepl(paste0(unique(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id), collapse = "|"), Tumor_Sample_Barcode), ] # Pooled normal samples --------------------------------------------------- pooled.bams <- @@ -122,7 +121,9 @@ compile_reads <- function(master.ref, "/", all.dmp.bam.ids, ".bam" - ) + ), + duplex_bam = NA, + simplex_bam = NA ) %>% mutate( cmo_patient_id = x, @@ -146,6 +147,7 @@ compile_reads <- function(master.ref, access.sample.sheet <- unique( data.frame( Sample_Barcode = all.dmp.ids.XS, + standard_bam = NA, duplex_bam = paste0( mirror.access.bam.dir, "/", @@ -199,6 +201,7 @@ compile_reads <- function(master.ref, # plasma bams -- duplex and simplex bam .( Sample_Barcode = as.character(cmo_sample_id_plasma), + standard_bam = NA, duplex_bam = bam_path_plasma_duplex, simplex_bam = bam_path_plasma_simplex, cmo_patient_id, @@ -211,6 +214,8 @@ compile_reads <- function(master.ref, .( Sample_Barcode = as.character(cmo_sample_id_normal), standard_bam = bam_path_normal, + duplex_bam = NA, + simplex_bam = NA, cmo_patient_id, Sample_Type = "unfilterednormal", dmp_patient_id @@ -429,7 +434,7 @@ compile_reads <- function(master.ref, HGVSp_Short, Reference_Allele, Tumor_Seq_Allele2 - )])),] %>% + )])), ] %>% mutate( t_ref_count = 0, t_alt_count = 0, @@ -554,7 +559,7 @@ compile_reads <- function(master.ref, HGVSp_Short, Reference_Allele, Tumor_Seq_Allele2 - )]), ] + )]),] write.table( all.all.unique.mafs, paste0(results.dir, "/pooled/all_all_unique.maf"), From 81afdbcbf25cc0e6391a07c2dbbc6bc26ca7b575 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Wed, 1 Feb 2023 23:58:56 -0500 Subject: [PATCH 111/126] Update compile_reads.R --- R/compile_reads.R | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index fddbf10..ab324b0 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -40,8 +40,6 @@ compile_reads <- function(master.ref, # data from DMP ----------------------------------------------------------- DMP.key <- fread(dmp.key.path) - access.key <- - as.data.table(read.csv(access.key.path, header = FALSE, sep = ",")) if (any(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-IH.|-T..-IM.", "", DMP.key[grepl("IH|IM", V1)]$V1))) { message(paste0( "These DMP IDs are not found in DMP key file: ", @@ -49,8 +47,11 @@ compile_reads <- function(master.ref, gsub("-T..-IH.|-T..-IM.", "", DMP.key[grepl("IH|IM", V1)]$V1))], collapse = " ,") )) } + # data from DMP ACCESS ---------------------------------------------------- + access.key <- + as.data.table(read.csv(access.key.path, header = FALSE, sep = ",")) if (any(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1))) { - messsage(paste0( + message(paste0( "These DMP IDs are not found in DMP ACCESS key file: ", paste0(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id[which(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1))], collapse = " ,") @@ -62,7 +63,7 @@ compile_reads <- function(master.ref, filter(Mutation_Status != "GERMLINE") %>% data.table() DMP.RET.maf <- - DMP.maf[grepl(paste0(unique(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id), collapse = "|"), Tumor_Sample_Barcode), ] + DMP.maf[grepl(paste0(unique(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id), collapse = "|"), Tumor_Sample_Barcode),] # Pooled normal samples --------------------------------------------------- pooled.bams <- @@ -434,7 +435,7 @@ compile_reads <- function(master.ref, HGVSp_Short, Reference_Allele, Tumor_Seq_Allele2 - )])), ] %>% + )])),] %>% mutate( t_ref_count = 0, t_alt_count = 0, @@ -559,7 +560,7 @@ compile_reads <- function(master.ref, HGVSp_Short, Reference_Allele, Tumor_Seq_Allele2 - )]),] + )]), ] write.table( all.all.unique.mafs, paste0(results.dir, "/pooled/all_all_unique.maf"), From fded98c160ecbc061f3597dfa5831a596c0740d0 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Thu, 2 Feb 2023 00:00:30 -0500 Subject: [PATCH 112/126] Update compile_reads.R --- R/compile_reads.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index ab324b0..5fa1a08 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -181,7 +181,7 @@ compile_reads <- function(master.ref, !is.null(access.sample.sheet)) { print("I am in 1") dmp.sample.sheet <- - bind_row(dmp.sample.sheet, access.sample.sheet) + bind_rows(dmp.sample.sheet, access.sample.sheet) } else if (is.null(dmp.sample.sheet) & !is.null(access.sample.sheet)) { From 32272be98874c80d574a73da95a8947dd721c9b1 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Thu, 2 Feb 2023 10:14:55 -0500 Subject: [PATCH 113/126] Update compile_reads.R --- R/compile_reads.R | 52 +++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 44 insertions(+), 8 deletions(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index 5fa1a08..b0cf1b7 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -91,7 +91,9 @@ compile_reads <- function(master.ref, all.dmp.ids.IH <- DMP.key[grepl(paste0(dmp_id, "-(T|N)..-IH."), V1)]$V1 all.dmp.ids.XS <- - access.key[grepl(paste0(dmp_id, "-(T|N)..-XS."), V1)]$V1 + access.key[grepl(paste0(dmp_id, "-T..-XS."), V1)]$V1 + all.dmp.ids.normal.XS <- + access.key[grepl(paste0(dmp_id, "-N..-XS."), V1)]$V1 all.dmp.ids <- c(all.dmp.ids.IM, all.dmp.ids.IH) all.dmp.bam.ids.IM <- DMP.key[grepl(paste0(dmp_id, "-(T|N)..-IM."), V1)]$V2 @@ -101,6 +103,10 @@ compile_reads <- function(master.ref, gsub("-standard|-unfilter|-simplex|-duplex", "", access.key[grepl(paste0(dmp_id, "-(T|N)..-XS."), V1)]$V2) + all.dmp.bam.ids.normal.XS <- + gsub("-standard|-unfilter|-simplex|-duplex", + "", + access.key[grepl(paste0(dmp_id, "-N..-XS."), V1)]$V2) all.dmp.bam.ids <- c(all.dmp.bam.ids.IM, all.dmp.bam.ids.IH) @@ -170,33 +176,63 @@ compile_reads <- function(master.ref, cmo_patient_id = x, Sample_Type = ifelse( grepl("-T", Sample_Barcode), - "DMP_Tumor", - "DMP_Normal" + "duplex", + "unfilterednormal" + ), + dmp_patient_id = dmp_id + ) + ) + access.normal.bam.sub.dir <- + unlist(lapply(strsplit( + substr(all.dmp.bam.ids.normal.XS, 1, 2), "" + ), function(x) { + paste0(x, collapse = "/") + })) + access.normal.sample.sheet <- unique( + data.frame( + Sample_Barcode = all.dmp.ids.normal.XS, + standard_bam = paste0( + mirror.access.bam.dir, + "/", + access.normal.bam.sub.dir, + "/", + all.dmp.bam.ids.normal.XS, + "-unfilter.bam" + ), + duplex_bam = NA, + simplex_bam = NA + ) %>% + mutate( + cmo_patient_id = x, + Sample_Type = ifelse( + grepl("-N", Sample_Barcode), + "unfilterednormal", + "duplex" ), dmp_patient_id = dmp_id ) ) + access.sample.sheet = bind_rows(access.sample.sheet, access.normal.sample.sheet) } if (!is.null(dmp.sample.sheet) & !is.null(access.sample.sheet)) { - print("I am in 1") + print("DMP IMPACT and DMP ACCESS samples are available") dmp.sample.sheet <- bind_rows(dmp.sample.sheet, access.sample.sheet) } else if (is.null(dmp.sample.sheet) & !is.null(access.sample.sheet)) { - print("I am in 2") + print("DMP IMPACT samples are available and DMP ACCESS samples are NOT available") dmp.sample.sheet <- access.sample.sheet } else if (!is.null(dmp.sample.sheet) & is.null(access.sample.sheet)) { - print("I am in 3") + print("DMP IMPACT samples are NOT available and DMP ACCESS samples are available") dmp.sample.sheet <- dmp.sample.sheet } else{ - print("I am in 4") + print("No DMP IMPACT samples or DMP ACCESS samples are available") dmp.sample.sheet <- NULL } } - print(dmp.sample.sheet) # total sample sheet sample.sheet <- master.ref[cmo_patient_id == x, # plasma bams -- duplex and simplex bam From 059146937c72073e3db59fa02137a96faec1dcc7 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Thu, 2 Feb 2023 10:39:04 -0500 Subject: [PATCH 114/126] Update compile_reads.R --- R/compile_reads.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/compile_reads.R b/R/compile_reads.R index b0cf1b7..72aeab4 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -233,6 +233,8 @@ compile_reads <- function(master.ref, dmp.sample.sheet <- NULL } } + print(dmp.sample.sheet) + stop() # total sample sheet sample.sheet <- master.ref[cmo_patient_id == x, # plasma bams -- duplex and simplex bam From b28c5f643fc41ec725f62779c536eef4cfa66368 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Thu, 2 Feb 2023 10:42:26 -0500 Subject: [PATCH 115/126] Update compile_reads.R --- R/compile_reads.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index 72aeab4..b4166a8 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -233,8 +233,6 @@ compile_reads <- function(master.ref, dmp.sample.sheet <- NULL } } - print(dmp.sample.sheet) - stop() # total sample sheet sample.sheet <- master.ref[cmo_patient_id == x, # plasma bams -- duplex and simplex bam @@ -775,7 +773,9 @@ if (!interactive()) { genotyper.path, dmp.dir, mirror.bam.dir, - dmp.key.path + mirror.access.bam.dir, + dmp.key.path, + access.key.path ) ) print("compile reads function finished") From f4db327152db9585ed591457e6c78be49911d062 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Thu, 2 Feb 2023 10:45:55 -0500 Subject: [PATCH 116/126] Update compile_reads.R --- R/compile_reads.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index b4166a8..cf85112 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -63,7 +63,7 @@ compile_reads <- function(master.ref, filter(Mutation_Status != "GERMLINE") %>% data.table() DMP.RET.maf <- - DMP.maf[grepl(paste0(unique(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id), collapse = "|"), Tumor_Sample_Barcode),] + DMP.maf[grepl(paste0(unique(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id), collapse = "|"), Tumor_Sample_Barcode), ] # Pooled normal samples --------------------------------------------------- pooled.bams <- @@ -471,7 +471,7 @@ compile_reads <- function(master.ref, HGVSp_Short, Reference_Allele, Tumor_Seq_Allele2 - )])),] %>% + )])), ] %>% mutate( t_ref_count = 0, t_alt_count = 0, @@ -596,7 +596,7 @@ compile_reads <- function(master.ref, HGVSp_Short, Reference_Allele, Tumor_Seq_Allele2 - )]), ] + )]),] write.table( all.all.unique.mafs, paste0(results.dir, "/pooled/all_all_unique.maf"), @@ -729,7 +729,7 @@ if (!interactive()) { "-dmpak", "--dmpaccesskeypath", type = "character", - default = " /juno/res/dmpcollab/dmprequest/ACCESS-12-245/key.txt", + default = "/juno/res/dmpcollab/dmprequest/ACCESS-12-245/key.txt", help = "DMP mirror BAM key file for MSK-ACCESS [default]" ) args <- parser$parse_args() From 0ab79fc43993c6abaf1be45a7adfe6a7d4595d19 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Thu, 2 Feb 2023 10:48:04 -0500 Subject: [PATCH 117/126] Update compile_reads.R --- R/compile_reads.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index cf85112..6bfd009 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -222,11 +222,11 @@ compile_reads <- function(master.ref, } else if (is.null(dmp.sample.sheet) & !is.null(access.sample.sheet)) { - print("DMP IMPACT samples are available and DMP ACCESS samples are NOT available") + print("DMP IMPACT samples are NOT available and DMP ACCESS samples are available") dmp.sample.sheet <- access.sample.sheet } else if (!is.null(dmp.sample.sheet) & is.null(access.sample.sheet)) { - print("DMP IMPACT samples are NOT available and DMP ACCESS samples are available") + print("DMP IMPACT samples are available and DMP ACCESS samples are NOT available") dmp.sample.sheet <- dmp.sample.sheet } else{ print("No DMP IMPACT samples or DMP ACCESS samples are available") From 484279505fc59903ec1e4c58e14249c768351cd8 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Thu, 2 Feb 2023 21:26:13 -0500 Subject: [PATCH 118/126] Update filter_calls.R --- R/filter_calls.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/filter_calls.R b/R/filter_calls.R index 32c5369..1f99e70 100644 --- a/R/filter_calls.R +++ b/R/filter_calls.R @@ -195,6 +195,7 @@ filter_calls = function( # germline filtering for matched and unmatched ---------------------------- plasma.samples <- sample.sheet[Sample_Type %in% c('duplex')]$column.names + print(plasma.samples) normal.samples <- sample.sheet[Sample_Type %in% c('unfilterednormal','normal_DMP')]$column.names fillouts.dt[,c( paste0(plasma.samples,'.called') From b0647fb9deb793ce19dec5b9dff97d822091890a Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Thu, 2 Feb 2023 21:30:49 -0500 Subject: [PATCH 119/126] Update filter_calls.R --- R/filter_calls.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/filter_calls.R b/R/filter_calls.R index 1f99e70..f2065d7 100644 --- a/R/filter_calls.R +++ b/R/filter_calls.R @@ -51,11 +51,13 @@ filter_calls = function( # compiling a sample sheet with duplex, simplex, normal, DMP tumor and DMP normal sample.sheet <- fread(paste0(results.dir,'/',x,'/',x,'_sample_sheet.tsv'))[,.(Sample_Barcode,cmo_patient_id,Sample_Type)] + print(sample.sheet) + print("Duplex-SIMPEX") simplex.sample.sheet = sample.sheet[Sample_Type == 'duplex',.(Sample_Barcode,cmo_patient_id,Sample_Type = 'simplex')] sample.sheet = rbind(sample.sheet,simplex.sample.sheet) %>% mutate(column.names = paste0(Sample_Barcode,'___',Sample_Type)) %>% data.table() - + print(sample.sheet) # compiling different genotype files from step 1 fillouts.dt <- do.call(rbind,lapply(fillouts.filenames,function(y){ sample.name = gsub('.*./|-ORG.*.','',y) From 06a32f84d46ef7bb1046f8c5812bafab131af225 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Thu, 2 Feb 2023 21:42:03 -0500 Subject: [PATCH 120/126] Update compile_Reads --- R/compile_reads.R | 2 +- R/filter_calls.R | 776 +++++++++++++++++++++++++++++++--------------- 2 files changed, 529 insertions(+), 249 deletions(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index 6bfd009..142b39a 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -102,7 +102,7 @@ compile_reads <- function(master.ref, all.dmp.bam.ids.XS <- gsub("-standard|-unfilter|-simplex|-duplex", "", - access.key[grepl(paste0(dmp_id, "-(T|N)..-XS."), V1)]$V2) + access.key[grepl(paste0(dmp_id, "-T..-XS."), V1)]$V2) all.dmp.bam.ids.normal.XS <- gsub("-standard|-unfilter|-simplex|-duplex", "", diff --git a/R/filter_calls.R b/R/filter_calls.R index f2065d7..6ecfed3 100644 --- a/R/filter_calls.R +++ b/R/filter_calls.R @@ -4,11 +4,10 @@ # library(dplyr) #' @export -filter_calls = function( - master.ref,results.dir, - CH.path = '/juno/work/access/production/resources/dmp_signedout_CH/current/signedout_CH.txt', - criteria = 'stringent' -){ +filter_calls = function(master.ref, + results.dir, + CH.path = '/juno/work/access/production/resources/dmp_signedout_CH/current/signedout_CH.txt', + criteria = 'stringent') { # # test input section ----------------------------------------------------------- # master.ref = fread('/juno/work/bergerm1/bergerlab/zhengy1/access_data_analysis/data/example_master_file.csv') # results.dir = paste0('/juno/work/bergerm1/MSK-ACCESS/ACCESS-Projects/test_access/access_data_analysis/output_042020/') @@ -18,273 +17,535 @@ filter_calls = function( # criteria <- 'stringent' # # criteria definition ----------------------------------------------------- - if(criteria == 'permissive'){ + if (criteria == 'permissive') { hotspot.support <- 1 non.hotspot.support <- 3 - }else{ + } else{ hotspot.support <- 3 non.hotspot.support <- 5 } - dir.create(paste0(results.dir,'/results_',criteria)) + dir.create(paste0(results.dir, '/results_', criteria)) # inputs --------------------------------------------------------------- # DMP.key <- fread(dmp.key.path) CH.calls = fread(CH.path) pooled.normal.mafs <- - fread(paste0(results.dir,'/pooled/all_all_unique.maf')) %>% - mutate(Tumor_Sample_Barcode = paste0(Tumor_Sample_Barcode,'___pooled')) %>% - select(Hugo_Symbol,Tumor_Sample_Barcode,Chromosome,Start_Position,End_Position,Variant_Classification,HGVSp_Short,Reference_Allele,Tumor_Seq_Allele2,t_alt_count) %>% - group_by(Hugo_Symbol,Chromosome,Start_Position,End_Position,Variant_Classification,Reference_Allele,Tumor_Seq_Allele2) %>% + fread(paste0(results.dir, '/pooled/all_all_unique.maf')) %>% + mutate(Tumor_Sample_Barcode = paste0(Tumor_Sample_Barcode, '___pooled')) %>% + select( + Hugo_Symbol, + Tumor_Sample_Barcode, + Chromosome, + Start_Position, + End_Position, + Variant_Classification, + HGVSp_Short, + Reference_Allele, + Tumor_Seq_Allele2, + t_alt_count + ) %>% + group_by( + Hugo_Symbol, + Chromosome, + Start_Position, + End_Position, + Variant_Classification, + Reference_Allele, + Tumor_Seq_Allele2 + ) %>% summarise(duplex_support_num = length(which(t_alt_count >= 2))) %>% - filter(duplex_support_num > 0,.preserve = T) %>% - transmute(Hugo_Symbol, Chromosome=as.character(Chromosome), Start_Position, End_Position, Variant_Classification, Reference_Allele, Tumor_Seq_Allele2, duplex_support_num) %>% + filter(duplex_support_num > 0, .preserve = T) %>% + transmute( + Hugo_Symbol, + Chromosome = as.character(Chromosome), + Start_Position, + End_Position, + Variant_Classification, + Reference_Allele, + Tumor_Seq_Allele2, + duplex_support_num + ) %>% data.table() # for each patient produce the correct results ---------------------------- # x <- unique(master.ref$cmo_patient_id)[1] - all.fillout.dim <- lapply(unique(master.ref$cmo_patient_id),function(x){ - print(paste0('Processing patient ',x)) - - # Inputs and sanity checks ------------------------------------------------ - fillouts.filenames <- list.files(paste0(results.dir,'/',x,'/'),'ORG-STD_genotyped.maf|ORG-SIMPLEX-DUPLEX_genotyped.maf',full.names = T) - - # compiling a sample sheet with duplex, simplex, normal, DMP tumor and DMP normal - sample.sheet <- fread(paste0(results.dir,'/',x,'/',x,'_sample_sheet.tsv'))[,.(Sample_Barcode,cmo_patient_id,Sample_Type)] - print(sample.sheet) - print("Duplex-SIMPEX") - simplex.sample.sheet = sample.sheet[Sample_Type == 'duplex',.(Sample_Barcode,cmo_patient_id,Sample_Type = 'simplex')] - sample.sheet = rbind(sample.sheet,simplex.sample.sheet) %>% - mutate(column.names = paste0(Sample_Barcode,'___',Sample_Type)) %>% - data.table() - print(sample.sheet) - # compiling different genotype files from step 1 - fillouts.dt <- do.call(rbind,lapply(fillouts.filenames,function(y){ - sample.name = gsub('.*./|-ORG.*.','',y) - sample.type = unique(sample.sheet[Sample_Barcode == sample.name]$Sample_Type) - - # t_alt_count,t_ref_count,t_depth these columns are useless, have to use duplex/simplex/standard columms - maf.file <- fread(y) %>% - select(-c(t_alt_count,t_ref_count)) %>% + all.fillout.dim <- + lapply(unique(master.ref$cmo_patient_id), function(x) { + print(paste0('Processing patient ', x)) + + # Inputs and sanity checks ------------------------------------------------ + fillouts.filenames <- + list.files( + paste0(results.dir, '/', x, '/'), + 'ORG-STD_genotyped.maf|ORG-SIMPLEX-DUPLEX_genotyped.maf', + full.names = T + ) + + # compiling a sample sheet with duplex, simplex, normal, DMP tumor and DMP normal + sample.sheet <- + fread(paste0(results.dir, '/', x, '/', x, '_sample_sheet.tsv'))[, .(Sample_Barcode, cmo_patient_id, Sample_Type)] + simplex.sample.sheet = sample.sheet[Sample_Type == 'duplex', .(Sample_Barcode, cmo_patient_id, Sample_Type = 'simplex')] + sample.sheet = rbind(sample.sheet, simplex.sample.sheet) %>% + mutate(column.names = paste0(Sample_Barcode, '___', Sample_Type)) %>% data.table() - if (nrow(maf.file) == 0) { - - columns <- c( - "Hugo_Symbol", "Tumor_Sample_Barcode", "Chromosome", "Start_Position", - "End_Position", "Variant_Classification", "HGVSp_Short", - "Reference_Allele", "Tumor_Seq_Allele2", "t_var_freq", "ExAC_AF") - df <- data.frame(matrix(ncol = length(columns), nrow = 0)) - colnames(df) <- columns - - return(df) - } + # compiling different genotype files from step 1 + fillouts.dt <- + do.call(rbind, lapply(fillouts.filenames, function(y) { + sample.name = gsub('.*./|-ORG.*.', '', y) + sample.type = unique(sample.sheet[Sample_Barcode == sample.name]$Sample_Type) + + # t_alt_count,t_ref_count,t_depth these columns are useless, have to use duplex/simplex/standard columms + maf.file <- fread(y) %>% + select(-c(t_alt_count, t_ref_count)) %>% + data.table() + + if (nrow(maf.file) == 0) { + columns <- c( + "Hugo_Symbol", + "Tumor_Sample_Barcode", + "Chromosome", + "Start_Position", + "End_Position", + "Variant_Classification", + "HGVSp_Short", + "Reference_Allele", + "Tumor_Seq_Allele2", + "t_var_freq", + "ExAC_AF" + ) + df <- data.frame(matrix(ncol = length(columns), nrow = 0)) + colnames(df) <- columns + + return(df) + } + + # fragment counts replacing actual allele counts + if (grepl('SIMPLEX-DUPLEX_genotyped', y)) { + melt.id.vars = colnames(maf.file)[!grepl('fragment', colnames(maf.file))] + + # get rid of simplex duplex aggregate columns + maf.file %>% + select(-c(contains('simplex_duplex'))) %>% + # melting and dcasting columns back but separating duplex and simplex columns + # t_alt_duplex, t_depth_duplex, t_alt_simplex, t_depth_simplex --> t_alt, t_depth + melt.data.table( + id.vars = melt.id.vars, + variable.name = 'variable', + value.name = 'value' + ) %>% + mutate(variable = gsub('fragment', '_', variable)) %>% + separate(variable, c('variable', 'Sample_Type'), sep = '___') %>% + mutate(Tumor_Sample_Barcode = paste0(sample.name, '___', Sample_Type)) %>% + select(-Sample_Type) %>% + data.table() %>% + unique() %>% + dcast.data.table(as.formula(paste0( + paste0(melt.id.vars, collapse = ' + '), ' ~ variable' + )), value.var = 'value') -> maf.file + } else{ + maf.file <- maf.file %>% + mutate(Tumor_Sample_Barcode = paste0(sample.name, '___', sample.type)) %>% + # swaping the t_alt_count(etc)_standard for t_alt_count(etc) + mutate(t_alt_count = t_alt_count_standard, t_total_count = t_total_count_standard) + } + + maf.file = maf.file %>% + mutate(t_var_freq = paste0( + t_alt_count, + '/', + t_total_count, + '(', + round(t_alt_count / t_total_count, 4), + ')' + )) %>% + transmute( + Hugo_Symbol, + Tumor_Sample_Barcode, + Chromosome = as.character(Chromosome), + Start_Position, + End_Position, + Variant_Classification, + HGVSp_Short = as.character(HGVSp_Short), + Reference_Allele, + Tumor_Seq_Allele2, + t_var_freq, + ExAC_AF + ) %>% + data.table() + + return(maf.file) + })) %>% + unique() %>% + data.table() - # fragment counts replacing actual allele counts - if(grepl('SIMPLEX-DUPLEX_genotyped',y)){ - melt.id.vars = colnames(maf.file)[!grepl('fragment',colnames(maf.file))] - - # get rid of simplex duplex aggregate columns - maf.file %>% - select(-c(contains('simplex_duplex'))) %>% - # melting and dcasting columns back but separating duplex and simplex columns - # t_alt_duplex, t_depth_duplex, t_alt_simplex, t_depth_simplex --> t_alt, t_depth - melt.data.table(id.vars = melt.id.vars,variable.name = 'variable',value.name = 'value') %>% - mutate(variable = gsub('fragment','_',variable)) %>% - separate(variable,c('variable','Sample_Type'),sep = '___') %>% - mutate(Tumor_Sample_Barcode = paste0(sample.name,'___',Sample_Type)) %>% - select(-Sample_Type) %>% - data.table() %>% - unique() %>% - dcast.data.table(as.formula(paste0(paste0(melt.id.vars,collapse = ' + '),' ~ variable')),value.var = 'value') -> maf.file - }else{ - maf.file <- maf.file %>% - mutate(Tumor_Sample_Barcode = paste0(sample.name,'___',sample.type)) %>% - # swaping the t_alt_count(etc)_standard for t_alt_count(etc) - mutate(t_alt_count = t_alt_count_standard,t_total_count = t_total_count_standard) - } + # merging and melting ----------------------------------------------------- + hotspot.maf <- + fread(paste0( + results.dir, + '/', + x, + '/', + x, + '_all_unique_calls_hotspots.maf' + )) %>% + rowwise() %>% + transmute( + Hugo_Symbol, + Chromosome = as.character(Chromosome), + Start_Position, + End_Position, + Variant_Classification, + # HGVSp_Short, + Reference_Allele, + Tumor_Seq_Allele2, + Hotspot = ifelse(hotspot_whitelist, 'Hotspot', NA) + ) %>% + data.table() - maf.file = maf.file %>% - mutate(t_var_freq = paste0(t_alt_count,'/',t_total_count,'(',round(t_alt_count/t_total_count,4),')')) %>% - transmute(Hugo_Symbol,Tumor_Sample_Barcode,Chromosome = as.character(Chromosome),Start_Position,End_Position,Variant_Classification, - HGVSp_Short=as.character(HGVSp_Short),Reference_Allele,Tumor_Seq_Allele2,t_var_freq,ExAC_AF) %>% + dmp.maf <- + fread(paste0(results.dir, '/', x, '/', x, '_impact_calls.maf')) %>% + transmute( + Hugo_Symbol, + Chromosome = as.character(Chromosome), + Start_Position, + End_Position, + Variant_Classification, + # HGVSp_Short, + Reference_Allele, + Tumor_Seq_Allele2 + ) %>% + mutate(DMP = 'Signed out') %>% + unique() %>% data.table() - return(maf.file) - })) %>% - unique() %>% - data.table() - - # merging and melting ----------------------------------------------------- - hotspot.maf <- fread(paste0(results.dir,'/',x,'/',x,'_all_unique_calls_hotspots.maf')) %>% - rowwise() %>% - transmute(Hugo_Symbol,Chromosome = as.character(Chromosome),Start_Position,End_Position,Variant_Classification, - # HGVSp_Short, - Reference_Allele,Tumor_Seq_Allele2,Hotspot = ifelse(hotspot_whitelist,'Hotspot',NA)) %>% - data.table() - - dmp.maf <- fread(paste0(results.dir,'/',x,'/',x,'_impact_calls.maf')) %>% - transmute(Hugo_Symbol,Chromosome = as.character(Chromosome),Start_Position,End_Position,Variant_Classification, - # HGVSp_Short, - Reference_Allele,Tumor_Seq_Allele2) %>% - mutate(DMP = 'Signed out') %>% - unique() %>% - data.table() - - if((nrow(dmp.maf) > 0) && (nrow(fillouts.dt) > 0)){ + if ((nrow(dmp.maf) > 0) && (nrow(fillouts.dt) > 0)) { fillouts.dt <- fillouts.dt %>% - dcast.data.table(Hugo_Symbol + Chromosome + Start_Position + End_Position + Variant_Classification + - HGVSp_Short + Reference_Allele + Tumor_Seq_Allele2 + ExAC_AF ~ Tumor_Sample_Barcode, - value.var = 't_var_freq') %>% + dcast.data.table( + Hugo_Symbol + Chromosome + Start_Position + End_Position + Variant_Classification + + HGVSp_Short + Reference_Allele + Tumor_Seq_Allele2 + ExAC_AF ~ Tumor_Sample_Barcode, + value.var = 't_var_freq' + ) %>% # hotspot information merge( hotspot.maf, - by = c('Hugo_Symbol','Chromosome','Start_Position','End_Position','Variant_Classification','Reference_Allele','Tumor_Seq_Allele2'), - all.x = T) %>% + by = c( + 'Hugo_Symbol', + 'Chromosome', + 'Start_Position', + 'End_Position', + 'Variant_Classification', + 'Reference_Allele', + 'Tumor_Seq_Allele2' + ), + all.x = T + ) %>% # Identifying signed out calls merge( dmp.maf, - by = c('Hugo_Symbol','Chromosome','Start_Position','End_Position','Variant_Classification','Reference_Allele','Tumor_Seq_Allele2'), - all.x = T) %>% + by = c( + 'Hugo_Symbol', + 'Chromosome', + 'Start_Position', + 'End_Position', + 'Variant_Classification', + 'Reference_Allele', + 'Tumor_Seq_Allele2' + ), + all.x = T + ) %>% # pooled normal for systemic artifacts merge( pooled.normal.mafs, - by = c('Hugo_Symbol','Chromosome','Start_Position','End_Position','Variant_Classification','Reference_Allele','Tumor_Seq_Allele2'), - all.x = T) %>% + by = c( + 'Hugo_Symbol', + 'Chromosome', + 'Start_Position', + 'End_Position', + 'Variant_Classification', + 'Reference_Allele', + 'Tumor_Seq_Allele2' + ), + all.x = T + ) %>% data.table() - } else if (nrow(fillouts.dt) > 0){ - fillouts.dt <- fillouts.dt %>% - dcast.data.table( - Hugo_Symbol + Chromosome + Start_Position + End_Position + Variant_Classification + - HGVSp_Short + Reference_Allele + Tumor_Seq_Allele2 + ExAC_AF ~ Tumor_Sample_Barcode, - value.var = 't_var_freq') %>% - # hotspot information - merge( - hotspot.maf, - by = c('Hugo_Symbol','Chromosome','Start_Position','End_Position','Variant_Classification','Reference_Allele','Tumor_Seq_Allele2'), - all.x = T) %>% - # pooled normal for systemic artifacts - merge( - pooled.normal.mafs, - by = c('Hugo_Symbol','Chromosome','Start_Position','End_Position','Variant_Classification','Reference_Allele','Tumor_Seq_Allele2'), - all.x = T) %>% - mutate(DMP = NA) %>% - data.table() - } else { - - print(paste0("Found no tumor or DMP mutations for ", x, ". Writing an empty data.frame to CSV.")) - - # if fillouts.dt has no data, then add the needed columns with no data - fillouts.dt[,c("DMP", "Hotspot", "duplex_support_num", "call_confidence", "CH") := NA] - - fillouts.dt <- fillouts.dt %>% select( - Hugo_Symbol,Chromosome,Start_Position,End_Position, - Variant_Classification,HGVSp_Short,Reference_Allele,Tumor_Seq_Allele2, - ExAC_AF,Hotspot,DMP,CH,duplex_support_num,call_confidence,sort(everything())) - - write.csv( - fillouts.dt, - paste0(results.dir,'/results_',criteria,'/',x,'_SNV_table.csv'), - row.names = F) + } else if (nrow(fillouts.dt) > 0) { + fillouts.dt <- fillouts.dt %>% + dcast.data.table( + Hugo_Symbol + Chromosome + Start_Position + End_Position + Variant_Classification + + HGVSp_Short + Reference_Allele + Tumor_Seq_Allele2 + ExAC_AF ~ Tumor_Sample_Barcode, + value.var = 't_var_freq' + ) %>% + # hotspot information + merge( + hotspot.maf, + by = c( + 'Hugo_Symbol', + 'Chromosome', + 'Start_Position', + 'End_Position', + 'Variant_Classification', + 'Reference_Allele', + 'Tumor_Seq_Allele2' + ), + all.x = T + ) %>% + # pooled normal for systemic artifacts + merge( + pooled.normal.mafs, + by = c( + 'Hugo_Symbol', + 'Chromosome', + 'Start_Position', + 'End_Position', + 'Variant_Classification', + 'Reference_Allele', + 'Tumor_Seq_Allele2' + ), + all.x = T + ) %>% + mutate(DMP = NA) %>% + data.table() + } else { + print( + paste0( + "Found no tumor or DMP mutations for ", + x, + ". Writing an empty data.frame to CSV." + ) + ) + + # if fillouts.dt has no data, then add the needed columns with no data + fillouts.dt[, c("DMP", + "Hotspot", + "duplex_support_num", + "call_confidence", + "CH") := NA] + + fillouts.dt <- fillouts.dt %>% select( + Hugo_Symbol, + Chromosome, + Start_Position, + End_Position, + Variant_Classification, + HGVSp_Short, + Reference_Allele, + Tumor_Seq_Allele2, + ExAC_AF, + Hotspot, + DMP, + CH, + duplex_support_num, + call_confidence, + sort(everything()) + ) + + write.csv( + fillouts.dt, + paste0( + results.dir, + '/results_', + criteria, + '/', + x, + '_SNV_table.csv' + ), + row.names = F + ) + + return() + } - return() - } + # Interesting cases where DMP signed out calls are artifacets + if (any(!is.na(fillouts.dt$DMP) & + !is.na(fillouts.dt$duplex_support_num))) { + print(paste0('Look at ', x, ' for DMP signed out plasma artifacts...')) + } - # Interesting cases where DMP signed out calls are artifacets - if(any(!is.na(fillouts.dt$DMP) & !is.na(fillouts.dt$duplex_support_num))){ - print(paste0('Look at ',x,' for DMP signed out plasma artifacts...')) - } + # germline filtering for matched and unmatched ---------------------------- + plasma.samples <- + sample.sheet[Sample_Type %in% c('duplex')]$column.names + normal.samples <- + sample.sheet[Sample_Type %in% c('unfilterednormal', 'normal_DMP')]$column.names + fillouts.dt[, c(paste0(plasma.samples, '.called') + # paste0(gsub('duplex','simplex',plasma.samples),'.called')) := 'Not Called'] + + # preliminary calling + # tmp.col.name <- plasma.samples[1] + lapply(plasma.samples, function(tmp.col.name) { + # genotyping (signed out stuff) + fillouts.dt[(as.numeric(gsub("/.*.$", '', get(tmp.col.name))) >= 1 | + as.numeric(gsub("/.*.$", '', get( + paste0(gsub('duplex', 'simplex', tmp.col.name)) + ))) > 1) & DMP == 'Signed out', + eval(paste0(tmp.col.name, '.called')) := 'Genotyped'] + # c(eval(paste0(tmp.col.name,'.called')),eval(paste0(gsub('duplex','simplex',tmp.col.name),'.called'))) := list('Called','Called')] + # hotspot reads + fillouts.dt[as.numeric(gsub("/.*.$", '', get(tmp.col.name))) >= hotspot.support & + Hotspot == 'Hotspot', + eval(paste0(tmp.col.name, '.called')) := 'Called'] + # c(eval(paste0(tmp.col.name,'.called')),eval(paste0(gsub('duplex','simplex',tmp.col.name),'.called'))) := list('Called','Called')] + # non hotspot reads + fillouts.dt[as.numeric(gsub("/.*.$", '', get(tmp.col.name))) >= non.hotspot.support & + is.na(Hotspot), + eval(paste0(tmp.col.name, '.called')) := 'Called'] + # c(eval(paste0(tmp.col.name,'.called')),eval(paste0(gsub('duplex','simplex',tmp.col.name),'.called'))) := list('Called','Called')] + # print(table(fillouts.dt[,get(paste0(tmp.col.name,'.called'))])) + }) + + if (all(!c('unfilterednormal', 'normal_DMP') %in% sample.sheet$Sample_Type)) { + tmp.col.name <- plasma.samples[1] + lapply(plasma.samples, function(tmp.col.name) { + #fillouts.dt[as.numeric(gsub("\\(|\\)",'',str_extract(get(tmp.col.name),"\\(.*.\\)"))) >= 0.3 | ExAC_AF >= 0.0001,eval(paste0(tmp.col.name,'.called')) := 'Not Called'] + fillouts.dt[get(tmp.col.name) == '0/0(NaN)', eval(paste0(tmp.col.name, '.called')) := 'Not Covered'] + }) + } else{ + lapply(plasma.samples, function(tmp.col.name) { + lapply(normal.samples, function(tmp.col.name.normal) { + # duplex tvar/nvar > 5 + fillouts.dt[(as.numeric(gsub( + "\\(|\\)", '', str_extract(get(tmp.col.name), "\\(.*.\\)") + )) / as.numeric(gsub( + "\\(|\\)", '', str_extract(get(tmp.col.name.normal), "\\(.*.\\)") + )) < 2) | + # if duplex have no reads, use simplex tvar + (as.numeric(gsub( + "\\(|\\)", '', str_extract(get( + gsub('duplex', 'simplex', tmp.col.name) + ), "\\(.*.\\)") + )) / as.numeric(gsub( + "\\(|\\)", '', str_extract(get(tmp.col.name.normal), "\\(.*.\\)") + )) < 2 & + as.numeric(gsub("/.*.$", '', get( + tmp.col.name + ))) == 0), + eval(paste0(tmp.col.name, '.called')) := 'Not Called'] + fillouts.dt[get(tmp.col.name) == '0/0(NaN)', eval(paste0(tmp.col.name, '.called')) := 'Not Covered'] + }) + }) + } + + # final processing -------------------------------------------------------- + # Save only the useful column + #print(fillouts.dt) + #print("#######") + fillouts.dt <- + fillouts.dt[DMP == 'Signed out' | + fillouts.dt[, apply(.SD, 1, function(x) { + any(x == 'Called') + })]] + #print(fillouts.dt) + # combining duplex and simplex counts + lapply(plasma.samples, function(tmp.col.name) { + # hotspot reads + fillouts.dt[, eval(gsub('duplex', 'total', tmp.col.name)) := paste0( + as.numeric(gsub("/.*.$", '', get(tmp.col.name))) + as.numeric(gsub("/.*.$", '', get( + gsub('duplex', 'simplex', tmp.col.name) + ))), + '/', + as.numeric(gsub( + "^.*./|\\(.*.$", '', get(tmp.col.name) + )) + as.numeric(gsub("^.*./|\\(.*.$", '', get( + gsub('duplex', 'simplex', tmp.col.name) + ))), + '(', + round((as.numeric(gsub( + "/.*.$", '', get(tmp.col.name) + )) + as.numeric(gsub( + "/.*.$", '', get(gsub('duplex', 'simplex', tmp.col.name)) + ))) / + (as.numeric(gsub( + "^.*./|\\(.*.$", '', get(tmp.col.name) + )) + as.numeric(gsub( + "^.*./|\\(.*.$", '', get(gsub('duplex', 'simplex', tmp.col.name)) + ))), + 4 + ), + ')' + )] + fillouts.dt[, c(eval(gsub('duplex', 'simplex', tmp.col.name)), eval(tmp.col.name)) := list(NULL, NULL)] + }) + + fillouts.dt <- + fillouts.dt[, order(colnames(fillouts.dt)), with = F] %>% + # filter for artifacts + mutate(call_confidence = case_when( + (Hugo_Symbol == 'TERT' & + is.na(Hotspot)) | + ( + Hugo_Symbol == 'ERBB2' & grepl('[A-Z]90[0-9][A-Z]', HGVSp_Short) + ) | + (Hugo_Symbol == 'BRAF' & + grepl('711', HGVSp_Short)) | + ( + Hugo_Symbol == 'NF1' & + grepl('[A-Z]106[0-9][A-Z]', HGVSp_Short) + ) ~ 'Low', + DMP == 'Signed out' ~ 'High', + TRUE ~ '' + )) %>% + merge( + CH.calls[, .( + Hugo_Symbol = Gene, + Chromosome = Chrom, + Start_Position = Start, + Reference_Allele = Ref, + Tumor_Seq_Allele2 = Alt, + HGVSp_Short = AAchange, + Variant_Classification = VariantClass, + CH = 'Yes' + )], + by = c( + 'Hugo_Symbol', + 'Chromosome', + 'Start_Position', + 'Variant_Classification', + 'HGVSp_Short', + 'Reference_Allele', + 'Tumor_Seq_Allele2' + ), + all.x = T + ) %>% + mutate(CH = ifelse(is.na(CH), 'No', 'Yes')) %>% + select( + Hugo_Symbol, + Chromosome, + Start_Position, + End_Position, + Variant_Classification, + HGVSp_Short, + Reference_Allele, + Tumor_Seq_Allele2, + ExAC_AF, + Hotspot, + DMP, + CH, + duplex_support_num, + call_confidence, + sort(everything()) + ) + + write.csv( + fillouts.dt, + paste0( + results.dir, + '/results_', + criteria, + '/', + x, + '_SNV_table.csv' + ), + row.names = F + ) + }) + + if (all(unlist(all.fillout.dim))) { + print('All dimension of fillout mafs for each patient looks correct') + } - # germline filtering for matched and unmatched ---------------------------- - plasma.samples <- sample.sheet[Sample_Type %in% c('duplex')]$column.names - print(plasma.samples) - normal.samples <- sample.sheet[Sample_Type %in% c('unfilterednormal','normal_DMP')]$column.names - fillouts.dt[,c( - paste0(plasma.samples,'.called') - # paste0(gsub('duplex','simplex',plasma.samples),'.called') - - ) := 'Not Called'] - - # preliminary calling - # tmp.col.name <- plasma.samples[1] - lapply(plasma.samples,function(tmp.col.name){ - # genotyping (signed out stuff) - fillouts.dt[(as.numeric(gsub("/.*.$",'',get(tmp.col.name))) >= 1 | as.numeric(gsub("/.*.$",'',get(paste0(gsub('duplex','simplex',tmp.col.name))))) > 1) & DMP == 'Signed out', - eval(paste0(tmp.col.name,'.called')) := 'Genotyped'] - # c(eval(paste0(tmp.col.name,'.called')),eval(paste0(gsub('duplex','simplex',tmp.col.name),'.called'))) := list('Called','Called')] - # hotspot reads - fillouts.dt[as.numeric(gsub("/.*.$",'',get(tmp.col.name))) >= hotspot.support & Hotspot == 'Hotspot', - eval(paste0(tmp.col.name,'.called')) := 'Called'] - # c(eval(paste0(tmp.col.name,'.called')),eval(paste0(gsub('duplex','simplex',tmp.col.name),'.called'))) := list('Called','Called')] - # non hotspot reads - fillouts.dt[as.numeric(gsub("/.*.$",'',get(tmp.col.name))) >= non.hotspot.support & is.na(Hotspot), - eval(paste0(tmp.col.name,'.called')) := 'Called'] - # c(eval(paste0(tmp.col.name,'.called')),eval(paste0(gsub('duplex','simplex',tmp.col.name),'.called'))) := list('Called','Called')] - # print(table(fillouts.dt[,get(paste0(tmp.col.name,'.called'))])) - }) - - if(all(!c('unfilterednormal','normal_DMP') %in% sample.sheet$Sample_Type)){ - tmp.col.name <- plasma.samples[1] - lapply(plasma.samples,function(tmp.col.name){ - #fillouts.dt[as.numeric(gsub("\\(|\\)",'',str_extract(get(tmp.col.name),"\\(.*.\\)"))) >= 0.3 | ExAC_AF >= 0.0001,eval(paste0(tmp.col.name,'.called')) := 'Not Called'] - fillouts.dt[get(tmp.col.name) == '0/0(NaN)',eval(paste0(tmp.col.name,'.called')) := 'Not Covered'] - }) - }else{ - lapply(plasma.samples,function(tmp.col.name){ - lapply(normal.samples,function(tmp.col.name.normal){ - # duplex tvar/nvar > 5 - fillouts.dt[(as.numeric(gsub("\\(|\\)",'',str_extract(get(tmp.col.name),"\\(.*.\\)")))/as.numeric(gsub("\\(|\\)",'',str_extract(get(tmp.col.name.normal),"\\(.*.\\)"))) < 2) | - # if duplex have no reads, use simplex tvar - (as.numeric(gsub("\\(|\\)",'',str_extract(get(gsub('duplex','simplex',tmp.col.name)),"\\(.*.\\)")))/as.numeric(gsub("\\(|\\)",'',str_extract(get(tmp.col.name.normal),"\\(.*.\\)"))) < 2 & - as.numeric(gsub("/.*.$",'',get(tmp.col.name))) == 0), - eval(paste0(tmp.col.name,'.called')) := 'Not Called'] - fillouts.dt[get(tmp.col.name) == '0/0(NaN)',eval(paste0(tmp.col.name,'.called')) := 'Not Covered'] - }) - }) } - # final processing -------------------------------------------------------- - # Save only the useful column - #print(fillouts.dt) - #print("#######") - fillouts.dt <- fillouts.dt[DMP == 'Signed out' | fillouts.dt[,apply(.SD,1,function(x){any(x == 'Called')})]] - #print(fillouts.dt) - # combining duplex and simplex counts - lapply(plasma.samples,function(tmp.col.name){ - # hotspot reads - fillouts.dt[,eval(gsub('duplex','total',tmp.col.name)) := paste0( - as.numeric(gsub("/.*.$",'',get(tmp.col.name)))+as.numeric(gsub("/.*.$",'',get(gsub('duplex','simplex',tmp.col.name)))),'/', - as.numeric(gsub("^.*./|\\(.*.$",'',get(tmp.col.name)))+as.numeric(gsub("^.*./|\\(.*.$",'',get(gsub('duplex','simplex',tmp.col.name)))),'(', - round((as.numeric(gsub("/.*.$",'',get(tmp.col.name)))+as.numeric(gsub("/.*.$",'',get(gsub('duplex','simplex',tmp.col.name)))))/ - (as.numeric(gsub("^.*./|\\(.*.$",'',get(tmp.col.name)))+as.numeric(gsub("^.*./|\\(.*.$",'',get(gsub('duplex','simplex',tmp.col.name))))),4),')' - )] - fillouts.dt[,c(eval(gsub('duplex','simplex',tmp.col.name)),eval(tmp.col.name)):= list(NULL,NULL)] - }) - - fillouts.dt <- fillouts.dt[,order(colnames(fillouts.dt)),with = F] %>% - # filter for artifacts - mutate(call_confidence = case_when( - (Hugo_Symbol == 'TERT' & is.na(Hotspot)) | (Hugo_Symbol == 'ERBB2' & grepl('[A-Z]90[0-9][A-Z]',HGVSp_Short)) | - (Hugo_Symbol == 'BRAF' & grepl('711',HGVSp_Short)) | (Hugo_Symbol == 'NF1' & grepl('[A-Z]106[0-9][A-Z]',HGVSp_Short)) ~ 'Low', - DMP == 'Signed out' ~ 'High', - TRUE ~ '' - )) %>% - merge(CH.calls[,.(Hugo_Symbol = Gene,Chromosome = Chrom,Start_Position = Start,Reference_Allele = Ref,Tumor_Seq_Allele2 = Alt,HGVSp_Short = AAchange,Variant_Classification = VariantClass,CH = 'Yes')], - by = c('Hugo_Symbol','Chromosome','Start_Position','Variant_Classification','HGVSp_Short','Reference_Allele','Tumor_Seq_Allele2'), - all.x = T) %>% - mutate(CH = ifelse(is.na(CH),'No','Yes')) %>% - select(Hugo_Symbol,Chromosome,Start_Position,End_Position,Variant_Classification,HGVSp_Short,Reference_Allele,Tumor_Seq_Allele2, - ExAC_AF,Hotspot,DMP,CH,duplex_support_num,call_confidence,sort(everything())) - - write.csv(fillouts.dt,paste0(results.dir,'/results_',criteria,'/',x,'_SNV_table.csv'),row.names = F) - }) - - if(all(unlist(all.fillout.dim))){ - print('All dimension of fillout mafs for each patient looks correct') - } - -} - # Executable ----------------------------------------------------------------------------------------------------------- suppressPackageStartupMessages({ library(data.table) @@ -295,28 +556,47 @@ suppressPackageStartupMessages({ }) if (!interactive()) { - - parser=ArgumentParser() - parser$add_argument('-m', '--masterref', type='character', help='File path to master reference file') - parser$add_argument('-o', '--resultsdir', type='character', help='Output directory') - parser$add_argument('-ch', '--chlist', type='character', default = '/juno/work/access/production/resources/dmp_signedout_CH/current/signedout_CH.txt', - help='List of signed out CH calls [default]') - parser$add_argument('-c', '--criteria', type='character', default = 'stringent', - help='Calling criteria [default]') - args=parser$parse_args() + parser = ArgumentParser() + parser$add_argument('-m', '--masterref', type = 'character', help = 'File path to master reference file') + parser$add_argument('-o', '--resultsdir', type = 'character', help = 'Output directory') + parser$add_argument( + '-ch', + '--chlist', + type = 'character', + default = '/juno/work/access/production/resources/dmp_signedout_CH/current/signedout_CH.txt', + help = 'List of signed out CH calls [default]' + ) + parser$add_argument( + '-c', + '--criteria', + type = 'character', + default = 'stringent', + help = 'Calling criteria [default]' + ) + args = parser$parse_args() master.ref = args$masterref results.dir = args$resultsdir chlist = args$chlist criteria = args$criteria - cat(paste0(paste0(c(paste0(rep('-',15),collapse = ''),'Arguments input: ',master.ref,results.dir,chlist,criteria, - paste0(rep('-',15),collapse = '')),collapse = "\n"),'\n')) - - if(!criteria %in% c('stringent','permissive')){ + cat(paste0(paste0( + c( + paste0(rep('-', 15), collapse = ''), + 'Arguments input: ', + master.ref, + results.dir, + chlist, + criteria, + paste0(rep('-', 15), collapse = '') + ), + collapse = "\n" + ), '\n')) + + if (!criteria %in% c('stringent', 'permissive')) { stop('Criteria argument should be either stringent or permissive') } - suppressWarnings(filter_calls(fread(master.ref),results.dir,chlist,criteria)) + suppressWarnings(filter_calls(fread(master.ref), results.dir, chlist, criteria)) } From f4e6a7362351e52adba55323354e41934d6061bc Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Fri, 3 Feb 2023 08:20:46 -0500 Subject: [PATCH 121/126] Update filter_calls.R --- R/filter_calls.R | 773 +++++++++++++++-------------------------------- 1 file changed, 245 insertions(+), 528 deletions(-) diff --git a/R/filter_calls.R b/R/filter_calls.R index 6ecfed3..32c5369 100644 --- a/R/filter_calls.R +++ b/R/filter_calls.R @@ -4,10 +4,11 @@ # library(dplyr) #' @export -filter_calls = function(master.ref, - results.dir, - CH.path = '/juno/work/access/production/resources/dmp_signedout_CH/current/signedout_CH.txt', - criteria = 'stringent') { +filter_calls = function( + master.ref,results.dir, + CH.path = '/juno/work/access/production/resources/dmp_signedout_CH/current/signedout_CH.txt', + criteria = 'stringent' +){ # # test input section ----------------------------------------------------------- # master.ref = fread('/juno/work/bergerm1/bergerlab/zhengy1/access_data_analysis/data/example_master_file.csv') # results.dir = paste0('/juno/work/bergerm1/MSK-ACCESS/ACCESS-Projects/test_access/access_data_analysis/output_042020/') @@ -17,535 +18,270 @@ filter_calls = function(master.ref, # criteria <- 'stringent' # # criteria definition ----------------------------------------------------- - if (criteria == 'permissive') { + if(criteria == 'permissive'){ hotspot.support <- 1 non.hotspot.support <- 3 - } else{ + }else{ hotspot.support <- 3 non.hotspot.support <- 5 } - dir.create(paste0(results.dir, '/results_', criteria)) + dir.create(paste0(results.dir,'/results_',criteria)) # inputs --------------------------------------------------------------- # DMP.key <- fread(dmp.key.path) CH.calls = fread(CH.path) pooled.normal.mafs <- - fread(paste0(results.dir, '/pooled/all_all_unique.maf')) %>% - mutate(Tumor_Sample_Barcode = paste0(Tumor_Sample_Barcode, '___pooled')) %>% - select( - Hugo_Symbol, - Tumor_Sample_Barcode, - Chromosome, - Start_Position, - End_Position, - Variant_Classification, - HGVSp_Short, - Reference_Allele, - Tumor_Seq_Allele2, - t_alt_count - ) %>% - group_by( - Hugo_Symbol, - Chromosome, - Start_Position, - End_Position, - Variant_Classification, - Reference_Allele, - Tumor_Seq_Allele2 - ) %>% + fread(paste0(results.dir,'/pooled/all_all_unique.maf')) %>% + mutate(Tumor_Sample_Barcode = paste0(Tumor_Sample_Barcode,'___pooled')) %>% + select(Hugo_Symbol,Tumor_Sample_Barcode,Chromosome,Start_Position,End_Position,Variant_Classification,HGVSp_Short,Reference_Allele,Tumor_Seq_Allele2,t_alt_count) %>% + group_by(Hugo_Symbol,Chromosome,Start_Position,End_Position,Variant_Classification,Reference_Allele,Tumor_Seq_Allele2) %>% summarise(duplex_support_num = length(which(t_alt_count >= 2))) %>% - filter(duplex_support_num > 0, .preserve = T) %>% - transmute( - Hugo_Symbol, - Chromosome = as.character(Chromosome), - Start_Position, - End_Position, - Variant_Classification, - Reference_Allele, - Tumor_Seq_Allele2, - duplex_support_num - ) %>% + filter(duplex_support_num > 0,.preserve = T) %>% + transmute(Hugo_Symbol, Chromosome=as.character(Chromosome), Start_Position, End_Position, Variant_Classification, Reference_Allele, Tumor_Seq_Allele2, duplex_support_num) %>% data.table() # for each patient produce the correct results ---------------------------- # x <- unique(master.ref$cmo_patient_id)[1] - all.fillout.dim <- - lapply(unique(master.ref$cmo_patient_id), function(x) { - print(paste0('Processing patient ', x)) - - # Inputs and sanity checks ------------------------------------------------ - fillouts.filenames <- - list.files( - paste0(results.dir, '/', x, '/'), - 'ORG-STD_genotyped.maf|ORG-SIMPLEX-DUPLEX_genotyped.maf', - full.names = T - ) - - # compiling a sample sheet with duplex, simplex, normal, DMP tumor and DMP normal - sample.sheet <- - fread(paste0(results.dir, '/', x, '/', x, '_sample_sheet.tsv'))[, .(Sample_Barcode, cmo_patient_id, Sample_Type)] - simplex.sample.sheet = sample.sheet[Sample_Type == 'duplex', .(Sample_Barcode, cmo_patient_id, Sample_Type = 'simplex')] - sample.sheet = rbind(sample.sheet, simplex.sample.sheet) %>% - mutate(column.names = paste0(Sample_Barcode, '___', Sample_Type)) %>% + all.fillout.dim <- lapply(unique(master.ref$cmo_patient_id),function(x){ + print(paste0('Processing patient ',x)) + + # Inputs and sanity checks ------------------------------------------------ + fillouts.filenames <- list.files(paste0(results.dir,'/',x,'/'),'ORG-STD_genotyped.maf|ORG-SIMPLEX-DUPLEX_genotyped.maf',full.names = T) + + # compiling a sample sheet with duplex, simplex, normal, DMP tumor and DMP normal + sample.sheet <- fread(paste0(results.dir,'/',x,'/',x,'_sample_sheet.tsv'))[,.(Sample_Barcode,cmo_patient_id,Sample_Type)] + simplex.sample.sheet = sample.sheet[Sample_Type == 'duplex',.(Sample_Barcode,cmo_patient_id,Sample_Type = 'simplex')] + sample.sheet = rbind(sample.sheet,simplex.sample.sheet) %>% + mutate(column.names = paste0(Sample_Barcode,'___',Sample_Type)) %>% + data.table() + + # compiling different genotype files from step 1 + fillouts.dt <- do.call(rbind,lapply(fillouts.filenames,function(y){ + sample.name = gsub('.*./|-ORG.*.','',y) + sample.type = unique(sample.sheet[Sample_Barcode == sample.name]$Sample_Type) + + # t_alt_count,t_ref_count,t_depth these columns are useless, have to use duplex/simplex/standard columms + maf.file <- fread(y) %>% + select(-c(t_alt_count,t_ref_count)) %>% data.table() - # compiling different genotype files from step 1 - fillouts.dt <- - do.call(rbind, lapply(fillouts.filenames, function(y) { - sample.name = gsub('.*./|-ORG.*.', '', y) - sample.type = unique(sample.sheet[Sample_Barcode == sample.name]$Sample_Type) - - # t_alt_count,t_ref_count,t_depth these columns are useless, have to use duplex/simplex/standard columms - maf.file <- fread(y) %>% - select(-c(t_alt_count, t_ref_count)) %>% - data.table() - - if (nrow(maf.file) == 0) { - columns <- c( - "Hugo_Symbol", - "Tumor_Sample_Barcode", - "Chromosome", - "Start_Position", - "End_Position", - "Variant_Classification", - "HGVSp_Short", - "Reference_Allele", - "Tumor_Seq_Allele2", - "t_var_freq", - "ExAC_AF" - ) - df <- data.frame(matrix(ncol = length(columns), nrow = 0)) - colnames(df) <- columns - - return(df) - } - - # fragment counts replacing actual allele counts - if (grepl('SIMPLEX-DUPLEX_genotyped', y)) { - melt.id.vars = colnames(maf.file)[!grepl('fragment', colnames(maf.file))] - - # get rid of simplex duplex aggregate columns - maf.file %>% - select(-c(contains('simplex_duplex'))) %>% - # melting and dcasting columns back but separating duplex and simplex columns - # t_alt_duplex, t_depth_duplex, t_alt_simplex, t_depth_simplex --> t_alt, t_depth - melt.data.table( - id.vars = melt.id.vars, - variable.name = 'variable', - value.name = 'value' - ) %>% - mutate(variable = gsub('fragment', '_', variable)) %>% - separate(variable, c('variable', 'Sample_Type'), sep = '___') %>% - mutate(Tumor_Sample_Barcode = paste0(sample.name, '___', Sample_Type)) %>% - select(-Sample_Type) %>% - data.table() %>% - unique() %>% - dcast.data.table(as.formula(paste0( - paste0(melt.id.vars, collapse = ' + '), ' ~ variable' - )), value.var = 'value') -> maf.file - } else{ - maf.file <- maf.file %>% - mutate(Tumor_Sample_Barcode = paste0(sample.name, '___', sample.type)) %>% - # swaping the t_alt_count(etc)_standard for t_alt_count(etc) - mutate(t_alt_count = t_alt_count_standard, t_total_count = t_total_count_standard) - } - - maf.file = maf.file %>% - mutate(t_var_freq = paste0( - t_alt_count, - '/', - t_total_count, - '(', - round(t_alt_count / t_total_count, 4), - ')' - )) %>% - transmute( - Hugo_Symbol, - Tumor_Sample_Barcode, - Chromosome = as.character(Chromosome), - Start_Position, - End_Position, - Variant_Classification, - HGVSp_Short = as.character(HGVSp_Short), - Reference_Allele, - Tumor_Seq_Allele2, - t_var_freq, - ExAC_AF - ) %>% - data.table() - - return(maf.file) - })) %>% - unique() %>% - data.table() + if (nrow(maf.file) == 0) { - # merging and melting ----------------------------------------------------- - hotspot.maf <- - fread(paste0( - results.dir, - '/', - x, - '/', - x, - '_all_unique_calls_hotspots.maf' - )) %>% - rowwise() %>% - transmute( - Hugo_Symbol, - Chromosome = as.character(Chromosome), - Start_Position, - End_Position, - Variant_Classification, - # HGVSp_Short, - Reference_Allele, - Tumor_Seq_Allele2, - Hotspot = ifelse(hotspot_whitelist, 'Hotspot', NA) - ) %>% - data.table() + columns <- c( + "Hugo_Symbol", "Tumor_Sample_Barcode", "Chromosome", "Start_Position", + "End_Position", "Variant_Classification", "HGVSp_Short", + "Reference_Allele", "Tumor_Seq_Allele2", "t_var_freq", "ExAC_AF") + df <- data.frame(matrix(ncol = length(columns), nrow = 0)) + colnames(df) <- columns + + return(df) + } - dmp.maf <- - fread(paste0(results.dir, '/', x, '/', x, '_impact_calls.maf')) %>% - transmute( - Hugo_Symbol, - Chromosome = as.character(Chromosome), - Start_Position, - End_Position, - Variant_Classification, - # HGVSp_Short, - Reference_Allele, - Tumor_Seq_Allele2 - ) %>% - mutate(DMP = 'Signed out') %>% - unique() %>% + # fragment counts replacing actual allele counts + if(grepl('SIMPLEX-DUPLEX_genotyped',y)){ + melt.id.vars = colnames(maf.file)[!grepl('fragment',colnames(maf.file))] + + # get rid of simplex duplex aggregate columns + maf.file %>% + select(-c(contains('simplex_duplex'))) %>% + # melting and dcasting columns back but separating duplex and simplex columns + # t_alt_duplex, t_depth_duplex, t_alt_simplex, t_depth_simplex --> t_alt, t_depth + melt.data.table(id.vars = melt.id.vars,variable.name = 'variable',value.name = 'value') %>% + mutate(variable = gsub('fragment','_',variable)) %>% + separate(variable,c('variable','Sample_Type'),sep = '___') %>% + mutate(Tumor_Sample_Barcode = paste0(sample.name,'___',Sample_Type)) %>% + select(-Sample_Type) %>% + data.table() %>% + unique() %>% + dcast.data.table(as.formula(paste0(paste0(melt.id.vars,collapse = ' + '),' ~ variable')),value.var = 'value') -> maf.file + }else{ + maf.file <- maf.file %>% + mutate(Tumor_Sample_Barcode = paste0(sample.name,'___',sample.type)) %>% + # swaping the t_alt_count(etc)_standard for t_alt_count(etc) + mutate(t_alt_count = t_alt_count_standard,t_total_count = t_total_count_standard) + } + + maf.file = maf.file %>% + mutate(t_var_freq = paste0(t_alt_count,'/',t_total_count,'(',round(t_alt_count/t_total_count,4),')')) %>% + transmute(Hugo_Symbol,Tumor_Sample_Barcode,Chromosome = as.character(Chromosome),Start_Position,End_Position,Variant_Classification, + HGVSp_Short=as.character(HGVSp_Short),Reference_Allele,Tumor_Seq_Allele2,t_var_freq,ExAC_AF) %>% data.table() - if ((nrow(dmp.maf) > 0) && (nrow(fillouts.dt) > 0)) { + return(maf.file) + })) %>% + unique() %>% + data.table() + + # merging and melting ----------------------------------------------------- + hotspot.maf <- fread(paste0(results.dir,'/',x,'/',x,'_all_unique_calls_hotspots.maf')) %>% + rowwise() %>% + transmute(Hugo_Symbol,Chromosome = as.character(Chromosome),Start_Position,End_Position,Variant_Classification, + # HGVSp_Short, + Reference_Allele,Tumor_Seq_Allele2,Hotspot = ifelse(hotspot_whitelist,'Hotspot',NA)) %>% + data.table() + + dmp.maf <- fread(paste0(results.dir,'/',x,'/',x,'_impact_calls.maf')) %>% + transmute(Hugo_Symbol,Chromosome = as.character(Chromosome),Start_Position,End_Position,Variant_Classification, + # HGVSp_Short, + Reference_Allele,Tumor_Seq_Allele2) %>% + mutate(DMP = 'Signed out') %>% + unique() %>% + data.table() + + if((nrow(dmp.maf) > 0) && (nrow(fillouts.dt) > 0)){ fillouts.dt <- fillouts.dt %>% - dcast.data.table( - Hugo_Symbol + Chromosome + Start_Position + End_Position + Variant_Classification + - HGVSp_Short + Reference_Allele + Tumor_Seq_Allele2 + ExAC_AF ~ Tumor_Sample_Barcode, - value.var = 't_var_freq' - ) %>% + dcast.data.table(Hugo_Symbol + Chromosome + Start_Position + End_Position + Variant_Classification + + HGVSp_Short + Reference_Allele + Tumor_Seq_Allele2 + ExAC_AF ~ Tumor_Sample_Barcode, + value.var = 't_var_freq') %>% # hotspot information merge( hotspot.maf, - by = c( - 'Hugo_Symbol', - 'Chromosome', - 'Start_Position', - 'End_Position', - 'Variant_Classification', - 'Reference_Allele', - 'Tumor_Seq_Allele2' - ), - all.x = T - ) %>% + by = c('Hugo_Symbol','Chromosome','Start_Position','End_Position','Variant_Classification','Reference_Allele','Tumor_Seq_Allele2'), + all.x = T) %>% # Identifying signed out calls merge( dmp.maf, - by = c( - 'Hugo_Symbol', - 'Chromosome', - 'Start_Position', - 'End_Position', - 'Variant_Classification', - 'Reference_Allele', - 'Tumor_Seq_Allele2' - ), - all.x = T - ) %>% + by = c('Hugo_Symbol','Chromosome','Start_Position','End_Position','Variant_Classification','Reference_Allele','Tumor_Seq_Allele2'), + all.x = T) %>% # pooled normal for systemic artifacts merge( pooled.normal.mafs, - by = c( - 'Hugo_Symbol', - 'Chromosome', - 'Start_Position', - 'End_Position', - 'Variant_Classification', - 'Reference_Allele', - 'Tumor_Seq_Allele2' - ), - all.x = T - ) %>% + by = c('Hugo_Symbol','Chromosome','Start_Position','End_Position','Variant_Classification','Reference_Allele','Tumor_Seq_Allele2'), + all.x = T) %>% data.table() - } else if (nrow(fillouts.dt) > 0) { - fillouts.dt <- fillouts.dt %>% - dcast.data.table( - Hugo_Symbol + Chromosome + Start_Position + End_Position + Variant_Classification + - HGVSp_Short + Reference_Allele + Tumor_Seq_Allele2 + ExAC_AF ~ Tumor_Sample_Barcode, - value.var = 't_var_freq' - ) %>% - # hotspot information - merge( - hotspot.maf, - by = c( - 'Hugo_Symbol', - 'Chromosome', - 'Start_Position', - 'End_Position', - 'Variant_Classification', - 'Reference_Allele', - 'Tumor_Seq_Allele2' - ), - all.x = T - ) %>% - # pooled normal for systemic artifacts - merge( - pooled.normal.mafs, - by = c( - 'Hugo_Symbol', - 'Chromosome', - 'Start_Position', - 'End_Position', - 'Variant_Classification', - 'Reference_Allele', - 'Tumor_Seq_Allele2' - ), - all.x = T - ) %>% - mutate(DMP = NA) %>% - data.table() - } else { - print( - paste0( - "Found no tumor or DMP mutations for ", - x, - ". Writing an empty data.frame to CSV." - ) - ) - - # if fillouts.dt has no data, then add the needed columns with no data - fillouts.dt[, c("DMP", - "Hotspot", - "duplex_support_num", - "call_confidence", - "CH") := NA] - - fillouts.dt <- fillouts.dt %>% select( - Hugo_Symbol, - Chromosome, - Start_Position, - End_Position, - Variant_Classification, - HGVSp_Short, - Reference_Allele, - Tumor_Seq_Allele2, - ExAC_AF, - Hotspot, - DMP, - CH, - duplex_support_num, - call_confidence, - sort(everything()) - ) - - write.csv( - fillouts.dt, - paste0( - results.dir, - '/results_', - criteria, - '/', - x, - '_SNV_table.csv' - ), - row.names = F - ) - - return() - } + } else if (nrow(fillouts.dt) > 0){ + fillouts.dt <- fillouts.dt %>% + dcast.data.table( + Hugo_Symbol + Chromosome + Start_Position + End_Position + Variant_Classification + + HGVSp_Short + Reference_Allele + Tumor_Seq_Allele2 + ExAC_AF ~ Tumor_Sample_Barcode, + value.var = 't_var_freq') %>% + # hotspot information + merge( + hotspot.maf, + by = c('Hugo_Symbol','Chromosome','Start_Position','End_Position','Variant_Classification','Reference_Allele','Tumor_Seq_Allele2'), + all.x = T) %>% + # pooled normal for systemic artifacts + merge( + pooled.normal.mafs, + by = c('Hugo_Symbol','Chromosome','Start_Position','End_Position','Variant_Classification','Reference_Allele','Tumor_Seq_Allele2'), + all.x = T) %>% + mutate(DMP = NA) %>% + data.table() + } else { - # Interesting cases where DMP signed out calls are artifacets - if (any(!is.na(fillouts.dt$DMP) & - !is.na(fillouts.dt$duplex_support_num))) { - print(paste0('Look at ', x, ' for DMP signed out plasma artifacts...')) - } + print(paste0("Found no tumor or DMP mutations for ", x, ". Writing an empty data.frame to CSV.")) + + # if fillouts.dt has no data, then add the needed columns with no data + fillouts.dt[,c("DMP", "Hotspot", "duplex_support_num", "call_confidence", "CH") := NA] + + fillouts.dt <- fillouts.dt %>% select( + Hugo_Symbol,Chromosome,Start_Position,End_Position, + Variant_Classification,HGVSp_Short,Reference_Allele,Tumor_Seq_Allele2, + ExAC_AF,Hotspot,DMP,CH,duplex_support_num,call_confidence,sort(everything())) + + write.csv( + fillouts.dt, + paste0(results.dir,'/results_',criteria,'/',x,'_SNV_table.csv'), + row.names = F) + + return() + } - # germline filtering for matched and unmatched ---------------------------- - plasma.samples <- - sample.sheet[Sample_Type %in% c('duplex')]$column.names - normal.samples <- - sample.sheet[Sample_Type %in% c('unfilterednormal', 'normal_DMP')]$column.names - fillouts.dt[, c(paste0(plasma.samples, '.called') - # paste0(gsub('duplex','simplex',plasma.samples),'.called')) := 'Not Called'] - - # preliminary calling - # tmp.col.name <- plasma.samples[1] - lapply(plasma.samples, function(tmp.col.name) { - # genotyping (signed out stuff) - fillouts.dt[(as.numeric(gsub("/.*.$", '', get(tmp.col.name))) >= 1 | - as.numeric(gsub("/.*.$", '', get( - paste0(gsub('duplex', 'simplex', tmp.col.name)) - ))) > 1) & DMP == 'Signed out', - eval(paste0(tmp.col.name, '.called')) := 'Genotyped'] - # c(eval(paste0(tmp.col.name,'.called')),eval(paste0(gsub('duplex','simplex',tmp.col.name),'.called'))) := list('Called','Called')] - # hotspot reads - fillouts.dt[as.numeric(gsub("/.*.$", '', get(tmp.col.name))) >= hotspot.support & - Hotspot == 'Hotspot', - eval(paste0(tmp.col.name, '.called')) := 'Called'] - # c(eval(paste0(tmp.col.name,'.called')),eval(paste0(gsub('duplex','simplex',tmp.col.name),'.called'))) := list('Called','Called')] - # non hotspot reads - fillouts.dt[as.numeric(gsub("/.*.$", '', get(tmp.col.name))) >= non.hotspot.support & - is.na(Hotspot), - eval(paste0(tmp.col.name, '.called')) := 'Called'] - # c(eval(paste0(tmp.col.name,'.called')),eval(paste0(gsub('duplex','simplex',tmp.col.name),'.called'))) := list('Called','Called')] - # print(table(fillouts.dt[,get(paste0(tmp.col.name,'.called'))])) - }) - - if (all(!c('unfilterednormal', 'normal_DMP') %in% sample.sheet$Sample_Type)) { - tmp.col.name <- plasma.samples[1] - lapply(plasma.samples, function(tmp.col.name) { - #fillouts.dt[as.numeric(gsub("\\(|\\)",'',str_extract(get(tmp.col.name),"\\(.*.\\)"))) >= 0.3 | ExAC_AF >= 0.0001,eval(paste0(tmp.col.name,'.called')) := 'Not Called'] - fillouts.dt[get(tmp.col.name) == '0/0(NaN)', eval(paste0(tmp.col.name, '.called')) := 'Not Covered'] - }) - } else{ - lapply(plasma.samples, function(tmp.col.name) { - lapply(normal.samples, function(tmp.col.name.normal) { - # duplex tvar/nvar > 5 - fillouts.dt[(as.numeric(gsub( - "\\(|\\)", '', str_extract(get(tmp.col.name), "\\(.*.\\)") - )) / as.numeric(gsub( - "\\(|\\)", '', str_extract(get(tmp.col.name.normal), "\\(.*.\\)") - )) < 2) | - # if duplex have no reads, use simplex tvar - (as.numeric(gsub( - "\\(|\\)", '', str_extract(get( - gsub('duplex', 'simplex', tmp.col.name) - ), "\\(.*.\\)") - )) / as.numeric(gsub( - "\\(|\\)", '', str_extract(get(tmp.col.name.normal), "\\(.*.\\)") - )) < 2 & - as.numeric(gsub("/.*.$", '', get( - tmp.col.name - ))) == 0), - eval(paste0(tmp.col.name, '.called')) := 'Not Called'] - fillouts.dt[get(tmp.col.name) == '0/0(NaN)', eval(paste0(tmp.col.name, '.called')) := 'Not Covered'] - }) - }) - } - - # final processing -------------------------------------------------------- - # Save only the useful column - #print(fillouts.dt) - #print("#######") - fillouts.dt <- - fillouts.dt[DMP == 'Signed out' | - fillouts.dt[, apply(.SD, 1, function(x) { - any(x == 'Called') - })]] - #print(fillouts.dt) - # combining duplex and simplex counts - lapply(plasma.samples, function(tmp.col.name) { - # hotspot reads - fillouts.dt[, eval(gsub('duplex', 'total', tmp.col.name)) := paste0( - as.numeric(gsub("/.*.$", '', get(tmp.col.name))) + as.numeric(gsub("/.*.$", '', get( - gsub('duplex', 'simplex', tmp.col.name) - ))), - '/', - as.numeric(gsub( - "^.*./|\\(.*.$", '', get(tmp.col.name) - )) + as.numeric(gsub("^.*./|\\(.*.$", '', get( - gsub('duplex', 'simplex', tmp.col.name) - ))), - '(', - round((as.numeric(gsub( - "/.*.$", '', get(tmp.col.name) - )) + as.numeric(gsub( - "/.*.$", '', get(gsub('duplex', 'simplex', tmp.col.name)) - ))) / - (as.numeric(gsub( - "^.*./|\\(.*.$", '', get(tmp.col.name) - )) + as.numeric(gsub( - "^.*./|\\(.*.$", '', get(gsub('duplex', 'simplex', tmp.col.name)) - ))), - 4 - ), - ')' - )] - fillouts.dt[, c(eval(gsub('duplex', 'simplex', tmp.col.name)), eval(tmp.col.name)) := list(NULL, NULL)] - }) - - fillouts.dt <- - fillouts.dt[, order(colnames(fillouts.dt)), with = F] %>% - # filter for artifacts - mutate(call_confidence = case_when( - (Hugo_Symbol == 'TERT' & - is.na(Hotspot)) | - ( - Hugo_Symbol == 'ERBB2' & grepl('[A-Z]90[0-9][A-Z]', HGVSp_Short) - ) | - (Hugo_Symbol == 'BRAF' & - grepl('711', HGVSp_Short)) | - ( - Hugo_Symbol == 'NF1' & - grepl('[A-Z]106[0-9][A-Z]', HGVSp_Short) - ) ~ 'Low', - DMP == 'Signed out' ~ 'High', - TRUE ~ '' - )) %>% - merge( - CH.calls[, .( - Hugo_Symbol = Gene, - Chromosome = Chrom, - Start_Position = Start, - Reference_Allele = Ref, - Tumor_Seq_Allele2 = Alt, - HGVSp_Short = AAchange, - Variant_Classification = VariantClass, - CH = 'Yes' - )], - by = c( - 'Hugo_Symbol', - 'Chromosome', - 'Start_Position', - 'Variant_Classification', - 'HGVSp_Short', - 'Reference_Allele', - 'Tumor_Seq_Allele2' - ), - all.x = T - ) %>% - mutate(CH = ifelse(is.na(CH), 'No', 'Yes')) %>% - select( - Hugo_Symbol, - Chromosome, - Start_Position, - End_Position, - Variant_Classification, - HGVSp_Short, - Reference_Allele, - Tumor_Seq_Allele2, - ExAC_AF, - Hotspot, - DMP, - CH, - duplex_support_num, - call_confidence, - sort(everything()) - ) - - write.csv( - fillouts.dt, - paste0( - results.dir, - '/results_', - criteria, - '/', - x, - '_SNV_table.csv' - ), - row.names = F - ) - }) - - if (all(unlist(all.fillout.dim))) { - print('All dimension of fillout mafs for each patient looks correct') - } + # Interesting cases where DMP signed out calls are artifacets + if(any(!is.na(fillouts.dt$DMP) & !is.na(fillouts.dt$duplex_support_num))){ + print(paste0('Look at ',x,' for DMP signed out plasma artifacts...')) + } + # germline filtering for matched and unmatched ---------------------------- + plasma.samples <- sample.sheet[Sample_Type %in% c('duplex')]$column.names + normal.samples <- sample.sheet[Sample_Type %in% c('unfilterednormal','normal_DMP')]$column.names + fillouts.dt[,c( + paste0(plasma.samples,'.called') + # paste0(gsub('duplex','simplex',plasma.samples),'.called') + + ) := 'Not Called'] + + # preliminary calling + # tmp.col.name <- plasma.samples[1] + lapply(plasma.samples,function(tmp.col.name){ + # genotyping (signed out stuff) + fillouts.dt[(as.numeric(gsub("/.*.$",'',get(tmp.col.name))) >= 1 | as.numeric(gsub("/.*.$",'',get(paste0(gsub('duplex','simplex',tmp.col.name))))) > 1) & DMP == 'Signed out', + eval(paste0(tmp.col.name,'.called')) := 'Genotyped'] + # c(eval(paste0(tmp.col.name,'.called')),eval(paste0(gsub('duplex','simplex',tmp.col.name),'.called'))) := list('Called','Called')] + # hotspot reads + fillouts.dt[as.numeric(gsub("/.*.$",'',get(tmp.col.name))) >= hotspot.support & Hotspot == 'Hotspot', + eval(paste0(tmp.col.name,'.called')) := 'Called'] + # c(eval(paste0(tmp.col.name,'.called')),eval(paste0(gsub('duplex','simplex',tmp.col.name),'.called'))) := list('Called','Called')] + # non hotspot reads + fillouts.dt[as.numeric(gsub("/.*.$",'',get(tmp.col.name))) >= non.hotspot.support & is.na(Hotspot), + eval(paste0(tmp.col.name,'.called')) := 'Called'] + # c(eval(paste0(tmp.col.name,'.called')),eval(paste0(gsub('duplex','simplex',tmp.col.name),'.called'))) := list('Called','Called')] + # print(table(fillouts.dt[,get(paste0(tmp.col.name,'.called'))])) + }) + + if(all(!c('unfilterednormal','normal_DMP') %in% sample.sheet$Sample_Type)){ + tmp.col.name <- plasma.samples[1] + lapply(plasma.samples,function(tmp.col.name){ + #fillouts.dt[as.numeric(gsub("\\(|\\)",'',str_extract(get(tmp.col.name),"\\(.*.\\)"))) >= 0.3 | ExAC_AF >= 0.0001,eval(paste0(tmp.col.name,'.called')) := 'Not Called'] + fillouts.dt[get(tmp.col.name) == '0/0(NaN)',eval(paste0(tmp.col.name,'.called')) := 'Not Covered'] + }) + }else{ + lapply(plasma.samples,function(tmp.col.name){ + lapply(normal.samples,function(tmp.col.name.normal){ + # duplex tvar/nvar > 5 + fillouts.dt[(as.numeric(gsub("\\(|\\)",'',str_extract(get(tmp.col.name),"\\(.*.\\)")))/as.numeric(gsub("\\(|\\)",'',str_extract(get(tmp.col.name.normal),"\\(.*.\\)"))) < 2) | + # if duplex have no reads, use simplex tvar + (as.numeric(gsub("\\(|\\)",'',str_extract(get(gsub('duplex','simplex',tmp.col.name)),"\\(.*.\\)")))/as.numeric(gsub("\\(|\\)",'',str_extract(get(tmp.col.name.normal),"\\(.*.\\)"))) < 2 & + as.numeric(gsub("/.*.$",'',get(tmp.col.name))) == 0), + eval(paste0(tmp.col.name,'.called')) := 'Not Called'] + fillouts.dt[get(tmp.col.name) == '0/0(NaN)',eval(paste0(tmp.col.name,'.called')) := 'Not Covered'] + }) + }) } + # final processing -------------------------------------------------------- + # Save only the useful column + #print(fillouts.dt) + #print("#######") + fillouts.dt <- fillouts.dt[DMP == 'Signed out' | fillouts.dt[,apply(.SD,1,function(x){any(x == 'Called')})]] + #print(fillouts.dt) + # combining duplex and simplex counts + lapply(plasma.samples,function(tmp.col.name){ + # hotspot reads + fillouts.dt[,eval(gsub('duplex','total',tmp.col.name)) := paste0( + as.numeric(gsub("/.*.$",'',get(tmp.col.name)))+as.numeric(gsub("/.*.$",'',get(gsub('duplex','simplex',tmp.col.name)))),'/', + as.numeric(gsub("^.*./|\\(.*.$",'',get(tmp.col.name)))+as.numeric(gsub("^.*./|\\(.*.$",'',get(gsub('duplex','simplex',tmp.col.name)))),'(', + round((as.numeric(gsub("/.*.$",'',get(tmp.col.name)))+as.numeric(gsub("/.*.$",'',get(gsub('duplex','simplex',tmp.col.name)))))/ + (as.numeric(gsub("^.*./|\\(.*.$",'',get(tmp.col.name)))+as.numeric(gsub("^.*./|\\(.*.$",'',get(gsub('duplex','simplex',tmp.col.name))))),4),')' + )] + fillouts.dt[,c(eval(gsub('duplex','simplex',tmp.col.name)),eval(tmp.col.name)):= list(NULL,NULL)] + }) + + fillouts.dt <- fillouts.dt[,order(colnames(fillouts.dt)),with = F] %>% + # filter for artifacts + mutate(call_confidence = case_when( + (Hugo_Symbol == 'TERT' & is.na(Hotspot)) | (Hugo_Symbol == 'ERBB2' & grepl('[A-Z]90[0-9][A-Z]',HGVSp_Short)) | + (Hugo_Symbol == 'BRAF' & grepl('711',HGVSp_Short)) | (Hugo_Symbol == 'NF1' & grepl('[A-Z]106[0-9][A-Z]',HGVSp_Short)) ~ 'Low', + DMP == 'Signed out' ~ 'High', + TRUE ~ '' + )) %>% + merge(CH.calls[,.(Hugo_Symbol = Gene,Chromosome = Chrom,Start_Position = Start,Reference_Allele = Ref,Tumor_Seq_Allele2 = Alt,HGVSp_Short = AAchange,Variant_Classification = VariantClass,CH = 'Yes')], + by = c('Hugo_Symbol','Chromosome','Start_Position','Variant_Classification','HGVSp_Short','Reference_Allele','Tumor_Seq_Allele2'), + all.x = T) %>% + mutate(CH = ifelse(is.na(CH),'No','Yes')) %>% + select(Hugo_Symbol,Chromosome,Start_Position,End_Position,Variant_Classification,HGVSp_Short,Reference_Allele,Tumor_Seq_Allele2, + ExAC_AF,Hotspot,DMP,CH,duplex_support_num,call_confidence,sort(everything())) + + write.csv(fillouts.dt,paste0(results.dir,'/results_',criteria,'/',x,'_SNV_table.csv'),row.names = F) + }) + + if(all(unlist(all.fillout.dim))){ + print('All dimension of fillout mafs for each patient looks correct') + } + +} + # Executable ----------------------------------------------------------------------------------------------------------- suppressPackageStartupMessages({ library(data.table) @@ -556,47 +292,28 @@ suppressPackageStartupMessages({ }) if (!interactive()) { - parser = ArgumentParser() - parser$add_argument('-m', '--masterref', type = 'character', help = 'File path to master reference file') - parser$add_argument('-o', '--resultsdir', type = 'character', help = 'Output directory') - parser$add_argument( - '-ch', - '--chlist', - type = 'character', - default = '/juno/work/access/production/resources/dmp_signedout_CH/current/signedout_CH.txt', - help = 'List of signed out CH calls [default]' - ) - parser$add_argument( - '-c', - '--criteria', - type = 'character', - default = 'stringent', - help = 'Calling criteria [default]' - ) - args = parser$parse_args() + + parser=ArgumentParser() + parser$add_argument('-m', '--masterref', type='character', help='File path to master reference file') + parser$add_argument('-o', '--resultsdir', type='character', help='Output directory') + parser$add_argument('-ch', '--chlist', type='character', default = '/juno/work/access/production/resources/dmp_signedout_CH/current/signedout_CH.txt', + help='List of signed out CH calls [default]') + parser$add_argument('-c', '--criteria', type='character', default = 'stringent', + help='Calling criteria [default]') + args=parser$parse_args() master.ref = args$masterref results.dir = args$resultsdir chlist = args$chlist criteria = args$criteria - cat(paste0(paste0( - c( - paste0(rep('-', 15), collapse = ''), - 'Arguments input: ', - master.ref, - results.dir, - chlist, - criteria, - paste0(rep('-', 15), collapse = '') - ), - collapse = "\n" - ), '\n')) - - if (!criteria %in% c('stringent', 'permissive')) { + cat(paste0(paste0(c(paste0(rep('-',15),collapse = ''),'Arguments input: ',master.ref,results.dir,chlist,criteria, + paste0(rep('-',15),collapse = '')),collapse = "\n"),'\n')) + + if(!criteria %in% c('stringent','permissive')){ stop('Criteria argument should be either stringent or permissive') } - suppressWarnings(filter_calls(fread(master.ref), results.dir, chlist, criteria)) + suppressWarnings(filter_calls(fread(master.ref),results.dir,chlist,criteria)) } From a29b031b5eedf233a973ba40ad5681cc2c4a26f1 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Fri, 3 Feb 2023 08:54:47 -0500 Subject: [PATCH 122/126] adding access samples to genotype --- R/compile_reads.R | 819 +++++++++------------------------------------- 1 file changed, 155 insertions(+), 664 deletions(-) diff --git a/R/compile_reads.R b/R/compile_reads.R index 142b39a..f30735c 100644 --- a/R/compile_reads.R +++ b/R/compile_reads.R @@ -5,17 +5,12 @@ #' @export -compile_reads <- function(master.ref, - results.dir, - project.ID, - pooled.bam.dir = "/juno/work/access/production/resources/msk-access/current/novaseq_curated_duplex_bams_dmp/current/", +compile_reads <- function( + master.ref, results.dir, project.ID, pooled.bam.dir = "/juno/work/access/production/resources/msk-access/current/novaseq_curated_duplex_bams_dmp/current/", fasta.path = "/juno/work/access/production/resources/reference/current/Homo_sapiens_assembly19.fasta", genotyper.path = "/work/access/production/resources/tools/GetBaseCountsMultiSample/current/GetBaseCountsMultiSample", - dmp.dir = "/juno/work/access/production/resources/cbioportal/current/msk_solid_heme", - mirror.bam.dir = "/juno/res/dmpcollab/dmpshare/share/irb12_245", - mirror.access.bam.dir = "/juno/res/dmpcollab/dmpshare/share/access_12_245/", - dmp.key.path = "/juno/res/dmpcollab/dmprequest/12-245/key.txt", - access.key.path = "/juno/res/dmpcollab/dmprequest/ACCESS-12-245/key.txt") { + dmp.dir = "/juno/work/access/production/resources/cbioportal/current/msk_solid_heme", mirror.bam.dir = "/juno/res/dmpcollab/dmpshare/share/irb12_245", + dmp.key.path = "/juno/res/dmpcollab/dmprequest/12-245/key.txt") { # # test input section ----------------------------------------------------------- # master.ref = fread('/juno/work/bergerm1/bergerlab/zhengy1/access_data_analysis/data/example_master_file.csv') # results.dir = paste0('/juno/work/bergerm1/MSK-ACCESS/ACCESS-Projects/test_access/access_data_analysis/output_',format(Sys.time(),'%m%d%y')) @@ -33,620 +28,170 @@ compile_reads <- function(master.ref, geno.bash <- system("which genotype_variants", intern = T) if (length(geno.bash) == 0) { # print(pyclone.path) - stop( - "needs to run \nsource /home/accessbot/miniconda3/bin/activate && conda activate genotype-variants-0.3.0" - ) + stop("needs to run \nsource /home/accessbot/miniconda3/bin/activate && conda activate genotype-variants-0.3.0") } # data from DMP ----------------------------------------------------------- DMP.key <- fread(dmp.key.path) - if (any(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-IH.|-T..-IM.", "", DMP.key[grepl("IH|IM", V1)]$V1))) { - message(paste0( + if (any(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-IH.|-T..-IM.|-T..-XS", "", DMP.key[grepl("IH|IM|XS", V1)]$V1))) { + stop(paste0( "These DMP IDs are not found in DMP key file: ", paste0(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id[which(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% - gsub("-T..-IH.|-T..-IM.", "", DMP.key[grepl("IH|IM", V1)]$V1))], collapse = " ,") - )) - } - # data from DMP ACCESS ---------------------------------------------------- - access.key <- - as.data.table(read.csv(access.key.path, header = FALSE, sep = ",")) - if (any(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1))) { - message(paste0( - "These DMP IDs are not found in DMP ACCESS key file: ", - paste0(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id[which(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% - gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1))], collapse = " ,") + gsub("-T..-IH.|-T..-IM.|-T..-XS", "", DMP.key[grepl("IH|IM|XS", V1)]$V1))], collapse = " ,") )) } - - DMP.maf <- - fread(paste0(dmp.dir, "/data_mutations_extended.txt")) %>% + DMP.maf <- fread(paste0(dmp.dir, "/data_mutations_extended.txt")) %>% filter(Mutation_Status != "GERMLINE") %>% data.table() - DMP.RET.maf <- - DMP.maf[grepl(paste0(unique(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id), collapse = "|"), Tumor_Sample_Barcode), ] + DMP.RET.maf <- DMP.maf[grepl(paste0(unique(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id), collapse = "|"), Tumor_Sample_Barcode), ] # Pooled normal samples --------------------------------------------------- - pooled.bams <- - list.files(pooled.bam.dir, pattern = ".bam", full.names = T) + pooled.bams <- list.files(pooled.bam.dir, pattern = ".bam", full.names = T) # For each patient -------------------------------------------------------- x <- unique(master.ref$cmo_patient_id)[1] # x = unique(master.ref$cmo_sample_id_plasma)[16] # x = 'C-YW82CY' print("Compiling reads per patient") - all.fillout.id <- - lapply(unique(master.ref$cmo_patient_id), function(x) { - print(x) - dir.create(paste0(results.dir, "/", x)) - dmp_id <- - unique(master.ref[cmo_patient_id == x]$dmp_patient_id) - # sample sheet with colummns -- TSB, sample type, bam path, treatm -------- - # need to get DMP tumor, DMP normal, plasma, plasma normal (if there is any), pooled normal - # DMP sample sheet - if (is.na(dmp_id) | dmp_id == '') { - dmp.sample.sheet <- NULL - } else { - all.dmp.ids.IM <- - DMP.key[grepl(paste0(dmp_id, "-(T|N)..-IM."), V1)]$V1 - all.dmp.ids.IH <- - DMP.key[grepl(paste0(dmp_id, "-(T|N)..-IH."), V1)]$V1 - all.dmp.ids.XS <- - access.key[grepl(paste0(dmp_id, "-T..-XS."), V1)]$V1 - all.dmp.ids.normal.XS <- - access.key[grepl(paste0(dmp_id, "-N..-XS."), V1)]$V1 - all.dmp.ids <- c(all.dmp.ids.IM, all.dmp.ids.IH) - all.dmp.bam.ids.IM <- - DMP.key[grepl(paste0(dmp_id, "-(T|N)..-IM."), V1)]$V2 - all.dmp.bam.ids.IH <- - DMP.key[grepl(paste0(dmp_id, "-(T|N)..-IH."), V1)]$V2 - all.dmp.bam.ids.XS <- - gsub("-standard|-unfilter|-simplex|-duplex", - "", - access.key[grepl(paste0(dmp_id, "-T..-XS."), V1)]$V2) - all.dmp.bam.ids.normal.XS <- - gsub("-standard|-unfilter|-simplex|-duplex", - "", - access.key[grepl(paste0(dmp_id, "-N..-XS."), V1)]$V2) - all.dmp.bam.ids <- - c(all.dmp.bam.ids.IM, - all.dmp.bam.ids.IH) - if (length(all.dmp.ids) == 0) { - dmp.sample.sheet <- NULL - } else{ - bam.sub.dir <- - unlist(lapply(strsplit(substr( - all.dmp.bam.ids, 1, 2 - ), ""), function(x) { - paste0(x, collapse = "/") - })) - dmp.sample.sheet <- data.frame( - Sample_Barcode = all.dmp.ids, - standard_bam = paste0( - mirror.bam.dir, - "/", - bam.sub.dir, - "/", - all.dmp.bam.ids, - ".bam" - ), - duplex_bam = NA, - simplex_bam = NA - ) %>% - mutate( - cmo_patient_id = x, - Sample_Type = ifelse( - grepl("-T", Sample_Barcode), - "DMP_Tumor", - "DMP_Normal" - ), - dmp_patient_id = dmp_id - ) - } - if (length(all.dmp.ids.XS) == 0) { - access.sample.sheet <- NULL - } else{ - access.bam.sub.dir <- - unlist(lapply(strsplit( - substr(all.dmp.bam.ids.XS, 1, 2), "" - ), function(x) { - paste0(x, collapse = "/") - })) - access.sample.sheet <- unique( - data.frame( - Sample_Barcode = all.dmp.ids.XS, - standard_bam = NA, - duplex_bam = paste0( - mirror.access.bam.dir, - "/", - access.bam.sub.dir, - "/", - all.dmp.bam.ids.XS, - "-duplex.bam" - ), - simplex_bam = paste0( - mirror.access.bam.dir, - "/", - access.bam.sub.dir, - "/", - all.dmp.bam.ids.XS, - "-simplex.bam" - ) - ) %>% - mutate( - cmo_patient_id = x, - Sample_Type = ifelse( - grepl("-T", Sample_Barcode), - "duplex", - "unfilterednormal" - ), - dmp_patient_id = dmp_id - ) - ) - access.normal.bam.sub.dir <- - unlist(lapply(strsplit( - substr(all.dmp.bam.ids.normal.XS, 1, 2), "" - ), function(x) { - paste0(x, collapse = "/") - })) - access.normal.sample.sheet <- unique( - data.frame( - Sample_Barcode = all.dmp.ids.normal.XS, - standard_bam = paste0( - mirror.access.bam.dir, - "/", - access.normal.bam.sub.dir, - "/", - all.dmp.bam.ids.normal.XS, - "-unfilter.bam" - ), - duplex_bam = NA, - simplex_bam = NA - ) %>% - mutate( - cmo_patient_id = x, - Sample_Type = ifelse( - grepl("-N", Sample_Barcode), - "unfilterednormal", - "duplex" - ), - dmp_patient_id = dmp_id - ) + all.fillout.id <- lapply(unique(master.ref$cmo_patient_id), function(x) { + print(x) + dir.create(paste0(results.dir, "/", x)) + dmp_id <- unique(master.ref[cmo_patient_id == x]$dmp_patient_id) + # sample sheet with colummns -- TSB, sample type, bam path, treatm -------- + # need to get DMP tumor, DMP normal, plasma, plasma normal (if there is any), pooled normal + # DMP sample sheet + if (is.na(dmp_id) | dmp_id == '') { + dmp.sample.sheet <- NULL + } else { + all.dmp.ids.IM <- DMP.key[grepl(paste0(dmp_id, "-(T|N)..-IM."), V1)]$V1 + all.dmp.ids.IH <- DMP.key[grepl(paste0(dmp_id, "-(T|N)..-IH."), V1)]$V1 + all.dmp.ids.XS <- DMP.key[grepl(paste0(dmp_id, "-(T|N)..-XS."), V1)]$V1 + all.dmp.ids <- c(all.dmp.ids.IM,all.dmp.ids.IH,all.dmp.ids.XS) + all.dmp.bam.ids.IM <- DMP.key[grepl(paste0(dmp_id, "-(T|N)..-IM."), V1)]$V2 + all.dmp.bam.ids.IH <- DMP.key[grepl(paste0(dmp_id, "-(T|N)..-IH."), V1)]$V2 + all.dmp.bam.ids.XS <- DMP.key[grepl(paste0(dmp_id, "-(T|N)..-XS."), V1)]$V2 + all.dmp.bam.ids <- c(all.dmp.bam.ids.IM,all.dmp.bam.ids.IH,all.dmp.bam.ids.XS) + bam.sub.dir <- unlist(lapply(strsplit(substr(all.dmp.bam.ids, 1, 2), ""), function(x) { + paste0(x, collapse = "/") + })) + dmp.sample.sheet <- data.frame( + Sample_Barcode = all.dmp.ids, + standard_bam = paste0(mirror.bam.dir, "/", bam.sub.dir, "/", all.dmp.bam.ids, ".bam") + ) %>% + mutate(cmo_patient_id = x, Sample_Type = ifelse(grepl("-T", Sample_Barcode), "DMP_Tumor", "DMP_Normal"), dmp_patient_id = dmp_id) + } + # total sample sheet + sample.sheet <- master.ref[ + cmo_patient_id == x, + # plasma bams -- duplex and simplex bam + .( + Sample_Barcode = as.character(cmo_sample_id_plasma), duplex_bam = bam_path_plasma_duplex, + simplex_bam = bam_path_plasma_simplex, cmo_patient_id, Sample_Type = "duplex", dmp_patient_id + ) + ] %>% + merge(rbind( + unique(master.ref[ + cmo_patient_id == x&paired=='Paired', + # buffy coat + DMP bams -- standard bam only + .( + Sample_Barcode = as.character(cmo_sample_id_normal), standard_bam = bam_path_normal, + cmo_patient_id, Sample_Type = "unfilterednormal", dmp_patient_id ) - access.sample.sheet = bind_rows(access.sample.sheet, access.normal.sample.sheet) - } - if (!is.null(dmp.sample.sheet) & - !is.null(access.sample.sheet)) { - print("DMP IMPACT and DMP ACCESS samples are available") - dmp.sample.sheet <- - bind_rows(dmp.sample.sheet, access.sample.sheet) - - } else if (is.null(dmp.sample.sheet) & - !is.null(access.sample.sheet)) { - print("DMP IMPACT samples are NOT available and DMP ACCESS samples are available") - dmp.sample.sheet <- access.sample.sheet - } else if (!is.null(dmp.sample.sheet) & - is.null(access.sample.sheet)) { - print("DMP IMPACT samples are available and DMP ACCESS samples are NOT available") - dmp.sample.sheet <- dmp.sample.sheet - } else{ - print("No DMP IMPACT samples or DMP ACCESS samples are available") - dmp.sample.sheet <- NULL - } + ]), + dmp.sample.sheet + ), all = T) + # catch '' or NA for empty cells for some cmo_sample_id_normal + sample.sheet <- sample.sheet[!is.na(Sample_Barcode) | Sample_Barcode != ""] + write.table(sample.sheet, paste0(results.dir, "/", x, "/", x, "_sample_sheet.tsv"), sep = "\t", quote = F, row.names = F) + # piece together all unique calls ----------------------------------------- + # get duplex calls + duplex.calls <- do.call(rbind, lapply(master.ref[cmo_patient_id == x]$maf_path, function(x) { + # fread(x) %>% filter(as.numeric(D_t_alt_count_fragment) > 0) %>% data.table() + selectcolumns <- c("Hugo_Symbol","Entrez_Gene_Id","Center","NCBI_Build","Chromosome","Start_Position","End_Position","Strand","Variant_Classification","Variant_Type","Reference_Allele","Tumor_Seq_Allele1","Tumor_Seq_Allele2","dbSNP_RS","dbSNP_Val_Status","Tumor_Sample_Barcode","caller_Norm_Sample_Barcode","Match_Norm_Seq_Allele1","Match_Norm_Seq_Allele2","Tumor_Validation_Allele1","Tumor_Validation_Allele2","Match_Norm_Validation_Allele1","Match_Norm_Validation_Allele2","Verification_Status","Validation_Status","Mutation_Status","Sequencing_Phase","Sequence_Source","Validation_Method","Score","BAM_File","Sequencer","Tumor_Sample_UUID","Matched_Norm_Sample_UUID","HGVSc","HGVSp","HGVSp_Short","Transcript_ID","Exon_Number","caller_t_depth","caller_t_ref_count","caller_t_alt_count","caller_n_depth","caller_n_ref_count","caller_n_alt_count","all_effects","Allele","Gene","Feature","Feature_type","Consequence","cDNA_position","CDS_position","Protein_position","Amino_acids","Codons","Existing_variation","ALLELE_NUM","DISTANCE","STRAND_VEP","SYMBOL","SYMBOL_SOURCE","HGNC_ID","BIOTYPE","CANONICAL","CCDS","ENSP","SWISSPROT","TREMBL","UNIPARC","RefSeq","SIFT","PolyPhen","EXON","INTRON","DOMAINS","AF","AFR_AF","AMR_AF","ASN_AF","EAS_AF","EUR_AF","SAS_AF","AA_AF","EA_AF","CLIN_SIG","SOMATIC","PUBMED","MOTIF_NAME","MOTIF_POS","HIGH_INF_POS","MOTIF_SCORE_CHANGE","IMPACT","PICK","VARIANT_CLASS","TSL","HGVS_OFFSET","PHENO","MINIMISED","ExAC_AF","ExAC_AF_AFR","ExAC_AF_AMR","ExAC_AF_EAS","ExAC_AF_FIN","ExAC_AF_NFE","ExAC_AF_OTH","ExAC_AF_SAS","GENE_PHENO","FILTER","flanking_bps","variant_id","variant_qual","ExAC_AF_Adj","ExAC_AC_AN_Adj","ExAC_AC_AN","ExAC_AC_AN_AFR","ExAC_AC_AN_AMR","ExAC_AC_AN_EAS","ExAC_AC_AN_FIN","ExAC_AC_AN_NFE","ExAC_AC_AN_OTH","ExAC_AC_AN_SAS","ExAC_FILTER","gnomAD_AF","gnomAD_AFR_AF","gnomAD_AMR_AF","gnomAD_ASJ_AF","gnomAD_EAS_AF","gnomAD_FIN_AF","gnomAD_NFE_AF","gnomAD_OTH_AF","gnomAD_SAS_AF","CallMethod","VCF_POS","VCF_REF","VCF_ALT","hotspot_whitelist","Status","D_t_alt_count_fragment","D_t_ref_count_fragment","D_t_vaf_fragment","SD_t_alt_count_fragment","SD_t_ref_count_fragment","SD_t_vaf_fragment","Matched_Norm_Sample_Barcode","Matched_Norm_Bamfile","n_alt_count_fragment","n_ref_count_fragment","n_vaf_fragment") + if("Status" %in% names(fread(x))){ + fread(x) %>% select(one_of(selectcolumns)) %>% subset((Status == "") | (is.na(Status))) + } else { + fread(x) %>% select(one_of(selectcolumns)) } - # total sample sheet - sample.sheet <- master.ref[cmo_patient_id == x, - # plasma bams -- duplex and simplex bam - .( - Sample_Barcode = as.character(cmo_sample_id_plasma), - standard_bam = NA, - duplex_bam = bam_path_plasma_duplex, - simplex_bam = bam_path_plasma_simplex, - cmo_patient_id, - Sample_Type = "duplex", - dmp_patient_id - )] %>% - merge(rbind(unique(master.ref[cmo_patient_id == x & - paired == 'Paired', - # buffy coat + DMP bams -- standard bam only - .( - Sample_Barcode = as.character(cmo_sample_id_normal), - standard_bam = bam_path_normal, - duplex_bam = NA, - simplex_bam = NA, - cmo_patient_id, - Sample_Type = "unfilterednormal", - dmp_patient_id - )]), - dmp.sample.sheet), all = T) - # catch '' or NA for empty cells for some cmo_sample_id_normal - sample.sheet <- - sample.sheet[!is.na(Sample_Barcode) | - Sample_Barcode != ""] - write.table( - sample.sheet, - paste0(results.dir, "/", x, "/", x, "_sample_sheet.tsv"), - sep = "\t", - quote = F, - row.names = F - ) - # piece together all unique calls ----------------------------------------- - # get duplex calls - duplex.calls <- - do.call(rbind, lapply(master.ref[cmo_patient_id == x]$maf_path, function(x) { - # fread(x) %>% filter(as.numeric(D_t_alt_count_fragment) > 0) %>% data.table() - selectcolumns <- - c( - "Hugo_Symbol", - "Entrez_Gene_Id", - "Center", - "NCBI_Build", - "Chromosome", - "Start_Position", - "End_Position", - "Strand", - "Variant_Classification", - "Variant_Type", - "Reference_Allele", - "Tumor_Seq_Allele1", - "Tumor_Seq_Allele2", - "dbSNP_RS", - "dbSNP_Val_Status", - "Tumor_Sample_Barcode", - "caller_Norm_Sample_Barcode", - "Match_Norm_Seq_Allele1", - "Match_Norm_Seq_Allele2", - "Tumor_Validation_Allele1", - "Tumor_Validation_Allele2", - "Match_Norm_Validation_Allele1", - "Match_Norm_Validation_Allele2", - "Verification_Status", - "Validation_Status", - "Mutation_Status", - "Sequencing_Phase", - "Sequence_Source", - "Validation_Method", - "Score", - "BAM_File", - "Sequencer", - "Tumor_Sample_UUID", - "Matched_Norm_Sample_UUID", - "HGVSc", - "HGVSp", - "HGVSp_Short", - "Transcript_ID", - "Exon_Number", - "caller_t_depth", - "caller_t_ref_count", - "caller_t_alt_count", - "caller_n_depth", - "caller_n_ref_count", - "caller_n_alt_count", - "all_effects", - "Allele", - "Gene", - "Feature", - "Feature_type", - "Consequence", - "cDNA_position", - "CDS_position", - "Protein_position", - "Amino_acids", - "Codons", - "Existing_variation", - "ALLELE_NUM", - "DISTANCE", - "STRAND_VEP", - "SYMBOL", - "SYMBOL_SOURCE", - "HGNC_ID", - "BIOTYPE", - "CANONICAL", - "CCDS", - "ENSP", - "SWISSPROT", - "TREMBL", - "UNIPARC", - "RefSeq", - "SIFT", - "PolyPhen", - "EXON", - "INTRON", - "DOMAINS", - "AF", - "AFR_AF", - "AMR_AF", - "ASN_AF", - "EAS_AF", - "EUR_AF", - "SAS_AF", - "AA_AF", - "EA_AF", - "CLIN_SIG", - "SOMATIC", - "PUBMED", - "MOTIF_NAME", - "MOTIF_POS", - "HIGH_INF_POS", - "MOTIF_SCORE_CHANGE", - "IMPACT", - "PICK", - "VARIANT_CLASS", - "TSL", - "HGVS_OFFSET", - "PHENO", - "MINIMISED", - "ExAC_AF", - "ExAC_AF_AFR", - "ExAC_AF_AMR", - "ExAC_AF_EAS", - "ExAC_AF_FIN", - "ExAC_AF_NFE", - "ExAC_AF_OTH", - "ExAC_AF_SAS", - "GENE_PHENO", - "FILTER", - "flanking_bps", - "variant_id", - "variant_qual", - "ExAC_AF_Adj", - "ExAC_AC_AN_Adj", - "ExAC_AC_AN", - "ExAC_AC_AN_AFR", - "ExAC_AC_AN_AMR", - "ExAC_AC_AN_EAS", - "ExAC_AC_AN_FIN", - "ExAC_AC_AN_NFE", - "ExAC_AC_AN_OTH", - "ExAC_AC_AN_SAS", - "ExAC_FILTER", - "gnomAD_AF", - "gnomAD_AFR_AF", - "gnomAD_AMR_AF", - "gnomAD_ASJ_AF", - "gnomAD_EAS_AF", - "gnomAD_FIN_AF", - "gnomAD_NFE_AF", - "gnomAD_OTH_AF", - "gnomAD_SAS_AF", - "CallMethod", - "VCF_POS", - "VCF_REF", - "VCF_ALT", - "hotspot_whitelist", - "Status", - "D_t_alt_count_fragment", - "D_t_ref_count_fragment", - "D_t_vaf_fragment", - "SD_t_alt_count_fragment", - "SD_t_ref_count_fragment", - "SD_t_vaf_fragment", - "Matched_Norm_Sample_Barcode", - "Matched_Norm_Bamfile", - "n_alt_count_fragment", - "n_ref_count_fragment", - "n_vaf_fragment" - ) - if ("Status" %in% names(fread(x))) { - fread(x) %>% select(one_of(selectcolumns)) %>% subset((Status == "") | - (is.na(Status))) - } else { - fread(x) %>% select(one_of(selectcolumns)) - } - # fread(x) - # %>% - # filter(as.numeric(t_alt_count) > 0) %>% - # data.table() - })) - # get impact calls - impact.calls <- - DMP.RET.maf[Tumor_Sample_Barcode %in% sample.sheet$Sample_Barcode] - write.table( - impact.calls[, .( - Hugo_Symbol, - Chromosome, - Start_Position, - End_Position, - Variant_Classification, - HGVSp_Short, - Reference_Allele, - Tumor_Seq_Allele2 - )], - paste0(results.dir, "/", x, "/", x, "_impact_calls.maf"), - sep = "\t", - quote = F, - row.names = F - ) - # combining plasma and impact calls - all.calls <- - rbind(duplex.calls[, intersect(colnames(duplex.calls), colnames(DMP.RET.maf)), with = F], - impact.calls[, intersect(colnames(duplex.calls), colnames(DMP.RET.maf)), with = F]) - # getting rid of duplicate calls and take the first occurence of all events - all.calls <- - all.calls[which(!duplicated(all.calls[, .( - Hugo_Symbol, - Chromosome, - Start_Position, - End_Position, - Variant_Classification, - HGVSp_Short, - Reference_Allele, - Tumor_Seq_Allele2 - )])), ] %>% - mutate( - t_ref_count = 0, - t_alt_count = 0, - n_ref_count = 0, - n_alt_count = 0, - Matched_Norm_Sample_Barcode = NA - ) %>% - filter( - Variant_Classification != "Silent" & - !grepl("RP11-", Hugo_Symbol) & - !grepl("Intron", Variant_Classification) - ) - write.table( - all.calls, - paste0(results.dir, "/", x, "/", x, "_all_unique_calls.maf"), - sep = "\t", - quote = F, - row.names = F - ) - # tagging hotspots - system( - paste0( - 'bsub -R "rusage[mem=4]" -cwd ', - results.dir, - "/", - x, - "/ -oo hotspot.o -eo hotspot.e -W 00:59 ", - " -P ", - project.ID, - " -J ", - x, - "_tag_hotspot ", - " python /work/access/production/workflows/access_workflows/v1/pipeline_2.0.0/ACCESS-Pipeline/cwl_tools/hotspots/tag_hotspots.py ", - " -m ", - results.dir, - "/", - x, - "/", - x, - "_all_unique_calls.maf", - " -itxt /work/access/production/resources/msk-access/current/regions_of_interest/current/hotspot-list-union-v1-v2_with_TERT.txt ", - " -o ", - results.dir, - "/", - x, - "/", - x, - "_all_unique_calls_hotspots.maf", - " -outdir ", - results.dir, - "/", - x, - "/", - x - ) - ) - # genotype all bams in this patient directory ----------------------------- - # genotyping plasma samples -- plasma duplex&simplex, plasma normal, pooled plasma normal - write.table( - sample.sheet[, .( - sample_id = Sample_Barcode, - maf = paste0(results.dir, "/", x, "/", x, "_all_unique_calls.maf"), - standard_bam, - duplex_bam, - simplex_bam - )], - paste0(results.dir, "/", x, "/", x, "_genotype_metadata.tsv"), - sep = "\t", - quote = F, - row.names = F - ) - job.ids <- system( - paste0( - "bsub -cwd ", - results.dir, - "/", - x, - ' -W 12:00 -R "rusage[mem=8]" -oo genotyping.o -eo genotyping.e ', - " -P ", - project.ID, - " -J ", - x, - "_genotype_variants ", - " genotype_variants small_variants multiple-samples -i ", - results.dir, - "/", - x, - "/", - x, - "_genotype_metadata.tsv", - " -r ", - fasta.path, - " -g ", - genotyper.path, - " -v DEBUG " - ), - intern = T - ) - job.ids <- as.numeric(gsub("Job <|> is.*.$", "", job.ids)) - }) +# fread(x) + # %>% + # filter(as.numeric(t_alt_count) > 0) %>% + # data.table() + })) + # get impact calls + impact.calls <- DMP.RET.maf[Tumor_Sample_Barcode %in% sample.sheet$Sample_Barcode] + write.table(impact.calls[, .(Hugo_Symbol, Chromosome, Start_Position, End_Position, Variant_Classification, HGVSp_Short, Reference_Allele, Tumor_Seq_Allele2)], + paste0(results.dir, "/", x, "/", x, "_impact_calls.maf"), + sep = "\t", quote = F, row.names = F + ) + # combining plasma and impact calls + all.calls <- rbind( + duplex.calls[, intersect(colnames(duplex.calls), colnames(DMP.RET.maf)), with = F], + impact.calls[, intersect(colnames(duplex.calls), colnames(DMP.RET.maf)), with = F] + ) + # getting rid of duplicate calls and take the first occurence of all events + all.calls <- all.calls[which(!duplicated(all.calls[, .(Hugo_Symbol, Chromosome, Start_Position, End_Position, Variant_Classification, HGVSp_Short, Reference_Allele, Tumor_Seq_Allele2)])), ] %>% + mutate(t_ref_count=0, t_alt_count=0, n_ref_count=0, n_alt_count=0, Matched_Norm_Sample_Barcode=NA ) %>% + filter(Variant_Classification != "Silent" & !grepl("RP11-", Hugo_Symbol) & !grepl("Intron", Variant_Classification)) + write.table(all.calls, paste0(results.dir, "/", x, "/", x, "_all_unique_calls.maf"), sep = "\t", quote = F, row.names = F) + # tagging hotspots + system(paste0( + 'bsub -R "rusage[mem=4]" -cwd ', results.dir, "/", x, "/ -oo hotspot.o -eo hotspot.e -W 00:59 ", + " -P ", project.ID, " -J ", x, "_tag_hotspot ", + " python /work/access/production/workflows/access_workflows/v1/pipeline_2.0.0/ACCESS-Pipeline/cwl_tools/hotspots/tag_hotspots.py ", + " -m ", results.dir, "/", x, "/", x, "_all_unique_calls.maf", + " -itxt /work/access/production/resources/msk-access/current/regions_of_interest/current/hotspot-list-union-v1-v2_with_TERT.txt ", + " -o ", results.dir, "/", x, "/", x, "_all_unique_calls_hotspots.maf", + " -outdir ", results.dir, "/", x, "/", x + )) + # genotype all bams in this patient directory ----------------------------- + # genotyping plasma samples -- plasma duplex&simplex, plasma normal, pooled plasma normal + write.table(sample.sheet[, .( + sample_id = Sample_Barcode, maf = paste0(results.dir, "/", x, "/", x, "_all_unique_calls.maf"), + standard_bam, duplex_bam, simplex_bam + )], + paste0(results.dir, "/", x, "/", x, "_genotype_metadata.tsv"), + sep = "\t", quote = F, row.names = F + ) + job.ids <- system(paste0( + "bsub -cwd ", results.dir, "/", x, ' -W 12:00 -R "rusage[mem=8]" -oo genotyping.o -eo genotyping.e ', + " -P ", project.ID, " -J ", x, "_genotype_variants ", + " genotype_variants small_variants multiple-samples -i ", results.dir, "/", x, "/", x, "_genotype_metadata.tsv", + " -r ", fasta.path, " -g ", genotyper.path, " -v DEBUG " + ), intern = T) + job.ids <- as.numeric(gsub("Job <|> is.*.$", "", job.ids)) + }) # Get base count multi sample in pooled normal ---------------------------- # all all unique calls in entire cohort print("Compiling reads in pooled samples") dir.create(paste0(results.dir, "/pooled")) - all.all.unique.mafs <- - do.call(rbind, lapply(unique(master.ref$cmo_patient_id), function(x) { - fread(list.files( - paste0(results.dir, "/", x), - pattern = "unique_calls.maf$", - full.names = T - )) - })) - all.all.unique.mafs <- - all.all.unique.mafs[!duplicated(all.all.unique.mafs[, .( - Hugo_Symbol, - Chromosome, - Start_Position, - End_Position, - Variant_Classification, - HGVSp_Short, - Reference_Allele, - Tumor_Seq_Allele2 - )]),] - write.table( - all.all.unique.mafs, - paste0(results.dir, "/pooled/all_all_unique.maf"), - sep = "\t", - quote = F, - row.names = F - ) + all.all.unique.mafs <- do.call(rbind, lapply(unique(master.ref$cmo_patient_id), function(x) { + fread(list.files(paste0(results.dir, "/", x), pattern = "unique_calls.maf$", full.names = T)) + })) + all.all.unique.mafs <- all.all.unique.mafs[!duplicated(all.all.unique.mafs[, .(Hugo_Symbol, Chromosome, Start_Position, End_Position, Variant_Classification, HGVSp_Short, Reference_Allele, Tumor_Seq_Allele2)]),] + write.table(all.all.unique.mafs, paste0(results.dir, "/pooled/all_all_unique.maf"), sep = "\t", quote = F, row.names = F) - write.table( - data.frame( - sample_id = gsub("^.*./|.bam", "", pooled.bams), - maf = paste0(results.dir, "/pooled/all_all_unique.maf"), - standard_bam = pooled.bams, - duplex_bam = "", - simplex_bam = "" - ), - paste0(results.dir, "/pooled/pooled_metadata.tsv"), - sep = "\t", - quote = F, - row.names = F + write.table(data.frame( + sample_id = gsub("^.*./|.bam", "", pooled.bams), maf = paste0(results.dir, "/pooled/all_all_unique.maf"), + standard_bam = pooled.bams, duplex_bam = "", simplex_bam = "" + ), + paste0(results.dir, "/pooled/pooled_metadata.tsv"), + sep = "\t", quote = F, row.names = F ) - pooled.sample.job.id <- system( - paste0( - "bsub -cwd ", - results.dir, - '/pooled -W 12:00 -R "rusage[mem=8]" -oo genotyping.o -eo genotyping.e ', - " -w ", - ' \"', - paste0(paste0("done(", unlist(all.fillout.id), ")"), collapse = "&&"), - '\" ', - " -P ", - project.ID, - " -J pooled_genotype_variants ", - " genotype_variants small_variants multiple-samples -i ", - results.dir, - "/pooled/pooled_metadata.tsv", - " -r ", - fasta.path, - " -g ", - genotyper.path, - " -v DEBUG " - ), - intern = T - ) - pooled.sample.job.id <- - as.numeric(gsub("Job <|> is.*.$", "", pooled.sample.job.id)) - while (!any(grepl("Done successfully", system( - paste0("bjobs -l ", pooled.sample.job.id), intern = T - )))) { + pooled.sample.job.id <- system(paste0( + "bsub -cwd ", results.dir, '/pooled -W 12:00 -R "rusage[mem=8]" -oo genotyping.o -eo genotyping.e ', + " -w ", ' \"', paste0(paste0("done(", unlist(all.fillout.id), ")"), collapse = "&&"), '\" ', + " -P ", project.ID, " -J pooled_genotype_variants ", + " genotype_variants small_variants multiple-samples -i ", results.dir, "/pooled/pooled_metadata.tsv", + " -r ", fasta.path, " -g ", genotyper.path, " -v DEBUG " + ), intern = T) + pooled.sample.job.id <- as.numeric(gsub("Job <|> is.*.$", "", pooled.sample.job.id)) + while ( + !any(grepl("Done successfully", system(paste0("bjobs -l ", pooled.sample.job.id), intern = T))) + ) { Sys.sleep(120) } print("Compile reads done!") @@ -669,69 +214,34 @@ if (!interactive()) { parser <- ArgumentParser() parser$add_argument("-m", "--masterref", type = "character", help = "File path to master reference file") parser$add_argument("-o", "--resultsdir", type = "character", help = "Output directory") - parser$add_argument( - "-pid", - "--projectid", - type = "character", - default = "", + parser$add_argument("-pid", "--projectid", + type = "character", default = "", help = "Project ID for submitted jobs involved in this run" ) - parser$add_argument( - "-pb", - "--pooledbamdir", - type = "character", - default = "/juno/work/access/production/resources/msk-access/current/novaseq_curated_duplex_bams_dmp/current/", + parser$add_argument("-pb", "--pooledbamdir", + type = "character", default = "/juno/work/access/production/resources/msk-access/current/novaseq_curated_duplex_bams_dmp/current/", help = "Directory for all pooled bams [default]" ) - parser$add_argument( - "-fa", - "--fastapath", - type = "character", - default = "/juno/work/access/production/resources/reference/current/Homo_sapiens_assembly19.fasta", + parser$add_argument("-fa", "--fastapath", + type = "character", default = "/juno/work/access/production/resources/reference/current/Homo_sapiens_assembly19.fasta", help = "Reference fasta path [default]" ) - parser$add_argument( - "-gt", - "--genotyperpath", - type = "character", - default = "/work/access/production/resources/tools/GetBaseCountsMultiSample/current/GetBaseCountsMultiSample", + parser$add_argument("-gt", "--genotyperpath", + type = "character", default = "/work/access/production/resources/tools/GetBaseCountsMultiSample/current/GetBaseCountsMultiSample", help = "Genotyper executable path [default]" ) - parser$add_argument( - "-dmp", - "--dmpdir", - type = "character", - default = "/juno/work/access/production/resources/cbioportal/current/msk_solid_heme", + parser$add_argument("-dmp", "--dmpdir", + type = "character", default = "/juno/work/access/production/resources/cbioportal/current/msk_solid_heme", help = "Directory of clinical DMP repository [default]" ) - parser$add_argument( - "-mb", - "--mirrorbamdir", - type = "character", - default = "/juno/res/dmpcollab/dmpshare/share/irb12_245", + parser$add_argument("-mb", "--mirrorbamdir", + type = "character", default = "/juno/res/dmpcollab/dmpshare/share/irb12_245", help = "Mirror BAM file directory [default]" ) - parser$add_argument( - "-mab", - "--mirroraccessbamdir", - type = "character", - default = "/juno/res/dmpcollab/dmpshare/share/access_12_245", - help = "Mirror BAM file directory for MSK-ACCESS [default]" - ) - parser$add_argument( - "-dmpk", - "--dmpkeypath", - type = "character", - default = "/juno/res/dmpcollab/dmprequest/12-245/key.txt", + parser$add_argument("-dmpk", "--dmpkeypath", + type = "character", default = "/juno/res/dmpcollab/dmprequest/12-245/key.txt", help = "DMP mirror BAM key file [default]" ) - parser$add_argument( - "-dmpak", - "--dmpaccesskeypath", - type = "character", - default = "/juno/res/dmpcollab/dmprequest/ACCESS-12-245/key.txt", - help = "DMP mirror BAM key file for MSK-ACCESS [default]" - ) args <- parser$parse_args() master.ref <- args$masterref @@ -742,14 +252,11 @@ if (!interactive()) { genotyper.path <- args$genotyperpath dmp.dir <- args$dmpdir mirror.bam.dir <- args$mirrorbamdir - mirror.access.bam.dir <- args$mirroraccessbamdir dmp.key.path <- args$dmpkeypath - access.key.path <- args$dmpaccesskeypath if (project.ID == "") { - project.ID <- - paste0(sample(c(0:9), size = 10, replace = T), collapse = "") + project.ID <- paste0(sample(c(0:9), size = 10, replace = T), collapse = "") } print(paste0("Input parameters for run ", project.ID)) @@ -760,23 +267,7 @@ if (!interactive()) { print(genotyper.path) print(dmp.dir) print(mirror.bam.dir) - print(mirror.access.bam.dir) print(dmp.key.path) - print(access.key.path) - suppressWarnings( - compile_reads( - fread(master.ref), - results.dir, - project.ID, - pooled.bam.dir, - fasta.path, - genotyper.path, - dmp.dir, - mirror.bam.dir, - mirror.access.bam.dir, - dmp.key.path, - access.key.path - ) - ) + suppressWarnings(compile_reads(fread(master.ref), results.dir, project.ID, pooled.bam.dir, fasta.path, genotyper.path, dmp.dir, mirror.bam.dir, dmp.key.path)) print("compile reads function finished") } From 0589e751968976e73b218e535c6abba0fa9ac4be Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Fri, 3 Feb 2023 08:55:04 -0500 Subject: [PATCH 123/126] adding access samples to genotype --- R/compile_reads_all.R | 782 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 782 insertions(+) create mode 100644 R/compile_reads_all.R diff --git a/R/compile_reads_all.R b/R/compile_reads_all.R new file mode 100644 index 0000000..142b39a --- /dev/null +++ b/R/compile_reads_all.R @@ -0,0 +1,782 @@ +#library(data.table) +#library(tidyr) +#library(stringr) +#library(dplyr) + + +#' @export +compile_reads <- function(master.ref, + results.dir, + project.ID, + pooled.bam.dir = "/juno/work/access/production/resources/msk-access/current/novaseq_curated_duplex_bams_dmp/current/", + fasta.path = "/juno/work/access/production/resources/reference/current/Homo_sapiens_assembly19.fasta", + genotyper.path = "/work/access/production/resources/tools/GetBaseCountsMultiSample/current/GetBaseCountsMultiSample", + dmp.dir = "/juno/work/access/production/resources/cbioportal/current/msk_solid_heme", + mirror.bam.dir = "/juno/res/dmpcollab/dmpshare/share/irb12_245", + mirror.access.bam.dir = "/juno/res/dmpcollab/dmpshare/share/access_12_245/", + dmp.key.path = "/juno/res/dmpcollab/dmprequest/12-245/key.txt", + access.key.path = "/juno/res/dmpcollab/dmprequest/ACCESS-12-245/key.txt") { + # # test input section ----------------------------------------------------------- + # master.ref = fread('/juno/work/bergerm1/bergerlab/zhengy1/access_data_analysis/data/example_master_file.csv') + # results.dir = paste0('/juno/work/bergerm1/MSK-ACCESS/ACCESS-Projects/test_access/access_data_analysis/output_',format(Sys.time(),'%m%d%y')) + # pooled.bam.dir = '/ifs/work/bergerm1/ACCESS-Projects/novaseq_curated_duplex_v2/' + # fasta.path = '/work/access/production/resources/reference/current/Homo_sapiens_assembly19.fasta' + # genotyper.path = '/ifs/work/bergerm1/Innovation/software/maysun/GetBaseCountsMultiSample/GetBaseCountsMultiSample' + # dmp.dir = '/ifs/work/bergerm1/zhengy1/dmp/mskimpact/' + # mirror.bam.dir = '/ifs/dmpshare/share/irb12_245/' + # dmp.key.path = '/ifs/dmprequest/12-245/key.txt' + # setting up directory ---------------------------------------------------- + dir.create(results.dir) + # make tmp directory in output directory + dir.create(paste0(results.dir, "/tmp")) + # checking virtualenv ----------------------------------------------------- + geno.bash <- system("which genotype_variants", intern = T) + if (length(geno.bash) == 0) { + # print(pyclone.path) + stop( + "needs to run \nsource /home/accessbot/miniconda3/bin/activate && conda activate genotype-variants-0.3.0" + ) + } + + # data from DMP ----------------------------------------------------------- + DMP.key <- fread(dmp.key.path) + if (any(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-IH.|-T..-IM.", "", DMP.key[grepl("IH|IM", V1)]$V1))) { + message(paste0( + "These DMP IDs are not found in DMP key file: ", + paste0(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id[which(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% + gsub("-T..-IH.|-T..-IM.", "", DMP.key[grepl("IH|IM", V1)]$V1))], collapse = " ,") + )) + } + # data from DMP ACCESS ---------------------------------------------------- + access.key <- + as.data.table(read.csv(access.key.path, header = FALSE, sep = ",")) + if (any(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1))) { + message(paste0( + "These DMP IDs are not found in DMP ACCESS key file: ", + paste0(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id[which(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% + gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1))], collapse = " ,") + )) + } + + DMP.maf <- + fread(paste0(dmp.dir, "/data_mutations_extended.txt")) %>% + filter(Mutation_Status != "GERMLINE") %>% + data.table() + DMP.RET.maf <- + DMP.maf[grepl(paste0(unique(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id), collapse = "|"), Tumor_Sample_Barcode), ] + + # Pooled normal samples --------------------------------------------------- + pooled.bams <- + list.files(pooled.bam.dir, pattern = ".bam", full.names = T) + + # For each patient -------------------------------------------------------- + x <- unique(master.ref$cmo_patient_id)[1] + # x = unique(master.ref$cmo_sample_id_plasma)[16] + # x = 'C-YW82CY' + print("Compiling reads per patient") + all.fillout.id <- + lapply(unique(master.ref$cmo_patient_id), function(x) { + print(x) + dir.create(paste0(results.dir, "/", x)) + dmp_id <- + unique(master.ref[cmo_patient_id == x]$dmp_patient_id) + # sample sheet with colummns -- TSB, sample type, bam path, treatm -------- + # need to get DMP tumor, DMP normal, plasma, plasma normal (if there is any), pooled normal + # DMP sample sheet + if (is.na(dmp_id) | dmp_id == '') { + dmp.sample.sheet <- NULL + } else { + all.dmp.ids.IM <- + DMP.key[grepl(paste0(dmp_id, "-(T|N)..-IM."), V1)]$V1 + all.dmp.ids.IH <- + DMP.key[grepl(paste0(dmp_id, "-(T|N)..-IH."), V1)]$V1 + all.dmp.ids.XS <- + access.key[grepl(paste0(dmp_id, "-T..-XS."), V1)]$V1 + all.dmp.ids.normal.XS <- + access.key[grepl(paste0(dmp_id, "-N..-XS."), V1)]$V1 + all.dmp.ids <- c(all.dmp.ids.IM, all.dmp.ids.IH) + all.dmp.bam.ids.IM <- + DMP.key[grepl(paste0(dmp_id, "-(T|N)..-IM."), V1)]$V2 + all.dmp.bam.ids.IH <- + DMP.key[grepl(paste0(dmp_id, "-(T|N)..-IH."), V1)]$V2 + all.dmp.bam.ids.XS <- + gsub("-standard|-unfilter|-simplex|-duplex", + "", + access.key[grepl(paste0(dmp_id, "-T..-XS."), V1)]$V2) + all.dmp.bam.ids.normal.XS <- + gsub("-standard|-unfilter|-simplex|-duplex", + "", + access.key[grepl(paste0(dmp_id, "-N..-XS."), V1)]$V2) + all.dmp.bam.ids <- + c(all.dmp.bam.ids.IM, + all.dmp.bam.ids.IH) + if (length(all.dmp.ids) == 0) { + dmp.sample.sheet <- NULL + } else{ + bam.sub.dir <- + unlist(lapply(strsplit(substr( + all.dmp.bam.ids, 1, 2 + ), ""), function(x) { + paste0(x, collapse = "/") + })) + dmp.sample.sheet <- data.frame( + Sample_Barcode = all.dmp.ids, + standard_bam = paste0( + mirror.bam.dir, + "/", + bam.sub.dir, + "/", + all.dmp.bam.ids, + ".bam" + ), + duplex_bam = NA, + simplex_bam = NA + ) %>% + mutate( + cmo_patient_id = x, + Sample_Type = ifelse( + grepl("-T", Sample_Barcode), + "DMP_Tumor", + "DMP_Normal" + ), + dmp_patient_id = dmp_id + ) + } + if (length(all.dmp.ids.XS) == 0) { + access.sample.sheet <- NULL + } else{ + access.bam.sub.dir <- + unlist(lapply(strsplit( + substr(all.dmp.bam.ids.XS, 1, 2), "" + ), function(x) { + paste0(x, collapse = "/") + })) + access.sample.sheet <- unique( + data.frame( + Sample_Barcode = all.dmp.ids.XS, + standard_bam = NA, + duplex_bam = paste0( + mirror.access.bam.dir, + "/", + access.bam.sub.dir, + "/", + all.dmp.bam.ids.XS, + "-duplex.bam" + ), + simplex_bam = paste0( + mirror.access.bam.dir, + "/", + access.bam.sub.dir, + "/", + all.dmp.bam.ids.XS, + "-simplex.bam" + ) + ) %>% + mutate( + cmo_patient_id = x, + Sample_Type = ifelse( + grepl("-T", Sample_Barcode), + "duplex", + "unfilterednormal" + ), + dmp_patient_id = dmp_id + ) + ) + access.normal.bam.sub.dir <- + unlist(lapply(strsplit( + substr(all.dmp.bam.ids.normal.XS, 1, 2), "" + ), function(x) { + paste0(x, collapse = "/") + })) + access.normal.sample.sheet <- unique( + data.frame( + Sample_Barcode = all.dmp.ids.normal.XS, + standard_bam = paste0( + mirror.access.bam.dir, + "/", + access.normal.bam.sub.dir, + "/", + all.dmp.bam.ids.normal.XS, + "-unfilter.bam" + ), + duplex_bam = NA, + simplex_bam = NA + ) %>% + mutate( + cmo_patient_id = x, + Sample_Type = ifelse( + grepl("-N", Sample_Barcode), + "unfilterednormal", + "duplex" + ), + dmp_patient_id = dmp_id + ) + ) + access.sample.sheet = bind_rows(access.sample.sheet, access.normal.sample.sheet) + } + if (!is.null(dmp.sample.sheet) & + !is.null(access.sample.sheet)) { + print("DMP IMPACT and DMP ACCESS samples are available") + dmp.sample.sheet <- + bind_rows(dmp.sample.sheet, access.sample.sheet) + + } else if (is.null(dmp.sample.sheet) & + !is.null(access.sample.sheet)) { + print("DMP IMPACT samples are NOT available and DMP ACCESS samples are available") + dmp.sample.sheet <- access.sample.sheet + } else if (!is.null(dmp.sample.sheet) & + is.null(access.sample.sheet)) { + print("DMP IMPACT samples are available and DMP ACCESS samples are NOT available") + dmp.sample.sheet <- dmp.sample.sheet + } else{ + print("No DMP IMPACT samples or DMP ACCESS samples are available") + dmp.sample.sheet <- NULL + } + } + # total sample sheet + sample.sheet <- master.ref[cmo_patient_id == x, + # plasma bams -- duplex and simplex bam + .( + Sample_Barcode = as.character(cmo_sample_id_plasma), + standard_bam = NA, + duplex_bam = bam_path_plasma_duplex, + simplex_bam = bam_path_plasma_simplex, + cmo_patient_id, + Sample_Type = "duplex", + dmp_patient_id + )] %>% + merge(rbind(unique(master.ref[cmo_patient_id == x & + paired == 'Paired', + # buffy coat + DMP bams -- standard bam only + .( + Sample_Barcode = as.character(cmo_sample_id_normal), + standard_bam = bam_path_normal, + duplex_bam = NA, + simplex_bam = NA, + cmo_patient_id, + Sample_Type = "unfilterednormal", + dmp_patient_id + )]), + dmp.sample.sheet), all = T) + # catch '' or NA for empty cells for some cmo_sample_id_normal + sample.sheet <- + sample.sheet[!is.na(Sample_Barcode) | + Sample_Barcode != ""] + write.table( + sample.sheet, + paste0(results.dir, "/", x, "/", x, "_sample_sheet.tsv"), + sep = "\t", + quote = F, + row.names = F + ) + # piece together all unique calls ----------------------------------------- + # get duplex calls + duplex.calls <- + do.call(rbind, lapply(master.ref[cmo_patient_id == x]$maf_path, function(x) { + # fread(x) %>% filter(as.numeric(D_t_alt_count_fragment) > 0) %>% data.table() + selectcolumns <- + c( + "Hugo_Symbol", + "Entrez_Gene_Id", + "Center", + "NCBI_Build", + "Chromosome", + "Start_Position", + "End_Position", + "Strand", + "Variant_Classification", + "Variant_Type", + "Reference_Allele", + "Tumor_Seq_Allele1", + "Tumor_Seq_Allele2", + "dbSNP_RS", + "dbSNP_Val_Status", + "Tumor_Sample_Barcode", + "caller_Norm_Sample_Barcode", + "Match_Norm_Seq_Allele1", + "Match_Norm_Seq_Allele2", + "Tumor_Validation_Allele1", + "Tumor_Validation_Allele2", + "Match_Norm_Validation_Allele1", + "Match_Norm_Validation_Allele2", + "Verification_Status", + "Validation_Status", + "Mutation_Status", + "Sequencing_Phase", + "Sequence_Source", + "Validation_Method", + "Score", + "BAM_File", + "Sequencer", + "Tumor_Sample_UUID", + "Matched_Norm_Sample_UUID", + "HGVSc", + "HGVSp", + "HGVSp_Short", + "Transcript_ID", + "Exon_Number", + "caller_t_depth", + "caller_t_ref_count", + "caller_t_alt_count", + "caller_n_depth", + "caller_n_ref_count", + "caller_n_alt_count", + "all_effects", + "Allele", + "Gene", + "Feature", + "Feature_type", + "Consequence", + "cDNA_position", + "CDS_position", + "Protein_position", + "Amino_acids", + "Codons", + "Existing_variation", + "ALLELE_NUM", + "DISTANCE", + "STRAND_VEP", + "SYMBOL", + "SYMBOL_SOURCE", + "HGNC_ID", + "BIOTYPE", + "CANONICAL", + "CCDS", + "ENSP", + "SWISSPROT", + "TREMBL", + "UNIPARC", + "RefSeq", + "SIFT", + "PolyPhen", + "EXON", + "INTRON", + "DOMAINS", + "AF", + "AFR_AF", + "AMR_AF", + "ASN_AF", + "EAS_AF", + "EUR_AF", + "SAS_AF", + "AA_AF", + "EA_AF", + "CLIN_SIG", + "SOMATIC", + "PUBMED", + "MOTIF_NAME", + "MOTIF_POS", + "HIGH_INF_POS", + "MOTIF_SCORE_CHANGE", + "IMPACT", + "PICK", + "VARIANT_CLASS", + "TSL", + "HGVS_OFFSET", + "PHENO", + "MINIMISED", + "ExAC_AF", + "ExAC_AF_AFR", + "ExAC_AF_AMR", + "ExAC_AF_EAS", + "ExAC_AF_FIN", + "ExAC_AF_NFE", + "ExAC_AF_OTH", + "ExAC_AF_SAS", + "GENE_PHENO", + "FILTER", + "flanking_bps", + "variant_id", + "variant_qual", + "ExAC_AF_Adj", + "ExAC_AC_AN_Adj", + "ExAC_AC_AN", + "ExAC_AC_AN_AFR", + "ExAC_AC_AN_AMR", + "ExAC_AC_AN_EAS", + "ExAC_AC_AN_FIN", + "ExAC_AC_AN_NFE", + "ExAC_AC_AN_OTH", + "ExAC_AC_AN_SAS", + "ExAC_FILTER", + "gnomAD_AF", + "gnomAD_AFR_AF", + "gnomAD_AMR_AF", + "gnomAD_ASJ_AF", + "gnomAD_EAS_AF", + "gnomAD_FIN_AF", + "gnomAD_NFE_AF", + "gnomAD_OTH_AF", + "gnomAD_SAS_AF", + "CallMethod", + "VCF_POS", + "VCF_REF", + "VCF_ALT", + "hotspot_whitelist", + "Status", + "D_t_alt_count_fragment", + "D_t_ref_count_fragment", + "D_t_vaf_fragment", + "SD_t_alt_count_fragment", + "SD_t_ref_count_fragment", + "SD_t_vaf_fragment", + "Matched_Norm_Sample_Barcode", + "Matched_Norm_Bamfile", + "n_alt_count_fragment", + "n_ref_count_fragment", + "n_vaf_fragment" + ) + if ("Status" %in% names(fread(x))) { + fread(x) %>% select(one_of(selectcolumns)) %>% subset((Status == "") | + (is.na(Status))) + } else { + fread(x) %>% select(one_of(selectcolumns)) + } + # fread(x) + # %>% + # filter(as.numeric(t_alt_count) > 0) %>% + # data.table() + })) + # get impact calls + impact.calls <- + DMP.RET.maf[Tumor_Sample_Barcode %in% sample.sheet$Sample_Barcode] + write.table( + impact.calls[, .( + Hugo_Symbol, + Chromosome, + Start_Position, + End_Position, + Variant_Classification, + HGVSp_Short, + Reference_Allele, + Tumor_Seq_Allele2 + )], + paste0(results.dir, "/", x, "/", x, "_impact_calls.maf"), + sep = "\t", + quote = F, + row.names = F + ) + # combining plasma and impact calls + all.calls <- + rbind(duplex.calls[, intersect(colnames(duplex.calls), colnames(DMP.RET.maf)), with = F], + impact.calls[, intersect(colnames(duplex.calls), colnames(DMP.RET.maf)), with = F]) + # getting rid of duplicate calls and take the first occurence of all events + all.calls <- + all.calls[which(!duplicated(all.calls[, .( + Hugo_Symbol, + Chromosome, + Start_Position, + End_Position, + Variant_Classification, + HGVSp_Short, + Reference_Allele, + Tumor_Seq_Allele2 + )])), ] %>% + mutate( + t_ref_count = 0, + t_alt_count = 0, + n_ref_count = 0, + n_alt_count = 0, + Matched_Norm_Sample_Barcode = NA + ) %>% + filter( + Variant_Classification != "Silent" & + !grepl("RP11-", Hugo_Symbol) & + !grepl("Intron", Variant_Classification) + ) + write.table( + all.calls, + paste0(results.dir, "/", x, "/", x, "_all_unique_calls.maf"), + sep = "\t", + quote = F, + row.names = F + ) + # tagging hotspots + system( + paste0( + 'bsub -R "rusage[mem=4]" -cwd ', + results.dir, + "/", + x, + "/ -oo hotspot.o -eo hotspot.e -W 00:59 ", + " -P ", + project.ID, + " -J ", + x, + "_tag_hotspot ", + " python /work/access/production/workflows/access_workflows/v1/pipeline_2.0.0/ACCESS-Pipeline/cwl_tools/hotspots/tag_hotspots.py ", + " -m ", + results.dir, + "/", + x, + "/", + x, + "_all_unique_calls.maf", + " -itxt /work/access/production/resources/msk-access/current/regions_of_interest/current/hotspot-list-union-v1-v2_with_TERT.txt ", + " -o ", + results.dir, + "/", + x, + "/", + x, + "_all_unique_calls_hotspots.maf", + " -outdir ", + results.dir, + "/", + x, + "/", + x + ) + ) + # genotype all bams in this patient directory ----------------------------- + # genotyping plasma samples -- plasma duplex&simplex, plasma normal, pooled plasma normal + write.table( + sample.sheet[, .( + sample_id = Sample_Barcode, + maf = paste0(results.dir, "/", x, "/", x, "_all_unique_calls.maf"), + standard_bam, + duplex_bam, + simplex_bam + )], + paste0(results.dir, "/", x, "/", x, "_genotype_metadata.tsv"), + sep = "\t", + quote = F, + row.names = F + ) + job.ids <- system( + paste0( + "bsub -cwd ", + results.dir, + "/", + x, + ' -W 12:00 -R "rusage[mem=8]" -oo genotyping.o -eo genotyping.e ', + " -P ", + project.ID, + " -J ", + x, + "_genotype_variants ", + " genotype_variants small_variants multiple-samples -i ", + results.dir, + "/", + x, + "/", + x, + "_genotype_metadata.tsv", + " -r ", + fasta.path, + " -g ", + genotyper.path, + " -v DEBUG " + ), + intern = T + ) + job.ids <- as.numeric(gsub("Job <|> is.*.$", "", job.ids)) + }) + + + # Get base count multi sample in pooled normal ---------------------------- + # all all unique calls in entire cohort + print("Compiling reads in pooled samples") + dir.create(paste0(results.dir, "/pooled")) + all.all.unique.mafs <- + do.call(rbind, lapply(unique(master.ref$cmo_patient_id), function(x) { + fread(list.files( + paste0(results.dir, "/", x), + pattern = "unique_calls.maf$", + full.names = T + )) + })) + all.all.unique.mafs <- + all.all.unique.mafs[!duplicated(all.all.unique.mafs[, .( + Hugo_Symbol, + Chromosome, + Start_Position, + End_Position, + Variant_Classification, + HGVSp_Short, + Reference_Allele, + Tumor_Seq_Allele2 + )]),] + write.table( + all.all.unique.mafs, + paste0(results.dir, "/pooled/all_all_unique.maf"), + sep = "\t", + quote = F, + row.names = F + ) + + write.table( + data.frame( + sample_id = gsub("^.*./|.bam", "", pooled.bams), + maf = paste0(results.dir, "/pooled/all_all_unique.maf"), + standard_bam = pooled.bams, + duplex_bam = "", + simplex_bam = "" + ), + paste0(results.dir, "/pooled/pooled_metadata.tsv"), + sep = "\t", + quote = F, + row.names = F + ) + + pooled.sample.job.id <- system( + paste0( + "bsub -cwd ", + results.dir, + '/pooled -W 12:00 -R "rusage[mem=8]" -oo genotyping.o -eo genotyping.e ', + " -w ", + ' \"', + paste0(paste0("done(", unlist(all.fillout.id), ")"), collapse = "&&"), + '\" ', + " -P ", + project.ID, + " -J pooled_genotype_variants ", + " genotype_variants small_variants multiple-samples -i ", + results.dir, + "/pooled/pooled_metadata.tsv", + " -r ", + fasta.path, + " -g ", + genotyper.path, + " -v DEBUG " + ), + intern = T + ) + pooled.sample.job.id <- + as.numeric(gsub("Job <|> is.*.$", "", pooled.sample.job.id)) + while (!any(grepl("Done successfully", system( + paste0("bjobs -l ", pooled.sample.job.id), intern = T + )))) { + Sys.sleep(120) + } + print("Compile reads done!") +} + +# Executable ----------------------------------------------------------------------------------------------------------- +# Minimal columns for input mafs +# +# Hugo_Symbol,Chromosome,Start_Position,End_Position,Tumor_Sample_Barcode,Variant_Classification,HGVSp_Short,Reference_Allele,Tumor_Seq_Allele2,D_t_alt_count_fragment + +suppressPackageStartupMessages({ + library(data.table) + library(tidyr) + library(stringr) + library(dplyr) + library(argparse) +}) + +if (!interactive()) { + parser <- ArgumentParser() + parser$add_argument("-m", "--masterref", type = "character", help = "File path to master reference file") + parser$add_argument("-o", "--resultsdir", type = "character", help = "Output directory") + parser$add_argument( + "-pid", + "--projectid", + type = "character", + default = "", + help = "Project ID for submitted jobs involved in this run" + ) + parser$add_argument( + "-pb", + "--pooledbamdir", + type = "character", + default = "/juno/work/access/production/resources/msk-access/current/novaseq_curated_duplex_bams_dmp/current/", + help = "Directory for all pooled bams [default]" + ) + parser$add_argument( + "-fa", + "--fastapath", + type = "character", + default = "/juno/work/access/production/resources/reference/current/Homo_sapiens_assembly19.fasta", + help = "Reference fasta path [default]" + ) + parser$add_argument( + "-gt", + "--genotyperpath", + type = "character", + default = "/work/access/production/resources/tools/GetBaseCountsMultiSample/current/GetBaseCountsMultiSample", + help = "Genotyper executable path [default]" + ) + parser$add_argument( + "-dmp", + "--dmpdir", + type = "character", + default = "/juno/work/access/production/resources/cbioportal/current/msk_solid_heme", + help = "Directory of clinical DMP repository [default]" + ) + parser$add_argument( + "-mb", + "--mirrorbamdir", + type = "character", + default = "/juno/res/dmpcollab/dmpshare/share/irb12_245", + help = "Mirror BAM file directory [default]" + ) + parser$add_argument( + "-mab", + "--mirroraccessbamdir", + type = "character", + default = "/juno/res/dmpcollab/dmpshare/share/access_12_245", + help = "Mirror BAM file directory for MSK-ACCESS [default]" + ) + parser$add_argument( + "-dmpk", + "--dmpkeypath", + type = "character", + default = "/juno/res/dmpcollab/dmprequest/12-245/key.txt", + help = "DMP mirror BAM key file [default]" + ) + parser$add_argument( + "-dmpak", + "--dmpaccesskeypath", + type = "character", + default = "/juno/res/dmpcollab/dmprequest/ACCESS-12-245/key.txt", + help = "DMP mirror BAM key file for MSK-ACCESS [default]" + ) + args <- parser$parse_args() + + master.ref <- args$masterref + results.dir <- args$resultsdir + project.ID <- args$projectid + pooled.bam.dir <- args$pooledbamdir + fasta.path <- args$fastapath + genotyper.path <- args$genotyperpath + dmp.dir <- args$dmpdir + mirror.bam.dir <- args$mirrorbamdir + mirror.access.bam.dir <- args$mirroraccessbamdir + dmp.key.path <- args$dmpkeypath + access.key.path <- args$dmpaccesskeypath + + + if (project.ID == "") { + project.ID <- + paste0(sample(c(0:9), size = 10, replace = T), collapse = "") + } + + print(paste0("Input parameters for run ", project.ID)) + print(master.ref) + print(results.dir) + print(pooled.bam.dir) + print(fasta.path) + print(genotyper.path) + print(dmp.dir) + print(mirror.bam.dir) + print(mirror.access.bam.dir) + print(dmp.key.path) + print(access.key.path) + suppressWarnings( + compile_reads( + fread(master.ref), + results.dir, + project.ID, + pooled.bam.dir, + fasta.path, + genotyper.path, + dmp.dir, + mirror.bam.dir, + mirror.access.bam.dir, + dmp.key.path, + access.key.path + ) + ) + print("compile reads function finished") +} From b3f43c346190223d89f6d5f0f1741fb45db8df9e Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Fri, 3 Feb 2023 08:57:58 -0500 Subject: [PATCH 124/126] Update compile_reads_all.R --- R/compile_reads_all.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/compile_reads_all.R b/R/compile_reads_all.R index 142b39a..364c4b8 100644 --- a/R/compile_reads_all.R +++ b/R/compile_reads_all.R @@ -5,7 +5,7 @@ #' @export -compile_reads <- function(master.ref, +compile_reads_all <- function(master.ref, results.dir, project.ID, pooled.bam.dir = "/juno/work/access/production/resources/msk-access/current/novaseq_curated_duplex_bams_dmp/current/", @@ -764,7 +764,7 @@ if (!interactive()) { print(dmp.key.path) print(access.key.path) suppressWarnings( - compile_reads( + compile_reads_all( fread(master.ref), results.dir, project.ID, From 50212a8973155b2f9688800c0bd93ae9afb7653d Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Fri, 3 Feb 2023 09:04:30 -0500 Subject: [PATCH 125/126] fixing things for release --- fof.txt | 19 ------------------- 1 file changed, 19 deletions(-) delete mode 100644 fof.txt diff --git a/fof.txt b/fof.txt deleted file mode 100644 index d90329c..0000000 --- a/fof.txt +++ /dev/null @@ -1,19 +0,0 @@ -/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-2AVE7W_SNV_table.csv -/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-2CJKAC_SNV_table.csv -/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-4PX38M_SNV_table.csv -/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-5DUJR8_SNV_table.csv -/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-5KCFV3_SNV_table.csv -/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-5PLA6N_SNV_table.csv -/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-70H905_SNV_table.csv -/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-84KMCA_SNV_table.csv -/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-8W2E8L_SNV_table.csv -/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-AME7C6_SNV_table.csv -/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-DDK2LJ_SNV_table.csv -/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-DFJ7RT_SNV_table.csv -/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-KPNF34_SNV_table.csv -/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-PXVUM9_SNV_table.csv -/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-R9MPAU_SNV_table.csv -/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-VUEN2P_SNV_table.csv -/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-WJPT69_SNV_table.csv -/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-XFV0RE_SNV_table.csv -/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-Y5K7R2_SNV_table.csv From 6f3cdb285a5f3ba8fc91fd9c509e1209cfe9b26e Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Fri, 3 Feb 2023 09:04:51 -0500 Subject: [PATCH 126/126] fixing things for release --- python/convert_csv_to_maf/fof.txt | 19 +++++++++++++++++++ .../get_cbioportal_variants/requirements.txt | 5 +++++ 2 files changed, 24 insertions(+) create mode 100644 python/convert_csv_to_maf/fof.txt create mode 100644 python/get_cbioportal_variants/requirements.txt diff --git a/python/convert_csv_to_maf/fof.txt b/python/convert_csv_to_maf/fof.txt new file mode 100644 index 0000000..d90329c --- /dev/null +++ b/python/convert_csv_to_maf/fof.txt @@ -0,0 +1,19 @@ +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-2AVE7W_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-2CJKAC_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-4PX38M_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-5DUJR8_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-5KCFV3_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-5PLA6N_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-70H905_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-84KMCA_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-8W2E8L_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-AME7C6_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-DDK2LJ_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-DFJ7RT_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-KPNF34_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-PXVUM9_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-R9MPAU_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-VUEN2P_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-WJPT69_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-XFV0RE_SNV_table.csv +/Users/shahr2/Library/CloudStorage/OneDrive-MemorialSloanKetteringCancerCenter/Projects/Melanoma_Postow/results_21July2020/results_stringent/C-Y5K7R2_SNV_table.csv diff --git a/python/get_cbioportal_variants/requirements.txt b/python/get_cbioportal_variants/requirements.txt new file mode 100644 index 0000000..a3d8c94 --- /dev/null +++ b/python/get_cbioportal_variants/requirements.txt @@ -0,0 +1,5 @@ +typer==0.3.2 +openpyxl==3.0.9 +typing_extensions==3.10.0.0 +pandas==1.2.5 +bed_lookup