From 4ac5ef9cff3d49f6f8a68a10fd05fab0fc52c924 Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Thu, 23 Jun 2022 21:17:10 -0400 Subject: [PATCH 01/26] change bedpe filtering script to handle both bedpe and bed regions files, and renamed the python script to better reflect its purpose --- .../iannotatesv/filter_regions_bedpe.py | 59 +++++++++++++++++++ containers/iannotatesv/pair_to_bed_annot.py | 46 --------------- containers/iannotatesv/utils.py | 8 +++ modules/process/SV/SomaticAnnotateSVBedpe.nf | 4 +- 4 files changed, 69 insertions(+), 48 deletions(-) create mode 100644 containers/iannotatesv/filter_regions_bedpe.py delete mode 100644 containers/iannotatesv/pair_to_bed_annot.py diff --git a/containers/iannotatesv/filter_regions_bedpe.py b/containers/iannotatesv/filter_regions_bedpe.py new file mode 100644 index 00000000..88d29f62 --- /dev/null +++ b/containers/iannotatesv/filter_regions_bedpe.py @@ -0,0 +1,59 @@ +import argparse, sys, os +import numpy as np +import pandas as pd +import pybedtools +from utils import * + +def usage(): + parser = argparse.ArgumentParser() + parser.add_argument('--blacklist-regions', dest="regions", help = 'regions bed/bedpe file', required = True) + parser.add_argument('--bedpe', help = 'bedpe file', required = True) + parser.add_argument('--tag', help = 'string to update FILTER column', required = True) + parser.add_argument('--output',dest="outfile",help = 'output file' , required = True) + parser.add_argument('--match-type',dest="type",help = 'both/either' , required = True) + parser.add_argument('--ignore-strand',dest="ignore_strand", default=False, action="store_true", help = 'Use flag to ignore strand when running bedtools. Default False' ) + return parser.parse_args() + +def main(): + args = usage() + + # determine validity of args.type + if args.type not in ["both","either"]: + sys.exit("--match-type must be both or either") + match_type = args.type + + # determine if regions file is bed or bedpe + filename=os.path.basename(args.regions) + is_bed = False + if filename.endswith("bed") or filename.endswith("bed.gz"): + is_bed = True + if filename.endswith("bedpe") or filename.endswith("bedpe.gz"): + pass + else: + sys.exit("--blacklist-regions must have .bed/.bed.gz/.bedpe/.bedpe.gz extension") + + # parse input files + [meta_header, bedpe_header_list, bedpe_df] = parse_svtools_bedpe_file(args.bedpe) + bedpe_bt = pybedtools.BedTool.from_dataframe(bedpe_df) + regions_bt = pybedtools.BedTool(args.regions) + + # run pair_to_bed if regions file is bed, pair_to_pair if regions file is bedpe + if is_bed: + intersect = run_pair_to_bed(bedpe_bt,regions_bt,match_type) + else: + intersect = run_pair_to_pair(bedpe_bt,regions_bt,match_type, ignore_strand=args.ignore_strand) + + # annotate bedpe based on matching IDs + intersect_df = bedtool_to_df(intersect,bedpe_header_list)[["ID"]].drop_duplicates() + intersect_df["FILTER_NEW"] = args.tag + filtered_bedpe_df = bedpe_df.merge(intersect_df, on="ID", how="left") + filtered_bedpe_df = filtered_bedpe_df.apply(lambda x: update_filter(x,x["FILTER_NEW"]) if not pd.isna(x["FILTER_NEW"]) else x, axis=1) + filtered_bed_df = filtered_bedpe_df.drop(['FILTER_NEW'], axis=1) + + # write result + with open(args.outfile, "w") as fw: + fw.write("".join(meta_header)) + filtered_bed_df.to_csv(args.outfile, header=True, index=False, sep="\t", mode="a") + +if __name__ == "__main__": + main() diff --git a/containers/iannotatesv/pair_to_bed_annot.py b/containers/iannotatesv/pair_to_bed_annot.py deleted file mode 100644 index 387150d1..00000000 --- a/containers/iannotatesv/pair_to_bed_annot.py +++ /dev/null @@ -1,46 +0,0 @@ -import argparse, sys -import numpy as np -import pandas as pd -import pybedtools -from utils import * - -def usage(): - parser = argparse.ArgumentParser() - parser.add_argument('--blacklist-regions', dest="regions", help = 'regions bed/bedpe file', required = True) - parser.add_argument('--bedpe', help = 'bedpe file', required = True) - parser.add_argument('--tag', help = 'string to update FILTER column', required = True) - parser.add_argument('--output',dest="outfile",help = 'output file' , required = True) - parser.add_argument('--match-type',dest="type",help = 'both/either' , required = True) - - return parser.parse_args() - -def main(): - args = usage() - - if args.type not in ["both","either"]: - sys.exit("--match-type must be both or either") - - [meta_header, bedpe_header_list, bedpe_df] = parse_svtools_bedpe_file(args.bedpe) - bedpe_bt = pybedtools.BedTool.from_dataframe(bedpe_df) - regions_bt = pybedtools.BedTool(args.regions) - intersect,outersect = filter_bedpe(bedpe_bt,regions_bt,args.type) - intersect_df = bedtool_to_df(intersect,bedpe_header_list) - intersect_df["FILTER"] = intersect_df["FILTER"].apply(lambda x: add_tag(x,args.tag)) - outersect_df = bedtool_to_df(outersect,bedpe_header_list) - filtered_bed_df = pd.concat([intersect_df,outersect_df]) - - with open(args.outfile, "w") as fw: - fw.write("".join(meta_header)) - filtered_bed_df.to_csv(args.outfile, header=True, index=False, sep="\t", mode="a") - -def filter_bedpe(bedpe_bt,regions_bt,match_type="either"): - outersect_match_type = "notboth" if match_type == "both" else "neither" - intersect = run_pair_to_bed(bedpe_bt,regions_bt,match_type) - outersect = run_pair_to_bed(bedpe_bt,regions_bt, outersect_match_type) - #intersect["FILTER"] = intersect["FILTER"].apply(lambda x: add_tag(x,tag_str)) - - return [intersect,outersect] - #return pd.concat([intersect,outersect]) - -if __name__ == "__main__": - main() diff --git a/containers/iannotatesv/utils.py b/containers/iannotatesv/utils.py index 2e9f84fb..f30ebc1b 100644 --- a/containers/iannotatesv/utils.py +++ b/containers/iannotatesv/utils.py @@ -27,6 +27,14 @@ def run_pair_to_bed(bedpe,bed,match_type): result = bedpe.pair_to_bed(bed, **{'type': match_type}) return result +def run_pair_to_pair(bedpe,filter_bedpe,match_type,ignore_strand=False): + """ + use pybedtools to run bedtools pairtopair + a length of 1 base must be artificially added to each end with a length of 0, otherwise no overlap + """ + result = bedpe.pair_to_pair(filter_bedpe, **{'type': match_type,'is':ignore_strand}) + return result + def bedtool_to_df(bt,header_list): try: diff --git a/modules/process/SV/SomaticAnnotateSVBedpe.nf b/modules/process/SV/SomaticAnnotateSVBedpe.nf index b8e0627e..96d46b27 100644 --- a/modules/process/SV/SomaticAnnotateSVBedpe.nf +++ b/modules/process/SV/SomaticAnnotateSVBedpe.nf @@ -19,14 +19,14 @@ process SomaticAnnotateSVBedpe { outputPrefix = "${idTumor}__${idNormal}" genome_ = ["GRCh37","smallGRCh37"].contains(genome) ? "hg19" : genome == "GRCh38" ? "hg38" : "hg18" """ - python ${custom_scripts}/pair_to_bed_annot.py \\ + python ${custom_scripts}/filter_regions_bedpe.py \\ --blacklist-regions ${mapabilityBlacklist} \\ --bedpe ${bedpein} \\ --tag mappability \\ --output ${outputPrefix}.combined.dac.bedpe \\ --match-type either - python ${custom_scripts}/pair_to_bed_annot.py \\ + python ${custom_scripts}/filter_regions_bedpe.py \\ --blacklist-regions ${repeatMasker} \\ --bedpe ${outputPrefix}.combined.dac.bedpe \\ --tag repeat_masker \\ From bd5f86e6c6a105272a39e2d295be9928ceedee59 Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Thu, 23 Jun 2022 21:35:53 -0400 Subject: [PATCH 02/26] add pcawg blacklist filtering --- conf/references.config | 3 ++ containers/iannotatesv/pair_to_pair_annot.py | 55 ++++++++++++++++++++ modules/process/SV/SomaticAnnotateSVBedpe.nf | 35 +++++++++++-- modules/subworkflow/sv_wf.nf | 3 ++ 4 files changed, 91 insertions(+), 5 deletions(-) create mode 100644 containers/iannotatesv/pair_to_pair_annot.py diff --git a/conf/references.config b/conf/references.config index 699ce43c..e1b792cb 100644 --- a/conf/references.config +++ b/conf/references.config @@ -101,6 +101,9 @@ params { spliceSites = "${params.reference_base}/mskcc-igenomes/grch37/splice_sites/splice_sites.bed" brassRefDir = "${params.reference_base}/mskcc-igenomes/grch37/brass" vagrentRefDir = "${params.reference_base}/mskcc-igenomes/grch37/vagrent" + svBlacklistBed = "${params.reference_base}/mskcc-igenomes/grch37/sv_calling//juno/work/taylorlab/cmopipeline/mskcc-igenomes/grch37/sv_calling/pcawg6_blacklist.slop.bed" + svBlacklistBedpe = "${params.reference_base}/mskcc-igenomes/grch37/sv_calling//juno/work/taylorlab/cmopipeline/mskcc-igenomes/grch37/sv_calling/pcawg6_blacklist.slop.bedpe" + svBlacklistFoldbackBedpe = "${params.reference_base}/mskcc-igenomes/grch37/sv_calling//juno/work/taylorlab/cmopipeline/mskcc-igenomes/grch37/sv_calling/pcawg6_blacklist_foldback_artefacts.bedpe" } 'GRCh38' { acLoci = "${params.genome_base}/Annotation/ASCAT/1000G_phase3_GRCh38_maf0.3.loci" diff --git a/containers/iannotatesv/pair_to_pair_annot.py b/containers/iannotatesv/pair_to_pair_annot.py new file mode 100644 index 00000000..48c05d48 --- /dev/null +++ b/containers/iannotatesv/pair_to_pair_annot.py @@ -0,0 +1,55 @@ +import argparse, sys, os +import numpy as np +import pandas as pd +import pybedtools +from utils import * + +def usage(): + parser = argparse.ArgumentParser() + parser.add_argument('--blacklist-regions', dest="regions", help = 'regions bed/bedpe file', required = True) + parser.add_argument('--bedpe', help = 'bedpe file', required = True) + parser.add_argument('--tag', help = 'string to update FILTER column', required = True) + parser.add_argument('--output',dest="outfile",help = 'output file' , required = True) + parser.add_argument('--match-type',dest="type",help = 'both/either' , required = True) + parser.add_argument('--ignore-strand',dest="ignore_strand", default=False, action="store_true", help = 'Use flag to ignore strand when running bedtools. Default False' ) + return parser.parse_args() + + +def main(): + args = usage() + + if args.type not in ["both","either"]: + sys.exit("--match-type must be both or either") + + match_type = args.type + # determine if regions file is bed or bedpe + filename=os.path.basename(args.regions) + is_bed = False + if filename.endswith("bed") or filename.endswith("bed.gz"): + is_bed = True + if filename.endswith("bedpe") or filename.endswith("bedpe.gz"): + pass + else: + sys.exit("--blacklist-regions must have .bed/.bed.gz/.bedpe/.bedpe.gz extension") + + [meta_header, bedpe_header_list, bedpe_df] = parse_svtools_bedpe_file(args.bedpe) + bedpe_bt = pybedtools.BedTool.from_dataframe(bedpe_df) + regions_bt = pybedtools.BedTool(args.regions) + + if is_bed: + intersect = run_pair_to_bed(bedpe_bt,regions_bt,match_type) + else: + intersect = run_pair_to_pair(bedpe_bt,regions_bt,match_type, ignore_strand=args.ignore_strand) + + intersect_df = bedtool_to_df(intersect,bedpe_header_list)[["ID"]] + intersect_df["FILTER_NEW"] = args.tag + filtered_bedpe_df = bedpe_df.merge(intersect_df, on="ID", how="left") + filtered_bedpe_df = filtered_bedpe_df.apply(lambda x: update_filter(x,x["FILTER_NEW"]) if not pd.isna(x["FILTER_NEW"]) else x, axis=1) + filtered_bed_df = filtered_bedpe_df.drop(['FILTER_NEW'], axis=1) + + with open(args.outfile, "w") as fw: + fw.write("".join(meta_header)) + filtered_bed_df.to_csv(args.outfile, header=True, index=False, sep="\t", mode="a") + +if __name__ == "__main__": + main() diff --git a/modules/process/SV/SomaticAnnotateSVBedpe.nf b/modules/process/SV/SomaticAnnotateSVBedpe.nf index 96d46b27..f86c33cb 100644 --- a/modules/process/SV/SomaticAnnotateSVBedpe.nf +++ b/modules/process/SV/SomaticAnnotateSVBedpe.nf @@ -7,6 +7,9 @@ process SomaticAnnotateSVBedpe { tuple val(idTumor), val(idNormal), val(target), path(bedpein) path(repeatMasker) path(mapabilityBlacklist) + path(svBlacklistBed) + path(svBlacklistBedpe) + path(svBlacklistFoldbackBedpe) path(spliceSites) path(custom_scripts) val(genome) @@ -31,19 +34,41 @@ process SomaticAnnotateSVBedpe { --bedpe ${outputPrefix}.combined.dac.bedpe \\ --tag repeat_masker \\ --output ${outputPrefix}.combined.dac.rm.bedpe \\ - --match-type either + --match-type either + + python ${custom_scripts}/filter_regions_bedpe.py \\ + --blacklist-regions ${svBlacklistBed} \\ + --bedpe ${outputPrefix}.combined.dac.rm.bedpe \\ + --tag pcawg_blacklist_bed \\ + --output ${outputPrefix}.combined.dac.rm.pcawg.1.bedpe \\ + --match-type either + + python ${custom_scripts}/filter_regions_bedpe.py \\ + --blacklist-regions ${svBlacklistBedpe} \\ + --bedpe ${outputPrefix}.combined.dac.rm.pcawg.1.bedpe \\ + --tag pcawg_blacklist_bed \\ + --output ${outputPrefix}.combined.dac.rm.pcawg.2.bedpe \\ + --match-type both \\ + --ignore-strand + + python ${custom_scripts}/filter_regions_bedpe.py \\ + --blacklist-regions ${svBlacklistFoldbackBedpe} \\ + --bedpe ${outputPrefix}.combined.dac.rm.pcawg.2.bedpe \\ + --tag pcawg_blacklist_bed \\ + --output ${outputPrefix}.combined.dac.rm.pcawg.3.bedpe \\ + --match-type both python ${custom_scripts}/detect_cdna.py \\ --exon-junct ${spliceSites} \\ - --bedpe ${outputPrefix}.combined.dac.rm.bedpe \\ - --out-bedpe ${outputPrefix}.combined.dac.rm.cdna.bedpe \\ + --bedpe ${outputPrefix}.combined.dac.rm.pcawg.3.bedpe \\ + --out-bedpe ${outputPrefix}.combined.dac.rm.pcawg.cdna.bedpe \\ --out ${outputPrefix}.contamination.tsv python ${custom_scripts}/run_iannotatesv.py \\ - --bedpe ${outputPrefix}.combined.dac.rm.cdna.bedpe \\ + --bedpe ${outputPrefix}.combined.dac.rm.pcawg.cdna.bedpe \\ --genome ${genome_} - cp ${outputPrefix}.combined.dac.rm.cdna.iannotate.bedpe \\ + cp ${outputPrefix}.combined.dac.rm.pcawg.cdna.iannotate.bedpe \\ ${outputPrefix}.combined.annot.filtered.bedpe awk -F"\\t" '\$1 ~ /#/ || \$12 == "PASS"' \\ diff --git a/modules/subworkflow/sv_wf.nf b/modules/subworkflow/sv_wf.nf index 5cbdf138..5c5e7fba 100644 --- a/modules/subworkflow/sv_wf.nf +++ b/modules/subworkflow/sv_wf.nf @@ -75,6 +75,9 @@ workflow sv_wf SomaticSVVcf2Bedpe.out.SomaticCombinedUnfilteredBedpe, referenceMap.repeatMasker, referenceMap.mapabilityBlacklist, + referenceMap.svBlacklistBed, + referenceMap.svBlacklistBedpe, + referenceMap.svBlacklistFoldbackBedpe, referenceMap.spliceSites, workflow.projectDir + "/containers/iannotatesv", params.genome From 928ef4e9f35c765e9e681be7f00b3031d8db96da Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Thu, 23 Jun 2022 22:39:08 -0400 Subject: [PATCH 03/26] added reference definition for pcawg blacklist files for loading them into referenceMap --- modules/function/define_maps.nf | 3 +++ 1 file changed, 3 insertions(+) diff --git a/modules/function/define_maps.nf b/modules/function/define_maps.nf index 2248ba2d..2ab19dac 100644 --- a/modules/function/define_maps.nf +++ b/modules/function/define_maps.nf @@ -62,6 +62,9 @@ def defineReferenceMap() { if (! workflow.profile.startsWith("test") ){ result_array << ['brassRefDir' : checkParamReturnFile('brassRefDir')] result_array << ['vagrentRefDir' : checkParamReturnFile('vagrentRefDir')] + result_array << ['svBlacklistBed' : checkParamReturnFile('svBlacklistBed')] + result_array << ['svBlacklistBedpe' : checkParamReturnFile('svBlacklistBedpe')] + result_array << ['svBlacklistFoldbackBedpe' : checkParamReturnFile('svBlacklistFoldbackBedpe')] } return result_array From 888a9041242f34213bfb57767bf8a04c5f7308d5 Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Thu, 23 Jun 2022 22:43:50 -0400 Subject: [PATCH 04/26] fixed filepaths for pcawg SV blacklist files --- conf/references.config | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/conf/references.config b/conf/references.config index e1b792cb..453c5ed2 100644 --- a/conf/references.config +++ b/conf/references.config @@ -101,9 +101,9 @@ params { spliceSites = "${params.reference_base}/mskcc-igenomes/grch37/splice_sites/splice_sites.bed" brassRefDir = "${params.reference_base}/mskcc-igenomes/grch37/brass" vagrentRefDir = "${params.reference_base}/mskcc-igenomes/grch37/vagrent" - svBlacklistBed = "${params.reference_base}/mskcc-igenomes/grch37/sv_calling//juno/work/taylorlab/cmopipeline/mskcc-igenomes/grch37/sv_calling/pcawg6_blacklist.slop.bed" - svBlacklistBedpe = "${params.reference_base}/mskcc-igenomes/grch37/sv_calling//juno/work/taylorlab/cmopipeline/mskcc-igenomes/grch37/sv_calling/pcawg6_blacklist.slop.bedpe" - svBlacklistFoldbackBedpe = "${params.reference_base}/mskcc-igenomes/grch37/sv_calling//juno/work/taylorlab/cmopipeline/mskcc-igenomes/grch37/sv_calling/pcawg6_blacklist_foldback_artefacts.bedpe" + svBlacklistBed = "${params.reference_base}/mskcc-igenomes/grch37/sv_calling/pcawg6_blacklist.slop.bed" + svBlacklistBedpe = "${params.reference_base}/mskcc-igenomes/grch37/sv_calling/pcawg6_blacklist.slop.bedpe" + svBlacklistFoldbackBedpe = "${params.reference_base}/mskcc-igenomes/grch37/sv_calling/pcawg6_blacklist_foldback_artefacts.bedpe" } 'GRCh38' { acLoci = "${params.genome_base}/Annotation/ASCAT/1000G_phase3_GRCh38_maf0.3.loci" From 188f64205d6790d6753104661f49f0ea79570c53 Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Mon, 27 Jun 2022 18:49:28 -0400 Subject: [PATCH 05/26] bugfix of file name for germline annotation --- modules/process/GermSV/GermlineAnnotateSVBedpe.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/process/GermSV/GermlineAnnotateSVBedpe.nf b/modules/process/GermSV/GermlineAnnotateSVBedpe.nf index 11a320cc..00a2c53f 100644 --- a/modules/process/GermSV/GermlineAnnotateSVBedpe.nf +++ b/modules/process/GermSV/GermlineAnnotateSVBedpe.nf @@ -19,14 +19,14 @@ process GermlineAnnotateSVBedpe { outputPrefix = "${idNormal}" genome_ = ["GRCh37","smallGRCh37"].contains(genome) ? "hg19" : genome == "GRCh38" ? "hg38" : "hg18" """ - python ${custom_scripts}/pair_to_bed_annot.py \\ + python ${custom_scripts}/filter_regions_bedpe.py \\ --blacklist-regions ${mapabilityBlacklist} \\ --bedpe ${bedpein} \\ --tag mappability \\ --output ${outputPrefix}.combined.dac.bedpe \\ --match-type either - python ${custom_scripts}/pair_to_bed_annot.py \\ + python ${custom_scripts}/filter_regions_bedpe.py \\ --blacklist-regions ${repeatMasker} \\ --bedpe ${outputPrefix}.combined.dac.bedpe \\ --tag repeat_masker \\ From 2a2a9ee0f401017f7a741a0ccb0d67fbcf452b8f Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Mon, 27 Jun 2022 18:49:58 -0400 Subject: [PATCH 06/26] reorganized filter_regions_bedpe.py to be more modularized --- .../iannotatesv/filter_regions_bedpe.py | 102 +++++++++++++----- containers/iannotatesv/utils.py | 7 ++ 2 files changed, 81 insertions(+), 28 deletions(-) diff --git a/containers/iannotatesv/filter_regions_bedpe.py b/containers/iannotatesv/filter_regions_bedpe.py index 88d29f62..86021eef 100644 --- a/containers/iannotatesv/filter_regions_bedpe.py +++ b/containers/iannotatesv/filter_regions_bedpe.py @@ -10,50 +10,96 @@ def usage(): parser.add_argument('--bedpe', help = 'bedpe file', required = True) parser.add_argument('--tag', help = 'string to update FILTER column', required = True) parser.add_argument('--output',dest="outfile",help = 'output file' , required = True) - parser.add_argument('--match-type',dest="type",help = 'both/either' , required = True) + parser.add_argument('--match-type',dest="type",help = 'both/either/notboth/neither' , required = True) parser.add_argument('--ignore-strand',dest="ignore_strand", default=False, action="store_true", help = 'Use flag to ignore strand when running bedtools. Default False' ) return parser.parse_args() +def determine_regions_filetype(filepath): + """ + Determine if the regions file is bed or bedpe, and raise error if neither + """ + if not any([filepath.endswith(i) for i in "bed|bedpe|bed.gz|bedpe.gz".split("|")]): + raise ValueError("--bedpe requires bed, bedpe, bed.gz or bedpe.gz file.") + elif any([filepath.endswith(i) for i in "bed|bed.gz".split("|")]): + return True + else: return False + +def validate_bedpe_input(filepath): + """ + Raise error if filetype is not bedpe + """ + if not any([filepath.endswith(i) for i in "bedpe|bedpe.gz".split("|")]): + raise ValueError("--bedpe requires bedpe or bedpe.gz file.") + +def validate_match_type(type): + """ + Raise error if type is not acceptable for use in pybedtools pair_to_pair or pair_to_bed + """ + if type not in overlap_type.keys(): + raise ValueError( "--match-type must be {}.".format(" or ".join(overlap_type.keys())) ) + def main(): args = usage() - # determine validity of args.type - if args.type not in ["both","either"]: - sys.exit("--match-type must be both or either") - match_type = args.type - - # determine if regions file is bed or bedpe - filename=os.path.basename(args.regions) - is_bed = False - if filename.endswith("bed") or filename.endswith("bed.gz"): - is_bed = True - if filename.endswith("bedpe") or filename.endswith("bedpe.gz"): - pass - else: - sys.exit("--blacklist-regions must have .bed/.bed.gz/.bedpe/.bedpe.gz extension") + # parse inputs + try: + validate_bedpe_input(args.bedpe) + [meta_header, bedpe_header_list, bedpe_df] = parse_svtools_bedpe_file(args.bedpe) + except Exception as e: + print(e) + sys.exit("Unable to parse --bedpe") + + try: + is_bed = determine_regions_filetype(args.regions) + regions_bt = pybedtools.BedTool(args.regions) + except Exception as e: + print(e) + sys.exit("Unable to parse --blacklist-regions") + + # annotate bedpe + try: + bedpe_bt = pybedtools.BedTool.from_dataframe(bedpe_df) + ids_df = find_overlapped_ids(bedpe_bt, regions_bt, args.type, args.tag, args.ignore_strand, is_bed, bedpe_header_list) + bedpe_df = add_filter_by_id(bedpe_df,ids_df,args.tag) + except Exception as e: + print(e) + sys.exit() + + # write result + #bedpe_df = bedtool_to_df(bedpe_bt,bedpe_header_list) + with open(args.outfile, "w") as fw: + fw.write("".join(meta_header)) + #filtered_bed_df.to_csv(args.outfile, header=True, index=False, sep="\t", mode="a") + bedpe_df.to_csv(args.outfile, header=True, index=False, sep="\t", mode="a") - # parse input files - [meta_header, bedpe_header_list, bedpe_df] = parse_svtools_bedpe_file(args.bedpe) - bedpe_bt = pybedtools.BedTool.from_dataframe(bedpe_df) - regions_bt = pybedtools.BedTool(args.regions) +def find_overlapped_ids(bedpe_bt,regions_bt,match_type,tag,ignore_strand,is_bed,bedpe_header_list): + # determine validity of match_type + validate_match_type(match_type) # run pair_to_bed if regions file is bed, pair_to_pair if regions file is bedpe if is_bed: intersect = run_pair_to_bed(bedpe_bt,regions_bt,match_type) else: - intersect = run_pair_to_pair(bedpe_bt,regions_bt,match_type, ignore_strand=args.ignore_strand) + intersect = run_pair_to_pair(bedpe_bt,regions_bt,match_type, ignore_strand=ignore_strand) + + # extract ids + try: + ids_df = intersect.to_dataframe(header=None)[6].drop_duplicates() + except: + ids_df = pd.DataFrame(columns=["name"]) + return ids_df +def add_filter_by_id(bedpe_df,ids_df,tag): # annotate bedpe based on matching IDs - intersect_df = bedtool_to_df(intersect,bedpe_header_list)[["ID"]].drop_duplicates() - intersect_df["FILTER_NEW"] = args.tag - filtered_bedpe_df = bedpe_df.merge(intersect_df, on="ID", how="left") + # intersect_df = bedtool_to_df(intersect,bedpe_header_list)[["ID"]].drop_duplicates() + # intersect_df["FILTER_NEW"] = tag + ids_df.columns = ["ID"] + ids_df["FILTER_NEW"] = tag + filtered_bedpe_df = bedpe_df.merge(ids_df, on="ID", how="left") filtered_bedpe_df = filtered_bedpe_df.apply(lambda x: update_filter(x,x["FILTER_NEW"]) if not pd.isna(x["FILTER_NEW"]) else x, axis=1) - filtered_bed_df = filtered_bedpe_df.drop(['FILTER_NEW'], axis=1) + filtered_bedpe_df = filtered_bedpe_df.drop(['FILTER_NEW'], axis=1) - # write result - with open(args.outfile, "w") as fw: - fw.write("".join(meta_header)) - filtered_bed_df.to_csv(args.outfile, header=True, index=False, sep="\t", mode="a") + return filtered_bedpe_dfs if __name__ == "__main__": main() diff --git a/containers/iannotatesv/utils.py b/containers/iannotatesv/utils.py index f30ebc1b..c24da536 100644 --- a/containers/iannotatesv/utils.py +++ b/containers/iannotatesv/utils.py @@ -3,6 +3,13 @@ import pandas as pd import pybedtools + +overlap_type = {"both":"notboth", + "notboth":"both", + "either":"neither", + "neither":"either" + } + def add_tag(val,tag ): """ update FILTER value From 2fbc02bc768833646865f5e61de6cd80cff1c33d Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Mon, 27 Jun 2022 18:52:30 -0400 Subject: [PATCH 07/26] cleanup filter_regions_bedpe.py --- containers/iannotatesv/filter_regions_bedpe.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/containers/iannotatesv/filter_regions_bedpe.py b/containers/iannotatesv/filter_regions_bedpe.py index 86021eef..5a70dadc 100644 --- a/containers/iannotatesv/filter_regions_bedpe.py +++ b/containers/iannotatesv/filter_regions_bedpe.py @@ -39,6 +39,9 @@ def validate_match_type(type): raise ValueError( "--match-type must be {}.".format(" or ".join(overlap_type.keys())) ) def main(): + """ + validate inputs, read input beds/bedpes as + """ args = usage() # parse inputs @@ -59,20 +62,18 @@ def main(): # annotate bedpe try: bedpe_bt = pybedtools.BedTool.from_dataframe(bedpe_df) - ids_df = find_overlapped_ids(bedpe_bt, regions_bt, args.type, args.tag, args.ignore_strand, is_bed, bedpe_header_list) + ids_df = find_overlapped_ids(bedpe_bt, regions_bt, args.type, args.ignore_strand, is_bed) bedpe_df = add_filter_by_id(bedpe_df,ids_df,args.tag) except Exception as e: print(e) sys.exit() # write result - #bedpe_df = bedtool_to_df(bedpe_bt,bedpe_header_list) with open(args.outfile, "w") as fw: fw.write("".join(meta_header)) - #filtered_bed_df.to_csv(args.outfile, header=True, index=False, sep="\t", mode="a") bedpe_df.to_csv(args.outfile, header=True, index=False, sep="\t", mode="a") -def find_overlapped_ids(bedpe_bt,regions_bt,match_type,tag,ignore_strand,is_bed,bedpe_header_list): +def find_overlapped_ids(bedpe_bt,regions_bt,match_type,ignore_strand,is_bed): # determine validity of match_type validate_match_type(match_type) @@ -91,8 +92,6 @@ def find_overlapped_ids(bedpe_bt,regions_bt,match_type,tag,ignore_strand,is_bed, def add_filter_by_id(bedpe_df,ids_df,tag): # annotate bedpe based on matching IDs - # intersect_df = bedtool_to_df(intersect,bedpe_header_list)[["ID"]].drop_duplicates() - # intersect_df["FILTER_NEW"] = tag ids_df.columns = ["ID"] ids_df["FILTER_NEW"] = tag filtered_bedpe_df = bedpe_df.merge(ids_df, on="ID", how="left") From 5bf1be36f703513cb90887c03386665a101f1075 Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Mon, 27 Jun 2022 18:56:37 -0400 Subject: [PATCH 08/26] add notes --- containers/iannotatesv/filter_regions_bedpe.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/containers/iannotatesv/filter_regions_bedpe.py b/containers/iannotatesv/filter_regions_bedpe.py index 5a70dadc..52bd07bb 100644 --- a/containers/iannotatesv/filter_regions_bedpe.py +++ b/containers/iannotatesv/filter_regions_bedpe.py @@ -40,7 +40,10 @@ def validate_match_type(type): def main(): """ - validate inputs, read input beds/bedpes as + 1. validate inputs + 2. read input beds/bedpes as pybedtools.BedTool objects + 3. extract variant IDs from overlapping regions + 4. update the FILTER tag for identified variants """ args = usage() From 328968cbfd4b444a18ca6c85b6313bd85ad2dabe Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Tue, 28 Jun 2022 17:05:40 -0400 Subject: [PATCH 09/26] fix filtering tags for sv annotation --- modules/process/SV/SomaticAnnotateSVBedpe.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/process/SV/SomaticAnnotateSVBedpe.nf b/modules/process/SV/SomaticAnnotateSVBedpe.nf index f86c33cb..c5699d17 100644 --- a/modules/process/SV/SomaticAnnotateSVBedpe.nf +++ b/modules/process/SV/SomaticAnnotateSVBedpe.nf @@ -46,7 +46,7 @@ process SomaticAnnotateSVBedpe { python ${custom_scripts}/filter_regions_bedpe.py \\ --blacklist-regions ${svBlacklistBedpe} \\ --bedpe ${outputPrefix}.combined.dac.rm.pcawg.1.bedpe \\ - --tag pcawg_blacklist_bed \\ + --tag pcawg_blacklist_bedpe \\ --output ${outputPrefix}.combined.dac.rm.pcawg.2.bedpe \\ --match-type both \\ --ignore-strand @@ -54,7 +54,7 @@ process SomaticAnnotateSVBedpe { python ${custom_scripts}/filter_regions_bedpe.py \\ --blacklist-regions ${svBlacklistFoldbackBedpe} \\ --bedpe ${outputPrefix}.combined.dac.rm.pcawg.2.bedpe \\ - --tag pcawg_blacklist_bed \\ + --tag pcawg_blacklist_fb_bedpe \\ --output ${outputPrefix}.combined.dac.rm.pcawg.3.bedpe \\ --match-type both From 981bce0f35a155aa9452a5fefb1570aa2752c32f Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Wed, 29 Jun 2022 13:13:33 -0400 Subject: [PATCH 10/26] added pcawg blacklist filtering to Germline SV annotation, and fixed publishing of files --- .../process/GermSV/GermlineAnnotateSVBedpe.nf | 32 ++++++++++++++++--- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/modules/process/GermSV/GermlineAnnotateSVBedpe.nf b/modules/process/GermSV/GermlineAnnotateSVBedpe.nf index 00a2c53f..4d897c9a 100644 --- a/modules/process/GermSV/GermlineAnnotateSVBedpe.nf +++ b/modules/process/GermSV/GermlineAnnotateSVBedpe.nf @@ -1,7 +1,7 @@ process GermlineAnnotateSVBedpe { tag "${idNormal}" - publishDir "${params.outDir}/germline/${outputPrefix}/combined_svs", mode: params.publishDirMode, pattern: "*.combined.filtered.bedpe" + publishDir "${params.outDir}/germline/${outputPrefix}/combined_svs", mode: params.publishDirMode, pattern: "*.combined.annot.filtered*.bedpe" input: tuple val(idNormal), val(target), path(bedpein) @@ -32,18 +32,40 @@ process GermlineAnnotateSVBedpe { --tag repeat_masker \\ --output ${outputPrefix}.combined.dac.rm.bedpe \\ --match-type either + + python ${custom_scripts}/filter_regions_bedpe.py \\ + --blacklist-regions ${svBlacklistBed} \\ + --bedpe ${outputPrefix}.combined.dac.rm.bedpe \\ + --tag pcawg_blacklist_bed \\ + --output ${outputPrefix}.combined.dac.rm.pcawg.1.bedpe \\ + --match-type either + + python ${custom_scripts}/filter_regions_bedpe.py \\ + --blacklist-regions ${svBlacklistBedpe} \\ + --bedpe ${outputPrefix}.combined.dac.rm.pcawg.1.bedpe \\ + --tag pcawg_blacklist_bedpe \\ + --output ${outputPrefix}.combined.dac.rm.pcawg.2.bedpe \\ + --match-type both \\ + --ignore-strand + + python ${custom_scripts}/filter_regions_bedpe.py \\ + --blacklist-regions ${svBlacklistFoldbackBedpe} \\ + --bedpe ${outputPrefix}.combined.dac.rm.pcawg.2.bedpe \\ + --tag pcawg_blacklist_fb_bedpe \\ + --output ${outputPrefix}.combined.dac.rm.pcawg.3.bedpe \\ + --match-type both python ${custom_scripts}/detect_cdna.py \\ --exon-junct ${spliceSites} \\ - --bedpe ${outputPrefix}.combined.dac.rm.bedpe \\ - --out-bedpe ${outputPrefix}.combined.dac.rm.cdna.bedpe \\ + --bedpe ${outputPrefix}.combined.dac.rm.pcawg.3.bedpe \\ + --out-bedpe ${outputPrefix}.combined.dac.rm.pcawg.cdna.bedpe \\ --out ${outputPrefix}.contamination.tsv python ${custom_scripts}/run_iannotatesv.py \\ - --bedpe ${outputPrefix}.combined.dac.rm.cdna.bedpe \\ + --bedpe ${outputPrefix}.combined.dac.rm.pcawg.cdna.bedpe \\ --genome ${genome_} - cp ${outputPrefix}.combined.dac.rm.cdna.iannotate.bedpe \\ + cp ${outputPrefix}.combined.dac.rm.pcawg.cdna.iannotate.bedpe \\ ${outputPrefix}.combined.annot.filtered.bedpe awk -F"\\t" '\$1 ~ /#/ || \$12 == "PASS"' \\ From da204262a96aff55036f2a1c863d16118dff4558 Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Wed, 29 Jun 2022 13:13:45 -0400 Subject: [PATCH 11/26] bugfixes --- containers/iannotatesv/filter_regions_bedpe.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/containers/iannotatesv/filter_regions_bedpe.py b/containers/iannotatesv/filter_regions_bedpe.py index 52bd07bb..a197d97b 100644 --- a/containers/iannotatesv/filter_regions_bedpe.py +++ b/containers/iannotatesv/filter_regions_bedpe.py @@ -69,7 +69,7 @@ def main(): bedpe_df = add_filter_by_id(bedpe_df,ids_df,args.tag) except Exception as e: print(e) - sys.exit() + sys.exit("Filtering failed with inputs {} and {}".format(args.bedpe, args.regions)) # write result with open(args.outfile, "w") as fw: @@ -88,9 +88,9 @@ def find_overlapped_ids(bedpe_bt,regions_bt,match_type,ignore_strand,is_bed): # extract ids try: - ids_df = intersect.to_dataframe(header=None)[6].drop_duplicates() + ids_df = pd.DataFrame({"ID":list( intersect.to_dataframe(header=None)[6].drop_duplicates() )}) except: - ids_df = pd.DataFrame(columns=["name"]) + ids_df = pd.DataFrame(columns=["ID"]) return ids_df def add_filter_by_id(bedpe_df,ids_df,tag): @@ -101,7 +101,7 @@ def add_filter_by_id(bedpe_df,ids_df,tag): filtered_bedpe_df = filtered_bedpe_df.apply(lambda x: update_filter(x,x["FILTER_NEW"]) if not pd.isna(x["FILTER_NEW"]) else x, axis=1) filtered_bedpe_df = filtered_bedpe_df.drop(['FILTER_NEW'], axis=1) - return filtered_bedpe_dfs + return filtered_bedpe_df if __name__ == "__main__": main() From feea8a37b90df7d1c389e3c122b7ce4d2ed6b05c Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Thu, 30 Jun 2022 13:58:25 -0400 Subject: [PATCH 12/26] bugfix for GermlineAnnotateSVBedpe inputs --- modules/process/GermSV/GermlineAnnotateSVBedpe.nf | 3 +++ modules/subworkflow/germlineSV_wf.nf | 3 +++ 2 files changed, 6 insertions(+) diff --git a/modules/process/GermSV/GermlineAnnotateSVBedpe.nf b/modules/process/GermSV/GermlineAnnotateSVBedpe.nf index 4d897c9a..05c78a81 100644 --- a/modules/process/GermSV/GermlineAnnotateSVBedpe.nf +++ b/modules/process/GermSV/GermlineAnnotateSVBedpe.nf @@ -7,6 +7,9 @@ process GermlineAnnotateSVBedpe { tuple val(idNormal), val(target), path(bedpein) path(repeatMasker) path(mapabilityBlacklist) + path(svBlacklistBed) + path(svBlacklistBedpe) + path(svBlacklistFoldbackBedpe) path(spliceSites) path(custom_scripts) val(genome) diff --git a/modules/subworkflow/germlineSV_wf.nf b/modules/subworkflow/germlineSV_wf.nf index 3ae67067..a184b670 100644 --- a/modules/subworkflow/germlineSV_wf.nf +++ b/modules/subworkflow/germlineSV_wf.nf @@ -43,6 +43,9 @@ workflow germlineSV_wf GermlineSVVcf2Bedpe.out.GermlineCombinedUnfilteredBedpe, referenceMap.repeatMasker, referenceMap.mapabilityBlacklist, + referenceMap.svBlacklistBed, + referenceMap.svBlacklistBedpe, + referenceMap.svBlacklistFoldbackBedpe, referenceMap.spliceSites, workflow.projectDir + "/containers/iannotatesv", params.genome From ecca378285f22688e1b583a0871980b28e3be14d Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Thu, 30 Jun 2022 14:21:24 -0400 Subject: [PATCH 13/26] add reference for trasposable element filtering in SV --- conf/references.config | 1 + modules/function/define_maps.nf | 1 + modules/process/GermSV/GermlineAnnotateSVBedpe.nf | 10 +++++++++- modules/process/SV/SomaticAnnotateSVBedpe.nf | 10 +++++++++- modules/subworkflow/germlineSV_wf.nf | 1 + modules/subworkflow/sv_wf.nf | 1 + 6 files changed, 22 insertions(+), 2 deletions(-) diff --git a/conf/references.config b/conf/references.config index 453c5ed2..44efbadc 100644 --- a/conf/references.config +++ b/conf/references.config @@ -104,6 +104,7 @@ params { svBlacklistBed = "${params.reference_base}/mskcc-igenomes/grch37/sv_calling/pcawg6_blacklist.slop.bed" svBlacklistBedpe = "${params.reference_base}/mskcc-igenomes/grch37/sv_calling/pcawg6_blacklist.slop.bedpe" svBlacklistFoldbackBedpe = "${params.reference_base}/mskcc-igenomes/grch37/sv_calling/pcawg6_blacklist_foldback_artefacts.bedpe" + svBlacklistTEBedpe = "${params.reference_base}/mskcc-igenomes/grch37/sv_calling/pcawg6_blacklist_TE_pseudogene.bedpe" } 'GRCh38' { acLoci = "${params.genome_base}/Annotation/ASCAT/1000G_phase3_GRCh38_maf0.3.loci" diff --git a/modules/function/define_maps.nf b/modules/function/define_maps.nf index 2ab19dac..b00762f9 100644 --- a/modules/function/define_maps.nf +++ b/modules/function/define_maps.nf @@ -65,6 +65,7 @@ def defineReferenceMap() { result_array << ['svBlacklistBed' : checkParamReturnFile('svBlacklistBed')] result_array << ['svBlacklistBedpe' : checkParamReturnFile('svBlacklistBedpe')] result_array << ['svBlacklistFoldbackBedpe' : checkParamReturnFile('svBlacklistFoldbackBedpe')] + result_array << ['svBlacklistTEBedpe' : checkParamReturnFile('svBlacklistTEBedpe')] } return result_array diff --git a/modules/process/GermSV/GermlineAnnotateSVBedpe.nf b/modules/process/GermSV/GermlineAnnotateSVBedpe.nf index 05c78a81..eb3898fb 100644 --- a/modules/process/GermSV/GermlineAnnotateSVBedpe.nf +++ b/modules/process/GermSV/GermlineAnnotateSVBedpe.nf @@ -10,6 +10,7 @@ process GermlineAnnotateSVBedpe { path(svBlacklistBed) path(svBlacklistBedpe) path(svBlacklistFoldbackBedpe) + path(svBlacklistTEBedpe) path(spliceSites) path(custom_scripts) val(genome) @@ -58,9 +59,16 @@ process GermlineAnnotateSVBedpe { --output ${outputPrefix}.combined.dac.rm.pcawg.3.bedpe \\ --match-type both + python ${custom_scripts}/filter_regions_bedpe.py \\ + --blacklist-regions ${svBlacklistTEBedpe} \\ + --bedpe ${outputPrefix}.combined.dac.rm.pcawg.3.bedpe \\ + --tag pcawg_blacklist_te_bedpe \\ + --output ${outputPrefix}.combined.dac.rm.pcawg.4.bedpe \\ + --match-type either + python ${custom_scripts}/detect_cdna.py \\ --exon-junct ${spliceSites} \\ - --bedpe ${outputPrefix}.combined.dac.rm.pcawg.3.bedpe \\ + --bedpe ${outputPrefix}.combined.dac.rm.pcawg.4.bedpe \\ --out-bedpe ${outputPrefix}.combined.dac.rm.pcawg.cdna.bedpe \\ --out ${outputPrefix}.contamination.tsv diff --git a/modules/process/SV/SomaticAnnotateSVBedpe.nf b/modules/process/SV/SomaticAnnotateSVBedpe.nf index c5699d17..15263d0a 100644 --- a/modules/process/SV/SomaticAnnotateSVBedpe.nf +++ b/modules/process/SV/SomaticAnnotateSVBedpe.nf @@ -10,6 +10,7 @@ process SomaticAnnotateSVBedpe { path(svBlacklistBed) path(svBlacklistBedpe) path(svBlacklistFoldbackBedpe) + path(svBlacklistTEBedpe) path(spliceSites) path(custom_scripts) val(genome) @@ -58,9 +59,16 @@ process SomaticAnnotateSVBedpe { --output ${outputPrefix}.combined.dac.rm.pcawg.3.bedpe \\ --match-type both + python ${custom_scripts}/filter_regions_bedpe.py \\ + --blacklist-regions ${svBlacklistTEBedpe} \\ + --bedpe ${outputPrefix}.combined.dac.rm.pcawg.3.bedpe \\ + --tag pcawg_blacklist_te_bedpe \\ + --output ${outputPrefix}.combined.dac.rm.pcawg.4.bedpe \\ + --match-type either + python ${custom_scripts}/detect_cdna.py \\ --exon-junct ${spliceSites} \\ - --bedpe ${outputPrefix}.combined.dac.rm.pcawg.3.bedpe \\ + --bedpe ${outputPrefix}.combined.dac.rm.pcawg.4.bedpe \\ --out-bedpe ${outputPrefix}.combined.dac.rm.pcawg.cdna.bedpe \\ --out ${outputPrefix}.contamination.tsv diff --git a/modules/subworkflow/germlineSV_wf.nf b/modules/subworkflow/germlineSV_wf.nf index a184b670..5a8c0a11 100644 --- a/modules/subworkflow/germlineSV_wf.nf +++ b/modules/subworkflow/germlineSV_wf.nf @@ -46,6 +46,7 @@ workflow germlineSV_wf referenceMap.svBlacklistBed, referenceMap.svBlacklistBedpe, referenceMap.svBlacklistFoldbackBedpe, + referenceMap.svBlacklistTEBedpe, referenceMap.spliceSites, workflow.projectDir + "/containers/iannotatesv", params.genome diff --git a/modules/subworkflow/sv_wf.nf b/modules/subworkflow/sv_wf.nf index 5c5e7fba..0ba87197 100644 --- a/modules/subworkflow/sv_wf.nf +++ b/modules/subworkflow/sv_wf.nf @@ -78,6 +78,7 @@ workflow sv_wf referenceMap.svBlacklistBed, referenceMap.svBlacklistBedpe, referenceMap.svBlacklistFoldbackBedpe, + referenceMap.svBlacklistTEBedpe, referenceMap.spliceSites, workflow.projectDir + "/containers/iannotatesv", params.genome From ddaff0eea282a8e23eb918625d264ea697d9ea69 Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Fri, 1 Jul 2022 22:42:59 -0400 Subject: [PATCH 14/26] added print statements for logging purposes, and made it so iannotatesv processes only 1000 rows at a time --- containers/iannotatesv/detect_cdna.py | 2 + .../iannotatesv/filter_regions_bedpe.py | 2 + containers/iannotatesv/run_iannotatesv.py | 44 ++++++++---------- containers/iannotatesv/utils.py | 46 +++++++++++++------ 4 files changed, 55 insertions(+), 39 deletions(-) diff --git a/containers/iannotatesv/detect_cdna.py b/containers/iannotatesv/detect_cdna.py index 2da81d79..62f87f63 100644 --- a/containers/iannotatesv/detect_cdna.py +++ b/containers/iannotatesv/detect_cdna.py @@ -51,6 +51,8 @@ def prep_intersection(intersect=pd.DataFrame(),bedpe_header_list=[]): def main(): args = usage() + print("Detecting possible cdna contamination in {}".format(os.path.basename(args.bedpe))) + [meta_header, bedpe_header_list, bedpe_df] = parse_svtools_bedpe_file(args.bedpe) bedpe_bt = pybedtools.BedTool.from_dataframe(bedpe_df) diff --git a/containers/iannotatesv/filter_regions_bedpe.py b/containers/iannotatesv/filter_regions_bedpe.py index a197d97b..258e9814 100644 --- a/containers/iannotatesv/filter_regions_bedpe.py +++ b/containers/iannotatesv/filter_regions_bedpe.py @@ -47,6 +47,8 @@ def main(): """ args = usage() + print("Filtering {} with {} regions".format(os.path.basename(args.bedpe), args.tag)) + # parse inputs try: validate_bedpe_input(args.bedpe) diff --git a/containers/iannotatesv/run_iannotatesv.py b/containers/iannotatesv/run_iannotatesv.py index e45c00c4..2147340c 100644 --- a/containers/iannotatesv/run_iannotatesv.py +++ b/containers/iannotatesv/run_iannotatesv.py @@ -1,6 +1,7 @@ import argparse, sys, os, subprocess import numpy as np import pandas as pd +from utils import * def usage(): parser = argparse.ArgumentParser() @@ -12,39 +13,32 @@ def usage(): def main(): args = usage() - meta="" - with open(args.bedpe, 'r') as f: - main_data = False - while main_data == False: - x = f.readline() - if x.startswith("##"): - meta += x - else: - header=x - main_data = True - try: - data = pd.read_csv(f, header=None, sep="\t" ) - data.columns = header.strip().split("\t") - except: - data = pd.DataFrame(columns = header.strip().split("\t")) + print("Running iAnnotateSV on {}".format(os.path.basename(args.bedpe))) + data = parse_svtools_bedpe_file(args.bedpe) iAnnotate_input = data["#CHROM_A|START_A|STRAND_A|CHROM_B|START_B|STRAND_B|ID".split("|")] iAnnotate_input.columns = "chr1|pos1|str1|chr2|pos2|str2|ID".split("|") iAnnotate_input = iAnnotate_input.replace("-", 1).replace("+", 0) - iAnnotate_input.to_csv("input.txt",sep="\t",header=True,index=False) - - if not os.path.isdir("outputdir"): - os.mkdir("outputdir") - run_args = "python /usr/bin/iAnnotateSV/iAnnotateSV/iAnnotateSV.py -i input.txt -ofp outputfilePrefix -o outputdir -r {} -d 3000""".format(args.genome).split(" ") - p = subprocess.Popen(run_args) - p.communicate() - - iAnnotate_output = pd.read_csv("outputdir/outputfilePrefix_Annotated.txt", sep="\t",header=0) - iAnnotate_output = pd.merge(iAnnotate_input,iAnnotate_output, on="chr1|pos1|str1|chr2|pos2|str2".split("|"), how="inner") + n_chunks = int((999+iAnnotate_input.shape[0])/1000) + iAnnotate_output = pd.DataFrame() + + for i in range(n_chunks): + iAnnotate_input_chunk = iAnnotate_input.iloc[(i-1)*1000:i*1000] + input_file = "input_" + str(i) + ".txt" + output_dir = "output_" + str(i) + iAnnotate_input_chunk.to_csv(input_file,sep="\t",header=True,index=False) + if not os.path.isdir(output_dir): os.mkdir(output_dir) + run_args = "python /usr/bin/iAnnotateSV/iAnnotateSV/iAnnotateSV.py -i {} -ofp outputfilePrefix -o {} -r {} -d 3000""".format(input_file, output_dir, args.genome).split(" ") + p = subprocess.Popen(run_args) + p.communicate() + iAnnotate_output_chunk = pd.read_csv(os.path.join(output_dir,"outputfilePrefix_Annotated.txt"), sep="\t",header=0) + iAnnotate_output_chunk = pd.merge(iAnnotate_input_chunk,iAnnotate_output_chunk, on="chr1|pos1|str1|chr2|pos2|str2".split("|"), how="inner") + iAnnotate_output = pd.concat([iAnnotate_output,iAnnotate_output_chunk]) annot_data = iAnnotate_output.drop("chr1|pos1|str1|chr2|pos2|str2".split("|"), axis=1) + final_data = pd.merge(data, annot_data, on="ID",how="left") final_data = pd.merge(data, annot_data, on="ID",how="left") diff --git a/containers/iannotatesv/utils.py b/containers/iannotatesv/utils.py index c24da536..c6d257d5 100644 --- a/containers/iannotatesv/utils.py +++ b/containers/iannotatesv/utils.py @@ -62,27 +62,45 @@ def bedtool_to_df(bt,header_list): print("Unable to apply formatting to bedpe columns using pandas, when converting pybedtool.Bedtool to pandas.DataFrame. Skipping...") return df.drop_duplicates() -def parse_svtools_bedpe_file(bedpe): - with open(bedpe, 'r') as f: - x = f.readlines() - meta_header = [x[y] for y in range(len(x)) if x[y].startswith("##")] - header_idx = [y for y in range(len(x)) if x[y].startswith("#CHROM")][0] - bedpe_header_list = x[header_idx].rstrip().split("\t") - bedpe_df = pd.read_csv(bedpe, skiprows=header_idx,header=0, sep="\t") +def parse_svtools_bedpe_file(bedpe, offset = True): + """ + Read bedpe file + 1. Separate components meta-data, header line, and records (main data) + 2. Coerce chromosome values to string, and position coordinates to integer + 3. if offset set to True, add +1 to END_* coordinates. This may be necessary for certain pybedtools operations. + """ + meta_header="" + with open(args.bedpe, 'r') as f: + main_data = False + while main_data == False: + x = f.readline() + if x.startswith("##"): + meta_header += x + else: + header = x + header_list = header.strip().split("\t") + main_data = True + try: + records_df = pd.read_csv(f, header=None, sep="\t" ) + records_df.columns = header_list + except: + records_df = pd.DataFrame(columns = header_list) + try: - bedpe_df = bedpe_df.astype({i:int for i in "START_A|END_A|START_B|END_B".split("|")}) + records_df = records_df.astype({i:int for i in "START_A|END_A|START_B|END_B".split("|")}) for j in ["#CHROM_A","CHROM_B"]: - if bedpe_df[j].dtype == 'float64': - bedpe_df = bedpe_df.astype({j:int}).astype({j:str}) + if records_df[j].dtype == 'float64': + records_df = records_df.astype({j:int}).astype({j:str}) except Exception as e: print(e) print("Unable to apply formatting to bedpe columns in pandas. Skipping...") try: - if bedpe_df.shape[0] > 0: - bedpe_df["END_A"] = bedpe_df.apply(lambda row: row["END_A"] + 1 if row["START_A"] == row["END_A"] else row["END_A"],axis=1) - bedpe_df["END_B"] = bedpe_df.apply(lambda row: row["END_B"] + 1 if row["START_B"] == row["END_B"] else row["END_B"],axis=1) + if offset: + if records_df.shape[0] > 0: + records_df["END_A"] = records_df.apply(lambda row: row["END_A"] + 1 if row["START_A"] == row["END_A"] else row["END_A"],axis=1) + records_df["END_B"] = records_df.apply(lambda row: row["END_B"] + 1 if row["START_B"] == row["END_B"] else row["END_B"],axis=1) except Exception as e: print(e) print("Unable to add +1 offset to END_A and END_B columns in pandas. There may be issues with pybedtools. Skipping...") - return [meta_header, bedpe_header_list,bedpe_df] + return [meta_header, header_list, records_df] From 21ad22a4768b4fd56c7237c94754b934dbb0e685 Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Tue, 5 Jul 2022 14:30:53 -0400 Subject: [PATCH 15/26] bugfixes --- containers/iannotatesv/run_iannotatesv.py | 17 ++++++++++++----- containers/iannotatesv/utils.py | 2 +- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/containers/iannotatesv/run_iannotatesv.py b/containers/iannotatesv/run_iannotatesv.py index 2147340c..456c6c3d 100644 --- a/containers/iannotatesv/run_iannotatesv.py +++ b/containers/iannotatesv/run_iannotatesv.py @@ -1,4 +1,5 @@ import argparse, sys, os, subprocess +from collections import OrderedDict import numpy as np import pandas as pd from utils import * @@ -15,17 +16,20 @@ def main(): print("Running iAnnotateSV on {}".format(os.path.basename(args.bedpe))) - data = parse_svtools_bedpe_file(args.bedpe) + meta, header_list, data = parse_svtools_bedpe_file(args.bedpe) iAnnotate_input = data["#CHROM_A|START_A|STRAND_A|CHROM_B|START_B|STRAND_B|ID".split("|")] - iAnnotate_input.columns = "chr1|pos1|str1|chr2|pos2|str2|ID".split("|") - iAnnotate_input = iAnnotate_input.replace("-", 1).replace("+", 0) + key_col = OrderedDict(zip("chr1|pos1|str1|chr2|pos2|str2".split("|"), [str,int,int,str,int,int])) + iAnnotate_input.columns = key_col.keys() + ["ID"] + for k,v in key_col.items(): + iAnnotate_input[k] = iAnnotate_input[k].astype(v) n_chunks = int((999+iAnnotate_input.shape[0])/1000) iAnnotate_output = pd.DataFrame() for i in range(n_chunks): - iAnnotate_input_chunk = iAnnotate_input.iloc[(i-1)*1000:i*1000] + print("Processing rows {} through {}".format(1+(i*1000), (i+1)*1000)) + iAnnotate_input_chunk = iAnnotate_input.iloc[i*1000:(i+1)*1000] input_file = "input_" + str(i) + ".txt" output_dir = "output_" + str(i) iAnnotate_input_chunk.to_csv(input_file,sep="\t",header=True,index=False) @@ -34,10 +38,13 @@ def main(): p = subprocess.Popen(run_args) p.communicate() iAnnotate_output_chunk = pd.read_csv(os.path.join(output_dir,"outputfilePrefix_Annotated.txt"), sep="\t",header=0) + for k,v in key_col.items(): + iAnnotate_output_chunk[k] = iAnnotate_output_chunk[k].astype(v) iAnnotate_output_chunk = pd.merge(iAnnotate_input_chunk,iAnnotate_output_chunk, on="chr1|pos1|str1|chr2|pos2|str2".split("|"), how="inner") iAnnotate_output = pd.concat([iAnnotate_output,iAnnotate_output_chunk]) - annot_data = iAnnotate_output.drop("chr1|pos1|str1|chr2|pos2|str2".split("|"), axis=1) + print("Merging annotation with bedpe") + annot_data = iAnnotate_output.drop(key_col.keys(), axis=1) final_data = pd.merge(data, annot_data, on="ID",how="left") final_data = pd.merge(data, annot_data, on="ID",how="left") diff --git a/containers/iannotatesv/utils.py b/containers/iannotatesv/utils.py index c6d257d5..c8d2ae4d 100644 --- a/containers/iannotatesv/utils.py +++ b/containers/iannotatesv/utils.py @@ -70,7 +70,7 @@ def parse_svtools_bedpe_file(bedpe, offset = True): 3. if offset set to True, add +1 to END_* coordinates. This may be necessary for certain pybedtools operations. """ meta_header="" - with open(args.bedpe, 'r') as f: + with open(bedpe, 'r') as f: main_data = False while main_data == False: x = f.readline() From 37ab8ad1ffcbf9933c28423f7ea8ce0621cbce4f Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Tue, 5 Jul 2022 14:31:27 -0400 Subject: [PATCH 16/26] fix header of bedpe due to improper handling in svtools --- modules/process/GermSV/GermlineSVVcf2Bedpe.nf | 5 ++++- modules/process/SV/SomaticSVVcf2Bedpe.nf | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/modules/process/GermSV/GermlineSVVcf2Bedpe.nf b/modules/process/GermSV/GermlineSVVcf2Bedpe.nf index d2da0f0c..d45be40a 100644 --- a/modules/process/GermSV/GermlineSVVcf2Bedpe.nf +++ b/modules/process/GermSV/GermlineSVVcf2Bedpe.nf @@ -16,9 +16,12 @@ process GermlineSVVcf2Bedpe { svtools vcftobedpe \\ -i ${vcfFile} \\ - -o ${outputPrefix}.combined.unsorted.bedpe \\ + -o ${outputPrefix}.combined.tmp.bedpe \\ -t ${outputPrefix}_tmp + zgrep "^#" ${vcfFile} | sed "s/##fileformat=*/##fileformat=BEDPE/g" > ${outputPrefix}.combined.unsorted.bedpe + grep -v "^#" ${outputPrefix}.combined.tmp.bedpe >> ${outputPrefix}.combined.unsorted.bedpe + if [ ! -s ${outputPrefix}.combined.unsorted.bedpe ] ; then echo -e "#CHROM_A\\tSTART_A\\tEND_A\\tCHROM_B\\tSTART_B\\tEND_B\\tID\\tQUAL\\tSTRAND_A\\tSTRAND_B\\tTYPE\\tFILTER\\tNAME_A\\tREF_A\\tALT_A\\tNAME_B\\tREF_B\\tALT_B\\tINFO_A\\tINFO_B\\tFORMAT\\t${idNormal}" >> ${outputPrefix}.combined.unsorted.bedpe fi diff --git a/modules/process/SV/SomaticSVVcf2Bedpe.nf b/modules/process/SV/SomaticSVVcf2Bedpe.nf index 89881e2e..cb83cd47 100644 --- a/modules/process/SV/SomaticSVVcf2Bedpe.nf +++ b/modules/process/SV/SomaticSVVcf2Bedpe.nf @@ -16,9 +16,12 @@ process SomaticSVVcf2Bedpe { svtools vcftobedpe \\ -i ${vcfFile} \\ - -o ${outputPrefix}.combined.unsorted.bedpe \\ + -o ${outputPrefix}.combined.tmp.bedpe \\ -t ${outputPrefix}_tmp + zgrep "^#" ${vcfFile} | sed "s/##fileformat=*/##fileformat=BEDPE/g" > ${outputPrefix}.combined.unsorted.bedpe + grep -v "^#" ${outputPrefix}.combined.tmp.bedpe >> ${outputPrefix}.combined.unsorted.bedpe + if [ ! -s ${outputPrefix}.combined.unsorted.bedpe ] ; then echo -e "#CHROM_A\\tSTART_A\\tEND_A\\tCHROM_B\\tSTART_B\\tEND_B\\tID\\tQUAL\\tSTRAND_A\\tSTRAND_B\\tTYPE\\tFILTER\\tNAME_A\\tREF_A\\tALT_A\\tNAME_B\\tREF_B\\tALT_B\\tINFO_A\\tINFO_B\\tFORMAT\\t${idNormal}\\t${idTumor}" >> ${outputPrefix}.combined.unsorted.bedpe fi From 063e5f240aed492aaf17bfc99b3f23855bf6f0dc Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Tue, 5 Jul 2022 15:10:21 -0400 Subject: [PATCH 17/26] fix bedpe header --- modules/process/GermSV/GermlineSVVcf2Bedpe.nf | 4 ++-- modules/process/SV/SomaticSVVcf2Bedpe.nf | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/process/GermSV/GermlineSVVcf2Bedpe.nf b/modules/process/GermSV/GermlineSVVcf2Bedpe.nf index d45be40a..6f2a1737 100644 --- a/modules/process/GermSV/GermlineSVVcf2Bedpe.nf +++ b/modules/process/GermSV/GermlineSVVcf2Bedpe.nf @@ -19,8 +19,8 @@ process GermlineSVVcf2Bedpe { -o ${outputPrefix}.combined.tmp.bedpe \\ -t ${outputPrefix}_tmp - zgrep "^#" ${vcfFile} | sed "s/##fileformat=*/##fileformat=BEDPE/g" > ${outputPrefix}.combined.unsorted.bedpe - grep -v "^#" ${outputPrefix}.combined.tmp.bedpe >> ${outputPrefix}.combined.unsorted.bedpe + zgrep "^##" ${vcfFile} | sed "s/##fileformat=*/##fileformat=BEDPE/g" > ${outputPrefix}.combined.unsorted.bedpe + grep -v "^##" ${outputPrefix}.combined.tmp.bedpe >> ${outputPrefix}.combined.unsorted.bedpe if [ ! -s ${outputPrefix}.combined.unsorted.bedpe ] ; then echo -e "#CHROM_A\\tSTART_A\\tEND_A\\tCHROM_B\\tSTART_B\\tEND_B\\tID\\tQUAL\\tSTRAND_A\\tSTRAND_B\\tTYPE\\tFILTER\\tNAME_A\\tREF_A\\tALT_A\\tNAME_B\\tREF_B\\tALT_B\\tINFO_A\\tINFO_B\\tFORMAT\\t${idNormal}" >> ${outputPrefix}.combined.unsorted.bedpe diff --git a/modules/process/SV/SomaticSVVcf2Bedpe.nf b/modules/process/SV/SomaticSVVcf2Bedpe.nf index cb83cd47..0662ab22 100644 --- a/modules/process/SV/SomaticSVVcf2Bedpe.nf +++ b/modules/process/SV/SomaticSVVcf2Bedpe.nf @@ -19,8 +19,8 @@ process SomaticSVVcf2Bedpe { -o ${outputPrefix}.combined.tmp.bedpe \\ -t ${outputPrefix}_tmp - zgrep "^#" ${vcfFile} | sed "s/##fileformat=*/##fileformat=BEDPE/g" > ${outputPrefix}.combined.unsorted.bedpe - grep -v "^#" ${outputPrefix}.combined.tmp.bedpe >> ${outputPrefix}.combined.unsorted.bedpe + zgrep "^##" ${vcfFile} | sed "s/##fileformat=*/##fileformat=BEDPE/g" > ${outputPrefix}.combined.unsorted.bedpe + grep -v "^##" ${outputPrefix}.combined.tmp.bedpe >> ${outputPrefix}.combined.unsorted.bedpe if [ ! -s ${outputPrefix}.combined.unsorted.bedpe ] ; then echo -e "#CHROM_A\\tSTART_A\\tEND_A\\tCHROM_B\\tSTART_B\\tEND_B\\tID\\tQUAL\\tSTRAND_A\\tSTRAND_B\\tTYPE\\tFILTER\\tNAME_A\\tREF_A\\tALT_A\\tNAME_B\\tREF_B\\tALT_B\\tINFO_A\\tINFO_B\\tFORMAT\\t${idNormal}\\t${idTumor}" >> ${outputPrefix}.combined.unsorted.bedpe From 75e35e2ba3268d6b1b672bda0a8435dd21d30a8e Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Tue, 5 Jul 2022 17:31:57 -0400 Subject: [PATCH 18/26] parallelize iannotatesv --- containers/iannotatesv/run_iannotatesv.py | 53 +++++++++++++------ .../process/GermSV/GermlineAnnotateSVBedpe.nf | 4 +- modules/process/SV/SomaticAnnotateSVBedpe.nf | 4 +- 3 files changed, 42 insertions(+), 19 deletions(-) diff --git a/containers/iannotatesv/run_iannotatesv.py b/containers/iannotatesv/run_iannotatesv.py index 456c6c3d..84c3bc04 100644 --- a/containers/iannotatesv/run_iannotatesv.py +++ b/containers/iannotatesv/run_iannotatesv.py @@ -1,5 +1,7 @@ import argparse, sys, os, subprocess from collections import OrderedDict +from datetime import datetime +import multiprocessing as mp import numpy as np import pandas as pd from utils import * @@ -8,13 +10,25 @@ def usage(): parser = argparse.ArgumentParser() parser.add_argument('--bedpe', help = 'bedpe file', required = True) parser.add_argument('--genome', default = "hg19", help = 'hg19/hg38/hg18') + parser.add_argument('--threads', type=int, default = 4, help = 'number of threads for parallelization') + parser.add_argument('--chunk', type=int, dest = "chunk_size", default = 500, help = 'how many records to process with a single thread') return parser.parse_args() +def run_iannotate_cmd(input_file,output_dir,output_pre,genome="hg19"): + print("Spawning in parallel: annotation of " + input_file) + run_args = "python /usr/bin/iAnnotateSV/iAnnotateSV/iAnnotateSV.py -i {} -o {} -ofp {} -r {} -d 3000""".format(input_file, output_dir, output_pre, genome).split(" ") + p = subprocess.Popen(run_args) + p.communicate() + +def read_iannotatesv_result(path): + return pd.read_csv(path, sep="\t",header=0) + def main(): args = usage() - print("Running iAnnotateSV on {}".format(os.path.basename(args.bedpe))) + dt = datetime.now() + print("[{}] Running iAnnotateSV on {}".format(dt,os.path.basename(args.bedpe))) meta, header_list, data = parse_svtools_bedpe_file(args.bedpe) iAnnotate_input = data["#CHROM_A|START_A|STRAND_A|CHROM_B|START_B|STRAND_B|ID".split("|")] @@ -24,26 +38,29 @@ def main(): for k,v in key_col.items(): iAnnotate_input[k] = iAnnotate_input[k].astype(v) - n_chunks = int((999+iAnnotate_input.shape[0])/1000) + k = int(args.chunk_size) + n_chunks = int(((k-1)+iAnnotate_input.shape[0])/k) iAnnotate_output = pd.DataFrame() + if not os.path.isdir("inputs"): os.mkdir("inputs") + if not os.path.isdir("outputs"): os.mkdir("outputs") + pool = mp.Pool(args.threads) for i in range(n_chunks): - print("Processing rows {} through {}".format(1+(i*1000), (i+1)*1000)) - iAnnotate_input_chunk = iAnnotate_input.iloc[i*1000:(i+1)*1000] - input_file = "input_" + str(i) + ".txt" - output_dir = "output_" + str(i) + iAnnotate_input_chunk = iAnnotate_input.iloc[i*k:(i+1)*k] + input_file = os.path.join("inputs","input_" + str(i) + ".txt") iAnnotate_input_chunk.to_csv(input_file,sep="\t",header=True,index=False) - if not os.path.isdir(output_dir): os.mkdir(output_dir) - run_args = "python /usr/bin/iAnnotateSV/iAnnotateSV/iAnnotateSV.py -i {} -ofp outputfilePrefix -o {} -r {} -d 3000""".format(input_file, output_dir, args.genome).split(" ") - p = subprocess.Popen(run_args) - p.communicate() - iAnnotate_output_chunk = pd.read_csv(os.path.join(output_dir,"outputfilePrefix_Annotated.txt"), sep="\t",header=0) - for k,v in key_col.items(): - iAnnotate_output_chunk[k] = iAnnotate_output_chunk[k].astype(v) - iAnnotate_output_chunk = pd.merge(iAnnotate_input_chunk,iAnnotate_output_chunk, on="chr1|pos1|str1|chr2|pos2|str2".split("|"), how="inner") - iAnnotate_output = pd.concat([iAnnotate_output,iAnnotate_output_chunk]) - - print("Merging annotation with bedpe") + pool.apply_async(run_iannotate_cmd, args=(input_file, "outputs", "output_" + str(i), args.genome )) + + pool.close() + pool.join() + + iAnnotate_output = pd.concat(map(read_iannotatesv_result, [os.path.join("outputs","output_" + str(i) + "_Annotated.txt") for i in range(n_chunks)])) + for key,val in key_col.items(): + iAnnotate_output[key] = iAnnotate_output[key].astype(val) + + iAnnotate_output = pd.merge(iAnnotate_input,iAnnotate_output, on=key_col.keys(), how="inner") + + print("[{}] Merging annotation with bedpe".format(datetime.now())) annot_data = iAnnotate_output.drop(key_col.keys(), axis=1) final_data = pd.merge(data, annot_data, on="ID",how="left") @@ -55,5 +72,7 @@ def main(): fw.write(meta) final_data.to_csv(outputbed,sep="\t",header=True,index=False, mode='a') + print("[{}] iAnnotateSV complete".format(datetime.now())) + if __name__ == "__main__": main() diff --git a/modules/process/GermSV/GermlineAnnotateSVBedpe.nf b/modules/process/GermSV/GermlineAnnotateSVBedpe.nf index eb3898fb..e33a1322 100644 --- a/modules/process/GermSV/GermlineAnnotateSVBedpe.nf +++ b/modules/process/GermSV/GermlineAnnotateSVBedpe.nf @@ -74,7 +74,9 @@ process GermlineAnnotateSVBedpe { python ${custom_scripts}/run_iannotatesv.py \\ --bedpe ${outputPrefix}.combined.dac.rm.pcawg.cdna.bedpe \\ - --genome ${genome_} + --genome ${genome_} \\ + --threads ${task.cpus * 2} \\ + --chunk 500 cp ${outputPrefix}.combined.dac.rm.pcawg.cdna.iannotate.bedpe \\ ${outputPrefix}.combined.annot.filtered.bedpe diff --git a/modules/process/SV/SomaticAnnotateSVBedpe.nf b/modules/process/SV/SomaticAnnotateSVBedpe.nf index 15263d0a..f7b8b05b 100644 --- a/modules/process/SV/SomaticAnnotateSVBedpe.nf +++ b/modules/process/SV/SomaticAnnotateSVBedpe.nf @@ -74,7 +74,9 @@ process SomaticAnnotateSVBedpe { python ${custom_scripts}/run_iannotatesv.py \\ --bedpe ${outputPrefix}.combined.dac.rm.pcawg.cdna.bedpe \\ - --genome ${genome_} + --genome ${genome_} \\ + --threads ${task.cpus * 2} \\ + --chunk 500 cp ${outputPrefix}.combined.dac.rm.pcawg.cdna.iannotate.bedpe \\ ${outputPrefix}.combined.annot.filtered.bedpe From 5563d0d18eef4e69f5771fc4b97db09e10de9d9c Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Wed, 6 Jul 2022 17:32:59 -0400 Subject: [PATCH 19/26] add resource configuration for *AnnotateSVBedpe processes --- conf/resources_juno.config | 8 ++++++++ conf/resources_juno_genome.config | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/conf/resources_juno.config b/conf/resources_juno.config index 026c462e..e023c58f 100755 --- a/conf/resources_juno.config +++ b/conf/resources_juno.config @@ -113,6 +113,10 @@ cpus = { task.attempt < 4 ? 1 : 2 } memory = { task.attempt < 3 ? 1.GB * task.attempt : 5.Gb * task.attempt } } + withName:SomaticAnnotateSVBedpe { + cpus = { task.attempt < 4 ? 1 : 2 } + memory = { 8.GB * task.attempt } + } //------------- Germline pipeline @@ -156,6 +160,10 @@ cpus = { task.attempt < 4 ? 1 : 2 } memory = { task.attempt < 3 ? 1.GB * task.attempt : 5.Gb * task.attempt } } + withName:GermlineAnnotateSVBedpe { + cpus = { task.attempt < 4 ? 1 : 2 } + memory = { 8.GB * task.attempt } + } //------------- Quality Control diff --git a/conf/resources_juno_genome.config b/conf/resources_juno_genome.config index 3e56bc02..8cbe34b2 100644 --- a/conf/resources_juno_genome.config +++ b/conf/resources_juno_genome.config @@ -152,6 +152,10 @@ cpus = { task.attempt < 4 ? 1 : 2 } memory = { 4.GB * task.attempt } } + withName:SomaticAnnotateSVBedpe { + cpus = { task.attempt < 4 ? 1 : 2 } + memory = { 8.GB * task.attempt } + } withName:HRDetect { cpus = { 2 } memory = { 4.GB * task.attempt } @@ -203,6 +207,10 @@ cpus = { task.attempt < 4 ? 1 : 2 } memory = { task.attempt < 3 ? 1.GB * task.attempt : 5.Gb * task.attempt } } + withName:GermlineAnnotateSVBedpe { + cpus = { task.attempt < 4 ? 1 : 2 } + memory = { 8.GB * task.attempt } + } //------------- Quality Control From d4cdeb3e75e172597fa39ab36f40b52362f36a4d Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Mon, 11 Jul 2022 11:10:04 -0400 Subject: [PATCH 20/26] raise resources for *AnnotateSVBedpe --- conf/resources_juno_genome.config | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/conf/resources_juno_genome.config b/conf/resources_juno_genome.config index 8cbe34b2..19f80ec2 100644 --- a/conf/resources_juno_genome.config +++ b/conf/resources_juno_genome.config @@ -153,7 +153,7 @@ memory = { 4.GB * task.attempt } } withName:SomaticAnnotateSVBedpe { - cpus = { task.attempt < 4 ? 1 : 2 } + cpus = { task.attempt < 3 ? 2 : 4 } memory = { 8.GB * task.attempt } } withName:HRDetect { @@ -208,7 +208,7 @@ memory = { task.attempt < 3 ? 1.GB * task.attempt : 5.Gb * task.attempt } } withName:GermlineAnnotateSVBedpe { - cpus = { task.attempt < 4 ? 1 : 2 } + cpus = { task.attempt < 3 ? 2 : 4 } memory = { 8.GB * task.attempt } } From 92764d0a8c42cf7d78367b07641bedcd1f64fa7a Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Mon, 11 Jul 2022 12:43:40 -0400 Subject: [PATCH 21/26] edited travis test references for filtering SV events --- .travis.yml | 2 +- conf/references.config | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 1fb4ca5b..a041f85e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,7 +14,7 @@ before_install: - chmod 777 nextflow # to change the test-data for travis, please download using the following command, extract, make changes, tarball again with gzip, and upload to google drive. # you will have to change the link below as well. Click to share the link, making it so anyone with the link can access, then extract the id in the link and put it here after "id=" - - wget -O test-data.tar.gz --no-check-certificate 'https://docs.google.com/uc?export=download&confirm=no_antivirus&id=1XuitIb02GZ5v928Do2Vh4hnQNkbSAT0V' + - wget -O test-data.tar.gz --no-check-certificate 'https://docs.google.com/uc?export=download&confirm=no_antivirus&id=13zUVw4BZ0_5QAW7zmdCsZ_CZDHbNxyMV' - tar -xzvf test-data.tar.gz script: diff --git a/conf/references.config b/conf/references.config index 44efbadc..a81f44ea 100644 --- a/conf/references.config +++ b/conf/references.config @@ -60,6 +60,10 @@ params { hlaDat = "${params.reference_base}/hla/hla.dat" neoantigenCDNA = "${params.reference_base}/neoantigen/Homo_sapiens.GRCh37.75.cdna.all.fa.gz" neoantigenCDS = "${params.reference_base}/neoantigen/Homo_sapiens.GRCh37.75.cds.all.fa.gz" + svBlacklistBed = "${params.genome_base}/sv_calling/pcawg6_blacklist.slop.bed.gz" + svBlacklistBedpe = "${params.genome_base}/sv_calling/pcawg6_blacklist.slop.trunc.bedpe.gz" + svBlacklistFoldbackBedpe = "${params.genome_base}/sv_calling/pcawg6_blacklist_foldback_artefacts.slop.trunc.bedpe.gz" + svBlacklistTEBedpe = "${params.genome_base}/sv_calling/pcawg6_blacklist_TE_pseudogene.bedpe.gz" } 'GRCh37' { acLoci = "${params.genome_base}/Annotation/ASCAT/1000G_phase3_20130502_SNP_maf0.3.loci" From d0ebe80471a62a6798a06488d28e10e5c315668b Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Mon, 11 Jul 2022 13:12:10 -0400 Subject: [PATCH 22/26] bugfix --- modules/function/define_maps.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/function/define_maps.nf b/modules/function/define_maps.nf index b00762f9..d31363d7 100644 --- a/modules/function/define_maps.nf +++ b/modules/function/define_maps.nf @@ -62,11 +62,11 @@ def defineReferenceMap() { if (! workflow.profile.startsWith("test") ){ result_array << ['brassRefDir' : checkParamReturnFile('brassRefDir')] result_array << ['vagrentRefDir' : checkParamReturnFile('vagrentRefDir')] + } result_array << ['svBlacklistBed' : checkParamReturnFile('svBlacklistBed')] result_array << ['svBlacklistBedpe' : checkParamReturnFile('svBlacklistBedpe')] result_array << ['svBlacklistFoldbackBedpe' : checkParamReturnFile('svBlacklistFoldbackBedpe')] result_array << ['svBlacklistTEBedpe' : checkParamReturnFile('svBlacklistTEBedpe')] - } return result_array } From 20bc35baf4d85e082764474d3ca9e3bae6ac1050 Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Mon, 11 Jul 2022 16:21:18 -0400 Subject: [PATCH 23/26] remove duplicated line --- containers/iannotatesv/run_iannotatesv.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/containers/iannotatesv/run_iannotatesv.py b/containers/iannotatesv/run_iannotatesv.py index 84c3bc04..c60291a3 100644 --- a/containers/iannotatesv/run_iannotatesv.py +++ b/containers/iannotatesv/run_iannotatesv.py @@ -64,8 +64,6 @@ def main(): annot_data = iAnnotate_output.drop(key_col.keys(), axis=1) final_data = pd.merge(data, annot_data, on="ID",how="left") - final_data = pd.merge(data, annot_data, on="ID",how="left") - outputbed = os.path.basename(args.bedpe)[:-6] if args.bedpe.endswith(".bedpe") else os.path.basename(args.bedpe) outputbed = os.path.join(os.path.dirname(args.bedpe),outputbed + ".iannotate.bedpe") with open(outputbed, "w") as fw: From 32978ee74d500034de73c28ee50264b293b446e5 Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Fri, 22 Jul 2022 14:45:19 -0400 Subject: [PATCH 24/26] remove chunk parameter from run_iannotatesv.py because finetuning it does not meaningfully impact run time --- conf/references.config | 8 ++++---- containers/iannotatesv/run_iannotatesv.py | 3 +-- modules/process/GermSV/GermlineAnnotateSVBedpe.nf | 3 +-- modules/process/SV/SomaticAnnotateSVBedpe.nf | 3 +-- 4 files changed, 7 insertions(+), 10 deletions(-) diff --git a/conf/references.config b/conf/references.config index a81f44ea..e03f92d1 100644 --- a/conf/references.config +++ b/conf/references.config @@ -105,10 +105,10 @@ params { spliceSites = "${params.reference_base}/mskcc-igenomes/grch37/splice_sites/splice_sites.bed" brassRefDir = "${params.reference_base}/mskcc-igenomes/grch37/brass" vagrentRefDir = "${params.reference_base}/mskcc-igenomes/grch37/vagrent" - svBlacklistBed = "${params.reference_base}/mskcc-igenomes/grch37/sv_calling/pcawg6_blacklist.slop.bed" - svBlacklistBedpe = "${params.reference_base}/mskcc-igenomes/grch37/sv_calling/pcawg6_blacklist.slop.bedpe" - svBlacklistFoldbackBedpe = "${params.reference_base}/mskcc-igenomes/grch37/sv_calling/pcawg6_blacklist_foldback_artefacts.bedpe" - svBlacklistTEBedpe = "${params.reference_base}/mskcc-igenomes/grch37/sv_calling/pcawg6_blacklist_TE_pseudogene.bedpe" + svBlacklistBed = "${params.reference_base}/mskcc-igenomes/grch37/sv_calling/pcawg6_blacklist.slop.bed.gz" + svBlacklistBedpe = "${params.reference_base}/mskcc-igenomes/grch37/sv_calling/pcawg6_blacklist.slop.bedpe.gz" + svBlacklistFoldbackBedpe = "${params.reference_base}/mskcc-igenomes/grch37/sv_calling/pcawg6_blacklist_foldback_artefacts.slop.bedpe.gz" + svBlacklistTEBedpe = "${params.reference_base}/mskcc-igenomes/grch37/sv_calling/pcawg6_blacklist_TE_pseudogene.bedpe.gz" } 'GRCh38' { acLoci = "${params.genome_base}/Annotation/ASCAT/1000G_phase3_GRCh38_maf0.3.loci" diff --git a/containers/iannotatesv/run_iannotatesv.py b/containers/iannotatesv/run_iannotatesv.py index c60291a3..e1a0a8c3 100644 --- a/containers/iannotatesv/run_iannotatesv.py +++ b/containers/iannotatesv/run_iannotatesv.py @@ -11,7 +11,6 @@ def usage(): parser.add_argument('--bedpe', help = 'bedpe file', required = True) parser.add_argument('--genome', default = "hg19", help = 'hg19/hg38/hg18') parser.add_argument('--threads', type=int, default = 4, help = 'number of threads for parallelization') - parser.add_argument('--chunk', type=int, dest = "chunk_size", default = 500, help = 'how many records to process with a single thread') return parser.parse_args() @@ -38,7 +37,7 @@ def main(): for k,v in key_col.items(): iAnnotate_input[k] = iAnnotate_input[k].astype(v) - k = int(args.chunk_size) + k = int(500) n_chunks = int(((k-1)+iAnnotate_input.shape[0])/k) iAnnotate_output = pd.DataFrame() if not os.path.isdir("inputs"): os.mkdir("inputs") diff --git a/modules/process/GermSV/GermlineAnnotateSVBedpe.nf b/modules/process/GermSV/GermlineAnnotateSVBedpe.nf index e33a1322..c8a1c6f4 100644 --- a/modules/process/GermSV/GermlineAnnotateSVBedpe.nf +++ b/modules/process/GermSV/GermlineAnnotateSVBedpe.nf @@ -75,8 +75,7 @@ process GermlineAnnotateSVBedpe { python ${custom_scripts}/run_iannotatesv.py \\ --bedpe ${outputPrefix}.combined.dac.rm.pcawg.cdna.bedpe \\ --genome ${genome_} \\ - --threads ${task.cpus * 2} \\ - --chunk 500 + --threads ${task.cpus * 2} cp ${outputPrefix}.combined.dac.rm.pcawg.cdna.iannotate.bedpe \\ ${outputPrefix}.combined.annot.filtered.bedpe diff --git a/modules/process/SV/SomaticAnnotateSVBedpe.nf b/modules/process/SV/SomaticAnnotateSVBedpe.nf index f7b8b05b..25f08b9c 100644 --- a/modules/process/SV/SomaticAnnotateSVBedpe.nf +++ b/modules/process/SV/SomaticAnnotateSVBedpe.nf @@ -75,8 +75,7 @@ process SomaticAnnotateSVBedpe { python ${custom_scripts}/run_iannotatesv.py \\ --bedpe ${outputPrefix}.combined.dac.rm.pcawg.cdna.bedpe \\ --genome ${genome_} \\ - --threads ${task.cpus * 2} \\ - --chunk 500 + --threads ${task.cpus * 2} cp ${outputPrefix}.combined.dac.rm.pcawg.cdna.iannotate.bedpe \\ ${outputPrefix}.combined.annot.filtered.bedpe From c0aeca7225dbc1288ec6b3c211755025bdef6489 Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Fri, 22 Jul 2022 15:27:09 -0400 Subject: [PATCH 25/26] add note about sources for pcawg blacklist files --- conf/references.config | 1 + containers/iannotatesv/pair_to_pair_annot.py | 55 -------------------- 2 files changed, 1 insertion(+), 55 deletions(-) delete mode 100644 containers/iannotatesv/pair_to_pair_annot.py diff --git a/conf/references.config b/conf/references.config index e03f92d1..abcfc63d 100644 --- a/conf/references.config +++ b/conf/references.config @@ -105,6 +105,7 @@ params { spliceSites = "${params.reference_base}/mskcc-igenomes/grch37/splice_sites/splice_sites.bed" brassRefDir = "${params.reference_base}/mskcc-igenomes/grch37/brass" vagrentRefDir = "${params.reference_base}/mskcc-igenomes/grch37/vagrent" + // svBlacklist* source: https://bitbucket.org/weischenfeldt/pcawg_sv_merge/src/docker/data/blacklist_files/ svBlacklistBed = "${params.reference_base}/mskcc-igenomes/grch37/sv_calling/pcawg6_blacklist.slop.bed.gz" svBlacklistBedpe = "${params.reference_base}/mskcc-igenomes/grch37/sv_calling/pcawg6_blacklist.slop.bedpe.gz" svBlacklistFoldbackBedpe = "${params.reference_base}/mskcc-igenomes/grch37/sv_calling/pcawg6_blacklist_foldback_artefacts.slop.bedpe.gz" diff --git a/containers/iannotatesv/pair_to_pair_annot.py b/containers/iannotatesv/pair_to_pair_annot.py deleted file mode 100644 index 48c05d48..00000000 --- a/containers/iannotatesv/pair_to_pair_annot.py +++ /dev/null @@ -1,55 +0,0 @@ -import argparse, sys, os -import numpy as np -import pandas as pd -import pybedtools -from utils import * - -def usage(): - parser = argparse.ArgumentParser() - parser.add_argument('--blacklist-regions', dest="regions", help = 'regions bed/bedpe file', required = True) - parser.add_argument('--bedpe', help = 'bedpe file', required = True) - parser.add_argument('--tag', help = 'string to update FILTER column', required = True) - parser.add_argument('--output',dest="outfile",help = 'output file' , required = True) - parser.add_argument('--match-type',dest="type",help = 'both/either' , required = True) - parser.add_argument('--ignore-strand',dest="ignore_strand", default=False, action="store_true", help = 'Use flag to ignore strand when running bedtools. Default False' ) - return parser.parse_args() - - -def main(): - args = usage() - - if args.type not in ["both","either"]: - sys.exit("--match-type must be both or either") - - match_type = args.type - # determine if regions file is bed or bedpe - filename=os.path.basename(args.regions) - is_bed = False - if filename.endswith("bed") or filename.endswith("bed.gz"): - is_bed = True - if filename.endswith("bedpe") or filename.endswith("bedpe.gz"): - pass - else: - sys.exit("--blacklist-regions must have .bed/.bed.gz/.bedpe/.bedpe.gz extension") - - [meta_header, bedpe_header_list, bedpe_df] = parse_svtools_bedpe_file(args.bedpe) - bedpe_bt = pybedtools.BedTool.from_dataframe(bedpe_df) - regions_bt = pybedtools.BedTool(args.regions) - - if is_bed: - intersect = run_pair_to_bed(bedpe_bt,regions_bt,match_type) - else: - intersect = run_pair_to_pair(bedpe_bt,regions_bt,match_type, ignore_strand=args.ignore_strand) - - intersect_df = bedtool_to_df(intersect,bedpe_header_list)[["ID"]] - intersect_df["FILTER_NEW"] = args.tag - filtered_bedpe_df = bedpe_df.merge(intersect_df, on="ID", how="left") - filtered_bedpe_df = filtered_bedpe_df.apply(lambda x: update_filter(x,x["FILTER_NEW"]) if not pd.isna(x["FILTER_NEW"]) else x, axis=1) - filtered_bed_df = filtered_bedpe_df.drop(['FILTER_NEW'], axis=1) - - with open(args.outfile, "w") as fw: - fw.write("".join(meta_header)) - filtered_bed_df.to_csv(args.outfile, header=True, index=False, sep="\t", mode="a") - -if __name__ == "__main__": - main() From 54209cb1144e15473941b62fafb46f510f8e7d52 Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Mon, 25 Jul 2022 13:24:33 -0400 Subject: [PATCH 26/26] add description and authorship info for each python file --- containers/iannotatesv/detect_cdna.py | 9 +++++++++ containers/iannotatesv/filter_regions_bedpe.py | 9 +++++++++ containers/iannotatesv/run_iannotatesv.py | 8 ++++++++ containers/iannotatesv/utils.py | 7 +++++++ 4 files changed, 33 insertions(+) mode change 100644 => 100755 containers/iannotatesv/detect_cdna.py diff --git a/containers/iannotatesv/detect_cdna.py b/containers/iannotatesv/detect_cdna.py old mode 100644 new mode 100755 index fcf6efe4..73af4507 --- a/containers/iannotatesv/detect_cdna.py +++ b/containers/iannotatesv/detect_cdna.py @@ -1,3 +1,12 @@ +#!/usr/bin/env python + +"""Identify deletion rearrangements that may pertain to cDNA contaminants""" + +__author__ = "Anne Marie Noronha" +__email__ = "noronhaa@mskcc.org" +__version__ = "0.0.1" +__status__ = "Dev" + import argparse, sys, os import numpy as np import pandas as pd diff --git a/containers/iannotatesv/filter_regions_bedpe.py b/containers/iannotatesv/filter_regions_bedpe.py index 258e9814..48ba4f82 100644 --- a/containers/iannotatesv/filter_regions_bedpe.py +++ b/containers/iannotatesv/filter_regions_bedpe.py @@ -1,3 +1,12 @@ +#!/usr/bin/env python + +"""Filter rearrangements that overlap with a set of regions""" + +__author__ = "Anne Marie Noronha" +__email__ = "noronhaa@mskcc.org" +__version__ = "0.0.1" +__status__ = "Dev" + import argparse, sys, os import numpy as np import pandas as pd diff --git a/containers/iannotatesv/run_iannotatesv.py b/containers/iannotatesv/run_iannotatesv.py index 780a588c..181737bc 100644 --- a/containers/iannotatesv/run_iannotatesv.py +++ b/containers/iannotatesv/run_iannotatesv.py @@ -1,3 +1,11 @@ +#!/usr/bin/env python + +"""Annotate SV Bedpe file with iAnnotateSV""" +__author__ = "Anne Marie Noronha" +__email__ = "noronhaa@mskcc.org" +__version__ = "0.0.1" +__status__ = "Dev" + import argparse, sys, os, subprocess from collections import OrderedDict from datetime import datetime diff --git a/containers/iannotatesv/utils.py b/containers/iannotatesv/utils.py index c8d2ae4d..db39d38b 100644 --- a/containers/iannotatesv/utils.py +++ b/containers/iannotatesv/utils.py @@ -1,3 +1,10 @@ +#!/usr/bin/env python + +__author__ = "Anne Marie Noronha" +__email__ = "noronhaa@mskcc.org" +__version__ = "0.0.1" +__status__ = "Dev" + import argparse import numpy as np import pandas as pd