diff --git a/.travis.yml b/.travis.yml index 1fb4ca5b..a041f85e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,7 +14,7 @@ before_install: - chmod 777 nextflow # to change the test-data for travis, please download using the following command, extract, make changes, tarball again with gzip, and upload to google drive. # you will have to change the link below as well. Click to share the link, making it so anyone with the link can access, then extract the id in the link and put it here after "id=" - - wget -O test-data.tar.gz --no-check-certificate 'https://docs.google.com/uc?export=download&confirm=no_antivirus&id=1XuitIb02GZ5v928Do2Vh4hnQNkbSAT0V' + - wget -O test-data.tar.gz --no-check-certificate 'https://docs.google.com/uc?export=download&confirm=no_antivirus&id=13zUVw4BZ0_5QAW7zmdCsZ_CZDHbNxyMV' - tar -xzvf test-data.tar.gz script: diff --git a/conf/references.config b/conf/references.config index 699ce43c..abcfc63d 100644 --- a/conf/references.config +++ b/conf/references.config @@ -60,6 +60,10 @@ params { hlaDat = "${params.reference_base}/hla/hla.dat" neoantigenCDNA = "${params.reference_base}/neoantigen/Homo_sapiens.GRCh37.75.cdna.all.fa.gz" neoantigenCDS = "${params.reference_base}/neoantigen/Homo_sapiens.GRCh37.75.cds.all.fa.gz" + svBlacklistBed = "${params.genome_base}/sv_calling/pcawg6_blacklist.slop.bed.gz" + svBlacklistBedpe = "${params.genome_base}/sv_calling/pcawg6_blacklist.slop.trunc.bedpe.gz" + svBlacklistFoldbackBedpe = "${params.genome_base}/sv_calling/pcawg6_blacklist_foldback_artefacts.slop.trunc.bedpe.gz" + svBlacklistTEBedpe = "${params.genome_base}/sv_calling/pcawg6_blacklist_TE_pseudogene.bedpe.gz" } 'GRCh37' { acLoci = "${params.genome_base}/Annotation/ASCAT/1000G_phase3_20130502_SNP_maf0.3.loci" @@ -101,6 +105,11 @@ params { spliceSites = "${params.reference_base}/mskcc-igenomes/grch37/splice_sites/splice_sites.bed" brassRefDir = "${params.reference_base}/mskcc-igenomes/grch37/brass" vagrentRefDir = "${params.reference_base}/mskcc-igenomes/grch37/vagrent" + // svBlacklist* source: https://bitbucket.org/weischenfeldt/pcawg_sv_merge/src/docker/data/blacklist_files/ + svBlacklistBed = "${params.reference_base}/mskcc-igenomes/grch37/sv_calling/pcawg6_blacklist.slop.bed.gz" + svBlacklistBedpe = "${params.reference_base}/mskcc-igenomes/grch37/sv_calling/pcawg6_blacklist.slop.bedpe.gz" + svBlacklistFoldbackBedpe = "${params.reference_base}/mskcc-igenomes/grch37/sv_calling/pcawg6_blacklist_foldback_artefacts.slop.bedpe.gz" + svBlacklistTEBedpe = "${params.reference_base}/mskcc-igenomes/grch37/sv_calling/pcawg6_blacklist_TE_pseudogene.bedpe.gz" } 'GRCh38' { acLoci = "${params.genome_base}/Annotation/ASCAT/1000G_phase3_GRCh38_maf0.3.loci" diff --git a/conf/resources_juno.config b/conf/resources_juno.config index 026c462e..e023c58f 100755 --- a/conf/resources_juno.config +++ b/conf/resources_juno.config @@ -113,6 +113,10 @@ cpus = { task.attempt < 4 ? 1 : 2 } memory = { task.attempt < 3 ? 1.GB * task.attempt : 5.Gb * task.attempt } } + withName:SomaticAnnotateSVBedpe { + cpus = { task.attempt < 4 ? 1 : 2 } + memory = { 8.GB * task.attempt } + } //------------- Germline pipeline @@ -156,6 +160,10 @@ cpus = { task.attempt < 4 ? 1 : 2 } memory = { task.attempt < 3 ? 1.GB * task.attempt : 5.Gb * task.attempt } } + withName:GermlineAnnotateSVBedpe { + cpus = { task.attempt < 4 ? 1 : 2 } + memory = { 8.GB * task.attempt } + } //------------- Quality Control diff --git a/conf/resources_juno_genome.config b/conf/resources_juno_genome.config index 3e56bc02..19f80ec2 100644 --- a/conf/resources_juno_genome.config +++ b/conf/resources_juno_genome.config @@ -152,6 +152,10 @@ cpus = { task.attempt < 4 ? 1 : 2 } memory = { 4.GB * task.attempt } } + withName:SomaticAnnotateSVBedpe { + cpus = { task.attempt < 3 ? 2 : 4 } + memory = { 8.GB * task.attempt } + } withName:HRDetect { cpus = { 2 } memory = { 4.GB * task.attempt } @@ -203,6 +207,10 @@ cpus = { task.attempt < 4 ? 1 : 2 } memory = { task.attempt < 3 ? 1.GB * task.attempt : 5.Gb * task.attempt } } + withName:GermlineAnnotateSVBedpe { + cpus = { task.attempt < 3 ? 2 : 4 } + memory = { 8.GB * task.attempt } + } //------------- Quality Control diff --git a/containers/iannotatesv/detect_cdna.py b/containers/iannotatesv/detect_cdna.py old mode 100644 new mode 100755 index a0e9b5a8..73af4507 --- a/containers/iannotatesv/detect_cdna.py +++ b/containers/iannotatesv/detect_cdna.py @@ -1,3 +1,12 @@ +#!/usr/bin/env python + +"""Identify deletion rearrangements that may pertain to cDNA contaminants""" + +__author__ = "Anne Marie Noronha" +__email__ = "noronhaa@mskcc.org" +__version__ = "0.0.1" +__status__ = "Dev" + import argparse, sys, os import numpy as np import pandas as pd @@ -51,6 +60,8 @@ def prep_intersection(intersect=pd.DataFrame(),bedpe_header_list=[]): def main(): args = usage() + print("Detecting possible cdna contamination in {}".format(os.path.basename(args.bedpe))) + [meta_header, bedpe_header_list, bedpe_df] = parse_svtools_bedpe_file(args.bedpe) bedpe_bt = pybedtools.BedTool.from_dataframe(bedpe_df) diff --git a/containers/iannotatesv/filter_regions_bedpe.py b/containers/iannotatesv/filter_regions_bedpe.py new file mode 100644 index 00000000..48ba4f82 --- /dev/null +++ b/containers/iannotatesv/filter_regions_bedpe.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python + +"""Filter rearrangements that overlap with a set of regions""" + +__author__ = "Anne Marie Noronha" +__email__ = "noronhaa@mskcc.org" +__version__ = "0.0.1" +__status__ = "Dev" + +import argparse, sys, os +import numpy as np +import pandas as pd +import pybedtools +from utils import * + +def usage(): + parser = argparse.ArgumentParser() + parser.add_argument('--blacklist-regions', dest="regions", help = 'regions bed/bedpe file', required = True) + parser.add_argument('--bedpe', help = 'bedpe file', required = True) + parser.add_argument('--tag', help = 'string to update FILTER column', required = True) + parser.add_argument('--output',dest="outfile",help = 'output file' , required = True) + parser.add_argument('--match-type',dest="type",help = 'both/either/notboth/neither' , required = True) + parser.add_argument('--ignore-strand',dest="ignore_strand", default=False, action="store_true", help = 'Use flag to ignore strand when running bedtools. Default False' ) + return parser.parse_args() + +def determine_regions_filetype(filepath): + """ + Determine if the regions file is bed or bedpe, and raise error if neither + """ + if not any([filepath.endswith(i) for i in "bed|bedpe|bed.gz|bedpe.gz".split("|")]): + raise ValueError("--bedpe requires bed, bedpe, bed.gz or bedpe.gz file.") + elif any([filepath.endswith(i) for i in "bed|bed.gz".split("|")]): + return True + else: return False + +def validate_bedpe_input(filepath): + """ + Raise error if filetype is not bedpe + """ + if not any([filepath.endswith(i) for i in "bedpe|bedpe.gz".split("|")]): + raise ValueError("--bedpe requires bedpe or bedpe.gz file.") + +def validate_match_type(type): + """ + Raise error if type is not acceptable for use in pybedtools pair_to_pair or pair_to_bed + """ + if type not in overlap_type.keys(): + raise ValueError( "--match-type must be {}.".format(" or ".join(overlap_type.keys())) ) + +def main(): + """ + 1. validate inputs + 2. read input beds/bedpes as pybedtools.BedTool objects + 3. extract variant IDs from overlapping regions + 4. update the FILTER tag for identified variants + """ + args = usage() + + print("Filtering {} with {} regions".format(os.path.basename(args.bedpe), args.tag)) + + # parse inputs + try: + validate_bedpe_input(args.bedpe) + [meta_header, bedpe_header_list, bedpe_df] = parse_svtools_bedpe_file(args.bedpe) + except Exception as e: + print(e) + sys.exit("Unable to parse --bedpe") + + try: + is_bed = determine_regions_filetype(args.regions) + regions_bt = pybedtools.BedTool(args.regions) + except Exception as e: + print(e) + sys.exit("Unable to parse --blacklist-regions") + + # annotate bedpe + try: + bedpe_bt = pybedtools.BedTool.from_dataframe(bedpe_df) + ids_df = find_overlapped_ids(bedpe_bt, regions_bt, args.type, args.ignore_strand, is_bed) + bedpe_df = add_filter_by_id(bedpe_df,ids_df,args.tag) + except Exception as e: + print(e) + sys.exit("Filtering failed with inputs {} and {}".format(args.bedpe, args.regions)) + + # write result + with open(args.outfile, "w") as fw: + fw.write("".join(meta_header)) + bedpe_df.to_csv(args.outfile, header=True, index=False, sep="\t", mode="a") + +def find_overlapped_ids(bedpe_bt,regions_bt,match_type,ignore_strand,is_bed): + # determine validity of match_type + validate_match_type(match_type) + + # run pair_to_bed if regions file is bed, pair_to_pair if regions file is bedpe + if is_bed: + intersect = run_pair_to_bed(bedpe_bt,regions_bt,match_type) + else: + intersect = run_pair_to_pair(bedpe_bt,regions_bt,match_type, ignore_strand=ignore_strand) + + # extract ids + try: + ids_df = pd.DataFrame({"ID":list( intersect.to_dataframe(header=None)[6].drop_duplicates() )}) + except: + ids_df = pd.DataFrame(columns=["ID"]) + return ids_df + +def add_filter_by_id(bedpe_df,ids_df,tag): + # annotate bedpe based on matching IDs + ids_df.columns = ["ID"] + ids_df["FILTER_NEW"] = tag + filtered_bedpe_df = bedpe_df.merge(ids_df, on="ID", how="left") + filtered_bedpe_df = filtered_bedpe_df.apply(lambda x: update_filter(x,x["FILTER_NEW"]) if not pd.isna(x["FILTER_NEW"]) else x, axis=1) + filtered_bedpe_df = filtered_bedpe_df.drop(['FILTER_NEW'], axis=1) + + return filtered_bedpe_df + +if __name__ == "__main__": + main() diff --git a/containers/iannotatesv/pair_to_bed_annot.py b/containers/iannotatesv/pair_to_bed_annot.py deleted file mode 100644 index 387150d1..00000000 --- a/containers/iannotatesv/pair_to_bed_annot.py +++ /dev/null @@ -1,46 +0,0 @@ -import argparse, sys -import numpy as np -import pandas as pd -import pybedtools -from utils import * - -def usage(): - parser = argparse.ArgumentParser() - parser.add_argument('--blacklist-regions', dest="regions", help = 'regions bed/bedpe file', required = True) - parser.add_argument('--bedpe', help = 'bedpe file', required = True) - parser.add_argument('--tag', help = 'string to update FILTER column', required = True) - parser.add_argument('--output',dest="outfile",help = 'output file' , required = True) - parser.add_argument('--match-type',dest="type",help = 'both/either' , required = True) - - return parser.parse_args() - -def main(): - args = usage() - - if args.type not in ["both","either"]: - sys.exit("--match-type must be both or either") - - [meta_header, bedpe_header_list, bedpe_df] = parse_svtools_bedpe_file(args.bedpe) - bedpe_bt = pybedtools.BedTool.from_dataframe(bedpe_df) - regions_bt = pybedtools.BedTool(args.regions) - intersect,outersect = filter_bedpe(bedpe_bt,regions_bt,args.type) - intersect_df = bedtool_to_df(intersect,bedpe_header_list) - intersect_df["FILTER"] = intersect_df["FILTER"].apply(lambda x: add_tag(x,args.tag)) - outersect_df = bedtool_to_df(outersect,bedpe_header_list) - filtered_bed_df = pd.concat([intersect_df,outersect_df]) - - with open(args.outfile, "w") as fw: - fw.write("".join(meta_header)) - filtered_bed_df.to_csv(args.outfile, header=True, index=False, sep="\t", mode="a") - -def filter_bedpe(bedpe_bt,regions_bt,match_type="either"): - outersect_match_type = "notboth" if match_type == "both" else "neither" - intersect = run_pair_to_bed(bedpe_bt,regions_bt,match_type) - outersect = run_pair_to_bed(bedpe_bt,regions_bt, outersect_match_type) - #intersect["FILTER"] = intersect["FILTER"].apply(lambda x: add_tag(x,tag_str)) - - return [intersect,outersect] - #return pd.concat([intersect,outersect]) - -if __name__ == "__main__": - main() diff --git a/containers/iannotatesv/run_iannotatesv.py b/containers/iannotatesv/run_iannotatesv.py index f44f4c3e..181737bc 100644 --- a/containers/iannotatesv/run_iannotatesv.py +++ b/containers/iannotatesv/run_iannotatesv.py @@ -1,52 +1,74 @@ +#!/usr/bin/env python + +"""Annotate SV Bedpe file with iAnnotateSV""" +__author__ = "Anne Marie Noronha" +__email__ = "noronhaa@mskcc.org" +__version__ = "0.0.1" +__status__ = "Dev" + import argparse, sys, os, subprocess +from collections import OrderedDict +from datetime import datetime +import multiprocessing as mp import numpy as np import pandas as pd +from utils import * def usage(): parser = argparse.ArgumentParser() parser.add_argument('--bedpe', help = 'bedpe file', required = True) parser.add_argument('--genome', default = "hg19", help = 'hg19/hg38/hg18') + parser.add_argument('--threads', type=int, default = 4, help = 'number of threads for parallelization') return parser.parse_args() +def run_iannotate_cmd(input_file,output_dir,output_pre,genome="hg19"): + print("Spawning in parallel: annotation of " + input_file) + run_args = "python /usr/bin/iAnnotateSV/iAnnotateSV/iAnnotateSV.py -i {} -o {} -ofp {} -r {} -d 3000""".format(input_file, output_dir, output_pre, genome).split(" ") + p = subprocess.Popen(run_args) + p.communicate() + +def read_iannotatesv_result(path): + return pd.read_csv(path, sep="\t",header=0) + def main(): args = usage() - meta="" - with open(args.bedpe, 'r') as f: - main_data = False - while main_data == False: - x = f.readline() - if x.startswith("##"): - meta += x - else: - header=x - main_data = True - try: - data = pd.read_csv(f, header=None, sep="\t" ) - data.columns = header.strip().split("\t") - except: - data = pd.DataFrame(columns = header.strip().split("\t")) + dt = datetime.now() + print("[{}] Running iAnnotateSV on {}".format(dt,os.path.basename(args.bedpe))) + meta, header_list, data = parse_svtools_bedpe_file(args.bedpe) iAnnotate_input = data["#CHROM_A|START_A|STRAND_A|CHROM_B|START_B|STRAND_B|ID".split("|")] - iAnnotate_input.columns = "chr1|pos1|str1|chr2|pos2|str2|ID".split("|") - iAnnotate_input = iAnnotate_input.replace("-", 1).replace("+", 0) + key_col = OrderedDict(zip("chr1|pos1|str1|chr2|pos2|str2".split("|"), [str,int,int,str,int,int])) + iAnnotate_input.columns = key_col.keys() + ["ID"] + for k,v in key_col.items(): + iAnnotate_input[k] = iAnnotate_input[k].astype(v) - iAnnotate_input.to_csv("input.txt",sep="\t",header=True,index=False) + k = int(500) + n_chunks = int(((k-1)+iAnnotate_input.shape[0])/k) + iAnnotate_output = pd.DataFrame() + if not os.path.isdir("inputs"): os.mkdir("inputs") + if not os.path.isdir("outputs"): os.mkdir("outputs") - if not os.path.isdir("outputdir"): - os.mkdir("outputdir") - run_args = "python /usr/bin/iAnnotateSV/iAnnotateSV/iAnnotateSV.py -i input.txt -ofp outputfilePrefix -o outputdir -r {} -d 3000""".format(args.genome).split(" ") - p = subprocess.Popen(run_args) - p.communicate() + pool = mp.Pool(args.threads) + for i in range(n_chunks): + iAnnotate_input_chunk = iAnnotate_input.iloc[i*k:(i+1)*k] + input_file = os.path.join("inputs","input_" + str(i) + ".txt") + iAnnotate_input_chunk.to_csv(input_file,sep="\t",header=True,index=False) + pool.apply_async(run_iannotate_cmd, args=(input_file, "outputs", "output_" + str(i), args.genome )) - iAnnotate_output = pd.read_csv("outputdir/outputfilePrefix_Annotated.txt", sep="\t",header=0) - iAnnotate_output = pd.merge(iAnnotate_input,iAnnotate_output, on="chr1|pos1|str1|chr2|pos2|str2".split("|"), how="inner") + pool.close() + pool.join() - annot_data = iAnnotate_output.drop("chr1|pos1|str1|chr2|pos2|str2".split("|"), axis=1) - annot_data = annot_data.replace(np.nan, ".") + iAnnotate_output = pd.concat(map(read_iannotatesv_result, [os.path.join("outputs","output_" + str(i) + "_Annotated.txt") for i in range(n_chunks)])) + for key,val in key_col.items(): + iAnnotate_output[key] = iAnnotate_output[key].astype(val) + iAnnotate_output = pd.merge(iAnnotate_input,iAnnotate_output, on=key_col.keys(), how="inner") + + print("[{}] Merging annotation with bedpe".format(datetime.now())) + annot_data = iAnnotate_output.drop(key_col.keys(), axis=1).replace(np.nan, ".") final_data = pd.merge(data, annot_data, on="ID",how="left") outputbed = os.path.basename(args.bedpe)[:-6] if args.bedpe.endswith(".bedpe") else os.path.basename(args.bedpe) @@ -55,5 +77,7 @@ def main(): fw.write(meta) final_data.to_csv(outputbed,sep="\t",header=True,index=False, mode='a') + print("[{}] iAnnotateSV complete".format(datetime.now())) + if __name__ == "__main__": main() diff --git a/containers/iannotatesv/utils.py b/containers/iannotatesv/utils.py index 2e9f84fb..db39d38b 100644 --- a/containers/iannotatesv/utils.py +++ b/containers/iannotatesv/utils.py @@ -1,8 +1,22 @@ +#!/usr/bin/env python + +__author__ = "Anne Marie Noronha" +__email__ = "noronhaa@mskcc.org" +__version__ = "0.0.1" +__status__ = "Dev" + import argparse import numpy as np import pandas as pd import pybedtools + +overlap_type = {"both":"notboth", + "notboth":"both", + "either":"neither", + "neither":"either" + } + def add_tag(val,tag ): """ update FILTER value @@ -27,6 +41,14 @@ def run_pair_to_bed(bedpe,bed,match_type): result = bedpe.pair_to_bed(bed, **{'type': match_type}) return result +def run_pair_to_pair(bedpe,filter_bedpe,match_type,ignore_strand=False): + """ + use pybedtools to run bedtools pairtopair + a length of 1 base must be artificially added to each end with a length of 0, otherwise no overlap + """ + result = bedpe.pair_to_pair(filter_bedpe, **{'type': match_type,'is':ignore_strand}) + return result + def bedtool_to_df(bt,header_list): try: @@ -47,27 +69,45 @@ def bedtool_to_df(bt,header_list): print("Unable to apply formatting to bedpe columns using pandas, when converting pybedtool.Bedtool to pandas.DataFrame. Skipping...") return df.drop_duplicates() -def parse_svtools_bedpe_file(bedpe): +def parse_svtools_bedpe_file(bedpe, offset = True): + """ + Read bedpe file + 1. Separate components meta-data, header line, and records (main data) + 2. Coerce chromosome values to string, and position coordinates to integer + 3. if offset set to True, add +1 to END_* coordinates. This may be necessary for certain pybedtools operations. + """ + meta_header="" with open(bedpe, 'r') as f: - x = f.readlines() - meta_header = [x[y] for y in range(len(x)) if x[y].startswith("##")] - header_idx = [y for y in range(len(x)) if x[y].startswith("#CHROM")][0] - bedpe_header_list = x[header_idx].rstrip().split("\t") - bedpe_df = pd.read_csv(bedpe, skiprows=header_idx,header=0, sep="\t") + main_data = False + while main_data == False: + x = f.readline() + if x.startswith("##"): + meta_header += x + else: + header = x + header_list = header.strip().split("\t") + main_data = True + try: + records_df = pd.read_csv(f, header=None, sep="\t" ) + records_df.columns = header_list + except: + records_df = pd.DataFrame(columns = header_list) + try: - bedpe_df = bedpe_df.astype({i:int for i in "START_A|END_A|START_B|END_B".split("|")}) + records_df = records_df.astype({i:int for i in "START_A|END_A|START_B|END_B".split("|")}) for j in ["#CHROM_A","CHROM_B"]: - if bedpe_df[j].dtype == 'float64': - bedpe_df = bedpe_df.astype({j:int}).astype({j:str}) + if records_df[j].dtype == 'float64': + records_df = records_df.astype({j:int}).astype({j:str}) except Exception as e: print(e) print("Unable to apply formatting to bedpe columns in pandas. Skipping...") try: - if bedpe_df.shape[0] > 0: - bedpe_df["END_A"] = bedpe_df.apply(lambda row: row["END_A"] + 1 if row["START_A"] == row["END_A"] else row["END_A"],axis=1) - bedpe_df["END_B"] = bedpe_df.apply(lambda row: row["END_B"] + 1 if row["START_B"] == row["END_B"] else row["END_B"],axis=1) + if offset: + if records_df.shape[0] > 0: + records_df["END_A"] = records_df.apply(lambda row: row["END_A"] + 1 if row["START_A"] == row["END_A"] else row["END_A"],axis=1) + records_df["END_B"] = records_df.apply(lambda row: row["END_B"] + 1 if row["START_B"] == row["END_B"] else row["END_B"],axis=1) except Exception as e: print(e) print("Unable to add +1 offset to END_A and END_B columns in pandas. There may be issues with pybedtools. Skipping...") - return [meta_header, bedpe_header_list,bedpe_df] + return [meta_header, header_list, records_df] diff --git a/modules/function/define_maps.nf b/modules/function/define_maps.nf index 2248ba2d..d31363d7 100644 --- a/modules/function/define_maps.nf +++ b/modules/function/define_maps.nf @@ -63,6 +63,10 @@ def defineReferenceMap() { result_array << ['brassRefDir' : checkParamReturnFile('brassRefDir')] result_array << ['vagrentRefDir' : checkParamReturnFile('vagrentRefDir')] } + result_array << ['svBlacklistBed' : checkParamReturnFile('svBlacklistBed')] + result_array << ['svBlacklistBedpe' : checkParamReturnFile('svBlacklistBedpe')] + result_array << ['svBlacklistFoldbackBedpe' : checkParamReturnFile('svBlacklistFoldbackBedpe')] + result_array << ['svBlacklistTEBedpe' : checkParamReturnFile('svBlacklistTEBedpe')] return result_array } diff --git a/modules/process/GermSV/GermlineAnnotateSVBedpe.nf b/modules/process/GermSV/GermlineAnnotateSVBedpe.nf index 11a320cc..c8a1c6f4 100644 --- a/modules/process/GermSV/GermlineAnnotateSVBedpe.nf +++ b/modules/process/GermSV/GermlineAnnotateSVBedpe.nf @@ -1,12 +1,16 @@ process GermlineAnnotateSVBedpe { tag "${idNormal}" - publishDir "${params.outDir}/germline/${outputPrefix}/combined_svs", mode: params.publishDirMode, pattern: "*.combined.filtered.bedpe" + publishDir "${params.outDir}/germline/${outputPrefix}/combined_svs", mode: params.publishDirMode, pattern: "*.combined.annot.filtered*.bedpe" input: tuple val(idNormal), val(target), path(bedpein) path(repeatMasker) path(mapabilityBlacklist) + path(svBlacklistBed) + path(svBlacklistBedpe) + path(svBlacklistFoldbackBedpe) + path(svBlacklistTEBedpe) path(spliceSites) path(custom_scripts) val(genome) @@ -19,31 +23,61 @@ process GermlineAnnotateSVBedpe { outputPrefix = "${idNormal}" genome_ = ["GRCh37","smallGRCh37"].contains(genome) ? "hg19" : genome == "GRCh38" ? "hg38" : "hg18" """ - python ${custom_scripts}/pair_to_bed_annot.py \\ + python ${custom_scripts}/filter_regions_bedpe.py \\ --blacklist-regions ${mapabilityBlacklist} \\ --bedpe ${bedpein} \\ --tag mappability \\ --output ${outputPrefix}.combined.dac.bedpe \\ --match-type either - python ${custom_scripts}/pair_to_bed_annot.py \\ + python ${custom_scripts}/filter_regions_bedpe.py \\ --blacklist-regions ${repeatMasker} \\ --bedpe ${outputPrefix}.combined.dac.bedpe \\ --tag repeat_masker \\ --output ${outputPrefix}.combined.dac.rm.bedpe \\ --match-type either + + python ${custom_scripts}/filter_regions_bedpe.py \\ + --blacklist-regions ${svBlacklistBed} \\ + --bedpe ${outputPrefix}.combined.dac.rm.bedpe \\ + --tag pcawg_blacklist_bed \\ + --output ${outputPrefix}.combined.dac.rm.pcawg.1.bedpe \\ + --match-type either + + python ${custom_scripts}/filter_regions_bedpe.py \\ + --blacklist-regions ${svBlacklistBedpe} \\ + --bedpe ${outputPrefix}.combined.dac.rm.pcawg.1.bedpe \\ + --tag pcawg_blacklist_bedpe \\ + --output ${outputPrefix}.combined.dac.rm.pcawg.2.bedpe \\ + --match-type both \\ + --ignore-strand + + python ${custom_scripts}/filter_regions_bedpe.py \\ + --blacklist-regions ${svBlacklistFoldbackBedpe} \\ + --bedpe ${outputPrefix}.combined.dac.rm.pcawg.2.bedpe \\ + --tag pcawg_blacklist_fb_bedpe \\ + --output ${outputPrefix}.combined.dac.rm.pcawg.3.bedpe \\ + --match-type both + python ${custom_scripts}/filter_regions_bedpe.py \\ + --blacklist-regions ${svBlacklistTEBedpe} \\ + --bedpe ${outputPrefix}.combined.dac.rm.pcawg.3.bedpe \\ + --tag pcawg_blacklist_te_bedpe \\ + --output ${outputPrefix}.combined.dac.rm.pcawg.4.bedpe \\ + --match-type either + python ${custom_scripts}/detect_cdna.py \\ --exon-junct ${spliceSites} \\ - --bedpe ${outputPrefix}.combined.dac.rm.bedpe \\ - --out-bedpe ${outputPrefix}.combined.dac.rm.cdna.bedpe \\ + --bedpe ${outputPrefix}.combined.dac.rm.pcawg.4.bedpe \\ + --out-bedpe ${outputPrefix}.combined.dac.rm.pcawg.cdna.bedpe \\ --out ${outputPrefix}.contamination.tsv python ${custom_scripts}/run_iannotatesv.py \\ - --bedpe ${outputPrefix}.combined.dac.rm.cdna.bedpe \\ - --genome ${genome_} + --bedpe ${outputPrefix}.combined.dac.rm.pcawg.cdna.bedpe \\ + --genome ${genome_} \\ + --threads ${task.cpus * 2} - cp ${outputPrefix}.combined.dac.rm.cdna.iannotate.bedpe \\ + cp ${outputPrefix}.combined.dac.rm.pcawg.cdna.iannotate.bedpe \\ ${outputPrefix}.combined.annot.filtered.bedpe awk -F"\\t" '\$1 ~ /#/ || \$12 == "PASS"' \\ diff --git a/modules/process/GermSV/GermlineSVVcf2Bedpe.nf b/modules/process/GermSV/GermlineSVVcf2Bedpe.nf index a8523e33..7679d8c1 100644 --- a/modules/process/GermSV/GermlineSVVcf2Bedpe.nf +++ b/modules/process/GermSV/GermlineSVVcf2Bedpe.nf @@ -16,9 +16,12 @@ process GermlineSVVcf2Bedpe { svtools vcftobedpe \\ -i ${vcfFile} \\ - -o ${outputPrefix}.combined.unsorted.bedpe \\ + -o ${outputPrefix}.combined.tmp.bedpe \\ -t ${outputPrefix}_tmp + zgrep "^##" ${vcfFile} | sed "s/##fileformat=*/##fileformat=BEDPE/g" > ${outputPrefix}.combined.unsorted.bedpe + grep -v "^##" ${outputPrefix}.combined.tmp.bedpe >> ${outputPrefix}.combined.unsorted.bedpe + if [ ! -s ${outputPrefix}.combined.unsorted.bedpe ] ; then echo -e "#CHROM_A\\tSTART_A\\tEND_A\\tCHROM_B\\tSTART_B\\tEND_B\\tID\\tQUAL\\tSTRAND_A\\tSTRAND_B\\tTYPE\\tFILTER\\tNAME_A\\tREF_A\\tALT_A\\tNAME_B\\tREF_B\\tALT_B\\tINFO_A\\tINFO_B\\tFORMAT\\t${idNormal}" >> ${outputPrefix}.combined.unsorted.bedpe fi diff --git a/modules/process/SV/SomaticAnnotateSVBedpe.nf b/modules/process/SV/SomaticAnnotateSVBedpe.nf index b8e0627e..25f08b9c 100644 --- a/modules/process/SV/SomaticAnnotateSVBedpe.nf +++ b/modules/process/SV/SomaticAnnotateSVBedpe.nf @@ -7,6 +7,10 @@ process SomaticAnnotateSVBedpe { tuple val(idTumor), val(idNormal), val(target), path(bedpein) path(repeatMasker) path(mapabilityBlacklist) + path(svBlacklistBed) + path(svBlacklistBedpe) + path(svBlacklistFoldbackBedpe) + path(svBlacklistTEBedpe) path(spliceSites) path(custom_scripts) val(genome) @@ -19,31 +23,61 @@ process SomaticAnnotateSVBedpe { outputPrefix = "${idTumor}__${idNormal}" genome_ = ["GRCh37","smallGRCh37"].contains(genome) ? "hg19" : genome == "GRCh38" ? "hg38" : "hg18" """ - python ${custom_scripts}/pair_to_bed_annot.py \\ + python ${custom_scripts}/filter_regions_bedpe.py \\ --blacklist-regions ${mapabilityBlacklist} \\ --bedpe ${bedpein} \\ --tag mappability \\ --output ${outputPrefix}.combined.dac.bedpe \\ --match-type either - python ${custom_scripts}/pair_to_bed_annot.py \\ + python ${custom_scripts}/filter_regions_bedpe.py \\ --blacklist-regions ${repeatMasker} \\ --bedpe ${outputPrefix}.combined.dac.bedpe \\ --tag repeat_masker \\ --output ${outputPrefix}.combined.dac.rm.bedpe \\ - --match-type either + --match-type either + + python ${custom_scripts}/filter_regions_bedpe.py \\ + --blacklist-regions ${svBlacklistBed} \\ + --bedpe ${outputPrefix}.combined.dac.rm.bedpe \\ + --tag pcawg_blacklist_bed \\ + --output ${outputPrefix}.combined.dac.rm.pcawg.1.bedpe \\ + --match-type either + + python ${custom_scripts}/filter_regions_bedpe.py \\ + --blacklist-regions ${svBlacklistBedpe} \\ + --bedpe ${outputPrefix}.combined.dac.rm.pcawg.1.bedpe \\ + --tag pcawg_blacklist_bedpe \\ + --output ${outputPrefix}.combined.dac.rm.pcawg.2.bedpe \\ + --match-type both \\ + --ignore-strand + + python ${custom_scripts}/filter_regions_bedpe.py \\ + --blacklist-regions ${svBlacklistFoldbackBedpe} \\ + --bedpe ${outputPrefix}.combined.dac.rm.pcawg.2.bedpe \\ + --tag pcawg_blacklist_fb_bedpe \\ + --output ${outputPrefix}.combined.dac.rm.pcawg.3.bedpe \\ + --match-type both + python ${custom_scripts}/filter_regions_bedpe.py \\ + --blacklist-regions ${svBlacklistTEBedpe} \\ + --bedpe ${outputPrefix}.combined.dac.rm.pcawg.3.bedpe \\ + --tag pcawg_blacklist_te_bedpe \\ + --output ${outputPrefix}.combined.dac.rm.pcawg.4.bedpe \\ + --match-type either + python ${custom_scripts}/detect_cdna.py \\ --exon-junct ${spliceSites} \\ - --bedpe ${outputPrefix}.combined.dac.rm.bedpe \\ - --out-bedpe ${outputPrefix}.combined.dac.rm.cdna.bedpe \\ + --bedpe ${outputPrefix}.combined.dac.rm.pcawg.4.bedpe \\ + --out-bedpe ${outputPrefix}.combined.dac.rm.pcawg.cdna.bedpe \\ --out ${outputPrefix}.contamination.tsv python ${custom_scripts}/run_iannotatesv.py \\ - --bedpe ${outputPrefix}.combined.dac.rm.cdna.bedpe \\ - --genome ${genome_} + --bedpe ${outputPrefix}.combined.dac.rm.pcawg.cdna.bedpe \\ + --genome ${genome_} \\ + --threads ${task.cpus * 2} - cp ${outputPrefix}.combined.dac.rm.cdna.iannotate.bedpe \\ + cp ${outputPrefix}.combined.dac.rm.pcawg.cdna.iannotate.bedpe \\ ${outputPrefix}.combined.annot.filtered.bedpe awk -F"\\t" '\$1 ~ /#/ || \$12 == "PASS"' \\ diff --git a/modules/process/SV/SomaticSVVcf2Bedpe.nf b/modules/process/SV/SomaticSVVcf2Bedpe.nf index e583eb99..2a2f364f 100644 --- a/modules/process/SV/SomaticSVVcf2Bedpe.nf +++ b/modules/process/SV/SomaticSVVcf2Bedpe.nf @@ -16,9 +16,12 @@ process SomaticSVVcf2Bedpe { svtools vcftobedpe \\ -i ${vcfFile} \\ - -o ${outputPrefix}.combined.unsorted.bedpe \\ + -o ${outputPrefix}.combined.tmp.bedpe \\ -t ${outputPrefix}_tmp + zgrep "^##" ${vcfFile} | sed "s/##fileformat=*/##fileformat=BEDPE/g" > ${outputPrefix}.combined.unsorted.bedpe + grep -v "^##" ${outputPrefix}.combined.tmp.bedpe >> ${outputPrefix}.combined.unsorted.bedpe + if [ ! -s ${outputPrefix}.combined.unsorted.bedpe ] ; then echo -e "#CHROM_A\\tSTART_A\\tEND_A\\tCHROM_B\\tSTART_B\\tEND_B\\tID\\tQUAL\\tSTRAND_A\\tSTRAND_B\\tTYPE\\tFILTER\\tNAME_A\\tREF_A\\tALT_A\\tNAME_B\\tREF_B\\tALT_B\\tINFO_A\\tINFO_B\\tFORMAT\\t${idNormal}\\t${idTumor}" >> ${outputPrefix}.combined.unsorted.bedpe fi diff --git a/modules/subworkflow/germlineSV_wf.nf b/modules/subworkflow/germlineSV_wf.nf index f8d45811..5b0aa5d7 100644 --- a/modules/subworkflow/germlineSV_wf.nf +++ b/modules/subworkflow/germlineSV_wf.nf @@ -55,6 +55,10 @@ workflow germlineSV_wf GermlineSVVcf2Bedpe.out.GermlineCombinedUnfilteredBedpe, referenceMap.repeatMasker, referenceMap.mapabilityBlacklist, + referenceMap.svBlacklistBed, + referenceMap.svBlacklistBedpe, + referenceMap.svBlacklistFoldbackBedpe, + referenceMap.svBlacklistTEBedpe, referenceMap.spliceSites, workflow.projectDir + "/containers/iannotatesv", params.genome diff --git a/modules/subworkflow/sv_wf.nf b/modules/subworkflow/sv_wf.nf index 3feac813..6eb45e4d 100644 --- a/modules/subworkflow/sv_wf.nf +++ b/modules/subworkflow/sv_wf.nf @@ -81,6 +81,10 @@ workflow sv_wf SomaticSVVcf2Bedpe.out.SomaticCombinedUnfilteredBedpe, referenceMap.repeatMasker, referenceMap.mapabilityBlacklist, + referenceMap.svBlacklistBed, + referenceMap.svBlacklistBedpe, + referenceMap.svBlacklistFoldbackBedpe, + referenceMap.svBlacklistTEBedpe, referenceMap.spliceSites, workflow.projectDir + "/containers/iannotatesv", params.genome