Merge pull request #965 from mskcc/dsl2/filterSVBlacklist

Dsl2/filter sv blacklist
mskcc · Jul 25, 2022 · 2d4e831 · 2d4e831
2 parents 7625b80 + 54209cb
commit 2d4e831
Show file tree

Hide file tree

Showing 16 changed files with 363 additions and 105 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -14,7 +14,7 @@ before_install:
   - chmod 777 nextflow
   # to change the test-data for travis, please download using the following command, extract, make changes, tarball again with gzip, and upload to google drive. 
   # you will have to change the link below as well. Click to share the link, making it so anyone with the link can access, then extract the id in the link and put it here after "id="
-  - wget -O test-data.tar.gz --no-check-certificate 'https://docs.google.com/uc?export=download&confirm=no_antivirus&id=1XuitIb02GZ5v928Do2Vh4hnQNkbSAT0V'
+  - wget -O test-data.tar.gz --no-check-certificate 'https://docs.google.com/uc?export=download&confirm=no_antivirus&id=13zUVw4BZ0_5QAW7zmdCsZ_CZDHbNxyMV'
   - tar -xzvf test-data.tar.gz 
 
 script:

diff --git a/conf/references.config b/conf/references.config
@@ -60,6 +60,10 @@ params {
       hlaDat = "${params.reference_base}/hla/hla.dat"
       neoantigenCDNA = "${params.reference_base}/neoantigen/Homo_sapiens.GRCh37.75.cdna.all.fa.gz"
       neoantigenCDS = "${params.reference_base}/neoantigen/Homo_sapiens.GRCh37.75.cds.all.fa.gz"
+      svBlacklistBed   = "${params.genome_base}/sv_calling/pcawg6_blacklist.slop.bed.gz"
+      svBlacklistBedpe = "${params.genome_base}/sv_calling/pcawg6_blacklist.slop.trunc.bedpe.gz"
+      svBlacklistFoldbackBedpe = "${params.genome_base}/sv_calling/pcawg6_blacklist_foldback_artefacts.slop.trunc.bedpe.gz"
+      svBlacklistTEBedpe       = "${params.genome_base}/sv_calling/pcawg6_blacklist_TE_pseudogene.bedpe.gz"
     }
     'GRCh37' {
       acLoci      = "${params.genome_base}/Annotation/ASCAT/1000G_phase3_20130502_SNP_maf0.3.loci"
@@ -101,6 +105,11 @@ params {
       spliceSites = "${params.reference_base}/mskcc-igenomes/grch37/splice_sites/splice_sites.bed"
       brassRefDir      = "${params.reference_base}/mskcc-igenomes/grch37/brass"
       vagrentRefDir    = "${params.reference_base}/mskcc-igenomes/grch37/vagrent"
+      // svBlacklist* source: https://bitbucket.org/weischenfeldt/pcawg_sv_merge/src/docker/data/blacklist_files/
+      svBlacklistBed   = "${params.reference_base}/mskcc-igenomes/grch37/sv_calling/pcawg6_blacklist.slop.bed.gz"
+      svBlacklistBedpe = "${params.reference_base}/mskcc-igenomes/grch37/sv_calling/pcawg6_blacklist.slop.bedpe.gz"
+      svBlacklistFoldbackBedpe = "${params.reference_base}/mskcc-igenomes/grch37/sv_calling/pcawg6_blacklist_foldback_artefacts.slop.bedpe.gz"
+      svBlacklistTEBedpe       = "${params.reference_base}/mskcc-igenomes/grch37/sv_calling/pcawg6_blacklist_TE_pseudogene.bedpe.gz"
     } 
     'GRCh38' {
       acLoci           = "${params.genome_base}/Annotation/ASCAT/1000G_phase3_GRCh38_maf0.3.loci"

diff --git a/conf/resources_juno.config b/conf/resources_juno.config
@@ -113,6 +113,10 @@
     cpus = { task.attempt < 4 ? 1 : 2 } 
     memory = { task.attempt < 3 ? 1.GB * task.attempt : 5.Gb * task.attempt } 
   }
+  withName:SomaticAnnotateSVBedpe {
+    cpus = { task.attempt < 4 ? 1 : 2 }
+    memory = { 8.GB * task.attempt }
+  }
 
 //------------- Germline pipeline
 
@@ -156,6 +160,10 @@
     cpus = { task.attempt < 4 ? 1 : 2 } 
     memory = { task.attempt < 3 ? 1.GB * task.attempt : 5.Gb * task.attempt } 
   }
+  withName:GermlineAnnotateSVBedpe {
+    cpus = { task.attempt < 4 ? 1 : 2 }
+    memory = { 8.GB * task.attempt }
+  }
 
 //------------- Quality Control
 

diff --git a/conf/resources_juno_genome.config b/conf/resources_juno_genome.config
@@ -152,6 +152,10 @@
     cpus = { task.attempt < 4 ? 1 : 2 }
     memory = { 4.GB * task.attempt } 
   }
+  withName:SomaticAnnotateSVBedpe {
+    cpus = { task.attempt < 3 ? 2 : 4 }
+    memory = { 8.GB * task.attempt }
+  }
   withName:HRDetect {
     cpus = { 2 }
     memory = { 4.GB * task.attempt }
@@ -203,6 +207,10 @@
     cpus = { task.attempt < 4 ? 1 : 2 } 
     memory = { task.attempt < 3 ? 1.GB * task.attempt : 5.Gb * task.attempt } 
   }
+  withName:GermlineAnnotateSVBedpe {
+    cpus = { task.attempt < 3 ? 2 : 4 }
+    memory = { 8.GB * task.attempt }
+  }
 
 //------------- Quality Control
 

diff --git a/containers/iannotatesv/detect_cdna.py b/containers/iannotatesv/detect_cdna.py
@@ -1,3 +1,12 @@
+#!/usr/bin/env python
+
+"""Identify deletion rearrangements that may pertain to cDNA contaminants"""
+
+__author__  = "Anne Marie Noronha"
+__email__   = "noronhaa@mskcc.org"
+__version__ = "0.0.1"
+__status__  = "Dev"
+
 import argparse, sys, os
 import numpy as np
 import pandas as pd
@@ -51,6 +60,8 @@ def prep_intersection(intersect=pd.DataFrame(),bedpe_header_list=[]):
 def main():
 	args = usage()
 
+	print("Detecting possible cdna contamination in {}".format(os.path.basename(args.bedpe)))
+
 	[meta_header, bedpe_header_list, bedpe_df] = parse_svtools_bedpe_file(args.bedpe)
 
 	bedpe_bt = pybedtools.BedTool.from_dataframe(bedpe_df)

diff --git a/containers/iannotatesv/filter_regions_bedpe.py b/containers/iannotatesv/filter_regions_bedpe.py
@@ -0,0 +1,118 @@
+#!/usr/bin/env python
+
+"""Filter rearrangements that overlap with a set of regions"""
+
+__author__  = "Anne Marie Noronha"
+__email__   = "noronhaa@mskcc.org"
+__version__ = "0.0.1"
+__status__  = "Dev"
+
+import argparse, sys, os
+import numpy as np
+import pandas as pd
+import pybedtools
+from utils import *
+
+def usage():
+	parser = argparse.ArgumentParser()
+	parser.add_argument('--blacklist-regions', dest="regions", help = 'regions bed/bedpe file', required = True)
+	parser.add_argument('--bedpe', help = 'bedpe file', required = True)
+	parser.add_argument('--tag', help = 'string to update FILTER column', required = True)
+	parser.add_argument('--output',dest="outfile",help = 'output file' , required = True)
+	parser.add_argument('--match-type',dest="type",help = 'both/either/notboth/neither' , required = True)
+	parser.add_argument('--ignore-strand',dest="ignore_strand", default=False, action="store_true", help = 'Use flag to ignore strand when running bedtools. Default False' )
+	return parser.parse_args()
+
+def determine_regions_filetype(filepath):
+	"""
+	Determine if the regions file is bed or bedpe, and raise error if neither
+	"""
+	if not any([filepath.endswith(i) for i in "bed|bedpe|bed.gz|bedpe.gz".split("|")]):
+		raise ValueError("--bedpe requires bed, bedpe, bed.gz or bedpe.gz file.")
+	elif any([filepath.endswith(i) for i in "bed|bed.gz".split("|")]):
+		return True
+	else: return False
+
+def validate_bedpe_input(filepath):
+	"""
+	Raise error if filetype is not bedpe
+	"""
+	if not any([filepath.endswith(i) for i in "bedpe|bedpe.gz".split("|")]):
+		raise ValueError("--bedpe requires bedpe or bedpe.gz file.")
+
+def validate_match_type(type):
+	"""
+	Raise error if type is not acceptable for use in pybedtools pair_to_pair or pair_to_bed
+	"""
+	if type not in overlap_type.keys():
+		raise ValueError( "--match-type must be {}.".format(" or ".join(overlap_type.keys())) )
+
+def main():
+	"""
+	1. validate inputs
+	2. read input beds/bedpes as pybedtools.BedTool objects
+	3. extract variant IDs from overlapping regions
+	4. update the FILTER tag for identified variants
+	"""
+	args = usage()
+
+	print("Filtering {} with {} regions".format(os.path.basename(args.bedpe), args.tag))
+
+	# parse inputs
+	try:
+		validate_bedpe_input(args.bedpe)
+		[meta_header, bedpe_header_list, bedpe_df] = parse_svtools_bedpe_file(args.bedpe)
+	except Exception as e:
+		print(e)
+		sys.exit("Unable to parse --bedpe")
+
+	try:
+		is_bed = determine_regions_filetype(args.regions)
+		regions_bt = pybedtools.BedTool(args.regions)
+	except Exception as e:
+		print(e)
+		sys.exit("Unable to parse --blacklist-regions")
+
+	# annotate bedpe
+	try:
+		bedpe_bt = pybedtools.BedTool.from_dataframe(bedpe_df)
+		ids_df = find_overlapped_ids(bedpe_bt, regions_bt, args.type, args.ignore_strand, is_bed)
+		bedpe_df = add_filter_by_id(bedpe_df,ids_df,args.tag)
+	except Exception as e:
+		print(e)
+		sys.exit("Filtering failed with inputs {} and {}".format(args.bedpe, args.regions))
+
+	# write result
+	with open(args.outfile, "w") as fw:
+		fw.write("".join(meta_header))
+	bedpe_df.to_csv(args.outfile, header=True, index=False, sep="\t", mode="a")
+
+def find_overlapped_ids(bedpe_bt,regions_bt,match_type,ignore_strand,is_bed):
+	# determine validity of match_type
+	validate_match_type(match_type)
+
+	# run pair_to_bed if regions file is bed, pair_to_pair if regions file is bedpe
+	if is_bed:
+		intersect = run_pair_to_bed(bedpe_bt,regions_bt,match_type)
+	else:
+		intersect = run_pair_to_pair(bedpe_bt,regions_bt,match_type, ignore_strand=ignore_strand)
+
+	# extract ids
+	try:
+		ids_df = pd.DataFrame({"ID":list( intersect.to_dataframe(header=None)[6].drop_duplicates() )})
+	except:
+		ids_df = pd.DataFrame(columns=["ID"])
+	return ids_df
+
+def add_filter_by_id(bedpe_df,ids_df,tag):
+	# annotate bedpe based on matching IDs
+	ids_df.columns = ["ID"]
+	ids_df["FILTER_NEW"] = tag
+	filtered_bedpe_df = bedpe_df.merge(ids_df, on="ID", how="left")
+	filtered_bedpe_df = filtered_bedpe_df.apply(lambda x: update_filter(x,x["FILTER_NEW"]) if not pd.isna(x["FILTER_NEW"]) else x, axis=1)
+	filtered_bedpe_df = filtered_bedpe_df.drop(['FILTER_NEW'], axis=1)
+
+	return filtered_bedpe_df
+
+if __name__ == "__main__":
+	main()
diff --git a/containers/iannotatesv/pair_to_bed_annot.py b/containers/iannotatesv/pair_to_bed_annot.py
diff --git a/containers/iannotatesv/run_iannotatesv.py b/containers/iannotatesv/run_iannotatesv.py
@@ -1,52 +1,74 @@
+#!/usr/bin/env python
+
+"""Annotate SV Bedpe file with iAnnotateSV"""
+__author__  = "Anne Marie Noronha"
+__email__   = "noronhaa@mskcc.org"
+__version__ = "0.0.1"
+__status__  = "Dev"
+
 import argparse, sys, os, subprocess
+from collections import OrderedDict
+from datetime import datetime
+import multiprocessing as mp
 import numpy as np
 import pandas as pd
+from utils import *
 
 def usage():
 	parser = argparse.ArgumentParser()
 	parser.add_argument('--bedpe', help = 'bedpe file', required = True)
 	parser.add_argument('--genome', default = "hg19", help = 'hg19/hg38/hg18')
+	parser.add_argument('--threads', type=int, default = 4, help = 'number of threads for parallelization')
 
 	return parser.parse_args()
 
+def run_iannotate_cmd(input_file,output_dir,output_pre,genome="hg19"):
+	print("Spawning in parallel: annotation of " + input_file)
+	run_args = "python /usr/bin/iAnnotateSV/iAnnotateSV/iAnnotateSV.py -i {} -o {} -ofp {} -r {} -d 3000""".format(input_file, output_dir, output_pre, genome).split(" ")
+	p = subprocess.Popen(run_args)
+	p.communicate()
+
+def read_iannotatesv_result(path):
+	return pd.read_csv(path, sep="\t",header=0)
+
 def main():
 	args = usage()
 
-	meta=""
-	with open(args.bedpe, 'r') as f:
-		main_data = False
-		while main_data == False:
-			x = f.readline()
-			if x.startswith("##"):
-				meta += x
-			else: 
-				header=x
-				main_data = True
-		try:
-			data = pd.read_csv(f, header=None, sep="\t" )
-			data.columns = header.strip().split("\t")
-		except:
-			data = pd.DataFrame(columns = header.strip().split("\t"))
+	dt = datetime.now()
+	print("[{}] Running iAnnotateSV on {}".format(dt,os.path.basename(args.bedpe)))
 
+	meta, header_list, data = parse_svtools_bedpe_file(args.bedpe)
 	iAnnotate_input = data["#CHROM_A|START_A|STRAND_A|CHROM_B|START_B|STRAND_B|ID".split("|")]
-	iAnnotate_input.columns = "chr1|pos1|str1|chr2|pos2|str2|ID".split("|")
-
 	iAnnotate_input = iAnnotate_input.replace("-", 1).replace("+", 0)
+	key_col = OrderedDict(zip("chr1|pos1|str1|chr2|pos2|str2".split("|"), [str,int,int,str,int,int]))
+	iAnnotate_input.columns = key_col.keys() + ["ID"]
+	for k,v in key_col.items():
+		iAnnotate_input[k] = iAnnotate_input[k].astype(v)
 
-	iAnnotate_input.to_csv("input.txt",sep="\t",header=True,index=False)
+	k = int(500)
+	n_chunks = int(((k-1)+iAnnotate_input.shape[0])/k)
+	iAnnotate_output = pd.DataFrame()
+	if not os.path.isdir("inputs"): os.mkdir("inputs")
+	if not os.path.isdir("outputs"): os.mkdir("outputs")
 
-	if not os.path.isdir("outputdir"):
-		os.mkdir("outputdir")
-	run_args = "python /usr/bin/iAnnotateSV/iAnnotateSV/iAnnotateSV.py -i input.txt -ofp outputfilePrefix -o outputdir -r {} -d 3000""".format(args.genome).split(" ")
-	p = subprocess.Popen(run_args)
-	p.communicate()
+	pool = mp.Pool(args.threads)
+	for i in range(n_chunks):
+		iAnnotate_input_chunk = iAnnotate_input.iloc[i*k:(i+1)*k]
+		input_file = os.path.join("inputs","input_" + str(i) + ".txt")
+		iAnnotate_input_chunk.to_csv(input_file,sep="\t",header=True,index=False)
+		pool.apply_async(run_iannotate_cmd, args=(input_file, "outputs", "output_" + str(i), args.genome ))
 
-	iAnnotate_output = pd.read_csv("outputdir/outputfilePrefix_Annotated.txt", sep="\t",header=0)
-	iAnnotate_output = pd.merge(iAnnotate_input,iAnnotate_output, on="chr1|pos1|str1|chr2|pos2|str2".split("|"), how="inner")
+	pool.close()
+	pool.join()
 
-	annot_data = iAnnotate_output.drop("chr1|pos1|str1|chr2|pos2|str2".split("|"), axis=1)
-	annot_data = annot_data.replace(np.nan, ".")
+	iAnnotate_output = pd.concat(map(read_iannotatesv_result, [os.path.join("outputs","output_" + str(i) + "_Annotated.txt") for i in range(n_chunks)]))
+	for key,val in key_col.items():
+		iAnnotate_output[key] = iAnnotate_output[key].astype(val)
 
+	iAnnotate_output = pd.merge(iAnnotate_input,iAnnotate_output, on=key_col.keys(), how="inner")
+
+	print("[{}] Merging annotation with bedpe".format(datetime.now()))
+	annot_data = iAnnotate_output.drop(key_col.keys(), axis=1).replace(np.nan, ".")
 	final_data = pd.merge(data, annot_data, on="ID",how="left")
 
 	outputbed = os.path.basename(args.bedpe)[:-6] if args.bedpe.endswith(".bedpe") else os.path.basename(args.bedpe)
@@ -55,5 +77,7 @@ def main():
 		fw.write(meta)
 	final_data.to_csv(outputbed,sep="\t",header=True,index=False, mode='a')
 
+	print("[{}] iAnnotateSV complete".format(datetime.now()))
+
 if __name__ == "__main__":
 	main()