Skip to content

Commit

Permalink
Merge pull request #965 from mskcc/dsl2/filterSVBlacklist
Browse files Browse the repository at this point in the history
Dsl2/filter sv blacklist
  • Loading branch information
anoronh4 authored Jul 25, 2022
2 parents 7625b80 + 54209cb commit 2d4e831
Show file tree
Hide file tree
Showing 16 changed files with 363 additions and 105 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ before_install:
- chmod 777 nextflow
# to change the test-data for travis, please download using the following command, extract, make changes, tarball again with gzip, and upload to google drive.
# you will have to change the link below as well. Click to share the link, making it so anyone with the link can access, then extract the id in the link and put it here after "id="
- wget -O test-data.tar.gz --no-check-certificate 'https://docs.google.com/uc?export=download&confirm=no_antivirus&id=1XuitIb02GZ5v928Do2Vh4hnQNkbSAT0V'
- wget -O test-data.tar.gz --no-check-certificate 'https://docs.google.com/uc?export=download&confirm=no_antivirus&id=13zUVw4BZ0_5QAW7zmdCsZ_CZDHbNxyMV'
- tar -xzvf test-data.tar.gz

script:
Expand Down
9 changes: 9 additions & 0 deletions conf/references.config
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,10 @@ params {
hlaDat = "${params.reference_base}/hla/hla.dat"
neoantigenCDNA = "${params.reference_base}/neoantigen/Homo_sapiens.GRCh37.75.cdna.all.fa.gz"
neoantigenCDS = "${params.reference_base}/neoantigen/Homo_sapiens.GRCh37.75.cds.all.fa.gz"
svBlacklistBed = "${params.genome_base}/sv_calling/pcawg6_blacklist.slop.bed.gz"
svBlacklistBedpe = "${params.genome_base}/sv_calling/pcawg6_blacklist.slop.trunc.bedpe.gz"
svBlacklistFoldbackBedpe = "${params.genome_base}/sv_calling/pcawg6_blacklist_foldback_artefacts.slop.trunc.bedpe.gz"
svBlacklistTEBedpe = "${params.genome_base}/sv_calling/pcawg6_blacklist_TE_pseudogene.bedpe.gz"
}
'GRCh37' {
acLoci = "${params.genome_base}/Annotation/ASCAT/1000G_phase3_20130502_SNP_maf0.3.loci"
Expand Down Expand Up @@ -101,6 +105,11 @@ params {
spliceSites = "${params.reference_base}/mskcc-igenomes/grch37/splice_sites/splice_sites.bed"
brassRefDir = "${params.reference_base}/mskcc-igenomes/grch37/brass"
vagrentRefDir = "${params.reference_base}/mskcc-igenomes/grch37/vagrent"
// svBlacklist* source: https://bitbucket.org/weischenfeldt/pcawg_sv_merge/src/docker/data/blacklist_files/
svBlacklistBed = "${params.reference_base}/mskcc-igenomes/grch37/sv_calling/pcawg6_blacklist.slop.bed.gz"
svBlacklistBedpe = "${params.reference_base}/mskcc-igenomes/grch37/sv_calling/pcawg6_blacklist.slop.bedpe.gz"
svBlacklistFoldbackBedpe = "${params.reference_base}/mskcc-igenomes/grch37/sv_calling/pcawg6_blacklist_foldback_artefacts.slop.bedpe.gz"
svBlacklistTEBedpe = "${params.reference_base}/mskcc-igenomes/grch37/sv_calling/pcawg6_blacklist_TE_pseudogene.bedpe.gz"
}
'GRCh38' {
acLoci = "${params.genome_base}/Annotation/ASCAT/1000G_phase3_GRCh38_maf0.3.loci"
Expand Down
8 changes: 8 additions & 0 deletions conf/resources_juno.config
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,10 @@
cpus = { task.attempt < 4 ? 1 : 2 }
memory = { task.attempt < 3 ? 1.GB * task.attempt : 5.Gb * task.attempt }
}
withName:SomaticAnnotateSVBedpe {
cpus = { task.attempt < 4 ? 1 : 2 }
memory = { 8.GB * task.attempt }
}

//------------- Germline pipeline

Expand Down Expand Up @@ -156,6 +160,10 @@
cpus = { task.attempt < 4 ? 1 : 2 }
memory = { task.attempt < 3 ? 1.GB * task.attempt : 5.Gb * task.attempt }
}
withName:GermlineAnnotateSVBedpe {
cpus = { task.attempt < 4 ? 1 : 2 }
memory = { 8.GB * task.attempt }
}

//------------- Quality Control

Expand Down
8 changes: 8 additions & 0 deletions conf/resources_juno_genome.config
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,10 @@
cpus = { task.attempt < 4 ? 1 : 2 }
memory = { 4.GB * task.attempt }
}
withName:SomaticAnnotateSVBedpe {
cpus = { task.attempt < 3 ? 2 : 4 }
memory = { 8.GB * task.attempt }
}
withName:HRDetect {
cpus = { 2 }
memory = { 4.GB * task.attempt }
Expand Down Expand Up @@ -203,6 +207,10 @@
cpus = { task.attempt < 4 ? 1 : 2 }
memory = { task.attempt < 3 ? 1.GB * task.attempt : 5.Gb * task.attempt }
}
withName:GermlineAnnotateSVBedpe {
cpus = { task.attempt < 3 ? 2 : 4 }
memory = { 8.GB * task.attempt }
}

//------------- Quality Control

Expand Down
11 changes: 11 additions & 0 deletions containers/iannotatesv/detect_cdna.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
#!/usr/bin/env python

"""Identify deletion rearrangements that may pertain to cDNA contaminants"""

__author__ = "Anne Marie Noronha"
__email__ = "noronhaa@mskcc.org"
__version__ = "0.0.1"
__status__ = "Dev"

import argparse, sys, os
import numpy as np
import pandas as pd
Expand Down Expand Up @@ -51,6 +60,8 @@ def prep_intersection(intersect=pd.DataFrame(),bedpe_header_list=[]):
def main():
args = usage()

print("Detecting possible cdna contamination in {}".format(os.path.basename(args.bedpe)))

[meta_header, bedpe_header_list, bedpe_df] = parse_svtools_bedpe_file(args.bedpe)

bedpe_bt = pybedtools.BedTool.from_dataframe(bedpe_df)
Expand Down
118 changes: 118 additions & 0 deletions containers/iannotatesv/filter_regions_bedpe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
#!/usr/bin/env python

"""Filter rearrangements that overlap with a set of regions"""

__author__ = "Anne Marie Noronha"
__email__ = "noronhaa@mskcc.org"
__version__ = "0.0.1"
__status__ = "Dev"

import argparse, sys, os
import numpy as np
import pandas as pd
import pybedtools
from utils import *

def usage():
parser = argparse.ArgumentParser()
parser.add_argument('--blacklist-regions', dest="regions", help = 'regions bed/bedpe file', required = True)
parser.add_argument('--bedpe', help = 'bedpe file', required = True)
parser.add_argument('--tag', help = 'string to update FILTER column', required = True)
parser.add_argument('--output',dest="outfile",help = 'output file' , required = True)
parser.add_argument('--match-type',dest="type",help = 'both/either/notboth/neither' , required = True)
parser.add_argument('--ignore-strand',dest="ignore_strand", default=False, action="store_true", help = 'Use flag to ignore strand when running bedtools. Default False' )
return parser.parse_args()

def determine_regions_filetype(filepath):
"""
Determine if the regions file is bed or bedpe, and raise error if neither
"""
if not any([filepath.endswith(i) for i in "bed|bedpe|bed.gz|bedpe.gz".split("|")]):
raise ValueError("--bedpe requires bed, bedpe, bed.gz or bedpe.gz file.")
elif any([filepath.endswith(i) for i in "bed|bed.gz".split("|")]):
return True
else: return False

def validate_bedpe_input(filepath):
"""
Raise error if filetype is not bedpe
"""
if not any([filepath.endswith(i) for i in "bedpe|bedpe.gz".split("|")]):
raise ValueError("--bedpe requires bedpe or bedpe.gz file.")

def validate_match_type(type):
"""
Raise error if type is not acceptable for use in pybedtools pair_to_pair or pair_to_bed
"""
if type not in overlap_type.keys():
raise ValueError( "--match-type must be {}.".format(" or ".join(overlap_type.keys())) )

def main():
"""
1. validate inputs
2. read input beds/bedpes as pybedtools.BedTool objects
3. extract variant IDs from overlapping regions
4. update the FILTER tag for identified variants
"""
args = usage()

print("Filtering {} with {} regions".format(os.path.basename(args.bedpe), args.tag))

# parse inputs
try:
validate_bedpe_input(args.bedpe)
[meta_header, bedpe_header_list, bedpe_df] = parse_svtools_bedpe_file(args.bedpe)
except Exception as e:
print(e)
sys.exit("Unable to parse --bedpe")

try:
is_bed = determine_regions_filetype(args.regions)
regions_bt = pybedtools.BedTool(args.regions)
except Exception as e:
print(e)
sys.exit("Unable to parse --blacklist-regions")

# annotate bedpe
try:
bedpe_bt = pybedtools.BedTool.from_dataframe(bedpe_df)
ids_df = find_overlapped_ids(bedpe_bt, regions_bt, args.type, args.ignore_strand, is_bed)
bedpe_df = add_filter_by_id(bedpe_df,ids_df,args.tag)
except Exception as e:
print(e)
sys.exit("Filtering failed with inputs {} and {}".format(args.bedpe, args.regions))

# write result
with open(args.outfile, "w") as fw:
fw.write("".join(meta_header))
bedpe_df.to_csv(args.outfile, header=True, index=False, sep="\t", mode="a")

def find_overlapped_ids(bedpe_bt,regions_bt,match_type,ignore_strand,is_bed):
# determine validity of match_type
validate_match_type(match_type)

# run pair_to_bed if regions file is bed, pair_to_pair if regions file is bedpe
if is_bed:
intersect = run_pair_to_bed(bedpe_bt,regions_bt,match_type)
else:
intersect = run_pair_to_pair(bedpe_bt,regions_bt,match_type, ignore_strand=ignore_strand)

# extract ids
try:
ids_df = pd.DataFrame({"ID":list( intersect.to_dataframe(header=None)[6].drop_duplicates() )})
except:
ids_df = pd.DataFrame(columns=["ID"])
return ids_df

def add_filter_by_id(bedpe_df,ids_df,tag):
# annotate bedpe based on matching IDs
ids_df.columns = ["ID"]
ids_df["FILTER_NEW"] = tag
filtered_bedpe_df = bedpe_df.merge(ids_df, on="ID", how="left")
filtered_bedpe_df = filtered_bedpe_df.apply(lambda x: update_filter(x,x["FILTER_NEW"]) if not pd.isna(x["FILTER_NEW"]) else x, axis=1)
filtered_bedpe_df = filtered_bedpe_df.drop(['FILTER_NEW'], axis=1)

return filtered_bedpe_df

if __name__ == "__main__":
main()
46 changes: 0 additions & 46 deletions containers/iannotatesv/pair_to_bed_annot.py

This file was deleted.

78 changes: 51 additions & 27 deletions containers/iannotatesv/run_iannotatesv.py
Original file line number Diff line number Diff line change
@@ -1,52 +1,74 @@
#!/usr/bin/env python

"""Annotate SV Bedpe file with iAnnotateSV"""
__author__ = "Anne Marie Noronha"
__email__ = "noronhaa@mskcc.org"
__version__ = "0.0.1"
__status__ = "Dev"

import argparse, sys, os, subprocess
from collections import OrderedDict
from datetime import datetime
import multiprocessing as mp
import numpy as np
import pandas as pd
from utils import *

def usage():
parser = argparse.ArgumentParser()
parser.add_argument('--bedpe', help = 'bedpe file', required = True)
parser.add_argument('--genome', default = "hg19", help = 'hg19/hg38/hg18')
parser.add_argument('--threads', type=int, default = 4, help = 'number of threads for parallelization')

return parser.parse_args()

def run_iannotate_cmd(input_file,output_dir,output_pre,genome="hg19"):
print("Spawning in parallel: annotation of " + input_file)
run_args = "python /usr/bin/iAnnotateSV/iAnnotateSV/iAnnotateSV.py -i {} -o {} -ofp {} -r {} -d 3000""".format(input_file, output_dir, output_pre, genome).split(" ")
p = subprocess.Popen(run_args)
p.communicate()

def read_iannotatesv_result(path):
return pd.read_csv(path, sep="\t",header=0)

def main():
args = usage()

meta=""
with open(args.bedpe, 'r') as f:
main_data = False
while main_data == False:
x = f.readline()
if x.startswith("##"):
meta += x
else:
header=x
main_data = True
try:
data = pd.read_csv(f, header=None, sep="\t" )
data.columns = header.strip().split("\t")
except:
data = pd.DataFrame(columns = header.strip().split("\t"))
dt = datetime.now()
print("[{}] Running iAnnotateSV on {}".format(dt,os.path.basename(args.bedpe)))

meta, header_list, data = parse_svtools_bedpe_file(args.bedpe)
iAnnotate_input = data["#CHROM_A|START_A|STRAND_A|CHROM_B|START_B|STRAND_B|ID".split("|")]
iAnnotate_input.columns = "chr1|pos1|str1|chr2|pos2|str2|ID".split("|")

iAnnotate_input = iAnnotate_input.replace("-", 1).replace("+", 0)
key_col = OrderedDict(zip("chr1|pos1|str1|chr2|pos2|str2".split("|"), [str,int,int,str,int,int]))
iAnnotate_input.columns = key_col.keys() + ["ID"]
for k,v in key_col.items():
iAnnotate_input[k] = iAnnotate_input[k].astype(v)

iAnnotate_input.to_csv("input.txt",sep="\t",header=True,index=False)
k = int(500)
n_chunks = int(((k-1)+iAnnotate_input.shape[0])/k)
iAnnotate_output = pd.DataFrame()
if not os.path.isdir("inputs"): os.mkdir("inputs")
if not os.path.isdir("outputs"): os.mkdir("outputs")

if not os.path.isdir("outputdir"):
os.mkdir("outputdir")
run_args = "python /usr/bin/iAnnotateSV/iAnnotateSV/iAnnotateSV.py -i input.txt -ofp outputfilePrefix -o outputdir -r {} -d 3000""".format(args.genome).split(" ")
p = subprocess.Popen(run_args)
p.communicate()
pool = mp.Pool(args.threads)
for i in range(n_chunks):
iAnnotate_input_chunk = iAnnotate_input.iloc[i*k:(i+1)*k]
input_file = os.path.join("inputs","input_" + str(i) + ".txt")
iAnnotate_input_chunk.to_csv(input_file,sep="\t",header=True,index=False)
pool.apply_async(run_iannotate_cmd, args=(input_file, "outputs", "output_" + str(i), args.genome ))

iAnnotate_output = pd.read_csv("outputdir/outputfilePrefix_Annotated.txt", sep="\t",header=0)
iAnnotate_output = pd.merge(iAnnotate_input,iAnnotate_output, on="chr1|pos1|str1|chr2|pos2|str2".split("|"), how="inner")
pool.close()
pool.join()

annot_data = iAnnotate_output.drop("chr1|pos1|str1|chr2|pos2|str2".split("|"), axis=1)
annot_data = annot_data.replace(np.nan, ".")
iAnnotate_output = pd.concat(map(read_iannotatesv_result, [os.path.join("outputs","output_" + str(i) + "_Annotated.txt") for i in range(n_chunks)]))
for key,val in key_col.items():
iAnnotate_output[key] = iAnnotate_output[key].astype(val)

iAnnotate_output = pd.merge(iAnnotate_input,iAnnotate_output, on=key_col.keys(), how="inner")

print("[{}] Merging annotation with bedpe".format(datetime.now()))
annot_data = iAnnotate_output.drop(key_col.keys(), axis=1).replace(np.nan, ".")
final_data = pd.merge(data, annot_data, on="ID",how="left")

outputbed = os.path.basename(args.bedpe)[:-6] if args.bedpe.endswith(".bedpe") else os.path.basename(args.bedpe)
Expand All @@ -55,5 +77,7 @@ def main():
fw.write(meta)
final_data.to_csv(outputbed,sep="\t",header=True,index=False, mode='a')

print("[{}] iAnnotateSV complete".format(datetime.now()))

if __name__ == "__main__":
main()
Loading

0 comments on commit 2d4e831

Please sign in to comment.