Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dsl2/filter sv blacklist #965

Merged
merged 28 commits into from
Jul 25, 2022
Merged
Show file tree
Hide file tree
Changes from 27 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
4ac5ef9
change bedpe filtering script to handle both bedpe and bed regions fi…
anoronh4 Jun 24, 2022
bd5f86e
add pcawg blacklist filtering
anoronh4 Jun 24, 2022
928ef4e
added reference definition for pcawg blacklist files for loading them…
anoronh4 Jun 24, 2022
888a904
fixed filepaths for pcawg SV blacklist files
anoronh4 Jun 24, 2022
188f642
bugfix of file name for germline annotation
anoronh4 Jun 27, 2022
2a2a9ee
reorganized filter_regions_bedpe.py to be more modularized
anoronh4 Jun 27, 2022
2fbc02b
cleanup filter_regions_bedpe.py
anoronh4 Jun 27, 2022
5bf1be3
add notes
anoronh4 Jun 27, 2022
328968c
fix filtering tags for sv annotation
anoronh4 Jun 28, 2022
981bce0
added pcawg blacklist filtering to Germline SV annotation, and fixed …
anoronh4 Jun 29, 2022
da20426
bugfixes
anoronh4 Jun 29, 2022
feea8a3
bugfix for GermlineAnnotateSVBedpe inputs
anoronh4 Jun 30, 2022
ecca378
add reference for trasposable element filtering in SV
anoronh4 Jun 30, 2022
ddaff0e
added print statements for logging purposes, and made it so iannotate…
anoronh4 Jul 2, 2022
21ad22a
bugfixes
anoronh4 Jul 5, 2022
37ab8ad
fix header of bedpe due to improper handling in svtools
anoronh4 Jul 5, 2022
063e5f2
fix bedpe header
anoronh4 Jul 5, 2022
75e35e2
parallelize iannotatesv
anoronh4 Jul 5, 2022
5563d0d
add resource configuration for *AnnotateSVBedpe processes
anoronh4 Jul 6, 2022
d4cdeb3
raise resources for *AnnotateSVBedpe
anoronh4 Jul 11, 2022
92764d0
edited travis test references for filtering SV events
anoronh4 Jul 11, 2022
d0ebe80
bugfix
anoronh4 Jul 11, 2022
5c9faca
Merge branch 'dsl2/enhancement/WGS_SV' into dsl2/filterSVBlacklist
anoronh4 Jul 11, 2022
20bc35b
remove duplicated line
anoronh4 Jul 11, 2022
32978ee
remove chunk parameter from run_iannotatesv.py because finetuning it …
anoronh4 Jul 22, 2022
2d841c8
Merge branch 'dsl2/filterSVBlacklist' of github.com:mskcc/tempo into …
anoronh4 Jul 22, 2022
c0aeca7
add note about sources for pcawg blacklist files
anoronh4 Jul 22, 2022
54209cb
add description and authorship info for each python file
anoronh4 Jul 25, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ before_install:
- chmod 777 nextflow
# to change the test-data for travis, please download using the following command, extract, make changes, tarball again with gzip, and upload to google drive.
# you will have to change the link below as well. Click to share the link, making it so anyone with the link can access, then extract the id in the link and put it here after "id="
- wget -O test-data.tar.gz --no-check-certificate 'https://docs.google.com/uc?export=download&confirm=no_antivirus&id=1XuitIb02GZ5v928Do2Vh4hnQNkbSAT0V'
- wget -O test-data.tar.gz --no-check-certificate 'https://docs.google.com/uc?export=download&confirm=no_antivirus&id=13zUVw4BZ0_5QAW7zmdCsZ_CZDHbNxyMV'
- tar -xzvf test-data.tar.gz

script:
Expand Down
9 changes: 9 additions & 0 deletions conf/references.config
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,10 @@ params {
hlaDat = "${params.reference_base}/hla/hla.dat"
neoantigenCDNA = "${params.reference_base}/neoantigen/Homo_sapiens.GRCh37.75.cdna.all.fa.gz"
neoantigenCDS = "${params.reference_base}/neoantigen/Homo_sapiens.GRCh37.75.cds.all.fa.gz"
svBlacklistBed = "${params.genome_base}/sv_calling/pcawg6_blacklist.slop.bed.gz"
svBlacklistBedpe = "${params.genome_base}/sv_calling/pcawg6_blacklist.slop.trunc.bedpe.gz"
svBlacklistFoldbackBedpe = "${params.genome_base}/sv_calling/pcawg6_blacklist_foldback_artefacts.slop.trunc.bedpe.gz"
svBlacklistTEBedpe = "${params.genome_base}/sv_calling/pcawg6_blacklist_TE_pseudogene.bedpe.gz"
anoronh4 marked this conversation as resolved.
Show resolved Hide resolved
}
'GRCh37' {
acLoci = "${params.genome_base}/Annotation/ASCAT/1000G_phase3_20130502_SNP_maf0.3.loci"
Expand Down Expand Up @@ -101,6 +105,11 @@ params {
spliceSites = "${params.reference_base}/mskcc-igenomes/grch37/splice_sites/splice_sites.bed"
brassRefDir = "${params.reference_base}/mskcc-igenomes/grch37/brass"
vagrentRefDir = "${params.reference_base}/mskcc-igenomes/grch37/vagrent"
// svBlacklist* source: https://bitbucket.org/weischenfeldt/pcawg_sv_merge/src/docker/data/blacklist_files/
svBlacklistBed = "${params.reference_base}/mskcc-igenomes/grch37/sv_calling/pcawg6_blacklist.slop.bed.gz"
svBlacklistBedpe = "${params.reference_base}/mskcc-igenomes/grch37/sv_calling/pcawg6_blacklist.slop.bedpe.gz"
svBlacklistFoldbackBedpe = "${params.reference_base}/mskcc-igenomes/grch37/sv_calling/pcawg6_blacklist_foldback_artefacts.slop.bedpe.gz"
svBlacklistTEBedpe = "${params.reference_base}/mskcc-igenomes/grch37/sv_calling/pcawg6_blacklist_TE_pseudogene.bedpe.gz"
}
'GRCh38' {
acLoci = "${params.genome_base}/Annotation/ASCAT/1000G_phase3_GRCh38_maf0.3.loci"
Expand Down
8 changes: 8 additions & 0 deletions conf/resources_juno.config
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,10 @@
cpus = { task.attempt < 4 ? 1 : 2 }
memory = { task.attempt < 3 ? 1.GB * task.attempt : 5.Gb * task.attempt }
}
withName:SomaticAnnotateSVBedpe {
cpus = { task.attempt < 4 ? 1 : 2 }
memory = { 8.GB * task.attempt }
}

//------------- Germline pipeline

Expand Down Expand Up @@ -156,6 +160,10 @@
cpus = { task.attempt < 4 ? 1 : 2 }
memory = { task.attempt < 3 ? 1.GB * task.attempt : 5.Gb * task.attempt }
}
withName:GermlineAnnotateSVBedpe {
cpus = { task.attempt < 4 ? 1 : 2 }
memory = { 8.GB * task.attempt }
}

//------------- Quality Control

Expand Down
8 changes: 8 additions & 0 deletions conf/resources_juno_genome.config
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,10 @@
cpus = { task.attempt < 4 ? 1 : 2 }
memory = { 4.GB * task.attempt }
}
withName:SomaticAnnotateSVBedpe {
cpus = { task.attempt < 3 ? 2 : 4 }
memory = { 8.GB * task.attempt }
}
withName:HRDetect {
cpus = { 2 }
memory = { 4.GB * task.attempt }
Expand Down Expand Up @@ -203,6 +207,10 @@
cpus = { task.attempt < 4 ? 1 : 2 }
memory = { task.attempt < 3 ? 1.GB * task.attempt : 5.Gb * task.attempt }
}
withName:GermlineAnnotateSVBedpe {
cpus = { task.attempt < 3 ? 2 : 4 }
memory = { 8.GB * task.attempt }
}

//------------- Quality Control

Expand Down
2 changes: 2 additions & 0 deletions containers/iannotatesv/detect_cdna.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ def prep_intersection(intersect=pd.DataFrame(),bedpe_header_list=[]):
def main():
args = usage()

print("Detecting possible cdna contamination in {}".format(os.path.basename(args.bedpe)))

[meta_header, bedpe_header_list, bedpe_df] = parse_svtools_bedpe_file(args.bedpe)

bedpe_bt = pybedtools.BedTool.from_dataframe(bedpe_df)
Expand Down
109 changes: 109 additions & 0 deletions containers/iannotatesv/filter_regions_bedpe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
import argparse, sys, os
import numpy as np
import pandas as pd
import pybedtools
from utils import *

def usage():
parser = argparse.ArgumentParser()
parser.add_argument('--blacklist-regions', dest="regions", help = 'regions bed/bedpe file', required = True)
parser.add_argument('--bedpe', help = 'bedpe file', required = True)
parser.add_argument('--tag', help = 'string to update FILTER column', required = True)
parser.add_argument('--output',dest="outfile",help = 'output file' , required = True)
parser.add_argument('--match-type',dest="type",help = 'both/either/notboth/neither' , required = True)
parser.add_argument('--ignore-strand',dest="ignore_strand", default=False, action="store_true", help = 'Use flag to ignore strand when running bedtools. Default False' )
return parser.parse_args()

def determine_regions_filetype(filepath):
"""
Determine if the regions file is bed or bedpe, and raise error if neither
"""
if not any([filepath.endswith(i) for i in "bed|bedpe|bed.gz|bedpe.gz".split("|")]):
raise ValueError("--bedpe requires bed, bedpe, bed.gz or bedpe.gz file.")
elif any([filepath.endswith(i) for i in "bed|bed.gz".split("|")]):
return True
else: return False

def validate_bedpe_input(filepath):
"""
Raise error if filetype is not bedpe
"""
if not any([filepath.endswith(i) for i in "bedpe|bedpe.gz".split("|")]):
raise ValueError("--bedpe requires bedpe or bedpe.gz file.")

def validate_match_type(type):
"""
Raise error if type is not acceptable for use in pybedtools pair_to_pair or pair_to_bed
"""
if type not in overlap_type.keys():
raise ValueError( "--match-type must be {}.".format(" or ".join(overlap_type.keys())) )

def main():
"""
1. validate inputs
2. read input beds/bedpes as pybedtools.BedTool objects
3. extract variant IDs from overlapping regions
4. update the FILTER tag for identified variants
"""
args = usage()

print("Filtering {} with {} regions".format(os.path.basename(args.bedpe), args.tag))

# parse inputs
try:
validate_bedpe_input(args.bedpe)
[meta_header, bedpe_header_list, bedpe_df] = parse_svtools_bedpe_file(args.bedpe)
except Exception as e:
print(e)
sys.exit("Unable to parse --bedpe")

try:
is_bed = determine_regions_filetype(args.regions)
regions_bt = pybedtools.BedTool(args.regions)
except Exception as e:
print(e)
sys.exit("Unable to parse --blacklist-regions")

# annotate bedpe
try:
bedpe_bt = pybedtools.BedTool.from_dataframe(bedpe_df)
ids_df = find_overlapped_ids(bedpe_bt, regions_bt, args.type, args.ignore_strand, is_bed)
bedpe_df = add_filter_by_id(bedpe_df,ids_df,args.tag)
except Exception as e:
print(e)
sys.exit("Filtering failed with inputs {} and {}".format(args.bedpe, args.regions))

# write result
with open(args.outfile, "w") as fw:
fw.write("".join(meta_header))
bedpe_df.to_csv(args.outfile, header=True, index=False, sep="\t", mode="a")

def find_overlapped_ids(bedpe_bt,regions_bt,match_type,ignore_strand,is_bed):
# determine validity of match_type
validate_match_type(match_type)

# run pair_to_bed if regions file is bed, pair_to_pair if regions file is bedpe
if is_bed:
intersect = run_pair_to_bed(bedpe_bt,regions_bt,match_type)
else:
intersect = run_pair_to_pair(bedpe_bt,regions_bt,match_type, ignore_strand=ignore_strand)

# extract ids
try:
ids_df = pd.DataFrame({"ID":list( intersect.to_dataframe(header=None)[6].drop_duplicates() )})
except:
ids_df = pd.DataFrame(columns=["ID"])
return ids_df

def add_filter_by_id(bedpe_df,ids_df,tag):
# annotate bedpe based on matching IDs
ids_df.columns = ["ID"]
ids_df["FILTER_NEW"] = tag
filtered_bedpe_df = bedpe_df.merge(ids_df, on="ID", how="left")
filtered_bedpe_df = filtered_bedpe_df.apply(lambda x: update_filter(x,x["FILTER_NEW"]) if not pd.isna(x["FILTER_NEW"]) else x, axis=1)
filtered_bedpe_df = filtered_bedpe_df.drop(['FILTER_NEW'], axis=1)

return filtered_bedpe_df

if __name__ == "__main__":
main()
46 changes: 0 additions & 46 deletions containers/iannotatesv/pair_to_bed_annot.py

This file was deleted.

70 changes: 43 additions & 27 deletions containers/iannotatesv/run_iannotatesv.py
Original file line number Diff line number Diff line change
@@ -1,52 +1,66 @@
import argparse, sys, os, subprocess
from collections import OrderedDict
from datetime import datetime
import multiprocessing as mp
import numpy as np
import pandas as pd
from utils import *

def usage():
parser = argparse.ArgumentParser()
parser.add_argument('--bedpe', help = 'bedpe file', required = True)
parser.add_argument('--genome', default = "hg19", help = 'hg19/hg38/hg18')
parser.add_argument('--threads', type=int, default = 4, help = 'number of threads for parallelization')

return parser.parse_args()

def run_iannotate_cmd(input_file,output_dir,output_pre,genome="hg19"):
print("Spawning in parallel: annotation of " + input_file)
run_args = "python /usr/bin/iAnnotateSV/iAnnotateSV/iAnnotateSV.py -i {} -o {} -ofp {} -r {} -d 3000""".format(input_file, output_dir, output_pre, genome).split(" ")
p = subprocess.Popen(run_args)
p.communicate()

def read_iannotatesv_result(path):
return pd.read_csv(path, sep="\t",header=0)

def main():
args = usage()

meta=""
with open(args.bedpe, 'r') as f:
main_data = False
while main_data == False:
x = f.readline()
if x.startswith("##"):
meta += x
else:
header=x
main_data = True
try:
data = pd.read_csv(f, header=None, sep="\t" )
data.columns = header.strip().split("\t")
except:
data = pd.DataFrame(columns = header.strip().split("\t"))
dt = datetime.now()
print("[{}] Running iAnnotateSV on {}".format(dt,os.path.basename(args.bedpe)))

meta, header_list, data = parse_svtools_bedpe_file(args.bedpe)
iAnnotate_input = data["#CHROM_A|START_A|STRAND_A|CHROM_B|START_B|STRAND_B|ID".split("|")]
iAnnotate_input.columns = "chr1|pos1|str1|chr2|pos2|str2|ID".split("|")

iAnnotate_input = iAnnotate_input.replace("-", 1).replace("+", 0)
key_col = OrderedDict(zip("chr1|pos1|str1|chr2|pos2|str2".split("|"), [str,int,int,str,int,int]))
iAnnotate_input.columns = key_col.keys() + ["ID"]
for k,v in key_col.items():
iAnnotate_input[k] = iAnnotate_input[k].astype(v)

iAnnotate_input.to_csv("input.txt",sep="\t",header=True,index=False)
k = int(500)
n_chunks = int(((k-1)+iAnnotate_input.shape[0])/k)
iAnnotate_output = pd.DataFrame()
if not os.path.isdir("inputs"): os.mkdir("inputs")
if not os.path.isdir("outputs"): os.mkdir("outputs")

if not os.path.isdir("outputdir"):
os.mkdir("outputdir")
run_args = "python /usr/bin/iAnnotateSV/iAnnotateSV/iAnnotateSV.py -i input.txt -ofp outputfilePrefix -o outputdir -r {} -d 3000""".format(args.genome).split(" ")
p = subprocess.Popen(run_args)
p.communicate()
pool = mp.Pool(args.threads)
for i in range(n_chunks):
iAnnotate_input_chunk = iAnnotate_input.iloc[i*k:(i+1)*k]
input_file = os.path.join("inputs","input_" + str(i) + ".txt")
iAnnotate_input_chunk.to_csv(input_file,sep="\t",header=True,index=False)
pool.apply_async(run_iannotate_cmd, args=(input_file, "outputs", "output_" + str(i), args.genome ))

iAnnotate_output = pd.read_csv("outputdir/outputfilePrefix_Annotated.txt", sep="\t",header=0)
iAnnotate_output = pd.merge(iAnnotate_input,iAnnotate_output, on="chr1|pos1|str1|chr2|pos2|str2".split("|"), how="inner")
pool.close()
pool.join()

annot_data = iAnnotate_output.drop("chr1|pos1|str1|chr2|pos2|str2".split("|"), axis=1)
annot_data = annot_data.replace(np.nan, ".")
iAnnotate_output = pd.concat(map(read_iannotatesv_result, [os.path.join("outputs","output_" + str(i) + "_Annotated.txt") for i in range(n_chunks)]))
for key,val in key_col.items():
iAnnotate_output[key] = iAnnotate_output[key].astype(val)

iAnnotate_output = pd.merge(iAnnotate_input,iAnnotate_output, on=key_col.keys(), how="inner")

print("[{}] Merging annotation with bedpe".format(datetime.now()))
annot_data = iAnnotate_output.drop(key_col.keys(), axis=1).replace(np.nan, ".")
final_data = pd.merge(data, annot_data, on="ID",how="left")

outputbed = os.path.basename(args.bedpe)[:-6] if args.bedpe.endswith(".bedpe") else os.path.basename(args.bedpe)
Expand All @@ -55,5 +69,7 @@ def main():
fw.write(meta)
final_data.to_csv(outputbed,sep="\t",header=True,index=False, mode='a')

print("[{}] iAnnotateSV complete".format(datetime.now()))

if __name__ == "__main__":
main()
Loading