-
Notifications
You must be signed in to change notification settings - Fork 2
/
getJunctionsFromTxt.py
129 lines (104 loc) · 6.02 KB
/
getJunctionsFromTxt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# Determines equivalent junction sequences from txt file containing chr, donor, and acceptor coordinates.
# Handling coordinates and strand is based on how data were presented in circBase txt downloads, so
# may need to be modified for other data sources.
import sys
import argparse
from Bio import SeqIO
from BCBio import GFF
# usage: python getJunctionsFromTxt.py -f /scratch/PI/horence/linda/EquivJuncPaper/hg19_genome.fa
# -t /scratch/PI/horence/linda/EquivJuncPaper/hsa_hg19_circRNA.txt
# -o /scratch/PI/horence/linda/EquivJuncPaper/hsa_hg19_circRNA_EquivSeqs.txt
# -v
# output: tab delimited file with equivJuncSeq, chr, donor, acceptor
# TO GENERATE REPORT FROM THIS OUTPUT: cat hsa_hg19_circRNA_EquivSeqs.txt Sim1deletionEquivSeqs.txt | sort -k1 | uniq | cut -f1 | sort | uniq -c | sort -k1nr
# this removes double-counting cases if the same junction is listed multiple times.
CHECK_EQUIV_LEN = 30
complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N'}
def getRevComp(inStr):
revStr = inStr[::-1]
revComp = []
for a in revStr:
revComp.append(complement[a])
return "".join(revComp)
def getEquivSeqs(txtFile, chrCol, donorCol, acceptorCol, strandCol):
t_handle = open(txtFile, "rU")
for line in t_handle:
try:
splits = line.strip().split()
juncChr = splits[chrCol]
juncStrand = splits[strandCol]
if juncStrand == "+":
donorLoc = int(splits[acceptorCol])
acceptorLoc = int(splits[donorCol])
else:
donorLoc = int(splits[donorCol])
acceptorLoc = int(splits[acceptorCol])
if args.verbose:
print (juncChr, juncStrand, donorLoc, acceptorLoc)
if juncChr in f_dict:
if juncStrand == "+":
dseq_prev = str(f_dict[juncChr][donorLoc-CHECK_EQUIV_LEN:donorLoc].seq).upper()
dseq_next = str(f_dict[juncChr][donorLoc:donorLoc+CHECK_EQUIV_LEN].seq).upper()
aseq_prev = str(f_dict[juncChr][acceptorLoc-CHECK_EQUIV_LEN:acceptorLoc].seq).upper()
aseq_next = str(f_dict[juncChr][acceptorLoc:acceptorLoc+CHECK_EQUIV_LEN].seq).upper()
else:
dseq = str(f_dict[juncChr][donorLoc-CHECK_EQUIV_LEN:donorLoc+CHECK_EQUIV_LEN].seq).upper()
dseq = getRevComp(dseq)
dseq_prev = dseq[:CHECK_EQUIV_LEN]
dseq_next = dseq[CHECK_EQUIV_LEN:]
aseq = str(f_dict[juncChr][acceptorLoc-CHECK_EQUIV_LEN:acceptorLoc+CHECK_EQUIV_LEN].seq).upper()
aseq = getRevComp(aseq)
aseq_prev = aseq[:CHECK_EQUIV_LEN]
aseq_next = aseq[CHECK_EQUIV_LEN:]
if args.verbose:
print ("dseq_prev:", dseq_prev)
print ("dseq_next:", dseq_next)
print ("aseq_prev:", aseq_prev)
print ("aseq_next:", aseq_next)
# generate equiv junc seq
equivJuncSeq = ""
# check moving breakpoint more upstream in gene
e_ind = CHECK_EQUIV_LEN - 1
while e_ind != -1:
if dseq_prev[e_ind] != aseq_prev[e_ind]:
break
equivJuncSeq = equivJuncSeq + dseq_prev[e_ind]
e_ind = e_ind - 1
equivJuncSeq = equivJuncSeq[::-1] # since it was added from end 1 by 1, need to reverse to get coorrect order
if args.verbose:
print ("equiv from moving upstream:", equivJuncSeq)
# check moving breakpoint more downstream in gene
e_ind = 0
while e_ind < CHECK_EQUIV_LEN:
if dseq_next[e_ind] != aseq_next[e_ind]:
break
equivJuncSeq = equivJuncSeq + dseq_next[e_ind]
e_ind = e_ind + 1
if args.verbose:
print ("equiv from checking both directions:", equivJuncSeq)
o_handle.write("\t".join([equivJuncSeq, juncChr, str(juncStrand), str(donorLoc), str(acceptorLoc)]))
o_handle.write("\n")
except Exception as e:
print ("Exception")
print (e)
print ("error:", sys.exc_info()[0])
print ("parsing line", line)
t_handle.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('-f', '--fastaFile', required=True, help='path to fasta file with chromosome sequences')
parser.add_argument('-t', '--txtFile', required=True, help='txt file containing splice junction coordinate data')
parser.add_argument('-o', '--outFile', required=True, help='output file name (including full path)')
parser.add_argument('-c', '--chrCol', type=int, default=0, help='0-based column containing chromosome info')
parser.add_argument('-d', '--donorCol', type=int, default=1, help='0-based column containing donor coordinate')
parser.add_argument('-a', '--acceptorCol', type=int, default=2, help='0-based column containing acceptor coordinate')
parser.add_argument('-s', '--strandCol', type=int, default=3, help='0-based column containing strand (+/-)')
parser.add_argument('-v', '--verbose', help='print info about data obtained', action='store_true')
args = parser.parse_args()
# read in the sequences
f_handle = open(args.fastaFile, "rU")
f_dict = SeqIO.to_dict(SeqIO.parse(f_handle, "fasta"))
f_handle.close()
o_handle = open(args.outFile, "w")
getEquivSeqs(args.txtFile, args.chrCol, args.donorCol, args.acceptorCol, args.strandCol)
o_handle.close()