-
Notifications
You must be signed in to change notification settings - Fork 10
/
reindex_pdb.py
executable file
·176 lines (155 loc) · 6.59 KB
/
reindex_pdb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
#!/usr/bin/env python
docstring='''
reindex_pdb.py startindex infile.pdb outfile.pdb
Rearrange residue number of "infile.pdb" so the first residue start from
"startindex". if the structure contains any missing residue, residue
number gaps will be removed.
reindex_pdb.py seq.fasta infile.pdb outfile.pdb
Rearrange residue number of "infile.pdb" according to "seq.fasta" so the
residue index is the same as that in "seq.fasta"
options:
-clean={true,false} whether to remove lines other than "ATOM " entries
'''
import sys,os
import re
def reindex_pdb_by_index(startindex=1,PDBtxt=''):
'''
reindex residue number of PDB format text
options:
startindex - index of first residue
PDBtxt - text of input PDB to be reindexed
'''
PDBtxt_reindex=''
current_old_index='' # residue number in origin PDB
warn_chainID='' # warning about new chain ID
for line in PDBtxt.splitlines():
if len(line)<27 or (not line.startswith("ATOM ") and \
not line.startswith("HETATM") and not line.startswith("TER")):
PDBtxt_reindex+=line+'\n'
continue
elif not line[16] in ['A',' ']: # alternative location identifier
continue
resSeq=line[22:27] # residue sequence number
current_chainID=line[21] # chain identifier
if not current_old_index: # first residue encountered
current_old_index=resSeq # residue number in origin PDB
current_new_index=int(startindex)
chainID=current_chainID
resSeq_new=str(current_new_index)
resSeq_new=' '*(4-len(resSeq_new))+resSeq_new+' '
elif current_chainID!=chainID:
if warn_chainID!=current_chainID:
sys.stderr.write(
"Warning! Discarding chain '%s'\n"%current_chainID)
warn_chainID=current_chainID
continue
elif resSeq!=current_old_index:
current_new_index+=1
current_old_index=resSeq
resSeq_new=str(current_new_index)
resSeq_new=' '*(4-len(resSeq_new))+resSeq_new+' '
PDBtxt_reindex+=line[:16]+' '+line[17:22]+resSeq_new+line[27:]+'\n'
return PDBtxt_reindex
def find_next_amino_acid(sequence_ref,sequence_pdb,startindex=0):
'''find the first instance of amino acid whose index is greater than
startindex. index is one-based. return -1 if not found. return negative
if the position is gap in sequence_ref
'''
startindex=abs(startindex)
for resi in range(startindex,len(sequence_pdb)):
if sequence_pdb[resi]!='-':
return (resi+1)*(-1)**(sequence_ref[resi]=='-')
return 0
def reindex_pdb_by_sequence(sequence_ref,PDBtxt):
'''NWalign "sequence_pdb" to "sequence_ref" and renumber residue index
according to sequence_ref.
unaligned residues in sequence_pdb will be removed.
only ATOM or MSE will be parsed
'''
#### convert PDB to fasta ####
from pdb2fasta import pdbtxt2seq
header_list,sequence_list=pdbtxt2seq(PDBtxt,PERMISSIVE="MSE",allowX=True)
sequence_pdb=sequence_list[0]
#### perform NeedlemanWunsch ####
from NWalign import calcualte_score_gotoh,trace_back_gotoh
# idir (DP path); jpV (Horizontal jump number); jpH (Vertical jump number)
idir,jpV,jpH=calcualte_score_gotoh(sequence_ref,sequence_pdb)
# sequenceA (aligned f1); sequenceB (aligned f2)
sequence_ref,sequence_pdb=trace_back_gotoh(idir,jpV,jpH,
sequence_ref,sequence_pdb)
#### reindex PDB text ####
PDBtxt_reindex=''
current_old_index='' # residue number in origin PDB
warn_chainID='' # warning about new chain ID
for line in PDBtxt.splitlines():
if len(line)<27 or not line[16] in ['A',' '] or (
not line.startswith("ATOM ") and not line.startswith("TER"
) and not (line.startswith("HETATM") and line[17:20]=="MSE")):
continue
resSeq=line[22:27] # residue sequence number
current_chainID=line[21] # chain identifier
if not current_old_index: # first residue encountered
current_old_index=resSeq # residue number in origin PDB
current_new_index=find_next_amino_acid(sequence_ref,sequence_pdb)
chainID=current_chainID
resSeq_new=str(current_new_index)
resSeq_new=' '*(4-len(resSeq_new))+resSeq_new+' '
elif resSeq!=current_old_index:
current_new_index=find_next_amino_acid(sequence_ref,
sequence_pdb,current_new_index)
current_old_index=resSeq
resSeq_new=str(current_new_index)
resSeq_new=' '*(4-len(resSeq_new))+resSeq_new+' '
if current_new_index==0: # reach the end of alignment
break
elif current_new_index>0: # no gap in sequence_ref:
PDBtxt_reindex+=line[:16]+' '+line[17:22]+resSeq_new+line[27:]+'\n'
return PDBtxt_reindex
def reindex_pdb(startindex,infile,clean=True):
'''parse PDB file "infile", reindex it according to start index or
sequence file "startindex", and return the text of renumbered PDB
'''
fp=open(infile,'rU')
PDBtxt=''
for line in fp.read().splitlines():
if line.startswith("END"):
if clean:
line=line.replace("ENDMDL","END ")
PDBtxt+=line+'\n'
break
if line.startswith("ATOM ") or line.startswith("TER") or (
clean==False and not line[:6] in ["DBREF ","SEQADV","MODRES",
"HELIX ","SHEET ","SSBOND","SITE "]):
PDBtxt+=line+'\n'
fp.close()
if os.path.isfile(startindex):
from NWalign import readFastaOrRawSequence
sequence_ref=readFastaOrRawSequence(startindex).replace('-','')
PDBtxt_reindex=reindex_pdb_by_sequence(sequence_ref,PDBtxt)
else:
PDBtxt_reindex=reindex_pdb_by_index(startindex,PDBtxt)
return PDBtxt_reindex
if __name__=="__main__":
#### parse commandline arguments ####
clean=True
argv=[]
for arg in sys.argv[1:]:
if arg.startswith("-clean="):
clean=(arg[len("-clean="):].lower()=="true")
elif arg.startswith('-') and len(set(arg[1:])-set(map(str,range(10)))):
sys.stderr.write("ERROR! Unknown option %s\n"%arg)
exit()
else:
argv.append(arg)
if len(argv)<2:
sys.stderr.write(docstring)
exit()
#### parse PDB file ####
PDBtxt_reindex=reindex_pdb(argv[0],argv[1],clean)
#### write PDB file ####
if len(argv)>1:
fp=open(argv[2],'w')
fp.write(PDBtxt_reindex)
fp.close()
else:
sys.stdout.write(PDBtxt)