-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathreorder_pdb_and_mdcrd.py
executable file
·262 lines (218 loc) · 10.8 KB
/
reorder_pdb_and_mdcrd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
#!/usr/bin/env python
#
# Strategy:
#
# Use SQLite to reorder the atoms so that atoms in the same residue are grouped together. We can then stuff this into
# AMBER's leap and make a prmtop out of it. Leap will further reorder the atoms, and we can make a new PDB.
# Now, given the old and new PDB files, we know the mapping of old to new, and can then reorder the trajectory (mdcrd format,
# converted from the DCD version using VMD).
import sys
import sqlite3 as sql
import gzip
class AtomRecord:
'''Represents a single atom.'''
def __init__(self, line):
self.atomid = int(line[6:11])
self.atomname = line[12:16].strip()
self.resname = line[17:20].strip()
self.chain = line[21]
if self.chain == " ":
self.chain = "X"
# Solvent residue ids are frequently mangled
# so just set the mangled ones to 0
try:
self.resid = int(line[22:26])
except:
self.resid = 0
self.x = float(line[30:38])
self.y = float(line[38:46])
self.z = float(line[46:54])
self.occupancy = float(line[54:60])
self.tempfactor = float(line[60:66])
self.rename_opls_to_amber()
def __str__(self):
return "ATOM %5d %4s %3s %1s%4d %8.3f%8.3f%8.3f%6.2f%6.2f " % \
(self.atomid, self.atomname, self.resname, self.chain, self.resid, self.x, self.y, self.z, \
self.occupancy, self.tempfactor)
def rename_opls_to_amber(self):
'''Converts what I assume to be OPLS atom names to AMBER convention.
This is done so that we can feed the resulting PDB into leap and not have it add heavy atoms or hydrogens,
so we can generate a prmtop that corresponds to the original PDB. These atom names might not be anything
special as they came from the Desmond toolset.'''
if self.resname == 'ACE':
if self.atomname == '1H': self.atomname = 'HH31'
elif self.atomname == '2H': self.atomname = 'HH32'
elif self.atomname == '3H': self.atomname = 'HH33'
if self.resname == 'NMA':
if self.atomname == '1HA': self.atomname = 'HH31'
elif self.atomname == '2HA': self.atomname = 'HH32'
elif self.atomname == '3HA': self.atomname = 'HH33'
elif self.atomname == 'CA': self.atomname = 'CH3'
self.resname = 'NME'
if self.resname == 'GLY':
if self.atomname == 'H2': self.atomname = 'H'
if self.resname == 'GLU':
if self.atomname == 'OXT': self.atomname = 'O'
if self.resname == 'SER':
if self.atomname == 'OXT': self.atomname = 'O'
if self.resname == 'ASN':
if self.atomname == 'H22': self.atomname = 'H'
if self.resname == 'LYS':
if self.atomname == 'H2': self.atomname = 'H'
if self.resname == 'ILE':
if self.atomname == 'H2': self.atomname = 'H'
# if self.resname == 'HIS':
# self.resname = 'HID'
class Molecule:
'''Represents an entire molecule loaded from a PDB file.'''
solvent_residues = ["WAT", "T3P", "HOH"]
def __init__(self, filename):
self.atoms = []
self.load_from_pdb(filename)
def load_from_pdb(self, filename):
pdb = open(filename)
for line in pdb:
if line[0:6] == "ATOM ":
atom = AtomRecord(line)
self.atoms.append(atom)
print >>sys.stderr, "Read %d atoms from %s." % (len(self.atoms), filename)
def nuke_solvent(self):
print >>sys.stderr, "Stripping solvent residues."
self.atoms = [atom for atom in self.atoms if atom.resname not in self.solvent_residues]
def keep_only_atomname(self, atomname):
'''Keeps only a certain type of atom. (e.g. CA, CB, N, ...)'''
print >>sys.stderr, "Keeping only %s atoms." % atomname
self.atoms = [atom for atom in self.atoms if atom.atomname == atomname]
def atom_record(atomid, atomname, resname, chain, resid, x, y, z, occupancy, tempfactor):
"""Returns a PDB ATOM record string ('card' in FORTRAN parlance)"""
return "ATOM %5d %4s %3s %1s%4d %8.3f%8.3f%8.3f%6.2f%6.2f " % \
(atomid, atomname, resname, chain, resid, x, y, z, occupancy, tempfactor)
def print_molecule_from_sql(conn):
'''Given an sqlite connection, dumps the molecule in table 'mol' to stdout in PDB format.'''
c = conn.cursor()
c.execute("SELECT * FROM mol ORDER BY chain, resid, atomid")
conn.commit()
recs = c.fetchall()
for i in xrange(len(recs)):
rec = recs[i]
(atomid, atomname, resname, chain, resid, x, y, z, occupancy, tempfactor) = rec
print atom_record(i+1, atomname, resname, chain, resid, x, y, z, occupancy, tempfactor)
def split_mdcrd_line(s):
r = []
for i in xrange(0, 10):
tmp = s[i*8:i*8+8].strip()
if tmp != '': r.append(tmp)
return r
def load_mdcrd_frame(mdcrd, num_atoms, box=False):
"""Loads a frame from an open mdcrd file, returning a list of coordinates that is 3*num_atoms in length.
Box information is discarded."""
num_lines = num_atoms * 3 / 10
if num_atoms*3%10 != 0: num_lines += 1
frame = []
for i in xrange(num_lines):
s = mdcrd.readline()
# If we've (possibly prematurely) reached EOF, this is not a complete frame, so return None
if s == '': return None
frame += [float(x) for x in split_mdcrd_line(s)]
# Discard box information if it is present
if box: mdcrd.readline()
if len(frame) != num_atoms * 3:
print >>sys.stderr, "There are %d atoms but %d coordinates read from the mdcrd. Aw." % (num_atoms, len(frame))
return frame
def print_frame(frame):
'''Prints a trajectory snapshot in mdcrd format.'''
for i in xrange(len(frame)):
sys.stdout.write("%8.3f" % frame[i])
if i != 0 and (i+1) % 10 == 0:
print ""
print ""
def rename_cys_for_disulfide_bridges(conn):
"""Renames CYS residues that do not have an HG atom to CYX.
This has the effect of allowing LEaP to infer the presence of disulfide bridges,
instead of adding on the HG itself. (CYS is the non-bridged cysteine, while CYX
is the bridged one.)
"""
c = conn.cursor()
all_cys_query = "SELECT DISTINCT resid, chain FROM mol WHERE resname = 'CYS' "
nonbridged_cys_query = "SELECT DISTINCT resid, chain FROM mol WHERE resname = 'CYS' AND atomname = 'HG'"
c.execute(all_cys_query)
conn.commit()
# Return value of c.fetchall() has structure [(1),(2),(3)]
all_cys = [str(x[0])+" "+x[1] for x in c.fetchall()]
c.execute(nonbridged_cys_query)
conn.commit()
nonbridged_cys = [str(x[0])+" "+x[1] for x in c.fetchall()]
bridged_cys = set(all_cys) - set(nonbridged_cys)
# Now that we have found the set of CYS residues we need to rename,
# we can go ahead and rename them
rename_query = "UPDATE mol SET resname = 'CYX' WHERE resid = :resid AND chain = :chain"
for residue in bridged_cys:
(resid, chain) = residue.split()
c.execute(rename_query, {"resid": resid, "chain": chain})
conn.commit()
if __name__ == '__main__':
ordered_mol, scrambled_mdcrd = None, None
scrambled_mol = Molecule(sys.argv[1])
scrambled_mol.nuke_solvent()
# If we just want to reorder a molecule, there will be only one command-line argument.
# If there are second and third arguments, we want to map the atoms to each other and convert a trajectory.
if len(sys.argv) == 4:
ordered_mol = Molecule(sys.argv[2])
ordered_mol.nuke_solvent()
if len(scrambled_mol.atoms) != len(ordered_mol.atoms):
print >>sys.stderr, "Number of atoms, %d vs %d does not match, fool." % (len(scrambled_mol.atoms), len(ordered_mol.atoms))
# sys.exit(1)
# Open the trajectory file and discard the header line
scrambled_mdcrd = open(sys.argv[3], 'r')
scrambled_mdcrd.readline()
# Create a temporary in-memory table to store the structure
conn = sql.connect(":memory:")
c = conn.cursor()
c.execute("CREATE TABLE mol (atomid integer, atomname text, resname text, chain text, resid integer, x real, y real, z real, occupancy real, tempfactor real)")
# Insert atoms into in-memory table
insert_query = "INSERT INTO mol (atomid, atomname, resname, chain, resid, x, y, z, occupancy, tempfactor) VALUES (:atomid, :atomname, :resname, :chain, :resid, :x, :y, :z, :occupancy, :tempfactor)"
for a in scrambled_mol.atoms:
c.execute(insert_query, {"atomid": a.atomid, "atomname": a.atomname, "resname": a.resname, "chain": a.chain, "resid": a.resid, "x": a.x, "y": a.y, "z": a.z, "occupancy": a.occupancy, "tempfactor": a.tempfactor})
conn.commit()
rename_cys_for_disulfide_bridges(conn)
# If we only wanted to reorder the PDB, just do that and exit
if ordered_mol is None:
print_molecule_from_sql(conn)
sys.exit(0)
# We must now create a 1:1 mapping between scrambled_mol and ordered_mol.
# The trajectory is scrambled, so for each atom of a frame, we want to do:
# Ordered[i] = Scrambled[Map[i]]
# where i is the atom id in the ordered trajectory.
# XXX:
# Running a SELECT for each atom in ordered_mol is incredibly stupid. I admit this.
# A much faster and cleaner solution would be to do a JOIN. But I am lazy.
scrambled_to_ordered = {}
for i in xrange(len(ordered_mol.atoms)):
# Try to find this atom in the scrambled atom table
a = ordered_mol.atoms[i]
q = "SELECT * FROM mol WHERE x = :x AND y = :y AND z = :z"
c.execute(q, {"x": a.x, "y": a.y, "z": a.z})
conn.commit()
recs = c.fetchall()
if len(recs) != 1:
print >>sys.stderr, "Ambiguous mapping between scrambled and ordered molecules!"
print >>sys.stderr, a
sys.exit(1)
r = recs[0]
# print "atomid: ordered[%d] = scrambled[%d]" % (a.atomid, r[0])
scrambled_to_ordered[r[0]] = a.atomid
print >>sys.stderr, "Done creating mapping between scrambled and ordered molecules."
# Now we can iterate through our scrambled trajectory and our mapping tells us what maps to what
num_atoms = len(scrambled_mol.atoms)
print "Generated by %s" % sys.argv[0]
while True:
scrambled_frame = load_mdcrd_frame(scrambled_mdcrd, num_atoms)
ordered_frame = [0] * num_atoms * 3
if scrambled_frame is None: break
for i in xrange(num_atoms):
# atom ids are 1-based. I hate fortran.
ordered_i = scrambled_to_ordered[i+1]-1
ordered_frame[ordered_i*3] = scrambled_frame[i*3]
ordered_frame[ordered_i*3+1] = scrambled_frame[i*3+1]
ordered_frame[ordered_i*3+2] = scrambled_frame[i*3+2]
print_frame(ordered_frame)