-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathfepout_cat_legacy.py
executable file
·379 lines (321 loc) · 17.1 KB
/
fepout_cat_legacy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
#!/usr/bin/env python
#
# This script is intended to combine multiple .fepout files from a single NAMD FEP calculation
# that was prematurely terminated (perhaps due to cluster queue time limits) and then restarted.
#
# We make the following assumptions in this script:
# - The beginning of each fepout file is not corrupted
#
# Input: Multiple .fepout files, in lambda order
# Output: Single .fepout, sent to stdout.
import re
import argparse
import sys
from os.path import isfile
import io as StringIO
import yaml
from natsort import natsorted
import intervaltree
def readline_and_offset(f):
offset = f.tell()
return (f.readline(), offset)
def readline_and_offset_until(f, until_re):
# Read a line. Does it match the regex? If so, return the line and its offset in the file.
# If not, try the next line. If we hit EOF, also stop looking.
while True:
line, offset = readline_and_offset(f)
if until_re.match(line) or line == '':
return line, offset
def scan_fepout_file(fname):
new_fep_window_re = re.compile(r'#NEW FEP WINDOW: LAMBDA SET TO ([\d.]+) LAMBDA2 ([\d.]+)')
equil_end_re = re.compile(r'#\d+ STEPS OF EQUILIBRATION AT LAMBDA [\d.]+ COMPLETED')
prod_start_re = re.compile(r'#STARTING COLLECTION OF ENSEMBLE AVERAGE')
prod_end_re = re.compile(r'#Free energy change for lambda window \[ ([\d.]+) ([\d.]+) \] is ([-\deE.]+) ; net change until now is [-\deE.]+')
f = open(fname)
# Eat the start of file header, which should span two lines
readline_and_offset(f)
readline_and_offset(f)
header_end_offset = f.tell()
lambdas = {}
while True:
line, offset = readline_and_offset_until(f, new_fep_window_re)
if line == '': return lambdas # Give up on EOF
# We have started a new window so extract the current lambdas.
# Assume this is a production fragment until proven otherwise.
m = new_fep_window_re.match(line)
l0, l1 = float(m.group(1)), float(m.group(2))
key = '%f_%f' % (l0, l1)
current_lambda = {'fname': fname, 'header_end_offset': header_end_offset, 'prod_start_offset': offset,
'delta': l1 - l0}
old_offset = offset
line, offset = readline_and_offset_until(f, equil_end_re)
if line != '':
# Found an end-of-equilibration line, so turns out this wasn't a production fragment after all
current_lambda['equil_start_offset'] = current_lambda['prod_start_offset']
current_lambda['equil_end_offset'] = f.tell()
# Now here is the production part. The line we're looking for only occurs if there was an
# equilibration fragment.
line, offset = readline_and_offset_until(f, prod_start_re)
if line == '': return lambdas
current_lambda['prod_start_offset'] = offset
else:
# Could not find an end-of-equilibration line, so we'll keep going, looking for the
# end of the production fragment, below
f.seek(old_offset)
# Say that the equilibration fragment doesn't exist
current_lambda['equil_start_offset'] = current_lambda['equil_end_offset'] = None
# We've now handled the equilibration block if it exists
line, offset = readline_and_offset_until(f, prod_end_re)
if line == '': return lambdas
current_lambda['prod_end_offset'] = f.tell()
# Save the free energy change for this window so we can do some calculations with it later
current_lambda['energy_change'] = float(prod_end_re.match(line).group(3))
lambdas[key] = current_lambda
def get_block_from_file(fname, start, end):
if None in (start, end):
return None
with open(fname) as f:
f.seek(start)
return f.read(end - start)
def trim_timesteps_from_block(block, numsteps=0):
"""Trims numsteps timesteps from the start of the provided FepEnergy block.
Note this is insensitive to the the interval of the steps. The same amount of
simulation time is trimmed regardless. So if you wanted 100000 steps trimmed but
had an output interval of 20000 (for some reason), only five FepEnergy values
would be trimmed."""
in_buf = StringIO.StringIO(block)
out_buf = StringIO.StringIO()
first_step = None
for line in in_buf.readlines():
if line.startswith('FepEnergy:'):
step = int(line.split()[1])
if first_step is None: first_step = step
if (step - first_step) >= numsteps:
out_buf.write(line)
else:
out_buf.write(line)
new_block = out_buf.getvalue()
out_buf.close()
return new_block
def concat_block_prod(blocks, ignore_steps=0):
"""Returns a string containing combined production parts of the given blocks, including whatever header
is necessary for VMD ParseFEP to be happy."""
# Load the raw production data content of the block
blocks_data = [get_block_from_file(b.data['fname'], b.data['prod_start_offset'], b.data['prod_end_offset']) for b in blocks]
out_buf = StringIO.StringIO()
comments, fepenergy = [], []
lambdas = set()
lambda_range_re = re.compile(r'#NEW FEP WINDOW: LAMBDA SET TO (.+) LAMBDA2 (.+)')
# Load the header from each block and ensure they are all for the same lambda range
for block_data in blocks_data:
in_buf = StringIO.StringIO(block_data)
comment_fragments, current_comment_fragment = [], []
fepenergy_fragments, current_fepenergy_fragment = [], []
# print >>sys.stderr, 'NEW BLOCK'
for line in in_buf.readlines():
# print >>sys.stderr, line
if line.startswith('#'):
# We are in a comment fragment. If we were previously in a FepEnergy fragment, "flush" it
if len(current_fepenergy_fragment) > 0:
fepenergy_fragments.append(current_fepenergy_fragment)
current_fepenergy_fragment = []
#print >>sys.stderr, ''
#print >>sys.stderr, "FLUSH FEPENERGY %d" % len(fepenergy_fragments)
current_comment_fragment.append(line)
# Also since this is a comment try to extract the lambda range for this block
lambda_range_match = lambda_range_re.match(line)
if lambda_range_match:
b, e = float(lambda_range_match.group(1)), float(lambda_range_match.group(2))
lambdas.add((b, e))
#print >>sys.stderr, 'Found lambda range (%f, %f)' % (b, e)
elif line.startswith(('FepEnergy:')):
# We are in a FepEnergy fragment. If we were previously in a comment fragment, "flush" it
if len(current_comment_fragment) > 0:
comment_fragments.append(current_comment_fragment)
current_comment_fragment = []
#print >>sys.stderr, "FLUSH COMMENTS"
current_fepenergy_fragment.append(line)
# Now that we are done with this block, let's figure out what we've got
if len(current_comment_fragment) > 0:
comment_fragments.append(current_comment_fragment)
if len(current_fepenergy_fragment) > 0:
fepenergy_fragments.append(current_fepenergy_fragment)
comments.append(comment_fragments)
fepenergy.append(fepenergy_fragments)
# We stored all the lambda ranges in a set. If we add a different lambda range,
# the set will grow, which we use to detect whether there are any different lambda ranges.
if len(lambdas) > 1:
print('Um I think your lambda spec is messed up. Here is what I have for the same group of blocks:', file=sys.stderr)
print(lambdas, file=sys.stderr)
return None
# Keep the header from the first block, which should be the first comment fragment
# for line in comments[0][0]:
# out_buf.write(line)
def emit_fepenergy_block(out_buf, lines, first_timestep=0, ignore_steps=0):
# FepEnergy: 50 ...
# FepEnergy: 100 ...
# Extract the delta between timesteps, which requires at least two lines to be present
assert len(lines) > 2, 'FepEnergy block is only %d lines which is not very long' % len(lines)
delta = int(lines[1].split()[1]) - int(lines[0].split()[1])
lines_to_skip = ignore_steps / delta
timestep = first_timestep
line_count = 0
for line in lines:
# Write this line if we weren't supposed to skip it
if line_count >= lines_to_skip:
# FepEnergy: %6d %14.4f %14.4f...
out_buf.write('FepEnergy: %6d' % timestep)
tokens = line.strip().split()
for i in range(2, len(tokens)):
out_buf.write(' %14.4f' % float(tokens[i]))
out_buf.write('\n')
# Renumber all the lines
timestep += delta
line_count += 1
return timestep
# Next step: spit out an equilibration fragment, which we will take to be the first one we found.
# Take note of the last timestep
equil_data = None
for b in blocks:
equil_data = get_block_from_file(b.data['fname'], b.data['equil_start_offset'], b.data['equil_end_offset'])
if equil_data is not None:
break
timestep, last_timestep = 0, 0
if equil_data is not None:
in_buf = StringIO.StringIO(equil_data)
for line in in_buf.readlines():
if line.startswith('FepEnergy:'):
last_timestep = timestep
timestep = int(line.split()[1])
out_buf.write(line)
timestep += timestep - last_timestep
# Iterate over each block index and spit out the renumbered prod block
for block_i in range(len(fepenergy)):
timestep = emit_fepenergy_block(out_buf, fepenergy[block_i][0], first_timestep=timestep, ignore_steps=ignore_steps)
# Now emit the footer (from the last block)
for line in comments[-1][1]:
out_buf.write(line)
# Return the assembled concatenated block
new_block = out_buf.getvalue()
out_buf.close()
return new_block
def load_spec_file(specfile):
spec = intervaltree.IntervalTree()
spec_fepout_files = set()
spec_yaml = yaml.load(open(specfile))
# Parse the spec file and save it in an interval tree
for lambda_range in spec_yaml:
fepouts = spec_yaml[lambda_range]
(spec_b, spec_e) = sorted(float(x) for x in lambda_range.strip().split())
# If we haven't seen this interval before, store it
if not spec.overlap(spec_b, spec_e, strict=True):
# ...But we may overlap with a previous interval, which is an error
if spec.overlap(spec_b, spec_e):
sys.stderr.write(
'(%f, %f) overlaps with something else in the spec. I am too lazy to tell you what\n' % (
spec_b, spec_e))
return None, None
spec[spec_b:spec_e] = []
current = list(spec[spec_b:spec_e])[0].data
current.append(fepouts)
spec[spec_b:spec_e] = current
spec_fepout_files.update(set(fepouts))
# sys.stderr.write(' %s: lambda %f to %f\n' % (fepout, b, e))
return spec, spec_fepout_files
def main():
ap = argparse.ArgumentParser(description='Stitch together NAMD fepout files, discarding incomplete lambda windows, and dump the result to stdout')
ap.add_argument('--spec', help='FEP specification that dictates which lambda ranges should be used from which fepout files')
ap.add_argument('--generate-spec', action='store_true', help='Generate a prototype specification file from the fepout files specified')
ap.add_argument('--ignore-steps', type=int, default=0, help='Ignore this many steps at the start of each production window')
ap.add_argument('fepout_file', nargs='+', help='.fepout files, in order of lambdas you want')
args = ap.parse_args()
# The general strategy is to record offsets within files delineating blocks we want to keep.
# This way we can avoid keeping big blocks of data around in memory.
spec, spec_fepout_files = None, set()
# It's annoying to generate spec files by hand, so here at user request we scan all fepout files
# specified and dump which blocks they contain, in a convenient YAML format
if args.generate_spec:
sys.stderr.write('Generating a spec file, to stdout.\n')
sys.stderr.write('It may not make sense, so please check it manually.\n')
sys.stderr.write('Scanning fepout files...\n')
# Make a list of all blocks
all_fepout_files = set(args.fepout_file)
per_file_lambdas = []
for fname in natsorted(all_fepout_files):
per_file_lambdas.append(scan_fepout_file(fname))
# Rearrange the list into a dict index by lambda, merging where necessary
all_lambdas = {}
for lambdas in per_file_lambdas:
for key in list(lambdas.keys()):
if key in all_lambdas:
all_lambdas[key].append(lambdas[key]['fname'])
else:
all_lambdas[key] = [lambdas[key]['fname'],]
# Spit out the blocks lambda dict in YAML format
for key in natsorted(list(all_lambdas.keys())):
print('"%s":' % key.replace('_', ' '))
for fname in all_lambdas[key]:
print(' - %s' % fname)
# Don't do anything else
return 0
# If user provides a yaml file that describes which lambda ranges can come from which files,
# go through each block that we inhaled and keep only those which meet the criteria.
# Bonus TODO: Maybe check to ensure that we have covered the entire transformation.
elif args.spec:
if not isfile(args.spec):
sys.stderr.write('Cannot find your FEP spec file %s.' % args.spec)
return 1
spec, spec_fepout_files = load_spec_file(args.spec)
# If user provided a spec, there's no need to load fepout files that aren't in the spec.
# So get rid of all fepout files in our list that we can't need.
all_fepout_files = set(args.fepout_file)
if len(spec_fepout_files) > 0:
all_fepout_files = all_fepout_files & spec_fepout_files
# Load all blocks from all fepout files and store them in an interval tree
all_blocks = intervaltree.IntervalTree()
all_lambda_ranges = set()
for fname in natsorted(all_fepout_files):
# Get a list of blocks in this fepout file
blocks = scan_fepout_file(fname)
# The keys are of the form 0.0_0.02 which represents lambda range 0.0 to 0.02
for key in blocks:
(block_b, block_e) = sorted(float(l) for l in key.split('_'))
all_lambda_ranges.add((block_b, block_e))
# Look through all lambda ranges in spec to see whether this block is allowed
if spec:
spec_files = spec.search(block_b, block_e)
if len(spec_files) == 0: # Not in spec, keep on truckin'
continue
elif len(spec_files) > 1:
sys.stderr.write('ERROR: (%f, %f) spans more than one spec file entry. What the heck?\n' % (block_b, block_e))
return 1
else: # We found exactly one result so keep going
sys.stderr.write('Keeping (%f, %f) from %s\n' % (block_b, block_e, fname)) # TODO: More detail
pass
# Dump this block into the tree. Duplicates are OK because they'll be returned with intervaltree.overlap()
# and we can then concatenate them
all_blocks[block_b:block_e] = blocks[key]
# We use the last delta value encountered to decide whether lambdas increase or decrease.
# Naturally this does not protect against weird pathological cases where the user provides
# both increasing and decreasing lambda windows. I guess we should error out in that case,
# but such intelligence is not yet implemented. Could easily be done by comparing the sign
# of last_delta and offsets[key]['delta']; if they are different, it's bad news bears.
# last_delta = blocks[key]['delta']
# Spit out a single header block from any .fepout file, because we only want one of these
# in the resulting concatenated fepout output
# Should not include the 'NEW FEP WINDOW' line
for block in all_blocks:
sys.stdout.write(get_block_from_file(block.data['fname'], 0, block.data['header_end_offset']))
break
# TODO: This is where the magic happens
# Iterate over the set of lambda ranges we had assembled above
for (block_b, block_e) in natsorted(all_lambda_ranges):
# Find all blocks with this lambda range
blocks = all_blocks.overlap(block_b, block_e)
# And merge them. concat_block_prod will go read the block data from the fepout files
# and merge only their production fragment, keeping an equilibration fragment chosen arbitrarily.
sys.stdout.write(concat_block_prod(blocks, ignore_steps=args.ignore_steps))
sys.stderr.write('Kept %d lambda windows.\n' % len(all_blocks))
return 0
if __name__ == '__main__':
exit(int(main() or 0))