-
Notifications
You must be signed in to change notification settings - Fork 7
/
process_olg.py
93 lines (63 loc) · 2.33 KB
/
process_olg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/python
'''
'''
import sys
from collections import OrderedDict
import numpy as np
from scipy.stats import hmean
def gini(x):
'''
`http://www.ellipsix.net/blog/2012/11/the-gini-coefficient-for-distribution-inequality.html`
'''
# REQUIRES ALL VALUES IN X TO BE ZERO OR POSITIVE NUMBERS,
# OTHERWISE RESULTS ARE UNDEFINED
x = np.array(x)
n = len(x)
s = x.sum()
r = np.argsort(np.argsort(-x)) # calculates zero-based ranks
return 1 - (2.0 * (r*x).sum() + s)/(n*s)
if __name__ == '__main__':
olg_filename = sys.argv[1]
summary_tsv_filename = olg_filename + '_sum.tsv'
per_atom_tsv_filename = olg_filename + '.tsv'
data = OrderedDict()
# PARSE OLG PDB FILE
with open(olg_filename, 'rb') as fo:
for line in fo:
line = line.strip()
if line.startswith('HETATM'):
chain = line[21:22]
resnum = line[22:26].strip()
inscode = line[26:27].strip()
atom_name = line[11:15].strip()
pymol_string = '/{}/{}{}/{}'.format(chain,
resnum,
inscode,
atom_name)
rinacc = float(line[73:78])
data[pymol_string] = rinacc
# SUMMARY STATISTICS
all_rinaccs = data.values()
# MIN
min_rinacc = min(all_rinaccs)
# MAX
max_rinacc = max(all_rinaccs)
# MEAN
mean_rinacc = np.mean(all_rinaccs)
# SD
sd_rinacc = np.std(all_rinaccs)
# MEDIAN
med_rinacc = np.median(all_rinaccs)
# GINI
gini_rinacc = gini(all_rinaccs)
# HARMONIC MEAN
harmonic_mean_rinacc = hmean(all_rinaccs)
# SUMMARY OUTPUT
output = [min_rinacc, max_rinacc, mean_rinacc, sd_rinacc, med_rinacc,
gini_rinacc, harmonic_mean_rinacc]
with open(summary_tsv_filename, 'wb') as fo:
fo.write('{}\n'.format('\t'.join([str(x) for x in output])))
# PER-ATOM OUTPUT
with open(per_atom_tsv_filename, 'wb') as fo:
for k, v in data.iteritems():
fo.write('{}\t{}\n'.format(k, v))