-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbenchmarks.py
executable file
·89 lines (79 loc) · 2.91 KB
/
benchmarks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#!/usr/bin/python
# -*- coding: utf8 -*-
import sys, random, time, gc
from codecs import open
import mspace
from mspace import BKTree, VPTree, levenshtein, tokenizer
def get_sample(objects, num):
objects = list(objects)
random.shuffle(objects)
return objects[:num]
def build_tree(cls, objects, metric):
tree = cls(objects, metric)
return tree
def benchmark_k(tree, objects, min_dist, max_dist, step):
mspace.dist_ctr = 0
print("#%s, size: %s, height: %s, nodes: %s" % (type(tree), len(tree),
tree.height, tree.num_nodes))
print("#doing %s searches" % len(objects))
print("#k\t%6s\t%8s\t%8s\t%8s" % (
"time", "distCt", "ratio", "resCt"))
for k in range(min_dist, max_dist+1, step):
print("%s\t" % k, end=' ')
start = time.time()
num_results = 0
for o in objects:
num_results += len(tree.search(o, k))
end = time.time()
time_per_object = float((end - start)) / len(objects)
dist_per_object = float(mspace.dist_ctr) / len(objects)
time_per_dist = time_per_object / dist_per_object
res_per_object = float(num_results) / len(objects)
print("%6.4f\t%8.2f\t%0.6f\t%8.2f" % (
time_per_object, dist_per_object,
time_per_dist, res_per_object))
mspace.dist_ctr = 0
def benchmark_construction(cls, objects, metric, max, step):
print("#%s" % (cls))
print("#%8s\t%7s\t%12s\t%8s" % ("size", "height", "time", "per node"))
for n in range(step, max+1, step):
lst = objects[:n]
start = time.time()
t = build_tree(cls, lst, metric)
end = time.time()
buildtime = float(end) - start
time_per_node = buildtime / len(t)
print("%8s\t%7s\t%12.2f\t%8.6f" % (len(t), t.height, buildtime,
time_per_node))
del t
gc.collect()
print()
def main():
filename = sys.argv[1]
num = int(sys.argv[2])
file = open(filename, encoding='iso-8859-1')
objects = get_sample(tokenizer(file), num)
file = open(filename, encoding='iso-8859-1')
toSearch = get_sample(tokenizer(file), min(100, max(int(.01*num), 20)))
gc.collect()
for cls in [BKTree, VPTree]:
print()
#benchmark_construction(cls, objects, levenshtein, num, int(num/6))
tree = build_tree(cls, objects, levenshtein)
benchmark_k(tree, toSearch, 0, 3, 1)
del tree
garbage = gc.collect()
#print "#garbage collection threw away %s objects." % garbage
if __name__ == '__main__':
# sys.stderr.write("""
#WARNING! For several reasons, the following numbers are highly inaccurate.
#For more accurate numbers, use the `timeit` module and disable shuffling of
#objects in tree indexing.\n
#""")
try:
main()
except KeyboardInterrupt as e:
sys.stdout.flush()
sys.stderr.flush()
sys.stderr.write("\n^C\n")
# vim: set et ts=4 sw=4 tw=76 nu: