-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLSH.py
65 lines (45 loc) · 1.53 KB
/
LSH.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
from sklearn.neighbors import LSHForest
from utils import *
import os
from config import Config
import time
import numpy as np
remarks = '100k'
nums = 100000
sentences_path = Config.cache_dir + "/sentences_%s.pkl"%remarks
lshf_path = Config.cache_dir + "/lshf_%s.pkl"%remarks
tfidf_vec_path = Config.cache_dir + "/tfidf_%s.pkl"%remarks
indices_path = Config.data_dir + "/indices_%s.pkl"%remarks
if os.path.exists(sentences_path):
raw_para_sentences = pickle.load(open(sentences_path,"rb"))
else:
raw_para_sentences = get_para_5m_raw_data()[:nums]
with open(sentences_path,'wb') as f:
pickle.dump(raw_para_sentences, f)
if os.path.exists(tfidf_vec_path):
tfidf_vec = pickle.load(open(tfidf_vec_path,"rb"))
else:
tfidf_vec = get_tfidf_feature(raw_para_sentences, remarks)
with open(tfidf_vec_path,'wb') as f:
pickle.dump(tfidf_vec, f)
if os.path.exists(lshf_path):
lshf = pickle.load(open(lshf_path,"rb"))
else:
lshf = LSHForest(random_state=42)
lshf.fit(tfidf_vec.toarray())
with open(lshf_path,'wb') as f:
pickle.dump(lshf, f, protocol=4)
orig_tfidf_vecs = tfidf_vec[::2]
t1 = time.time()
x_test = tfidf_vec[0]
distances, indices = lshf.kneighbors(orig_tfidf_vecs.toarray(), n_neighbors = 3)
with open(indices_path,'wb') as f:
pickle.dump(indices, f)
# print(distances)
# print('original', raw_para_sentences[0])
# print('para', raw_para_sentences[1])
# for i in indices:
# print(raw_para_sentences[0][i])
t2 = time.time()
print(t2-t1)
# print(indices)