-
Notifications
You must be signed in to change notification settings - Fork 22
/
Copy pathextractVectors.py
36 lines (26 loc) · 1.03 KB
/
extractVectors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#!/usr/bin/python
import sys
from decimal import Decimal
import numpy as np
import gensim
token_file = sys.argv[1]
out_file = token_file.replace("tokens", "embedding-word2vec-300")
token_fin = open(token_file, "r")
embed_fout = open(out_file, "w")
#model = gensim.models.Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
for pair in token_fin.readlines():
cols = pair.strip().split(",")
token1 = cols[0].replace("\"", "")
token2 = cols[1].replace("\"", "")
if token1.lower() in model:
vec1 = ','.join(['%.6f' % num for num in model[token1.lower()]])
else:
vec1 = ','.join(['%.6f' % num for num in np.zeros(300)])
if token2 != "O":
if token2.lower() in model:
vec2 = ','.join(['%.6f' % num for num in model[token2.lower()]])
else:
vec2 = ','.join(['%.6f' % num for num in np.zeros(300)])
embed_fout.write(vec1 + ',' + vec2 + ',' + label + '\n')
token_fin.close()
embed_fout.close()