-
Notifications
You must be signed in to change notification settings - Fork 47
/
obtain_pretrained_word2vec.py
executable file
·69 lines (57 loc) · 2.02 KB
/
obtain_pretrained_word2vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import numpy as np
import cPickle
def get_W(w2v, word2idx, k=300):
"""
Get word matrix. W[i] is the vector for word indexed by i
"""
vocab_size = len(w2v)
W = np.zeros(shape=(vocab_size, k))
for word in w2v:
W[word2idx[word]] = w2v[word]
return W
def load_bin_vec(fname, vocab):
"""
Loads 300x1 word vecs from Google (Mikolov) word2vec
"""
word_vecs = {}
with open(fname, "rb") as f:
header = f.readline()
vocab_size, layer1_size = map(int, header.split())
binary_len = np.dtype('float32').itemsize * layer1_size
for line in xrange(vocab_size):
word = []
while True:
ch = f.read(1)
if ch == ' ':
word = ''.join(word)
break
if ch != '\n':
word.append(ch)
if word in vocab:
word_vecs[word] = np.fromstring(f.read(binary_len), dtype='float32')
else:
f.read(binary_len)
return word_vecs
def add_unknown_words(word_vecs, vocab, k=300):
"""
For words that occur in at least min_df documents, create a separate word vector.
0.25 is chosen so the unknown vectors have (approximately) same variance as pre-trained ones
"""
for word in vocab:
if word not in word_vecs:
word_vecs[word] = np.random.uniform(-0.25,0.25,k)
if __name__=="__main__":
w2v_file = 'GoogleNews-vectors-negative300.bin'
x = cPickle.load(open("./data/youtube2text/corpus.p","rb"))
#train, val, test = x[0], x[1], x[2]
wordtoix, ixtoword = x[3], x[4]
del x
n_words = len(ixtoword)
w2v = load_bin_vec(w2v_file, wordtoix)
add_unknown_words(w2v, wordtoix)
W = get_W(w2v,wordtoix)
#rand_vecs = {}
#add_unknown_words(rand_vecs, wordtoix)
#W2 = get_W(rand_vecs,wordtoix)
cPickle.dump([W,wordtoix], open("word2vec.p", "wb"))
print "pretrained word vector created!"