-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathembedding_utils.py
62 lines (50 loc) · 1.87 KB
/
embedding_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import numpy as np
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
def get_glove(fname):
with open(fname, "rb") as lines:
wvec = {line.split()[0].decode("utf-8"): np.array(line.split()[1:], dtype=np.float32)
for line in lines}
return wvec
# sklearn's classifiers
class SumEmbeddingVectorizer(object):
def __init__(self, word2vec):
self.word2vec = word2vec
if len(word2vec) > 0:
self.dim = len(word2vec[next(iter(wvec))])
else:
self.dim = 0
def fit(self, X, y):
return self
def transform(self, X):
return np.array([
np.sum([self.word2vec[w] for w in words if w in self.word2vec]
or [np.zeros(self.dim)], axis=0)
for words in X
])
class TfidfEmbeddingVectorizer(object):
def __init__(self, word2vec):
self.word2vec = word2vec
self.word2weight = None
if len(word2vec)>0:
self.dim=len(word2vec[next(iter(wvec))])
else:
self.dim=0
def fit(self, X, y):
tfidf = TfidfVectorizer(analyzer=lambda x: x)
tfidf.fit(X)
# if a word was never seen - it must be at least as infrequent
# as any of the known words - so the default idf is the max of
# known idf's
max_idf = max(tfidf.idf_)
self.word2weight = defaultdict(
lambda: max_idf,
[(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])
return self
def transform(self, X):
return np.array([
np.sum([self.word2vec[w] * self.word2weight[w]
for w in words if w in self.word2vec] or
[np.zeros(self.dim)], axis=0)
for words in X
])