-
Notifications
You must be signed in to change notification settings - Fork 9
/
my_embedding.py
110 lines (84 loc) · 3 KB
/
my_embedding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import os, sys
from time import time
import numpy as np
import random
def ngrams(sentence, n):
"""
Returns:
list: a list of lists of words corresponding to the ngrams in the sentence.
"""
return [sentence[i:i+n] for i in range(len(sentence)-n+1)]
class GloveEmbedding(object):
"""
Reference: http://nlp.stanford.edu/projects/glove
"""
def __init__(self, default='zero'):
self.default = default
self.dim = 300
start = time()
self.dict = self.load_emb('embeddings/glove/glove.840B.300d.txt')
print('Pre-trained Glove embeddings loaded in {} seconds!'.format(time()-start))
def load_emb(self, filename):
emb_dict = {}
with open(filename) as fid:
for line in fid:
items = line.rstrip().split(' ')
word = items[0]
embd = [float(x) for x in items[1:]]
assert(len(embd) == self.dim)
emb_dict[word] = embd
return emb_dict
def emb(self, word, oov_default = 'zero'):
embd = self.dict.get(word)
if embd == None:
if oov_default == 'zero':
embd = [0.0 for i in range(self.dim)]
else:
np.random.seed(123)
embd = [random.uniform(-0.1, 0.1) for i in range(self.dim)]
return embd
class KazumaCharEmbedding(object):
"""
Reference: http://www.logos.t.u-tokyo.ac.jp/~hassy/publications/arxiv2016jmt/
"""
def __init__(self):
self.dim = 100
start = time()
self.dict = self.load_emb('embeddings/kazuma/charNgram.txt')
print('Pre-trained character Ngram embeddings loaded in {} seconds!'.format(time()-start))
def load_emb(self, filename):
emb_dict = {}
with open(filename) as fid:
for line in fid:
items = line.rstrip().split(' ')
word = items[0]
embd = [float(x) for x in items[1:]]
assert(len(embd) == self.dim)
emb_dict[word] = embd
return emb_dict
def emb(self, word, oov_default='zero'):
chars = ['#BEGIN#'] + list(word) + ['#END#']
if oov_default == 'zero':
embs = np.zeros(self.dim, dtype=np.float32)
else:
np.random.seed(123)
embs = np.random.uniform(-0.1, 0.1, self.dim)
match = {}
for i in [2, 3, 4]:
grams = ngrams(chars, i)
for g in grams:
g = '{}gram-{}'.format(i, ''.join(g))
e = self.dict.get(g)
if e is not None:
match[g] = np.array(e, np.float32)
if match:
embs = sum(match.values()) / len(match)
return embs.tolist()
if __name__ == '__main__':
#g = GloveEmbedding()
k = KazumaCharEmbedding()
for w in ['canada', 'vancouver', 'toronto']:
start = time()
print('embedding {}'.format(w))
print(k.emb(w))
print('took {}s'.format(time() - start))