forked from babylonhealth/fastText_multilingual
-
Notifications
You must be signed in to change notification settings - Fork 0
/
fasttext.py
150 lines (127 loc) · 5.95 KB
/
fasttext.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#
# Copyright (c) 2017-present, babylon health
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
#
import numpy as np
class FastVector:
"""
Minimal wrapper for fastvector embeddings.
```
Usage:
$ model = FastVector(vector_file='/path/to/wiki.en.vec')
$ 'apple' in model
> TRUE
$ model['apple'].shape
> (300,)
```
"""
def __init__(self, vector_file='', transform=None):
"""Read in word vectors in fasttext format"""
self.word2id = {}
# Captures word order, for export() and translate methods
self.id2word = []
print('reading word vectors from %s' % vector_file)
with open(vector_file, 'r') as f:
(self.n_words, self.n_dim) = \
(int(x) for x in f.readline().rstrip('\n').split(' '))
self.embed = np.zeros((self.n_words, self.n_dim))
for i, line in enumerate(f):
elems = line.rstrip('\n').split(' ')
self.word2id[elems[0]] = i
self.embed[i] = elems[1:self.n_dim+1]
self.id2word.append(elems[0])
# Used in translate_inverted_softmax()
self.softmax_denominators = None
if transform is not None:
print('Applying transformation to embedding')
self.apply_transform(transform)
def apply_transform(self, transform):
"""
Apply the given transformation to the vector space
Right-multiplies given transform with embeddings E:
E = E * transform
Transform can either be a string with a filename to a
text file containing a ndarray (compat. with np.loadtxt)
or a numpy ndarray.
"""
transmat = np.loadtxt(transform) if isinstance(transform, str) else transform
self.embed = np.matmul(self.embed, transmat)
def export(self, outpath):
"""
Transforming a large matrix of WordVectors is expensive.
This method lets you write the transformed matrix back to a file for future use
:param The path to the output file to be written
"""
fout = open(outpath, "w")
# Header takes the guesswork out of loading by recording how many lines, vector dims
fout.write(str(self.n_words) + " " + str(self.n_dim) + "\n")
for token in self.id2word:
vector_components = ["%.6f" % number for number in self[token]]
vector_as_string = " ".join(vector_components)
out_line = token + " " + vector_as_string + "\n"
fout.write(out_line)
fout.close()
def translate_nearest_neighbour(self, source_vector):
"""Obtain translation of source_vector using nearest neighbour retrieval"""
similarity_vector = np.matmul(FastVector.normalised(self.embed), source_vector)
target_id = np.argmax(similarity_vector)
return self.id2word[target_id]
def translate_inverted_softmax(self, source_vector, source_space, nsamples,
beta=10., batch_size=100, recalculate=True):
"""
Obtain translation of source_vector using sampled inverted softmax retrieval
with inverse temperature beta.
nsamples vectors are drawn from source_space in batches of batch_size
to calculate the inverted softmax denominators.
Denominators from previous call are reused if recalculate=False. This saves
time if multiple words are translated from the same source language.
"""
embed_normalised = FastVector.normalised(self.embed)
# calculate contributions to softmax denominators in batches
# to save memory
if self.softmax_denominators is None or recalculate is True:
self.softmax_denominators = np.zeros(self.embed.shape[0])
while nsamples > 0:
# get batch of randomly sampled vectors from source space
sample_vectors = source_space.get_samples(min(nsamples, batch_size))
# calculate cosine similarities between sampled vectors and
# all vectors in the target space
sample_similarities = \
np.matmul(embed_normalised,
FastVector.normalised(sample_vectors).transpose())
# accumulate contribution to denominators
self.softmax_denominators \
+= np.sum(np.exp(beta * sample_similarities), axis=1)
nsamples -= batch_size
# cosine similarities between source_vector and all target vectors
similarity_vector = np.matmul(embed_normalised,
source_vector/np.linalg.norm(source_vector))
# exponentiate and normalise with denominators to obtain inverted softmax
softmax_scores = np.exp(beta * similarity_vector) / \
self.softmax_denominators
# pick highest score as translation
target_id = np.argmax(softmax_scores)
return self.id2word[target_id]
def get_samples(self, nsamples):
"""Return a matrix of nsamples randomly sampled vectors from embed"""
sample_ids = np.random.choice(self.embed.shape[0], nsamples, replace=False)
return self.embed[sample_ids]
@classmethod
def normalised(cls, mat, axis=-1, order=2):
"""Utility function to normalise the rows of a numpy array."""
norm = np.linalg.norm(
mat, axis=axis, ord=order, keepdims=True)
norm[norm == 0] = 1
return mat / norm
@classmethod
def cosine_similarity(cls, vec_a, vec_b):
"""Compute cosine similarity between vec_a and vec_b"""
return np.dot(vec_a, vec_b) / \
(np.linalg.norm(vec_a) * np.linalg.norm(vec_b))
def __contains__(self, key):
return key in self.word2id
def __getitem__(self, key):
return self.embed[self.word2id[key]]