forked from corcra/leekspeak
-
Notifications
You must be signed in to change notification settings - Fork 0
/
map_fns.py
359 lines (332 loc) · 12.2 KB
/
map_fns.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
#!/bin/python
# Create a bijection between base32 nmers and LANGUAGE.
from string import ascii_lowercase
from itertools import product
from scipy.spatial.distance import pdist, squareform
import editdistance
import numpy as np
import pdb
import sys
import re
import gzip
NMER_SIZE = 3
# --- loading and prep --- #
def get_base32():
"""
Generate nmers of base32 characters.
"""
alphabet = ascii_lowercase
base32_chars = alphabet + '234567'
base32_nmers = [''.join(x) for x in product(base32_chars, repeat=NMER_SIZE)]
return base32_nmers
def subset_language(vocabulary, vectors, wordlist, N=32768):
"""
Subset the vocabulary/vectors to those in a wordlist.
The wordlist is a list arranged in order of 'preference'.
Note: we hope the vocabulary is contained in the wordlist,
but it might not be. N is the number of words we require.
If the wordlist contains fewer than N words, (but the vocabulary has >= N),
we supplement the result from the vocabulary randomly.
Also, we want to make sure the order of vocabulary is random (because some
structure could negatively influence the optimisation procedure later).
Arguments:
vocabulary list of words (strings)
vectors list of vectors corresponding to vocabulary
wordlist str, list, or array, if str: path for .npy of words
N int: number of words we want vectors for
Returns:
vocabulary_subset list of words (len N)
vectors_subset ndarray of vectors corresponding to words (shape[0] = N)
"""
keep_indices = [] # indices of vocabulary/vectors to keep
added = 0
if type(wordlist) == str:
# load from path
print 'Loading wordlist from', wordlist
wordlist = np.loadtxt(wordlist, dtype=str)
else:
assert type(wordlist) == list or type(wordlist) == np.ndarray
print 'Subsetting vocabulary.'
for word in wordlist:
print word
if added == N:
break
try:
word_index = vocabulary.index(word)
keep_indices.append(word_index)
added += 1
except ValueError:
continue
print 'Acquired', len(keep_indices), 'words.'
miss = N - len(keep_indices)
if miss > 0:
print 'Supplementing with', miss, 'random words.'
for i in xrange(miss):
random_index = np.random.choice(len(vocabulary), 1)
while random_index in keep_indices:
random_index = np.random.choice(len(vocabulary), 1)
keep_indices.append(random_index)
print 'Shuffling.'
# shuffle
np.random.shuffle(keep_indices)
# populate new arrays
print 'Populating subsetted arrays.'
vectors_subset = vectors[keep_indices]
vocabulary_subset = [vocabulary[i] for i in keep_indices]
return vocabulary_subset, vectors_subset
def get_language(path):
"""
Get the LANGUAGE words, and vectors!
Takes a path to a file like
apple 0.3410 0.24 0.4114
orange 0.613 3.414 0.512
Outputs a list like [apple, orange]
and a np array like [[0.3410, 0.24, 0.4114],
[0.613, 3.414, 0.512]]
"""
print 'Loading language from', path
vocabulary = []
vectors = []
if '.gz' in path:
fi = gzip.open(path, 'rb')
else:
fi = open(path, 'r')
for line in fi:
if '\t' in line:
sl = line.strip('\n').split('\t')
else:
sl = line.strip('\n').split(' ')
word = re.sub('\x00', '', sl[0])
vocabulary.append(word)
if len(sl) > 1:
vector = map(float, sl[1:])
else:
vector = np.random.normal(size=5)
vectors.append(vector)
vectors = np.array(vectors)
W = len(vocabulary)
print 'Loaded', W, 'words from', path
return vocabulary, vectors
# --- distance metrics --- #
def distance_lookup_table():
"""
Pairwise character similarity lookup table.
Characters in base32:
base32_chars = alphabet + '234567'
Similarities:
- i ~ l (1 is not a problem as it does not exist in base32)
- b ~ d
- p ~ q
- m ~ n
- v ~ w
- c ~ e
- a ~ o
Note: this is largely arbitrary from me, partially influenced by:
http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3541865/table/t1-ptj3712663/
"""
base32_chars = ascii_lowercase + '234567'
n_chars = len(base32_chars)
# start off all ones
distance_lookup = np.ones(shape=(n_chars, n_chars))
# manually input the above
# 0.5 may not be th best for this
# b ~ d
distance_lookup[base32_chars.index('b'), base32_chars.index('d')] = 0.5
distance_lookup[base32_chars.index('d'), base32_chars.index('b')] = 0.5
# p ~ q
distance_lookup[base32_chars.index('p'), base32_chars.index('q')] = 0.5
distance_lookup[base32_chars.index('q'), base32_chars.index('p')] = 0.5
# m ~ n
distance_lookup[base32_chars.index('m'), base32_chars.index('n')] = 0.5
distance_lookup[base32_chars.index('n'), base32_chars.index('m')] = 0.5
# v ~ w
distance_lookup[base32_chars.index('v'), base32_chars.index('w')] = 0.5
distance_lookup[base32_chars.index('w'), base32_chars.index('v')] = 0.5
# c ~ e
distance_lookup[base32_chars.index('c'), base32_chars.index('e')] = 0.5
distance_lookup[base32_chars.index('e'), base32_chars.index('c')] = 0.5
# a ~ o
distance_lookup[base32_chars.index('a'), base32_chars.index('o')] = 0.5
distance_lookup[base32_chars.index('o'), base32_chars.index('a')] = 0.5
# zeros along the diagonal
for i in xrange(n_chars):
distance_lookup[i, i] = 0
# the horrible dict version (i'm sure there's a lovely zip way to do this but w/e)
distance_lookup_dict = dict()
for (i, a) in enumerate(base32_chars):
for (j, b) in enumerate(base32_chars):
distance_lookup_dict[(a, b)] = distance_lookup[i, j]
return distance_lookup, distance_lookup_dict
def bespoke_distance(nmer1, nmer2, offset_kappa):
"""
Hand-crafted distance function, probably not a real metric.
Thinking about what is 'hard to differentiate', as human looking at strings
Properties:
- adjacent swaps are hard to detect
- m ~ rn
- see pairwise_character_distance
"""
raise NotImplementedError
d = 0
# if the nmers are identical, don't need to do anything complicated
if nmer1 == nmer2:
return d
# check exact correspondences (increment distances)
for (a, b) in zip(nmer1, nmer2):
d += pairwise_character_distance(a, b) # this fn has bespoke distances
# now check with an offset
for (a, b) in zip(nmer1[1:]+nmer[0], nmer2):
d += offset_kappa*pairwise_character_distance(a, b) # this fn has bespoke distances
# negative offset
for (a, b) in zip(nmer[-1]+nmer1[:-1], nmer2):
d += offset_kappa*pairwise_character_distance(a, b) # this fn has bespoke distances
# I can already feel how slow this is going to be.
# some stuff goes here etc zzzz
d = abs(np.random.normal())
return d
def base32_distances(base32_nmers, metric='levenshtein'):
"""
Get pairwise distances (different metrics)
This takes a little while
"""
N = len(base32_nmers)
total = N*(N-1.0)/2
print 'Calculating', N*(N-1)/2, 'pairwise distances.'
d = np.empty(shape=(N, N), dtype=np.float)
n = 0
for i in xrange(N):
for j in xrange(i, N):
n += 1
if n%500000 == 0:
sys.stdout.write('\r'+'%.4f' % (float(n*100)/total)+'%')
sys.stdout.flush()
if metric == 'levenshtein':
dij = editdistance.eval(base32_nmers[i], base32_nmers[j])
elif metric == 'bespoke':
dij = bespoke_distance(base32_nmers[i], base32_nmers[j])
else:
raise NotImplementedError
d[i, j] = dij
d[j, i] = dij
print ''
return d
# --- some optimisation stuff --- #
def get_proposal(A, B):
"""
Everyone loves MH.
"""
n = A.shape[0]
# get a pivot point
inner_products = np.einsum('i...,...i', A, B)
violated = np.random.choice(n, size=1, p=inner_products/np.sum(inner_products))[0]
# get the rest
v = np.array([x for (i, x) in enumerate(A[:, violated]) if not i == violated])
phi = np.array([x for (i, x) in enumerate(B[:, violated]) if not i == violated])
# now reorder... (argsort gives low to high, remember)
v_order = np.argsort(v)
phi_order = np.argsort(-phi)
# want to move the highest phi to the lowest v
ordering_subset = v_order[phi_order]
ordering_subset[np.where(ordering_subset >= violated)] += 1
# reinsert into ordering
ordering = np.empty(shape=len(v)+1, dtype=np.int)
ordering[:violated] = ordering_subset[:violated]
ordering[violated] = violated
ordering[(violated+1):] = ordering_subset[violated:]
assert len(set(ordering)) == len(ordering)
return ordering
def find_ordering(A, B, eps=0.1):
"""
Reorder the rows/columns of B to maximise its difference to A.
... possibly.
Use Metropolis-Hastings for some reason.
"""
assert A.shape[0] == A.shape[1]
assert B.shape == A.shape
diff = np.mean(abs(A - B))
temperature = diff
delta = 100
cumulative_delta = 0
print diff, delta
accept, reject = 0, 0
while abs(delta) > eps:
# get proposal ordering
proposal_ordering = get_proposal(A, B)
proposal_B = B[proposal_ordering, :][:, proposal_ordering]
proposal_diff = np.mean(abs(A - proposal_B))
# accept with some probability
proposal_delta = diff - proposal_diff
prob = min(1, np.exp(-proposal_delta/temperature))
if np.random.random() <= prob:
accept += 1
ordering = proposal_ordering
diff = proposal_diff
delta = proposal_delta
cumulative_delta -= delta
B = proposal_B
print diff, -delta, cumulative_delta
if accept%100 == 0:
temperature /= 1.1
else:
reject += 1
if reject%100 == 0:
temperature *= 1.15
temperature *= 0.99999
acceptance_rate = float(accept)/(accept + reject)
return ordering, acceptance_rate, temperature
# --- maps --- #
def random_map(nmers, vocabulary):
"""
Totally random map, totally unconstrained, totally boring.
"""
forward_mapping = dict(zip(nmers, vocabulary))
backward_mapping = dict(zip(vocabulary, nmers))
return forward_mapping, backward_mapping
def diverse_map(nmers, vocabulary, vectors):
"""
Map which aims to map pairs of similar base32 nmers to pairs of dissimilar
language words.
"""
N = len(nmers)
A = base32_distances(nmers)
print A.shape
B = squareform(pdist(vectors, 'cosine'))
print B.shape
ordering = find_ordering(A, B)
forward_mapping, backward_mapping = dict(), dict()
for i in xrange(N):
triple = nmers[i]
word = vocabulary[ordering[i]]
forward_mapping[triple] = word
backward_mapping[word] = triple
return forward_mapping, backward_mapping
def get_map(nmers, vocabulary, vectors=None, mapping='random'):
"""
Prep and get a map.
"""
N = len(nmers)
W = len(vocabulary)
if W < N:
print 'ERROR: Not enough words.'
return False
if W > N:
print 'There are', W, 'elements in the vocabulary and only', N,
print 'nmers: subsetting.'
vocabulary_subset = list(np.random.choice(vocabulary, N))
vocabulary = vocabulary_subset
if mapping == 'random':
print 'Using random map.'
forward_mapping, backward_mapping = random_map(nmers, vocabulary)
elif mapping == 'diverse':
print 'Using diverse map.'
if vectors is None:
print 'ERROR: diverse map requires vectors.'
return False
forward_mapping, backward_mapping = diverse_map(nmers, vocabulary, vectors)
else:
print 'ERROR: Not implemented :('
# sanity check
for (k, v) in forward_mapping.iteritems():
if not backward_mapping[v] == k:
print k, v
return forward_mapping, backward_mapping