-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathquery_aux.py
421 lines (318 loc) · 14.9 KB
/
query_aux.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import io
import munkres
import ot
import numpy as np
import pandas as pd
import scipy as sp
from scipy.spatial.distance import directed_hausdorff
from pymfe.mfe import MFE
def compute_NN_list(x_src, x_tgt, idx_src, bsz=100, nn_size=10):
"""
Compute the list of the nearest neighboors in target space
for each word in source words
"""
nn_list = dict()
x_src /= np.linalg.norm(x_src, axis=1)[:, np.newaxis] + 1e-8
x_tgt /= np.linalg.norm(x_tgt, axis=1)[:, np.newaxis] + 1e-8
for i in range(0, len(idx_src), bsz):
e = min(i + bsz, len(idx_src))
scores = np.dot(x_tgt, x_src[idx_src[i:e]].T)
ind = np.argpartition(scores, -nn_size, axis=0)[-nn_size:]
#print(scores.shape) #200 000, 100
#print(ind.shape) # 10,100
for j in range(i, e):
nn_list[idx_src[j]] = list(
ind[:, j - i][np.argsort(scores[:, j - i][ind[:, j - i]])]
)[::-1]
return nn_list
def compute_csls_coord(x_src, x_tgt, lexicon, lexicon_size=-1, k=10, bsz=1024,return_similarity=False):
"""
Compute similarity and CSLS penalty (from 0 to k) for all word vectors
"""
if lexicon_size < 0:
lexicon_size = len(lexicon)
idx_src = list(lexicon.keys())
x_src /= np.linalg.norm(x_src, axis=1)[:, np.newaxis] + 1e-8
x_tgt /= np.linalg.norm(x_tgt, axis=1)[:, np.newaxis] + 1e-8
sr = x_src[list(idx_src)]
similarities = 2 * np.dot(sr, x_tgt.T)
# We compute x_tgt penaly for k steps of nn
x_tgt_penalty = np.zeros((x_tgt.shape[0], k))
for i in range(0, x_tgt.shape[0], bsz):
j = min(i + bsz, x_tgt.shape[0])
sc_batch = np.dot(x_tgt[i:j, :], x_src.T)
relevant_nn_dotprod = np.partition(sc_batch, -k, axis=1)[:, -k:]
relevant_nn_dotprod.sort()
for l in range(0, k):
x_tgt_penalty[i:j, l] = np.mean(relevant_nn_dotprod[:, l:], axis=1)
# We compute x_src penaly for k steps of nn
x_src_penalty = np.zeros((sr.shape[0], k))
for i in range(0, sr.shape[0], bsz):
j = min(i + bsz, sr.shape[0])
sc_batch = np.dot(sr[i:j, :], x_tgt.T)
relevant_nn_dotprod = np.partition(sc_batch, -k, axis=1)[:, -k:]
relevant_nn_dotprod.sort()
for l in range(0, k):
x_src_penalty[i:j, l] = np.mean(relevant_nn_dotprod[:, l:], axis=1)
if return_similarity:
return similarities,x_src_penalty,x_tgt_penalty
else:
return x_src_penalty, x_tgt_penalty
def compute_csls_list(x_src, x_tgt, lexicon,idx_src, nn_size = 10,lexicon_size=-1, k=10, bsz=1024):
similarities,x_src_penalty,x_tgt_penalty = compute_csls_coord(x_src, x_tgt,
lexicon,
lexicon_size=lexicon_size,
k=k,
bsz=bsz,
return_similarity=True)
# x_src_penalty and x_tgt_penalty are of shape [number of words,k].
# We need to keep only the first column (the penalty for each word) and add this penalties to the similarity
x_src_penalty = x_src_penalty[:,0]
x_tgt_penalty = x_tgt_penalty[:,0]
# We then remove these to the similarities :
similarities -= x_src_penalty[:,np.newaxis]
similarities -= x_tgt_penalty[np.newaxis, :]
# We now have a matrix with similarities[i,j] = csls(src[i],tgt[j])
# We need to compute the list of the nearest neighboors (in terms of csls) in target space
#for each word in source words (meaning the biggest similarity)
csls_list = dict()
for i in range(len(idx_src)):
if (i % 1000 == 0):
print("Step {}".format(i))
# For the i-th source word, the csls similarity with each word from target :
scores = similarities[i,:]
ind = np.argpartition(scores, -nn_size, axis=0)[-nn_size:]
#print(ind.shape) #10,
#print(scores.shape) #200 000,
csls_list[idx_src[i]] = list(ind[np.argsort(scores[ind])])[::-1]
return csls_list
def compute_binary_distance(
x_src,
x_tgt,
file_name,
lexicon,
k_csls=10,
discard_empty_query=False,
testing_query=False,
add_csls_coord=True,
add_word_coord=False,
add_query_coord=False,
add_meta_features=True,
center_meta_features=False,
nn_size_meta_features=10,
min_relevance=1,
max_relevance=2,
bsz=100,
query_size=2,
use_csls = False
):
"""
Use a 0-1 loss for queries construction and save svmlight file
(or 1-2 as 0 is considered as not relevant and therefore, as they are many 0, the prediction
is nearly always 0)
To be modfied for exact query exctraction
"""
idx_src = list(lexicon.keys())
print("Expected query size: %d" % (query_size * len(idx_src)))
#Compute NN list ou CSLS-NN list
nn_list = compute_NN_list(
x_src, x_tgt, idx_src=idx_src, bsz=bsz, nn_size=query_size
) if use_csls == False else compute_csls_list(x_src, x_tgt, lexicon=lexicon, idx_src=idx_src)
if add_meta_features == True:
target_nn_list=compute_target_NN_list(x_tgt,nn_size=nn_size_meta_features)
source_nn_list=compute_source_NN_list(x_src,idx_src,nn_size=nn_size_meta_features)
query_id = 0
file = open(file_name, "wb")
# wb means we are writting in binary
file.truncate(0)
if add_csls_coord == True:
x_src_penalty, x_tgt_penalty = compute_csls_coord(
x_src, x_tgt, lexicon, lexicon_size=-1, k=k_csls, bsz=1024
)
query_count = len(idx_src)
for ind_src, i in enumerate(idx_src):
# Whether or not to add ground truth words in each query.
# For the testing scenario, we do not want to force their
# presence whereas for the learning scenario, they are
# necessary
if discard_empty_query == True:
# Do not write queries with no relevent labels with (at the moment) the NN method
test = False
for e in lexicon[i]:
if e in nn_list[i]:
test = True
if test:
query_list = nn_list[i]
discard_curr_query = False
else:
discard_curr_query = True
query_count -=1
elif testing_query == True:
query_list = nn_list[i]
discard_curr_query = False
else:
# We consider ground truth words and fill the remaining with neighboors
# in order to have a fixed length of query size
target = list(lexicon[i])
others_neigh = [elem for elem in nn_list[i] if elem not in lexicon[i]]
query_list = target + others_neigh[: query_size - len(lexicon[i])]
discard_curr_query = False
# We compute the source word "voisinage"
if add_meta_features == True:
source_pt_neigh=source_nn_list[i]
source_pt_emb=x_src[source_pt_neigh]
if not discard_curr_query:
for ind, j in enumerate(query_list):
#All the bellow can be computed in parallel, to implement ?
total_coord = []
if add_csls_coord == True:
similarity = np.dot(x_src[i], x_tgt[j].T)
# We append NN similarity
total_coord.append(similarity)
# We append k CSLS similarity
total_coord = np.concatenate(
(
total_coord,
similarity - x_src_penalty[ind_src, :] - x_tgt_penalty[j, :],
),
axis=None,
)
if add_word_coord == True:
# We add the coord of the potential translation word
total_coord = np.concatenate((total_coord, x_tgt[j]), axis=None)
if add_query_coord == True:
# We add the coord of the query word
total_coord = np.concatenate((total_coord, x_src[i]), axis=None)
if add_meta_features == True:
target_pt_neigh=target_nn_list[j]
target_pt_emb=x_tgt[target_pt_neigh]
if center_meta_features==True:
#We align source and target word
target_pt_emb-=(source_pt_emb[0]-target_pt_emb[0])
#We compute mf for the two groups and for each single group
bigroup_mf=bigroup_meta_features(source_pt_emb,target_pt_emb)
target_mf=single_group_meta_features(target_pt_emb)
source_mf=single_group_meta_features(source_pt_emb)
total_coord = np.concatenate((total_coord, bigroup_mf,
target_mf, source_mf), axis=None)
# For the given traduction, the relevance is max_relevance
if j in lexicon[i]:
line = svm_line(total_coord, query_id, max_relevance)
file.write(line)
# For the other nearest neighboors, the relevance is min_relevance
else:
line = svm_line(total_coord, query_id, min_relevance)
file.write(line)
query_id += 1
#To keep track of progress
if (query_id%100)==0:
print("Query number {} done out of {}".format(query_id,len(idx_src)))
print("Query kept {} out of {}".format(query_count,len(idx_src)))
file.close()
def svm_line(coord_list, query_id, relevance):
pairs = ["%d:%.16g" % (i, x) for i, x in enumerate(coord_list)]
sep_line = []
sep_line.append(str(relevance))
sep_line.append("qid:%d" % query_id)
sep_line.extend(pairs)
sep_line.append("\n")
line = " ".join(sep_line)
return line.encode("ascii")
def bigroup_meta_features(source_pt_emb,target_pt_emb):
y = [0]*source_pt_emb.shape[0] + [1]*source_pt_emb.shape[0]
X=np.concatenate([source_pt_emb,target_pt_emb],axis=0)
# Extract several meta-features (more than for single group)
mfe=MFE(groups=["Statistical","complexity","concept","clustering"],suppress_warnings=True)
mfe.fit(X, y)
ft = mfe.extract()
feat_list =[]
interest_features = ['ch', 'cohesiveness.mean', 'cohesiveness.sd', 'conceptvar.mean',
'conceptvar.sd', 'cor.mean', 'cor.sd', 'cov.mean', 'cov.sd',
'eigenvalues.mean', 'eigenvalues.sd', 'f3.mean', 'f4.mean', 'gravity',
'impconceptvar.mean', 'impconceptvar.sd', 'int', 'iq_range.mean',
'iq_range.sd', 'kurtosis.mean', 'kurtosis.sd', 'mad.mean', 'mad.sd',
'max.mean', 'max.sd', 'mean.mean', 'mean.sd', 'median.mean',
'median.sd', 'min.mean', 'min.sd', 'nr_cor_attr', 'nr_norm',
'nr_outliers', 'pb', 'range.mean', 'range.sd', 'sd.mean', 'sd.sd',
'sil', 'skewness.mean', 'skewness.sd', 't4', 't_mean.mean', 't_mean.sd',
'var.mean', 'var.sd', 'vdb', 'vdu', 'wg_dist.mean', 'wg_dist.sd']
for feat,val in zip(ft[0],ft[1]):
if feat in interest_features:
feat_list.append(val)
#We add 3 extra "distances"
hung_dist=permutation_dist(source_pt_emb,target_pt_emb)
wass_dist=wasserstein_dist(source_pt_emb,target_pt_emb)
hauss_dist=hausdorff_dist(source_pt_emb,target_pt_emb)
feat_list.append(hung_dist)
feat_list.append(wass_dist)
feat_list.append(hauss_dist)
#return pd.Series(feat_list,index=interest_features+["hung_dist","wasser","hauss"])
return feat_list
def single_group_meta_features(X):
# Extract single group (source/target) features
features = ["cohesiveness","cor", "cov", "eigenvalues", "nr_cor_attr",
"min", "mean", "median", "max", "iq_range", "kurtosis",
"skewness", "t_mean", "var","sd", "range",
"nr_norm", "nr_outliers"]
mfe=MFE(features=features,suppress_warnings=True)
mfe.fit(X, [0]*X.shape[0])
ft = mfe.extract()
#return pd.Series(ft[1],index=ft[0])
return ft[1]
def hausdorff_dist(X1,X2):
return directed_hausdorff(X1,X2)[0]
def permutation_dist(X1, X2):
cost = (((X1[np.newaxis,:] - X2[:,np.newaxis,:])**2).sum(2))
matrix = cost.tolist()
m = munkres.Munkres()
newind = m.compute(matrix)
costnew = 0.
for (iold, inew) in newind:
costnew += cost[iold, inew]
return np.sqrt(costnew)
def wasserstein_dist(X1,X2):
C1 = sp.spatial.distance.cdist(X1, X1)
C2 = sp.spatial.distance.cdist(X2, X2)
C1 /= C1.max()
C2 /= C2.max()
p = ot.unif(X1.shape[0])
q = ot.unif(X2.shape[0])
gw0, log0 = ot.gromov.gromov_wasserstein(
C1, C2, p, q, 'square_loss', verbose=False, log=True)
return log0['gw_dist']
def compute_target_NN_list(x_tgt, bsz=100, nn_size=10):
"""
Compute the list of the nearest neighboors in target space
for each word in target words
Include the index of the origin word
"""
nn_list = dict()
x_tgt /= np.linalg.norm(x_tgt, axis=1)[:, np.newaxis] + 1e-8
for i in range(0, len(x_tgt), bsz):
e = min(i + bsz, len(x_tgt))
scores = np.dot(x_tgt, x_tgt[i:e].T)
ind = np.argpartition(scores, -(nn_size+1), axis=0)[-(nn_size+1):]
for j in range(i, e):
nn_list[j] = list(
ind[:, j - i][np.argsort(scores[:, j - i][ind[:, j - i]])]
)[::-1]
return nn_list
def compute_source_NN_list(x_src, idx_src, bsz=100, nn_size=10):
"""
Compute the list of the nearest neighboors in source space
for each word in source words lexicon list
Include the index of the origin word
"""
nn_list = dict()
x_src /= np.linalg.norm(x_src, axis=1)[:, np.newaxis] + 1e-8
for i in range(0, len(idx_src), bsz):
e = min(i + bsz, len(idx_src))
scores = np.dot(x_src,x_src[idx_src[i:e]].T)
ind = np.argpartition(scores, -(nn_size+1), axis=0)[-(nn_size+1):]
for j in range(i, e):
nn_list[idx_src[j]] = list(
ind[:, j - i][np.argsort(scores[:, j - i][ind[:, j - i]])]
)[::-1]
return nn_list