-
Notifications
You must be signed in to change notification settings - Fork 3
/
fmsne_demo.py
402 lines (362 loc) · 21.9 KB
/
fmsne_demo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
#! python3
# -*-coding:Utf-8 -*
import sklearn.datasets
import sklearn.manifold
import time
import fmsne
##############################
# Demo presenting how to use the main functions of this file.
####################
if __name__ == '__main__':
print("==============================================")
print("===== Starting the demo of fast_ms_ne.py =====")
print("==============================================")
# List of tuples. There is one tuple per considered data set in
# this demo. The first element of each tuple is a function
# enabling to load the data set, while the second element of each
# tuple is a string storing a name for the associated data set.
L_data = [(sklearn.datasets.load_digits, 'Digits'), (lambda: sklearn.datasets.make_blobs(n_samples=11000, n_features=12, centers=22, cluster_std=1.0, center_box=(-10.0, 10.0), shuffle=True, random_state=3), 'Blobs')]
n_data = len(L_data)
# Boolean. Whether or not to plot the LD embeddings as they are
# computed in the demo.
plot_emb = True
# Largest neighborhood size to consider when employing the
# 'red_rnx_auc' function for the reduced quality assessment in the
# demo.
Kup = 10000
# For each data set
for idx_data, data_t in enumerate(L_data):
# Load function and name of the current data set
data_load, data_name = data_t
print("=====")
print("==== Data set #{i}/{n_data}: {data_name}".format(i=idx_data+1, n_data=n_data, data_name=data_name))
print("=====")
print('- Loading the HD data set')
# TIP: to change the employed data set, you just need to
# modify the next code line to provide different values for
# X_hds and labels. Afterwards, only X_hds is employed to
# compute the LD embeddings. The labels are only used to plot
# the obtained LD embeddings using colors.
D_data = data_load()
if isinstance(D_data, dict):
X_hds, labels = D_data['data'], D_data['target']
elif isinstance(D_data, tuple):
X_hds, labels = D_data
else:
raise ValueError("Error in the demo of module {module_name}: the data set '{data_name}' #{i}/{n_data} cannot be loaded.".format(module_name=module_name, data_name=data_name, i=idx_data+1, n_data=n_data))
# Number of samples and dimension of the HD data set
N_samp, M_HD = X_hds.shape
print("Number of data samples: {N_samp}".format(N_samp=N_samp))
print("HDS dimension: {M_HD}".format(M_HD=M_HD))
# Targeted dimension of the LD embeddings
dim_LDS = 2
print("Targeted LDS dimension: {dim_LDS}".format(dim_LDS=dim_LDS))
# Whether the currently considered data set is big in terms of
# its number of samples or not.
big_data = (N_samp >= 10000)
if big_data:
print('The data set is big in terms of its number of samples.')
print('Multi-scale SNE, multi-scale t-SNE and t-SNE are hence not applied; only their fast versions are employed (fast multi-scale SNE, fast multi-scale t-SNE and Barnes-Hut t-SNE).')
print('The reduced DR quality is evaluated; it means that the R_{NX}(K) curve is computed only for K=1 to Kup={Kup}, and that the AUC refers to the area under this reduced curve, with a log scale for K, instead of the full one for K=1 to N-2={v}, with N being the number of data samples.'.format(Kup=Kup, v=N_samp-2, NX='{NX}'))
else:
print('The data set is moderate in terms of its number of samples.')
print('Multi-scale SNE, multi-scale t-SNE and t-SNE are hence applied, as well as their fast versions (fast multi-scale SNE, fast multi-scale t-SNE and Barnes-Hut t-SNE).')
print('The DR quality is completely evaluated; the R_{NX}(K) curve is computed for K=1 to N-2={v}, with N being the number of data samples, and the AUC refers to the area under this curve with a log scale for K.'.format(v=N_samp-2, NX='{NX}'))
print('===')
print('===')
print('===')
###
###
###
# fmsne.eucl_dist_matr() is used to compute a 2-D numpy
# array containing the pairwise distances in a data set,
# if it is not too big in terms of its number of
# samples. This function is used to compute the HD and LD
# distances for the DR quality assessment when the data
# set is of moderate size.
#
# Note that in all DR methods employed in this code
# (multi-scale SNE, multi-scale t-SNE, t-SNE, fast
# multi-scale SNE, fast multi-scale t-SNE, Barnes-Hut
# t-SNE), the LD embedding is computed using Euclidean
# distances in the LD space.
# Lists to provide as parameters to viz_qa, to visualize the
# DR quality assessment as conducted in [1].
L_rnx, Lmarkers, Lcols, Lleg_rnx, Lls, Lmedw, Lsdots = [], [], [], [], [], [], []
# If the data set is not too big, we can compute all the
# pairwise HD distances between its samples.
if not big_data:
print('- Computing the pairwise Euclidean distances in the HD data set')
t0 = time.time()
dm_hd = fmsne.eucl_dist_matr(X_hds)
t = time.time() - t0
print('Done. It took {t} seconds.'.format(t=fmsne.rstr(t)))
print('===')
print('===')
print('===')
# Initialization type of the LD embedding. Check the
# 'init_ld_emb' function for details. Note that you can
# provide the LD coordinates to use for the initialization by
# setting init_ld_emb to a 2-D numpy.ndarray containing the
# initial LD positions, with one example per row and one LD
# dimension per column, init_ld_emb[i,:] containing the
# initial LD coordinates related to the HD sample X_hds[i,:].
init_ld_emb = 'pca'
# Applying multi-scale t-SNE if the data set is not too big, i.e. it is limited to a few thousands samples.
if not big_data:
print('- Applying multi-scale t-SNE on the data set to obtain a {dim_LDS}-D embedding'.format(dim_LDS=dim_LDS))
if data_name == 'Digits':
print('This takes a few seconds (i.e., around 17 seconds with a processor Intel(R) Core(TM) i7-8750H CPU @ 2.20GHz 2.21GHz).')
t0 = time.time()
X_ld_mstsne = fmsne.mstsne(X_hds=X_hds, n_components=dim_LDS, init=init_ld_emb, rand_state=fmsne.np.random.RandomState(2))
t = time.time() - t0
print('Done. It took {t} seconds.'.format(t=fmsne.rstr(t)))
print('- Evaluating the DR quality of the LD embedding obtained using multi-scale t-SNE')
if data_name == 'Digits':
print('This takes a few seconds (i.e., around 1 second with a processor Intel(R) Core(TM) i7-8750H CPU @ 2.20GHz 2.21GHz).')
t0 = time.time()
rnx_mstsne, auc_mstsne = fmsne.eval_dr_quality(d_hd=dm_hd, d_ld=fmsne.eucl_dist_matr(X_ld_mstsne))
t = time.time() - t0
print('Done. It took {t} seconds.'.format(t=fmsne.rstr(t)))
print('AUC: {v}'.format(v=fmsne.rstr(auc_mstsne, 4)))
# Updating the lists for viz_qa
L_rnx.append(rnx_mstsne)
Lmarkers.append('^')
Lcols.append('blue')
Lleg_rnx.append('Ms $t$-SNE')
Lls.append('solid')
Lmedw.append(0.5)
Lsdots.append(10)
if plot_emb:
print('- Plotting the LD embedding obtained using multi-scale t-SNE')
print('If a figure is shown, close it to continue.')
# TIP: you can save the produced plot by specifying a
# path for the figure in the fname parameter of the
# following line. The format of the figure can be
# specified through the f_format parameter. Check the
# documentation of the save_show_fig function for more
# information.
fmsne.viz_2d_emb(X=X_ld_mstsne, vcol=labels, tit='LD embedding Ms $t$-SNE ({data_name} data set)'.format(data_name=data_name), fname=None, f_format=None)
print('===')
print('===')
print('===')
# Applying multi-scale SNE if the data set is not too big, i.e. it is limited to a few thousands samples.
if not big_data:
print('- Applying multi-scale SNE on the data set to obtain a {dim_LDS}-D embedding'.format(dim_LDS=dim_LDS))
if data_name == 'Digits':
print('This takes a few minutes (i.e., around 2.5 minutes with a processor Intel(R) Core(TM) i7-8750H CPU @ 2.20GHz 2.21GHz).')
t0 = time.time()
X_ld_mssne = fmsne.mssne(X_hds=X_hds, n_components=dim_LDS, init=init_ld_emb, rand_state=fmsne.np.random.RandomState(2))
t = time.time() - t0
print('Done. It took {t} seconds.'.format(t=fmsne.rstr(t)))
print('- Evaluating the DR quality of the LD embedding obtained using multi-scale SNE')
t0 = time.time()
rnx_mssne, auc_mssne = fmsne.eval_dr_quality(d_hd=dm_hd, d_ld=fmsne.eucl_dist_matr(X_ld_mssne))
t = time.time() - t0
print('Done. It took {t} seconds.'.format(t=fmsne.rstr(t)))
print('AUC: {v}'.format(v=fmsne.rstr(auc_mssne, 4)))
# Updating the lists for viz_qa
L_rnx.append(rnx_mssne)
Lmarkers.append('x')
Lcols.append('red')
Lleg_rnx.append('Ms SNE')
Lls.append('solid')
Lmedw.append(0.5)
Lsdots.append(10)
if plot_emb:
print('- Plotting the LD embedding obtained using multi-scale SNE')
print('If a figure is shown, close it to continue.')
# TIP: you can save the produced plot by specifying a
# path for the figure in the fname parameter of the
# following line. The format of the figure can be
# specified through the f_format parameter. Check the
# documentation of the save_show_fig function for more
# information.
fmsne.viz_2d_emb(X=X_ld_mssne, vcol=labels, tit='LD embedding Ms SNE ({data_name} data set)'.format(data_name=data_name), fname=None, f_format=None)
print('===')
print('===')
print('===')
# Applying t-SNE [7] if the data set is not too big, i.e. it is limited to a few thousands samples.
if not big_data:
print('- Applying t-SNE on the data set to obtain a {dim_LDS}-D embedding'.format(dim_LDS=dim_LDS))
if data_name == 'Digits':
print('This takes a few minutes (i.e., around 2 minutes with a processor Intel(R) Core(TM) i7-8750H CPU @ 2.20GHz 2.21GHz).')
t0 = time.time()
X_ld_tsne = sklearn.manifold.TSNE(n_components=dim_LDS, perplexity=50.0, early_exaggeration=4.0, n_iter=1000, learning_rate=100.0, min_grad_norm=10.0**(-5), random_state=fmsne.np.random.RandomState(2), metric='euclidean', init=init_ld_emb, method='exact').fit_transform(X_hds)
t = time.time() - t0
print('Done. It took {t} seconds.'.format(t=fmsne.rstr(t)))
print('- Evaluating the DR quality of the LD embedding obtained using t-SNE')
t0 = time.time()
rnx_tsne, auc_tsne = fmsne.eval_dr_quality(d_hd=dm_hd, d_ld=fmsne.eucl_dist_matr(X_ld_tsne))
t = time.time() - t0
print('Done. It took {t} seconds.'.format(t=fmsne.rstr(t)))
print('AUC: {v}'.format(v=fmsne.rstr(auc_tsne, 4)))
# Updating the lists for viz_qa
L_rnx.append(rnx_tsne)
Lmarkers.append('|')
Lcols.append('black')
Lleg_rnx.append('$t$-SNE')
Lls.append('solid')
Lmedw.append(0.5)
Lsdots.append(10)
if plot_emb:
print('- Plotting the LD embedding obtained using t-SNE')
print('If a figure is shown, close it to continue.')
# TIP: you can save the produced plot by specifying a
# path for the figure in the fname parameter of the
# following line. The format of the figure can be
# specified through the f_format parameter. Check the
# documentation of the save_show_fig function for more
# information.
fmsne.viz_2d_emb(X=X_ld_tsne, vcol=labels, tit='LD embedding $t$-SNE ({data_name} data set)'.format(data_name=data_name), fname=None, f_format=None)
print('===')
print('===')
print('===')
# Fast multi-scale t-SNE can be employed on very large-scale databases.
print('- Applying fast multi-scale t-SNE on the data set to obtain a {dim_LDS}-D embedding'.format(dim_LDS=dim_LDS))
if data_name == 'Blobs':
print('This takes a few seconds (i.e., around 32 seconds with a processor Intel(R) Core(TM) i7-8750H CPU @ 2.20GHz 2.21GHz).')
elif data_name == 'Digits':
print('This takes a few seconds (i.e., around 3 seconds with a processor Intel(R) Core(TM) i7-8750H CPU @ 2.20GHz 2.21GHz).')
t0 = time.time()
X_ld_fmstsne = fmsne.fmstsne(X_hds=X_hds, n_components=dim_LDS, init=init_ld_emb, rand_state=fmsne.np.random.RandomState(2), bht=0.75, fseed=1)
t = time.time() - t0
print('Done. It took {t} seconds.'.format(t=fmsne.rstr(t)))
if big_data:
print('- Evaluating the reduced DR quality of the LD embedding obtained using fast multi-scale t-SNE')
if data_name == 'Blobs':
print('This takes a few seconds (i.e., around 34 seconds with a processor Intel(R) Core(TM) i7-8750H CPU @ 2.20GHz 2.21GHz).')
t0 = time.time()
rnx_fmstsne, auc_fmstsne = fmsne.red_rnx_auc(X_hds=X_hds, X_lds=X_ld_fmstsne, Kup=Kup)
t = time.time() - t0
print('Done. It took {t} seconds.'.format(t=fmsne.rstr(t)))
print('AUC: {v}'.format(v=fmsne.rstr(auc_fmstsne, 4)))
else:
print('- Evaluating the DR quality of the LD embedding obtained using fast multi-scale t-SNE')
t0 = time.time()
rnx_fmstsne, auc_fmstsne = fmsne.eval_dr_quality(d_hd=dm_hd, d_ld=fmsne.eucl_dist_matr(X_ld_fmstsne))
t = time.time() - t0
print('Done. It took {t} seconds.'.format(t=fmsne.rstr(t)))
print('AUC: {v}'.format(v=fmsne.rstr(auc_fmstsne, 4)))
# Updating the lists for viz_qa
L_rnx.append(rnx_fmstsne)
Lmarkers.append('s')
Lcols.append('cyan')
Lleg_rnx.append('FMs $t$-SNE')
Lls.append('solid')
Lmedw.append(0.5)
Lsdots.append(10)
if plot_emb:
print('- Plotting the LD embedding obtained using fast multi-scale t-SNE')
print('If a figure is shown, close it to continue.')
# TIP: you can save the produced plot by specifying a path
# for the figure in the fname parameter of the following
# line. The format of the figure can be specified through
# the f_format parameter. Check the documentation of the
# save_show_fig function for more information.
fmsne.viz_2d_emb(X=X_ld_fmstsne, vcol=labels, tit='LD embedding FMs $t$-SNE ({data_name} data set)'.format(data_name=data_name), fname=None, f_format=None)
print('===')
print('===')
print('===')
# Fast multi-scale SNE can be employed on very large-scale databases.
print('- Applying fast multi-scale SNE on the data set to obtain a {dim_LDS}-D embedding'.format(dim_LDS=dim_LDS))
if data_name == 'Blobs':
print('This takes a few minutes (i.e., around 15 minutes with a processor Intel(R) Core(TM) i7-8750H CPU @ 2.20GHz 2.21GHz).')
elif data_name == 'Digits':
print('This takes a few minutes (i.e., around 1.25 minutes with a processor Intel(R) Core(TM) i7-8750H CPU @ 2.20GHz 2.21GHz).')
t0 = time.time()
X_ld_fmssne = fmsne.fmssne(X_hds=X_hds, n_components=dim_LDS, init=init_ld_emb, rand_state=fmsne.np.random.RandomState(2), bht=0.45, fseed=1)
t = time.time() - t0
print('Done. It took {t} seconds.'.format(t=fmsne.rstr(t)))
if big_data:
print('- Evaluating the reduced DR quality of the LD embedding obtained using fast multi-scale SNE')
if data_name == 'Blobs':
print('This takes a few seconds (i.e., around 33 seconds with a processor Intel(R) Core(TM) i7-8750H CPU @ 2.20GHz 2.21GHz).')
t0 = time.time()
rnx_fmssne, auc_fmssne = fmsne.red_rnx_auc(X_hds=X_hds, X_lds=X_ld_fmssne, Kup=Kup)
t = time.time() - t0
print('Done. It took {t} seconds.'.format(t=fmsne.rstr(t)))
print('AUC: {v}'.format(v=fmsne.rstr(auc_fmssne, 4)))
else:
print('- Evaluating the DR quality of the LD embedding obtained using fast multi-scale SNE')
t0 = time.time()
rnx_fmssne, auc_fmssne = fmsne.eval_dr_quality(d_hd=dm_hd, d_ld=fmsne.eucl_dist_matr(X_ld_fmssne))
t = time.time() - t0
print('Done. It took {t} seconds.'.format(t=fmsne.rstr(t)))
print('AUC: {v}'.format(v=fmsne.rstr(auc_fmssne, 4)))
# Updating the lists for viz_qa
L_rnx.append(rnx_fmssne)
Lmarkers.append('$\\star$')
Lcols.append('magenta')
Lleg_rnx.append('FMs SNE')
Lls.append('solid')
Lmedw.append(0.5)
Lsdots.append(10)
if plot_emb:
print('- Plotting the LD embedding obtained using fast multi-scale SNE')
print('If a figure is shown, close it to continue.')
# TIP: you can save the produced plot by specifying a path for the figure in the fname parameter of the following line. The format of the figure can be specified through the f_format parameter. Check the documentation of the save_show_fig function for more information.
fmsne.viz_2d_emb(X=X_ld_fmssne, vcol=labels, tit='LD embedding FMs SNE ({data_name} data set)'.format(data_name=data_name), fname=None, f_format=None)
print('===')
print('===')
print('===')
# Barnes-Hut (BH) t-SNE [8] can be employed on very large-scale databases.
print('- Applying Barnes-Hut (BH) t-SNE on the data set to obtain a {dim_LDS}-D embedding'.format(dim_LDS=dim_LDS))
if data_name == 'Blobs':
print('This takes a few minutes (i.e., around 5 minutes with a processor Intel(R) Core(TM) i7-8750H CPU @ 2.20GHz 2.21GHz).')
elif data_name == 'Digits':
print('This takes a few seconds (i.e., around 39 seconds with a processor Intel(R) Core(TM) i7-8750H CPU @ 2.20GHz 2.21GHz).')
t0 = time.time()
X_ld_bhtsne = sklearn.manifold.TSNE(n_components=dim_LDS, perplexity=50.0, early_exaggeration=12.0, n_iter=1000, learning_rate=200.0, min_grad_norm=10.0**(-5), random_state=fmsne.np.random.RandomState(2), metric='euclidean', init=init_ld_emb, method='barnes_hut', angle=0.5).fit_transform(X_hds)
t = time.time() - t0
print('Done. It took {t} seconds.'.format(t=fmsne.rstr(t)))
if big_data:
print('- Evaluating the reduced DR quality of the LD embedding obtained using BH t-SNE')
if data_name == 'Blobs':
print('This takes a few seconds (i.e., around 33 seconds with a processor Intel(R) Core(TM) i7-8750H CPU @ 2.20GHz 2.21GHz).')
t0 = time.time()
rnx_bhtsne, auc_bhtsne = fmsne.red_rnx_auc(X_hds=X_hds, X_lds=X_ld_bhtsne, Kup=Kup)
t = time.time() - t0
print('Done. It took {t} seconds.'.format(t=fmsne.rstr(t)))
print('AUC: {v}'.format(v=fmsne.rstr(auc_bhtsne, 4)))
else:
print('- Evaluating the DR quality of the LD embedding obtained using BH t-SNE')
t0 = time.time()
rnx_bhtsne, auc_bhtsne = fmsne.eval_dr_quality(d_hd=dm_hd, d_ld=fmsne.eucl_dist_matr(X_ld_bhtsne))
t = time.time() - t0
print('Done. It took {t} seconds.'.format(t=fmsne.rstr(t)))
print('AUC: {v}'.format(v=fmsne.rstr(auc_bhtsne, 4)))
# Updating the lists for viz_qa
L_rnx.append(rnx_bhtsne)
Lmarkers.append('o')
Lcols.append('green')
Lleg_rnx.append('BH $t$-SNE')
Lls.append('solid')
Lmedw.append(0.5)
Lsdots.append(10)
if plot_emb:
print('- Plotting the LD embedding obtained using BH t-SNE')
print('If a figure is shown, close it to continue.')
# TIP: you can save the produced plot by specifying a path
# for the figure in the fname parameter of the following
# line. The format of the figure can be specified through
# the f_format parameter. Check the documentation of the
# save_show_fig function for more information.
fmsne.viz_2d_emb(X=X_ld_bhtsne, vcol=labels, tit='LD embedding BH $t$-SNE ({data_name} data set)'.format(data_name=data_name), fname=None, f_format=None)
print('===')
print('===')
print('===')
print('- Plotting the results of the DR quality assessment')
print('If a figure is shown, close it to continue.')
# TIP: you can save the produced plot by specifying a path for
# the figure in the fname parameter of the following line. The
# format of the figure can be specified through the f_format
# parameter. Check the documentation of the save_show_fig
# function for more information.
fmsne.viz_qa(Ly=L_rnx, Lmarkers=Lmarkers, Lcols=Lcols, Lleg=Lleg_rnx, Lls=Lls, Lmedw=Lmedw, Lsdots=Lsdots, tit='DR quality', xlabel='Neighborhood size $K$', ylabel='$R_{\\mathrm{{NX}}}(K)$', fname=None, f_format=None, ncol_leg=2)
print('===')
print('===')
print('===')
print('*********************')
print('***** Done! :-) *****')
print('*********************')