-
Notifications
You must be signed in to change notification settings - Fork 0
/
Toy_experiment.py
430 lines (378 loc) · 18.9 KB
/
Toy_experiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
import get_data_from_101 as get_data
import Create_more_metrics
import copy
import os
import pickle
from sklearn import tree
from sklearn.linear_model import LinearRegression
from sklearn import svm
from sklearn import neighbors
from sklearn import ensemble
from sklearn.tree import ExtraTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.metrics import r2_score
from scipy.stats import kendalltau
from tensorflow.python.keras.utils import to_categorical
import numpy as np
import matplotlib.pyplot as plt
# different methods
# 1.decision tree regression
model_decision_tree_regression = tree.DecisionTreeRegressor()
# 2.linear regression
model_linear_regression = LinearRegression()
# 3.SVM regression
model_svm = svm.SVR()
# 4.kNN regression
model_k_neighbor = neighbors.KNeighborsRegressor()
# 5.random forest regression
model_random_forest_regressor = ensemble.RandomForestRegressor(n_estimators=230)
# 6.Adaboost regression
model_adaboost_regressor = ensemble.AdaBoostRegressor()
# 7.GBRT regression
model_gradient_boosting_regressor = ensemble.GradientBoostingRegressor()
# 8.Bagging regression
model_bagging_regressor = ensemble.BaggingRegressor()
# 9.ExtraTree regression
model_extra_tree_regressor = ExtraTreeRegressor()
# 10.Gaussian Process Regression
model_gaussian_process_regressor = GaussianProcessRegressor()
# 11.MLP Regression
model_MLP_regressor = MLPRegressor()
model = [model_decision_tree_regression, model_linear_regression, model_svm, model_k_neighbor,
model_random_forest_regressor, model_adaboost_regressor, model_gradient_boosting_regressor,
model_bagging_regressor, model_extra_tree_regressor, model_gaussian_process_regressor, model_MLP_regressor]
method = ['decision_tree', 'linear_regression', 'svm', 'knn', 'random_forest', 'adaboost', 'GBRT', 'Bagging',
'ExtraTree', 'Gaussian_Process', 'MLP']
# MAX full training time
MAX_FTT = 5521.803059895833
# MAX trainable parameters
MAX_TP = 49979274
def Random_forestAndmore_test(method, x_train, y_train, x_test, y_test, more_x_test, more_y_test, parameter_scale,
step=10):
print('First run without more test data')
no_more_test_KTau_list, no_more_test_MSE_list = Ablation_study(method, x_train, y_train, x_test, y_test,
parameter_scale, step, more_test_data=False,
show_figure=False)
print('Second run with more test data')
more_test_KTau_list, more_test_MSE_list = Ablation_study(method, x_train, y_train, more_x_test, more_y_test,
parameter_scale, step, more_test_data=True,
show_figure=False)
# save KTau and MSE list
if not os.path.isdir('pkl'):
os.makedirs('pkl')
save_path = r'pkl\ktau_and_mse_list.pkl'
save_dic = {'no_more_test_KTau_list': no_more_test_KTau_list, 'no_more_test_MSE_list': no_more_test_MSE_list,
'more_test_KTau_list': more_test_KTau_list, 'more_test_MSE_list': more_test_MSE_list}
with open(save_path, 'wb') as file:
pickle.dump(save_dic, file)
print('Save KTau and MSE list successfully!')
# Because running the above code is a time-consuming process. To speed up the process,
# we recommend using the function make_plot_for_KTau_and_MSE with the saving data.
plt.figure(figsize=(13, 5))
plt.subplot(1, 2, 1)
plt.plot(np.arange(parameter_scale[0], parameter_scale[-1], step), no_more_test_KTau_list[4:], "hotpink",
linestyle=':', marker='^', label="original test")
plt.plot(np.arange(parameter_scale[0], parameter_scale[-1], step), more_test_KTau_list[4:], "skyblue",
linestyle=':', marker='D', label="committee prediction")
plt.xlabel("number of estimators")
plt.ylabel("KTau")
plt.legend(loc="best")
plt.subplot(1, 2, 2)
plt.plot(np.arange(parameter_scale[0], parameter_scale[-1], step), no_more_test_MSE_list[4:], "hotpink",
linestyle=':', marker='^', label="original test")
plt.plot(np.arange(parameter_scale[0], parameter_scale[-1], step), more_test_MSE_list[4:], "skyblue",
linestyle=':', marker='D', label="committee prediction")
plt.xlabel("number of estimators")
plt.ylabel("MSE")
plt.legend(loc="best")
plt.show()
def make_plot_for_KTau_and_MSE(KTau_list, MSE_list):
# show the results of estimators = {50, 60, ..., 300}
parameter_scale = [10, 310]
# don't change the step
step = 10
plt.subplot(2, 1, 1)
plt.plot(np.arange(parameter_scale[0], parameter_scale[-1], step), KTau_list, "hotpink", linestyle=':',
marker='^')
plt.xlabel("number of estimators")
plt.ylabel("KTau")
plt.subplot(2, 1, 2)
plt.plot(np.arange(parameter_scale[0], parameter_scale[-1], step), MSE_list, "skyblue", linestyle=':',
marker='D')
plt.xlabel("number of estimators")
plt.ylabel("MSE")
plt.show()
def Ablation_study(method, x_train, y_train, x_test, y_test, parameter_scale, step=1, more_test_data=False,
show_figure=True, upper_tri_num=120):
score_list, KTau_list, MSE_list = [], [], []
y_test_copy = copy.deepcopy(y_test)
print(f'Ablation study, method: {method}')
for parameter in range(parameter_scale[0], parameter_scale[-1], step):
if method == 'random_forest':
Model = ensemble.RandomForestRegressor(n_estimators=parameter)
elif method == 'knn':
Model = neighbors.KNeighborsRegressor(n_neighbors=parameter)
elif method == 'GBRT':
Model = ensemble.GradientBoostingRegressor(n_estimators=parameter)
elif method == 'Bagging':
Model = ensemble.BaggingRegressor(n_estimators=parameter)
else:
raise ValueError
Model.fit(x_train, y_train)
result = Model.predict(x_test)
if more_test_data:
if isinstance(upper_tri_num, list):
print('Select upper tri matrix!!!')
mean_result = []
first_ground_truth = []
last_num = 0
for num in upper_tri_num:
mean_result.extend([np.mean(result[last_num:last_num + num])])
first_ground_truth.extend([y_test_copy[last_num]])
last_num = last_num + num
else:
# the number is fixed 120 now, use the mean value
# that is one metric can produce 5! = 120 same architecture representation
mean_result = []
first_ground_truth = []
for i in range(len(x_test) // 120):
mean_result.extend([np.mean(result[i * 120:(i + 1) * 120])])
first_ground_truth.extend([y_test_copy[i * 120]])
# because the ground truth in [i*120:(i+1)*120] are all [i*120]
result = mean_result
y_test = first_ground_truth
result = list(result)
score = r2_score(y_test, result)
result_arg = np.argsort(result)
y_test_arg = np.argsort(y_test)
result_rank = np.zeros(len(y_test_arg))
y_test_rank = np.zeros(len(y_test_arg))
for i in range(len(y_test_arg)):
result_rank[result_arg[i]] = i
y_test_rank[y_test_arg[i]] = i
KTau, _ = kendalltau(result_rank, y_test_rank)
MSE = calculate_MSE(y_test, result)
print(
'parameter: {:}, KTau: {:}, MSE: {:}, R2score: {:}'.format(parameter, KTau, MSE, score))
score_list.append([score])
MSE_list.append([MSE])
KTau_list.append([KTau])
# save KTau and MSE list
if not os.path.isdir('pkl'):
os.makedirs('pkl')
save_path = r'pkl\ktau_and_mse_list.pkl'
save_dic = {'KTau_list': KTau_list, 'MSE_list': MSE_list}
with open(save_path, 'wb') as file:
pickle.dump(save_dic, file)
print('Save KTau and MSE list successfully!')
if show_figure:
plt.subplot(2, 1, 1)
plt.plot(np.arange(parameter_scale[0], parameter_scale[-1], step), KTau_list, "hotpink", linestyle=':',
marker='^')
plt.xlabel("number of estimators")
plt.ylabel("KTau")
plt.subplot(2, 1, 2)
plt.plot(np.arange(parameter_scale[0], parameter_scale[-1], step), MSE_list, "skyblue", linestyle=':',
marker='D')
plt.xlabel("number of estimators")
plt.ylabel("MSE")
plt.title(f"Different number of estimators in {method}")
plt.show()
# return: 1. X: a linear array: flattened adjacent matrix + integer operations
# 2. y: accuracy
def get_toy_data(important_metrics, create_more_metrics=True, select_upper_tri=False, max_creation=-1,
integers2one_hot=True, additional_metrics=False, normalization=True):
X = []
y = []
more_metrics_num = []
for index in important_metrics:
fixed_metrics = important_metrics[index]['fixed_metrics']
adjacent_matrix = fixed_metrics['module_adjacency']
module_integers = fixed_metrics['module_integers']
trainable_parameters = fixed_metrics['trainable_parameters']
final_training_time = important_metrics[index]['final_training_time']
accuracy = important_metrics[index]['final_test_accuracy']
if create_more_metrics:
more_metrics = Create_more_metrics.create_new_metrics(adjacent_matrix, module_integers, select_upper_tri,
max_creation, InOut=False)
num_new_metrics = len(more_metrics)
more_metrics_num.extend([num_new_metrics])
for same_metric in more_metrics:
adjacent_matrix, module_integers = same_metric['module_adjacency'], same_metric['module_integers']
adjacent_matrix = get_data.delete_margin(adjacent_matrix)
flattened_adjacent = adjacent_matrix.flatten()
input_metrics = []
input_metrics.extend(flattened_adjacent)
if integers2one_hot:
module_integers = to_categorical(module_integers, 4, dtype='int8')
module_integers = module_integers.flatten()
input_metrics.extend(module_integers)
if additional_metrics:
norm_trainable_parameters = trainable_parameters / MAX_TP
norm_final_training_time = final_training_time / MAX_FTT
if normalization:
input_metrics.extend([norm_trainable_parameters])
input_metrics.extend([norm_final_training_time])
else:
input_metrics.extend([trainable_parameters])
input_metrics.extend([final_training_time])
X.append(input_metrics)
y.append(accuracy)
else:
adjacent_matrix = get_data.delete_margin(adjacent_matrix)
flattened_adjacent = adjacent_matrix.flatten()
input_metrics = []
input_metrics.extend(flattened_adjacent)
if integers2one_hot:
module_integers = to_categorical(module_integers, 4, dtype='int8')
module_integers = module_integers.flatten()
input_metrics.extend(module_integers)
if additional_metrics:
if normalization:
trainable_parameters = trainable_parameters / MAX_TP
final_training_time = final_training_time / MAX_FTT
input_metrics.extend([trainable_parameters])
input_metrics.extend([final_training_time])
X.append(input_metrics)
y.append(accuracy)
assert len(X) == len(y)
print('Input {:} metrics, obtain {:} metrics'.format(len(important_metrics), len(X)))
return X, y, more_metrics_num
def get_upper_triangular_data(important_metrics, integers2one_hot=True, double_upper=True, additional_metrics=True,
normalization=True):
# upper triangular and additional metrics (including trainable parameters and final training time)
# double_upper denotes flatting the upper triangular matrix into a one-dimensional vector based the axis = 0 and 1
X = []
y = []
for index in important_metrics:
fixed_metrics = important_metrics[index]['fixed_metrics']
adjacent_matrix = fixed_metrics['module_adjacency']
module_integers = fixed_metrics['module_integers']
if integers2one_hot:
module_integers = to_categorical(module_integers, 4, dtype='int8')
module_integers = module_integers.flatten()
trainable_parameters = fixed_metrics['trainable_parameters']
final_training_time = important_metrics[index]['final_training_time']
adjacent_matrix = get_data.delete_margin(adjacent_matrix)
array_adjacent_matrix = np.array(adjacent_matrix)
flattened_adjacent = []
matrix_size = len(adjacent_matrix)
# get upper triangular data in matrix and flat it
# Noting that this doesn't contains the elements as main diagonal
for i in range(matrix_size):
flattened_adjacent.extend(adjacent_matrix[i][i:])
if double_upper:
# add information from column
for i in range(matrix_size):
extend_array = array_adjacent_matrix[:i + 1, i]
flattened_adjacent.extend(extend_array.tolist())
input_metrics = []
input_metrics.extend(flattened_adjacent)
input_metrics.extend(module_integers)
if additional_metrics:
if normalization:
trainable_parameters = trainable_parameters / MAX_TP
final_training_time = final_training_time / MAX_FTT
input_metrics.extend([trainable_parameters])
input_metrics.extend([final_training_time])
accuracy = important_metrics[index]['final_test_accuracy']
X.append(input_metrics)
y.append(accuracy)
return X, y
def get_toy_metrics(num, type='train', train_num=2000):
index = get_data.get_data_index_from_101(num, type=type, train_num=train_num)
metrics = get_data.get_corresponding_metrics_by_index(index, type=type)
metrics = get_data.padding_zero_in_matrix(metrics)
metrics = get_data.operations2integers(metrics)
return metrics
def calculate_MSE(x, y):
# input two list, x: predict, y: ground truth
# output MSE
mse_list = np.array([(element_x - element_y) ** 2 for element_x, element_y in zip(x, y)])
mse = np.mean(mse_list)
return mse
def try_different_method(x_train, y_train, x_test, y_test, model, method, show_fig=True, return_flag=False):
model.fit(x_train, y_train)
result = model.predict(x_test)
result = list(result)
score = r2_score(y_test, result)
result_arg = np.argsort(result)
y_test_arg = np.argsort(y_test)
result_rank = np.zeros(len(y_test_arg))
y_test_rank = np.zeros(len(y_test_arg))
for i in range(len(y_test_arg)):
result_rank[result_arg[i]] = i
y_test_rank[y_test_arg[i]] = i
KTau, _ = kendalltau(result_rank, y_test_rank)
print('method: {:}, KTau: {:}, MSE: {:}, R2score: {:}'.format(method, KTau, calculate_MSE(y_test, result), score))
print('--------------------try-end---------------------\n')
if show_fig:
x = np.arange(0, 1, 0.01)
y = x
plt.figure(figsize=(5, 5))
plt.plot(x, y, 'g', label='y_test = result')
plt.scatter(result, y_test, s=1)
plt.xlabel("predict_result")
plt.ylabel("y_test")
plt.title(f"method:{method}---score:{score}")
plt.legend(loc="best")
plt.show()
x = np.arange(0, len(y_test), 0.1)
y = x
plt.figure(figsize=(6, 6))
line_color = '#1F77D0'
plt.plot(x, y, c=line_color, linewidth=1)
point_color = '#FF4400'
plt.scatter(result_rank, y_test_rank, c=point_color, s=2)
plt.xlabel("predict_result")
plt.ylabel("y_test")
plt.title(f"method:{method}---KTau:{KTau}")
plt.xlim(xmax=5000, xmin=0)
plt.ylim(ymax=5000, ymin=0)
plt.show()
if return_flag:
return KTau, calculate_MSE(y_test, result)
if __name__ == '__main__':
train_num = 1000
test_num = 5000
additional_metrics = False
integers2one_hot = True
more_train_data = True
more_test_data = False
# if run for Random_forestAndmore_test, please set more_test_data = False, and activate a piece of code below
metrics = get_toy_metrics(train_num)
print('----------------------train---------------------')
# X, y = get_upper_triangular_data(metrics, integers2one_hot=True, double_upper=False, additional_metrics=True,
# normalization=True)
X, y, _ = get_toy_data(metrics, create_more_metrics=more_train_data, select_upper_tri=False,
additional_metrics=additional_metrics, integers2one_hot=integers2one_hot)
# print(X)
# print(y)
print('----------------------test----------------------')
test_metrics = get_toy_metrics(test_num, type='fixed_test', train_num=train_num)
# testX, testy = get_upper_triangular_data(test_metrics, integers2one_hot=True, double_upper=False,
# additional_metrics=True, normalization=True)
testX, testy, num_new_metrics = get_toy_data(test_metrics, create_more_metrics=more_test_data,
select_upper_tri=False,
additional_metrics=additional_metrics,
integers2one_hot=integers2one_hot)
# if run for Random_forestAndmore_test, please activate the following codes
# more_testX, more_testy, _ = get_toy_data(test_metrics, create_more_metrics=True,
# select_upper_tri=False,
# additional_metrics=additional_metrics,
# integers2one_hot=integers2one_hot)
# print(testX)
# print(testy)
# # This is to show different kinds of regressions with default settings
# MLP is not good when some big data (such as final training time and trainable parameters) are added
# range(4, 5) is random forest
for i in range(4, 5):
try_different_method(X, y, testX, testy, model[i], method[i], more_test_data=more_test_data, show_fig=False)
# # Ablation study
# # knn max KTau is below 0.6, and min MSE is around 0.0025
# # GBRT max KTau is around 0.627, and min MSE is around 0.0023
# # Gagging is almost the same as random forest
method_name = 'random_forest'
# # method name list: 'random_forest', 'knn', 'GBRT', 'Bagging'
# Ablation_study(method_name, X, y, testX, testy, [10, 310], step=10, more_test_data=more_test_data)