forked from ntucllab/libact
-
Notifications
You must be signed in to change notification settings - Fork 0
/
alce_plot.py
135 lines (106 loc) · 4.5 KB
/
alce_plot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/usr/bin/env python3
"""
Cost-Senstive Multi-Class Active Learning
"""
import copy
import os
import numpy as np
import matplotlib
matplotlib.use('tkAgg')
import matplotlib.pyplot as plt
try:
from sklearn.model_selection import train_test_split
except ImportError:
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
import sklearn.datasets
from sklearn.svm import SVR
# libact classes
from libact.base.dataset import Dataset, import_libsvm_sparse
from libact.models import SVM, LogisticRegression
from libact.query_strategies.multiclass import ActiveLearningWithCostEmbedding as ALCE
from libact.query_strategies import UncertaintySampling, RandomSampling
from libact.labelers import IdealLabeler
from libact.utils import calc_cost
def run(trn_ds, tst_ds, lbr, model, qs, quota, cost_matrix):
C_in, C_out = [], []
for i in range(quota+1):
# Standard usage of libact objects
if i > 0:
ask_id = qs.make_query()
X, _ = zip(*trn_ds.data)
lb = lbr.label(X[ask_id])
trn_ds.update(ask_id, lb)
model.train(trn_ds)
trn_X, trn_y = zip(*trn_ds.get_labeled_entries())
tst_X, tst_y = zip(*tst_ds.get_labeled_entries())
C_in = np.append(C_in,
calc_cost(trn_y, model.predict(trn_X), cost_matrix))
C_out = np.append(C_out,
calc_cost(tst_y, model.predict(tst_X), cost_matrix))
return C_in, C_out
def split_train_test(test_size):
# choose a dataset with unbalanced class instances
#data = sklearn.datasets.fetch_mldata('segment')
data = sklearn.datasets.fetch_mldata('vehicle')
X = StandardScaler().fit_transform(data['data'])
target = np.unique(data['target'])
# mapping the targets to 0 to n_classes-1
y = np.array([np.where(target == i)[0][0] for i in data['target']])
X_trn, X_tst, y_trn, y_tst = \
train_test_split(X, y, test_size=test_size, stratify=y)
# making sure each class appears ones initially
init_y_ind = np.array(
[np.where(y_trn == i)[0][0] for i in range(len(target))])
y_ind = np.array([i for i in range(len(X_trn)) if i not in init_y_ind])
trn_ds = Dataset(
np.vstack((X_trn[init_y_ind], X_trn[y_ind])),
np.concatenate((y_trn[init_y_ind], [None] * (len(y_ind)))))
tst_ds = Dataset(X_tst, y_tst)
fully_labeled_trn_ds = Dataset(
np.vstack((X_trn[init_y_ind], X_trn[y_ind])),
np.concatenate((y_trn[init_y_ind], y_trn[y_ind])))
cost_matrix = 2000. * np.random.rand(len(target), len(target))
np.fill_diagonal(cost_matrix, 0)
return trn_ds, tst_ds, fully_labeled_trn_ds, cost_matrix
def main():
test_size = 0.25 # the percentage of samples in the dataset that will be
# randomly selected and assigned to the test set
result = {'E1':[], 'E2':[], 'E3':[]}
for i in range(20):
trn_ds, tst_ds, fully_labeled_trn_ds, cost_matrix = \
split_train_test(test_size)
trn_ds2 = copy.deepcopy(trn_ds)
trn_ds3 = copy.deepcopy(trn_ds)
lbr = IdealLabeler(fully_labeled_trn_ds)
model = SVM(kernel='rbf', decision_function_shape='ovr')
quota = 100 # number of samples to query
qs = UncertaintySampling(
trn_ds, method='sm', model=SVM(decision_function_shape='ovr'))
_, E_out_1 = run(trn_ds, tst_ds, lbr, model, qs, quota, cost_matrix)
result['E1'].append(E_out_1)
qs2 = RandomSampling(trn_ds2)
_, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota, cost_matrix)
result['E2'].append(E_out_2)
qs3 = ALCE(trn_ds3, cost_matrix, SVR())
_, E_out_3 = run(trn_ds3, tst_ds, lbr, model, qs3, quota, cost_matrix)
result['E3'].append(E_out_3)
E_out_1 = np.mean(result['E1'], axis=0)
E_out_2 = np.mean(result['E2'], axis=0)
E_out_3 = np.mean(result['E3'], axis=0)
print("Uncertainty: ", E_out_1[::5].tolist())
print("Random: ", E_out_2[::5].tolist())
print("ALCE: ", E_out_3[::5].tolist())
query_num = np.arange(0, quota + 1)
plt.figure(figsize=(10, 8))
plt.plot(query_num, E_out_1, 'g', label='Uncertainty sampling')
plt.plot(query_num, E_out_2, 'k', label='Random')
plt.plot(query_num, E_out_3, 'r', label='ALCE')
plt.xlabel('Number of Queries')
plt.ylabel('Error')
plt.title('Experiment Result')
plt.legend(
loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, ncol=5)
plt.show()
if __name__ == '__main__':
main()