forked from ntucllab/libact
-
Notifications
You must be signed in to change notification settings - Fork 0
/
albl_plot.py
executable file
·147 lines (119 loc) · 5.31 KB
/
albl_plot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#!/usr/bin/env python3
"""
The script runs experiments to compare the performance of ALBL and other active
learning algorithms.
"""
import copy
import os
import numpy as np
import matplotlib.pyplot as plt
try:
from sklearn.model_selection import train_test_split
except ImportError:
from sklearn.cross_validation import train_test_split
# libact classes
from libact.base.dataset import Dataset, import_libsvm_sparse
from libact.models import SVM
from libact.query_strategies import QUIRE, UncertaintySampling, RandomSampling,\
ActiveLearningByLearning, HintSVM
from libact.labelers import IdealLabeler
def run(trn_ds, tst_ds, lbr, model, qs, quota):
E_in, E_out = [], []
for _ in range(quota):
ask_id = qs.make_query()
X, _ = zip(*trn_ds.data)
lb = lbr.label(X[ask_id])
trn_ds.update(ask_id, lb)
model.train(trn_ds)
E_in = np.append(E_in, 1 - model.score(trn_ds))
E_out = np.append(E_out, 1 - model.score(tst_ds))
return E_in, E_out
def split_train_test(dataset_filepath, test_size, n_labeled):
X, y = import_libsvm_sparse(dataset_filepath).format_sklearn()
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=test_size)
while len(np.unique((y_train[:n_labeled]))) != 2:
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=test_size)
trn_ds = Dataset(X_train, np.concatenate(
[y_train[:n_labeled], [None] * (len(y_train) - n_labeled)]))
tst_ds = Dataset(X_test, y_test)
fully_labeled_trn_ds = Dataset(X_train, y_train)
return trn_ds, tst_ds, y_train, fully_labeled_trn_ds
def main():
# Specifiy the parameters here:
# path to your binary classification dataset
ds_name = 'australian'
dataset_filepath = os.path.join(
os.path.dirname(os.path.realpath(__file__)), '%s.txt' % ds_name)
test_size = 0.33 # the percentage of samples in the dataset that will be
# randomly selected and assigned to the test set
n_labeled = 10 # number of samples that are initially labeled
results = []
for T in range(20): # repeat the experiment 20 times
print("%dth experiment" % (T+1))
trn_ds, tst_ds, y_train, fully_labeled_trn_ds = \
split_train_test(dataset_filepath, test_size, n_labeled)
trn_ds2 = copy.deepcopy(trn_ds)
trn_ds3 = copy.deepcopy(trn_ds)
trn_ds4 = copy.deepcopy(trn_ds)
trn_ds5 = copy.deepcopy(trn_ds)
lbr = IdealLabeler(fully_labeled_trn_ds)
quota = len(y_train) - n_labeled # number of samples to query
# Comparing UncertaintySampling strategy with RandomSampling.
# model is the base learner, e.g. LogisticRegression, SVM ... etc.
qs = UncertaintySampling(trn_ds,
model=SVM(decision_function_shape='ovr'))
model = SVM(kernel='linear', decision_function_shape='ovr')
_, E_out_1 = run(trn_ds, tst_ds, lbr, model, qs, quota)
results.append(E_out_1.tolist())
qs2 = RandomSampling(trn_ds2)
model = SVM(kernel='linear', decision_function_shape='ovr')
_, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota)
results.append(E_out_2.tolist())
qs3 = QUIRE(trn_ds3)
model = SVM(kernel='linear', decision_function_shape='ovr')
_, E_out_3 = run(trn_ds3, tst_ds, lbr, model, qs3, quota)
results.append(E_out_3.tolist())
qs4 = HintSVM(trn_ds4, cl=1.0, ch=1.0)
model = SVM(kernel='linear', decision_function_shape='ovr')
_, E_out_4 = run(trn_ds4, tst_ds, lbr, model, qs4, quota)
results.append(E_out_4.tolist())
qs5 = ActiveLearningByLearning(trn_ds5,
query_strategies=[
UncertaintySampling(trn_ds5,
model=SVM(kernel='linear',
decision_function_shape='ovr')),
QUIRE(trn_ds5),
HintSVM(trn_ds5, cl=1.0, ch=1.0),
],
T=quota,
uniform_sampler=True,
model=SVM(kernel='linear', decision_function_shape='ovr')
)
model = SVM(kernel='linear', decision_function_shape='ovr')
_, E_out_5 = run(trn_ds5, tst_ds, lbr, model, qs5, quota)
results.append(E_out_5.tolist())
result = []
for i in range(5):
_temp = []
for j in range(i, len(results), 5):
_temp.append(results[j])
result.append(np.mean(_temp, axis=0))
# Plot the learning curve of UncertaintySampling to RandomSampling
# The x-axis is the number of queries, and the y-axis is the corresponding
# error rate.
query_num = np.arange(1, quota + 1)
plt.plot(query_num, result[0], 'g', label='uncertainty sampling')
plt.plot(query_num, result[1], 'k', label='random')
plt.plot(query_num, result[2], 'r', label='QUIRE')
plt.plot(query_num, result[3], 'b', label='HintSVM')
plt.plot(query_num, result[4], 'c', label='ALBL')
plt.xlabel('Number of Queries')
plt.ylabel('Error')
plt.title('Experiment Result')
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05),
fancybox=True, shadow=True, ncol=5)
plt.show()
if __name__ == '__main__':
main()