-
Notifications
You must be signed in to change notification settings - Fork 0
/
nbsvm_model.py
84 lines (63 loc) · 2.3 KB
/
nbsvm_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
from sklearn_utils import *
import numpy as np
import glob
import joblib
import multiprocessing
from nbsvm import NBSVM
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
def model_gen():
model = NBSVM(C=0.3, beta=0.5)
return model
def scoring(estimator, X, y):
preds = estimator.predict(X)
return f1_score(y, preds, average='micro')
def param_search():
params = {'C' : np.linspace(0.25, 0.75, num=21),
#'beta' : [0.25, 0.3, 0.35, 0.4, 0.45, 0.5],
}
print('Params : ', params)
model = NBSVM()
# use 1 less core than available, prevents locking up of laptop
n_cores = multiprocessing.cpu_count() - 1
g = GridSearchCV(model, param_grid=params, scoring=scoring,
n_jobs=n_cores, cv=100, verbose=1)
np.random.seed(1000)
print('Loading data')
texts, labels, label_map = load_both()
print('Tokenizing texts')
x_counts = tokenize(texts)
print('Finished tokenizing texts')
data = tfidf(x_counts)
print('Finished computing TF-IDF')
g.fit(data, labels)
print("Best parameters set found on development set:")
print()
print(g.best_params_)
print()
print("Grid scores on development set:")
print()
means = g.cv_results_['mean_test_score']
stds = g.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, g.cv_results_['params']):
print("%0.6f (+/-%0.06f) for %r"
% (mean, std * 2, params))
def write_predictions(model_dir='nbsvm/'):
basepath = 'models/' + model_dir
path = basepath + "*.pkl"
data, labels = prepare_data()
files = glob.glob(path)
nb_models = len(files)
model_predictions = np.zeros((nb_models, data.shape[0], 3))
for i, fn in enumerate(files):
model = joblib.load(fn) # type: NBSVM
model_predictions[i, :, :] = model._predict_proba_lr(data)
print('Finished prediction for model %d' % (i + 1))
np.save(basepath + "svm_predictions.npy", model_predictions)
if __name__ == '__main__':
train_sklearn_model_cv(model_gen, 'nbsvm/svm-model', k_folds=100)
#param_search()
write_predictions()
evaluate_sklearn_model('nbsvm/')
evaluate_sklearn_model('nbsvm/', dataset='obama')
evaluate_sklearn_model('nbsvm/', dataset='romney')