-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmodels.py
151 lines (118 loc) · 4.58 KB
/
models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import sklearn_crfsuite
import scipy.stats
from data_extraction import sent2features, sent2labels, consecutive
from sklearn.model_selection import PredefinedSplit
from sklearn_crfsuite import metrics
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer
RANDOMIZED_SEARCH_ITERATIONS = 50
def train_model(X_train, y_train, c1=0.1, c2=0.1):
print(c1, c2)
crf = sklearn_crfsuite.CRF(
algorithm='lbfgs',
c1=c1,
c2=c2,
max_iterations=100,
all_possible_transitions=True
)
crf.fit(X_train, y_train)
return crf
def randomized_search(X_train, X_dev, y_train, y_dev, labels):
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
algorithm='lbfgs',
all_possible_transitions=True
)
params_space = {
'c1': scipy.stats.loguniform(a=1e-7,b=1e2),
'c2': scipy.stats.loguniform(a=1e-7,b=1e2),
}
# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
average='weighted', labels=labels)
X = X_train + X_dev
# Create a list where train data indices are -1 and validation data indices are 0
indices = [-1 if x in range(len(X_train)) else 0 for x in range(len(X))]
# Use the list to create PredefinedSplit
pds = PredefinedSplit(test_fold=indices)
y = y_train + y_dev
rs = RandomizedSearchCV(
crf, params_space,
cv=pds,
verbose=1,
n_jobs=-1,
n_iter=RANDOMIZED_SEARCH_ITERATIONS,
scoring=f1_scorer,
refit=False
)
rs.fit(X, y)
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
crf = sklearn_crfsuite.CRF(
algorithm='lbfgs',
c1=rs.best_params_['c1'],
c2=rs.best_params_['c2'],
all_possible_transitions=True
)
crf.fit(X_train, y_train)
return crf
def get_reformatted_labels(crf):
labels = list(crf.classes_)
labels.remove('O')
return labels
def sort(labels):
return sorted(
labels,
key=lambda name: (name[1:], name[0])
)
def run_baseline(train_sents, test_sents, y_train, y_test):
print('Running baseline model')
X_train = [sent2features(s) for s in train_sents]
crf = train_model(X_train, y_train)
# Model Evalutation
labels = get_reformatted_labels(crf)
sorted_labels = sort(labels)
X_test = [sent2features(s) for s in test_sents]
y_pred = crf.predict(X_test)
print('F1 scores: \n', metrics.flat_f1_score(
y_test, y_pred, average='weighted', labels=labels))
print('Classification Report: \n', metrics.flat_classification_report(
y_test, y_pred, labels=sorted_labels, digits=3))
print('\n')
return crf, labels
def run_baseline_with_hparam_optimization(
train_sents, dev_sents, test_sents, y_train, y_dev, y_test, labels
):
print('Running randomized search on baseline model..')
X_train = [sent2features(s) for s in train_sents]
X_dev = [sent2features(s) for s in dev_sents]
rs_crf = randomized_search(X_train, X_dev, y_train, y_dev, labels)
# Model Evalutation
sorted_labels = sort(labels)
X_test = [sent2features(s) for s in test_sents]
y_pred = rs_crf.predict(X_test)
print('F1 scores: \n', metrics.flat_f1_score(
y_test, y_pred, average='weighted', labels=labels))
print('Classification Report: \n', metrics.flat_classification_report(
y_test, y_pred, labels=sorted_labels, digits=3))
print('\n')
return rs_crf
def run_extra_features_with_hparam_optimization(
train_sents, dev_sents, test_sents, y_train, y_dev, y_test, labels, extra_features=[], search_depth=1
):
extra_features_log = f'extra features {", ".join(extra_features)}' if len(extra_features) else 'no extra features'
print(f'Running randomized search on model with search depth {search_depth} and {extra_features_log}')
X_train = [sent2features(s, extra_features, search_depth)for s in train_sents]
X_dev = [sent2features(s, extra_features, search_depth) for s in dev_sents]
rs_crf = randomized_search(X_train, X_dev, y_train, y_dev, labels)
# Model Evalutation
sorted_labels = sort(labels)
X_test = [sent2features(s, extra_features, search_depth)
for s in test_sents]
y_pred = rs_crf.predict(X_test)
print('F1 scores: \n', metrics.flat_f1_score(
y_test, y_pred, average='weighted', labels=labels))
print('Classification Report: \n', metrics.flat_classification_report(
y_test, y_pred, labels=sorted_labels, digits=3))
print('\n')
return rs_crf