-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtask2_train.py
204 lines (168 loc) · 6.51 KB
/
task2_train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
import pandas as pd
import numpy as np
import click
from joblib import dump, load
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, f1_score, auc, roc_curve
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import matplotlib.pyplot as plt
from sklearn.utils.multiclass import unique_labels
from constants import LABELS_DATA_PATH, ALL_COLS, ROOM_LABELS, TASK2_CLASS_COL, TASK2_CLASS_LABELS
def plot_confusion_matrix(y_true, y_pred, classes, normalize=False, title=None, cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
if not title:
if normalize:
title = "Normalized confusion matrix"
else:
title = "Confusion matrix, without normalization"
# Compute confusion matrix
# cm = confusion_matrix(y_true, y_pred)
cm = confusion_matrix(y_true, y_pred)
# Only use the labels that appear in the data
# classes = classes[unique_labels(y_true, y_pred)]
if normalize:
cm = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print("Confusion matrix, without normalization")
print(cm)
fig, ax = plt.subplots()
im = ax.imshow(cm, interpolation="nearest", cmap=cmap)
ax.figure.colorbar(im, ax=ax)
# We want to show all ticks...
ax.set(
xticks=np.arange(cm.shape[1]),
yticks=np.arange(cm.shape[0]),
# ... and label them with the respective list entries
xticklabels=classes,
yticklabels=classes,
title=title,
ylabel="True label",
xlabel="Predicted label",
)
# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
# Loop over data dimensions and create text annotations.
fmt = ".2f" if normalize else "d"
thresh = cm.max() / 2.0
for i in range(cm.shape[0]):
for j in range(cm.shape[1]):
ax.text(
j, i, format(cm[i, j], fmt), ha="center", va="center", color="white" if cm[i, j] > thresh else "black"
)
fig.tight_layout()
filename = f"cm_{'with_normalization' if normalize else 'without_normalization'}"
fig.savefig(filename)
plt.close()
def divide(df: pd.DataFrame, labels: list):
X = df.drop(labels, axis=1, inplace=False).copy()
y = df[labels].copy()
return X, y
def read_data(one):
X, y = 0.0, 0.0
# use task2_class
if one:
df = pd.read_csv(LABELS_DATA_PATH, usecols=ALL_COLS + [TASK2_CLASS_COL])
df = df.drop(ROOM_LABELS, axis=1)
one_hot = pd.get_dummies(df[TASK2_CLASS_COL])
df = df.drop(TASK2_CLASS_COL, axis=1)
df = df.join(one_hot)
df = df.drop("validation", axis=1)
X, y = divide(df, TASK2_CLASS_LABELS)
# respective cols
else:
df = pd.read_csv(LABELS_DATA_PATH, usecols=ALL_COLS)
X, y = divide(df, ROOM_LABELS)
df = df.drop("House", axis=1)
return X, y
@click.command()
@click.option("--label", default="Bathroom", help="")
@click.option("--normalize", is_flag=True, help="")
@click.option("--search", is_flag=True, help="")
@click.option(
"--one", is_flag=True, help="Use labels from task2_class; otherwise use data from cols respective to rom names"
)
@click.option("--test_size", default=0.3, help="test set size")
@click.option("--n_estimators", default=100, help="")
@click.option("--max_depth", default=15, help="")
@click.option("--max_features", default=3, help="")
@click.option("--min_samples_split", default=3, help="")
@click.option("--max_leaf_nodes", default=3, help="")
@click.option("--min_samples_leaf", default=3, help="")
@click.option("--min_weight_fraction_leaf", default=0.5, help="")
def main(
test_size,
label,
normalize,
one,
min_samples_leaf,
min_weight_fraction_leaf,
n_estimators,
max_depth,
search,
max_features,
min_samples_split,
max_leaf_nodes,
):
# data
# X, y = read_data(one)
df = pd.read_csv(LABELS_DATA_PATH, usecols=ALL_COLS + [TASK2_CLASS_COL])
df = df[df[TASK2_CLASS_COL] != "validation"]
X, y = divide(df, TASK2_CLASS_COL)
# y = y[label]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=True)
# model
# clf = RandomForestClassifier(
# class_weight="balanced",
# max_depth=max_depth,
# n_estimators=n_estimators,
# max_features=max_features,
# bootstrap=False,
# )
clf = GradientBoostingClassifier(
max_depth=max_depth,
max_leaf_nodes=max_leaf_nodes,
max_features=max_features,
n_estimators=n_estimators,
min_samples_split=min_samples_split,
min_samples_leaf=min_samples_leaf,
min_weight_fraction_leaf=min_weight_fraction_leaf,
)
clf.fit(X_train, y_train)
# grid search
if search:
param_grid = {
"max_depth": [3, 5, 10],
"max_leaf_nodes": [3, 5, 10],
"max_features": [3, 5, 10],
"n_estimators": [64, 128],
"min_samples_split": [3, 5],
"min_samples_leaf": [3, 5],
"min_weight_fraction_leaf": [0.01, 0.05, 0.1, 0.2, 0.5],
}
grid = GridSearchCV(estimator=clf, param_grid=param_grid, verbose=1, n_jobs=-1)
grid.fit(X, y)
print(grid)
# summarize the results of the grid search
print(grid.best_score_)
print(grid.best_params_)
# cross val
scores = cross_val_score(clf, X, y, cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
# evaluation
y_pred = clf.predict(X_test)
# print(classification_report(y_test['Bathroom'], y_pred['Bathroom']))
classes = TASK2_CLASS_LABELS if one else ROOM_LABELS
title = f"Confusion matrix, {'with normalization' if normalize else 'without normalization'}"
plot_confusion_matrix(y_test, y_pred, classes=classes, title=title, normalize=normalize)
# fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label=1)
# print(f"AUC: {round(auc(fpr, tpr),3)}")
print(f"Accuracy: {round(accuracy_score(y_test, y_pred),3)}")
print("F1-score:",f1_score(y_test, y_pred, average='weighted'))
dump(clf, f"clf_task2_xgboost_{'single' if one else 'multiple'}.joblib")
if __name__ == "__main__":
main()