-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhelper.py
189 lines (142 loc) · 5.48 KB
/
helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
# Helper functions
import matplotlib.pyplot as plt
def sampled(X_train, y_train):
"""Returns sampled X_train, y_train by using smote().
Returns sampled X_test, y_test by using RandomOverSampler().
Parameters:
X_train (pd.dataframe): X_train set to be sampled
X_test (pd.dataframe): X_test set to be sampled
y_train (pd.series): y_train set to be sampled
y_test (pd.series): y_test set to be sampled
Returns:
X_train (pd.dataframe): sampled X_train
X_test (pd.dataframe): sampled X_test
y_train (pd.series): sampled y_train
y_test (pd.series): sampled y_test
"""
from imblearn.over_sampling import SMOTE
#from imblearn.over_sampling import RandomOverSampler
smote = SMOTE(sampling_strategy='auto', k_neighbors=3, random_state=33)
#X_train, y_train = smote.fit_sample(X_train, y_train)
#X_test, y_test = smote.fit_sample(X_test, y_test)
#return X_train, X_test, y_train, y_test
# Create instance of SMOTE
#smote = SMOTE()
# Apply smote
X_train_resampled, y_train_resampled = smote.fit_sample(X_train, y_train)
return X_train_resampled, y_train_resampled
def print_accuracy_report(X_train, X_test, y_train, y_test, model):
"""Takes in X_train, X_test, y_train, y_test, model and calculates and prints accuracy_score,
f1_score, confusion_matrix, classification_report for X_train, X_test, y_train, y_test
Parameters:
X_train (pd.dataframe):
X_test (pd.dataframe):
y_train (pd.series):
y_test (pd.series):
model: classifier
Returns:
Returns accuracy_score, f1_score, confusion_matrix, classification_report for X_train, X_test,
y_train, y_test
"""
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
training_preds = model.predict(X_train)
y_pred = model.predict(X_test)
training_accuracy = accuracy_score(y_train, training_preds)
val_accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print("\n\nTraining Accuracy: {:.4}%".format(training_accuracy * 100))
print("Validation Accuracy: {:.4}%".format(val_accuracy * 100))
print("F1 Score: {:.4}%". format(f1))
# Classification report
print("\n\n\nClassification Report:")
print("---------------------")
print(classification_report(y_test, y_pred))
# Plot a confusion matrix
def plot_con_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""Returns confusion matrix for y_test and y_pred
Parameters:
y_test (pd.Series):
y_pred (pd.series):
class_names: names of the target variable
cmap: colormap for the matrix
Returns:
Returns confusion matrix.
"""
#Add Normalization Option
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print('Normalized confusion matrix')
else:
print('Confusion matrix, without normalization')
print(cm)
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, format(cm[i, j], fmt),
horizontalalignment='center',
color='white' if cm[i, j] > thresh else 'black')
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
def plot_roc_curve(fpr, tpr):
""" Plots ROC curve.
Parameters:
fpr: false positive rate
tpr: true positive rate
Returns:
Returns ROC curve.
"""
plt.style.use('ggplot')
plt.figure(figsize=(10, 8))
plt.plot(fpr, tpr, color='darkorange', lw=4, label='ROC curve')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.yticks([i/20.0 for i in range(21)])
plt.xticks([i/20.0 for i in range(21)])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()
def grid_search(X_train, X_test, y_train, y_test, model, model_name, params, cv=3):
"""Returns confusion matrix for y_test and y_pred
Parameters:
y_test (pd.Series):
y_pred (pd.series):
class_names: names of the target variable
cmap: colormap for the matrix
Returns:
Returns confusion matrix.
"""
from sklearn.model_selection import GridSearchCV
# Create grid search object
gridsearch = GridSearchCV(
model,
param_grid = params,
cv = cv,
return_train_score=True
)
# Fit on data
gridsearch.fit(X_train, y_train)
best_params = gridsearch.best_params_
print(f'Optimal parameters: {best_params}')
print('\n')
print(f"Training Accuracy: {gridsearch.best_score_ :.2%}")
print('\n')
print(f"Test Accuracy: {gridsearch.score(X_test, y_test) :.2%}")
import joblib
model_name = model_name.lower().replace(' ', '_')
joblib.dump(gridsearch.best_estimator_, f'{model_name}_gridsearch_output_model.pkl', compress = 1)
print('\n')
print('Model saved successfully!')
return best_params