-
Notifications
You must be signed in to change notification settings - Fork 0
/
transmission_classification.py
executable file
·186 lines (149 loc) · 6.38 KB
/
transmission_classification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
# -*- coding: utf-8 -*-
'''Sample script for Kaggle "Energy Connections" competition
Author: Kyle Bradbury
Date: September 27, 2018
Organization: Duke University Energy Initiative
'''
'''
Import the packages needed for classification
'''
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
import sklearn.metrics as metrics
from os import path
plt.close()
'''
Set directory parameters
'''
# Set the directories for the data and the CSV files that contain ids/labels
base_dir = '/Users/scottheng/Documents/Courses/Fall_2019/ENERGY_795/electricity-connections'
dir_train_images = path.join(base_dir, 'train/train')
dir_test_images = path.join(base_dir, 'test/test')
dir_train_labels = path.join(base_dir, 'train.csv')
dir_test_ids = path.join(base_dir, 'sample_submission.csv')
'''
Include the functions used for loading, preprocessing, features extraction,
classification, and performance evaluation
'''
def load_data(dir_data, dir_labels, training=True):
''' Load each of the image files into memory
While this is feasible with a smaller dataset, for larger datasets,
not all the images would be able to be loaded into memory
When training=True, the labels are also loaded
'''
labels_pd = pd.read_csv(dir_labels)
ids = labels_pd.id.values
data = []
for identifier in ids:
fname = path.join(dir_data, identifier)
image = mpl.image.imread(fname)
data.append(image)
data = np.array(data) # Convert to Numpy array
if training:
labels = labels_pd.label.values
return data, labels
else:
return data, ids
def preprocess_and_extract_features(data):
'''Preprocess data and extract features
Preprocess: normalize, scale, repair
Extract features: transformations and dimensionality reduction
'''
# Here, we do something trivially simple: we take the average of the RGB
# values to produce a grey image, transform that into a vector, then
# extract the mean and standard deviation as features.
# Make the image grayscale
data = np.mean(data, axis=3)
# Vectorize the grayscale matrices
vectorized_data = data.reshape(data.shape[0],-1)
# extract the mean and standard deviation of each sample as features
feature_mean = np.mean(vectorized_data,axis=1)
feature_std = np.std(vectorized_data,axis=1)
# Combine the extracted features into a single feature vector
features = np.stack((feature_mean,feature_std),axis=-1)
return features
def set_classifier():
'''Shared function to select the classifier for both performance evaluation
and testing
'''
return KNeighborsClassifier(n_neighbors=10)
def cv_performance_assessment(X,y,k,clf):
'''Cross validated performance assessment
X = training data
y = training labels
k = number of folds for cross validation
clf = classifier to use
Divide the training data into k folds of training and validation data.
For each fold the classifier will be trained on the training data and
tested on the validation data. The classifier prediction scores are
aggregated and output
'''
# Establish the k folds
prediction_scores = np.empty(y.shape[0],dtype='object')
kf = StratifiedKFold(n_splits=k, shuffle=True)
i = 1
for train_index, val_index in kf.split(X, y):
# Extract the training and validation data for this fold
X_train, X_val = X[train_index], X[val_index]
y_train = y[train_index]
# Train the classifier
X_train_features = preprocess_and_extract_features(X_train)
clf = clf.fit(X_train_features,y_train)
# Test the classifier on the validation data for this fold
X_val_features = preprocess_and_extract_features(X_val)
cpred = clf.predict_proba(X_val_features)
# Save the predictions for this fold
prediction_scores[val_index] = cpred[:,1]
print('Training Fold {} of {} Complete'.format(i,k))
i += 1
return prediction_scores
def plot_roc(labels, prediction_scores):
fpr, tpr, _ = metrics.roc_curve(labels, prediction_scores, pos_label=1)
auc = metrics.roc_auc_score(labels, prediction_scores)
legend_string = 'AUC = {:0.3f}'.format(auc)
plt.plot([0,1],[0,1],'--', color='gray', label='Chance')
plt.plot(fpr, tpr, label=legend_string)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.grid(True)
plt.axis('square')
plt.legend()
plt.tight_layout()
'''
Sample script for running cross-validated performance assessment
'''
# Set parameters for the analysis
num_training_folds = 5
# Load the data
data, labels = load_data(dir_train_images, dir_train_labels, training=True)
print('Data Loaded')
# Choose which classifier to use
clf = set_classifier()
# Perform cross validated performance assessment
prediction_scores = cv_performance_assessment(data,labels,num_training_folds,clf)
# Compute and plot the ROC curves
plot_roc(labels, prediction_scores)
'''
Sample script for producing a Kaggle submission
'''
produce_submission = True # Switch this to True when you're ready to create a submission for Kaggle
if produce_submission:
# Load data, extract features, and train the classifier on the training data
training_data, training_labels = load_data(dir_train_images, dir_train_labels, training=True)
training_features = preprocess_and_extract_features(training_data)
clf = set_classifier()
clf.fit(training_features,training_labels)
# Load the test data and test the classifier
test_data, ids = load_data(dir_test_images, dir_test_ids, training=False)
test_features = preprocess_and_extract_features(test_data)
test_scores = clf.predict_proba(test_features)[:,1]
# Save the predictions to a CSV file for upload to Kaggle
submission_file = pd.DataFrame({'id': ids,
'score': test_scores})
submission_file.to_csv('submission.csv',
columns=['id','score'],
index=False)