-
Notifications
You must be signed in to change notification settings - Fork 0
/
app.py
361 lines (304 loc) · 11.4 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from skimage import io
from sklearn.metrics import fbeta_score
from PIL import Image
import pathfinder
import utils
rng = np.random.RandomState(37145)
def read_compressed_image(dataset, idx):
if dataset == 'train':
prefix = 'train-compressed/train_'
elif dataset == 'test':
prefix = 'test-compressed/test_'
elif dataset == 'test2':
prefix = 'test-compressed/file_'
elif dataset == 'train-jpg':
prefix = 'train-jpg/train_'
elif dataset == 'test-jpg':
prefix = 'test-jpg/test_'
elif dataset == 'test2-jpg':
prefix = 'test-jpg/file_'
else:
raise
path = pathfinder.DATA_PATH + prefix + str(idx)
if 'jpg' in dataset:
path = path + '.jpg'
im = Image.open(path)
return im
else:
path = path + '.npz'
im = utils.load_np(path)
if 'arr' not in im:
print path
return im['arr']
def read_image_from_path(path):
path = pathfinder.DATA_PATH + path
if 'jpg' in path:
path = path + '.jpg'
im = Image.open(path)
return im
else:
path = path + '.npz'
im = utils.load_np(path)
if 'arr' not in im:
print path
return im['arr']
def get_image_paths(train_ids = [], test_ids = [], test2_ids = []):
image_paths = []
for train_id in train_ids:
image_paths.append('train-jpg/train_'+str(train_id))
for test_id in test_ids:
image_paths.append('test-jpg/test_'+str(test_id))
for test2_id in test2_ids:
image_paths.append('test-jpg/file_'+str(test2_id))
return image_paths
def get_id_from_path(img_path):
return int(img_path.split('_')[-1])
def read_image(dataset, idx):
if dataset == 'train':
prefix = 'train-tif-v2/train_'
elif dataset == 'test':
prefix = 'test-tif-v2/test_'
elif dataset == 'test2':
prefix = 'test-tif-v2/file_'
else:
raise
path = pathfinder.DATA_PATH + prefix + str(idx) + '.tif'
image = io.imread(path)
image = np.swapaxes(image,0,2)
return image
def save_image_compressed(dataset, idx):
np_image = read_image(dataset, idx)
if dataset == 'train':
prefix = 'train-compressed/train_'
elif dataset == 'test':
prefix = 'test-compressed/test_'
elif dataset == 'test2':
prefix = 'test-compressed/file_'
else:
raise
path = pathfinder.DATA_PATH + prefix + str(idx) + '.npz'
utils.savez_compressed_np(np_image, path)
def get_labels(version=1):
if version == 2:
df = pd.read_csv(pathfinder.DATA_PATH + 'train_v2.csv')
else:
df = pd.read_csv(pathfinder.DATA_PATH+'train.csv')
df = pd.concat([df['image_name'], df.tags.str.get_dummies(sep=' ')], axis=1)
cols = list(df.columns.values) #Make a list of all of the columns in the df
weather_labels = ['clear', 'partly_cloudy', 'haze', 'cloudy']
rare_labels = ['slash_burn','conventional_mine', 'bare_ground', 'artisinal_mine',
'blooming','selective_logging','blow_down']
for label in weather_labels:
cols.pop(cols.index(label)) #Remove b from list
for label in rare_labels:
cols.pop(cols.index(label)) #Remove b from list
df = df[weather_labels+rare_labels+cols] #
return df
def get_headers(version=1):
df = get_labels(version)
only_labels = df.drop(['image_name'], axis = 1, inplace = False)
cols = list(only_labels.columns.values)
return cols
def get_labels_array(version=1):
df = get_labels(version)
only_labels = df.drop(['image_name'], axis = 1, inplace = False)
only_labels = only_labels.as_matrix()
return only_labels
def get_d_labels(version=1):
d_labels = {}
df = get_labels(version)
label_array = get_labels_array(version)
for index, row in df.iterrows():
d_labels[row['image_name']] = label_array[index]
return d_labels
def chunkIt(seq, num):
avg = len(seq) / float(num)
out = []
last = 0.0
while last < len(seq):
if int(last + avg + avg) > len(seq):
out.append(seq[int(last):len(seq)])
break
else:
out.append(seq[int(last):int(last + avg)])
last += avg
assert len(out) == num
return out
def top_occ(feat_comb, n_top = 5):
# a method for printing top occurences of feature combinations
# built for checking if split is stratified
n_total_samples = len(feat_comb)
feat_comb_occ = np.bincount(feat_comb)
top = feat_comb_occ.argsort()[-n_top:][::-1]
for idx, fc in enumerate(top):
print idx, fc, 1.0*feat_comb_occ[fc]/n_total_samples
print 'checksum', sum(feat_comb)
def make_stratified_split(no_folds=5, verbose=False,version=1):
only_labels = get_labels_array(version)
# df = get_labels()
# only_labels = df.drop(['image_name'], axis = 1, inplace = False)
# only_labels = only_labels.as_matrix()
# if verbose: print 'labels shape', only_labels.shape
feat_comb = only_labels.dot(1 << np.arange(only_labels.shape[-1] - 1, -1, -1))
feat_comb_set = set(feat_comb)
feat_comb_occ = np.bincount(feat_comb)
feat_comb_high = np.where(feat_comb_occ >= no_folds)[0]
n_total_samples = 0
folds = [[] for _ in range(no_folds)]
for fc in feat_comb_high:
idcs = np.where(feat_comb == fc)[0]
chunks = chunkIt(idcs,no_folds)
# print len(idcs), [len(chunk) for chunk in chunks]
rng.shuffle(chunks)
for idx, chunk in enumerate(chunks):
folds[idx].extend(chunk)
feat_comb_low = np.where(np.logical_and(feat_comb_occ < no_folds, feat_comb_occ > 0))[0]
low_idcs = []
for fc in feat_comb_low:
idcs = np.where(feat_comb == fc)[0]
low_idcs.extend(idcs)
chunks = chunkIt(low_idcs,no_folds)
rng.shuffle(chunks)
for idx, chunk in enumerate(chunks):
folds[idx].extend(chunk)
n_samples_fold = 0
for f in folds:
n_samples_fold += len(f)
if verbose:
print 'n_samples_fold', n_samples_fold
top_occ(feat_comb)
for f in folds:
top_occ(feat_comb[f])
return folds
def investigate_labels():
only_labels_all = get_labels_array()
feat_comb = only_labels_all.dot(1 << np.arange(only_labels_all.shape[-1] - 1, -1, -1))
feat_comb_set = set(feat_comb)
print 'number of combinations when all labels are present'
print len(feat_comb_set)
feat_comb_occ = np.bincount(feat_comb)
for i in range(17):
print 'cutting out label', i
only_labels_sel = np.copy(only_labels_all)
only_labels_sel = np.delete(only_labels_sel,i, axis=1)
print only_labels_sel.shape
feat_comb = only_labels_sel.dot(1 << np.arange(only_labels_sel.shape[-1] - 1, -1, -1))
feat_comb_set = set(feat_comb)
print 'number of features', len(feat_comb_set)
feat_comb_occ = np.bincount(feat_comb)
def get_pos_neg_ids(label_id):
labels = get_labels_array()
pos_ids = np.where(labels[:,label_id])
neg_ids = np.where(labels[:,label_id]==0)
return pos_ids[0], neg_ids[0]
def get_biases():
df = get_labels()
df = df.drop(['image_name'], axis = 1, inplace = False)
label_list = list(df.columns.values)
histo = df[label_list].sum()
biases = 1.*np.int32(histo)/len(df.index)
return biases
def generate_compressed(img_ids, dataset):
for idx, img_id in enumerate(img_ids):
if idx%100 == 0:
print idx, '/', len(img_ids)
save_image_compressed(dataset, img_id)
def generate_compressed_trainset():
folds = make_stratified_split(no_folds=5)
all_ids = folds[0] + folds[1] + folds[2] + folds[3] +folds[4]
bad_ids = [18772, 28173, 5023]
img_ids = [x for x in all_ids if x not in bad_ids]
generate_compressed(img_ids, 'train')
def generate_compressed_testset():
test_ids = np.arange(0,40669)
test2_ids = np.arange(0,20522)
generate_compressed(test_ids, 'test')
#generate_compressed(test2_ids, 'test2')
def f2_score(y_true, y_pred, average='samples'):
# fbeta_score throws a confusing error if inputs are not numpy arrays
y_true, y_pred, = np.array(y_true), np.array(y_pred)
# We need to use average='samples' here, any other average method will generate bogus results
return fbeta_score(y_true, y_pred, beta=2, average=average)
def f2_score_arr(y_true, y_pred, treshold=.5, average='samples'):
y_pred = np.array(y_pred)
y_true = np.array(y_true)
assert(len(y_pred.shape)==2)
assert(len(y_true.shape)==2)
assert(y_pred.shape[0]==y_true.shape[0])
n_samples = y_true.shape[0]
y_pred_cutoff = np.digitize(y_pred, [-0.01,treshold,1.01])-1
return f2_score(y_true, y_pred_cutoff, average)
def _test_get_pos_neg_ids():
for i in range(17):
pos_ids, neg_ids = get_pos_neg_ids(i)
print i, len(pos_ids), len(neg_ids), len(pos_ids)+len(neg_ids)
print get_labels_array()[pos_ids[0]]
print get_labels_array()[pos_ids[1]]
print get_labels_array()[neg_ids[0]]
print get_labels_array()[neg_ids[1]]
print 'test done'
def apply_argmax_threshold(prediction):
weather_class = np.argmax(prediction[:4])
output = np.zeros((17,))
output[weather_class] = 1
output[4:] = prediction[4:]>0.5
return output
def apply_threshold(prediction, threshold=.5):
return prediction>threshold
def calculate_relative_occurences():
a_labels = get_labels_array()
print a_labels.shape
abs_occ = np.sum(a_labels,axis=0)
print abs_occ
rel_occ = 1.*abs_occ/a_labels.shape[0]
print rel_occ
print get_headers()
def logloss(predictions, targets, epsilon=1.e-7, skewing_factor = 1.):
preds = np.clip(predictions, epsilon, 1.-epsilon)
weighted_bce = - skewing_factor * targets * np.log(preds) - (1-targets)*np.log(1-preds)
return np.mean(weighted_bce)
def get_ids_by_tag(tag):
la = get_labels_array()
return np.where(la[:,tag]==1)[0]
def remove_tags(labels, d_l2i):
# d_l2i dict with label id as key and as value a list with img ids
for label_id, img_ids in d_l2i.iteritems():
for img_id in img_ids:
if labels[img_id,label_id]:
labels[img_id,label_id] = 0
else:
print 'WARNING in remove_tags: for image ', img_id, ' the tag for label id', label_id, 'is already 0'
return labels
def read_img_ids_from_lst_file(path):
with open(path, 'r') as file:
lst = file.readlines()
img_ids = []
for filename in lst:
img_ids.append(int(filename.split('_')[1].split('.')[0]))
return img_ids
def _test_remove_tags():
test_labels = np.ones((10,17))
d_l2i = {2:[2,8], 5:[5], 10:[0,4,5,9]}
print remove_tags(test_labels, d_l2i)
test_labels = np.ones((10,17))
test_labels[5,5] = 0
print remove_tags(test_labels, d_l2i)
if __name__ == "__main__":
# maxs = []
# mins = []
# shapes = set()
# for idx in range(40476):
# if idx%1000 == 0:
# print idx, '/', '40476'
# im = read_compressed_image('train-jpg', idx)
# imarray = np.array(im)
# shapes.add(imarray.shape)
# maxs.append(np.amax(im))
# mins.append(np.amin(im))
# print np.amax(np.array(maxs))
# print np.amin(np.array(mins))
# print shapes
# calculate_relative_occurences()
print read_img_ids_from_lst_file('manual_labeling/fn_cultivation_wrongly_labeled.lst')