-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDefectDetection.py
executable file
·1292 lines (1034 loc) · 40.9 KB
/
DefectDetection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
__authors__ = 'Arun Manohar'
__license__ = '3-clause BSD'
__maintainer__ = 'Arun Manohar'
__email__ = 'arunmano121@outlook.com'
# IO modules to help with reading in Matlab data
import scipy.io
# plotting tools
import matplotlib.pyplot as plt
# numerical python library to perform array operations
import numpy as np
# perform statistical operations using scipy
import scipy.stats as sp
# used to pre-filter data using wavelets
import pywt
# scitkit-learn library tools to perform machine learning operations
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
# Point class object and function to evaluate if point lies within polygon
from point_in_convex_polygon import Point, is_within_polygon
def annotate_plots(ax, defs):
'''
Annotate charts with locations of defects
Parameters
----------
ax: axis object
plot axis
defs: dict
defect parameters
Returns
-------
None
'''
# iterate through number of defects, defect index starts at 1
for i in range(1, len(defs)+1):
# mark edges of the defects
ax.plot(defs[i]['s1_vertices'], defs[i]['s2_vertices'], color='red')
# add the defect title on the plot at the mean coordinate along
# s1 and s2 axis
ax.annotate(defs[i]['name'], xy=(np.mean(defs[i]['s1_vertices']),
np.mean(defs[i]['s2_vertices'])))
return
def defect_detection_metrics(mat, mah, iso, s1_2d, s2_2d,
defs, t_stamps, t, units, plot=True):
'''
Quantification of defect detection, and plotting the results
True-Positive Rate (TPR), False-Positive Rate (FPR), Receiver Operating
Curves (ROC) are calculated for the raw data, Mahalanobis distance and
result of Isolation Forest method. In addition, Area Under Curve (AUC)
is also calculated to quantify the performance. Often, performance in
terms of higher TPR is desired at lower FPR. To aid this, TPR values
are calculated at 2%, 5% and 10% FPR. Further, the results are presented
graphically if needed.
Parameters
----------
mat: ndarray
raw data - 3D `float` array
mah: ndarray
result of performing Mahalanobis distance - 3D `float` array
iso: ndarray
result of performing Isolation Forest algorithm - 3D
`float` array
s1_2d: ndarray
2D meshgrid representation of s1 axis
s2_2d: ndarray
2D meshgrid representation of s2 axis
defs: dict
defect parameters
t_stamps: list
time stamps at which features were calculated and where
results are desired
t: list
time coordinates
units: dict
units of the different dimensions
plot: Bool
Boolean to indicate if plots are needed to visualize
Returns
-------
None
'''
# total number of defects
num_defs = len(defs)
# y_truth holds the true labels of the defective areas
# location of defects are labeled with 1
y_truth = np.zeros(mat.shape)
# defect index starts from 1
for i in range(1, num_defs+1):
y_truth[:, defs[i]['s1_sel'], defs[i]['s2_sel']] = 1
# counter holds the defect number and t_idx contains the time index
# of the defect
for counter, t_idx in enumerate(t_stamps):
# Compute ROC curve and ROC area for raw data
fpr_mat, tpr_mat, _ = \
roc_curve(
y_truth[t_idx, :, :].ravel(), mat[t_idx, :, :].ravel())
# calculate Area Under Curve (AUC) based on FPR and TPR of raw data
auc_mat = auc(fpr_mat, tpr_mat)
# index where fpr is closest to 0.02 (2%)
fpr_p2_mat_idx = abs(np.array(fpr_mat - 0.02)).argmin()
# index where fpr is closest to 0.05 (5%)
fpr_p5_mat_idx = abs(np.array(fpr_mat - 0.05)).argmin()
# index where fpr is closest to 0.10 (10%)
fpr_p10_mat_idx = abs(np.array(fpr_mat - 0.10)).argmin()
# find tpr at those indices
tpr_p2_mat = tpr_mat[fpr_p2_mat_idx]
tpr_p5_mat = tpr_mat[fpr_p5_mat_idx]
tpr_p10_mat = tpr_mat[fpr_p10_mat_idx]
# Compute ROC curve and ROC area for outliers computed using
# Mahalanobis distance
fpr_mah, tpr_mah, _ = \
roc_curve(
y_truth[t_idx, :, :].ravel(), mah[t_idx, :, :].ravel())
# calculate AUC based on FPR and TPR over results from
# Mahalanobis distance
auc_mah = auc(fpr_mah, tpr_mah)
# index where fpr is closest to 0.02 (2%)
fpr_p2_mah_idx = abs(np.array(fpr_mah - 0.02)).argmin()
# index where fpr is closest to 0.05 (5%)
fpr_p5_mah_idx = abs(np.array(fpr_mah - 0.05)).argmin()
# index where fpr is closest to 0.10 (10%)
fpr_p10_mah_idx = abs(np.array(fpr_mah - 0.10)).argmin()
# find tpr at those indices
tpr_p2_mah = tpr_mah[fpr_p2_mah_idx]
tpr_p5_mah = tpr_mah[fpr_p5_mah_idx]
tpr_p10_mah = tpr_mah[fpr_p10_mah_idx]
# Compute ROC curve and ROC area for outliers computed using
# Isolation Forest
fpr_iso, tpr_iso, _ = \
roc_curve(
y_truth[t_idx, :, :].ravel(), iso[t_idx, :, :].ravel())
# calculate AUC based on FPR and TPR over results from
# Isolation Forest algorithm
auc_iso = auc(fpr_iso, tpr_iso)
# index where fpr is closest to 0.02 (2%)
fpr_p2_iso_idx = abs(np.array(fpr_iso - 0.02)).argmin()
# index where fpr is closest to 0.05 (5%)
fpr_p5_iso_idx = abs(np.array(fpr_iso - 0.05)).argmin()
# index where fpr is closest to 0.10 (10%)
fpr_p10_iso_idx = abs(np.array(fpr_iso - 0.10)).argmin()
# find tpr at those indices
tpr_p2_iso = tpr_iso[fpr_p2_iso_idx]
tpr_p5_iso = tpr_iso[fpr_p5_iso_idx]
tpr_p10_iso = tpr_iso[fpr_p10_iso_idx]
# Summarize results
print('\n')
print('Time Index: %d' % (t_idx))
print('FPR range:0.00-1.00 AUC Raw: %0.2f'
' AUC Mah: %0.2f AUC Iso: %0.2f'
% (auc_mat, auc_mah, auc_iso))
print('FPR:0.02 TPR Raw: %0.2f TPR Mah: %0.2f TPR Iso: %0.2f'
% (tpr_p2_mat, tpr_p2_mah, tpr_p2_iso))
print('FPR:0.05 TPR Raw: %0.2f TPR Mah: %0.2f TPR Iso: %0.2f'
% (tpr_p5_mat, tpr_p5_mah, tpr_p5_iso))
print('FPR:0.10 TPR Raw: %0.2f TPR Mah: %0.2f TPR Iso: %0.2f'
% (tpr_p10_mat, tpr_p10_mah, tpr_p10_iso))
# plot results based on plot boolean flag
if plot:
fig, ax = plt.subplots(nrows=1, ncols=3)
fig.suptitle('Result of Outlier analysis at %0.1f%s'
% (t[t_idx], units['t_units']))
# raw data
cs = ax[0].contourf(s1_2d, s2_2d, mat[t_idx, :, :])
ax[0].plot(0, 0)
ax[0].set(title='Raw data')
annotate_plots(ax[0], defs)
fig.colorbar(cs, ax=ax[0], shrink=0.33)
# outliers computed using Mahalanobis distance
cs = ax[1].contourf(s1_2d, s2_2d, mah[t_idx, :, :])
ax[1].plot(0, 0)
ax[1].set(title='Outliers using Mahalanobis distance')
annotate_plots(ax[1], defs)
fig.colorbar(cs, ax=ax[1], shrink=0.33)
# outliers computed using Isolation Forest
cs = ax[2].contourf(s1_2d, s2_2d, iso[t_idx, :, :])
ax[2].plot(0, 0)
ax[2].set(title='Isolation Forest')
annotate_plots(ax[2], defs)
fig.colorbar(cs, ax=ax[2], shrink=0.33)
plt.tight_layout()
plt.show()
# plots of ROC curves
fig, ax = plt.subplots(nrows=1, ncols=2)
fig.suptitle('ROC curves')
# FPR range 0.00-1.00
ax[0].plot(fpr_mat, tpr_mat, color='darkorange',
lw=2, label='Raw ROC curve (area = %0.2f)'
% (auc_mat))
ax[0].plot(fpr_mah, tpr_mah, color='green',
lw=2, label='Mah ROC curve (area = %0.2f)'
% (auc_mah))
ax[0].plot(fpr_iso, tpr_iso, color='red',
lw=2, label='Iso ROC curve (area = %0.2f)'
% (auc_iso))
ax[0].plot([0, 1], [0, 1], color='navy', lw=2,
linestyle='--')
ax[0].set_xlim(0.0, 1.0)
ax[0].set_ylim(0.0, 1.05)
ax[0].legend(loc='lower right')
ax[0].set_xlabel('False Positive Rate')
ax[0].set_ylabel('True Positive Rate')
# FPR range 0.00-0.10
ax[1].plot(fpr_mat, tpr_mat, color='darkorange',
lw=2, label='Raw ROC curve')
ax[1].plot(fpr_mah, tpr_mah, color='green',
lw=2, label='Mah ROC curve')
ax[1].plot(fpr_iso, tpr_iso, color='red',
lw=2, label='Iso ROC curve')
ax[1].plot([0, 1], [0, 1], color='navy', lw=2,
linestyle='--')
ax[1].set_xlim(0.0, 0.10)
ax[1].legend(loc='lower right')
ax[1].set_ylim(0.0, 1.05)
ax[1].set_xlabel('False Positive Rate')
ax[1].set_ylabel('True Positive Rate')
plt.title('ROC curve - Time: %0.1f%s'
% (t[t_idx], units['t_units']))
plt.show()
return
def scale_frames(arr, t_stamps):
'''
Scale frames between 0-1 for better interpretability
Parameters
----------
arr: ndarray
input array that needs to be scaled
t_stamps: list
time stamps at which features were calculated and where
results are desired
Returns
-------
outarr: ndarray
scaled array where the elements lie between 0-1
'''
outarr = np.zeros(arr.shape)
# iterate through time stamps
for t_idx in t_stamps:
# slice of data at time stamp
j_mat = arr[t_idx, :, :]
# min and max of slice to help with scaling
j_min = np.min(j_mat)
j_max = np.max(j_mat)
# scale data between 0-1
outarr[t_idx, :, :] = (j_mat - j_min)/(j_max - j_min)
return outarr
def fit_isolationforest_model(features, t_stamps, pca_var):
'''
Fit Isolation Forest model
Parameters
----------
features: dict
dictionary containing all input features
t_stamps: list
time stamps at which features were calculated and where
results are desired
pca_var: float
contains the desired explained variance parameter,
if less than 1.0, PCA will be performed
Returns
-------
iso: ndarray
result of Isolation Forest model over the data
'''
# initialize the model with 15% outliers
clf = IsolationForest(contamination=0.15)
# create an empty numpy array to hold results
shape = features[list(features.keys())[0]].shape
iso = np.zeros(shape)
# create an empty numpy array to hold input features data
X = np.zeros([shape[1] * shape[2] * len(t_stamps), len(features)])
# fill the array with features at different columns
for counter, feature in enumerate(features):
X[:, counter] = np.ravel(features[feature][t_stamps, :, :])
# if PCA is required the parameter will be less than 1
if pca_var < 1.0:
# Standardizing the features before performing PCA
X = StandardScaler().fit_transform(X)
# perform PCA to reduce dimensionality
pca = PCA(n_components=pca_var)
X = pca.fit_transform(X)
# if PCA is performed
# number of components needed to explain variance
print('PCA: Explained variance: %0.2f%% \nfeatures reqd: %d' %
(pca_var*100, X.shape[1]))
# fit the Isolation Forest model
clf.fit(X)
# predict outliers
# multiply by -1 to flip labels and be consistent with mah and mat
# does not affect any other quantitative results
val = -1 * clf.decision_function(X)
iso[t_stamps, :, :] = val.reshape((len(t_stamps), shape[1], shape[2]))
return iso
def outlier_mah(features, t_stamps, pca_var):
'''
Mahalanobis distance to identify outliers
Parameters
----------
features: dict
dictionary containing all input features
t_stamps: list
time stamps at which features were calculated and where
results are desired
pca_var: float
contains the desired explained variance parameter,
if less than 1.0, PCA will be performed
Returns
-------
mah: ndarray
contains the result of computing Mahalanobis distance over
the data
'''
# create an empty numpy array to hold results
shape = features[list(features.keys())[0]].shape
mah = np.zeros(shape)
# create an empty numpy array to hold input features data
X = np.zeros([shape[1] * shape[2] * len(t_stamps), len(features)])
# fill the array with features at different columns
for counter, feature in enumerate(features):
X[:, counter] = np.ravel(features[feature][t_stamps, :, :])
# if PCA is required the parameter will be less than 1
if pca_var < 1.0:
# Standardizing the features before performing PCA
X = StandardScaler().fit_transform(X)
# perform PCA to reduce dimensionality
pca = PCA(n_components=pca_var)
X = pca.fit_transform(X)
# if PCA is performed
# number of components needed to explain variance
print('PCA: Explained variance: %0.2f%% \nfeatures reqd: %d' %
(pca_var*100, X.shape[1]))
# compute mean, covariance and covariance inverse
mu = np.mean(X, axis=0)
cov = np.cov(X, rowvar=False)
cov_inv = np.linalg.inv(cov)
# regular method - but computationally intensive
# t = np.matmul(np.matmul((X_slice - mu), cov_inv),
# np.transpose((X_slice - mu)))
# val = np.sqrt(np.diag(t))
# alternate method to speed-up significantly ~ 150x
# source - https://stackoverflow.com/questions/27686240/
# calculate-mahalanobis-distance-using-numpy-only
delta = X - mu
val = np.sqrt(np.einsum('nj,jk,nk->n', delta, cov_inv, delta))
# store the Mahalanobis distance
mah[t_stamps, :, :] = val.reshape((len(t_stamps), shape[1], shape[2]))
return mah
def normalize_features(features, t_stamps):
'''
Normalize features
Parameters
----------
features: dict
dictionary containing all input features
t_stamps: list
time stamps at which features were calculated and where
results are desired
Returns
-------
features: dict
dictionary containing all normalized features
'''
# iterate through the features
for feature in features:
# iterate through the time stamps
for t_idx in t_stamps:
# extract feature at timestamp
j_feature = features[feature][t_idx, :, :]
# compute the 1st and 99th percentile
p1 = np.percentile(j_feature, 1)
p99 = np.percentile(j_feature, 99)
# capping at 1st and 99th percentile to avoid extreme values
j_feature[j_feature <= p1] = p1
j_feature[j_feature >= p99] = p99
# calculate min and max of the features to scale
j_min = np.min(j_feature)
j_max = np.max(j_feature)
features[feature][t_idx, :, :] = \
(j_feature - j_min) / (j_max - j_min)
return features
def combine_features(feature_list):
'''
Combine all features from different methods into one single dict
Parameters
----------
feature_list: list
list containing all entries of input features
that need to be concatenated
Returns
-------
features: dict
feature dictionary containing all the input features
'''
# placeholder dictionary to hold all the features
features = {}
for i in range(len(feature_list)):
features = {**feature_list[i], **features}
return features
def visualize_features(mat, features, s1_2d, s2_2d, feature,
t_idx, t, units):
'''
Visualize computed features
Parameters
----------
mat: ndarray
raw data
features: dict
dictionary containing input features
s1_2d: ndarray
2D meshgrid representation of s1 axis
s2_2d: ndarray
2D meshgrid representation of s2 axis
feature: str
desired feature that needs to be visualized
t_idx: int
time index at which visualization is needed
units: dict
units of the different dimensions
Returns
-------
None
'''
fig, ax = plt.subplots(nrows=1, ncols=2)
fig.suptitle('Comparison of raw signal and feature at %0.1f%s'
% (t[t_idx], units['t_units']))
# contour plot of raw signal
ax[0].contourf(s1_2d, s2_2d, mat[t_idx, :, :])
ax[0].set(title='Raw signal')
# contour plot of feature
ax[1].contourf(s1_2d, s2_2d, features[feature][t_idx, :, :])
ax[1].set(title='Feature - ' + feature)
plt.tight_layout()
plt.show()
return
def compute_features_wav(mat, t_stamps):
'''
Calculates wavelet transformed features at every location
Parameters
----------
mat: ndarray
raw data
t_stamps: list
time stamps at which wavelet features are calculated
Returns
-------
features_wav: dict
dictionary containing wavelet features
'''
# initialize an empty dictionary to hold and return all features
features_wav = {}
# ll is reconstructed from approximation
# lh is reconstructed from s1 direction detail
# hl is reconstructed from s2 direction detail
# hh is reconstructed from diagonal detail
features_wav['ll'] = np.zeros(mat.shape)
features_wav['lh'] = np.zeros(mat.shape)
features_wav['hl'] = np.zeros(mat.shape)
features_wav['hh'] = np.zeros(mat.shape)
# iterating through the time stamps
for t_idx in t_stamps:
# wavelet decomposition
LL, (LH, HL, HH) = pywt.dwt2(mat[t_idx, :, :], 'bior1.3')
# inverse wavelet transform
features_wav['ll'][t_idx, :, :] = \
pywt.idwt2((LL, (None, None, None)), 'bior1.3',
mode='symmetric')[1:, :]
features_wav['lh'][t_idx, :, :] = \
pywt.idwt2((None, (LH, None, None)), 'bior1.3',
mode='symmetric')[1:, :]
features_wav['hl'][t_idx, :, :] = \
pywt.idwt2((None, (None, HL, None)), 'bior1.3',
mode='symmetric')[1:, :]
features_wav['hh'][t_idx, :, :] = \
pywt.idwt2((None, (None, None, HH)), 'bior1.3',
mode='symmetric')[1:, :]
return features_wav
def compute_features_td(mat, t_stamps):
'''
Calculating temporal features at every spatial location
Parameters
----------
mat: ndarray
raw data
t_stamps: list
time stamps at which time domain features are calculated
Returns
-------
features_td: dict
dictionary containing time domain features
'''
# all these time domain features are calculated at every spatial point
# in the time domain
# initialize an empty dictionary to hold and return all features
features_td = {}
# a small constant to prevent divide by zeros
eps = 1e-7
# z val compares the distance of current obs from the mean of
# 10 prior or 100 prior obs
features_td['val_z10'] = np.zeros(mat.shape)
features_td['val_z100'] = np.zeros(mat.shape)
# ratio of current obs compared to median of 10 or 100 prior obs
features_td['val_med10'] = np.zeros(mat.shape)
features_td['val_med100'] = np.zeros(mat.shape)
# ratio of current obs compared to mean of 10 or 100 prior obs
features_td['val_mean10'] = np.zeros(mat.shape)
features_td['val_mean100'] = np.zeros(mat.shape)
# ratio of mean of 10 obs to mean of 100 obs
features_td['rat_mean_10_100'] = np.zeros(mat.shape)
# ratio of stdev of 10 obs to stdev of 100 obs
features_td['rat_std_10_100'] = np.zeros(mat.shape)
# ratio of median of 10 obs to median of 100 obs
features_td['rat_med_10_100'] = np.zeros(mat.shape)
# ratio of skew of 10 obs to skew of 100 obs
features_td['rat_skew_10_100'] = np.zeros(mat.shape)
# ratio of kurtosis of 10 obs to kurtosis of 100 obs
features_td['rat_kurtosis_10_100'] = np.zeros(mat.shape)
# ratio of median of 10 obs to mean of 10 obs
features_td['rat_med_mean_10'] = np.zeros(mat.shape)
# ratio of median of 100 obs to mean of 100 obs
features_td['rat_med_mean_100'] = np.zeros(mat.shape)
# ratio of skew of 10 obs to kurtosis of 10 obs
features_td['rat_skew_kurtosis_10'] = np.zeros(mat.shape)
# ratio of skew of 100 obs to kurtosis of 100 obs
features_td['rat_skew_kurtosis_100'] = np.zeros(mat.shape)
# uniform average value of the 5 most recent obs
features_td['smooth_uni_5'] = np.zeros(mat.shape)
# weighted verage value of the 5 most recent obs
features_td['smooth_wt_5'] = np.zeros(mat.shape)
# uniform average value of the 25 most recent obs
features_td['smooth_uni_25'] = np.zeros(mat.shape)
# iterating through the other time stamps
for t_idx in t_stamps:
j_slice = mat[t_idx, :, :] # current time slice
j_mean10 = np.mean(mat[t_idx-10:t_idx, :, :], axis=0)
j_std10 = np.std(mat[t_idx-10:t_idx, :, :], axis=0)
j_med10 = np.median(mat[t_idx-10:t_idx, :, :], axis=0)
j_mean100 = np.mean(mat[t_idx-100:t_idx, :, :], axis=0)
j_std100 = np.std(mat[t_idx-100:t_idx, :, :], axis=0)
j_med100 = np.median(mat[t_idx-100:t_idx, :, :], axis=0)
j_skew10 = sp.skew(mat[t_idx-10:t_idx, :, :], axis=0)
j_skew100 = sp.skew(mat[t_idx-100:t_idx, :, :], axis=0)
j_kurtosis10 = sp.kurtosis(mat[t_idx-10:t_idx, :, :], axis=0)
j_kurtosis100 = sp.kurtosis(mat[t_idx-100:t_idx, :, :], axis=0)
# adding eps to avoid division by 0
j_mean10 += eps
j_std10 += eps
j_med10 += eps
j_mean100 += eps
j_std100 += eps
j_med100 += eps
j_skew10 += eps
j_skew100 += eps
j_kurtosis10 += eps
j_kurtosis100 += eps
features_td['val_z10'][t_idx, :, :] = (j_slice - j_mean10) / j_std10
features_td['val_z100'][t_idx, :, :] = \
(j_slice - j_mean100) / j_std100
features_td['val_med10'][t_idx, :, :] = j_slice / j_med10
features_td['val_med100'][t_idx, :, :] = j_slice / j_med100
features_td['val_mean10'][t_idx, :, :] = j_slice / j_mean10
features_td['val_mean100'][t_idx, :, :] = j_slice / j_mean100
features_td['rat_mean_10_100'][t_idx, :, :] = j_mean10 / j_mean100
features_td['rat_std_10_100'][t_idx, :, :] = j_std10 / j_std100
features_td['rat_med_10_100'][t_idx, :, :] = j_med10 / j_med100
features_td['rat_skew_10_100'][t_idx, :, :] = j_skew10 / j_skew100
features_td['rat_kurtosis_10_100'][t_idx, :, :] = \
j_kurtosis10 / j_kurtosis100
features_td['rat_med_mean_10'][t_idx, :, :] = j_med10 / j_mean10
features_td['rat_med_mean_100'][t_idx, :, :] = j_med100 / j_mean100
features_td['rat_skew_kurtosis_10'][t_idx, :, :] = \
j_skew10 / j_kurtosis10
features_td['rat_skew_kurtosis_100'][t_idx, :, :] = \
j_skew100 / j_kurtosis100
features_td['smooth_uni_5'][t_idx, :, :] = \
np.mean(mat[t_idx-5:t_idx, :, :], axis=0)
features_td['smooth_wt_5'][t_idx, :, :] = \
0.4*mat[t_idx, :, :] + 0.25*mat[t_idx-1, :, :] + \
0.15*mat[t_idx-2, :, :] + 0.1*mat[t_idx-3, :, :] + \
+ 0.1*mat[t_idx-4, :, :]
features_td['smooth_uni_25'][t_idx, :, :] = \
np.mean(mat[t_idx-25:t_idx, :, :], axis=0)
return features_td
def compute_features_sd(mat, t_stamps):
'''
Calculates spatial features at every location and time stamp
Parameters
----------
mat: ndarray
raw data
t_stamps: list
time stamps at which time domain features are calculated
Returns
-------
features_sd: dict
dictionary containing spatial domain features
'''
# all these features are calculated in the spatial domain at every point
# initialize an empty dictionary to hold and return all features
features_sd = {}
# z val calculates the distance of obs from the mean of other obs
# present at same s1
features_sd['val_z_s1'] = np.zeros(mat.shape)
# z val calculates the distance of obs from the mean of other obs
# present at same s2
features_sd['val_z_s2'] = np.zeros(mat.shape)
# median of obs from in a surround 3x3 grid
features_sd['med_3x3'] = np.zeros(mat.shape)
# median of obs from in a surround 5x5 grid
features_sd['med_5x5'] = np.zeros(mat.shape)
# iterate through different time stamps
for t_idx in t_stamps:
j_mean_s1 = np.mean(mat[t_idx, :, :], axis=1)
j_mean_s2 = np.mean(mat[t_idx, :, :], axis=0)
j_std_s1 = np.std(mat[t_idx, :, :], axis=1)
j_std_s2 = np.std(mat[t_idx, :, :], axis=0)
# Add an extra dimension in the last axis
j_mean_s1 = np.expand_dims(j_mean_s1, axis=1)
j_mean_s2 = np.expand_dims(j_mean_s2, axis=0)
j_std_s1 = np.expand_dims(j_std_s1, axis=1)
j_std_s2 = np.expand_dims(j_std_s2, axis=0)
features_sd['val_z_s1'][t_idx, :, :] = \
(mat[t_idx, :, :] - j_mean_s1) / j_std_s1
features_sd['val_z_s2'][t_idx, :, :] = \
(mat[t_idx, :, :] - j_mean_s2) / j_std_s2
for s1_idx in range(1, mat.shape[1]-1):
for s2_idx in range(1, mat.shape[2]-1):
features_sd['med_3x3'][t_idx, s1_idx, s2_idx] =\
np.median(mat[t_idx,
s1_idx-1:s1_idx+2,
s2_idx-1:s2_idx+2])
for s1_idx in range(2, mat.shape[1]-2):
for s2_idx in range(2, mat.shape[2]-2):
features_sd['med_5x5'][t_idx, s1_idx, s2_idx] =\
np.median(mat[t_idx,
s1_idx-2:s1_idx+3,
s2_idx-2:s2_idx+5])
return features_sd
def compute_features_grad(mat):
'''
Calculates spatial and temporal gradients
Parameters
----------
mat: ndarray
raw data
Returns
-------
features_grad: dict
dictionary containing spatial and temporal gradient features
'''
# initialize an empty dictionary to hold and return all features
features_grad = {}
# first derivative
features_grad['t_grad'] = np.roll(mat, -1, axis=0) - mat
features_grad['s1_grad'] = np.roll(mat, -1, axis=1) - mat
features_grad['s2_grad'] = np.roll(mat, -1, axis=2) - mat
# second derivative
features_grad['t_grad2'] = \
np.roll(features_grad['t_grad'], -1, axis=0)\
- features_grad['t_grad']
features_grad['s1_grad2'] = np.roll(features_grad['s1_grad'], -1,
axis=1)\
- features_grad['s1_grad']
features_grad['s2_grad2'] = \
np.roll(features_grad['s2_grad'], -1, axis=2) - \
features_grad['s2_grad']
return features_grad
def define_defects(s1, s2, defs_coord, def_names):
'''
Define coordinates of defects
Parameters
----------
s1: list
spatial axis 1
s2: list
spatial axis 2
defs_coord: list
list containing all defects - each defect contains
a list of tuples containing the vertices of defect
def_names: dict
dictionary containing the names of defects
Returns
-------
defs: dict
dictionary containing all the necessary parameters of
all the defined defects
'''
# initialize empty dictionary to hold defect coordinates
defs = {}
# as many defects can be setup within the defects dictionary.
# each defect key will be an unique identifier
# defs[n:{...}] contains the parameters of defect 'n'
# defs[n:{'polygon':}] contains the defect's Point object definition
# defs[n:{'s1_sel':}] contains the selected s1 coordinates of defect
# defs[n:{'s2_sel':}] contains the selected s2 coordinates of defect
# defs[n:{'s1_vertices':}] contains the s1 vertices of defect
# defs[n:{'s2_vertices':}] contains the s2 vertices of defect
# defs[n:{'name':}] contains the names of defect
# iterate through the defects
for i in range(1, len(defs_coord) + 1):
defs[i] = {}
defs[i]['polygon'] = [Point(x, y) for x, y in defs_coord[i-1]]
defs[i]['s1_sel'] = []
defs[i]['s2_sel'] = []
# extract vertices of defects in s1 and s2 coordinates
defs[i]['s1_vertices'], defs[i]['s2_vertices'] = \
map(list, zip(*defs_coord[i-1]))
# add the first element to the tail of dictionary to aid with plots
defs[i]['s1_vertices'].append(defs[i]['s1_vertices'][0])
defs[i]['s2_vertices'].append(defs[i]['s2_vertices'][0])
defs[i]['name'] = def_names[i-1]
# iterate through the spatial coordinates to identify which pixels
# belong to defecs
for s1_idx in range(len(s1)):
for s2_idx in range(len(s2)):
p = Point(s1[s1_idx], s2[s2_idx])
# iterate through the defects
for i in range(1, len(defs)+1):
if is_within_polygon(defs[i]['polygon'], p):
defs[i]['s1_sel'].append(s1_idx)
defs[i]['s2_sel'].append(s2_idx)
return defs
def mean_filter(mat, t, s1, s2, units, size, plot_sample):
'''
Performs mean filtering at each location
Parameters
----------
mat: ndarray
raw data
t: list
time axis
s1: list
spatial axis 1
s2: list
spatial axis 2
units: dict
units of the different dimensions
size: int
number of elements to use in the mean filter. The higher,
the more aggresive the filtering
plot_sample: Bool
Boolean to indicate if time series plots are
needed to compare raw and filtered data
Returns
-------
filt_mat: ndarray
mean filtered raw data based on kernel size
'''
filt_mat = np.zeros(mat.shape)
kernel = [1/size for i in range(size)]
for s1_idx in range(mat.shape[1]):
for s2_idx in range(mat.shape[2]):
filt_mat[:, s1_idx, s2_idx] = \
np.convolve(mat[:, s1_idx, s2_idx], kernel, 'same')
# compares the effect of filtering
if plot_sample:
# pick random spatial coordinate along s1 and s2 axes
s1_coord = np.random.randint(low=0, high=len(s1), size=1)
s2_coord = np.random.randint(low=0, high=len(s2), size=1)
# extract time series signals at the random coordinate
sig1 = mat[:, s1_coord[0], s2_coord[0]]
sig2 = filt_mat[:, s1_coord[0], s2_coord[0]]
fig, ax = plt.subplots(nrows=1, ncols=1)
fig.suptitle('Time series')
ax.plot(t, sig1, label='Raw time series')
ax.plot(t, sig2, label='Filtered time series')
ax.legend()
ax.set(xlabel='Time[%s]' % (units['t_units']), ylabel='Signal',
title='s1=%0.1f%s, s2=%0.1f%s' %
(s1[s1_coord[0]], units['s1_units'],
s2[s2_coord[0]], units['s2_units']))
ax.grid()
plt.tight_layout()
plt.show()
return filt_mat
def visualize_spatial_data(mat, t, s1_2d, s2_2d,
t_min_idx, t_max_idx, del_t_idx, units):
'''
Visualize spatial slices of data at certain time stamps
Parameters
----------
mat: ndarray
raw data
t: list
time axis
s1_2d: ndarray
2D meshgrid representation of s1 axis
s2_2d: ndarray
2D meshgrid representation of s2 axis
t_min_idx: int
lower bound time index for visualization
t_max_idx: int
upper bound time index for visualization
del_t_idx: int
time index steps for visualization
units: dict