-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanomaly-detection.bib
1141 lines (1068 loc) · 114 KB
/
anomaly-detection.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
@article{bolton_statistical_2002,
title = {Statistical {Fraud} {Detection}: {A} {Review}},
volume = {17},
number = {3},
journal = {Statistical Science},
author = {Bolton, Richard J. and Hand, David J.},
year = {2002},
keywords = {survey},
pages = {235--255},
file = {Bolton_Hand_2002_Statistical Fraud Detection.pdf:/home/roland/Zotero/storage/7NSHS3NQ/Bolton_Hand_2002_Statistical Fraud Detection.pdf:application/pdf}
}
@article{chandola_anomaly_2009,
title = {Anomaly detection: {A} survey},
volume = {41},
number = {3},
journal = {ACM Comput. Surv.},
author = {Chandola, Varun and Banerjee, Arindam and Kumar, Vipin},
year = {2009},
keywords = {survey},
pages = {15:1--15:58},
file = {Chandola et al_2009_Anomaly detection.pdf:/home/roland/Zotero/storage/UHCR792T/Chandola et al_2009_Anomaly detection.pdf:application/pdf}
}
@article{chandola_anomaly_2012,
title = {Anomaly {Detection} for {Discrete} {Sequences}: {A} {Survey}},
volume = {24},
number = {5},
journal = {IEEE Trans. Knowl. Data Eng.},
author = {Chandola, Varun and Banerjee, Arindam and Kumar, Vipin},
year = {2012},
keywords = {survey},
pages = {823--839},
file = {Chandola et al_2012_Anomaly Detection for Discrete Sequences.pdf:/home/roland/Zotero/storage/URQI7P8K/Chandola et al_2012_Anomaly Detection for Discrete Sequences.pdf:application/pdf}
}
@article{lane_sequence_nodate,
title = {Sequence {Matching} and {Learning} in {Anomaly} {Detection} for {Computer} {Security}},
abstract = {Twoproblemsof importance in computersecurity are to 1) detect the presence of an intruder masquerading as the valid user and 2) detect the perpetration of abusive actions on the part of an otherwise innocuous user. Wehave developed an approach to these problems that examines sequences of user actions (UNIX commandsto) classify behavior as normal or anomalous. In this paper weexplore the matchingfunction needed to comparea current behavioral sequence to a historical profile. Wediscuss the difficulties of performing matching in human-generateddata and show that exact string matchingis insufficient to this domain. Wedemonstrate a numberof partial matching functions and examine their behavior on user commanddata. In particular, we explore two methods for weighting scores by adjacency of matchesas well as two growthfunctions (polynomial and exponential) for scoring similarities. Wefind, empirically, that the optimal similarity measureis user dependantbut that measures based on the assumption of causal linkage betweenuser commandasre superior for this domain.},
language = {en},
author = {Lane, Terran and Brodley, Carla E},
pages = {7},
file = {Lane_Brodley_Sequence Matching and Learning in Anomaly Detection for Computer Security.pdf:/home/roland/Zotero/storage/726N79DP/Lane_Brodley_Sequence Matching and Learning in Anomaly Detection for Computer Security.pdf:application/pdf}
}
@inproceedings{wang_statistical_2011,
address = {Dublin, Ireland},
title = {Statistical {Techniques} for {Online} {Anomaly} {Detection} in {Data} {Centers}},
isbn = {978-1-4244-9219-0},
url = {http://ieeexplore.ieee.org/document/5990537/},
doi = {10.1109/INM.2011.5990537},
abstract = {Online anomaly detection is an important step in data center management, requiring light-weight techniques that provide sufficient accuracy for subsequent diagnosis and management actions. This paper presents statistical techniques based on the Tukey and Relative Entropy statistics, and applies them to data collected from a production environment and to data captured from a testbed for multi-tier web applications running on server class machines. The proposed techniques are lightweight and improve over standard Gaussian assumptions in terms of performance.},
language = {en},
urldate = {2019-09-19},
booktitle = {12th {IFIP}/{IEEE} {International} {Symposium} on {Integrated} {Network} {Management} ({IM} 2011) and {Workshops}},
publisher = {IEEE},
author = {Wang, Chengwei and Viswanathan, Krishnamurthy and Choudur, Lakshminarayan and Talwar, Vanish and Satterfield, Wade and Schwan, Karsten},
month = may,
year = {2011},
pages = {385--392},
file = {Wang et al_2011_Statistical techniques for online anomaly detection in data centers.pdf:/home/roland/Zotero/storage/2T5XBMFF/Wang et al_2011_Statistical techniques for online anomaly detection in data centers.pdf:application/pdf}
}
@incollection{hutchison_anomalous_2004,
address = {Berlin, Heidelberg},
title = {Anomalous {Payload}-{Based} {Network} {Intrusion} {Detection}},
volume = {3224},
isbn = {978-3-540-23123-3 978-3-540-30143-1},
url = {http://link.springer.com/10.1007/978-3-540-30143-1_11},
abstract = {We present a payload-based anomaly detector, we call PAYL, for intrusion detection. PAYL models the normal application payload of network traffic in a fully automatic, unsupervised and very effecient fashion. We first compute during a training phase a profile byte frequency distribution and their standard deviation of the application payload flowing to a single host and port. We then use Mahalanobis distance during the detection phase to calculate the similarity of new data against the pre-computed profile. The detector compares this measure against a threshold and generates an alert when the distance of the new input exceeds this threshold. We demonstrate the surprising effectiveness of the method on the 1999 DARPA IDS dataset and a live dataset we collected on the Columbia CS department network. In once case nearly 100\% accuracy is achieved with 0.1\% false positive rate for port 80 traffic.},
language = {en},
urldate = {2019-09-19},
booktitle = {Recent {Advances} in {Intrusion} {Detection}},
publisher = {Springer Berlin Heidelberg},
author = {Wang, Ke and Stolfo, Salvatore J.},
editor = {Hutchison, David and Kanade, Takeo and Kittler, Josef and Kleinberg, Jon M. and Mattern, Friedemann and Mitchell, John C. and Naor, Moni and Nierstrasz, Oscar and Pandu Rangan, C. and Steffen, Bernhard and Sudan, Madhu and Terzopoulos, Demetri and Tygar, Dough and Vardi, Moshe Y. and Weikum, Gerhard and Jonsson, Erland and Valdes, Alfonso and Almgren, Magnus},
year = {2004},
doi = {10.1007/978-3-540-30143-1_11},
pages = {203--222},
file = {Wang_Stolfo_2004_Anomalous Payload-Based Network Intrusion Detection.pdf:/home/roland/Zotero/storage/KSEGUYLA/Wang_Stolfo_2004_Anomalous Payload-Based Network Intrusion Detection.pdf:application/pdf}
}
@article{talagala_anomaly_2019,
title = {Anomaly {Detection} in {Streaming} {Nonstationary} {Temporal} {Data}},
issn = {1061-8600, 1537-2715},
url = {https://www.tandfonline.com/doi/full/10.1080/10618600.2019.1617160},
doi = {10.1080/10618600.2019.1617160},
abstract = {This article proposes a framework that provides early detection of anomalous series within a large collection of nonstationary streaming time-series data. We define an anomaly as an observation, that is, very unlikely given the recent distribution of a given system. The proposed framework first calculates a boundary for the system’s typical behavior using extreme value theory. Then a sliding window is used to test for anomalous series within a newly arrived collection of series. The model uses time series features as inputs, and a density-based comparison to detect any significant changes in the distribution of the features. Using various synthetic and real world datasets, we demonstrate the wide applicability and usefulness of our proposed framework. We show that the proposed algorithm can work well in the presence of noisy nonstationarity data within multiple classes of time series. This framework is implemented in the open source R package oddstream. R code and data are available in the online supplementary materials.},
language = {en},
urldate = {2019-09-19},
journal = {Journal of Computational and Graphical Statistics},
author = {Talagala, Priyanga Dilini and Hyndman, Rob J. and Smith-Miles, Kate and Kandanaarachchi, Sevvandi and Muñoz, Mario A.},
month = jun,
year = {2019},
pages = {1--21},
file = {Talagala et al_2019_Anomaly Detection in Streaming Nonstationary Temporal Data.pdf:/home/roland/Zotero/storage/FFG2GNDH/Talagala et al_2019_Anomaly Detection in Streaming Nonstationary Temporal Data.pdf:application/pdf}
}
@inproceedings{dos_santos_teixeira_data_2010,
address = {Sierre, Switzerland},
title = {Data {Stream} {Anomaly} {Detection} through {Principal} {Subspace} {Tracking}},
isbn = {978-1-60558-639-7},
url = {http://portal.acm.org/citation.cfm?doid=1774088.1774434},
doi = {10.1145/1774088.1774434},
abstract = {We consider the problem of anomaly detection in multiple co-evolving data streams. In this paper, we introduce FRAHST (Fast Rank-Adaptive row-Householder Subspace Tracking). It automatically learns the principal subspace from N numerical data streams and an anomaly is indicated by a change in the number of latent variables. Our technique provides state-of-the-art estimates for the subspace basis and has a true dominant complexity of only 5N r operations while satisfying all desirable streaming constraints. FRAHST successfully detects subtle anomalous patterns and when compared against four other anomaly detection techniques, it is the only with a consistent F1 ≥ 80\% in the Abilene datasets as well as in the ISP datasets introduced in this work.},
language = {en},
urldate = {2019-09-19},
booktitle = {Proceedings of the 2010 {ACM} {Symposium} on {Applied} {Computing} - {SAC} '10},
publisher = {ACM Press},
author = {dos Santos Teixeira, Pedro Henriques and Milidiú, Ruy Luiz},
year = {2010},
pages = {1609},
file = {dos Santos Teixeira_Milidiú_2010_Data stream anomaly detection through principal subspace tracking.pdf:/home/roland/Zotero/storage/D7ZHPBZ5/dos Santos Teixeira_Milidiú_2010_Data stream anomaly detection through principal subspace tracking.pdf:application/pdf}
}
@inproceedings{zheng_detecting_2015,
address = {Bellevue, Washington},
title = {Detecting collective anomalies from multiple spatio-temporal datasets across different domains},
isbn = {978-1-4503-3967-4},
url = {http://dl.acm.org/citation.cfm?doid=2820783.2820813},
doi = {10.1145/2820783.2820813},
abstract = {The collective anomaly denotes a collection of nearby locations that are anomalous during a few consecutive time intervals in terms of phenomena collectively witnessed by multiple datasets. The collective anomalies suggest there are underlying problems that may not be identified based on a single data source or in a single location. It also associates individual locations and time intervals, formulating a panoramic view of an event. To detect a collective anomaly is very challenging, however, as different datasets have different densities, distributions, and scales. Additionally, to find the spatio-temporal scope of a collective anomaly is time consuming as there are many ways to combine regions and time slots. Our method consists of three components: MultipleSource Latent-Topic (MSLT) model, Spatio-Temporal Likelihood Ratio Test (ST\_LRT) model, and a candidate generation algorithm. MSLT combines multiple datasets to infer the latent functions of a geographic region in the framework of a topic model. In turn, a estimate the underlying distribution of a sparse dataset generated in the region. ST\_LRT learns a proper underlying distribution for different datasets, and calculates an anomalous degree for each dataset based on a likelihood ratio test (LRT). It then aggregates the anomalous degrees of different datasets, using a skyline detection algorithm. We evaluate our method using five datasets related to New York City (NYC): 311 complaints, taxicab data, bike rental data, points of interest, and road network data, finding the anomalies that cannot be identified (or earlier than those detected) by a single dataset. Results show the advantages beyond six baseline methods.},
language = {en},
urldate = {2019-09-19},
booktitle = {Proceedings of the 23rd {SIGSPATIAL} {International} {Conference} on {Advances} in {Geographic} {Information} {Systems} - {GIS} '15},
publisher = {ACM Press},
author = {Zheng, Yu and Zhang, Huichu and Yu, Yong},
year = {2015},
pages = {1--10},
file = {Zheng et al_2015_Detecting collective anomalies from multiple spatio-temporal datasets across.pdf:/home/roland/Zotero/storage/ZEJQRRF8/Zheng et al_2015_Detecting collective anomalies from multiple spatio-temporal datasets across.pdf:application/pdf}
}
@article{schneider_expected_2016,
title = {Expected {Similarity} {Estimation} for {Large}-{Scale} {Batch} and {Streaming} {Anomaly} {Detection}},
volume = {105},
issn = {0885-6125, 1573-0565},
url = {http://arxiv.org/abs/1601.06602},
doi = {10.1007/s10994-016-5567-7},
abstract = {We present a novel algorithm for anomaly detection on very large datasets and data streams. The method, named EXPected Similarity Estimation (EXPoSE), is kernel-based and able to efficiently compute the similarity between new data points and the distribution of regular data. The estimator is formulated as an inner product with a reproducing kernel Hilbert space embedding and makes no assumption about the type or shape of the underlying data distribution. We show that offline (batch) learning with EXPoSE can be done in linear time and online (incremental) learning takes constant time per instance and model update. Furthermore, EXPoSE can make predictions in constant time, while it requires only constant memory. In addition, we propose different methodologies for concept drift adaptation on evolving data streams. On several real datasets we demonstrate that our approach can compete with state of the art algorithms for anomaly detection while being an order of magnitude faster than most other approaches.},
language = {en},
number = {3},
urldate = {2019-09-19},
journal = {Machine Learning},
author = {Schneider, Markus and Ertel, Wolfgang and Ramos, Fabio},
month = dec,
year = {2016},
note = {arXiv: 1601.06602},
keywords = {Computer Science - Artificial Intelligence, Computer Science - Machine Learning},
pages = {305--333},
file = {Schneider et al_2016_Expected Similarity Estimation for Large-Scale Batch and Streaming Anomaly.pdf:/home/roland/Zotero/storage/J53FKASB/Schneider et al_2016_Expected Similarity Estimation for Large-Scale Batch and Streaming Anomaly.pdf:application/pdf}
}
@inproceedings{lavin_evaluating_2015,
address = {Miami, FL, USA},
title = {Evaluating {Real}-{Time} {Anomaly} {Detection} {Algorithms} -- {The} {Numenta} {Anomaly} {Benchmark}},
isbn = {978-1-5090-0287-0},
url = {http://ieeexplore.ieee.org/document/7424283/},
doi = {10.1109/ICMLA.2015.141},
abstract = {Much of the world’s data is streaming, time-series data, where anomalies give significant information in critical situations; examples abound in domains such as finance, IT, security, medical, and energy. Yet detecting anomalies in streaming data is a difficult task, requiring detectors to process data in real-time, not batches, and learn while simultaneously making predictions. There are no benchmarks to adequately test and score the efficacy of real-time anomaly detectors. Here we propose the Numenta Anomaly Benchmark (NAB), which attempts to provide a controlled and repeatable environment of open-source tools to test and measure anomaly detection algorithms on streaming data. The perfect detector would detect all anomalies as soon as possible, trigger no false alarms, work with real-world time-series data across a variety of domains, and automatically adapt to changing statistics. Rewarding these characteristics is formalized in NAB, using a scoring algorithm designed for streaming data. NAB evaluates detectors on a benchmark dataset with labeled, real-world time-series data. We present these components, and give results and analyses for several open source, commercially-used algorithms. The goal for NAB is to provide a standard, open source framework with which the research community can compare and evaluate different algorithms for detecting anomalies in streaming data.},
language = {en},
urldate = {2019-09-19},
booktitle = {2015 {IEEE} 14th {International} {Conference} on {Machine} {Learning} and {Applications} ({ICMLA})},
publisher = {IEEE},
author = {Lavin, Alexander and Ahmad, Subutai},
month = dec,
year = {2015},
pages = {38--44},
file = {Lavin_Ahmad_2015_Evaluating Real-Time Anomaly Detection Algorithms -- The Numenta Anomaly.pdf:/home/roland/Zotero/storage/KDJRA5EU/Lavin_Ahmad_2015_Evaluating Real-Time Anomaly Detection Algorithms -- The Numenta Anomaly.pdf:application/pdf}
}
@article{adams_bayesian_2007,
title = {Bayesian {Online} {Changepoint} {Detection}},
url = {http://arxiv.org/abs/0710.3742},
abstract = {Changepoints are abrupt variations in the generative parameters of a data sequence. Online detection of changepoints is useful in modelling and prediction of time series in application areas such as finance, biometrics, and robotics. While frequentist methods have yielded online filtering and prediction techniques, most Bayesian papers have focused on the retrospective segmentation problem. Here we examine the case where the model parameters before and after the changepoint are independent and we derive an online algorithm for exact inference of the most recent changepoint. We compute the probability distribution of the length of the current “run,” or time since the last changepoint, using a simple message-passing algorithm. Our implementation is highly modular so that the algorithm may be applied to a variety of types of data. We illustrate this modularity by demonstrating the algorithm on three different real-world data sets.},
language = {en},
urldate = {2019-09-19},
journal = {arXiv:0710.3742 [stat]},
author = {Adams, Ryan Prescott and MacKay, David J. C.},
month = oct,
year = {2007},
note = {arXiv: 0710.3742},
keywords = {Statistics - Machine Learning},
file = {Adams_MacKay_2007_Bayesian Online Changepoint Detection.pdf:/home/roland/Zotero/storage/48AQDRCC/Adams_MacKay_2007_Bayesian Online Changepoint Detection.pdf:application/pdf}
}
@article{ding_anomaly_2013,
title = {An {Anomaly} {Detection} {Approach} {Based} on {Isolation} {Forest} {Algorithm} for {Streaming} {Data} using {Sliding} {Window}},
volume = {46},
issn = {14746670},
url = {https://linkinghub.elsevier.com/retrieve/pii/S1474667016314999},
doi = {10.3182/20130902-3-CN-3020.00044},
abstract = {Anomalous behavior detection in many applications is becoming more and more important, such as computer security, sensor network and so on. However, the inherent characteristics of streaming data, such as generated quickly, data infinite, tremendous volume and the phenomenon of concept drift, imply that the anomaly detection in the streaming data is a challenge work. In this paper, using the frame of sliding windows and taking into account the concept drift phenomenon, a novel anomaly detection framework is presented and an adapted streaming data anomaly detection algorithm based on the iForest algorithm, namely iForestASD is proposed. The experiment results performed on four real-world datasets derived from the UCI repository demonstrate that the proposed algorithm can effective to detect anomalous instances for the streaming data.},
language = {en},
number = {20},
urldate = {2019-09-19},
journal = {IFAC Proceedings Volumes},
author = {Ding, Zhiguo and Fei, Minrui},
year = {2013},
pages = {12--17},
file = {Ding_Fei_2013_An Anomaly Detection Approach Based on Isolation Forest Algorithm for Streaming.pdf:/home/roland/Zotero/storage/4SAKXG5K/Ding_Fei_2013_An Anomaly Detection Approach Based on Isolation Forest Algorithm for Streaming.pdf:application/pdf}
}
@article{hill_anomaly_2010,
title = {Anomaly detection in streaming environmental sensor data: {A} data-driven modeling approach},
volume = {25},
issn = {13648152},
shorttitle = {Anomaly detection in streaming environmental sensor data},
url = {https://linkinghub.elsevier.com/retrieve/pii/S1364815209002321},
doi = {10.1016/j.envsoft.2009.08.010},
abstract = {The deployment of environmental sensors has generated an interest in real-time applications of the data they collect. This research develops a real-time anomaly detection method for environmental data streams that can be used to identify data that deviate from historical patterns. The method is based on an autoregressive data-driven model of the data stream and its corresponding prediction interval. It performs fast, incremental evaluation of data as it becomes available, scales to large quantities of data, and requires no pre-classification of anomalies. Furthermore, this method can be easily deployed on a large heterogeneous sensor network. Sixteen instantiations of this method are compared based on their ability to identify measurement errors in a windspeed data stream from Corpus Christi, Texas. The results indicate that a multilayer perceptron model of the data stream, coupled with replacement of anomalous data points, performs well at identifying erroneous data in this data stream.},
language = {en},
number = {9},
urldate = {2019-09-19},
journal = {Environmental Modelling \& Software},
author = {Hill, David J. and Minsker, Barbara S.},
month = sep,
year = {2010},
pages = {1014--1022},
file = {Hill_Minsker_2010_Anomaly detection in streaming environmental sensor data.pdf:/home/roland/Zotero/storage/BG79P64V/Hill_Minsker_2010_Anomaly detection in streaming environmental sensor data.pdf:application/pdf}
}
@article{wu_hierarchical_2018,
title = {Hierarchical {Temporal} {Memory} method for time-series-based anomaly detection},
volume = {273},
issn = {09252312},
url = {https://linkinghub.elsevier.com/retrieve/pii/S0925231217313887},
doi = {10.1016/j.neucom.2017.08.026},
abstract = {The time-series-based anomaly detection is a well-studied subject, and it is well-documented in the literature. Theories and techniques have been proposed and applied successfully for domain-specific applications. However, this subject has received renewed interest motivated by the increasing importance of continuously learning, tolerance to noise and generalization. This paper tackles these problems by applying Hierarchical Temporal Memory (HTM), a novel biological neural network. HTM is more suitable for dealing with the changing pattern of data since it is capable of incorporating contextual information from the past to make more accurate prediction. Both artificial and real datasets are tested with HTM for the time-series-based anomaly detection. The experiment results show that HTM can efficiently detect the anomalies in time series data.},
language = {en},
urldate = {2019-09-19},
journal = {Neurocomputing},
author = {Wu, Jia and Zeng, Weiru and Yan, Fei},
month = jan,
year = {2018},
pages = {535--546},
file = {Wu et al_2018_Hierarchical Temporal Memory method for time-series-based anomaly detection.pdf:/home/roland/Zotero/storage/I2EC22SU/Wu et al_2018_Hierarchical Temporal Memory method for time-series-based anomaly detection.pdf:application/pdf}
}
@article{miller_twitter_2014,
title = {Twitter spammer detection using data stream clustering},
volume = {260},
issn = {00200255},
url = {https://linkinghub.elsevier.com/retrieve/pii/S0020025513008037},
doi = {10.1016/j.ins.2013.11.016},
abstract = {The rapid growth of Twitter has triggered a dramatic increase in spam volume and sophistication. The abuse of certain Twitter components such as ‘‘hashtags’’, ‘‘mentions’’, and shortened URLs enables spammers to operate efficiently. These same features, however, may be a key factor in identifying new spam accounts as shown in previous studies. Our study provides three novel contributions. Firstly, previous studies have approached spam detection as a classification problem, whereas we view it as an anomaly detection problem. Secondly, 95 one-gram features from tweet text were introduced alongside the user information analyzed in previous studies. Finally, to effectively handle the streaming nature of tweets, two stream clustering algorithms, StreamKM++ and DenStream, were modified to facilitate spam identification. Both algorithms clustered normal Twitter users, treating outliers as spammers. Each of these algorithms performed well individually, with StreamKM++ achieving 99\% recall and a 6.4\% false positive rate; and DenStream producing 99\% recall and a 2.8\% false positive rate. When used in conjunction, these algorithms reached 100\% recall and a 2.2\% false positive rate, meaning that our system was able to identify 100\% of the spammers in our test while incorrectly detecting only 2.2\% of normal users as spammers. Ó 2013 Elsevier Inc. All rights reserved.},
language = {en},
urldate = {2019-09-19},
journal = {Information Sciences},
author = {Miller, Zachary and Dickinson, Brian and Deitrick, William and Hu, Wei and Wang, Alex Hai},
month = mar,
year = {2014},
keywords = {clustering},
pages = {64--73},
file = {Miller et al_2014_Twitter spammer detection using data stream clustering.pdf:/home/roland/Zotero/storage/UHU9X7M4/Miller et al_2014_Twitter spammer detection using data stream clustering.pdf:application/pdf}
}
@article{guha_robust_nodate,
title = {Robust {Random} {Cut} {Forest} {Based} {Anomaly} {Detection} {On} {Streams}},
abstract = {In this paper we focus on the anomaly detection problem for dynamic data streams through the lens of random cut forests. We investigate a robust random cut data structure that can be used as a sketch or synopsis of the input stream. We provide a plausible definition of non-parametric anomalies based on the influence of an unseen point on the remainder of the data, i.e., the externality imposed by that point. We show how the sketch can be efficiently updated in a dynamic data stream. We demonstrate the viability of the algorithm on publicly available real data.},
language = {en},
author = {Guha, Sudipto and Mishra, Nina and Roy, Gourav and Schrijvers, Okke},
keywords = {ensemble, random forest},
pages = {10},
file = {Guha et al_Robust Random Cut Forest Based Anomaly Detection On Streams.pdf:/home/roland/Zotero/storage/WX5CUPX7/Guha et al_Robust Random Cut Forest Based Anomaly Detection On Streams.pdf:application/pdf}
}
@article{hayes_contextual_2015,
title = {Contextual anomaly detection framework for big sensor data},
volume = {2},
issn = {2196-1115},
url = {http://www.journalofbigdata.com/content/2/1/2},
doi = {10.1186/s40537-014-0011-y},
abstract = {The ability to detect and process anomalies for Big Data in real-time is a difficult task. The volume and velocity of the data within many systems makes it difficult for typical algorithms to scale and retain their real-time characteristics. The pervasiveness of data combined with the problem that many existing algorithms only consider the content of the data source; e.g. a sensor reading itself without concern for its context, leaves room for potential improvement. The proposed work defines a contextual anomaly detection framework. It is composed of two distinct steps: content detection and context detection. The content detector is used to determine anomalies in real-time, while possibly, and likely, identifying false positives. The context detector is used to prune the output of the content detector, identifying those anomalies which are considered both content and contextually anomalous. The context detector utilizes the concept of profiles, which are groups of similarly grouped data points generated by a multivariate clustering algorithm. The research has been evaluated against two real-world sensor datasets provided by a local company in Brampton, Canada. Additionally, the framework has been evaluated against the open-source Dodgers dataset, available at the UCI machine learning repository, and against the R statistical toolbox.},
language = {en},
number = {1},
urldate = {2019-07-16},
journal = {Journal of Big Data},
author = {Hayes, Michael A and Capretz, Miriam AM},
month = dec,
year = {2015},
keywords = {clustering},
file = {Hayes_Capretz_2015_Contextual anomaly detection framework for big sensor data.pdf:/home/roland/Zotero/storage/SPQJN53F/Hayes_Capretz_2015_Contextual anomaly detection framework for big sensor data.pdf:application/pdf}
}
@article{akoglu_graph_2015,
title = {Graph based anomaly detection and description: a survey},
volume = {29},
issn = {1384-5810, 1573-756X},
shorttitle = {Graph based anomaly detection and description},
url = {http://link.springer.com/10.1007/s10618-014-0365-y},
doi = {10.1007/s10618-014-0365-y},
language = {en},
number = {3},
urldate = {2019-07-16},
journal = {Data Mining and Knowledge Discovery},
author = {Akoglu, Leman and Tong, Hanghang and Koutra, Danai},
month = may,
year = {2015},
keywords = {survey},
pages = {626--688},
file = {Akoglu et al_2015_Graph based anomaly detection and description.pdf:/home/roland/Zotero/storage/4FUYS6QY/Akoglu et al_2015_Graph based anomaly detection and description.pdf:application/pdf}
}
@inproceedings{rettig_online_2015,
address = {Santa Clara, CA, USA},
title = {Online {Anomaly} {Detection} over {Big} {Data} {Streams}},
isbn = {978-1-4799-9926-2},
url = {http://ieeexplore.ieee.org/document/7363865/},
doi = {10.1109/BigData.2015.7363865},
abstract = {Data quality is a challenging problem in many real world application domains. While a lot of attention has been given to detect anomalies for data at rest, detecting anomalies for streaming applications still largely remains an open problem. For applications involving several data streams, the challenge of detecting anomalies has become harder over time, as data can dynamically evolve in subtle ways following changes in the underlying infrastructure. In this paper, we describe and empirically evaluate an online anomaly detection pipeline that satisfies two key conditions: generality and scalability. Our technique works on numerical data as well as on categorical data and makes no assumption on the underlying data distributions. We implement two metrics, relative entropy and Pearson correlation, to dynamically detect anomalies. The two metrics we use provide an efficient and effective detection of anomalies over high velocity streams of events.},
language = {en},
urldate = {2019-07-16},
booktitle = {2015 {IEEE} {International} {Conference} on {Big} {Data} ({Big} {Data})},
publisher = {IEEE},
author = {Rettig, Laura and Khayati, Mourad and Cudre-Mauroux, Philippe and Piorkowski, Michal},
month = oct,
year = {2015},
pages = {1113--1122},
file = {Rettig et al_2015_Online anomaly detection over Big Data streams.pdf:/home/roland/Zotero/storage/8A6PHA4W/Rettig et al_2015_Online anomaly detection over Big Data streams.pdf:application/pdf}
}
@article{shipmon_time_2017,
title = {Time {Series} {Anomaly} {Detection}},
abstract = {Google uses continuous streams of data from industry partners in order to deliver accurate results to users. Unexpected drops in traffic can be an indication of an underlying issue and may be an early warning that remedial action may be necessary. Detecting such drops is non-trivial because streams are variable and noisy, with roughly regular spikes (in many different shapes) in traffic data. We investigated the question of whether or not we can predict anomalies in these data streams. Our goal is to utilize Machine Learning and statistical approaches to classify anomalous drops in periodic, but noisy, traffic patterns. Since we do not have a large body of labeled examples to directly apply supervised learning for anomaly classification, we approached the problem in two parts. First we used TensorFlow to train our various models including DNNs, RNNs, and LSTMs to perform regression and predict the expected value in the time series. Secondly we created anomaly detection rules that compared the actual values to predicted values. Since the problem requires finding sustained anomalies, rather than just short delays or momentary inactivity in the data, our two detection methods focused on continuous sections of activity rather than just single points. We tried multiple combinations of our models and rules and found that using the intersection of our two anomaly detection methods proved to be an effective method of detecting anomalies on almost all of our models. In the process we also found that not all data fell within our experimental assumptions, as one data stream had no periodicity, and therefore no time based model could predict it.},
language = {en},
journal = {arXiv:1708.03665},
author = {Shipmon, Dominique T and Gurevitch, Jason M and Piselli, Paolo M and Edwards, Steve},
year = {2017},
pages = {9},
file = {Shipmon et al_2017_Time series anomaly detection\; detection of anomalous drops with limited.pdf:/home/roland/Zotero/storage/AE5CGS2W/Shipmon et al_2017_Time series anomaly detection\; detection of anomalous drops with limited.pdf:application/pdf}
}
@article{ahmad_real-time_2016,
title = {Real-{Time} {Anomaly} {Detection} for {Streaming} {Analytics}},
url = {http://arxiv.org/abs/1607.02480},
abstract = {Much of the worlds data is streaming, time-series data, where anomalies give significant information in critical situations. Yet detecting anomalies in streaming data is a difficult task, requiring detectors to process data in real-time, and learn while simultaneously making predictions. We present a novel anomaly detection technique based on an on-line sequence memory algorithm called Hierarchical Temporal Memory (HTM). We show results from a live application that detects anomalies in financial metrics in realtime. We also test the algorithm on NAB, a published benchmark for real-time anomaly detection, where our algorithm achieves best-in-class results.},
language = {en},
urldate = {2019-07-16},
journal = {arXiv:1607.02480 [cs]},
author = {Ahmad, Subutai and Purdy, Scott},
month = jul,
year = {2016},
note = {arXiv: 1607.02480},
keywords = {Computer Science - Artificial Intelligence, Computer Science - Distributed, Parallel, and Cluster Computing, Electrical Engineering and Systems Science - Systems and Control},
file = {Ahmad_Purdy_2016_Real-Time Anomaly Detection for Streaming Analytics.pdf:/home/roland/Zotero/storage/SPMVZ7SB/Ahmad_Purdy_2016_Real-Time Anomaly Detection for Streaming Analytics.pdf:application/pdf}
}
@article{ahmad_unsupervised_2017,
title = {Unsupervised real-time anomaly detection for streaming data},
volume = {262},
issn = {09252312},
url = {https://linkinghub.elsevier.com/retrieve/pii/S0925231217309864},
doi = {10.1016/j.neucom.2017.04.070},
abstract = {We are seeing an enormous increase in the availability of streaming, time-series data. Largely driven by the rise of connected real-time data sources, this data presents technical challenges and opportunities. One fundamental capability for streaming analytics is to model each stream in an unsupervised fashion and detect unusual, anomalous behaviors in real-time. Early anomaly detection is valuable, yet it can be difficult to execute reliably in practice. Application constraints require systems to process data in realtime, not batches. Streaming data inherently exhibits concept drift, favoring algorithms that learn continuously. Furthermore, the massive number of independent streams in practice requires that anomaly detectors be fully automated. In this paper we propose a novel anomaly detection algorithm that meets these constraints. The technique is based on an online sequence memory algorithm called Hierarchical Temporal Memory (HTM). We also present results using the Numenta Anomaly Benchmark (NAB), a benchmark containing real-world data streams with labeled anomalies. The benchmark, the first of its kind, provides a controlled open-source environment for testing anomaly detection algorithms on streaming data. We present results and analysis for a wide range of algorithms on this benchmark, and discuss future challenges for the emerging field of streaming analytics.},
language = {en},
urldate = {2019-07-16},
journal = {Neurocomputing},
author = {Ahmad, Subutai and Lavin, Alexander and Purdy, Scott and Agha, Zuha},
month = nov,
year = {2017},
keywords = {benchmark, htm},
pages = {134--147},
file = {Ahmad et al_2017_Unsupervised real-time anomaly detection for streaming data.pdf:/home/roland/Zotero/storage/XIZQUAB5/Ahmad et al_2017_Unsupervised real-time anomaly detection for streaming data.pdf:application/pdf}
}
@inproceedings{bifet_new_2009,
address = {Paris, France},
title = {New {Ensemble} {Methods} {For} {Evolving} {Data} {Streams}},
isbn = {978-1-60558-495-9},
url = {http://portal.acm.org/citation.cfm?doid=1557019.1557041},
doi = {10.1145/1557019.1557041},
abstract = {Advanced analysis of data streams is quickly becoming a key area of data mining research as the number of applications demanding such processing increases. Online mining when such data streams evolve over time, that is when concepts drift or change completely, is becoming one of the core issues. When tackling non-stationary concepts, ensembles of classifiers have several advantages over single classifier methods: they are easy to scale and parallelize, they can adapt to change quickly by pruning under-performing parts of the ensemble, and they therefore usually also generate more accurate concept descriptions. This paper proposes a new experimental data stream framework for studying concept drift, and two new variants of Bagging: ADWIN Bagging and Adaptive-Size Hoeffding Tree (ASHT) Bagging. Using the new experimental framework, an evaluation study on synthetic and real-world datasets comprising up to ten million examples shows that the new ensemble methods perform very well compared to several known methods.},
language = {en},
urldate = {2019-07-16},
booktitle = {Proceedings of the 15th {ACM} {SIGKDD} international conference on {Knowledge} discovery and data mining - {KDD} '09},
publisher = {ACM Press},
author = {Bifet, Albert and Holmes, Geoff and Pfahringer, Bernhard and Kirkby, Richard and Gavaldà, Ricard},
year = {2009},
pages = {139},
file = {Bifet et al_2009_New ensemble methods for evolving data streams.pdf:/home/roland/Zotero/storage/9H2DFLNW/Bifet et al_2009_New ensemble methods for evolving data streams.pdf:application/pdf}
}
@inproceedings{domingos_mining_2000,
address = {Boston, Massachusetts, United States},
title = {Mining high-speed data streams},
isbn = {978-1-58113-233-5},
url = {http://portal.acm.org/citation.cfm?doid=347090.347107},
doi = {10.1145/347090.347107},
language = {en},
urldate = {2019-07-16},
booktitle = {Proceedings of the sixth {ACM} {SIGKDD} international conference on {Knowledge} discovery and data mining - {KDD} '00},
publisher = {ACM Press},
author = {Domingos, Pedro and Hulten, Geoff},
year = {2000},
pages = {71--80},
file = {Domingos_Hulten_2000_Mining high-speed data streams.pdf:/home/roland/Zotero/storage/GKU5MXHW/Domingos_Hulten_2000_Mining high-speed data streams.pdf:application/pdf}
}
@article{silva_data_2013,
title = {Data stream clustering: {A} survey},
volume = {46},
issn = {03600300},
shorttitle = {Data stream clustering},
url = {http://dl.acm.org/citation.cfm?doid=2522968.2522981},
doi = {10.1145/2522968.2522981},
language = {en},
number = {1},
urldate = {2019-07-16},
journal = {ACM Computing Surveys},
author = {Silva, Jonathan A. and Faria, Elaine R. and Barros, Rodrigo C. and Hruschka, Eduardo R. and Carvalho, André C. P. L. F. de and Gama, João},
month = oct,
year = {2013},
keywords = {survey},
pages = {1--31},
file = {Silva et al_2013_Data stream clustering.pdf:/home/roland/Zotero/storage/3ZDJ9MJQ/Silva et al_2013_Data stream clustering.pdf:application/pdf}
}
@article{akidau_evolution_nodate,
title = {The {Evolution} of {Massive}-{Scale} {Data} {Processing}},
language = {en},
author = {Akidau, Tyler},
pages = {112},
file = {Akidau_The Evolution of Massive-Scale Data Processing.pdf:/home/roland/Zotero/storage/7DSWZPYT/Akidau_The Evolution of Massive-Scale Data Processing.pdf:application/pdf}
}
@techreport{kitchenham_guidelines_2007,
title = {Guidelines for performing {Systematic} {Literature} {Reviews} in {Software} {Engineering}},
urldate = {2019-09-25},
author = {Kitchenham, Barbara and Charters, Stuart},
month = jul,
year = {2007},
file = {Kitchenham_Charters_2007_Guidelines for performing Systematic Literature Reviews in Software Engineering.pdf:/home/roland/Zotero/storage/F7Z4KCGS/Kitchenham_Charters_2007_Guidelines for performing Systematic Literature Reviews in Software Engineering.pdf:application/pdf}
}
@article{cugola_processing_2012,
title = {Processing flows of information: {From} data stream to complex event processing},
volume = {44},
issn = {03600300},
shorttitle = {Processing flows of information},
url = {http://dl.acm.org/citation.cfm?doid=2187671.2187677},
doi = {10.1145/2187671.2187677},
language = {en},
number = {3},
urldate = {2019-09-24},
journal = {ACM Computing Surveys},
author = {Cugola, Gianpaolo and Margara, Alessandro},
month = jun,
year = {2012},
pages = {1--62},
file = {Cugola_Margara_2012_Processing flows of information.pdf:/home/roland/Zotero/storage/G4P5D2VS/Cugola_Margara_2012_Processing flows of information.pdf:application/pdf}
}
@article{toliopoulos_continuous_2019,
title = {Continuous {Outlier} {Mining} of {Streaming} {Data} in {Flink}},
url = {http://arxiv.org/abs/1902.07901},
abstract = {In this work, we focus on distance-based outliers in a metric space, where the status of an entity as to whether it is an outlier is based on the number of other entities in its neighborhood. In recent years, several solutions have tackled the problem of distance-based outliers in data streams, where outliers must be mined continuously as new elements become available. An interesting research problem is to combine the streaming environment with massively parallel systems to provide scalable streambased algorithms. However, none of the previously proposed techniques refer to a massively parallel setting. Our proposal fills this gap and investigates the challenges in transferring state-of-the-art techniques to Apache Flink, a modern platform for intensive streaming analytics. We thoroughly present the technical challenges encountered and the alternatives that may be applied. We show speed-ups of up to 117 (resp. 2076) times over a naive parallel (resp. non-parallel) solution in Flink, by using just an ordinary four-core machine and a real-world dataset. When moving to a three-machine cluster, due to less contention, we manage to achieve both better scalability in terms of the window slide size and the data dimensionality, and even higher speed-ups, e.g., by a factor of 510. Overall, our results demonstrate that oulier mining can be achieved in an efficient and scalable manner. The resulting techniques have been made publicly available as open-source software.},
language = {en},
urldate = {2019-09-24},
journal = {arXiv:1902.07901 [cs]},
author = {Toliopoulos, Theodoros and Gounaris, Anastasios and Tsichlas, Kostas and Papadopoulos, Apostolos and Sampaio, Sandra},
month = feb,
year = {2019},
note = {arXiv: 1902.07901},
keywords = {Computer Science - Distributed, Parallel, and Cluster Computing, Computer Science - Machine Learning, Computer Science - Databases},
file = {Toliopoulos et al_2019_Continuous Outlier Mining of Streaming Data in Flink.pdf:/home/roland/Zotero/storage/WP3RDJSC/Toliopoulos et al_2019_Continuous Outlier Mining of Streaming Data in Flink.pdf:application/pdf}
}
@article{kitchenham_systematic_2009,
title = {Systematic literature reviews in software engineering – {A} systematic literature review},
volume = {51},
issn = {09505849},
url = {https://linkinghub.elsevier.com/retrieve/pii/S0950584908001390},
doi = {10.1016/j.infsof.2008.09.009},
abstract = {Background: In 2004 the concept of evidence-based software engineering (EBSE) was introduced at the ICSE04 conference. Aims: This study assesses the impact of systematic literature reviews (SLRs) which are the recommended EBSE method for aggregating evidence.
Method: We used the standard systematic literature review method employing a manual search of 10 journals and 4 conference proceedings.
Results: Of 20 relevant studies, eight addressed research trends rather than technique evaluation. Seven SLRs addressed cost estimation. The quality of SLRs was fair with only three scoring less than 2 out of 4.
Conclusions: Currently, the topic areas covered by SLRs are limited. European researchers, particularly those at the Simula Laboratory appear to be the leading exponents of systematic literature reviews. The series of cost estimation SLRs demonstrate the potential value of EBSE for synthesising evidence and making it available to practitioners.},
language = {en},
number = {1},
urldate = {2019-09-24},
journal = {Information and Software Technology},
author = {Kitchenham, Barbara and Pearl Brereton, O. and Budgen, David and Turner, Mark and Bailey, John and Linkman, Stephen},
month = jan,
year = {2009},
pages = {7--15},
file = {Kitchenham et al_2009_Systematic literature reviews in software engineering – A systematic literature.pdf:/home/roland/Zotero/storage/TNE9EAPR/Kitchenham et al_2009_Systematic literature reviews in software engineering – A systematic literature.pdf:application/pdf}
}
@article{xie_anomaly_2011,
title = {Anomaly detection in wireless sensor networks: {A} survey},
volume = {34},
issn = {10848045},
shorttitle = {Anomaly detection in wireless sensor networks},
url = {https://linkinghub.elsevier.com/retrieve/pii/S1084804511000580},
doi = {10.1016/j.jnca.2011.03.004},
abstract = {Since security threats to WSNs are increasingly being diversified and deliberate, prevention-based techniques alone can no longer provide WSNs with adequate security. However, detection-based techniques might be effective in collaboration with prevention-based techniques for securing WSNs. As a significant branch of detection-based techniques, the research of anomaly detection in wired networks and wireless ad hoc networks is already quite mature, but such solutions can be rarely applied to WSNs without any change, because WSNs are characterized by constrained resources, such as limited energy, weak computation capability, poor memory, short communication range, etc. The development of anomaly detection techniques suitable for WSNs is therefore regarded as an essential research area, which will enable WSNs to be much more secure and reliable. In this survey paper, a few of the key design principles relating to the development of anomaly detection techniques in WSNs are discussed in particular. Then, the state-of-the-art techniques of anomaly detection in WSNs are systematically introduced, according to WSNs’ architectures (Hierarchical/Flat) and detection technique categories (statistical techniques, rule based, data mining, computational intelligence, game theory, graph based, and hybrid, etc.). The analyses and comparisons of the approaches that belong to a similar technique category are represented technically, followed by a brief discussion towards the potential research areas in the near future and conclusion.},
language = {en},
number = {4},
urldate = {2019-09-24},
journal = {Journal of Network and Computer Applications},
author = {Xie, Miao and Han, Song and Tian, Biming and Parvin, Sazia},
month = jul,
year = {2011},
keywords = {survey},
pages = {1302--1325},
file = {Xie et al_2011_Anomaly detection in wireless sensor networks.pdf:/home/roland/Zotero/storage/ABVAWH9J/Xie et al_2011_Anomaly detection in wireless sensor networks.pdf:application/pdf}
}
@article{kanarachos_detecting_2017,
title = {Detecting anomalies in time series data via a deep learning algorithm combining wavelets, neural networks and {Hilbert} transform},
volume = {85},
issn = {09574174},
url = {https://linkinghub.elsevier.com/retrieve/pii/S0957417417302737},
doi = {10.1016/j.eswa.2017.04.028},
abstract = {The quest for more efficient real-time detection of anomalies in time series data is critically important in numerous applications and systems ranging from intelligent transportation, structural health monitoring, heart disease, and earthquake prediction. Although the range of application is wide, anomaly detection algorithms are usually domain specific and build on experts’ knowledge. Here a new signal processing algorithm – inspired by the deep learning paradigm – is presented that combines wavelets, neural networks, and Hilbert transform. The algorithm performs robustly and is transferable. The proposed neural network structure facilitates learning short and long-term pattern interdependencies; a task usually hard to accomplish using standard neural network training algorithms. The paper provides guidelines for selecting the neural network’s buffer size, training algorithm, and anomaly detection features. The algorithm learns the system’s normal behavior and does not require the existence of anomalous data for assessing its statistical significance. This is an essential attribute in applications that require customization. Anomalies are detected by analysing hierarchically the instantaneous frequency and amplitude of the residual signal using probabilistic Receiver Operating Characteristics. The method is shown to be able to automatically detect anomalies in the Seismic Electrical Signal that could be used to predict earthquake activity. Furthermore, the method can be used in combination with crowdsourcing of smartphone data to locate road defects such as potholes and bumps for intervention and repair.},
language = {en},
urldate = {2019-09-24},
journal = {Expert Systems with Applications},
author = {Kanarachos, Stratis and Christopoulos, Stavros-Richard G. and Chroneos, Alexander and Fitzpatrick, Michael E.},
month = nov,
year = {2017},
pages = {292--304},
file = {Kanarachos et al_2017_Detecting anomalies in time series data via a deep learning algorithm combining.pdf:/home/roland/Zotero/storage/JUDJ28NQ/Kanarachos et al_2017_Detecting anomalies in time series data via a deep learning algorithm combining.pdf:application/pdf}
}
@article{serdio_fault_2014,
title = {Fault detection in multi-sensor networks based on multivariate time-series models and orthogonal transformations},
volume = {20},
issn = {15662535},
url = {https://linkinghub.elsevier.com/retrieve/pii/S1566253514000451},
doi = {10.1016/j.inffus.2014.03.006},
abstract = {We introduce the usage of multivariate orthogonal space transformations and vectorized time-series models in combination with data-driven system identification models to achieve an enhanced performance of residual-based fault detection in condition monitoring systems equipped with multi-sensor networks. Neither time-consuming annotated samples nor fault patterns/models need to be available, as our approach is solely based on on-line recorded data streams. The system identification step acts as a fusion operation by searching for relations and dependencies between sensor channels measuring the state of system variables. We therefore apply three different vectorized time-series variants: (i) non-linear finite impulse response models (NFIR) relying only on the lagged input variables, (ii) non-linear output error models (NOE), also including the lags of the own predictions and (iii) non-linear Box–Jenkins models (NBJ) which include the lags of the predictions errors as well. The use of multivariate orthogonal space transformations allows to produce more compact and accurate models due to an integrated dimensionality (noise) reduction step. Fault detection is conducted based on finding anomalies (untypical occurrences) in the temporal residual signal in incremental manner. Our experimental results achieved on four real-world condition monitoring scenarios employing multi-sensor network systems demonstrate that the Receiver Operating Characteristic (ROC) curves are improved over those ones achieved with native static models (w/o lags, w/o transformations) by about 20–30\%.},
language = {en},
urldate = {2019-09-24},
journal = {Information Fusion},
author = {Serdio, Francisco and Lughofer, Edwin and Pichler, Kurt and Buchegger, Thomas and Pichler, Markus and Efendic, Hajrudin},
month = nov,
year = {2014},
pages = {272--291},
file = {Serdio et al_2014_Fault detection in multi-sensor networks based on multivariate time-series.pdf:/home/roland/Zotero/storage/EVH4IYAX/Serdio et al_2014_Fault detection in multi-sensor networks based on multivariate time-series.pdf:application/pdf}
}
@article{bosman_spatial_2017,
title = {Spatial anomaly detection in sensor networks using neighborhood information},
volume = {33},
issn = {15662535},
url = {https://linkinghub.elsevier.com/retrieve/pii/S1566253516300252},
doi = {10.1016/j.inffus.2016.04.007},
abstract = {The field of wireless sensor networks (WSNs), embedded systems with sensing and networking capability, has now matured after a decade-long research effort and technological advances in electronics and networked systems. An important remaining challenge now is to extract meaningful information from the ever-increasing amount of sensor data collected by WSNs. In particular, there is strong interest in algorithms capable of automatic detection of patterns, events or other out-of-the order, anomalous system behavior. Data anomalies may indicate states of the system that require further analysis or prompt actions. Traditionally, anomaly detection techniques are executed in a central processing facility, which requires the collection of all measurement data at a central location, an obvious limitation for WSNs due to the high data communication costs involved. In this paper we explore the extent by which one may depart from this classical centralized paradigm, looking at decentralized anomaly detection based on unsupervised machine learning. Our aim is to detect anomalies at the sensor nodes, as opposed to centrally, to reduce energy and spectrum consumption. We study the information gain coming from aggregate neighborhood data, in comparison to performing simple, in-node anomaly detection. We evaluate the effects of neighborhood size and spatio-temporal correlation on the performance of our new neighborhood-based approach using a range of real-world network deployments and datasets. We find the conditions that make neighborhood data fusion advantageous, identifying also the cases in which this approach does not lead to detectable improvements. Improvements are linked to the diffusive properties of data (spatio-temporal correlations) but also to the type of sensors, anomalies and network topological features. Overall, when a dataset stems from a similar mixture of diffusive processes precision tends to benefit, particularly in terms of recall. Our work paves the way towards understanding how distributed data fusion methods may help managing the complexity of wireless sensor networks, for instance in massive Internet of Things scenarios.},
language = {en},
urldate = {2019-09-24},
journal = {Information Fusion},
author = {Bosman, Hedde HWJ and Iacca, Giovanni and Tejada, Arturo and Wörtche, Heinrich J. and Liotta, Antonio},
month = jan,
year = {2017},
pages = {41--56},
file = {Bosman et al_2017_Spatial anomaly detection in sensor networks using neighborhood information.pdf:/home/roland/Zotero/storage/WFC4ZH4Y/Bosman et al_2017_Spatial anomaly detection in sensor networks using neighborhood information.pdf:application/pdf}
}
@article{agrawal_survey_2015,
title = {Survey on {Anomaly} {Detection} using {Data} {Mining} {Techniques}},
volume = {60},
issn = {18770509},
url = {https://linkinghub.elsevier.com/retrieve/pii/S1877050915023479},
doi = {10.1016/j.procs.2015.08.220},
abstract = {In the present world huge amounts of data are stored and transferred from one location to another. The data when transferred or stored is primed exposed to attack. Although various techniques or applications are available to protect data, loopholes exist. Thus to analyze data and to determine various kind of attack data mining techniques have emerged to make it less vulnerable. Anomaly detection uses these data mining techniques to detect the surprising behaviour hidden within data increasing the chances of being intruded or attacked. Various hybrid approaches have also been made in order to detect known and unknown attacks more accurately. This paper reviews various data mining techniques for anomaly detection to provide better understanding among the existing techniques that may help interested researchers to work future in this direction.},
language = {en},
urldate = {2019-09-24},
journal = {Procedia Computer Science},
author = {Agrawal, Shikha and Agrawal, Jitendra},
year = {2015},
pages = {708--713},
file = {Agrawal_Agrawal_2015_Survey on Anomaly Detection using Data Mining Techniques.pdf:/home/roland/Zotero/storage/TK69WDSW/Agrawal_Agrawal_2015_Survey on Anomaly Detection using Data Mining Techniques.pdf:application/pdf}
}
@article{lane_application_nodate,
title = {An {Application} of {Machine} {Learning} to {Anomaly} {Detection}},
abstract = {The anomaly detection problem has been widely studied in the computer security literature. In this paper we present a machine learning approach to anomaly detection. Our system builds user profiles based on command sequences and compares current input sequences to the profile using a similarity measure. The system must learn to classify current behavior as consistent or anomalous with past behavior using only positive examples of the account's valid user. Our empirical results demonstrate that this is a promising approach to distinguishing the legitamate user from an intruder.},
language = {en},
author = {Lane, Terran and Brodley, Carla E},
pages = {13}
}
@inproceedings{singh_demystifying_2017,
address = {Anchorage, AK, USA},
title = {Demystifying {Numenta} anomaly benchmark},
isbn = {978-1-5090-6182-2},
url = {http://ieeexplore.ieee.org/document/7966038/},
doi = {10.1109/IJCNN.2017.7966038},
abstract = {Detecting anomalies in large-scale, streaming datasets has wide applicability in a myriad of domains like network intrusion detection for cyber-security, fraud detection for credit cards, system health monitoring, and fault detection in safety critical systems. Due to its wide applicability, the problem of anomaly detection has been well-studied by industry and academia alike, and many algorithms have been proposed for detecting anomalies in different problem settings. But until recently, there was no openly available, systematic dataset and/or framework using which the proposed anomaly detection algorithms could be compared and evaluated on a common ground. Numenta Anomaly Benchmark (NAB), made available by Numenta1 in 2015, addressed this gap by providing a set of openly-available, labeled data files and a common scoring system, using which different anomaly detection algorithms could be fairly evaluated and compared. In this paper, we provide an in-depth analysis of the key aspects of the NAB framework, and highlight inherent challenges therein, with the objective to provide insights about the gaps in the current framework that must be addressed so as to make it more robust and easyto-use. Furthermore, we also provide additional evaluation of five state-of-the-art anomaly detection algorithms (including the ones proposed by Numenta) using the NAB datasets, and based on the evaluation results, we argue that the performance of these algorithms is not sufficient for practical, industry-scale applications, and must be improved upon so as to make them suitable for large-scale anomaly detection problems.},
language = {en},
urldate = {2019-11-22},
booktitle = {2017 {International} {Joint} {Conference} on {Neural} {Networks} ({IJCNN})},
publisher = {IEEE},
author = {Singh, Nidhi and Olinsky, Craig},
month = may,
year = {2017},
pages = {1570--1577},
file = {Singh_Olinsky_2017_Demystifying Numenta anomaly benchmark.pdf:/home/roland/Zotero/storage/HEPPWLPX/Singh_Olinsky_2017_Demystifying Numenta anomaly benchmark.pdf:application/pdf}
}
@article{pimentel_review_2014,
title = {A review of novelty detection},
volume = {99},
issn = {01651684},
url = {https://linkinghub.elsevier.com/retrieve/pii/S016516841300515X},
doi = {10.1016/j.sigpro.2013.12.026},
abstract = {Novelty detection is the task of classifying test data that differ in some respect from the data that are available during training. This may be seen as “one-class classification”, in which a model is constructed to describe “normal” training data. The novelty detection approach is typically used when the quantity of available “abnormal” data is insufficient to construct explicit models for non-normal classes. Application includes inference in datasets from critical systems, where the quantity of available normal data is very large, such that “normality” may be accurately modelled. In this review we aim to provide an updated and structured investigation of novelty detection research papers that have appeared in the machine learning literature during the last decade.},
language = {en},
urldate = {2019-11-20},
journal = {Signal Processing},
author = {Pimentel, Marco A.F. and Clifton, David A. and Clifton, Lei and Tarassenko, Lionel},
month = jun,
year = {2014},
pages = {215--249},
file = {Pimentel et al_2014_A review of novelty detection.pdf:/home/roland/Zotero/storage/5LGAFUKF/Pimentel et al_2014_A review of novelty detection.pdf:application/pdf}
}
@article{sarabadani_building_2017,
title = {Building automated vandalism detection tools for {Wikidata}},
url = {http://arxiv.org/abs/1703.03861},
doi = {10.1145/3041021.3053366},
abstract = {Wikidata, like Wikipedia, is a knowledge base that anyone can edit. This open collaboration model is powerful in that it reduces barriers to participation and allows a large number of people to contribute. However, it exposes the knowledge base to the risk of vandalism and low-quality contributions. In this work, we build on past work detecting vandalism in Wikipedia to detect vandalism in Wikidata. This work is novel in that identifying damaging changes in a structured knowledge-base requires substantially different feature engineering work than in a text-based wiki like Wikipedia. We also discuss the utility of these classifiers for reducing the overall workload of vandalism patrollers in Wikidata. We describe a machine classification strategy that is able to catch 89\% of vandalism while reducing patrollers’ workload by 98\%, by drawing lightly from contextual features of an edit and heavily from the characteristics of the user making the edit.},
language = {en},
urldate = {2019-11-05},
journal = {Proceedings of the 26th International Conference on World Wide Web Companion - WWW '17 Companion},
author = {Sarabadani, Amir and Halfaker, Aaron and Taraborelli, Dario},
year = {2017},
note = {arXiv: 1703.03861},
keywords = {Computer Science - Information Retrieval, Computer Science - Computers and Society},
pages = {1647--1654},
file = {Sarabadani et al_2017_Building automated vandalism detection tools for Wikidata.pdf:/home/roland/Zotero/storage/HDIUAPFX/Sarabadani et al_2017_Building automated vandalism detection tools for Wikidata.pdf:application/pdf}
}
@article{yoon_nets:_2019,
title = {{NETS}: {Extremely} {Fast} {Outlier} {Detection} from a {Data} {Stream} via {Set}-{Based} {Processing}},
volume = {12},
issn = {21508097},
shorttitle = {{NETS}},
url = {http://dl.acm.org/citation.cfm?doid=3342263.3360345},
doi = {10.14778/3342263.3342269},
abstract = {This paper addresses the problem of efficiently detecting outliers from a data stream as old data points expire from and new data points enter the window incrementally. The proposed method is based on a newly discovered characteristic of a data stream that the change in the locations of data points in the data space is typically very insignificant. This observation has led to the finding that the existing distance-based outlier detection algorithms perform excessive unnecessary computations that are repetitive and/or canceling out the effects. Thus, in this paper, we propose a novel set-based approach to detecting outliers, whereby data points at similar locations are grouped and the detection of outliers or inliers is handled at the group level. Specifically, a new algorithm NETS is proposed to achieve a remarkable performance improvement by realizing set-based early identification of outliers or inliers and taking advantage of the “net effect” between expired and new data points. Additionally, NETS is capable of achieving the same efficiency even for a high-dimensional data stream through two-level dimensional filtering. Comprehensive experiments using six real-world data streams show 5 to 25 times faster processing time than state-of-the-art algorithms with comparable memory consumption. We assert that NETS opens a new possibility to real-time data stream outlier detection.},
language = {en},
number = {11},
urldate = {2019-11-05},
journal = {Proceedings of the VLDB Endowment},
author = {Yoon, Susik and Lee, Jae-Gil and Lee, Byung Suk},
month = jul,
year = {2019},
pages = {1303--1315},
file = {Yoon et al_2019_NETS.pdf:/home/roland/Zotero/storage/M5IHWT7U/Yoon et al_2019_NETS.pdf:application/pdf}
}
@inproceedings{oza_experimental_2001,
address = {San Francisco, California},
title = {Experimental comparisons of online and batch versions of bagging and boosting},
isbn = {978-1-58113-391-2},
url = {http://portal.acm.org/citation.cfm?doid=502512.502565},
doi = {10.1145/502512.502565},
language = {en},
urldate = {2019-11-05},
booktitle = {Proceedings of the seventh {ACM} {SIGKDD} international conference on {Knowledge} discovery and data mining - {KDD} '01},
publisher = {ACM Press},
author = {Oza, Nikunj C. and Russell, Stuart},
year = {2001},
pages = {359--364},
file = {Oza_Russell_2001_Experimental comparisons of online and batch versions of bagging and boosting.pdf:/home/roland/Zotero/storage/27CS952P/Oza_Russell_2001_Experimental comparisons of online and batch versions of bagging and boosting.pdf:application/pdf}
}
@inproceedings{heindorf_vandalism_2016,
address = {Indianapolis, Indiana, USA},
title = {Vandalism {Detection} in {Wikidata}},
isbn = {978-1-4503-4073-1},
url = {http://dl.acm.org/citation.cfm?doid=2983323.2983740},
doi = {10.1145/2983323.2983740},
abstract = {Wikidata is the new, large-scale knowledge base of the Wikimedia Foundation. Its knowledge is increasingly used within Wikipedia itself and various other kinds of information systems, imposing high demands on its integrity. Wikidata can be edited by anyone and, unfortunately, it frequently gets vandalized, exposing all information systems using it to the risk of spreading vandalized and falsified information. In this paper, we present a new machine learning-based approach to detect vandalism in Wikidata. We propose a set of 47 features that exploit both content and context information, and we report on 4 classifiers of increasing effectiveness tailored to this learning task. Our approach is evaluated on the recently published Wikidata Vandalism Corpus WDVC-2015 and it achieves an area under curve value of the receiver operating characteristic, ROCAUC, of 0.991. It significantly outperforms the state of the art represented by the rule-based Wikidata Abuse Filter (0.865 ROCAUC) and a prototypical vandalism detector recently introduced by Wikimedia within the Objective Revision Evaluation Service (0.859 ROCAUC).},
language = {en},
urldate = {2019-11-05},
booktitle = {Proceedings of the 25th {ACM} {International} on {Conference} on {Information} and {Knowledge} {Management} - {CIKM} '16},
publisher = {ACM Press},
author = {Heindorf, Stefan and Potthast, Martin and Stein, Benno and Engels, Gregor},
year = {2016},
pages = {327--336},
file = {Heindorf et al_2016_Vandalism Detection in Wikidata.pdf:/home/roland/Zotero/storage/7MNDUCP6/Heindorf et al_2016_Vandalism Detection in Wikidata.pdf:application/pdf}
}
@inproceedings{costa_online_2015,
address = {Killarney, Ireland},
title = {Online fault detection based on {Typicality} and {Eccentricity} {Data} {Analytics}},
isbn = {978-1-4799-1960-4},
url = {http://ieeexplore.ieee.org/document/7280712/},
doi = {10.1109/IJCNN.2015.7280712},
abstract = {Fault detection is a task of major importance in industry nowadays, since that it can considerably reduce the risk of accidents involving human lives, in addition to production and, consequently, financial losses. Therefore, fault detection systems have been largely studied in the past few years, resulting in many different methods and approaches to solve such problem. This paper presents a detailed study on fault detection on industrial processes based on the recently introduced eccentricity and typicality data analytics (TEDA) approach. TEDA is a recursive and non-parametric method, firstly proposed to the general problem of anomaly detection on data streams. It is based on the measures of data density and proximity from each read data point to the analyzed data set. TEDA is an online autonomous learning algorithm that does not require a priori knowledge about the process, is completely free of user- and problem-defined parameters, requires very low computational effort and, thus, is very suitable for real-time applications. The results further presented were generated by the application of TEDA to the very well-known real data benchmark DAMADICS.},
language = {en},
urldate = {2019-12-20},
booktitle = {2015 {International} {Joint} {Conference} on {Neural} {Networks} ({IJCNN})},
publisher = {IEEE},
author = {Costa, Bruno Sielly Jales and Bezerra, Clauber Gomes and Guedes, Luiz Affonso and Angelov, Plamen Parvanov},
month = jul,
year = {2015},
pages = {1--6},
file = {Costa et al_2015_Online fault detection based on Typicality and Eccentricity Data Analytics.pdf:/home/roland/Zotero/storage/GTFD5T8N/Costa et al_2015_Online fault detection based on Typicality and Eccentricity Data Analytics.pdf:application/pdf}
}
@inproceedings{muter_entropy-based_2011,
address = {Baden-Baden, Germany},
title = {Entropy-{Based} {Anomaly} {Detection} for {In}-{Vehicle} {Networks}},
isbn = {978-1-4577-0890-9},
url = {http://ieeexplore.ieee.org/document/5940552/},
doi = {10.1109/IVS.2011.5940552},
abstract = {Due to an increased connectivity and seamless integration of information technology into modern vehicles, a trend of research in the automotive domain is the development of holistic IT security concepts. Within the scope of this development, vehicular attack detection is one concept which gains an increased attention, because of its reactive nature that allows to respond to threats during runtime. In this paper we explore the applicability of entropy-based attack detection for in-vehicle networks. We illustrate the crucial aspects for an adaptation of such an approach to the automotive domain. Moreover, we show first exemplary results by applying the approach to measurements derived from a standard vehicle’s CAN-Body network.},
language = {en},
urldate = {2019-12-20},
booktitle = {2011 {IEEE} {Intelligent} {Vehicles} {Symposium} ({IV})},
publisher = {IEEE},
author = {Muter, Michael and Asaj, Naim},
month = jun,
year = {2011},
pages = {1110--1115},
file = {Muter_Asaj_2011_Entropy-based anomaly detection for in-vehicle networks.pdf:/home/roland/Zotero/storage/M9YCYD4Y/Muter_Asaj_2011_Entropy-based anomaly detection for in-vehicle networks.pdf:application/pdf}
}
@article{koupaie_outlier_2013,
title = {Outlier {Detection} in {Stream} {Data} by {Machine} {Learning} and {Feature} {Selection} {Methods}},
volume = {2},
abstract = {In recent years, intrusion detection has emerged as an important technique for network security. Machine learning techniques have been applied to the field of intrusion detection. They can learn normal and anomalous patterns from training data and via Feature selection improving classification by searching for the subset of features which best classifies the training data to detect attacks on computer system. The quality of features directly affects the performance of classification. Many feature selection methods introduced to remove redundant and irrelevant features, because raw features may reduce accuracy or robustness of classification. Outlier detection in stream data is an important and active research issue in anomaly detection. Most of the existing outlier detection algorithms has less accurate because use some clustering method. Some data are so essential and secretary. Therefore, it needs to mine carefully even if spend cost. This paper presents a framework to detect outlier in stream data by machine learning method. Moreover, it is considered if data was high dimensional. This method is more accurate from other preferred models, because machine learning method is more accurate of other methods.},
language = {en},
number = {3},
journal = {International Journal of Advanced Computer Science and Information Technology},
author = {Koupaie, Hossein Moradi and Ibrahim, Suhaimi and Hosseinkhani, Javad},
year = {2013},
pages = {8},
file = {Koupaie et al_2013_Outlier Detection in Stream Data by Machine Learning and Feature Selection.pdf:/home/roland/Zotero/storage/WPJBV896/Koupaie et al_2013_Outlier Detection in Stream Data by Machine Learning and Feature Selection.pdf:application/pdf}
}
@inproceedings{cao_density-based_2006,
title = {Density-{Based} {Clustering} over an {Evolving} {Data} {Stream} with {Noise}},
isbn = {978-0-89871-611-5 978-1-61197-276-4},
url = {https://epubs.siam.org/doi/10.1137/1.9781611972764.29},
doi = {10.1137/1.9781611972764.29},
abstract = {Clustering is an important task in mining evolving data streams. Beside the limited memory and one-pass constraints, the nature of evolving data streams implies the following requirements for stream clustering: no assumption on the number of clusters, discovery of clusters with arbitrary shape and ability to handle outliers. While a lot of clustering algorithms for data streams have been proposed, they offer no solution to the combination of these requirements. In this paper, we present DenStream, a new approach for discovering clusters in an evolving data stream. The “dense” micro-cluster (named core-micro-cluster) is introduced to summarize the clusters with arbitrary shape, while the potential core-micro-cluster and outlier micro-cluster structures are proposed to maintain and distinguish the potential clusters and outliers. A novel pruning strategy is designed based on these concepts, which guarantees the precision of the weights of the micro-clusters with limited memory. Our performance study over a number of real and synthetic data sets demonstrates the effectiveness and efficiency of our method.},
language = {en},
urldate = {2019-12-20},
booktitle = {Proceedings of the 2006 {SIAM} {International} {Conference} on {Data} {Mining}},
publisher = {Society for Industrial and Applied Mathematics},
author = {Cao, Feng and Estert, Martin and Qian, Weining and Zhou, Aoying},
month = apr,
year = {2006},
pages = {328--339},
file = {Cao et al_2006_Density-Based Clustering over an Evolving Data Stream with Noise.pdf:/home/roland/Zotero/storage/GULBSUGA/Cao et al_2006_Density-Based Clustering over an Evolving Data Stream with Noise.pdf:application/pdf}
}
@misc{noauthor_anomaly_nodate,
title = {Anomaly {Detection} over {Noisy} {Data} using {Learned} {Probability} {Distributions}},
file = {Anomaly Detection over Noisy Data using Learned Probability Distributions.pdf:/home/roland/Zotero/storage/EZG2MFQQ/Anomaly Detection over Noisy Data using Learned Probability Distributions.pdf:application/pdf}
}
@misc{noauthor_handbook_nodate,
title = {Handbook of {Parametric} and {Nonparametric} {Statistical} {Procedures}},
file = {Handbook of Parametric and Nonparametric Statistical Procedures.pdf:/home/roland/Zotero/storage/XR92VNTY/Handbook of Parametric and Nonparametric Statistical Procedures.pdf:application/pdf}
}
@article{haibo_he_incremental_2011,
title = {Incremental {Learning} {From} {Stream} {Data}},
volume = {22},
issn = {1045-9227, 1941-0093},
url = {http://ieeexplore.ieee.org/document/6064897/},
doi = {10.1109/TNN.2011.2171713},
abstract = {Recent years have witnessed an incredibly increasing interest in the topic of incremental learning. Unlike conventional machine learning situations, data flow targeted by incremental learning becomes available continuously over time. Accordingly, it is desirable to be able to abandon the traditional assumption of the availability of representative training data during the training period to develop decision boundaries. Under scenarios of continuous data flow, the challenge is how to transform the vast amount of stream raw data into information and knowledge representation, and accumulate experience over time to support future decision-making process. In this paper, we propose a general adaptive incremental learning framework named ADAIN that is capable of learning from continuous raw data, accumulating experience over time, and using such knowledge to improve future learning and prediction performance. Detailed system level architecture and design strategies are presented in this paper. Simulation results over several realworld data sets are used to validate the effectiveness of this method.},
language = {en},
number = {12},
urldate = {2019-12-19},
journal = {IEEE Transactions on Neural Networks},
author = {{Haibo He} and {Sheng Chen} and {Kang Li} and {Xin Xu}},
month = dec,
year = {2011},
pages = {1901--1914},
file = {Haibo He et al_2011_Incremental Learning From Stream Data.pdf:/home/roland/Zotero/storage/L75MEEW3/Haibo He et al_2011_Incremental Learning From Stream Data.pdf:application/pdf}
}
@article{gama_survey_2012,
title = {A survey on learning from data streams: current and future trends},
volume = {1},
issn = {2192-6352, 2192-6360},
shorttitle = {A survey on learning from data streams},
url = {http://link.springer.com/10.1007/s13748-011-0002-6},
doi = {10.1007/s13748-011-0002-6},
abstract = {Nowadays, there are applications in which the data are modeled best not as persistent tables, but rather as transient data streams. In this article, we discuss the limitations of current machine learning and data mining algorithms. We discuss the fundamental issues in learning in dynamic environments like continuously maintain learning models that evolve over time, learning and forgetting, concept drift and change detection. Data streams produce a huge amount of data that introduce new constraints in the design of learning algorithms: limited computational resources in terms of memory, cpu power, and communication bandwidth. We present some illustrative algorithms, designed to taking these constrains into account, for decision-tree learning, hierarchical clustering and frequent pattern mining. We identify the main issues and current challenges that emerge in learning from data streams that open research lines for further developments.},
language = {en},
number = {1},
urldate = {2019-12-19},
journal = {Progress in Artificial Intelligence},
author = {Gama, João},
month = apr,
year = {2012},
pages = {45--55},
file = {Gama_2012_A survey on learning from data streams.pdf:/home/roland/Zotero/storage/HNUFEA4X/Gama_2012_A survey on learning from data streams.pdf:application/pdf}
}
@article{hong_entropy_2016,
title = {The {Entropy} and {PCA} {Based} {Anomaly} {Prediction} in {Data} {Streams}},
volume = {96},
issn = {18770509},
url = {https://linkinghub.elsevier.com/retrieve/pii/S1877050916319160},
doi = {10.1016/j.procs.2016.08.115},
abstract = {With the increase of data and information, anomaly management has been attracting much more attention and become an important research topic gradually. Previous literatures have advocated anomaly discovery and identification ignoring the fact that practice needs anomaly detection in advance (anomaly prediction) but anomaly detection with post-hoc analysis. Given this apparent gap, this research proposes a new approach for anomaly prediction based on PCA (principle component analysis) and information entropy theory, and support vector regression. The main idea of anomaly prediction is to train the historical data and to identify and recognize outlier data according to previous streams patterns and trends. The explorative results of SO2 concentration of exhaust gas in WFGD (Wet Flue Gas Desulfurization) demonstrate a good performance (efficient and accurate) of the target data prediction approach. This robust and novel method can be used to detect and predict the anomaly in data streams, and applied to fault prediction, credit card fraud prediction, intrusion prediction in cyber-security, malignant diagnosis, etc.},
language = {en},
urldate = {2019-12-19},
journal = {Procedia Computer Science},
author = {Hong, Daocheng and Zhao, Deshan and Zhang, Yanchun},
year = {2016},
pages = {139--146},
file = {Hong et al_2016_The Entropy and PCA Based Anomaly Prediction in Data Streams.pdf:/home/roland/Zotero/storage/9Y48T24D/Hong et al_2016_The Entropy and PCA Based Anomaly Prediction in Data Streams.pdf:application/pdf}
}
@inproceedings{angelov_anomaly_2014,
address = {Orlando, FL, USA},
title = {Anomaly detection based on eccentricity analysis},
isbn = {978-1-4799-4494-1},
url = {http://ieeexplore.ieee.org/document/7009497/},
doi = {10.1109/EALS.2014.7009497},
abstract = {In this paper, we propose a new eccentricity- based anomaly detection principle and algorithm. It is based on a further development of the recently introduced data analytics framework (TEDA – from typicality and eccentricity data analytics). We compare TEDA with the traditional statistical approach and prove that TEDA is a generalization of it in regards to the well-known “nσ” analysis (TEDA gives exactly the same result as the traditional “nσ” analysis but it does not require the restrictive prior assumptions that are made for the traditional approach to be in place). Moreover, it offers a nonparametric, closed form analytical descriptions (models of the data distribution) to be extracted from the real data realizations, not to be pre-assumed. In addition to that, for several types of proximity/similarity measures (such as Euclidean, cosine, Mahalonobis) it can be calculated recursively, thus, computationally very efficiently and is suitable for real time and online algorithms. Building on the per data sample, exact information about the data distribution in a closed analytical form, in this paper we propose a new less conservative and more sensitive condition for anomaly detection. It is quite different from the traditional “nσ” type conditions. We demonstrate example where traditional conditions would lead to an increased amount of false negatives or false positives in comparison with the proposed condition. The new condition is intuitive and easy to check for arbitrary data distribution and arbitrary small (but not less than 3) amount of data samples/points. Finally, because the anomaly/novelty/change detection is very important and basic data analysis operation which is in the fundament of such higher level tasks as fault detection, drift detection in data streams, clustering, outliers detection, autonomous video analytics, particle physics, etc. we point to some possible applications which will be the domain of future work.},
language = {en},
urldate = {2019-12-19},
booktitle = {2014 {IEEE} {Symposium} on {Evolving} and {Autonomous} {Learning} {Systems} ({EALS})},
publisher = {IEEE},
author = {Angelov, Plamen},
month = dec,
year = {2014},
pages = {1--8},
file = {Angelov_2014_Anomaly detection based on eccentricity analysis.pdf:/home/roland/Zotero/storage/4AKQWEVX/Angelov_2014_Anomaly detection based on eccentricity analysis.pdf:application/pdf}
}
@article{wang_statistical_2013,
title = {Statistical wavelet-based anomaly detection in big data with compressive sensing},
volume = {2013},
issn = {1687-1499},
url = {https://jwcn-eurasipjournals.springeropen.com/articles/10.1186/1687-1499-2013-269},
doi = {10.1186/1687-1499-2013-269},
abstract = {Anomaly detection in big data is a key problem in the big data analytics domain. In this paper, the definitions of anomaly detection and big data were presented. Due to the sampling and storage burden and the inadequacy of privacy protection of anomaly detection based on uncompressed data, compressive sensing theory was introduced and used in the anomaly detection algorithm. The anomaly detection criterion based on wavelet packet transform and statistic process control theory was deduced. The proposed anomaly detection technique was used for through-wall human detection to demonstrate the effectiveness. The experiments for detecting humans behind a brick wall and gypsum based on ultra-wideband radar signal were carried out. The results showed that the proposed anomaly detection algorithm could effectively detect the existence of a human being through compressed signals and uncompressed data.},
language = {en},
number = {1},
urldate = {2019-12-19},
journal = {EURASIP Journal on Wireless Communications and Networking},
author = {Wang, Wei and Lu, Dunqiang and Zhou, Xin and Zhang, Baoju and Mu, Jiasong},
month = dec,
year = {2013},
file = {Wang et al_2013_Statistical wavelet-based anomaly detection in big data with compressive sensing.pdf:/home/roland/Zotero/storage/7QT5IZAH/Wang et al_2013_Statistical wavelet-based anomaly detection in big data with compressive sensing.pdf:application/pdf}
}
@misc{noauthor_issues_nodate,
title = {Issues in {Evaluation} of {Stream} {Learning} {Algorithms}},
file = {Issues in Evaluation of Stream Learning Algorithms.pdf:/home/roland/Zotero/storage/23CNIT4X/Issues in Evaluation of Stream Learning Algorithms.pdf:application/pdf}
}
@techreport{tsymbal_problem_2004,
address = {Dublin},
title = {The problem of concept drift: definitions and related work},
abstract = {In the real world concepts are often not stable but change with time. Typical examples of this are weather prediction rules and customers’ preferences. The underlying data distribution may change as well. Often these changes make the model built on old data inconsistent with the new data, and regular updating of the model is necessary. This problem, known as concept drift, complicates the task of learning a model from data and requires special approaches, different from commonly used techniques, which treat arriving instances as equally important contributors to the final concept. This paper considers different types of concept drift, peculiarities of the problem, and gives a critical review of existing approaches to the problem.},
language = {en},
number = {106.2},
institution = {Computer Science Department, Trinity College Dublin},
author = {Tsymbal, Alexey},
year = {2004},
pages = {7},
file = {Tsymbal_2004_The problem of concept drift.pdf:/home/roland/Zotero/storage/HI85RK6Z/Tsymbal_2004_The problem of concept drift.pdf:application/pdf}
}
@article{cejnek_concept_2018,
title = {Concept drift robust adaptive novelty detection for data streams},
volume = {309},
issn = {09252312},
url = {https://linkinghub.elsevier.com/retrieve/pii/S0925231218305253},
doi = {10.1016/j.neucom.2018.04.069},
abstract = {In this paper we study the performance of two original adaptive unsupervised novelty detection methods (NDMs) on data with concept drift. Newly, the concept drift is considered as a challenging data imbalance that should be ignored by the NDMs, and only system changes and outliers represent novelty. The field of application for such NDMs is broad. For example, the method can be used as a supportive method for real-time system fault detection, for onset detection of events in biomedical signals, in monitoring of nonlinearly controlled processes, for event driven automated trading, etc. The two newly studied methods are the error and learning based novelty detection (ELBND) and the learning entropy (LE) based detection. These methods use both the error and weight increments of a (supervised) learning model. Here, we study these methods with normalized least-mean squares (NLMS) adaptive filter, and while the NDMs were studied on various real life tasks, newly, we carry out the study on two types of data streams with concept drift to analyze the general ability for unsupervised novelty detection. The two data streams, one with system changes, second with outliers, represent different novelty scenarios to demonstrate the performance of the proposed NDMs with concept drifts in data. Both tested NDMs work as a feature extractor. Thus, a classification framework is used for the evaluation of the obtained features and NDM benchmarking, where two other NDMs, one based on the adaptive model plain error, second using the sample entropy (SE), are used as the reference for the comparison to the proposed methods. The results show that both newly studied NDMs are superior to the merely use of the plain error of adaptive model and also to the sample entropy based detection while they are robust against the concept drift occurrence.},
language = {en},
urldate = {2019-12-21},
journal = {Neurocomputing},
author = {Cejnek, Matous and Bukovsky, Ivo},
month = oct,
year = {2018},
pages = {46--53},
file = {Cejnek_Bukovsky_2018_Concept drift robust adaptive novelty detection for data streams.pdf:/home/roland/Zotero/storage/NZGUD45K/Cejnek_Bukovsky_2018_Concept drift robust adaptive novelty detection for data streams.pdf:application/pdf}
}
@inproceedings{jiang_real_2014,
title = {Real time contextual collective anomaly detection over multiple data streams},
abstract = {Anomaly detection has always been a critical and challenging problem in many application areas such as industry, healthcare, environment and finance. This problem becomes more di cult in the Big Data era as the data scale increases dramatically and the type of anomalies gets more complicated. In time sensitive applications like real time monitoring, data are often fed in streams and anomalies are required to be identified online across multiple streams with a short time delay. The new data characteristics and analysis requirements make existing solutions no longer suitable.},
language = {en},
booktitle = {Proceedings of the {ODD}},
author = {Jiang, Yexi and Zeng, Chunqiu and Xu, Jian and Li, Tao},
year = {2014},
pages = {8},
file = {Jiang et al_2014_Real time contextual collective anomaly detection over multiple data streams.pdf:/home/roland/Zotero/storage/L6MVAMMR/Jiang et al_2014_Real time contextual collective anomaly detection over multiple data streams.pdf:application/pdf}
}
@article{haidar_sharif_entropy_2012,
title = {An entropy approach for abnormal activities detection in video streams},
volume = {45},
issn = {00313203},
url = {https://linkinghub.elsevier.com/retrieve/pii/S0031320311004778},
doi = {10.1016/j.patcog.2011.11.023},
abstract = {Detection of aberration in video surveillance is an important task for public safety. This paper puts forward a simple but effective framework to detect aberrations in video streams using Entropy, which is estimated on the statistical treatments of the spatiotemporal information of a set of interest points within a region of interest by measuring their degree of randomness of both directions and displacements. Entropy is a measure of the disorder/randomness in video frame. It has been showed that degree of randomness of the directions (circular variance) changes markedly in abnormal state of affairs and does change only direction variation but does not change with displacement variation of the interest point. Degree of randomness of the displacements has been put in for to counterbalance this deficiency. Simple simulations have been exercised to see the characteristics of these crude elements of entropy. Normalized entropy measure provides the knowledge of the state of anomalousness. Experiments have been conducted on various real world video datasets. Both simulation and experimental results report that entropy measures of the frames over time is an outstanding way to characterize anomalies in videos.},
language = {en},
number = {7},
urldate = {2020-01-06},
journal = {Pattern Recognition},
author = {Haidar Sharif, Md. and Djeraba, Chabane},
month = jul,
year = {2012},
pages = {2543--2561},
file = {Haidar Sharif_Djeraba_2012_An entropy approach for abnormal activities detection in video streams.pdf:/home/roland/Zotero/storage/G6SI8Q3M/Haidar Sharif_Djeraba_2012_An entropy approach for abnormal activities detection in video streams.pdf:application/pdf}
}
@article{li_incremental_2014,
title = {Incremental entropy-based clustering on categorical data streams with concept drift},
volume = {59},
issn = {09507051},
url = {https://linkinghub.elsevier.com/retrieve/pii/S0950705114000446},
doi = {10.1016/j.knosys.2014.02.004},
abstract = {Clustering on categorical data streams is a relatively new field that has not received as much attention as static data and numerical data streams. One of the main difficulties in categorical data analysis is lacking in an appropriate way to define the similarity or dissimilarity measure on data. In this paper, we propose three dissimilarity measures: a point-cluster dissimilarity measure (based on incremental entropy), a cluster–cluster dissimilarity measure (based on incremental entropy) and a dissimilarity measure between two cluster distributions (based on sample standard deviation). We then propose an integrated framework for clustering categorical data streams with three algorithms: Minimal Dissimilarity Data Labeling (MDDL), Concept Drift Detection (CDD) and Cluster Evolving Analysis (CEA). We also make comparisons with other algorithms on several data streams synthesized from real data sets. Experiments show that the proposed algorithms are more effective in generating clustering results and detecting concept drift.},
language = {en},
urldate = {2020-01-06},
journal = {Knowledge-Based Systems},
author = {Li, Yanhong and Li, Deyu and Wang, Suge and Zhai, Yanhui},
month = mar,
year = {2014},
pages = {33--47},
file = {Li et al_2014_Incremental entropy-based clustering on categorical data streams with concept.pdf:/home/roland/Zotero/storage/G6389XR3/Li et al_2014_Incremental entropy-based clustering on categorical data streams with concept.pdf:application/pdf}
}
@article{tran_distance-based_2016,
title = {Distance-based outlier detection in data streams},
volume = {9},
issn = {21508097},
url = {http://dl.acm.org/citation.cfm?doid=2994509.2994526},
doi = {10.14778/2994509.2994526},
abstract = {Continuous outlier detection in data streams has important applications in fraud detection, network security, and public health. The arrival and departure of data objects in a streaming manner impose new challenges for outlier detection algorithms, especially in time and space efficiency. In the past decade, several studies have been performed to address the problem of distance-based outlier detection in data streams (DODDS), which adopts an unsupervised definition and does not have any distributional assumptions on data values. Our work is motivated by the lack of comparative evaluation among the state-of-the-art algorithms using the same datasets on the same platform. We systematically evaluate the most recent algorithms for DODDS under various stream settings and outlier rates. Our extensive results show that in most settings, the MCOD algorithm offers the superior performance among all the algorithms, including the most recent algorithm Thresh LEAP.},
language = {en},
number = {12},
urldate = {2020-01-06},
journal = {Proceedings of the VLDB Endowment},
author = {Tran, Luan and Fan, Liyue and Shahabi, Cyrus},
month = aug,
year = {2016},
pages = {1089--1100},
file = {Tran et al_2016_Distance-based outlier detection in data streams.pdf:/home/roland/Zotero/storage/6PMU4SR4/Tran et al_2016_Distance-based outlier detection in data streams.pdf:application/pdf}
}
@article{reunanen_unsupervised_2019,
title = {Unsupervised online detection and prediction of outliers in streams of sensor data},
issn = {2364-415X, 2364-4168},
url = {http://link.springer.com/10.1007/s41060-019-00191-3},
doi = {10.1007/s41060-019-00191-3},
abstract = {Outliers are unexpected observations, which deviate from the majority of observations. Outlier detection and prediction are challenging tasks, because outliers are rare by definition. A stream is an unbounded source of data, which has to be processed promptly. This article proposes novel methods for outlier detection and outlier prediction in streams of sensor data. The outlier detection is an independent, unsupervised process, which is implemented using an autoencoder. The outlier detection continuously evaluates if the latest data point xi from a stream is an inlier or an outlier. This distinction is based on the reconstruction cost accompanied with Chebyshev’s inequality and the EWMA (exponentially weighted moving average) model. The outlier prediction uses the results of the outlier detection to form the required training data. The outlier prediction utilizes LR (logistic regression), SGD (stochastic gradient descent) and the hidden representation provided by the autoencoder to predict outliers in streams. The results of the experiments show that the proposed methods (1) provide accurate results, (2) are calculated in reduced computation time and (3) use a low amount of memory. Our proposed methods are suitable for analyzing streams of sensor data and providing results with low latency. The experiments also indicated that the outlier prediction is able to anticipate the occurrence of outliers in streams of sensor data.},
language = {en},
urldate = {2020-01-06},
journal = {International Journal of Data Science and Analytics},
author = {Reunanen, Niko and Räty, Tomi and Jokinen, Juho J. and Hoyt, Tyler and Culler, David},
month = jun,
year = {2019},
file = {Reunanen et al_2019_Unsupervised online detection and prediction of outliers in streams of sensor.pdf:/home/roland/Zotero/storage/XIAUH85P/Reunanen et al_2019_Unsupervised online detection and prediction of outliers in streams of sensor.pdf:application/pdf}
}
@inproceedings{poonsirivong_rapid_2017,
address = {NakhonSiThammarat, Thailand},
title = {A rapid anomaly detection technique for big data curation},
isbn = {978-1-5090-4834-2},
url = {http://ieeexplore.ieee.org/document/8025900/},
doi = {10.1109/JCSSE.2017.8025900},
abstract = {Anomaly detection (outlier) using simulation helps us analyze the anomaly instances from big data source. As the hasty explosion of today’s data stream, outlier detection technique will be an analytical tool to be employed for evaluating massive unstructured datasets. In order to speed-up the processing time to handle enormous datasets, this research will conduct experiments of advanced distant-based outlier detection algorithms to investigate the most effective algorithms using MOA. The algorithms used in this study are Continuous Outlie Detection (COD), Micro-Cluster based COD or MCOD, and STream OutlierR Miner (STORM). The results demonstrate MCOD algorithm can outperform other two algorithms in terms of processing time and accurate anomalies.},
language = {en},
urldate = {2020-01-06},
booktitle = {2017 14th {International} {Joint} {Conference} on {Computer} {Science} and {Software} {Engineering} ({JCSSE})},
publisher = {IEEE},
author = {Poonsirivong, Korn and Jittawiriyanukoon, Chanintorn},
month = jul,
year = {2017},
pages = {1--6},
file = {Poonsirivong_Jittawiriyanukoon_2017_A rapid anomaly detection technique for big data curation.pdf:/home/roland/Zotero/storage/WTE9B23W/Poonsirivong_Jittawiriyanukoon_2017_A rapid anomaly detection technique for big data curation.pdf:application/pdf}
}
@inproceedings{cao_scalable_2014,
address = {Chicago, IL, USA},
title = {Scalable distance-based outlier detection over high-volume data streams},
isbn = {978-1-4799-2555-1},
url = {http://ieeexplore.ieee.org/document/6816641/},
doi = {10.1109/ICDE.2014.6816641},
abstract = {The discovery of distance-based outliers from huge volumes of streaming data is critical for modern applications ranging from credit card fraud detection to moving object monitoring. In this work, we propose the first general framework to handle the three major classes of distance-based outliers in streaming environments, including the traditional distance threshold based and the nearest-neighbor-based definitions. Our LEAP framework encompasses two general optimization princi ples applicable across all three outlier types. First, our "mini mal probing" principle uses a lightweight probing operation to gather minimal yet sufficient evidence for outlier detection. This principle overturns the state-of-the-art methodology that requires routinely conducting expensive complete neighborhood searches to identify outliers. Second, our "lifespan-aware prioritization" principle leverages the temporal relationships among stream data points to prioritize the processing order among them during the probing process. Guided by these two principles, we design an outlier detection strategy which is proven to be optimal in CPU costs needed to determine the outlier status of any data point during its entire life. Our comprehensive experimental studies, using both synthetic as well as real streaming data, demonstrate that our methods are 3 orders of magnitude faster than state-of the-art methods for a rich diversity of scenarios tested yet scale to high dimensional streaming data.},
language = {en},
urldate = {2020-01-06},
booktitle = {2014 {IEEE} 30th {International} {Conference} on {Data} {Engineering}},
publisher = {IEEE},
author = {Cao, Lei and Yang, Di and Wang, Qingyang and Yu, Yanwei and Wang, Jiayuan and Rundensteiner, Elke A.},
month = mar,
year = {2014},
pages = {76--87},
file = {Cao et al_2014_Scalable distance-based outlier detection over high-volume data streams.pdf:/home/roland/Zotero/storage/EVLZTYEN/Cao et al_2014_Scalable distance-based outlier detection over high-volume data streams.pdf:application/pdf}
}