anomaly-detection.bib


@article{bolton_statistical_2002,
	title = {Statistical {Fraud} {Detection}: {A} {Review}},
	volume = {17},
	number = {3},
	journal = {Statistical Science},
	author = {Bolton, Richard J. and Hand, David J.},
	year = {2002},
	keywords = {survey},
	pages = {235--255},
	file = {Bolton_Hand_2002_Statistical Fraud Detection.pdf:/home/roland/Zotero/storage/7NSHS3NQ/Bolton_Hand_2002_Statistical Fraud Detection.pdf:application/pdf}
}

@article{chandola_anomaly_2009,
	title = {Anomaly detection: {A} survey},
	volume = {41},
	number = {3},
	journal = {ACM Comput. Surv.},
	author = {Chandola, Varun and Banerjee, Arindam and Kumar, Vipin},
	year = {2009},
	keywords = {survey},
	pages = {15:1--15:58},
	file = {Chandola et al_2009_Anomaly detection.pdf:/home/roland/Zotero/storage/UHCR792T/Chandola et al_2009_Anomaly detection.pdf:application/pdf}
}

@article{chandola_anomaly_2012,
	title = {Anomaly {Detection} for {Discrete} {Sequences}: {A} {Survey}},
	volume = {24},
	number = {5},
	journal = {IEEE Trans. Knowl. Data Eng.},
	author = {Chandola, Varun and Banerjee, Arindam and Kumar, Vipin},
	year = {2012},
	keywords = {survey},
	pages = {823--839},
	file = {Chandola et al_2012_Anomaly Detection for Discrete Sequences.pdf:/home/roland/Zotero/storage/URQI7P8K/Chandola et al_2012_Anomaly Detection for Discrete Sequences.pdf:application/pdf}
}

@article{lane_sequence_nodate,
	title = {Sequence {Matching} and {Learning} in {Anomaly} {Detection} for {Computer} {Security}},
	abstract = {Twoproblemsof importance in computersecurity are to 1) detect the presence of an intruder masquerading as the valid user and 2) detect the perpetration of abusive actions on the part of an otherwise innocuous user. Wehave developed an approach to these problems that examines sequences of user actions (UNIX commandsto) classify behavior as normal or anomalous. In this paper weexplore the matchingfunction needed to comparea current behavioral sequence to a historical profile. Wediscuss the difficulties of performing matching in human-generateddata and show that exact string matchingis insufficient to this domain. Wedemonstrate a numberof partial matching functions and examine their behavior on user commanddata. In particular, we explore two methods for weighting scores by adjacency of matchesas well as two growthfunctions (polynomial and exponential) for scoring similarities. Wefind, empirically, that the optimal similarity measureis user dependantbut that measures based on the assumption of causal linkage betweenuser commandasre superior for this domain.},
	language = {en},
	author = {Lane, Terran and Brodley, Carla E},
	pages = {7},
	file = {Lane_Brodley_Sequence Matching and Learning in Anomaly Detection for Computer Security.pdf:/home/roland/Zotero/storage/726N79DP/Lane_Brodley_Sequence Matching and Learning in Anomaly Detection for Computer Security.pdf:application/pdf}
}

@inproceedings{wang_statistical_2011,
	address = {Dublin, Ireland},
	title = {Statistical {Techniques} for {Online} {Anomaly} {Detection} in {Data} {Centers}},
	isbn = {978-1-4244-9219-0},
	url = {http://ieeexplore.ieee.org/document/5990537/},
	doi = {10.1109/INM.2011.5990537},
	abstract = {Online anomaly detection is an important step in data center management, requiring light-weight techniques that provide sufficient accuracy for subsequent diagnosis and management actions. This paper presents statistical techniques based on the Tukey and Relative Entropy statistics, and applies them to data collected from a production environment and to data captured from a testbed for multi-tier web applications running on server class machines. The proposed techniques are lightweight and improve over standard Gaussian assumptions in terms of performance.},
	language = {en},
	urldate = {2019-09-19},
	booktitle = {12th {IFIP}/{IEEE} {International} {Symposium} on {Integrated} {Network} {Management} ({IM} 2011) and {Workshops}},
	publisher = {IEEE},
	author = {Wang, Chengwei and Viswanathan, Krishnamurthy and Choudur, Lakshminarayan and Talwar, Vanish and Satterfield, Wade and Schwan, Karsten},
	month = may,
	year = {2011},
	pages = {385--392},
	file = {Wang et al_2011_Statistical techniques for online anomaly detection in data centers.pdf:/home/roland/Zotero/storage/2T5XBMFF/Wang et al_2011_Statistical techniques for online anomaly detection in data centers.pdf:application/pdf}
}

@incollection{hutchison_anomalous_2004,
	address = {Berlin, Heidelberg},
	title = {Anomalous {Payload}-{Based} {Network} {Intrusion} {Detection}},
	volume = {3224},
	isbn = {978-3-540-23123-3 978-3-540-30143-1},
	url = {http://link.springer.com/10.1007/978-3-540-30143-1_11},
	abstract = {We present a payload-based anomaly detector, we call PAYL, for intrusion detection. PAYL models the normal application payload of network traffic in a fully automatic, unsupervised and very effecient fashion. We first compute during a training phase a profile byte frequency distribution and their standard deviation of the application payload flowing to a single host and port. We then use Mahalanobis distance during the detection phase to calculate the similarity of new data against the pre-computed profile. The detector compares this measure against a threshold and generates an alert when the distance of the new input exceeds this threshold. We demonstrate the surprising effectiveness of the method on the 1999 DARPA IDS dataset and a live dataset we collected on the Columbia CS department network. In once case nearly 100\% accuracy is achieved with 0.1\% false positive rate for port 80 traffic.},
	language = {en},
	urldate = {2019-09-19},
	booktitle = {Recent {Advances} in {Intrusion} {Detection}},
	publisher = {Springer Berlin Heidelberg},
	author = {Wang, Ke and Stolfo, Salvatore J.},
	editor = {Hutchison, David and Kanade, Takeo and Kittler, Josef and Kleinberg, Jon M. and Mattern, Friedemann and Mitchell, John C. and Naor, Moni and Nierstrasz, Oscar and Pandu Rangan, C. and Steffen, Bernhard and Sudan, Madhu and Terzopoulos, Demetri and Tygar, Dough and Vardi, Moshe Y. and Weikum, Gerhard and Jonsson, Erland and Valdes, Alfonso and Almgren, Magnus},
	year = {2004},
	doi = {10.1007/978-3-540-30143-1_11},
	pages = {203--222},
	file = {Wang_Stolfo_2004_Anomalous Payload-Based Network Intrusion Detection.pdf:/home/roland/Zotero/storage/KSEGUYLA/Wang_Stolfo_2004_Anomalous Payload-Based Network Intrusion Detection.pdf:application/pdf}
}

@article{talagala_anomaly_2019,
	title = {Anomaly {Detection} in {Streaming} {Nonstationary} {Temporal} {Data}},
	issn = {1061-8600, 1537-2715},
	url = {https://www.tandfonline.com/doi/full/10.1080/10618600.2019.1617160},
	doi = {10.1080/10618600.2019.1617160},
	abstract = {This article proposes a framework that provides early detection of anomalous series within a large collection of nonstationary streaming time-series data. We define an anomaly as an observation, that is, very unlikely given the recent distribution of a given system. The proposed framework first calculates a boundary for the system’s typical behavior using extreme value theory. Then a sliding window is used to test for anomalous series within a newly arrived collection of series. The model uses time series features as inputs, and a density-based comparison to detect any significant changes in the distribution of the features. Using various synthetic and real world datasets, we demonstrate the wide applicability and usefulness of our proposed framework. We show that the proposed algorithm can work well in the presence of noisy nonstationarity data within multiple classes of time series. This framework is implemented in the open source R package oddstream. R code and data are available in the online supplementary materials.},
	language = {en},
	urldate = {2019-09-19},
	journal = {Journal of Computational and Graphical Statistics},
	author = {Talagala, Priyanga Dilini and Hyndman, Rob J. and Smith-Miles, Kate and Kandanaarachchi, Sevvandi and Muñoz, Mario A.},
	month = jun,
	year = {2019},
	pages = {1--21},
	file = {Talagala et al_2019_Anomaly Detection in Streaming Nonstationary Temporal Data.pdf:/home/roland/Zotero/storage/FFG2GNDH/Talagala et al_2019_Anomaly Detection in Streaming Nonstationary Temporal Data.pdf:application/pdf}
}

@inproceedings{dos_santos_teixeira_data_2010,
	address = {Sierre, Switzerland},
	title = {Data {Stream} {Anomaly} {Detection} through {Principal} {Subspace} {Tracking}},
	isbn = {978-1-60558-639-7},
	url = {http://portal.acm.org/citation.cfm?doid=1774088.1774434},
	doi = {10.1145/1774088.1774434},
	abstract = {We consider the problem of anomaly detection in multiple co-evolving data streams. In this paper, we introduce FRAHST (Fast Rank-Adaptive row-Householder Subspace Tracking). It automatically learns the principal subspace from N numerical data streams and an anomaly is indicated by a change in the number of latent variables. Our technique provides state-of-the-art estimates for the subspace basis and has a true dominant complexity of only 5N r operations while satisfying all desirable streaming constraints. FRAHST successfully detects subtle anomalous patterns and when compared against four other anomaly detection techniques, it is the only with a consistent F1 ≥ 80\% in the Abilene datasets as well as in the ISP datasets introduced in this work.},
	language = {en},
	urldate = {2019-09-19},
	booktitle = {Proceedings of the 2010 {ACM} {Symposium} on {Applied} {Computing} - {SAC} '10},
	publisher = {ACM Press},
	author = {dos Santos Teixeira, Pedro Henriques and Milidiú, Ruy Luiz},
	year = {2010},
	pages = {1609},
	file = {dos Santos Teixeira_Milidiú_2010_Data stream anomaly detection through principal subspace tracking.pdf:/home/roland/Zotero/storage/D7ZHPBZ5/dos Santos Teixeira_Milidiú_2010_Data stream anomaly detection through principal subspace tracking.pdf:application/pdf}
}

@inproceedings{zheng_detecting_2015,
	address = {Bellevue, Washington},
	title = {Detecting collective anomalies from multiple spatio-temporal datasets across different domains},
	isbn = {978-1-4503-3967-4},
	url = {http://dl.acm.org/citation.cfm?doid=2820783.2820813},
	doi = {10.1145/2820783.2820813},
	abstract = {The collective anomaly denotes a collection of nearby locations that are anomalous during a few consecutive time intervals in terms of phenomena collectively witnessed by multiple datasets. The collective anomalies suggest there are underlying problems that may not be identified based on a single data source or in a single location. It also associates individual locations and time intervals, formulating a panoramic view of an event. To detect a collective anomaly is very challenging, however, as different datasets have different densities, distributions, and scales. Additionally, to find the spatio-temporal scope of a collective anomaly is time consuming as there are many ways to combine regions and time slots. Our method consists of three components: MultipleSource Latent-Topic (MSLT) model, Spatio-Temporal Likelihood Ratio Test (ST\_LRT) model, and a candidate generation algorithm. MSLT combines multiple datasets to infer the latent functions of a geographic region in the framework of a topic model. In turn, a estimate the underlying distribution of a sparse dataset generated in the region. ST\_LRT learns a proper underlying distribution for different datasets, and calculates an anomalous degree for each dataset based on a likelihood ratio test (LRT). It then aggregates the anomalous degrees of different datasets, using a skyline detection algorithm. We evaluate our method using five datasets related to New York City (NYC): 311 complaints, taxicab data, bike rental data, points of interest, and road network data, finding the anomalies that cannot be identified (or earlier than those detected) by a single dataset. Results show the advantages beyond six baseline methods.},
	language = {en},
	urldate = {2019-09-19},
	booktitle = {Proceedings of the 23rd {SIGSPATIAL} {International} {Conference} on {Advances} in {Geographic} {Information} {Systems} - {GIS} '15},
	publisher = {ACM Press},
	author = {Zheng, Yu and Zhang, Huichu and Yu, Yong},
	year = {2015},
	pages = {1--10},
	file = {Zheng et al_2015_Detecting collective anomalies from multiple spatio-temporal datasets across.pdf:/home/roland/Zotero/storage/ZEJQRRF8/Zheng et al_2015_Detecting collective anomalies from multiple spatio-temporal datasets across.pdf:application/pdf}
}

@article{schneider_expected_2016,
	title = {Expected {Similarity} {Estimation} for {Large}-{Scale} {Batch} and {Streaming} {Anomaly} {Detection}},
	volume = {105},
	issn = {0885-6125, 1573-0565},
	url = {http://arxiv.org/abs/1601.06602},
	doi = {10.1007/s10994-016-5567-7},
	abstract = {We present a novel algorithm for anomaly detection on very large datasets and data streams. The method, named EXPected Similarity Estimation (EXPoSE), is kernel-based and able to efﬁciently compute the similarity between new data points and the distribution of regular data. The estimator is formulated as an inner product with a reproducing kernel Hilbert space embedding and makes no assumption about the type or shape of the underlying data distribution. We show that ofﬂine (batch) learning with EXPoSE can be done in linear time and online (incremental) learning takes constant time per instance and model update. Furthermore, EXPoSE can make predictions in constant time, while it requires only constant memory. In addition, we propose different methodologies for concept drift adaptation on evolving data streams. On several real datasets we demonstrate that our approach can compete with state of the art algorithms for anomaly detection while being an order of magnitude faster than most other approaches.},
	language = {en},
	number = {3},
	urldate = {2019-09-19},
	journal = {Machine Learning},
	author = {Schneider, Markus and Ertel, Wolfgang and Ramos, Fabio},
	month = dec,
	year = {2016},
	note = {arXiv: 1601.06602},
	keywords = {Computer Science - Artificial Intelligence, Computer Science - Machine Learning},
	pages = {305--333},
	file = {Schneider et al_2016_Expected Similarity Estimation for Large-Scale Batch and Streaming Anomaly.pdf:/home/roland/Zotero/storage/J53FKASB/Schneider et al_2016_Expected Similarity Estimation for Large-Scale Batch and Streaming Anomaly.pdf:application/pdf}
}

@inproceedings{lavin_evaluating_2015,
	address = {Miami, FL, USA},
	title = {Evaluating {Real}-{Time} {Anomaly} {Detection} {Algorithms} -- {The} {Numenta} {Anomaly} {Benchmark}},
	isbn = {978-1-5090-0287-0},
	url = {http://ieeexplore.ieee.org/document/7424283/},
	doi = {10.1109/ICMLA.2015.141},
	abstract = {Much of the world’s data is streaming, time-series data, where anomalies give significant information in critical situations; examples abound in domains such as finance, IT, security, medical, and energy. Yet detecting anomalies in streaming data is a difficult task, requiring detectors to process data in real-time, not batches, and learn while simultaneously making predictions. There are no benchmarks to adequately test and score the efficacy of real-time anomaly detectors. Here we propose the Numenta Anomaly Benchmark (NAB), which attempts to provide a controlled and repeatable environment of open-source tools to test and measure anomaly detection algorithms on streaming data. The perfect detector would detect all anomalies as soon as possible, trigger no false alarms, work with real-world time-series data across a variety of domains, and automatically adapt to changing statistics. Rewarding these characteristics is formalized in NAB, using a scoring algorithm designed for streaming data. NAB evaluates detectors on a benchmark dataset with labeled, real-world time-series data. We present these components, and give results and analyses for several open source, commercially-used algorithms. The goal for NAB is to provide a standard, open source framework with which the research community can compare and evaluate different algorithms for detecting anomalies in streaming data.},
	language = {en},
	urldate = {2019-09-19},
	booktitle = {2015 {IEEE} 14th {International} {Conference} on {Machine} {Learning} and {Applications} ({ICMLA})},
	publisher = {IEEE},
	author = {Lavin, Alexander and Ahmad, Subutai},
	month = dec,
	year = {2015},
	pages = {38--44},
	file = {Lavin_Ahmad_2015_Evaluating Real-Time Anomaly Detection Algorithms -- The Numenta Anomaly.pdf:/home/roland/Zotero/storage/KDJRA5EU/Lavin_Ahmad_2015_Evaluating Real-Time Anomaly Detection Algorithms -- The Numenta Anomaly.pdf:application/pdf}
}

@article{adams_bayesian_2007,
	title = {Bayesian {Online} {Changepoint} {Detection}},
	url = {http://arxiv.org/abs/0710.3742},
	abstract = {Changepoints are abrupt variations in the generative parameters of a data sequence. Online detection of changepoints is useful in modelling and prediction of time series in application areas such as ﬁnance, biometrics, and robotics. While frequentist methods have yielded online ﬁltering and prediction techniques, most Bayesian papers have focused on the retrospective segmentation problem. Here we examine the case where the model parameters before and after the changepoint are independent and we derive an online algorithm for exact inference of the most recent changepoint. We compute the probability distribution of the length of the current “run,” or time since the last changepoint, using a simple message-passing algorithm. Our implementation is highly modular so that the algorithm may be applied to a variety of types of data. We illustrate this modularity by demonstrating the algorithm on three diﬀerent real-world data sets.},
	language = {en},
	urldate = {2019-09-19},
	journal = {arXiv:0710.3742 [stat]},
	author = {Adams, Ryan Prescott and MacKay, David J. C.},
	month = oct,
	year = {2007},
	note = {arXiv: 0710.3742},
	keywords = {Statistics - Machine Learning},
	file = {Adams_MacKay_2007_Bayesian Online Changepoint Detection.pdf:/home/roland/Zotero/storage/48AQDRCC/Adams_MacKay_2007_Bayesian Online Changepoint Detection.pdf:application/pdf}
}

@article{ding_anomaly_2013,
	title = {An {Anomaly} {Detection} {Approach} {Based} on {Isolation} {Forest} {Algorithm} for {Streaming} {Data} using {Sliding} {Window}},
	volume = {46},
	issn = {14746670},
	url = {https://linkinghub.elsevier.com/retrieve/pii/S1474667016314999},
	doi = {10.3182/20130902-3-CN-3020.00044},
	abstract = {Anomalous behavior detection in many applications is becoming more and more important, such as computer security, sensor network and so on. However, the inherent characteristics of streaming data, such as generated quickly, data infinite, tremendous volume and the phenomenon of concept drift, imply that the anomaly detection in the streaming data is a challenge work. In this paper, using the frame of sliding windows and taking into account the concept drift phenomenon, a novel anomaly detection framework is presented and an adapted streaming data anomaly detection algorithm based on the iForest algorithm, namely iForestASD is proposed. The experiment results performed on four real-world datasets derived from the UCI repository demonstrate that the proposed algorithm can effective to detect anomalous instances for the streaming data.},
	language = {en},
	number = {20},
	urldate = {2019-09-19},
	journal = {IFAC Proceedings Volumes},
	author = {Ding, Zhiguo and Fei, Minrui},
	year = {2013},
	pages = {12--17},
	file = {Ding_Fei_2013_An Anomaly Detection Approach Based on Isolation Forest Algorithm for Streaming.pdf:/home/roland/Zotero/storage/4SAKXG5K/Ding_Fei_2013_An Anomaly Detection Approach Based on Isolation Forest Algorithm for Streaming.pdf:application/pdf}
}

@article{hill_anomaly_2010,
	title = {Anomaly detection in streaming environmental sensor data: {A} data-driven modeling approach},
	volume = {25},
	issn = {13648152},
	shorttitle = {Anomaly detection in streaming environmental sensor data},
	url = {https://linkinghub.elsevier.com/retrieve/pii/S1364815209002321},
	doi = {10.1016/j.envsoft.2009.08.010},
	abstract = {The deployment of environmental sensors has generated an interest in real-time applications of the data they collect. This research develops a real-time anomaly detection method for environmental data streams that can be used to identify data that deviate from historical patterns. The method is based on an autoregressive data-driven model of the data stream and its corresponding prediction interval. It performs fast, incremental evaluation of data as it becomes available, scales to large quantities of data, and requires no pre-classiﬁcation of anomalies. Furthermore, this method can be easily deployed on a large heterogeneous sensor network. Sixteen instantiations of this method are compared based on their ability to identify measurement errors in a windspeed data stream from Corpus Christi, Texas. The results indicate that a multilayer perceptron model of the data stream, coupled with replacement of anomalous data points, performs well at identifying erroneous data in this data stream.},
	language = {en},
	number = {9},
	urldate = {2019-09-19},
	journal = {Environmental Modelling \& Software},
	author = {Hill, David J. and Minsker, Barbara S.},
	month = sep,
	year = {2010},
	pages = {1014--1022},
	file = {Hill_Minsker_2010_Anomaly detection in streaming environmental sensor data.pdf:/home/roland/Zotero/storage/BG79P64V/Hill_Minsker_2010_Anomaly detection in streaming environmental sensor data.pdf:application/pdf}
}

@article{wu_hierarchical_2018,
	title = {Hierarchical {Temporal} {Memory} method for time-series-based anomaly detection},
	volume = {273},
	issn = {09252312},
	url = {https://linkinghub.elsevier.com/retrieve/pii/S0925231217313887},
	doi = {10.1016/j.neucom.2017.08.026},
	abstract = {The time-series-based anomaly detection is a well-studied subject, and it is well-documented in the literature. Theories and techniques have been proposed and applied successfully for domain-speciﬁc applications. However, this subject has received renewed interest motivated by the increasing importance of continuously learning, tolerance to noise and generalization. This paper tackles these problems by applying Hierarchical Temporal Memory (HTM), a novel biological neural network. HTM is more suitable for dealing with the changing pattern of data since it is capable of incorporating contextual information from the past to make more accurate prediction. Both artiﬁcial and real datasets are tested with HTM for the time-series-based anomaly detection. The experiment results show that HTM can eﬃciently detect the anomalies in time series data.},
	language = {en},
	urldate = {2019-09-19},
	journal = {Neurocomputing},
	author = {Wu, Jia and Zeng, Weiru and Yan, Fei},
	month = jan,
	year = {2018},
	pages = {535--546},
	file = {Wu et al_2018_Hierarchical Temporal Memory method for time-series-based anomaly detection.pdf:/home/roland/Zotero/storage/I2EC22SU/Wu et al_2018_Hierarchical Temporal Memory method for time-series-based anomaly detection.pdf:application/pdf}
}

@article{miller_twitter_2014,
	title = {Twitter spammer detection using data stream clustering},
	volume = {260},
	issn = {00200255},
	url = {https://linkinghub.elsevier.com/retrieve/pii/S0020025513008037},
	doi = {10.1016/j.ins.2013.11.016},
	abstract = {The rapid growth of Twitter has triggered a dramatic increase in spam volume and sophistication. The abuse of certain Twitter components such as ‘‘hashtags’’, ‘‘mentions’’, and shortened URLs enables spammers to operate efﬁciently. These same features, however, may be a key factor in identifying new spam accounts as shown in previous studies. Our study provides three novel contributions. Firstly, previous studies have approached spam detection as a classiﬁcation problem, whereas we view it as an anomaly detection problem. Secondly, 95 one-gram features from tweet text were introduced alongside the user information analyzed in previous studies. Finally, to effectively handle the streaming nature of tweets, two stream clustering algorithms, StreamKM++ and DenStream, were modiﬁed to facilitate spam identiﬁcation. Both algorithms clustered normal Twitter users, treating outliers as spammers. Each of these algorithms performed well individually, with StreamKM++ achieving 99\% recall and a 6.4\% false positive rate; and DenStream producing 99\% recall and a 2.8\% false positive rate. When used in conjunction, these algorithms reached 100\% recall and a 2.2\% false positive rate, meaning that our system was able to identify 100\% of the spammers in our test while incorrectly detecting only 2.2\% of normal users as spammers. Ó 2013 Elsevier Inc. All rights reserved.},
	language = {en},
	urldate = {2019-09-19},
	journal = {Information Sciences},
	author = {Miller, Zachary and Dickinson, Brian and Deitrick, William and Hu, Wei and Wang, Alex Hai},
	month = mar,
	year = {2014},
	keywords = {clustering},
	pages = {64--73},
	file = {Miller et al_2014_Twitter spammer detection using data stream clustering.pdf:/home/roland/Zotero/storage/UHU9X7M4/Miller et al_2014_Twitter spammer detection using data stream clustering.pdf:application/pdf}
}

@article{guha_robust_nodate,
	title = {Robust {Random} {Cut} {Forest} {Based} {Anomaly} {Detection} {On} {Streams}},
	abstract = {In this paper we focus on the anomaly detection problem for dynamic data streams through the lens of random cut forests. We investigate a robust random cut data structure that can be used as a sketch or synopsis of the input stream. We provide a plausible deﬁnition of non-parametric anomalies based on the inﬂuence of an unseen point on the remainder of the data, i.e., the externality imposed by that point. We show how the sketch can be efﬁciently updated in a dynamic data stream. We demonstrate the viability of the algorithm on publicly available real data.},
	language = {en},
	author = {Guha, Sudipto and Mishra, Nina and Roy, Gourav and Schrijvers, Okke},
	keywords = {ensemble, random forest},
	pages = {10},
	file = {Guha et al_Robust Random Cut Forest Based Anomaly Detection On Streams.pdf:/home/roland/Zotero/storage/WX5CUPX7/Guha et al_Robust Random Cut Forest Based Anomaly Detection On Streams.pdf:application/pdf}
}

@article{hayes_contextual_2015,
	title = {Contextual anomaly detection framework for big sensor data},
	volume = {2},
	issn = {2196-1115},
	url = {http://www.journalofbigdata.com/content/2/1/2},
	doi = {10.1186/s40537-014-0011-y},
	abstract = {The ability to detect and process anomalies for Big Data in real-time is a difficult task. The volume and velocity of the data within many systems makes it difficult for typical algorithms to scale and retain their real-time characteristics. The pervasiveness of data combined with the problem that many existing algorithms only consider the content of the data source; e.g. a sensor reading itself without concern for its context, leaves room for potential improvement. The proposed work defines a contextual anomaly detection framework. It is composed of two distinct steps: content detection and context detection. The content detector is used to determine anomalies in real-time, while possibly, and likely, identifying false positives. The context detector is used to prune the output of the content detector, identifying those anomalies which are considered both content and contextually anomalous. The context detector utilizes the concept of profiles, which are groups of similarly grouped data points generated by a multivariate clustering algorithm. The research has been evaluated against two real-world sensor datasets provided by a local company in Brampton, Canada. Additionally, the framework has been evaluated against the open-source Dodgers dataset, available at the UCI machine learning repository, and against the R statistical toolbox.},
	language = {en},
	number = {1},
	urldate = {2019-07-16},
	journal = {Journal of Big Data},
	author = {Hayes, Michael A and Capretz, Miriam AM},
	month = dec,
	year = {2015},
	keywords = {clustering},
	file = {Hayes_Capretz_2015_Contextual anomaly detection framework for big sensor data.pdf:/home/roland/Zotero/storage/SPQJN53F/Hayes_Capretz_2015_Contextual anomaly detection framework for big sensor data.pdf:application/pdf}
}

@article{akoglu_graph_2015,
	title = {Graph based anomaly detection and description: a survey},
	volume = {29},
	issn = {1384-5810, 1573-756X},
	shorttitle = {Graph based anomaly detection and description},
	url = {http://link.springer.com/10.1007/s10618-014-0365-y},
	doi = {10.1007/s10618-014-0365-y},
	language = {en},
	number = {3},
	urldate = {2019-07-16},
	journal = {Data Mining and Knowledge Discovery},
	author = {Akoglu, Leman and Tong, Hanghang and Koutra, Danai},
	month = may,
	year = {2015},
	keywords = {survey},
	pages = {626--688},
	file = {Akoglu et al_2015_Graph based anomaly detection and description.pdf:/home/roland/Zotero/storage/4FUYS6QY/Akoglu et al_2015_Graph based anomaly detection and description.pdf:application/pdf}
}

@inproceedings{rettig_online_2015,
	address = {Santa Clara, CA, USA},
	title = {Online {Anomaly} {Detection} over {Big} {Data} {Streams}},
	isbn = {978-1-4799-9926-2},
	url = {http://ieeexplore.ieee.org/document/7363865/},
	doi = {10.1109/BigData.2015.7363865},
	abstract = {Data quality is a challenging problem in many real world application domains. While a lot of attention has been given to detect anomalies for data at rest, detecting anomalies for streaming applications still largely remains an open problem. For applications involving several data streams, the challenge of detecting anomalies has become harder over time, as data can dynamically evolve in subtle ways following changes in the underlying infrastructure. In this paper, we describe and empirically evaluate an online anomaly detection pipeline that satisﬁes two key conditions: generality and scalability. Our technique works on numerical data as well as on categorical data and makes no assumption on the underlying data distributions. We implement two metrics, relative entropy and Pearson correlation, to dynamically detect anomalies. The two metrics we use provide an efﬁcient and effective detection of anomalies over high velocity streams of events.},
	language = {en},
	urldate = {2019-07-16},
	booktitle = {2015 {IEEE} {International} {Conference} on {Big} {Data} ({Big} {Data})},
	publisher = {IEEE},
	author = {Rettig, Laura and Khayati, Mourad and Cudre-Mauroux, Philippe and Piorkowski, Michal},
	month = oct,
	year = {2015},
	pages = {1113--1122},
	file = {Rettig et al_2015_Online anomaly detection over Big Data streams.pdf:/home/roland/Zotero/storage/8A6PHA4W/Rettig et al_2015_Online anomaly detection over Big Data streams.pdf:application/pdf}
}

@article{shipmon_time_2017,
	title = {Time {Series} {Anomaly} {Detection}},
	abstract = {Google uses continuous streams of data from industry partners in order to deliver accurate results to users. Unexpected drops in traffic can be an indication of an underlying issue and may be an early warning that remedial action may be necessary. Detecting such drops is non-trivial because streams are variable and noisy, with roughly regular spikes (in many different shapes) in traffic data. We investigated the question of whether or not we can predict anomalies in these data streams. Our goal is to utilize Machine Learning and statistical approaches to classify anomalous drops in periodic, but noisy, traffic patterns. Since we do not have a large body of labeled examples to directly apply supervised learning for anomaly classification, we approached the problem in two parts. First we used TensorFlow to train our various models including DNNs, RNNs, and LSTMs to perform regression and predict the expected value in the time series. Secondly we created anomaly detection rules that compared the actual values to predicted values. Since the problem requires finding sustained anomalies, rather than just short delays or momentary inactivity in the data, our two detection methods focused on continuous sections of activity rather than just single points. We tried multiple combinations of our models and rules and found that using the intersection of our two anomaly detection methods proved to be an effective method of detecting anomalies on almost all of our models. In the process we also found that not all data fell within our experimental assumptions, as one data stream had no periodicity, and therefore no time based model could predict it.},
	language = {en},
	journal = {arXiv:1708.03665},
	author = {Shipmon, Dominique T and Gurevitch, Jason M and Piselli, Paolo M and Edwards, Steve},
	year = {2017},
	pages = {9},
	file = {Shipmon et al_2017_Time series anomaly detection\; detection of anomalous drops with limited.pdf:/home/roland/Zotero/storage/AE5CGS2W/Shipmon et al_2017_Time series anomaly detection\; detection of anomalous drops with limited.pdf:application/pdf}
}

@article{ahmad_real-time_2016,
	title = {Real-{Time} {Anomaly} {Detection} for {Streaming} {Analytics}},
	url = {http://arxiv.org/abs/1607.02480},
	abstract = {Much of the worlds data is streaming, time-series data, where anomalies give signiﬁcant information in critical situations. Yet detecting anomalies in streaming data is a difﬁcult task, requiring detectors to process data in real-time, and learn while simultaneously making predictions. We present a novel anomaly detection technique based on an on-line sequence memory algorithm called Hierarchical Temporal Memory (HTM). We show results from a live application that detects anomalies in ﬁnancial metrics in realtime. We also test the algorithm on NAB, a published benchmark for real-time anomaly detection, where our algorithm achieves best-in-class results.},
	language = {en},
	urldate = {2019-07-16},
	journal = {arXiv:1607.02480 [cs]},
	author = {Ahmad, Subutai and Purdy, Scott},
	month = jul,
	year = {2016},
	note = {arXiv: 1607.02480},
	keywords = {Computer Science - Artificial Intelligence, Computer Science - Distributed, Parallel, and Cluster Computing, Electrical Engineering and Systems Science - Systems and Control},
	file = {Ahmad_Purdy_2016_Real-Time Anomaly Detection for Streaming Analytics.pdf:/home/roland/Zotero/storage/SPMVZ7SB/Ahmad_Purdy_2016_Real-Time Anomaly Detection for Streaming Analytics.pdf:application/pdf}
}

@article{ahmad_unsupervised_2017,
	title = {Unsupervised real-time anomaly detection for streaming data},
	volume = {262},
	issn = {09252312},
	url = {https://linkinghub.elsevier.com/retrieve/pii/S0925231217309864},
	doi = {10.1016/j.neucom.2017.04.070},
	abstract = {We are seeing an enormous increase in the availability of streaming, time-series data. Largely driven by the rise of connected real-time data sources, this data presents technical challenges and opportunities. One fundamental capability for streaming analytics is to model each stream in an unsupervised fashion and detect unusual, anomalous behaviors in real-time. Early anomaly detection is valuable, yet it can be diﬃcult to execute reliably in practice. Application constraints require systems to process data in realtime, not batches. Streaming data inherently exhibits concept drift, favoring algorithms that learn continuously. Furthermore, the massive number of independent streams in practice requires that anomaly detectors be fully automated. In this paper we propose a novel anomaly detection algorithm that meets these constraints. The technique is based on an online sequence memory algorithm called Hierarchical Temporal Memory (HTM). We also present results using the Numenta Anomaly Benchmark (NAB), a benchmark containing real-world data streams with labeled anomalies. The benchmark, the ﬁrst of its kind, provides a controlled open-source environment for testing anomaly detection algorithms on streaming data. We present results and analysis for a wide range of algorithms on this benchmark, and discuss future challenges for the emerging ﬁeld of streaming analytics.},
	language = {en},
	urldate = {2019-07-16},
	journal = {Neurocomputing},
	author = {Ahmad, Subutai and Lavin, Alexander and Purdy, Scott and Agha, Zuha},
	month = nov,
	year = {2017},
	keywords = {benchmark, htm},
	pages = {134--147},
	file = {Ahmad et al_2017_Unsupervised real-time anomaly detection for streaming data.pdf:/home/roland/Zotero/storage/XIZQUAB5/Ahmad et al_2017_Unsupervised real-time anomaly detection for streaming data.pdf:application/pdf}
}

@inproceedings{bifet_new_2009,
	address = {Paris, France},
	title = {New {Ensemble} {Methods} {For} {Evolving} {Data} {Streams}},
	isbn = {978-1-60558-495-9},
	url = {http://portal.acm.org/citation.cfm?doid=1557019.1557041},
	doi = {10.1145/1557019.1557041},
	abstract = {Advanced analysis of data streams is quickly becoming a key area of data mining research as the number of applications demanding such processing increases. Online mining when such data streams evolve over time, that is when concepts drift or change completely, is becoming one of the core issues. When tackling non-stationary concepts, ensembles of classiﬁers have several advantages over single classiﬁer methods: they are easy to scale and parallelize, they can adapt to change quickly by pruning under-performing parts of the ensemble, and they therefore usually also generate more accurate concept descriptions. This paper proposes a new experimental data stream framework for studying concept drift, and two new variants of Bagging: ADWIN Bagging and Adaptive-Size Hoeﬀding Tree (ASHT) Bagging. Using the new experimental framework, an evaluation study on synthetic and real-world datasets comprising up to ten million examples shows that the new ensemble methods perform very well compared to several known methods.},
	language = {en},
	urldate = {2019-07-16},
	booktitle = {Proceedings of the 15th {ACM} {SIGKDD} international conference on {Knowledge} discovery and data mining - {KDD} '09},
	publisher = {ACM Press},
	author = {Bifet, Albert and Holmes, Geoff and Pfahringer, Bernhard and Kirkby, Richard and Gavaldà, Ricard},
	year = {2009},
	pages = {139},
	file = {Bifet et al_2009_New ensemble methods for evolving data streams.pdf:/home/roland/Zotero/storage/9H2DFLNW/Bifet et al_2009_New ensemble methods for evolving data streams.pdf:application/pdf}
}

@inproceedings{domingos_mining_2000,
	address = {Boston, Massachusetts, United States},
	title = {Mining high-speed data streams},
	isbn = {978-1-58113-233-5},
	url = {http://portal.acm.org/citation.cfm?doid=347090.347107},
	doi = {10.1145/347090.347107},
	language = {en},
	urldate = {2019-07-16},
	booktitle = {Proceedings of the sixth {ACM} {SIGKDD} international conference on {Knowledge} discovery and data mining  - {KDD} '00},
	publisher = {ACM Press},
	author = {Domingos, Pedro and Hulten, Geoff},
	year = {2000},
	pages = {71--80},
	file = {Domingos_Hulten_2000_Mining high-speed data streams.pdf:/home/roland/Zotero/storage/GKU5MXHW/Domingos_Hulten_2000_Mining high-speed data streams.pdf:application/pdf}
}

@article{silva_data_2013,
	title = {Data stream clustering: {A} survey},
	volume = {46},
	issn = {03600300},
	shorttitle = {Data stream clustering},
	url = {http://dl.acm.org/citation.cfm?doid=2522968.2522981},
	doi = {10.1145/2522968.2522981},
	language = {en},
	number = {1},
	urldate = {2019-07-16},
	journal = {ACM Computing Surveys},
	author = {Silva, Jonathan A. and Faria, Elaine R. and Barros, Rodrigo C. and Hruschka, Eduardo R. and Carvalho, André C. P. L. F. de and Gama, João},
	month = oct,
	year = {2013},
	keywords = {survey},
	pages = {1--31},
	file = {Silva et al_2013_Data stream clustering.pdf:/home/roland/Zotero/storage/3ZDJ9MJQ/Silva et al_2013_Data stream clustering.pdf:application/pdf}
}

@article{akidau_evolution_nodate,
	title = {The {Evolution} of {Massive}-{Scale} {Data} {Processing}},
	language = {en},
	author = {Akidau, Tyler},
	pages = {112},
	file = {Akidau_The Evolution of Massive-Scale Data Processing.pdf:/home/roland/Zotero/storage/7DSWZPYT/Akidau_The Evolution of Massive-Scale Data Processing.pdf:application/pdf}
}

@techreport{kitchenham_guidelines_2007,
	title = {Guidelines for performing {Systematic} {Literature} {Reviews} in {Software} {Engineering}},
	urldate = {2019-09-25},
	author = {Kitchenham, Barbara and Charters, Stuart},
	month = jul,
	year = {2007},
	file = {Kitchenham_Charters_2007_Guidelines for performing Systematic Literature Reviews in Software Engineering.pdf:/home/roland/Zotero/storage/F7Z4KCGS/Kitchenham_Charters_2007_Guidelines for performing Systematic Literature Reviews in Software Engineering.pdf:application/pdf}
}

@article{cugola_processing_2012,
	title = {Processing flows of information: {From} data stream to complex event processing},
	volume = {44},
	issn = {03600300},
	shorttitle = {Processing flows of information},
	url = {http://dl.acm.org/citation.cfm?doid=2187671.2187677},
	doi = {10.1145/2187671.2187677},
	language = {en},
	number = {3},
	urldate = {2019-09-24},
	journal = {ACM Computing Surveys},
	author = {Cugola, Gianpaolo and Margara, Alessandro},
	month = jun,
	year = {2012},
	pages = {1--62},
	file = {Cugola_Margara_2012_Processing flows of information.pdf:/home/roland/Zotero/storage/G4P5D2VS/Cugola_Margara_2012_Processing flows of information.pdf:application/pdf}
}

@article{toliopoulos_continuous_2019,
	title = {Continuous {Outlier} {Mining} of {Streaming} {Data} in {Flink}},
	url = {http://arxiv.org/abs/1902.07901},
	abstract = {In this work, we focus on distance-based outliers in a metric space, where the status of an entity as to whether it is an outlier is based on the number of other entities in its neighborhood. In recent years, several solutions have tackled the problem of distance-based outliers in data streams, where outliers must be mined continuously as new elements become available. An interesting research problem is to combine the streaming environment with massively parallel systems to provide scalable streambased algorithms. However, none of the previously proposed techniques refer to a massively parallel setting. Our proposal ﬁlls this gap and investigates the challenges in transferring state-of-the-art techniques to Apache Flink, a modern platform for intensive streaming analytics. We thoroughly present the technical challenges encountered and the alternatives that may be applied. We show speed-ups of up to 117 (resp. 2076) times over a naive parallel (resp. non-parallel) solution in Flink, by using just an ordinary four-core machine and a real-world dataset. When moving to a three-machine cluster, due to less contention, we manage to achieve both better scalability in terms of the window slide size and the data dimensionality, and even higher speed-ups, e.g., by a factor of 510. Overall, our results demonstrate that oulier mining can be achieved in an efﬁcient and scalable manner. The resulting techniques have been made publicly available as open-source software.},
	language = {en},
	urldate = {2019-09-24},
	journal = {arXiv:1902.07901 [cs]},
	author = {Toliopoulos, Theodoros and Gounaris, Anastasios and Tsichlas, Kostas and Papadopoulos, Apostolos and Sampaio, Sandra},
	month = feb,
	year = {2019},
	note = {arXiv: 1902.07901},
	keywords = {Computer Science - Distributed, Parallel, and Cluster Computing, Computer Science - Machine Learning, Computer Science - Databases},
	file = {Toliopoulos et al_2019_Continuous Outlier Mining of Streaming Data in Flink.pdf:/home/roland/Zotero/storage/WP3RDJSC/Toliopoulos et al_2019_Continuous Outlier Mining of Streaming Data in Flink.pdf:application/pdf}
}

@article{kitchenham_systematic_2009,
	title = {Systematic literature reviews in software engineering – {A} systematic literature review},
	volume = {51},
	issn = {09505849},
	url = {https://linkinghub.elsevier.com/retrieve/pii/S0950584908001390},
	doi = {10.1016/j.infsof.2008.09.009},
	abstract = {Background: In 2004 the concept of evidence-based software engineering (EBSE) was introduced at the ICSE04 conference. Aims: This study assesses the impact of systematic literature reviews (SLRs) which are the recommended EBSE method for aggregating evidence.
Method: We used the standard systematic literature review method employing a manual search of 10 journals and 4 conference proceedings.
Results: Of 20 relevant studies, eight addressed research trends rather than technique evaluation. Seven SLRs addressed cost estimation. The quality of SLRs was fair with only three scoring less than 2 out of 4.
Conclusions: Currently, the topic areas covered by SLRs are limited. European researchers, particularly those at the Simula Laboratory appear to be the leading exponents of systematic literature reviews. The series of cost estimation SLRs demonstrate the potential value of EBSE for synthesising evidence and making it available to practitioners.},
	language = {en},
	number = {1},
	urldate = {2019-09-24},
	journal = {Information and Software Technology},
	author = {Kitchenham, Barbara and Pearl Brereton, O. and Budgen, David and Turner, Mark and Bailey, John and Linkman, Stephen},
	month = jan,
	year = {2009},
	pages = {7--15},
	file = {Kitchenham et al_2009_Systematic literature reviews in software engineering – A systematic literature.pdf:/home/roland/Zotero/storage/TNE9EAPR/Kitchenham et al_2009_Systematic literature reviews in software engineering – A systematic literature.pdf:application/pdf}
}

@article{xie_anomaly_2011,
	title = {Anomaly detection in wireless sensor networks: {A} survey},
	volume = {34},
	issn = {10848045},
	shorttitle = {Anomaly detection in wireless sensor networks},
	url = {https://linkinghub.elsevier.com/retrieve/pii/S1084804511000580},
	doi = {10.1016/j.jnca.2011.03.004},
	abstract = {Since security threats to WSNs are increasingly being diversiﬁed and deliberate, prevention-based techniques alone can no longer provide WSNs with adequate security. However, detection-based techniques might be effective in collaboration with prevention-based techniques for securing WSNs. As a signiﬁcant branch of detection-based techniques, the research of anomaly detection in wired networks and wireless ad hoc networks is already quite mature, but such solutions can be rarely applied to WSNs without any change, because WSNs are characterized by constrained resources, such as limited energy, weak computation capability, poor memory, short communication range, etc. The development of anomaly detection techniques suitable for WSNs is therefore regarded as an essential research area, which will enable WSNs to be much more secure and reliable. In this survey paper, a few of the key design principles relating to the development of anomaly detection techniques in WSNs are discussed in particular. Then, the state-of-the-art techniques of anomaly detection in WSNs are systematically introduced, according to WSNs’ architectures (Hierarchical/Flat) and detection technique categories (statistical techniques, rule based, data mining, computational intelligence, game theory, graph based, and hybrid, etc.). The analyses and comparisons of the approaches that belong to a similar technique category are represented technically, followed by a brief discussion towards the potential research areas in the near future and conclusion.},
	language = {en},
	number = {4},
	urldate = {2019-09-24},
	journal = {Journal of Network and Computer Applications},
	author = {Xie, Miao and Han, Song and Tian, Biming and Parvin, Sazia},
	month = jul,
	year = {2011},
	keywords = {survey},
	pages = {1302--1325},
	file = {Xie et al_2011_Anomaly detection in wireless sensor networks.pdf:/home/roland/Zotero/storage/ABVAWH9J/Xie et al_2011_Anomaly detection in wireless sensor networks.pdf:application/pdf}
}

@article{kanarachos_detecting_2017,
	title = {Detecting anomalies in time series data via a deep learning algorithm combining wavelets, neural networks and {Hilbert} transform},
	volume = {85},
	issn = {09574174},
	url = {https://linkinghub.elsevier.com/retrieve/pii/S0957417417302737},
	doi = {10.1016/j.eswa.2017.04.028},
	abstract = {The quest for more eﬃcient real-time detection of anomalies in time series data is critically important in numerous applications and systems ranging from intelligent transportation, structural health monitoring, heart disease, and earthquake prediction. Although the range of application is wide, anomaly detection algorithms are usually domain speciﬁc and build on experts’ knowledge. Here a new signal processing algorithm – inspired by the deep learning paradigm – is presented that combines wavelets, neural networks, and Hilbert transform. The algorithm performs robustly and is transferable. The proposed neural network structure facilitates learning short and long-term pattern interdependencies; a task usually hard to accomplish using standard neural network training algorithms. The paper provides guidelines for selecting the neural network’s buffer size, training algorithm, and anomaly detection features. The algorithm learns the system’s normal behavior and does not require the existence of anomalous data for assessing its statistical signiﬁcance. This is an essential attribute in applications that require customization. Anomalies are detected by analysing hierarchically the instantaneous frequency and amplitude of the residual signal using probabilistic Receiver Operating Characteristics. The method is shown to be able to automatically detect anomalies in the Seismic Electrical Signal that could be used to predict earthquake activity. Furthermore, the method can be used in combination with crowdsourcing of smartphone data to locate road defects such as potholes and bumps for intervention and repair.},
	language = {en},
	urldate = {2019-09-24},
	journal = {Expert Systems with Applications},
	author = {Kanarachos, Stratis and Christopoulos, Stavros-Richard G. and Chroneos, Alexander and Fitzpatrick, Michael E.},
	month = nov,
	year = {2017},
	pages = {292--304},
	file = {Kanarachos et al_2017_Detecting anomalies in time series data via a deep learning algorithm combining.pdf:/home/roland/Zotero/storage/JUDJ28NQ/Kanarachos et al_2017_Detecting anomalies in time series data via a deep learning algorithm combining.pdf:application/pdf}
}

@article{serdio_fault_2014,
	title = {Fault detection in multi-sensor networks based on multivariate time-series models and orthogonal transformations},
	volume = {20},
	issn = {15662535},
	url = {https://linkinghub.elsevier.com/retrieve/pii/S1566253514000451},
	doi = {10.1016/j.inffus.2014.03.006},
	abstract = {We introduce the usage of multivariate orthogonal space transformations and vectorized time-series models in combination with data-driven system identiﬁcation models to achieve an enhanced performance of residual-based fault detection in condition monitoring systems equipped with multi-sensor networks. Neither time-consuming annotated samples nor fault patterns/models need to be available, as our approach is solely based on on-line recorded data streams. The system identiﬁcation step acts as a fusion operation by searching for relations and dependencies between sensor channels measuring the state of system variables. We therefore apply three different vectorized time-series variants: (i) non-linear ﬁnite impulse response models (NFIR) relying only on the lagged input variables, (ii) non-linear output error models (NOE), also including the lags of the own predictions and (iii) non-linear Box–Jenkins models (NBJ) which include the lags of the predictions errors as well. The use of multivariate orthogonal space transformations allows to produce more compact and accurate models due to an integrated dimensionality (noise) reduction step. Fault detection is conducted based on ﬁnding anomalies (untypical occurrences) in the temporal residual signal in incremental manner. Our experimental results achieved on four real-world condition monitoring scenarios employing multi-sensor network systems demonstrate that the Receiver Operating Characteristic (ROC) curves are improved over those ones achieved with native static models (w/o lags, w/o transformations) by about 20–30\%.},
	language = {en},
	urldate = {2019-09-24},
	journal = {Information Fusion},
	author = {Serdio, Francisco and Lughofer, Edwin and Pichler, Kurt and Buchegger, Thomas and Pichler, Markus and Efendic, Hajrudin},
	month = nov,
	year = {2014},
	pages = {272--291},
	file = {Serdio et al_2014_Fault detection in multi-sensor networks based on multivariate time-series.pdf:/home/roland/Zotero/storage/EVH4IYAX/Serdio et al_2014_Fault detection in multi-sensor networks based on multivariate time-series.pdf:application/pdf}
}

@article{bosman_spatial_2017,
	title = {Spatial anomaly detection in sensor networks using neighborhood information},
	volume = {33},
	issn = {15662535},
	url = {https://linkinghub.elsevier.com/retrieve/pii/S1566253516300252},
	doi = {10.1016/j.inffus.2016.04.007},
	abstract = {The ﬁeld of wireless sensor networks (WSNs), embedded systems with sensing and networking capability, has now matured after a decade-long research effort and technological advances in electronics and networked systems. An important remaining challenge now is to extract meaningful information from the ever-increasing amount of sensor data collected by WSNs. In particular, there is strong interest in algorithms capable of automatic detection of patterns, events or other out-of-the order, anomalous system behavior. Data anomalies may indicate states of the system that require further analysis or prompt actions. Traditionally, anomaly detection techniques are executed in a central processing facility, which requires the collection of all measurement data at a central location, an obvious limitation for WSNs due to the high data communication costs involved. In this paper we explore the extent by which one may depart from this classical centralized paradigm, looking at decentralized anomaly detection based on unsupervised machine learning. Our aim is to detect anomalies at the sensor nodes, as opposed to centrally, to reduce energy and spectrum consumption. We study the information gain coming from aggregate neighborhood data, in comparison to performing simple, in-node anomaly detection. We evaluate the effects of neighborhood size and spatio-temporal correlation on the performance of our new neighborhood-based approach using a range of real-world network deployments and datasets. We ﬁnd the conditions that make neighborhood data fusion advantageous, identifying also the cases in which this approach does not lead to detectable improvements. Improvements are linked to the diffusive properties of data (spatio-temporal correlations) but also to the type of sensors, anomalies and network topological features. Overall, when a dataset stems from a similar mixture of diffusive processes precision tends to beneﬁt, particularly in terms of recall. Our work paves the way towards understanding how distributed data fusion methods may help managing the complexity of wireless sensor networks, for instance in massive Internet of Things scenarios.},
	language = {en},
	urldate = {2019-09-24},
	journal = {Information Fusion},
	author = {Bosman, Hedde HWJ and Iacca, Giovanni and Tejada, Arturo and Wörtche, Heinrich J. and Liotta, Antonio},
	month = jan,
	year = {2017},
	pages = {41--56},
	file = {Bosman et al_2017_Spatial anomaly detection in sensor networks using neighborhood information.pdf:/home/roland/Zotero/storage/WFC4ZH4Y/Bosman et al_2017_Spatial anomaly detection in sensor networks using neighborhood information.pdf:application/pdf}
}

@article{agrawal_survey_2015,
	title = {Survey on {Anomaly} {Detection} using {Data} {Mining} {Techniques}},
	volume = {60},
	issn = {18770509},
	url = {https://linkinghub.elsevier.com/retrieve/pii/S1877050915023479},
	doi = {10.1016/j.procs.2015.08.220},
	abstract = {In the present world huge amounts of data are stored and transferred from one location to another. The data when transferred or stored is primed exposed to attack. Although various techniques or applications are available to protect data, loopholes exist. Thus to analyze data and to determine various kind of attack data mining techniques have emerged to make it less vulnerable. Anomaly detection uses these data mining techniques to detect the surprising behaviour hidden within data increasing the chances of being intruded or attacked. Various hybrid approaches have also been made in order to detect known and unknown attacks more accurately. This paper reviews various data mining techniques for anomaly detection to provide better understanding among the existing techniques that may help interested researchers to work future in this direction.},
	language = {en},
	urldate = {2019-09-24},
	journal = {Procedia Computer Science},
	author = {Agrawal, Shikha and Agrawal, Jitendra},
	year = {2015},
	pages = {708--713},
	file = {Agrawal_Agrawal_2015_Survey on Anomaly Detection using Data Mining Techniques.pdf:/home/roland/Zotero/storage/TK69WDSW/Agrawal_Agrawal_2015_Survey on Anomaly Detection using Data Mining Techniques.pdf:application/pdf}
}

@article{lane_application_nodate,
	title = {An {Application} of {Machine} {Learning} to {Anomaly} {Detection}},
	abstract = {The anomaly detection problem has been widely studied in the computer security literature. In this paper we present a machine learning approach to anomaly detection. Our system builds user proﬁles based on command sequences and compares current input sequences to the proﬁle using a similarity measure. The system must learn to classify current behavior as consistent or anomalous with past behavior using only positive examples of the account's valid user. Our empirical results demonstrate that this is a promising approach to distinguishing the legitamate user from an intruder.},
	language = {en},
	author = {Lane, Terran and Brodley, Carla E},
	pages = {13}
}

@inproceedings{singh_demystifying_2017,
	address = {Anchorage, AK, USA},
	title = {Demystifying {Numenta} anomaly benchmark},
	isbn = {978-1-5090-6182-2},
	url = {http://ieeexplore.ieee.org/document/7966038/},
	doi = {10.1109/IJCNN.2017.7966038},
	abstract = {Detecting anomalies in large-scale, streaming datasets has wide applicability in a myriad of domains like network intrusion detection for cyber-security, fraud detection for credit cards, system health monitoring, and fault detection in safety critical systems. Due to its wide applicability, the problem of anomaly detection has been well-studied by industry and academia alike, and many algorithms have been proposed for detecting anomalies in different problem settings. But until recently, there was no openly available, systematic dataset and/or framework using which the proposed anomaly detection algorithms could be compared and evaluated on a common ground. Numenta Anomaly Benchmark (NAB), made available by Numenta1 in 2015, addressed this gap by providing a set of openly-available, labeled data ﬁles and a common scoring system, using which different anomaly detection algorithms could be fairly evaluated and compared. In this paper, we provide an in-depth analysis of the key aspects of the NAB framework, and highlight inherent challenges therein, with the objective to provide insights about the gaps in the current framework that must be addressed so as to make it more robust and easyto-use. Furthermore, we also provide additional evaluation of ﬁve state-of-the-art anomaly detection algorithms (including the ones proposed by Numenta) using the NAB datasets, and based on the evaluation results, we argue that the performance of these algorithms is not sufﬁcient for practical, industry-scale applications, and must be improved upon so as to make them suitable for large-scale anomaly detection problems.},
	language = {en},
	urldate = {2019-11-22},
	booktitle = {2017 {International} {Joint} {Conference} on {Neural} {Networks} ({IJCNN})},
	publisher = {IEEE},
	author = {Singh, Nidhi and Olinsky, Craig},
	month = may,
	year = {2017},
	pages = {1570--1577},
	file = {Singh_Olinsky_2017_Demystifying Numenta anomaly benchmark.pdf:/home/roland/Zotero/storage/HEPPWLPX/Singh_Olinsky_2017_Demystifying Numenta anomaly benchmark.pdf:application/pdf}
}

@article{pimentel_review_2014,
	title = {A review of novelty detection},
	volume = {99},
	issn = {01651684},
	url = {https://linkinghub.elsevier.com/retrieve/pii/S016516841300515X},
	doi = {10.1016/j.sigpro.2013.12.026},
	abstract = {Novelty detection is the task of classifying test data that differ in some respect from the data that are available during training. This may be seen as “one-class classification”, in which a model is constructed to describe “normal” training data. The novelty detection approach is typically used when the quantity of available “abnormal” data is insufficient to construct explicit models for non-normal classes. Application includes inference in datasets from critical systems, where the quantity of available normal data is very large, such that “normality” may be accurately modelled. In this review we aim to provide an updated and structured investigation of novelty detection research papers that have appeared in the machine learning literature during the last decade.},
	language = {en},
	urldate = {2019-11-20},
	journal = {Signal Processing},
	author = {Pimentel, Marco A.F. and Clifton, David A. and Clifton, Lei and Tarassenko, Lionel},
	month = jun,
	year = {2014},
	pages = {215--249},
	file = {Pimentel et al_2014_A review of novelty detection.pdf:/home/roland/Zotero/storage/5LGAFUKF/Pimentel et al_2014_A review of novelty detection.pdf:application/pdf}
}

@article{sarabadani_building_2017,
	title = {Building automated vandalism detection tools for {Wikidata}},
	url = {http://arxiv.org/abs/1703.03861},
	doi = {10.1145/3041021.3053366},
	abstract = {Wikidata, like Wikipedia, is a knowledge base that anyone can edit. This open collaboration model is powerful in that it reduces barriers to participation and allows a large number of people to contribute. However, it exposes the knowledge base to the risk of vandalism and low-quality contributions. In this work, we build on past work detecting vandalism in Wikipedia to detect vandalism in Wikidata. This work is novel in that identifying damaging changes in a structured knowledge-base requires substantially diﬀerent feature engineering work than in a text-based wiki like Wikipedia. We also discuss the utility of these classiﬁers for reducing the overall workload of vandalism patrollers in Wikidata. We describe a machine classiﬁcation strategy that is able to catch 89\% of vandalism while reducing patrollers’ workload by 98\%, by drawing lightly from contextual features of an edit and heavily from the characteristics of the user making the edit.},
	language = {en},
	urldate = {2019-11-05},
	journal = {Proceedings of the 26th International Conference on World Wide Web Companion - WWW '17 Companion},
	author = {Sarabadani, Amir and Halfaker, Aaron and Taraborelli, Dario},
	year = {2017},
	note = {arXiv: 1703.03861},
	keywords = {Computer Science - Information Retrieval, Computer Science - Computers and Society},
	pages = {1647--1654},
	file = {Sarabadani et al_2017_Building automated vandalism detection tools for Wikidata.pdf:/home/roland/Zotero/storage/HDIUAPFX/Sarabadani et al_2017_Building automated vandalism detection tools for Wikidata.pdf:application/pdf}
}

@article{yoon_nets:_2019,
	title = {{NETS}: {Extremely} {Fast} {Outlier} {Detection} from a {Data} {Stream} via {Set}-{Based} {Processing}},
	volume = {12},
	issn = {21508097},
	shorttitle = {{NETS}},
	url = {http://dl.acm.org/citation.cfm?doid=3342263.3360345},
	doi = {10.14778/3342263.3342269},
	abstract = {This paper addresses the problem of eﬃciently detecting outliers from a data stream as old data points expire from and new data points enter the window incrementally. The proposed method is based on a newly discovered characteristic of a data stream that the change in the locations of data points in the data space is typically very insigniﬁcant. This observation has led to the ﬁnding that the existing distance-based outlier detection algorithms perform excessive unnecessary computations that are repetitive and/or canceling out the eﬀects. Thus, in this paper, we propose a novel set-based approach to detecting outliers, whereby data points at similar locations are grouped and the detection of outliers or inliers is handled at the group level. Speciﬁcally, a new algorithm NETS is proposed to achieve a remarkable performance improvement by realizing set-based early identiﬁcation of outliers or inliers and taking advantage of the “net eﬀect” between expired and new data points. Additionally, NETS is capable of achieving the same eﬃciency even for a high-dimensional data stream through two-level dimensional ﬁltering. Comprehensive experiments using six real-world data streams show 5 to 25 times faster processing time than state-of-the-art algorithms with comparable memory consumption. We assert that NETS opens a new possibility to real-time data stream outlier detection.},
	language = {en},
	number = {11},
	urldate = {2019-11-05},
	journal = {Proceedings of the VLDB Endowment},
	author = {Yoon, Susik and Lee, Jae-Gil and Lee, Byung Suk},
	month = jul,
	year = {2019},
	pages = {1303--1315},
	file = {Yoon et al_2019_NETS.pdf:/home/roland/Zotero/storage/M5IHWT7U/Yoon et al_2019_NETS.pdf:application/pdf}
}

@inproceedings{oza_experimental_2001,
	address = {San Francisco, California},
	title = {Experimental comparisons of online and batch versions of bagging and boosting},
	isbn = {978-1-58113-391-2},
	url = {http://portal.acm.org/citation.cfm?doid=502512.502565},
	doi = {10.1145/502512.502565},
	language = {en},
	urldate = {2019-11-05},
	booktitle = {Proceedings of the seventh {ACM} {SIGKDD} international conference on {Knowledge} discovery and data mining  - {KDD} '01},
	publisher = {ACM Press},
	author = {Oza, Nikunj C. and Russell, Stuart},
	year = {2001},
	pages = {359--364},
	file = {Oza_Russell_2001_Experimental comparisons of online and batch versions of bagging and boosting.pdf:/home/roland/Zotero/storage/27CS952P/Oza_Russell_2001_Experimental comparisons of online and batch versions of bagging and boosting.pdf:application/pdf}
}

@inproceedings{heindorf_vandalism_2016,
	address = {Indianapolis, Indiana, USA},
	title = {Vandalism {Detection} in {Wikidata}},
	isbn = {978-1-4503-4073-1},
	url = {http://dl.acm.org/citation.cfm?doid=2983323.2983740},
	doi = {10.1145/2983323.2983740},
	abstract = {Wikidata is the new, large-scale knowledge base of the Wikimedia Foundation. Its knowledge is increasingly used within Wikipedia itself and various other kinds of information systems, imposing high demands on its integrity. Wikidata can be edited by anyone and, unfortunately, it frequently gets vandalized, exposing all information systems using it to the risk of spreading vandalized and falsiﬁed information. In this paper, we present a new machine learning-based approach to detect vandalism in Wikidata. We propose a set of 47 features that exploit both content and context information, and we report on 4 classiﬁers of increasing effectiveness tailored to this learning task. Our approach is evaluated on the recently published Wikidata Vandalism Corpus WDVC-2015 and it achieves an area under curve value of the receiver operating characteristic, ROCAUC, of 0.991. It signiﬁcantly outperforms the state of the art represented by the rule-based Wikidata Abuse Filter (0.865 ROCAUC) and a prototypical vandalism detector recently introduced by Wikimedia within the Objective Revision Evaluation Service (0.859 ROCAUC).},
	language = {en},
	urldate = {2019-11-05},
	booktitle = {Proceedings of the 25th {ACM} {International} on {Conference} on {Information} and {Knowledge} {Management} - {CIKM} '16},
	publisher = {ACM Press},
	author = {Heindorf, Stefan and Potthast, Martin and Stein, Benno and Engels, Gregor},
	year = {2016},
	pages = {327--336},
	file = {Heindorf et al_2016_Vandalism Detection in Wikidata.pdf:/home/roland/Zotero/storage/7MNDUCP6/Heindorf et al_2016_Vandalism Detection in Wikidata.pdf:application/pdf}
}

@inproceedings{costa_online_2015,
	address = {Killarney, Ireland},
	title = {Online fault detection based on {Typicality} and {Eccentricity} {Data} {Analytics}},
	isbn = {978-1-4799-1960-4},
	url = {http://ieeexplore.ieee.org/document/7280712/},
	doi = {10.1109/IJCNN.2015.7280712},
	abstract = {Fault detection is a task of major importance in industry nowadays, since that it can considerably reduce the risk of accidents involving human lives, in addition to production and, consequently, ﬁnancial losses. Therefore, fault detection systems have been largely studied in the past few years, resulting in many different methods and approaches to solve such problem. This paper presents a detailed study on fault detection on industrial processes based on the recently introduced eccentricity and typicality data analytics (TEDA) approach. TEDA is a recursive and non-parametric method, ﬁrstly proposed to the general problem of anomaly detection on data streams. It is based on the measures of data density and proximity from each read data point to the analyzed data set. TEDA is an online autonomous learning algorithm that does not require a priori knowledge about the process, is completely free of user- and problem-deﬁned parameters, requires very low computational effort and, thus, is very suitable for real-time applications. The results further presented were generated by the application of TEDA to the very well-known real data benchmark DAMADICS.},
	language = {en},
	urldate = {2019-12-20},
	booktitle = {2015 {International} {Joint} {Conference} on {Neural} {Networks} ({IJCNN})},
	publisher = {IEEE},
	author = {Costa, Bruno Sielly Jales and Bezerra, Clauber Gomes and Guedes, Luiz Affonso and Angelov, Plamen Parvanov},
	month = jul,
	year = {2015},
	pages = {1--6},
	file = {Costa et al_2015_Online fault detection based on Typicality and Eccentricity Data Analytics.pdf:/home/roland/Zotero/storage/GTFD5T8N/Costa et al_2015_Online fault detection based on Typicality and Eccentricity Data Analytics.pdf:application/pdf}
}

@inproceedings{muter_entropy-based_2011,
	address = {Baden-Baden, Germany},
	title = {Entropy-{Based} {Anomaly} {Detection} for {In}-{Vehicle} {Networks}},
	isbn = {978-1-4577-0890-9},
	url = {http://ieeexplore.ieee.org/document/5940552/},
	doi = {10.1109/IVS.2011.5940552},
	abstract = {Due to an increased connectivity and seamless integration of information technology into modern vehicles, a trend of research in the automotive domain is the development of holistic IT security concepts. Within the scope of this development, vehicular attack detection is one concept which gains an increased attention, because of its reactive nature that allows to respond to threats during runtime. In this paper we explore the applicability of entropy-based attack detection for in-vehicle networks. We illustrate the crucial aspects for an adaptation of such an approach to the automotive domain. Moreover, we show ﬁrst exemplary results by applying the approach to measurements derived from a standard vehicle’s CAN-Body network.},
	language = {en},
	urldate = {2019-12-20},
	booktitle = {2011 {IEEE} {Intelligent} {Vehicles} {Symposium} ({IV})},
	publisher = {IEEE},
	author = {Muter, Michael and Asaj, Naim},
	month = jun,
	year = {2011},
	pages = {1110--1115},
	file = {Muter_Asaj_2011_Entropy-based anomaly detection for in-vehicle networks.pdf:/home/roland/Zotero/storage/M9YCYD4Y/Muter_Asaj_2011_Entropy-based anomaly detection for in-vehicle networks.pdf:application/pdf}
}

@article{koupaie_outlier_2013,
	title = {Outlier {Detection} in {Stream} {Data} by {Machine} {Learning} and {Feature} {Selection} {Methods}},
	volume = {2},
	abstract = {In recent years, intrusion detection has emerged as an important technique for network security. Machine learning techniques have been applied to the field of intrusion detection. They can learn normal and anomalous patterns from training data and via Feature selection improving classification by searching for the subset of features which best classifies the training data to detect attacks on computer system. The quality of features directly affects the performance of classification. Many feature selection methods introduced to remove redundant and irrelevant features, because raw features may reduce accuracy or robustness of classification. Outlier detection in stream data is an important and active research issue in anomaly detection. Most of the existing outlier detection algorithms has less accurate because use some clustering method. Some data are so essential and secretary. Therefore, it needs to mine carefully even if spend cost. This paper presents a framework to detect outlier in stream data by machine learning method. Moreover, it is considered if data was high dimensional. This method is more accurate from other preferred models, because machine learning method is more accurate of other methods.},
	language = {en},
	number = {3},
	journal = {International Journal of Advanced Computer Science and Information Technology},
	author = {Koupaie, Hossein Moradi and Ibrahim, Suhaimi and Hosseinkhani, Javad},
	year = {2013},
	pages = {8},
	file = {Koupaie et al_2013_Outlier Detection in Stream Data by Machine Learning and Feature Selection.pdf:/home/roland/Zotero/storage/WPJBV896/Koupaie et al_2013_Outlier Detection in Stream Data by Machine Learning and Feature Selection.pdf:application/pdf}
}

@inproceedings{cao_density-based_2006,
	title = {Density-{Based} {Clustering} over an {Evolving} {Data} {Stream} with {Noise}},
	isbn = {978-0-89871-611-5 978-1-61197-276-4},
	url = {https://epubs.siam.org/doi/10.1137/1.9781611972764.29},
	doi = {10.1137/1.9781611972764.29},
	abstract = {Clustering is an important task in mining evolving data streams. Beside the limited memory and one-pass constraints, the nature of evolving data streams implies the following requirements for stream clustering: no assumption on the number of clusters, discovery of clusters with arbitrary shape and ability to handle outliers. While a lot of clustering algorithms for data streams have been proposed, they oﬀer no solution to the combination of these requirements. In this paper, we present DenStream, a new approach for discovering clusters in an evolving data stream. The “dense” micro-cluster (named core-micro-cluster) is introduced to summarize the clusters with arbitrary shape, while the potential core-micro-cluster and outlier micro-cluster structures are proposed to maintain and distinguish the potential clusters and outliers. A novel pruning strategy is designed based on these concepts, which guarantees the precision of the weights of the micro-clusters with limited memory. Our performance study over a number of real and synthetic data sets demonstrates the eﬀectiveness and eﬃciency of our method.},
	language = {en},
	urldate = {2019-12-20},
	booktitle = {Proceedings of the 2006 {SIAM} {International} {Conference} on {Data} {Mining}},
	publisher = {Society for Industrial and Applied Mathematics},
	author = {Cao, Feng and Estert, Martin and Qian, Weining and Zhou, Aoying},
	month = apr,
	year = {2006},
	pages = {328--339},
	file = {Cao et al_2006_Density-Based Clustering over an Evolving Data Stream with Noise.pdf:/home/roland/Zotero/storage/GULBSUGA/Cao et al_2006_Density-Based Clustering over an Evolving Data Stream with Noise.pdf:application/pdf}
}

@misc{noauthor_anomaly_nodate,
	title = {Anomaly {Detection} over {Noisy} {Data} using {Learned} {Probability} {Distributions}},
	file = {Anomaly Detection over Noisy Data using Learned Probability Distributions.pdf:/home/roland/Zotero/storage/EZG2MFQQ/Anomaly Detection over Noisy Data using Learned Probability Distributions.pdf:application/pdf}
}

@misc{noauthor_handbook_nodate,
	title = {Handbook of {Parametric} and {Nonparametric} {Statistical} {Procedures}},
	file = {Handbook of Parametric and Nonparametric Statistical Procedures.pdf:/home/roland/Zotero/storage/XR92VNTY/Handbook of Parametric and Nonparametric Statistical Procedures.pdf:application/pdf}
}

@article{haibo_he_incremental_2011,
	title = {Incremental {Learning} {From} {Stream} {Data}},
	volume = {22},
	issn = {1045-9227, 1941-0093},
	url = {http://ieeexplore.ieee.org/document/6064897/},
	doi = {10.1109/TNN.2011.2171713},
	abstract = {Recent years have witnessed an incredibly increasing interest in the topic of incremental learning. Unlike conventional machine learning situations, data ﬂow targeted by incremental learning becomes available continuously over time. Accordingly, it is desirable to be able to abandon the traditional assumption of the availability of representative training data during the training period to develop decision boundaries. Under scenarios of continuous data ﬂow, the challenge is how to transform the vast amount of stream raw data into information and knowledge representation, and accumulate experience over time to support future decision-making process. In this paper, we propose a general adaptive incremental learning framework named ADAIN that is capable of learning from continuous raw data, accumulating experience over time, and using such knowledge to improve future learning and prediction performance. Detailed system level architecture and design strategies are presented in this paper. Simulation results over several realworld data sets are used to validate the effectiveness of this method.},
	language = {en},
	number = {12},
	urldate = {2019-12-19},
	journal = {IEEE Transactions on Neural Networks},
	author = {{Haibo He} and {Sheng Chen} and {Kang Li} and {Xin Xu}},
	month = dec,
	year = {2011},
	pages = {1901--1914},
	file = {Haibo He et al_2011_Incremental Learning From Stream Data.pdf:/home/roland/Zotero/storage/L75MEEW3/Haibo He et al_2011_Incremental Learning From Stream Data.pdf:application/pdf}
}

@article{gama_survey_2012,
	title = {A survey on learning from data streams: current and future trends},
	volume = {1},
	issn = {2192-6352, 2192-6360},
	shorttitle = {A survey on learning from data streams},
	url = {http://link.springer.com/10.1007/s13748-011-0002-6},
	doi = {10.1007/s13748-011-0002-6},
	abstract = {Nowadays, there are applications in which the data are modeled best not as persistent tables, but rather as transient data streams. In this article, we discuss the limitations of current machine learning and data mining algorithms. We discuss the fundamental issues in learning in dynamic environments like continuously maintain learning models that evolve over time, learning and forgetting, concept drift and change detection. Data streams produce a huge amount of data that introduce new constraints in the design of learning algorithms: limited computational resources in terms of memory, cpu power, and communication bandwidth. We present some illustrative algorithms, designed to taking these constrains into account, for decision-tree learning, hierarchical clustering and frequent pattern mining. We identify the main issues and current challenges that emerge in learning from data streams that open research lines for further developments.},
	language = {en},
	number = {1},
	urldate = {2019-12-19},
	journal = {Progress in Artificial Intelligence},
	author = {Gama, João},
	month = apr,
	year = {2012},
	pages = {45--55},
	file = {Gama_2012_A survey on learning from data streams.pdf:/home/roland/Zotero/storage/HNUFEA4X/Gama_2012_A survey on learning from data streams.pdf:application/pdf}
}

@article{hong_entropy_2016,
	title = {The {Entropy} and {PCA} {Based} {Anomaly} {Prediction} in {Data} {Streams}},
	volume = {96},
	issn = {18770509},
	url = {https://linkinghub.elsevier.com/retrieve/pii/S1877050916319160},
	doi = {10.1016/j.procs.2016.08.115},
	abstract = {With the increase of data and information, anomaly management has been attracting much more attention and become an important research topic gradually. Previous literatures have advocated anomaly discovery and identification ignoring the fact that practice needs anomaly detection in advance (anomaly prediction) but anomaly detection with post-hoc analysis. Given this apparent gap, this research proposes a new approach for anomaly prediction based on PCA (principle component analysis) and information entropy theory, and support vector regression. The main idea of anomaly prediction is to train the historical data and to identify and recognize outlier data according to previous streams patterns and trends. The explorative results of SO2 concentration of exhaust gas in WFGD (Wet Flue Gas Desulfurization) demonstrate a good performance (efficient and accurate) of the target data prediction approach. This robust and novel method can be used to detect and predict the anomaly in data streams, and applied to fault prediction, credit card fraud prediction, intrusion prediction in cyber-security, malignant diagnosis, etc.},
	language = {en},
	urldate = {2019-12-19},
	journal = {Procedia Computer Science},
	author = {Hong, Daocheng and Zhao, Deshan and Zhang, Yanchun},
	year = {2016},
	pages = {139--146},
	file = {Hong et al_2016_The Entropy and PCA Based Anomaly Prediction in Data Streams.pdf:/home/roland/Zotero/storage/9Y48T24D/Hong et al_2016_The Entropy and PCA Based Anomaly Prediction in Data Streams.pdf:application/pdf}
}

@inproceedings{angelov_anomaly_2014,
	address = {Orlando, FL, USA},
	title = {Anomaly detection based on eccentricity analysis},
	isbn = {978-1-4799-4494-1},
	url = {http://ieeexplore.ieee.org/document/7009497/},
	doi = {10.1109/EALS.2014.7009497},
	abstract = {In this paper, we propose a new eccentricity- based anomaly detection principle and algorithm. It is based on a further development of the recently introduced data analytics framework (TEDA – from typicality and eccentricity data analytics). We compare TEDA with the traditional statistical approach and prove that TEDA is a generalization of it in regards to the well-known “nσ” analysis (TEDA gives exactly the same result as the traditional “nσ” analysis but it does not require the restrictive prior assumptions that are made for the traditional approach to be in place). Moreover, it offers a nonparametric, closed form analytical descriptions (models of the data distribution) to be extracted from the real data realizations, not to be pre-assumed. In addition to that, for several types of proximity/similarity measures (such as Euclidean, cosine, Mahalonobis) it can be calculated recursively, thus, computationally very efficiently and is suitable for real time and online algorithms. Building on the per data sample, exact information about the data distribution in a closed analytical form, in this paper we propose a new less conservative and more sensitive condition for anomaly detection. It is quite different from the traditional “nσ” type conditions. We demonstrate example where traditional conditions would lead to an increased amount of false negatives or false positives in comparison with the proposed condition. The new condition is intuitive and easy to check for arbitrary data distribution and arbitrary small (but not less than 3) amount of data samples/points. Finally, because the anomaly/novelty/change detection is very important and basic data analysis operation which is in the fundament of such higher level tasks as fault detection, drift detection in data streams, clustering, outliers detection, autonomous video analytics, particle physics, etc. we point to some possible applications which will be the domain of future work.},
	language = {en},
	urldate = {2019-12-19},
	booktitle = {2014 {IEEE} {Symposium} on {Evolving} and {Autonomous} {Learning} {Systems} ({EALS})},
	publisher = {IEEE},
	author = {Angelov, Plamen},
	month = dec,
	year = {2014},
	pages = {1--8},
	file = {Angelov_2014_Anomaly detection based on eccentricity analysis.pdf:/home/roland/Zotero/storage/4AKQWEVX/Angelov_2014_Anomaly detection based on eccentricity analysis.pdf:application/pdf}
}

@article{wang_statistical_2013,
	title = {Statistical wavelet-based anomaly detection in big data with compressive sensing},
	volume = {2013},
	issn = {1687-1499},
	url = {https://jwcn-eurasipjournals.springeropen.com/articles/10.1186/1687-1499-2013-269},
	doi = {10.1186/1687-1499-2013-269},
	abstract = {Anomaly detection in big data is a key problem in the big data analytics domain. In this paper, the definitions of anomaly detection and big data were presented. Due to the sampling and storage burden and the inadequacy of privacy protection of anomaly detection based on uncompressed data, compressive sensing theory was introduced and used in the anomaly detection algorithm. The anomaly detection criterion based on wavelet packet transform and statistic process control theory was deduced. The proposed anomaly detection technique was used for through-wall human detection to demonstrate the effectiveness. The experiments for detecting humans behind a brick wall and gypsum based on ultra-wideband radar signal were carried out. The results showed that the proposed anomaly detection algorithm could effectively detect the existence of a human being through compressed signals and uncompressed data.},
	language = {en},
	number = {1},
	urldate = {2019-12-19},
	journal = {EURASIP Journal on Wireless Communications and Networking},
	author = {Wang, Wei and Lu, Dunqiang and Zhou, Xin and Zhang, Baoju and Mu, Jiasong},
	month = dec,
	year = {2013},
	file = {Wang et al_2013_Statistical wavelet-based anomaly detection in big data with compressive sensing.pdf:/home/roland/Zotero/storage/7QT5IZAH/Wang et al_2013_Statistical wavelet-based anomaly detection in big data with compressive sensing.pdf:application/pdf}
}

@misc{noauthor_issues_nodate,
	title = {Issues in {Evaluation} of {Stream} {Learning} {Algorithms}},
	file = {Issues in Evaluation of Stream Learning Algorithms.pdf:/home/roland/Zotero/storage/23CNIT4X/Issues in Evaluation of Stream Learning Algorithms.pdf:application/pdf}
}

@techreport{tsymbal_problem_2004,
	address = {Dublin},
	title = {The problem of concept drift: definitions and related work},
	abstract = {In the real world concepts are often not stable but change with time. Typical examples of this are weather prediction rules and customers’ preferences. The underlying data distribution may change as well. Often these changes make the model built on old data inconsistent with the new data, and regular updating of the model is necessary. This problem, known as concept drift, complicates the task of learning a model from data and requires special approaches, different from commonly used techniques, which treat arriving instances as equally important contributors to the final concept. This paper considers different types of concept drift, peculiarities of the problem, and gives a critical review of existing approaches to the problem.},
	language = {en},
	number = {106.2},
	institution = {Computer Science Department, Trinity College Dublin},
	author = {Tsymbal, Alexey},
	year = {2004},
	pages = {7},
	file = {Tsymbal_2004_The problem of concept drift.pdf:/home/roland/Zotero/storage/HI85RK6Z/Tsymbal_2004_The problem of concept drift.pdf:application/pdf}
}

@article{cejnek_concept_2018,
	title = {Concept drift robust adaptive novelty detection for data streams},
	volume = {309},
	issn = {09252312},
	url = {https://linkinghub.elsevier.com/retrieve/pii/S0925231218305253},
	doi = {10.1016/j.neucom.2018.04.069},
	abstract = {In this paper we study the performance of two original adaptive unsupervised novelty detection methods (NDMs) on data with concept drift. Newly, the concept drift is considered as a challenging data imbalance that should be ignored by the NDMs, and only system changes and outliers represent novelty. The ﬁeld of application for such NDMs is broad. For example, the method can be used as a supportive method for real-time system fault detection, for onset detection of events in biomedical signals, in monitoring of nonlinearly controlled processes, for event driven automated trading, etc. The two newly studied methods are the error and learning based novelty detection (ELBND) and the learning entropy (LE) based detection. These methods use both the error and weight increments of a (supervised) learning model. Here, we study these methods with normalized least-mean squares (NLMS) adaptive ﬁlter, and while the NDMs were studied on various real life tasks, newly, we carry out the study on two types of data streams with concept drift to analyze the general ability for unsupervised novelty detection. The two data streams, one with system changes, second with outliers, represent different novelty scenarios to demonstrate the performance of the proposed NDMs with concept drifts in data. Both tested NDMs work as a feature extractor. Thus, a classiﬁcation framework is used for the evaluation of the obtained features and NDM benchmarking, where two other NDMs, one based on the adaptive model plain error, second using the sample entropy (SE), are used as the reference for the comparison to the proposed methods. The results show that both newly studied NDMs are superior to the merely use of the plain error of adaptive model and also to the sample entropy based detection while they are robust against the concept drift occurrence.},
	language = {en},
	urldate = {2019-12-21},
	journal = {Neurocomputing},
	author = {Cejnek, Matous and Bukovsky, Ivo},
	month = oct,
	year = {2018},
	pages = {46--53},
	file = {Cejnek_Bukovsky_2018_Concept drift robust adaptive novelty detection for data streams.pdf:/home/roland/Zotero/storage/NZGUD45K/Cejnek_Bukovsky_2018_Concept drift robust adaptive novelty detection for data streams.pdf:application/pdf}
}

@inproceedings{jiang_real_2014,
	title = {Real time contextual collective anomaly detection over multiple data streams},
	abstract = {Anomaly detection has always been a critical and challenging problem in many application areas such as industry, healthcare, environment and ﬁnance. This problem becomes more di cult in the Big Data era as the data scale increases dramatically and the type of anomalies gets more complicated. In time sensitive applications like real time monitoring, data are often fed in streams and anomalies are required to be identiﬁed online across multiple streams with a short time delay. The new data characteristics and analysis requirements make existing solutions no longer suitable.},
	language = {en},
	booktitle = {Proceedings of the {ODD}},
	author = {Jiang, Yexi and Zeng, Chunqiu and Xu, Jian and Li, Tao},
	year = {2014},
	pages = {8},
	file = {Jiang et al_2014_Real time contextual collective anomaly detection over multiple data streams.pdf:/home/roland/Zotero/storage/L6MVAMMR/Jiang et al_2014_Real time contextual collective anomaly detection over multiple data streams.pdf:application/pdf}
}

@article{haidar_sharif_entropy_2012,
	title = {An entropy approach for abnormal activities detection in video streams},
	volume = {45},
	issn = {00313203},
	url = {https://linkinghub.elsevier.com/retrieve/pii/S0031320311004778},
	doi = {10.1016/j.patcog.2011.11.023},
	abstract = {Detection of aberration in video surveillance is an important task for public safety. This paper puts forward a simple but effective framework to detect aberrations in video streams using Entropy, which is estimated on the statistical treatments of the spatiotemporal information of a set of interest points within a region of interest by measuring their degree of randomness of both directions and displacements. Entropy is a measure of the disorder/randomness in video frame. It has been showed that degree of randomness of the directions (circular variance) changes markedly in abnormal state of affairs and does change only direction variation but does not change with displacement variation of the interest point. Degree of randomness of the displacements has been put in for to counterbalance this deﬁciency. Simple simulations have been exercised to see the characteristics of these crude elements of entropy. Normalized entropy measure provides the knowledge of the state of anomalousness. Experiments have been conducted on various real world video datasets. Both simulation and experimental results report that entropy measures of the frames over time is an outstanding way to characterize anomalies in videos.},
	language = {en},
	number = {7},
	urldate = {2020-01-06},
	journal = {Pattern Recognition},
	author = {Haidar Sharif, Md. and Djeraba, Chabane},
	month = jul,
	year = {2012},
	pages = {2543--2561},
	file = {Haidar Sharif_Djeraba_2012_An entropy approach for abnormal activities detection in video streams.pdf:/home/roland/Zotero/storage/G6SI8Q3M/Haidar Sharif_Djeraba_2012_An entropy approach for abnormal activities detection in video streams.pdf:application/pdf}
}

@article{li_incremental_2014,
	title = {Incremental entropy-based clustering on categorical data streams with concept drift},
	volume = {59},
	issn = {09507051},
	url = {https://linkinghub.elsevier.com/retrieve/pii/S0950705114000446},
	doi = {10.1016/j.knosys.2014.02.004},
	abstract = {Clustering on categorical data streams is a relatively new ﬁeld that has not received as much attention as static data and numerical data streams. One of the main difﬁculties in categorical data analysis is lacking in an appropriate way to deﬁne the similarity or dissimilarity measure on data. In this paper, we propose three dissimilarity measures: a point-cluster dissimilarity measure (based on incremental entropy), a cluster–cluster dissimilarity measure (based on incremental entropy) and a dissimilarity measure between two cluster distributions (based on sample standard deviation). We then propose an integrated framework for clustering categorical data streams with three algorithms: Minimal Dissimilarity Data Labeling (MDDL), Concept Drift Detection (CDD) and Cluster Evolving Analysis (CEA). We also make comparisons with other algorithms on several data streams synthesized from real data sets. Experiments show that the proposed algorithms are more effective in generating clustering results and detecting concept drift.},
	language = {en},
	urldate = {2020-01-06},
	journal = {Knowledge-Based Systems},
	author = {Li, Yanhong and Li, Deyu and Wang, Suge and Zhai, Yanhui},
	month = mar,
	year = {2014},
	pages = {33--47},
	file = {Li et al_2014_Incremental entropy-based clustering on categorical data streams with concept.pdf:/home/roland/Zotero/storage/G6389XR3/Li et al_2014_Incremental entropy-based clustering on categorical data streams with concept.pdf:application/pdf}
}

@article{tran_distance-based_2016,
	title = {Distance-based outlier detection in data streams},
	volume = {9},
	issn = {21508097},
	url = {http://dl.acm.org/citation.cfm?doid=2994509.2994526},
	doi = {10.14778/2994509.2994526},
	abstract = {Continuous outlier detection in data streams has important applications in fraud detection, network security, and public health. The arrival and departure of data objects in a streaming manner impose new challenges for outlier detection algorithms, especially in time and space eﬃciency. In the past decade, several studies have been performed to address the problem of distance-based outlier detection in data streams (DODDS), which adopts an unsupervised deﬁnition and does not have any distributional assumptions on data values. Our work is motivated by the lack of comparative evaluation among the state-of-the-art algorithms using the same datasets on the same platform. We systematically evaluate the most recent algorithms for DODDS under various stream settings and outlier rates. Our extensive results show that in most settings, the MCOD algorithm oﬀers the superior performance among all the algorithms, including the most recent algorithm Thresh LEAP.},
	language = {en},
	number = {12},
	urldate = {2020-01-06},
	journal = {Proceedings of the VLDB Endowment},
	author = {Tran, Luan and Fan, Liyue and Shahabi, Cyrus},
	month = aug,
	year = {2016},
	pages = {1089--1100},
	file = {Tran et al_2016_Distance-based outlier detection in data streams.pdf:/home/roland/Zotero/storage/6PMU4SR4/Tran et al_2016_Distance-based outlier detection in data streams.pdf:application/pdf}
}

@article{reunanen_unsupervised_2019,
	title = {Unsupervised online detection and prediction of outliers in streams of sensor data},
	issn = {2364-415X, 2364-4168},
	url = {http://link.springer.com/10.1007/s41060-019-00191-3},
	doi = {10.1007/s41060-019-00191-3},
	abstract = {Outliers are unexpected observations, which deviate from the majority of observations. Outlier detection and prediction are challenging tasks, because outliers are rare by deﬁnition. A stream is an unbounded source of data, which has to be processed promptly. This article proposes novel methods for outlier detection and outlier prediction in streams of sensor data. The outlier detection is an independent, unsupervised process, which is implemented using an autoencoder. The outlier detection continuously evaluates if the latest data point xi from a stream is an inlier or an outlier. This distinction is based on the reconstruction cost accompanied with Chebyshev’s inequality and the EWMA (exponentially weighted moving average) model. The outlier prediction uses the results of the outlier detection to form the required training data. The outlier prediction utilizes LR (logistic regression), SGD (stochastic gradient descent) and the hidden representation provided by the autoencoder to predict outliers in streams. The results of the experiments show that the proposed methods (1) provide accurate results, (2) are calculated in reduced computation time and (3) use a low amount of memory. Our proposed methods are suitable for analyzing streams of sensor data and providing results with low latency. The experiments also indicated that the outlier prediction is able to anticipate the occurrence of outliers in streams of sensor data.},
	language = {en},
	urldate = {2020-01-06},
	journal = {International Journal of Data Science and Analytics},
	author = {Reunanen, Niko and Räty, Tomi and Jokinen, Juho J. and Hoyt, Tyler and Culler, David},
	month = jun,
	year = {2019},
	file = {Reunanen et al_2019_Unsupervised online detection and prediction of outliers in streams of sensor.pdf:/home/roland/Zotero/storage/XIAUH85P/Reunanen et al_2019_Unsupervised online detection and prediction of outliers in streams of sensor.pdf:application/pdf}
}

@inproceedings{poonsirivong_rapid_2017,
	address = {NakhonSiThammarat, Thailand},
	title = {A rapid anomaly detection technique for big data curation},
	isbn = {978-1-5090-4834-2},
	url = {http://ieeexplore.ieee.org/document/8025900/},
	doi = {10.1109/JCSSE.2017.8025900},
	abstract = {Anomaly detection (outlier) using simulation helps us analyze the anomaly instances from big data source. As the hasty explosion of today’s data stream, outlier detection technique will be an analytical tool to be employed for evaluating massive unstructured datasets. In order to speed-up the processing time to handle enormous datasets, this research will conduct experiments of advanced distant-based outlier detection algorithms to investigate the most effective algorithms using MOA. The algorithms used in this study are Continuous Outlie Detection (COD), Micro-Cluster based COD or MCOD, and STream OutlierR Miner (STORM). The results demonstrate MCOD algorithm can outperform other two algorithms in terms of processing time and accurate anomalies.},
	language = {en},
	urldate = {2020-01-06},
	booktitle = {2017 14th {International} {Joint} {Conference} on {Computer} {Science} and {Software} {Engineering} ({JCSSE})},
	publisher = {IEEE},
	author = {Poonsirivong, Korn and Jittawiriyanukoon, Chanintorn},
	month = jul,
	year = {2017},
	pages = {1--6},
	file = {Poonsirivong_Jittawiriyanukoon_2017_A rapid anomaly detection technique for big data curation.pdf:/home/roland/Zotero/storage/WTE9B23W/Poonsirivong_Jittawiriyanukoon_2017_A rapid anomaly detection technique for big data curation.pdf:application/pdf}
}

@inproceedings{cao_scalable_2014,
	address = {Chicago, IL, USA},
	title = {Scalable distance-based outlier detection over high-volume data streams},
	isbn = {978-1-4799-2555-1},
	url = {http://ieeexplore.ieee.org/document/6816641/},
	doi = {10.1109/ICDE.2014.6816641},
	abstract = {The discovery of distance-based outliers from huge volumes of streaming data is critical for modern applications ranging from credit card fraud detection to moving object monitoring. In this work, we propose the first general framework to handle the three major classes of distance-based outliers in streaming environments, including the traditional distance­ threshold based and the nearest-neighbor-based definitions. Our LEAP framework encompasses two general optimization princi­ ples applicable across all three outlier types. First, our "mini­ mal probing" principle uses a lightweight probing operation to gather minimal yet sufficient evidence for outlier detection. This principle overturns the state-of-the-art methodology that requires routinely conducting expensive complete neighborhood searches to identify outliers. Second, our "lifespan-aware prioritization" principle leverages the temporal relationships among stream data points to prioritize the processing order among them during the probing process. Guided by these two principles, we design an outlier detection strategy which is proven to be optimal in CPU costs needed to determine the outlier status of any data point during its entire life. Our comprehensive experimental studies, using both synthetic as well as real streaming data, demonstrate that our methods are 3 orders of magnitude faster than state-of­ the-art methods for a rich diversity of scenarios tested yet scale to high dimensional streaming data.},
	language = {en},
	urldate = {2020-01-06},
	booktitle = {2014 {IEEE} 30th {International} {Conference} on {Data} {Engineering}},
	publisher = {IEEE},
	author = {Cao, Lei and Yang, Di and Wang, Qingyang and Yu, Yanwei and Wang, Jiayuan and Rundensteiner, Elke A.},
	month = mar,
	year = {2014},
	pages = {76--87},
	file = {Cao et al_2014_Scalable distance-based outlier detection over high-volume data streams.pdf:/home/roland/Zotero/storage/EVLZTYEN/Cao et al_2014_Scalable distance-based outlier detection over high-volume data streams.pdf:application/pdf}
}

@inproceedings{georgiadis_continuous_2013,
	title = {Continuous {Outlier} {Detection} in {Data} {Streams}: {An} {Extensible} {Framework} and {State}-{Of}-{The}-{Art} {Algorithms}},
	abstract = {Anomaly detection is an important data mining task, aiming at the discovery of elements that show signiﬁcant diversion from the expected behavior; such elements are termed as outliers. One of the most widely employed criteria for determining whether an element is an outlier is based on the number of neighboring elements within a ﬁxed distance (R), against a ﬁxed threshold (k). Such outliers are referred to as distance-based outliers and are the focus of this work. In this demo, we show both an extendible framework for outlier detection algorithms and speciﬁc outlier detection algorithms for the demanding case where outlier detection is continuously performed over a data stream. More speciﬁcally: i) ﬁrst we demonstrate a novel ﬂavor of an open-source publicly available tool for Massive Online Analysis (MOA) that is endowed with capabilities to encapsulate algorithms that continuously detect outliers and ii) second, we present four online outlier detection algorithms. Two of these algorithms have been designed by the authors of this demo, with a view to improving on key aspects related to outlier mining, such as running time, ﬂexibility and space requirements.},
	language = {en},
	booktitle = {Proceedings of the 2013 {ACM} {SIGMOD} {International} {Conference} on {Management} of {Data}},
	author = {Georgiadis, Dimitrios and Kontaki, Maria and Gounaris, Anastasios and Papadopoulos, Apostolos N and Tsichlas, Kostas and Manolopoulos, Yannis},
	year = {2013},
	pages = {4},
	file = {Georgiadis et al_2013_Continuous outlier detection in data streams.pdf:/home/roland/Zotero/storage/3X9ED6EU/Georgiadis et al_2013_Continuous outlier detection in data streams.pdf:application/pdf}
}

@article{agrawal_adaptive_2017,
	title = {Adaptive real-time anomaly detection in cloud infrastructures},
	volume = {29},
	issn = {15320626},
	shorttitle = {Adaptive real-time anomaly detection in cloud infrastructures},
	url = {http://doi.wiley.com/10.1002/cpe.4193},
	doi = {10.1002/cpe.4193},
	abstract = {Cloud computing has become increasingly popular, which has led many individuals and organizations towards cloud storage systems. This move is motivated by benefits such as shared storage, computation, and transparent service among a massive number of users. However, cloud-computing systems require the maintenance of complex and large-scale systems with practically unavoidable runtime problems caused by hardware and software faults. Large systems are very complex due to heterogeneity, dynamicity, scalability, hidden complexity, and time limitations. Automatic anomaly detection is a critical technique for managing such complex cloud resources. This paper proposes a scalable model for automatic anomaly detection on a large system like a cloud. The anomaly detection process is capable of issuing a correct early warning of unusual behavior in dynamic environments after learning the system characteristic of normal operation. To detect unusual activity in the cloud, we need to monitor the data center and collect cloud performance logs. In this paper, we propose an adaptive anomaly detection mechanism, which investigates principal components of the performance metrics. It transforms the performance metrics into a low-rank matrix and calculates the orthogonal distance using the Robust PCA algorithm. The proposed model updates itself recursively, while learning and adjusting the new threshold value, to minimize reconstruction errors. This paper also investigates robust principal component analysis in distributed environments using Apache Spark as the underlying framework. It specifically addresses cases in which normal operation might exhibit multiple hidden modes. The accuracy and sensitivity of the model were tested on Amazon CloudWatch datasets, and Yahoo! datasets. The model achieved an accuracy of 88.54\%.},
	language = {en},
	number = {24},
	urldate = {2020-02-04},
	journal = {Concurrency and Computation: Practice and Experience},
	author = {Agrawal, Bikash and Wiktorski, Tomasz and Rong, Chunming},
	month = dec,
	year = {2017},
	pages = {e4193},
	file = {Agrawal et al_2017_Adaptive real-time anomaly detection in cloud infrastructures.pdf:/home/roland/Zotero/storage/NMRRVBRP/Agrawal et al_2017_Adaptive real-time anomaly detection in cloud infrastructures.pdf:application/pdf}
}

@inproceedings{kontaki_continuous_2011,
	address = {Hannover, Germany},
	title = {Continuous {Monitoring} of {Distance}-{Based} {Outliers} over {Data} {Streams}},
	isbn = {978-1-4244-8959-6},
	url = {http://ieeexplore.ieee.org/document/5767923/},
	doi = {10.1109/ICDE.2011.5767923},
	abstract = {Anomaly detection is considered an important data mining task, aiming at the discovery of elements (also known as outliers) that show signiﬁcant diversion from the expected case. More speciﬁcally, given a set of objects the problem is to return the suspicious objects that deviate signiﬁcantly from the typical behavior. As in the case of clustering, the application of different criteria leads to different deﬁnitions for an outlier. In this work, we focus on distance-based outliers: an object x is an outlier if there are less than k objects lying at distance at most R from x. The problem offers signiﬁcant challenges when a stream-based environment is considered, where data arrive continuously and outliers must be detected on-the-ﬂy. There are a few research works studying the problem of continuous outlier detection. However, none of these proposals meets the requirements of modern stream-based applications for the following reasons: (i) they demand a signiﬁcant storage overhead, (ii) their efﬁciency is limited and (iii) they lack ﬂexibility. In this work, we propose new algorithms for continuous outlier monitoring in data streams, based on sliding windows. Our techniques are able to reduce the required storage overhead, run faster than previously proposed techniques and offer signiﬁcant ﬂexibility. Experiments performed on real-life as well as synthetic data sets verify our theoretical study.},
	language = {en},
	urldate = {2020-02-03},
	booktitle = {2011 {IEEE} 27th {International} {Conference} on {Data} {Engineering}},
	publisher = {IEEE},
	author = {Kontaki, Maria and Gounaris, Anastasios and Papadopoulos, Apostolos N. and Tsichlas, Kostas and Manolopoulos, Yannis},
	month = apr,
	year = {2011},
	pages = {135--146},
	file = {Kontaki et al_2011_Continuous monitoring of distance-based outliers over data streams.pdf:/home/roland/Zotero/storage/5ZZ9CPFF/Kontaki et al_2011_Continuous monitoring of distance-based outliers over data streams.pdf:application/pdf}
}

@inproceedings{liu_isolation_2008,
	address = {Pisa, Italy},
	title = {Isolation {Forest}},
	isbn = {978-0-7695-3502-9},
	url = {http://ieeexplore.ieee.org/document/4781136/},
	doi = {10.1109/ICDM.2008.17},
	abstract = {Most existing model-based approaches to anomaly detection construct a proﬁle of normal instances, then identify instances that do not conform to the normal proﬁle as anomalies. This paper proposes a fundamentally different model-based method that explicitly isolates anomalies instead of proﬁles normal points. To our best knowledge, the concept of isolation has not been explored in current literature. The use of isolation enables the proposed method, iForest, to exploit sub-sampling to an extent that is not feasible in existing methods, creating an algorithm which has a linear time complexity with a low constant and a low memory requirement. Our empirical evaluation shows that iForest performs favourably to ORCA, a near-linear time complexity distance-based method, LOF and Random Forests in terms of AUC and processing time, and especially in large data sets. iForest also works well in high dimensional problems which have a large number of irrelevant attributes, and in situations where training set does not contain any anomalies.},
	language = {en},
	urldate = {2020-02-13},
	booktitle = {2008 {Eighth} {IEEE} {International} {Conference} on {Data} {Mining}},
	publisher = {IEEE},
	author = {Liu, Fei Tony and Ting, Kai Ming and Zhou, Zhi-Hua},
	month = dec,
	year = {2008},
	pages = {413--422},
	file = {Liu et al. - 2008 - Isolation Forest.pdf:/home/roland/Zotero/storage/7IMN38ZS/Liu et al. - 2008 - Isolation Forest.pdf:application/pdf}
}

@article{widmer_gerhard_learning_1996,
	title = {Learning in the presence of concept drift and hidden contexts},
	volume = {23},
	number = {1},
	journal = {Machine learning},
	author = {{Widmer, Gerhard} and {Kubat, Miroslav}},
	year = {1996},
	pages = {69--101},
	file = {Widmer, Gerhard_Kubat, Miroslav_1996_Learning in the presence of concept drift and hidden contexts.pdf:/home/roland/Zotero/storage/ZKTRN7QH/Widmer, Gerhard_Kubat, Miroslav_1996_Learning in the presence of concept drift and hidden contexts.pdf:application/pdf}
}

@incollection{siekmann_effective_1993,
	address = {Berlin, Heidelberg},
	title = {Effective learning in dynamic environments by explicit context tracking},
	volume = {667},
	isbn = {978-3-540-56602-1 978-3-540-47597-2},
	url = {http://link.springer.com/10.1007/3-540-56602-3_139},
	language = {en},
	urldate = {2020-02-13},
	booktitle = {Machine {Learning}: {ECML}-93},
	publisher = {Springer Berlin Heidelberg},
	author = {Widmer, Gerhard and Kubat, Miroslav},
	editor = {Siekmann, J. and Goos, G. and Hartmanis, J. and Brazdil, Pavel B.},
	year = {1993},
	doi = {10.1007/3-540-56602-3_139},
	pages = {227--243},
	file = {Widmer and Kubat - 1993 - Effective learning in dynamic environments by expl.pdf:/home/roland/Zotero/storage/BMBI9HXH/Widmer and Kubat - 1993 - Effective learning in dynamic environments by expl.pdf:application/pdf}
}

@inproceedings{keogh_hot_2005,
	address = {Houston, TX, USA},
	title = {{HOT} {SAX}: {Efficiently} {Finding} the {Most} {Unusual} {Time} {Series} {Subsequence}},
	isbn = {978-0-7695-2278-4},
	shorttitle = {{HOT} {SAX}},
	url = {http://ieeexplore.ieee.org/document/1565683/},
	doi = {10.1109/ICDM.2005.79},
	abstract = {In this work, we introduce the new problem of finding time series discords. Time series discords are subsequences of a longer time series that are maximally different to all the rest of the time series subsequences. They thus capture the sense of the most unusual subsequence within a time series. Time series discords have many uses for data mining, including improving the quality of clustering, data cleaning, summarization, and anomaly detection. As we will show, discords are particularly attractive as anomaly detectors because they only require one intuitive parameter (the length of the subsequence) unlike most anomaly detection algorithms that typically require many parameters. We evaluate our work with a comprehensive set of experiments. In particular, we demonstrate the utility of discords with objective experiments on domains as diverse as Space Shuttle telemetry monitoring, medicine, surveillance, and industry, and we demonstrate the effectiveness of our discord discovery algorithm with more than one million experiments, on 82 different datasets from diverse domains.},
	language = {en},
	urldate = {2020-02-13},
	booktitle = {Fifth {IEEE} {International} {Conference} on {Data} {Mining} ({ICDM}'05)},
	publisher = {IEEE},
	author = {Keogh, E. and Lin, J. and Fu, A.},
	year = {2005},
	pages = {226--233},
	file = {Keogh et al. - 2005 - HOT SAX Efficiently Finding the Most Unusual Time.pdf:/home/roland/Zotero/storage/DZ4X6PAT/Keogh et al. - 2005 - HOT SAX Efficiently Finding the Most Unusual Time.pdf:application/pdf}
}

@article{hodge_survey_2004,
	title = {A {Survey} of {Outlier} {Detection} {Methodologies}},
	abstract = {Outlier detection has been used for centuries to detect and, where appropriate, remove anomalous observations from data. Outliers arise due to mechanical faults, changes in system behaviour, fraudulent behaviour, human error, instrument error or simply through natural deviations in populations. Their detection can identify system faults and fraud before they escalate with potentially catastrophic consequences. It can identify errors and remove their contaminating eﬀect on the data set and as such to purify the data for processing. The original outlier detection methods were arbitrary but now, principled and systematic techniques are used, drawn from the full gamut of Computer Science and Statistics. In this paper, we introduce a survey of contemporary techniques for outlier detection. We identify their respective motivations and distinguish their advantages and disadvantages in a comparative review.},
	language = {en},
	author = {Hodge, Victoria J and Austin, Jim},
	year = {2004},
	pages = {42},
	file = {Hodge and Austin - A Survey of Outlier Detection Methodologies.pdf:/home/roland/Zotero/storage/PJSSA2AD/Hodge and Austin - A Survey of Outlier Detection Methodologies.pdf:application/pdf}
}

@article{ackermann_streamkm_2012,
	title = {{StreamKM}++: {A} clustering algorithm for data streams},
	volume = {17},
	issn = {10846654},
	shorttitle = {{StreamKM}++},
	url = {http://dl.acm.org/citation.cfm?doid=2133803.2184450},
	doi = {10.1145/2133803.2184450},
	language = {en},
	number = {1},
	urldate = {2020-02-22},
	journal = {Journal of Experimental Algorithmics},
	author = {Ackermann, Marcel R. and Märtens, Marcus and Raupach, Christoph and Swierkot, Kamil and Lammersen, Christiane and Sohler, Christian},
	month = jul,
	year = {2012},
	pages = {2.1},
	file = {Ackermann et al. - 2012 - StreamKM++ A clustering algorithm for data stream.pdf:/home/roland/Zotero/storage/HUDF7PA4/Ackermann et al. - 2012 - StreamKM++ A clustering algorithm for data stream.pdf:application/pdf}
}