Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Multivariate range #39

Merged
merged 27 commits into from
Feb 8, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
df3280a
Make error message more precise
Jan 31, 2022
b5ebb1b
Add quotes to signify docstring and use pass since error in case of n…
Jan 31, 2022
fab834d
Adjust methods in base to accept DataFrame input in addition to Serie…
Jan 31, 2022
12c5b77
Complete examples with example data
Jan 31, 2022
3c917ae
Add quotes to indicate docstring
Jan 31, 2022
194fa75
Improve PEP8 compliance and complete code for tests with unused varia…
Feb 1, 2022
70660e9
Use absolute path to test data to avoid errors from file not being found
Feb 1, 2022
0d90453
Use absolute path to folder with data for tests to avoid errors from …
Feb 1, 2022
d443bc3
Add newline at end of file
Feb 1, 2022
1bd86e1
Merge branch 'main' into multivariate_range
Feb 1, 2022
0e51e0f
Remove decorators to skip tests and decrease detection thresholds to …
Feb 1, 2022
7ae6468
Add multivariate range detector that checks if any time series value …
Feb 1, 2022
4ae1b31
Add tests for multivariaterange tests
Feb 2, 2022
d3582ef
Elaborate docstring
Feb 2, 2022
d42bef9
Add test for fitting and make tests for different initializing values…
Feb 2, 2022
96e9e75
Refactor to distinguish between probability and value quantiles and c…
Feb 2, 2022
f5f086d
Ensure that distribution limits occur in normal data
Feb 2, 2022
a80dc2d
Use nanquantile instead of dropna() to handle nans to avoid dropping …
Feb 2, 2022
08076e4
Let nanquantile handle nans appropriately instead of using dropna fro…
Feb 2, 2022
facaf0b
Add support for time series specific ranges
Feb 4, 2022
b84e222
Change return type to include DataFrame
Feb 4, 2022
5bc95a5
Add custom exception for wrong data input size
Feb 4, 2022
2bf4fb1
Add tests for time-series specific ranges
Feb 4, 2022
b920400
Replace assert statements with checks that raise errors if condition …
Feb 4, 2022
6b1da1d
Test that exceptions get raised
Feb 4, 2022
58d6a9c
Add Error suffix to custom exceptions for PEP8 alignment https://www.…
Feb 4, 2022
fba175a
Refactor naming of quantiles in multivariate range detector to be con…
Feb 4, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 13 additions & 9 deletions tests/test_detectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
import pytest
import numpy as np
import pandas as pd
import os

from tsod.custom_exceptions import WrongInputDataType
from tsod.custom_exceptions import WrongInputDataTypeError
from tsod.detectors import (
RangeDetector,
DiffDetector,
Expand Down Expand Up @@ -89,7 +90,7 @@ def test_base_detector_exceptions(range_data, range_data_series):
data_series, _, _ = range_data_series

detector = RangeDetector()
pytest.raises(WrongInputDataType, detector.fit, data)
pytest.raises(WrongInputDataTypeError, detector.fit, data)


def test_range_detector(range_data_series):
Expand Down Expand Up @@ -175,7 +176,9 @@ def test_diff_detector_autoset(range_data_series):


def test_combined_detector():
df = pd.read_csv("tests/data/example.csv", parse_dates=True, index_col=0)
path_to_tests_super_folder = os.path.abspath(__file__).split('tests')[0]
df = pd.read_csv(os.path.join(path_to_tests_super_folder, 'tests', 'data', 'example.csv'),
parse_dates=True, index_col=0)
combined = CombinedDetector(
[
ConstantValueDetector(),
Expand Down Expand Up @@ -229,7 +232,7 @@ def test_hampel_detector(data_series):
assert all(i in expected_anomalies_indices for i in anomalies_indices)


def test_autoencoder_detector(data_series):
def test_auto_encoder_detector(data_series):
data_with_anomalies, expected_anomalies_indices, normal_data = data_series
detector = AutoEncoder(
hidden_neurons=[1, 1, 1, 1], epochs=1
Expand All @@ -239,15 +242,16 @@ def test_autoencoder_detector(data_series):
anomalies_indices = np.array(np.where(anomalies)).flatten()
# Validate if the found anomalies are also in the expected anomaly set
# NB Not necessarily all of them
# assert all(i in expected_anomalies_indices for i in anomalies_indices)
assert np.mean(np.array([i in expected_anomalies_indices for i in anomalies_indices])) > 0.4


def test_autoencoderlstm_detector(data_series):
def test_auto_encoder_lstm_detector(data_series):
data_with_anomalies, expected_anomalies_indices, normal_data = data_series
detector = AutoEncoderLSTM()
detector.fit(data_with_anomalies)
anomalies = detector.detect(data_with_anomalies)
anomalies_indices = np.array(np.where(anomalies)).flatten()
assert np.mean(np.array([i in expected_anomalies_indices for i in anomalies_indices])) > 0.01


def test_constant_value_detector(constant_data_series):
Expand Down Expand Up @@ -371,7 +375,7 @@ def test_create_dataset(data_series):
data_with_anomalies.name = "y"
data = data_with_anomalies.to_frame()
time_steps = 2
X, y = create_dataset(data[["y"]], data.y, time_steps)
predictors, y = create_dataset(data[["y"]], data.y, time_steps)
assert len(y) == len(data) - time_steps
assert X.shape[0] == len(data) - time_steps
assert X.shape[1] == time_steps
assert predictors.shape[0] == len(data) - time_steps
assert predictors.shape[1] == time_steps
161 changes: 161 additions & 0 deletions tests/test_mvdetectors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
import pytest
import pandas as pd
import numpy as np

from tsod.custom_exceptions import InvalidArgumentError
from tsod.mvdetectors import MVRangeDetector


@pytest.fixture
def range_data():
n_obs = 15
normal_data = pd.DataFrame(np.random.uniform(size=[3, n_obs]))
normal_data.iloc[2, [2, 8]] = np.nan
normal_data.iloc[:, 13] = 1
normal_data.iloc[:, 14] = 0
abnormal_data = pd.DataFrame(np.random.uniform(size=[3, n_obs]))
abnormal_data.iloc[0, [2, 3, 7]] = 5
abnormal_data.iloc[1, [2, 12]] = -2
abnormal_data.iloc[0, [8]] = np.nan
abnormal_data.iloc[2, [8, 9]] = np.nan
return normal_data, abnormal_data


@pytest.fixture
def range_data_time_series_specific_ranges():
n_obs = 15
ts_mins = [-1, -0.5, 0]
ts_maxs = [2, 3, 4]
normal_data = pd.DataFrame(np.random.uniform(low=ts_mins, high=ts_maxs, size=(n_obs, len(ts_mins))).T)
normal_data.iloc[2, [2, 8]] = np.nan
normal_data.iloc[:, 13] = ts_mins
normal_data.iloc[:, 14] = ts_maxs
abnormal_data = pd.DataFrame(np.random.uniform(low=ts_mins, high=ts_maxs, size=(n_obs, len(ts_mins))).T)
abnormal_data.iloc[0, [2, 3, 7]] = 5
abnormal_data.iloc[1, [2, 12]] = -2
abnormal_data.iloc[0, [8]] = np.nan
abnormal_data.iloc[2, [8, 9]] = np.nan
return normal_data, abnormal_data


@pytest.mark.parametrize("detector, expected_anomalies_list", [
(MVRangeDetector(min_value=0.0, max_value=1.0),
[[False, False, True, True, False, False, False, True, False, False, False, False, False, False, False],
[False, False, True, False, False, False, False, False, False, False, False, False, True, False, False],
[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]]),
(MVRangeDetector(max_value=1.0),
[[False, False, True, True, False, False, False, True, False, False, False, False, False, False, False],
[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False],
[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]]),
(MVRangeDetector(min_value=0.0),
[[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False],
[False, False, True, False, False, False, False, False, False, False, False, False, True, False, False],
[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]])
])
def test_single_range_detector_detection(range_data, detector, expected_anomalies_list):
normal_data, abnormal_data = range_data
detected_anomalies = detector.detect(abnormal_data)
expected_anomalies = pd.DataFrame(expected_anomalies_list, columns=abnormal_data.columns, index=abnormal_data.index)
pd.testing.assert_frame_equal(expected_anomalies, detected_anomalies)

detected_anomalies = detector.detect(normal_data)
assert not detected_anomalies.to_numpy().any()


def test_single_range_detector_fitting(range_data):
normal_data, abnormal_data = range_data
detector = MVRangeDetector()
detector.fit(normal_data)
detected_anomalies = detector.detect(abnormal_data)
expected_anomalies = pd.DataFrame(
[[False, False, True, True, False, False, False, True, False, False, False, False, False, False, False],
[False, False, True, False, False, False, False, False, False, False, False, False, True, False, False],
[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]],
columns=abnormal_data.columns, index=abnormal_data.index)
pd.testing.assert_frame_equal(expected_anomalies, detected_anomalies)

detected_anomalies = detector.detect(normal_data)
assert not detected_anomalies.to_numpy().any()


@pytest.mark.parametrize("detector, expected_anomalies_list", [
(MVRangeDetector(min_value=[0.0, 0.0, 0.0], max_value=1.0),
[[False, False, True, True, False, False, False, True, False, False, False, False, False, False, False],
[False, False, True, False, False, False, False, False, False, False, False, False, True, False, False],
[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]]),
(MVRangeDetector(min_value=0.0, max_value=[1.0, 1.0, 1.0]),
[[False, False, True, True, False, False, False, True, False, False, False, False, False, False, False],
[False, False, True, False, False, False, False, False, False, False, False, False, True, False, False],
[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]]),
(MVRangeDetector(min_value=[0.0, 0.0, 0.0], max_value=[1.0, 1.0, 1.0]),
[[False, False, True, True, False, False, False, True, False, False, False, False, False, False, False],
[False, False, True, False, False, False, False, False, False, False, False, False, True, False, False],
[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]])
])
def test_multi_range_detector_detection(range_data, detector, expected_anomalies_list):
normal_data, abnormal_data = range_data
detected_anomalies = detector.detect(abnormal_data)
expected_anomalies = pd.DataFrame(expected_anomalies_list, columns=abnormal_data.columns, index=abnormal_data.index)
pd.testing.assert_frame_equal(expected_anomalies, detected_anomalies)

detected_anomalies = detector.detect(normal_data)

assert not detected_anomalies.to_numpy().any()


@pytest.mark.parametrize("detector, expected_anomalies_list", [
(MVRangeDetector(min_value=[-1, -0.5, 0], max_value=[2, 3, 4]),
[[False, False, True, True, False, False, False, True, False, False, False, False, False, False, False],
[False, False, True, False, False, False, False, False, False, False, False, False, True, False, False],
[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]]),
(MVRangeDetector(max_value=[2, 3, 4]),
[[False, False, True, True, False, False, False, True, False, False, False, False, False, False, False],
[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False],
[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]]),
(MVRangeDetector(min_value=[-1, -0.5, 0]),
[[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False],
[False, False, True, False, False, False, False, False, False, False, False, False, True, False, False],
[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]])
])
def test_multiple_ranges_detector_detection(range_data_time_series_specific_ranges, detector, expected_anomalies_list):
normal_data, abnormal_data = range_data_time_series_specific_ranges
detected_anomalies = detector.detect(abnormal_data)
expected_anomalies = pd.DataFrame(expected_anomalies_list, columns=abnormal_data.columns, index=abnormal_data.index)
pd.testing.assert_frame_equal(expected_anomalies, detected_anomalies)

detected_anomalies = detector.detect(normal_data)
assert not detected_anomalies.to_numpy().any()


def test_multiple_ranges_detector_fitting(range_data_time_series_specific_ranges):
normal_data, abnormal_data = range_data_time_series_specific_ranges
detector = MVRangeDetector()
detector.fit(normal_data)
detected_anomalies = detector.detect(abnormal_data)
expected_anomalies = pd.DataFrame(
[[False, False, True, True, False, False, False, True, False, False, False, False, False, False, False],
[False, False, True, False, False, False, False, False, False, False, False, False, True, False, False],
[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]],
columns=abnormal_data.columns, index=abnormal_data.index)
pd.testing.assert_frame_equal(expected_anomalies, detected_anomalies)

detected_anomalies = detector.detect(normal_data)
assert not detected_anomalies.to_numpy().any()


@pytest.mark.parametrize("min_value, max_value",
[
(3, 2), ([0, 0, 3], 2), ([[0], [0], [0]], 1), (-1, [[0], [0], [0]])
])
def test_invalid_argument_raised_min_max(min_value, max_value):
with pytest.raises(InvalidArgumentError):
MVRangeDetector(min_value=min_value, max_value=max_value)


@pytest.mark.parametrize("quantile_prob_cut_offs",
[
([0.5, 1.1]), ([-0.5, 1.1]), ([-0.5, 0.9])
])
def test_invalid_argument_raised_quantiles(quantile_prob_cut_offs):
with pytest.raises(InvalidArgumentError):
MVRangeDetector(quantiles=quantile_prob_cut_offs)
6 changes: 3 additions & 3 deletions tests/test_persistence.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ def test_save_and_load(tmp_path):


def test_load():

filename = os.path.join("tests", "data", "combined.joblib")
path_to_tests_super_folder = os.path.abspath(__file__).split('tests')[0]
filename = os.path.join(path_to_tests_super_folder, "tests", "data", "combined.joblib")

loaded = tsod.load(filename)

Expand All @@ -43,4 +43,4 @@ def test_save_and_load_filename(tmpdir):

loaded = tsod.load(filename)

assert isinstance(loaded, CombinedDetector)
assert isinstance(loaded, CombinedDetector)
27 changes: 13 additions & 14 deletions tsod/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,9 @@
import joblib

import pandas as pd
import numpy as np


from .custom_exceptions import WrongInputDataType
from .custom_exceptions import WrongInputDataTypeError


def load(path: Union[str, Path]):
Expand All @@ -29,7 +28,7 @@ class Detector(ABC):
def __init__(self):
pass

def fit(self, data: pd.Series):
def fit(self, data: Union[pd.Series, pd.DataFrame]):
"""Set detector parameters based on data.

Parameters
Expand All @@ -41,11 +40,11 @@ def fit(self, data: pd.Series):
self._fit(data)
return self

def _fit(self, data: pd.Series):
def _fit(self, data: Union[pd.Series, pd.DataFrame]):
# Default implementation is a NoOp
return self

def detect(self, data: pd.Series) -> pd.Series:
def detect(self, data: Union[pd.Series, pd.DataFrame]) -> Union[pd.Series, pd.DataFrame]:
"""Detect anomalies

Parameters
Expand All @@ -63,25 +62,25 @@ def detect(self, data: pd.Series) -> pd.Series:
pred = self._detect(data)
return self._postprocess(pred)

def _postprocess(self, pred: pd.Series) -> pd.Series:
def _postprocess(self, pred: Union[pd.Series, pd.DataFrame]) -> pd.Series:
# TODO implement
return pred

@abstractmethod
def _detect(self, data: pd.Series) -> pd.Series:
"Detect anomalies"
NotImplementedError()
def _detect(self, data: Union[pd.Series, pd.DataFrame]) -> pd.Series:
"""Detect anomalies"""
pass

def validate(self, data: pd.Series) -> pd.Series:
def validate(self, data: Union[pd.Series, pd.DataFrame]) -> Union[pd.Series, pd.DataFrame]:
"""Check that input data is in correct format and possibly adjust"""
if not isinstance(data, pd.Series):
raise WrongInputDataType()
if not (isinstance(data, pd.Series) or isinstance(data, pd.DataFrame)):
raise WrongInputDataTypeError()
return data

def _gradient(self, data: pd.Series, periods: int = 1) -> pd.Series:
def _gradient(self, data: Union[pd.Series, pd.DataFrame], periods: int = 1) -> pd.Series:
dt = data.index.to_series().diff().dt.total_seconds()
if dt.min() < 1e-15:
raise ValueError("Input must be monotonic increasing")
raise ValueError("Index must be monotonically increasing")

gradient = data.diff(periods=periods) / dt
return gradient
Expand Down
12 changes: 8 additions & 4 deletions tsod/custom_exceptions.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
class WrongInputDataType(Exception):
class WrongInputDataTypeError(Exception):
def __init__(self, message="Input data must be a pandas.Series."):
self.message = message
super().__init__(self.message)
Expand All @@ -15,18 +15,22 @@ def __init__(self, message="Or specify min/max range when instantiating detector
super().__init__(message)


class InvalidArgument(Exception):
class InvalidArgumentError(Exception):
def __init__(self, argument_name, requirement):
self.message = f"{argument_name} must be {requirement}."
super().__init__(self.message)


class NotInteger(InvalidArgument):
class NotIntegerError(InvalidArgumentError):
def __init__(self, argument_name):
super().__init__(argument_name, "an integer")


class NonUniqueTimeStamps(Exception):
class NonUniqueTimeStampsError(Exception):
def __init__(self, message="Found multiple values at the same time stamp."):
self.message = message
super().__init__(self.message)


class WrongInputSizeError(ValueError):
pass
Loading