diff --git a/tests/test_detectors.py b/tests/test_detectors.py index 55dbc1b..a411abf 100644 --- a/tests/test_detectors.py +++ b/tests/test_detectors.py @@ -2,8 +2,9 @@ import pytest import numpy as np import pandas as pd +import os -from tsod.custom_exceptions import WrongInputDataType +from tsod.custom_exceptions import WrongInputDataTypeError from tsod.detectors import ( RangeDetector, DiffDetector, @@ -89,7 +90,7 @@ def test_base_detector_exceptions(range_data, range_data_series): data_series, _, _ = range_data_series detector = RangeDetector() - pytest.raises(WrongInputDataType, detector.fit, data) + pytest.raises(WrongInputDataTypeError, detector.fit, data) def test_range_detector(range_data_series): @@ -175,7 +176,9 @@ def test_diff_detector_autoset(range_data_series): def test_combined_detector(): - df = pd.read_csv("tests/data/example.csv", parse_dates=True, index_col=0) + path_to_tests_super_folder = os.path.abspath(__file__).split('tests')[0] + df = pd.read_csv(os.path.join(path_to_tests_super_folder, 'tests', 'data', 'example.csv'), + parse_dates=True, index_col=0) combined = CombinedDetector( [ ConstantValueDetector(), @@ -229,7 +232,7 @@ def test_hampel_detector(data_series): assert all(i in expected_anomalies_indices for i in anomalies_indices) -def test_autoencoder_detector(data_series): +def test_auto_encoder_detector(data_series): data_with_anomalies, expected_anomalies_indices, normal_data = data_series detector = AutoEncoder( hidden_neurons=[1, 1, 1, 1], epochs=1 @@ -239,15 +242,16 @@ def test_autoencoder_detector(data_series): anomalies_indices = np.array(np.where(anomalies)).flatten() # Validate if the found anomalies are also in the expected anomaly set # NB Not necessarily all of them - # assert all(i in expected_anomalies_indices for i in anomalies_indices) + assert np.mean(np.array([i in expected_anomalies_indices for i in anomalies_indices])) > 0.4 -def test_autoencoderlstm_detector(data_series): +def test_auto_encoder_lstm_detector(data_series): data_with_anomalies, expected_anomalies_indices, normal_data = data_series detector = AutoEncoderLSTM() detector.fit(data_with_anomalies) anomalies = detector.detect(data_with_anomalies) anomalies_indices = np.array(np.where(anomalies)).flatten() + assert np.mean(np.array([i in expected_anomalies_indices for i in anomalies_indices])) > 0.01 def test_constant_value_detector(constant_data_series): @@ -371,7 +375,7 @@ def test_create_dataset(data_series): data_with_anomalies.name = "y" data = data_with_anomalies.to_frame() time_steps = 2 - X, y = create_dataset(data[["y"]], data.y, time_steps) + predictors, y = create_dataset(data[["y"]], data.y, time_steps) assert len(y) == len(data) - time_steps - assert X.shape[0] == len(data) - time_steps - assert X.shape[1] == time_steps + assert predictors.shape[0] == len(data) - time_steps + assert predictors.shape[1] == time_steps diff --git a/tests/test_mvdetectors.py b/tests/test_mvdetectors.py new file mode 100644 index 0000000..1104e1f --- /dev/null +++ b/tests/test_mvdetectors.py @@ -0,0 +1,161 @@ +import pytest +import pandas as pd +import numpy as np + +from tsod.custom_exceptions import InvalidArgumentError +from tsod.mvdetectors import MVRangeDetector + + +@pytest.fixture +def range_data(): + n_obs = 15 + normal_data = pd.DataFrame(np.random.uniform(size=[3, n_obs])) + normal_data.iloc[2, [2, 8]] = np.nan + normal_data.iloc[:, 13] = 1 + normal_data.iloc[:, 14] = 0 + abnormal_data = pd.DataFrame(np.random.uniform(size=[3, n_obs])) + abnormal_data.iloc[0, [2, 3, 7]] = 5 + abnormal_data.iloc[1, [2, 12]] = -2 + abnormal_data.iloc[0, [8]] = np.nan + abnormal_data.iloc[2, [8, 9]] = np.nan + return normal_data, abnormal_data + + +@pytest.fixture +def range_data_time_series_specific_ranges(): + n_obs = 15 + ts_mins = [-1, -0.5, 0] + ts_maxs = [2, 3, 4] + normal_data = pd.DataFrame(np.random.uniform(low=ts_mins, high=ts_maxs, size=(n_obs, len(ts_mins))).T) + normal_data.iloc[2, [2, 8]] = np.nan + normal_data.iloc[:, 13] = ts_mins + normal_data.iloc[:, 14] = ts_maxs + abnormal_data = pd.DataFrame(np.random.uniform(low=ts_mins, high=ts_maxs, size=(n_obs, len(ts_mins))).T) + abnormal_data.iloc[0, [2, 3, 7]] = 5 + abnormal_data.iloc[1, [2, 12]] = -2 + abnormal_data.iloc[0, [8]] = np.nan + abnormal_data.iloc[2, [8, 9]] = np.nan + return normal_data, abnormal_data + + +@pytest.mark.parametrize("detector, expected_anomalies_list", [ + (MVRangeDetector(min_value=0.0, max_value=1.0), + [[False, False, True, True, False, False, False, True, False, False, False, False, False, False, False], + [False, False, True, False, False, False, False, False, False, False, False, False, True, False, False], + [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]]), + (MVRangeDetector(max_value=1.0), + [[False, False, True, True, False, False, False, True, False, False, False, False, False, False, False], + [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False], + [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]]), + (MVRangeDetector(min_value=0.0), + [[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False], + [False, False, True, False, False, False, False, False, False, False, False, False, True, False, False], + [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]]) +]) +def test_single_range_detector_detection(range_data, detector, expected_anomalies_list): + normal_data, abnormal_data = range_data + detected_anomalies = detector.detect(abnormal_data) + expected_anomalies = pd.DataFrame(expected_anomalies_list, columns=abnormal_data.columns, index=abnormal_data.index) + pd.testing.assert_frame_equal(expected_anomalies, detected_anomalies) + + detected_anomalies = detector.detect(normal_data) + assert not detected_anomalies.to_numpy().any() + + +def test_single_range_detector_fitting(range_data): + normal_data, abnormal_data = range_data + detector = MVRangeDetector() + detector.fit(normal_data) + detected_anomalies = detector.detect(abnormal_data) + expected_anomalies = pd.DataFrame( + [[False, False, True, True, False, False, False, True, False, False, False, False, False, False, False], + [False, False, True, False, False, False, False, False, False, False, False, False, True, False, False], + [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]], + columns=abnormal_data.columns, index=abnormal_data.index) + pd.testing.assert_frame_equal(expected_anomalies, detected_anomalies) + + detected_anomalies = detector.detect(normal_data) + assert not detected_anomalies.to_numpy().any() + + +@pytest.mark.parametrize("detector, expected_anomalies_list", [ + (MVRangeDetector(min_value=[0.0, 0.0, 0.0], max_value=1.0), + [[False, False, True, True, False, False, False, True, False, False, False, False, False, False, False], + [False, False, True, False, False, False, False, False, False, False, False, False, True, False, False], + [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]]), + (MVRangeDetector(min_value=0.0, max_value=[1.0, 1.0, 1.0]), + [[False, False, True, True, False, False, False, True, False, False, False, False, False, False, False], + [False, False, True, False, False, False, False, False, False, False, False, False, True, False, False], + [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]]), + (MVRangeDetector(min_value=[0.0, 0.0, 0.0], max_value=[1.0, 1.0, 1.0]), + [[False, False, True, True, False, False, False, True, False, False, False, False, False, False, False], + [False, False, True, False, False, False, False, False, False, False, False, False, True, False, False], + [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]]) +]) +def test_multi_range_detector_detection(range_data, detector, expected_anomalies_list): + normal_data, abnormal_data = range_data + detected_anomalies = detector.detect(abnormal_data) + expected_anomalies = pd.DataFrame(expected_anomalies_list, columns=abnormal_data.columns, index=abnormal_data.index) + pd.testing.assert_frame_equal(expected_anomalies, detected_anomalies) + + detected_anomalies = detector.detect(normal_data) + + assert not detected_anomalies.to_numpy().any() + + +@pytest.mark.parametrize("detector, expected_anomalies_list", [ + (MVRangeDetector(min_value=[-1, -0.5, 0], max_value=[2, 3, 4]), + [[False, False, True, True, False, False, False, True, False, False, False, False, False, False, False], + [False, False, True, False, False, False, False, False, False, False, False, False, True, False, False], + [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]]), + (MVRangeDetector(max_value=[2, 3, 4]), + [[False, False, True, True, False, False, False, True, False, False, False, False, False, False, False], + [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False], + [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]]), + (MVRangeDetector(min_value=[-1, -0.5, 0]), + [[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False], + [False, False, True, False, False, False, False, False, False, False, False, False, True, False, False], + [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]]) +]) +def test_multiple_ranges_detector_detection(range_data_time_series_specific_ranges, detector, expected_anomalies_list): + normal_data, abnormal_data = range_data_time_series_specific_ranges + detected_anomalies = detector.detect(abnormal_data) + expected_anomalies = pd.DataFrame(expected_anomalies_list, columns=abnormal_data.columns, index=abnormal_data.index) + pd.testing.assert_frame_equal(expected_anomalies, detected_anomalies) + + detected_anomalies = detector.detect(normal_data) + assert not detected_anomalies.to_numpy().any() + + +def test_multiple_ranges_detector_fitting(range_data_time_series_specific_ranges): + normal_data, abnormal_data = range_data_time_series_specific_ranges + detector = MVRangeDetector() + detector.fit(normal_data) + detected_anomalies = detector.detect(abnormal_data) + expected_anomalies = pd.DataFrame( + [[False, False, True, True, False, False, False, True, False, False, False, False, False, False, False], + [False, False, True, False, False, False, False, False, False, False, False, False, True, False, False], + [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]], + columns=abnormal_data.columns, index=abnormal_data.index) + pd.testing.assert_frame_equal(expected_anomalies, detected_anomalies) + + detected_anomalies = detector.detect(normal_data) + assert not detected_anomalies.to_numpy().any() + + +@pytest.mark.parametrize("min_value, max_value", + [ + (3, 2), ([0, 0, 3], 2), ([[0], [0], [0]], 1), (-1, [[0], [0], [0]]) + ]) +def test_invalid_argument_raised_min_max(min_value, max_value): + with pytest.raises(InvalidArgumentError): + MVRangeDetector(min_value=min_value, max_value=max_value) + + +@pytest.mark.parametrize("quantile_prob_cut_offs", + [ + ([0.5, 1.1]), ([-0.5, 1.1]), ([-0.5, 0.9]) + ]) +def test_invalid_argument_raised_quantiles(quantile_prob_cut_offs): + with pytest.raises(InvalidArgumentError): + MVRangeDetector(quantiles=quantile_prob_cut_offs) diff --git a/tests/test_persistence.py b/tests/test_persistence.py index 83e75cc..d27e189 100644 --- a/tests/test_persistence.py +++ b/tests/test_persistence.py @@ -21,8 +21,8 @@ def test_save_and_load(tmp_path): def test_load(): - - filename = os.path.join("tests", "data", "combined.joblib") + path_to_tests_super_folder = os.path.abspath(__file__).split('tests')[0] + filename = os.path.join(path_to_tests_super_folder, "tests", "data", "combined.joblib") loaded = tsod.load(filename) @@ -43,4 +43,4 @@ def test_save_and_load_filename(tmpdir): loaded = tsod.load(filename) - assert isinstance(loaded, CombinedDetector) \ No newline at end of file + assert isinstance(loaded, CombinedDetector) diff --git a/tsod/base.py b/tsod/base.py index 98c45e3..f706bd1 100644 --- a/tsod/base.py +++ b/tsod/base.py @@ -5,10 +5,9 @@ import joblib import pandas as pd -import numpy as np -from .custom_exceptions import WrongInputDataType +from .custom_exceptions import WrongInputDataTypeError def load(path: Union[str, Path]): @@ -29,7 +28,7 @@ class Detector(ABC): def __init__(self): pass - def fit(self, data: pd.Series): + def fit(self, data: Union[pd.Series, pd.DataFrame]): """Set detector parameters based on data. Parameters @@ -41,11 +40,11 @@ def fit(self, data: pd.Series): self._fit(data) return self - def _fit(self, data: pd.Series): + def _fit(self, data: Union[pd.Series, pd.DataFrame]): # Default implementation is a NoOp return self - def detect(self, data: pd.Series) -> pd.Series: + def detect(self, data: Union[pd.Series, pd.DataFrame]) -> Union[pd.Series, pd.DataFrame]: """Detect anomalies Parameters @@ -63,25 +62,25 @@ def detect(self, data: pd.Series) -> pd.Series: pred = self._detect(data) return self._postprocess(pred) - def _postprocess(self, pred: pd.Series) -> pd.Series: + def _postprocess(self, pred: Union[pd.Series, pd.DataFrame]) -> pd.Series: # TODO implement return pred @abstractmethod - def _detect(self, data: pd.Series) -> pd.Series: - "Detect anomalies" - NotImplementedError() + def _detect(self, data: Union[pd.Series, pd.DataFrame]) -> pd.Series: + """Detect anomalies""" + pass - def validate(self, data: pd.Series) -> pd.Series: + def validate(self, data: Union[pd.Series, pd.DataFrame]) -> Union[pd.Series, pd.DataFrame]: """Check that input data is in correct format and possibly adjust""" - if not isinstance(data, pd.Series): - raise WrongInputDataType() + if not (isinstance(data, pd.Series) or isinstance(data, pd.DataFrame)): + raise WrongInputDataTypeError() return data - def _gradient(self, data: pd.Series, periods: int = 1) -> pd.Series: + def _gradient(self, data: Union[pd.Series, pd.DataFrame], periods: int = 1) -> pd.Series: dt = data.index.to_series().diff().dt.total_seconds() if dt.min() < 1e-15: - raise ValueError("Input must be monotonic increasing") + raise ValueError("Index must be monotonically increasing") gradient = data.diff(periods=periods) / dt return gradient diff --git a/tsod/custom_exceptions.py b/tsod/custom_exceptions.py index 61b3d08..3cfb4bd 100644 --- a/tsod/custom_exceptions.py +++ b/tsod/custom_exceptions.py @@ -1,4 +1,4 @@ -class WrongInputDataType(Exception): +class WrongInputDataTypeError(Exception): def __init__(self, message="Input data must be a pandas.Series."): self.message = message super().__init__(self.message) @@ -15,18 +15,22 @@ def __init__(self, message="Or specify min/max range when instantiating detector super().__init__(message) -class InvalidArgument(Exception): +class InvalidArgumentError(Exception): def __init__(self, argument_name, requirement): self.message = f"{argument_name} must be {requirement}." super().__init__(self.message) -class NotInteger(InvalidArgument): +class NotIntegerError(InvalidArgumentError): def __init__(self, argument_name): super().__init__(argument_name, "an integer") -class NonUniqueTimeStamps(Exception): +class NonUniqueTimeStampsError(Exception): def __init__(self, message="Found multiple values at the same time stamp."): self.message = message super().__init__(self.message) + + +class WrongInputSizeError(ValueError): + pass diff --git a/tsod/detectors.py b/tsod/detectors.py index 564b398..750bcd3 100644 --- a/tsod/detectors.py +++ b/tsod/detectors.py @@ -14,6 +14,10 @@ class CombinedDetector(Detector, Sequence): Examples -------- + >>> normal_data = pd.Series(np.random.normal(size=100)) + >>> abnormal_data = pd.Series(np.random.normal(size=100)) + >>> abnormal_data[[2, 6, 15, 57, 60, 73]] = 5 + >>> anomaly_detector = CombinedDetector([RangeDetector(), DiffDetector()]) >>> anomaly_detector.fit(normal_data) >>> detected_anomalies = anomaly_detector.detect(abnormal_data) @@ -66,16 +70,22 @@ class RangeDetector(Detector): Examples --------- + >>> normal_data = pd.Series(np.random.normal(size=100)) + >>> abnormal_data = pd.Series(np.random.normal(size=100)) + >>> abnormal_data[[2, 6, 15, 57, 60, 73]] = 5 + >>> normal_data_with_some_outliers = pd.Series(np.random.normal(size=100)) + >>> normal_data_with_some_outliers[[12, 13, 20, 90]] = 7 + >>> detector = RangeDetector(min_value=0.0, max_value=2.0) - >>> anomalies = detector.detect(data) + >>> anomalies = detector.detect(abnormal_data) >>> detector = RangeDetector() >>> detector.fit(normal_data) # min, max inferred from normal data - >>> anomalies = detector.detect(data) + >>> anomalies = detector.detect(abnormal_data) >>> detector = RangeDetector(quantiles=[0.001,0.999]) >>> detector.fit(normal_data_with_some_outliers) - >>> anomalies = detector.detect(data)""" + >>> anomalies = detector.detect(abnormal_data)""" def __init__(self, min_value=-np.inf, max_value=np.inf, quantiles=None): super().__init__() @@ -101,7 +111,7 @@ def _fit(self, data): """ super().validate(data) - quantiles = np.quantile(data.dropna(), self._quantiles) + quantiles = np.nanquantile(data, self._quantiles) self._min = quantiles.min() self._max = quantiles.max() @@ -109,7 +119,7 @@ def _fit(self, data): return self def _detect(self, data: pd.Series) -> pd.Series: - "Detect anomalies outside range" + """Detect anomalies outside range""" if self._max is None: return data < self._min diff --git a/tsod/hampel.py b/tsod/hampel.py index 2a0272b..1b6abed 100644 --- a/tsod/hampel.py +++ b/tsod/hampel.py @@ -2,7 +2,7 @@ import numpy as np from numba import jit -from tsod.custom_exceptions import NotInteger, InvalidArgument +from tsod.custom_exceptions import NotIntegerError, InvalidArgumentError from tsod.detectors import Detector @@ -14,13 +14,13 @@ def _validate_arguments(window_size, threshold): if not isinstance(window_size, int): - raise NotInteger("window_size") + raise NotIntegerError("window_size") else: if window_size <= 0: - raise InvalidArgument("window_size", "nonnegative") + raise InvalidArgumentError("window_size", "nonnegative") if threshold < 0: - raise InvalidArgument("threshold", "positive") + raise InvalidArgumentError("threshold", "positive") @jit(nopython=True) diff --git a/tsod/mvdetectors.py b/tsod/mvdetectors.py new file mode 100644 index 0000000..65235ec --- /dev/null +++ b/tsod/mvdetectors.py @@ -0,0 +1,127 @@ +import pandas as pd +import numpy as np +import typing + +from .base import Detector +from .custom_exceptions import NoRangeDefinedError, WrongInputSizeError, InvalidArgumentError + + +def make_vector_broadcastable(function_input, n_data_rows): + if function_input is not None: + if len(function_input.shape) > 0: + if len(function_input) != n_data_rows: + raise WrongInputSizeError( + "The number of rows in the input data must match the number of " + "values specified for min and max if more than one value is given for min/max.") + min_comparison = function_input + if len(function_input.shape) == 1: + min_comparison = function_input[..., np.newaxis] + return min_comparison + + +class MVRangeDetector(Detector): + """ + Detect values outside range. + + NaN values are not marked as anomalies. + + Parameters + ---------- + min_value : float, List, np.array + Minimum value threshold. + max_value : float, List, np.array + Maximum value threshold. + quantiles : list[2] + Default quantiles [0, 1]. Same as min and max value. + + Examples + --------- + >>> n_obs = 100 + >>> normal_data = pd.DataFrame(np.random.normal(size=[3, n_obs])) + >>> abnormal_data = pd.DataFrame(np.random.normal(size=[3, n_obs])) + >>> abnormal_data.iloc[0, [2, 6, 15, 57, 60, 73]] = 5 + >>> normal_data_with_some_outliers = pd.DataFrame(np.random.normal(size=[3, n_obs])) + >>> normal_data_with_some_outliers.iloc[0, [12, 13, 20, 90]] = 7 + + >>> detector = MVRangeDetector(min_value=0.0, max_value=2.0) + >>> anomalies = detector.detect(abnormal_data) + + >>> detector = MVRangeDetector() + >>> detector.fit(normal_data) # min, max inferred from normal data + >>> anomalies = detector.detect(abnormal_data) + + >>> detector = MVRangeDetector(quantiles=[0.001,0.999]) + >>> detector.fit(normal_data_with_some_outliers) + >>> anomalies = detector.detect(normal_data_with_some_outliers)""" + + def __init__(self, min_value=-np.inf, max_value=np.inf, quantiles=None): + super().__init__() + + min_value = np.array(min_value) + if len(min_value.shape) > 1: + raise InvalidArgumentError('min_value ', ' a float or 1D array_like.') + + max_value = np.array(max_value) + if len(max_value.shape) > 1: + raise InvalidArgumentError('max_value ', ' a float or 1D array_like.') + + if np.array([min_value > max_value]).any(): + raise InvalidArgumentError('For all values in min_value and max_value ', ' the min must be less than max.') + + self._min = min_value + + self._max = max_value + + if quantiles is None: + self.quantiles = [0.0, 1.0] + else: + if not (0.0 <= quantiles[0] <= 1.0): + raise InvalidArgumentError('Values in quantile_prob_cut_offs', ' between 0 and 1, both inclusive.') + if not (0.0 <= quantiles[1] <= 1.0): + raise InvalidArgumentError('Values in quantile_prob_cut_offs', ' between 0 and 1, both inclusive.') + self.quantiles = [np.min(quantiles), np.max(quantiles)] + + def _fit(self, data): + """Set min and max based on data. + + Parameters + ---------- + data : pd.DataFrame + Time series data with time over columns. + """ + super().validate(data) + + values_at_quantiles = np.nanquantile(data, self.quantiles, axis=1) + self._min = values_at_quantiles[0] + self._max = values_at_quantiles[1] + + return self + + def _detect(self, data: typing.Union[pd.Series, pd.DataFrame]) -> pd.DataFrame: + """Detect anomalies outside range""" + + if (self._min is None) and (self._max is None): + raise NoRangeDefinedError("Both min and max are None. At least one of them must be set.") + + if len(data.shape) == 1: + n_data_rows = 1 + else: + n_data_rows = data.shape[0] + + min_comparison = make_vector_broadcastable(self._min, n_data_rows) + max_comparison = make_vector_broadcastable(self._max, n_data_rows) + + if self._max is None: + return data < min_comparison + + if self._min is None: + return data > max_comparison + + return (data < min_comparison) | (data > max_comparison) + + def __str__(self): + + return f"{super.__str__(self)}{self._min}, {self._max})" + + def __repr__(self): + return f"{self.__class__.__name__}(min: {self._min:.1e}, max: {self._max:.1e})"