diff --git a/docs/source/api_reference/anomaly_detectors.rst b/docs/source/api_reference/anomaly_detectors.rst index d4f63ca..9012e17 100644 --- a/docs/source/api_reference/anomaly_detectors.rst +++ b/docs/source/api_reference/anomaly_detectors.rst @@ -11,9 +11,9 @@ Base :toctree: auto_generated/ :template: class.rst - BaseCollectiveAnomalyDetector + BaseSegmentAnomalyDetector -Collective anomaly detectors +Segment anomaly detectors ---------------------------- .. currentmodule:: skchange.anomaly_detectors @@ -25,7 +25,7 @@ Collective anomaly detectors CircularBinarySegmentation StatThresholdAnomaliser -Collective anomaly detectors with variable identification +Segment anomaly detectors with variable identification --------------------------------------------------------- .. currentmodule:: skchange.anomaly_detectors diff --git a/docs/source/index.rst b/docs/source/index.rst index e4c7cbd..75fb1ee 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -4,7 +4,7 @@ Welcome to skchange =================== -A python library for fast collective anomaly and changepoint detection. +A python library for fast change point and segment anomaly detection. The library is designed to be compatible with `sktime `_. `Numba `_ is used for computational speed. @@ -34,8 +34,8 @@ Key features - **Fast**: `Numba `_ is used for performance. - **Easy to use**: Follows the conventions of `sktime `_ and `scikit-learn `_. - **Easy to extend**: Make your own detectors by inheriting from the base class templates. Create custom detection scores and cost functions. -- **Collective anomaly detection**: Detect intervals of anomalous behaviour in time series data. -- **Subset collective anomaly detection**: Detect intervals of anomalous behaviour in time series data, and infer the subset of variables that are responsible for the anomaly. +- **Segment anomaly detection**: Detect intervals of anomalous behaviour in time series data. +- **Subset anomaly detection**: Detect intervals of anomalous behaviour in time series data, and infer the subset of variables that are responsible for the anomaly. Mission ------- diff --git a/interactive/compare_detector_outputs.py b/interactive/compare_detector_outputs.py index e400c16..24317db 100644 --- a/interactive/compare_detector_outputs.py +++ b/interactive/compare_detector_outputs.py @@ -29,14 +29,14 @@ print(changepoints) print(changepoint_labels) -# Collective anomaly detector +# Segment anomaly detector anomaly_detector = CAPA() anomalies = anomaly_detector.fit_predict(df) anomaly_labels = anomaly_detector.transform(df) print(anomalies) print(anomaly_labels) -# Subset collective anomaly detector +# Subset segment anomaly detector subset_anomaly_detector = MVCAPA() subset_anomalies = subset_anomaly_detector.fit_predict(df) subset_anomaly_labels = subset_anomaly_detector.transform(df) diff --git a/interactive/explore_capa.py b/interactive/explore_capa.py index 2513cdf..2ab18a4 100644 --- a/interactive/explore_capa.py +++ b/interactive/explore_capa.py @@ -27,7 +27,7 @@ df = generate_alternating_data( 5, 10, p=10, mean=10, affected_proportion=0.2, random_state=2 ) -detector = MVCAPA(collective_penalty="sparse") +detector = MVCAPA(segment_penalty="sparse") anomalies = detector.fit_predict(df) print(anomalies) @@ -56,13 +56,11 @@ # Profiling n = int(1e5) df = generate_alternating_data(n_segments=1, mean=0, segment_length=n, p=1) -detector = CAPA( - max_segment_length=100, collective_penalty_scale=5, point_penalty_scale=5 -) +detector = CAPA(max_segment_length=100, segment_penalty_scale=5, point_penalty_scale=5) detector = MVCAPA( max_segment_length=1000, - collective_penalty="sparse", - collective_penalty_scale=5, + segment_penalty="sparse", + segment_penalty_scale=5, point_penalty_scale=5, ) profiler = Profiler().start() diff --git a/skchange/anomaly_detectors/__init__.py b/skchange/anomaly_detectors/__init__.py index 1d83c45..0a8945e 100644 --- a/skchange/anomaly_detectors/__init__.py +++ b/skchange/anomaly_detectors/__init__.py @@ -1,13 +1,13 @@ """Anomaly detection algorithms.""" from skchange.anomaly_detectors.anomalisers import StatThresholdAnomaliser -from skchange.anomaly_detectors.base import BaseCollectiveAnomalyDetector +from skchange.anomaly_detectors.base import BaseSegmentAnomalyDetector from skchange.anomaly_detectors.capa import CAPA from skchange.anomaly_detectors.circular_binseg import CircularBinarySegmentation from skchange.anomaly_detectors.mvcapa import MVCAPA BASE_ANOMALY_DETECTORS = [ - BaseCollectiveAnomalyDetector, + BaseSegmentAnomalyDetector, ] COLLECTIVE_ANOMALY_DETECTORS = [ CAPA, diff --git a/skchange/anomaly_detectors/anomalisers.py b/skchange/anomaly_detectors/anomalisers.py index a4871aa..8ca2834 100644 --- a/skchange/anomaly_detectors/anomalisers.py +++ b/skchange/anomaly_detectors/anomalisers.py @@ -5,11 +5,11 @@ import numpy as np import pandas as pd -from skchange.anomaly_detectors.base import BaseCollectiveAnomalyDetector +from skchange.anomaly_detectors.base import BaseSegmentAnomalyDetector from skchange.change_detectors.base import BaseChangeDetector -class StatThresholdAnomaliser(BaseCollectiveAnomalyDetector): +class StatThresholdAnomaliser(BaseSegmentAnomalyDetector): """Anomaly detection based on thresholding the values of segment statistics. Parameters diff --git a/skchange/anomaly_detectors/base.py b/skchange/anomaly_detectors/base.py index 877a196..1c2b6fd 100644 --- a/skchange/anomaly_detectors/base.py +++ b/skchange/anomaly_detectors/base.py @@ -1,7 +1,7 @@ """Base classes for anomaly detectors. classes: - BaseCollectiveAnomalyDetector + BaseSegmentAnomalyDetector By inheriting from these classes the remaining methods of the BaseDetector class to implement to obtain a fully functional anomaly detector are given below. @@ -23,10 +23,10 @@ from skchange.base import BaseDetector -class BaseCollectiveAnomalyDetector(BaseDetector): - """Base class for collective anomaly detectors. +class BaseSegmentAnomalyDetector(BaseDetector): + """Base class for segment anomaly detectors. - Collective anomaly detectors detect segments of data points that are considered + Segment anomaly detectors detect segments of data points that are considered anomalous. Output format of the `predict` method: See the `dense_to_sparse` method. @@ -68,10 +68,10 @@ def sparse_to_dense( 0 is reserved for the normal instances. """ if "icolumns" in y_sparse: - return BaseCollectiveAnomalyDetector._sparse_to_dense_icolumns( + return BaseSegmentAnomalyDetector._sparse_to_dense_icolumns( y_sparse, index, columns ) - return BaseCollectiveAnomalyDetector._sparse_to_dense_ilocs(y_sparse, index) + return BaseSegmentAnomalyDetector._sparse_to_dense_ilocs(y_sparse, index) @staticmethod def dense_to_sparse(y_dense: pd.DataFrame) -> pd.DataFrame: @@ -104,9 +104,9 @@ def dense_to_sparse(y_dense: pd.DataFrame) -> pd.DataFrame: `output["ilocs"].array.left` and `output["ilocs"].array.right`, respectively. """ if "labels" in y_dense.columns: - return BaseCollectiveAnomalyDetector._dense_to_sparse_ilocs(y_dense) + return BaseSegmentAnomalyDetector._dense_to_sparse_ilocs(y_dense) elif y_dense.columns.str.startswith("labels_").all(): - return BaseCollectiveAnomalyDetector._dense_to_sparse_icolumns(y_dense) + return BaseSegmentAnomalyDetector._dense_to_sparse_icolumns(y_dense) raise ValueError( "Invalid columns in `y_dense`. Expected 'labels' or 'labels_*'." f" Got: {y_dense.columns}" @@ -114,19 +114,19 @@ def dense_to_sparse(y_dense: pd.DataFrame) -> pd.DataFrame: def _format_sparse_output( self, - collective_anomalies: Union[ + segment_anomalies: Union[ list[tuple[int, int]], list[tuple[int, int, np.ndarray]] ], closed: str = "left", ) -> pd.DataFrame: - """Format the sparse output of collective anomaly detectors. + """Format the sparse output of segment anomaly detectors. Can be reused by subclasses to format the output of the `_predict` method. Parameters ---------- - collective_anomalies : list - List of tuples containing start and end indices of collective anomalies, + segment_anomalies : list + List of tuples containing start and end indices of segment anomalies, and optionally a np.array of the identified variables/components/columns. closed : str Whether the (start, end) tuple correspond to intervals that are closed @@ -144,11 +144,11 @@ def _format_sparse_output( The start and end points of the intervals can be accessed by `output["ilocs"].array.left` and `output["ilocs"].array.right`, respectively. """ - # Cannot extract this from collective_anomalies as it may be an empty list. + # Cannot extract this from segment_anomalies as it may be an empty list. if self.capability_variable_identification: - return self._format_sparse_output_icolumns(collective_anomalies, closed) + return self._format_sparse_output_icolumns(segment_anomalies, closed) else: - return self._format_sparse_output_ilocs(collective_anomalies, closed) + return self._format_sparse_output_ilocs(segment_anomalies, closed) @staticmethod def _sparse_to_dense_ilocs( @@ -221,7 +221,7 @@ def _dense_to_sparse_ilocs(y_dense: pd.DataFrame) -> pd.DataFrame: anomaly_ends = np.insert(anomaly_ends, len(anomaly_ends), last_anomaly_end) anomaly_intervals = list(zip(anomaly_starts, anomaly_ends)) - return BaseCollectiveAnomalyDetector._format_sparse_output_ilocs( + return BaseSegmentAnomalyDetector._format_sparse_output_ilocs( anomaly_intervals, closed="left" ) @@ -229,14 +229,14 @@ def _dense_to_sparse_ilocs(y_dense: pd.DataFrame) -> pd.DataFrame: def _format_sparse_output_ilocs( anomaly_intervals: list[tuple[int, int]], closed: str = "left" ) -> pd.DataFrame: - """Format the sparse output of collective anomaly detectors. + """Format the sparse output of segment anomaly detectors. Can be reused by subclasses to format the output of the `_predict` method. Parameters ---------- anomaly_intervals : list - List of tuples containing start and end indices of collective anomalies. + List of tuples containing start and end indices of segment anomalies. Returns ------- @@ -337,23 +337,23 @@ def _dense_to_sparse_icolumns(y_dense: pd.DataFrame): anomaly_end = anomaly_mask.index[which_rows][-1] anomaly_intervals.append((anomaly_start, anomaly_end + 1, anomaly_columns)) - return BaseCollectiveAnomalyDetector._format_sparse_output_icolumns( + return BaseSegmentAnomalyDetector._format_sparse_output_icolumns( anomaly_intervals, closed="left" ) @staticmethod def _format_sparse_output_icolumns( - collective_anomalies: list[tuple[int, int, np.ndarray]], + segment_anomalies: list[tuple[int, int, np.ndarray]], closed: str = "left", ) -> pd.DataFrame: - """Format the sparse output of subset collective anomaly detectors. + """Format the sparse output of subset segment anomaly detectors. Can be reused by subclasses to format the output of the `_predict` method. Parameters ---------- - collective_anomalies : list - List of tuples containing start and end indices of collective + segment_anomalies : list + List of tuples containing start and end indices of segment anomalies and a np.array of the affected components/columns. closed : str Whether the (start, end) tuple correspond to intervals that are closed @@ -367,10 +367,10 @@ def _format_sparse_output_icolumns( * ``"labels"`` - integer labels 1, ..., K for each segment anomaly. * ``"icolumns"`` - list of affected columns for each anomaly. """ - ilocs = [(int(start), int(end)) for start, end, _ in collective_anomalies] + ilocs = [(int(start), int(end)) for start, end, _ in segment_anomalies] icolumns = [ np.array(components, dtype="int64") - for _, _, components in collective_anomalies + for _, _, components in segment_anomalies ] return pd.DataFrame( { diff --git a/skchange/anomaly_detectors/capa.py b/skchange/anomaly_detectors/capa.py index ff16049..cf8cc71 100644 --- a/skchange/anomaly_detectors/capa.py +++ b/skchange/anomaly_detectors/capa.py @@ -8,7 +8,7 @@ import numpy as np import pandas as pd -from skchange.anomaly_detectors.base import BaseCollectiveAnomalyDetector +from skchange.anomaly_detectors.base import BaseSegmentAnomalyDetector from skchange.anomaly_detectors.mvcapa import capa_penalty, run_base_capa from skchange.anomaly_scores import BaseSaving, L2Saving, to_saving from skchange.costs import BaseCost @@ -18,22 +18,22 @@ def run_capa( X: np.ndarray, - collective_saving: BaseSaving, + segment_saving: BaseSaving, point_saving: BaseSaving, - collective_alpha: float, + segment_alpha: float, point_alpha: float, min_segment_length: int, max_segment_length: int, ) -> tuple[np.ndarray, list[tuple[int, int]], list[tuple[int, int]]]: - collective_betas = np.zeros(1) + segment_betas = np.zeros(1) point_betas = np.zeros(1) - collective_saving.fit(X) + segment_saving.fit(X) point_saving.fit(X) return run_base_capa( - collective_saving, + segment_saving, point_saving, - collective_alpha, - collective_betas, + segment_alpha, + segment_betas, point_alpha, point_betas, min_segment_length, @@ -41,8 +41,8 @@ def run_capa( ) -class CAPA(BaseCollectiveAnomalyDetector): - """Collective and point anomaly detection. +class CAPA(BaseSegmentAnomalyDetector): + """The collective and point anomaly (CAPA) detection algorithm. An efficient implementation of the CAPA algorithm [1]_ for anomaly detection. It is implemented using the 'savings' formulation of the problem given in [2]_ and @@ -54,8 +54,8 @@ class CAPA(BaseCollectiveAnomalyDetector): Parameters ---------- - collective_saving : BaseSaving or BaseCost, optional, default=L2Saving() - The saving function to use for collective anomaly detection. + segment_saving : BaseSaving or BaseCost, optional, default=L2Saving() + The saving function to use for segment anomaly detection. If a `BaseCost` is given, the saving function is constructed from the cost. The cost must have a fixed parameter that represents the baseline cost. point_saving : BaseSaving or BaseCost, optional, default=L2Saving() @@ -63,8 +63,8 @@ class CAPA(BaseCollectiveAnomalyDetector): minimum size of 1 are permitted. If a `BaseCost` is given, the saving function is constructed from the cost. The cost must have a fixed parameter that represents the baseline cost. - collective_penalty_scale : float, optional, default=2.0 - Scaling factor for the collective penalty. + segment_penalty_scale : float, optional, default=2.0 + Scaling factor for the segment penalty. point_penalty_scale : float, optional, default=2.0 Scaling factor for the point penalty. min_segment_length : int, optional, default=2 @@ -73,8 +73,8 @@ class CAPA(BaseCollectiveAnomalyDetector): Maximum length of a segment. ignore_point_anomalies : bool, optional, default=False If True, detected point anomalies are not returned by `predict`. I.e., only - collective anomalies are returned. If False, point anomalies are included in the - output as collective anomalies of length 1. + segment anomalies are returned. If False, point anomalies are included in the + output as segment anomalies of length 1. See Also -------- @@ -114,34 +114,32 @@ class CAPA(BaseCollectiveAnomalyDetector): def __init__( self, - collective_saving: Optional[Union[BaseSaving, BaseCost]] = None, + segment_saving: Optional[Union[BaseSaving, BaseCost]] = None, point_saving: Optional[Union[BaseSaving, BaseCost]] = None, - collective_penalty_scale: float = 2.0, + segment_penalty_scale: float = 2.0, point_penalty_scale: float = 2.0, min_segment_length: int = 2, max_segment_length: int = 1000, ignore_point_anomalies: bool = False, ): - self.collective_saving = collective_saving + self.segment_saving = segment_saving self.point_saving = point_saving - self.collective_penalty_scale = collective_penalty_scale + self.segment_penalty_scale = segment_penalty_scale self.point_penalty_scale = point_penalty_scale self.min_segment_length = min_segment_length self.max_segment_length = max_segment_length self.ignore_point_anomalies = ignore_point_anomalies super().__init__() - _collective_saving = ( - L2Saving() if collective_saving is None else collective_saving - ) - self._collective_saving = to_saving(_collective_saving) + _segment_saving = L2Saving() if segment_saving is None else segment_saving + self._segment_saving = to_saving(_segment_saving) _point_saving = L2Saving() if point_saving is None else point_saving if _point_saving.min_size is not None and _point_saving.min_size > 1: raise ValueError("Point saving must have a minimum size of 1.") self._point_saving = to_saving(_point_saving) - check_larger_than(0, collective_penalty_scale, "collective_penalty_scale") + check_larger_than(0, segment_penalty_scale, "segment_penalty_scale") check_larger_than(0, point_penalty_scale, "point_penalty_scale") check_larger_than(2, min_segment_length, "min_segment_length") check_larger_than(min_segment_length, max_segment_length, "max_segment_length") @@ -152,10 +150,10 @@ def _get_penalty_components(self, X: pd.DataFrame) -> tuple[np.ndarray, float]: # return self._tune_threshold(X) n = X.shape[0] p = X.shape[1] - n_params = self._collective_saving.get_param_size(p) - collective_penalty = capa_penalty(n, n_params, self.collective_penalty_scale) + n_params = self._segment_saving.get_param_size(p) + segment_penalty = capa_penalty(n, n_params, self.segment_penalty_scale) point_penalty = self.point_penalty_scale * n_params * p * np.log(n) - return collective_penalty, point_penalty + return segment_penalty, point_penalty def _fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None): """Fit to training data. @@ -189,7 +187,7 @@ def _fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None): min_length=self.min_segment_length, min_length_name="min_segment_length", ) - self.collective_penalty_, self.point_penalty_ = self._get_penalty_components(X) + self.segment_penalty_, self.point_penalty_ = self._get_penalty_components(X) return self def _predict(self, X: Union[pd.DataFrame, pd.Series]) -> pd.Series: @@ -203,7 +201,7 @@ def _predict(self, X: Union[pd.DataFrame, pd.Series]) -> pd.Series: Returns ------- pd.Series[pd.Interval] - Containing the collective anomaly intervals. + Containing the segment anomaly intervals. Notes ----- @@ -215,18 +213,18 @@ def _predict(self, X: Union[pd.DataFrame, pd.Series]) -> pd.Series: min_length=self.min_segment_length, min_length_name="min_segment_length", ) - opt_savings, collective_anomalies, point_anomalies = run_capa( + opt_savings, segment_anomalies, point_anomalies = run_capa( X.values, - self._collective_saving, + self._segment_saving, self._point_saving, - self.collective_penalty_, + self.segment_penalty_, self.point_penalty_, self.min_segment_length, self.max_segment_length, ) self.scores = pd.Series(opt_savings, index=X.index, name="score") - anomalies = collective_anomalies + anomalies = segment_anomalies if not self.ignore_point_anomalies: anomalies += point_anomalies anomalies = sorted(anomalies) @@ -275,13 +273,13 @@ def get_test_params(cls, parameter_set="default"): params = [ { - "collective_saving": L2Cost(param=0.0), + "segment_saving": L2Cost(param=0.0), "point_saving": L2Cost(param=0.0), "min_segment_length": 5, "max_segment_length": 100, }, { - "collective_saving": L2Cost(param=0.0), + "segment_saving": L2Cost(param=0.0), "point_saving": L2Cost(param=0.0), "min_segment_length": 2, "max_segment_length": 20, diff --git a/skchange/anomaly_detectors/circular_binseg.py b/skchange/anomaly_detectors/circular_binseg.py index 89ef3a0..7aa68a7 100644 --- a/skchange/anomaly_detectors/circular_binseg.py +++ b/skchange/anomaly_detectors/circular_binseg.py @@ -8,7 +8,7 @@ import numpy as np import pandas as pd -from skchange.anomaly_detectors.base import BaseCollectiveAnomalyDetector +from skchange.anomaly_detectors.base import BaseSegmentAnomalyDetector from skchange.anomaly_scores import BaseLocalAnomalyScore, to_local_anomaly_score from skchange.change_detectors.seeded_binseg import make_seeded_intervals from skchange.costs import BaseCost, L2Cost @@ -102,8 +102,8 @@ def run_circular_binseg( return anomalies, anomaly_scores, maximizers, starts, ends -class CircularBinarySegmentation(BaseCollectiveAnomalyDetector): - """Circular binary segmentation algorithm for multiple collective anomaly detection. +class CircularBinarySegmentation(BaseSegmentAnomalyDetector): + """Circular binary segmentation algorithm for multiple segment anomaly detection. Binary segmentation type changepoint detection algorithms recursively split the data into two segments, and test whether the two segments are different. Circular binary @@ -111,7 +111,7 @@ class CircularBinarySegmentation(BaseCollectiveAnomalyDetector): (anomaly score) is applied to compare the data behaviour of an inner interval subset with the surrounding data contained in an outer interval. In other words, the null hypothesis within each outer interval is that the data - is stationary, while the alternative hypothesis is that there is a collective + is stationary, while the alternative hypothesis is that there is a segment anomaly within the outer interval. Efficently implemented using numba. @@ -303,7 +303,7 @@ def _predict(self, X: Union[pd.DataFrame, pd.Series]) -> pd.Series: Returns ------- - pd.Series[pd.Interval] containing the collective anomaly intervals. + pd.Series[pd.Interval] containing the segment anomaly intervals. Notes ----- diff --git a/skchange/anomaly_detectors/mvcapa.py b/skchange/anomaly_detectors/mvcapa.py index 15442c3..9d1c356 100644 --- a/skchange/anomaly_detectors/mvcapa.py +++ b/skchange/anomaly_detectors/mvcapa.py @@ -9,7 +9,7 @@ import pandas as pd from scipy.stats import chi2 -from skchange.anomaly_detectors.base import BaseCollectiveAnomalyDetector +from skchange.anomaly_detectors.base import BaseSegmentAnomalyDetector from skchange.anomaly_scores import BaseSaving, L2Saving, to_saving from skchange.costs import BaseCost from skchange.utils.numba import njit @@ -214,19 +214,19 @@ def capa_penalty_factory(penalty: Union[str, Callable] = "combined") -> Callable def get_anomalies( anomaly_starts: np.ndarray, ) -> tuple[list[tuple[int, int]], list[tuple[int, int]]]: - collective_anomalies = [] + segment_anomalies = [] point_anomalies = [] i = anomaly_starts.size - 1 while i >= 0: start_i = anomaly_starts[i] size = i - start_i + 1 if size > 1: - collective_anomalies.append((int(start_i), i + 1)) + segment_anomalies.append((int(start_i), i + 1)) i = int(start_i) elif size == 1: point_anomalies.append((i, i)) i -= 1 - return collective_anomalies, point_anomalies + return segment_anomalies, point_anomalies @njit @@ -283,19 +283,19 @@ def optimise_savings( def run_base_capa( - collective_saving: BaseSaving, + segment_saving: BaseSaving, point_saving: BaseSaving, - collective_alpha: float, - collective_betas: np.ndarray, + segment_alpha: float, + segment_betas: np.ndarray, point_alpha: float, point_betas: np.ndarray, min_segment_length: int, max_segment_length: int, ) -> tuple[np.ndarray, list[tuple[int, int]], list[tuple[int, int]]]: - collective_saving.check_is_fitted() + segment_saving.check_is_fitted() point_saving.check_is_fitted() - n = collective_saving._X.shape[0] + n = segment_saving._X.shape[0] opt_savings = np.zeros(n + 1) # Store the optimal start and affected components of an anomaly for each t. # Used to get the final set of anomalies after the loop. @@ -304,13 +304,13 @@ def run_base_capa( ts = np.arange(min_segment_length - 1, n) for t in ts: - # Collective anomalies + # Segment anomalies t_array = np.array([t]) starts = np.concatenate((starts, t_array - min_segment_length + 1)) ends = np.repeat(t + 1, len(starts)) - collective_savings = collective_saving.evaluate(np.column_stack((starts, ends))) - opt_collective_saving, opt_start, candidate_savings = optimise_savings( - starts, opt_savings, collective_savings, collective_alpha, collective_betas + segment_savings = segment_saving.evaluate(np.column_stack((starts, ends))) + opt_segment_saving, opt_start, candidate_savings = optimise_savings( + starts, opt_savings, segment_savings, segment_alpha, segment_betas ) # Point anomalies @@ -320,7 +320,7 @@ def run_base_capa( ) # Combine and store results - savings = np.array([opt_savings[t], opt_collective_saving, opt_point_saving]) + savings = np.array([opt_savings[t], opt_segment_saving, opt_point_saving]) argmax = np.argmax(savings) opt_savings[t + 1] = savings[argmax] if argmax == 1: @@ -329,22 +329,22 @@ def run_base_capa( opt_anomaly_starts[t] = t # Pruning the admissible starts - penalty_sum = collective_alpha + collective_betas.sum() + penalty_sum = segment_alpha + segment_betas.sum() saving_too_low = candidate_savings + penalty_sum < opt_savings[t + 1] too_long_segment = starts < t - max_segment_length + 2 prune = saving_too_low | too_long_segment starts = starts[~prune] - collective_anomalies, point_anomalies = get_anomalies(opt_anomaly_starts) - return opt_savings[1:], collective_anomalies, point_anomalies + segment_anomalies, point_anomalies = get_anomalies(opt_anomaly_starts) + return opt_savings[1:], segment_anomalies, point_anomalies def run_mvcapa( X: np.ndarray, - collective_saving: BaseSaving, + segment_saving: BaseSaving, point_saving: BaseSaving, - collective_penalty: str, - collective_penalty_scale: float, + segment_penalty: str, + segment_penalty_scale: float, point_penalty: str, point_penalty_scale: float, min_segment_length: int, @@ -354,23 +354,23 @@ def run_mvcapa( ]: n = X.shape[0] p = X.shape[1] - collective_n_params_per_variable = collective_saving.get_param_size(1) - collective_penalty_func = capa_penalty_factory(collective_penalty) - collective_alpha, collective_betas = collective_penalty_func( - n, p, collective_n_params_per_variable, scale=collective_penalty_scale + segment_n_params_per_variable = segment_saving.get_param_size(1) + segment_penalty_func = capa_penalty_factory(segment_penalty) + segment_alpha, segment_betas = segment_penalty_func( + n, p, segment_n_params_per_variable, scale=segment_penalty_scale ) point_penalty_func = capa_penalty_factory(point_penalty) point_n_params_per_variable = point_saving.get_param_size(1) point_alpha, point_betas = point_penalty_func( n, p, point_n_params_per_variable, scale=point_penalty_scale ) - collective_saving.fit(X) + segment_saving.fit(X) point_saving.fit(X) - opt_savings, collective_anomalies, point_anomalies = run_base_capa( - collective_saving, + opt_savings, segment_anomalies, point_anomalies = run_base_capa( + segment_saving, point_saving, - collective_alpha, - collective_betas, + segment_alpha, + segment_betas, point_alpha, point_betas, min_segment_length, @@ -379,29 +379,29 @@ def run_mvcapa( sparse_penalty_func = capa_penalty_factory("sparse") sparse_alpha, sparse_betas = sparse_penalty_func( - n, p, collective_n_params_per_variable, scale=collective_penalty_scale + n, p, segment_n_params_per_variable, scale=segment_penalty_scale ) - collective_anomalies = find_affected_components( - collective_saving, - collective_anomalies, + segment_anomalies = find_affected_components( + segment_saving, + segment_anomalies, sparse_alpha, sparse_betas, ) point_anomalies = find_affected_components( point_saving, point_anomalies, point_alpha, point_betas ) - return opt_savings, collective_anomalies, point_anomalies + return opt_savings, segment_anomalies, point_anomalies -class MVCAPA(BaseCollectiveAnomalyDetector): +class MVCAPA(BaseSegmentAnomalyDetector): """Subset multivariate collective and point anomaly detection. An efficient implementation of the MVCAPA algorithm [1]_ for anomaly detection. Parameters ---------- - collective_saving : BaseSaving or BaseCost, optional, default=L2Saving() - The saving function to use for collective anomaly detection. + segment_saving : BaseSaving or BaseCost, optional, default=L2Saving() + The saving function to use for segment anomaly detection. Only univariate savings are permitted (see the `evaluation_type` attribute). If a `BaseCost` is given, the saving function is constructed from the cost. The cost must have a fixed parameter that represents the baseline cost. @@ -410,15 +410,15 @@ class MVCAPA(BaseCollectiveAnomalyDetector): minimum size of 1 are permitted. If a `BaseCost` is given, the saving function is constructed from the cost. The cost must have a fixed parameter that represents the baseline cost. - collective_penalty : str or Callable, optional, default="combined" - Penalty function to use for collective anomalies. If a string, must be one of + segment_penalty : str or Callable, optional, default="combined" + Penalty function to use for segment anomalies. If a string, must be one of "dense", "sparse", "intermediate" or "combined". If a Callable, must be a function returning a penalty and per-component penalties, given n, p, n_params and scale. - collective_penalty_scale : float, optional, default=1.0 - Scaling factor for the collective penalty. + segment_penalty_scale : float, optional, default=1.0 + Scaling factor for the segment penalty. point_penalty : str or Callable, optional, default="sparse" - Penalty function to use for point anomalies. See `collective_penalty`. + Penalty function to use for point anomalies. See `segment_penalty`. point_penalty_scale : float, optional, default=1.0 Scaling factor for the point penalty. min_segment_length : int, optional, default=2 @@ -427,12 +427,12 @@ class MVCAPA(BaseCollectiveAnomalyDetector): Maximum length of a segment. ignore_point_anomalies : bool, optional, default=False If True, detected point anomalies are not returned by `predict`. I.e., only - collective anomalies are returned. + segment anomalies are returned. References ---------- .. [1] Fisch, A. T., Eckley, I. A., & Fearnhead, P. (2022). Subset multivariate - collective and point anomaly detection. Journal of Computational and Graphical + segment and point anomaly detection. Journal of Computational and Graphical Statistics, 31(2), 574-585. Examples @@ -466,20 +466,20 @@ class MVCAPA(BaseCollectiveAnomalyDetector): def __init__( self, - collective_saving: Optional[Union[BaseSaving, BaseCost]] = None, + segment_saving: Optional[Union[BaseSaving, BaseCost]] = None, point_saving: Optional[Union[BaseSaving, BaseCost]] = None, - collective_penalty: Union[str, Callable] = "combined", - collective_penalty_scale: float = 2.0, + segment_penalty: Union[str, Callable] = "combined", + segment_penalty_scale: float = 2.0, point_penalty: Union[str, Callable] = "sparse", point_penalty_scale: float = 2.0, min_segment_length: int = 2, max_segment_length: int = 1000, ignore_point_anomalies: bool = False, ): - self.collective_saving = collective_saving + self.segment_saving = segment_saving self.point_saving = point_saving - self.collective_penalty = collective_penalty - self.collective_penalty_scale = collective_penalty_scale + self.segment_penalty = segment_penalty + self.segment_penalty_scale = segment_penalty_scale self.point_penalty = point_penalty self.point_penalty_scale = point_penalty_scale self.min_segment_length = min_segment_length @@ -487,19 +487,17 @@ def __init__( self.ignore_point_anomalies = ignore_point_anomalies super().__init__() - _collective_saving = ( - L2Saving() if collective_saving is None else collective_saving - ) - if _collective_saving.evaluation_type == "multivariate": - raise ValueError("Collective saving must be univariate.") - self._collective_saving = to_saving(_collective_saving) + _segment_saving = L2Saving() if segment_saving is None else segment_saving + if _segment_saving.evaluation_type == "multivariate": + raise ValueError("Segment saving must be univariate.") + self._segment_saving = to_saving(_segment_saving) _point_saving = L2Saving() if point_saving is None else point_saving if _point_saving.min_size != 1: raise ValueError("Point saving must have a minimum size of 1.") self._point_saving = to_saving(_point_saving) - check_larger_than(0, collective_penalty_scale, "collective_penalty_scale") + check_larger_than(0, segment_penalty_scale, "segment_penalty_scale") check_larger_than(0, point_penalty_scale, "point_penalty_scale") check_larger_than(2, min_segment_length, "min_segment_length") check_larger_than(min_segment_length, max_segment_length, "max_segment_length") @@ -557,12 +555,12 @@ def _predict(self, X: Union[pd.DataFrame, pd.Series]) -> pd.Series: min_length=self.min_segment_length, min_length_name="min_segment_length", ) - opt_savings, collective_anomalies, point_anomalies = run_mvcapa( + opt_savings, segment_anomalies, point_anomalies = run_mvcapa( X.values, - self._collective_saving, + self._segment_saving, self._point_saving, - self.collective_penalty, - self.collective_penalty_scale, + self.segment_penalty, + self.segment_penalty_scale, self.point_penalty, self.point_penalty_scale, self.min_segment_length, @@ -570,7 +568,7 @@ def _predict(self, X: Union[pd.DataFrame, pd.Series]) -> pd.Series: ) self.scores = pd.Series(opt_savings, index=X.index, name="score") - anomalies = collective_anomalies + anomalies = segment_anomalies if not self.ignore_point_anomalies: anomalies += point_anomalies anomalies = sorted(anomalies) @@ -619,13 +617,13 @@ def get_test_params(cls, parameter_set="default"): params = [ { - "collective_saving": L2Cost(param=0.0), + "segment_saving": L2Cost(param=0.0), "point_saving": L2Cost(param=0.0), "min_segment_length": 5, "max_segment_length": 100, }, { - "collective_saving": L2Cost(param=0.0), + "segment_saving": L2Cost(param=0.0), "point_saving": L2Cost(param=0.0), "min_segment_length": 2, "max_segment_length": 20, diff --git a/skchange/anomaly_detectors/tests/test_anomaly_detectors.py b/skchange/anomaly_detectors/tests/test_anomaly_detectors.py index 2c59a95..619db49 100644 --- a/skchange/anomaly_detectors/tests/test_anomaly_detectors.py +++ b/skchange/anomaly_detectors/tests/test_anomaly_detectors.py @@ -4,7 +4,7 @@ import pytest from skchange.anomaly_detectors import COLLECTIVE_ANOMALY_DETECTORS -from skchange.anomaly_detectors.base import BaseCollectiveAnomalyDetector +from skchange.anomaly_detectors.base import BaseSegmentAnomalyDetector from skchange.datasets.generate import generate_anomalous_data true_anomalies = [(30, 35), (70, 75)] @@ -15,8 +15,8 @@ @pytest.mark.parametrize("Estimator", COLLECTIVE_ANOMALY_DETECTORS) -def test_collective_anomaly_detector_predict(Estimator: BaseCollectiveAnomalyDetector): - """Test collective anomaly detector's predict method (sparse output).""" +def test_segment_anomaly_detector_predict(Estimator: BaseSegmentAnomalyDetector): + """Test segment anomaly detector's predict method (sparse output).""" detector = Estimator.create_test_instance() detector.fit(anomaly_free_data) anomalies = detector.predict(anomaly_data)["ilocs"] @@ -27,18 +27,18 @@ def test_collective_anomaly_detector_predict(Estimator: BaseCollectiveAnomalyDet @pytest.mark.parametrize("Estimator", COLLECTIVE_ANOMALY_DETECTORS) -def test_collective_anomaly_detector_transform( - Estimator: BaseCollectiveAnomalyDetector, +def test_segment_anomaly_detector_transform( + Estimator: BaseSegmentAnomalyDetector, ): - """Test collective anomaly detector's transform method (dense output).""" + """Test segment anomaly detector's transform method (dense output).""" detector = Estimator.create_test_instance() detector.fit(anomaly_free_data) labels = detector.transform(anomaly_data) - true_collective_anomalies = pd.DataFrame( + true_segment_anomalies = pd.DataFrame( {"ilocs": pd.IntervalIndex.from_tuples(true_anomalies, closed="left")} ) - true_anomaly_labels = BaseCollectiveAnomalyDetector.sparse_to_dense( - true_collective_anomalies, anomaly_data.index + true_anomaly_labels = BaseSegmentAnomalyDetector.sparse_to_dense( + true_segment_anomalies, anomaly_data.index ) labels.equals(true_anomaly_labels) @@ -53,4 +53,4 @@ def test_dense_to_sparse_invalid_columns(): """Test dense_to_sparse method with invalid DataFrame input columns.""" invalid_df = pd.DataFrame({"invalid_column": [0, 1, 0, 1]}) with pytest.raises(ValueError): - BaseCollectiveAnomalyDetector.dense_to_sparse(invalid_df) + BaseSegmentAnomalyDetector.dense_to_sparse(invalid_df) diff --git a/skchange/anomaly_detectors/tests/test_capa.py b/skchange/anomaly_detectors/tests/test_capa.py index bb5081f..7fd90e1 100644 --- a/skchange/anomaly_detectors/tests/test_capa.py +++ b/skchange/anomaly_detectors/tests/test_capa.py @@ -46,8 +46,8 @@ def test_capa_anomalies(Detector, Saving): random_state=8, ) detector = Detector( - collective_saving=saving, - collective_penalty_scale=2.0, + segment_saving=saving, + segment_penalty_scale=2.0, min_segment_length=p + 1, ignore_point_anomalies=True, # To get test coverage. ) @@ -68,9 +68,9 @@ def test_mvcapa_errors(): cost = MultivariateGaussianCost([0.0, cov_mat]) saving = Saving(cost) - # Test collective saving must be univariate + # Test segment saving must be univariate with pytest.raises(ValueError): - MVCAPA(collective_saving=saving) + MVCAPA(segment_saving=saving) # Test point saving must have a minimum size of 1 with pytest.raises(ValueError): diff --git a/skchange/anomaly_scores/from_cost.py b/skchange/anomaly_scores/from_cost.py index 4134381..9f2707a 100644 --- a/skchange/anomaly_scores/from_cost.py +++ b/skchange/anomaly_scores/from_cost.py @@ -185,7 +185,7 @@ class LocalAnomalyScore(BaseLocalAnomalyScore): Local anomaly scores compare the data behaviour of an inner interval with the surrounding data contained in an outer interval. In other words, the null hypothesis within each outer interval is that the data is stationary, while the - alternative hypothesis is that there is a collective anomaly within the + alternative hypothesis is that there is a segment anomaly within the outer interval. Parameters diff --git a/skchange/base/base_detector.py b/skchange/base/base_detector.py index 3998139..cf035d3 100644 --- a/skchange/base/base_detector.py +++ b/skchange/base/base_detector.py @@ -11,7 +11,7 @@ class name: BaseDetector detection scores, dense - transform_scores(self, X) [optional] updating (temporal) - update(self, X, y=None) [optional] -Each detector type (e.g. point anomaly detector, collective anomaly detector, +Each detector type (e.g. point anomaly detector, segment anomaly detector, changepoint detector) are subclasses of BaseDetector (task tag in sktime). A detector type is defined by the content and format of the output of the predict method. Each detector type therefore has the following methods for converting between