Merge pull request #51 from NorskRegnesentral/collective-to-segment-a…

…nomaly [MNT] Use "segment anomalies" rather than "collective anomalies"
NorskRegnesentral · Dec 11, 2024 · b1d0826 · b1d0826
2 parents 8f08b83 + def6b0d
commit b1d0826
Show file tree

Hide file tree

Showing 14 changed files with 158 additions and 164 deletions.
diff --git a/docs/source/api_reference/anomaly_detectors.rst b/docs/source/api_reference/anomaly_detectors.rst
@@ -11,9 +11,9 @@ Base
     :toctree: auto_generated/
     :template: class.rst
 
-    BaseCollectiveAnomalyDetector
+    BaseSegmentAnomalyDetector
 
-Collective anomaly detectors
+Segment anomaly detectors
 ----------------------------
 .. currentmodule:: skchange.anomaly_detectors
 
@@ -25,7 +25,7 @@ Collective anomaly detectors
     CircularBinarySegmentation
     StatThresholdAnomaliser
 
-Collective anomaly detectors with variable identification
+Segment anomaly detectors with variable identification
 ---------------------------------------------------------
 .. currentmodule:: skchange.anomaly_detectors
 

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -4,7 +4,7 @@
 Welcome to skchange
 ===================
 
-A python library for fast collective anomaly and changepoint detection.
+A python library for fast change point and segment anomaly detection.
 The library is designed to be compatible with `sktime <https://www.sktime.net>`_.
 `Numba <https://numba.readthedocs.io>`_ is used for computational speed.
 
@@ -34,8 +34,8 @@ Key features
 - **Fast**: `Numba <https://numba.readthedocs.io>`_ is used for performance.
 - **Easy to use**: Follows the conventions of `sktime <https://www.sktime.net>`_ and `scikit-learn <https://scikit-learn.org>`_.
 - **Easy to extend**: Make your own detectors by inheriting from the base class templates. Create custom detection scores and cost functions.
-- **Collective anomaly detection**: Detect intervals of anomalous behaviour in time series data.
-- **Subset collective anomaly detection**: Detect intervals of anomalous behaviour in time series data, and infer the subset of variables that are responsible for the anomaly.
+- **Segment anomaly detection**: Detect intervals of anomalous behaviour in time series data.
+- **Subset anomaly detection**: Detect intervals of anomalous behaviour in time series data, and infer the subset of variables that are responsible for the anomaly.
 
 Mission
 -------

diff --git a/interactive/compare_detector_outputs.py b/interactive/compare_detector_outputs.py
@@ -29,14 +29,14 @@
 print(changepoints)
 print(changepoint_labels)
 
-# Collective anomaly detector
+# Segment anomaly detector
 anomaly_detector = CAPA()
 anomalies = anomaly_detector.fit_predict(df)
 anomaly_labels = anomaly_detector.transform(df)
 print(anomalies)
 print(anomaly_labels)
 
-# Subset collective anomaly detector
+# Subset segment anomaly detector
 subset_anomaly_detector = MVCAPA()
 subset_anomalies = subset_anomaly_detector.fit_predict(df)
 subset_anomaly_labels = subset_anomaly_detector.transform(df)

diff --git a/interactive/explore_capa.py b/interactive/explore_capa.py
@@ -27,7 +27,7 @@
 df = generate_alternating_data(
     5, 10, p=10, mean=10, affected_proportion=0.2, random_state=2
 )
-detector = MVCAPA(collective_penalty="sparse")
+detector = MVCAPA(segment_penalty="sparse")
 
 anomalies = detector.fit_predict(df)
 print(anomalies)
@@ -56,13 +56,11 @@
 # Profiling
 n = int(1e5)
 df = generate_alternating_data(n_segments=1, mean=0, segment_length=n, p=1)
-detector = CAPA(
-    max_segment_length=100, collective_penalty_scale=5, point_penalty_scale=5
-)
+detector = CAPA(max_segment_length=100, segment_penalty_scale=5, point_penalty_scale=5)
 detector = MVCAPA(
     max_segment_length=1000,
-    collective_penalty="sparse",
-    collective_penalty_scale=5,
+    segment_penalty="sparse",
+    segment_penalty_scale=5,
     point_penalty_scale=5,
 )
 profiler = Profiler().start()

diff --git a/skchange/anomaly_detectors/__init__.py b/skchange/anomaly_detectors/__init__.py
@@ -1,13 +1,13 @@
 """Anomaly detection algorithms."""
 
 from skchange.anomaly_detectors.anomalisers import StatThresholdAnomaliser
-from skchange.anomaly_detectors.base import BaseCollectiveAnomalyDetector
+from skchange.anomaly_detectors.base import BaseSegmentAnomalyDetector
 from skchange.anomaly_detectors.capa import CAPA
 from skchange.anomaly_detectors.circular_binseg import CircularBinarySegmentation
 from skchange.anomaly_detectors.mvcapa import MVCAPA
 
 BASE_ANOMALY_DETECTORS = [
-    BaseCollectiveAnomalyDetector,
+    BaseSegmentAnomalyDetector,
 ]
 COLLECTIVE_ANOMALY_DETECTORS = [
     CAPA,

diff --git a/skchange/anomaly_detectors/anomalisers.py b/skchange/anomaly_detectors/anomalisers.py
@@ -5,11 +5,11 @@
 import numpy as np
 import pandas as pd
 
-from skchange.anomaly_detectors.base import BaseCollectiveAnomalyDetector
+from skchange.anomaly_detectors.base import BaseSegmentAnomalyDetector
 from skchange.change_detectors.base import BaseChangeDetector
 
 
-class StatThresholdAnomaliser(BaseCollectiveAnomalyDetector):
+class StatThresholdAnomaliser(BaseSegmentAnomalyDetector):
     """Anomaly detection based on thresholding the values of segment statistics.
 
     Parameters

diff --git a/skchange/anomaly_detectors/base.py b/skchange/anomaly_detectors/base.py
@@ -1,7 +1,7 @@
 """Base classes for anomaly detectors.
 
     classes:
-        BaseCollectiveAnomalyDetector
+        BaseSegmentAnomalyDetector
 
 By inheriting from these classes the remaining methods of the BaseDetector class to
 implement to obtain a fully functional anomaly detector are given below.
@@ -23,10 +23,10 @@
 from skchange.base import BaseDetector
 
 
-class BaseCollectiveAnomalyDetector(BaseDetector):
-    """Base class for collective anomaly detectors.
+class BaseSegmentAnomalyDetector(BaseDetector):
+    """Base class for segment anomaly detectors.
 
-    Collective anomaly detectors detect segments of data points that are considered
+    Segment anomaly detectors detect segments of data points that are considered
     anomalous.
 
     Output format of the `predict` method: See the `dense_to_sparse` method.
@@ -68,10 +68,10 @@ def sparse_to_dense(
             0 is reserved for the normal instances.
         """
         if "icolumns" in y_sparse:
-            return BaseCollectiveAnomalyDetector._sparse_to_dense_icolumns(
+            return BaseSegmentAnomalyDetector._sparse_to_dense_icolumns(
                 y_sparse, index, columns
             )
-        return BaseCollectiveAnomalyDetector._sparse_to_dense_ilocs(y_sparse, index)
+        return BaseSegmentAnomalyDetector._sparse_to_dense_ilocs(y_sparse, index)
 
     @staticmethod
     def dense_to_sparse(y_dense: pd.DataFrame) -> pd.DataFrame:
@@ -104,29 +104,29 @@ def dense_to_sparse(y_dense: pd.DataFrame) -> pd.DataFrame:
         `output["ilocs"].array.left` and `output["ilocs"].array.right`, respectively.
         """
         if "labels" in y_dense.columns:
-            return BaseCollectiveAnomalyDetector._dense_to_sparse_ilocs(y_dense)
+            return BaseSegmentAnomalyDetector._dense_to_sparse_ilocs(y_dense)
         elif y_dense.columns.str.startswith("labels_").all():
-            return BaseCollectiveAnomalyDetector._dense_to_sparse_icolumns(y_dense)
+            return BaseSegmentAnomalyDetector._dense_to_sparse_icolumns(y_dense)
         raise ValueError(
             "Invalid columns in `y_dense`. Expected 'labels' or 'labels_*'."
             f" Got: {y_dense.columns}"
         )
 
     def _format_sparse_output(
         self,
-        collective_anomalies: Union[
+        segment_anomalies: Union[
             list[tuple[int, int]], list[tuple[int, int, np.ndarray]]
         ],
         closed: str = "left",
     ) -> pd.DataFrame:
-        """Format the sparse output of collective anomaly detectors.
+        """Format the sparse output of segment anomaly detectors.
 
         Can be reused by subclasses to format the output of the `_predict` method.
 
         Parameters
         ----------
-        collective_anomalies : list
-            List of tuples containing start and end indices of collective anomalies,
+        segment_anomalies : list
+            List of tuples containing start and end indices of segment anomalies,
             and optionally a np.array of the identified variables/components/columns.
         closed : str
             Whether the (start, end) tuple correspond to intervals that are closed
@@ -144,11 +144,11 @@ def _format_sparse_output(
         The start and end points of the intervals can be accessed by
         `output["ilocs"].array.left` and `output["ilocs"].array.right`, respectively.
         """
-        # Cannot extract this from collective_anomalies as it may be an empty list.
+        # Cannot extract this from segment_anomalies as it may be an empty list.
         if self.capability_variable_identification:
-            return self._format_sparse_output_icolumns(collective_anomalies, closed)
+            return self._format_sparse_output_icolumns(segment_anomalies, closed)
         else:
-            return self._format_sparse_output_ilocs(collective_anomalies, closed)
+            return self._format_sparse_output_ilocs(segment_anomalies, closed)
 
     @staticmethod
     def _sparse_to_dense_ilocs(
@@ -221,22 +221,22 @@ def _dense_to_sparse_ilocs(y_dense: pd.DataFrame) -> pd.DataFrame:
         anomaly_ends = np.insert(anomaly_ends, len(anomaly_ends), last_anomaly_end)
 
         anomaly_intervals = list(zip(anomaly_starts, anomaly_ends))
-        return BaseCollectiveAnomalyDetector._format_sparse_output_ilocs(
+        return BaseSegmentAnomalyDetector._format_sparse_output_ilocs(
             anomaly_intervals, closed="left"
         )
 
     @staticmethod
     def _format_sparse_output_ilocs(
         anomaly_intervals: list[tuple[int, int]], closed: str = "left"
     ) -> pd.DataFrame:
-        """Format the sparse output of collective anomaly detectors.
+        """Format the sparse output of segment anomaly detectors.
 
         Can be reused by subclasses to format the output of the `_predict` method.
 
         Parameters
         ----------
         anomaly_intervals : list
-            List of tuples containing start and end indices of collective anomalies.
+            List of tuples containing start and end indices of segment anomalies.
 
         Returns
         -------
@@ -337,23 +337,23 @@ def _dense_to_sparse_icolumns(y_dense: pd.DataFrame):
             anomaly_end = anomaly_mask.index[which_rows][-1]
             anomaly_intervals.append((anomaly_start, anomaly_end + 1, anomaly_columns))
 
-        return BaseCollectiveAnomalyDetector._format_sparse_output_icolumns(
+        return BaseSegmentAnomalyDetector._format_sparse_output_icolumns(
             anomaly_intervals, closed="left"
         )
 
     @staticmethod
     def _format_sparse_output_icolumns(
-        collective_anomalies: list[tuple[int, int, np.ndarray]],
+        segment_anomalies: list[tuple[int, int, np.ndarray]],
         closed: str = "left",
     ) -> pd.DataFrame:
-        """Format the sparse output of subset collective anomaly detectors.
+        """Format the sparse output of subset segment anomaly detectors.
 
         Can be reused by subclasses to format the output of the `_predict` method.
 
         Parameters
         ----------
-        collective_anomalies : list
-            List of tuples containing start and end indices of collective
+        segment_anomalies : list
+            List of tuples containing start and end indices of segment
             anomalies and a np.array of the affected components/columns.
         closed : str
             Whether the (start, end) tuple correspond to intervals that are closed
@@ -367,10 +367,10 @@ def _format_sparse_output_icolumns(
             * ``"labels"`` - integer labels 1, ..., K for each segment anomaly.
             * ``"icolumns"`` - list of affected columns for each anomaly.
         """
-        ilocs = [(int(start), int(end)) for start, end, _ in collective_anomalies]
+        ilocs = [(int(start), int(end)) for start, end, _ in segment_anomalies]
         icolumns = [
             np.array(components, dtype="int64")
-            for _, _, components in collective_anomalies
+            for _, _, components in segment_anomalies
         ]
         return pd.DataFrame(
             {