Skip to content

Commit

Permalink
Merge pull request #50 from NorskRegnesentral/fix-scorer-inner-datatype
Browse files Browse the repository at this point in the history
Fix inner data type in interval scorer and unify cost names
  • Loading branch information
Tveten authored Dec 11, 2024
2 parents a7c2d2c + 37603a0 commit 8f08b83
Show file tree
Hide file tree
Showing 14 changed files with 62 additions and 86 deletions.
2 changes: 1 addition & 1 deletion docs/source/api_reference/costs.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ Costs

BaseCost
L2Cost
GaussianVarCost
GaussianCost
GaussianCovCost

Utility functions
Expand Down
3 changes: 1 addition & 2 deletions skchange/anomaly_detectors/anomalisers.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,7 @@ def _fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None):
-------
self : returns a reference to self
"""
self.change_detector_: ChangeDetector
self.change_detector_ = self.change_detector.clone()
self.change_detector_: BaseChangeDetector = self.change_detector.clone()
self.change_detector_.fit(X, y)
return self

Expand Down
17 changes: 8 additions & 9 deletions skchange/anomaly_scores/from_cost.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from typing import Union

import numpy as np
from numpy.typing import ArrayLike

from skchange.anomaly_scores.base import BaseLocalAnomalyScore, BaseSaving
from skchange.costs import BaseCost, L2Cost
Expand Down Expand Up @@ -81,13 +80,13 @@ def get_param_size(self, p: int) -> int:
"""
return self.optimised_cost.get_param_size(p)

def _fit(self, X: ArrayLike, y=None):
def _fit(self, X: np.ndarray, y=None):
"""Fit the saving scorer.
Parameters
----------
X : array-like
Input data.
X : np.ndarray
Data to evaluate. Must be a 2D array.
y : None
Ignored. Included for API consistency by convention.
Expand Down Expand Up @@ -218,13 +217,13 @@ def min_size(self) -> int:
"""Minimum valid size of the interval to evaluate."""
return self.cost.min_size

def _fit(self, X: ArrayLike, y=None):
def _fit(self, X: np.ndarray, y=None):
"""Fit the saving scorer.
Parameters
----------
X : array-like
Input data.
X : np.ndarray
Data to evaluate. Must be a 2D array.
y : None
Ignored. Included for API consistency by convention.
Expand Down Expand Up @@ -339,10 +338,10 @@ def get_test_params(cls, parameter_set="default"):
`MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance.
`create_test_instance` uses the first (or only) dictionary in `params`
"""
from skchange.costs import GaussianVarCost
from skchange.costs import GaussianCost

params = [
{"cost": L2Cost()},
{"cost": GaussianVarCost()},
{"cost": GaussianCost()},
]
return params
13 changes: 5 additions & 8 deletions skchange/anomaly_scores/l2_saving.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,10 @@
__author__ = ["Tveten"]

import numpy as np
from numpy.typing import ArrayLike

from skchange.anomaly_scores.base import BaseSaving
from skchange.utils.numba import njit
from skchange.utils.numba.stats import col_cumsum
from skchange.utils.validation.data import as_2d_array


@njit
Expand Down Expand Up @@ -80,13 +78,13 @@ def get_param_size(self, p: int) -> int:
"""
return p

def _fit(self, X: ArrayLike, y=None):
def _fit(self, X: np.ndarray, y=None):
"""Fit the saving evaluator.
Parameters
----------
X : array-like
Input data.
X : np.ndarray
Data to evaluate. Must be a 2D array.
y : None
Ignored. Included for API consistency by convention.
Expand All @@ -95,8 +93,7 @@ def _fit(self, X: ArrayLike, y=None):
self :
Reference to self.
"""
X = as_2d_array(X)
self.sums_ = col_cumsum(X, init_zero=True)
self._sums = col_cumsum(X, init_zero=True)
return self

def _evaluate(self, cuts: np.ndarray) -> np.ndarray:
Expand All @@ -120,7 +117,7 @@ def _evaluate(self, cuts: np.ndarray) -> np.ndarray:
"""
starts = cuts[:, 0]
ends = cuts[:, 1]
return l2_saving(starts, ends, self.sums_)
return l2_saving(starts, ends, self._sums)

@classmethod
def get_test_params(cls, parameter_set="default"):
Expand Down
12 changes: 6 additions & 6 deletions skchange/base/base_interval_scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def fit(self, X, y=None):
Parameters
----------
X : pd.Series, pd.DataFrame, or np.ndarray
Data to score.
Data to evaluate.
y : None
Ignored. Included for API consistency by convention.
Expand All @@ -86,22 +86,22 @@ def fit(self, X, y=None):
Updates the fitted model and sets attributes ending in "_".
"""
X = check_series(X, allow_index_names=True)
self._X = X
self._X = as_2d_array(X)

self._fit(X=X, y=y)
self._fit(X=self._X, y=y)
self._is_fitted = True
return self

def _fit(self, X, y=None):
def _fit(self, X: np.ndarray, y=None):
"""Fit the interval scorer to training data.
The core logic of fitting an interval scorer to training data is implemented
here.
Parameters
----------
X : pd.Series, pd.DataFrame or np.ndarray
Data to score.
X : np.ndarray
Data to evaluate. Must be a 2D array.
y : None
Ignored. Included for API consistency by convention.
Expand Down
2 changes: 1 addition & 1 deletion skchange/base/tests/test_base_interval_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def _evaluate(self, cuts):

def test_fit():
evaluator = ConcreteIntervalEvaluator()
X = np.array([1, 2, 3, 4, 5])
X = np.array([1, 2, 3, 4, 5]).reshape(-1, 1)
evaluator.fit(X)
assert evaluator._is_fitted
assert np.array_equal(evaluator._X, X)
Expand Down
13 changes: 5 additions & 8 deletions skchange/change_scores/cusum.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,10 @@
__author__ = ["Tveten"]

import numpy as np
from numpy.typing import ArrayLike

from skchange.change_scores.base import BaseChangeScore
from skchange.utils.numba import njit
from skchange.utils.numba.stats import col_cumsum
from skchange.utils.validation.data import as_2d_array


@njit
Expand Down Expand Up @@ -74,13 +72,13 @@ def min_size(self) -> int:
"""Minimum size of the interval to evaluate."""
return 1

def _fit(self, X: ArrayLike, y=None):
def _fit(self, X: np.ndarray, y=None):
"""Fit the change score evaluator.
Parameters
----------
X : array-like
Input data.
X : np.ndarray
Data to evaluate. Must be a 2D array.
y : None
Ignored. Included for API consistency by convention.
Expand All @@ -89,8 +87,7 @@ def _fit(self, X: ArrayLike, y=None):
self :
Reference to self.
"""
X = as_2d_array(X)
self.sums_ = col_cumsum(X, init_zero=True)
self._sums = col_cumsum(X, init_zero=True)
return self

def _evaluate(self, cuts: np.ndarray):
Expand All @@ -117,7 +114,7 @@ def _evaluate(self, cuts: np.ndarray):
starts = cuts[:, 0]
splits = cuts[:, 1]
ends = cuts[:, 2]
return cusum_score(starts, ends, splits, self.sums_)
return cusum_score(starts, ends, splits, self._sums)

@classmethod
def get_test_params(cls, parameter_set="default"):
Expand Down
11 changes: 5 additions & 6 deletions skchange/change_scores/from_cost.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from typing import Union

import numpy as np
from numpy.typing import ArrayLike

from skchange.change_scores.base import BaseChangeScore
from skchange.costs.base import BaseCost
Expand Down Expand Up @@ -57,13 +56,13 @@ def min_size(self) -> int:
"""Minimum valid size of an interval to evaluate."""
return self.cost.min_size

def _fit(self, X: ArrayLike, y=None):
def _fit(self, X: np.ndarray, y=None):
"""Fit the change score.
Parameters
----------
X : array-like
Input data.
X : np.ndarray
Data to evaluate. Must be a 2D array.
y : None
Ignored. Included for API consistency by convention.
Expand Down Expand Up @@ -124,10 +123,10 @@ def get_test_params(cls, parameter_set="default"):
`MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance.
`create_test_instance` uses the first (or only) dictionary in `params`
"""
from skchange.costs import GaussianVarCost, L2Cost
from skchange.costs import GaussianCost, L2Cost

params = [
{"cost": L2Cost()},
{"cost": GaussianVarCost()},
{"cost": GaussianCost()},
]
return params
12 changes: 5 additions & 7 deletions skchange/change_scores/multivariate_gaussian_score.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,10 @@
__all__ = ["MultivariateGaussianScore"]

import numpy as np
from numpy.typing import ArrayLike

from skchange.change_scores.base import BaseChangeScore
from skchange.costs.multivariate_gaussian_cost import MultivariateGaussianCost
from skchange.utils.numba import njit
from skchange.utils.validation.data import as_2d_array


@njit
Expand Down Expand Up @@ -176,13 +174,13 @@ def min_size(self) -> int:
else:
return None

def _fit(self, X: ArrayLike, y=None):
def _fit(self, X: np.ndarray, y=None):
"""Fit the multivariate Gaussian change score evaluator.
Parameters
----------
X : array-like
Input data.
X : np.ndarray
Data to evaluate. Must be a 2D array.
y : None
Ignored. Included for API consistency by convention.
Expand All @@ -191,7 +189,7 @@ def _fit(self, X: ArrayLike, y=None):
self :
Reference to self.
"""
self._cost.fit(as_2d_array(X))
self._cost.fit(X)
return self

def _evaluate(self, cuts: np.ndarray):
Expand Down Expand Up @@ -229,7 +227,7 @@ def _evaluate(self, cuts: np.ndarray):
bartlett_corrections = compute_bartlett_corrections(
sequence_lengths=segment_lengths,
cut_points=segment_splits,
dimension=self._cost.data_dimension_,
dimension=self._X.shape[1],
)
return bartlett_corrections * raw_scores
else:
Expand Down
4 changes: 2 additions & 2 deletions skchange/costs/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Cost functions for cost-based change and anomaly detection."""

from skchange.costs.base import BaseCost
from skchange.costs.gaussian_var_cost import GaussianVarCost
from skchange.costs.gaussian_cost import GaussianCost
from skchange.costs.l2_cost import L2Cost
from skchange.costs.multivariate_gaussian_cost import MultivariateGaussianCost

Expand All @@ -10,7 +10,7 @@
]
COSTS = [
MultivariateGaussianCost,
GaussianVarCost,
GaussianCost,
L2Cost,
]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,12 @@
from typing import Union

import numpy as np
from numpy.typing import ArrayLike

from skchange.costs.base import BaseCost
from skchange.costs.utils import MeanType, VarType, check_mean, check_var
from skchange.utils.numba import njit
from skchange.utils.numba.general import truncate_below
from skchange.utils.numba.stats import col_cumsum
from skchange.utils.validation.data import as_2d_array


@njit
Expand Down Expand Up @@ -119,7 +117,7 @@ def gaussian_var_cost_fixed(
return -log_likelihood


class GaussianVarCost(BaseCost):
class GaussianCost(BaseCost):
"""Univariate Gaussian likelihood cost.
Parameters
Expand Down Expand Up @@ -172,23 +170,22 @@ def get_param_size(self, p: int) -> int:
"""
return 2 * p

def _fit(self, X: ArrayLike, y=None):
def _fit(self, X: np.ndarray, y=None):
"""Fit the cost.
This method precomputes quantities that speed up the cost evaluation.
Parameters
----------
X : array-like
Input data.
X : np.ndarray
Data to evaluate. Must be a 2D array.
y: None
Ignored. Included for API consistency by convention.
"""
X = as_2d_array(X)
self._param = self._check_param(self.param, X)

self.sums_ = col_cumsum(X, init_zero=True)
self.sums2_ = col_cumsum(X**2, init_zero=True)
self._sums = col_cumsum(X, init_zero=True)
self._sums2 = col_cumsum(X**2, init_zero=True)
return self

def _evaluate_optim_param(self, starts: np.ndarray, ends: np.ndarray) -> np.ndarray:
Expand All @@ -207,7 +204,7 @@ def _evaluate_optim_param(self, starts: np.ndarray, ends: np.ndarray) -> np.ndar
A 2D array of costs. One row for each interval. The number of
columns is 1 since the GaussianCovCost is inherently multivariate.
"""
return gaussian_var_cost_optim(starts, ends, self.sums_, self.sums2_)
return gaussian_var_cost_optim(starts, ends, self._sums, self._sums2)

def _evaluate_fixed_param(self, starts, ends):
"""Evaluate the cost for the fixed parameter.
Expand All @@ -226,7 +223,7 @@ def _evaluate_fixed_param(self, starts, ends):
columns is 1 since the GaussianCovCost is inherently multivariate.
"""
mean, var = self._param
return gaussian_var_cost_fixed(starts, ends, self.sums_, self.sums2_, mean, var)
return gaussian_var_cost_fixed(starts, ends, self._sums, self._sums2, mean, var)

@classmethod
def get_test_params(cls, parameter_set="default"):
Expand Down
Loading

0 comments on commit 8f08b83

Please sign in to comment.