Skip to content

Commit

Permalink
[BUG, ENH] SFA fix: Std-Normalization, as used in BOSS and WEASEL mod…
Browse files Browse the repository at this point in the history
…els, is potentially harmful for lower bounding (#2461)

This fixes an issue with std normalization in SFA, as is commonly used for BOSS and WEASEL models.

For Lower Bounding computations this however adds a potential violation of the lower bounding distance. Thus, I remove it, if whole series matching is applied.
  • Loading branch information
patrickzib authored Jan 2, 2025
1 parent 66128de commit 26dfdd2
Show file tree
Hide file tree
Showing 7 changed files with 379 additions and 105 deletions.
15 changes: 5 additions & 10 deletions aeon/distances/mindist/_dft_sfa.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,22 +44,17 @@ def mindist_dft_sfa_distance(
--------
>>> import numpy as np
>>> from aeon.distances import mindist_dft_sfa_distance
>>> from aeon.transformations.collection.dictionary_based import SFAFast
>>> from aeon.transformations.collection.dictionary_based import SFAWhole
>>> x = np.array([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]])
>>> y = np.array([[11, 12, 13, 14, 15, 16, 17, 18, 19, 20]])
>>> transform = SFAFast(
>>> transform = SFAWhole(
... word_length=8,
... alphabet_size=8,
... window_size=x.shape[-1],
... norm=True,
... lower_bounding_distances=True # This must be set!
... )
>>> transform.fit(x)
SFAFast(...)
>>> x_sfa = transform.transform_words(x).squeeze()
>>> y_sfa = transform.transform_words(y).squeeze()
>>> x_dft = transform.transform_mft(x).squeeze()
>>> dist = mindist_dft_sfa_distance(x_dft, y_sfa, transform.breakpoints)
>>> x_sfa, _ = transform.fit_transform(x)
>>> _, y_dft = transform.transform(y)
>>> dist = mindist_dft_sfa_distance(y_dft, x_sfa, transform.breakpoints)
"""
if x_dft.ndim == 1 and y_sfa.ndim == 1:
return _univariate_dft_sfa_distance(x_dft, y_sfa, breakpoints)
Expand Down
14 changes: 5 additions & 9 deletions aeon/distances/mindist/_sfa.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,20 +44,16 @@ def mindist_sfa_distance(
--------
>>> import numpy as np
>>> from aeon.distances import mindist_sfa_distance
>>> from aeon.transformations.collection.dictionary_based import SFAFast
>>> from aeon.transformations.collection.dictionary_based import SFAWhole
>>> x = np.array([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]])
>>> y = np.array([[11, 12, 13, 14, 15, 16, 17, 18, 19, 20]])
>>> transform = SFAFast(
>>> transform = SFAWhole(
... word_length=8,
... alphabet_size=8,
... window_size=x.shape[-1],
... norm=True,
... lower_bounding_distances=True # This must be set!
... norm=True
... )
>>> transform.fit(x)
SFAFast(...)
>>> x_sfa = transform.transform_words(x).squeeze()
>>> y_sfa = transform.transform_words(y).squeeze()
>>> x_sfa, _ = transform.fit_transform(x)
>>> y_sfa, _ = transform.transform(y)
>>> dist = mindist_sfa_distance(x_sfa, y_sfa, transform.breakpoints)
"""
if x.ndim == 1 and y.ndim == 1:
Expand Down
80 changes: 70 additions & 10 deletions aeon/distances/tests/test_symbolic_mindist.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from aeon.distances.mindist._paa_sax import mindist_paa_sax_distance
from aeon.distances.mindist._sax import mindist_sax_distance
from aeon.distances.mindist._sfa import mindist_sfa_distance
from aeon.transformations.collection.dictionary_based import SAX, SFA, SFAFast
from aeon.transformations.collection.dictionary_based import SAX, SFA, SFAFast, SFAWhole


def test_sax_mindist():
Expand Down Expand Up @@ -69,7 +69,7 @@ def test_sfa_mindist():
window_size=n,
binning_method=histogram_type,
norm=True,
variance=True,
variance=False, # True gives a tighter lower bound
lower_bounding_distances=True, # This must be set!
)

Expand All @@ -81,27 +81,39 @@ def test_sfa_mindist():
norm=True,
lower_bounding_distances=True, # This must be set!
)
transforms = [sfa_old, sfa_fast]

for sfa in transforms:
sfa.fit(X_train)
X_train_words = sfa.transform_words(X_train).squeeze()
Y_train_words = sfa.transform_words(X_test).squeeze()
sfa_whole = SFAWhole(
word_length=n_segments,
alphabet_size=alphabet_size,
binning_method=histogram_type,
variance=False, # True gives a tighter lower bound
norm=True,
)

SFA_train_dfts = sfa.transform_mft(X_train).squeeze()
transforms = [sfa_fast, sfa_old, sfa_whole]
dists = np.zeros(
(min(X_train.shape[0], X_test.shape[0]), len(transforms)), dtype=np.float32
)

for j, sfa in enumerate(transforms):
sfa.fit(X_train)
X_train_words, X_train_dfts = sfa.transform_words(X_train)
X_test_words, _ = sfa.transform_words(X_test)

for i in range(min(X_train.shape[0], X_test.shape[0])):
X = X_train[i].reshape(1, -1)
Y = X_test[i].reshape(1, -1)

# SFA Min-Distance
mindist_sfa = mindist_sfa_distance(
X_train_words[i], Y_train_words[i], sfa.breakpoints
X_train_words[i], X_test_words[i], sfa.breakpoints
)

dists[i, j] = mindist_sfa

# DFT-SFA Min-Distance
mindist_dft_sfa = mindist_dft_sfa_distance(
SFA_train_dfts[i], Y_train_words[i], sfa.breakpoints
X_train_dfts[i], X_test_words[i], sfa.breakpoints
)

# Euclidean Distance
Expand All @@ -110,3 +122,51 @@ def test_sfa_mindist():
assert mindist_sfa <= ed
assert mindist_dft_sfa >= mindist_sfa # a tighter lower bound
assert mindist_dft_sfa <= ed

for i in range(min(X_train.shape[0], X_test.shape[0])):
assert np.allclose(*dists[i])


def test_sfa_whole_mindist():
"""Test the SFA Min-Distance function."""
n_segments = 16
alphabet_size = 8

X_train, _ = load_unit_test("TRAIN")
X_test, _ = load_unit_test("TEST")

X_train = zscore(X_train.squeeze(), axis=1)
X_test = zscore(X_test.squeeze(), axis=1)

histogram_type = "equi-width"

sfa = SFAWhole(
word_length=n_segments,
alphabet_size=alphabet_size,
binning_method=histogram_type,
norm=True,
)

X_train_words, X_train_dfts = sfa.fit_transform(X_train)
X_test_words, _ = sfa.transform(X_test)

for i in range(min(X_train.shape[0], X_test.shape[0])):
X = X_train[i].reshape(1, -1)
Y = X_test[i].reshape(1, -1)

# SFA Min-Distance
mindist_sfa = mindist_sfa_distance(
X_train_words[i], X_test_words[i], sfa.breakpoints
)

# DFT-SFA Min-Distance
mindist_dft_sfa = mindist_dft_sfa_distance(
X_train_dfts[i], X_test_words[i], sfa.breakpoints
)

# Euclidean Distance
ed = np.linalg.norm(X[0] - Y[0])

assert mindist_sfa <= ed
assert mindist_dft_sfa >= mindist_sfa # a tighter lower bound
assert mindist_dft_sfa <= ed
3 changes: 2 additions & 1 deletion aeon/transformations/collection/dictionary_based/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
"""Transformers."""

__all__ = ["PAA", "SFA", "SFAFast", "SAX", "BORF"]
__all__ = ["PAA", "SFA", "SFAFast", "SFAWhole", "SAX", "BORF"]

from aeon.transformations.collection.dictionary_based._borf import BORF
from aeon.transformations.collection.dictionary_based._paa import PAA
from aeon.transformations.collection.dictionary_based._sax import SAX
from aeon.transformations.collection.dictionary_based._sfa import SFA
from aeon.transformations.collection.dictionary_based._sfa_fast import SFAFast
from aeon.transformations.collection.dictionary_based._sfa_whole import SFAWhole
6 changes: 3 additions & 3 deletions aeon/transformations/collection/dictionary_based/_sfa.py
Original file line number Diff line number Diff line change
Expand Up @@ -443,7 +443,7 @@ def _transform_words_case(self, X):
self.letter_bits,
)

return words
return words, dfts

def transform_words(self, X):
"""Return the words generated for each series.
Expand All @@ -463,8 +463,8 @@ def transform_words(self, X):
delayed(self._transform_words_case)(X[i, :]) for i in range(X.shape[0])
)

words = zip(*transform)
return np.array(list(words))
words = list(zip(*transform)) # words and dfts
return np.array(words[0]).squeeze(), np.array(words[1]).squeeze()

def get_words(self):
"""Return the words generated for each series.
Expand Down
Loading

0 comments on commit 26dfdd2

Please sign in to comment.