Skip to content

Commit

Permalink
Added new query similarity counter feature (#798)
Browse files Browse the repository at this point in the history
* Added new query similarity counter feature

* Removed trailing whitespace

* Fixed code style formatting

* Fixed closing bracket indentation

* Fixed line continuation and remove whitespace

* Fixed indentation and removed brackets

* Added non-normalied Euclidean distance option

* Removed trailing whitespace

* Removed whitespace in blank line

* Added additional test

* Fixed unit test

* Changes based on first review

* Fixed merge-conflict failure

Co-authored-by: Nils Braun <nilslennartbraun@gmail.com>
  • Loading branch information
seanlaw and nils-braun authored Jan 25, 2021
1 parent 04b473f commit 1559aef
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 3 deletions.
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ tqdm>=4.10.0
dask[dataframe]>=2.9.0
distributed>=2.11.0
matrixprofile>=1.1.10<2.0.0
stumpy>=1.7.2
22 changes: 22 additions & 0 deletions tests/units/feature_extraction/test_feature_calculations.py
Original file line number Diff line number Diff line change
Expand Up @@ -1302,6 +1302,28 @@ def test_benford_correlation(self):
self.assertAlmostEqual(benford_correlation(list_with_nan), 0.10357511)
self.assertIsNaN(benford_correlation(equal_list))

def test_query_similarity_count(self):
np.random.seed(42)
query = np.random.uniform(size=10)
threshold = 3.0
x = np.random.uniform(size=100)

# z-normalized Euclidean distances

param = [{"query": query}]
self.assertAlmostEqual(query_similarity_count(x, param=param)[0][1], 0.0)

param = [{"query": query, "threshold": threshold}]
self.assertAlmostEqual(query_similarity_count(x, param=param)[0][1], 6.0)

# non-normalized Euclidean distances

param = [{"query": query, "normalize": False}]
self.assertAlmostEqual(query_similarity_count(x, param=param)[0][1], 0.0)

param = [{"query": query, "threshold": threshold, "normalize": False}]
self.assertAlmostEqual(query_similarity_count(x, param=param)[0][1], 91.0)

def test_matrix_profile_window(self):
# Test matrix profile output with specified window
np.random.seed(9999)
Expand Down
54 changes: 52 additions & 2 deletions tsfresh/feature_extraction/feature_calculators.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@

import itertools
import functools
from tsfresh.utilities.string_manipulation import convert_to_output_format
import warnings
from tsfresh.utilities.string_manipulation import convert_to_output_format
from builtins import range
from collections import defaultdict

Expand All @@ -32,7 +32,7 @@
from statsmodels.tools.sm_exceptions import MissingDataError
from matrixprofile.exceptions import NoSolutionPossible
import matrixprofile as mp

import stumpy

with warnings.catch_warnings():
# Ignore warnings of the patsy package
Expand Down Expand Up @@ -2297,3 +2297,53 @@ def _calculate_mp(**kwargs):
raise ValueError(f"Unknown feature {feature} for the matrix profile")

return [(key, value) for key, value in res.items()]


@set_property("fctype", "combiner")
def query_similarity_count(x, param):
"""
This feature calculator accepts an input query subsequence parameter,
compares the query (under z-normalized Euclidean distance) to all
subsequences within the time series, and returns a count of the number
of times the query was found in the time series (within some predefined
maximum distance threshold). Note that this feature will always return
`np.nan` when no query subsequence is provided and so users will need
to enable this feature themselves.
:param x: the time series to calculate the feature of
:type x: numpy.ndarray
:param param: contains dictionaries
{"query": Q, "threshold": thr, "normalize": norm}
with `Q` (numpy.ndarray), the query subsequence to compare the
time series against. If `Q` is omitted then a value of zero
is returned. Additionally, `thr` (float), the maximum
z-normalized Euclidean distance threshold for which to
increment the query similarity count. If `thr` is omitted
then a default threshold of `thr=0.0` is used, which
corresponds to finding exact matches to `Q`. Finally, for
non-normalized (i.e., without z-normalization) Euclidean set
`norm` (bool) to `False.
:type param: list
:return x: the different feature values
:return type: int
"""
res = {}
T = np.asarray(x).astype(float)

for i, kwargs in enumerate(param):
key = convert_to_output_format(kwargs)
normalize = kwargs.get("normalize", True)
threshold = kwargs.get('threshold', 0.0)
Q = kwargs.get('query', None)
Q = np.asarray(Q).astype(float)
count = np.nan
if Q is not None and Q.size >= 3:
if normalize:
distance_profile = stumpy.core.mass(Q, T)
else:
distance_profile = stumpy.core.mass_absolute(Q, T)
count = np.sum(distance_profile <= threshold)

res[key] = count

return [(key, value) for key, value in res.items()]
2 changes: 1 addition & 1 deletion tsfresh/feature_extraction/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,8 +152,8 @@ def __init__(self):
"lempel_ziv_complexity": [{"bins": x} for x in [2, 3, 5, 10, 100]],
"fourier_entropy": [{"bins": x} for x in [2, 3, 5, 10, 100]],
"permutation_entropy": [{"tau": 1, "dimension": x} for x in [3, 4, 5, 6, 7]],
"query_similarity_count": [{"query": None, "threshold": 0.0}],
"matrix_profile": [{"threshold": 0.98, "feature": f} for f in ["min", "max", "mean", "median", "25", "75"]]

})

super().__init__(name_to_param)
Expand Down

0 comments on commit 1559aef

Please sign in to comment.