Added new query similarity counter feature (#798)

* Added new query similarity counter feature * Removed trailing whitespace * Fixed code style formatting * Fixed closing bracket indentation * Fixed line continuation and remove whitespace * Fixed indentation and removed brackets * Added non-normalied Euclidean distance option * Removed trailing whitespace * Removed whitespace in blank line * Added additional test * Fixed unit test * Changes based on first review * Fixed merge-conflict failure Co-authored-by: Nils Braun <nilslennartbraun@gmail.com>
blue-yonder · Jan 25, 2021 · 1559aef · 1559aef
1 parent 04b473f
commit 1559aef
Show file tree

Hide file tree

Showing 4 changed files with 76 additions and 3 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -9,3 +9,4 @@ tqdm>=4.10.0
 dask[dataframe]>=2.9.0
 distributed>=2.11.0
 matrixprofile>=1.1.10<2.0.0
+stumpy>=1.7.2
diff --git a/tests/units/feature_extraction/test_feature_calculations.py b/tests/units/feature_extraction/test_feature_calculations.py
@@ -1302,6 +1302,28 @@ def test_benford_correlation(self):
         self.assertAlmostEqual(benford_correlation(list_with_nan), 0.10357511)
         self.assertIsNaN(benford_correlation(equal_list))
 
+    def test_query_similarity_count(self):
+        np.random.seed(42)
+        query = np.random.uniform(size=10)
+        threshold = 3.0
+        x = np.random.uniform(size=100)
+
+        # z-normalized Euclidean distances
+
+        param = [{"query": query}]
+        self.assertAlmostEqual(query_similarity_count(x, param=param)[0][1], 0.0)
+
+        param = [{"query": query, "threshold": threshold}]
+        self.assertAlmostEqual(query_similarity_count(x, param=param)[0][1], 6.0)
+
+        # non-normalized Euclidean distances
+
+        param = [{"query": query, "normalize": False}]
+        self.assertAlmostEqual(query_similarity_count(x, param=param)[0][1], 0.0)
+
+        param = [{"query": query, "threshold": threshold, "normalize": False}]
+        self.assertAlmostEqual(query_similarity_count(x, param=param)[0][1], 91.0)
+
     def test_matrix_profile_window(self):
         # Test matrix profile output with specified window
         np.random.seed(9999)

diff --git a/tsfresh/feature_extraction/feature_calculators.py b/tsfresh/feature_extraction/feature_calculators.py
@@ -19,8 +19,8 @@
 
 import itertools
 import functools
-from tsfresh.utilities.string_manipulation import convert_to_output_format
 import warnings
+from tsfresh.utilities.string_manipulation import convert_to_output_format
 from builtins import range
 from collections import defaultdict
 
@@ -32,7 +32,7 @@
 from statsmodels.tools.sm_exceptions import MissingDataError
 from matrixprofile.exceptions import NoSolutionPossible
 import matrixprofile as mp
-
+import stumpy
 
 with warnings.catch_warnings():
     # Ignore warnings of the patsy package
@@ -2297,3 +2297,53 @@ def _calculate_mp(**kwargs):
                 raise ValueError(f"Unknown feature {feature} for the matrix profile")
 
     return [(key, value) for key, value in res.items()]
+
+
+@set_property("fctype", "combiner")
+def query_similarity_count(x, param):
+    """
+    This feature calculator accepts an input query subsequence parameter,
+    compares the query (under z-normalized Euclidean distance) to all
+    subsequences within the time series, and returns a count of the number
+    of times the query was found in the time series (within some predefined
+    maximum distance threshold). Note that this feature will always return
+    `np.nan` when no query subsequence is provided and so users will need
+    to enable this feature themselves.
+
+    :param x: the time series to calculate the feature of
+    :type x: numpy.ndarray
+    :param param: contains dictionaries
+                  {"query": Q, "threshold": thr, "normalize": norm}
+                  with `Q` (numpy.ndarray), the query subsequence to compare the
+                  time series against. If `Q` is omitted then a value of zero
+                  is returned. Additionally, `thr` (float), the maximum
+                  z-normalized Euclidean distance threshold for which to
+                  increment the query similarity count. If `thr` is omitted
+                  then a default threshold of `thr=0.0` is used, which
+                  corresponds to finding exact matches to `Q`. Finally, for
+                  non-normalized (i.e., without z-normalization) Euclidean set
+                  `norm` (bool) to `False.
+    :type param: list
+    :return x: the different feature values
+    :return type: int
+    """
+    res = {}
+    T = np.asarray(x).astype(float)
+
+    for i, kwargs in enumerate(param):
+        key = convert_to_output_format(kwargs)
+        normalize = kwargs.get("normalize", True)
+        threshold = kwargs.get('threshold', 0.0)
+        Q = kwargs.get('query', None)
+        Q = np.asarray(Q).astype(float)
+        count = np.nan
+        if Q is not None and Q.size >= 3:
+            if normalize:
+                distance_profile = stumpy.core.mass(Q, T)
+            else:
+                distance_profile = stumpy.core.mass_absolute(Q, T)
+            count = np.sum(distance_profile <= threshold)
+
+        res[key] = count
+
+    return [(key, value) for key, value in res.items()]
diff --git a/tsfresh/feature_extraction/settings.py b/tsfresh/feature_extraction/settings.py
@@ -152,8 +152,8 @@ def __init__(self):
             "lempel_ziv_complexity": [{"bins": x} for x in [2, 3, 5, 10, 100]],
             "fourier_entropy":  [{"bins": x} for x in [2, 3, 5, 10, 100]],
             "permutation_entropy":  [{"tau": 1, "dimension": x} for x in [3, 4, 5, 6, 7]],
+            "query_similarity_count": [{"query": None, "threshold": 0.0}],
             "matrix_profile": [{"threshold": 0.98, "feature": f} for f in ["min", "max", "mean", "median", "25", "75"]]
-
         })
 
         super().__init__(name_to_param)