Implemented two new feature calculator: Lempel Ziv complexity and Fou…

…rier Entropy (#688) * Implemented a new feature calculator: Lempel Ziv complexity * Added to changelog * Added an additional parameter and fixed a typo * pep8ify * Implemented a new feature calculator: Fourier Entropy * Added to changelog Co-authored-by: Nils Braun <nilslennartbraun@gmail.com>
blue-yonder · May 14, 2020 · 8dd3f72 · 8dd3f72
1 parent b73cd09
commit 8dd3f72
Show file tree

Hide file tree

Showing 4 changed files with 100 additions and 1 deletion.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -4,6 +4,11 @@ Changelog
 
 tsfresh uses `Semantic Versioning <http://semver.org/>`_
 
+Unreleased
+==========
+- Added Features
+    - Implemented the Lempel-Ziv-Complexity and the Fourier Entropy (#688)
+
 Version 0.16.0
 ==============
 

diff --git a/tests/units/feature_extraction/test_feature_calculations.py b/tests/units/feature_extraction/test_feature_calculations.py
@@ -322,6 +322,38 @@ def test_cid_ce(self):
         self.assertEqualOnAllArrayTypes(cid_ce, [0.5, 3.5, 7.5], 5, normalize=False)
         self.assertEqualOnAllArrayTypes(cid_ce, [-4.33, -1.33, 2.67], 5, normalize=False)
 
+    def test_lempel_ziv_complexity(self):
+        self.assertAlmostEqualOnAllArrayTypes(lempel_ziv_complexity, [1, 1, 1], 2./3, bins=2)
+        self.assertAlmostEqualOnAllArrayTypes(lempel_ziv_complexity, [1, 1, 1], 2./3, bins=5)
+
+        self.assertAlmostEqualOnAllArrayTypes(lempel_ziv_complexity, [1, 1, 1, 1, 1, 1, 1],
+                                              0.4285714285, bins=2)
+        self.assertAlmostEqualOnAllArrayTypes(lempel_ziv_complexity, [1, 1, 1, 2, 1, 1, 1],
+                                              0.5714285714, bins=2)
+
+        self.assertAlmostEqualOnAllArrayTypes(lempel_ziv_complexity,
+                                              [-1, 4.3, 5, 1, -4.5, 1, 5, 7, -3.4, 6],
+                                              0.8, bins=10)
+        self.assertAlmostEqualOnAllArrayTypes(lempel_ziv_complexity,
+                                              [-1, np.nan, 5, 1, -4.5, 1, 5, 7, -3.4, 6],
+                                              0.4, bins=10)
+
+    def test_fourier_entropy(self):
+        self.assertAlmostEqualOnAllArrayTypes(fourier_entropy, [1, 2, 1], 0.693147180, bins=2)
+        self.assertAlmostEqualOnAllArrayTypes(fourier_entropy, [1, 2, 1], 0.693147180, bins=5)
+
+        self.assertAlmostEqualOnAllArrayTypes(fourier_entropy, [1, 1, 2, 1, 1, 1, 1],
+                                              0.5623351446188083, bins=5)
+        self.assertAlmostEqualOnAllArrayTypes(fourier_entropy, [1, 1, 1, 1, 2, 1, 1],
+                                              1.0397207708399179, bins=5)
+
+        self.assertAlmostEqualOnAllArrayTypes(fourier_entropy,
+                                              [-1, 4.3, 5, 1, -4.5, 1, 5, 7, -3.4, 6],
+                                              1.5607104090414063, bins=10)
+        self.assertIsNanOnAllArrayTypes(fourier_entropy,
+                                        [-1, np.nan, 5, 1, -4.5, 1, 5, 7, -3.4, 6],
+                                        bins=10)
+
     def test_ratio_beyond_r_sigma(self):
 
         x = [0, 1] * 10 + [10, 20, -30]  # std of x is 7.21, mean 3.04

diff --git a/tsfresh/feature_extraction/feature_calculators.py b/tsfresh/feature_extraction/feature_calculators.py
@@ -524,6 +524,61 @@ def cid_ce(x, normalize):
     return np.sqrt(np.dot(x, x))
 
 
+@set_property("fctype", "simple")
+def fourier_entropy(x, bins):
+    """
+    Calculate the binned entropy of the power spectral density of the time series
+    (using the welch method).
+
+    Ref: https://hackaday.io/project/707-complexity-of-a-time-series/details
+    Ref: https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.signal.welch.html
+
+    """
+    _, pxx = welch(x, nperseg=min(len(x), 256))
+    return binned_entropy(pxx / np.max(pxx), bins)
+
+
+@set_property("fctype", "simple")
+def lempel_ziv_complexity(x, bins):
+    """
+    Calculate a complexity estimate based on the Lempel-Ziv compression
+    algorithm.
+
+    The complexity is defined as the number of dictionary entries (or sub-words) needed
+    to encode the time series when viewed from left to right.
+    FOr this, the time series is first binned into the given number of bins.
+    Then it is converted into sub-words with different prefixes.
+    The number of sub-words needed for this divided by the length of the time
+    series is the complexity estimate.
+
+    For example, if the time series (after binning in only 2 bins) would look like "100111",
+    the different sub-words would be 1, 0, 01 and 11 and therefore the result is 4/6 = 0.66.
+
+    Ref: https://github.com/Naereen/Lempel-Ziv_Complexity/blob/master/src/lempel_ziv_complexity.py
+
+    """
+    x = np.asarray(x)
+
+    bins = np.linspace(np.min(x), np.max(x), bins)
+    sequence = np.searchsorted(bins, x, side='left')
+
+    sub_strings = set()
+    n = len(sequence)
+
+    ind = 0
+    inc = 1
+    while ind + inc <= n:
+        # convert tu tuple to make it hashable
+        sub_str = tuple(sequence[ind:ind + inc])
+        if sub_str in sub_strings:
+            inc += 1
+        else:
+            sub_strings.add(sub_str)
+            ind += inc
+            inc = 1
+    return len(sub_strings) / n
+
+
 @set_property("fctype", "simple")
 def mean_abs_change(x):
     """
@@ -1512,6 +1567,11 @@ def binned_entropy(x, max_bins):
     """
     if not isinstance(x, (np.ndarray, pd.Series)):
         x = np.asarray(x)
+
+    # nan makes no sense here
+    if np.isnan(x).any():
+        return np.nan
+
     hist, bin_edges = np.histogram(x, bins=max_bins)
     probs = hist / x.size
     probs[probs == 0] = 1.0

diff --git a/tsfresh/feature_extraction/settings.py b/tsfresh/feature_extraction/settings.py
@@ -148,7 +148,9 @@ def __init__(self):
             "linear_trend_timewise": [{"attr": "pvalue"}, {"attr": "rvalue"}, {"attr": "intercept"},
                                       {"attr": "slope"}, {"attr": "stderr"}],
             "count_above": [{"t": 0}],
-            "count_below": [{"t": 0}]
+            "count_below": [{"t": 0}],
+            "lempel_ziv_complexity": [{"bins": x} for x in [2, 3, 5, 10, 100]],
+            "fourier_entropy":  [{"bins": x} for x in [2, 3, 5, 10, 100]],
 
         })