blue-yonder · nils-braun · May 11, 2020 · Apr 13, 2020 · Apr 13, 2020 · Apr 13, 2020
diff --git a/.coveragerc b/.coveragerc
@@ -14,6 +14,7 @@ omit = tsfresh/utilities/profiling.py
        tsfresh/examples/driftbif_simulation.py
        tsfresh/examples/test_tsfresh_baseline_dataset.py
        tsfresh/scripts/test_timing.py
+       tsfresh/scripts/measure_execution_time.py
 
 [report]
 # Regexes for lines to exclude from consideration

diff --git a/.travis.yml b/.travis.yml
@@ -69,10 +69,10 @@ jobs:
       # python 3.7 requires pandas >= 0.23.2
       python: 3.7
 
-    - env: NUMPY="1.12.0" PANDAS="0.20.3" SCIKITLEARN="0.19.0" DASK="0.15.2" DISTRIBUTED="1.18.3" SCIPY="1.2.0" STATSMODELS="0.9.0"
+    - env: NUMPY="1.15.1" PANDAS="0.20.3" SCIKITLEARN="0.19.0" DASK="0.15.2" DISTRIBUTED="1.18.3" SCIPY="1.2.0" STATSMODELS="0.9.0"
       python: 3.6
 
-    - env: NUMPY="1.12.0" PANDAS="0.20.3" SCIKITLEARN="0.19.0" DASK="0.15.2" DISTRIBUTED="1.18.3" SCIPY="1.2.0" STATSMODELS="0.9.0"
+    - env: NUMPY="1.15.1" PANDAS="0.20.3" SCIKITLEARN="0.19.0" DASK="0.15.2" DISTRIBUTED="1.18.3" SCIPY="1.2.0" STATSMODELS="0.9.0"
       python: 3.5.3
 
 

diff --git a/CHANGES.rst b/CHANGES.rst
@@ -23,9 +23,13 @@ Unreleased
     - Optimize RelevantFeatureAugmenter to avoid re-extraction (#669)
     - Added a function `add_sub_time_series_index` (#666)
     - Added Dockerfile
+    - Speed optimizations and speed testing script (#681)
 - Bugfixes
     - Increase the extracted `ar` coefficients to the full parameter range. (#662)
     - Documentation fixes (#663, #664, #665)
+    - Rewrote the `sample_entropy` feature calculator (#681)
+      It is now faster and (hopefully) more correct.
+      But your results will change!
 
 
 Version 0.15.1

diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,5 @@
 requests>=2.9.1
-numpy>=1.12.0
+numpy>=1.15.1
 pandas>=0.20.3,!=0.24.* # pandas dropna is buggy in 0.24.0, see https://github.com/blue-yonder/tsfresh/issues/485 and https://github.com/pandas-dev/pandas/issues/25087
 scipy>=1.2.0
 statsmodels>=0.8.0

diff --git a/tests/units/feature_extraction/test_feature_calculations.py b/tests/units/feature_extraction/test_feature_calculations.py
@@ -791,10 +791,29 @@ def test_binned_entropy(self):
         self.assertAlmostEqualOnAllArrayTypes(binned_entropy, list(range(100)), - np.math.log(1 / 2), 2)
 
     def test_sample_entropy(self):
+        # "random" list -> large entropy
         ts = [1, 4, 5, 1, 7, 3, 1, 2, 5, 8, 9, 7, 3, 7, 9, 5, 4, 3, 9, 1, 2, 3, 4, 2, 9, 6, 7, 4, 9, 2, 9, 9, 6, 5, 1,
               3, 8, 1, 5, 3, 8, 4, 1, 2, 2, 1, 6, 5, 3, 6, 5, 4, 8, 9, 6, 7, 5, 3, 2, 5, 4, 2, 5, 1, 6, 5, 3, 5, 6, 7,
               8, 5, 2, 8, 6, 3, 8, 2, 7, 1, 7, 3, 5, 6, 2, 1, 3, 7, 3, 5, 3, 7, 6, 7, 7, 2, 3, 1, 7, 8]
-        self.assertAlmostEqualOnAllArrayTypes(sample_entropy, ts, 2.21187685)
+        self.assertAlmostEqualOnAllArrayTypes(sample_entropy, ts, 2.38262780)
+        # This is not very complex, so it gives a small value
+        ts = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+        self.assertAlmostEqualOnAllArrayTypes(sample_entropy, ts, 0.25131442)
+        # however adding a 2 increases complexity
+        ts = [1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
+        self.assertAlmostEqualOnAllArrayTypes(sample_entropy, ts, 0.74193734)
+        # and it does not matter where
+        ts = [1, 1, 1, 2, 1, 1, 1, 1, 1, 1]
+        self.assertAlmostEqualOnAllArrayTypes(sample_entropy, ts, 0.74193734)
+        # negative numbers also work
+        ts = [1, -1, 1, -1, 1, -1]
+        self.assertAlmostEqualOnAllArrayTypes(sample_entropy, ts, 0.69314718)
+        # nan gives nan
+        ts = [1, -1, 1, np.nan, 1, -1]
+        self.assertIsNanOnAllArrayTypes(sample_entropy, ts)
+        # this is not a very "random" list, so it should give a small entropy
+        ts = list(range(1000))
+        self.assertAlmostEqualOnAllArrayTypes(sample_entropy, ts, 0.0010314596066622707)
 
     def test_autocorrelation(self):
         self.assertAlmostEqualOnAllArrayTypes(autocorrelation, [1, 2, 1, 2, 1, 2], -1, 1)

diff --git a/tsfresh/feature_extraction/feature_calculators.py b/tsfresh/feature_extraction/feature_calculators.py
@@ -1539,40 +1539,42 @@ def sample_entropy(x):
     """
     x = np.array(x)
 
-    sample_length = 1  # number of sequential points of the time series
-    tolerance = 0.2 * np.std(x)  # 0.2 is a common value for r - why?
+    # if one of the values is NaN, we can not compute anything meaningful
+    if np.isnan(x).any():
+        return np.nan
 
-    n = len(x)
-    prev = np.zeros(n)
-    curr = np.zeros(n)
-    A = np.zeros((1, 1))  # number of matches for m = [1,...,template_length - 1]
-    B = np.zeros((1, 1))  # number of matches for m = [1,...,template_length]
-
-    for i in range(n - 1):
-        nj = n - i - 1
-        ts1 = x[i]
-        for jj in range(nj):
-            j = jj + i + 1
-            if abs(x[j] - ts1) < tolerance:  # distance between two vectors
-                curr[jj] = prev[jj] + 1
-                temp_ts_length = min(sample_length, curr[jj])
-                for m in range(int(temp_ts_length)):
-                    A[m] += 1
-                    if j < n - 1:
-                        B[m] += 1
-            else:
-                curr[jj] = 0
-        for j in range(nj):
-            prev[j] = curr[j]
+    m = 2  # common value for m, according to wikipedia...
+    tolerance = 0.2 * np.std(x)  # 0.2 is a common value for r, according to wikipedia...
+
+    N = len(x)
+
+    # Split time series and save all templates of length m
+    # Basically we turn [1, 2, 3, 4] into [1, 2], [2, 3], [3, 4]
+    xm = np.array([x[i:i + m] for i in range(N - m + 1)])
+
+    # Now calculate the maximum distance between each of those pairs
+    #   np.abs(xmi - xm).max(axis=1)
+    # and check how many are below the tolerance.
+    # For speed reasons, we are not doing this in a nested for loop,
+    # but with numpy magic.
+    # Example:
+    # if x = [1, 2, 3]
+    # then xm = [[1, 2], [2, 3]]
+    # so we will substract xm from [1, 2] => [[0, 0], [-1, -1]]
+    # and from [2, 3] => [[1, 1], [0, 0]]
+    # taking the abs and max gives us:
+    # [0, 1] and [1, 0]
+    # as the diagonal elements are always 0, we substract 1.
+    B = np.sum([np.sum(np.abs(xmi - xm).max(axis=1) <= tolerance) - 1 for xmi in xm])
+
+    # Similar for computing A
+    m += 1
+    xmp1 = np.array([x[i:i + m] for i in range(N - m + 1)])
 
-    N = n * (n - 1) / 2
-    B = np.vstack(([N], B[0]))
+    A = np.sum([np.sum(np.abs(xmi - xmp1).max(axis=1) <= tolerance) - 1 for xmi in xmp1])
 
-    # sample entropy = -1 * (log (A/B))
-    similarity_ratio = A / B
-    se = -1 * np.log(similarity_ratio)
-    se = np.reshape(se, -1)
-    return se[0]
+    # Return SampEn
+    return -np.log(A / B)
 
 
 @set_property("fctype", "simple")
@@ -1620,7 +1622,6 @@ def autocorrelation(x, lag):
 
 
 @set_property("fctype", "simple")
-@set_property("input", "pd.Series")
 def quantile(x, q):
     """
     Calculates the q quantile of x. This is the value of x greater than q% of the ordered values from x.
@@ -1632,9 +1633,9 @@ def quantile(x, q):
     :return: the value of this feature
     :return type: float
     """
-    if not isinstance(x, pd.Series):
-        x = pd.Series(x)
-    return pd.Series.quantile(x, q)
+    if len(x) == 0:
+        return np.NaN
+    return np.quantile(x, q)
 
 
 @set_property("fctype", "simple")
@@ -1961,7 +1962,6 @@ def energy_ratio_by_chunks(x, param):
 @set_property("fctype", "combiner")
 @set_property("input", "pd.Series")
 @set_property("index_type", pd.DatetimeIndex)
-@set_property("high_comp_cost", True)
 def linear_trend_timewise(x, param):
     """
     Calculate a linear least-squares regression for the values of the time series versus the sequence from 0 to

diff --git a/tsfresh/scripts/measure_execution_time.py b/tsfresh/scripts/measure_execution_time.py
@@ -0,0 +1,167 @@
+# This script extracts the execution time for
+# various different settings of tsfresh
+# using different input data
+# Attention: it will run for ~half a day
+# Do these calculations in a controlled environment
+# (e.g. a cloud provider VM)
+# You will need to have b2luigi installed.
+from tsfresh.feature_extraction import ComprehensiveFCParameters, MinimalFCParameters, extract_features
+
+import pandas as pd
+import numpy as np
+from time import time
+import b2luigi as luigi
+import json
+
+
+class DataCreationTask(luigi.Task):
+    """Create random data for testing"""
+    num_ids = luigi.IntParameter(default=100)
+    time_series_length = luigi.IntParameter()
+    random_seed = luigi.IntParameter()
+
+    def output(self):
+        yield self.add_to_output("data.csv")
+
+    def run(self):
+        np.random.seed(self.random_seed)
+
+        df = pd.concat([
+            pd.DataFrame({
+                "id": [i] * self.time_series_length,
+                "time": range(self.time_series_length),
+                "value": np.random.randn(self.time_series_length)
+            })
+            for i in range(self.num_ids)
+        ])
+
+        with self._get_output_target("data.csv").open("w") as f:
+            df.to_csv(f)
+
+
+@luigi.requires(DataCreationTask)
+class TimingTask(luigi.Task):
+    """Run tsfresh with the given parameters"""
+    feature_parameter = luigi.DictParameter(hashed=True)
+    n_jobs = luigi.IntParameter()
+    try_number = luigi.IntParameter()
+
+    def output(self):
+        yield self.add_to_output("result.json")
+
+    def run(self):
+        input_file = self._get_input_targets("data.csv")[0]
+
+        with input_file.open("r") as f:
+            df = pd.read_csv(f)
+
+        start_time = time()
+        extract_features(df, column_id="id", column_sort="time", n_jobs=self.n_jobs,
+                         default_fc_parameters=self.feature_parameter,
+                         disable_progressbar=True)
+        end_time = time()
+
+        single_parameter_name = list(self.feature_parameter.keys())[0]
+        single_parameter_params = self.feature_parameter[single_parameter_name]
+
+        result_json = {
+            "time": end_time - start_time,
+            "n_ids": self.num_ids,
+            "n_jobs": self.n_jobs,
+            "feature": single_parameter_name,
+            "number_parameters": len(single_parameter_params) if single_parameter_params else 0,
+            "time_series_length": int((df["id"] == 0).sum()),
+            "try_number": self.try_number,
+        }
+
+        with self._get_output_target("result.json").open("w") as f:
+            json.dump(result_json, f)
+
+
+@luigi.requires(DataCreationTask)
+class FullTimingTask(luigi.Task):
+    """Run tsfresh with all calculators for comparison"""
+    n_jobs = luigi.IntParameter()
+
+    def output(self):
+        yield self.add_to_output("result.json")
+
+    def run(self):
+        input_file = self._get_input_targets("data.csv")[0]
+
+        with input_file.open("r") as f:
+            df = pd.read_csv(f)
+
+        start_time = time()
+        extract_features(df, column_id="id", column_sort="time", n_jobs=self.n_jobs,
+                         disable_progressbar=True)
+        end_time = time()
+
+        result_json = {
+            "time": end_time - start_time,
+            "n_ids": self.num_ids,
+            "n_jobs": self.n_jobs,
+            "time_series_length": int((df["id"] == 0).sum()),
+        }
+
+        with self._get_output_target("result.json").open("w") as f:
+            json.dump(result_json, f)
+
+
+class CombinerTask(luigi.Task):
+    """Collect all tasks into a single result.csv file"""
+    def complete(self):
+        return False
+
+    def requires(self):
+        settings = ComprehensiveFCParameters()
+        for job in [0, 1, 4]:
+            for time_series_length in [100, 500, 1000, 5000]:
+                yield FullTimingTask(time_series_length=time_series_length,
+                                     n_jobs=job,
+                                     num_ids=10,
+                                     random_seed=42)
+                yield FullTimingTask(time_series_length=time_series_length,
+                                     n_jobs=job,
+                                     num_ids=100,
+                                     random_seed=42)
+
+                for feature_name in settings:
+                    yield TimingTask(
+                        feature_parameter={feature_name: settings[feature_name]},
+                        time_series_length=time_series_length,
+                        n_jobs=job,
+                        num_ids=100,
+                        try_number=0,
+                        random_seed=42
+                    )
+
+                    for try_number in range(3):
+                        yield TimingTask(
+                            feature_parameter={feature_name: settings[feature_name]},
+                            n_jobs=job,
+                            try_number=try_number,
+                            num_ids=10,
+                            time_series_length=time_series_length,
+                            random_seed=42
+                        )
+
+    def output(self):
+        yield self.add_to_output("results.csv")
+
+    def run(self):
+        results = []
+
+        for input_file in self._get_input_targets("result.json"):
+            with input_file.open("r") as f:
+                results.append(json.load(f))
+
+        df = pd.DataFrame(results)
+
+        with self._get_output_target("results.csv").open("w") as f:
+            df.to_csv(f)
+
+
+if __name__ == "__main__":
+    luigi.set_setting("result_path", "results")
+    luigi.process(CombinerTask())