blue-yonder · nils-braun · May 11, 2020 · Apr 13, 2020 · Apr 13, 2020 · Apr 13, 2020
diff --git a/.coveragerc b/.coveragerc
@@ -14,6 +14,7 @@ omit = tsfresh/utilities/profiling.py
        tsfresh/examples/driftbif_simulation.py
        tsfresh/examples/test_tsfresh_baseline_dataset.py
        tsfresh/scripts/test_timing.py
+       tsfresh/scripts/measure_execution_time.py
 
 [report]
 # Regexes for lines to exclude from consideration

diff --git a/.travis.yml b/.travis.yml
@@ -69,10 +69,10 @@ jobs:
       # python 3.7 requires pandas >= 0.23.2
       python: 3.7
 
-    - env: NUMPY="1.12.0" PANDAS="0.20.3" SCIKITLEARN="0.19.0" DASK="0.15.2" DISTRIBUTED="1.18.3" SCIPY="1.2.0" STATSMODELS="0.9.0"
+    - env: NUMPY="1.15.1" PANDAS="0.20.3" SCIKITLEARN="0.19.0" DASK="0.15.2" DISTRIBUTED="1.18.3" SCIPY="1.2.0" STATSMODELS="0.9.0"
       python: 3.6
 
-    - env: NUMPY="1.12.0" PANDAS="0.20.3" SCIKITLEARN="0.19.0" DASK="0.15.2" DISTRIBUTED="1.18.3" SCIPY="1.2.0" STATSMODELS="0.9.0"
+    - env: NUMPY="1.15.1" PANDAS="0.20.3" SCIKITLEARN="0.19.0" DASK="0.15.2" DISTRIBUTED="1.18.3" SCIPY="1.2.0" STATSMODELS="0.9.0"
       python: 3.5.3
 
 

diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,5 @@
 requests>=2.9.1
-numpy>=1.12.0
+numpy>=1.15.1
 pandas>=0.20.3,!=0.24.* # pandas dropna is buggy in 0.24.0, see https://github.com/blue-yonder/tsfresh/issues/485 and https://github.com/pandas-dev/pandas/issues/25087
 scipy>=1.2.0
 statsmodels>=0.8.0

diff --git a/tests/units/feature_extraction/test_feature_calculations.py b/tests/units/feature_extraction/test_feature_calculations.py
@@ -794,7 +794,7 @@ def test_sample_entropy(self):
         ts = [1, 4, 5, 1, 7, 3, 1, 2, 5, 8, 9, 7, 3, 7, 9, 5, 4, 3, 9, 1, 2, 3, 4, 2, 9, 6, 7, 4, 9, 2, 9, 9, 6, 5, 1,
               3, 8, 1, 5, 3, 8, 4, 1, 2, 2, 1, 6, 5, 3, 6, 5, 4, 8, 9, 6, 7, 5, 3, 2, 5, 4, 2, 5, 1, 6, 5, 3, 5, 6, 7,
               8, 5, 2, 8, 6, 3, 8, 2, 7, 1, 7, 3, 5, 6, 2, 1, 3, 7, 3, 5, 3, 7, 6, 7, 7, 2, 3, 1, 7, 8]
-        self.assertAlmostEqualOnAllArrayTypes(sample_entropy, ts, 2.21187685)
+        self.assertAlmostEqualOnAllArrayTypes(sample_entropy, ts, 2.374905754573672)
 
     def test_autocorrelation(self):
         self.assertAlmostEqualOnAllArrayTypes(autocorrelation, [1, 2, 1, 2, 1, 2], -1, 1)

diff --git a/tsfresh/feature_extraction/feature_calculators.py b/tsfresh/feature_extraction/feature_calculators.py
@@ -1539,40 +1539,26 @@ def sample_entropy(x):
     """
     x = np.array(x)
 
-    sample_length = 1  # number of sequential points of the time series
-    tolerance = 0.2 * np.std(x)  # 0.2 is a common value for r - why?
+    m = 2  # common value for m, according to wikipedia...
+    tolerance = 0.2 * np.std(x)  # 0.2 is a common value for r, according to wikipedia...
 
-    n = len(x)
-    prev = np.zeros(n)
-    curr = np.zeros(n)
-    A = np.zeros((1, 1))  # number of matches for m = [1,...,template_length - 1]
-    B = np.zeros((1, 1))  # number of matches for m = [1,...,template_length]
-
-    for i in range(n - 1):
-        nj = n - i - 1
-        ts1 = x[i]
-        for jj in range(nj):
-            j = jj + i + 1
-            if abs(x[j] - ts1) < tolerance:  # distance between two vectors
-                curr[jj] = prev[jj] + 1
-                temp_ts_length = min(sample_length, curr[jj])
-                for m in range(int(temp_ts_length)):
-                    A[m] += 1
-                    if j < n - 1:
-                        B[m] += 1
-            else:
-                curr[jj] = 0
-        for j in range(nj):
-            prev[j] = curr[j]
+    N = len(x)
+
+    # Split time series and save all templates of length m
+    xmi = np.array([x[i:i + m] for i in range(N - m)])
+    xmj = np.array([x[i:i + m] for i in range(N - m + 1)])
 
-    N = n * (n - 1) / 2
-    B = np.vstack(([N], B[0]))
+    # Save all matches minus the self-match, compute B
+    B = np.sum([np.sum(np.abs(xmii - xmj).max(axis=1) <= tolerance) - 1 for xmii in xmi])
 
-    # sample entropy = -1 * (log (A/B))
-    similarity_ratio = A / B
-    se = -1 * np.log(similarity_ratio)
-    se = np.reshape(se, -1)
-    return se[0]
+    # Similar for computing A
+    m += 1
+    xm = np.array([x[i:i + m] for i in range(N - m + 1)])
+
+    A = np.sum([np.sum(np.abs(xmi - xm).max(axis=1) <= tolerance) - 1 for xmi in xm])
+
+    # Return SampEn
+    return -np.log(A / B)
 
 
 @set_property("fctype", "simple")
@@ -1620,7 +1606,6 @@ def autocorrelation(x, lag):
 
 
 @set_property("fctype", "simple")
-@set_property("input", "pd.Series")
 def quantile(x, q):
     """
     Calculates the q quantile of x. This is the value of x greater than q% of the ordered values from x.
@@ -1632,9 +1617,9 @@ def quantile(x, q):
     :return: the value of this feature
     :return type: float
     """
-    if not isinstance(x, pd.Series):
-        x = pd.Series(x)
-    return pd.Series.quantile(x, q)
+    if len(x) == 0:
+        return np.NaN
+    return np.quantile(x, q)
 
 
 @set_property("fctype", "simple")
@@ -1961,7 +1946,6 @@ def energy_ratio_by_chunks(x, param):
 @set_property("fctype", "combiner")
 @set_property("input", "pd.Series")
 @set_property("index_type", pd.DatetimeIndex)
-@set_property("high_comp_cost", True)
 def linear_trend_timewise(x, param):
     """
     Calculate a linear least-squares regression for the values of the time series versus the sequence from 0 to

diff --git a/tsfresh/scripts/measure_execution_time.py b/tsfresh/scripts/measure_execution_time.py
@@ -0,0 +1,162 @@
+# This script extracts the execution time for
+# various different settings of tsfresh
+# using different input data
+# Attention: it will run for ~half a day
+# Do these calculations in a controlled environment
+# (e.g. a cloud provider VM)
+# You will need to have b2luigi installed.
+from tsfresh.feature_extraction import ComprehensiveFCParameters, MinimalFCParameters, extract_features
+
+import pandas as pd
+import numpy as np
+from time import time
+import b2luigi as luigi
+import json
+
+np.random.seed(42)
+
+
+class DataCreationTask(luigi.Task):
+    """Create random data for testing"""
+    num_ids = luigi.IntParameter(default=100)
+    time_series_length = luigi.IntParameter()
+
+    def output(self):
+        yield self.add_to_output("data.csv")
+
+    def run(self):
+        df = pd.concat([
+            pd.DataFrame({
+                "id": [i] * self.time_series_length,
+                "time": range(self.time_series_length),
+                "value": np.random.randn(self.time_series_length)
+            })
+            for i in range(self.num_ids)
+        ])
+
+        with self._get_output_target("data.csv").open("w") as f:
+            df.to_csv(f)
+
+
+@luigi.requires(DataCreationTask)
+class TimingTask(luigi.Task):
+    """Run tsfresh with the given parameters"""
+    feature_parameter = luigi.DictParameter(hashed=True)
+    n_jobs = luigi.IntParameter()
+    try_number = luigi.IntParameter()
+
+    def output(self):
+        yield self.add_to_output("result.json")
+
+    def run(self):
+        input_file = self._get_input_targets("data.csv")[0]
+
+        with input_file.open("r") as f:
+            df = pd.read_csv(f)
+
+        start_time = time()
+        extract_features(df, column_id="id", column_sort="time", n_jobs=self.n_jobs,
+                         default_fc_parameters=self.feature_parameter,
+                         disable_progressbar=True)
+        end_time = time()
+
+        single_parameter_name = list(self.feature_parameter.keys())[0]
+        single_parameter_params = self.feature_parameter[single_parameter_name]
+
+        result_json = {
+            "time": end_time - start_time,
+            "n_ids": self.num_ids,
+            "n_jobs": self.n_jobs,
+            "feature": single_parameter_name,
+            "number_parameters": len(single_parameter_params) if single_parameter_params else 0,
+            "time_series_length": int((df["id"] == 0).sum()),
+            "try_number": self.try_number,
+        }
+
+        with self._get_output_target("result.json").open("w") as f:
+            json.dump(result_json, f)
+
+
+@luigi.requires(DataCreationTask)
+class FullTimingTask(luigi.Task):
+    """Run tsfresh with all calculators for comparison"""
+    n_jobs = luigi.IntParameter()
+
+    def output(self):
+        yield self.add_to_output("result.json")
+
+    def run(self):
+        input_file = self._get_input_targets("data.csv")[0]
+
+        with input_file.open("r") as f:
+            df = pd.read_csv(f)
+
+        start_time = time()
+        extract_features(df, column_id="id", column_sort="time", n_jobs=self.n_jobs,
+                         disable_progressbar=True)
+        end_time = time()
+
+        result_json = {
+            "time": end_time - start_time,
+            "n_ids": self.num_ids,
+            "n_jobs": self.n_jobs,
+            "time_series_length": int((df["id"] == 0).sum()),
+        }
+
+        with self._get_output_target("result.json").open("w") as f:
+            json.dump(result_json, f)
+
+
+class CombinerTask(luigi.Task):
+    """Collect all tasks into a single result.csv file"""
+    def complete(self):
+        return False
+
+    def requires(self):
+        settings = ComprehensiveFCParameters()
+        for job in [0, 1, 4]:
+            for time_series_length in [100, 500, 1000, 5000]:
+                yield FullTimingTask(time_series_length=time_series_length,
+                                     n_jobs=job,
+                                     num_ids=10)
+                yield FullTimingTask(time_series_length=time_series_length,
+                                     n_jobs=job,
+                                     num_ids=100)
+
+                for feature_name in settings:
+                    yield TimingTask(
+                        feature_parameter={feature_name: settings[feature_name]},
+                        time_series_length=time_series_length,
+                        n_jobs=job,
+                        num_ids=100,
+                        try_number=0,
+                    )
+
+                    for try_number in range(3):
+                        yield TimingTask(
+                            feature_parameter={feature_name: settings[feature_name]},
+                            n_jobs=job,
+                            try_number=try_number,
+                            num_ids=10,
+                            time_series_length=time_series_length
+                        )
+
+    def output(self):
+        yield self.add_to_output("results.csv")
+
+    def run(self):
+        results = []
+
+        for input_file in self._get_input_targets("result.json"):
+            with input_file.open("r") as f:
+                results.append(json.load(f))
+
+        df = pd.DataFrame(results)
+
+        with self._get_output_target("results.csv").open("w") as f:
+            df.to_csv(f)
+
+
+if __name__ == "__main__":
+    luigi.set_setting("result_path", "results")
+    luigi.process(CombinerTask())