Skip to content

Commit

Permalink
Improved speed (blue-yonder#681)
Browse files Browse the repository at this point in the history
* Started refactoring the rolling function

* Unfinished test fix

* Fixed remaining rolling tests

* Correct the shifted values and the index

* Added test for min timeshift

* Added parallelization to the rolling function

* Be python 3.5 compatible...

* Reworked the rolling documentation

* Copy-paste error

* Improved forecasting documentation

* pep8ify

* Speed up quantile calculation

* Replace sample_entropy by corrected and faster reference implementation from Wikipedia

* Added task file for testing

* Bump the requirement for numpy to use the quantile function

* Move to correct location

* pep8ify

* Does not make sense to include script in coverage

* The linear_trend_timewise is not a high computation costs calculator anymore

* Remove unneeded import

* Make the random-seed seeting more explicit

* Added more tests, more documentation and fixed another bug in the implementation

* Added changelog

Co-authored-by: Nils Braun <nilslennartbraun@gmail.com>
Co-authored-by: Sophie Walther <sophie.n.walther@googlemail.com>
  • Loading branch information
3 people authored May 11, 2020
1 parent 5bdbbcf commit 8106334
Show file tree
Hide file tree
Showing 7 changed files with 231 additions and 40 deletions.
1 change: 1 addition & 0 deletions .coveragerc
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ omit = tsfresh/utilities/profiling.py
tsfresh/examples/driftbif_simulation.py
tsfresh/examples/test_tsfresh_baseline_dataset.py
tsfresh/scripts/test_timing.py
tsfresh/scripts/measure_execution_time.py

[report]
# Regexes for lines to exclude from consideration
Expand Down
4 changes: 2 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,10 @@ jobs:
# python 3.7 requires pandas >= 0.23.2
python: 3.7

- env: NUMPY="1.12.0" PANDAS="0.20.3" SCIKITLEARN="0.19.0" DASK="0.15.2" DISTRIBUTED="1.18.3" SCIPY="1.2.0" STATSMODELS="0.9.0"
- env: NUMPY="1.15.1" PANDAS="0.20.3" SCIKITLEARN="0.19.0" DASK="0.15.2" DISTRIBUTED="1.18.3" SCIPY="1.2.0" STATSMODELS="0.9.0"
python: 3.6

- env: NUMPY="1.12.0" PANDAS="0.20.3" SCIKITLEARN="0.19.0" DASK="0.15.2" DISTRIBUTED="1.18.3" SCIPY="1.2.0" STATSMODELS="0.9.0"
- env: NUMPY="1.15.1" PANDAS="0.20.3" SCIKITLEARN="0.19.0" DASK="0.15.2" DISTRIBUTED="1.18.3" SCIPY="1.2.0" STATSMODELS="0.9.0"
python: 3.5.3


Expand Down
4 changes: 4 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,13 @@ Unreleased
- Optimize RelevantFeatureAugmenter to avoid re-extraction (#669)
- Added a function `add_sub_time_series_index` (#666)
- Added Dockerfile
- Speed optimizations and speed testing script (#681)
- Bugfixes
- Increase the extracted `ar` coefficients to the full parameter range. (#662)
- Documentation fixes (#663, #664, #665)
- Rewrote the `sample_entropy` feature calculator (#681)
It is now faster and (hopefully) more correct.
But your results will change!


Version 0.15.1
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
requests>=2.9.1
numpy>=1.12.0
numpy>=1.15.1
pandas>=0.20.3,!=0.24.* # pandas dropna is buggy in 0.24.0, see https://github.com/blue-yonder/tsfresh/issues/485 and https://github.com/pandas-dev/pandas/issues/25087
scipy>=1.2.0
statsmodels>=0.8.0
Expand Down
21 changes: 20 additions & 1 deletion tests/units/feature_extraction/test_feature_calculations.py
Original file line number Diff line number Diff line change
Expand Up @@ -791,10 +791,29 @@ def test_binned_entropy(self):
self.assertAlmostEqualOnAllArrayTypes(binned_entropy, list(range(100)), - np.math.log(1 / 2), 2)

def test_sample_entropy(self):
# "random" list -> large entropy
ts = [1, 4, 5, 1, 7, 3, 1, 2, 5, 8, 9, 7, 3, 7, 9, 5, 4, 3, 9, 1, 2, 3, 4, 2, 9, 6, 7, 4, 9, 2, 9, 9, 6, 5, 1,
3, 8, 1, 5, 3, 8, 4, 1, 2, 2, 1, 6, 5, 3, 6, 5, 4, 8, 9, 6, 7, 5, 3, 2, 5, 4, 2, 5, 1, 6, 5, 3, 5, 6, 7,
8, 5, 2, 8, 6, 3, 8, 2, 7, 1, 7, 3, 5, 6, 2, 1, 3, 7, 3, 5, 3, 7, 6, 7, 7, 2, 3, 1, 7, 8]
self.assertAlmostEqualOnAllArrayTypes(sample_entropy, ts, 2.21187685)
self.assertAlmostEqualOnAllArrayTypes(sample_entropy, ts, 2.38262780)
# This is not very complex, so it gives a small value
ts = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
self.assertAlmostEqualOnAllArrayTypes(sample_entropy, ts, 0.25131442)
# however adding a 2 increases complexity
ts = [1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
self.assertAlmostEqualOnAllArrayTypes(sample_entropy, ts, 0.74193734)
# and it does not matter where
ts = [1, 1, 1, 2, 1, 1, 1, 1, 1, 1]
self.assertAlmostEqualOnAllArrayTypes(sample_entropy, ts, 0.74193734)
# negative numbers also work
ts = [1, -1, 1, -1, 1, -1]
self.assertAlmostEqualOnAllArrayTypes(sample_entropy, ts, 0.69314718)
# nan gives nan
ts = [1, -1, 1, np.nan, 1, -1]
self.assertIsNanOnAllArrayTypes(sample_entropy, ts)
# this is not a very "random" list, so it should give a small entropy
ts = list(range(1000))
self.assertAlmostEqualOnAllArrayTypes(sample_entropy, ts, 0.0010314596066622707)

def test_autocorrelation(self):
self.assertAlmostEqualOnAllArrayTypes(autocorrelation, [1, 2, 1, 2, 1, 2], -1, 1)
Expand Down
72 changes: 36 additions & 36 deletions tsfresh/feature_extraction/feature_calculators.py
Original file line number Diff line number Diff line change
Expand Up @@ -1539,40 +1539,42 @@ def sample_entropy(x):
"""
x = np.array(x)

sample_length = 1 # number of sequential points of the time series
tolerance = 0.2 * np.std(x) # 0.2 is a common value for r - why?
# if one of the values is NaN, we can not compute anything meaningful
if np.isnan(x).any():
return np.nan

n = len(x)
prev = np.zeros(n)
curr = np.zeros(n)
A = np.zeros((1, 1)) # number of matches for m = [1,...,template_length - 1]
B = np.zeros((1, 1)) # number of matches for m = [1,...,template_length]

for i in range(n - 1):
nj = n - i - 1
ts1 = x[i]
for jj in range(nj):
j = jj + i + 1
if abs(x[j] - ts1) < tolerance: # distance between two vectors
curr[jj] = prev[jj] + 1
temp_ts_length = min(sample_length, curr[jj])
for m in range(int(temp_ts_length)):
A[m] += 1
if j < n - 1:
B[m] += 1
else:
curr[jj] = 0
for j in range(nj):
prev[j] = curr[j]
m = 2 # common value for m, according to wikipedia...
tolerance = 0.2 * np.std(x) # 0.2 is a common value for r, according to wikipedia...

N = len(x)

# Split time series and save all templates of length m
# Basically we turn [1, 2, 3, 4] into [1, 2], [2, 3], [3, 4]
xm = np.array([x[i:i + m] for i in range(N - m + 1)])

# Now calculate the maximum distance between each of those pairs
# np.abs(xmi - xm).max(axis=1)
# and check how many are below the tolerance.
# For speed reasons, we are not doing this in a nested for loop,
# but with numpy magic.
# Example:
# if x = [1, 2, 3]
# then xm = [[1, 2], [2, 3]]
# so we will substract xm from [1, 2] => [[0, 0], [-1, -1]]
# and from [2, 3] => [[1, 1], [0, 0]]
# taking the abs and max gives us:
# [0, 1] and [1, 0]
# as the diagonal elements are always 0, we substract 1.
B = np.sum([np.sum(np.abs(xmi - xm).max(axis=1) <= tolerance) - 1 for xmi in xm])

# Similar for computing A
m += 1
xmp1 = np.array([x[i:i + m] for i in range(N - m + 1)])

N = n * (n - 1) / 2
B = np.vstack(([N], B[0]))
A = np.sum([np.sum(np.abs(xmi - xmp1).max(axis=1) <= tolerance) - 1 for xmi in xmp1])

# sample entropy = -1 * (log (A/B))
similarity_ratio = A / B
se = -1 * np.log(similarity_ratio)
se = np.reshape(se, -1)
return se[0]
# Return SampEn
return -np.log(A / B)


@set_property("fctype", "simple")
Expand Down Expand Up @@ -1620,7 +1622,6 @@ def autocorrelation(x, lag):


@set_property("fctype", "simple")
@set_property("input", "pd.Series")
def quantile(x, q):
"""
Calculates the q quantile of x. This is the value of x greater than q% of the ordered values from x.
Expand All @@ -1632,9 +1633,9 @@ def quantile(x, q):
:return: the value of this feature
:return type: float
"""
if not isinstance(x, pd.Series):
x = pd.Series(x)
return pd.Series.quantile(x, q)
if len(x) == 0:
return np.NaN
return np.quantile(x, q)


@set_property("fctype", "simple")
Expand Down Expand Up @@ -1961,7 +1962,6 @@ def energy_ratio_by_chunks(x, param):
@set_property("fctype", "combiner")
@set_property("input", "pd.Series")
@set_property("index_type", pd.DatetimeIndex)
@set_property("high_comp_cost", True)
def linear_trend_timewise(x, param):
"""
Calculate a linear least-squares regression for the values of the time series versus the sequence from 0 to
Expand Down
167 changes: 167 additions & 0 deletions tsfresh/scripts/measure_execution_time.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
# This script extracts the execution time for
# various different settings of tsfresh
# using different input data
# Attention: it will run for ~half a day
# Do these calculations in a controlled environment
# (e.g. a cloud provider VM)
# You will need to have b2luigi installed.
from tsfresh.feature_extraction import ComprehensiveFCParameters, MinimalFCParameters, extract_features

import pandas as pd
import numpy as np
from time import time
import b2luigi as luigi
import json


class DataCreationTask(luigi.Task):
"""Create random data for testing"""
num_ids = luigi.IntParameter(default=100)
time_series_length = luigi.IntParameter()
random_seed = luigi.IntParameter()

def output(self):
yield self.add_to_output("data.csv")

def run(self):
np.random.seed(self.random_seed)

df = pd.concat([
pd.DataFrame({
"id": [i] * self.time_series_length,
"time": range(self.time_series_length),
"value": np.random.randn(self.time_series_length)
})
for i in range(self.num_ids)
])

with self._get_output_target("data.csv").open("w") as f:
df.to_csv(f)


@luigi.requires(DataCreationTask)
class TimingTask(luigi.Task):
"""Run tsfresh with the given parameters"""
feature_parameter = luigi.DictParameter(hashed=True)
n_jobs = luigi.IntParameter()
try_number = luigi.IntParameter()

def output(self):
yield self.add_to_output("result.json")

def run(self):
input_file = self._get_input_targets("data.csv")[0]

with input_file.open("r") as f:
df = pd.read_csv(f)

start_time = time()
extract_features(df, column_id="id", column_sort="time", n_jobs=self.n_jobs,
default_fc_parameters=self.feature_parameter,
disable_progressbar=True)
end_time = time()

single_parameter_name = list(self.feature_parameter.keys())[0]
single_parameter_params = self.feature_parameter[single_parameter_name]

result_json = {
"time": end_time - start_time,
"n_ids": self.num_ids,
"n_jobs": self.n_jobs,
"feature": single_parameter_name,
"number_parameters": len(single_parameter_params) if single_parameter_params else 0,
"time_series_length": int((df["id"] == 0).sum()),
"try_number": self.try_number,
}

with self._get_output_target("result.json").open("w") as f:
json.dump(result_json, f)


@luigi.requires(DataCreationTask)
class FullTimingTask(luigi.Task):
"""Run tsfresh with all calculators for comparison"""
n_jobs = luigi.IntParameter()

def output(self):
yield self.add_to_output("result.json")

def run(self):
input_file = self._get_input_targets("data.csv")[0]

with input_file.open("r") as f:
df = pd.read_csv(f)

start_time = time()
extract_features(df, column_id="id", column_sort="time", n_jobs=self.n_jobs,
disable_progressbar=True)
end_time = time()

result_json = {
"time": end_time - start_time,
"n_ids": self.num_ids,
"n_jobs": self.n_jobs,
"time_series_length": int((df["id"] == 0).sum()),
}

with self._get_output_target("result.json").open("w") as f:
json.dump(result_json, f)


class CombinerTask(luigi.Task):
"""Collect all tasks into a single result.csv file"""
def complete(self):
return False

def requires(self):
settings = ComprehensiveFCParameters()
for job in [0, 1, 4]:
for time_series_length in [100, 500, 1000, 5000]:
yield FullTimingTask(time_series_length=time_series_length,
n_jobs=job,
num_ids=10,
random_seed=42)
yield FullTimingTask(time_series_length=time_series_length,
n_jobs=job,
num_ids=100,
random_seed=42)

for feature_name in settings:
yield TimingTask(
feature_parameter={feature_name: settings[feature_name]},
time_series_length=time_series_length,
n_jobs=job,
num_ids=100,
try_number=0,
random_seed=42
)

for try_number in range(3):
yield TimingTask(
feature_parameter={feature_name: settings[feature_name]},
n_jobs=job,
try_number=try_number,
num_ids=10,
time_series_length=time_series_length,
random_seed=42
)

def output(self):
yield self.add_to_output("results.csv")

def run(self):
results = []

for input_file in self._get_input_targets("result.json"):
with input_file.open("r") as f:
results.append(json.load(f))

df = pd.DataFrame(results)

with self._get_output_target("results.csv").open("w") as f:
df.to_csv(f)


if __name__ == "__main__":
luigi.set_setting("result_path", "results")
luigi.process(CombinerTask())

0 comments on commit 8106334

Please sign in to comment.