-
Notifications
You must be signed in to change notification settings - Fork 1.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improved speed #681
Improved speed #681
Changes from 21 commits
52c1a69
0d5fd4e
b0006cd
c4e90a8
3c3c0e9
ffae951
c480005
304f450
7bf7b9e
08e7343
c6e41e2
ab24c93
ce493e5
c44080c
d0465c6
bc59a0d
9c79823
b209b93
85ad39b
51e0ccd
b1499da
ce15784
888fb98
b3fa7e8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1539,40 +1539,26 @@ def sample_entropy(x): | |
""" | ||
x = np.array(x) | ||
|
||
sample_length = 1 # number of sequential points of the time series | ||
tolerance = 0.2 * np.std(x) # 0.2 is a common value for r - why? | ||
m = 2 # common value for m, according to wikipedia... | ||
tolerance = 0.2 * np.std(x) # 0.2 is a common value for r, according to wikipedia... | ||
|
||
n = len(x) | ||
prev = np.zeros(n) | ||
curr = np.zeros(n) | ||
A = np.zeros((1, 1)) # number of matches for m = [1,...,template_length - 1] | ||
B = np.zeros((1, 1)) # number of matches for m = [1,...,template_length] | ||
|
||
for i in range(n - 1): | ||
nj = n - i - 1 | ||
ts1 = x[i] | ||
for jj in range(nj): | ||
j = jj + i + 1 | ||
if abs(x[j] - ts1) < tolerance: # distance between two vectors | ||
curr[jj] = prev[jj] + 1 | ||
temp_ts_length = min(sample_length, curr[jj]) | ||
for m in range(int(temp_ts_length)): | ||
A[m] += 1 | ||
if j < n - 1: | ||
B[m] += 1 | ||
else: | ||
curr[jj] = 0 | ||
for j in range(nj): | ||
prev[j] = curr[j] | ||
N = len(x) | ||
|
||
# Split time series and save all templates of length m | ||
xmi = np.array([x[i:i + m] for i in range(N - m)]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can't we replace those lines with https://docs.scipy.org/doc/numpy/reference/generated/numpy.split.html? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No. |
||
xmj = np.array([x[i:i + m] for i in range(N - m + 1)]) | ||
|
||
N = n * (n - 1) / 2 | ||
B = np.vstack(([N], B[0])) | ||
# Save all matches minus the self-match, compute B | ||
B = np.sum([np.sum(np.abs(xmii - xmj).max(axis=1) <= tolerance) - 1 for xmii in xmi]) | ||
|
||
# sample entropy = -1 * (log (A/B)) | ||
similarity_ratio = A / B | ||
se = -1 * np.log(similarity_ratio) | ||
se = np.reshape(se, -1) | ||
return se[0] | ||
# Similar for computing A | ||
m += 1 | ||
xm = np.array([x[i:i + m] for i in range(N - m + 1)]) | ||
|
||
A = np.sum([np.sum(np.abs(xmi - xm).max(axis=1) <= tolerance) - 1 for xmi in xm]) | ||
|
||
# Return SampEn | ||
return -np.log(A / B) | ||
|
||
|
||
@set_property("fctype", "simple") | ||
|
@@ -1620,7 +1606,6 @@ def autocorrelation(x, lag): | |
|
||
|
||
@set_property("fctype", "simple") | ||
@set_property("input", "pd.Series") | ||
def quantile(x, q): | ||
""" | ||
Calculates the q quantile of x. This is the value of x greater than q% of the ordered values from x. | ||
|
@@ -1632,9 +1617,9 @@ def quantile(x, q): | |
:return: the value of this feature | ||
:return type: float | ||
""" | ||
if not isinstance(x, pd.Series): | ||
x = pd.Series(x) | ||
return pd.Series.quantile(x, q) | ||
if len(x) == 0: | ||
return np.NaN | ||
return np.quantile(x, q) | ||
|
||
|
||
@set_property("fctype", "simple") | ||
|
@@ -1961,7 +1946,6 @@ def energy_ratio_by_chunks(x, param): | |
@set_property("fctype", "combiner") | ||
@set_property("input", "pd.Series") | ||
@set_property("index_type", pd.DatetimeIndex) | ||
@set_property("high_comp_cost", True) | ||
def linear_trend_timewise(x, param): | ||
""" | ||
Calculate a linear least-squares regression for the values of the time series versus the sequence from 0 to | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,162 @@ | ||
# This script extracts the execution time for | ||
# various different settings of tsfresh | ||
# using different input data | ||
# Attention: it will run for ~half a day | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. on how many cores? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On two. I have described the machine setup on my blog-post: https://nils-braun.github.io/execution-time/ There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry, maybe to be more clear: |
||
# Do these calculations in a controlled environment | ||
# (e.g. a cloud provider VM) | ||
# You will need to have b2luigi installed. | ||
from tsfresh.feature_extraction import ComprehensiveFCParameters, MinimalFCParameters, extract_features | ||
|
||
import pandas as pd | ||
import numpy as np | ||
from time import time | ||
import b2luigi as luigi | ||
import json | ||
|
||
np.random.seed(42) | ||
nils-braun marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
|
||
class DataCreationTask(luigi.Task): | ||
"""Create random data for testing""" | ||
num_ids = luigi.IntParameter(default=100) | ||
time_series_length = luigi.IntParameter() | ||
|
||
def output(self): | ||
yield self.add_to_output("data.csv") | ||
|
||
def run(self): | ||
df = pd.concat([ | ||
pd.DataFrame({ | ||
"id": [i] * self.time_series_length, | ||
"time": range(self.time_series_length), | ||
"value": np.random.randn(self.time_series_length) | ||
}) | ||
for i in range(self.num_ids) | ||
]) | ||
|
||
with self._get_output_target("data.csv").open("w") as f: | ||
df.to_csv(f) | ||
|
||
|
||
@luigi.requires(DataCreationTask) | ||
class TimingTask(luigi.Task): | ||
"""Run tsfresh with the given parameters""" | ||
feature_parameter = luigi.DictParameter(hashed=True) | ||
n_jobs = luigi.IntParameter() | ||
try_number = luigi.IntParameter() | ||
|
||
def output(self): | ||
yield self.add_to_output("result.json") | ||
|
||
def run(self): | ||
input_file = self._get_input_targets("data.csv")[0] | ||
|
||
with input_file.open("r") as f: | ||
df = pd.read_csv(f) | ||
|
||
start_time = time() | ||
extract_features(df, column_id="id", column_sort="time", n_jobs=self.n_jobs, | ||
default_fc_parameters=self.feature_parameter, | ||
disable_progressbar=True) | ||
end_time = time() | ||
|
||
single_parameter_name = list(self.feature_parameter.keys())[0] | ||
single_parameter_params = self.feature_parameter[single_parameter_name] | ||
|
||
result_json = { | ||
"time": end_time - start_time, | ||
"n_ids": self.num_ids, | ||
"n_jobs": self.n_jobs, | ||
"feature": single_parameter_name, | ||
"number_parameters": len(single_parameter_params) if single_parameter_params else 0, | ||
"time_series_length": int((df["id"] == 0).sum()), | ||
"try_number": self.try_number, | ||
} | ||
|
||
with self._get_output_target("result.json").open("w") as f: | ||
json.dump(result_json, f) | ||
|
||
|
||
@luigi.requires(DataCreationTask) | ||
class FullTimingTask(luigi.Task): | ||
"""Run tsfresh with all calculators for comparison""" | ||
n_jobs = luigi.IntParameter() | ||
|
||
def output(self): | ||
yield self.add_to_output("result.json") | ||
|
||
def run(self): | ||
input_file = self._get_input_targets("data.csv")[0] | ||
|
||
with input_file.open("r") as f: | ||
df = pd.read_csv(f) | ||
|
||
start_time = time() | ||
extract_features(df, column_id="id", column_sort="time", n_jobs=self.n_jobs, | ||
disable_progressbar=True) | ||
end_time = time() | ||
|
||
result_json = { | ||
"time": end_time - start_time, | ||
"n_ids": self.num_ids, | ||
"n_jobs": self.n_jobs, | ||
"time_series_length": int((df["id"] == 0).sum()), | ||
} | ||
|
||
with self._get_output_target("result.json").open("w") as f: | ||
json.dump(result_json, f) | ||
|
||
|
||
class CombinerTask(luigi.Task): | ||
"""Collect all tasks into a single result.csv file""" | ||
def complete(self): | ||
return False | ||
|
||
def requires(self): | ||
settings = ComprehensiveFCParameters() | ||
for job in [0, 1, 4]: | ||
for time_series_length in [100, 500, 1000, 5000]: | ||
yield FullTimingTask(time_series_length=time_series_length, | ||
n_jobs=job, | ||
num_ids=10) | ||
yield FullTimingTask(time_series_length=time_series_length, | ||
n_jobs=job, | ||
num_ids=100) | ||
|
||
for feature_name in settings: | ||
yield TimingTask( | ||
feature_parameter={feature_name: settings[feature_name]}, | ||
time_series_length=time_series_length, | ||
n_jobs=job, | ||
num_ids=100, | ||
try_number=0, | ||
) | ||
|
||
for try_number in range(3): | ||
yield TimingTask( | ||
feature_parameter={feature_name: settings[feature_name]}, | ||
n_jobs=job, | ||
try_number=try_number, | ||
num_ids=10, | ||
time_series_length=time_series_length | ||
) | ||
|
||
def output(self): | ||
yield self.add_to_output("results.csv") | ||
|
||
def run(self): | ||
results = [] | ||
|
||
for input_file in self._get_input_targets("result.json"): | ||
with input_file.open("r") as f: | ||
results.append(json.load(f)) | ||
|
||
df = pd.DataFrame(results) | ||
|
||
with self._get_output_target("results.csv").open("w") as f: | ||
df.to_csv(f) | ||
|
||
|
||
if __name__ == "__main__": | ||
luigi.set_setting("result_path", "results") | ||
luigi.process(CombinerTask()) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add some more unit tests for the sample_entroy?
I am missing:
etc.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I added more tests and some documentation on the tests in my last commit.