Skip to content

Commit

Permalink
Fix parameter sorting (#656)
Browse files Browse the repository at this point in the history
* Fixed parameter sorting for combiner calculators

* Fixed parameter sorting for combiner calculators.

Added hint for parameter sorting

* Alphabetically sorted features also in the tests

Additionally added a test to assure, that this ordering stays the same.

* pep8ify

Co-authored-by: Tim Rueckel <tim.rueckel@iav.de>
  • Loading branch information
nils-braun and Tim Rueckel authored Apr 11, 2020
1 parent 52a37e0 commit 2a50ba7
Show file tree
Hide file tree
Showing 4 changed files with 103 additions and 86 deletions.
2 changes: 1 addition & 1 deletion tests/integrations/test_full_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def test_relevant_extraction(self):

some_expected_features = {'F_x__abs_energy',
'F_x__absolute_sum_of_changes',
'F_x__ar_coefficient__k_10__coeff_0',
'F_x__ar_coefficient__coeff_0__k_10',
'F_x__autocorrelation__lag_1',
'F_x__binned_entropy__max_bins_10',
'F_x__count_above_mean',
Expand Down
14 changes: 14 additions & 0 deletions tests/units/feature_extraction/test_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,20 @@ def test_extract_index_preservation(self):
self.assertIsInstance(extracted_features, pd.DataFrame)
self.assertEqual(set(df["id"]), set(extracted_features.index))

def test_extract_features_alphabetically_sorted(self):
df = self.create_test_data_sample()

features = extract_features(df, column_id="id", column_sort="sort",
column_kind="kind", column_value="val")

for col_name in features.columns:
# split out the configuration of the features calculator
col_name_chunks = col_name.split("__")
# the name is always at the beginning, so remove it. Also remove the kind of the column
col_name_chunks = col_name_chunks[2:]

self.assertEqual(col_name_chunks, list(sorted(col_name_chunks)))


class ParallelExtractionTestCase(DataTestCase):
def setUp(self):
Expand Down
156 changes: 78 additions & 78 deletions tests/units/feature_extraction/test_feature_calculations.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,16 +257,16 @@ def test_augmented_dickey_fuller(self):
{"autolag": "BIC", "attr": "usedlag"}
]
expected_index = [
'autolag_"BIC"__attr_"teststat"',
'autolag_"BIC"__attr_"pvalue"',
'autolag_"BIC"__attr_"usedlag"',
'attr_"teststat"__autolag_"BIC"',
'attr_"pvalue"__autolag_"BIC"',
'attr_"usedlag"__autolag_"BIC"',
]

res = augmented_dickey_fuller(x=x, param=param)
res = pd.Series(dict(res))
self.assertCountEqual(list(res.index), expected_index)
self.assertGreater(res['autolag_"BIC"__attr_"pvalue"'], 0.10)
self.assertEqual(res['autolag_"BIC"__attr_"usedlag"'], 0)
self.assertGreater(res['attr_"pvalue"__autolag_"BIC"'], 0.10)
self.assertEqual(res['attr_"usedlag"__autolag_"BIC"'], 0)

# H0 should be rejected for AR(1) model with x_{t} = 1/2 x_{t-1} + e_{t}
np.random.seed(seed=42)
Expand All @@ -282,16 +282,16 @@ def test_augmented_dickey_fuller(self):
{"autolag": "AIC", "attr": "usedlag"}
]
expected_index = [
'autolag_"AIC"__attr_"teststat"',
'autolag_"AIC"__attr_"pvalue"',
'autolag_"AIC"__attr_"usedlag"',
'attr_"teststat"__autolag_"AIC"',
'attr_"pvalue"__autolag_"AIC"',
'attr_"usedlag"__autolag_"AIC"',
]

res = augmented_dickey_fuller(x=x, param=param)
res = pd.Series(dict(res))
self.assertCountEqual(list(res.index), expected_index)
self.assertLessEqual(res['autolag_"AIC"__attr_"pvalue"'], 0.05)
self.assertEqual(res['autolag_"AIC"__attr_"usedlag"'], 0)
self.assertLessEqual(res['attr_"pvalue"__autolag_"AIC"'], 0.05)
self.assertEqual(res['attr_"usedlag"__autolag_"AIC"'], 0)

# Check if LinAlgError and ValueError are catched
res_linalg_error = augmented_dickey_fuller(x=np.repeat(np.nan, 100), param=param)
Expand Down Expand Up @@ -501,38 +501,38 @@ def test_fft_coefficient(self):
{"coeff": 0, "attr": "imag"}, {"coeff": 1, "attr": "imag"}, {"coeff": 2, "attr": "imag"},
{"coeff": 0, "attr": "angle"}, {"coeff": 1, "attr": "angle"}, {"coeff": 2, "attr": "angle"},
{"coeff": 0, "attr": "abs"}, {"coeff": 1, "attr": "abs"}, {"coeff": 2, "attr": "abs"}]
expected_index = ['coeff_0__attr_"real"', 'coeff_1__attr_"real"', 'coeff_2__attr_"real"',
'coeff_0__attr_"imag"', 'coeff_1__attr_"imag"', 'coeff_2__attr_"imag"',
'coeff_0__attr_"angle"', 'coeff_1__attr_"angle"', 'coeff_2__attr_"angle"',
'coeff_0__attr_"abs"', 'coeff_1__attr_"abs"', 'coeff_2__attr_"abs"']
expected_index = ['attr_"real"__coeff_0', 'attr_"real"__coeff_1', 'attr_"real"__coeff_2',
'attr_"imag"__coeff_0', 'attr_"imag"__coeff_1', 'attr_"imag"__coeff_2',
'attr_"angle"__coeff_0', 'attr_"angle"__coeff_1', 'attr_"angle"__coeff_2',
'attr_"abs"__coeff_0', 'attr_"abs"__coeff_1', 'attr_"abs"__coeff_2']

res = pd.Series(dict(fft_coefficient(x, param)))
self.assertCountEqual(list(res.index), expected_index)
self.assertAlmostEqual(res['coeff_0__attr_"imag"'], 0, places=6)
self.assertAlmostEqual(res['coeff_0__attr_"real"'], sum(x), places=6)
self.assertAlmostEqual(res['coeff_0__attr_"angle"'], 0, places=6)
self.assertAlmostEqual(res['coeff_0__attr_"abs"'], sum(x), places=6)
self.assertAlmostEqual(res['attr_"imag"__coeff_0'], 0, places=6)
self.assertAlmostEqual(res['attr_"real"__coeff_0'], sum(x), places=6)
self.assertAlmostEqual(res['attr_"angle"__coeff_0'], 0, places=6)
self.assertAlmostEqual(res['attr_"abs"__coeff_0'], sum(x), places=6)

x = [0, 1, 0, 0]
res = pd.Series(dict(fft_coefficient(x, param)))
# see documentation of fft in numpy
# should return array([1. + 0.j, 0. - 1.j, -1. + 0.j])
self.assertAlmostEqual(res['coeff_0__attr_"imag"'], 0, places=6)
self.assertAlmostEqual(res['coeff_0__attr_"real"'], 1, places=6)
self.assertAlmostEqual(res['coeff_1__attr_"imag"'], -1, places=6)
self.assertAlmostEqual(res['coeff_1__attr_"angle"'], -90, places=6)
self.assertAlmostEqual(res['coeff_1__attr_"real"'], 0, places=6)
self.assertAlmostEqual(res['coeff_2__attr_"imag"'], 0, places=6)
self.assertAlmostEqual(res['coeff_2__attr_"real"'], -1, places=6)
self.assertAlmostEqual(res['attr_"imag"__coeff_0'], 0, places=6)
self.assertAlmostEqual(res['attr_"real"__coeff_0'], 1, places=6)
self.assertAlmostEqual(res['attr_"imag"__coeff_1'], -1, places=6)
self.assertAlmostEqual(res['attr_"angle"__coeff_1'], -90, places=6)
self.assertAlmostEqual(res['attr_"real"__coeff_1'], 0, places=6)
self.assertAlmostEqual(res['attr_"imag"__coeff_2'], 0, places=6)
self.assertAlmostEqual(res['attr_"real"__coeff_2'], -1, places=6)

# test what happens if coeff is biger than time series lenght
x = range(5)
param = [{"coeff": 10, "attr": "real"}]
expected_index = ['coeff_10__attr_"real"']
expected_index = ['attr_"real"__coeff_10']

res = pd.Series(dict(fft_coefficient(x, param)))
self.assertCountEqual(list(res.index), expected_index)
self.assertIsNaN(res['coeff_10__attr_"real"'])
self.assertIsNaN(res['attr_"real"__coeff_10'])

def test_fft_aggregated(self):
param = [
Expand Down Expand Up @@ -692,16 +692,16 @@ def test_cwt_coefficients(self):
{"widths": (1, 3), "coeff": 5, "w": 3}]
shuffle(param)

expected_index = ["widths_(1, 2, 3)__coeff_2__w_1",
"widths_(1, 3)__coeff_2__w_3",
"widths_(1, 3)__coeff_5__w_3"]
expected_index = ["coeff_2__w_1__widths_(1, 2, 3)",
"coeff_2__w_3__widths_(1, 3)",
"coeff_5__w_3__widths_(1, 3)"]

res = cwt_coefficients(x, param)
res = pd.Series(dict(res))

# todo: add unit test for the values
self.assertCountEqual(list(res.index), expected_index)
self.assertTrue(math.isnan(res["widths_(1, 3)__coeff_5__w_3"]))
self.assertTrue(math.isnan(res["coeff_5__w_3__widths_(1, 3)"]))

def test_ar_coefficient(self):

Expand All @@ -714,12 +714,12 @@ def test_ar_coefficient(self):
x[i] = 2.5 * x[i - 1] + 1

res = ar_coefficient(x, param)
expected_index = ["k_1__coeff_0", "k_1__coeff_1"]
expected_index = ["coeff_0__k_1", "coeff_1__k_1"]

res = pd.Series(dict(res))
self.assertCountEqual(list(res.index), expected_index)
self.assertAlmostEqual(res["k_1__coeff_0"], 1, places=2)
self.assertAlmostEqual(res["k_1__coeff_1"], 2.5, places=2)
self.assertAlmostEqual(res["coeff_0__k_1"], 1, places=2)
self.assertAlmostEqual(res["coeff_1__k_1"], 2.5, places=2)

# Test for X_i = 1.4 * X_{i-1} - 1 X_{i-2} + 1
param = [{"k": 1, "coeff": 0}, {"k": 1, "coeff": 1},
Expand All @@ -731,18 +731,18 @@ def test_ar_coefficient(self):
x[i] = (-2) * x[i - 2] + 3.5 * x[i - 1] + 1

res = ar_coefficient(x, param)
expected_index = ["k_1__coeff_0", "k_1__coeff_1",
"k_2__coeff_0", "k_2__coeff_1",
"k_2__coeff_2", "k_2__coeff_3"]
expected_index = ["coeff_0__k_1", "coeff_1__k_1",
"coeff_0__k_2", "coeff_1__k_2",
"coeff_2__k_2", "coeff_3__k_2"]

res = pd.Series(dict(res))

self.assertIsInstance(res, pd.Series)
self.assertCountEqual(list(res.index), expected_index)
self.assertAlmostEqual(res["k_2__coeff_0"], 1, places=2)
self.assertAlmostEqual(res["k_2__coeff_1"], 3.5, places=2)
self.assertAlmostEqual(res["k_2__coeff_2"], -2, places=2)
self.assertTrue(np.isnan(res["k_2__coeff_3"]))
self.assertAlmostEqual(res["coeff_0__k_2"], 1, places=2)
self.assertAlmostEqual(res["coeff_1__k_2"], 3.5, places=2)
self.assertAlmostEqual(res["coeff_2__k_2"], -2, places=2)
self.assertTrue(np.isnan(res["coeff_3__k_2"]))

def test_time_reversal_asymmetry_statistic(self):
x = [1] * 10
Expand Down Expand Up @@ -980,57 +980,57 @@ def test_agg_linear_trend(self):
{"attr": "slope", "chunk_len": 3, "f_agg": "mean"},
{"attr": "intercept", "chunk_len": 3, "f_agg": "median"},
{"attr": "slope", "chunk_len": 3, "f_agg": "median"}]
expected_index = ['f_agg_"max"__chunk_len_3__attr_"intercept"',
'f_agg_"max"__chunk_len_3__attr_"slope"',
'f_agg_"min"__chunk_len_3__attr_"intercept"',
'f_agg_"min"__chunk_len_3__attr_"slope"',
'f_agg_"mean"__chunk_len_3__attr_"intercept"',
'f_agg_"mean"__chunk_len_3__attr_"slope"',
'f_agg_"median"__chunk_len_3__attr_"intercept"',
'f_agg_"median"__chunk_len_3__attr_"slope"']
expected_index = ['attr_"intercept"__chunk_len_3__f_agg_"max"',
'attr_"slope"__chunk_len_3__f_agg_"max"',
'attr_"intercept"__chunk_len_3__f_agg_"min"',
'attr_"slope"__chunk_len_3__f_agg_"min"',
'attr_"intercept"__chunk_len_3__f_agg_"mean"',
'attr_"slope"__chunk_len_3__f_agg_"mean"',
'attr_"intercept"__chunk_len_3__f_agg_"median"',
'attr_"slope"__chunk_len_3__f_agg_"median"']

res = agg_linear_trend(x=x, param=param)

res = pd.Series(dict(res))
self.assertEqual(len(res), 8)
self.maxDiff = 2000
self.assertCountEqual(list(res.index), expected_index)
self.assertAlmostEqual(res['f_agg_"max"__chunk_len_3__attr_"intercept"'], 2)
self.assertAlmostEqual(res['f_agg_"max"__chunk_len_3__attr_"slope"'], 3)
self.assertAlmostEqual(res['f_agg_"min"__chunk_len_3__attr_"intercept"'], 0)
self.assertAlmostEqual(res['f_agg_"min"__chunk_len_3__attr_"slope"'], 3)
self.assertAlmostEqual(res['f_agg_"mean"__chunk_len_3__attr_"intercept"'], 1)
self.assertAlmostEqual(res['f_agg_"mean"__chunk_len_3__attr_"slope"'], 3)
self.assertAlmostEqual(res['f_agg_"median"__chunk_len_3__attr_"intercept"'], 1)
self.assertAlmostEqual(res['f_agg_"median"__chunk_len_3__attr_"slope"'], 3)
self.assertAlmostEqual(res['attr_"intercept"__chunk_len_3__f_agg_"max"'], 2)
self.assertAlmostEqual(res['attr_"slope"__chunk_len_3__f_agg_"max"'], 3)
self.assertAlmostEqual(res['attr_"intercept"__chunk_len_3__f_agg_"min"'], 0)
self.assertAlmostEqual(res['attr_"slope"__chunk_len_3__f_agg_"min"'], 3)
self.assertAlmostEqual(res['attr_"intercept"__chunk_len_3__f_agg_"mean"'], 1)
self.assertAlmostEqual(res['attr_"slope"__chunk_len_3__f_agg_"mean"'], 3)
self.assertAlmostEqual(res['attr_"intercept"__chunk_len_3__f_agg_"median"'], 1)
self.assertAlmostEqual(res['attr_"slope"__chunk_len_3__f_agg_"median"'], 3)

x = pd.Series([np.NaN, np.NaN, np.NaN, -3, -3, -3])
res = agg_linear_trend(x=x, param=param)

res = pd.Series(dict(res))

self.assertIsNaN(res['f_agg_"max"__chunk_len_3__attr_"intercept"'])
self.assertIsNaN(res['f_agg_"max"__chunk_len_3__attr_"slope"'])
self.assertIsNaN(res['f_agg_"min"__chunk_len_3__attr_"intercept"'])
self.assertIsNaN(res['f_agg_"min"__chunk_len_3__attr_"slope"'])
self.assertIsNaN(res['f_agg_"mean"__chunk_len_3__attr_"intercept"'])
self.assertIsNaN(res['f_agg_"mean"__chunk_len_3__attr_"slope"'])
self.assertIsNaN(res['f_agg_"median"__chunk_len_3__attr_"intercept"'])
self.assertIsNaN(res['f_agg_"median"__chunk_len_3__attr_"slope"'])
self.assertIsNaN(res['attr_"intercept"__chunk_len_3__f_agg_"max"'])
self.assertIsNaN(res['attr_"slope"__chunk_len_3__f_agg_"max"'])
self.assertIsNaN(res['attr_"intercept"__chunk_len_3__f_agg_"min"'])
self.assertIsNaN(res['attr_"slope"__chunk_len_3__f_agg_"min"'])
self.assertIsNaN(res['attr_"intercept"__chunk_len_3__f_agg_"mean"'])
self.assertIsNaN(res['attr_"slope"__chunk_len_3__f_agg_"mean"'])
self.assertIsNaN(res['attr_"intercept"__chunk_len_3__f_agg_"median"'])
self.assertIsNaN(res['attr_"slope"__chunk_len_3__f_agg_"median"'])

x = pd.Series([np.NaN, np.NaN, -3, -3, -3, -3])
res = agg_linear_trend(x=x, param=param)

res = pd.Series(dict(res))

self.assertAlmostEqual(res['f_agg_"max"__chunk_len_3__attr_"intercept"'], -3)
self.assertAlmostEqual(res['f_agg_"max"__chunk_len_3__attr_"slope"'], 0)
self.assertAlmostEqual(res['f_agg_"min"__chunk_len_3__attr_"intercept"'], -3)
self.assertAlmostEqual(res['f_agg_"min"__chunk_len_3__attr_"slope"'], 0)
self.assertAlmostEqual(res['f_agg_"mean"__chunk_len_3__attr_"intercept"'], -3)
self.assertAlmostEqual(res['f_agg_"mean"__chunk_len_3__attr_"slope"'], 0)
self.assertAlmostEqual(res['f_agg_"median"__chunk_len_3__attr_"intercept"'], -3)
self.assertAlmostEqual(res['f_agg_"median"__chunk_len_3__attr_"slope"'], 0)
self.assertAlmostEqual(res['attr_"intercept"__chunk_len_3__f_agg_"max"'], -3)
self.assertAlmostEqual(res['attr_"slope"__chunk_len_3__f_agg_"max"'], 0)
self.assertAlmostEqual(res['attr_"intercept"__chunk_len_3__f_agg_"min"'], -3)
self.assertAlmostEqual(res['attr_"slope"__chunk_len_3__f_agg_"min"'], 0)
self.assertAlmostEqual(res['attr_"intercept"__chunk_len_3__f_agg_"mean"'], -3)
self.assertAlmostEqual(res['attr_"slope"__chunk_len_3__f_agg_"mean"'], 0)
self.assertAlmostEqual(res['attr_"intercept"__chunk_len_3__f_agg_"median"'], -3)
self.assertAlmostEqual(res['attr_"slope"__chunk_len_3__f_agg_"median"'], 0)

def test_energy_ratio_by_chunks(self):
x = pd.Series(range(90), index=range(90))
Expand Down Expand Up @@ -1217,7 +1217,7 @@ def test_friedrich_coefficients(self):
x = np.zeros(100)
res = pd.Series(dict(friedrich_coefficients(x, param)))

expected_index = ["m_2__r_30__coeff_0", "m_2__r_30__coeff_1", "m_2__r_30__coeff_2", "m_2__r_30__coeff_3"]
expected_index = ["coeff_0__m_2__r_30", "coeff_1__m_2__r_30", "coeff_2__m_2__r_30", "coeff_3__m_2__r_30"]
self.assertCountEqual(list(res.index), expected_index)
self.assertTrue(np.sum(np.isnan(res)), 3)

Expand All @@ -1227,7 +1227,7 @@ def test_friedrich_number_of_returned_features_is_equal_to_number_of_parameters(
x = np.zeros(100)
res = pd.Series(dict(friedrich_coefficients(x, param)))

expected_index = ["m_3__r_5__coeff_2", "m_3__r_5__coeff_3", "m_3__r_2__coeff_3"]
expected_index = ["coeff_2__m_3__r_5", "coeff_3__m_3__r_5", "coeff_3__m_3__r_2"]
self.assertCountEqual(list(res.index), expected_index)
self.assertTrue(np.sum(np.isnan(res)), 3)

Expand All @@ -1240,6 +1240,6 @@ def test_friedrich_equal_to_snapshot(self):

res = pd.Series(dict(friedrich_coefficients(x, param)))

self.assertAlmostEqual(res['m_2__r_30__coeff_0'], -0.24536975738843042)
self.assertAlmostEqual(res['m_2__r_30__coeff_1'], -0.533309548662685)
self.assertAlmostEqual(res['m_2__r_30__coeff_2'], 0.2759399238199404)
self.assertAlmostEqual(res['coeff_0__m_2__r_30'], -0.24536975738843042)
self.assertAlmostEqual(res['coeff_1__m_2__r_30'], -0.533309548662685)
self.assertAlmostEqual(res['coeff_2__m_2__r_30'], 0.2759399238199404)
17 changes: 10 additions & 7 deletions tsfresh/feature_extraction/feature_calculators.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@
They are specified using the "fctype" parameter of each feature calculator, which is added using the
set_property function. Only functions in this python module, which have a parameter called "fctype" are
seen by tsfresh as a feature calculator. Others will not be calculated.
Feature calculators of type combiner should return the concatenated parameters sorted
alphabetically ascending.
"""

import itertools
Expand Down Expand Up @@ -452,7 +455,7 @@ def compute_adf(autolag):
autolag = config.get("autolag", "AIC")

adf = compute_adf(autolag)
index = 'autolag_"{}"__attr_"{}"'.format(autolag, config["attr"])
index = 'attr_"{}"__autolag_"{}"'.format(config["attr"], autolag)

if config["attr"] == "teststat":
res.append((index, adf[0]))
Expand Down Expand Up @@ -987,7 +990,7 @@ def complex_agg(x, agg):

res = [complex_agg(fft[config["coeff"]], config["attr"]) if config["coeff"] < len(fft)
else np.NaN for config in param]
index = ['coeff_{}__attr_"{}"'.format(config["coeff"], config["attr"]) for config in param]
index = ['attr_"{}"__coeff_{}'.format(config["attr"], config["coeff"]) for config in param]
return zip(index, res)


Expand Down Expand Up @@ -1244,7 +1247,7 @@ def cwt_coefficients(x, param):

calculated_cwt_for_widths = calculated_cwt[widths]

indices += ["widths_{}__coeff_{}__w_{}".format(widths, coeff, w)]
indices += ["coeff_{}__w_{}__widths_{}".format(coeff, w, widths)]

i = widths.index(w)
if calculated_cwt_for_widths.shape[1] <= coeff:
Expand Down Expand Up @@ -1319,7 +1322,7 @@ def ar_coefficient(x, param):
k = parameter_combination["k"]
p = parameter_combination["coeff"]

column_name = "k_{}__coeff_{}".format(k, p)
column_name = "coeff_{}__k_{}".format(p, k)

if k not in calculated_ar_params:
try:
Expand Down Expand Up @@ -1796,9 +1799,9 @@ def friedrich_coefficients(x, param):
calculated[m][r] = _estimate_friedrich_coefficients(x, m, r)

try:
res["m_{}__r_{}__coeff_{}".format(m, r, coeff)] = calculated[m][r][coeff]
res["coeff_{}__m_{}__r_{}".format(coeff, m, r)] = calculated[m][r][coeff]
except IndexError:
res["m_{}__r_{}__coeff_{}".format(m, r, coeff)] = np.NaN
res["coeff_{}__m_{}__r_{}".format(coeff, m, r)] = np.NaN
return [(key, value) for key, value in res.items()]


Expand Down Expand Up @@ -1887,7 +1890,7 @@ def agg_linear_trend(x, param):
else:
res_data.append(getattr(calculated_agg[f_agg][chunk_len], attr))

res_index.append("f_agg_\"{}\"__chunk_len_{}__attr_\"{}\"".format(f_agg, chunk_len, attr))
res_index.append("attr_\"{}\"__chunk_len_{}__f_agg_\"{}\"".format(attr, chunk_len, f_agg))

return zip(res_index, res_data)

Expand Down

0 comments on commit 2a50ba7

Please sign in to comment.