From 52c1a6984b72dc8aeb28b0bf8a6ee12a961898bf Mon Sep 17 00:00:00 2001 From: Nils Braun Date: Mon, 13 Apr 2020 08:19:11 +0200 Subject: [PATCH 01/23] Started refactoring the rolling function --- tsfresh/utilities/dataframe_functions.py | 74 +++++++++++++++--------- 1 file changed, 46 insertions(+), 28 deletions(-) diff --git a/tsfresh/utilities/dataframe_functions.py b/tsfresh/utilities/dataframe_functions.py index 015c7dc59..6a8052a2a 100644 --- a/tsfresh/utilities/dataframe_functions.py +++ b/tsfresh/utilities/dataframe_functions.py @@ -372,7 +372,7 @@ def _normalize_input_to_internal_representation(timeseries_container, column_id, return timeseries_container, column_id, column_kind, column_value -def roll_time_series(df_or_dict, column_id, column_sort, column_kind, rolling_direction, max_timeshift=None): +def roll_time_series(df_or_dict, column_id, column_sort, column_kind, rolling_direction, max_timeshift=None, min_timeshift=1): """ This method creates sub windows of the time series. It rolls the (sorted) data frames for each kind and each id separately in the "time" domain (which is represented by the sort order of the sort column given by `column_sort`). @@ -406,6 +406,8 @@ def roll_time_series(df_or_dict, column_id, column_sort, column_kind, rolling_di :type rolling_direction: int :param max_timeshift: If not None, shift only up to max_timeshift. If None, shift as often as possible. :type max_timeshift: int + :param min_timeshift: Throw away all extracted forecast windows smaller than this. Must be larger than 0. + :type min_timeshift: int :return: The rolled data frame or dictionary of data frames :rtype: the one from df_or_dict @@ -414,6 +416,12 @@ def roll_time_series(df_or_dict, column_id, column_sort, column_kind, rolling_di if rolling_direction == 0: raise ValueError("Rolling direction of 0 is not possible") + if max_timeshift is not None and max_timeshift <= 0: + raise ValueError("max_timeshift needs to be positive!") + + if min_timeshift <= 0: + raise ValueError("min_timeshift needs to be positive!") + if isinstance(df_or_dict, dict): if column_kind is not None: raise ValueError("You passed in a dictionary and gave a column name for the kind. Both are not possible.") @@ -423,7 +431,8 @@ def roll_time_series(df_or_dict, column_id, column_sort, column_kind, rolling_di column_sort=column_sort, column_kind=column_kind, rolling_direction=rolling_direction, - max_timeshift=max_timeshift) + max_timeshift=max_timeshift, + min_timeshift=min_timeshift) for key in df_or_dict} # Now we know that this is a pandas data frame @@ -440,7 +449,7 @@ def roll_time_series(df_or_dict, column_id, column_sort, column_kind, rolling_di else: grouper = [column_id, ] - if column_sort is not None and df[column_sort].dtype != np.object: + if column_sort is not None: # Require no Nans in column if df[column_sort].isnull().any(): @@ -448,32 +457,29 @@ def roll_time_series(df_or_dict, column_id, column_sort, column_kind, rolling_di df = df.sort_values(column_sort) - # if rolling is enabled, the data should be uniformly sampled in this column - # Build the differences between consecutive time sort values + if df[column_sort].dtype != np.object: + # if rolling is enabled, the data should be uniformly sampled in this column + # Build the differences between consecutive time sort values - differences = df.groupby(grouper)[column_sort].apply( - lambda x: x.values[:-1] - x.values[1:]) - # Write all of them into one big list - differences = sum(map(list, differences), []) - # Test if all differences are the same - if differences and min(differences) != max(differences): - warnings.warn("Your time stamps are not uniformly sampled, which makes rolling " - "nonsensical in some domains.") + differences = df.groupby(grouper)[column_sort].apply( + lambda x: x.values[:-1] - x.values[1:]) + # Write all of them into one big list + differences = sum(map(list, differences), []) + # Test if all differences are the same + if differences and min(differences) != max(differences): + warnings.warn("Your time stamps are not uniformly sampled, which makes rolling " + "nonsensical in some domains.") # Roll the data frames if requested rolling_direction = np.sign(rolling_direction) grouped_data = df.groupby(grouper) - max_timeshift = max_timeshift or grouped_data.count().max().max() + prediction_steps = grouped_data.count().max().max() - if np.isnan(max_timeshift): + if np.isnan(prediction_steps): raise ValueError("Somehow the maximum length of your time series is NaN (Does your time series container have " "only one row?). Can not perform rolling.") - - if rolling_direction > 0: - range_of_shifts = range(max_timeshift, -1, -1) - else: - range_of_shifts = range(-max_timeshift, 1) + max_timeshift = max_timeshift or prediction_steps # Todo: not default for columns_sort to be None if column_sort is None: @@ -481,14 +487,26 @@ def roll_time_series(df_or_dict, column_id, column_sort, column_kind, rolling_di df[column_sort] = range(df.shape[0]) def roll_out_time_series(time_shift): - # Shift out only the first "time_shift" rows - df_temp = grouped_data.shift(time_shift) - df_temp[column_id] = df[column_sort] - if column_kind: - df_temp[column_kind] = df[column_kind] - return df_temp.dropna() - - df_shift = pd.concat([roll_out_time_series(time_shift) for time_shift in range_of_shifts], ignore_index=True) + # Now comes the fun part. + # This function has the task to extract the rolled forecast data frame of the number `time_shift`. + # This means it is `time_shift` in the future - starting counting from the first row + # for each id (and kind). + # This means we cut out the data until `time_shift`. + # The first row we cut out is either 0 or given by the maximal allowed length of `max_timeshift`. + shift_until = time_shift + shift_from = max(shift_until - max_timeshift - 1, 0) + df_temp = grouped_data.apply(lambda x: x.iloc[shift_from:shift_until] if shift_until <= len(x) else None) + + # Make sure we keep the old column id values + old_column_id = df_temp[column_id] + # and now create new ones out of the old ones + df_temp[column_id] = df_temp.apply(lambda row: f"id={row[column_id]},shift={time_shift - 1}", axis=1) + + return df_temp + + range_of_shifts = range(min_timeshift, prediction_steps + 1) + shifted_chunks = map(lambda time_shift: roll_out_time_series(time_shift), range_of_shifts) + df_shift = pd.concat(shifted_chunks, ignore_index=True) return df_shift.sort_values(by=[column_id, column_sort]) From 0d5fd4e0655bcdd1dbb31666bee3be5da9c427b2 Mon Sep 17 00:00:00 2001 From: Nils Braun Date: Mon, 13 Apr 2020 08:19:37 +0200 Subject: [PATCH 02/23] Unfinished test fix --- .../utilities/test_dataframe_functions.py | 31 +++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/tests/units/utilities/test_dataframe_functions.py b/tests/units/utilities/test_dataframe_functions.py index 72646d6db..c47063f46 100644 --- a/tests/units/utilities/test_dataframe_functions.py +++ b/tests/units/utilities/test_dataframe_functions.py @@ -277,7 +277,21 @@ def test_positive_rolling(self): 4 10 12 20 2 5 11 13 21 2 """ - correct_indices = [0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 20, 21, 21] + correct_indices = [ + 'id=1,shift=0', + 'id=1,shift=1', + 'id=1,shift=1', + 'id=1,shift=2', + 'id=1,shift=2', + 'id=1,shift=2', + 'id=1,shift=3', + 'id=1,shift=3', + 'id=1,shift=3', + 'id=1,shift=3', + 'id=2,shift=0', + 'id=2,shift=1', + 'id=2,shift=1' + ] correct_values_a = [1.0, 1.0, 2.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 4.0, 10.0, 10.0, 11.0] correct_values_b = [5.0, 5.0, 6.0, 5.0, 6.0, 7.0, 5.0, 6.0, 7.0, 8.0, 12.0, 12.0, 13.0] @@ -300,7 +314,20 @@ def test_positive_rolling(self): column_kind=None, rolling_direction=1, max_timeshift=2) - correct_indices = [0, 1, 1, 2, 2, 2, 3, 3, 3, 20, 21, 21] + correct_indices = [ + 'id=1,shift=0', + 'id=1,shift=1', + 'id=1,shift=1', + 'id=1,shift=2', + 'id=1,shift=2', + 'id=1,shift=2', + 'id=1,shift=3', + 'id=1,shift=3', + 'id=1,shift=3', + 'id=2,shift=0', + 'id=2,shift=1', + 'id=2,shift=1' + ] correct_values_a = [1.0, 1.0, 2.0, 1.0, 2.0, 3.0, 2.0, 3.0, 4.0, 10.0, 10.0, 11.0] correct_values_b = [5.0, 5.0, 6.0, 5.0, 6.0, 7.0, 6.0, 7.0, 8.0, 12.0, 12.0, 13.0] From b0006cdffdc979a664ef1ba3a2478bdd38d34941 Mon Sep 17 00:00:00 2001 From: Nils Braun Date: Mon, 13 Apr 2020 10:51:36 +0200 Subject: [PATCH 03/23] Fixed remaining rolling tests --- .../utilities/test_dataframe_functions.py | 187 ++++++++++++------ tsfresh/utilities/dataframe_functions.py | 18 +- 2 files changed, 139 insertions(+), 66 deletions(-) diff --git a/tests/units/utilities/test_dataframe_functions.py b/tests/units/utilities/test_dataframe_functions.py index c47063f46..90e4805e2 100644 --- a/tests/units/utilities/test_dataframe_functions.py +++ b/tests/units/utilities/test_dataframe_functions.py @@ -353,7 +353,21 @@ def test_negative_rolling(self): 5 11 13 21 2 """ - correct_indices = ([0, 0, 0, 0, 1, 1, 1, 2, 2, 3, 20, 20, 21]) + correct_indices = [ + 'id=1,shift=0', + 'id=1,shift=0', + 'id=1,shift=0', + 'id=1,shift=0', + 'id=1,shift=1', + 'id=1,shift=1', + 'id=1,shift=1', + 'id=1,shift=2', + 'id=1,shift=2', + 'id=1,shift=3', + 'id=2,shift=0', + 'id=2,shift=0', + 'id=2,shift=1' + ] correct_values_a = [1.0, 2.0, 3.0, 4.0, 2.0, 3.0, 4.0, 3.0, 4.0, 4.0, 10.0, 11.0, 11.0] correct_values_b = [5.0, 6.0, 7.0, 8.0, 6.0, 7.0, 8.0, 7.0, 8.0, 8.0, 12.0, 13.0, 13.0] @@ -376,7 +390,18 @@ def test_negative_rolling(self): column_kind=None, rolling_direction=-1, max_timeshift=1) - correct_indices = ([0, 0, 1, 1, 2, 2, 3, 20, 20, 21]) + correct_indices = [ + 'id=1,shift=0', + 'id=1,shift=0', + 'id=1,shift=1', + 'id=1,shift=1', + 'id=1,shift=2', + 'id=1,shift=2', + 'id=1,shift=3', + 'id=2,shift=0', + 'id=2,shift=0', + 'id=2,shift=1' + ] correct_values_a = [1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0, 10.0, 11.0, 11.0] correct_values_b = [5.0, 6.0, 6.0, 7.0, 7.0, 8.0, 8.0, 12.0, 13.0, 13.0] @@ -388,7 +413,20 @@ def test_negative_rolling(self): column_kind=None, rolling_direction=-1, max_timeshift=2) - correct_indices = ([0, 0, 0, 1, 1, 1, 2, 2, 3, 20, 20, 21]) + correct_indices = [ + 'id=1,shift=0', + 'id=1,shift=0', + 'id=1,shift=0', + 'id=1,shift=1', + 'id=1,shift=1', + 'id=1,shift=1', + 'id=1,shift=2', + 'id=1,shift=2', + 'id=1,shift=3', + 'id=2,shift=0', + 'id=2,shift=0', + 'id=2,shift=1' + ] correct_values_a = [1.0, 2.0, 3.0, 2.0, 3.0, 4.0, 3.0, 4.0, 4.0, 10.0, 11.0, 11.0] correct_values_b = [5.0, 6.0, 7.0, 6.0, 7.0, 8.0, 7.0, 8.0, 8.0, 12.0, 13.0, 13.0] @@ -400,7 +438,21 @@ def test_negative_rolling(self): column_kind=None, rolling_direction=-1, max_timeshift=4) - correct_indices = ([0, 0, 0, 0, 1, 1, 1, 2, 2, 3, 20, 20, 21]) + correct_indices = [ + 'id=1,shift=0', + 'id=1,shift=0', + 'id=1,shift=0', + 'id=1,shift=0', + 'id=1,shift=1', + 'id=1,shift=1', + 'id=1,shift=1', + 'id=1,shift=2', + 'id=1,shift=2', + 'id=1,shift=3', + 'id=2,shift=0', + 'id=2,shift=0', + 'id=2,shift=1' + ] correct_values_a = [1.0, 2.0, 3.0, 4.0, 2.0, 3.0, 4.0, 3.0, 4.0, 4.0, 10.0, 11.0, 11.0] correct_values_b = [5.0, 6.0, 7.0, 8.0, 6.0, 7.0, 8.0, 7.0, 8.0, 8.0, 12.0, 13.0, 13.0] @@ -440,10 +492,16 @@ def test_stacked_rolling(self): df = dataframe_functions.roll_time_series(df_stacked, column_id="id", column_sort="time", column_kind="kind", rolling_direction=-1) - correct_indices = ([0] * 2 * 4 + [1] * 2 * 3 + [2] * 2 * 2 + [3] * 2 * 1 + [20] * 4 + [21] * 2) + correct_indices = ( + ['id=1,shift=0'] * 2 * 4 + + ['id=1,shift=1'] * 2 * 3 + + ['id=1,shift=2'] * 2 * 2 + + ['id=1,shift=3'] * 2 * 1 + + ['id=2,shift=0'] * 2 * 2 + + ['id=2,shift=1'] * 2 * 1 + ) self.assertListEqual(list(df["id"].values), correct_indices) - print(df["_value"].values) self.assertListEqual(list(df["kind"].values), ["a", "b"] * 13) self.assertListEqual(list(df["_value"].values), [1., 5., 2., 6., 3., 7., 4., 8., 2., 6., 3., 7., 4., 8., 3., 7., 4., 8., 4., 8., 10., 12., @@ -457,40 +515,41 @@ def test_dict_rolling(self): df = dataframe_functions.roll_time_series(df_dict, column_id="id", column_sort=None, column_kind=None, rolling_direction=-1) """ df is - {a: _value sort id - 7 1.0 0.0 0 - 3 2.0 1.0 0 - 1 3.0 2.0 0 - 0 4.0 3.0 0 - 8 2.0 1.0 1 - 4 3.0 2.0 1 - 2 4.0 3.0 1 - 9 3.0 2.0 2 - 5 4.0 3.0 2 - 10 4.0 3.0 3 - 11 10.0 4.0 4 - 6 11.0 5.0 4 - 12 11.0 5.0 5, - - b: _value sort id - 7 5.0 0.0 0 - 3 6.0 1.0 0 - 1 7.0 2.0 0 - 0 8.0 3.0 0 - 8 6.0 1.0 1 - 4 7.0 2.0 1 - 2 8.0 3.0 1 - 9 7.0 2.0 2 - 5 8.0 3.0 2 - 10 8.0 3.0 3 - 11 12.0 4.0 4 - 6 13.0 5.0 4 - 12 13.0 5.0 5} + {a: _value id + 1.0 1 + 2.0 1 + 3.0 1 + 4.0 1 + 10.0 2 + 11.0 2, + + b: _value id + 5.0 1 + 6.0 1 + 7.0 1 + 8.0 1 + 12.0 2 + 13.0 2 + } """ - correct_indices = [0, 0, 0, 0, 1, 1, 1, 2, 2, 3, 4, 4, 5] - + correct_indices = [ + 'id=1,shift=0', + 'id=1,shift=0', + 'id=1,shift=0', + 'id=1,shift=0', + 'id=1,shift=1', + 'id=1,shift=1', + 'id=1,shift=1', + 'id=1,shift=2', + 'id=1,shift=2', + 'id=1,shift=3', + 'id=2,shift=0', + 'id=2,shift=0', + 'id=2,shift=1' + ] self.assertListEqual(list(df["a"]["id"].values), correct_indices) + self.assertListEqual(list(df["b"]["id"].values), correct_indices) self.assertListEqual(list(df["a"]["_value"].values), @@ -506,32 +565,36 @@ def test_dict_rolling_maxshift_1(self): df = dataframe_functions.roll_time_series(df_dict, column_id="id", column_sort=None, column_kind=None, rolling_direction=-1, max_timeshift=1) """ df is - {a: _value sort id - 7 1.0 0.0 0 - 3 2.0 1.0 0 - 8 2.0 1.0 1 - 4 3.0 2.0 1 - 9 3.0 2.0 2 - 5 4.0 3.0 2 - 10 4.0 3.0 3 - 11 10.0 4.0 4 - 6 11.0 5.0 4 - 12 11.0 5.0 5, - - b: _value sort id - 7 5.0 0.0 0 - 3 6.0 1.0 0 - 8 6.0 1.0 1 - 4 7.0 2.0 1 - 9 7.0 2.0 2 - 5 8.0 3.0 2 - 10 8.0 3.0 3 - 11 12.0 4.0 4 - 6 13.0 5.0 4 - 12 13.0 5.0 5} + {a: _value id + 1.0 1 + 2.0 1 + 3.0 1 + 4.0 1 + 10.0 2 + 11.0 2, + + b: _value id + 5.0 1 + 6.0 1 + 7.0 1 + 8.0 1 + 12.0 2 + 13.0 2 + } """ - correct_indices = [0, 0, 1, 1, 2, 2, 3, 4, 4, 5] + correct_indices = [ + 'id=1,shift=0', + 'id=1,shift=0', + 'id=1,shift=1', + 'id=1,shift=1', + 'id=1,shift=2', + 'id=1,shift=2', + 'id=1,shift=3', + 'id=2,shift=0', + 'id=2,shift=0', + 'id=2,shift=1' + ] self.assertListEqual(list(df["a"]["id"].values), correct_indices) self.assertListEqual(list(df["b"]["id"].values), correct_indices) @@ -542,7 +605,7 @@ def test_dict_rolling_maxshift_1(self): def test_warning_on_non_uniform_time_steps(self): with warnings.catch_warnings(record=True) as w: first_class = pd.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "time": [1, 2, 4, 5]}) - second_class = pd.DataFrame({"a": [10, 11], "b": [12, 13], "time": range(20, 22)}) + second_class = pd.DataFrame({"a": [10, 11], "b": [12, 13], "time": list(range(20, 22))}) first_class["id"] = 1 second_class["id"] = 2 diff --git a/tsfresh/utilities/dataframe_functions.py b/tsfresh/utilities/dataframe_functions.py index 6a8052a2a..1a8a85c4a 100644 --- a/tsfresh/utilities/dataframe_functions.py +++ b/tsfresh/utilities/dataframe_functions.py @@ -450,7 +450,6 @@ def roll_time_series(df_or_dict, column_id, column_sort, column_kind, rolling_di grouper = [column_id, ] if column_sort is not None: - # Require no Nans in column if df[column_sort].isnull().any(): raise ValueError("You have NaN values in your sort column.") @@ -493,9 +492,20 @@ def roll_out_time_series(time_shift): # for each id (and kind). # This means we cut out the data until `time_shift`. # The first row we cut out is either 0 or given by the maximal allowed length of `max_timeshift`. - shift_until = time_shift - shift_from = max(shift_until - max_timeshift - 1, 0) - df_temp = grouped_data.apply(lambda x: x.iloc[shift_from:shift_until] if shift_until <= len(x) else None) + # for a negative rolling direction it is reversed + if rolling_direction > 0: + shift_until = time_shift + shift_from = max(shift_until - max_timeshift - 1, 0) + + df_temp = grouped_data.apply(lambda x: x.iloc[shift_from:shift_until] if shift_until <= len(x) else None) + else: + shift_from = max(time_shift - 1, 0) + shift_until = shift_from + max_timeshift + 1 + + df_temp = grouped_data.apply(lambda x: x.iloc[shift_from:shift_until]) + + if len(df_temp) == 0: + return # Make sure we keep the old column id values old_column_id = df_temp[column_id] From c4e90a8fb334a728bd6c5470256ee55bff25bab6 Mon Sep 17 00:00:00 2001 From: Nils Braun Date: Mon, 13 Apr 2020 16:43:18 +0200 Subject: [PATCH 04/23] Correct the shifted values and the index --- .../utilities/test_dataframe_functions.py | 226 +++++++++--------- tsfresh/utilities/dataframe_functions.py | 45 ++-- 2 files changed, 143 insertions(+), 128 deletions(-) diff --git a/tests/units/utilities/test_dataframe_functions.py b/tests/units/utilities/test_dataframe_functions.py index 90e4805e2..91cdac8fa 100644 --- a/tests/units/utilities/test_dataframe_functions.py +++ b/tests/units/utilities/test_dataframe_functions.py @@ -278,19 +278,19 @@ def test_positive_rolling(self): 5 11 13 21 2 """ correct_indices = [ - 'id=1,shift=0', - 'id=1,shift=1', - 'id=1,shift=1', - 'id=1,shift=2', - 'id=1,shift=2', - 'id=1,shift=2', - 'id=1,shift=3', - 'id=1,shift=3', - 'id=1,shift=3', - 'id=1,shift=3', - 'id=2,shift=0', - 'id=2,shift=1', - 'id=2,shift=1' + 'id=1,timeshift=0', + 'id=1,timeshift=1', + 'id=1,timeshift=1', + 'id=1,timeshift=2', + 'id=1,timeshift=2', + 'id=1,timeshift=2', + 'id=1,timeshift=3', + 'id=1,timeshift=3', + 'id=1,timeshift=3', + 'id=1,timeshift=3', + 'id=2,timeshift=20', + 'id=2,timeshift=21', + 'id=2,timeshift=21' ] correct_values_a = [1.0, 1.0, 2.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 4.0, 10.0, 10.0, 11.0] correct_values_b = [5.0, 5.0, 6.0, 5.0, 6.0, 7.0, 5.0, 6.0, 7.0, 8.0, 12.0, 12.0, 13.0] @@ -315,18 +315,18 @@ def test_positive_rolling(self): max_timeshift=2) correct_indices = [ - 'id=1,shift=0', - 'id=1,shift=1', - 'id=1,shift=1', - 'id=1,shift=2', - 'id=1,shift=2', - 'id=1,shift=2', - 'id=1,shift=3', - 'id=1,shift=3', - 'id=1,shift=3', - 'id=2,shift=0', - 'id=2,shift=1', - 'id=2,shift=1' + 'id=1,timeshift=0', + 'id=1,timeshift=1', + 'id=1,timeshift=1', + 'id=1,timeshift=2', + 'id=1,timeshift=2', + 'id=1,timeshift=2', + 'id=1,timeshift=3', + 'id=1,timeshift=3', + 'id=1,timeshift=3', + 'id=2,timeshift=20', + 'id=2,timeshift=21', + 'id=2,timeshift=21' ] correct_values_a = [1.0, 1.0, 2.0, 1.0, 2.0, 3.0, 2.0, 3.0, 4.0, 10.0, 10.0, 11.0] correct_values_b = [5.0, 5.0, 6.0, 5.0, 6.0, 7.0, 6.0, 7.0, 8.0, 12.0, 12.0, 13.0] @@ -354,19 +354,19 @@ def test_negative_rolling(self): """ correct_indices = [ - 'id=1,shift=0', - 'id=1,shift=0', - 'id=1,shift=0', - 'id=1,shift=0', - 'id=1,shift=1', - 'id=1,shift=1', - 'id=1,shift=1', - 'id=1,shift=2', - 'id=1,shift=2', - 'id=1,shift=3', - 'id=2,shift=0', - 'id=2,shift=0', - 'id=2,shift=1' + 'id=1,timeshift=0', + 'id=1,timeshift=0', + 'id=1,timeshift=0', + 'id=1,timeshift=0', + 'id=1,timeshift=1', + 'id=1,timeshift=1', + 'id=1,timeshift=1', + 'id=1,timeshift=2', + 'id=1,timeshift=2', + 'id=1,timeshift=3', + 'id=2,timeshift=20', + 'id=2,timeshift=20', + 'id=2,timeshift=21' ] correct_values_a = [1.0, 2.0, 3.0, 4.0, 2.0, 3.0, 4.0, 3.0, 4.0, 4.0, 10.0, 11.0, 11.0] correct_values_b = [5.0, 6.0, 7.0, 8.0, 6.0, 7.0, 8.0, 7.0, 8.0, 8.0, 12.0, 13.0, 13.0] @@ -391,16 +391,16 @@ def test_negative_rolling(self): max_timeshift=1) correct_indices = [ - 'id=1,shift=0', - 'id=1,shift=0', - 'id=1,shift=1', - 'id=1,shift=1', - 'id=1,shift=2', - 'id=1,shift=2', - 'id=1,shift=3', - 'id=2,shift=0', - 'id=2,shift=0', - 'id=2,shift=1' + 'id=1,timeshift=0', + 'id=1,timeshift=0', + 'id=1,timeshift=1', + 'id=1,timeshift=1', + 'id=1,timeshift=2', + 'id=1,timeshift=2', + 'id=1,timeshift=3', + 'id=2,timeshift=20', + 'id=2,timeshift=20', + 'id=2,timeshift=21' ] correct_values_a = [1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0, 10.0, 11.0, 11.0] correct_values_b = [5.0, 6.0, 6.0, 7.0, 7.0, 8.0, 8.0, 12.0, 13.0, 13.0] @@ -414,18 +414,18 @@ def test_negative_rolling(self): max_timeshift=2) correct_indices = [ - 'id=1,shift=0', - 'id=1,shift=0', - 'id=1,shift=0', - 'id=1,shift=1', - 'id=1,shift=1', - 'id=1,shift=1', - 'id=1,shift=2', - 'id=1,shift=2', - 'id=1,shift=3', - 'id=2,shift=0', - 'id=2,shift=0', - 'id=2,shift=1' + 'id=1,timeshift=0', + 'id=1,timeshift=0', + 'id=1,timeshift=0', + 'id=1,timeshift=1', + 'id=1,timeshift=1', + 'id=1,timeshift=1', + 'id=1,timeshift=2', + 'id=1,timeshift=2', + 'id=1,timeshift=3', + 'id=2,timeshift=20', + 'id=2,timeshift=20', + 'id=2,timeshift=21' ] correct_values_a = [1.0, 2.0, 3.0, 2.0, 3.0, 4.0, 3.0, 4.0, 4.0, 10.0, 11.0, 11.0] correct_values_b = [5.0, 6.0, 7.0, 6.0, 7.0, 8.0, 7.0, 8.0, 8.0, 12.0, 13.0, 13.0] @@ -439,19 +439,19 @@ def test_negative_rolling(self): max_timeshift=4) correct_indices = [ - 'id=1,shift=0', - 'id=1,shift=0', - 'id=1,shift=0', - 'id=1,shift=0', - 'id=1,shift=1', - 'id=1,shift=1', - 'id=1,shift=1', - 'id=1,shift=2', - 'id=1,shift=2', - 'id=1,shift=3', - 'id=2,shift=0', - 'id=2,shift=0', - 'id=2,shift=1' + 'id=1,timeshift=0', + 'id=1,timeshift=0', + 'id=1,timeshift=0', + 'id=1,timeshift=0', + 'id=1,timeshift=1', + 'id=1,timeshift=1', + 'id=1,timeshift=1', + 'id=1,timeshift=2', + 'id=1,timeshift=2', + 'id=1,timeshift=3', + 'id=2,timeshift=20', + 'id=2,timeshift=20', + 'id=2,timeshift=21' ] correct_values_a = [1.0, 2.0, 3.0, 4.0, 2.0, 3.0, 4.0, 3.0, 4.0, 4.0, 10.0, 11.0, 11.0] correct_values_b = [5.0, 6.0, 7.0, 8.0, 6.0, 7.0, 8.0, 7.0, 8.0, 8.0, 12.0, 13.0, 13.0] @@ -493,12 +493,12 @@ def test_stacked_rolling(self): column_kind="kind", rolling_direction=-1) correct_indices = ( - ['id=1,shift=0'] * 2 * 4 + - ['id=1,shift=1'] * 2 * 3 + - ['id=1,shift=2'] * 2 * 2 + - ['id=1,shift=3'] * 2 * 1 + - ['id=2,shift=0'] * 2 * 2 + - ['id=2,shift=1'] * 2 * 1 + ['id=1,timeshift=0'] * 2 * 4 + + ['id=1,timeshift=1'] * 2 * 3 + + ['id=1,timeshift=2'] * 2 * 2 + + ['id=1,timeshift=3'] * 2 * 1 + + ['id=2,timeshift=20'] * 2 * 2 + + ['id=2,timeshift=21'] * 2 * 1 ) self.assertListEqual(list(df["id"].values), correct_indices) @@ -534,19 +534,19 @@ def test_dict_rolling(self): """ correct_indices = [ - 'id=1,shift=0', - 'id=1,shift=0', - 'id=1,shift=0', - 'id=1,shift=0', - 'id=1,shift=1', - 'id=1,shift=1', - 'id=1,shift=1', - 'id=1,shift=2', - 'id=1,shift=2', - 'id=1,shift=3', - 'id=2,shift=0', - 'id=2,shift=0', - 'id=2,shift=1' + 'id=1,timeshift=0', + 'id=1,timeshift=0', + 'id=1,timeshift=0', + 'id=1,timeshift=0', + 'id=1,timeshift=1', + 'id=1,timeshift=1', + 'id=1,timeshift=1', + 'id=1,timeshift=2', + 'id=1,timeshift=2', + 'id=1,timeshift=3', + 'id=2,timeshift=0', + 'id=2,timeshift=0', + 'id=2,timeshift=1' ] self.assertListEqual(list(df["a"]["id"].values), correct_indices) @@ -584,16 +584,16 @@ def test_dict_rolling_maxshift_1(self): """ correct_indices = [ - 'id=1,shift=0', - 'id=1,shift=0', - 'id=1,shift=1', - 'id=1,shift=1', - 'id=1,shift=2', - 'id=1,shift=2', - 'id=1,shift=3', - 'id=2,shift=0', - 'id=2,shift=0', - 'id=2,shift=1' + 'id=1,timeshift=0', + 'id=1,timeshift=0', + 'id=1,timeshift=1', + 'id=1,timeshift=1', + 'id=1,timeshift=2', + 'id=1,timeshift=2', + 'id=1,timeshift=3', + 'id=2,timeshift=0', + 'id=2,timeshift=0', + 'id=2,timeshift=1' ] self.assertListEqual(list(df["a"]["id"].values), correct_indices) @@ -831,17 +831,23 @@ class MakeForecastingFrameTestCase(TestCase): def test_make_forecasting_frame_list(self): df, y = dataframe_functions.make_forecasting_frame(x=range(4), kind="test", max_timeshift=1, rolling_direction=1) - expected_df = pd.DataFrame({"id": [1, 2, 3], "kind": ["test"] * 3, "value": [0., 1., 2.], "time": [0., 1., 2.]}) + expected_df = pd.DataFrame({"id": ["id=id,timeshift=1", "id=id,timeshift=2", "id=id,timeshift=3"], + "kind": ["test"] * 3, + "value": [0, 1, 2], + "time": [0, 1, 2]}) expected_y = pd.Series(data=[1, 2, 3], index=[1, 2, 3], name="value") - assert_frame_equal(df.sort_index(axis=1), expected_df.sort_index(axis=1)) + assert_frame_equal(df.sort_index(axis=1).reset_index(drop=True), expected_df.sort_index(axis=1)) assert_series_equal(y, expected_y) def test_make_forecasting_frame_range(self): df, y = dataframe_functions.make_forecasting_frame(x=np.arange(4), kind="test", max_timeshift=1, rolling_direction=1) - expected_df = pd.DataFrame({"id": [1, 2, 3], "kind": ["test"] * 3, "value": [0., 1., 2.], "time": [0., 1., 2.]}) - assert_frame_equal(df.sort_index(axis=1), expected_df.sort_index(axis=1)) + expected_df = pd.DataFrame({"id": ["id=id,timeshift=1", "id=id,timeshift=2", "id=id,timeshift=3"], + "kind": ["test"] * 3, + "value": [0, 1, 2], + "time": [0, 1, 2]}) + assert_frame_equal(df.sort_index(axis=1).reset_index(drop=True), expected_df.sort_index(axis=1)) def test_make_forecasting_frame_pdSeries(self): @@ -851,13 +857,13 @@ def test_make_forecasting_frame_pdSeries(self): expected_y = pd.Series(data=[1, 2, 3], index=pd.DatetimeIndex(["2011-01-01 01:00:00", "2011-01-01 02:00:00", "2011-01-01 03:00:00"]), name="value") - expected_df = pd.DataFrame({"id": pd.DatetimeIndex(["2011-01-01 01:00:00", "2011-01-01 02:00:00", - "2011-01-01 03:00:00"]), - "kind": ["test"] * 3, "value": [0., 1., 2.], + expected_df = pd.DataFrame({"id": ["id=id,timeshift=2011-01-01 01:00:00", "id=id,timeshift=2011-01-01 02:00:00", + "id=id,timeshift=2011-01-01 03:00:00"], + "kind": ["test"] * 3, "value": [0, 1, 2], "time": pd.DatetimeIndex(["2011-01-01 00:00:00", "2011-01-01 01:00:00", "2011-01-01 02:00:00"]) }) - assert_frame_equal(df.sort_index(axis=1), expected_df.sort_index(axis=1)) + assert_frame_equal(df.sort_index(axis=1).reset_index(drop=True), expected_df.sort_index(axis=1)) assert_series_equal(y, expected_y) diff --git a/tsfresh/utilities/dataframe_functions.py b/tsfresh/utilities/dataframe_functions.py index 1a8a85c4a..2624693bb 100644 --- a/tsfresh/utilities/dataframe_functions.py +++ b/tsfresh/utilities/dataframe_functions.py @@ -482,8 +482,7 @@ def roll_time_series(df_or_dict, column_id, column_sort, column_kind, rolling_di # Todo: not default for columns_sort to be None if column_sort is None: - column_sort = "sort" - df[column_sort] = range(df.shape[0]) + df["sort"] = range(df.shape[0]) def roll_out_time_series(time_shift): # Now comes the fun part. @@ -493,32 +492,42 @@ def roll_out_time_series(time_shift): # This means we cut out the data until `time_shift`. # The first row we cut out is either 0 or given by the maximal allowed length of `max_timeshift`. # for a negative rolling direction it is reversed - if rolling_direction > 0: - shift_until = time_shift - shift_from = max(shift_until - max_timeshift - 1, 0) + def _f(x): + if rolling_direction > 0: + shift_until = time_shift + shift_from = max(shift_until - max_timeshift - 1, 0) - df_temp = grouped_data.apply(lambda x: x.iloc[shift_from:shift_until] if shift_until <= len(x) else None) - else: - shift_from = max(time_shift - 1, 0) - shift_until = shift_from + max_timeshift + 1 + df_temp = x.iloc[shift_from:shift_until] if shift_until <= len(x) else None + else: + shift_from = max(time_shift - 1, 0) + shift_until = shift_from + max_timeshift + 1 + + df_temp = x.iloc[shift_from:shift_until] + + if df_temp is None or len(df_temp) == 0: + return - df_temp = grouped_data.apply(lambda x: x.iloc[shift_from:shift_until]) + df_temp = df_temp.copy() - if len(df_temp) == 0: - return + # and set the shift correctly + if column_sort and rolling_direction > 0: + shift_string = f"timeshift={df_temp[column_sort].iloc[-1]}" + elif column_sort and rolling_direction < 0: + shift_string = f"timeshift={df_temp[column_sort].iloc[0]}" + else: + shift_string = f"timeshift={time_shift - 1}" + # and now create new ones ids out of the old ones + df_temp[column_id] = df_temp.apply(lambda row: f"id={row[column_id]},{shift_string}", axis=1) - # Make sure we keep the old column id values - old_column_id = df_temp[column_id] - # and now create new ones out of the old ones - df_temp[column_id] = df_temp.apply(lambda row: f"id={row[column_id]},shift={time_shift - 1}", axis=1) + return df_temp - return df_temp + return grouped_data.apply(_f) range_of_shifts = range(min_timeshift, prediction_steps + 1) shifted_chunks = map(lambda time_shift: roll_out_time_series(time_shift), range_of_shifts) df_shift = pd.concat(shifted_chunks, ignore_index=True) - return df_shift.sort_values(by=[column_id, column_sort]) + return df_shift.sort_values(by=[column_id, column_sort or "sort"]) def make_forecasting_frame(x, kind, max_timeshift, rolling_direction): From 3c3c0e933460a7786b9995916274a9b62097e6fc Mon Sep 17 00:00:00 2001 From: Nils Braun Date: Mon, 13 Apr 2020 17:28:27 +0200 Subject: [PATCH 05/23] Added test for min timeshift --- .../utilities/test_dataframe_functions.py | 41 +++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/tests/units/utilities/test_dataframe_functions.py b/tests/units/utilities/test_dataframe_functions.py index 91cdac8fa..66fba8637 100644 --- a/tests/units/utilities/test_dataframe_functions.py +++ b/tests/units/utilities/test_dataframe_functions.py @@ -335,6 +335,25 @@ def test_positive_rolling(self): self.assertListEqual(list(df["a"].values), correct_values_a) self.assertListEqual(list(df["b"].values), correct_values_b) + df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time", + column_kind=None, rolling_direction=1, + max_timeshift=2, min_timeshift=2) + + correct_indices = [ + 'id=1,timeshift=2', + 'id=1,timeshift=2', + 'id=1,timeshift=2', + 'id=1,timeshift=3', + 'id=1,timeshift=3', + 'id=1,timeshift=3', + ] + correct_values_a = [1.0, 2.0, 3.0, 2.0, 3.0, 4.0] + correct_values_b = [5.0, 6.0, 7.0, 6.0, 7.0, 8.0] + + self.assertListEqual(list(df["id"]), correct_indices) + self.assertListEqual(list(df["a"].values), correct_values_a) + self.assertListEqual(list(df["b"].values), correct_values_b) + def test_negative_rolling(self): first_class = pd.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "time": range(4)}) second_class = pd.DataFrame({"a": [10, 11], "b": [12, 13], "time": range(20, 22)}) @@ -460,6 +479,28 @@ def test_negative_rolling(self): self.assertListEqual(list(df["a"].values), correct_values_a) self.assertListEqual(list(df["b"].values), correct_values_b) + + df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time", + column_kind=None, rolling_direction=-1, + min_timeshift=2, + max_timeshift=3) + + correct_indices = [ + 'id=1,timeshift=0', + 'id=1,timeshift=0', + 'id=1,timeshift=0', + 'id=1,timeshift=0', + 'id=1,timeshift=1', + 'id=1,timeshift=1', + 'id=1,timeshift=1' + ] + correct_values_a = [1.0, 2.0, 3.0, 4.0, 2.0, 3.0, 4.0] + correct_values_b = [5.0, 6.0, 7.0, 8.0, 6.0, 7.0, 8.0] + + self.assertListEqual(list(df["id"].values), correct_indices) + self.assertListEqual(list(df["a"].values), correct_values_a) + self.assertListEqual(list(df["b"].values), correct_values_b) + def test_stacked_rolling(self): first_class = pd.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "time": range(4)}) second_class = pd.DataFrame({"a": [10, 11], "b": [12, 13], "time": range(20, 22)}) From ffae951db6b01556d1d82684853cb56009cbc2c4 Mon Sep 17 00:00:00 2001 From: Nils Braun Date: Mon, 13 Apr 2020 17:31:01 +0200 Subject: [PATCH 06/23] Added parallelization to the rolling function --- tsfresh/utilities/dataframe_functions.py | 163 ++++++++++++++++------- 1 file changed, 116 insertions(+), 47 deletions(-) diff --git a/tsfresh/utilities/dataframe_functions.py b/tsfresh/utilities/dataframe_functions.py index 2624693bb..f9a6b0e4a 100644 --- a/tsfresh/utilities/dataframe_functions.py +++ b/tsfresh/utilities/dataframe_functions.py @@ -8,6 +8,9 @@ import gc import warnings +from tsfresh import defaults +from tsfresh.utilities.distribution import MapDistributor, MultiprocessingDistributor, DistributorBaseClass + import numpy as np import pandas as pd @@ -372,7 +375,54 @@ def _normalize_input_to_internal_representation(timeseries_container, column_id, return timeseries_container, column_id, column_kind, column_value -def roll_time_series(df_or_dict, column_id, column_sort, column_kind, rolling_direction, max_timeshift=None, min_timeshift=1): +def _roll_out_time_series(time_shift, grouped_data, rolling_direction, max_timeshift, min_timeshift, column_sort, column_id): + """ + Internal helper function for roll_time_series. + This function has the task to extract the rolled forecast data frame of the number `time_shift`. + This means it is `time_shift` in the future - starting counting from the first row + for each id (and kind). + This means we cut out the data until `time_shift`. + The first row we cut out is either 0 or given by the maximal allowed length of `max_timeshift`. + for a negative rolling direction it is reversed + """ + def _f(x): + if rolling_direction > 0: + shift_until = time_shift + shift_from = max(shift_until - max_timeshift - 1, 0) + + df_temp = x.iloc[shift_from:shift_until] if shift_until <= len(x) else None + else: + shift_from = max(time_shift - 1, 0) + shift_until = shift_from + max_timeshift + 1 + + df_temp = x.iloc[shift_from:shift_until] + + if df_temp is None or len(df_temp) < min_timeshift + 1: + return + + df_temp = df_temp.copy() + + # and set the shift correctly + if column_sort and rolling_direction > 0: + shift_string = f"timeshift={df_temp[column_sort].iloc[-1]}" + elif column_sort and rolling_direction < 0: + shift_string = f"timeshift={df_temp[column_sort].iloc[0]}" + else: + shift_string = f"timeshift={time_shift - 1}" + # and now create new ones ids out of the old ones + df_temp["id"] = df_temp.apply(lambda row: f"id={row[column_id]},{shift_string}", axis=1) + + return df_temp + + return [grouped_data.apply(_f)] + + +def roll_time_series(df_or_dict, column_id, column_sort=None, column_kind=None, + rolling_direction=1, max_timeshift=None, min_timeshift=0, + chunksize=defaults.CHUNKSIZE, + n_jobs=defaults.N_PROCESSES, show_warnings=defaults.SHOW_WARNINGS, + disable_progressbar=defaults.DISABLE_PROGRESSBAR, + distributor=None): """ This method creates sub windows of the time series. It rolls the (sorted) data frames for each kind and each id separately in the "time" domain (which is represented by the sort order of the sort column given by `column_sort`). @@ -391,24 +441,47 @@ def roll_time_series(df_or_dict, column_id, column_sort, column_kind, rolling_di :param df_or_dict: a pandas DataFrame or a dictionary. The required shape/form of the object depends on the rest of the passed arguments. :type df_or_dict: pandas.DataFrame or dict + :param column_id: it must be present in the pandas DataFrame or in all DataFrames in the dictionary. It is not allowed to have NaN values in this column. :type column_id: basestring or None + :param column_sort: if not None, sort the rows by this column. It is not allowed to - have NaN values in this column. + have NaN values in this column. If not given, will be filled by an increasing number. :type column_sort: basestring or None + :param column_kind: It can only be used when passing a pandas DataFrame (the dictionary is already assumed to be grouped by the kind). Is must be present in the DataFrame and no NaN values are allowed. If the kind column is not passed, it is assumed that each column in the pandas DataFrame (except the id or sort column) is a possible kind. :type column_kind: basestring or None + :param rolling_direction: The sign decides, if to roll backwards or forwards in "time" :type rolling_direction: int + :param max_timeshift: If not None, shift only up to max_timeshift. If None, shift as often as possible. :type max_timeshift: int - :param min_timeshift: Throw away all extracted forecast windows smaller than this. Must be larger than 0. + + :param min_timeshift: Throw away all extracted forecast windows smaller than this. Must be larger than or equal 0. :type min_timeshift: int + :param n_jobs: The number of processes to use for parallelization. If zero, no parallelization is used. + :type n_jobs: int + + :param chunksize: How many shifts per job should be calculated. + :type chunksize: None or int + + :param show_warnings: Show warnings during the feature extraction (needed for debugging of calculators). + :type show_warnings: bool + + :param disable_progressbar: Do not show a progressbar while doing the calculation. + :type disable_progressbar: bool + + :param distributor: Advanced parameter: set this to a class name that you want to use as a + distributor. See the utilities/distribution.py for more information. Leave to None, if you want + TSFresh to choose the best distributor. + :type distributor: class + :return: The rolled data frame or dictionary of data frames :rtype: the one from df_or_dict """ @@ -419,8 +492,8 @@ def roll_time_series(df_or_dict, column_id, column_sort, column_kind, rolling_di if max_timeshift is not None and max_timeshift <= 0: raise ValueError("max_timeshift needs to be positive!") - if min_timeshift <= 0: - raise ValueError("min_timeshift needs to be positive!") + if min_timeshift < 0: + raise ValueError("min_timeshift needs to be positive or zero!") if isinstance(df_or_dict, dict): if column_kind is not None: @@ -432,7 +505,12 @@ def roll_time_series(df_or_dict, column_id, column_sort, column_kind, rolling_di column_kind=column_kind, rolling_direction=rolling_direction, max_timeshift=max_timeshift, - min_timeshift=min_timeshift) + min_timeshift=min_timeshift, + chunksize=chunksize, + n_jobs=n_jobs, + show_warnings=show_warnings, + disable_progressbar=disable_progressbar, + distributor=distributor) for key in df_or_dict} # Now we know that this is a pandas data frame @@ -484,47 +562,38 @@ def roll_time_series(df_or_dict, column_id, column_sort, column_kind, rolling_di if column_sort is None: df["sort"] = range(df.shape[0]) - def roll_out_time_series(time_shift): - # Now comes the fun part. - # This function has the task to extract the rolled forecast data frame of the number `time_shift`. - # This means it is `time_shift` in the future - starting counting from the first row - # for each id (and kind). - # This means we cut out the data until `time_shift`. - # The first row we cut out is either 0 or given by the maximal allowed length of `max_timeshift`. - # for a negative rolling direction it is reversed - def _f(x): - if rolling_direction > 0: - shift_until = time_shift - shift_from = max(shift_until - max_timeshift - 1, 0) - - df_temp = x.iloc[shift_from:shift_until] if shift_until <= len(x) else None - else: - shift_from = max(time_shift - 1, 0) - shift_until = shift_from + max_timeshift + 1 - - df_temp = x.iloc[shift_from:shift_until] - - if df_temp is None or len(df_temp) == 0: - return - - df_temp = df_temp.copy() - - # and set the shift correctly - if column_sort and rolling_direction > 0: - shift_string = f"timeshift={df_temp[column_sort].iloc[-1]}" - elif column_sort and rolling_direction < 0: - shift_string = f"timeshift={df_temp[column_sort].iloc[0]}" - else: - shift_string = f"timeshift={time_shift - 1}" - # and now create new ones ids out of the old ones - df_temp[column_id] = df_temp.apply(lambda row: f"id={row[column_id]},{shift_string}", axis=1) - - return df_temp - - return grouped_data.apply(_f) - - range_of_shifts = range(min_timeshift, prediction_steps + 1) - shifted_chunks = map(lambda time_shift: roll_out_time_series(time_shift), range_of_shifts) + if rolling_direction > 0: + range_of_shifts = range(1, prediction_steps + 1) + else: + range_of_shifts = range(1, prediction_steps + 1) + + if distributor is None: + if n_jobs == 0: + distributor = MapDistributor(disable_progressbar=disable_progressbar, + progressbar_title="Feature Extraction") + else: + distributor = MultiprocessingDistributor(n_workers=n_jobs, + disable_progressbar=disable_progressbar, + progressbar_title="Feature Extraction", + show_warnings=show_warnings) + + if not isinstance(distributor, DistributorBaseClass): + raise ValueError("the passed distributor is not an DistributorBaseClass object") + + kwargs = { + "grouped_data": grouped_data, + "rolling_direction": rolling_direction, + "max_timeshift": max_timeshift, + "min_timeshift": min_timeshift, + "column_sort": column_sort, + "column_id": column_id, + } + + shifted_chunks = distributor.map_reduce(_roll_out_time_series, data=range_of_shifts, + chunk_size=chunksize, function_kwargs=kwargs) + + distributor.close() + df_shift = pd.concat(shifted_chunks, ignore_index=True) return df_shift.sort_values(by=[column_id, column_sort or "sort"]) From c48000553723a4b68390fea9c92db691bc3a3753 Mon Sep 17 00:00:00 2001 From: Nils Braun Date: Tue, 14 Apr 2020 21:51:45 +0200 Subject: [PATCH 07/23] Be python 3.5 compatible... --- tsfresh/utilities/dataframe_functions.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tsfresh/utilities/dataframe_functions.py b/tsfresh/utilities/dataframe_functions.py index f9a6b0e4a..5aa87dd6a 100644 --- a/tsfresh/utilities/dataframe_functions.py +++ b/tsfresh/utilities/dataframe_functions.py @@ -404,13 +404,13 @@ def _f(x): # and set the shift correctly if column_sort and rolling_direction > 0: - shift_string = f"timeshift={df_temp[column_sort].iloc[-1]}" + shift_string = "timeshift=" + str(df_temp[column_sort].iloc[-1]) elif column_sort and rolling_direction < 0: - shift_string = f"timeshift={df_temp[column_sort].iloc[0]}" + shift_string = "timeshift=" + str(df_temp[column_sort].iloc[0]) else: - shift_string = f"timeshift={time_shift - 1}" + shift_string = "timeshift=" + str(time_shift - 1) # and now create new ones ids out of the old ones - df_temp["id"] = df_temp.apply(lambda row: f"id={row[column_id]},{shift_string}", axis=1) + df_temp["id"] = df_temp.apply(lambda row: "id=" + str(row[column_id]) + "," + str(shift_string), axis=1) return df_temp From 304f450d8901425b2395b80a3d85d7aaf3e4b5dd Mon Sep 17 00:00:00 2001 From: Nils Braun Date: Thu, 16 Apr 2020 21:27:32 +0200 Subject: [PATCH 08/23] Reworked the rolling documentation --- docs/text/forecasting.rst | 420 ++++++++++++++++---------------------- 1 file changed, 172 insertions(+), 248 deletions(-) diff --git a/docs/text/forecasting.rst b/docs/text/forecasting.rst index d11725a20..444da12c7 100644 --- a/docs/text/forecasting.rst +++ b/docs/text/forecasting.rst @@ -1,22 +1,27 @@ .. _forecasting-label: -Time series forecasting -======================= +Time series forecasting/Rolling +=============================== Features that are extracted with *tsfresh* can be used for many different tasks, such as time series classification, compression or forecasting. This section explains how one can use the features for time series forecasting tasks. -The "sort" column of a DataFrame in the supported :ref:`data-formats-label` gives a sequential state to the -individual measurements. In the case of time series this can be the *time* dimension while in the case of spectra the -order is given by the *wavelength* or *frequency* dimensions. -We can exploit this sequence to generate more input data out of a single time series, by *rolling* over the data. - Lets say you have the price of a certain stock, e.g. Apple, for 100 time steps. Now, you want to build a feature-based model to forecast future prices of the Apple stock. -So you will have to extract features in every time step of the original time series while looking at -a certain number of past values. -A rolling mechanism will give you the sub time series of last *m* time steps to construct the features. +You could remove the last price value (of today) and extract features from the time series until today to predict the price of today. +But this would only give you a single example to train. +However, you can repeat this process: for every day in your stock price time series, remove the current value, extract features for the time until this value and train to predict the value of the day (which you removed). +In `tsfresh`, this is called *rolling*. + +Rolling is a way, to turn a single time series into multiple time series, each of them ending (or starting, depending on the roll direction) one time step later than the one before. +The rolling utilities implemented in `tsfresh` help you in this process of reshaping (and rolling) your data into a form, so that you can apply the usual :func:`tsfresh.extract_features` method. + +Please note that "time" does not necessarily mean clock time here. +The "sort" column of a DataFrame in the supported :ref:`data-formats-label` gives a sequential state to the +individual measurements. +In the case of time series this can be the *time* dimension while in the case of spectra the +order is given by the *wavelength* or *frequency* dimensions. The following image illustrates the process: @@ -26,21 +31,17 @@ The following image illustrates the process: :align: center - -So, we move the window that extract the features and then predict the next time step (which was not used to extract features) forward. -In the above image, the window moves from left to right. - Another example can be found in streaming data, e.g. in Industry 4.0 applications. Here you typically get one new data row at a time and use this to for example predict machine failures. To train your model, you could act as if you would stream the data, by feeding your classifier the data after one time step, the data after the first two time steps etc. -Both examples imply, that you extract the features not only on the full data set, but also -on all temporal coherent subsets of data, which is the process of *rolling*. In tsfresh, this is implemented in the -function :func:`tsfresh.utilities.dataframe_functions.roll_time_series`. +In tsfresh, rolling is implemented via the helper function :func:`tsfresh.utilities.dataframe_functions.roll_time_series`. Further, we provide the :func:`tsfresh.utilities.dataframe_functions.make_forecasting_frame` method as a convenient wrapper to fast construct the container and target vector for a given sequence. +Let's walk through an example to see how it works: + The rolling mechanism --------------------- @@ -56,260 +57,183 @@ To see what this does in real-world applications, we look into the following exa +----+------+----+----+ | id | time | x | y | +====+======+====+====+ -| 1 | t1 | 1 | 5 | +| 1 | 1 | 1 | 5 | +----+------+----+----+ -| 1 | t2 | 2 | 6 | +| 1 | 2 | 2 | 6 | +----+------+----+----+ -| 1 | t3 | 3 | 7 | +| 1 | 3 | 3 | 7 | +----+------+----+----+ -| 1 | t4 | 4 | 8 | +| 1 | 4 | 4 | 8 | +----+------+----+----+ -| 2 | t8 | 10 | 12 | +| 2 | 8 | 10 | 12 | +----+------+----+----+ -| 2 | t9 | 11 | 13 | +| 2 | 9 | 11 | 13 | +----+------+----+----+ where you have measured the values from two sensors x and y for two different entities (id 1 and 2) in 4 or 2 time steps (t1 to t9). -Now, we can use :func:`tsfresh.utilities.dataframe_functions.roll_time_series` to get consecutive sub-time series. -E.g. if you set `rolling` to 0, the feature extraction works on the original time series without any rolling. +If you want to follow along, here is the python code to generate this data: -So it extracts 2 set of features, +.. code:: python -+----+------+----+----+ -| id | time | x | y | -+====+======+====+====+ -| 1 | t1 | 1 | 5 | -+----+------+----+----+ -| 1 | t2 | 2 | 6 | -+----+------+----+----+ -| 1 | t3 | 3 | 7 | -+----+------+----+----+ -| 1 | t4 | 4 | 8 | -+----+------+----+----+ - -and - -+----+------+----+----+ -| id | time | x | y | -+====+======+====+====+ -| 2 | t8 | 10 | 12 | -+----+------+----+----+ -| 2 | t9 | 11 | 13 | -+----+------+----+----+ - -If you set rolling to 1, the feature extraction works with all of the following time series: - -+----+------+----+----+ -| id | time | x | y | -+====+======+====+====+ -| 1 | t1 | 1 | 5 | -+----+------+----+----+ - -+----+------+----+----+ -| id | time | x | y | -+====+======+====+====+ -| 1 | t1 | 1 | 5 | -+----+------+----+----+ -| 1 | t2 | 2 | 6 | -+----+------+----+----+ - -+----+------+----+----+ -| id | time | x | y | -+====+======+====+====+ -| 1 | t1 | 1 | 5 | -+----+------+----+----+ -| 1 | t2 | 2 | 6 | -+----+------+----+----+ -| 1 | t3 | 3 | 7 | -+----+------+----+----+ -| 2 | t8 | 10 | 12 | -+----+------+----+----+ - -+----+------+----+----+ -| id | time | x | y | -+====+======+====+====+ -| 1 | t1 | 1 | 5 | -+----+------+----+----+ -| 1 | t2 | 2 | 6 | -+----+------+----+----+ -| 1 | t3 | 3 | 7 | -+----+------+----+----+ -| 1 | t4 | 4 | 8 | -+----+------+----+----+ -| 2 | t8 | 10 | 12 | -+----+------+----+----+ -| 2 | t9 | 11 | 13 | -+----+------+----+----+ - -If you set rolling to -1, you end up with features for the time series, rolled in the other direction - -+----+------+----+----+ -| id | time | x | y | -+====+======+====+====+ -| 1 | t4 | 4 | 8 | -+----+------+----+----+ - -+----+------+----+----+ -| id | time | x | y | -+====+======+====+====+ -| 1 | t3 | 3 | 7 | -+----+------+----+----+ -| 1 | t4 | 4 | 8 | -+----+------+----+----+ - -+----+------+----+----+ -| id | time | x | y | -+====+======+====+====+ -| 1 | t2 | 2 | 6 | -+----+------+----+----+ -| 1 | t3 | 3 | 7 | -+----+------+----+----+ -| 1 | t4 | 4 | 8 | -+----+------+----+----+ -| 2 | t9 | 11 | 13 | -+----+------+----+----+ - -+----+------+----+----+ -| id | time | x | y | -+====+======+====+====+ -| 1 | t1 | 1 | 5 | -+----+------+----+----+ -| 1 | t2 | 2 | 6 | -+----+------+----+----+ -| 1 | t3 | 3 | 7 | -+----+------+----+----+ -| 1 | t4 | 4 | 8 | -+----+------+----+----+ -| 2 | t8 | 10 | 12 | -+----+------+----+----+ -| 2 | t9 | 11 | 13 | -+----+------+----+----+ - -We only gave an example for the flat DataFrame format, but rolling actually works on all 3 :ref:`data-formats-label` -that are supported by tsfresh. + import pandas as pd + df = pd.DataFrame({ + "id": [1, 1, 1, 1, 2, 2], + "time": [1, 2, 3, 4, 8, 9], + "x": [1, 2, 3, 4, 10, 11], + "y": [5, 6, 7, 8, 12, 13], + }) +Now, we can use :func:`tsfresh.utilities.dataframe_functions.roll_time_series` to get consecutive sub-time series. +You could think of having a window sliding over your time series data and extracting out every data you can see through this window. +There are three parameters to tune the window: +* `rolling_direction`: if you want to slide in positive (increasing sort) or negative (decreasing sort) direction. Default is positive. +* `max_timeshift` defines, how large the window size will grow. This means the extracted time series will have at maximum `max_timeshift + 1` steps in the past (or future). Default is infinite. +* `min_timeshift` defines the minimal size. Defaults to 0. + +The column parameters are the same as in the usual :ref:`data-formats-label`. + +Let's see what will happen with our data sample: + +.. code:: python + + from tsfresh.utilities.dataframe_functions import roll_time_series + df_rolled = roll_time_series(df, column_id="id", column_sort="time") + +The new data set consists only of values from the old data set, but with new indices. +If you group by index, you will end up with the following parts: + ++-----------------+-------+---+----+ +|id | time | x | y | ++=================+=======+===+====+ +|id=1,timeshift=1 | 1 | 1 | 5 | ++-----------------+-------+---+----+ + ++-----------------+-------+---+----+ +|id | time | x | y | ++=================+=======+===+====+ +|id=1,timeshift=2 | 1 | 1 | 5 | ++-----------------+-------+---+----+ +|id=1,timeshift=2 | 2 | 2 | 6 | ++-----------------+-------+---+----+ + ++-----------------+-------+---+----+ +|id | time | x | y | ++=================+=======+===+====+ +|id=1,timeshift=3 | 1 | 1 | 5 | ++-----------------+-------+---+----+ +|id=1,timeshift=3 | 2 | 2 | 6 | ++-----------------+-------+---+----+ +|id=1,timeshift=3 | 3 | 3 | 7 | ++-----------------+-------+---+----+ + ++-----------------+-------+---+----+ +|id | time | x | y | ++=================+=======+===+====+ +|id=1,timeshift=4 | 1 | 1 | 5 | ++-----------------+-------+---+----+ +|id=1,timeshift=4 | 2 | 2 | 6 | ++-----------------+-------+---+----+ +|id=1,timeshift=4 | 3 | 3 | 7 | ++-----------------+-------+---+----+ +|id=1,timeshift=4 | 4 | 4 | 8 | ++-----------------+-------+---+----+ + ++-----------------+-------+---+----+ +|id | time | x | y | ++=================+=======+===+====+ +|id=2,timeshift=8 | 8 |10 | 12 | ++-----------------+-------+---+----+ + ++-----------------+-------+---+----+ +|id | time | x | y | ++=================+=======+===+====+ +|id=2,timeshift=9 | 8 |10 | 12 | ++-----------------+-------+---+----+ +|id=2,timeshift=9 | 9 |11 | 13 | ++-----------------+-------+---+----+ + +Each of those parts can now be treated independently. +For example, you could run the usual feature extraction on them: + +.. code:: python + + from tsfresh import extract_features + df_features = extract_features(df_rolled, column_id="id", column_sort="time") + +You will end up with features generated for each of the parts above, which you can then use for training your forecasting model. + ++------------------+----------------+-----------------------------+-----+ +| variable | x__abs_energy | x__absolute_sum_of_changes | ... | ++==================+================+=============================+=====+ +| id | | | ... | ++------------------+----------------+-----------------------------+-----+ +| id=1,timeshift=1 | 1.0 | 0.0 | ... | ++------------------+----------------+-----------------------------+-----+ +| id=1,timeshift=2 | 5.0 | 1.0 | ... | ++------------------+----------------+-----------------------------+-----+ +| id=1,timeshift=3 | 14.0 | 2.0 | ... | ++------------------+----------------+-----------------------------+-----+ +| id=1,timeshift=4 | 30.0 | 3.0 | ... | ++------------------+----------------+-----------------------------+-----+ +| id=2,timeshift=8 | 100.0 | 0.0 | ... | ++------------------+----------------+-----------------------------+-----+ +| id=2,timeshift=9 | 221.0 | 1.0 | ... | ++------------------+----------------+-----------------------------+-----+ + +If you want to train for a forecasting, `tsfresh` also offers the function :func:`tsfresh.utilities.dataframe_functions.make_forecasting_frame`, which will also help you match the target vector properly. This process is also visualized by the following figure. It shows how the purple, rolled sub-timeseries are used as base for the construction of the feature matrix *X* -(after calculation of the features by *f*). +(if *f* is the `extract_features` function). The green data points need to be predicted by the model and are used as rows in the target vector *y*. +Be aware that this only works for a one-dimensional time series of a single `id` and `kind`. .. image:: ../images/rolling_mechanism_2.png :scale: 100 % :alt: The rolling mechanism :align: center - - Parameters and Implementation Notes ----------------------------------- The above example demonstrates the overall rolling mechanism, which creates new time series. -Now we discuss the naming convention for such new time series: - -For identifying every subsequence, tsfresh uses the time stamp of the point that will be predicted as new "id". -The above example with rolling set to 1 yields the following sub-time series: - -+-----------+------+----+----+ -| id | time | x | y | -+===========+======+====+====+ -| t1 | t1 | 1 | 5 | -+-----------+------+----+----+ - -+-----------+------+----+----+ -| id | time | x | y | -+===========+======+====+====+ -| t2 | t1 | 1 | 5 | -+-----------+------+----+----+ -| t2 | t2 | 2 | 6 | -+-----------+------+----+----+ - -+-----------+------+----+----+ -| id | time | x | y | -+===========+======+====+====+ -| t3 | t1 | 1 | 5 | -+-----------+------+----+----+ -| t3 | t2 | 2 | 6 | -+-----------+------+----+----+ -| t3 | t3 | 3 | 7 | -+-----------+------+----+----+ - -+-----------+------+----+----+ -| id | time | x | y | -+===========+======+====+====+ -| t4 | t1 | 1 | 5 | -+-----------+------+----+----+ -| t4 | t2 | 2 | 6 | -+-----------+------+----+----+ -| t4 | t3 | 3 | 7 | -+-----------+------+----+----+ -| t4 | t4 | 4 | 8 | -+-----------+------+----+----+ - -+-----------+------+----+----+ -| id | time | x | y | -+===========+======+====+====+ -| t8 | t8 | 10 | 12 | -+-----------+------+----+----+ - -+-----------+------+----+----+ -| id | time | x | y | -+===========+======+====+====+ -| t9 | t8 | 10 | 12 | -+-----------+------+----+----+ -| t9 | t9 | 11 | 13 | -+-----------+------+----+----+ - -The new id is the time stamp where the shift ended. -So above, every table represents a sub-time series. -The higher the shift value, the more steps the time series was moved into the specified direction (into the past in -this example). - -If you want to limit how far the time series shall be shifted into the specified direction, you can set the -*max_timeshift* parameter to the maximum time steps to be shifted. -In our example, setting *max_timeshift* to 1 yields the following result (setting it to 0 will create all possible shifts): - -+-----------+------+----+----+ -| id | time | x | y | -+===========+======+====+====+ -| t1 | t1 | 1 | 5 | -+-----------+------+----+----+ - -+-----------+------+----+----+ -| id | time | x | y | -+===========+======+====+====+ -| t2 | t1 | 1 | 5 | -+-----------+------+----+----+ -| t2 | t2 | 2 | 6 | -+-----------+------+----+----+ - -+-----------+------+----+----+ -| id | time | x | y | -+===========+======+====+====+ -| t3 | t2 | 2 | 6 | -+-----------+------+----+----+ -| t3 | t3 | 3 | 7 | -+-----------+------+----+----+ - -+-----------+------+----+----+ -| id | time | x | y | -+===========+======+====+====+ -| t4 | t3 | 3 | 7 | -+-----------+------+----+----+ -| t4 | t4 | 4 | 8 | -+-----------+------+----+----+ - -+-----------+------+----+----+ -| id | time | x | y | -+===========+======+====+====+ -| t8 | t8 | 10 | 12 | -+-----------+------+----+----+ - -+-----------+------+----+----+ -| id | time | x | y | -+===========+======+====+====+ -| t9 | t8 | 10 | 12 | -+-----------+------+----+----+ -| t9 | t9 | 11 | 13 | -+-----------+------+----+----+ \ No newline at end of file +Now we discuss the naming convention for such new time series. + +For identifying every subsequence, `tsfresh` uses the time stamp of the point that will be predicted together with the old identifier as "id". +For positive rolling, this `timeshift` is the last time stamp in the subsequence. +For negative rolling, it is the first one, for example the above dataframe rolled in negative direction gives us: + ++------------------+------+----+----+ +|id | time | x | y | ++==================+======+====+====+ +|id=1,timeshift=1 | 1 | 1 | 5 | ++------------------+------+----+----+ +|id=1,timeshift=1 | 2 | 2 | 6 | ++------------------+------+----+----+ +|id=1,timeshift=1 | 3 | 3 | 7 | ++------------------+------+----+----+ +|id=1,timeshift=1 | 4 | 4 | 8 | ++------------------+------+----+----+ +|id=1,timeshift=2 | 2 | 2 | 6 | ++------------------+------+----+----+ +|id=1,timeshift=2 | 3 | 3 | 7 | ++------------------+------+----+----+ +|id=1,timeshift=2 | 4 | 4 | 8 | ++------------------+------+----+----+ +|id=1,timeshift=3 | 3 | 3 | 7 | ++------------------+------+----+----+ +|id=1,timeshift=3 | 4 | 4 | 8 | ++------------------+------+----+----+ +|id=1,timeshift=4 | 4 | 4 | 8 | ++------------------+------+----+----+ +|id=2,timeshift=8 | 8 | 10 | 12 | ++------------------+------+----+----+ +|id=2,timeshift=8 | 9 | 11 | 13 | ++------------------+------+----+----+ +|id=2,timeshift=9 | 9 | 11 | 13 | ++------------------+------+----+----+ + +which you could use to predict the current value using the future time series values (if that makes sense in your case). + +Choosing a non-default `max_timeshift` or `min_timeshift` would make the extracted sub-time-series smaller or even remove them completely (e.g. with `min_timeshift = 1` the `id=1,timeshift=1` of the positive rolling case would disappear). \ No newline at end of file From 7bf7b9ee853f26dc9744cf43af61aa2c213fd91e Mon Sep 17 00:00:00 2001 From: Nils Braun Date: Thu, 16 Apr 2020 21:28:07 +0200 Subject: [PATCH 09/23] Copy-paste error --- tsfresh/utilities/dataframe_functions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tsfresh/utilities/dataframe_functions.py b/tsfresh/utilities/dataframe_functions.py index 5aa87dd6a..97464be9e 100644 --- a/tsfresh/utilities/dataframe_functions.py +++ b/tsfresh/utilities/dataframe_functions.py @@ -570,11 +570,11 @@ def roll_time_series(df_or_dict, column_id, column_sort=None, column_kind=None, if distributor is None: if n_jobs == 0: distributor = MapDistributor(disable_progressbar=disable_progressbar, - progressbar_title="Feature Extraction") + progressbar_title="Rolling") else: distributor = MultiprocessingDistributor(n_workers=n_jobs, disable_progressbar=disable_progressbar, - progressbar_title="Feature Extraction", + progressbar_title="Rolling", show_warnings=show_warnings) if not isinstance(distributor, DistributorBaseClass): From 08e7343de9f4cda4f0014dca274acbec04c2a335 Mon Sep 17 00:00:00 2001 From: Sophie Walther Date: Thu, 16 Apr 2020 21:50:52 +0200 Subject: [PATCH 10/23] Improved forecasting documentation --- docs/text/forecasting.rst | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/docs/text/forecasting.rst b/docs/text/forecasting.rst index 444da12c7..ce6306572 100644 --- a/docs/text/forecasting.rst +++ b/docs/text/forecasting.rst @@ -32,7 +32,7 @@ The following image illustrates the process: Another example can be found in streaming data, e.g. in Industry 4.0 applications. -Here you typically get one new data row at a time and use this to for example predict machine failures. To train your model, +Here you typically get one new data row at a time and use this to, for example, predict machine failures. To train your model, you could act as if you would stream the data, by feeding your classifier the data after one time step, the data after the first two time steps etc. @@ -45,14 +45,7 @@ Let's walk through an example to see how it works: The rolling mechanism --------------------- -The rolling mechanism takes a time series :math:`x` with its data rows :math:`[x_1, x_2, x_3, ..., x_n]` -and creates :math:`n` new time series :math:`\hat x^k`, each of them with a different consecutive part -of :math:`x`: - -.. math:: - \hat x^k = [x_k, x_{k-1}, x_{k-2}, ..., x_1] - -To see what this does in real-world applications, we look into the following example flat DataFrame in tsfresh format +We look into the following example flat DataFrame in tsfresh format +----+------+----+----+ | id | time | x | y | @@ -88,6 +81,7 @@ If you want to follow along, here is the python code to generate this data: Now, we can use :func:`tsfresh.utilities.dataframe_functions.roll_time_series` to get consecutive sub-time series. You could think of having a window sliding over your time series data and extracting out every data you can see through this window. There are three parameters to tune the window: + * `rolling_direction`: if you want to slide in positive (increasing sort) or negative (decreasing sort) direction. Default is positive. * `max_timeshift` defines, how large the window size will grow. This means the extracted time series will have at maximum `max_timeshift + 1` steps in the past (or future). Default is infinite. * `min_timeshift` defines the minimal size. Defaults to 0. From c6e41e220b409d81af14b09b89713e1fa3c25246 Mon Sep 17 00:00:00 2001 From: Nils Braun Date: Thu, 16 Apr 2020 21:56:47 +0200 Subject: [PATCH 11/23] pep8ify --- tests/units/utilities/test_dataframe_functions.py | 1 - tsfresh/utilities/dataframe_functions.py | 5 +++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/units/utilities/test_dataframe_functions.py b/tests/units/utilities/test_dataframe_functions.py index 66fba8637..35725aa66 100644 --- a/tests/units/utilities/test_dataframe_functions.py +++ b/tests/units/utilities/test_dataframe_functions.py @@ -479,7 +479,6 @@ def test_negative_rolling(self): self.assertListEqual(list(df["a"].values), correct_values_a) self.assertListEqual(list(df["b"].values), correct_values_b) - df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time", column_kind=None, rolling_direction=-1, min_timeshift=2, diff --git a/tsfresh/utilities/dataframe_functions.py b/tsfresh/utilities/dataframe_functions.py index 97464be9e..9aa6be5db 100644 --- a/tsfresh/utilities/dataframe_functions.py +++ b/tsfresh/utilities/dataframe_functions.py @@ -375,7 +375,8 @@ def _normalize_input_to_internal_representation(timeseries_container, column_id, return timeseries_container, column_id, column_kind, column_value -def _roll_out_time_series(time_shift, grouped_data, rolling_direction, max_timeshift, min_timeshift, column_sort, column_id): +def _roll_out_time_series(time_shift, grouped_data, rolling_direction, max_timeshift, min_timeshift, + column_sort, column_id): """ Internal helper function for roll_time_series. This function has the task to extract the rolled forecast data frame of the number `time_shift`. @@ -545,7 +546,7 @@ def roll_time_series(df_or_dict, column_id, column_sort=None, column_kind=None, # Test if all differences are the same if differences and min(differences) != max(differences): warnings.warn("Your time stamps are not uniformly sampled, which makes rolling " - "nonsensical in some domains.") + "nonsensical in some domains.") # Roll the data frames if requested rolling_direction = np.sign(rolling_direction) From ab24c93c22199f8204be3db35cecd3a00401fe71 Mon Sep 17 00:00:00 2001 From: Nils Braun Date: Mon, 20 Apr 2020 22:32:59 +0200 Subject: [PATCH 12/23] Speed up quantile calculation --- tsfresh/feature_extraction/feature_calculators.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tsfresh/feature_extraction/feature_calculators.py b/tsfresh/feature_extraction/feature_calculators.py index 42f2039de..7617d076f 100644 --- a/tsfresh/feature_extraction/feature_calculators.py +++ b/tsfresh/feature_extraction/feature_calculators.py @@ -1597,7 +1597,6 @@ def autocorrelation(x, lag): @set_property("fctype", "simple") -@set_property("input", "pd.Series") def quantile(x, q): """ Calculates the q quantile of x. This is the value of x greater than q% of the ordered values from x. @@ -1609,9 +1608,9 @@ def quantile(x, q): :return: the value of this feature :return type: float """ - if not isinstance(x, pd.Series): - x = pd.Series(x) - return pd.Series.quantile(x, q) + if len(x) == 0: + return np.NaN + return np.quantile(x, q) @set_property("fctype", "simple") From ce493e5735bea79424f8bb357a4e357caebdb369 Mon Sep 17 00:00:00 2001 From: Nils Braun Date: Mon, 20 Apr 2020 22:37:20 +0200 Subject: [PATCH 13/23] Replace sample_entropy by corrected and faster reference implementation from Wikipedia --- .../test_feature_calculations.py | 2 +- .../feature_extraction/feature_calculators.py | 52 +++++++------------ 2 files changed, 20 insertions(+), 34 deletions(-) diff --git a/tests/units/feature_extraction/test_feature_calculations.py b/tests/units/feature_extraction/test_feature_calculations.py index e48050df3..bb5db351f 100644 --- a/tests/units/feature_extraction/test_feature_calculations.py +++ b/tests/units/feature_extraction/test_feature_calculations.py @@ -794,7 +794,7 @@ def test_sample_entropy(self): ts = [1, 4, 5, 1, 7, 3, 1, 2, 5, 8, 9, 7, 3, 7, 9, 5, 4, 3, 9, 1, 2, 3, 4, 2, 9, 6, 7, 4, 9, 2, 9, 9, 6, 5, 1, 3, 8, 1, 5, 3, 8, 4, 1, 2, 2, 1, 6, 5, 3, 6, 5, 4, 8, 9, 6, 7, 5, 3, 2, 5, 4, 2, 5, 1, 6, 5, 3, 5, 6, 7, 8, 5, 2, 8, 6, 3, 8, 2, 7, 1, 7, 3, 5, 6, 2, 1, 3, 7, 3, 5, 3, 7, 6, 7, 7, 2, 3, 1, 7, 8] - self.assertAlmostEqualOnAllArrayTypes(sample_entropy, ts, 2.21187685) + self.assertAlmostEqualOnAllArrayTypes(sample_entropy, ts, 2.374905754573672) def test_autocorrelation(self): self.assertAlmostEqualOnAllArrayTypes(autocorrelation, [1, 2, 1, 2, 1, 2], -1, 1) diff --git a/tsfresh/feature_extraction/feature_calculators.py b/tsfresh/feature_extraction/feature_calculators.py index 7617d076f..b58288d0a 100644 --- a/tsfresh/feature_extraction/feature_calculators.py +++ b/tsfresh/feature_extraction/feature_calculators.py @@ -1516,40 +1516,26 @@ def sample_entropy(x): """ x = np.array(x) - sample_length = 1 # number of sequential points of the time series - tolerance = 0.2 * np.std(x) # 0.2 is a common value for r - why? + m = 2 # common value for m, according to wikipedia... + tolerance = 0.2 * np.std(x) # 0.2 is a common value for r, according to wikipedia... - n = len(x) - prev = np.zeros(n) - curr = np.zeros(n) - A = np.zeros((1, 1)) # number of matches for m = [1,...,template_length - 1] - B = np.zeros((1, 1)) # number of matches for m = [1,...,template_length] - - for i in range(n - 1): - nj = n - i - 1 - ts1 = x[i] - for jj in range(nj): - j = jj + i + 1 - if abs(x[j] - ts1) < tolerance: # distance between two vectors - curr[jj] = prev[jj] + 1 - temp_ts_length = min(sample_length, curr[jj]) - for m in range(int(temp_ts_length)): - A[m] += 1 - if j < n - 1: - B[m] += 1 - else: - curr[jj] = 0 - for j in range(nj): - prev[j] = curr[j] - - N = n * (n - 1) / 2 - B = np.vstack(([N], B[0])) - - # sample entropy = -1 * (log (A/B)) - similarity_ratio = A / B - se = -1 * np.log(similarity_ratio) - se = np.reshape(se, -1) - return se[0] + N = len(x) + + # Split time series and save all templates of length m + xmi = np.array([x[i:i + m] for i in range(N - m)]) + xmj = np.array([x[i:i + m] for i in range(N - m + 1)]) + + # Save all matches minus the self-match, compute B + B = np.sum([np.sum(np.abs(xmii - xmj).max(axis=1) <= tolerance) - 1 for xmii in xmi]) + + # Similar for computing A + m += 1 + xm = np.array([x[i:i + m] for i in range(N - m + 1)]) + + A = np.sum([np.sum(np.abs(xmi - xm).max(axis=1) <= tolerance) - 1 for xmi in xm]) + + # Return SampEn + return -np.log(A / B) @set_property("fctype", "simple") From c44080c241db556d45f1bc0c9e1b8cceea9e269c Mon Sep 17 00:00:00 2001 From: Nils Braun Date: Wed, 22 Apr 2020 21:45:00 +0200 Subject: [PATCH 14/23] Added task file for testing --- issues/205/tasks.py | 148 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 148 insertions(+) create mode 100644 issues/205/tasks.py diff --git a/issues/205/tasks.py b/issues/205/tasks.py new file mode 100644 index 000000000..4971277a6 --- /dev/null +++ b/issues/205/tasks.py @@ -0,0 +1,148 @@ +from tsfresh.feature_extraction import ComprehensiveFCParameters, MinimalFCParameters, extract_features + +import pandas as pd +import numpy as np +from time import time +from tqdm import tqdm +import matplotlib.pylab as plt +import b2luigi as luigi +import json + +np.random.seed(42) + + +class DataCreationTask(luigi.Task): + num_ids = luigi.IntParameter(default=100) + time_series_length = luigi.IntParameter() + + def output(self): + yield self.add_to_output("data.csv") + + def run(self): + df = pd.concat([ + pd.DataFrame({ + "id": [i] * self.time_series_length, + "time": range(self.time_series_length), + "value": np.random.randn(self.time_series_length) + }) + for i in range(self.num_ids) + ]) + + df.to_csv(self.get_output_file_name("data.csv")) + + +@luigi.requires(DataCreationTask) +class TimingTask(luigi.Task): + feature_parameter = luigi.DictParameter(hashed=True) + n_jobs = luigi.IntParameter() + try_number = luigi.IntParameter() + + def output(self): + yield self.add_to_output("result.json") + + def run(self): + input_file = self.get_input_file_names("data.csv")[0] + + df = pd.read_csv(input_file) + + start_time = time() + extract_features(df, column_id="id", column_sort="time", n_jobs=self.n_jobs, + default_fc_parameters=self.feature_parameter, + disable_progressbar=True) + end_time = time() + + single_parameter_name = list(self.feature_parameter.keys())[0] + single_parameter_params = self.feature_parameter[single_parameter_name] + + result_json = { + "time": end_time - start_time, + "n_ids": self.num_ids, + "n_jobs": self.n_jobs, + "feature": single_parameter_name, + "number_parameters": len(single_parameter_params) if single_parameter_params else 0, + "time_series_length": int((df["id"] == 0).sum()), + "try_number": self.try_number, + } + + with open(self.get_output_file_name("result.json"), "w") as f: + json.dump(result_json, f) + + +@luigi.requires(DataCreationTask) +class FullTimingTask(luigi.Task): + n_jobs = luigi.IntParameter() + + def output(self): + yield self.add_to_output("result.json") + + def run(self): + input_file = self.get_input_file_names("data.csv")[0] + + df = pd.read_csv(input_file) + + start_time = time() + extract_features(df, column_id="id", column_sort="time", n_jobs=self.n_jobs, + disable_progressbar=True) + end_time = time() + + result_json = { + "time": end_time - start_time, + "n_ids": self.num_ids, + "n_jobs": self.n_jobs, + "time_series_length": int((df["id"] == 0).sum()), + } + + with open(self.get_output_file_name("result.json"), "w") as f: + json.dump(result_json, f) + + +class CombinerTask(luigi.Task): + def complete(self): + return False + + def requires(self): + settings = ComprehensiveFCParameters() + for job in [0, 1, 4]: + for time_series_length in [100, 500, 1000, 5000]: + yield FullTimingTask(time_series_length=time_series_length, + n_jobs=job, + num_ids=10) + yield FullTimingTask(time_series_length=time_series_length, + n_jobs=job, + num_ids=100) + + for feature_name in settings: + yield TimingTask( + feature_parameter={feature_name: settings[feature_name]}, + time_series_length=time_series_length, + n_jobs=job, + num_ids=100, + try_number=0, + ) + + for try_number in range(3): + yield TimingTask( + feature_parameter={feature_name: settings[feature_name]}, + n_jobs=job, + try_number=try_number, + num_ids=10, + time_series_length=time_series_length + ) + + def output(self): + yield self.add_to_output("results.csv") + + def run(self): + results = [] + + for input_file in self.get_input_file_names("result.json"): + with open(input_file, "r") as f: + results.append(json.load(f)) + + df = pd.DataFrame(results) + df.to_csv(self.get_output_file_name("results.csv")) + + +if __name__ == "__main__": + luigi.set_setting("result_path", "results") + luigi.process(CombinerTask()) \ No newline at end of file From d0465c68ab7a0921dfedbf22302c6b55249d06e4 Mon Sep 17 00:00:00 2001 From: Nils Braun Date: Wed, 29 Apr 2020 22:16:35 +0200 Subject: [PATCH 15/23] Bump the requirement for numpy to use the quantile function --- .travis.yml | 4 ++-- requirements.txt | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index b9a7de796..47be05bd7 100644 --- a/.travis.yml +++ b/.travis.yml @@ -69,10 +69,10 @@ jobs: # python 3.7 requires pandas >= 0.23.2 python: 3.7 - - env: NUMPY="1.12.0" PANDAS="0.20.3" SCIKITLEARN="0.19.0" DASK="0.15.2" DISTRIBUTED="1.18.3" SCIPY="1.2.0" STATSMODELS="0.9.0" + - env: NUMPY="1.15.1" PANDAS="0.20.3" SCIKITLEARN="0.19.0" DASK="0.15.2" DISTRIBUTED="1.18.3" SCIPY="1.2.0" STATSMODELS="0.9.0" python: 3.6 - - env: NUMPY="1.12.0" PANDAS="0.20.3" SCIKITLEARN="0.19.0" DASK="0.15.2" DISTRIBUTED="1.18.3" SCIPY="1.2.0" STATSMODELS="0.9.0" + - env: NUMPY="1.15.1" PANDAS="0.20.3" SCIKITLEARN="0.19.0" DASK="0.15.2" DISTRIBUTED="1.18.3" SCIPY="1.2.0" STATSMODELS="0.9.0" python: 3.5.3 diff --git a/requirements.txt b/requirements.txt index a9ee6d8f6..72cd2e5ba 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ requests>=2.9.1 -numpy>=1.12.0 +numpy>=1.15.1 pandas>=0.20.3,!=0.24.* # pandas dropna is buggy in 0.24.0, see https://github.com/blue-yonder/tsfresh/issues/485 and https://github.com/pandas-dev/pandas/issues/25087 scipy>=1.2.0 statsmodels>=0.8.0 From bc59a0d7110b4e7cdc6de6f6a9c84760cfed6952 Mon Sep 17 00:00:00 2001 From: Nils Braun Date: Fri, 1 May 2020 16:00:31 +0200 Subject: [PATCH 16/23] Move to correct location --- .../scripts/measure_execution_time.py | 39 +++++++++++++------ 1 file changed, 27 insertions(+), 12 deletions(-) rename issues/205/tasks.py => tsfresh/scripts/measure_execution_time.py (77%) diff --git a/issues/205/tasks.py b/tsfresh/scripts/measure_execution_time.py similarity index 77% rename from issues/205/tasks.py rename to tsfresh/scripts/measure_execution_time.py index 4971277a6..137f9113c 100644 --- a/issues/205/tasks.py +++ b/tsfresh/scripts/measure_execution_time.py @@ -1,10 +1,16 @@ +# This script extracts the execution time for +# various different settings of tsfresh +# using different input data +# Attention: it will run for ~half a day +# Do these calculations in a controlled environment +# (e.g. a cloud provider VM) +# You will need to have b2luigi installed. from tsfresh.feature_extraction import ComprehensiveFCParameters, MinimalFCParameters, extract_features import pandas as pd import numpy as np from time import time -from tqdm import tqdm -import matplotlib.pylab as plt +from luigi.contrib import gcs import b2luigi as luigi import json @@ -12,6 +18,7 @@ class DataCreationTask(luigi.Task): + """Create random data for testing""" num_ids = luigi.IntParameter(default=100) time_series_length = luigi.IntParameter() @@ -28,11 +35,13 @@ def run(self): for i in range(self.num_ids) ]) - df.to_csv(self.get_output_file_name("data.csv")) + with self._get_output_target("data.csv").open("w") as f: + df.to_csv(f) @luigi.requires(DataCreationTask) class TimingTask(luigi.Task): + """Run tsfresh with the given parameters""" feature_parameter = luigi.DictParameter(hashed=True) n_jobs = luigi.IntParameter() try_number = luigi.IntParameter() @@ -41,9 +50,10 @@ def output(self): yield self.add_to_output("result.json") def run(self): - input_file = self.get_input_file_names("data.csv")[0] + input_file = self._get_input_targets("data.csv")[0] - df = pd.read_csv(input_file) + with input_file.open("r") as f: + df = pd.read_csv(f) start_time = time() extract_features(df, column_id="id", column_sort="time", n_jobs=self.n_jobs, @@ -64,21 +74,23 @@ def run(self): "try_number": self.try_number, } - with open(self.get_output_file_name("result.json"), "w") as f: + with self._get_output_target("result.json").open("w") as f: json.dump(result_json, f) @luigi.requires(DataCreationTask) class FullTimingTask(luigi.Task): + """Run tsfresh with all calculators for comparison""" n_jobs = luigi.IntParameter() def output(self): yield self.add_to_output("result.json") def run(self): - input_file = self.get_input_file_names("data.csv")[0] + input_file = self._get_input_targets("data.csv")[0] - df = pd.read_csv(input_file) + with input_file.open("r") as f: + df = pd.read_csv(f) start_time = time() extract_features(df, column_id="id", column_sort="time", n_jobs=self.n_jobs, @@ -92,11 +104,12 @@ def run(self): "time_series_length": int((df["id"] == 0).sum()), } - with open(self.get_output_file_name("result.json"), "w") as f: + with self._get_output_target("result.json").open("w") as f: json.dump(result_json, f) class CombinerTask(luigi.Task): + """Collect all tasks into a single result.csv file""" def complete(self): return False @@ -135,12 +148,14 @@ def output(self): def run(self): results = [] - for input_file in self.get_input_file_names("result.json"): - with open(input_file, "r") as f: + for input_file in self._get_input_targets("result.json"): + with input_file.open("r") as f: results.append(json.load(f)) df = pd.DataFrame(results) - df.to_csv(self.get_output_file_name("results.csv")) + + with self._get_output_target("results.csv").open("w") as f: + df.to_csv(f) if __name__ == "__main__": From b209b931db6b7d177355e56ebd28238ab84d64a1 Mon Sep 17 00:00:00 2001 From: Nils Braun Date: Fri, 1 May 2020 16:18:00 +0200 Subject: [PATCH 17/23] pep8ify --- tsfresh/feature_extraction/feature_calculators.py | 2 +- tsfresh/scripts/measure_execution_time.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tsfresh/feature_extraction/feature_calculators.py b/tsfresh/feature_extraction/feature_calculators.py index f05f78820..24a9aded6 100644 --- a/tsfresh/feature_extraction/feature_calculators.py +++ b/tsfresh/feature_extraction/feature_calculators.py @@ -1539,7 +1539,7 @@ def sample_entropy(x): """ x = np.array(x) - m = 2 # common value for m, according to wikipedia... + m = 2 # common value for m, according to wikipedia... tolerance = 0.2 * np.std(x) # 0.2 is a common value for r, according to wikipedia... N = len(x) diff --git a/tsfresh/scripts/measure_execution_time.py b/tsfresh/scripts/measure_execution_time.py index 137f9113c..3d35ebb01 100644 --- a/tsfresh/scripts/measure_execution_time.py +++ b/tsfresh/scripts/measure_execution_time.py @@ -160,4 +160,4 @@ def run(self): if __name__ == "__main__": luigi.set_setting("result_path", "results") - luigi.process(CombinerTask()) \ No newline at end of file + luigi.process(CombinerTask()) From 85ad39b3e139153cc6ed9bc1c8d4011f26b765f1 Mon Sep 17 00:00:00 2001 From: Nils Braun Date: Sat, 2 May 2020 11:45:55 +0200 Subject: [PATCH 18/23] Does not make sense to include script in coverage --- .coveragerc | 1 + 1 file changed, 1 insertion(+) diff --git a/.coveragerc b/.coveragerc index 3f0d406bb..69c9f5fb1 100644 --- a/.coveragerc +++ b/.coveragerc @@ -14,6 +14,7 @@ omit = tsfresh/utilities/profiling.py tsfresh/examples/driftbif_simulation.py tsfresh/examples/test_tsfresh_baseline_dataset.py tsfresh/scripts/test_timing.py + tsfresh/scripts/measure_execution_time.py [report] # Regexes for lines to exclude from consideration From 51e0ccd463d29afc3ab8d3ec472dd3e3573ff1e0 Mon Sep 17 00:00:00 2001 From: Nils Braun Date: Sun, 3 May 2020 11:56:49 +0200 Subject: [PATCH 19/23] The linear_trend_timewise is not a high computation costs calculator anymore --- tsfresh/feature_extraction/feature_calculators.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tsfresh/feature_extraction/feature_calculators.py b/tsfresh/feature_extraction/feature_calculators.py index 24a9aded6..c36c32049 100644 --- a/tsfresh/feature_extraction/feature_calculators.py +++ b/tsfresh/feature_extraction/feature_calculators.py @@ -1946,7 +1946,6 @@ def energy_ratio_by_chunks(x, param): @set_property("fctype", "combiner") @set_property("input", "pd.Series") @set_property("index_type", pd.DatetimeIndex) -@set_property("high_comp_cost", True) def linear_trend_timewise(x, param): """ Calculate a linear least-squares regression for the values of the time series versus the sequence from 0 to From b1499da8f7f5f0036cc68631dde19f10cfe0cc15 Mon Sep 17 00:00:00 2001 From: Nils Braun Date: Sun, 3 May 2020 11:58:10 +0200 Subject: [PATCH 20/23] Remove unneeded import --- tsfresh/scripts/measure_execution_time.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tsfresh/scripts/measure_execution_time.py b/tsfresh/scripts/measure_execution_time.py index 3d35ebb01..5fcb8e35d 100644 --- a/tsfresh/scripts/measure_execution_time.py +++ b/tsfresh/scripts/measure_execution_time.py @@ -10,7 +10,6 @@ import pandas as pd import numpy as np from time import time -from luigi.contrib import gcs import b2luigi as luigi import json From ce1578497f2bc3aa7cb101d9846836f80a97d1b6 Mon Sep 17 00:00:00 2001 From: Nils Braun Date: Sat, 9 May 2020 22:16:35 +0200 Subject: [PATCH 21/23] Make the random-seed seeting more explicit --- tsfresh/scripts/measure_execution_time.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/tsfresh/scripts/measure_execution_time.py b/tsfresh/scripts/measure_execution_time.py index 5fcb8e35d..72948a660 100644 --- a/tsfresh/scripts/measure_execution_time.py +++ b/tsfresh/scripts/measure_execution_time.py @@ -13,18 +13,19 @@ import b2luigi as luigi import json -np.random.seed(42) - class DataCreationTask(luigi.Task): """Create random data for testing""" num_ids = luigi.IntParameter(default=100) time_series_length = luigi.IntParameter() + random_seed = luigi.IntParameter() def output(self): yield self.add_to_output("data.csv") def run(self): + np.random.seed(self.random_seed) + df = pd.concat([ pd.DataFrame({ "id": [i] * self.time_series_length, @@ -118,10 +119,12 @@ def requires(self): for time_series_length in [100, 500, 1000, 5000]: yield FullTimingTask(time_series_length=time_series_length, n_jobs=job, - num_ids=10) + num_ids=10, + random_seed=42) yield FullTimingTask(time_series_length=time_series_length, n_jobs=job, - num_ids=100) + num_ids=100, + random_seed=42) for feature_name in settings: yield TimingTask( @@ -130,6 +133,7 @@ def requires(self): n_jobs=job, num_ids=100, try_number=0, + random_seed=42 ) for try_number in range(3): @@ -138,7 +142,8 @@ def requires(self): n_jobs=job, try_number=try_number, num_ids=10, - time_series_length=time_series_length + time_series_length=time_series_length, + random_seed=42 ) def output(self): From 888fb9884551d70a67536b4f09033409a339afda Mon Sep 17 00:00:00 2001 From: Nils Braun Date: Sat, 9 May 2020 23:14:58 +0200 Subject: [PATCH 22/23] Added more tests, more documentation and fixed another bug in the implementation --- .../test_feature_calculations.py | 21 +++++++++++++- .../feature_extraction/feature_calculators.py | 28 +++++++++++++++---- 2 files changed, 42 insertions(+), 7 deletions(-) diff --git a/tests/units/feature_extraction/test_feature_calculations.py b/tests/units/feature_extraction/test_feature_calculations.py index 3803901aa..413eca0b7 100644 --- a/tests/units/feature_extraction/test_feature_calculations.py +++ b/tests/units/feature_extraction/test_feature_calculations.py @@ -791,10 +791,29 @@ def test_binned_entropy(self): self.assertAlmostEqualOnAllArrayTypes(binned_entropy, list(range(100)), - np.math.log(1 / 2), 2) def test_sample_entropy(self): + # "random" list -> large entropy ts = [1, 4, 5, 1, 7, 3, 1, 2, 5, 8, 9, 7, 3, 7, 9, 5, 4, 3, 9, 1, 2, 3, 4, 2, 9, 6, 7, 4, 9, 2, 9, 9, 6, 5, 1, 3, 8, 1, 5, 3, 8, 4, 1, 2, 2, 1, 6, 5, 3, 6, 5, 4, 8, 9, 6, 7, 5, 3, 2, 5, 4, 2, 5, 1, 6, 5, 3, 5, 6, 7, 8, 5, 2, 8, 6, 3, 8, 2, 7, 1, 7, 3, 5, 6, 2, 1, 3, 7, 3, 5, 3, 7, 6, 7, 7, 2, 3, 1, 7, 8] - self.assertAlmostEqualOnAllArrayTypes(sample_entropy, ts, 2.374905754573672) + self.assertAlmostEqualOnAllArrayTypes(sample_entropy, ts, 2.38262780) + # This is not very complex, so it gives a small value + ts = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + self.assertAlmostEqualOnAllArrayTypes(sample_entropy, ts, 0.25131442) + # however adding a 2 increases complexity + ts = [1, 1, 2, 1, 1, 1, 1, 1, 1, 1] + self.assertAlmostEqualOnAllArrayTypes(sample_entropy, ts, 0.74193734) + # and it does not matter where + ts = [1, 1, 1, 2, 1, 1, 1, 1, 1, 1] + self.assertAlmostEqualOnAllArrayTypes(sample_entropy, ts, 0.74193734) + # negative numbers also work + ts = [1, -1, 1, -1, 1, -1] + self.assertAlmostEqualOnAllArrayTypes(sample_entropy, ts, 0.69314718) + # nan gives nan + ts = [1, -1, 1, np.nan, 1, -1] + self.assertIsNanOnAllArrayTypes(sample_entropy, ts) + # this is not a very "random" list, so it should give a small entropy + ts = list(range(1000)) + self.assertAlmostEqualOnAllArrayTypes(sample_entropy, ts, 0.0010314596066622707) def test_autocorrelation(self): self.assertAlmostEqualOnAllArrayTypes(autocorrelation, [1, 2, 1, 2, 1, 2], -1, 1) diff --git a/tsfresh/feature_extraction/feature_calculators.py b/tsfresh/feature_extraction/feature_calculators.py index c36c32049..25b5b1d3d 100644 --- a/tsfresh/feature_extraction/feature_calculators.py +++ b/tsfresh/feature_extraction/feature_calculators.py @@ -1539,23 +1539,39 @@ def sample_entropy(x): """ x = np.array(x) + # if one of the values is NaN, we can not compute anything meaningful + if np.isnan(x).any(): + return np.nan + m = 2 # common value for m, according to wikipedia... tolerance = 0.2 * np.std(x) # 0.2 is a common value for r, according to wikipedia... N = len(x) # Split time series and save all templates of length m - xmi = np.array([x[i:i + m] for i in range(N - m)]) - xmj = np.array([x[i:i + m] for i in range(N - m + 1)]) + # Basically we turn [1, 2, 3, 4] into [1, 2], [2, 3], [3, 4] + xm = np.array([x[i:i + m] for i in range(N - m + 1)]) - # Save all matches minus the self-match, compute B - B = np.sum([np.sum(np.abs(xmii - xmj).max(axis=1) <= tolerance) - 1 for xmii in xmi]) + # Now calculate the maximum distance between each of those pairs + # np.abs(xmi - xm).max(axis=1) + # and check how many are below the tolerance. + # For speed reasons, we are not doing this in a nested for loop, + # but with numpy magic. + # Example: + # if x = [1, 2, 3] + # then xm = [[1, 2], [2, 3]] + # so we will substract xm from [1, 2] => [[0, 0], [-1, -1]] + # and from [2, 3] => [[1, 1], [0, 0]] + # taking the abs and max gives us: + # [0, 1] and [1, 0] + # as the diagonal elements are always 0, we substract 1. + B = np.sum([np.sum(np.abs(xmi - xm).max(axis=1) <= tolerance) - 1 for xmi in xm]) # Similar for computing A m += 1 - xm = np.array([x[i:i + m] for i in range(N - m + 1)]) + xmp1 = np.array([x[i:i + m] for i in range(N - m + 1)]) - A = np.sum([np.sum(np.abs(xmi - xm).max(axis=1) <= tolerance) - 1 for xmi in xm]) + A = np.sum([np.sum(np.abs(xmi - xmp1).max(axis=1) <= tolerance) - 1 for xmi in xmp1]) # Return SampEn return -np.log(A / B) From b3fa7e870f966babb9dc1ea6442e875fde2e7ae5 Mon Sep 17 00:00:00 2001 From: Nils Braun Date: Mon, 11 May 2020 22:57:56 +0200 Subject: [PATCH 23/23] Added changelog --- CHANGES.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index d6c86185c..daecdda58 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -23,9 +23,13 @@ Unreleased - Optimize RelevantFeatureAugmenter to avoid re-extraction (#669) - Added a function `add_sub_time_series_index` (#666) - Added Dockerfile + - Speed optimizations and speed testing script (#681) - Bugfixes - Increase the extracted `ar` coefficients to the full parameter range. (#662) - Documentation fixes (#663, #664, #665) + - Rewrote the `sample_entropy` feature calculator (#681) + It is now faster and (hopefully) more correct. + But your results will change! Version 0.15.1