From 5f1dadb1401e7e3ca8d6dbc9497ce2a2fe2c515d Mon Sep 17 00:00:00 2001 From: Nils Braun Date: Sat, 25 Apr 2020 13:19:42 +0200 Subject: [PATCH] Add add_sub_time_series_index (#666) * Add add_sub_time_series_index This fixes #633. * Pep8ify * Add a test for the sort parameter * Add a test for dict input * Be python 3.5 compatible... * Return a cleaned dataframe * Changelog Co-authored-by: Nils Braun --- CHANGES.rst | 1 + .../utilities/test_dataframe_functions.py | 76 +++++++++++++++-- tsfresh/utilities/dataframe_functions.py | 82 +++++++++++++++++++ 3 files changed, 154 insertions(+), 5 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index e1141325f..e09199582 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -16,6 +16,7 @@ Unreleased - Added variation coefficient (#654) - Added the datetimeindex explanation from the notebook to the docs (#661) - Optimize RelevantFeatureAugmenter to avoid re-extraction (#669) + - Added a function `add_sub_time_series_index` (#666) - Bugfixes - Increase the extracted `ar` coefficients to the full parameter range. (#662) - Documentation fixes (#663, #664, #665) diff --git a/tests/units/utilities/test_dataframe_functions.py b/tests/units/utilities/test_dataframe_functions.py index 72646d6db..a7a5705ba 100644 --- a/tests/units/utilities/test_dataframe_functions.py +++ b/tests/units/utilities/test_dataframe_functions.py @@ -9,8 +9,6 @@ import numpy as np from pandas.testing import assert_frame_equal, assert_series_equal -from tsfresh.utilities.dataframe_functions import get_ids - class NormalizeTestCase(TestCase): def test_with_dictionaries_one_row(self): @@ -775,13 +773,81 @@ class GetIDsTestCase(TestCase): def test_get_id__correct_DataFrame(self): df = pd.DataFrame({"_value": [1, 2, 3, 4, 10, 11], "id": [1, 1, 1, 1, 2, 2]}) - self.assertEqual(get_ids(df, "id"), {1, 2}) + self.assertEqual(dataframe_functions.get_ids(df, "id"), {1, 2}) def test_get_id__correct_dict(self): df_dict = {"a": pd.DataFrame({"_value": [1, 2, 3, 4, 10, 11], "id": [1, 1, 1, 1, 2, 2]}), "b": pd.DataFrame({"_value": [5, 6, 7, 8, 12, 13], "id": [4, 4, 3, 3, 2, 2]})} - self.assertEqual(get_ids(df_dict, "id"), {1, 2, 3, 4}) + self.assertEqual(dataframe_functions.get_ids(df_dict, "id"), {1, 2, 3, 4}) def test_get_id_wrong(self): other_type = np.array([1, 2, 3]) - self.assertRaises(TypeError, get_ids, other_type, "id") + self.assertRaises(TypeError, dataframe_functions.get_ids, other_type, "id") + + +class AddSubIdTestCase(TestCase): + def test_no_parameters(self): + dataframe = pd.DataFrame({"value": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + extended_dataframe = dataframe_functions.add_sub_time_series_index(dataframe, 2) + + self.assertEqual(list(extended_dataframe["id"]), [0, 0, 1, 1, 2, 2, 3, 3, 4]) + assert_series_equal(dataframe["value"], extended_dataframe["value"]) + + def test_id_parameters(self): + dataframe = pd.DataFrame({"value": [1, 2, 3, 4, 5, 6, 7, 8, 9], + "id": [1, 1, 1, 1, 2, 2, 2, 2, 2]}) + + extended_dataframe = dataframe_functions.add_sub_time_series_index(dataframe, 2, column_id="id") + + self.assertEqual(list(extended_dataframe["id"]), + ["0,1", "0,1", "1,1", "1,1", "0,2", "0,2", "1,2", "1,2", "2,2"]) + assert_series_equal(dataframe["value"], extended_dataframe["value"]) + + def test_kind_parameters(self): + dataframe = pd.DataFrame({"value": [1, 2, 3, 4, 5, 6, 7, 8, 9], + "id": [1, 1, 1, 1, 2, 2, 2, 2, 2], + "kind": [0, 1, 0, 1, 0, 1, 0, 1, 0]}) + + extended_dataframe = dataframe_functions.add_sub_time_series_index(dataframe, 2, + column_id="id", + column_kind="kind") + + self.assertEqual(list(extended_dataframe["id"]), + ["0,1", "0,1", "0,1", "0,1", "0,2", "0,2", "0,2", "0,2", "1,2"]) + assert_series_equal(dataframe["value"], extended_dataframe["value"]) + assert_series_equal(dataframe["kind"], extended_dataframe["kind"]) + + def test_sort_parameters(self): + dataframe = pd.DataFrame({"value": [1, 2, 3, 4, 5, 6, 7, 8, 9], + "id": [1, 1, 1, 1, 2, 2, 2, 2, 2], + "kind": [0, 1, 0, 1, 0, 1, 0, 1, 0], + "sort": [9, 8, 7, 6, 5, 4, 3, 2, 1]}) + + extended_dataframe = dataframe_functions.add_sub_time_series_index(dataframe, 2, + column_id="id", + column_kind="kind", + column_sort="sort") + + self.assertEqual(list(extended_dataframe["id"]), + ["0,2", "0,2", "0,2", "0,2", "1,2", "0,1", "0,1", "0,1", "0,1"]) + self.assertEqual(list(extended_dataframe["value"]), + [9, 8, 7, 6, 5, 4, 3, 2, 1]) + self.assertEqual(list(extended_dataframe["kind"]), + [0, 1, 0, 1, 0, 1, 0, 1, 0]) + self.assertEqual(list(extended_dataframe["sort"]), + [1, 2, 3, 4, 5, 6, 7, 8, 9]) + + def test_dict_input(self): + dataframe = pd.DataFrame({"value": [1, 2, 3, 4, 5, 6, 7, 8, 9], + "id": [1, 1, 1, 1, 2, 2, 2, 2, 2]}) + + extended_dataframe = dataframe_functions.add_sub_time_series_index({"1": dataframe}, 2, + column_id="id") + + self.assertIn("1", extended_dataframe) + + extended_dataframe = extended_dataframe["1"] + + self.assertEqual(list(extended_dataframe["id"]), + ["0,1", "0,1", "1,1", "1,1", "0,2", "0,2", "1,2", "1,2", "2,2"]) + assert_series_equal(dataframe["value"], extended_dataframe["value"]) diff --git a/tsfresh/utilities/dataframe_functions.py b/tsfresh/utilities/dataframe_functions.py index 015c7dc59..c039f10fc 100644 --- a/tsfresh/utilities/dataframe_functions.py +++ b/tsfresh/utilities/dataframe_functions.py @@ -553,3 +553,85 @@ def mask_first(x): df_shift = df_shift[mask] return df_shift, df["value"][1:] + + +def add_sub_time_series_index(df_or_dict, sub_length, column_id=None, column_sort=None, column_kind=None): + """ + Add a column "id" which contains: + 1. if column_id is None: for each kind (or if column_kind is None for the full dataframe) a new index built by + "sub-packaging" the data in packages of length "sub_length". For example if you have data with the + length of 11 and sub_length is 2, you will get 6 new packages: 0, 0; 1, 1; 2, 2; 3, 3; 4, 4; 5. + 2. if column_id is not None: the same as before, just for each id seperately. The old column_id values are added + to the new "id" column after a comma + + You can use this functions to turn a long measurement into sub-packages, where you want to extract features on. + + :param df_or_dict: a pandas DataFrame or a dictionary. The required shape/form of the object depends on the rest of + the passed arguments. + :type df_or_dict: pandas.DataFrame or dict + :param column_id: it must be present in the pandas DataFrame or in all DataFrames in the dictionary. + It is not allowed to have NaN values in this column. + :type column_id: basestring or None + :param column_sort: if not None, sort the rows by this column. It is not allowed to + have NaN values in this column. + :type column_sort: basestring or None + :param column_kind: It can only be used when passing a pandas DataFrame (the dictionary is already assumed to be + grouped by the kind). Is must be present in the DataFrame and no NaN values are allowed. + If the kind column is not passed, it is assumed that each column in the pandas DataFrame (except the id or + sort column) is a possible kind. + :type column_kind: basestring or None + + :return: The data frame or dictionary of data frames with a column "id" added + :rtype: the one from df_or_dict + """ + + if isinstance(df_or_dict, dict): + if column_kind is not None: + raise ValueError("You passed in a dictionary and gave a column name for the kind. Both are not possible.") + + return {key: add_sub_time_series_index(df_or_dict=df_or_dict[key], + sub_length=sub_length, + column_id=column_id, + column_sort=column_sort, + column_kind=column_kind) + for key in df_or_dict} + + df = df_or_dict + + grouper = [] + + if column_id is not None: + grouper.append(column_id) + if column_kind is not None: + grouper.append(column_kind) + + def _add_id_column(df_chunk): + chunk_length = len(df_chunk) + last_chunk_number = chunk_length // sub_length + reminder = chunk_length % sub_length + + indices = np.concatenate([np.repeat(np.arange(last_chunk_number), sub_length), + np.repeat(last_chunk_number, reminder)]) + assert(len(indices) == chunk_length) + + if column_id: + indices = [str(id) + "," + str(old_id) for id, old_id in zip(indices, df_chunk[column_id])] + + if column_sort: + df_chunk = df_chunk.sort_values(column_sort) + + df_chunk["id"] = indices + + return df_chunk + + if grouper: + df = df.groupby(grouper).apply(_add_id_column) + else: + df = _add_id_column(df) + + if column_sort: + df = df.sort_values(column_sort) + + df = df.set_index(df.index.get_level_values(-1)) + + return df