From 5f1dadb1401e7e3ca8d6dbc9497ce2a2fe2c515d Mon Sep 17 00:00:00 2001
From: Nils Braun <nils-braun@users.noreply.github.com>
Date: Sat, 25 Apr 2020 13:19:42 +0200
Subject: [PATCH] Add add_sub_time_series_index (#666)

* Add add_sub_time_series_index

This fixes #633.

* Pep8ify

* Add a test for the sort parameter

* Add a test for dict input

* Be python 3.5 compatible...

* Return a cleaned dataframe

* Changelog

Co-authored-by: Nils Braun <nilslennartbraun@gmail.com>
---
 CHANGES.rst                                   |  1 +
 .../utilities/test_dataframe_functions.py     | 76 +++++++++++++++--
 tsfresh/utilities/dataframe_functions.py      | 82 +++++++++++++++++++
 3 files changed, 154 insertions(+), 5 deletions(-)

diff --git a/CHANGES.rst b/CHANGES.rst
index e1141325f..e09199582 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -16,6 +16,7 @@ Unreleased
     - Added variation coefficient (#654)
     - Added the datetimeindex explanation from the notebook to the docs (#661)
     - Optimize RelevantFeatureAugmenter to avoid re-extraction (#669)
+    - Added a function `add_sub_time_series_index` (#666)
 - Bugfixes
     - Increase the extracted `ar` coefficients to the full parameter range. (#662)
     - Documentation fixes (#663, #664, #665)
diff --git a/tests/units/utilities/test_dataframe_functions.py b/tests/units/utilities/test_dataframe_functions.py
index 72646d6db..a7a5705ba 100644
--- a/tests/units/utilities/test_dataframe_functions.py
+++ b/tests/units/utilities/test_dataframe_functions.py
@@ -9,8 +9,6 @@
 import numpy as np
 from pandas.testing import assert_frame_equal, assert_series_equal
 
-from tsfresh.utilities.dataframe_functions import get_ids
-
 
 class NormalizeTestCase(TestCase):
     def test_with_dictionaries_one_row(self):
@@ -775,13 +773,81 @@ class GetIDsTestCase(TestCase):
 
     def test_get_id__correct_DataFrame(self):
         df = pd.DataFrame({"_value": [1, 2, 3, 4, 10, 11], "id": [1, 1, 1, 1, 2, 2]})
-        self.assertEqual(get_ids(df, "id"), {1, 2})
+        self.assertEqual(dataframe_functions.get_ids(df, "id"), {1, 2})
 
     def test_get_id__correct_dict(self):
         df_dict = {"a": pd.DataFrame({"_value": [1, 2, 3, 4, 10, 11], "id": [1, 1, 1, 1, 2, 2]}),
                    "b": pd.DataFrame({"_value": [5, 6, 7, 8, 12, 13], "id": [4, 4, 3, 3, 2, 2]})}
-        self.assertEqual(get_ids(df_dict, "id"), {1, 2, 3, 4})
+        self.assertEqual(dataframe_functions.get_ids(df_dict, "id"), {1, 2, 3, 4})
 
     def test_get_id_wrong(self):
         other_type = np.array([1, 2, 3])
-        self.assertRaises(TypeError, get_ids, other_type, "id")
+        self.assertRaises(TypeError, dataframe_functions.get_ids, other_type, "id")
+
+
+class AddSubIdTestCase(TestCase):
+    def test_no_parameters(self):
+        dataframe = pd.DataFrame({"value": [1, 2, 3, 4, 5, 6, 7, 8, 9]})
+        extended_dataframe = dataframe_functions.add_sub_time_series_index(dataframe, 2)
+
+        self.assertEqual(list(extended_dataframe["id"]), [0, 0, 1, 1, 2, 2, 3, 3, 4])
+        assert_series_equal(dataframe["value"], extended_dataframe["value"])
+
+    def test_id_parameters(self):
+        dataframe = pd.DataFrame({"value": [1, 2, 3, 4, 5, 6, 7, 8, 9],
+                                  "id": [1, 1, 1, 1, 2, 2, 2, 2, 2]})
+
+        extended_dataframe = dataframe_functions.add_sub_time_series_index(dataframe, 2, column_id="id")
+
+        self.assertEqual(list(extended_dataframe["id"]),
+                         ["0,1", "0,1", "1,1", "1,1", "0,2", "0,2", "1,2", "1,2", "2,2"])
+        assert_series_equal(dataframe["value"], extended_dataframe["value"])
+
+    def test_kind_parameters(self):
+        dataframe = pd.DataFrame({"value": [1, 2, 3, 4, 5, 6, 7, 8, 9],
+                                  "id": [1, 1, 1, 1, 2, 2, 2, 2, 2],
+                                  "kind": [0, 1, 0, 1, 0, 1, 0, 1, 0]})
+
+        extended_dataframe = dataframe_functions.add_sub_time_series_index(dataframe, 2,
+                                                                           column_id="id",
+                                                                           column_kind="kind")
+
+        self.assertEqual(list(extended_dataframe["id"]),
+                         ["0,1", "0,1", "0,1", "0,1", "0,2", "0,2", "0,2", "0,2", "1,2"])
+        assert_series_equal(dataframe["value"], extended_dataframe["value"])
+        assert_series_equal(dataframe["kind"], extended_dataframe["kind"])
+
+    def test_sort_parameters(self):
+        dataframe = pd.DataFrame({"value": [1, 2, 3, 4, 5, 6, 7, 8, 9],
+                                  "id": [1, 1, 1, 1, 2, 2, 2, 2, 2],
+                                  "kind": [0, 1, 0, 1, 0, 1, 0, 1, 0],
+                                  "sort": [9, 8, 7, 6, 5, 4, 3, 2, 1]})
+
+        extended_dataframe = dataframe_functions.add_sub_time_series_index(dataframe, 2,
+                                                                           column_id="id",
+                                                                           column_kind="kind",
+                                                                           column_sort="sort")
+
+        self.assertEqual(list(extended_dataframe["id"]),
+                         ["0,2", "0,2", "0,2", "0,2", "1,2", "0,1", "0,1", "0,1", "0,1"])
+        self.assertEqual(list(extended_dataframe["value"]),
+                         [9, 8, 7, 6, 5, 4, 3, 2, 1])
+        self.assertEqual(list(extended_dataframe["kind"]),
+                         [0, 1, 0, 1, 0, 1, 0, 1, 0])
+        self.assertEqual(list(extended_dataframe["sort"]),
+                         [1, 2, 3, 4, 5, 6, 7, 8, 9])
+
+    def test_dict_input(self):
+        dataframe = pd.DataFrame({"value": [1, 2, 3, 4, 5, 6, 7, 8, 9],
+                                  "id": [1, 1, 1, 1, 2, 2, 2, 2, 2]})
+
+        extended_dataframe = dataframe_functions.add_sub_time_series_index({"1": dataframe}, 2,
+                                                                           column_id="id")
+
+        self.assertIn("1", extended_dataframe)
+
+        extended_dataframe = extended_dataframe["1"]
+
+        self.assertEqual(list(extended_dataframe["id"]),
+                         ["0,1", "0,1", "1,1", "1,1", "0,2", "0,2", "1,2", "1,2", "2,2"])
+        assert_series_equal(dataframe["value"], extended_dataframe["value"])
diff --git a/tsfresh/utilities/dataframe_functions.py b/tsfresh/utilities/dataframe_functions.py
index 015c7dc59..c039f10fc 100644
--- a/tsfresh/utilities/dataframe_functions.py
+++ b/tsfresh/utilities/dataframe_functions.py
@@ -553,3 +553,85 @@ def mask_first(x):
     df_shift = df_shift[mask]
 
     return df_shift, df["value"][1:]
+
+
+def add_sub_time_series_index(df_or_dict, sub_length, column_id=None, column_sort=None, column_kind=None):
+    """
+    Add a column "id" which contains:
+    1. if column_id is None: for each kind (or if column_kind is None for the full dataframe) a new index built by
+       "sub-packaging" the data in packages of length "sub_length". For example if you have data with the
+       length of 11 and sub_length is 2, you will get 6 new packages: 0, 0; 1, 1; 2, 2; 3, 3; 4, 4; 5.
+    2. if column_id is not None: the same as before, just for each id seperately. The old column_id values are added
+       to the new "id" column after a comma
+
+    You can use this functions to turn a long measurement into sub-packages, where you want to extract features on.
+
+    :param df_or_dict: a pandas DataFrame or a dictionary. The required shape/form of the object depends on the rest of
+        the passed arguments.
+    :type df_or_dict: pandas.DataFrame or dict
+    :param column_id: it must be present in the pandas DataFrame or in all DataFrames in the dictionary.
+        It is not allowed to have NaN values in this column.
+    :type column_id: basestring or None
+    :param column_sort: if not None, sort the rows by this column. It is not allowed to
+        have NaN values in this column.
+    :type column_sort: basestring or None
+    :param column_kind: It can only be used when passing a pandas DataFrame (the dictionary is already assumed to be
+        grouped by the kind). Is must be present in the DataFrame and no NaN values are allowed.
+        If the kind column is not passed, it is assumed that each column in the pandas DataFrame (except the id or
+        sort column) is a possible kind.
+    :type column_kind: basestring or None
+
+    :return: The data frame or dictionary of data frames with a column "id" added
+    :rtype: the one from df_or_dict
+    """
+
+    if isinstance(df_or_dict, dict):
+        if column_kind is not None:
+            raise ValueError("You passed in a dictionary and gave a column name for the kind. Both are not possible.")
+
+        return {key: add_sub_time_series_index(df_or_dict=df_or_dict[key],
+                                               sub_length=sub_length,
+                                               column_id=column_id,
+                                               column_sort=column_sort,
+                                               column_kind=column_kind)
+                for key in df_or_dict}
+
+    df = df_or_dict
+
+    grouper = []
+
+    if column_id is not None:
+        grouper.append(column_id)
+    if column_kind is not None:
+        grouper.append(column_kind)
+
+    def _add_id_column(df_chunk):
+        chunk_length = len(df_chunk)
+        last_chunk_number = chunk_length // sub_length
+        reminder = chunk_length % sub_length
+
+        indices = np.concatenate([np.repeat(np.arange(last_chunk_number), sub_length),
+                                  np.repeat(last_chunk_number, reminder)])
+        assert(len(indices) == chunk_length)
+
+        if column_id:
+            indices = [str(id) + "," + str(old_id) for id, old_id in zip(indices, df_chunk[column_id])]
+
+        if column_sort:
+            df_chunk = df_chunk.sort_values(column_sort)
+
+        df_chunk["id"] = indices
+
+        return df_chunk
+
+    if grouper:
+        df = df.groupby(grouper).apply(_add_id_column)
+    else:
+        df = _add_id_column(df)
+
+    if column_sort:
+        df = df.sort_values(column_sort)
+
+    df = df.set_index(df.index.get_level_values(-1))
+
+    return df