Skip to content

Commit

Permalink
Add add_sub_time_series_index (#666)
Browse files Browse the repository at this point in the history
* Add add_sub_time_series_index

This fixes #633.

* Pep8ify

* Add a test for the sort parameter

* Add a test for dict input

* Be python 3.5 compatible...

* Return a cleaned dataframe

* Changelog

Co-authored-by: Nils Braun <nilslennartbraun@gmail.com>
  • Loading branch information
nils-braun and nils-braun authored Apr 25, 2020
1 parent c819de8 commit 5f1dadb
Show file tree
Hide file tree
Showing 3 changed files with 154 additions and 5 deletions.
1 change: 1 addition & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ Unreleased
- Added variation coefficient (#654)
- Added the datetimeindex explanation from the notebook to the docs (#661)
- Optimize RelevantFeatureAugmenter to avoid re-extraction (#669)
- Added a function `add_sub_time_series_index` (#666)
- Bugfixes
- Increase the extracted `ar` coefficients to the full parameter range. (#662)
- Documentation fixes (#663, #664, #665)
Expand Down
76 changes: 71 additions & 5 deletions tests/units/utilities/test_dataframe_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@
import numpy as np
from pandas.testing import assert_frame_equal, assert_series_equal

from tsfresh.utilities.dataframe_functions import get_ids


class NormalizeTestCase(TestCase):
def test_with_dictionaries_one_row(self):
Expand Down Expand Up @@ -775,13 +773,81 @@ class GetIDsTestCase(TestCase):

def test_get_id__correct_DataFrame(self):
df = pd.DataFrame({"_value": [1, 2, 3, 4, 10, 11], "id": [1, 1, 1, 1, 2, 2]})
self.assertEqual(get_ids(df, "id"), {1, 2})
self.assertEqual(dataframe_functions.get_ids(df, "id"), {1, 2})

def test_get_id__correct_dict(self):
df_dict = {"a": pd.DataFrame({"_value": [1, 2, 3, 4, 10, 11], "id": [1, 1, 1, 1, 2, 2]}),
"b": pd.DataFrame({"_value": [5, 6, 7, 8, 12, 13], "id": [4, 4, 3, 3, 2, 2]})}
self.assertEqual(get_ids(df_dict, "id"), {1, 2, 3, 4})
self.assertEqual(dataframe_functions.get_ids(df_dict, "id"), {1, 2, 3, 4})

def test_get_id_wrong(self):
other_type = np.array([1, 2, 3])
self.assertRaises(TypeError, get_ids, other_type, "id")
self.assertRaises(TypeError, dataframe_functions.get_ids, other_type, "id")


class AddSubIdTestCase(TestCase):
def test_no_parameters(self):
dataframe = pd.DataFrame({"value": [1, 2, 3, 4, 5, 6, 7, 8, 9]})
extended_dataframe = dataframe_functions.add_sub_time_series_index(dataframe, 2)

self.assertEqual(list(extended_dataframe["id"]), [0, 0, 1, 1, 2, 2, 3, 3, 4])
assert_series_equal(dataframe["value"], extended_dataframe["value"])

def test_id_parameters(self):
dataframe = pd.DataFrame({"value": [1, 2, 3, 4, 5, 6, 7, 8, 9],
"id": [1, 1, 1, 1, 2, 2, 2, 2, 2]})

extended_dataframe = dataframe_functions.add_sub_time_series_index(dataframe, 2, column_id="id")

self.assertEqual(list(extended_dataframe["id"]),
["0,1", "0,1", "1,1", "1,1", "0,2", "0,2", "1,2", "1,2", "2,2"])
assert_series_equal(dataframe["value"], extended_dataframe["value"])

def test_kind_parameters(self):
dataframe = pd.DataFrame({"value": [1, 2, 3, 4, 5, 6, 7, 8, 9],
"id": [1, 1, 1, 1, 2, 2, 2, 2, 2],
"kind": [0, 1, 0, 1, 0, 1, 0, 1, 0]})

extended_dataframe = dataframe_functions.add_sub_time_series_index(dataframe, 2,
column_id="id",
column_kind="kind")

self.assertEqual(list(extended_dataframe["id"]),
["0,1", "0,1", "0,1", "0,1", "0,2", "0,2", "0,2", "0,2", "1,2"])
assert_series_equal(dataframe["value"], extended_dataframe["value"])
assert_series_equal(dataframe["kind"], extended_dataframe["kind"])

def test_sort_parameters(self):
dataframe = pd.DataFrame({"value": [1, 2, 3, 4, 5, 6, 7, 8, 9],
"id": [1, 1, 1, 1, 2, 2, 2, 2, 2],
"kind": [0, 1, 0, 1, 0, 1, 0, 1, 0],
"sort": [9, 8, 7, 6, 5, 4, 3, 2, 1]})

extended_dataframe = dataframe_functions.add_sub_time_series_index(dataframe, 2,
column_id="id",
column_kind="kind",
column_sort="sort")

self.assertEqual(list(extended_dataframe["id"]),
["0,2", "0,2", "0,2", "0,2", "1,2", "0,1", "0,1", "0,1", "0,1"])
self.assertEqual(list(extended_dataframe["value"]),
[9, 8, 7, 6, 5, 4, 3, 2, 1])
self.assertEqual(list(extended_dataframe["kind"]),
[0, 1, 0, 1, 0, 1, 0, 1, 0])
self.assertEqual(list(extended_dataframe["sort"]),
[1, 2, 3, 4, 5, 6, 7, 8, 9])

def test_dict_input(self):
dataframe = pd.DataFrame({"value": [1, 2, 3, 4, 5, 6, 7, 8, 9],
"id": [1, 1, 1, 1, 2, 2, 2, 2, 2]})

extended_dataframe = dataframe_functions.add_sub_time_series_index({"1": dataframe}, 2,
column_id="id")

self.assertIn("1", extended_dataframe)

extended_dataframe = extended_dataframe["1"]

self.assertEqual(list(extended_dataframe["id"]),
["0,1", "0,1", "1,1", "1,1", "0,2", "0,2", "1,2", "1,2", "2,2"])
assert_series_equal(dataframe["value"], extended_dataframe["value"])
82 changes: 82 additions & 0 deletions tsfresh/utilities/dataframe_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -553,3 +553,85 @@ def mask_first(x):
df_shift = df_shift[mask]

return df_shift, df["value"][1:]


def add_sub_time_series_index(df_or_dict, sub_length, column_id=None, column_sort=None, column_kind=None):
"""
Add a column "id" which contains:
1. if column_id is None: for each kind (or if column_kind is None for the full dataframe) a new index built by
"sub-packaging" the data in packages of length "sub_length". For example if you have data with the
length of 11 and sub_length is 2, you will get 6 new packages: 0, 0; 1, 1; 2, 2; 3, 3; 4, 4; 5.
2. if column_id is not None: the same as before, just for each id seperately. The old column_id values are added
to the new "id" column after a comma
You can use this functions to turn a long measurement into sub-packages, where you want to extract features on.
:param df_or_dict: a pandas DataFrame or a dictionary. The required shape/form of the object depends on the rest of
the passed arguments.
:type df_or_dict: pandas.DataFrame or dict
:param column_id: it must be present in the pandas DataFrame or in all DataFrames in the dictionary.
It is not allowed to have NaN values in this column.
:type column_id: basestring or None
:param column_sort: if not None, sort the rows by this column. It is not allowed to
have NaN values in this column.
:type column_sort: basestring or None
:param column_kind: It can only be used when passing a pandas DataFrame (the dictionary is already assumed to be
grouped by the kind). Is must be present in the DataFrame and no NaN values are allowed.
If the kind column is not passed, it is assumed that each column in the pandas DataFrame (except the id or
sort column) is a possible kind.
:type column_kind: basestring or None
:return: The data frame or dictionary of data frames with a column "id" added
:rtype: the one from df_or_dict
"""

if isinstance(df_or_dict, dict):
if column_kind is not None:
raise ValueError("You passed in a dictionary and gave a column name for the kind. Both are not possible.")

return {key: add_sub_time_series_index(df_or_dict=df_or_dict[key],
sub_length=sub_length,
column_id=column_id,
column_sort=column_sort,
column_kind=column_kind)
for key in df_or_dict}

df = df_or_dict

grouper = []

if column_id is not None:
grouper.append(column_id)
if column_kind is not None:
grouper.append(column_kind)

def _add_id_column(df_chunk):
chunk_length = len(df_chunk)
last_chunk_number = chunk_length // sub_length
reminder = chunk_length % sub_length

indices = np.concatenate([np.repeat(np.arange(last_chunk_number), sub_length),
np.repeat(last_chunk_number, reminder)])
assert(len(indices) == chunk_length)

if column_id:
indices = [str(id) + "," + str(old_id) for id, old_id in zip(indices, df_chunk[column_id])]

if column_sort:
df_chunk = df_chunk.sort_values(column_sort)

df_chunk["id"] = indices

return df_chunk

if grouper:
df = df.groupby(grouper).apply(_add_id_column)
else:
df = _add_id_column(df)

if column_sort:
df = df.sort_values(column_sort)

df = df.set_index(df.index.get_level_values(-1))

return df

0 comments on commit 5f1dadb

Please sign in to comment.