Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add add_sub_time_series_index #666

Merged
merged 8 commits into from
Apr 25, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ Unreleased
- Added variation coefficient (#654)
- Added the datetimeindex explanation from the notebook to the docs (#661)
- Optimize RelevantFeatureAugmenter to avoid re-extraction (#669)
- Added a function `add_sub_time_series_index` (#666)
- Bugfixes
- Increase the extracted `ar` coefficients to the full parameter range. (#662)
- Documentation fixes (#663, #664, #665)
Expand Down
76 changes: 71 additions & 5 deletions tests/units/utilities/test_dataframe_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@
import numpy as np
from pandas.testing import assert_frame_equal, assert_series_equal

from tsfresh.utilities.dataframe_functions import get_ids


class NormalizeTestCase(TestCase):
def test_with_dictionaries_one_row(self):
Expand Down Expand Up @@ -775,13 +773,81 @@ class GetIDsTestCase(TestCase):

def test_get_id__correct_DataFrame(self):
df = pd.DataFrame({"_value": [1, 2, 3, 4, 10, 11], "id": [1, 1, 1, 1, 2, 2]})
self.assertEqual(get_ids(df, "id"), {1, 2})
self.assertEqual(dataframe_functions.get_ids(df, "id"), {1, 2})

def test_get_id__correct_dict(self):
df_dict = {"a": pd.DataFrame({"_value": [1, 2, 3, 4, 10, 11], "id": [1, 1, 1, 1, 2, 2]}),
"b": pd.DataFrame({"_value": [5, 6, 7, 8, 12, 13], "id": [4, 4, 3, 3, 2, 2]})}
self.assertEqual(get_ids(df_dict, "id"), {1, 2, 3, 4})
self.assertEqual(dataframe_functions.get_ids(df_dict, "id"), {1, 2, 3, 4})

def test_get_id_wrong(self):
other_type = np.array([1, 2, 3])
self.assertRaises(TypeError, get_ids, other_type, "id")
self.assertRaises(TypeError, dataframe_functions.get_ids, other_type, "id")


class AddSubIdTestCase(TestCase):
def test_no_parameters(self):
dataframe = pd.DataFrame({"value": [1, 2, 3, 4, 5, 6, 7, 8, 9]})
extended_dataframe = dataframe_functions.add_sub_time_series_index(dataframe, 2)

self.assertEqual(list(extended_dataframe["id"]), [0, 0, 1, 1, 2, 2, 3, 3, 4])
assert_series_equal(dataframe["value"], extended_dataframe["value"])

def test_id_parameters(self):
dataframe = pd.DataFrame({"value": [1, 2, 3, 4, 5, 6, 7, 8, 9],
"id": [1, 1, 1, 1, 2, 2, 2, 2, 2]})

extended_dataframe = dataframe_functions.add_sub_time_series_index(dataframe, 2, column_id="id")

self.assertEqual(list(extended_dataframe["id"]),
["0,1", "0,1", "1,1", "1,1", "0,2", "0,2", "1,2", "1,2", "2,2"])
assert_series_equal(dataframe["value"], extended_dataframe["value"])

def test_kind_parameters(self):
dataframe = pd.DataFrame({"value": [1, 2, 3, 4, 5, 6, 7, 8, 9],
"id": [1, 1, 1, 1, 2, 2, 2, 2, 2],
"kind": [0, 1, 0, 1, 0, 1, 0, 1, 0]})

extended_dataframe = dataframe_functions.add_sub_time_series_index(dataframe, 2,
column_id="id",
column_kind="kind")

self.assertEqual(list(extended_dataframe["id"]),
["0,1", "0,1", "0,1", "0,1", "0,2", "0,2", "0,2", "0,2", "1,2"])
assert_series_equal(dataframe["value"], extended_dataframe["value"])
assert_series_equal(dataframe["kind"], extended_dataframe["kind"])

def test_sort_parameters(self):
dataframe = pd.DataFrame({"value": [1, 2, 3, 4, 5, 6, 7, 8, 9],
"id": [1, 1, 1, 1, 2, 2, 2, 2, 2],
"kind": [0, 1, 0, 1, 0, 1, 0, 1, 0],
"sort": [9, 8, 7, 6, 5, 4, 3, 2, 1]})

extended_dataframe = dataframe_functions.add_sub_time_series_index(dataframe, 2,
column_id="id",
column_kind="kind",
column_sort="sort")

self.assertEqual(list(extended_dataframe["id"]),
["0,2", "0,2", "0,2", "0,2", "1,2", "0,1", "0,1", "0,1", "0,1"])
self.assertEqual(list(extended_dataframe["value"]),
[9, 8, 7, 6, 5, 4, 3, 2, 1])
self.assertEqual(list(extended_dataframe["kind"]),
[0, 1, 0, 1, 0, 1, 0, 1, 0])
self.assertEqual(list(extended_dataframe["sort"]),
[1, 2, 3, 4, 5, 6, 7, 8, 9])

def test_dict_input(self):
dataframe = pd.DataFrame({"value": [1, 2, 3, 4, 5, 6, 7, 8, 9],
"id": [1, 1, 1, 1, 2, 2, 2, 2, 2]})

extended_dataframe = dataframe_functions.add_sub_time_series_index({"1": dataframe}, 2,
column_id="id")

self.assertIn("1", extended_dataframe)

extended_dataframe = extended_dataframe["1"]

self.assertEqual(list(extended_dataframe["id"]),
["0,1", "0,1", "1,1", "1,1", "0,2", "0,2", "1,2", "1,2", "2,2"])
assert_series_equal(dataframe["value"], extended_dataframe["value"])
82 changes: 82 additions & 0 deletions tsfresh/utilities/dataframe_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -553,3 +553,85 @@ def mask_first(x):
df_shift = df_shift[mask]

return df_shift, df["value"][1:]


def add_sub_time_series_index(df_or_dict, sub_length, column_id=None, column_sort=None, column_kind=None):
"""
Add a column "id" which contains:
1. if column_id is None: for each kind (or if column_kind is None for the full dataframe) a new index built by
"sub-packaging" the data in packages of length "sub_length". For example if you have data with the
length of 11 and sub_length is 2, you will get 6 new packages: 0, 0; 1, 1; 2, 2; 3, 3; 4, 4; 5.
2. if column_id is not None: the same as before, just for each id seperately. The old column_id values are added
to the new "id" column after a comma

You can use this functions to turn a long measurement into sub-packages, where you want to extract features on.

:param df_or_dict: a pandas DataFrame or a dictionary. The required shape/form of the object depends on the rest of
the passed arguments.
:type df_or_dict: pandas.DataFrame or dict
:param column_id: it must be present in the pandas DataFrame or in all DataFrames in the dictionary.
It is not allowed to have NaN values in this column.
:type column_id: basestring or None
:param column_sort: if not None, sort the rows by this column. It is not allowed to
have NaN values in this column.
:type column_sort: basestring or None
:param column_kind: It can only be used when passing a pandas DataFrame (the dictionary is already assumed to be
grouped by the kind). Is must be present in the DataFrame and no NaN values are allowed.
If the kind column is not passed, it is assumed that each column in the pandas DataFrame (except the id or
sort column) is a possible kind.
:type column_kind: basestring or None

:return: The data frame or dictionary of data frames with a column "id" added
:rtype: the one from df_or_dict
"""

if isinstance(df_or_dict, dict):
if column_kind is not None:
raise ValueError("You passed in a dictionary and gave a column name for the kind. Both are not possible.")

return {key: add_sub_time_series_index(df_or_dict=df_or_dict[key],
sub_length=sub_length,
column_id=column_id,
column_sort=column_sort,
column_kind=column_kind)
for key in df_or_dict}

df = df_or_dict

grouper = []

if column_id is not None:
grouper.append(column_id)
if column_kind is not None:
grouper.append(column_kind)

def _add_id_column(df_chunk):
chunk_length = len(df_chunk)
last_chunk_number = chunk_length // sub_length
reminder = chunk_length % sub_length

indices = np.concatenate([np.repeat(np.arange(last_chunk_number), sub_length),
np.repeat(last_chunk_number, reminder)])
assert(len(indices) == chunk_length)

if column_id:
indices = [str(id) + "," + str(old_id) for id, old_id in zip(indices, df_chunk[column_id])]

if column_sort:
df_chunk = df_chunk.sort_values(column_sort)

df_chunk["id"] = indices

return df_chunk

if grouper:
df = df.groupby(grouper).apply(_add_id_column)
else:
df = _add_id_column(df)

if column_sort:
df = df.sort_values(column_sort)

df = df.set_index(df.index.get_level_values(-1))

return df