Skip to content

Commit

Permalink
🧹
Browse files Browse the repository at this point in the history
  • Loading branch information
jvdd committed Oct 12, 2023
1 parent 80b28e6 commit b35c77b
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 30 deletions.
22 changes: 4 additions & 18 deletions tests/benchmarks/test_featurecollection.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,29 +52,15 @@ def test_single_series_feature_collection_multiple_descriptors(
benchmark(fc.calculate, dummy_data, n_jobs=n_cores)


@pytest.mark.benchmark(group="group_by_consecutive collection")
@pytest.mark.benchmark(group="group_by collection")
@pytest.mark.parametrize("n_cores", NB_CORES)
@pytest.mark.parametrize("func", FUNCS)
@pytest.mark.parametrize("group_by", ["group_by_all", "group_by_consecutive"])
def test_single_series_feature_collection_group_by_consecutive(
benchmark, n_cores, func, dummy_group_data # noqa: F811
benchmark, n_cores, func, group_by, dummy_group_data # noqa: F811
):
fd = FeatureDescriptor(function=func, series_name="number_sold")

fc = FeatureCollection(feature_descriptors=fd)

benchmark(
fc.calculate, dummy_group_data, group_by_consecutive="store", n_jobs=n_cores
)


@pytest.mark.benchmark(group="group_by_all collection")
@pytest.mark.parametrize("n_cores", NB_CORES)
@pytest.mark.parametrize("func", FUNCS)
def test_single_series_feature_collection_group_by_all(
benchmark, n_cores, func, dummy_group_data # noqa: F811
):
fd = FeatureDescriptor(function=func, series_name="number_sold")

fc = FeatureCollection(feature_descriptors=fd)

benchmark(fc.calculate, dummy_group_data, group_by_all="store", n_jobs=n_cores)
benchmark(fc.calculate, dummy_group_data, n_jobs=n_cores, **{group_by: "store"})
34 changes: 22 additions & 12 deletions tsflex/features/feature_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,9 +294,9 @@ def _executor_grouped(idx: int) -> pd.DataFrame:
executor function (since we calculate the segment indices for the consecutive
groups).
"""
# Uses the global get_group_func, group_indices, and group_idx_name
# Uses the global get_group_func, group_indices, and group_id_name
data, function = get_group_func(idx)
index = group_indices.keys()
group_ids = group_indices.keys() # group_ids are the keys of the group_indices
cols = data.columns.values

t_start = time.perf_counter()
Expand All @@ -321,17 +321,17 @@ def f(x: pd.DataFrame):
return function(*[x[c] for c in cols])

# Function execution over the grouped data (accessed by using the group_indices)
out = np.array(list(map(f, [data.iloc[idx] for idx in index])))
out = np.array(list(map(f, [data.iloc[idx] for idx in group_indices.values()])))

# Aggregate function output in a dictionary
output_names = ["|".join(cols) + "__" + o for o in function.output_names]
feat_out = _process_func_output(out, index, output_names, str(function))
feat_out = _process_func_output(out, group_ids, output_names, str(function))
# Log the function execution time
_log_func_execution(
t_start, function, tuple(cols), "groupby_all", "groupby_all", output_names
)

return pd.DataFrame(feat_out, index=index).rename_axis(index=group_idx_name)
return pd.DataFrame(feat_out, index=group_ids).rename_axis(index=group_id_name)

# def _get_stroll(self, kwargs):
# return StridedRollingFactory.get_segmenter(**kwargs)
Expand Down Expand Up @@ -510,9 +510,9 @@ def _calculate_group_by_all(
where `func` is the FeatureDescriptor function and `x` is the name
on which the FeatureDescriptor operates.
"""
global group_indices, group_idx_name, get_group_func
group_indices = grouped_data.indices
group_idx_name = grouped_data.grouper.names
global group_indices, group_id_name, get_group_func
group_indices = grouped_data.indices # dict - group_id as key; indices as value
group_id_name = grouped_data.grouper.names # name of the group col(s)
get_group_func = self._group_feat_generator(grouped_data)

return self._calculate_feature_list(
Expand Down Expand Up @@ -739,7 +739,12 @@ def _calculate_feature_list(

def calculate(
self,
data: Union[pd.Series, pd.DataFrame, List[Union[pd.Series, pd.DataFrame]]],
data: Union[
pd.Series,
pd.DataFrame,
List[Union[pd.Series, pd.DataFrame]],
pd.core.groupby.DataFrameGroupby,
],
stride: Optional[Union[float, str, pd.Timedelta, List, None]] = None,
segment_start_idxs: Optional[
Union[list, np.ndarray, pd.Series, pd.Index]
Expand All @@ -760,7 +765,7 @@ def calculate(
Parameters
----------
data : Union[pd.Series, pd.DataFrame, List[Union[pd.Series, pd.DataFrame]]]
data : Union[pd.Series, pd.DataFrame, List[Union[pd.Series, pd.DataFrame]], pd.core.groupby.DataFrameGroupby]
Dataframe or Series or list thereof, with all the required data for the
feature calculation. \n
**Assumptions**: \n
Expand All @@ -769,6 +774,8 @@ def calculate(
numeric or a ``pd.DatetimeIndex``.
* each Series / DataFrame index must be comparable with all others
* we assume that each series-name / dataframe-column-name is unique.
Can also be a `DataFrameGroupBy` object, in which case the expected
behaviour is similar to grouping by all values in `group_by_all`.
stride: Union[float, str, pd.Timedelta, List[Union[float, str, pd.Timedelta], None], optional
The stride size. By default None. This argument supports multiple types: \n
* If None, the stride of the `FeatureDescriptor` objects will be used.
Expand Down Expand Up @@ -864,6 +871,9 @@ def calculate(
will be calculated. The output that is returned contains this `group_by`
column as index to allow identifying the groups, and also contains all
corresponding fields of used `FeatureDescriptor`s.
Rows with NaN values for this column will not be considered. This means that
no NaN values will be present for calculation of any of the
`FeatureDescriptor`s or for dividing in groups.
.. note::
This is similar as passing a `DataFrameGroupBy` object as `data`
argument to the `calculate` method.
Expand All @@ -877,8 +887,8 @@ def calculate(
identifying the groups, and also contains fields [`__start`, "__end"] which
contain start and end time range for each result row. Also contains all
corresponding fields of used `FeatureDescriptor`s.
Rows with NaN values will be dropped from input data before grouping. This
means that no NaN values will be present for calculation of any of the
Rows with NaN values for this column will not be considered. This means that
no NaN values will be present for calculation of any of the
`FeatureDescriptor`s or for dividing in groups.
Grouping column values will be grouped on exact matches. Groups can appear
multiple times if they are appear in different time-gaps.
Expand Down

0 comments on commit b35c77b

Please sign in to comment.