🧹

predict-idlab · Oct 12, 2023 · b35c77b · b35c77b
1 parent 80b28e6
commit b35c77b
Show file tree

Hide file tree

Showing 2 changed files with 26 additions and 30 deletions.
diff --git a/tests/benchmarks/test_featurecollection.py b/tests/benchmarks/test_featurecollection.py
@@ -52,29 +52,15 @@ def test_single_series_feature_collection_multiple_descriptors(
     benchmark(fc.calculate, dummy_data, n_jobs=n_cores)
 
 
-@pytest.mark.benchmark(group="group_by_consecutive collection")
+@pytest.mark.benchmark(group="group_by collection")
 @pytest.mark.parametrize("n_cores", NB_CORES)
 @pytest.mark.parametrize("func", FUNCS)
+@pytest.mark.parametrize("group_by", ["group_by_all", "group_by_consecutive"])
 def test_single_series_feature_collection_group_by_consecutive(
-    benchmark, n_cores, func, dummy_group_data  # noqa: F811
+    benchmark, n_cores, func, group_by, dummy_group_data  # noqa: F811
 ):
     fd = FeatureDescriptor(function=func, series_name="number_sold")
 
     fc = FeatureCollection(feature_descriptors=fd)
 
-    benchmark(
-        fc.calculate, dummy_group_data, group_by_consecutive="store", n_jobs=n_cores
-    )
-
-
-@pytest.mark.benchmark(group="group_by_all collection")
-@pytest.mark.parametrize("n_cores", NB_CORES)
-@pytest.mark.parametrize("func", FUNCS)
-def test_single_series_feature_collection_group_by_all(
-    benchmark, n_cores, func, dummy_group_data  # noqa: F811
-):
-    fd = FeatureDescriptor(function=func, series_name="number_sold")
-
-    fc = FeatureCollection(feature_descriptors=fd)
-
-    benchmark(fc.calculate, dummy_group_data, group_by_all="store", n_jobs=n_cores)
+    benchmark(fc.calculate, dummy_group_data, n_jobs=n_cores, **{group_by: "store"})
diff --git a/tsflex/features/feature_collection.py b/tsflex/features/feature_collection.py
@@ -294,9 +294,9 @@ def _executor_grouped(idx: int) -> pd.DataFrame:
         executor function (since we calculate the segment indices for the consecutive
         groups).
         """
-        # Uses the global get_group_func, group_indices, and group_idx_name
+        # Uses the global get_group_func, group_indices, and group_id_name
         data, function = get_group_func(idx)
-        index = group_indices.keys()
+        group_ids = group_indices.keys()  # group_ids are the keys of the group_indices
         cols = data.columns.values
 
         t_start = time.perf_counter()
@@ -321,17 +321,17 @@ def f(x: pd.DataFrame):
                     return function(*[x[c] for c in cols])
 
         # Function execution over the grouped data (accessed by using the group_indices)
-        out = np.array(list(map(f, [data.iloc[idx] for idx in index])))
+        out = np.array(list(map(f, [data.iloc[idx] for idx in group_indices.values()])))
 
         # Aggregate function output in a dictionary
         output_names = ["|".join(cols) + "__" + o for o in function.output_names]
-        feat_out = _process_func_output(out, index, output_names, str(function))
+        feat_out = _process_func_output(out, group_ids, output_names, str(function))
         # Log the function execution time
         _log_func_execution(
             t_start, function, tuple(cols), "groupby_all", "groupby_all", output_names
         )
 
-        return pd.DataFrame(feat_out, index=index).rename_axis(index=group_idx_name)
+        return pd.DataFrame(feat_out, index=group_ids).rename_axis(index=group_id_name)
 
     # def _get_stroll(self, kwargs):
     #     return StridedRollingFactory.get_segmenter(**kwargs)
@@ -510,9 +510,9 @@ def _calculate_group_by_all(
             where `func` is the FeatureDescriptor function and `x` is the name
             on which the FeatureDescriptor operates.
         """
-        global group_indices, group_idx_name, get_group_func
-        group_indices = grouped_data.indices
-        group_idx_name = grouped_data.grouper.names
+        global group_indices, group_id_name, get_group_func
+        group_indices = grouped_data.indices  # dict - group_id as key; indices as value
+        group_id_name = grouped_data.grouper.names  # name of the group col(s)
         get_group_func = self._group_feat_generator(grouped_data)
 
         return self._calculate_feature_list(
@@ -739,7 +739,12 @@ def _calculate_feature_list(
 
     def calculate(
         self,
-        data: Union[pd.Series, pd.DataFrame, List[Union[pd.Series, pd.DataFrame]]],
+        data: Union[
+            pd.Series,
+            pd.DataFrame,
+            List[Union[pd.Series, pd.DataFrame]],
+            pd.core.groupby.DataFrameGroupby,
+        ],
         stride: Optional[Union[float, str, pd.Timedelta, List, None]] = None,
         segment_start_idxs: Optional[
             Union[list, np.ndarray, pd.Series, pd.Index]
@@ -760,7 +765,7 @@ def calculate(
 
         Parameters
         ----------
-        data : Union[pd.Series, pd.DataFrame, List[Union[pd.Series, pd.DataFrame]]]
+        data : Union[pd.Series, pd.DataFrame, List[Union[pd.Series, pd.DataFrame]], pd.core.groupby.DataFrameGroupby]
             Dataframe or Series or list thereof, with all the required data for the
             feature calculation. \n
             **Assumptions**: \n
@@ -769,6 +774,8 @@ def calculate(
             numeric or a ``pd.DatetimeIndex``.
             * each Series / DataFrame index must be comparable with all others
             * we assume that each series-name / dataframe-column-name is unique.
+            Can also be a `DataFrameGroupBy` object, in which case the expected
+            behaviour is similar to grouping by all values in `group_by_all`.
         stride: Union[float, str, pd.Timedelta, List[Union[float, str, pd.Timedelta], None], optional
             The stride size. By default None. This argument supports multiple types: \n
             * If None, the stride of the `FeatureDescriptor` objects will be used.
@@ -864,6 +871,9 @@ def calculate(
             will be calculated. The output that is returned contains this `group_by`
             column as index to allow identifying the groups, and also contains all
             corresponding fields of used `FeatureDescriptor`s.
+            Rows with NaN values for this column will not be considered. This means that
+            no NaN values will be present for calculation of any of the
+            `FeatureDescriptor`s or for dividing in groups.
             .. note::
                 This is similar as passing a `DataFrameGroupBy` object as `data`
                 argument to the `calculate` method.
@@ -877,8 +887,8 @@ def calculate(
             identifying the groups, and also contains fields [`__start`, "__end"] which
             contain start and end time range for each result row. Also contains all
             corresponding fields of used `FeatureDescriptor`s.
-            Rows with NaN values will be dropped from input data before grouping. This
-            means that no NaN values will be present for calculation of any of the
+            Rows with NaN values for this column will not be considered. This means that
+            no NaN values will be present for calculation of any of the
             `FeatureDescriptor`s or for dividing in groups.
             Grouping column values will be grouped on exact matches. Groups can appear
             multiple times if they are appear in different time-gaps.