feature-engine · Ezzaldin97 · Feb 23, 2024 · Feb 23, 2024 · Feb 25, 2024 · Feb 25, 2024
diff --git a/feature_engine/selection/drop_psi_features.py b/feature_engine/selection/drop_psi_features.py
@@ -1,5 +1,5 @@
 import datetime
-from typing import List, Union
+from typing import Dict, List, Union
 
 import numpy as np
 import pandas as pd
@@ -475,7 +475,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series = None):
                 threshold_cat = self.threshold
 
         # Compute the PSI by looping over the features
-        self.psi_values_ = {}
+        self.psi_values_: Dict = {}
         self.features_to_drop_ = []
 
         # Compute PSI for numerical features

diff --git a/feature_engine/timeseries/forecasting/base_forecast_transformers.py b/feature_engine/timeseries/forecasting/base_forecast_transformers.py
@@ -51,6 +51,9 @@ class BaseForecastTransformer(BaseEstimator, TransformerMixin, GetFeatureNamesOu
 
     {drop_original}
 
+    group_by: str, str, int, or list of strings or integers, default=None
+            variable of list of variables to create lag features based on.
+
     Attributes
     ----------
     {feature_names_in_}
@@ -64,6 +67,7 @@ def __init__(
         variables: Union[None, int, str, List[Union[str, int]]] = None,
         missing_values: str = "raise",
         drop_original: bool = False,
+        group_by: Union[None, int, str, List[Union[str, int]]] = None,
     ) -> None:
 
         if missing_values not in ["raise", "ignore"]:
@@ -81,6 +85,7 @@ def __init__(
         self.variables = _check_variables_input_value(variables)
         self.missing_values = missing_values
         self.drop_original = drop_original
+        self.group_by = _check_variables_input_value(group_by)
 
     def _check_index(self, X: pd.DataFrame):
         """

diff --git a/feature_engine/timeseries/forecasting/expanding_window_features.py b/feature_engine/timeseries/forecasting/expanding_window_features.py
@@ -3,7 +3,7 @@
 
 from __future__ import annotations
 
-from typing import List
+from typing import List, Union
 
 import pandas as pd
 
@@ -93,6 +93,9 @@ class ExpandingWindowFeatures(BaseForecastTransformer):
 
     {drop_original}
 
+    group_by: str, str, int, or list of strings or integers, default=None
+            variable of list of variables to create lag features based on.
+
     Attributes
     ----------
     variables_:
@@ -139,6 +142,36 @@ class ExpandingWindowFeatures(BaseForecastTransformer):
     2  2022-09-20   3   8                1.5                6.5
     3  2022-09-21   4   9                2.0                7.0
     4  2022-09-22   5  10                2.5                7.5
+    create expanding window features based on other variables.
+    >>> import pandas as pd
+    >>> from feature_engine.timeseries.forecasting import ExpandingWindowFeatures
+    >>> X = pd.DataFrame(dict(date = ["2022-09-18",
+    >>>                          "2022-09-19",
+    >>>                          "2022-09-20",
+    >>>                          "2022-09-21",
+    >>>                          "2022-09-22",
+    >>>                          "2022-09-18",
+    >>>                          "2022-09-19",
+    >>>                          "2022-09-20",
+    >>>                          "2022-09-21",
+    >>>                          "2022-09-22"],
+    >>>                  x1 = [1,2,3,4,5, 3,5,6,8,11],
+    >>>                  x2 = [6,7,8,9,10, 2,9,10,15,2],
+    >>>                  x3=['a','a','a','a','a', 'b','b','b','b','b']
+    >>>                ))
+    >>> ewf = ExpandingWindowFeatures(group_by='x3')
+    >>> ewf.fit_transform(X)
+             date  x1  x2 x3  x1_expanding_mean  x2_expanding_mean
+    0  2022-09-18   1   6  a                NaN                NaN
+    1  2022-09-19   2   7  a           1.000000                6.0
+    2  2022-09-20   3   8  a           1.500000                6.5
+    3  2022-09-21   4   9  a           2.000000                7.0
+    4  2022-09-22   5  10  a           2.500000                7.5
+    5  2022-09-18   3   2  b                NaN                NaN
+    6  2022-09-19   5   9  b           3.000000                2.0
+    7  2022-09-20   6  10  b           4.000000                5.5
+    8  2022-09-21   8  15  b           4.666667                7.0
+    9  2022-09-22  11   2  b           5.500000                9.0
     """
 
     def __init__(
@@ -151,6 +184,7 @@ def __init__(
         sort_index: bool = True,
         missing_values: str = "raise",
         drop_original: bool = False,
+        group_by: Union[None, int, str, List[Union[str, int]]] = None,
     ) -> None:
 
         if not isinstance(functions, (str, list)) or not all(
@@ -168,7 +202,7 @@ def __init__(
                 f"periods must be a non-negative integer. Got {periods} instead."
             )
 
-        super().__init__(variables, missing_values, drop_original)
+        super().__init__(variables, missing_values, drop_original, group_by)
 
         self.min_periods = min_periods
         self.functions = functions
@@ -193,12 +227,17 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
         # Common dataframe checks and setting up.
         X = self._check_transform_input_and_state(X)
 
-        tmp = (
-            X[self.variables_]
-            .expanding(min_periods=self.min_periods)
-            .agg(self.functions)
-            .shift(periods=self.periods, freq=self.freq)
-        )
+        if self.group_by:
+            tmp = self._agg_expanding_window_features(
+                grouped_df=X.groupby(self.group_by)
+            )
+        else:
+            tmp = (
+                X[self.variables_]
+                .expanding(min_periods=self.min_periods)
+                .agg(self.functions)
+                .shift(periods=self.periods, freq=self.freq)
+            )
 
         tmp.columns = self._get_new_features_name()
 
@@ -224,3 +263,30 @@ def _get_new_features_name(self) -> List:
         ]
 
         return feature_names
+
+    def _agg_expanding_window_features(
+        self,
+        grouped_df: pd.core.groupby.generic.DataFrameGroupBy,
+    ) -> Union[pd.Series, pd.DataFrame]:
+        """generate expanding window features based on groups
+        Parameters
+        ----------
+        grouped_df : pd.core.groupby.generic.DataFrameGroupBy
+            dataframe of groups
+
+        Returns
+        -------
+        Union[pd.Series, pd.DataFrame]
+            returned expanding window features
+        """
+        tmp_data = []
+        for _, group in grouped_df:
+            tmp = (
+                group[self.variables_]
+                .expanding(min_periods=self.min_periods)
+                .agg(self.functions)
+                .shift(periods=self.periods, freq=self.freq)
+            )
+            tmp_data.append(tmp)
+        tmp = pd.concat(tmp_data).sort_index()
+        return tmp
diff --git a/feature_engine/timeseries/forecasting/lag_features.py b/feature_engine/timeseries/forecasting/lag_features.py
@@ -74,6 +74,9 @@ class LagFeatures(BaseForecastTransformer):
 
     {drop_original}
 
+    group_by: str, str, int, or list of strings or integers, default=None
+            variable of list of variables to create lag features based on.
+
     Attributes
     ----------
     variables_:
@@ -117,6 +120,26 @@ class LagFeatures(BaseForecastTransformer):
     2  2022-09-20   3   8       2.0       7.0       1.0       6.0
     3  2022-09-21   4   9       3.0       8.0       2.0       7.0
     4  2022-09-22   5  10       4.0       9.0       3.0       8.0
+    create lags based on other variables.
+    >>> import pandas as pd
+    >>> from feature_engine.timeseries.forecasting import LagFeatures
+    >>> X = pd.DataFrame(dict(date = ["2022-09-18",
+    >>>                               "2022-09-19",
+    >>>                               "2022-09-20",
+    >>>                               "2022-09-21",
+    >>>                               "2022-09-22"],
+    >>>                       x1 = [1,2,3,4,5],
+    >>>                       x2 = [6,7,8,9,10],
+    >>>                       x3 = ['a','b','a','b','a']
+    >>>                     ))
+    >>> lf = LagFeatures(periods=[1,2], group_by_variables='x3')
+    >>> lf.fit_transform(X)
+              date  x1  x2 x3  x1_lag_1  x2_lag_1  x1_lag_2  x2_lag_2
+    0  2022-09-18   1   6  a       NaN       NaN       NaN       NaN
+    1  2022-09-19   2   7  b       NaN       NaN       NaN       NaN
+    2  2022-09-20   3   8  a       1.0       6.0       NaN       NaN
+    3  2022-09-21   4   9  b       2.0       7.0       NaN       NaN
+    4  2022-09-22   5  10  a       3.0       8.0       1.0       6.0
     """
 
     def __init__(
@@ -127,6 +150,7 @@ def __init__(
         sort_index: bool = True,
         missing_values: str = "raise",
         drop_original: bool = False,
+        group_by: Union[None, int, str, List[Union[str, int]]] = None,
     ) -> None:
 
         if not (
@@ -151,7 +175,7 @@ def __init__(
                 "sort_index takes values True and False." f"Got {sort_index} instead."
             )
 
-        super().__init__(variables, missing_values, drop_original)
+        super().__init__(variables, missing_values, drop_original, group_by)
 
         self.periods = periods
         self.freq = freq
@@ -180,35 +204,57 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
             if isinstance(self.freq, list):
                 df_ls = []
                 for fr in self.freq:
-                    tmp = X[self.variables_].shift(
-                        freq=fr,
-                        axis=0,
-                    )
+                    if self.group_by:
+                        tmp = self._agg_freq_lags(
+                            grouped_df=X.groupby(self.group_by),
+                            freq=fr,
+                        )
+                    else:
+                        tmp = X[self.variables_].shift(
+                            freq=fr,
+                            axis=0,
+                        )
                     df_ls.append(tmp)
                 tmp = pd.concat(df_ls, axis=1)
 
             else:
-                tmp = X[self.variables_].shift(
-                    freq=self.freq,
-                    axis=0,
-                )
+                if self.group_by:
+                    tmp = self._agg_freq_lags(
+                        grouped_df=X.groupby(self.group_by),
+                        freq=self.freq,
+                    )
+                else:
+                    tmp = X[self.variables_].shift(
+                        freq=self.freq,
+                        axis=0,
+                    )
 
         else:
             if isinstance(self.periods, list):
                 df_ls = []
                 for pr in self.periods:
-                    tmp = X[self.variables_].shift(
-                        periods=pr,
-                        axis=0,
-                    )
+                    if self.group_by:
+                        tmp = X.groupby(self.group_by)[self.variables_].shift(
+                            periods=pr,
+                        )
+                    else:
+                        tmp = X[self.variables_].shift(
+                            periods=pr,
+                            axis=0,
+                        )
                     df_ls.append(tmp)
                 tmp = pd.concat(df_ls, axis=1)
 
             else:
-                tmp = X[self.variables_].shift(
-                    periods=self.periods,
-                    axis=0,
-                )
+                if self.group_by:
+                    tmp = X.groupby(self.group_by)[self.variables_].shift(
+                        periods=self.periods,
+                    )
+                else:
+                    tmp = X[self.variables_].shift(
+                        periods=self.periods,
+                        axis=0,
+                    )
 
         tmp.columns = self._get_new_features_name()
 
@@ -243,3 +289,30 @@ def _get_new_features_name(self) -> List:
             ]
 
         return feature_names
+
+    def _agg_freq_lags(
+        self,
+        grouped_df: pd.core.groupby.generic.DataFrameGroupBy,
+        freq: Union[str, List[str]],
+    ) -> Union[pd.Series, pd.DataFrame]:
+        """_summary_
+
+        Parameters
+        ----------
+        grouped_df : pd.core.groupby.generic.DataFrameGroupBy
+            dataframe of groups
+        freq : Union[str, List[str]]
+            Offset to use from the tseries module or time rule.
+
+        Returns
+        -------
+        Union[pd.Series, pd.DataFrame]
+            lag feature or dataframe of lag features
+        """
+        tmp_data = []
+        for _, group in grouped_df:
+            original_idx = group.index
+            tmp = group[self.variables_].shift(freq=freq).reindex(original_idx)
+            tmp_data.append(tmp)
+        tmp = pd.concat(tmp_data).sort_index()
+        return tmp