[BUG] Fix Imputer bugs (#24)

* [BUG] Fix Imputer bug * [BUG] Fix Imputer bug * Update sktime/transformations/series/impute.py * linting * linting
aeon-toolkit · Feb 20, 2023 · e1e3b75 · e1e3b75
1 parent 5f7dcc0
commit e1e3b75
Show file tree

Hide file tree

Showing 2 changed files with 63 additions and 30 deletions.
diff --git a/sktime/transformations/series/impute.py b/sktime/transformations/series/impute.py
@@ -54,14 +54,16 @@ class Imputer(BaseTransformer):
         The placeholder for the missing values. All occurrences of
         missing_values will be imputed, in addition to np.nan.
         If None, then only np.nan values are imputed.
-    value : int/float, default=None
-        Value to use to fill missing values when method="constant".
+    value : int/float, default=0
+        Value to use to fill missing values when method="constant" or
+        for other methods in case a column of X contains only NaN values.
     forecaster : Any Forecaster based on sktime.BaseForecaster, default=None
         Use a given Forecaster to impute by insample predictions when
         method="forecaster". Before fitting, missing data is imputed with
         method="ffill" or "bfill" as heuristic. in case of multivariate X,
         the forecaster is applied separete to each column like a
-        ColumnEnsembleForecaster.
+        ColumnEnsembleForecaster. Forecaster is only applied if the param
+        method="forecaster" is set, otherwise forecaster is ignored.
     random_state : int/float/str, optional
         Value to set random.seed() if method="random", default None
 
@@ -102,14 +104,14 @@ def __init__(
         self,
         method="drift",
         random_state=None,
-        value=None,
+        value=0,
         forecaster=None,
         missing_values=None,
     ):
-
         self.method = method
         self.missing_values = missing_values
         self.value = value
+
         self.forecaster = forecaster
         self.random_state = random_state
         super(Imputer, self).__init__()
@@ -175,6 +177,11 @@ def _transform(self, X, y=None):
         if self.missing_values:
             X = X.replace(to_replace=self.missing_values, value=np.nan)
 
+        # in case a column only contains missing values, fill with value
+        for col in X.columns:
+            if all(X[col].isna()):
+                X[col] = self.value
+
         if not _has_missing_values(X):
             return X
 
@@ -221,25 +228,15 @@ def _check_method(self):
             "forecaster",
         ]:
             raise ValueError(f"Given method {method} is not an allowed method.")
-        if (
-            self.value is not None
-            and method != "constant"
-            or method == "constant"
-            and self.value is None
-        ):
+        if method == "constant" and self.value is None:
             raise ValueError(
-                """Imputing with a value can only be
-                used if method="constant" and if parameter "value" is not None"""
+                """Imputing with method=\"constant\" can only be used if parameter
+                value" is not None"""
             )
-        elif (
-            self.forecaster is not None
-            and method != "forecaster"
-            or method == "forecaster"
-            and self.forecaster is None
-        ):
+        elif method == "forecaster" and self.forecaster is None:
             raise ValueError(
-                """Imputing with a forecaster can only be used if
-                method=\"forecaster\" and if arg forecaster is not None"""
+                """Imputing with method=\"forecaster\" can only be
+                used if param forecaster is not None"""
             )
         else:
             pass
@@ -286,10 +283,15 @@ def _impute_with_forecaster(self, X, y):
                 fh = ForecastingHorizon(values=na_index, is_relative=False)
 
                 # fill NaN before fitting with ffill and backfill (heuristic)
-
                 self._forecaster.fit(
-                    y=self._X[col].fillna(method="ffill").fillna(method="backfill"),
-                    X=self._y[col].fillna(method="ffill").fillna(method="backfill")
+                    y=self._X[col]
+                    .fillna(method="ffill")
+                    .fillna(method="backfill")
+                    .fillna(self.value),
+                    X=self._y[col]
+                    .fillna(method="ffill")
+                    .fillna(method="backfill")
+                    .fillna(self.value)
                     if self._y is not None
                     else None,
                 )

diff --git a/sktime/transformations/series/tests/test_imputer.py b/sktime/transformations/series/tests/test_imputer.py
@@ -7,8 +7,10 @@
 __all__ = []
 
 import numpy as np
+import pandas as pd
 import pytest
 
+from sktime.forecasting.model_selection import temporal_train_test_split
 from sktime.forecasting.naive import NaiveForecaster
 from sktime.transformations.series.impute import Imputer
 from sktime.utils._testing.forecasting import make_forecasting_problem
@@ -25,7 +27,8 @@
 y.iloc[-1] = np.nan
 
 
-@pytest.mark.parametrize("Z", [y, X])
+@pytest.mark.parametrize("forecaster", [None, NaiveForecaster()])
+@pytest.mark.parametrize("X", [y, X])
 @pytest.mark.parametrize(
     "method",
     [
@@ -41,10 +44,38 @@
         "forecaster",
     ],
 )
-def test_imputer(method, Z):
+def test_imputer(method, X, forecaster):
     """Test univariate and multivariate Imputer with all methods."""
-    forecaster = NaiveForecaster() if method == "forecaster" else None
-    value = 3 if method == "constant" else None
-    t = Imputer(method=method, forecaster=forecaster, value=value)
-    y_hat = t.fit_transform(Z)
+    forecaster = NaiveForecaster() if method == "forecaster" else forecaster
+
+    t = Imputer(method=method, forecaster=forecaster)
+    y_hat = t.fit_transform(X)
     assert not y_hat.isnull().to_numpy().any()
+
+    # test train and transform data is different
+    X_train, X_test = temporal_train_test_split(X, test_size=5)
+
+    t = Imputer(method=method, forecaster=forecaster)
+    t = t.fit(X_train)
+    y_hat = t.transform(X_test)
+    assert not y_hat.isnull().to_numpy().any()
+
+    # test some columns only contain NaN, either in fit or transform
+    t = Imputer(method=method, forecaster=forecaster)
+    # one column only contains NaN
+    if isinstance(X, pd.Series):
+        X = X.to_frame()
+        X.iloc[:, 0] = np.nan
+        X = pd.Series(X.iloc[:, 0])
+
+    y_hat = t.fit_transform(X)
+    assert not y_hat.isnull().to_numpy().any()
+
+    if isinstance(X, pd.DataFrame):
+        X_train.iloc[:, 0] = np.nan
+        X_test.iloc[:, 1] = np.nan
+
+        t = Imputer(method=method, forecaster=NaiveForecaster())
+        t = t.fit(X_train)
+        y_hat = t.transform(X_test)
+        assert not y_hat.isnull().to_numpy().any()