Skip to content

Commit

Permalink
[BUG] Fix Imputer bugs (#24)
Browse files Browse the repository at this point in the history
* [BUG] Fix Imputer bug

* [BUG] Fix Imputer bug

* Update sktime/transformations/series/impute.py

* linting

* linting
  • Loading branch information
aiwalter authored Feb 20, 2023
1 parent 5f7dcc0 commit e1e3b75
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 30 deletions.
50 changes: 26 additions & 24 deletions sktime/transformations/series/impute.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,14 +54,16 @@ class Imputer(BaseTransformer):
The placeholder for the missing values. All occurrences of
missing_values will be imputed, in addition to np.nan.
If None, then only np.nan values are imputed.
value : int/float, default=None
Value to use to fill missing values when method="constant".
value : int/float, default=0
Value to use to fill missing values when method="constant" or
for other methods in case a column of X contains only NaN values.
forecaster : Any Forecaster based on sktime.BaseForecaster, default=None
Use a given Forecaster to impute by insample predictions when
method="forecaster". Before fitting, missing data is imputed with
method="ffill" or "bfill" as heuristic. in case of multivariate X,
the forecaster is applied separete to each column like a
ColumnEnsembleForecaster.
ColumnEnsembleForecaster. Forecaster is only applied if the param
method="forecaster" is set, otherwise forecaster is ignored.
random_state : int/float/str, optional
Value to set random.seed() if method="random", default None
Expand Down Expand Up @@ -102,14 +104,14 @@ def __init__(
self,
method="drift",
random_state=None,
value=None,
value=0,
forecaster=None,
missing_values=None,
):

self.method = method
self.missing_values = missing_values
self.value = value

self.forecaster = forecaster
self.random_state = random_state
super(Imputer, self).__init__()
Expand Down Expand Up @@ -175,6 +177,11 @@ def _transform(self, X, y=None):
if self.missing_values:
X = X.replace(to_replace=self.missing_values, value=np.nan)

# in case a column only contains missing values, fill with value
for col in X.columns:
if all(X[col].isna()):
X[col] = self.value

if not _has_missing_values(X):
return X

Expand Down Expand Up @@ -221,25 +228,15 @@ def _check_method(self):
"forecaster",
]:
raise ValueError(f"Given method {method} is not an allowed method.")
if (
self.value is not None
and method != "constant"
or method == "constant"
and self.value is None
):
if method == "constant" and self.value is None:
raise ValueError(
"""Imputing with a value can only be
used if method="constant" and if parameter "value" is not None"""
"""Imputing with method=\"constant\" can only be used if parameter
value" is not None"""
)
elif (
self.forecaster is not None
and method != "forecaster"
or method == "forecaster"
and self.forecaster is None
):
elif method == "forecaster" and self.forecaster is None:
raise ValueError(
"""Imputing with a forecaster can only be used if
method=\"forecaster\" and if arg forecaster is not None"""
"""Imputing with method=\"forecaster\" can only be
used if param forecaster is not None"""
)
else:
pass
Expand Down Expand Up @@ -286,10 +283,15 @@ def _impute_with_forecaster(self, X, y):
fh = ForecastingHorizon(values=na_index, is_relative=False)

# fill NaN before fitting with ffill and backfill (heuristic)

self._forecaster.fit(
y=self._X[col].fillna(method="ffill").fillna(method="backfill"),
X=self._y[col].fillna(method="ffill").fillna(method="backfill")
y=self._X[col]
.fillna(method="ffill")
.fillna(method="backfill")
.fillna(self.value),
X=self._y[col]
.fillna(method="ffill")
.fillna(method="backfill")
.fillna(self.value)
if self._y is not None
else None,
)
Expand Down
43 changes: 37 additions & 6 deletions sktime/transformations/series/tests/test_imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@
__all__ = []

import numpy as np
import pandas as pd
import pytest

from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.forecasting.naive import NaiveForecaster
from sktime.transformations.series.impute import Imputer
from sktime.utils._testing.forecasting import make_forecasting_problem
Expand All @@ -25,7 +27,8 @@
y.iloc[-1] = np.nan


@pytest.mark.parametrize("Z", [y, X])
@pytest.mark.parametrize("forecaster", [None, NaiveForecaster()])
@pytest.mark.parametrize("X", [y, X])
@pytest.mark.parametrize(
"method",
[
Expand All @@ -41,10 +44,38 @@
"forecaster",
],
)
def test_imputer(method, Z):
def test_imputer(method, X, forecaster):
"""Test univariate and multivariate Imputer with all methods."""
forecaster = NaiveForecaster() if method == "forecaster" else None
value = 3 if method == "constant" else None
t = Imputer(method=method, forecaster=forecaster, value=value)
y_hat = t.fit_transform(Z)
forecaster = NaiveForecaster() if method == "forecaster" else forecaster

t = Imputer(method=method, forecaster=forecaster)
y_hat = t.fit_transform(X)
assert not y_hat.isnull().to_numpy().any()

# test train and transform data is different
X_train, X_test = temporal_train_test_split(X, test_size=5)

t = Imputer(method=method, forecaster=forecaster)
t = t.fit(X_train)
y_hat = t.transform(X_test)
assert not y_hat.isnull().to_numpy().any()

# test some columns only contain NaN, either in fit or transform
t = Imputer(method=method, forecaster=forecaster)
# one column only contains NaN
if isinstance(X, pd.Series):
X = X.to_frame()
X.iloc[:, 0] = np.nan
X = pd.Series(X.iloc[:, 0])

y_hat = t.fit_transform(X)
assert not y_hat.isnull().to_numpy().any()

if isinstance(X, pd.DataFrame):
X_train.iloc[:, 0] = np.nan
X_test.iloc[:, 1] = np.nan

t = Imputer(method=method, forecaster=NaiveForecaster())
t = t.fit(X_train)
y_hat = t.transform(X_test)
assert not y_hat.isnull().to_numpy().any()

0 comments on commit e1e3b75

Please sign in to comment.