Skip to content

Commit

Permalink
Extend TimeSeriesImputer to handle multiple series (#4291)
Browse files Browse the repository at this point in the history
* Multiseries timeseries imputer support
  • Loading branch information
MichaelFu512 committed Sep 5, 2023
1 parent 69344b2 commit 93e7b97
Show file tree
Hide file tree
Showing 6 changed files with 142 additions and 20 deletions.
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ Release Notes
* Enhancements
* Added support for prediction intervals for VARMAX regressor :pr:`4267`
* Integrated multiseries time series into AutoMLSearch :pr:`4270`
* Extended TimeSeriesImputer to handle multiple series :pr:`4291`
* Fixes
* Fixed error when stacking data with no exogenous variables :pr:`4275`
* Changes
Expand Down
5 changes: 3 additions & 2 deletions evalml/pipelines/components/component_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,9 @@ def _handle_nullable_types(self, X=None, y=None):
Args:
X (pd.DataFrame, optional): Input data to a component of shape [n_samples, n_features].
May contain nullable types.
y (pd.Series, optional): The target of length [n_samples]. May contain nullable types.
y (pd.Series or pd.DataFrame, optional): The target of length [n_samples] or the
unstacked target for a multiseries problem of length [n_samples, n_features*n_series].
May contain nullable types.
Returns:
X, y with any incompatible nullable types downcasted to compatible equivalents.
Expand All @@ -278,5 +280,4 @@ def _handle_nullable_types(self, X=None, y=None):
handle_boolean_nullable=y_bool_incompatible,
handle_integer_nullable=y_int_incompatible,
)

return X, y
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ def __init__(
self._backwards_cols = None
self._interpolate_cols = None
self._impute_target = None
self._y_all_null_cols = None
super().__init__(
parameters=parameters,
component_obj=None,
Expand Down Expand Up @@ -137,11 +138,17 @@ def _filter_cols(impute_strat, X):
self._backwards_cols = _filter_cols("backwards_fill", X)
self._interpolate_cols = _filter_cols("interpolate", X)

if y is not None:
if isinstance(y, pd.Series):
y = infer_feature_types(y)
if y.isnull().any():
self._impute_target = self.parameters["target_impute_strategy"]

elif isinstance(y, pd.DataFrame):
y = infer_feature_types(y)
y_nan_ratio = y.isna().sum() / y.shape[0]
self._y_all_null_cols = y_nan_ratio[y_nan_ratio == 1].index.tolist()
if y.isnull().values.any():
self._impute_target = self.parameters["target_impute_strategy"]
return self

def transform(self, X, y=None):
Expand Down Expand Up @@ -212,19 +219,33 @@ def transform(self, X, y=None):
new_ltypes.update(new_int_ltypes)
X_not_all_null.ww.init(schema=original_schema, logical_types=new_ltypes)

y_imputed = pd.Series(y)
y_imputed = (
y.ww.drop(self._y_all_null_cols)
if isinstance(y, pd.DataFrame)
else pd.Series(y)
)
if y is not None and len(y) > 0:
if self._impute_target == "forwards_fill":
y_imputed = y.pad()
y_imputed = y_imputed.pad()
y_imputed.bfill(inplace=True)
elif self._impute_target == "backwards_fill":
y_imputed = y.bfill()
y_imputed = y_imputed.bfill()
y_imputed.pad(inplace=True)
elif self._impute_target == "interpolate":
y_imputed = y.interpolate()
y_imputed = y_imputed.interpolate()
y_imputed.bfill(inplace=True)
# Re-initialize woodwork with the downcast logical type
y_imputed = ww.init_series(y_imputed, logical_type=y.ww.logical_type)
if isinstance(y, pd.Series):
y_imputed = ww.init_series(y_imputed, logical_type=y.ww.logical_type)
else:
y_original_schema = y.ww.schema.get_subset_schema(
list(y_imputed.columns),
)
y_new_ltypes = {
col: _determine_non_nullable_equivalent(ltype)
for col, ltype in y_original_schema.logical_types.items()
}
y_imputed.ww.init(schema=y_original_schema, logical_types=y_new_ltypes)

return X_not_all_null, y_imputed

Expand All @@ -234,15 +255,22 @@ def _handle_nullable_types(self, X=None, y=None):
Args:
X (pd.DataFrame, optional): Input data to a component of shape [n_samples, n_features].
May contain nullable types.
y (pd.Series, optional): The target of length [n_samples]. May contain nullable types.
y (pd.Series or pd.DataFrame, optional): The target of length [n_samples] or the
unstacked target for a multiseries problem of length [n_samples, n_features*n_series].
May contain nullable types.
Returns:
X, y with any incompatible nullable types downcasted to compatible equivalents when interpolate is used. Is NoOp otherwise.
"""
if self._impute_target == "interpolate":
# For BooleanNullable, we have to avoid Categorical columns
# since the category dtype also has incompatibilities with linear interpolate, which is expected
if isinstance(y.ww.logical_type, BooleanNullable):
# TODO: Avoid categorical columns for BooleanNullable in multiseries when
# multiseries timeseries supports categorical
if isinstance(y, pd.Series) and isinstance(
y.ww.logical_type,
BooleanNullable,
):
y = ww.init_series(y, Double)
else:
_, y = super()._handle_nullable_types(None, y)
Expand Down
75 changes: 75 additions & 0 deletions evalml/tests/component_tests/test_time_series_imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -722,3 +722,78 @@ def test_time_series_imputer_nullable_type_incompatibility(
_, nullable_series = imputer._handle_nullable_types(None, nullable_series)

nullable_series.interpolate()


@pytest.mark.parametrize(
"nans_present",
[True, False],
)
def test_time_series_imputer_multiseries(
multiseries_ts_data_unstacked,
nans_present,
):
X, y = multiseries_ts_data_unstacked
imputer = TimeSeriesImputer(target_impute_strategy="interpolate")

if nans_present:
for count, col in enumerate(y, start=1):
y[col][count] = np.nan

imputer.fit(X, y)
assert imputer._y_all_null_cols == []

_, y_imputed = imputer.transform(X, y)
assert isinstance(y_imputed, pd.DataFrame)

y_expected = pd.DataFrame({f"target_{i}": range(i, 100, 5) for i in range(5)})
assert_frame_equal(y_imputed, y_expected, check_dtype=False)


@pytest.mark.parametrize(
"num_nan_cols",
[1, 2, 3],
)
@pytest.mark.parametrize(
"nan_in_other_cols",
[True, False],
)
def test_time_series_imputer_multiseries_some_columns_all_nan(
multiseries_ts_data_unstacked,
num_nan_cols,
nan_in_other_cols,
):
X, y = multiseries_ts_data_unstacked
imputer = TimeSeriesImputer(target_impute_strategy="interpolate")

for count, col in enumerate(y, start=1):
if count <= num_nan_cols:
y[col] = np.nan
if count == num_nan_cols and not nan_in_other_cols:
break
else:
y[col][count] = np.nan

imputer.fit(X, y)
_, y_imputed = imputer.transform(X, y)

y_expected = pd.DataFrame(
{f"target_{i}": range(i, 100, 5) for i in range(num_nan_cols, 5)},
)
assert_frame_equal(y_imputed, y_expected, check_dtype=False)


def test_imputer_multiseries_drops_columns_with_all_nan(multiseries_ts_data_unstacked):
X, y = multiseries_ts_data_unstacked
for col in y:
y[col] = np.nan
imputer = TimeSeriesImputer(target_impute_strategy="interpolate")
imputer.fit(X, y)
assert imputer._y_all_null_cols == y.columns.tolist()
_, y_imputed = imputer.transform(X, y)
expected = y.drop(y.columns.tolist(), axis=1)
assert_frame_equal(
y_imputed,
expected,
check_column_type=False,
check_index_type=False,
)
21 changes: 17 additions & 4 deletions evalml/tests/utils_tests/test_nullable_type_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,24 @@ def test_determine_downcast_type(nullable_type_target, nullable_ltype, has_nans)


@pytest.mark.parametrize(
"downcast_util, data_type",
[(_downcast_nullable_X, "X"), (_downcast_nullable_y, "y")],
"downcast_util, data_type, y_type",
[
(_downcast_nullable_X, "X", "series"),
(_downcast_nullable_y, "y", "series"),
(_downcast_nullable_y, "y", "dataframe"),
],
)
def test_downcast_utils_handle_woodwork_not_init(X_y_binary, downcast_util, data_type):
X, y = X_y_binary
def test_downcast_utils_handle_woodwork_not_init(
X_y_binary,
multiseries_ts_data_unstacked,
downcast_util,
data_type,
y_type,
):
if y_type == "series":
X, y = X_y_binary
else:
X, y = multiseries_ts_data_unstacked
# Remove woodwork types
if data_type == "X":
data = X.copy()
Expand Down
16 changes: 10 additions & 6 deletions evalml/utils/nullable_type_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import woodwork as ww
import pandas as pd
from woodwork.logical_types import AgeNullable, BooleanNullable, IntegerNullable

DOWNCAST_TYPE_DICT = {
Expand Down Expand Up @@ -48,7 +48,7 @@ def _downcast_nullable_y(y, handle_boolean_nullable=True, handle_integer_nullabl
to other dtypes via Woodwork logical type transformations.
Args:
y (pd.Series): Target data of shape [n_samples] whose nullable types will be changed.
y (pd.Series or pd.DataFrame): Target data of shape [n_samples] or [n_samples, n_features*n_series] whose nullable types will be changed.
handle_boolean_nullable (bool, optional): Whether or not to downcast data with BooleanNullable logical types.
handle_integer_nullable (bool, optional): Whether or not to downcast data with IntegerNullable or AgeNullable logical types.
Expand All @@ -57,16 +57,20 @@ def _downcast_nullable_y(y, handle_boolean_nullable=True, handle_integer_nullabl
y with any incompatible nullable types downcasted to compatible equivalents.
"""
if y.ww.schema is None:
y = ww.init_series(y)
y.ww.init()

incompatible_logical_types = _get_incompatible_nullable_types(
handle_boolean_nullable,
handle_integer_nullable,
)

if isinstance(y.ww.logical_type, tuple(incompatible_logical_types)):
new_ltype = _determine_downcast_type(y)
return y.ww.set_logical_type(new_ltype)
if isinstance(y, pd.DataFrame):
y = _downcast_nullable_X(y)

else:
if isinstance(y.ww.logical_type, tuple(incompatible_logical_types)):
new_ltype = _determine_downcast_type(y)
return y.ww.set_logical_type(new_ltype)

return y

Expand Down

0 comments on commit 93e7b97

Please sign in to comment.