Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extend TimeSeriesImputer to handle multiple series #4291

Merged
merged 19 commits into from
Sep 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ Release Notes
* Enhancements
* Added support for prediction intervals for VARMAX regressor :pr:`4267`
* Integrated multiseries time series into AutoMLSearch :pr:`4270`
* Extended TimeSeriesImputer to handle multiple series :pr:`4291`
* Fixes
* Fixed error when stacking data with no exogenous variables :pr:`4275`
* Changes
Expand Down
5 changes: 3 additions & 2 deletions evalml/pipelines/components/component_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,9 @@ def _handle_nullable_types(self, X=None, y=None):
Args:
X (pd.DataFrame, optional): Input data to a component of shape [n_samples, n_features].
May contain nullable types.
y (pd.Series, optional): The target of length [n_samples]. May contain nullable types.
y (pd.Series or pd.DataFrame, optional): The target of length [n_samples] or the
unstacked target for a multiseries problem of length [n_samples, n_features*n_series].
May contain nullable types.

Returns:
X, y with any incompatible nullable types downcasted to compatible equivalents.
Expand All @@ -278,5 +280,4 @@ def _handle_nullable_types(self, X=None, y=None):
handle_boolean_nullable=y_bool_incompatible,
handle_integer_nullable=y_int_incompatible,
)

return X, y
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@
self._backwards_cols = None
self._interpolate_cols = None
self._impute_target = None
self._y_all_null_cols = None
super().__init__(
parameters=parameters,
component_obj=None,
Expand Down Expand Up @@ -137,11 +138,17 @@
self._backwards_cols = _filter_cols("backwards_fill", X)
self._interpolate_cols = _filter_cols("interpolate", X)

if y is not None:
if isinstance(y, pd.Series):
y = infer_feature_types(y)
if y.isnull().any():
self._impute_target = self.parameters["target_impute_strategy"]

elif isinstance(y, pd.DataFrame):
y = infer_feature_types(y)
y_nan_ratio = y.isna().sum() / y.shape[0]
self._y_all_null_cols = y_nan_ratio[y_nan_ratio == 1].index.tolist()
if y.isnull().values.any():
self._impute_target = self.parameters["target_impute_strategy"]

Check warning on line 151 in evalml/pipelines/components/transformers/imputers/time_series_imputer.py

View check run for this annotation

Codecov / codecov/patch

evalml/pipelines/components/transformers/imputers/time_series_imputer.py#L147-L151

Added lines #L147 - L151 were not covered by tests
return self

def transform(self, X, y=None):
Expand Down Expand Up @@ -212,19 +219,33 @@
new_ltypes.update(new_int_ltypes)
X_not_all_null.ww.init(schema=original_schema, logical_types=new_ltypes)

y_imputed = pd.Series(y)
y_imputed = (
y.ww.drop(self._y_all_null_cols)
if isinstance(y, pd.DataFrame)
else pd.Series(y)
)
if y is not None and len(y) > 0:
if self._impute_target == "forwards_fill":
y_imputed = y.pad()
y_imputed = y_imputed.pad()

Check warning on line 229 in evalml/pipelines/components/transformers/imputers/time_series_imputer.py

View check run for this annotation

Codecov / codecov/patch

evalml/pipelines/components/transformers/imputers/time_series_imputer.py#L229

Added line #L229 was not covered by tests
y_imputed.bfill(inplace=True)
elif self._impute_target == "backwards_fill":
y_imputed = y.bfill()
y_imputed = y_imputed.bfill()

Check warning on line 232 in evalml/pipelines/components/transformers/imputers/time_series_imputer.py

View check run for this annotation

Codecov / codecov/patch

evalml/pipelines/components/transformers/imputers/time_series_imputer.py#L232

Added line #L232 was not covered by tests
y_imputed.pad(inplace=True)
elif self._impute_target == "interpolate":
y_imputed = y.interpolate()
y_imputed = y_imputed.interpolate()

Check warning on line 235 in evalml/pipelines/components/transformers/imputers/time_series_imputer.py

View check run for this annotation

Codecov / codecov/patch

evalml/pipelines/components/transformers/imputers/time_series_imputer.py#L235

Added line #L235 was not covered by tests
y_imputed.bfill(inplace=True)
# Re-initialize woodwork with the downcast logical type
y_imputed = ww.init_series(y_imputed, logical_type=y.ww.logical_type)
if isinstance(y, pd.Series):
y_imputed = ww.init_series(y_imputed, logical_type=y.ww.logical_type)
else:
y_original_schema = y.ww.schema.get_subset_schema(

Check warning on line 241 in evalml/pipelines/components/transformers/imputers/time_series_imputer.py

View check run for this annotation

Codecov / codecov/patch

evalml/pipelines/components/transformers/imputers/time_series_imputer.py#L241

Added line #L241 was not covered by tests
list(y_imputed.columns),
)
y_new_ltypes = {

Check warning on line 244 in evalml/pipelines/components/transformers/imputers/time_series_imputer.py

View check run for this annotation

Codecov / codecov/patch

evalml/pipelines/components/transformers/imputers/time_series_imputer.py#L244

Added line #L244 was not covered by tests
col: _determine_non_nullable_equivalent(ltype)
for col, ltype in y_original_schema.logical_types.items()
}
y_imputed.ww.init(schema=y_original_schema, logical_types=y_new_ltypes)

Check warning on line 248 in evalml/pipelines/components/transformers/imputers/time_series_imputer.py

View check run for this annotation

Codecov / codecov/patch

evalml/pipelines/components/transformers/imputers/time_series_imputer.py#L248

Added line #L248 was not covered by tests

return X_not_all_null, y_imputed

Expand All @@ -234,15 +255,22 @@
Args:
X (pd.DataFrame, optional): Input data to a component of shape [n_samples, n_features].
May contain nullable types.
y (pd.Series, optional): The target of length [n_samples]. May contain nullable types.
y (pd.Series or pd.DataFrame, optional): The target of length [n_samples] or the
unstacked target for a multiseries problem of length [n_samples, n_features*n_series].
May contain nullable types.

Returns:
X, y with any incompatible nullable types downcasted to compatible equivalents when interpolate is used. Is NoOp otherwise.
"""
if self._impute_target == "interpolate":
# For BooleanNullable, we have to avoid Categorical columns
# since the category dtype also has incompatibilities with linear interpolate, which is expected
if isinstance(y.ww.logical_type, BooleanNullable):
# TODO: Avoid categorical columns for BooleanNullable in multiseries when
# multiseries timeseries supports categorical
if isinstance(y, pd.Series) and isinstance(

Check warning on line 270 in evalml/pipelines/components/transformers/imputers/time_series_imputer.py

View check run for this annotation

Codecov / codecov/patch

evalml/pipelines/components/transformers/imputers/time_series_imputer.py#L270

Added line #L270 was not covered by tests
y.ww.logical_type,
BooleanNullable,
):
y = ww.init_series(y, Double)
else:
_, y = super()._handle_nullable_types(None, y)
Expand Down
75 changes: 75 additions & 0 deletions evalml/tests/component_tests/test_time_series_imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -722,3 +722,78 @@
_, nullable_series = imputer._handle_nullable_types(None, nullable_series)

nullable_series.interpolate()


@pytest.mark.parametrize(

Check warning on line 727 in evalml/tests/component_tests/test_time_series_imputer.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/component_tests/test_time_series_imputer.py#L727

Added line #L727 was not covered by tests
"nans_present",
[True, False],
)
def test_time_series_imputer_multiseries(

Check warning on line 731 in evalml/tests/component_tests/test_time_series_imputer.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/component_tests/test_time_series_imputer.py#L731

Added line #L731 was not covered by tests
multiseries_ts_data_unstacked,
nans_present,
):
X, y = multiseries_ts_data_unstacked
imputer = TimeSeriesImputer(target_impute_strategy="interpolate")

Check warning on line 736 in evalml/tests/component_tests/test_time_series_imputer.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/component_tests/test_time_series_imputer.py#L735-L736

Added lines #L735 - L736 were not covered by tests

if nans_present:
for count, col in enumerate(y, start=1):
y[col][count] = np.nan

Check warning on line 740 in evalml/tests/component_tests/test_time_series_imputer.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/component_tests/test_time_series_imputer.py#L738-L740

Added lines #L738 - L740 were not covered by tests

imputer.fit(X, y)
assert imputer._y_all_null_cols == []

Check warning on line 743 in evalml/tests/component_tests/test_time_series_imputer.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/component_tests/test_time_series_imputer.py#L742-L743

Added lines #L742 - L743 were not covered by tests

_, y_imputed = imputer.transform(X, y)
assert isinstance(y_imputed, pd.DataFrame)

Check warning on line 746 in evalml/tests/component_tests/test_time_series_imputer.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/component_tests/test_time_series_imputer.py#L745-L746

Added lines #L745 - L746 were not covered by tests

y_expected = pd.DataFrame({f"target_{i}": range(i, 100, 5) for i in range(5)})
assert_frame_equal(y_imputed, y_expected, check_dtype=False)

Check warning on line 749 in evalml/tests/component_tests/test_time_series_imputer.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/component_tests/test_time_series_imputer.py#L748-L749

Added lines #L748 - L749 were not covered by tests


@pytest.mark.parametrize(

Check warning on line 752 in evalml/tests/component_tests/test_time_series_imputer.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/component_tests/test_time_series_imputer.py#L752

Added line #L752 was not covered by tests
"num_nan_cols",
[1, 2, 3],
)
@pytest.mark.parametrize(

Check warning on line 756 in evalml/tests/component_tests/test_time_series_imputer.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/component_tests/test_time_series_imputer.py#L756

Added line #L756 was not covered by tests
"nan_in_other_cols",
[True, False],
)
def test_time_series_imputer_multiseries_some_columns_all_nan(

Check warning on line 760 in evalml/tests/component_tests/test_time_series_imputer.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/component_tests/test_time_series_imputer.py#L760

Added line #L760 was not covered by tests
multiseries_ts_data_unstacked,
num_nan_cols,
nan_in_other_cols,
):
X, y = multiseries_ts_data_unstacked
imputer = TimeSeriesImputer(target_impute_strategy="interpolate")

Check warning on line 766 in evalml/tests/component_tests/test_time_series_imputer.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/component_tests/test_time_series_imputer.py#L765-L766

Added lines #L765 - L766 were not covered by tests

for count, col in enumerate(y, start=1):
if count <= num_nan_cols:
y[col] = np.nan
if count == num_nan_cols and not nan_in_other_cols:
break

Check warning on line 772 in evalml/tests/component_tests/test_time_series_imputer.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/component_tests/test_time_series_imputer.py#L768-L772

Added lines #L768 - L772 were not covered by tests
else:
y[col][count] = np.nan

Check warning on line 774 in evalml/tests/component_tests/test_time_series_imputer.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/component_tests/test_time_series_imputer.py#L774

Added line #L774 was not covered by tests

imputer.fit(X, y)
_, y_imputed = imputer.transform(X, y)

Check warning on line 777 in evalml/tests/component_tests/test_time_series_imputer.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/component_tests/test_time_series_imputer.py#L776-L777

Added lines #L776 - L777 were not covered by tests

y_expected = pd.DataFrame(

Check warning on line 779 in evalml/tests/component_tests/test_time_series_imputer.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/component_tests/test_time_series_imputer.py#L779

Added line #L779 was not covered by tests
{f"target_{i}": range(i, 100, 5) for i in range(num_nan_cols, 5)},
)
assert_frame_equal(y_imputed, y_expected, check_dtype=False)

Check warning on line 782 in evalml/tests/component_tests/test_time_series_imputer.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/component_tests/test_time_series_imputer.py#L782

Added line #L782 was not covered by tests


def test_imputer_multiseries_drops_columns_with_all_nan(multiseries_ts_data_unstacked):
X, y = multiseries_ts_data_unstacked
for col in y:
y[col] = np.nan
Comment on lines +787 to +788
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we would benefit from another test (parametrized here) where only some of the columns are NaN, but not all of them!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

^ Sorry, should have clarified 😅 I meant a test where some of the columns are all NaN, so we drop some columns and impute or pass through others!

imputer = TimeSeriesImputer(target_impute_strategy="interpolate")
imputer.fit(X, y)
assert imputer._y_all_null_cols == y.columns.tolist()
_, y_imputed = imputer.transform(X, y)
expected = y.drop(y.columns.tolist(), axis=1)
assert_frame_equal(

Check warning on line 794 in evalml/tests/component_tests/test_time_series_imputer.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/component_tests/test_time_series_imputer.py#L785-L794

Added lines #L785 - L794 were not covered by tests
y_imputed,
expected,
check_column_type=False,
check_index_type=False,
)
21 changes: 17 additions & 4 deletions evalml/tests/utils_tests/test_nullable_type_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,24 @@


@pytest.mark.parametrize(
"downcast_util, data_type",
[(_downcast_nullable_X, "X"), (_downcast_nullable_y, "y")],
"downcast_util, data_type, y_type",
[
(_downcast_nullable_X, "X", "series"),
(_downcast_nullable_y, "y", "series"),
(_downcast_nullable_y, "y", "dataframe"),
],
)
def test_downcast_utils_handle_woodwork_not_init(X_y_binary, downcast_util, data_type):
X, y = X_y_binary
def test_downcast_utils_handle_woodwork_not_init(

Check warning on line 63 in evalml/tests/utils_tests/test_nullable_type_utils.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/utils_tests/test_nullable_type_utils.py#L63

Added line #L63 was not covered by tests
X_y_binary,
multiseries_ts_data_unstacked,
downcast_util,
data_type,
y_type,
):
if y_type == "series":
X, y = X_y_binary

Check warning on line 71 in evalml/tests/utils_tests/test_nullable_type_utils.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/utils_tests/test_nullable_type_utils.py#L70-L71

Added lines #L70 - L71 were not covered by tests
else:
X, y = multiseries_ts_data_unstacked

Check warning on line 73 in evalml/tests/utils_tests/test_nullable_type_utils.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/utils_tests/test_nullable_type_utils.py#L73

Added line #L73 was not covered by tests
# Remove woodwork types
if data_type == "X":
data = X.copy()
Expand Down
16 changes: 10 additions & 6 deletions evalml/utils/nullable_type_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import woodwork as ww
import pandas as pd
from woodwork.logical_types import AgeNullable, BooleanNullable, IntegerNullable

DOWNCAST_TYPE_DICT = {
Expand Down Expand Up @@ -48,7 +48,7 @@
to other dtypes via Woodwork logical type transformations.

Args:
y (pd.Series): Target data of shape [n_samples] whose nullable types will be changed.
y (pd.Series or pd.DataFrame): Target data of shape [n_samples] or [n_samples, n_features*n_series] whose nullable types will be changed.
handle_boolean_nullable (bool, optional): Whether or not to downcast data with BooleanNullable logical types.
handle_integer_nullable (bool, optional): Whether or not to downcast data with IntegerNullable or AgeNullable logical types.

Expand All @@ -57,16 +57,20 @@
y with any incompatible nullable types downcasted to compatible equivalents.
"""
if y.ww.schema is None:
y = ww.init_series(y)
y.ww.init()

Check warning on line 60 in evalml/utils/nullable_type_utils.py

View check run for this annotation

Codecov / codecov/patch

evalml/utils/nullable_type_utils.py#L60

Added line #L60 was not covered by tests

incompatible_logical_types = _get_incompatible_nullable_types(
handle_boolean_nullable,
handle_integer_nullable,
)

if isinstance(y.ww.logical_type, tuple(incompatible_logical_types)):
new_ltype = _determine_downcast_type(y)
return y.ww.set_logical_type(new_ltype)
if isinstance(y, pd.DataFrame):
MichaelFu512 marked this conversation as resolved.
Show resolved Hide resolved
y = _downcast_nullable_X(y)

Check warning on line 68 in evalml/utils/nullable_type_utils.py

View check run for this annotation

Codecov / codecov/patch

evalml/utils/nullable_type_utils.py#L68

Added line #L68 was not covered by tests

else:
if isinstance(y.ww.logical_type, tuple(incompatible_logical_types)):
new_ltype = _determine_downcast_type(y)
return y.ww.set_logical_type(new_ltype)

return y

Expand Down
Loading