diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 881451efa2..7e07c8be10 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -17,6 +17,7 @@ Release Notes * Enhancements * Added support for prediction intervals for VARMAX regressor :pr:`4267` * Integrated multiseries time series into AutoMLSearch :pr:`4270` + * Extended TimeSeriesImputer to handle multiple series :pr:`4291` * Fixes * Fixed error when stacking data with no exogenous variables :pr:`4275` * Changes diff --git a/evalml/pipelines/components/component_base.py b/evalml/pipelines/components/component_base.py index 12b6603bb4..3375ffd233 100644 --- a/evalml/pipelines/components/component_base.py +++ b/evalml/pipelines/components/component_base.py @@ -256,7 +256,9 @@ def _handle_nullable_types(self, X=None, y=None): Args: X (pd.DataFrame, optional): Input data to a component of shape [n_samples, n_features]. May contain nullable types. - y (pd.Series, optional): The target of length [n_samples]. May contain nullable types. + y (pd.Series or pd.DataFrame, optional): The target of length [n_samples] or the + unstacked target for a multiseries problem of length [n_samples, n_features*n_series]. + May contain nullable types. Returns: X, y with any incompatible nullable types downcasted to compatible equivalents. @@ -278,5 +280,4 @@ def _handle_nullable_types(self, X=None, y=None): handle_boolean_nullable=y_bool_incompatible, handle_integer_nullable=y_int_incompatible, ) - return X, y diff --git a/evalml/pipelines/components/transformers/imputers/time_series_imputer.py b/evalml/pipelines/components/transformers/imputers/time_series_imputer.py index 35e9ba8193..e8d15f89d6 100644 --- a/evalml/pipelines/components/transformers/imputers/time_series_imputer.py +++ b/evalml/pipelines/components/transformers/imputers/time_series_imputer.py @@ -93,6 +93,7 @@ def __init__( self._backwards_cols = None self._interpolate_cols = None self._impute_target = None + self._y_all_null_cols = None super().__init__( parameters=parameters, component_obj=None, @@ -137,11 +138,17 @@ def _filter_cols(impute_strat, X): self._backwards_cols = _filter_cols("backwards_fill", X) self._interpolate_cols = _filter_cols("interpolate", X) - if y is not None: + if isinstance(y, pd.Series): y = infer_feature_types(y) if y.isnull().any(): self._impute_target = self.parameters["target_impute_strategy"] + elif isinstance(y, pd.DataFrame): + y = infer_feature_types(y) + y_nan_ratio = y.isna().sum() / y.shape[0] + self._y_all_null_cols = y_nan_ratio[y_nan_ratio == 1].index.tolist() + if y.isnull().values.any(): + self._impute_target = self.parameters["target_impute_strategy"] return self def transform(self, X, y=None): @@ -212,19 +219,33 @@ def transform(self, X, y=None): new_ltypes.update(new_int_ltypes) X_not_all_null.ww.init(schema=original_schema, logical_types=new_ltypes) - y_imputed = pd.Series(y) + y_imputed = ( + y.ww.drop(self._y_all_null_cols) + if isinstance(y, pd.DataFrame) + else pd.Series(y) + ) if y is not None and len(y) > 0: if self._impute_target == "forwards_fill": - y_imputed = y.pad() + y_imputed = y_imputed.pad() y_imputed.bfill(inplace=True) elif self._impute_target == "backwards_fill": - y_imputed = y.bfill() + y_imputed = y_imputed.bfill() y_imputed.pad(inplace=True) elif self._impute_target == "interpolate": - y_imputed = y.interpolate() + y_imputed = y_imputed.interpolate() y_imputed.bfill(inplace=True) # Re-initialize woodwork with the downcast logical type - y_imputed = ww.init_series(y_imputed, logical_type=y.ww.logical_type) + if isinstance(y, pd.Series): + y_imputed = ww.init_series(y_imputed, logical_type=y.ww.logical_type) + else: + y_original_schema = y.ww.schema.get_subset_schema( + list(y_imputed.columns), + ) + y_new_ltypes = { + col: _determine_non_nullable_equivalent(ltype) + for col, ltype in y_original_schema.logical_types.items() + } + y_imputed.ww.init(schema=y_original_schema, logical_types=y_new_ltypes) return X_not_all_null, y_imputed @@ -234,7 +255,9 @@ def _handle_nullable_types(self, X=None, y=None): Args: X (pd.DataFrame, optional): Input data to a component of shape [n_samples, n_features]. May contain nullable types. - y (pd.Series, optional): The target of length [n_samples]. May contain nullable types. + y (pd.Series or pd.DataFrame, optional): The target of length [n_samples] or the + unstacked target for a multiseries problem of length [n_samples, n_features*n_series]. + May contain nullable types. Returns: X, y with any incompatible nullable types downcasted to compatible equivalents when interpolate is used. Is NoOp otherwise. @@ -242,7 +265,12 @@ def _handle_nullable_types(self, X=None, y=None): if self._impute_target == "interpolate": # For BooleanNullable, we have to avoid Categorical columns # since the category dtype also has incompatibilities with linear interpolate, which is expected - if isinstance(y.ww.logical_type, BooleanNullable): + # TODO: Avoid categorical columns for BooleanNullable in multiseries when + # multiseries timeseries supports categorical + if isinstance(y, pd.Series) and isinstance( + y.ww.logical_type, + BooleanNullable, + ): y = ww.init_series(y, Double) else: _, y = super()._handle_nullable_types(None, y) diff --git a/evalml/tests/component_tests/test_time_series_imputer.py b/evalml/tests/component_tests/test_time_series_imputer.py index fc3e1d4f3c..20ba00823b 100644 --- a/evalml/tests/component_tests/test_time_series_imputer.py +++ b/evalml/tests/component_tests/test_time_series_imputer.py @@ -722,3 +722,78 @@ def test_time_series_imputer_nullable_type_incompatibility( _, nullable_series = imputer._handle_nullable_types(None, nullable_series) nullable_series.interpolate() + + +@pytest.mark.parametrize( + "nans_present", + [True, False], +) +def test_time_series_imputer_multiseries( + multiseries_ts_data_unstacked, + nans_present, +): + X, y = multiseries_ts_data_unstacked + imputer = TimeSeriesImputer(target_impute_strategy="interpolate") + + if nans_present: + for count, col in enumerate(y, start=1): + y[col][count] = np.nan + + imputer.fit(X, y) + assert imputer._y_all_null_cols == [] + + _, y_imputed = imputer.transform(X, y) + assert isinstance(y_imputed, pd.DataFrame) + + y_expected = pd.DataFrame({f"target_{i}": range(i, 100, 5) for i in range(5)}) + assert_frame_equal(y_imputed, y_expected, check_dtype=False) + + +@pytest.mark.parametrize( + "num_nan_cols", + [1, 2, 3], +) +@pytest.mark.parametrize( + "nan_in_other_cols", + [True, False], +) +def test_time_series_imputer_multiseries_some_columns_all_nan( + multiseries_ts_data_unstacked, + num_nan_cols, + nan_in_other_cols, +): + X, y = multiseries_ts_data_unstacked + imputer = TimeSeriesImputer(target_impute_strategy="interpolate") + + for count, col in enumerate(y, start=1): + if count <= num_nan_cols: + y[col] = np.nan + if count == num_nan_cols and not nan_in_other_cols: + break + else: + y[col][count] = np.nan + + imputer.fit(X, y) + _, y_imputed = imputer.transform(X, y) + + y_expected = pd.DataFrame( + {f"target_{i}": range(i, 100, 5) for i in range(num_nan_cols, 5)}, + ) + assert_frame_equal(y_imputed, y_expected, check_dtype=False) + + +def test_imputer_multiseries_drops_columns_with_all_nan(multiseries_ts_data_unstacked): + X, y = multiseries_ts_data_unstacked + for col in y: + y[col] = np.nan + imputer = TimeSeriesImputer(target_impute_strategy="interpolate") + imputer.fit(X, y) + assert imputer._y_all_null_cols == y.columns.tolist() + _, y_imputed = imputer.transform(X, y) + expected = y.drop(y.columns.tolist(), axis=1) + assert_frame_equal( + y_imputed, + expected, + check_column_type=False, + check_index_type=False, + ) diff --git a/evalml/tests/utils_tests/test_nullable_type_utils.py b/evalml/tests/utils_tests/test_nullable_type_utils.py index 05afce3bf2..ebbb10cbb3 100644 --- a/evalml/tests/utils_tests/test_nullable_type_utils.py +++ b/evalml/tests/utils_tests/test_nullable_type_utils.py @@ -53,11 +53,24 @@ def test_determine_downcast_type(nullable_type_target, nullable_ltype, has_nans) @pytest.mark.parametrize( - "downcast_util, data_type", - [(_downcast_nullable_X, "X"), (_downcast_nullable_y, "y")], + "downcast_util, data_type, y_type", + [ + (_downcast_nullable_X, "X", "series"), + (_downcast_nullable_y, "y", "series"), + (_downcast_nullable_y, "y", "dataframe"), + ], ) -def test_downcast_utils_handle_woodwork_not_init(X_y_binary, downcast_util, data_type): - X, y = X_y_binary +def test_downcast_utils_handle_woodwork_not_init( + X_y_binary, + multiseries_ts_data_unstacked, + downcast_util, + data_type, + y_type, +): + if y_type == "series": + X, y = X_y_binary + else: + X, y = multiseries_ts_data_unstacked # Remove woodwork types if data_type == "X": data = X.copy() diff --git a/evalml/utils/nullable_type_utils.py b/evalml/utils/nullable_type_utils.py index 0cfa6515ff..afe84e1a93 100644 --- a/evalml/utils/nullable_type_utils.py +++ b/evalml/utils/nullable_type_utils.py @@ -1,4 +1,4 @@ -import woodwork as ww +import pandas as pd from woodwork.logical_types import AgeNullable, BooleanNullable, IntegerNullable DOWNCAST_TYPE_DICT = { @@ -48,7 +48,7 @@ def _downcast_nullable_y(y, handle_boolean_nullable=True, handle_integer_nullabl to other dtypes via Woodwork logical type transformations. Args: - y (pd.Series): Target data of shape [n_samples] whose nullable types will be changed. + y (pd.Series or pd.DataFrame): Target data of shape [n_samples] or [n_samples, n_features*n_series] whose nullable types will be changed. handle_boolean_nullable (bool, optional): Whether or not to downcast data with BooleanNullable logical types. handle_integer_nullable (bool, optional): Whether or not to downcast data with IntegerNullable or AgeNullable logical types. @@ -57,16 +57,20 @@ def _downcast_nullable_y(y, handle_boolean_nullable=True, handle_integer_nullabl y with any incompatible nullable types downcasted to compatible equivalents. """ if y.ww.schema is None: - y = ww.init_series(y) + y.ww.init() incompatible_logical_types = _get_incompatible_nullable_types( handle_boolean_nullable, handle_integer_nullable, ) - if isinstance(y.ww.logical_type, tuple(incompatible_logical_types)): - new_ltype = _determine_downcast_type(y) - return y.ww.set_logical_type(new_ltype) + if isinstance(y, pd.DataFrame): + y = _downcast_nullable_X(y) + + else: + if isinstance(y.ww.logical_type, tuple(incompatible_logical_types)): + new_ltype = _determine_downcast_type(y) + return y.ww.set_logical_type(new_ltype) return y