From c8addb494985a5c6d9c7a28abe9b855b754fc0b1 Mon Sep 17 00:00:00 2001 From: MichaelFu512 Date: Mon, 28 Aug 2023 15:21:26 -0700 Subject: [PATCH 01/15] time series imputer multiseries support --- .../imputers/time_series_imputer.py | 43 ++++++++++++++----- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/evalml/pipelines/components/transformers/imputers/time_series_imputer.py b/evalml/pipelines/components/transformers/imputers/time_series_imputer.py index 35e9ba8193..5f53296c85 100644 --- a/evalml/pipelines/components/transformers/imputers/time_series_imputer.py +++ b/evalml/pipelines/components/transformers/imputers/time_series_imputer.py @@ -93,6 +93,7 @@ def __init__( self._backwards_cols = None self._interpolate_cols = None self._impute_target = None + self._y_all_null_cols = None super().__init__( parameters=parameters, component_obj=None, @@ -137,11 +138,17 @@ def _filter_cols(impute_strat, X): self._backwards_cols = _filter_cols("backwards_fill", X) self._interpolate_cols = _filter_cols("interpolate", X) - if y is not None: + if isinstance(y, pd.Series): y = infer_feature_types(y) if y.isnull().any(): self._impute_target = self.parameters["target_impute_strategy"] + elif isinstance(y, pd.DataFrame): + y = infer_feature_types(y) + y_nan_ratio = X.isna().sum() / X.shape[0] + self._y_all_null_cols = y_nan_ratio[nan_ratio == 1].index.tolist() + if y.isnull().values.any(): + self._impute_target = self.parameters["target_impute_strategy"] return self def transform(self, X, y=None): @@ -212,19 +219,33 @@ def transform(self, X, y=None): new_ltypes.update(new_int_ltypes) X_not_all_null.ww.init(schema=original_schema, logical_types=new_ltypes) - y_imputed = pd.Series(y) + y_imputed = ( + y.ww.drop(self._y_all_null_cols) + if isinstance(y, pd.DataFrame) + else pd.Series(y) + ) if y is not None and len(y) > 0: if self._impute_target == "forwards_fill": - y_imputed = y.pad() + y_imputed = y_imputed.pad() y_imputed.bfill(inplace=True) elif self._impute_target == "backwards_fill": - y_imputed = y.bfill() + y_imputed = y_imputed.bfill() y_imputed.pad(inplace=True) elif self._impute_target == "interpolate": - y_imputed = y.interpolate() + y_imputed = y_imputed.interpolate() y_imputed.bfill(inplace=True) # Re-initialize woodwork with the downcast logical type - y_imputed = ww.init_series(y_imputed, logical_type=y.ww.logical_type) + if isinstance(y, pd.Series): + y_imputed = ww.init_series(y_imputed, logical_type=y.ww.logical_type) + else: + y_original_schema = y.ww.schema.get_subset_schema( + list(y_imputed.columns), + ) + y_new_ltypes = { + col: _determine_non_nullable_equivalent(ltype) + for col, ltype in y_original_schema.logical_types.items() + } + y_imputed.ww.init(schema=y_original_schema, logical_types=y_new_ltypes) return X_not_all_null, y_imputed @@ -242,10 +263,12 @@ def _handle_nullable_types(self, X=None, y=None): if self._impute_target == "interpolate": # For BooleanNullable, we have to avoid Categorical columns # since the category dtype also has incompatibilities with linear interpolate, which is expected - if isinstance(y.ww.logical_type, BooleanNullable): - y = ww.init_series(y, Double) - else: - _, y = super()._handle_nullable_types(None, y) + # TODO: Avoid categorical columns for multiseries when multiseries timeseries supports categorical + if isinstance(y, pd.Series): + if isinstance(y.ww.logical_type, BooleanNullable): + y = ww.init_series(y, Double) + else: + _, y = super()._handle_nullable_types(None, y) if self._interpolate_cols is not None: X, _ = super()._handle_nullable_types(X, None) From eb96bca9853bb296733eb061840b873c57c0afca Mon Sep 17 00:00:00 2001 From: MichaelFu512 Date: Tue, 29 Aug 2023 14:08:16 -0700 Subject: [PATCH 02/15] Fixed interpolate error --- evalml/pipelines/components/component_base.py | 23 +++++++++++++------ .../imputers/time_series_imputer.py | 22 ++++++++++-------- 2 files changed, 29 insertions(+), 16 deletions(-) diff --git a/evalml/pipelines/components/component_base.py b/evalml/pipelines/components/component_base.py index 12b6603bb4..80fb4f9451 100644 --- a/evalml/pipelines/components/component_base.py +++ b/evalml/pipelines/components/component_base.py @@ -3,6 +3,7 @@ from abc import ABC, abstractmethod import cloudpickle +import pandas as pd from evalml.exceptions import MethodPropertyNotFoundError from evalml.pipelines.components.component_base_meta import ComponentBaseMeta @@ -256,7 +257,8 @@ def _handle_nullable_types(self, X=None, y=None): Args: X (pd.DataFrame, optional): Input data to a component of shape [n_samples, n_features]. May contain nullable types. - y (pd.Series, optional): The target of length [n_samples]. May contain nullable types. + y (pd.Series or pd.DataFrame, optional): The target of length [n_samples] or the unstacked target for a multiseries problem. + May contain nullable types. Returns: X, y with any incompatible nullable types downcasted to compatible equivalents. @@ -273,10 +275,17 @@ def _handle_nullable_types(self, X=None, y=None): y_bool_incompatible = "y" in self._boolean_nullable_incompatibilities y_int_incompatible = "y" in self._integer_nullable_incompatibilities if y is not None and (y_bool_incompatible or y_int_incompatible): - y = _downcast_nullable_y( - y, - handle_boolean_nullable=y_bool_incompatible, - handle_integer_nullable=y_int_incompatible, - ) - + if isinstance(y, pd.Series): + y = _downcast_nullable_y( + y, + handle_boolean_nullable=y_bool_incompatible, + handle_integer_nullable=y_int_incompatible, + ) + # if y is a dataframe (from unstacked multiseries) use _downcast_nullable_X since downcast_nullable_y is for series + else: + y = _downcast_nullable_X( + y, + handle_boolean_nullable=y_bool_incompatible, + handle_integer_nullable=y_int_incompatible, + ) return X, y diff --git a/evalml/pipelines/components/transformers/imputers/time_series_imputer.py b/evalml/pipelines/components/transformers/imputers/time_series_imputer.py index 5f53296c85..7c864432ff 100644 --- a/evalml/pipelines/components/transformers/imputers/time_series_imputer.py +++ b/evalml/pipelines/components/transformers/imputers/time_series_imputer.py @@ -145,8 +145,8 @@ def _filter_cols(impute_strat, X): elif isinstance(y, pd.DataFrame): y = infer_feature_types(y) - y_nan_ratio = X.isna().sum() / X.shape[0] - self._y_all_null_cols = y_nan_ratio[nan_ratio == 1].index.tolist() + y_nan_ratio = y.isna().sum() / y.shape[0] + self._y_all_null_cols = y_nan_ratio[y_nan_ratio == 1].index.tolist() if y.isnull().values.any(): self._impute_target = self.parameters["target_impute_strategy"] return self @@ -255,7 +255,8 @@ def _handle_nullable_types(self, X=None, y=None): Args: X (pd.DataFrame, optional): Input data to a component of shape [n_samples, n_features]. May contain nullable types. - y (pd.Series, optional): The target of length [n_samples]. May contain nullable types. + y (pd.Series or pd.DataFrame, optional): The target of length [n_samples] or the unstacked target for a multiseries problem. + May contain nullable types. Returns: X, y with any incompatible nullable types downcasted to compatible equivalents when interpolate is used. Is NoOp otherwise. @@ -263,12 +264,15 @@ def _handle_nullable_types(self, X=None, y=None): if self._impute_target == "interpolate": # For BooleanNullable, we have to avoid Categorical columns # since the category dtype also has incompatibilities with linear interpolate, which is expected - # TODO: Avoid categorical columns for multiseries when multiseries timeseries supports categorical - if isinstance(y, pd.Series): - if isinstance(y.ww.logical_type, BooleanNullable): - y = ww.init_series(y, Double) - else: - _, y = super()._handle_nullable_types(None, y) + # TODO: Avoid categorical columns for BooleanNullable in multiseries when + # multiseries timeseries supports categorical + if isinstance(y, pd.Series) and isinstance( + y.ww.logical_type, + BooleanNullable, + ): + y = ww.init_series(y, Double) + else: + _, y = super()._handle_nullable_types(None, y) if self._interpolate_cols is not None: X, _ = super()._handle_nullable_types(X, None) From f4d532d83d6c9fa4206bbb736658d9e888a1a4a0 Mon Sep 17 00:00:00 2001 From: MichaelFu512 Date: Tue, 29 Aug 2023 14:08:34 -0700 Subject: [PATCH 03/15] testing --- .../test_time_series_imputer.py | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/evalml/tests/component_tests/test_time_series_imputer.py b/evalml/tests/component_tests/test_time_series_imputer.py index fc3e1d4f3c..45354cb76a 100644 --- a/evalml/tests/component_tests/test_time_series_imputer.py +++ b/evalml/tests/component_tests/test_time_series_imputer.py @@ -722,3 +722,40 @@ def test_time_series_imputer_nullable_type_incompatibility( _, nullable_series = imputer._handle_nullable_types(None, nullable_series) nullable_series.interpolate() + + +@pytest.mark.parametrize( + "nans_present", + [True, False], +) +def test_time_series_imputer_multiseries(multiseries_ts_data_unstacked, nans_present): + X, y = multiseries_ts_data_unstacked + imputer = TimeSeriesImputer(target_impute_strategy="interpolate") + if nans_present: + c = 1 + for x in y: + y[x][c] = np.nan + c += 1 + imputer.fit(X, y) + assert imputer._y_all_null_cols == [] + _, y_imputed = imputer.transform(X, y) + assert isinstance(y_imputed, pd.DataFrame) + y_expected = pd.DataFrame({f"target_{i}": range(i, 100, 5) for i in range(5)}) + assert_frame_equal(y_imputed, y_expected, check_dtype=False) + + +def test_imputer_multiseries_drops_columns_with_all_nan(multiseries_ts_data_unstacked): + X, y = multiseries_ts_data_unstacked + for col in y: + y[col] = np.nan + imputer = TimeSeriesImputer(target_impute_strategy="interpolate") + imputer.fit(X, y) + assert imputer._y_all_null_cols == y.columns.tolist() + _, y_imputed = imputer.transform(X, y) + expected = y.drop(y.columns.tolist(), axis=1) + assert_frame_equal( + y_imputed, + expected, + check_column_type=False, + check_index_type=False, + ) From 278d95309d54165d911a5483f1a679297d37af5c Mon Sep 17 00:00:00 2001 From: MichaelFu512 Date: Tue, 29 Aug 2023 14:19:08 -0700 Subject: [PATCH 04/15] updated release notes --- docs/source/release_notes.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index ed6c545a19..2124a0d6b1 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -4,6 +4,7 @@ Release Notes * Enhancements * Added support for prediction intervals for VARMAX regressor :pr:`4267` * Integrated multiseries time series into AutoMLSearch :pr:`4270` + * Extended TimeSeriesImputer to handle multiple series :pr:`4291` * Fixes * Fixed error when stacking data with no exogenous variables :pr:`4275` * Changes From 021dfec6878d2ad70d09e948f21597f8875e5deb Mon Sep 17 00:00:00 2001 From: MichaelFu512 Date: Wed, 30 Aug 2023 13:36:45 -0700 Subject: [PATCH 05/15] added new test --- .../test_time_series_imputer.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/evalml/tests/component_tests/test_time_series_imputer.py b/evalml/tests/component_tests/test_time_series_imputer.py index 45354cb76a..b50cc92d46 100644 --- a/evalml/tests/component_tests/test_time_series_imputer.py +++ b/evalml/tests/component_tests/test_time_series_imputer.py @@ -725,17 +725,23 @@ def test_time_series_imputer_nullable_type_incompatibility( @pytest.mark.parametrize( - "nans_present", - [True, False], + "nans_present, nan_in_every_col", + [(True, True), (True, False), (False, False)], ) -def test_time_series_imputer_multiseries(multiseries_ts_data_unstacked, nans_present): +def test_time_series_imputer_multiseries( + multiseries_ts_data_unstacked, + nans_present, + nan_in_every_col, +): X, y = multiseries_ts_data_unstacked imputer = TimeSeriesImputer(target_impute_strategy="interpolate") if nans_present: - c = 1 + counter = 1 for x in y: - y[x][c] = np.nan - c += 1 + y[x][counter] = np.nan + counter += 1 + if not nan_in_every_col and counter > 3: + break imputer.fit(X, y) assert imputer._y_all_null_cols == [] _, y_imputed = imputer.transform(X, y) From c1acd1967d460749fcf3707aa3822b9688145751 Mon Sep 17 00:00:00 2001 From: MichaelFu512 Date: Wed, 30 Aug 2023 13:37:35 -0700 Subject: [PATCH 06/15] fixed comment and reworked downcast y --- evalml/pipelines/components/component_base.py | 5 ++-- evalml/utils/nullable_type_utils.py | 26 +++++++++++++++---- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/evalml/pipelines/components/component_base.py b/evalml/pipelines/components/component_base.py index 80fb4f9451..7a52cec647 100644 --- a/evalml/pipelines/components/component_base.py +++ b/evalml/pipelines/components/component_base.py @@ -257,7 +257,8 @@ def _handle_nullable_types(self, X=None, y=None): Args: X (pd.DataFrame, optional): Input data to a component of shape [n_samples, n_features]. May contain nullable types. - y (pd.Series or pd.DataFrame, optional): The target of length [n_samples] or the unstacked target for a multiseries problem. + y (pd.Series or pd.DataFrame, optional): The target of length [n_samples] or the + unstacked target for a multiseries problem of length [n_samples, n_features*n_series]. May contain nullable types. Returns: @@ -283,7 +284,7 @@ def _handle_nullable_types(self, X=None, y=None): ) # if y is a dataframe (from unstacked multiseries) use _downcast_nullable_X since downcast_nullable_y is for series else: - y = _downcast_nullable_X( + y = _downcast_nullable_y( y, handle_boolean_nullable=y_bool_incompatible, handle_integer_nullable=y_int_incompatible, diff --git a/evalml/utils/nullable_type_utils.py b/evalml/utils/nullable_type_utils.py index 0cfa6515ff..102c4e92c2 100644 --- a/evalml/utils/nullable_type_utils.py +++ b/evalml/utils/nullable_type_utils.py @@ -1,3 +1,4 @@ +import pandas as pd import woodwork as ww from woodwork.logical_types import AgeNullable, BooleanNullable, IntegerNullable @@ -48,7 +49,7 @@ def _downcast_nullable_y(y, handle_boolean_nullable=True, handle_integer_nullabl to other dtypes via Woodwork logical type transformations. Args: - y (pd.Series): Target data of shape [n_samples] whose nullable types will be changed. + y (pd.Series or pd.DataFrame): Target data of shape [n_samples] or [n_samples, n_features*n_series] whose nullable types will be changed. handle_boolean_nullable (bool, optional): Whether or not to downcast data with BooleanNullable logical types. handle_integer_nullable (bool, optional): Whether or not to downcast data with IntegerNullable or AgeNullable logical types. @@ -57,16 +58,31 @@ def _downcast_nullable_y(y, handle_boolean_nullable=True, handle_integer_nullabl y with any incompatible nullable types downcasted to compatible equivalents. """ if y.ww.schema is None: - y = ww.init_series(y) + if isinstance(y, pd.DataFrame): + y.ww.init() + else: + y = ww.init_series(y) incompatible_logical_types = _get_incompatible_nullable_types( handle_boolean_nullable, handle_integer_nullable, ) - if isinstance(y.ww.logical_type, tuple(incompatible_logical_types)): - new_ltype = _determine_downcast_type(y) - return y.ww.set_logical_type(new_ltype) + if isinstance(y, pd.DataFrame): + data_to_downcast = y.ww.select(incompatible_logical_types) + # If no incompatible types are present, no downcasting is needed + if not len(data_to_downcast.columns): + return y + new_ltypes = { + col: _determine_downcast_type(data_to_downcast.ww[col]) + for col in data_to_downcast.columns + } + y.ww.set_types(logical_types=new_ltypes) + + else: + if isinstance(y.ww.logical_type, tuple(incompatible_logical_types)): + new_ltype = _determine_downcast_type(y) + return y.ww.set_logical_type(new_ltype) return y From 24d5e056432ddb821d31090d648733fb5c39aaa5 Mon Sep 17 00:00:00 2001 From: MichaelFu512 Date: Wed, 30 Aug 2023 13:43:33 -0700 Subject: [PATCH 07/15] removed stray comment --- evalml/pipelines/components/component_base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/evalml/pipelines/components/component_base.py b/evalml/pipelines/components/component_base.py index 7a52cec647..6fa945c007 100644 --- a/evalml/pipelines/components/component_base.py +++ b/evalml/pipelines/components/component_base.py @@ -282,7 +282,6 @@ def _handle_nullable_types(self, X=None, y=None): handle_boolean_nullable=y_bool_incompatible, handle_integer_nullable=y_int_incompatible, ) - # if y is a dataframe (from unstacked multiseries) use _downcast_nullable_X since downcast_nullable_y is for series else: y = _downcast_nullable_y( y, From d5b465204dd18eb23105d56ed6403cb6ece338f6 Mon Sep 17 00:00:00 2001 From: MichaelFu512 Date: Wed, 30 Aug 2023 17:32:01 -0700 Subject: [PATCH 08/15] codecov --- .../utils_tests/test_nullable_type_utils.py | 21 +++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/evalml/tests/utils_tests/test_nullable_type_utils.py b/evalml/tests/utils_tests/test_nullable_type_utils.py index 05afce3bf2..ebbb10cbb3 100644 --- a/evalml/tests/utils_tests/test_nullable_type_utils.py +++ b/evalml/tests/utils_tests/test_nullable_type_utils.py @@ -53,11 +53,24 @@ def test_determine_downcast_type(nullable_type_target, nullable_ltype, has_nans) @pytest.mark.parametrize( - "downcast_util, data_type", - [(_downcast_nullable_X, "X"), (_downcast_nullable_y, "y")], + "downcast_util, data_type, y_type", + [ + (_downcast_nullable_X, "X", "series"), + (_downcast_nullable_y, "y", "series"), + (_downcast_nullable_y, "y", "dataframe"), + ], ) -def test_downcast_utils_handle_woodwork_not_init(X_y_binary, downcast_util, data_type): - X, y = X_y_binary +def test_downcast_utils_handle_woodwork_not_init( + X_y_binary, + multiseries_ts_data_unstacked, + downcast_util, + data_type, + y_type, +): + if y_type == "series": + X, y = X_y_binary + else: + X, y = multiseries_ts_data_unstacked # Remove woodwork types if data_type == "X": data = X.copy() From 96f43a1a26142d322f31ee41c8866a37f9120ed5 Mon Sep 17 00:00:00 2001 From: MichaelFu512 Date: Thu, 31 Aug 2023 10:23:09 -0700 Subject: [PATCH 09/15] removed redundant code --- evalml/pipelines/components/component_base.py | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/evalml/pipelines/components/component_base.py b/evalml/pipelines/components/component_base.py index 6fa945c007..3375ffd233 100644 --- a/evalml/pipelines/components/component_base.py +++ b/evalml/pipelines/components/component_base.py @@ -3,7 +3,6 @@ from abc import ABC, abstractmethod import cloudpickle -import pandas as pd from evalml.exceptions import MethodPropertyNotFoundError from evalml.pipelines.components.component_base_meta import ComponentBaseMeta @@ -276,16 +275,9 @@ def _handle_nullable_types(self, X=None, y=None): y_bool_incompatible = "y" in self._boolean_nullable_incompatibilities y_int_incompatible = "y" in self._integer_nullable_incompatibilities if y is not None and (y_bool_incompatible or y_int_incompatible): - if isinstance(y, pd.Series): - y = _downcast_nullable_y( - y, - handle_boolean_nullable=y_bool_incompatible, - handle_integer_nullable=y_int_incompatible, - ) - else: - y = _downcast_nullable_y( - y, - handle_boolean_nullable=y_bool_incompatible, - handle_integer_nullable=y_int_incompatible, - ) + y = _downcast_nullable_y( + y, + handle_boolean_nullable=y_bool_incompatible, + handle_integer_nullable=y_int_incompatible, + ) return X, y From a12d92d79787606f50a6472bc73b46834c74e372 Mon Sep 17 00:00:00 2001 From: MichaelFu512 Date: Thu, 31 Aug 2023 10:26:17 -0700 Subject: [PATCH 10/15] updated comments --- .../components/transformers/imputers/time_series_imputer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/evalml/pipelines/components/transformers/imputers/time_series_imputer.py b/evalml/pipelines/components/transformers/imputers/time_series_imputer.py index 7c864432ff..e8d15f89d6 100644 --- a/evalml/pipelines/components/transformers/imputers/time_series_imputer.py +++ b/evalml/pipelines/components/transformers/imputers/time_series_imputer.py @@ -255,7 +255,8 @@ def _handle_nullable_types(self, X=None, y=None): Args: X (pd.DataFrame, optional): Input data to a component of shape [n_samples, n_features]. May contain nullable types. - y (pd.Series or pd.DataFrame, optional): The target of length [n_samples] or the unstacked target for a multiseries problem. + y (pd.Series or pd.DataFrame, optional): The target of length [n_samples] or the + unstacked target for a multiseries problem of length [n_samples, n_features*n_series]. May contain nullable types. Returns: From 39380012bb111994ca42210598286db689a90958 Mon Sep 17 00:00:00 2001 From: MichaelFu512 Date: Thu, 31 Aug 2023 12:59:58 -0700 Subject: [PATCH 11/15] removed redundant code --- evalml/utils/nullable_type_utils.py | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/evalml/utils/nullable_type_utils.py b/evalml/utils/nullable_type_utils.py index 102c4e92c2..afe84e1a93 100644 --- a/evalml/utils/nullable_type_utils.py +++ b/evalml/utils/nullable_type_utils.py @@ -1,5 +1,4 @@ import pandas as pd -import woodwork as ww from woodwork.logical_types import AgeNullable, BooleanNullable, IntegerNullable DOWNCAST_TYPE_DICT = { @@ -58,10 +57,7 @@ def _downcast_nullable_y(y, handle_boolean_nullable=True, handle_integer_nullabl y with any incompatible nullable types downcasted to compatible equivalents. """ if y.ww.schema is None: - if isinstance(y, pd.DataFrame): - y.ww.init() - else: - y = ww.init_series(y) + y.ww.init() incompatible_logical_types = _get_incompatible_nullable_types( handle_boolean_nullable, @@ -69,15 +65,7 @@ def _downcast_nullable_y(y, handle_boolean_nullable=True, handle_integer_nullabl ) if isinstance(y, pd.DataFrame): - data_to_downcast = y.ww.select(incompatible_logical_types) - # If no incompatible types are present, no downcasting is needed - if not len(data_to_downcast.columns): - return y - new_ltypes = { - col: _determine_downcast_type(data_to_downcast.ww[col]) - for col in data_to_downcast.columns - } - y.ww.set_types(logical_types=new_ltypes) + y = _downcast_nullable_X(y) else: if isinstance(y.ww.logical_type, tuple(incompatible_logical_types)): From b3c839296e8f276a7de23b4d25aece9e331051fb Mon Sep 17 00:00:00 2001 From: MichaelFu512 Date: Thu, 31 Aug 2023 13:05:10 -0700 Subject: [PATCH 12/15] tests --- .../test_time_series_imputer.py | 35 ++++++++++++++++--- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/evalml/tests/component_tests/test_time_series_imputer.py b/evalml/tests/component_tests/test_time_series_imputer.py index b50cc92d46..9fb0b2d737 100644 --- a/evalml/tests/component_tests/test_time_series_imputer.py +++ b/evalml/tests/component_tests/test_time_series_imputer.py @@ -735,21 +735,46 @@ def test_time_series_imputer_multiseries( ): X, y = multiseries_ts_data_unstacked imputer = TimeSeriesImputer(target_impute_strategy="interpolate") + if nans_present: - counter = 1 - for x in y: - y[x][counter] = np.nan - counter += 1 - if not nan_in_every_col and counter > 3: + for count, col in enumerate(y, start=1): + y[col][count] = np.nan + if not nan_in_every_col and count > len(y) // 2: break imputer.fit(X, y) assert imputer._y_all_null_cols == [] + _, y_imputed = imputer.transform(X, y) assert isinstance(y_imputed, pd.DataFrame) + y_expected = pd.DataFrame({f"target_{i}": range(i, 100, 5) for i in range(5)}) assert_frame_equal(y_imputed, y_expected, check_dtype=False) +@pytest.mark.parametrize( + "num_nan_cols", + [1, 2, 3], +) +def test_time_series_imputer_multiseries_some_columns_all_nan( + multiseries_ts_data_unstacked, + num_nan_cols, +): + X, y = multiseries_ts_data_unstacked + imputer = TimeSeriesImputer(target_impute_strategy="interpolate") + + for count, col in enumerate(y, start=1): + y[col] = np.nan + if count == num_nan_cols: + break + imputer.fit(X, y) + _, y_imputed = imputer.transform(X, y) + + y_expected = pd.DataFrame( + {f"target_{i}": range(i, 100, 5) for i in range(num_nan_cols, 5)}, + ) + assert_frame_equal(y_imputed, y_expected, check_dtype=False) + + def test_imputer_multiseries_drops_columns_with_all_nan(multiseries_ts_data_unstacked): X, y = multiseries_ts_data_unstacked for col in y: From 3f8118abe76dbb80fbe6841fa5bdce74a486cff3 Mon Sep 17 00:00:00 2001 From: MichaelFu512 Date: Thu, 31 Aug 2023 15:33:02 -0700 Subject: [PATCH 13/15] added some nans --- .../component_tests/test_time_series_imputer.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/evalml/tests/component_tests/test_time_series_imputer.py b/evalml/tests/component_tests/test_time_series_imputer.py index 9fb0b2d737..887a85da4d 100644 --- a/evalml/tests/component_tests/test_time_series_imputer.py +++ b/evalml/tests/component_tests/test_time_series_imputer.py @@ -739,7 +739,7 @@ def test_time_series_imputer_multiseries( if nans_present: for count, col in enumerate(y, start=1): y[col][count] = np.nan - if not nan_in_every_col and count > len(y) // 2: + if not nan_in_every_col and count >= len(y) // 2: break imputer.fit(X, y) assert imputer._y_all_null_cols == [] @@ -755,17 +755,26 @@ def test_time_series_imputer_multiseries( "num_nan_cols", [1, 2, 3], ) +@pytest.mark.parametrize( + "nan_in_other_cols", + [True, False], +) def test_time_series_imputer_multiseries_some_columns_all_nan( multiseries_ts_data_unstacked, num_nan_cols, + nan_in_other_cols, ): X, y = multiseries_ts_data_unstacked imputer = TimeSeriesImputer(target_impute_strategy="interpolate") for count, col in enumerate(y, start=1): - y[col] = np.nan - if count == num_nan_cols: + if count <= num_nan_cols: + y[col] = np.nan + if count == num_nan_cols and not nan_in_other_cols: break + else: + y[col][count] = np.nan + imputer.fit(X, y) _, y_imputed = imputer.transform(X, y) From 5435b192d9926d7ae34810fbdef9a044e2946e8e Mon Sep 17 00:00:00 2001 From: MichaelFu512 Date: Thu, 31 Aug 2023 15:57:03 -0700 Subject: [PATCH 14/15] maybe codecov will pass now --- evalml/tests/component_tests/test_time_series_imputer.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/evalml/tests/component_tests/test_time_series_imputer.py b/evalml/tests/component_tests/test_time_series_imputer.py index 887a85da4d..e7763a63b6 100644 --- a/evalml/tests/component_tests/test_time_series_imputer.py +++ b/evalml/tests/component_tests/test_time_series_imputer.py @@ -725,8 +725,8 @@ def test_time_series_imputer_nullable_type_incompatibility( @pytest.mark.parametrize( - "nans_present, nan_in_every_col", - [(True, True), (True, False), (False, False)], + "nans_present", + [True, False], ) def test_time_series_imputer_multiseries( multiseries_ts_data_unstacked, @@ -739,8 +739,7 @@ def test_time_series_imputer_multiseries( if nans_present: for count, col in enumerate(y, start=1): y[col][count] = np.nan - if not nan_in_every_col and count >= len(y) // 2: - break + imputer.fit(X, y) assert imputer._y_all_null_cols == [] From 41c277e02d836146c4bbaa8d936ab44872fb0381 Mon Sep 17 00:00:00 2001 From: MichaelFu512 Date: Thu, 31 Aug 2023 16:21:57 -0700 Subject: [PATCH 15/15] fix test --- evalml/tests/component_tests/test_time_series_imputer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/evalml/tests/component_tests/test_time_series_imputer.py b/evalml/tests/component_tests/test_time_series_imputer.py index e7763a63b6..20ba00823b 100644 --- a/evalml/tests/component_tests/test_time_series_imputer.py +++ b/evalml/tests/component_tests/test_time_series_imputer.py @@ -731,7 +731,6 @@ def test_time_series_imputer_nullable_type_incompatibility( def test_time_series_imputer_multiseries( multiseries_ts_data_unstacked, nans_present, - nan_in_every_col, ): X, y = multiseries_ts_data_unstacked imputer = TimeSeriesImputer(target_impute_strategy="interpolate")