alteryx · MichaelFu512 · Sep 5, 2023 · Aug 28, 2023 · Aug 29, 2023 · Aug 29, 2023
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -4,6 +4,7 @@ Release Notes
  * Enhancements
  * Added support for prediction intervals for VARMAX regressor :pr:`4267`
  * Integrated multiseries time series into AutoMLSearch :pr:`4270`
+ * Extended TimeSeriesImputer to handle multiple series :pr:`4291`
  * Fixes
  * Fixed error when stacking data with no exogenous variables :pr:`4275`
  * Changes

diff --git a/evalml/pipelines/components/component_base.py b/evalml/pipelines/components/component_base.py
@@ -3,6 +3,7 @@
 from abc import ABC, abstractmethod
 
 import cloudpickle
+import pandas as pd
 
 from evalml.exceptions import MethodPropertyNotFoundError
 from evalml.pipelines.components.component_base_meta import ComponentBaseMeta
@@ -256,7 +257,8 @@
  Args:
  X (pd.DataFrame, optional): Input data to a component of shape [n_samples, n_features].
  May contain nullable types.
- y (pd.Series, optional): The target of length [n_samples]. May contain nullable types.
+ y (pd.Series or pd.DataFrame, optional): The target of length [n_samples] or the unstacked target for a multiseries problem.
+ May contain nullable types.
 
  Returns:
  X, y with any incompatible nullable types downcasted to compatible equivalents.
@@ -273,10 +275,17 @@
  y_bool_incompatible = "y" in self._boolean_nullable_incompatibilities
  y_int_incompatible = "y" in self._integer_nullable_incompatibilities
  if y is not None and (y_bool_incompatible or y_int_incompatible):
- y = _downcast_nullable_y(
- y,
- handle_boolean_nullable=y_bool_incompatible,
- handle_integer_nullable=y_int_incompatible,
- )
-
+ if isinstance(y, pd.Series):
+ y = _downcast_nullable_y(
+ y,
+ handle_boolean_nullable=y_bool_incompatible,
+ handle_integer_nullable=y_int_incompatible,
+ )
+ # if y is a dataframe (from unstacked multiseries) use _downcast_nullable_X since downcast_nullable_y is for series
+ else:
+ y = _downcast_nullable_X(
+ y,
+ handle_boolean_nullable=y_bool_incompatible,
+ handle_integer_nullable=y_int_incompatible,
+ )
  return X, y
diff --git a/evalml/pipelines/components/transformers/imputers/time_series_imputer.py b/evalml/pipelines/components/transformers/imputers/time_series_imputer.py
@@ -93,6 +93,7 @@
  self._backwards_cols = None
  self._interpolate_cols = None
  self._impute_target = None
+ self._y_all_null_cols = None
  super().__init__(
  parameters=parameters,
  component_obj=None,
@@ -137,11 +138,17 @@
  self._backwards_cols = _filter_cols("backwards_fill", X)
  self._interpolate_cols = _filter_cols("interpolate", X)
 
- if y is not None:
+ if isinstance(y, pd.Series):
  y = infer_feature_types(y)
  if y.isnull().any():
  self._impute_target = self.parameters["target_impute_strategy"]
 
+ elif isinstance(y, pd.DataFrame):
+ y = infer_feature_types(y)
+ y_nan_ratio = y.isna().sum() / y.shape[0]
+ self._y_all_null_cols = y_nan_ratio[y_nan_ratio == 1].index.tolist()
+ if y.isnull().values.any():
+ self._impute_target = self.parameters["target_impute_strategy"]
  return self
 
  def transform(self, X, y=None):
@@ -212,19 +219,33 @@
  new_ltypes.update(new_int_ltypes)
  X_not_all_null.ww.init(schema=original_schema, logical_types=new_ltypes)
 
- y_imputed = pd.Series(y)
+ y_imputed = (
+ y.ww.drop(self._y_all_null_cols)
+ if isinstance(y, pd.DataFrame)
+ else pd.Series(y)
+ )
  if y is not None and len(y) > 0:
  if self._impute_target == "forwards_fill":
- y_imputed = y.pad()
+ y_imputed = y_imputed.pad()
  y_imputed.bfill(inplace=True)
  elif self._impute_target == "backwards_fill":
- y_imputed = y.bfill()
+ y_imputed = y_imputed.bfill()
  y_imputed.pad(inplace=True)
  elif self._impute_target == "interpolate":
- y_imputed = y.interpolate()
+ y_imputed = y_imputed.interpolate()
  y_imputed.bfill(inplace=True)
  # Re-initialize woodwork with the downcast logical type
- y_imputed = ww.init_series(y_imputed, logical_type=y.ww.logical_type)
+ if isinstance(y, pd.Series):
+ y_imputed = ww.init_series(y_imputed, logical_type=y.ww.logical_type)
+ else:
+ y_original_schema = y.ww.schema.get_subset_schema(
+ list(y_imputed.columns),
+ )
+ y_new_ltypes = {
+ col: _determine_non_nullable_equivalent(ltype)
+ for col, ltype in y_original_schema.logical_types.items()
+ }
+ y_imputed.ww.init(schema=y_original_schema, logical_types=y_new_ltypes)
 
  return X_not_all_null, y_imputed
 
@@ -234,15 +255,21 @@
  Args:
  X (pd.DataFrame, optional): Input data to a component of shape [n_samples, n_features].
  May contain nullable types.
- y (pd.Series, optional): The target of length [n_samples]. May contain nullable types.
+ y (pd.Series or pd.DataFrame, optional): The target of length [n_samples] or the unstacked target for a multiseries problem.
+ May contain nullable types.
 
  Returns:
  X, y with any incompatible nullable types downcasted to compatible equivalents when interpolate is used. Is NoOp otherwise.
  """
  if self._impute_target == "interpolate":
  # For BooleanNullable, we have to avoid Categorical columns
  # since the category dtype also has incompatibilities with linear interpolate, which is expected
- if isinstance(y.ww.logical_type, BooleanNullable):
+ # TODO: Avoid categorical columns for BooleanNullable in multiseries when
+ # multiseries timeseries supports categorical
+ if isinstance(y, pd.Series) and isinstance(
+ y.ww.logical_type,
+ BooleanNullable,
+ ):
  y = ww.init_series(y, Double)
  else:
  _, y = super()._handle_nullable_types(None, y)

diff --git a/evalml/tests/component_tests/test_time_series_imputer.py b/evalml/tests/component_tests/test_time_series_imputer.py
@@ -722,3 +722,40 @@
  _, nullable_series = imputer._handle_nullable_types(None, nullable_series)
 
  nullable_series.interpolate()
+
+
+@pytest.mark.parametrize(
+ "nans_present",
+ [True, False],
+)
+def test_time_series_imputer_multiseries(multiseries_ts_data_unstacked, nans_present):
+ X, y = multiseries_ts_data_unstacked
+ imputer = TimeSeriesImputer(target_impute_strategy="interpolate")
+ if nans_present:
+ c = 1
+ for x in y:
+ y[x][c] = np.nan
+ c += 1
+ imputer.fit(X, y)
+ assert imputer._y_all_null_cols == []
+ _, y_imputed = imputer.transform(X, y)
+ assert isinstance(y_imputed, pd.DataFrame)
+ y_expected = pd.DataFrame({f"target_{i}": range(i, 100, 5) for i in range(5)})
+ assert_frame_equal(y_imputed, y_expected, check_dtype=False)
+
+
+def test_imputer_multiseries_drops_columns_with_all_nan(multiseries_ts_data_unstacked):
+ X, y = multiseries_ts_data_unstacked
+ for col in y:
+ y[col] = np.nan
+ imputer = TimeSeriesImputer(target_impute_strategy="interpolate")
+ imputer.fit(X, y)
+ assert imputer._y_all_null_cols == y.columns.tolist()
+ _, y_imputed = imputer.transform(X, y)
+ expected = y.drop(y.columns.tolist(), axis=1)
+ assert_frame_equal(
+ y_imputed,
+ expected,
+ check_column_type=False,
+ check_index_type=False,
+ )