Extend TimeSeriesImputer to handle multiple series (#4291)

* Multiseries timeseries imputer support
alteryx · Sep 5, 2023 · 93e7b97 · 93e7b97
1 parent 69344b2
commit 93e7b97
Show file tree

Hide file tree

Showing 6 changed files with 142 additions and 20 deletions.
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -17,6 +17,7 @@ Release Notes
  * Enhancements
  * Added support for prediction intervals for VARMAX regressor :pr:`4267`
  * Integrated multiseries time series into AutoMLSearch :pr:`4270`
+ * Extended TimeSeriesImputer to handle multiple series :pr:`4291`
  * Fixes
  * Fixed error when stacking data with no exogenous variables :pr:`4275`
  * Changes

diff --git a/evalml/pipelines/components/component_base.py b/evalml/pipelines/components/component_base.py
@@ -256,7 +256,9 @@ def _handle_nullable_types(self, X=None, y=None):
  Args:
  X (pd.DataFrame, optional): Input data to a component of shape [n_samples, n_features].
  May contain nullable types.
- y (pd.Series, optional): The target of length [n_samples]. May contain nullable types.
+ y (pd.Series or pd.DataFrame, optional): The target of length [n_samples] or the
+ unstacked target for a multiseries problem of length [n_samples, n_features*n_series].
+ May contain nullable types.
 
  Returns:
  X, y with any incompatible nullable types downcasted to compatible equivalents.
@@ -278,5 +280,4 @@ def _handle_nullable_types(self, X=None, y=None):
  handle_boolean_nullable=y_bool_incompatible,
  handle_integer_nullable=y_int_incompatible,
  )
-
  return X, y
diff --git a/evalml/pipelines/components/transformers/imputers/time_series_imputer.py b/evalml/pipelines/components/transformers/imputers/time_series_imputer.py
@@ -93,6 +93,7 @@ def __init__(
  self._backwards_cols = None
  self._interpolate_cols = None
  self._impute_target = None
+ self._y_all_null_cols = None
  super().__init__(
  parameters=parameters,
  component_obj=None,
@@ -137,11 +138,17 @@ def _filter_cols(impute_strat, X):
  self._backwards_cols = _filter_cols("backwards_fill", X)
  self._interpolate_cols = _filter_cols("interpolate", X)
 
- if y is not None:
+ if isinstance(y, pd.Series):
  y = infer_feature_types(y)
  if y.isnull().any():
  self._impute_target = self.parameters["target_impute_strategy"]
 
+ elif isinstance(y, pd.DataFrame):
+ y = infer_feature_types(y)
+ y_nan_ratio = y.isna().sum() / y.shape[0]
+ self._y_all_null_cols = y_nan_ratio[y_nan_ratio == 1].index.tolist()
+ if y.isnull().values.any():
+ self._impute_target = self.parameters["target_impute_strategy"]
  return self
 
  def transform(self, X, y=None):
@@ -212,19 +219,33 @@ def transform(self, X, y=None):
  new_ltypes.update(new_int_ltypes)
  X_not_all_null.ww.init(schema=original_schema, logical_types=new_ltypes)
 
- y_imputed = pd.Series(y)
+ y_imputed = (
+ y.ww.drop(self._y_all_null_cols)
+ if isinstance(y, pd.DataFrame)
+ else pd.Series(y)
+ )
  if y is not None and len(y) > 0:
  if self._impute_target == "forwards_fill":
- y_imputed = y.pad()
+ y_imputed = y_imputed.pad()
  y_imputed.bfill(inplace=True)
  elif self._impute_target == "backwards_fill":
- y_imputed = y.bfill()
+ y_imputed = y_imputed.bfill()
  y_imputed.pad(inplace=True)
  elif self._impute_target == "interpolate":
- y_imputed = y.interpolate()
+ y_imputed = y_imputed.interpolate()
  y_imputed.bfill(inplace=True)
  # Re-initialize woodwork with the downcast logical type
- y_imputed = ww.init_series(y_imputed, logical_type=y.ww.logical_type)
+ if isinstance(y, pd.Series):
+ y_imputed = ww.init_series(y_imputed, logical_type=y.ww.logical_type)
+ else:
+ y_original_schema = y.ww.schema.get_subset_schema(
+ list(y_imputed.columns),
+ )
+ y_new_ltypes = {
+ col: _determine_non_nullable_equivalent(ltype)
+ for col, ltype in y_original_schema.logical_types.items()
+ }
+ y_imputed.ww.init(schema=y_original_schema, logical_types=y_new_ltypes)
 
  return X_not_all_null, y_imputed
 
@@ -234,15 +255,22 @@ def _handle_nullable_types(self, X=None, y=None):
  Args:
  X (pd.DataFrame, optional): Input data to a component of shape [n_samples, n_features].
  May contain nullable types.
- y (pd.Series, optional): The target of length [n_samples]. May contain nullable types.
+ y (pd.Series or pd.DataFrame, optional): The target of length [n_samples] or the
+ unstacked target for a multiseries problem of length [n_samples, n_features*n_series].
+ May contain nullable types.
 
  Returns:
  X, y with any incompatible nullable types downcasted to compatible equivalents when interpolate is used. Is NoOp otherwise.
  """
  if self._impute_target == "interpolate":
  # For BooleanNullable, we have to avoid Categorical columns
  # since the category dtype also has incompatibilities with linear interpolate, which is expected
- if isinstance(y.ww.logical_type, BooleanNullable):
+ # TODO: Avoid categorical columns for BooleanNullable in multiseries when
+ # multiseries timeseries supports categorical
+ if isinstance(y, pd.Series) and isinstance(
+ y.ww.logical_type,
+ BooleanNullable,
+ ):
  y = ww.init_series(y, Double)
  else:
  _, y = super()._handle_nullable_types(None, y)

diff --git a/evalml/tests/component_tests/test_time_series_imputer.py b/evalml/tests/component_tests/test_time_series_imputer.py
@@ -722,3 +722,78 @@ def test_time_series_imputer_nullable_type_incompatibility(
  _, nullable_series = imputer._handle_nullable_types(None, nullable_series)
 
  nullable_series.interpolate()
+
+
+@pytest.mark.parametrize(
+ "nans_present",
+ [True, False],
+)
+def test_time_series_imputer_multiseries(
+ multiseries_ts_data_unstacked,
+ nans_present,
+):
+ X, y = multiseries_ts_data_unstacked
+ imputer = TimeSeriesImputer(target_impute_strategy="interpolate")
+
+ if nans_present:
+ for count, col in enumerate(y, start=1):
+ y[col][count] = np.nan
+
+ imputer.fit(X, y)
+ assert imputer._y_all_null_cols == []
+
+ _, y_imputed = imputer.transform(X, y)
+ assert isinstance(y_imputed, pd.DataFrame)
+
+ y_expected = pd.DataFrame({f"target_{i}": range(i, 100, 5) for i in range(5)})
+ assert_frame_equal(y_imputed, y_expected, check_dtype=False)
+
+
+@pytest.mark.parametrize(
+ "num_nan_cols",
+ [1, 2, 3],
+)
+@pytest.mark.parametrize(
+ "nan_in_other_cols",
+ [True, False],
+)
+def test_time_series_imputer_multiseries_some_columns_all_nan(
+ multiseries_ts_data_unstacked,
+ num_nan_cols,
+ nan_in_other_cols,
+):
+ X, y = multiseries_ts_data_unstacked
+ imputer = TimeSeriesImputer(target_impute_strategy="interpolate")
+
+ for count, col in enumerate(y, start=1):
+ if count <= num_nan_cols:
+ y[col] = np.nan
+ if count == num_nan_cols and not nan_in_other_cols:
+ break
+ else:
+ y[col][count] = np.nan
+
+ imputer.fit(X, y)
+ _, y_imputed = imputer.transform(X, y)
+
+ y_expected = pd.DataFrame(
+ {f"target_{i}": range(i, 100, 5) for i in range(num_nan_cols, 5)},
+ )
+ assert_frame_equal(y_imputed, y_expected, check_dtype=False)
+
+
+def test_imputer_multiseries_drops_columns_with_all_nan(multiseries_ts_data_unstacked):
+ X, y = multiseries_ts_data_unstacked
+ for col in y:
+ y[col] = np.nan
+ imputer = TimeSeriesImputer(target_impute_strategy="interpolate")
+ imputer.fit(X, y)
+ assert imputer._y_all_null_cols == y.columns.tolist()
+ _, y_imputed = imputer.transform(X, y)
+ expected = y.drop(y.columns.tolist(), axis=1)
+ assert_frame_equal(
+ y_imputed,
+ expected,
+ check_column_type=False,
+ check_index_type=False,
+ )
diff --git a/evalml/tests/utils_tests/test_nullable_type_utils.py b/evalml/tests/utils_tests/test_nullable_type_utils.py
@@ -53,11 +53,24 @@ def test_determine_downcast_type(nullable_type_target, nullable_ltype, has_nans)
 
 
 @pytest.mark.parametrize(
- "downcast_util, data_type",
- [(_downcast_nullable_X, "X"), (_downcast_nullable_y, "y")],
+ "downcast_util, data_type, y_type",
+ [
+ (_downcast_nullable_X, "X", "series"),
+ (_downcast_nullable_y, "y", "series"),
+ (_downcast_nullable_y, "y", "dataframe"),
+ ],
 )
-def test_downcast_utils_handle_woodwork_not_init(X_y_binary, downcast_util, data_type):
- X, y = X_y_binary
+def test_downcast_utils_handle_woodwork_not_init(
+ X_y_binary,
+ multiseries_ts_data_unstacked,
+ downcast_util,
+ data_type,
+ y_type,
+):
+ if y_type == "series":
+ X, y = X_y_binary
+ else:
+ X, y = multiseries_ts_data_unstacked
  # Remove woodwork types
  if data_type == "X":
  data = X.copy()

diff --git a/evalml/utils/nullable_type_utils.py b/evalml/utils/nullable_type_utils.py
@@ -1,4 +1,4 @@
-import woodwork as ww
+import pandas as pd
 from woodwork.logical_types import AgeNullable, BooleanNullable, IntegerNullable
 
 DOWNCAST_TYPE_DICT = {
@@ -48,7 +48,7 @@ def _downcast_nullable_y(y, handle_boolean_nullable=True, handle_integer_nullabl
  to other dtypes via Woodwork logical type transformations.
 
  Args:
- y (pd.Series): Target data of shape [n_samples] whose nullable types will be changed.
+ y (pd.Series or pd.DataFrame): Target data of shape [n_samples] or [n_samples, n_features*n_series] whose nullable types will be changed.
  handle_boolean_nullable (bool, optional): Whether or not to downcast data with BooleanNullable logical types.
  handle_integer_nullable (bool, optional): Whether or not to downcast data with IntegerNullable or AgeNullable logical types.
 
@@ -57,16 +57,20 @@ def _downcast_nullable_y(y, handle_boolean_nullable=True, handle_integer_nullabl
  y with any incompatible nullable types downcasted to compatible equivalents.
  """
  if y.ww.schema is None:
- y = ww.init_series(y)
+ y.ww.init()
 
  incompatible_logical_types = _get_incompatible_nullable_types(
  handle_boolean_nullable,
  handle_integer_nullable,
  )
 
- if isinstance(y.ww.logical_type, tuple(incompatible_logical_types)):
- new_ltype = _determine_downcast_type(y)
- return y.ww.set_logical_type(new_ltype)
+ if isinstance(y, pd.DataFrame):
+ y = _downcast_nullable_X(y)
+
+ else:
+ if isinstance(y.ww.logical_type, tuple(incompatible_logical_types)):
+ new_ltype = _determine_downcast_type(y)
+ return y.ww.set_logical_type(new_ltype)
 
  return y