Fixed error when stacking data with no exogenous variables (#4275)

* Initial commit * Updated release notes * Refactored code structure. * Updated error message and docstring * Final nits
alteryx · Aug 21, 2023 · 53bd61b · 53bd61b
1 parent 7781c77
commit 53bd61b
Show file tree

Hide file tree

Showing 5 changed files with 96 additions and 29 deletions.
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -5,6 +5,7 @@ Release Notes
  * Added support for prediction intervals for VARMAX regressor :pr:`4267`
  * Integrated multiseries time series into AutoMLSearch :pr:`4270`
  * Fixes
+ * Fixed error when stacking data with no exogenous variables :pr:`4275`
  * Changes
  * Updated ``ARIMARegressor`` to be compatible with sktime v0.22.0 and beyond :pr:`4283`
  * Documentation Changes

diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py
@@ -1417,8 +1417,11 @@ def unstack_multiseries(
  X_unstacked_cols.append(new_column)
 
  # Concatenate all the single series to reform dataframes
- X_unstacked = pd.concat(X_unstacked_cols, axis=1)
  y_unstacked = pd.concat(y_unstacked_cols, axis=1)
+ if len(X_unstacked_cols) == 0:
+ X_unstacked = pd.DataFrame(index=y_unstacked.index)
+ else:
+ X_unstacked = pd.concat(X_unstacked_cols, axis=1)
 
  # Reset the axes now that they've been unstacked, keep time info in X
  X_unstacked = X_unstacked.reset_index()
@@ -1477,7 +1480,7 @@ def stack_data(data, include_series_id=False, series_id_name=None, starting_inde
  return stacked_series
 
 
-def stack_X(X, series_id_name, time_index, starting_index=None):
+def stack_X(X, series_id_name, time_index, starting_index=None, series_id_values=None):
  """Restacks the unstacked features into a single DataFrame.
 
  Args:
@@ -1486,37 +1489,61 @@ def stack_X(X, series_id_name, time_index, starting_index=None):
  time_index (str): The name of the time index column.
  starting_index (int): The starting index to use for the stacked DataFrame. If None, the starting index
  will match that of the input data. Defaults to None.
+ series_id_values (set, list): The unique values of a series ID, used to generate the index. If None, values will
+ be generated from X column values. Required if X only has time index values and no exogenous values.
+ Defaults to None.
 
  Returns:
  pd.DataFrame: The restacked features.
  """
  original_columns = set()
- series_ids = set()
- for col in X.columns:
- if col == time_index:
- continue
- separated_name = col.split("_")
- original_columns.add("_".join(separated_name[:-1]))
- series_ids.add(separated_name[-1])
-
- restacked_X = []
-
- for i, original_col in enumerate(original_columns):
- # Only include the series id once (for the first column)
- include_series_id = i == 0
- subset_X = [col for col in X.columns if original_col in col]
- restacked_X.append(
- stack_data(
- X[subset_X],
- include_series_id=include_series_id,
- series_id_name=series_id_name,
- starting_index=starting_index,
- ),
+ series_ids = series_id_values or set()
+ if series_id_values is None:
+ for col in X.columns:
+ if col == time_index:
+ continue
+ separated_name = col.split("_")
+ original_columns.add("_".join(separated_name[:-1]))
+ series_ids.add(separated_name[-1])
+
+ if len(series_ids) == 0:
+ raise ValueError(
+ "Series ID values need to be passed in X column values or as a set with the `series_id_values` parameter.",
  )
- restacked_X = pd.concat(restacked_X, axis=1)
 
  time_index_col = X[time_index].repeat(len(series_ids)).reset_index(drop=True)
- time_index_col.index = restacked_X.index
- restacked_X[time_index] = time_index_col
+
+ if len(original_columns) == 0:
+ start_index = starting_index or X.index[0]
+ stacked_index = pd.RangeIndex(
+ start=start_index,
+ stop=start_index + len(time_index_col),
+ )
+ time_index_col.index = stacked_index
+ restacked_X = pd.DataFrame(
+ {
+ time_index: time_index_col,
+ series_id_name: sorted(list(series_ids)) * len(X),
+ },
+ index=stacked_index,
+ )
+ else:
+ restacked_X = []
+ for i, original_col in enumerate(original_columns):
+ # Only include the series id once (for the first column)
+ include_series_id = i == 0
+ subset_X = [col for col in X.columns if original_col in col]
+ restacked_X.append(
+ stack_data(
+ X[subset_X],
+ include_series_id=include_series_id,
+ series_id_name=series_id_name,
+ starting_index=starting_index,
+ ),
+ )
+
+ restacked_X = pd.concat(restacked_X, axis=1)
+ time_index_col.index = restacked_X.index
+ restacked_X[time_index] = time_index_col
 
  return restacked_X
diff --git a/evalml/preprocessing/utils.py b/evalml/preprocessing/utils.py
@@ -72,12 +72,22 @@ def split_multiseries_data(X, y, series_id, time_index, **kwargs):
  X_unstacked, y_unstacked, problem_type="time series regression", **kwargs
  )
 
- X_train = stack_X(X_train_unstacked, series_id, time_index)
+ # Get unique series value from X if there is only the time_index column
+ # Otherwise, this information is generated in `stack_X` from the column values
+ series_id_values = set(X[series_id]) if len(X_unstacked.columns) == 1 else None
+
+ X_train = stack_X(
+ X_train_unstacked,
+ series_id,
+ time_index,
+ series_id_values=series_id_values,
+ )
  X_holdout = stack_X(
  X_holdout_unstacked,
  series_id,
  time_index,
  starting_index=X_train.index[-1] + 1,
+ series_id_values=series_id_values,
  )
  y_train = stack_data(y_train_unstacked)
  y_holdout = stack_data(y_holdout_unstacked, starting_index=y_train.index[-1] + 1)

diff --git a/evalml/tests/pipeline_tests/test_pipeline_utils.py b/evalml/tests/pipeline_tests/test_pipeline_utils.py
@@ -1472,9 +1472,13 @@ def test_stack_data_noop():
  pd.testing.assert_series_equal(stack_data(series_y), series_y)
 
 
+@pytest.mark.parametrize("series_id_values_type", [set, list])
+@pytest.mark.parametrize("no_features", [True, False])
 @pytest.mark.parametrize("starting_index", [None, 1, 132])
 def test_stack_X(
  starting_index,
+ no_features,
+ series_id_values_type,
  multiseries_ts_data_stacked,
  multiseries_ts_data_unstacked,
 ):
@@ -1484,7 +1488,28 @@ def test_stack_X(
  if starting_index is not None:
  X_expected.index = X_expected.index + starting_index
 
- X_transformed = stack_X(X, "series_id", "date", starting_index=starting_index)
+ if no_features:
+ series_id_values = series_id_values_type(str(i) for i in range(0, 5))
+ X = pd.DataFrame(X["date"])
+ X_expected = X_expected[["date", "series_id"]]
+
+ with pytest.raises(
+ ValueError,
+ match="Series ID values need to be passed in X column values or as a set with the `series_id_values` parameter.",
+ ):
+ stack_X(X, "series_id", "date", starting_index=starting_index)
+
+ X_transformed = stack_X(
+ X,
+ "series_id",
+ "date",
+ starting_index=starting_index,
+ series_id_values=series_id_values,
+ )
+
+ else:
+ X_transformed = stack_X(X, "series_id", "date", starting_index=starting_index)
+
  pd.testing.assert_frame_equal(
  X_expected.sort_index(axis=1),
  X_transformed.sort_index(axis=1),

diff --git a/evalml/tests/preprocessing_tests/test_split_data.py b/evalml/tests/preprocessing_tests/test_split_data.py
@@ -127,9 +127,13 @@ def test_split_data_ts(test, X_y_regression):
  assert len(y_test) == test_size
 
 
-def test_split_multiseries_data(multiseries_ts_data_stacked):
+@pytest.mark.parametrize("no_features", [True, False])
+def test_split_multiseries_data(no_features, multiseries_ts_data_stacked):
  X, y = multiseries_ts_data_stacked
 
+ if no_features:
+ X = X[["date", "series_id"]]
+
  X_train_expected, X_holdout_expected = X[:-10], X[-10:]
  y_train_expected, y_holdout_expected = y[:-10], y[-10:]