diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index b727366c55..17b965565b 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -5,6 +5,7 @@ Release Notes * Added support for prediction intervals for VARMAX regressor :pr:`4267` * Integrated multiseries time series into AutoMLSearch :pr:`4270` * Fixes + * Fixed error when stacking data with no exogenous variables :pr:`4275` * Changes * Updated ``ARIMARegressor`` to be compatible with sktime v0.22.0 and beyond :pr:`4283` * Documentation Changes diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py index 7e61999f25..dbc51abee8 100644 --- a/evalml/pipelines/utils.py +++ b/evalml/pipelines/utils.py @@ -1417,8 +1417,11 @@ def unstack_multiseries( X_unstacked_cols.append(new_column) # Concatenate all the single series to reform dataframes - X_unstacked = pd.concat(X_unstacked_cols, axis=1) y_unstacked = pd.concat(y_unstacked_cols, axis=1) + if len(X_unstacked_cols) == 0: + X_unstacked = pd.DataFrame(index=y_unstacked.index) + else: + X_unstacked = pd.concat(X_unstacked_cols, axis=1) # Reset the axes now that they've been unstacked, keep time info in X X_unstacked = X_unstacked.reset_index() @@ -1477,7 +1480,7 @@ def stack_data(data, include_series_id=False, series_id_name=None, starting_inde return stacked_series -def stack_X(X, series_id_name, time_index, starting_index=None): +def stack_X(X, series_id_name, time_index, starting_index=None, series_id_values=None): """Restacks the unstacked features into a single DataFrame. Args: @@ -1486,37 +1489,61 @@ def stack_X(X, series_id_name, time_index, starting_index=None): time_index (str): The name of the time index column. starting_index (int): The starting index to use for the stacked DataFrame. If None, the starting index will match that of the input data. Defaults to None. + series_id_values (set, list): The unique values of a series ID, used to generate the index. If None, values will + be generated from X column values. Required if X only has time index values and no exogenous values. + Defaults to None. Returns: pd.DataFrame: The restacked features. """ original_columns = set() - series_ids = set() - for col in X.columns: - if col == time_index: - continue - separated_name = col.split("_") - original_columns.add("_".join(separated_name[:-1])) - series_ids.add(separated_name[-1]) - - restacked_X = [] - - for i, original_col in enumerate(original_columns): - # Only include the series id once (for the first column) - include_series_id = i == 0 - subset_X = [col for col in X.columns if original_col in col] - restacked_X.append( - stack_data( - X[subset_X], - include_series_id=include_series_id, - series_id_name=series_id_name, - starting_index=starting_index, - ), + series_ids = series_id_values or set() + if series_id_values is None: + for col in X.columns: + if col == time_index: + continue + separated_name = col.split("_") + original_columns.add("_".join(separated_name[:-1])) + series_ids.add(separated_name[-1]) + + if len(series_ids) == 0: + raise ValueError( + "Series ID values need to be passed in X column values or as a set with the `series_id_values` parameter.", ) - restacked_X = pd.concat(restacked_X, axis=1) time_index_col = X[time_index].repeat(len(series_ids)).reset_index(drop=True) - time_index_col.index = restacked_X.index - restacked_X[time_index] = time_index_col + + if len(original_columns) == 0: + start_index = starting_index or X.index[0] + stacked_index = pd.RangeIndex( + start=start_index, + stop=start_index + len(time_index_col), + ) + time_index_col.index = stacked_index + restacked_X = pd.DataFrame( + { + time_index: time_index_col, + series_id_name: sorted(list(series_ids)) * len(X), + }, + index=stacked_index, + ) + else: + restacked_X = [] + for i, original_col in enumerate(original_columns): + # Only include the series id once (for the first column) + include_series_id = i == 0 + subset_X = [col for col in X.columns if original_col in col] + restacked_X.append( + stack_data( + X[subset_X], + include_series_id=include_series_id, + series_id_name=series_id_name, + starting_index=starting_index, + ), + ) + + restacked_X = pd.concat(restacked_X, axis=1) + time_index_col.index = restacked_X.index + restacked_X[time_index] = time_index_col return restacked_X diff --git a/evalml/preprocessing/utils.py b/evalml/preprocessing/utils.py index 6b3d656d4c..6e7c203611 100644 --- a/evalml/preprocessing/utils.py +++ b/evalml/preprocessing/utils.py @@ -72,12 +72,22 @@ def split_multiseries_data(X, y, series_id, time_index, **kwargs): X_unstacked, y_unstacked, problem_type="time series regression", **kwargs ) - X_train = stack_X(X_train_unstacked, series_id, time_index) + # Get unique series value from X if there is only the time_index column + # Otherwise, this information is generated in `stack_X` from the column values + series_id_values = set(X[series_id]) if len(X_unstacked.columns) == 1 else None + + X_train = stack_X( + X_train_unstacked, + series_id, + time_index, + series_id_values=series_id_values, + ) X_holdout = stack_X( X_holdout_unstacked, series_id, time_index, starting_index=X_train.index[-1] + 1, + series_id_values=series_id_values, ) y_train = stack_data(y_train_unstacked) y_holdout = stack_data(y_holdout_unstacked, starting_index=y_train.index[-1] + 1) diff --git a/evalml/tests/pipeline_tests/test_pipeline_utils.py b/evalml/tests/pipeline_tests/test_pipeline_utils.py index cbc963aa57..92eb95cc0e 100644 --- a/evalml/tests/pipeline_tests/test_pipeline_utils.py +++ b/evalml/tests/pipeline_tests/test_pipeline_utils.py @@ -1472,9 +1472,13 @@ def test_stack_data_noop(): pd.testing.assert_series_equal(stack_data(series_y), series_y) +@pytest.mark.parametrize("series_id_values_type", [set, list]) +@pytest.mark.parametrize("no_features", [True, False]) @pytest.mark.parametrize("starting_index", [None, 1, 132]) def test_stack_X( starting_index, + no_features, + series_id_values_type, multiseries_ts_data_stacked, multiseries_ts_data_unstacked, ): @@ -1484,7 +1488,28 @@ def test_stack_X( if starting_index is not None: X_expected.index = X_expected.index + starting_index - X_transformed = stack_X(X, "series_id", "date", starting_index=starting_index) + if no_features: + series_id_values = series_id_values_type(str(i) for i in range(0, 5)) + X = pd.DataFrame(X["date"]) + X_expected = X_expected[["date", "series_id"]] + + with pytest.raises( + ValueError, + match="Series ID values need to be passed in X column values or as a set with the `series_id_values` parameter.", + ): + stack_X(X, "series_id", "date", starting_index=starting_index) + + X_transformed = stack_X( + X, + "series_id", + "date", + starting_index=starting_index, + series_id_values=series_id_values, + ) + + else: + X_transformed = stack_X(X, "series_id", "date", starting_index=starting_index) + pd.testing.assert_frame_equal( X_expected.sort_index(axis=1), X_transformed.sort_index(axis=1), diff --git a/evalml/tests/preprocessing_tests/test_split_data.py b/evalml/tests/preprocessing_tests/test_split_data.py index 9403862ef4..cbb8c941ed 100644 --- a/evalml/tests/preprocessing_tests/test_split_data.py +++ b/evalml/tests/preprocessing_tests/test_split_data.py @@ -127,9 +127,13 @@ def test_split_data_ts(test, X_y_regression): assert len(y_test) == test_size -def test_split_multiseries_data(multiseries_ts_data_stacked): +@pytest.mark.parametrize("no_features", [True, False]) +def test_split_multiseries_data(no_features, multiseries_ts_data_stacked): X, y = multiseries_ts_data_stacked + if no_features: + X = X[["date", "series_id"]] + X_train_expected, X_holdout_expected = X[:-10], X[-10:] y_train_expected, y_holdout_expected = y[:-10], y[-10:]