From dbcc2938336411fb50135fbf7b321d1c2e3557f9 Mon Sep 17 00:00:00 2001 From: Becca McBrayer Date: Mon, 18 Sep 2023 16:25:42 -0400 Subject: [PATCH 1/3] Update split_data to call split_multiseries_data --- evalml/preprocessing/utils.py | 24 +++++++++- .../preprocessing_tests/test_split_data.py | 48 ++++++++++++++++--- 2 files changed, 64 insertions(+), 8 deletions(-) diff --git a/evalml/preprocessing/utils.py b/evalml/preprocessing/utils.py index 6e7c203611..db711f068a 100644 --- a/evalml/preprocessing/utils.py +++ b/evalml/preprocessing/utils.py @@ -4,7 +4,12 @@ from evalml.pipelines.utils import stack_data, stack_X, unstack_multiseries from evalml.preprocessing.data_splitters import TrainingValidationSplit -from evalml.problem_types import is_classification, is_regression, is_time_series +from evalml.problem_types import ( + is_classification, + is_multiseries, + is_regression, + is_time_series, +) from evalml.utils import infer_feature_types @@ -144,6 +149,23 @@ def split_data( 1 9 dtype: int64 """ + if is_multiseries(problem_type) and isinstance(y, pd.Series): + series_id = problem_configuration.get("series_id") + time_index = problem_configuration.get("time_index") + if series_id is None or time_index is None: + raise ValueError( + "split_data needs both series_id and time_index values in the problem_configuration to split multiseries data", + ) + return split_multiseries_data( + X, + y, + series_id, + time_index, + problem_configuration=problem_configuration, + test_size=test_size, + random_seed=random_seed, + ) + X = infer_feature_types(X) y = infer_feature_types(y) diff --git a/evalml/tests/preprocessing_tests/test_split_data.py b/evalml/tests/preprocessing_tests/test_split_data.py index cbb8c941ed..3a06d17b89 100644 --- a/evalml/tests/preprocessing_tests/test_split_data.py +++ b/evalml/tests/preprocessing_tests/test_split_data.py @@ -6,6 +6,7 @@ ProblemTypes, is_binary, is_multiclass, + is_multiseries, is_regression, is_time_series, ) @@ -29,6 +30,8 @@ def test_split_data( X, y = X_y_regression problem_configuration = None if is_time_series(problem_type): + if is_multiseries(problem_type): + pytest.skip("Multiseries time series is tested separately") problem_configuration = {"gap": 1, "max_delay": 7, "time_index": "date"} X = make_data_type(data_type, X) @@ -70,6 +73,8 @@ def test_split_data_defaults(problem_type, data_type, get_test_data_from_configu problem_configuration = None if is_time_series(problem_type): + if is_multiseries(problem_type): + pytest.skip("Multiseries time series is tested separately") problem_configuration = {"gap": 1, "max_delay": 7, "time_index": "date"} test_pct = 0.1 else: @@ -127,8 +132,27 @@ def test_split_data_ts(test, X_y_regression): assert len(y_test) == test_size +def test_split_data_calls_multiseries_error(multiseries_ts_data_stacked): + X, y = multiseries_ts_data_stacked + match_str = ( + "needs both series_id and time_index values in the problem_configuration" + ) + with pytest.raises(ValueError, match=match_str): + split_data( + X, + y, + problem_type="multiseries time series regression", + problem_configuration={"time_index": "date"}, + ) + + @pytest.mark.parametrize("no_features", [True, False]) -def test_split_multiseries_data(no_features, multiseries_ts_data_stacked): +@pytest.mark.parametrize("splitting_function", ["split_data", "split_multiseries_data"]) +def test_split_multiseries_data( + no_features, + splitting_function, + multiseries_ts_data_stacked, +): X, y = multiseries_ts_data_stacked if no_features: @@ -137,12 +161,22 @@ def test_split_multiseries_data(no_features, multiseries_ts_data_stacked): X_train_expected, X_holdout_expected = X[:-10], X[-10:] y_train_expected, y_holdout_expected = y[:-10], y[-10:] - X_train, X_holdout, y_train, y_holdout = split_multiseries_data( - X, - y, - "series_id", - "date", - ) + # Results should be identical whether split_multiseries_data is called through + # split_data or directly + if splitting_function == "split_data": + X_train, X_holdout, y_train, y_holdout = split_data( + X, + y, + problem_type="multiseries time series regression", + problem_configuration={"time_index": "date", "series_id": "series_id"}, + ) + else: + X_train, X_holdout, y_train, y_holdout = split_multiseries_data( + X, + y, + "series_id", + "date", + ) pd.testing.assert_frame_equal( X_train.sort_index(axis=1), From 068a2b56c3c2b8be37bb7eb1f7e16251b10bb57d Mon Sep 17 00:00:00 2001 From: Becca McBrayer Date: Mon, 18 Sep 2023 16:28:03 -0400 Subject: [PATCH 2/3] Release notes --- docs/source/release_notes.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 40174fd6df..37d1ce74d2 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -10,6 +10,7 @@ Release Notes * Extended TimeSeriesRegularizer to support multiseries :pr:`4303` * Fixes * Changes + * Updated ``split_data`` to call ``split_multiseries_data`` when passed stacked multiseries data :pr:`4312` * Documentation Changes * Testing Changes From d9bf2ce52da83636526073becc324e1264304950 Mon Sep 17 00:00:00 2001 From: Becca McBrayer Date: Tue, 19 Sep 2023 08:53:55 -0400 Subject: [PATCH 3/3] PR comments --- evalml/preprocessing/utils.py | 7 +++ .../preprocessing_tests/test_split_data.py | 55 ++++++++++++++----- 2 files changed, 49 insertions(+), 13 deletions(-) diff --git a/evalml/preprocessing/utils.py b/evalml/preprocessing/utils.py index db711f068a..dc17e75ee8 100644 --- a/evalml/preprocessing/utils.py +++ b/evalml/preprocessing/utils.py @@ -123,6 +123,9 @@ def split_data( Returns: pd.DataFrame, pd.DataFrame, pd.Series, pd.Series: Feature and target data each split into train and test sets. + Raises: + ValueError: If the problem_configuration is missing or does not contain both a time_index and series_id for multiseries problems. + Examples: >>> X = pd.DataFrame([1, 2, 3, 4, 5, 6], columns=["First"]) >>> y = pd.Series([8, 9, 10, 11, 12, 13]) @@ -150,6 +153,10 @@ def split_data( dtype: int64 """ if is_multiseries(problem_type) and isinstance(y, pd.Series): + if problem_configuration is None: + raise ValueError( + "split_data requires problem_configuration for multiseries problems", + ) series_id = problem_configuration.get("series_id") time_index = problem_configuration.get("time_index") if series_id is None or time_index is None: diff --git a/evalml/tests/preprocessing_tests/test_split_data.py b/evalml/tests/preprocessing_tests/test_split_data.py index 3a06d17b89..f5e494d57c 100644 --- a/evalml/tests/preprocessing_tests/test_split_data.py +++ b/evalml/tests/preprocessing_tests/test_split_data.py @@ -20,6 +20,7 @@ def test_split_data( X_y_binary, X_y_multi, X_y_regression, + multiseries_ts_data_unstacked, make_data_type, ): if is_binary(problem_type): @@ -30,9 +31,9 @@ def test_split_data( X, y = X_y_regression problem_configuration = None if is_time_series(problem_type): - if is_multiseries(problem_type): - pytest.skip("Multiseries time series is tested separately") problem_configuration = {"gap": 1, "max_delay": 7, "time_index": "date"} + if is_multiseries(problem_type): + X, y = multiseries_ts_data_unstacked X = make_data_type(data_type, X) y = make_data_type(data_type, y) @@ -53,17 +54,28 @@ def test_split_data( assert len(y_test) == test_size assert isinstance(X_train, pd.DataFrame) assert isinstance(X_test, pd.DataFrame) - assert isinstance(y_train, pd.Series) - assert isinstance(y_test, pd.Series) + if not is_multiseries(problem_type): + assert isinstance(y_train, pd.Series) + assert isinstance(y_test, pd.Series) + else: + assert isinstance(y_train, pd.DataFrame) + assert isinstance(y_test, pd.DataFrame) + pd.testing.assert_frame_equal(X_test, X[int(train_size) :], check_dtype=False) + pd.testing.assert_frame_equal(y_test, y[int(train_size) :], check_dtype=False) - if is_time_series(problem_type): + if is_time_series(problem_type) and not is_multiseries(problem_type): pd.testing.assert_frame_equal(X_test, X[int(train_size) :], check_dtype=False) pd.testing.assert_series_equal(y_test, y[int(train_size) :], check_dtype=False) @pytest.mark.parametrize("problem_type", ProblemTypes.all_problem_types) @pytest.mark.parametrize("data_type", ["np", "pd", "ww"]) -def test_split_data_defaults(problem_type, data_type, get_test_data_from_configuration): +def test_split_data_defaults( + problem_type, + data_type, + get_test_data_from_configuration, + multiseries_ts_data_unstacked, +): X, y = get_test_data_from_configuration( data_type, problem_type, @@ -73,9 +85,9 @@ def test_split_data_defaults(problem_type, data_type, get_test_data_from_configu problem_configuration = None if is_time_series(problem_type): - if is_multiseries(problem_type): - pytest.skip("Multiseries time series is tested separately") problem_configuration = {"gap": 1, "max_delay": 7, "time_index": "date"} + if is_multiseries(problem_type): + X, y = multiseries_ts_data_unstacked test_pct = 0.1 else: test_pct = 0.2 @@ -97,7 +109,18 @@ def test_split_data_defaults(problem_type, data_type, get_test_data_from_configu X = pd.DataFrame(X) y = pd.Series(y) pd.testing.assert_frame_equal(X_test, X[int(train_size) :], check_dtype=False) - pd.testing.assert_series_equal(y_test, y[int(train_size) :], check_dtype=False) + if not is_multiseries(problem_type): + pd.testing.assert_series_equal( + y_test, + y[int(train_size) :], + check_dtype=False, + ) + else: + pd.testing.assert_frame_equal( + y_test, + y[int(train_size) :], + check_dtype=False, + ) @pytest.mark.parametrize("test", ["fh_limitation", "no_fh_limitation"]) @@ -134,10 +157,16 @@ def test_split_data_ts(test, X_y_regression): def test_split_data_calls_multiseries_error(multiseries_ts_data_stacked): X, y = multiseries_ts_data_stacked - match_str = ( - "needs both series_id and time_index values in the problem_configuration" - ) - with pytest.raises(ValueError, match=match_str): + with pytest.raises( + ValueError, + match="requires problem_configuration for multiseries", + ): + split_data(X, y, problem_type="multiseries time series regression") + + with pytest.raises( + ValueError, + match="needs both series_id and time_index values in the problem_configuration", + ): split_data( X, y,