Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update split_data to call split_multiseries_data #4312

Merged
merged 4 commits into from
Sep 19, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ Release Notes
* Extended TimeSeriesRegularizer to support multiseries :pr:`4303`
* Fixes
* Changes
* Updated ``split_data`` to call ``split_multiseries_data`` when passed stacked multiseries data :pr:`4312`
* Documentation Changes
* Testing Changes

Expand Down
24 changes: 23 additions & 1 deletion evalml/preprocessing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,12 @@

from evalml.pipelines.utils import stack_data, stack_X, unstack_multiseries
from evalml.preprocessing.data_splitters import TrainingValidationSplit
from evalml.problem_types import is_classification, is_regression, is_time_series
from evalml.problem_types import (
is_classification,
is_multiseries,
is_regression,
is_time_series,
)
from evalml.utils import infer_feature_types


Expand Down Expand Up @@ -144,6 +149,23 @@
1 9
dtype: int64
"""
if is_multiseries(problem_type) and isinstance(y, pd.Series):
series_id = problem_configuration.get("series_id")
eccabay marked this conversation as resolved.
Show resolved Hide resolved
time_index = problem_configuration.get("time_index")
if series_id is None or time_index is None:
raise ValueError(

Check warning on line 156 in evalml/preprocessing/utils.py

View check run for this annotation

Codecov / codecov/patch

evalml/preprocessing/utils.py#L153-L156

Added lines #L153 - L156 were not covered by tests
"split_data needs both series_id and time_index values in the problem_configuration to split multiseries data",
)
return split_multiseries_data(

Check warning on line 159 in evalml/preprocessing/utils.py

View check run for this annotation

Codecov / codecov/patch

evalml/preprocessing/utils.py#L159

Added line #L159 was not covered by tests
X,
y,
series_id,
time_index,
problem_configuration=problem_configuration,
test_size=test_size,
random_seed=random_seed,
)

X = infer_feature_types(X)
y = infer_feature_types(y)

Expand Down
48 changes: 41 additions & 7 deletions evalml/tests/preprocessing_tests/test_split_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
ProblemTypes,
is_binary,
is_multiclass,
is_multiseries,
is_regression,
is_time_series,
)
Expand All @@ -29,6 +30,8 @@
X, y = X_y_regression
problem_configuration = None
if is_time_series(problem_type):
if is_multiseries(problem_type):

Check warning on line 33 in evalml/tests/preprocessing_tests/test_split_data.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/preprocessing_tests/test_split_data.py#L33

Added line #L33 was not covered by tests
pytest.skip("Multiseries time series is tested separately")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of skipping could we mock out the multi series split and asset that it's called?

problem_configuration = {"gap": 1, "max_delay": 7, "time_index": "date"}

X = make_data_type(data_type, X)
Expand Down Expand Up @@ -70,6 +73,8 @@

problem_configuration = None
if is_time_series(problem_type):
if is_multiseries(problem_type):

Check warning on line 76 in evalml/tests/preprocessing_tests/test_split_data.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/preprocessing_tests/test_split_data.py#L76

Added line #L76 was not covered by tests
pytest.skip("Multiseries time series is tested separately")
problem_configuration = {"gap": 1, "max_delay": 7, "time_index": "date"}
test_pct = 0.1
else:
Expand Down Expand Up @@ -127,8 +132,27 @@
assert len(y_test) == test_size


def test_split_data_calls_multiseries_error(multiseries_ts_data_stacked):
X, y = multiseries_ts_data_stacked
match_str = (

Check warning on line 137 in evalml/tests/preprocessing_tests/test_split_data.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/preprocessing_tests/test_split_data.py#L135-L137

Added lines #L135 - L137 were not covered by tests
"needs both series_id and time_index values in the problem_configuration"
)
with pytest.raises(ValueError, match=match_str):
split_data(

Check warning on line 141 in evalml/tests/preprocessing_tests/test_split_data.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/preprocessing_tests/test_split_data.py#L140-L141

Added lines #L140 - L141 were not covered by tests
X,
y,
problem_type="multiseries time series regression",
problem_configuration={"time_index": "date"},
)


@pytest.mark.parametrize("no_features", [True, False])
def test_split_multiseries_data(no_features, multiseries_ts_data_stacked):
@pytest.mark.parametrize("splitting_function", ["split_data", "split_multiseries_data"])
def test_split_multiseries_data(

Check warning on line 151 in evalml/tests/preprocessing_tests/test_split_data.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/preprocessing_tests/test_split_data.py#L150-L151

Added lines #L150 - L151 were not covered by tests
no_features,
splitting_function,
multiseries_ts_data_stacked,
):
X, y = multiseries_ts_data_stacked

if no_features:
Expand All @@ -137,12 +161,22 @@
X_train_expected, X_holdout_expected = X[:-10], X[-10:]
y_train_expected, y_holdout_expected = y[:-10], y[-10:]

X_train, X_holdout, y_train, y_holdout = split_multiseries_data(
X,
y,
"series_id",
"date",
)
# Results should be identical whether split_multiseries_data is called through
# split_data or directly
if splitting_function == "split_data":
X_train, X_holdout, y_train, y_holdout = split_data(

Check warning on line 167 in evalml/tests/preprocessing_tests/test_split_data.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/preprocessing_tests/test_split_data.py#L166-L167

Added lines #L166 - L167 were not covered by tests
X,
y,
problem_type="multiseries time series regression",
problem_configuration={"time_index": "date", "series_id": "series_id"},
)
else:
X_train, X_holdout, y_train, y_holdout = split_multiseries_data(

Check warning on line 174 in evalml/tests/preprocessing_tests/test_split_data.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/preprocessing_tests/test_split_data.py#L174

Added line #L174 was not covered by tests
X,
y,
"series_id",
"date",
)

pd.testing.assert_frame_equal(
X_train.sort_index(axis=1),
Expand Down
Loading