Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update split_data to call split_multiseries_data #4312

Merged
merged 4 commits into from
Sep 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ Release Notes
* Extended TimeSeriesRegularizer to support multiseries :pr:`4303`
* Fixes
* Changes
* Updated ``split_data`` to call ``split_multiseries_data`` when passed stacked multiseries data :pr:`4312`
* Documentation Changes
* Removed LightGBM's excessive amount of warnings :pr:`4308`
* Testing Changes
Expand Down
31 changes: 30 additions & 1 deletion evalml/preprocessing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,12 @@

from evalml.pipelines.utils import stack_data, stack_X, unstack_multiseries
from evalml.preprocessing.data_splitters import TrainingValidationSplit
from evalml.problem_types import is_classification, is_regression, is_time_series
from evalml.problem_types import (
is_classification,
is_multiseries,
is_regression,
is_time_series,
)
from evalml.utils import infer_feature_types


Expand Down Expand Up @@ -118,6 +123,9 @@
Returns:
pd.DataFrame, pd.DataFrame, pd.Series, pd.Series: Feature and target data each split into train and test sets.

Raises:
ValueError: If the problem_configuration is missing or does not contain both a time_index and series_id for multiseries problems.

Examples:
>>> X = pd.DataFrame([1, 2, 3, 4, 5, 6], columns=["First"])
>>> y = pd.Series([8, 9, 10, 11, 12, 13])
Expand All @@ -144,6 +152,27 @@
1 9
dtype: int64
"""
if is_multiseries(problem_type) and isinstance(y, pd.Series):
if problem_configuration is None:
raise ValueError(

Check warning on line 157 in evalml/preprocessing/utils.py

View check run for this annotation

Codecov / codecov/patch

evalml/preprocessing/utils.py#L156-L157

Added lines #L156 - L157 were not covered by tests
"split_data requires problem_configuration for multiseries problems",
)
series_id = problem_configuration.get("series_id")
eccabay marked this conversation as resolved.
Show resolved Hide resolved
time_index = problem_configuration.get("time_index")
if series_id is None or time_index is None:
raise ValueError(

Check warning on line 163 in evalml/preprocessing/utils.py

View check run for this annotation

Codecov / codecov/patch

evalml/preprocessing/utils.py#L160-L163

Added lines #L160 - L163 were not covered by tests
"split_data needs both series_id and time_index values in the problem_configuration to split multiseries data",
)
return split_multiseries_data(

Check warning on line 166 in evalml/preprocessing/utils.py

View check run for this annotation

Codecov / codecov/patch

evalml/preprocessing/utils.py#L166

Added line #L166 was not covered by tests
X,
y,
series_id,
time_index,
problem_configuration=problem_configuration,
test_size=test_size,
random_seed=random_seed,
)

X = infer_feature_types(X)
y = infer_feature_types(y)

Expand Down
87 changes: 75 additions & 12 deletions evalml/tests/preprocessing_tests/test_split_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
ProblemTypes,
is_binary,
is_multiclass,
is_multiseries,
is_regression,
is_time_series,
)
Expand All @@ -19,6 +20,7 @@
X_y_binary,
X_y_multi,
X_y_regression,
multiseries_ts_data_unstacked,
make_data_type,
):
if is_binary(problem_type):
Expand All @@ -30,6 +32,8 @@
problem_configuration = None
if is_time_series(problem_type):
problem_configuration = {"gap": 1, "max_delay": 7, "time_index": "date"}
if is_multiseries(problem_type):
X, y = multiseries_ts_data_unstacked

Check warning on line 36 in evalml/tests/preprocessing_tests/test_split_data.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/preprocessing_tests/test_split_data.py#L35-L36

Added lines #L35 - L36 were not covered by tests

X = make_data_type(data_type, X)
y = make_data_type(data_type, y)
Expand All @@ -50,17 +54,28 @@
assert len(y_test) == test_size
assert isinstance(X_train, pd.DataFrame)
assert isinstance(X_test, pd.DataFrame)
assert isinstance(y_train, pd.Series)
assert isinstance(y_test, pd.Series)
if not is_multiseries(problem_type):
assert isinstance(y_train, pd.Series)
assert isinstance(y_test, pd.Series)

Check warning on line 59 in evalml/tests/preprocessing_tests/test_split_data.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/preprocessing_tests/test_split_data.py#L57-L59

Added lines #L57 - L59 were not covered by tests
else:
assert isinstance(y_train, pd.DataFrame)
assert isinstance(y_test, pd.DataFrame)
pd.testing.assert_frame_equal(X_test, X[int(train_size) :], check_dtype=False)
pd.testing.assert_frame_equal(y_test, y[int(train_size) :], check_dtype=False)

Check warning on line 64 in evalml/tests/preprocessing_tests/test_split_data.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/preprocessing_tests/test_split_data.py#L61-L64

Added lines #L61 - L64 were not covered by tests

if is_time_series(problem_type):
if is_time_series(problem_type) and not is_multiseries(problem_type):

Check warning on line 66 in evalml/tests/preprocessing_tests/test_split_data.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/preprocessing_tests/test_split_data.py#L66

Added line #L66 was not covered by tests
pd.testing.assert_frame_equal(X_test, X[int(train_size) :], check_dtype=False)
pd.testing.assert_series_equal(y_test, y[int(train_size) :], check_dtype=False)


@pytest.mark.parametrize("problem_type", ProblemTypes.all_problem_types)
@pytest.mark.parametrize("data_type", ["np", "pd", "ww"])
def test_split_data_defaults(problem_type, data_type, get_test_data_from_configuration):
def test_split_data_defaults(

Check warning on line 73 in evalml/tests/preprocessing_tests/test_split_data.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/preprocessing_tests/test_split_data.py#L73

Added line #L73 was not covered by tests
problem_type,
data_type,
get_test_data_from_configuration,
multiseries_ts_data_unstacked,
):
X, y = get_test_data_from_configuration(
data_type,
problem_type,
Expand All @@ -71,6 +86,8 @@
problem_configuration = None
if is_time_series(problem_type):
problem_configuration = {"gap": 1, "max_delay": 7, "time_index": "date"}
if is_multiseries(problem_type):
X, y = multiseries_ts_data_unstacked

Check warning on line 90 in evalml/tests/preprocessing_tests/test_split_data.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/preprocessing_tests/test_split_data.py#L89-L90

Added lines #L89 - L90 were not covered by tests
test_pct = 0.1
else:
test_pct = 0.2
Expand All @@ -92,7 +109,18 @@
X = pd.DataFrame(X)
y = pd.Series(y)
pd.testing.assert_frame_equal(X_test, X[int(train_size) :], check_dtype=False)
pd.testing.assert_series_equal(y_test, y[int(train_size) :], check_dtype=False)
if not is_multiseries(problem_type):
pd.testing.assert_series_equal(

Check warning on line 113 in evalml/tests/preprocessing_tests/test_split_data.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/preprocessing_tests/test_split_data.py#L112-L113

Added lines #L112 - L113 were not covered by tests
y_test,
y[int(train_size) :],
check_dtype=False,
)
else:
pd.testing.assert_frame_equal(

Check warning on line 119 in evalml/tests/preprocessing_tests/test_split_data.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/preprocessing_tests/test_split_data.py#L119

Added line #L119 was not covered by tests
y_test,
y[int(train_size) :],
check_dtype=False,
)


@pytest.mark.parametrize("test", ["fh_limitation", "no_fh_limitation"])
Expand Down Expand Up @@ -127,8 +155,33 @@
assert len(y_test) == test_size


def test_split_data_calls_multiseries_error(multiseries_ts_data_stacked):
X, y = multiseries_ts_data_stacked
with pytest.raises(

Check warning on line 160 in evalml/tests/preprocessing_tests/test_split_data.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/preprocessing_tests/test_split_data.py#L158-L160

Added lines #L158 - L160 were not covered by tests
ValueError,
match="requires problem_configuration for multiseries",
):
split_data(X, y, problem_type="multiseries time series regression")

Check warning on line 164 in evalml/tests/preprocessing_tests/test_split_data.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/preprocessing_tests/test_split_data.py#L164

Added line #L164 was not covered by tests

with pytest.raises(

Check warning on line 166 in evalml/tests/preprocessing_tests/test_split_data.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/preprocessing_tests/test_split_data.py#L166

Added line #L166 was not covered by tests
ValueError,
match="needs both series_id and time_index values in the problem_configuration",
):
split_data(

Check warning on line 170 in evalml/tests/preprocessing_tests/test_split_data.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/preprocessing_tests/test_split_data.py#L170

Added line #L170 was not covered by tests
X,
y,
problem_type="multiseries time series regression",
problem_configuration={"time_index": "date"},
)


@pytest.mark.parametrize("no_features", [True, False])
def test_split_multiseries_data(no_features, multiseries_ts_data_stacked):
@pytest.mark.parametrize("splitting_function", ["split_data", "split_multiseries_data"])
def test_split_multiseries_data(

Check warning on line 180 in evalml/tests/preprocessing_tests/test_split_data.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/preprocessing_tests/test_split_data.py#L179-L180

Added lines #L179 - L180 were not covered by tests
no_features,
splitting_function,
multiseries_ts_data_stacked,
):
X, y = multiseries_ts_data_stacked

if no_features:
Expand All @@ -137,12 +190,22 @@
X_train_expected, X_holdout_expected = X[:-10], X[-10:]
y_train_expected, y_holdout_expected = y[:-10], y[-10:]

X_train, X_holdout, y_train, y_holdout = split_multiseries_data(
X,
y,
"series_id",
"date",
)
# Results should be identical whether split_multiseries_data is called through
# split_data or directly
if splitting_function == "split_data":
X_train, X_holdout, y_train, y_holdout = split_data(

Check warning on line 196 in evalml/tests/preprocessing_tests/test_split_data.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/preprocessing_tests/test_split_data.py#L195-L196

Added lines #L195 - L196 were not covered by tests
X,
y,
problem_type="multiseries time series regression",
problem_configuration={"time_index": "date", "series_id": "series_id"},
)
else:
X_train, X_holdout, y_train, y_holdout = split_multiseries_data(

Check warning on line 203 in evalml/tests/preprocessing_tests/test_split_data.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/preprocessing_tests/test_split_data.py#L203

Added line #L203 was not covered by tests
X,
y,
"series_id",
"date",
)

pd.testing.assert_frame_equal(
X_train.sort_index(axis=1),
Expand Down
Loading