Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add multiseries support to graph_prediction_vs_actual_over_time #4284

Merged
merged 15 commits into from
Aug 21, 2023
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ Release Notes
* Fixes
* Changes
* Updated ``ARIMARegressor`` to be compatible with sktime v0.22.0 and beyond :pr:`4283`
* Updated ``graph_prediction_vs_actual_over_time()`` to be compatible with multiseries time series :pr:`4284`
* Documentation Changes
* Removed erroneous warnings from Data Checks User Guide page and removed ``tqdm`` warning in all notebooks :pr:`4274`
* Testing Changes
Expand Down
96 changes: 85 additions & 11 deletions evalml/model_understanding/visualizations.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from evalml.model_family import ModelFamily
from evalml.objectives.utils import get_objective
from evalml.problem_types import ProblemTypes
from evalml.problem_types.utils import is_multiseries
from evalml.utils import import_or_raise, infer_feature_types, jupyter_check


Expand Down Expand Up @@ -373,25 +374,44 @@
dates = infer_feature_types(dates)
prediction = pipeline.predict_in_sample(X, y, X_train=X_train, y_train=y_train)

return pd.DataFrame(
{
"dates": dates.reset_index(drop=True),
"target": y.reset_index(drop=True),
"prediction": prediction.reset_index(drop=True),
},
)
if is_multiseries(pipeline.problem_type):
return pd.DataFrame(

Check warning on line 378 in evalml/model_understanding/visualizations.py

View check run for this annotation

Codecov / codecov/patch

evalml/model_understanding/visualizations.py#L377-L378

Added lines #L377 - L378 were not covered by tests
{
"dates": dates.reset_index(drop=True),
"target": y.reset_index(drop=True),
"prediction": prediction.reset_index(drop=True),
"series_id": X[pipeline.series_id].reset_index(drop=True),
},
)
else:
return pd.DataFrame(

Check warning on line 387 in evalml/model_understanding/visualizations.py

View check run for this annotation

Codecov / codecov/patch

evalml/model_understanding/visualizations.py#L387

Added line #L387 was not covered by tests
{
"dates": dates.reset_index(drop=True),
"target": y.reset_index(drop=True),
"prediction": prediction.reset_index(drop=True),
},
)


def graph_prediction_vs_actual_over_time(pipeline, X, y, X_train, y_train, dates):
def graph_prediction_vs_actual_over_time(
pipeline,
X,
y,
X_train,
y_train,
dates,
single_series=None,
):
"""Plot the target values and predictions against time on the x-axis.

Args:
pipeline (TimeSeriesRegressionPipeline): Fitted time series regression pipeline.
X (pd.DataFrame): Features used to generate new predictions.
y (pd.Series): Target values to compare predictions against.
X (pd.DataFrame): Features used to generate new predictions. If problem is multiseries, X should be stacked.
y (pd.Series): Target values to compare predictions against. If problem is multiseries, y should be stacked.
X_train (pd.DataFrame): Data the pipeline was trained on.
y_train (pd.Series): Target values for training data.
dates (pd.Series): Dates corresponding to target values and predictions.
single_series (str): The single series that will be plotted from multiseries. Defaults to None
MichaelFu512 marked this conversation as resolved.
Show resolved Hide resolved

Returns:
plotly.Figure: Showing the prediction vs actual over time.
Expand All @@ -403,8 +423,15 @@
"plotly.graph_objects",
error_msg="Cannot find dependency plotly.graph_objects",
)
subplots = import_or_raise(

Check warning on line 426 in evalml/model_understanding/visualizations.py

View check run for this annotation

Codecov / codecov/patch

evalml/model_understanding/visualizations.py#L426

Added line #L426 was not covered by tests
"plotly.subplots",
error_msg="Cannot find dependency plotly.subplots",
)

if pipeline.problem_type != ProblemTypes.TIME_SERIES_REGRESSION:
if (

Check warning on line 431 in evalml/model_understanding/visualizations.py

View check run for this annotation

Codecov / codecov/patch

evalml/model_understanding/visualizations.py#L431

Added line #L431 was not covered by tests
pipeline.problem_type != ProblemTypes.TIME_SERIES_REGRESSION
and pipeline.problem_type != ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION
):
raise ValueError(
"graph_prediction_vs_actual_over_time only supports time series regression pipelines! "
f"Received {str(pipeline.problem_type)}.",
Expand All @@ -419,6 +446,53 @@
dates,
)

if is_multiseries(pipeline.problem_type):
MichaelFu512 marked this conversation as resolved.
Show resolved Hide resolved
id_list = (

Check warning on line 450 in evalml/model_understanding/visualizations.py

View check run for this annotation

Codecov / codecov/patch

evalml/model_understanding/visualizations.py#L449-L450

Added lines #L449 - L450 were not covered by tests
[single_series] if single_series is not None else data["series_id"].unique()
)
fig = subplots.make_subplots(

Check warning on line 453 in evalml/model_understanding/visualizations.py

View check run for this annotation

Codecov / codecov/patch

evalml/model_understanding/visualizations.py#L453

Added line #L453 was not covered by tests
rows=len(id_list),
cols=1,
subplot_titles=[f"Series: {id}" for id in id_list],
)
for curr_count, id in enumerate(id_list):
curr_df = data[data["series_id"] == id]
fig.append_trace(

Check warning on line 460 in evalml/model_understanding/visualizations.py

View check run for this annotation

Codecov / codecov/patch

evalml/model_understanding/visualizations.py#L458-L460

Added lines #L458 - L460 were not covered by tests
_go.Scatter(
x=curr_df["dates"],
y=curr_df["target"],
mode="lines+markers",
name=f"Series {id}: Target",
),
row=curr_count + 1,
col=1,
)
fig.append_trace(

Check warning on line 470 in evalml/model_understanding/visualizations.py

View check run for this annotation

Codecov / codecov/patch

evalml/model_understanding/visualizations.py#L470

Added line #L470 was not covered by tests
_go.Scatter(
x=curr_df["dates"],
y=curr_df["prediction"],
mode="lines+markers",
name=f"Series {id}: Prediction",
),
row=curr_count + 1,
col=1,
)
fig.update_xaxes(title_text="Time")
fig.update_yaxes(title_text=y.name)
if single_series is not None:
fig.update_layout(

Check warning on line 483 in evalml/model_understanding/visualizations.py

View check run for this annotation

Codecov / codecov/patch

evalml/model_understanding/visualizations.py#L480-L483

Added lines #L480 - L483 were not covered by tests
height=600,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there any autosizing we can take advantage of? Also not sure if we need the single_series case: can it just match what we already had before for time series?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any autosizing we can take advantage of?

  • For python plotly, it doesn't look like there's any autosizing besides doing what I did in line 490 (though correct me if I'm wrong). If I left the parameters blank for the multiseries graph (i.e. don't define x and y) and let it use the default values the graph looks really squished.
Screen Shot 2023-08-21 at 11 33 29 AM
  • For single series though I can probably leave the parameters blank since it is only one graph.

Also not sure if we need the single_series case: can it just match what we already had before for time series?

  • The issue is that for single_series case I want the data that corresponds to that single series.
    • In the before case for time series, y is taken from data["target"] whereas for single series I want to take it from curr_df["target"] where curr_df = data[data["series_id"] == single_series]
    • If I wanted to do that with the before code, I'd probably have to add more if statements and such so it made more sense to me to have it included with the multiseries case.
    • Also I'd like for the single series plot to have a different title than the time series plot.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cool - thanks for the explanation. Works for me!

width=1000,
title_text=f"Graph for Series {single_series}",
)
else:
fig.update_layout(

Check warning on line 489 in evalml/model_understanding/visualizations.py

View check run for this annotation

Codecov / codecov/patch

evalml/model_understanding/visualizations.py#L489

Added line #L489 was not covered by tests
height=600 + (len(id_list)) * 200,
width=1500,
title_text="Graph for Multiseries",
)
return fig

Check warning on line 494 in evalml/model_understanding/visualizations.py

View check run for this annotation

Codecov / codecov/patch

evalml/model_understanding/visualizations.py#L494

Added line #L494 was not covered by tests

data = [
_go.Scatter(
x=data["dates"],
Expand Down
160 changes: 160 additions & 0 deletions evalml/tests/model_understanding_tests/test_visualizations.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,49 @@
ElasticNetRegressor,
LinearRegressor,
MulticlassClassificationPipeline,
MultiseriesRegressionPipeline,
RegressionPipeline,
TimeSeriesRegressionPipeline,
)
from evalml.preprocessing import split_multiseries_data

Check warning on line 35 in evalml/tests/model_understanding_tests/test_visualizations.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/model_understanding_tests/test_visualizations.py#L35

Added line #L35 was not covered by tests
from evalml.problem_types import ProblemTypes
from evalml.utils import get_random_state, infer_feature_types


@pytest.fixture(scope="module")
def component_graph_multiseries():
return {

Check warning on line 42 in evalml/tests/model_understanding_tests/test_visualizations.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/model_understanding_tests/test_visualizations.py#L40-L42

Added lines #L40 - L42 were not covered by tests
"Time Series Featurizer": ["Time Series Featurizer", "X", "y"],
"Baseline Multiseries": [
"Multiseries Time Series Baseline Regressor",
"Time Series Featurizer.x",
"y",
],
}


@pytest.fixture(scope="module")
def pipeline_parameters_multiseries():
return {

Check warning on line 54 in evalml/tests/model_understanding_tests/test_visualizations.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/model_understanding_tests/test_visualizations.py#L52-L54

Added lines #L52 - L54 were not covered by tests
"pipeline": {
"time_index": "date",
"max_delay": 10,
"forecast_horizon": 7,
"gap": 0,
"series_id": "series_id",
},
"Time Series Featurizer": {
"time_index": "date",
"max_delay": 10,
"forecast_horizon": 7,
"gap": 0,
"delay_features": False,
"delay_target": True,
},
"Baseline Multiseries": {"gap": 0, "forecast_horizon": 7},
}


@pytest.mark.parametrize("data_type", ["np", "pd", "ww"])
def test_cost_benefit_matrix_vs_threshold(
data_type,
Expand Down Expand Up @@ -346,6 +382,35 @@
assert list(results.columns) == ["dates", "target", "prediction"]


def test_get_prediction_vs_actual_over_time_data_multiseries(

Check warning on line 385 in evalml/tests/model_understanding_tests/test_visualizations.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/model_understanding_tests/test_visualizations.py#L385

Added line #L385 was not covered by tests
multiseries_ts_data_stacked,
component_graph_multiseries,
pipeline_parameters_multiseries,
):
X, y = multiseries_ts_data_stacked
X_train, _, y_train, _ = split_multiseries_data(

Check warning on line 391 in evalml/tests/model_understanding_tests/test_visualizations.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/model_understanding_tests/test_visualizations.py#L390-L391

Added lines #L390 - L391 were not covered by tests
X,
y,
"series_id",
"date",
)
pipeline = MultiseriesRegressionPipeline(

Check warning on line 397 in evalml/tests/model_understanding_tests/test_visualizations.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/model_understanding_tests/test_visualizations.py#L397

Added line #L397 was not covered by tests
component_graph_multiseries,
pipeline_parameters_multiseries,
)
pipeline.fit(X_train, y_train)
results = get_prediction_vs_actual_over_time_data(

Check warning on line 402 in evalml/tests/model_understanding_tests/test_visualizations.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/model_understanding_tests/test_visualizations.py#L401-L402

Added lines #L401 - L402 were not covered by tests
pipeline,
X,
y,
X_train,
y_train,
pd.Series(X["date"]),
)
assert isinstance(results, pd.DataFrame)
assert list(results.columns) == ["dates", "target", "prediction", "series_id"]

Check warning on line 411 in evalml/tests/model_understanding_tests/test_visualizations.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/model_understanding_tests/test_visualizations.py#L410-L411

Added lines #L410 - L411 were not covered by tests


def test_graph_prediction_vs_actual_over_time(ts_data, go):
X, _, y = ts_data()
X_train, y_train = X.iloc[:30], y.iloc[:30]
Expand Down Expand Up @@ -407,6 +472,101 @@
)


def test_graph_prediction_vs_actual_over_time_multiseries_single(

Check warning on line 475 in evalml/tests/model_understanding_tests/test_visualizations.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/model_understanding_tests/test_visualizations.py#L475

Added line #L475 was not covered by tests
MichaelFu512 marked this conversation as resolved.
Show resolved Hide resolved
multiseries_ts_data_stacked,
go,
component_graph_multiseries,
pipeline_parameters_multiseries,
):
X, y = multiseries_ts_data_stacked
X_train, _, y_train, _ = split_multiseries_data(

Check warning on line 482 in evalml/tests/model_understanding_tests/test_visualizations.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/model_understanding_tests/test_visualizations.py#L481-L482

Added lines #L481 - L482 were not covered by tests
X,
y,
"series_id",
"date",
)
pipeline = MultiseriesRegressionPipeline(

Check warning on line 488 in evalml/tests/model_understanding_tests/test_visualizations.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/model_understanding_tests/test_visualizations.py#L488

Added line #L488 was not covered by tests
component_graph_multiseries,
pipeline_parameters_multiseries,
)
pipeline.fit(X_train, y_train)
fig = graph_prediction_vs_actual_over_time(

Check warning on line 493 in evalml/tests/model_understanding_tests/test_visualizations.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/model_understanding_tests/test_visualizations.py#L492-L493

Added lines #L492 - L493 were not covered by tests
pipeline,
X,
y,
X_train,
y_train,
X["date"],
"1",
)
assert isinstance(fig, go.Figure)
fig_dict = fig.to_dict()

Check warning on line 503 in evalml/tests/model_understanding_tests/test_visualizations.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/model_understanding_tests/test_visualizations.py#L502-L503

Added lines #L502 - L503 were not covered by tests

assert fig_dict["layout"]["title"]["text"] == "Graph for Series 1"
assert fig_dict["layout"]["xaxis"]["title"]["text"] == "Time"
assert fig_dict["layout"]["yaxis"]["title"]["text"] == "target"
assert len(fig_dict["data"]) == 2

Check warning on line 508 in evalml/tests/model_understanding_tests/test_visualizations.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/model_understanding_tests/test_visualizations.py#L505-L508

Added lines #L505 - L508 were not covered by tests

assert len(fig_dict["data"][0]["x"]) == len(X["date"].unique())
assert len(fig_dict["data"][0]["y"]) == len(X["date"].unique())
assert not np.isnan(fig_dict["data"][0]["y"]).all()
assert fig_dict["data"][0]["name"] == "Series 1: Target"

Check warning on line 513 in evalml/tests/model_understanding_tests/test_visualizations.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/model_understanding_tests/test_visualizations.py#L510-L513

Added lines #L510 - L513 were not covered by tests

assert len(fig_dict["data"][1]["x"]) == len(X["date"].unique())
assert len(fig_dict["data"][1]["y"]) == len(X["date"].unique())
assert not np.isnan(fig_dict["data"][1]["y"]).all()
assert fig_dict["data"][1]["name"] == "Series 1: Prediction"

Check warning on line 518 in evalml/tests/model_understanding_tests/test_visualizations.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/model_understanding_tests/test_visualizations.py#L515-L518

Added lines #L515 - L518 were not covered by tests


def test_graph_prediction_vs_actual_over_time_multiseries(

Check warning on line 521 in evalml/tests/model_understanding_tests/test_visualizations.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/model_understanding_tests/test_visualizations.py#L521

Added line #L521 was not covered by tests
multiseries_ts_data_stacked,
go,
component_graph_multiseries,
pipeline_parameters_multiseries,
):
X, y = multiseries_ts_data_stacked
X_train, _, y_train, _ = split_multiseries_data(

Check warning on line 528 in evalml/tests/model_understanding_tests/test_visualizations.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/model_understanding_tests/test_visualizations.py#L527-L528

Added lines #L527 - L528 were not covered by tests
X,
y,
"series_id",
"date",
)
pipeline = MultiseriesRegressionPipeline(

Check warning on line 534 in evalml/tests/model_understanding_tests/test_visualizations.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/model_understanding_tests/test_visualizations.py#L534

Added line #L534 was not covered by tests
component_graph_multiseries,
pipeline_parameters_multiseries,
)
pipeline.fit(X_train, y_train)
fig = graph_prediction_vs_actual_over_time(

Check warning on line 539 in evalml/tests/model_understanding_tests/test_visualizations.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/model_understanding_tests/test_visualizations.py#L538-L539

Added lines #L538 - L539 were not covered by tests
pipeline,
X,
y,
X_train,
y_train,
X["date"],
)
assert isinstance(fig, go.Figure)

Check warning on line 547 in evalml/tests/model_understanding_tests/test_visualizations.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/model_understanding_tests/test_visualizations.py#L547

Added line #L547 was not covered by tests

fig_dict = fig.to_dict()
assert fig_dict["layout"]["title"]["text"] == "Graph for Multiseries"
assert fig_dict["layout"]["xaxis"]["title"]["text"] == "Time"
assert fig_dict["layout"]["yaxis"]["title"]["text"] == "target"

Check warning on line 552 in evalml/tests/model_understanding_tests/test_visualizations.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/model_understanding_tests/test_visualizations.py#L549-L552

Added lines #L549 - L552 were not covered by tests

# there's 5 series, and each series has two lines (one each for target/prediction)
assert len(fig_dict["data"]) == 10

Check warning on line 555 in evalml/tests/model_understanding_tests/test_visualizations.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/model_understanding_tests/test_visualizations.py#L555

Added line #L555 was not covered by tests

curr_series = 0
for i in range(len(fig_dict["data"])):
assert len(fig_dict["data"][i]["x"]) == len(X["date"].unique())
assert len(fig_dict["data"][i]["y"]) == len(X["date"].unique())
assert not np.isnan(fig_dict["data"][i]["y"]).all()

Check warning on line 561 in evalml/tests/model_understanding_tests/test_visualizations.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/model_understanding_tests/test_visualizations.py#L557-L561

Added lines #L557 - L561 were not covered by tests

if i % 2 == 0:
assert fig_dict["data"][i]["name"] == f"Series {curr_series}: Target"

Check warning on line 564 in evalml/tests/model_understanding_tests/test_visualizations.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/model_understanding_tests/test_visualizations.py#L563-L564

Added lines #L563 - L564 were not covered by tests
else:
assert fig_dict["data"][i]["name"] == f"Series {curr_series}: Prediction"
curr_series += 1

Check warning on line 567 in evalml/tests/model_understanding_tests/test_visualizations.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/model_understanding_tests/test_visualizations.py#L566-L567

Added lines #L566 - L567 were not covered by tests


def test_decision_tree_data_from_estimator_not_fitted(tree_estimators):
est_class, _ = tree_estimators
with pytest.raises(
Expand Down
Loading