Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add STLDecomposer to multiseries pipelines #4299

Merged
merged 13 commits into from
Sep 8, 2023
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ Release Notes
* Extended STLDecomposer to Support Multiseries :pr:`4253`
* Extended TimeSeriesImputer to handle multiseries :pr:`4291`
* Added datacheck to check for mismatched series length in multiseries :pr:`4296`
* Added STLDecomposer to multiseries pipelines :pr:`4299`
* Fixes
* Changes
* Documentation Changes
Expand Down
4 changes: 3 additions & 1 deletion evalml/pipelines/component_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -802,14 +802,16 @@ def graph(self, name=None, graph_format=None):
for component_name, component_class in self.component_instances.items():
label = "%s\l" % (component_name) # noqa: W605
if isinstance(component_class, ComponentBase):
# Reformat labels for nodes: cast values as strings, reformat floats to 2 decimal points and remove brackets from dictionary values so Digraph can parse it
parameters = "\\l".join(
[
key + " : " + "{:0.2f}".format(val)
if (isinstance(val, float))
else key + " : " + str(val)
else key + " : " + str(val).replace("{", "").replace("}", "")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is hard to follow 😅 can you add an explanatory comment?

for key, val in component_class.parameters.items()
],
) # noqa: W605

label = "%s |%s\l" % (component_name, parameters) # noqa: W605
graph.node(component_name, shape="record", label=label, nodesep="0.03")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -442,6 +442,7 @@ def inverse_transform(
y.append(y_series)
y_df = pd.DataFrame(y).T
y_df.index = original_index
y_df.columns = y_t.columns
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Out of curiosity, why is this necessary? What was the situation where the columns weren't the same?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The predictions weren't getting the corresponding series ID values as the column names and that's needed since the decomposer uses this to select the correct value. Before this was causing the decomposer to return NaN values. @christopherbunn figured that out so he might have more info.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The predictions that are generated do not have the series ID values as their column names. Copying these names over is required so we can inverse_transform from the decomposer.

return y_df

def get_trend_dataframe(self, X, y):
Expand Down
6 changes: 6 additions & 0 deletions evalml/pipelines/time_series_pipeline_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,12 @@ def predict_in_sample(
calculating_residuals=calculating_residuals,
)
predictions = self._estimator_predict(features)
if isinstance(predictions, pd.Series):
predictions = predictions.rename(self.input_target_name)
elif isinstance(predictions, pd.DataFrame):
predictions = predictions.ww.rename(
dict(zip(predictions.columns, y.columns)),
)
if len(predictions) == len(y):
predictions.index = y.index
predictions = self.inverse_transform(predictions)
Expand Down
43 changes: 26 additions & 17 deletions evalml/pipelines/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,21 +233,27 @@
def _get_decomposer(X, y, problem_type, estimator_class, sampler_name=None):
components = []
if is_time_series(problem_type) and is_regression(problem_type):
time_index = get_time_index(X, y, None)
# If the time index frequency is uninferrable, STL will fail
if time_index.freq is None:
return components
freq = time_index.freq.name
if STLDecomposer.is_freq_valid(freq):
# Make sure there's a seasonal period
order = 3 if "Q" in freq else 5
seasonal_period = STLDecomposer.determine_periodicity(
X,
y,
rel_max_order=order,
)
if seasonal_period is not None and seasonal_period <= DECOMPOSER_PERIOD_CAP:
components.append(STLDecomposer)
if is_multiseries(problem_type):
components.append(STLDecomposer)
else:
time_index = get_time_index(X, y, None)
# If the time index frequency is uninferrable, STL will fail
if time_index.freq is None:
return components

Check warning on line 242 in evalml/pipelines/utils.py

View check run for this annotation

Codecov / codecov/patch

evalml/pipelines/utils.py#L242

Added line #L242 was not covered by tests
freq = time_index.freq.name
if STLDecomposer.is_freq_valid(freq):
# Make sure there's a seasonal period
order = 3 if "Q" in freq else 5
seasonal_period = STLDecomposer.determine_periodicity(
X,
y,
rel_max_order=order,
)
if (
seasonal_period is not None
and seasonal_period <= DECOMPOSER_PERIOD_CAP
):
components.append(STLDecomposer)
return components


Expand Down Expand Up @@ -292,9 +298,12 @@
list[Transformer]: A list of applicable preprocessing components to use with the estimator.
"""
if is_multiseries(problem_type):
return []
if include_decomposer:
components_functions = [_get_decomposer]
else:
return []

if is_time_series(problem_type):
elif is_time_series(problem_type):
components_functions = [
_get_label_encoder,
_get_drop_all_null,
Expand Down
4 changes: 2 additions & 2 deletions evalml/tests/automl_tests/test_default_algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -670,7 +670,7 @@
)

first_batch = algo.next_batch()
assert len(first_batch) == 1
assert len(first_batch) == 2

Check warning on line 673 in evalml/tests/automl_tests/test_default_algorithm.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/automl_tests/test_default_algorithm.py#L673

Added line #L673 was not covered by tests
pipeline = first_batch[0]
assert pipeline.model_family == ModelFamily.VARMAX
assert pipeline.parameters["pipeline"] == search_parameters["pipeline"]
Expand All @@ -679,7 +679,7 @@

long_explore = algo.next_batch()
long_estimators = set([pipeline.estimator.name for pipeline in long_explore])
assert len(long_explore) == 50
assert len(long_explore) == 100

Check warning on line 682 in evalml/tests/automl_tests/test_default_algorithm.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/automl_tests/test_default_algorithm.py#L682

Added line #L682 was not covered by tests
assert len(long_estimators) == 1


Expand Down
7 changes: 5 additions & 2 deletions evalml/tests/automl_tests/test_iterative_algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,12 @@
DateTimeFeaturizer,
EmailFeaturizer,
NaturalLanguageFeaturizer,
STLDecomposer,
TimeSeriesFeaturizer,
URLFeaturizer,
)
from evalml.pipelines.components.utils import get_estimators
from evalml.pipelines.utils import make_pipeline
from evalml.pipelines.utils import is_regression, make_pipeline

Check warning on line 26 in evalml/tests/automl_tests/test_iterative_algorithm.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/automl_tests/test_iterative_algorithm.py#L26

Added line #L26 was not covered by tests
from evalml.problem_types import ProblemTypes, is_multiseries, is_time_series


Expand Down Expand Up @@ -97,6 +98,7 @@
assert algo.batch_number == 0
assert algo.default_max_batches == 1
estimators = get_estimators(problem_type)
decomposer = [STLDecomposer] if is_regression(problem_type) else []

Check warning on line 101 in evalml/tests/automl_tests/test_iterative_algorithm.py

View check run for this annotation

Codecov / codecov/patch

evalml/tests/automl_tests/test_iterative_algorithm.py#L101

Added line #L101 was not covered by tests
assert len(algo.allowed_pipelines) == len(
[
make_pipeline(
Expand All @@ -107,7 +109,8 @@
parameters=search_parameters,
)
for estimator in estimators
],
]
+ decomposer,
)


Expand Down
2 changes: 1 addition & 1 deletion evalml/tests/pipeline_tests/test_pipeline_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ def test_make_pipeline(

if is_time_series(problem_type):
if is_multiseries(problem_type):
expected_components = dfs + [estimator_class]
expected_components = dfs + decomposer + [estimator_class]
else:
expected_components = (
dfs
Expand Down
Loading