Skip to content

Commit

Permalink
Add STLDecomposer to multiseries pipelines (#4299)
Browse files Browse the repository at this point in the history
* add decomposer to tests

* Remove nan values

* handle series and df

* fix stl graph

* fix condition for adding decomposer

---------

Co-authored-by: christopherbunn <chris.l.bunn@gmail.com>
  • Loading branch information
remyogasawara and christopherbunn authored Sep 8, 2023
1 parent 1329988 commit 81abfca
Show file tree
Hide file tree
Showing 8 changed files with 45 additions and 23 deletions.
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ Release Notes
* Extended STLDecomposer to Support Multiseries :pr:`4253`
* Extended TimeSeriesImputer to handle multiseries :pr:`4291`
* Added datacheck to check for mismatched series length in multiseries :pr:`4296`
* Added STLDecomposer to multiseries pipelines :pr:`4299`
* Fixes
* Changes
* Documentation Changes
Expand Down
4 changes: 3 additions & 1 deletion evalml/pipelines/component_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -802,14 +802,16 @@ def graph(self, name=None, graph_format=None):
for component_name, component_class in self.component_instances.items():
label = "%s\l" % (component_name) # noqa: W605
if isinstance(component_class, ComponentBase):
# Reformat labels for nodes: cast values as strings, reformat floats to 2 decimal points and remove brackets from dictionary values so Digraph can parse it
parameters = "\\l".join(
[
key + " : " + "{:0.2f}".format(val)
if (isinstance(val, float))
else key + " : " + str(val)
else key + " : " + str(val).replace("{", "").replace("}", "")
for key, val in component_class.parameters.items()
],
) # noqa: W605

label = "%s |%s\l" % (component_name, parameters) # noqa: W605
graph.node(component_name, shape="record", label=label, nodesep="0.03")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -442,6 +442,7 @@ def inverse_transform(
y.append(y_series)
y_df = pd.DataFrame(y).T
y_df.index = original_index
y_df.columns = y_t.columns
return y_df

def get_trend_dataframe(self, X, y):
Expand Down
6 changes: 6 additions & 0 deletions evalml/pipelines/time_series_pipeline_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,12 @@ def predict_in_sample(
calculating_residuals=calculating_residuals,
)
predictions = self._estimator_predict(features)
if isinstance(predictions, pd.Series):
predictions = predictions.rename(self.input_target_name)
elif isinstance(predictions, pd.DataFrame):
predictions = predictions.ww.rename(
dict(zip(predictions.columns, y.columns)),
)
if len(predictions) == len(y):
predictions.index = y.index
predictions = self.inverse_transform(predictions)
Expand Down
43 changes: 26 additions & 17 deletions evalml/pipelines/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,21 +233,27 @@ def _get_time_series_featurizer(X, y, problem_type, estimator_class, sampler_nam
def _get_decomposer(X, y, problem_type, estimator_class, sampler_name=None):
components = []
if is_time_series(problem_type) and is_regression(problem_type):
time_index = get_time_index(X, y, None)
# If the time index frequency is uninferrable, STL will fail
if time_index.freq is None:
return components
freq = time_index.freq.name
if STLDecomposer.is_freq_valid(freq):
# Make sure there's a seasonal period
order = 3 if "Q" in freq else 5
seasonal_period = STLDecomposer.determine_periodicity(
X,
y,
rel_max_order=order,
)
if seasonal_period is not None and seasonal_period <= DECOMPOSER_PERIOD_CAP:
components.append(STLDecomposer)
if is_multiseries(problem_type):
components.append(STLDecomposer)
else:
time_index = get_time_index(X, y, None)
# If the time index frequency is uninferrable, STL will fail
if time_index.freq is None:
return components
freq = time_index.freq.name
if STLDecomposer.is_freq_valid(freq):
# Make sure there's a seasonal period
order = 3 if "Q" in freq else 5
seasonal_period = STLDecomposer.determine_periodicity(
X,
y,
rel_max_order=order,
)
if (
seasonal_period is not None
and seasonal_period <= DECOMPOSER_PERIOD_CAP
):
components.append(STLDecomposer)
return components


Expand Down Expand Up @@ -292,9 +298,12 @@ def _get_preprocessing_components(
list[Transformer]: A list of applicable preprocessing components to use with the estimator.
"""
if is_multiseries(problem_type):
return []
if include_decomposer:
components_functions = [_get_decomposer]
else:
return []

if is_time_series(problem_type):
elif is_time_series(problem_type):
components_functions = [
_get_label_encoder,
_get_drop_all_null,
Expand Down
4 changes: 2 additions & 2 deletions evalml/tests/automl_tests/test_default_algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -670,7 +670,7 @@ def test_default_algorithm_multiseries_time_series(
)

first_batch = algo.next_batch()
assert len(first_batch) == 1
assert len(first_batch) == 2
pipeline = first_batch[0]
assert pipeline.model_family == ModelFamily.VARMAX
assert pipeline.parameters["pipeline"] == search_parameters["pipeline"]
Expand All @@ -679,7 +679,7 @@ def test_default_algorithm_multiseries_time_series(

long_explore = algo.next_batch()
long_estimators = set([pipeline.estimator.name for pipeline in long_explore])
assert len(long_explore) == 50
assert len(long_explore) == 100
assert len(long_estimators) == 1


Expand Down
7 changes: 5 additions & 2 deletions evalml/tests/automl_tests/test_iterative_algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,12 @@
DateTimeFeaturizer,
EmailFeaturizer,
NaturalLanguageFeaturizer,
STLDecomposer,
TimeSeriesFeaturizer,
URLFeaturizer,
)
from evalml.pipelines.components.utils import get_estimators
from evalml.pipelines.utils import make_pipeline
from evalml.pipelines.utils import is_regression, make_pipeline
from evalml.problem_types import ProblemTypes, is_multiseries, is_time_series


Expand Down Expand Up @@ -97,6 +98,7 @@ def test_iterative_algorithm_init(
assert algo.batch_number == 0
assert algo.default_max_batches == 1
estimators = get_estimators(problem_type)
decomposer = [STLDecomposer] if is_regression(problem_type) else []
assert len(algo.allowed_pipelines) == len(
[
make_pipeline(
Expand All @@ -107,7 +109,8 @@ def test_iterative_algorithm_init(
parameters=search_parameters,
)
for estimator in estimators
],
]
+ decomposer,
)


Expand Down
2 changes: 1 addition & 1 deletion evalml/tests/pipeline_tests/test_pipeline_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ def test_make_pipeline(

if is_time_series(problem_type):
if is_multiseries(problem_type):
expected_components = dfs + [estimator_class]
expected_components = dfs + decomposer + [estimator_class]
else:
expected_components = (
dfs
Expand Down

0 comments on commit 81abfca

Please sign in to comment.