Add STLDecomposer to multiseries pipelines (#4299)

* add decomposer to tests * Remove nan values * handle series and df * fix stl graph * fix condition for adding decomposer --------- Co-authored-by: christopherbunn <chris.l.bunn@gmail.com>
alteryx · Sep 8, 2023 · 81abfca · 81abfca
1 parent 1329988
commit 81abfca
Show file tree

Hide file tree

Showing 8 changed files with 45 additions and 23 deletions.
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -5,6 +5,7 @@ Release Notes
         * Extended STLDecomposer to Support Multiseries :pr:`4253`
         * Extended TimeSeriesImputer to handle multiseries :pr:`4291`
         * Added datacheck to check for mismatched series length in multiseries :pr:`4296`
+        * Added STLDecomposer to multiseries pipelines :pr:`4299`
     * Fixes
     * Changes
     * Documentation Changes

diff --git a/evalml/pipelines/component_graph.py b/evalml/pipelines/component_graph.py
@@ -802,14 +802,16 @@ def graph(self, name=None, graph_format=None):
         for component_name, component_class in self.component_instances.items():
             label = "%s\l" % (component_name)  # noqa: W605
             if isinstance(component_class, ComponentBase):
+                # Reformat labels for nodes: cast values as strings, reformat floats to 2 decimal points and remove brackets from dictionary values so Digraph can parse it
                 parameters = "\\l".join(
                     [
                         key + " : " + "{:0.2f}".format(val)
                         if (isinstance(val, float))
-                        else key + " : " + str(val)
+                        else key + " : " + str(val).replace("{", "").replace("}", "")
                         for key, val in component_class.parameters.items()
                     ],
                 )  # noqa: W605
+
                 label = "%s |%s\l" % (component_name, parameters)  # noqa: W605
             graph.node(component_name, shape="record", label=label, nodesep="0.03")
 

diff --git a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py
@@ -442,6 +442,7 @@ def inverse_transform(
             y.append(y_series)
         y_df = pd.DataFrame(y).T
         y_df.index = original_index
+        y_df.columns = y_t.columns
         return y_df
 
     def get_trend_dataframe(self, X, y):

diff --git a/evalml/pipelines/time_series_pipeline_base.py b/evalml/pipelines/time_series_pipeline_base.py
@@ -265,6 +265,12 @@ def predict_in_sample(
             calculating_residuals=calculating_residuals,
         )
         predictions = self._estimator_predict(features)
+        if isinstance(predictions, pd.Series):
+            predictions = predictions.rename(self.input_target_name)
+        elif isinstance(predictions, pd.DataFrame):
+            predictions = predictions.ww.rename(
+                dict(zip(predictions.columns, y.columns)),
+            )
         if len(predictions) == len(y):
             predictions.index = y.index
         predictions = self.inverse_transform(predictions)

diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py
@@ -233,21 +233,27 @@ def _get_time_series_featurizer(X, y, problem_type, estimator_class, sampler_nam
 def _get_decomposer(X, y, problem_type, estimator_class, sampler_name=None):
     components = []
     if is_time_series(problem_type) and is_regression(problem_type):
-        time_index = get_time_index(X, y, None)
-        # If the time index frequency is uninferrable, STL will fail
-        if time_index.freq is None:
-            return components
-        freq = time_index.freq.name
-        if STLDecomposer.is_freq_valid(freq):
-            # Make sure there's a seasonal period
-            order = 3 if "Q" in freq else 5
-            seasonal_period = STLDecomposer.determine_periodicity(
-                X,
-                y,
-                rel_max_order=order,
-            )
-            if seasonal_period is not None and seasonal_period <= DECOMPOSER_PERIOD_CAP:
-                components.append(STLDecomposer)
+        if is_multiseries(problem_type):
+            components.append(STLDecomposer)
+        else:
+            time_index = get_time_index(X, y, None)
+            # If the time index frequency is uninferrable, STL will fail
+            if time_index.freq is None:
+                return components
+            freq = time_index.freq.name
+            if STLDecomposer.is_freq_valid(freq):
+                # Make sure there's a seasonal period
+                order = 3 if "Q" in freq else 5
+                seasonal_period = STLDecomposer.determine_periodicity(
+                    X,
+                    y,
+                    rel_max_order=order,
+                )
+                if (
+                    seasonal_period is not None
+                    and seasonal_period <= DECOMPOSER_PERIOD_CAP
+                ):
+                    components.append(STLDecomposer)
     return components
 
 
@@ -292,9 +298,12 @@ def _get_preprocessing_components(
         list[Transformer]: A list of applicable preprocessing components to use with the estimator.
     """
     if is_multiseries(problem_type):
-        return []
+        if include_decomposer:
+            components_functions = [_get_decomposer]
+        else:
+            return []
 
-    if is_time_series(problem_type):
+    elif is_time_series(problem_type):
         components_functions = [
             _get_label_encoder,
             _get_drop_all_null,

diff --git a/evalml/tests/automl_tests/test_default_algorithm.py b/evalml/tests/automl_tests/test_default_algorithm.py
@@ -670,7 +670,7 @@ def test_default_algorithm_multiseries_time_series(
     )
 
     first_batch = algo.next_batch()
-    assert len(first_batch) == 1
+    assert len(first_batch) == 2
     pipeline = first_batch[0]
     assert pipeline.model_family == ModelFamily.VARMAX
     assert pipeline.parameters["pipeline"] == search_parameters["pipeline"]
@@ -679,7 +679,7 @@ def test_default_algorithm_multiseries_time_series(
 
     long_explore = algo.next_batch()
     long_estimators = set([pipeline.estimator.name for pipeline in long_explore])
-    assert len(long_explore) == 50
+    assert len(long_explore) == 100
     assert len(long_estimators) == 1
 
 

diff --git a/evalml/tests/automl_tests/test_iterative_algorithm.py b/evalml/tests/automl_tests/test_iterative_algorithm.py
@@ -18,11 +18,12 @@
     DateTimeFeaturizer,
     EmailFeaturizer,
     NaturalLanguageFeaturizer,
+    STLDecomposer,
     TimeSeriesFeaturizer,
     URLFeaturizer,
 )
 from evalml.pipelines.components.utils import get_estimators
-from evalml.pipelines.utils import make_pipeline
+from evalml.pipelines.utils import is_regression, make_pipeline
 from evalml.problem_types import ProblemTypes, is_multiseries, is_time_series
 
 
@@ -97,6 +98,7 @@ def test_iterative_algorithm_init(
     assert algo.batch_number == 0
     assert algo.default_max_batches == 1
     estimators = get_estimators(problem_type)
+    decomposer = [STLDecomposer] if is_regression(problem_type) else []
     assert len(algo.allowed_pipelines) == len(
         [
             make_pipeline(
@@ -107,7 +109,8 @@ def test_iterative_algorithm_init(
                 parameters=search_parameters,
             )
             for estimator in estimators
-        ],
+        ]
+        + decomposer,
     )
 
 

diff --git a/evalml/tests/pipeline_tests/test_pipeline_utils.py b/evalml/tests/pipeline_tests/test_pipeline_utils.py
@@ -170,7 +170,7 @@ def test_make_pipeline(
 
             if is_time_series(problem_type):
                 if is_multiseries(problem_type):
-                    expected_components = dfs + [estimator_class]
+                    expected_components = dfs + decomposer + [estimator_class]
                 else:
                     expected_components = (
                         dfs