From a7f704a41714afedb02f132322d009996f4aa12e Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Fri, 1 Sep 2023 17:45:22 -0700 Subject: [PATCH 01/12] init commit --- evalml/pipelines/utils.py | 7 +++++-- evalml/utils/gen_utils.py | 3 ++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py index dbc51abee8..1f4779ad07 100644 --- a/evalml/pipelines/utils.py +++ b/evalml/pipelines/utils.py @@ -292,9 +292,12 @@ def _get_preprocessing_components( list[Transformer]: A list of applicable preprocessing components to use with the estimator. """ if is_multiseries(problem_type): - return [] + if include_decomposer: + components_functions = [_get_decomposer] + else: + return [] - if is_time_series(problem_type): + elif is_time_series(problem_type): components_functions = [ _get_label_encoder, _get_drop_all_null, diff --git a/evalml/utils/gen_utils.py b/evalml/utils/gen_utils.py index af253fa021..7128b6b2ba 100644 --- a/evalml/utils/gen_utils.py +++ b/evalml/utils/gen_utils.py @@ -679,7 +679,8 @@ def get_time_index(X: pd.DataFrame, y: pd.Series, time_index_name: str): raise ValueError( f"Too many Datetime features provided in data and provided time_index column {time_index_name} not present in data.", ) - + if dt_col.duplicated().any(): + dt_col = dt_col.drop_duplicates() if not isinstance(dt_col, pd.DatetimeIndex) or dt_col.freq is None: dt_col = pd.DatetimeIndex(dt_col, freq="infer") time_index = dt_col.rename(y.index.name) From 39f3e80e61a206b2e235ba140744fc1124b0f76f Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Fri, 1 Sep 2023 17:54:58 -0700 Subject: [PATCH 02/12] update release notes --- docs/source/release_notes.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 881451efa2..eda28b6b9a 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -3,6 +3,7 @@ Release Notes **Future Releases** * Enhancements * Extended STLDecomposer to Support Multiseries :pr:`4253` + * Added STLDecomposer to multiseries pipelines :pr:`4299` * Fixes * Changes * Documentation Changes From 4689a95c4a7f7156adc7f8f91c791f80db1b20e6 Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Tue, 5 Sep 2023 17:12:14 -0700 Subject: [PATCH 03/12] add decomposer to tests --- evalml/tests/pipeline_tests/test_pipeline_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evalml/tests/pipeline_tests/test_pipeline_utils.py b/evalml/tests/pipeline_tests/test_pipeline_utils.py index 92eb95cc0e..db6de1a9d0 100644 --- a/evalml/tests/pipeline_tests/test_pipeline_utils.py +++ b/evalml/tests/pipeline_tests/test_pipeline_utils.py @@ -170,7 +170,7 @@ def test_make_pipeline( if is_time_series(problem_type): if is_multiseries(problem_type): - expected_components = dfs + [estimator_class] + expected_components = dfs + decomposer + [estimator_class] else: expected_components = ( dfs From 6aa8a259425bce6ae384697c3a57d505fa3a911d Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Wed, 6 Sep 2023 12:34:55 -0700 Subject: [PATCH 04/12] handle duplicates --- evalml/utils/gen_utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/evalml/utils/gen_utils.py b/evalml/utils/gen_utils.py index 7128b6b2ba..827930b8b6 100644 --- a/evalml/utils/gen_utils.py +++ b/evalml/utils/gen_utils.py @@ -679,10 +679,11 @@ def get_time_index(X: pd.DataFrame, y: pd.Series, time_index_name: str): raise ValueError( f"Too many Datetime features provided in data and provided time_index column {time_index_name} not present in data.", ) - if dt_col.duplicated().any(): - dt_col = dt_col.drop_duplicates() if not isinstance(dt_col, pd.DatetimeIndex) or dt_col.freq is None: dt_col = pd.DatetimeIndex(dt_col, freq="infer") + if dt_col.duplicated().any(): + temp_dt_col = pd.DatetimeIndex(dt_col.copy().drop_duplicates(), freq="infer") + dt_col.freq = temp_dt_col.freq time_index = dt_col.rename(y.index.name) return time_index From 6bea453bc0bfc65bd11997638b4817902bb3bf1f Mon Sep 17 00:00:00 2001 From: christopherbunn Date: Thu, 7 Sep 2023 11:32:21 -0400 Subject: [PATCH 05/12] Remove nan values - NOT FINISHED --- .../preprocessing/stl_decomposer.py | 1 + evalml/pipelines/time_series_pipeline_base.py | 5 ++++ evalml/pipelines/utils.py | 28 ++++++++++--------- evalml/utils/gen_utils.py | 6 ++-- 4 files changed, 24 insertions(+), 16 deletions(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py index b4bcfdd029..503be35da3 100644 --- a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py @@ -442,6 +442,7 @@ def inverse_transform( y.append(y_series) y_df = pd.DataFrame(y).T y_df.index = original_index + y_df.columns = y_t.columns return y_df def get_trend_dataframe(self, X, y): diff --git a/evalml/pipelines/time_series_pipeline_base.py b/evalml/pipelines/time_series_pipeline_base.py index 3badb6dc09..0b82a86dcf 100644 --- a/evalml/pipelines/time_series_pipeline_base.py +++ b/evalml/pipelines/time_series_pipeline_base.py @@ -265,6 +265,11 @@ def predict_in_sample( calculating_residuals=calculating_residuals, ) predictions = self._estimator_predict(features) + if len(predictions.columns) == len(y.columns): + # predictions.columns = y.columns + predictions = predictions.ww.rename( + dict(zip(predictions.columns, y.columns)), + ) if len(predictions) == len(y): predictions.index = y.index predictions = self.inverse_transform(predictions) diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py index 1f4779ad07..897c3ce61d 100644 --- a/evalml/pipelines/utils.py +++ b/evalml/pipelines/utils.py @@ -235,19 +235,21 @@ def _get_decomposer(X, y, problem_type, estimator_class, sampler_name=None): if is_time_series(problem_type) and is_regression(problem_type): time_index = get_time_index(X, y, None) # If the time index frequency is uninferrable, STL will fail - if time_index.freq is None: - return components - freq = time_index.freq.name - if STLDecomposer.is_freq_valid(freq): - # Make sure there's a seasonal period - order = 3 if "Q" in freq else 5 - seasonal_period = STLDecomposer.determine_periodicity( - X, - y, - rel_max_order=order, - ) - if seasonal_period is not None and seasonal_period <= DECOMPOSER_PERIOD_CAP: - components.append(STLDecomposer) + # if time_index.freq is None: + # return components + if time_index.freq is not None: + order = 3 if "Q" in time_index.freq.name else 5 + else: + order = 5 + # if STLDecomposer.is_freq_valid(freq): + # Make sure there's a seasonal period + seasonal_period = STLDecomposer.determine_periodicity( + X, + y, + rel_max_order=order, + ) + if seasonal_period is not None and seasonal_period <= DECOMPOSER_PERIOD_CAP: + components.append(STLDecomposer) return components diff --git a/evalml/utils/gen_utils.py b/evalml/utils/gen_utils.py index 827930b8b6..e53142f44d 100644 --- a/evalml/utils/gen_utils.py +++ b/evalml/utils/gen_utils.py @@ -681,9 +681,9 @@ def get_time_index(X: pd.DataFrame, y: pd.Series, time_index_name: str): ) if not isinstance(dt_col, pd.DatetimeIndex) or dt_col.freq is None: dt_col = pd.DatetimeIndex(dt_col, freq="infer") - if dt_col.duplicated().any(): - temp_dt_col = pd.DatetimeIndex(dt_col.copy().drop_duplicates(), freq="infer") - dt_col.freq = temp_dt_col.freq + # if dt_col.duplicated().any(): + # temp_dt_col = pd.DatetimeIndex(dt_col.copy().drop_duplicates(), freq="infer") + # dt_col.freq = temp_dt_col.freq time_index = dt_col.rename(y.index.name) return time_index From 189eb4943f161882c53fef444775315d0dc9718b Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Thu, 7 Sep 2023 10:43:01 -0700 Subject: [PATCH 06/12] handle series and df --- evalml/pipelines/time_series_pipeline_base.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/evalml/pipelines/time_series_pipeline_base.py b/evalml/pipelines/time_series_pipeline_base.py index 0b82a86dcf..5a0a23e2af 100644 --- a/evalml/pipelines/time_series_pipeline_base.py +++ b/evalml/pipelines/time_series_pipeline_base.py @@ -265,8 +265,11 @@ def predict_in_sample( calculating_residuals=calculating_residuals, ) predictions = self._estimator_predict(features) - if len(predictions.columns) == len(y.columns): - # predictions.columns = y.columns + if isinstance(predictions, pd.Series) and len(predictions) == len(y): + predictions = predictions.rename(self.input_target_name) + elif isinstance(predictions, pd.DataFrame) and len(predictions.columns) == len( + y.columns, + ): predictions = predictions.ww.rename( dict(zip(predictions.columns, y.columns)), ) From 4a9ab0e973172fa9d2e4cf935962fbff3c5d0f82 Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Thu, 7 Sep 2023 16:15:16 -0700 Subject: [PATCH 07/12] fix stl graph --- evalml/pipelines/component_graph.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/evalml/pipelines/component_graph.py b/evalml/pipelines/component_graph.py index 48e83807c1..ac85488ea2 100644 --- a/evalml/pipelines/component_graph.py +++ b/evalml/pipelines/component_graph.py @@ -806,10 +806,11 @@ def graph(self, name=None, graph_format=None): [ key + " : " + "{:0.2f}".format(val) if (isinstance(val, float)) - else key + " : " + str(val) + else key + " : " + str(val).replace("{", "").replace("}", "") for key, val in component_class.parameters.items() ], ) # noqa: W605 + label = "%s |%s\l" % (component_name, parameters) # noqa: W605 graph.node(component_name, shape="record", label=label, nodesep="0.03") From 2fd85fe9b4b27771a09b2f0645fd3522027d9299 Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Thu, 7 Sep 2023 17:04:06 -0700 Subject: [PATCH 08/12] fix if statements --- evalml/pipelines/time_series_pipeline_base.py | 6 ++---- evalml/utils/gen_utils.py | 3 --- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/evalml/pipelines/time_series_pipeline_base.py b/evalml/pipelines/time_series_pipeline_base.py index 5a0a23e2af..37f163cd90 100644 --- a/evalml/pipelines/time_series_pipeline_base.py +++ b/evalml/pipelines/time_series_pipeline_base.py @@ -265,11 +265,9 @@ def predict_in_sample( calculating_residuals=calculating_residuals, ) predictions = self._estimator_predict(features) - if isinstance(predictions, pd.Series) and len(predictions) == len(y): + if isinstance(predictions, pd.Series): predictions = predictions.rename(self.input_target_name) - elif isinstance(predictions, pd.DataFrame) and len(predictions.columns) == len( - y.columns, - ): + elif isinstance(predictions, pd.DataFrame): predictions = predictions.ww.rename( dict(zip(predictions.columns, y.columns)), ) diff --git a/evalml/utils/gen_utils.py b/evalml/utils/gen_utils.py index e53142f44d..ae7b444654 100644 --- a/evalml/utils/gen_utils.py +++ b/evalml/utils/gen_utils.py @@ -681,9 +681,6 @@ def get_time_index(X: pd.DataFrame, y: pd.Series, time_index_name: str): ) if not isinstance(dt_col, pd.DatetimeIndex) or dt_col.freq is None: dt_col = pd.DatetimeIndex(dt_col, freq="infer") - # if dt_col.duplicated().any(): - # temp_dt_col = pd.DatetimeIndex(dt_col.copy().drop_duplicates(), freq="infer") - # dt_col.freq = temp_dt_col.freq time_index = dt_col.rename(y.index.name) return time_index From 639408ab22ca40d3c65b9df9e6d05585d8d7e158 Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Thu, 7 Sep 2023 17:06:11 -0700 Subject: [PATCH 09/12] revert utils --- evalml/utils/gen_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/evalml/utils/gen_utils.py b/evalml/utils/gen_utils.py index ae7b444654..af253fa021 100644 --- a/evalml/utils/gen_utils.py +++ b/evalml/utils/gen_utils.py @@ -679,6 +679,7 @@ def get_time_index(X: pd.DataFrame, y: pd.Series, time_index_name: str): raise ValueError( f"Too many Datetime features provided in data and provided time_index column {time_index_name} not present in data.", ) + if not isinstance(dt_col, pd.DatetimeIndex) or dt_col.freq is None: dt_col = pd.DatetimeIndex(dt_col, freq="infer") time_index = dt_col.rename(y.index.name) From 12e1771bbc6bdab49c2324dd335c1872e25cab70 Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Thu, 7 Sep 2023 17:07:46 -0700 Subject: [PATCH 10/12] comments --- evalml/pipelines/utils.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py index 897c3ce61d..255065b053 100644 --- a/evalml/pipelines/utils.py +++ b/evalml/pipelines/utils.py @@ -234,14 +234,10 @@ def _get_decomposer(X, y, problem_type, estimator_class, sampler_name=None): components = [] if is_time_series(problem_type) and is_regression(problem_type): time_index = get_time_index(X, y, None) - # If the time index frequency is uninferrable, STL will fail - # if time_index.freq is None: - # return components if time_index.freq is not None: order = 3 if "Q" in time_index.freq.name else 5 else: order = 5 - # if STLDecomposer.is_freq_valid(freq): # Make sure there's a seasonal period seasonal_period = STLDecomposer.determine_periodicity( X, From 8ab7fdf1762e061052efe6bded8b694fafe018cd Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Fri, 8 Sep 2023 09:34:44 -0700 Subject: [PATCH 11/12] add comments and conditional branch --- evalml/pipelines/component_graph.py | 1 + evalml/pipelines/utils.py | 24 ++++++++++++------------ 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/evalml/pipelines/component_graph.py b/evalml/pipelines/component_graph.py index ac85488ea2..0f3f4e5810 100644 --- a/evalml/pipelines/component_graph.py +++ b/evalml/pipelines/component_graph.py @@ -802,6 +802,7 @@ def graph(self, name=None, graph_format=None): for component_name, component_class in self.component_instances.items(): label = "%s\l" % (component_name) # noqa: W605 if isinstance(component_class, ComponentBase): + # Reformat labels for nodes: cast values as strings, reformat floats to 2 decimal points and remove brackets from dictionary values so Digraph can parse it parameters = "\\l".join( [ key + " : " + "{:0.2f}".format(val) diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py index 255065b053..d2ce680105 100644 --- a/evalml/pipelines/utils.py +++ b/evalml/pipelines/utils.py @@ -233,19 +233,19 @@ def _get_time_series_featurizer(X, y, problem_type, estimator_class, sampler_nam def _get_decomposer(X, y, problem_type, estimator_class, sampler_name=None): components = [] if is_time_series(problem_type) and is_regression(problem_type): - time_index = get_time_index(X, y, None) - if time_index.freq is not None: - order = 3 if "Q" in time_index.freq.name else 5 - else: - order = 5 - # Make sure there's a seasonal period - seasonal_period = STLDecomposer.determine_periodicity( - X, - y, - rel_max_order=order, - ) - if seasonal_period is not None and seasonal_period <= DECOMPOSER_PERIOD_CAP: + if is_multiseries(problem_type): components.append(STLDecomposer) + else: + time_index = get_time_index(X, y, None) + order = 3 if "Q" in time_index.freq.name else 5 + # Make sure there's a seasonal period + seasonal_period = STLDecomposer.determine_periodicity( + X, + y, + rel_max_order=order, + ) + if seasonal_period is not None and seasonal_period <= DECOMPOSER_PERIOD_CAP: + components.append(STLDecomposer) return components From 28e2cdb8985993833607ce080c5eea53ab6cb3a6 Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Fri, 8 Sep 2023 10:51:03 -0700 Subject: [PATCH 12/12] fix condition for adding decomposer --- evalml/pipelines/utils.py | 26 ++++++++++++------- .../automl_tests/test_default_algorithm.py | 4 +-- .../automl_tests/test_iterative_algorithm.py | 7 +++-- 3 files changed, 24 insertions(+), 13 deletions(-) diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py index d2ce680105..e23998096d 100644 --- a/evalml/pipelines/utils.py +++ b/evalml/pipelines/utils.py @@ -237,15 +237,23 @@ def _get_decomposer(X, y, problem_type, estimator_class, sampler_name=None): components.append(STLDecomposer) else: time_index = get_time_index(X, y, None) - order = 3 if "Q" in time_index.freq.name else 5 - # Make sure there's a seasonal period - seasonal_period = STLDecomposer.determine_periodicity( - X, - y, - rel_max_order=order, - ) - if seasonal_period is not None and seasonal_period <= DECOMPOSER_PERIOD_CAP: - components.append(STLDecomposer) + # If the time index frequency is uninferrable, STL will fail + if time_index.freq is None: + return components + freq = time_index.freq.name + if STLDecomposer.is_freq_valid(freq): + # Make sure there's a seasonal period + order = 3 if "Q" in freq else 5 + seasonal_period = STLDecomposer.determine_periodicity( + X, + y, + rel_max_order=order, + ) + if ( + seasonal_period is not None + and seasonal_period <= DECOMPOSER_PERIOD_CAP + ): + components.append(STLDecomposer) return components diff --git a/evalml/tests/automl_tests/test_default_algorithm.py b/evalml/tests/automl_tests/test_default_algorithm.py index b21cc452cb..31b8a166f7 100644 --- a/evalml/tests/automl_tests/test_default_algorithm.py +++ b/evalml/tests/automl_tests/test_default_algorithm.py @@ -670,7 +670,7 @@ def test_default_algorithm_multiseries_time_series( ) first_batch = algo.next_batch() - assert len(first_batch) == 1 + assert len(first_batch) == 2 pipeline = first_batch[0] assert pipeline.model_family == ModelFamily.VARMAX assert pipeline.parameters["pipeline"] == search_parameters["pipeline"] @@ -679,7 +679,7 @@ def test_default_algorithm_multiseries_time_series( long_explore = algo.next_batch() long_estimators = set([pipeline.estimator.name for pipeline in long_explore]) - assert len(long_explore) == 50 + assert len(long_explore) == 100 assert len(long_estimators) == 1 diff --git a/evalml/tests/automl_tests/test_iterative_algorithm.py b/evalml/tests/automl_tests/test_iterative_algorithm.py index f5ed9b73ac..3030c09909 100644 --- a/evalml/tests/automl_tests/test_iterative_algorithm.py +++ b/evalml/tests/automl_tests/test_iterative_algorithm.py @@ -18,11 +18,12 @@ DateTimeFeaturizer, EmailFeaturizer, NaturalLanguageFeaturizer, + STLDecomposer, TimeSeriesFeaturizer, URLFeaturizer, ) from evalml.pipelines.components.utils import get_estimators -from evalml.pipelines.utils import make_pipeline +from evalml.pipelines.utils import is_regression, make_pipeline from evalml.problem_types import ProblemTypes, is_multiseries, is_time_series @@ -97,6 +98,7 @@ def test_iterative_algorithm_init( assert algo.batch_number == 0 assert algo.default_max_batches == 1 estimators = get_estimators(problem_type) + decomposer = [STLDecomposer] if is_regression(problem_type) else [] assert len(algo.allowed_pipelines) == len( [ make_pipeline( @@ -107,7 +109,8 @@ def test_iterative_algorithm_init( parameters=search_parameters, ) for estimator in estimators - ], + ] + + decomposer, )