diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index fc1c580c3b..ee0f2c1f70 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -3,6 +3,7 @@ Release Notes **Future Releases** * Enhancements * Updated regression metrics to handle multioutput dataframes as well as single output series :pr:`4233` + * Added baseline regressor for multiseries time series problems :pr:`4246` * Added stacking and unstacking utility functions to work with multiseries data :pr:`4250` * Fixes * Added support for pandas 2 :pr:`4216` diff --git a/evalml/pipelines/components/__init__.py b/evalml/pipelines/components/__init__.py index 1d00a850cf..30a200256b 100644 --- a/evalml/pipelines/components/__init__.py +++ b/evalml/pipelines/components/__init__.py @@ -21,6 +21,7 @@ DecisionTreeClassifier, DecisionTreeRegressor, TimeSeriesBaselineEstimator, + MultiseriesTimeSeriesBaselineRegressor, KNeighborsClassifier, ProphetRegressor, SVMClassifier, diff --git a/evalml/pipelines/components/component_base.py b/evalml/pipelines/components/component_base.py index 12b6603bb4..5c00a053e1 100644 --- a/evalml/pipelines/components/component_base.py +++ b/evalml/pipelines/components/component_base.py @@ -31,6 +31,7 @@ class ComponentBase(ABC, metaclass=ComponentBaseMeta): # Referring to the pandas nullable dtypes; not just woodwork logical types _integer_nullable_incompatibilities = [] _boolean_nullable_incompatibilities = [] + is_multiseries = False def __init__(self, parameters=None, component_obj=None, random_seed=0, **kwargs): """Base class for all components. diff --git a/evalml/pipelines/components/estimators/__init__.py b/evalml/pipelines/components/estimators/__init__.py index 1528742106..ce9dc742a1 100644 --- a/evalml/pipelines/components/estimators/__init__.py +++ b/evalml/pipelines/components/estimators/__init__.py @@ -25,6 +25,7 @@ ExtraTreesRegressor, BaselineRegressor, TimeSeriesBaselineEstimator, + MultiseriesTimeSeriesBaselineRegressor, DecisionTreeRegressor, SVMRegressor, ExponentialSmoothingRegressor, diff --git a/evalml/pipelines/components/estimators/regressors/__init__.py b/evalml/pipelines/components/estimators/regressors/__init__.py index a35167d54b..b98e3a7fdb 100644 --- a/evalml/pipelines/components/estimators/regressors/__init__.py +++ b/evalml/pipelines/components/estimators/regressors/__init__.py @@ -29,6 +29,9 @@ from evalml.pipelines.components.estimators.regressors.time_series_baseline_estimator import ( TimeSeriesBaselineEstimator, ) +from evalml.pipelines.components.estimators.regressors.multiseries_time_series_baseline_regressor import ( + MultiseriesTimeSeriesBaselineRegressor, +) from evalml.pipelines.components.estimators.regressors.prophet_regressor import ( ProphetRegressor, ) diff --git a/evalml/pipelines/components/estimators/regressors/multiseries_time_series_baseline_regressor.py b/evalml/pipelines/components/estimators/regressors/multiseries_time_series_baseline_regressor.py new file mode 100644 index 0000000000..f01132a608 --- /dev/null +++ b/evalml/pipelines/components/estimators/regressors/multiseries_time_series_baseline_regressor.py @@ -0,0 +1,114 @@ +"""Time series estimator that predicts using the naive forecasting approach.""" +import numpy as np +import pandas as pd + +from evalml.model_family import ModelFamily +from evalml.pipelines.components.estimators import Estimator +from evalml.pipelines.components.transformers import TimeSeriesFeaturizer +from evalml.problem_types import ProblemTypes +from evalml.utils import infer_feature_types + + +class MultiseriesTimeSeriesBaselineRegressor(Estimator): + """Multiseries time series regressor that predicts using the naive forecasting approach. + + This is useful as a simple baseline estimator for multiseries time series problems. + + Args: + gap (int): Gap between prediction date and target date and must be a positive integer. If gap is 0, target date will be shifted ahead by 1 time period. Defaults to 1. + forecast_horizon (int): Number of time steps the model is expected to predict. + random_seed (int): Seed for the random number generator. Defaults to 0. + """ + + name = "Multiseries Time Series Baseline Regressor" + hyperparameter_ranges = {} + """{}""" + model_family = ModelFamily.BASELINE + """ModelFamily.BASELINE""" + is_multiseries = True + supported_problem_types = [ + ProblemTypes.TIME_SERIES_REGRESSION, + ] + """[ + ProblemTypes.TIME_SERIES_REGRESSION, + ]""" + + def __init__(self, gap=1, forecast_horizon=1, random_seed=0, **kwargs): + self._prediction_value = None + self.start_delay = forecast_horizon + gap + self._num_features = None + + if gap < 0: + raise ValueError( + f"gap value must be a positive integer. {gap} was provided.", + ) + + parameters = {"gap": gap, "forecast_horizon": forecast_horizon} + parameters.update(kwargs) + super().__init__( + parameters=parameters, + component_obj=None, + random_seed=random_seed, + ) + + def fit(self, X, y=None): + """Fits multiseries time series baseline regressor to data. + + Args: + X (pd.DataFrame): The input training data of shape [n_samples, n_features * n_series]. + y (pd.DataFrame): The target training data of shape [n_samples, n_features * n_series]. + + Returns: + self + + Raises: + ValueError: If input y is None or if y is not a DataFrame with multiple columns. + """ + if y is None: + raise ValueError( + "Cannot train Multiseries Time Series Baseline Regressor if y is None", + ) + if isinstance(y, pd.Series): + raise ValueError( + "y must be a DataFrame with multiple columns for Multiseries Time Series Baseline Regressor", + ) + self._target_column_names = list(y.columns) + self._num_features = X.shape[1] + + return self + + def predict(self, X): + """Make predictions using fitted multiseries time series baseline regressor. + + Args: + X (pd.DataFrame): Data of shape [n_samples, n_features]. + + Returns: + pd.DataFrame: Predicted values. + + Raises: + ValueError: If the lagged columns are not present in X. + """ + X = infer_feature_types(X) + feature_names = [ + TimeSeriesFeaturizer.df_colname_prefix.format(col, self.start_delay) + for col in self._target_column_names + ] + if not set(feature_names).issubset(set(X.columns)): + raise ValueError( + "Multiseries Time Series Baseline Regressor is meant to be used in a pipeline with " + "a Time Series Featurizer", + ) + return X.ww[feature_names] + + @property + def feature_importance(self): + """Returns importance associated with each feature. + + Since baseline estimators do not use input features to calculate predictions, returns an array of zeroes. + + Returns: + np.ndarray (float): An array of zeroes. + """ + importance = np.array([0] * self._num_features) + return importance diff --git a/evalml/pipelines/components/transformers/preprocessing/time_series_featurizer.py b/evalml/pipelines/components/transformers/preprocessing/time_series_featurizer.py index c966dc3162..f812471090 100644 --- a/evalml/pipelines/components/transformers/preprocessing/time_series_featurizer.py +++ b/evalml/pipelines/components/transformers/preprocessing/time_series_featurizer.py @@ -61,6 +61,8 @@ class TimeSeriesFeaturizer(Transformer): needs_fitting = True target_colname_prefix = "target_delay_{}" """target_delay_{}""" + df_colname_prefix = "{}_delay_{}" + """{}_delay_{}""" def __init__( self, @@ -124,12 +126,17 @@ def fit(self, X, y=None): """ if self.time_index is None: raise ValueError("time_index cannot be None!") - self.statistically_significant_lags = self._find_significant_lags( - y, - conf_level=self.conf_level, - start_delay=self.start_delay, - max_delay=self.max_delay, - ) + + # For the multiseries case, where we only want the start delay lag for the baseline + if isinstance(y, pd.DataFrame): + self.statistically_significant_lags = [self.start_delay] + else: + self.statistically_significant_lags = self._find_significant_lags( + y, + conf_level=self.conf_level, + start_delay=self.start_delay, + max_delay=self.max_delay, + ) return self @staticmethod @@ -215,6 +222,24 @@ def _compute_rolling_transforms(self, X, y, original_features): ) return data + def _delay_df( + self, + data, + cols_to_delay, + categorical_columns=None, + X_categorical=None, + ): + lagged_features = {} + for col_name in cols_to_delay: + col = data[col_name] + if categorical_columns and col_name in categorical_columns: + col = X_categorical[col_name] + for t in self.statistically_significant_lags: + lagged_features[self.df_colname_prefix.format(col_name, t)] = col.shift( + t, + ) + return lagged_features + def _compute_delays(self, X_ww, y): """Computes the delayed features for numeric/categorical features in X and y. @@ -234,33 +259,28 @@ def _compute_delays(self, X_ww, y): ).columns, ) categorical_columns = self._get_categorical_columns(X_ww) - cols_derived_from_categoricals = [] lagged_features = {} if self.delay_features and len(X_ww) > 0: X_categorical = self._encode_X_while_preserving_index( X_ww[categorical_columns], ) - for col_name in cols_to_delay: - col = X_ww[col_name] - if col_name in categorical_columns: - col = X_categorical[col_name] - for t in self.statistically_significant_lags: - feature_name = f"{col_name}_delay_{t}" - lagged_features[f"{col_name}_delay_{t}"] = col.shift(t) - if col_name in categorical_columns: - cols_derived_from_categoricals.append(feature_name) + lagged_features.update( + self._delay_df(X_ww, cols_to_delay, categorical_columns, X_categorical), + ) # Handle cases where the target was passed in if self.delay_target and y is not None: - if type(y.ww.logical_type) == logical_types.Categorical: - y = self._encode_y_while_preserving_index(y) - for t in self.statistically_significant_lags: - lagged_features[self.target_colname_prefix.format(t)] = y.shift(t) + if isinstance(y, pd.DataFrame): + lagged_features.update(self._delay_df(y, y.columns)) + else: + if type(y.ww.logical_type) == logical_types.Categorical: + y = self._encode_y_while_preserving_index(y) + for t in self.statistically_significant_lags: + lagged_features[self.target_colname_prefix.format(t)] = y.shift(t) # Features created from categorical columns should no longer be categorical - lagged_features = pd.DataFrame(lagged_features) + lagged_features = pd.DataFrame(lagged_features, index=X_ww.index) lagged_features.ww.init( logical_types={col: "Double" for col in lagged_features.columns}, ) - lagged_features.index = X_ww.index return ww.concat_columns([X_ww, lagged_features]) def transform(self, X, y=None): diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py index c95a3bbc51..750d9e4b5c 100644 --- a/evalml/pipelines/utils.py +++ b/evalml/pipelines/utils.py @@ -1357,7 +1357,6 @@ def unstack_multiseries( series_id, time_index, target_name, - keep_time_in_index=True, ): """Converts multiseries data with one series_id column and one target column to one target column per series id. @@ -1367,8 +1366,6 @@ def unstack_multiseries( series_id (str): The column which identifies which series each row belongs to. time_index (str): Specifies the name of the column in X that provides the datetime objects. target_name (str): The name of the target column. - keep_time_in_index (bool): Whether to maintain the time index as the index of the returned dataframes. Defaults to True. - If set to false, will discard the time index information entirely. Returns: pd.DataFrame, pd.DataFrame: The unstacked X and y data. @@ -1401,10 +1398,9 @@ def unstack_multiseries( X_unstacked = pd.concat(X_unstacked_cols, axis=1) y_unstacked = pd.concat(y_unstacked_cols, axis=1) - # Reset the axis if need be - if not keep_time_in_index: - X_unstacked.reset_index(drop=True, inplace=True) - y_unstacked.reset_index(drop=True, inplace=True) + # Reset the axes now that they've been unstacked, keep time info in X + X_unstacked = X_unstacked.reset_index() + y_unstacked = y_unstacked.reset_index(drop=True) return X_unstacked, y_unstacked diff --git a/evalml/tests/component_tests/test_components.py b/evalml/tests/component_tests/test_components.py index 3245a57185..918a237d18 100644 --- a/evalml/tests/component_tests/test_components.py +++ b/evalml/tests/component_tests/test_components.py @@ -40,6 +40,7 @@ LinearDiscriminantAnalysis, LinearRegressor, LogisticRegressionClassifier, + MultiseriesTimeSeriesBaselineRegressor, NaturalLanguageFeaturizer, OneHotEncoder, Oversampler, @@ -1015,9 +1016,9 @@ def test_components_can_be_used_for_partial_dependence_fast_mode(): # Expected number is hardcoded so that this test will fail when new components are added # It should be len(all_native_components) - num_invalid_for_pd_fast_mode if ProphetRegressor not in all_native_components: - expected_num_valid_for_pd_fast_mode = 63 - else: expected_num_valid_for_pd_fast_mode = 64 + else: + expected_num_valid_for_pd_fast_mode = 65 assert num_valid_for_pd_fast_mode == expected_num_valid_for_pd_fast_mode @@ -1210,6 +1211,7 @@ def test_all_estimators_check_fit( StackedEnsembleClassifier, StackedEnsembleRegressor, TimeSeriesBaselineEstimator, + MultiseriesTimeSeriesBaselineRegressor, VowpalWabbitBinaryClassifier, VowpalWabbitMulticlassClassifier, VowpalWabbitRegressor, @@ -1367,6 +1369,9 @@ def test_serialization( else: X, y = X_y_binary + if component_class.is_multiseries: + y = pd.DataFrame({"target_a": y, "target_b": y}) + component.fit(X, y) for pickle_protocol in range(cloudpickle.DEFAULT_PROTOCOL + 1): @@ -1740,6 +1745,9 @@ def test_estimator_fit_respects_custom_indices( X = pd.DataFrame(X) y = pd.Series(y) + if estimator_class.is_multiseries: + y = pd.DataFrame({"target_a": y, "target_b": y}) + if use_custom_index and ts_problem: X.index = pd.date_range("2020-10-01", periods=40) y.index = pd.date_range("2020-10-01", periods=40) @@ -1915,7 +1923,10 @@ def test_components_support_nullable_types( component is added that has nullable type incompatibilities, this should fail.""" cannot_handle_boolean_target = [CatBoostRegressor] - if component_class == TimeSeriesBaselineEstimator: + if ( + component_class == TimeSeriesBaselineEstimator + or component_class == MultiseriesTimeSeriesBaselineRegressor + ): pytest.skip( "Time Series Baseline Estimator can only be used within a Pipeline.", ) diff --git a/evalml/tests/component_tests/test_multiseries_baseline_regressor.py b/evalml/tests/component_tests/test_multiseries_baseline_regressor.py new file mode 100644 index 0000000000..0b63eca318 --- /dev/null +++ b/evalml/tests/component_tests/test_multiseries_baseline_regressor.py @@ -0,0 +1,62 @@ +import pandas as pd +import pytest + +from evalml.model_family import ModelFamily +from evalml.pipelines.components import ( + MultiseriesTimeSeriesBaselineRegressor, + TimeSeriesFeaturizer, +) + + +def test_multiseries_time_series_baseline_regressor_init(): + baseline = MultiseriesTimeSeriesBaselineRegressor() + assert baseline.model_family == ModelFamily.BASELINE + assert baseline.is_multiseries + assert baseline.start_delay == 2 + + baseline = MultiseriesTimeSeriesBaselineRegressor(gap=2, forecast_horizon=5) + assert baseline.start_delay == 7 + + +def test_multiseries_time_series_baseline_gap_negative(): + with pytest.raises(ValueError, match="gap value must be a positive integer."): + MultiseriesTimeSeriesBaselineRegressor(gap=-1) + + +def test_multiseries_time_series_baseline_estimator_invalid_y( + multiseries_ts_data_unstacked, +): + X, _ = multiseries_ts_data_unstacked + + estimator = MultiseriesTimeSeriesBaselineRegressor(gap=0, forecast_horizon=2) + + with pytest.raises(ValueError, match="if y is None"): + estimator.fit(X, None) + with pytest.raises(ValueError, match="y must be a DataFrame"): + estimator.fit(X, pd.Series(range(100))) + + +def test_multiseries_baseline_no_featurizer(multiseries_ts_data_unstacked): + X, y = multiseries_ts_data_unstacked + + estimator = MultiseriesTimeSeriesBaselineRegressor(gap=0, forecast_horizon=2) + estimator.fit(X, y) + + with pytest.raises(ValueError, match="is meant to be used in a pipeline with "): + estimator.predict(X) + + +def test_multiseries_time_series_baseline_lags(multiseries_ts_data_unstacked): + X, y = multiseries_ts_data_unstacked + + feat = TimeSeriesFeaturizer(time_index="date", gap=0, forecast_horizon=2) + feat.fit(X, y) + X_t = feat.transform(X, y) + + estimator = MultiseriesTimeSeriesBaselineRegressor(gap=0, forecast_horizon=2) + estimator.fit(X_t, y) + + pred = estimator.predict(X_t) + expected = y.shift(2) + expected.columns = [f"{col}_delay_2" for col in expected.columns] + pd.testing.assert_frame_equal(pred, expected) diff --git a/evalml/tests/component_tests/test_time_series_featurizer.py b/evalml/tests/component_tests/test_time_series_featurizer.py index db206495aa..0458d8cfd0 100644 --- a/evalml/tests/component_tests/test_time_series_featurizer.py +++ b/evalml/tests/component_tests/test_time_series_featurizer.py @@ -981,3 +981,17 @@ def test_delay_feature_transformer_works_for_non_numeric_ordinal_categories(ts_d output.fit(X, y) X_t = output.transform(X, y) assert set(X_t["cats_delay_1"].value_counts().to_dict().keys()) == {2.0, 0.0, 1.0} + + +def test_featurizer_y_dataframe(multiseries_ts_data_unstacked): + X, y = multiseries_ts_data_unstacked + + featurizer = TimeSeriesFeaturizer(time_index="date", gap=1, forecast_horizon=5) + featurizer.fit(X, y) + + assert featurizer.statistically_significant_lags == [6] + + expected_y_cols = [f"target_{i}_delay_6" for i in range(y.shape[1])] + X_t = featurizer.transform(X, y) + for expected_y_col in expected_y_cols: + assert expected_y_col in X_t.columns diff --git a/evalml/tests/component_tests/test_utils.py b/evalml/tests/component_tests/test_utils.py index 4d59202d3a..3e4d0f6c56 100644 --- a/evalml/tests/component_tests/test_utils.py +++ b/evalml/tests/component_tests/test_utils.py @@ -75,6 +75,7 @@ "Target Imputer", "Natural Language Featurizer", "Time Series Baseline Estimator", + "Multiseries Time Series Baseline Regressor", "Time Series Imputer", "Time Series Regularizer", "URL Featurizer", diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py index 2d508bd53d..1882f7c05e 100644 --- a/evalml/tests/conftest.py +++ b/evalml/tests/conftest.py @@ -1034,11 +1034,7 @@ def multiseries_ts_data_unstacked(): y = pd.DataFrame({f"target_{i}": range(i, 100, 5) for i in range(5)}) - X.index = pd.date_range(start="1/1/2018", periods=20) - X.index.name = "date" - y.index = pd.date_range(start="1/1/2018", periods=20) - y.index.name = "date" - + X["date"] = pd.date_range(start="1/1/2018", periods=20) return X, y diff --git a/evalml/tests/pipeline_tests/test_pipeline_utils.py b/evalml/tests/pipeline_tests/test_pipeline_utils.py index d1b74d5283..2064dcc835 100644 --- a/evalml/tests/pipeline_tests/test_pipeline_utils.py +++ b/evalml/tests/pipeline_tests/test_pipeline_utils.py @@ -1379,10 +1379,8 @@ def test_make_pipeline_features_and_dfs(X_y_binary): @pytest.mark.parametrize("target_name", ["target", "Target_Data"]) -@pytest.mark.parametrize("keep_time_in_index", [True, False]) def test_unstack_multiseries( target_name, - keep_time_in_index, multiseries_ts_data_stacked, multiseries_ts_data_unstacked, ): @@ -1392,9 +1390,6 @@ def test_unstack_multiseries( y_unstacked.columns = [ f"{target_name}_{i}" for i in range(len(y_unstacked.columns)) ] - if not keep_time_in_index: - X_unstacked.reset_index(drop=True, inplace=True) - y_unstacked.reset_index(drop=True, inplace=True) X_unstacked_transformed, y_unstacked_transformed = unstack_multiseries( X, @@ -1402,7 +1397,6 @@ def test_unstack_multiseries( "series_id", "date", target_name=target_name, - keep_time_in_index=keep_time_in_index, ) pd.testing.assert_frame_equal( X_unstacked.sort_index(axis=1), @@ -1418,11 +1412,9 @@ def test_unstack_multiseries( @pytest.mark.parametrize("include_series_id", [True, False]) @pytest.mark.parametrize("series_id_name", [None, "SERIES"]) -@pytest.mark.parametrize("index_type", ["datetime", "int"]) def test_stack_data( include_series_id, series_id_name, - index_type, multiseries_ts_data_stacked, multiseries_ts_data_unstacked, ): @@ -1430,13 +1422,6 @@ def test_stack_data( _, y_stacked = multiseries_ts_data_stacked y_stacked.name = "target" - - if index_type == "datetime": - y_stacked.index = pd.date_range(start="1/1/2018", periods=20).repeat(5) - y_stacked.index.name = "date" - else: - y = y.reset_index(drop=True) - y_stacked_transformed = stack_data( y, include_series_id=include_series_id, diff --git a/evalml/utils/gen_utils.py b/evalml/utils/gen_utils.py index c0aabf7424..af253fa021 100644 --- a/evalml/utils/gen_utils.py +++ b/evalml/utils/gen_utils.py @@ -209,6 +209,7 @@ def _get_subclasses(base_class): "BaselineClassifier", "BaselineRegressor", "TimeSeriesBaselineEstimator", + "MultiseriesTimeSeriesBaselineRegressor", "StackedEnsembleClassifier", "StackedEnsembleRegressor", "KNeighborsClassifier",