Skip to content

Commit

Permalink
Swap is_multiseries logic to problem type (#4278)
Browse files Browse the repository at this point in the history
* Add multiseries time series regression as problem type

* Completely revamp to multiseries based on problem type
  • Loading branch information
eccabay authored Aug 17, 2023
1 parent 9f3cb79 commit 41f8e87
Show file tree
Hide file tree
Showing 28 changed files with 192 additions and 171 deletions.
18 changes: 3 additions & 15 deletions evalml/automl/automl_algorithm/default_algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
_make_pipeline_from_multiple_graphs,
make_pipeline,
)
from evalml.problem_types import is_regression, is_time_series
from evalml.problem_types import is_multiseries, is_regression, is_time_series
from evalml.utils import infer_feature_types
from evalml.utils.logger import get_logger

Expand Down Expand Up @@ -81,7 +81,6 @@ class DefaultAlgorithm(AutoMLAlgorithm):
model families. Run evalml.pipelines.components.utils.allowed_model_families("binary") to see options. Change `binary`
to `multiclass` or `regression` depending on the problem type.
excluded_model_families (list[ModelFamily]): A list of model families to exclude from the estimators used when building pipelines. For default algorithm, this only excludes estimators in the non-naive batches.
is_multiseries (bool): Whether or not the problem is a multiseries time series problem. Defaults to False.
"""

def __init__(
Expand All @@ -106,7 +105,6 @@ def __init__(
run_feature_selection=True,
verbose=False,
exclude_featurizers=None,
is_multiseries=False,
):
super().__init__(
allowed_pipelines=[],
Expand Down Expand Up @@ -140,7 +138,6 @@ def __init__(
self.run_feature_selection = run_feature_selection
self.ensembling = ensembling
self.exclude_featurizers = exclude_featurizers or []
self.is_multiseries = is_multiseries

if allowed_model_families is not None and excluded_model_families is not None:
raise ValueError(
Expand Down Expand Up @@ -173,7 +170,7 @@ def default_max_batches(self):
"""Returns the number of max batches AutoMLSearch should run by default."""
if self.ensembling:
return 3
elif self.is_multiseries:
elif is_multiseries(self.problem_type):
return 1
else:
return 2
Expand Down Expand Up @@ -222,7 +219,6 @@ def _non_naive_estimators(self):
self.problem_type,
model_families=self.allowed_model_families,
excluded_model_families=self.excluded_model_families,
is_multiseries=self.is_multiseries,
)
if est not in self._naive_estimators()
]
Expand Down Expand Up @@ -271,7 +267,6 @@ def _create_naive_pipelines(self, use_features=False):
),
features=self.features,
exclude_featurizers=self.exclude_featurizers,
is_multiseries=self.is_multiseries,
)
for estimator in estimators
]
Expand Down Expand Up @@ -300,7 +295,6 @@ def _add_without_pipelines(self, pipelines, estimators, feature_selector=[]):
features=self.features,
exclude_featurizers=self.exclude_featurizers,
include_decomposer=False,
is_multiseries=self.is_multiseries,
)
for estimator in estimators
]
Expand Down Expand Up @@ -440,7 +434,6 @@ def _make_pipelines_helper(self, estimators):
),
features=self.features,
exclude_featurizers=self.exclude_featurizers,
is_multiseries=self.is_multiseries,
)
for estimator in estimators
]
Expand Down Expand Up @@ -484,7 +477,7 @@ def next_batch(self):
# Skip the naive batch for multiseries time series
batch = (
self._batch_number
if not self.is_multiseries
if not is_multiseries(self.problem_type)
else self._batch_number + 1
)
if batch == 0:
Expand Down Expand Up @@ -679,7 +672,6 @@ def _make_split_pipeline(self, estimator, pipeline_name=None):
extra_components_before=[SelectColumns],
use_estimator=False,
exclude_featurizers=self.exclude_featurizers,
is_multiseries=self.is_multiseries,
)

numeric_pipeline = make_pipeline(
Expand All @@ -693,7 +685,6 @@ def _make_split_pipeline(self, estimator, pipeline_name=None):
extra_components_after=[SelectColumns],
use_estimator=False,
exclude_featurizers=self.exclude_featurizers,
is_multiseries=self.is_multiseries,
)
pre_pipeline_components = (
{"DFS Transformer": ["DFS Transformer", "X", "y"]}
Expand Down Expand Up @@ -745,7 +736,6 @@ def _make_split_pipeline(self, estimator, pipeline_name=None):
extra_components_before=[SelectColumns],
features=self.features,
exclude_featurizers=self.exclude_featurizers,
is_multiseries=self.is_multiseries,
)
return categorical_pipeline
elif self.run_feature_selection:
Expand All @@ -762,7 +752,6 @@ def _make_split_pipeline(self, estimator, pipeline_name=None):
extra_components_after=[SelectColumns],
features=self.features,
exclude_featurizers=self.exclude_featurizers,
is_multiseries=self.is_multiseries,
)
return numeric_pipeline

Expand All @@ -774,6 +763,5 @@ def _make_split_pipeline(self, estimator, pipeline_name=None):
self.problem_type,
sampler_name=self.sampler_name,
exclude_featurizers=self.exclude_featurizers,
is_multiseries=self.is_multiseries,
)
return pipeline
6 changes: 0 additions & 6 deletions evalml/automl/automl_algorithm/iterative_algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@ class IterativeAlgorithm(AutoMLAlgorithm):
verbose (boolean): Whether or not to display logging information regarding pipeline building. Defaults to False.
exclude_featurizers (list[str]): A list of featurizer components to exclude from the pipelines built by IterativeAlgorithm.
Valid options are "DatetimeFeaturizer", "EmailFeaturizer", "URLFeaturizer", "NaturalLanguageFeaturizer", "TimeSeriesFeaturizer"
is_multiseries (bool): Whether or not the problem is a multiseries time series problem. Defaults to False.
"""

def __init__(
Expand All @@ -96,7 +95,6 @@ def __init__(
features=None,
verbose=False,
exclude_featurizers=None,
is_multiseries=False,
):
self.X = infer_feature_types(X)
self.y = infer_feature_types(y)
Expand Down Expand Up @@ -131,7 +129,6 @@ def __init__(
self.features = features
self._set_additional_pipeline_params()
self.exclude_featurizers = exclude_featurizers
self.is_multiseries = is_multiseries

super().__init__(
allowed_pipelines=self.allowed_pipelines,
Expand Down Expand Up @@ -159,7 +156,6 @@ def _create_pipelines(self):
self.problem_type,
model_families=self.allowed_model_families,
excluded_model_families=self.excluded_model_families,
is_multiseries=self.is_multiseries,
)
allowed_estimators = self._filter_estimators(
allowed_estimators,
Expand Down Expand Up @@ -188,7 +184,6 @@ def _create_pipelines(self):
).get("known_in_advance", None),
features=self.features,
exclude_featurizers=self.exclude_featurizers,
is_multiseries=self.is_multiseries,
)
for estimator in allowed_estimators
]
Expand All @@ -212,7 +207,6 @@ def _create_pipelines(self):
features=self.features,
exclude_featurizers=self.exclude_featurizers,
include_decomposer=False,
is_multiseries=self.is_multiseries,
)
for estimator in allowed_estimators
]
Expand Down
17 changes: 9 additions & 8 deletions evalml/automl/automl_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@
handle_problem_types,
is_binary,
is_classification,
is_multiseries,
is_time_series,
)
from evalml.tuners import SKOptTuner
Expand Down Expand Up @@ -625,10 +626,6 @@ def __init__(
self.problem_configuration = self._validate_problem_configuration(
problem_configuration,
)
self.is_multiseries = (
is_time_series(self.problem_type)
and self.problem_configuration.get("series_id") is not None
)
self._train_best_pipeline = train_best_pipeline
self._best_pipeline = None
self._searched = False
Expand Down Expand Up @@ -657,7 +654,7 @@ def __init__(
)

# For multiseries problems, we need to mke sure that the data is primarily ordered by the time_index rather than the series_id
if self.is_multiseries:
if is_multiseries(self.problem_type):
time_index = self.problem_configuration.get("time_index")
series_id = self.problem_configuration.get("series_id")
X_train = X_train.sort_values([time_index, series_id])
Expand Down Expand Up @@ -946,7 +943,6 @@ def _is_imbalanced(X, y, problem_type):
features=features,
verbose=self.verbose,
exclude_featurizers=self.exclude_featurizers,
is_multiseries=self.is_multiseries,
)
elif automl_algorithm == "default":
self.automl_algorithm = DefaultAlgorithm(
Expand All @@ -967,7 +963,6 @@ def _is_imbalanced(X, y, problem_type):
verbose=self.verbose,
n_jobs=self.n_jobs,
exclude_featurizers=self.exclude_featurizers,
is_multiseries=self.is_multiseries,
)
else:
raise ValueError("Please specify a valid automl algorithm.")
Expand Down Expand Up @@ -1068,6 +1063,13 @@ def _validate_problem_configuration(self, problem_configuration=None):
is_valid, msg = contains_all_ts_parameters(problem_configuration)
if not is_valid:
raise ValueError(msg)
if (
is_multiseries(self.problem_type)
and "series_id" not in problem_configuration
):
raise ValueError(
"Must provide 'series_id' column in problem_configuration for multiseries time series problems.",
)
return problem_configuration or {}

def _handle_keyboard_interrupt(self):
Expand Down Expand Up @@ -1380,7 +1382,6 @@ def _get_baseline_pipeline(self):
forecast_horizon,
time_index,
exclude_timeseries_featurizer,
self.is_multiseries,
series_id,
)
return baseline
Expand Down
1 change: 1 addition & 0 deletions evalml/automl/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ def get_default_primary_search_objective(problem_type):
"time series regression": "MedianAE",
"time series binary": "Log Loss Binary",
"time series multiclass": "Log Loss Multiclass",
"multiseries time series regression": "MedianAE",
}[problem_type.value]
return get_objective(objective_name, return_instance=True)

Expand Down
8 changes: 6 additions & 2 deletions evalml/objectives/regression_objective.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,9 @@
class RegressionObjective(ObjectiveBase):
"""Base class for all regression objectives."""

problem_types = [ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION]
"""[ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION]"""
problem_types = [
ProblemTypes.REGRESSION,
ProblemTypes.TIME_SERIES_REGRESSION,
ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION,
]
"""[ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION, ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION]"""
1 change: 0 additions & 1 deletion evalml/pipelines/components/component_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ class ComponentBase(ABC, metaclass=ComponentBaseMeta):
# Referring to the pandas nullable dtypes; not just woodwork logical types
_integer_nullable_incompatibilities = []
_boolean_nullable_incompatibilities = []
is_multiseries = False

def __init__(self, parameters=None, component_obj=None, random_seed=0, **kwargs):
"""Base class for all components.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,11 @@ class MultiseriesTimeSeriesBaselineRegressor(Estimator):
"""{}"""
model_family = ModelFamily.BASELINE
"""ModelFamily.BASELINE"""
is_multiseries = True
supported_problem_types = [
ProblemTypes.TIME_SERIES_REGRESSION,
ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION,
]
"""[
ProblemTypes.TIME_SERIES_REGRESSION,
ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION,
]"""

def __init__(self, gap=1, forecast_horizon=1, random_seed=0, **kwargs):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,9 @@ class VARMAXRegressor(Estimator):
"trend": Categorical(['n', 'c', 't', 'ct']),
}"""
model_family = ModelFamily.VARMAX
is_multiseries = True
"""ModelFamily.VARMAX"""
supported_problem_types = [ProblemTypes.TIME_SERIES_REGRESSION]
"""[ProblemTypes.TIME_SERIES_REGRESSION]"""
supported_problem_types = [ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION]
"""[ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION]"""

def __init__(
self,
Expand Down
16 changes: 1 addition & 15 deletions evalml/pipelines/components/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from evalml.pipelines.components.component_base import ComponentBase
from evalml.pipelines.components.estimators.estimator import Estimator
from evalml.pipelines.components.transformers.transformer import Transformer
from evalml.problem_types import ProblemTypes, handle_problem_types, is_time_series
from evalml.problem_types import ProblemTypes, handle_problem_types
from evalml.utils import get_importable_subclasses


Expand Down Expand Up @@ -56,18 +56,10 @@ def allowed_model_families(problem_type):
return list(set([e.model_family for e in estimators]))


def _filter_multiseries_estimators(estimators, is_multiseries):
if is_multiseries:
return [estimator for estimator in estimators if estimator.is_multiseries]
else:
return [estimator for estimator in estimators if not estimator.is_multiseries]


def get_estimators(
problem_type,
model_families=None,
excluded_model_families=None,
is_multiseries=False,
):
"""Returns the estimators allowed for a particular problem type.
Expand All @@ -77,7 +69,6 @@ def get_estimators(
problem_type (ProblemTypes or str): Problem type to filter for.
model_families (list[ModelFamily] or list[str]): Model families to filter for.
excluded_model_families (list[ModelFamily]): A list of model families to exclude from the results.
is_multiseries (bool): Whether to return only estimators that support multiseries data.
Returns:
list[class]: A list of estimator subclasses.
Expand Down Expand Up @@ -124,11 +115,6 @@ def get_estimators(
if estimator_class.model_family not in model_families:
continue
estimator_classes.append(estimator_class)
if is_time_series(problem_type):
estimator_classes = _filter_multiseries_estimators(
estimator_classes,
is_multiseries,
)
return estimator_classes


Expand Down
4 changes: 2 additions & 2 deletions evalml/pipelines/multiseries_regression_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@ class MultiseriesRegressionPipeline(TimeSeriesRegressionPipeline):
"""

problem_type = ProblemTypes.TIME_SERIES_REGRESSION
problem_type = ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION

"""ProblemTypes.TIME_SERIES_REGRESSION"""
"""ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION"""

def __init__(
self,
Expand Down
Loading

0 comments on commit 41f8e87

Please sign in to comment.