Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Swap is_multiseries logic to problem type #4278

Merged
merged 4 commits into from
Aug 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 3 additions & 15 deletions evalml/automl/automl_algorithm/default_algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
_make_pipeline_from_multiple_graphs,
make_pipeline,
)
from evalml.problem_types import is_regression, is_time_series
from evalml.problem_types import is_multiseries, is_regression, is_time_series
from evalml.utils import infer_feature_types
from evalml.utils.logger import get_logger

Expand Down Expand Up @@ -81,7 +81,6 @@ class DefaultAlgorithm(AutoMLAlgorithm):
model families. Run evalml.pipelines.components.utils.allowed_model_families("binary") to see options. Change `binary`
to `multiclass` or `regression` depending on the problem type.
excluded_model_families (list[ModelFamily]): A list of model families to exclude from the estimators used when building pipelines. For default algorithm, this only excludes estimators in the non-naive batches.
is_multiseries (bool): Whether or not the problem is a multiseries time series problem. Defaults to False.
"""

def __init__(
Expand All @@ -106,7 +105,6 @@ def __init__(
run_feature_selection=True,
verbose=False,
exclude_featurizers=None,
is_multiseries=False,
):
super().__init__(
allowed_pipelines=[],
Expand Down Expand Up @@ -140,7 +138,6 @@ def __init__(
self.run_feature_selection = run_feature_selection
self.ensembling = ensembling
self.exclude_featurizers = exclude_featurizers or []
self.is_multiseries = is_multiseries

if allowed_model_families is not None and excluded_model_families is not None:
raise ValueError(
Expand Down Expand Up @@ -173,7 +170,7 @@ def default_max_batches(self):
"""Returns the number of max batches AutoMLSearch should run by default."""
if self.ensembling:
return 3
elif self.is_multiseries:
elif is_multiseries(self.problem_type):
return 1
else:
return 2
Expand Down Expand Up @@ -222,7 +219,6 @@ def _non_naive_estimators(self):
self.problem_type,
model_families=self.allowed_model_families,
excluded_model_families=self.excluded_model_families,
is_multiseries=self.is_multiseries,
)
if est not in self._naive_estimators()
]
Expand Down Expand Up @@ -271,7 +267,6 @@ def _create_naive_pipelines(self, use_features=False):
),
features=self.features,
exclude_featurizers=self.exclude_featurizers,
is_multiseries=self.is_multiseries,
)
for estimator in estimators
]
Expand Down Expand Up @@ -300,7 +295,6 @@ def _add_without_pipelines(self, pipelines, estimators, feature_selector=[]):
features=self.features,
exclude_featurizers=self.exclude_featurizers,
include_decomposer=False,
is_multiseries=self.is_multiseries,
)
for estimator in estimators
]
Expand Down Expand Up @@ -440,7 +434,6 @@ def _make_pipelines_helper(self, estimators):
),
features=self.features,
exclude_featurizers=self.exclude_featurizers,
is_multiseries=self.is_multiseries,
)
for estimator in estimators
]
Expand Down Expand Up @@ -484,7 +477,7 @@ def next_batch(self):
# Skip the naive batch for multiseries time series
batch = (
self._batch_number
if not self.is_multiseries
if not is_multiseries(self.problem_type)
else self._batch_number + 1
)
if batch == 0:
Expand Down Expand Up @@ -679,7 +672,6 @@ def _make_split_pipeline(self, estimator, pipeline_name=None):
extra_components_before=[SelectColumns],
use_estimator=False,
exclude_featurizers=self.exclude_featurizers,
is_multiseries=self.is_multiseries,
)

numeric_pipeline = make_pipeline(
Expand All @@ -693,7 +685,6 @@ def _make_split_pipeline(self, estimator, pipeline_name=None):
extra_components_after=[SelectColumns],
use_estimator=False,
exclude_featurizers=self.exclude_featurizers,
is_multiseries=self.is_multiseries,
)
pre_pipeline_components = (
{"DFS Transformer": ["DFS Transformer", "X", "y"]}
Expand Down Expand Up @@ -745,7 +736,6 @@ def _make_split_pipeline(self, estimator, pipeline_name=None):
extra_components_before=[SelectColumns],
features=self.features,
exclude_featurizers=self.exclude_featurizers,
is_multiseries=self.is_multiseries,
)
return categorical_pipeline
elif self.run_feature_selection:
Expand All @@ -762,7 +752,6 @@ def _make_split_pipeline(self, estimator, pipeline_name=None):
extra_components_after=[SelectColumns],
features=self.features,
exclude_featurizers=self.exclude_featurizers,
is_multiseries=self.is_multiseries,
)
return numeric_pipeline

Expand All @@ -774,6 +763,5 @@ def _make_split_pipeline(self, estimator, pipeline_name=None):
self.problem_type,
sampler_name=self.sampler_name,
exclude_featurizers=self.exclude_featurizers,
is_multiseries=self.is_multiseries,
)
return pipeline
6 changes: 0 additions & 6 deletions evalml/automl/automl_algorithm/iterative_algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@ class IterativeAlgorithm(AutoMLAlgorithm):
verbose (boolean): Whether or not to display logging information regarding pipeline building. Defaults to False.
exclude_featurizers (list[str]): A list of featurizer components to exclude from the pipelines built by IterativeAlgorithm.
Valid options are "DatetimeFeaturizer", "EmailFeaturizer", "URLFeaturizer", "NaturalLanguageFeaturizer", "TimeSeriesFeaturizer"
is_multiseries (bool): Whether or not the problem is a multiseries time series problem. Defaults to False.
"""

def __init__(
Expand All @@ -96,7 +95,6 @@ def __init__(
features=None,
verbose=False,
exclude_featurizers=None,
is_multiseries=False,
):
self.X = infer_feature_types(X)
self.y = infer_feature_types(y)
Expand Down Expand Up @@ -131,7 +129,6 @@ def __init__(
self.features = features
self._set_additional_pipeline_params()
self.exclude_featurizers = exclude_featurizers
self.is_multiseries = is_multiseries

super().__init__(
allowed_pipelines=self.allowed_pipelines,
Expand Down Expand Up @@ -159,7 +156,6 @@ def _create_pipelines(self):
self.problem_type,
model_families=self.allowed_model_families,
excluded_model_families=self.excluded_model_families,
is_multiseries=self.is_multiseries,
)
allowed_estimators = self._filter_estimators(
allowed_estimators,
Expand Down Expand Up @@ -188,7 +184,6 @@ def _create_pipelines(self):
).get("known_in_advance", None),
features=self.features,
exclude_featurizers=self.exclude_featurizers,
is_multiseries=self.is_multiseries,
)
for estimator in allowed_estimators
]
Expand All @@ -212,7 +207,6 @@ def _create_pipelines(self):
features=self.features,
exclude_featurizers=self.exclude_featurizers,
include_decomposer=False,
is_multiseries=self.is_multiseries,
)
for estimator in allowed_estimators
]
Expand Down
17 changes: 9 additions & 8 deletions evalml/automl/automl_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@
handle_problem_types,
is_binary,
is_classification,
is_multiseries,
is_time_series,
)
from evalml.tuners import SKOptTuner
Expand Down Expand Up @@ -625,10 +626,6 @@
self.problem_configuration = self._validate_problem_configuration(
problem_configuration,
)
self.is_multiseries = (
is_time_series(self.problem_type)
and self.problem_configuration.get("series_id") is not None
)
self._train_best_pipeline = train_best_pipeline
self._best_pipeline = None
self._searched = False
Expand Down Expand Up @@ -657,7 +654,7 @@
)

# For multiseries problems, we need to mke sure that the data is primarily ordered by the time_index rather than the series_id
if self.is_multiseries:
if is_multiseries(self.problem_type):
time_index = self.problem_configuration.get("time_index")
series_id = self.problem_configuration.get("series_id")
X_train = X_train.sort_values([time_index, series_id])
Expand Down Expand Up @@ -946,7 +943,6 @@
features=features,
verbose=self.verbose,
exclude_featurizers=self.exclude_featurizers,
is_multiseries=self.is_multiseries,
)
elif automl_algorithm == "default":
self.automl_algorithm = DefaultAlgorithm(
Expand All @@ -967,7 +963,6 @@
verbose=self.verbose,
n_jobs=self.n_jobs,
exclude_featurizers=self.exclude_featurizers,
is_multiseries=self.is_multiseries,
)
else:
raise ValueError("Please specify a valid automl algorithm.")
Expand Down Expand Up @@ -1068,6 +1063,13 @@
is_valid, msg = contains_all_ts_parameters(problem_configuration)
if not is_valid:
raise ValueError(msg)
if (
is_multiseries(self.problem_type)
and "series_id" not in problem_configuration
):
raise ValueError(

Check warning on line 1070 in evalml/automl/automl_search.py

View check run for this annotation

Codecov / codecov/patch

evalml/automl/automl_search.py#L1070

Added line #L1070 was not covered by tests
"Must provide 'series_id' column in problem_configuration for multiseries time series problems.",
)
return problem_configuration or {}

def _handle_keyboard_interrupt(self):
Expand Down Expand Up @@ -1380,7 +1382,6 @@
forecast_horizon,
time_index,
exclude_timeseries_featurizer,
self.is_multiseries,
series_id,
)
return baseline
Expand Down
1 change: 1 addition & 0 deletions evalml/automl/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ def get_default_primary_search_objective(problem_type):
"time series regression": "MedianAE",
"time series binary": "Log Loss Binary",
"time series multiclass": "Log Loss Multiclass",
"multiseries time series regression": "MedianAE",
}[problem_type.value]
return get_objective(objective_name, return_instance=True)

Expand Down
8 changes: 6 additions & 2 deletions evalml/objectives/regression_objective.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,9 @@
class RegressionObjective(ObjectiveBase):
"""Base class for all regression objectives."""

problem_types = [ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION]
"""[ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION]"""
problem_types = [
ProblemTypes.REGRESSION,
ProblemTypes.TIME_SERIES_REGRESSION,
ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION,
]
"""[ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION, ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION]"""
1 change: 0 additions & 1 deletion evalml/pipelines/components/component_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ class ComponentBase(ABC, metaclass=ComponentBaseMeta):
# Referring to the pandas nullable dtypes; not just woodwork logical types
_integer_nullable_incompatibilities = []
_boolean_nullable_incompatibilities = []
is_multiseries = False

def __init__(self, parameters=None, component_obj=None, random_seed=0, **kwargs):
"""Base class for all components.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,11 @@ class MultiseriesTimeSeriesBaselineRegressor(Estimator):
"""{}"""
model_family = ModelFamily.BASELINE
"""ModelFamily.BASELINE"""
is_multiseries = True
supported_problem_types = [
ProblemTypes.TIME_SERIES_REGRESSION,
ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION,
]
"""[
ProblemTypes.TIME_SERIES_REGRESSION,
ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION,
]"""

def __init__(self, gap=1, forecast_horizon=1, random_seed=0, **kwargs):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,9 @@ class VARMAXRegressor(Estimator):
"trend": Categorical(['n', 'c', 't', 'ct']),
}"""
model_family = ModelFamily.VARMAX
is_multiseries = True
"""ModelFamily.VARMAX"""
supported_problem_types = [ProblemTypes.TIME_SERIES_REGRESSION]
"""[ProblemTypes.TIME_SERIES_REGRESSION]"""
supported_problem_types = [ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION]
"""[ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION]"""

def __init__(
self,
Expand Down
16 changes: 1 addition & 15 deletions evalml/pipelines/components/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from evalml.pipelines.components.component_base import ComponentBase
from evalml.pipelines.components.estimators.estimator import Estimator
from evalml.pipelines.components.transformers.transformer import Transformer
from evalml.problem_types import ProblemTypes, handle_problem_types, is_time_series
from evalml.problem_types import ProblemTypes, handle_problem_types
from evalml.utils import get_importable_subclasses


Expand Down Expand Up @@ -56,18 +56,10 @@ def allowed_model_families(problem_type):
return list(set([e.model_family for e in estimators]))


def _filter_multiseries_estimators(estimators, is_multiseries):
if is_multiseries:
return [estimator for estimator in estimators if estimator.is_multiseries]
else:
return [estimator for estimator in estimators if not estimator.is_multiseries]


def get_estimators(
problem_type,
model_families=None,
excluded_model_families=None,
is_multiseries=False,
):
"""Returns the estimators allowed for a particular problem type.

Expand All @@ -77,7 +69,6 @@ def get_estimators(
problem_type (ProblemTypes or str): Problem type to filter for.
model_families (list[ModelFamily] or list[str]): Model families to filter for.
excluded_model_families (list[ModelFamily]): A list of model families to exclude from the results.
is_multiseries (bool): Whether to return only estimators that support multiseries data.

Returns:
list[class]: A list of estimator subclasses.
Expand Down Expand Up @@ -124,11 +115,6 @@ def get_estimators(
if estimator_class.model_family not in model_families:
continue
estimator_classes.append(estimator_class)
if is_time_series(problem_type):
estimator_classes = _filter_multiseries_estimators(
estimator_classes,
is_multiseries,
)
return estimator_classes


Expand Down
4 changes: 2 additions & 2 deletions evalml/pipelines/multiseries_regression_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@ class MultiseriesRegressionPipeline(TimeSeriesRegressionPipeline):

"""

problem_type = ProblemTypes.TIME_SERIES_REGRESSION
problem_type = ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION

"""ProblemTypes.TIME_SERIES_REGRESSION"""
"""ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION"""

def __init__(
self,
Expand Down
Loading
Loading