diff --git a/ax/benchmark/benchmark.py b/ax/benchmark/benchmark.py index 14dca5c998e..0503c009e0b 100644 --- a/ax/benchmark/benchmark.py +++ b/ax/benchmark/benchmark.py @@ -29,20 +29,11 @@ from ax.benchmark.benchmark_method import BenchmarkMethod from ax.benchmark.benchmark_problem import BenchmarkProblem from ax.benchmark.benchmark_result import AggregatedBenchmarkResult, BenchmarkResult -from ax.benchmark.metrics.base import BenchmarkMetricBase, GroundTruthMetricMixin from ax.core.experiment import Experiment -from ax.core.metric import Metric -from ax.core.objective import MultiObjective, Objective -from ax.core.optimization_config import ( - MultiObjectiveOptimizationConfig, - OptimizationConfig, -) -from ax.core.outcome_constraint import ObjectiveThreshold, OutcomeConstraint from ax.core.utils import get_model_times from ax.service.scheduler import Scheduler from ax.utils.common.logger import get_logger from ax.utils.common.random import with_rng_seed -from ax.utils.common.typeutils import checked_cast, not_none logger: Logger = get_logger(__name__) @@ -88,25 +79,10 @@ def _create_benchmark_experiment( Returns: The Experiment object to be used for benchmarking. """ - tracking_metrics = problem.tracking_metrics - if not problem.is_noiseless and problem.has_ground_truth: - # Make the ground truth counterparts for each metric defined on the problem, - # which will be added as tracking metrics on the Experiment object below. - # In the analysis, a modified OptimziationConfig referencing those metrics - # will be passed to the `Scheduler.get_trace()` method, which allows to extract - # the optimziation trace based on the ground truth outcomes (without noise). - # If the problem is known to be noiseless, this is unneccesary and we can just - # use the observations made during the optimization loop directly. - gt_metric_dict = make_ground_truth_metrics(problem=problem) - tracking_metrics = tracking_metrics + list(gt_metric_dict.values()) return Experiment( name=f"{problem.name}|{method_name}_{int(time())}", search_space=problem.search_space, optimization_config=problem.optimization_config, - tracking_metrics=tracking_metrics, # pyre-ignore [6]: Incompatible - # parameter type: In call `Experiment.__init__`, for argument - # `tracking_metrics`, expected `Optional[List[Metric]]` but got - # `Union[List[Union[BenchmarkMetricBase, Metric]], List[BenchmarkMetricBase]]`. runner=problem.runner, ) @@ -124,7 +100,12 @@ def benchmark_replication( seed: The seed to use for this replication. """ - experiment = _create_benchmark_experiment(problem=problem, method_name=method.name) + experiment = Experiment( + name=f"{problem.name}|{method.name}_{int(time())}", + search_space=problem.search_space, + optimization_config=problem.optimization_config, + runner=problem.runner, + ) scheduler = Scheduler( experiment=experiment, @@ -135,24 +116,7 @@ def benchmark_replication( with with_rng_seed(seed=seed): scheduler.run_n_trials(max_trials=problem.num_trials) - if not problem.is_noiseless and problem.has_ground_truth: - # We modify the optimization config so we can use `Scheduler.get_trace()` - # to use the true (not corrupted by noise) observations that were logged - # as tracking metrics on the Experiment object. If the problem is known to - # be noiseless, this is unnecssary and we can just use the observations - # made during the optimization loop directly. - analysis_opt_config = make_ground_truth_optimization_config( - experiment=experiment - ) - else: - analysis_opt_config = experiment.optimization_config - - optimization_trace = np.asarray( - scheduler.get_trace(optimization_config=analysis_opt_config) - ) - - new_optimization_trace = problem.get_opt_trace(experiment=experiment) - np.testing.assert_allclose(optimization_trace, new_optimization_trace) + optimization_trace = problem.get_opt_trace(experiment=experiment) try: # Catch any errors that may occur during score computation, such as errors @@ -217,125 +181,3 @@ def benchmark_multiple_problems_methods( benchmark_one_method_problem(problem=p, method=m, seeds=seeds) for p, m in product(problems, methods) ] - - -def make_ground_truth_metrics( - problem: BenchmarkProblem, - include_tracking_metrics: bool = True, -) -> dict[str, Metric]: - """Makes a ground truth version for each metric defined on the problem. - - Args: - problem: The BenchmarkProblem to test against (can be synthetic or real). - include_tracking_metrics: Whether or not to include tracking metrics. - - Returns: - A dict mapping (original) metric names to their respective ground truth metric. - """ - if not problem.has_ground_truth: - raise ValueError( - "Cannot create ground truth metrics for problems that " - "do not have a ground truth." - ) - metrics: list[BenchmarkMetricBase] = [ - checked_cast(BenchmarkMetricBase, metric) - for metric in problem.optimization_config.metrics.values() - ] - if include_tracking_metrics: - metrics = metrics + problem.tracking_metrics - return {metric.name: metric.make_ground_truth_metric() for metric in metrics} - - -def make_ground_truth_optimization_config( - experiment: Experiment, -) -> OptimizationConfig: - """Makes a clone of the OptimizationConfig on the experiment in which each metric - is replaced by its respective "ground truth" counterpart, which has been added to - the experiment's tracking metrics in `_create_benchmark_experiment` and which - returns the ground truth (i.e., uncorrupted by noise) observations. - """ - optimization_config = not_none(experiment.optimization_config) - - if optimization_config.risk_measure is not None: - raise NotImplementedError("Support for risk measures is not yet implemented.") - - # dict for caching metric lookup - gt_metric_dict: dict[str, BenchmarkMetricBase] = {} - - def get_gt_metric(metric: Metric) -> BenchmarkMetricBase: - """Look up corresponding ground truth metric of the experiment. Will error - out if no corresponding ground truth metric exists.""" - if not isinstance(metric, BenchmarkMetricBase): - raise ValueError( - "Only BenchmarkMetricBase metrics are supported for ground truth " - f"metrics. Got {type(metric)}." - ) - - if metric.name in gt_metric_dict: - return gt_metric_dict[metric.name] - - for tracking_metric in experiment.tracking_metrics: - if getattr(tracking_metric, "is_ground_truth", False): - # TODO: Figure out if there is a better way to match the ground truth - # metric and the original metric. - ground_truth_name = tracking_metric.name - orig_name = checked_cast( - GroundTruthMetricMixin, tracking_metric - ).get_original_name(ground_truth_name) - if orig_name == metric.name: - tracking_metric = checked_cast(BenchmarkMetricBase, tracking_metric) - gt_metric_dict[metric.name] = tracking_metric - return tracking_metric - raise ValueError(f"Ground truth metric for metric {metric.name} not found!") - - # convert outcome constraints - if optimization_config.outcome_constraints is not None: - gt_outcome_constraints = [ - OutcomeConstraint( - metric=get_gt_metric(oc.metric), - op=oc.op, - bound=oc.bound, - relative=oc.relative, - ) - for oc in optimization_config.outcome_constraints - ] - else: - gt_outcome_constraints = None - - # we need to distinguish MOO and non-MOO problems - if not optimization_config.is_moo_problem: - gt_objective = Objective( - metric=get_gt_metric(optimization_config.objective.metric) - ) - - return OptimizationConfig( - objective=gt_objective, outcome_constraints=gt_outcome_constraints - ) - - gt_objective = MultiObjective( - metrics=[ - get_gt_metric(metric) for metric in optimization_config.objective.metrics - ] - ) - # there may be objective thresholds to also convert - objective_thresholds = checked_cast( - MultiObjectiveOptimizationConfig, optimization_config - ).objective_thresholds - if objective_thresholds is not None: - gt_objective_thresholds = [ - ObjectiveThreshold( - metric=get_gt_metric(ot.metric), - bound=ot.bound, - relative=ot.relative, - op=ot.op, - ) - for ot in objective_thresholds - ] - else: - gt_objective_thresholds = None - - return MultiObjectiveOptimizationConfig( - objective=gt_objective, - outcome_constraints=gt_outcome_constraints, - objective_thresholds=gt_objective_thresholds, - ) diff --git a/ax/benchmark/benchmark_method.py b/ax/benchmark/benchmark_method.py index 8bfee47415f..2b71dcadc22 100644 --- a/ax/benchmark/benchmark_method.py +++ b/ax/benchmark/benchmark_method.py @@ -7,13 +7,11 @@ import logging from dataclasses import dataclass -from typing import Any -from ax.modelbridge.generation_strategy import GenerationStep, GenerationStrategy +from ax.modelbridge.generation_strategy import GenerationStrategy from ax.service.utils.scheduler_options import SchedulerOptions, TrialType from ax.utils.common.base import Base from ax.utils.common.logger import get_logger -from ax.utils.common.typeutils import not_none logger: logging.Logger = get_logger("BenchmarkMethod") @@ -28,9 +26,6 @@ class BenchmarkMethod(Base): Note: If `BenchmarkMethod.scheduler_options.total_trials` is less than `BenchmarkProblem.num_trials` then only the number of trials specified in the former will be run. - - Note: The `generation_strategy` passed in is assumed to be in its "base state", - as it will be cloned and reset. """ name: str @@ -38,42 +33,6 @@ class BenchmarkMethod(Base): scheduler_options: SchedulerOptions distribute_replications: bool = False - def __post_init__(self) -> None: - # We (I think?) in general don't want to fit tracking metrics during our - # benchmarks. Further, not setting `fit_tracking_metrics=False`causes - # issues with the ground truth metrics created automatically when running - # the benchmark - in fact, things will error out deep inside the modeling - # stack since the model gets both noisy (benchmark) and noiseless (ground - # truth) observations. While support for this is something we shold add - # for models, in the context of benchmarking we actually want to avoid - # fitting the ground truth metrics at all. - - # Clone the GS so as to not modify the original one in-place below. - # Note that this assumes that the GS passed in is in its base state. - gs_cloned = self.generation_strategy.clone_reset() - - for node in gs_cloned._nodes: - if isinstance(node, GenerationStep): - if node.model_kwargs is None: - node.model_kwargs = {} - if node.model_kwargs.get("fit_tracking_metrics", True): - logger.info( - "Setting `fit_tracking_metrics` in a GenerationStep to False.", - ) - not_none(node.model_kwargs)["fit_tracking_metrics"] = False - for model_spec in node.model_specs: - if model_spec.model_kwargs is None: - model_spec.model_kwargs = {} - elif model_spec.model_kwargs.get("fit_tracking_metrics", True): - logger.info( - "Setting `fit_tracking_metrics` in a GenerationNode's " - "model_spec to False." - ) - not_none(model_spec.model_kwargs)["fit_tracking_metrics"] = False - - # hack around not being able to update frozen attribute of a dataclass - _assign_frozen_attr(self, name="generation_strategy", value=gs_cloned) - def get_benchmark_scheduler_options( timeout_hours: int = 4, @@ -103,10 +62,3 @@ def get_benchmark_scheduler_options( trial_type=TrialType.TRIAL if batch_size == 1 else TrialType.BATCH_TRIAL, batch_size=batch_size, ) - - -def _assign_frozen_attr(obj: Any, name: str, value: Any) -> None: # pyre-ignore [2] - """Assign a new value to an attribute of a frozen dataclass. - This is an ugly hack and shouldn't be used broadly. - """ - object.__setattr__(obj, name, value) diff --git a/ax/benchmark/benchmark_problem.py b/ax/benchmark/benchmark_problem.py index 77f5dabee69..e71e6b71f39 100644 --- a/ax/benchmark/benchmark_problem.py +++ b/ax/benchmark/benchmark_problem.py @@ -11,8 +11,6 @@ import numpy as np import pandas as pd -from ax.benchmark.metrics.base import BenchmarkMetricBase - from ax.benchmark.metrics.benchmark import BenchmarkMetric from ax.benchmark.runners.base import BenchmarkRunner from ax.benchmark.runners.botorch_test import BotorchTestProblemRunner @@ -72,12 +70,6 @@ class BenchmarkProblem(Base): observe_noise_stds: If boolean, whether the standard deviation of the observation noise is observed for all metrics. If a dictionary, whether noise levels are observed on a per-metric basis. - has_ground_truth: Whether the Runner produces underlying ground truth - values, which are not observed in real noisy problems but may be - known in benchmarks. - tracking_metrics: Tracking metrics are not optimized, and for the - purpose of benchmarking, they will not be fit. The ground truth may - be provided as `tracking_metrics`. optimal_value: The best ground-truth objective value. Hypervolume for multi-objective problems. If the best value is not known, it is conventional to set it to a value that is almost certainly better @@ -91,13 +83,10 @@ class BenchmarkProblem(Base): optimization_config: OptimizationConfig num_trials: int observe_noise_stds: Union[bool, dict[str, bool]] = False - has_ground_truth: bool = True - tracking_metrics: list[BenchmarkMetricBase] = field(default_factory=list) optimal_value: float search_space: SearchSpace = field(repr=False) runner: BenchmarkRunner = field(repr=False) - is_noiseless: bool def get_oracle_experiment(self, experiment: Experiment) -> Experiment: records = [] @@ -263,8 +252,6 @@ def create_single_objective_problem_from_botorch( ), num_trials=num_trials, observe_noise_stds=observe_noise_sd, - is_noiseless=test_problem.noise_std in (None, 0.0), - has_ground_truth=True, # all synthetic problems have ground truth optimal_value=optimal_value, ) @@ -356,8 +343,6 @@ def create_multi_objective_problem_from_botorch( optimization_config=optimization_config, runner=runner, num_trials=num_trials, - is_noiseless=test_problem.noise_std in (None, 0.0), observe_noise_stds=observe_noise_sd, - has_ground_truth=True, optimal_value=test_problem.max_hv, ) diff --git a/ax/benchmark/metrics/base.py b/ax/benchmark/metrics/base.py deleted file mode 100644 index 59bf5951e56..00000000000 --- a/ax/benchmark/metrics/base.py +++ /dev/null @@ -1,73 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -# pyre-strict - -""" -Module containing the metric base classes for benchmarks. The key property of -a benchmark metric is whether it has a ground truth or not, which is indicated -by a `has_ground_truth` attribute of `BenchmarkMetricBase`. All mnetrics used -in Ax bechmarks need to be subclassed from `BenchmarkMetricBase`. - -For metrics that do have a ground truth, we can compute the performance of the -optimization directly in terms of the ground truth observations (or the ground -truth of the out-of-sample model-suggested best point). For metrics that do not -have a ground truth, this is not possible. - -The benchmarks are designed in a way so that (unless the metric is noiseless) -no ground truth observations are available to the optimziation algorithm. -Instead, we use separate "ground truth metrics" attached as tracking metrics -to the experiment that are used to evaluate the performance after the -optimization is complete. `GroundTruthMetricMixin` can be used to construct -such ground truth metrics (with the `is_ground_truth` property indicating -that the metric provides the ground truth) and implements naming conventions -and helpers for associating a the ground truth metric to the respective metric -used during the optimization. -""" - -from __future__ import annotations - -from abc import ABC, abstractmethod - -from ax.core.metric import Metric - - -class BenchmarkMetricBase(Metric, ABC): - """A generic metric used for Ax Benchmarks. - - Attributes: - has_ground_truth: Whether or not there exists a ground truth for this - metric, i.e. whether each observation has an associated ground - truth value. This is trivially true for deterministic metrics, and - is also true for metrics where synthetic observation noise is added - to its (deterministic) values. This is not true for metrics that - are inherently noisy. - """ - - has_ground_truth: bool - - @abstractmethod - def make_ground_truth_metric(self) -> BenchmarkMetricBase: - """Create a ground truth version of this metric. If metric observations - are noisy, the ground truth would be the underlying noiseless values.""" - - -class GroundTruthMetricMixin(ABC): - """A mixin for metrics that defines a naming convention and associated helper - methods that allow mapping from a ground truth metric to its original metric - and vice versa.""" - - is_ground_truth: bool = True - _GROUND_TRUTH_SUFFIX = "__GROUND_TRUTH" - - @classmethod - def get_ground_truth_name(cls, metric: Metric) -> str: - return f"{metric.name}{cls._GROUND_TRUTH_SUFFIX}" - - @classmethod - def get_original_name(cls, full_name: str) -> str: - if not full_name.endswith(cls._GROUND_TRUTH_SUFFIX): - raise ValueError("full_name does not end with ground truth suffix.") - return full_name.replace(cls._GROUND_TRUTH_SUFFIX, "") diff --git a/ax/benchmark/metrics/benchmark.py b/ax/benchmark/metrics/benchmark.py index b759ae5935b..5e854ff29e7 100644 --- a/ax/benchmark/metrics/benchmark.py +++ b/ax/benchmark/metrics/benchmark.py @@ -9,29 +9,17 @@ from typing import Any, Optional -from ax.benchmark.metrics.base import BenchmarkMetricBase, GroundTruthMetricMixin from ax.benchmark.metrics.utils import _fetch_trial_data from ax.core.base_trial import BaseTrial -from ax.core.metric import MetricFetchResult +from ax.core.metric import Metric, MetricFetchResult -class BenchmarkMetric(BenchmarkMetricBase): +class BenchmarkMetric(Metric): """A generic metric used for observed values produced by Ax Benchmarks. - Compatible e.g. with results generated by `BotorchTestProblemRunner` and - `SurrogateRunner`. - - Attributes: - has_ground_truth: Whether or not there exists a ground truth for this - metric, i.e. whether each observation has an associated ground - truth value. This is trivially true for deterministic metrics, and - is also true for metrics where synthetic observation noise is added - to its (deterministic) values. This is not true for metrics that - are inherently noisy. + Compatible with results generated by `BenchmarkRunner`. """ - has_ground_truth: bool = True - def __init__( self, name: str, @@ -70,43 +58,4 @@ def fetch_trial_data(self, trial: BaseTrial, **kwargs: Any) -> MetricFetchResult metric_name=self.name, outcome_index=self.outcome_index, include_noise_sd=self.observe_noise_sd, - ground_truth=False, - ) - - def make_ground_truth_metric(self) -> BenchmarkMetricBase: - """Create a ground truth version of this metric.""" - return GroundTruthBenchmarkMetric(original_metric=self) - - -class GroundTruthBenchmarkMetric(BenchmarkMetric, GroundTruthMetricMixin): - def __init__(self, original_metric: BenchmarkMetric) -> None: - """ - Args: - original_metric: The original BenchmarkMetric to which this metric - corresponds. - """ - super().__init__( - name=self.get_ground_truth_name(original_metric), - lower_is_better=original_metric.lower_is_better, - observe_noise_sd=False, - outcome_index=original_metric.outcome_index, - ) - self.original_metric = original_metric - - def fetch_trial_data(self, trial: BaseTrial, **kwargs: Any) -> MetricFetchResult: - if len(kwargs) > 0: - raise NotImplementedError( - f"Arguments {set(kwargs)} are not supported in " - f"{self.__class__.__name__}.fetch_trial_data." - ) - return _fetch_trial_data( - trial=trial, - metric_name=self.name, - outcome_index=self.outcome_index, - include_noise_sd=False, - ground_truth=True, ) - - def make_ground_truth_metric(self) -> BenchmarkMetricBase: - """Create a ground truth version of this metric.""" - return self diff --git a/ax/benchmark/metrics/utils.py b/ax/benchmark/metrics/utils.py index 67aa4c3edf6..0b55df0aca0 100644 --- a/ax/benchmark/metrics/utils.py +++ b/ax/benchmark/metrics/utils.py @@ -8,11 +8,9 @@ from typing import Optional import pandas as pd -from ax.benchmark.metrics.base import GroundTruthMetricMixin from ax.core.base_trial import BaseTrial from ax.core.data import Data from ax.core.metric import MetricFetchE, MetricFetchResult -from ax.exceptions.core import UnsupportedError from ax.utils.common.result import Err, Ok @@ -21,7 +19,6 @@ def _fetch_trial_data( metric_name: str, outcome_index: Optional[int] = None, include_noise_sd: bool = True, - ground_truth: bool = False, ) -> MetricFetchResult: """ Args: @@ -30,26 +27,17 @@ def _fetch_trial_data( this is used to retrieve the index (of the outcomes) from the `outcome_names` dict in a trial's `run_metadata`. If `metric_index` is specified, this is simply the name of the metric. - outcome_index: The index (in the last dimension) of the `Ys`, `Ys_true`, and + outcome_index: The index (in the last dimension) of the `Ys` and `Ystds` lists of outcomes stored by the respective runner in the trial's `run_metadata`. If omitted, `run_metadata` must contain a `outcome_names` list of names in the same order as the outcomes that will be used to determine the index. include_noise_sd: Whether to include noise standard deviation in the returned - data. Must be `False` if `ground_truth` is set to `True`. - ground_truth: If True, return the ground truth values instead of the actual - (noisy) observations. In this case, the noise standard deviations will - be reported as zero. + data. Returns: A MetricFetchResult containing the data for the requested metric. """ - if include_noise_sd and ground_truth: - raise UnsupportedError( - "Cannot include noise standard deviation when extracting ground truth " - "data. Will be set to zero for ground truth observations." - ) - if outcome_index is None: # Look up the index based on the outcome name under which we track the data # as part of `run_metadata`. @@ -59,15 +47,11 @@ def _fetch_trial_data( "Trials' `run_metadata` must contain `outcome_names` if " "no `outcome_index` is provided." ) - outcome_index = outcome_names.index( - GroundTruthMetricMixin.get_original_name(metric_name) - if ground_truth - else metric_name - ) + outcome_index = outcome_names.index(metric_name) try: arm_names = list(trial.arms_by_name.keys()) - all_Ys = trial.run_metadata["Ys_true" if ground_truth else "Ys"] + all_Ys = trial.run_metadata["Ys"] Ys = [all_Ys[arm_name][outcome_index] for arm_name in arm_names] if include_noise_sd: @@ -75,11 +59,6 @@ def _fetch_trial_data( trial.run_metadata["Ystds"][arm_name][outcome_index] for arm_name in arm_names ] - elif ground_truth: - # Ground truth observations are noiseless (note that at least currently - # this information is not being used as we only use the ground truth - # observations for analysis but not for modeling). - stdvs = [0.0] * len(Ys) else: stdvs = [float("nan")] * len(Ys) diff --git a/ax/benchmark/problems/hpo/torchvision.py b/ax/benchmark/problems/hpo/torchvision.py index 993376acfeb..bcf8cd5248d 100644 --- a/ax/benchmark/problems/hpo/torchvision.py +++ b/ax/benchmark/problems/hpo/torchvision.py @@ -224,8 +224,6 @@ def get_pytorch_cnn_torchvision_benchmark_problem( optimization_config=optimization_config, num_trials=num_trials, observe_noise_stds=False, - is_noiseless=True, - has_ground_truth=True, optimal_value=base_problem.optimal_value, runner=runner, ) diff --git a/ax/benchmark/problems/synthetic/discretized/mixed_integer.py b/ax/benchmark/problems/synthetic/discretized/mixed_integer.py index 15b966fe3e8..769b7d2b698 100644 --- a/ax/benchmark/problems/synthetic/discretized/mixed_integer.py +++ b/ax/benchmark/problems/synthetic/discretized/mixed_integer.py @@ -118,9 +118,7 @@ def _get_problem_from_common_inputs( runner=runner, num_trials=num_trials, optimal_value=optimal_value, - is_noiseless=True, observe_noise_stds=observe_noise_sd, - has_ground_truth=True, ) diff --git a/ax/benchmark/problems/synthetic/hss/jenatton.py b/ax/benchmark/problems/synthetic/hss/jenatton.py index f277ceddd38..16cbb93d89b 100644 --- a/ax/benchmark/problems/synthetic/hss/jenatton.py +++ b/ax/benchmark/problems/synthetic/hss/jenatton.py @@ -130,8 +130,6 @@ def get_jenatton_benchmark_problem( outcome_names=[name], ), num_trials=num_trials, - is_noiseless=noise_std == 0.0, observe_noise_stds=observe_noise_sd, - has_ground_truth=True, optimal_value=Jenatton.optimal_value, ) diff --git a/ax/benchmark/runners/base.py b/ax/benchmark/runners/base.py index b4fb19eabf4..af9d0784628 100644 --- a/ax/benchmark/runners/base.py +++ b/ax/benchmark/runners/base.py @@ -89,14 +89,9 @@ def run(self, trial: BaseTrial) -> dict[str, Any]: noise standard deviations (possibly nan if the noise level is unobserved), where the order of the outcomes is the same as in `outcome_names`. - - Ys_true: A dict mapping arm names to lists of corresponding ground - truth outcomes, where the order of the outcomes is the same as - in `outcome_names`. If the benchmark problem does not provide a - ground truth, this key will not be present in the dict returned - by this function. - "outcome_names": A list of metric names. """ - Ys, Ys_true, Ystds = {}, {}, {} + Ys, Ystds = {}, {} noise_stds = self.get_noise_stds() if noise_stds is not None: @@ -126,7 +121,6 @@ def run(self, trial: BaseTrial) -> dict[str, Any]: for arm in trial.arms: # Case where we do have a ground truth Y_true = self.get_Y_true(arm) - Ys_true[arm.name] = Y_true.tolist() if noise_stds is None: # No noise, so just return the true outcome. Ystds[arm.name] = [0.0] * len(Y_true) @@ -144,7 +138,6 @@ def run(self, trial: BaseTrial) -> dict[str, Any]: "Ys": Ys, "Ystds": Ystds, "outcome_names": self.outcome_names, - "Ys_true": Ys_true, } return run_metadata diff --git a/ax/benchmark/tests/metrics/test_benchmark_metric.py b/ax/benchmark/tests/metrics/test_benchmark_metric.py index c98c538b22f..715b19b68df 100644 --- a/ax/benchmark/tests/metrics/test_benchmark_metric.py +++ b/ax/benchmark/tests/metrics/test_benchmark_metric.py @@ -5,7 +5,7 @@ # pyre-strict -from ax.benchmark.metrics.benchmark import BenchmarkMetric, GroundTruthBenchmarkMetric +from ax.benchmark.metrics.benchmark import BenchmarkMetric from ax.core.arm import Arm from ax.core.batch_trial import BatchTrial from ax.core.trial import Trial @@ -110,38 +110,3 @@ def test_fetch_trial_data_batch_trial(self) -> None: "trial_index": {0: 0, 1: 0}, }, ) - - def test_make_ground_truth_metric(self) -> None: - metric = BenchmarkMetric(name="test_metric1", lower_is_better=True) - gt_metric = metric.make_ground_truth_metric() - self.assertIsInstance(gt_metric, GroundTruthBenchmarkMetric) - self.assertEqual(gt_metric.name, "test_metric1__GROUND_TRUTH") - self.assertEqual(gt_metric.lower_is_better, metric.lower_is_better) - self.assertFalse(gt_metric.observe_noise_sd) # pyre-ignore [16] - self.assertEqual( - gt_metric.outcome_index, metric.outcome_index # pyre-ignore [16] - ) - self.assertIs(gt_metric.original_metric, metric) # pyre-ignore [16] - - trial = get_test_trial() - - with self.assertRaisesRegex( - NotImplementedError, - "Arguments {'foo'} are not supported in GroundTruthBenchmarkMetric", - ): - gt_metric.fetch_trial_data(trial, foo="bar") - - df = gt_metric.fetch_trial_data(trial=trial).value.df # pyre-ignore [16] - self.assertEqual(len(df), 1) - self.assertDictEqual( - df.iloc[0].to_dict(), - { - "arm_name": "0_0", - "metric_name": "test_metric1__GROUND_TRUTH", - "mean": 1.1, - "sem": 0.0, - "trial_index": 0, - }, - ) - - self.assertIs(gt_metric.make_ground_truth_metric(), gt_metric) diff --git a/ax/benchmark/tests/problems/hpo/test_torchvision.py b/ax/benchmark/tests/problems/hpo/test_torchvision.py index bed29b8da0c..5f019c24843 100644 --- a/ax/benchmark/tests/problems/hpo/test_torchvision.py +++ b/ax/benchmark/tests/problems/hpo/test_torchvision.py @@ -54,9 +54,7 @@ def test_problem_properties(self) -> None: ) self.assertFalse(problem.optimization_config.objective.minimize) self.assertEqual(problem.num_trials, num_trials) - self.assertTrue(problem.is_noiseless) self.assertFalse(problem.observe_noise_stds) - self.assertTrue(problem.has_ground_truth) def test_deterministic(self) -> None: problem_name = choice(["MNIST", "FashionMNIST"]) @@ -77,7 +75,6 @@ def test_deterministic(self) -> None: { "Ys": {"0": [expected]}, "Ystds": {"0": [0.0]}, - "Ys_true": {"0": [expected]}, "outcome_names": ["accuracy"], }, ) diff --git a/ax/benchmark/tests/problems/synthetic/hss/test_jenatton.py b/ax/benchmark/tests/problems/synthetic/hss/test_jenatton.py index 5f8ee63db30..172e80a64c3 100644 --- a/ax/benchmark/tests/problems/synthetic/hss/test_jenatton.py +++ b/ax/benchmark/tests/problems/synthetic/hss/test_jenatton.py @@ -8,7 +8,7 @@ import math from random import random -from ax.benchmark.metrics.benchmark import BenchmarkMetric, GroundTruthBenchmarkMetric +from ax.benchmark.metrics.benchmark import BenchmarkMetric from ax.benchmark.problems.synthetic.hss.jenatton import ( get_jenatton_benchmark_problem, @@ -114,7 +114,6 @@ def test_create_problem(self) -> None: ).test_problem.noise_std, 0.0, ) - self.assertTrue(problem.is_noiseless) self.assertFalse(assert_is_instance(metric, BenchmarkMetric).observe_noise_sd) problem = get_jenatton_benchmark_problem( @@ -129,7 +128,6 @@ def test_create_problem(self) -> None: ).test_problem.noise_std, 0.1, ) - self.assertFalse(problem.is_noiseless) self.assertTrue(assert_is_instance(metric, BenchmarkMetric).observe_noise_sd) def test_fetch_trial_data(self) -> None: @@ -151,7 +149,6 @@ def test_fetch_trial_data(self) -> None: "Ys": {"0_0": [4.25]}, "Ystds": {"0_0": [0.0]}, "outcome_names": ["Jenatton"], - "Ys_true": {"0_0": [4.25]}, } self.assertEqual(metadata, expected_metadata) @@ -186,36 +183,3 @@ def test_fetch_trial_data(self) -> None: self.assertNotEqual(res_dict["mean"], 4.25) self.assertAlmostEqual(res_dict["sem"], 0.1) self.assertEqual(res_dict["trial_index"], 0) - - def test_make_ground_truth_metric(self) -> None: - problem = get_jenatton_benchmark_problem() - - arm = Arm(parameters={"x1": 0, "x2": 1, "x5": 2.0, "r8": 0.05}, name="0_0") - - experiment = Experiment( - search_space=problem.search_space, - name="Jenatton", - optimization_config=problem.optimization_config, - ) - - trial = Trial(experiment=experiment) - trial.add_arm(arm) - problem.runner.run(trial=trial) - metadata = problem.runner.run(trial=trial) - trial.update_run_metadata(metadata) - - metric = assert_is_instance( - problem.optimization_config.objective.metric, BenchmarkMetric - ) - gt_metric = metric.make_ground_truth_metric() - self.assertIsInstance(gt_metric, GroundTruthBenchmarkMetric) - runner = assert_is_instance(problem.runner, ParamBasedTestProblemRunner) - self.assertEqual(runner.test_problem.noise_std, 0.0) - self.assertFalse( - assert_is_instance(gt_metric, BenchmarkMetric).observe_noise_sd - ) - - self.assertIsInstance(metric, BenchmarkMetric) - self.assertNotIsInstance(metric, GroundTruthBenchmarkMetric) - self.assertEqual(runner.test_problem.noise_std, 0.0) - self.assertFalse(metric.observe_noise_sd) diff --git a/ax/benchmark/tests/problems/test_surrogate_problems.py b/ax/benchmark/tests/problems/test_surrogate_problems.py index 7295d617f5f..e901d88b87c 100644 --- a/ax/benchmark/tests/problems/test_surrogate_problems.py +++ b/ax/benchmark/tests/problems/test_surrogate_problems.py @@ -36,8 +36,8 @@ def test_repr(self) -> None: '"branin", ' "minimize=True), " "outcome_constraints=[]), num_trials=6, " - "observe_noise_stds=True, has_ground_truth=True, " - "tracking_metrics=[], optimal_value=0.0, is_noiseless=True)" + "observe_noise_stds=True, " + "optimal_value=0.0)" ) self.assertEqual(repr(sbp), expected_repr) diff --git a/ax/benchmark/tests/runners/test_botorch_test_problem.py b/ax/benchmark/tests/runners/test_botorch_test_problem.py index f6787e812fa..57cdb60e3a4 100644 --- a/ax/benchmark/tests/runners/test_botorch_test_problem.py +++ b/ax/benchmark/tests/runners/test_botorch_test_problem.py @@ -159,11 +159,8 @@ def test_synthetic_runner(self) -> None: trial.arm = arm trial.index = 0 res = runner.run(trial=trial) - self.assertSetEqual( - set(res.keys()), {"Ys", "Ys_true", "Ystds", "outcome_names"} - ) - self.assertSetEqual(set(res["Ys"].keys()), {"0_0"}) - self.assertEqual(res["Ys_true"]["0_0"], Y.tolist()) + self.assertEqual({"Ys", "Ystds", "outcome_names"}, res.keys()) + self.assertEqual({"0_0"}, res["Ys"].keys()) if noise_std is not None: self.assertEqual(res["Ystds"]["0_0"], [noise_std] * len(Y)) else: @@ -227,9 +224,7 @@ def test_botorch_test_problem_runner_heterogeneous_noise(self) -> None: trial.arm = arm trial.index = 0 res = runner.run(trial=trial) - self.assertSetEqual( - set(res.keys()), {"Ys", "Ys_true", "Ystds", "outcome_names"} - ) + self.assertSetEqual(set(res.keys()), {"Ys", "Ystds", "outcome_names"}) self.assertSetEqual(set(res["Ys"].keys()), {"0_0"}) self.assertEqual(res["Ystds"]["0_0"], [0.1, 0.05]) self.assertEqual(res["outcome_names"], ["objective", "constraint"]) diff --git a/ax/benchmark/tests/test_benchmark.py b/ax/benchmark/tests/test_benchmark.py index fb95644c786..8fd7f987204 100644 --- a/ax/benchmark/tests/test_benchmark.py +++ b/ax/benchmark/tests/test_benchmark.py @@ -14,8 +14,6 @@ benchmark_multiple_problems_methods, benchmark_one_method_problem, benchmark_replication, - make_ground_truth_metrics, - make_ground_truth_optimization_config, ) from ax.benchmark.benchmark_method import ( BenchmarkMethod, @@ -24,10 +22,7 @@ from ax.benchmark.benchmark_problem import create_single_objective_problem_from_botorch from ax.benchmark.benchmark_result import BenchmarkResult from ax.benchmark.methods.modular_botorch import get_sobol_botorch_modular_acquisition -from ax.benchmark.metrics.base import GroundTruthMetricMixin -from ax.benchmark.metrics.benchmark import BenchmarkMetric, GroundTruthBenchmarkMetric from ax.benchmark.problems.registry import get_problem -from ax.core.optimization_config import MultiObjectiveOptimizationConfig from ax.modelbridge.generation_strategy import GenerationNode, GenerationStrategy from ax.modelbridge.model_spec import ModelSpec from ax.modelbridge.registry import Models @@ -35,7 +30,7 @@ from ax.storage.json_store.load import load_experiment from ax.storage.json_store.save import save_experiment from ax.utils.common.testutils import TestCase -from ax.utils.common.typeutils import checked_cast, not_none +from ax.utils.common.typeutils import not_none from ax.utils.testing.benchmark_stubs import ( get_moo_surrogate, get_multi_objective_benchmark_problem, @@ -113,79 +108,6 @@ def test_storage(self) -> None: experiment = load_experiment(f.name) self.assertEqual(experiment, experiment) - def test_make_ground_truth_metrics(self) -> None: - problem = get_single_objective_benchmark_problem(observe_noise_sd=False) - metric = problem.optimization_config.objective.metric - - # basic setup - gt_metrics = make_ground_truth_metrics(problem=problem) - self.assertEqual(len(gt_metrics), 1) - gt_metric = checked_cast(GroundTruthBenchmarkMetric, gt_metrics[metric.name]) - self.assertIs(gt_metric.original_metric, metric) - - # add a tracking metric - tracking_metric = BenchmarkMetric(name="test_track", lower_is_better=True) - problem.tracking_metrics = [tracking_metric] - gt_metrics = make_ground_truth_metrics(problem=problem) - self.assertEqual(len(gt_metrics), 2) - gt_tracking_metric = checked_cast( - GroundTruthBenchmarkMetric, gt_metrics["test_track"] - ) - self.assertIs(gt_tracking_metric.original_metric, tracking_metric) - - # set include_tracking_metrics=False - gt_metrics = make_ground_truth_metrics( - problem=problem, include_tracking_metrics=False - ) - self.assertEqual(len(gt_metrics), 1) - - # error out if the problem does not have ground truth - problem.has_ground_truth = False - with self.assertRaisesRegex(ValueError, "do not have a ground truth"): - make_ground_truth_metrics(problem=problem) - - def test_make_ground_truth_optimization_config(self) -> None: - problem = get_single_objective_benchmark_problem(observe_noise_sd=False) - metric = problem.optimization_config.objective.metric - experiment = _create_benchmark_experiment( - problem=problem, method_name="test_method" - ) - - # A vanilla experiment w/o ground truth metrics attached should error - with self.assertRaisesRegex( - ValueError, f"Ground truth metric for metric {metric.name} not found!" - ): - make_ground_truth_optimization_config(experiment) - - # Add the ground truth metric and check basic behavior - gt_metric = make_ground_truth_metrics(problem)[metric.name] - experiment.add_tracking_metric(gt_metric) - gt_opt_cfg = make_ground_truth_optimization_config(experiment) - self.assertIs(gt_opt_cfg.objective.metric, gt_metric) - - # Test behavior with MOO problem - problem = get_multi_objective_benchmark_problem(observe_noise_sd=False) - self.assertIsInstance( - problem.optimization_config, MultiObjectiveOptimizationConfig - ) - experiment = _create_benchmark_experiment( - problem=problem, method_name="test_method" - ) - gt_metrics = make_ground_truth_metrics(problem) - for metric in problem.optimization_config.objective.metrics: - experiment.add_tracking_metric(gt_metrics[metric.name]) - gt_opt_cfg = make_ground_truth_optimization_config(experiment) - - for metric in gt_opt_cfg.objective.metrics: - gt_name = metric.name - metric = checked_cast(GroundTruthMetricMixin, metric) - self.assertIs(metric, gt_metrics[metric.get_original_name(gt_name)]) - - for metric in gt_opt_cfg.outcome_constraints: - gt_name = metric.metric.name - metric = checked_cast(GroundTruthMetricMixin, metric.metric) - self.assertIs(metric, gt_metrics[metric.get_original_name(gt_name)]) - def test_benchmark_result_invalid_inputs(self) -> None: """ Test that a BenchmarkResult cannot be specified with both an `experiment` @@ -244,14 +166,6 @@ def test_create_benchmark_experiment(self) -> None: self.assertEqual( experiment.optimization_config, problem.optimization_config ) - self.assertEqual(len(experiment.tracking_metrics), 1) - gt_metric = checked_cast( - GroundTruthBenchmarkMetric, experiment.tracking_metrics[0] - ) - self.assertIs( - gt_metric.original_metric, - problem.optimization_config.objective.metric, - ) self.assertEqual(experiment.runner, problem.runner) with self.subTest("noisy, observed noise std"): @@ -267,14 +181,6 @@ def test_create_benchmark_experiment(self) -> None: self.assertEqual( experiment.optimization_config, problem.optimization_config ) - self.assertEqual(len(experiment.tracking_metrics), 1) - gt_metric = checked_cast( - GroundTruthBenchmarkMetric, experiment.tracking_metrics[0] - ) - self.assertIs( - gt_metric.original_metric, - problem.optimization_config.objective.metric, - ) self.assertEqual(experiment.runner, problem.runner) def test_replication_sobol_synthetic(self) -> None: diff --git a/ax/benchmark/tests/test_benchmark_problem.py b/ax/benchmark/tests/test_benchmark_problem.py index 155d2eee8f0..0f794f9b583 100644 --- a/ax/benchmark/tests/test_benchmark_problem.py +++ b/ax/benchmark/tests/test_benchmark_problem.py @@ -91,10 +91,7 @@ def test_single_objective_from_botorch(self) -> None: "minimize=True), outcome_constraints=[]), " "num_trials=1, " "observe_noise_stds=False, " - "has_ground_truth=True, " - "tracking_metrics=[], " - "optimal_value=0.0, " - "is_noiseless=True)" + "optimal_value=0.0)" ) else: outcome_constraint = ( @@ -112,10 +109,7 @@ def test_single_objective_from_botorch(self) -> None: " >= 0.0)]), " "num_trials=1, " "observe_noise_stds=False, " - "has_ground_truth=True, " - "tracking_metrics=[], " - "optimal_value=-3.32237, " - "is_noiseless=True)" + "optimal_value=-3.32237)" ) self.assertEqual(repr(test_problem), expected_repr) diff --git a/ax/storage/json_store/registry.py b/ax/storage/json_store/registry.py index 48ef289edf7..e0f3123904e 100644 --- a/ax/storage/json_store/registry.py +++ b/ax/storage/json_store/registry.py @@ -16,7 +16,7 @@ MultiObjectiveBenchmarkProblem, ) from ax.benchmark.benchmark_result import AggregatedBenchmarkResult, BenchmarkResult -from ax.benchmark.metrics.benchmark import BenchmarkMetric, GroundTruthBenchmarkMetric +from ax.benchmark.metrics.benchmark import BenchmarkMetric from ax.benchmark.problems.hpo.torchvision import PyTorchCNNTorchvisionParamBasedProblem from ax.benchmark.runners.botorch_test import ( BotorchTestProblemRunner, @@ -201,7 +201,6 @@ GenerationNode: generation_node_to_dict, GenerationStrategy: generation_strategy_to_dict, GeneratorRun: generator_run_to_dict, - GroundTruthBenchmarkMetric: metric_to_dict, Hartmann6Metric: metric_to_dict, ImprovementGlobalStoppingStrategy: improvement_global_stopping_strategy_to_dict, Interval: botorch_component_to_dict, @@ -318,8 +317,6 @@ "GenerationStep": GenerationStep, "GeneratorRun": GeneratorRun, "GeneratorRunStruct": GeneratorRunStruct, - "GroundTruthBenchmarkMetric": GroundTruthBenchmarkMetric, - "GroundTruthBotorchTestProblemMetric": GroundTruthBenchmarkMetric, # for BC "Hartmann6Metric": Hartmann6Metric, "HierarchicalSearchSpace": HierarchicalSearchSpace, "ImprovementGlobalStoppingStrategy": ImprovementGlobalStoppingStrategy, diff --git a/ax/utils/testing/benchmark_stubs.py b/ax/utils/testing/benchmark_stubs.py index fa681ea61a9..100d56a18d6 100644 --- a/ax/utils/testing/benchmark_stubs.py +++ b/ax/utils/testing/benchmark_stubs.py @@ -131,7 +131,6 @@ def get_soo_surrogate() -> SOOSurrogateBenchmarkProblem: observe_noise_stds=observe_noise_sd, optimal_value=0.0, runner=runner, - is_noiseless=runner.is_noiseless, ) @@ -180,7 +179,6 @@ def get_moo_surrogate() -> MOOSurrogateBenchmarkProblem: observe_noise_stds=True, optimal_value=1.0, runner=runner, - is_noiseless=runner.is_noiseless, )