Remove tracking metrics setup for noisy benchmarks (#2706)

Summary: Pull Request resolved: #2706 No longer needed after the changes in the previous diff. Differential Revision: D61415525 Reviewed By: Balandat
facebook · Aug 24, 2024 · 0099295 · 0099295
1 parent 3b75095
commit 0099295
Show file tree

Hide file tree

Showing 19 changed files with 27 additions and 590 deletions.
diff --git a/ax/benchmark/benchmark.py b/ax/benchmark/benchmark.py
@@ -29,20 +29,11 @@
 from ax.benchmark.benchmark_method import BenchmarkMethod
 from ax.benchmark.benchmark_problem import BenchmarkProblem
 from ax.benchmark.benchmark_result import AggregatedBenchmarkResult, BenchmarkResult
-from ax.benchmark.metrics.base import BenchmarkMetricBase, GroundTruthMetricMixin
 from ax.core.experiment import Experiment
-from ax.core.metric import Metric
-from ax.core.objective import MultiObjective, Objective
-from ax.core.optimization_config import (
- MultiObjectiveOptimizationConfig,
- OptimizationConfig,
-)
-from ax.core.outcome_constraint import ObjectiveThreshold, OutcomeConstraint
 from ax.core.utils import get_model_times
 from ax.service.scheduler import Scheduler
 from ax.utils.common.logger import get_logger
 from ax.utils.common.random import with_rng_seed
-from ax.utils.common.typeutils import checked_cast, not_none
 
 logger: Logger = get_logger(__name__)
 
@@ -88,25 +79,10 @@ def _create_benchmark_experiment(
  Returns:
  The Experiment object to be used for benchmarking.
  """
- tracking_metrics = problem.tracking_metrics
- if not problem.is_noiseless and problem.has_ground_truth:
- # Make the ground truth counterparts for each metric defined on the problem,
- # which will be added as tracking metrics on the Experiment object below.
- # In the analysis, a modified OptimziationConfig referencing those metrics
- # will be passed to the `Scheduler.get_trace()` method, which allows to extract
- # the optimziation trace based on the ground truth outcomes (without noise).
- # If the problem is known to be noiseless, this is unneccesary and we can just
- # use the observations made during the optimization loop directly.
- gt_metric_dict = make_ground_truth_metrics(problem=problem)
- tracking_metrics = tracking_metrics + list(gt_metric_dict.values())
  return Experiment(
  name=f"{problem.name}|{method_name}_{int(time())}",
  search_space=problem.search_space,
  optimization_config=problem.optimization_config,
- tracking_metrics=tracking_metrics, # pyre-ignore [6]: Incompatible
- # parameter type: In call `Experiment.__init__`, for argument
- # `tracking_metrics`, expected `Optional[List[Metric]]` but got
- # `Union[List[Union[BenchmarkMetricBase, Metric]], List[BenchmarkMetricBase]]`.
  runner=problem.runner,
  )
 
@@ -124,7 +100,12 @@ def benchmark_replication(
  seed: The seed to use for this replication.
  """
 
- experiment = _create_benchmark_experiment(problem=problem, method_name=method.name)
+ experiment = Experiment(
+ name=f"{problem.name}|{method.name}_{int(time())}",
+ search_space=problem.search_space,
+ optimization_config=problem.optimization_config,
+ runner=problem.runner,
+ )
 
  scheduler = Scheduler(
  experiment=experiment,
@@ -135,24 +116,7 @@ def benchmark_replication(
  with with_rng_seed(seed=seed):
  scheduler.run_n_trials(max_trials=problem.num_trials)
 
- if not problem.is_noiseless and problem.has_ground_truth:
- # We modify the optimization config so we can use `Scheduler.get_trace()`
- # to use the true (not corrupted by noise) observations that were logged
- # as tracking metrics on the Experiment object. If the problem is known to
- # be noiseless, this is unnecssary and we can just use the observations
- # made during the optimization loop directly.
- analysis_opt_config = make_ground_truth_optimization_config(
- experiment=experiment
- )
- else:
- analysis_opt_config = experiment.optimization_config
-
- optimization_trace = np.asarray(
- scheduler.get_trace(optimization_config=analysis_opt_config)
- )
-
- new_optimization_trace = problem.get_opt_trace(experiment=experiment)
- np.testing.assert_allclose(optimization_trace, new_optimization_trace)
+ optimization_trace = problem.get_opt_trace(experiment=experiment)
 
  try:
  # Catch any errors that may occur during score computation, such as errors
@@ -217,125 +181,3 @@ def benchmark_multiple_problems_methods(
  benchmark_one_method_problem(problem=p, method=m, seeds=seeds)
  for p, m in product(problems, methods)
  ]
-
-
-def make_ground_truth_metrics(
- problem: BenchmarkProblem,
- include_tracking_metrics: bool = True,
-) -> dict[str, Metric]:
- """Makes a ground truth version for each metric defined on the problem.
-
- Args:
- problem: The BenchmarkProblem to test against (can be synthetic or real).
- include_tracking_metrics: Whether or not to include tracking metrics.
-
- Returns:
- A dict mapping (original) metric names to their respective ground truth metric.
- """
- if not problem.has_ground_truth:
- raise ValueError(
- "Cannot create ground truth metrics for problems that "
- "do not have a ground truth."
- )
- metrics: list[BenchmarkMetricBase] = [
- checked_cast(BenchmarkMetricBase, metric)
- for metric in problem.optimization_config.metrics.values()
- ]
- if include_tracking_metrics:
- metrics = metrics + problem.tracking_metrics
- return {metric.name: metric.make_ground_truth_metric() for metric in metrics}
-
-
-def make_ground_truth_optimization_config(
- experiment: Experiment,
-) -> OptimizationConfig:
- """Makes a clone of the OptimizationConfig on the experiment in which each metric
- is replaced by its respective "ground truth" counterpart, which has been added to
- the experiment's tracking metrics in `_create_benchmark_experiment` and which
- returns the ground truth (i.e., uncorrupted by noise) observations.
- """
- optimization_config = not_none(experiment.optimization_config)
-
- if optimization_config.risk_measure is not None:
- raise NotImplementedError("Support for risk measures is not yet implemented.")
-
- # dict for caching metric lookup
- gt_metric_dict: dict[str, BenchmarkMetricBase] = {}
-
- def get_gt_metric(metric: Metric) -> BenchmarkMetricBase:
- """Look up corresponding ground truth metric of the experiment. Will error
- out if no corresponding ground truth metric exists."""
- if not isinstance(metric, BenchmarkMetricBase):
- raise ValueError(
- "Only BenchmarkMetricBase metrics are supported for ground truth "
- f"metrics. Got {type(metric)}."
- )
-
- if metric.name in gt_metric_dict:
- return gt_metric_dict[metric.name]
-
- for tracking_metric in experiment.tracking_metrics:
- if getattr(tracking_metric, "is_ground_truth", False):
- # TODO: Figure out if there is a better way to match the ground truth
- # metric and the original metric.
- ground_truth_name = tracking_metric.name
- orig_name = checked_cast(
- GroundTruthMetricMixin, tracking_metric
- ).get_original_name(ground_truth_name)
- if orig_name == metric.name:
- tracking_metric = checked_cast(BenchmarkMetricBase, tracking_metric)
- gt_metric_dict[metric.name] = tracking_metric
- return tracking_metric
- raise ValueError(f"Ground truth metric for metric {metric.name} not found!")
-
- # convert outcome constraints
- if optimization_config.outcome_constraints is not None:
- gt_outcome_constraints = [
- OutcomeConstraint(
- metric=get_gt_metric(oc.metric),
- op=oc.op,
- bound=oc.bound,
- relative=oc.relative,
- )
- for oc in optimization_config.outcome_constraints
- ]
- else:
- gt_outcome_constraints = None
-
- # we need to distinguish MOO and non-MOO problems
- if not optimization_config.is_moo_problem:
- gt_objective = Objective(
- metric=get_gt_metric(optimization_config.objective.metric)
- )
-
- return OptimizationConfig(
- objective=gt_objective, outcome_constraints=gt_outcome_constraints
- )
-
- gt_objective = MultiObjective(
- metrics=[
- get_gt_metric(metric) for metric in optimization_config.objective.metrics
- ]
- )
- # there may be objective thresholds to also convert
- objective_thresholds = checked_cast(
- MultiObjectiveOptimizationConfig, optimization_config
- ).objective_thresholds
- if objective_thresholds is not None:
- gt_objective_thresholds = [
- ObjectiveThreshold(
- metric=get_gt_metric(ot.metric),
- bound=ot.bound,
- relative=ot.relative,
- op=ot.op,
- )
- for ot in objective_thresholds
- ]
- else:
- gt_objective_thresholds = None
-
- return MultiObjectiveOptimizationConfig(
- objective=gt_objective,
- outcome_constraints=gt_outcome_constraints,
- objective_thresholds=gt_objective_thresholds,
- )
diff --git a/ax/benchmark/benchmark_method.py b/ax/benchmark/benchmark_method.py
@@ -7,13 +7,11 @@
 
 import logging
 from dataclasses import dataclass
-from typing import Any
 
-from ax.modelbridge.generation_strategy import GenerationStep, GenerationStrategy
+from ax.modelbridge.generation_strategy import GenerationStrategy
 from ax.service.utils.scheduler_options import SchedulerOptions, TrialType
 from ax.utils.common.base import Base
 from ax.utils.common.logger import get_logger
-from ax.utils.common.typeutils import not_none
 
 
 logger: logging.Logger = get_logger("BenchmarkMethod")
@@ -28,52 +26,13 @@ class BenchmarkMethod(Base):
  Note: If `BenchmarkMethod.scheduler_options.total_trials` is less than
  `BenchmarkProblem.num_trials` then only the number of trials specified in the
  former will be run.
-
- Note: The `generation_strategy` passed in is assumed to be in its "base state",
- as it will be cloned and reset.
  """
 
  name: str
  generation_strategy: GenerationStrategy
  scheduler_options: SchedulerOptions
  distribute_replications: bool = False
 
- def __post_init__(self) -> None:
- # We (I think?) in general don't want to fit tracking metrics during our
- # benchmarks. Further, not setting `fit_tracking_metrics=False`causes
- # issues with the ground truth metrics created automatically when running
- # the benchmark - in fact, things will error out deep inside the modeling
- # stack since the model gets both noisy (benchmark) and noiseless (ground
- # truth) observations. While support for this is something we shold add
- # for models, in the context of benchmarking we actually want to avoid
- # fitting the ground truth metrics at all.
-
- # Clone the GS so as to not modify the original one in-place below.
- # Note that this assumes that the GS passed in is in its base state.
- gs_cloned = self.generation_strategy.clone_reset()
-
- for node in gs_cloned._nodes:
- if isinstance(node, GenerationStep):
- if node.model_kwargs is None:
- node.model_kwargs = {}
- if node.model_kwargs.get("fit_tracking_metrics", True):
- logger.info(
- "Setting `fit_tracking_metrics` in a GenerationStep to False.",
- )
- not_none(node.model_kwargs)["fit_tracking_metrics"] = False
- for model_spec in node.model_specs:
- if model_spec.model_kwargs is None:
- model_spec.model_kwargs = {}
- elif model_spec.model_kwargs.get("fit_tracking_metrics", True):
- logger.info(
- "Setting `fit_tracking_metrics` in a GenerationNode's "
- "model_spec to False."
- )
- not_none(model_spec.model_kwargs)["fit_tracking_metrics"] = False
-
- # hack around not being able to update frozen attribute of a dataclass
- _assign_frozen_attr(self, name="generation_strategy", value=gs_cloned)
-
 
 def get_benchmark_scheduler_options(
  timeout_hours: int = 4,
@@ -103,10 +62,3 @@ def get_benchmark_scheduler_options(
  trial_type=TrialType.TRIAL if batch_size == 1 else TrialType.BATCH_TRIAL,
  batch_size=batch_size,
  )
-
-
-def _assign_frozen_attr(obj: Any, name: str, value: Any) -> None: # pyre-ignore [2]
- """Assign a new value to an attribute of a frozen dataclass.
- This is an ugly hack and shouldn't be used broadly.
- """
- object.__setattr__(obj, name, value)
diff --git a/ax/benchmark/benchmark_problem.py b/ax/benchmark/benchmark_problem.py
@@ -11,8 +11,6 @@
 import numpy as np
 import pandas as pd
 
-from ax.benchmark.metrics.base import BenchmarkMetricBase
-
 from ax.benchmark.metrics.benchmark import BenchmarkMetric
 from ax.benchmark.runners.base import BenchmarkRunner
 from ax.benchmark.runners.botorch_test import BotorchTestProblemRunner
@@ -72,12 +70,6 @@ class BenchmarkProblem(Base):
  observe_noise_stds: If boolean, whether the standard deviation of the
  observation noise is observed for all metrics. If a dictionary,
  whether noise levels are observed on a per-metric basis.
- has_ground_truth: Whether the Runner produces underlying ground truth
- values, which are not observed in real noisy problems but may be
- known in benchmarks.
- tracking_metrics: Tracking metrics are not optimized, and for the
- purpose of benchmarking, they will not be fit. The ground truth may
- be provided as `tracking_metrics`.
  optimal_value: The best ground-truth objective value. Hypervolume for
  multi-objective problems. If the best value is not known, it is
  conventional to set it to a value that is almost certainly better
@@ -91,13 +83,10 @@ class BenchmarkProblem(Base):
  optimization_config: OptimizationConfig
  num_trials: int
  observe_noise_stds: Union[bool, dict[str, bool]] = False
- has_ground_truth: bool = True
- tracking_metrics: list[BenchmarkMetricBase] = field(default_factory=list)
  optimal_value: float
 
  search_space: SearchSpace = field(repr=False)
  runner: BenchmarkRunner = field(repr=False)
- is_noiseless: bool
 
  def get_oracle_experiment(self, experiment: Experiment) -> Experiment:
  records = []
@@ -263,8 +252,6 @@ def create_single_objective_problem_from_botorch(
  ),
  num_trials=num_trials,
  observe_noise_stds=observe_noise_sd,
- is_noiseless=test_problem.noise_std in (None, 0.0),
- has_ground_truth=True, # all synthetic problems have ground truth
  optimal_value=optimal_value,
  )
 
@@ -356,8 +343,6 @@ def create_multi_objective_problem_from_botorch(
  optimization_config=optimization_config,
  runner=runner,
  num_trials=num_trials,
- is_noiseless=test_problem.noise_std in (None, 0.0),
  observe_noise_stds=observe_noise_sd,
- has_ground_truth=True,
  optimal_value=test_problem.max_hv,
  )