diff --git a/ax/benchmark/benchmark.py b/ax/benchmark/benchmark.py
index 2412f98d19f..33df2c1ec5c 100644
--- a/ax/benchmark/benchmark.py
+++ b/ax/benchmark/benchmark.py
@@ -22,7 +22,7 @@
 from collections.abc import Iterable
 from itertools import product
 from logging import Logger
-from time import time
+from time import monotonic, time
 
 import numpy as np
 
@@ -30,6 +30,7 @@
 from ax.benchmark.benchmark_problem import BenchmarkProblem
 from ax.benchmark.benchmark_result import AggregatedBenchmarkResult, BenchmarkResult
 from ax.core.experiment import Experiment
+from ax.core.types import TParameterization
 from ax.core.utils import get_model_times
 from ax.service.scheduler import Scheduler
 from ax.service.utils.best_point_mixin import BestPointMixin
@@ -93,12 +94,23 @@ def benchmark_replication(
     method: BenchmarkMethod,
     seed: int,
 ) -> BenchmarkResult:
-    """Runs one benchmarking replication (equivalent to one optimization loop).
+    """
+    Run one benchmarking replication (equivalent to one optimization loop).
+
+    After each trial, the `method` gets the best parameter(s) found so far, as
+    evaluated based on empirical data. After all trials are run, the `problem`
+    gets the oracle values of each "best" parameter; this yields the ``inference
+    trace``. The cumulative maximum of the oracle value of each parameterization
+    tested is the ``oracle_trace``.
+
 
     Args:
         problem: The BenchmarkProblem to test against (can be synthetic or real)
         method: The BenchmarkMethod to test
         seed: The seed to use for this replication.
+
+    Return:
+        ``BenchmarkResult`` object.
     """
 
     experiment = Experiment(
@@ -113,19 +125,67 @@ def benchmark_replication(
         generation_strategy=method.generation_strategy.clone_reset(),
         options=method.scheduler_options,
     )
+    timeout_hours = scheduler.options.timeout_hours
 
+    # list of parameters for each trial
+    best_params_by_trial: list[list[TParameterization]] = []
+
+    is_mf_or_mt = len(problem.runner.target_fidelity_and_task) > 0
+    # Run the optimization loop.
+    timeout_hours = scheduler.options.timeout_hours
     with with_rng_seed(seed=seed):
-        scheduler.run_n_trials(max_trials=problem.num_trials)
+        start = monotonic()
+        for _ in range(problem.num_trials):
+            next(
+                scheduler.run_trials_and_yield_results(
+                    max_trials=1, timeout_hours=timeout_hours
+                )
+            )
+            if timeout_hours is not None:
+                elapsed_hours = (monotonic() - start) / 3600
+                timeout_hours = timeout_hours - elapsed_hours
+                if timeout_hours <= 0:
+                    break
+
+            if problem.is_moo or is_mf_or_mt:
+                # Inference trace is not supported for MOO.
+                # It's also not supported for multi-fidelity or multi-task
+                # problems, because Ax's best-point functionality doesn't know
+                # to predict at the target task or fidelity.
+                continue
+
+            best_params = method.get_best_parameters(
+                experiment=experiment,
+                optimization_config=problem.optimization_config,
+                n_points=problem.n_best_points,
+            )
+            best_params_by_trial.append(best_params)
+
+    # Construct inference trace from best parameters
+    inference_trace = np.full(problem.num_trials, np.nan)
+    for trial_index, best_params in enumerate(best_params_by_trial):
+        # Construct an experiment with one BatchTrial
+        best_params_oracle_experiment = problem.get_oracle_experiment_from_params(
+            {0: {str(i): p for i, p in enumerate(best_params)}}
+        )
+        # Get the optimization trace. It will have only one point.
+        inference_trace[trial_index] = BestPointMixin._get_trace(
+            experiment=best_params_oracle_experiment,
+            optimization_config=problem.optimization_config,
+        )[0]
 
-    oracle_experiment = problem.get_oracle_experiment_from_experiment(
+    actual_params_oracle_experiment = problem.get_oracle_experiment_from_experiment(
         experiment=experiment
     )
-    optimization_trace = np.array(
+    oracle_trace = np.array(
         BestPointMixin._get_trace(
-            experiment=oracle_experiment,
+            experiment=actual_params_oracle_experiment,
             optimization_config=problem.optimization_config,
         )
     )
+    optimization_trace = (
+        inference_trace if problem.report_inference_value_as_trace else oracle_trace
+    )
 
     try:
         # Catch any errors that may occur during score computation, such as errors
@@ -155,6 +215,8 @@ def benchmark_replication(
         name=scheduler.experiment.name,
         seed=seed,
         experiment=scheduler.experiment,
+        oracle_trace=oracle_trace,
+        inference_trace=inference_trace,
         optimization_trace=optimization_trace,
         score_trace=score_trace,
         fit_time=fit_time,
diff --git a/ax/benchmark/benchmark_method.py b/ax/benchmark/benchmark_method.py
index 95983d569d8..01f962af4de 100644
--- a/ax/benchmark/benchmark_method.py
+++ b/ax/benchmark/benchmark_method.py
@@ -5,16 +5,20 @@
 
 # pyre-strict
 
-import logging
-from dataclasses import dataclass
+from dataclasses import dataclass, field
+
+from ax.core.experiment import Experiment
+from ax.core.optimization_config import (
+    MultiObjectiveOptimizationConfig,
+    OptimizationConfig,
+)
+from ax.core.types import TParameterization
 
 from ax.modelbridge.generation_strategy import GenerationStrategy
+from ax.service.utils.best_point_mixin import BestPointMixin
 from ax.service.utils.scheduler_options import SchedulerOptions, TrialType
 from ax.utils.common.base import Base
-from ax.utils.common.logger import get_logger
-
-
-logger: logging.Logger = get_logger("BenchmarkMethod")
+from pyre_extensions import none_throws
 
 
 @dataclass(frozen=True)
@@ -36,12 +40,74 @@ class BenchmarkMethod(Base):
             `get_benchmark_scheduler_options`.
         distribute_replications: Indicates whether the replications should be
             run in a distributed manner. Ax itself does not use this attribute.
+        best_point_kwargs: Arguments passed to `get_pareto_optimal_parameters`
+            (if multi-objective) or `BestPointMixin._get_best_trial` (if
+            single-objective). Currently, the only supported argument is
+            `use_model_predictions`. However, note that if multi-objective,
+            best-point selection is not currently supported and
+            `get_pareto_optimal_parameters` will raise a `NotImplementedError`.
     """
 
     name: str
     generation_strategy: GenerationStrategy
     scheduler_options: SchedulerOptions
     distribute_replications: bool = False
+    best_point_kwargs: dict[str, bool] = field(
+        default_factory=lambda: {"use_model_predictions": False}
+    )
+
+    def get_best_parameters(
+        self,
+        experiment: Experiment,
+        optimization_config: OptimizationConfig,
+        n_points: int,
+    ) -> list[TParameterization]:
+        """
+        Get ``n_points`` promising points. NOTE: Only SOO with n_points = 1 is
+        supported.
+
+        The expected use case is that these points will be evaluated against an
+        oracle for hypervolume (if multi-objective) or for the value of the best
+        parameter (if single-objective).
+
+        For multi-objective cases, ``n_points > 1`` is needed. For SOO, ``n_points > 1``
+        reflects setups where we can choose some points which will then be
+        evaluated noiselessly or at high fidelity and then use the best one.
+
+
+        Args:
+            experiment: The experiment to get the data from. This should contain
+                values that would be observed in a realistic setting and not
+                contain oracle values.
+            optimization_config: The ``optimization_config`` for the corresponding
+                ``BenchmarkProblem``.
+            n_points: The number of points to return.
+        """
+        if isinstance(optimization_config, MultiObjectiveOptimizationConfig):
+            raise NotImplementedError(
+                "BenchmarkMethod.get_pareto_optimal_parameters is not currently "
+                "supported for multi-objective problems."
+            )
+
+        if n_points != 1:
+            raise NotImplementedError(
+                f"Currently only n_points=1 is supported. Got {n_points=}."
+            )
+
+        # SOO, n=1 case.
+        # Note: This has the same effect Scheduler.get_best_parameters
+        result = BestPointMixin._get_best_trial(
+            experiment=experiment,
+            generation_strategy=self.generation_strategy,
+            optimization_config=optimization_config,
+            # pyre-fixme: Incompatible parameter type [6]: In call
+            # `get_pareto_optimal_parameters`, for 4th positional argument,
+            # expected `Optional[Iterable[int]]` but got `bool`.
+            **self.best_point_kwargs,
+        )
+
+        i, params, prediction = none_throws(result)
+        return [params]
 
 
 def get_benchmark_scheduler_options(
diff --git a/ax/benchmark/benchmark_problem.py b/ax/benchmark/benchmark_problem.py
index ea8e21313bb..a39d90c8406 100644
--- a/ax/benchmark/benchmark_problem.py
+++ b/ax/benchmark/benchmark_problem.py
@@ -74,6 +74,13 @@ class BenchmarkProblem(Base):
         search_space: The search space.
         runner: The Runner that will be used to generate data for the problem,
             including any ground-truth data stored as tracking metrics.
+        report_inference_value_as_trace: Whether the ``optimization_trace`` on a
+            ``BenchmarkResult`` should use the ``oracle_trace`` (if False,
+            default) or the ``inference_trace``. See ``BenchmarkResult`` for
+            more information. Currently, this is only supported for
+            single-objective problems.
+        n_best_points: Number of points for a best-point selector to recommend.
+            Currently, only ``n_best_points=1`` is supported.
     """
 
     name: str
@@ -84,6 +91,17 @@ class BenchmarkProblem(Base):
 
     search_space: SearchSpace = field(repr=False)
     runner: BenchmarkRunner = field(repr=False)
+    report_inference_value_as_trace: bool = False
+    n_best_points: int = 1
+
+    def __post_init__(self) -> None:
+        if self.n_best_points != 1:
+            raise NotImplementedError("Only `n_best_points=1` is currently supported.")
+        if self.report_inference_value_as_trace and self.is_moo:
+            raise NotImplementedError(
+                "Inference trace is not supported for MOO. Please set "
+                "`report_inference_value_as_trace` to False."
+            )
 
     def get_oracle_experiment_from_params(
         self,
@@ -285,6 +303,7 @@ def create_problem_from_botorch(
     lower_is_better: bool = True,
     observe_noise_sd: bool = False,
     search_space: SearchSpace | None = None,
+    report_inference_value_as_trace: bool = False,
 ) -> BenchmarkProblem:
     """
     Create a `BenchmarkProblem` from a BoTorch `BaseTestProblem`.
@@ -308,6 +327,10 @@ def create_problem_from_botorch(
         search_space: If provided, the `search_space` of the `BenchmarkProblem`.
             Otherwise, a `SearchSpace` with all `RangeParameter`s is created
             from the bounds of the test problem.
+        report_inference_value_as_trace: If True, indicates that the
+            ``optimization_trace`` on a ``BenchmarkResult`` ought to be the
+            ``inference_trace``; otherwise, it will be the ``oracle_trace``.
+            See ``BenchmarkResult`` for more information.
     """
     # pyre-fixme [45]: Invalid class instantiation
     test_problem = test_problem_class(**test_problem_kwargs)
@@ -364,4 +387,5 @@ def create_problem_from_botorch(
         num_trials=num_trials,
         observe_noise_stds=observe_noise_sd,
         optimal_value=optimal_value,
+        report_inference_value_as_trace=report_inference_value_as_trace,
     )
diff --git a/ax/benchmark/benchmark_result.py b/ax/benchmark/benchmark_result.py
index afa74e77ee4..52bcf06f994 100644
--- a/ax/benchmark/benchmark_result.py
+++ b/ax/benchmark/benchmark_result.py
@@ -33,15 +33,38 @@ class BenchmarkResult(Base):
         name: Name of the benchmark. Should make it possible to determine the
             problem and the method.
         seed: Seed used for determinism.
-        optimization_trace: For single-objective problems, element i of the
-            optimization trace is the oracle value of the "best" point, computed
-            after the first i trials have been run. For multi-objective
-            problems, element i of the optimization trace is the hypervolume of
-            oracle values at a set of points, also computed after the first i
-            trials (even if these were ``BatchTrials``).  Oracle values are
-            typically ground-truth (rather than noisy) and evaluated at the
-            target task and fidelity.
-
+        oracle_trace: For single-objective problems, element i of the
+            optimization trace is the best oracle value of the arms evaluated
+            after the first i trials.  For multi-objective problems, element i
+            of the optimization trace is the hypervolume of the oracle values of
+            the arms in the first i trials (which may be ``BatchTrial``s).
+            Oracle values are typically ground-truth (rather than noisy) and
+            evaluated at the target task and fidelity.
+        inference_trace: Inference trace comes from choosing a "best" point
+            based only on data that would be observable in realistic settings
+            and then evaluating the oracle value of that point. For
+            multi-objective problems, we find a Pareto set and evaluate its
+            hypervolume.
+
+            There are several ways of specifying the "best" point: One could
+            pick the point with the best observed value, or the point with the
+            best model prediction, and could consider the whole search space,
+            the set of trials completed so far, etc. How the inference trace is
+            computed is specified by a best-point selector, which is an
+            attribute of the `BenchmarkMethod`.
+
+            Note: This is not "inference regret", which is a lower-is-better value
+            that is relative to the best possible value. The inference value
+            trace is higher-is-better if the problem is a maximization problem
+            or if the problem is multi-objective (in which case hypervolume is
+            used). Hence, it is signed the same as ``oracle_trace`` and
+            ``optimization_trace``. ``score_trace`` is higher-is-better and
+            relative to the optimum.
+        optimization_trace: Either the ``oracle_trace`` or the
+            ``inference_trace``, depending on whether the ``BenchmarkProblem``
+            specifies ``report_inference_value``. Having ``optimization_trace``
+            specified separately is useful when we need just one value to
+            evaluate how well the benchmark went.
         score_trace: The scores associated with the problem, typically either
             the optimization_trace or inference_value_trace normalized to a
             0-100 scale for comparability between problems.
@@ -56,6 +79,8 @@ class BenchmarkResult(Base):
     name: str
     seed: int
 
+    oracle_trace: ndarray
+    inference_trace: ndarray
     optimization_trace: ndarray
     score_trace: ndarray
 
diff --git a/ax/benchmark/methods/modular_botorch.py b/ax/benchmark/methods/modular_botorch.py
index 7df32f7f2d1..a55aeb1c598 100644
--- a/ax/benchmark/methods/modular_botorch.py
+++ b/ax/benchmark/methods/modular_botorch.py
@@ -48,6 +48,7 @@ def get_sobol_botorch_modular_acquisition(
     name: Optional[str] = None,
     num_sobol_trials: int = 5,
     model_gen_kwargs: Optional[dict[str, Any]] = None,
+    best_point_kwargs: dict[str, bool] | None = None,
 ) -> BenchmarkMethod:
     """Get a `BenchmarkMethod` that uses Sobol followed by MBM.
 
@@ -64,6 +65,7 @@ def get_sobol_botorch_modular_acquisition(
             `BatchTrial`s.
         model_gen_kwargs: Passed to the BoTorch `GenerationStep` and ultimately
             to the BoTorch `Model`.
+        best_point_kwargs: Passed to the created `BenchmarkMethod`.
 
     Example:
         >>> # A simple example
@@ -138,4 +140,5 @@ def get_sobol_botorch_modular_acquisition(
         generation_strategy=generation_strategy,
         scheduler_options=scheduler_options or get_benchmark_scheduler_options(),
         distribute_replications=distribute_replications,
+        best_point_kwargs={} if best_point_kwargs is None else best_point_kwargs,
     )
diff --git a/ax/benchmark/tests/methods/test_methods.py b/ax/benchmark/tests/methods/test_methods.py
index 740213f8af7..27ae23844f6 100644
--- a/ax/benchmark/tests/methods/test_methods.py
+++ b/ax/benchmark/tests/methods/test_methods.py
@@ -6,14 +6,24 @@
 # pyre-strict
 
 
+from itertools import product
+from unittest.mock import patch
+
 import numpy as np
 from ax.benchmark.benchmark import benchmark_replication
 from ax.benchmark.benchmark_method import get_benchmark_scheduler_options
 from ax.benchmark.methods.modular_botorch import get_sobol_botorch_modular_acquisition
 from ax.benchmark.methods.sobol import get_sobol_benchmark_method
 from ax.benchmark.problems.registry import get_problem
+from ax.core.experiment import Experiment
 from ax.modelbridge.registry import Models
+from ax.service.scheduler import Scheduler
+from ax.service.utils.best_point import (
+    get_best_by_raw_objective_with_trial_index,
+    get_best_parameters_from_model_predictions_with_trial_index,
+)
 from ax.service.utils.scheduler_options import SchedulerOptions
+from ax.utils.common.random import with_rng_seed
 from ax.utils.common.testutils import TestCase
 from ax.utils.testing.mock import fast_botorch_optimize
 from botorch.acquisition.acquisition import AcquisitionFunction
@@ -124,3 +134,68 @@ def test_sobol(self) -> None:
         problem = get_problem(problem_name="ackley4", num_trials=3)
         result = benchmark_replication(problem=problem, method=method, seed=0)
         self.assertTrue(np.isfinite(result.score_trace).all())
+
+    def _test_get_best_parameters(
+        self, use_model_predictions: bool, as_batch: bool
+    ) -> None:
+        problem = get_problem(
+            problem_name="ackley4", num_trials=2, test_problem_kwargs={"noise_std": 1.0}
+        )
+
+        method = get_sobol_botorch_modular_acquisition(
+            model_cls=SingleTaskGP,
+            acquisition_cls=qLogExpectedImprovement,
+            distribute_replications=False,
+            best_point_kwargs={"use_model_predictions": use_model_predictions},
+            num_sobol_trials=1,
+        )
+
+        experiment = Experiment(
+            name="test",
+            search_space=problem.search_space,
+            optimization_config=problem.optimization_config,
+            runner=problem.runner,
+        )
+
+        scheduler = Scheduler(
+            experiment=experiment,
+            generation_strategy=method.generation_strategy.clone_reset(),
+            options=method.scheduler_options,
+        )
+
+        with with_rng_seed(seed=0):
+            scheduler.run_n_trials(max_trials=problem.num_trials)
+
+        # because the second trial is a BoTorch trial, the model should be used
+        best_point_mixin_path = "ax.service.utils.best_point_mixin.best_point_utils."
+        with patch(
+            best_point_mixin_path
+            + "get_best_parameters_from_model_predictions_with_trial_index",
+            wraps=get_best_parameters_from_model_predictions_with_trial_index,
+        ) as mock_get_best_parameters_from_predictions, patch(
+            best_point_mixin_path + "get_best_by_raw_objective_with_trial_index",
+            wraps=get_best_by_raw_objective_with_trial_index,
+        ) as mock_get_best_by_raw_objective_with_trial_index:
+            best_params = method.get_best_parameters(
+                experiment=experiment,
+                optimization_config=problem.optimization_config,
+                n_points=1,
+            )
+        if use_model_predictions:
+            mock_get_best_parameters_from_predictions.assert_called_once()
+            # get_best_by_raw_objective_with_trial_index might be used as
+            # fallback
+        else:
+            mock_get_best_parameters_from_predictions.assert_not_called()
+            mock_get_best_by_raw_objective_with_trial_index.assert_called_once()
+        self.assertEqual(len(best_params), 1)
+
+    def test_get_best_parameters(self) -> None:
+        for use_model_predictions, as_batch in product(
+            [False, True],
+            [False, True],
+        ):
+            with self.subTest(f"{use_model_predictions=}, {as_batch=}"):
+                self._test_get_best_parameters(
+                    use_model_predictions=use_model_predictions, as_batch=as_batch
+                )
diff --git a/ax/benchmark/tests/test_benchmark.py b/ax/benchmark/tests/test_benchmark.py
index 5e2447b9236..3c3294114f7 100644
--- a/ax/benchmark/tests/test_benchmark.py
+++ b/ax/benchmark/tests/test_benchmark.py
@@ -6,6 +6,7 @@
 # pyre-strict
 
 import tempfile
+from itertools import product
 from unittest.mock import patch
 
 import numpy as np
@@ -119,6 +120,8 @@ def test_benchmark_result_invalid_inputs(self) -> None:
             BenchmarkResult(
                 name="name",
                 seed=0,
+                inference_trace=np.array([]),
+                oracle_trace=np.array([]),
                 optimization_trace=np.array([]),
                 score_trace=np.array([]),
                 fit_time=0.0,
@@ -133,6 +136,8 @@ def test_benchmark_result_invalid_inputs(self) -> None:
             BenchmarkResult(
                 name="name",
                 seed=0,
+                inference_trace=np.array([]),
+                oracle_trace=np.array([]),
                 optimization_trace=np.array([]),
                 score_trace=np.array([]),
                 fit_time=0.0,
@@ -220,6 +225,73 @@ def test_replication_sobol_surrogate(self) -> None:
                 self.assertTrue(np.isfinite(res.score_trace).all())
                 self.assertTrue(np.all(res.score_trace <= 100))
 
+    @fast_botorch_optimize
+    def _test_replication_with_inference_value(
+        self,
+        batch_size: int,
+        use_model_predictions: bool,
+        report_inference_value_as_trace: bool,
+    ) -> None:
+        seed = 1
+        method = get_sobol_botorch_modular_acquisition(
+            model_cls=SingleTaskGP,
+            acquisition_cls=qLogNoisyExpectedImprovement,
+            distribute_replications=False,
+            best_point_kwargs={"use_model_predictions": use_model_predictions},
+            num_sobol_trials=3,
+        )
+
+        test_problem_kwargs = {"noise_std": 100.0}
+        num_trials = 4
+        problem = get_single_objective_benchmark_problem(
+            test_problem_kwargs=test_problem_kwargs,
+            num_trials=num_trials,
+            report_inference_value_as_trace=report_inference_value_as_trace,
+        )
+        res = benchmark_replication(problem=problem, method=method, seed=seed)
+        # The inference trace could coincide with the oracle trace, but it won't
+        # happen in this example with high noise and a seed
+        self.assertEqual(
+            np.equal(res.inference_trace, res.optimization_trace).all(),
+            report_inference_value_as_trace,
+        )
+        self.assertEqual(
+            np.equal(res.oracle_trace, res.optimization_trace).all(),
+            not report_inference_value_as_trace,
+        )
+
+        self.assertEqual(res.optimization_trace.shape, (problem.num_trials,))
+        self.assertTrue((res.inference_trace >= res.oracle_trace).all())
+        self.assertTrue((res.score_trace >= 0).all())
+        self.assertTrue((res.score_trace <= 100).all())
+
+    def test_replication_with_inference_value(self) -> None:
+        for (
+            use_model_predictions,
+            batch_size,
+            report_inference_value_as_trace,
+        ) in product(
+            [False, True],
+            [1, 2],
+            [False, True],
+        ):
+            with self.subTest(
+                batch_size=batch_size,
+                use_model_predictions=use_model_predictions,
+                report_inference_value_as_trace=report_inference_value_as_trace,
+            ):
+                self._test_replication_with_inference_value(
+                    batch_size=batch_size,
+                    use_model_predictions=use_model_predictions,
+                    report_inference_value_as_trace=report_inference_value_as_trace,
+                )
+
+        with self.assertRaisesRegex(
+            NotImplementedError,
+            "Inference trace is not supported for MOO",
+        ):
+            get_multi_objective_benchmark_problem(report_inference_value_as_trace=True)
+
     @fast_botorch_optimize
     def test_replication_mbm(self) -> None:
         with patch.dict(
diff --git a/ax/benchmark/tests/test_benchmark_problem.py b/ax/benchmark/tests/test_benchmark_problem.py
index 9e1218a8070..887b49fc9d0 100644
--- a/ax/benchmark/tests/test_benchmark_problem.py
+++ b/ax/benchmark/tests/test_benchmark_problem.py
@@ -14,11 +14,14 @@
 
 from ax.benchmark.benchmark_metric import BenchmarkMetric
 
-from ax.benchmark.benchmark_problem import create_problem_from_botorch
+from ax.benchmark.benchmark_problem import BenchmarkProblem, create_problem_from_botorch
 from ax.benchmark.runners.botorch_test import BotorchTestProblemRunner
 from ax.core.arm import Arm
-from ax.core.objective import MultiObjective
-from ax.core.optimization_config import MultiObjectiveOptimizationConfig
+from ax.core.objective import MultiObjective, Objective
+from ax.core.optimization_config import (
+    MultiObjectiveOptimizationConfig,
+    OptimizationConfig,
+)
 from ax.core.parameter import ChoiceParameter, ParameterType, RangeParameter
 from ax.core.search_space import SearchSpace
 from ax.core.types import ComparisonOp
@@ -44,6 +47,46 @@ def setUp(self) -> None:
         self.maxDiff = None
         super().setUp()
 
+    def test_inference_value_not_implemented(self) -> None:
+        objectives = [
+            Objective(metric=BenchmarkMetric(name, lower_is_better=True))
+            for name in ["Branin", "Currin"]
+        ]
+        optimization_config = OptimizationConfig(objective=objectives[0])
+        runner = BotorchTestProblemRunner(
+            test_problem_class=Branin,
+            outcome_names=["foo"],
+            test_problem_kwargs={},
+        )
+        with self.assertRaisesRegex(NotImplementedError, "Only `n_best_points=1`"):
+            BenchmarkProblem(
+                name="foo",
+                optimization_config=optimization_config,
+                num_trials=1,
+                observe_noise_stds=False,
+                optimal_value=0.0,
+                search_space=SearchSpace(parameters=[]),
+                runner=runner,
+                n_best_points=2,
+            )
+
+        with self.assertRaisesRegex(
+            NotImplementedError, "Inference trace is not supported for MOO"
+        ):
+            BenchmarkProblem(
+                name="foo",
+                optimization_config=MultiObjectiveOptimizationConfig(
+                    objective=MultiObjective(objectives)
+                ),
+                num_trials=1,
+                observe_noise_stds=False,
+                optimal_value=0.0,
+                search_space=SearchSpace(parameters=[]),
+                runner=runner,
+                n_best_points=1,
+                report_inference_value_as_trace=True,
+            )
+
     def _test_multi_fidelity_or_multi_task(self, fidelity_or_task: str) -> None:
         """
         Args:
diff --git a/ax/service/scheduler.py b/ax/service/scheduler.py
index 47aa7f3f624..9494f825861 100644
--- a/ax/service/scheduler.py
+++ b/ax/service/scheduler.py
@@ -850,7 +850,7 @@ def run_trials_and_yield_results(
         self,
         max_trials: int,
         ignore_global_stopping_strategy: bool = False,
-        timeout_hours: Optional[int] = None,
+        timeout_hours: int | float | None = None,
         idle_callback: Optional[Callable[[Scheduler], None]] = None,
     ) -> Generator[dict[str, Any], None, None]:
         """Make continuous calls to `run` and `process_results` to run up to
diff --git a/ax/utils/testing/benchmark_stubs.py b/ax/utils/testing/benchmark_stubs.py
index a87e1f9e171..7f3bfd7b524 100644
--- a/ax/utils/testing/benchmark_stubs.py
+++ b/ax/utils/testing/benchmark_stubs.py
@@ -46,12 +46,14 @@ def get_single_objective_benchmark_problem(
     observe_noise_sd: bool = False,
     num_trials: int = 4,
     test_problem_kwargs: Optional[dict[str, Any]] = None,
+    report_inference_value_as_trace: bool = False,
 ) -> BenchmarkProblem:
     return create_problem_from_botorch(
         test_problem_class=Branin,
         test_problem_kwargs=test_problem_kwargs or {},
         num_trials=num_trials,
         observe_noise_sd=observe_noise_sd,
+        report_inference_value_as_trace=report_inference_value_as_trace,
     )
 
 
@@ -59,12 +61,14 @@ def get_multi_objective_benchmark_problem(
     observe_noise_sd: bool = False,
     num_trials: int = 4,
     test_problem_class: type[BraninCurrin] = BraninCurrin,
+    report_inference_value_as_trace: bool = False,
 ) -> BenchmarkProblem:
     return create_problem_from_botorch(
         test_problem_class=test_problem_class,
         test_problem_kwargs={},
         num_trials=num_trials,
         observe_noise_sd=observe_noise_sd,
+        report_inference_value_as_trace=report_inference_value_as_trace,
     )
 
 
@@ -206,6 +210,8 @@ def get_benchmark_result() -> BenchmarkResult:
             runner=problem.runner,
             is_test=True,
         ),
+        inference_trace=np.ones(4),
+        oracle_trace=np.zeros(4),
         optimization_trace=np.array([3, 2, 1, 0.1]),
         score_trace=np.array([3, 2, 1, 0.1]),
         fit_time=0.1,