diff --git a/ax/benchmark/benchmark.py b/ax/benchmark/benchmark.py
index 14dca5c998e..0503c009e0b 100644
--- a/ax/benchmark/benchmark.py
+++ b/ax/benchmark/benchmark.py
@@ -29,20 +29,11 @@
 from ax.benchmark.benchmark_method import BenchmarkMethod
 from ax.benchmark.benchmark_problem import BenchmarkProblem
 from ax.benchmark.benchmark_result import AggregatedBenchmarkResult, BenchmarkResult
-from ax.benchmark.metrics.base import BenchmarkMetricBase, GroundTruthMetricMixin
 from ax.core.experiment import Experiment
-from ax.core.metric import Metric
-from ax.core.objective import MultiObjective, Objective
-from ax.core.optimization_config import (
-    MultiObjectiveOptimizationConfig,
-    OptimizationConfig,
-)
-from ax.core.outcome_constraint import ObjectiveThreshold, OutcomeConstraint
 from ax.core.utils import get_model_times
 from ax.service.scheduler import Scheduler
 from ax.utils.common.logger import get_logger
 from ax.utils.common.random import with_rng_seed
-from ax.utils.common.typeutils import checked_cast, not_none
 
 logger: Logger = get_logger(__name__)
 
@@ -88,25 +79,10 @@ def _create_benchmark_experiment(
     Returns:
         The Experiment object to be used for benchmarking.
     """
-    tracking_metrics = problem.tracking_metrics
-    if not problem.is_noiseless and problem.has_ground_truth:
-        # Make the ground truth counterparts for each metric defined on the problem,
-        # which will be added as tracking metrics on the Experiment object below.
-        # In the analysis, a modified OptimziationConfig referencing those metrics
-        # will be passed to the `Scheduler.get_trace()` method, which allows to extract
-        # the optimziation trace based on the ground truth outcomes (without noise).
-        # If the problem is known to be noiseless, this is unneccesary and we can just
-        # use the observations made during the optimization loop directly.
-        gt_metric_dict = make_ground_truth_metrics(problem=problem)
-        tracking_metrics = tracking_metrics + list(gt_metric_dict.values())
     return Experiment(
         name=f"{problem.name}|{method_name}_{int(time())}",
         search_space=problem.search_space,
         optimization_config=problem.optimization_config,
-        tracking_metrics=tracking_metrics,  # pyre-ignore [6]: Incompatible
-        # parameter type: In call `Experiment.__init__`, for argument
-        # `tracking_metrics`, expected `Optional[List[Metric]]` but got
-        # `Union[List[Union[BenchmarkMetricBase, Metric]], List[BenchmarkMetricBase]]`.
         runner=problem.runner,
     )
 
@@ -124,7 +100,12 @@ def benchmark_replication(
         seed: The seed to use for this replication.
     """
 
-    experiment = _create_benchmark_experiment(problem=problem, method_name=method.name)
+    experiment = Experiment(
+        name=f"{problem.name}|{method.name}_{int(time())}",
+        search_space=problem.search_space,
+        optimization_config=problem.optimization_config,
+        runner=problem.runner,
+    )
 
     scheduler = Scheduler(
         experiment=experiment,
@@ -135,24 +116,7 @@ def benchmark_replication(
     with with_rng_seed(seed=seed):
         scheduler.run_n_trials(max_trials=problem.num_trials)
 
-    if not problem.is_noiseless and problem.has_ground_truth:
-        # We modify the optimization config so we can use `Scheduler.get_trace()`
-        # to use the true (not corrupted by noise) observations that were logged
-        # as tracking metrics on the Experiment object. If the problem is known to
-        # be noiseless, this is unnecssary and we can just use the observations
-        # made during the optimization loop directly.
-        analysis_opt_config = make_ground_truth_optimization_config(
-            experiment=experiment
-        )
-    else:
-        analysis_opt_config = experiment.optimization_config
-
-    optimization_trace = np.asarray(
-        scheduler.get_trace(optimization_config=analysis_opt_config)
-    )
-
-    new_optimization_trace = problem.get_opt_trace(experiment=experiment)
-    np.testing.assert_allclose(optimization_trace, new_optimization_trace)
+    optimization_trace = problem.get_opt_trace(experiment=experiment)
 
     try:
         # Catch any errors that may occur during score computation, such as errors
@@ -217,125 +181,3 @@ def benchmark_multiple_problems_methods(
         benchmark_one_method_problem(problem=p, method=m, seeds=seeds)
         for p, m in product(problems, methods)
     ]
-
-
-def make_ground_truth_metrics(
-    problem: BenchmarkProblem,
-    include_tracking_metrics: bool = True,
-) -> dict[str, Metric]:
-    """Makes a ground truth version for each metric defined on the problem.
-
-    Args:
-        problem: The BenchmarkProblem to test against (can be synthetic or real).
-        include_tracking_metrics: Whether or not to include tracking metrics.
-
-    Returns:
-        A dict mapping (original) metric names to their respective ground truth metric.
-    """
-    if not problem.has_ground_truth:
-        raise ValueError(
-            "Cannot create ground truth metrics for problems that "
-            "do not have a ground truth."
-        )
-    metrics: list[BenchmarkMetricBase] = [
-        checked_cast(BenchmarkMetricBase, metric)
-        for metric in problem.optimization_config.metrics.values()
-    ]
-    if include_tracking_metrics:
-        metrics = metrics + problem.tracking_metrics
-    return {metric.name: metric.make_ground_truth_metric() for metric in metrics}
-
-
-def make_ground_truth_optimization_config(
-    experiment: Experiment,
-) -> OptimizationConfig:
-    """Makes a clone of the OptimizationConfig on the experiment in which each metric
-    is replaced by its respective "ground truth" counterpart, which has been added to
-    the experiment's tracking metrics in `_create_benchmark_experiment` and which
-    returns the ground truth (i.e., uncorrupted by noise) observations.
-    """
-    optimization_config = not_none(experiment.optimization_config)
-
-    if optimization_config.risk_measure is not None:
-        raise NotImplementedError("Support for risk measures is not yet implemented.")
-
-    # dict for caching metric lookup
-    gt_metric_dict: dict[str, BenchmarkMetricBase] = {}
-
-    def get_gt_metric(metric: Metric) -> BenchmarkMetricBase:
-        """Look up corresponding ground truth metric of the experiment. Will error
-        out if no corresponding ground truth metric exists."""
-        if not isinstance(metric, BenchmarkMetricBase):
-            raise ValueError(
-                "Only BenchmarkMetricBase metrics are supported for ground truth "
-                f"metrics. Got {type(metric)}."
-            )
-
-        if metric.name in gt_metric_dict:
-            return gt_metric_dict[metric.name]
-
-        for tracking_metric in experiment.tracking_metrics:
-            if getattr(tracking_metric, "is_ground_truth", False):
-                # TODO: Figure out if there is a better way to match the ground truth
-                # metric and the original metric.
-                ground_truth_name = tracking_metric.name
-                orig_name = checked_cast(
-                    GroundTruthMetricMixin, tracking_metric
-                ).get_original_name(ground_truth_name)
-                if orig_name == metric.name:
-                    tracking_metric = checked_cast(BenchmarkMetricBase, tracking_metric)
-                    gt_metric_dict[metric.name] = tracking_metric
-                    return tracking_metric
-        raise ValueError(f"Ground truth metric for metric {metric.name} not found!")
-
-    # convert outcome constraints
-    if optimization_config.outcome_constraints is not None:
-        gt_outcome_constraints = [
-            OutcomeConstraint(
-                metric=get_gt_metric(oc.metric),
-                op=oc.op,
-                bound=oc.bound,
-                relative=oc.relative,
-            )
-            for oc in optimization_config.outcome_constraints
-        ]
-    else:
-        gt_outcome_constraints = None
-
-    # we need to distinguish MOO and non-MOO problems
-    if not optimization_config.is_moo_problem:
-        gt_objective = Objective(
-            metric=get_gt_metric(optimization_config.objective.metric)
-        )
-
-        return OptimizationConfig(
-            objective=gt_objective, outcome_constraints=gt_outcome_constraints
-        )
-
-    gt_objective = MultiObjective(
-        metrics=[
-            get_gt_metric(metric) for metric in optimization_config.objective.metrics
-        ]
-    )
-    # there may be objective thresholds to also convert
-    objective_thresholds = checked_cast(
-        MultiObjectiveOptimizationConfig, optimization_config
-    ).objective_thresholds
-    if objective_thresholds is not None:
-        gt_objective_thresholds = [
-            ObjectiveThreshold(
-                metric=get_gt_metric(ot.metric),
-                bound=ot.bound,
-                relative=ot.relative,
-                op=ot.op,
-            )
-            for ot in objective_thresholds
-        ]
-    else:
-        gt_objective_thresholds = None
-
-    return MultiObjectiveOptimizationConfig(
-        objective=gt_objective,
-        outcome_constraints=gt_outcome_constraints,
-        objective_thresholds=gt_objective_thresholds,
-    )
diff --git a/ax/benchmark/benchmark_method.py b/ax/benchmark/benchmark_method.py
index 8bfee47415f..2b71dcadc22 100644
--- a/ax/benchmark/benchmark_method.py
+++ b/ax/benchmark/benchmark_method.py
@@ -7,13 +7,11 @@
 
 import logging
 from dataclasses import dataclass
-from typing import Any
 
-from ax.modelbridge.generation_strategy import GenerationStep, GenerationStrategy
+from ax.modelbridge.generation_strategy import GenerationStrategy
 from ax.service.utils.scheduler_options import SchedulerOptions, TrialType
 from ax.utils.common.base import Base
 from ax.utils.common.logger import get_logger
-from ax.utils.common.typeutils import not_none
 
 
 logger: logging.Logger = get_logger("BenchmarkMethod")
@@ -28,9 +26,6 @@ class BenchmarkMethod(Base):
     Note: If `BenchmarkMethod.scheduler_options.total_trials` is less than
     `BenchmarkProblem.num_trials` then only the number of trials specified in the
     former will be run.
-
-    Note: The `generation_strategy` passed in is assumed to be in its "base state",
-    as it will be cloned and reset.
     """
 
     name: str
@@ -38,42 +33,6 @@ class BenchmarkMethod(Base):
     scheduler_options: SchedulerOptions
     distribute_replications: bool = False
 
-    def __post_init__(self) -> None:
-        # We (I think?) in general don't want to fit tracking metrics during our
-        # benchmarks. Further, not setting `fit_tracking_metrics=False`causes
-        # issues with the ground truth metrics created automatically when running
-        # the benchmark - in fact, things will error out deep inside the modeling
-        # stack since the model gets both noisy (benchmark) and noiseless (ground
-        # truth) observations. While support for this is something we shold add
-        # for models, in the context of benchmarking we actually want to avoid
-        # fitting the ground truth metrics at all.
-
-        # Clone the GS so as to not modify the original one in-place below.
-        # Note that this assumes that the GS passed in is in its base state.
-        gs_cloned = self.generation_strategy.clone_reset()
-
-        for node in gs_cloned._nodes:
-            if isinstance(node, GenerationStep):
-                if node.model_kwargs is None:
-                    node.model_kwargs = {}
-                if node.model_kwargs.get("fit_tracking_metrics", True):
-                    logger.info(
-                        "Setting `fit_tracking_metrics` in a GenerationStep to False.",
-                    )
-                    not_none(node.model_kwargs)["fit_tracking_metrics"] = False
-            for model_spec in node.model_specs:
-                if model_spec.model_kwargs is None:
-                    model_spec.model_kwargs = {}
-                elif model_spec.model_kwargs.get("fit_tracking_metrics", True):
-                    logger.info(
-                        "Setting `fit_tracking_metrics` in a GenerationNode's "
-                        "model_spec to False."
-                    )
-                    not_none(model_spec.model_kwargs)["fit_tracking_metrics"] = False
-
-        # hack around not being able to update frozen attribute of a dataclass
-        _assign_frozen_attr(self, name="generation_strategy", value=gs_cloned)
-
 
 def get_benchmark_scheduler_options(
     timeout_hours: int = 4,
@@ -103,10 +62,3 @@ def get_benchmark_scheduler_options(
         trial_type=TrialType.TRIAL if batch_size == 1 else TrialType.BATCH_TRIAL,
         batch_size=batch_size,
     )
-
-
-def _assign_frozen_attr(obj: Any, name: str, value: Any) -> None:  # pyre-ignore [2]
-    """Assign a new value to an attribute of a frozen dataclass.
-    This is an ugly hack and shouldn't be used broadly.
-    """
-    object.__setattr__(obj, name, value)
diff --git a/ax/benchmark/benchmark_problem.py b/ax/benchmark/benchmark_problem.py
index 77f5dabee69..e71e6b71f39 100644
--- a/ax/benchmark/benchmark_problem.py
+++ b/ax/benchmark/benchmark_problem.py
@@ -11,8 +11,6 @@
 import numpy as np
 import pandas as pd
 
-from ax.benchmark.metrics.base import BenchmarkMetricBase
-
 from ax.benchmark.metrics.benchmark import BenchmarkMetric
 from ax.benchmark.runners.base import BenchmarkRunner
 from ax.benchmark.runners.botorch_test import BotorchTestProblemRunner
@@ -72,12 +70,6 @@ class BenchmarkProblem(Base):
         observe_noise_stds: If boolean, whether the standard deviation of the
             observation noise is observed for all metrics. If a dictionary,
             whether noise levels are observed on a per-metric basis.
-        has_ground_truth: Whether the Runner produces underlying ground truth
-            values, which are not observed in real noisy problems but may be
-            known in benchmarks.
-        tracking_metrics: Tracking metrics are not optimized, and for the
-            purpose of benchmarking, they will not be fit. The ground truth may
-            be provided as `tracking_metrics`.
         optimal_value: The best ground-truth objective value. Hypervolume for
             multi-objective problems. If the best value is not known, it is
             conventional to set it to a value that is almost certainly better
@@ -91,13 +83,10 @@ class BenchmarkProblem(Base):
     optimization_config: OptimizationConfig
     num_trials: int
     observe_noise_stds: Union[bool, dict[str, bool]] = False
-    has_ground_truth: bool = True
-    tracking_metrics: list[BenchmarkMetricBase] = field(default_factory=list)
     optimal_value: float
 
     search_space: SearchSpace = field(repr=False)
     runner: BenchmarkRunner = field(repr=False)
-    is_noiseless: bool
 
     def get_oracle_experiment(self, experiment: Experiment) -> Experiment:
         records = []
@@ -263,8 +252,6 @@ def create_single_objective_problem_from_botorch(
         ),
         num_trials=num_trials,
         observe_noise_stds=observe_noise_sd,
-        is_noiseless=test_problem.noise_std in (None, 0.0),
-        has_ground_truth=True,  # all synthetic problems have ground truth
         optimal_value=optimal_value,
     )
 
@@ -356,8 +343,6 @@ def create_multi_objective_problem_from_botorch(
         optimization_config=optimization_config,
         runner=runner,
         num_trials=num_trials,
-        is_noiseless=test_problem.noise_std in (None, 0.0),
         observe_noise_stds=observe_noise_sd,
-        has_ground_truth=True,
         optimal_value=test_problem.max_hv,
     )
diff --git a/ax/benchmark/metrics/base.py b/ax/benchmark/metrics/base.py
deleted file mode 100644
index 59bf5951e56..00000000000
--- a/ax/benchmark/metrics/base.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-# pyre-strict
-
-"""
-Module containing the metric base classes for benchmarks. The key property of
-a benchmark metric is whether it has a ground truth or not, which is indicated
-by a `has_ground_truth` attribute of `BenchmarkMetricBase`. All mnetrics used
-in Ax bechmarks need to be subclassed from `BenchmarkMetricBase`.
-
-For metrics that do have a ground truth, we can compute the performance of the
-optimization directly in terms of the ground truth observations (or the ground
-truth of the out-of-sample model-suggested best point). For metrics that do not
-have a ground truth, this is not possible.
-
-The benchmarks are designed in a way so that (unless the metric is noiseless)
-no ground truth observations are available to the optimziation algorithm.
-Instead, we use separate "ground truth metrics" attached as tracking metrics
-to the experiment that are used to evaluate the performance after the
-optimization is complete. `GroundTruthMetricMixin` can be used to construct
-such ground truth metrics (with the `is_ground_truth` property indicating
-that the metric provides the ground truth) and implements naming conventions
-and helpers for associating a the ground truth metric to the respective metric
-used during the optimization.
-"""
-
-from __future__ import annotations
-
-from abc import ABC, abstractmethod
-
-from ax.core.metric import Metric
-
-
-class BenchmarkMetricBase(Metric, ABC):
-    """A generic metric used for Ax Benchmarks.
-
-    Attributes:
-        has_ground_truth: Whether or not there exists a ground truth for this
-            metric, i.e. whether each observation has an associated ground
-            truth value. This is trivially true for deterministic metrics, and
-            is also true for metrics where synthetic observation noise is added
-            to its (deterministic) values. This is not true for metrics that
-            are inherently noisy.
-    """
-
-    has_ground_truth: bool
-
-    @abstractmethod
-    def make_ground_truth_metric(self) -> BenchmarkMetricBase:
-        """Create a ground truth version of this metric. If metric observations
-        are noisy, the ground truth would be the underlying noiseless values."""
-
-
-class GroundTruthMetricMixin(ABC):
-    """A mixin for metrics that defines a naming convention and associated helper
-    methods that allow mapping from a ground truth metric to its original metric
-    and vice versa."""
-
-    is_ground_truth: bool = True
-    _GROUND_TRUTH_SUFFIX = "__GROUND_TRUTH"
-
-    @classmethod
-    def get_ground_truth_name(cls, metric: Metric) -> str:
-        return f"{metric.name}{cls._GROUND_TRUTH_SUFFIX}"
-
-    @classmethod
-    def get_original_name(cls, full_name: str) -> str:
-        if not full_name.endswith(cls._GROUND_TRUTH_SUFFIX):
-            raise ValueError("full_name does not end with ground truth suffix.")
-        return full_name.replace(cls._GROUND_TRUTH_SUFFIX, "")
diff --git a/ax/benchmark/metrics/benchmark.py b/ax/benchmark/metrics/benchmark.py
index b759ae5935b..5e854ff29e7 100644
--- a/ax/benchmark/metrics/benchmark.py
+++ b/ax/benchmark/metrics/benchmark.py
@@ -9,29 +9,17 @@
 
 from typing import Any, Optional
 
-from ax.benchmark.metrics.base import BenchmarkMetricBase, GroundTruthMetricMixin
 from ax.benchmark.metrics.utils import _fetch_trial_data
 from ax.core.base_trial import BaseTrial
-from ax.core.metric import MetricFetchResult
+from ax.core.metric import Metric, MetricFetchResult
 
 
-class BenchmarkMetric(BenchmarkMetricBase):
+class BenchmarkMetric(Metric):
     """A generic metric used for observed values produced by Ax Benchmarks.
 
-    Compatible e.g. with results generated by `BotorchTestProblemRunner` and
-    `SurrogateRunner`.
-
-    Attributes:
-        has_ground_truth: Whether or not there exists a ground truth for this
-            metric, i.e. whether each observation has an associated ground
-            truth value. This is trivially true for deterministic metrics, and
-            is also true for metrics where synthetic observation noise is added
-            to its (deterministic) values. This is not true for metrics that
-            are inherently noisy.
+    Compatible with results generated by `BenchmarkRunner`.
     """
 
-    has_ground_truth: bool = True
-
     def __init__(
         self,
         name: str,
@@ -70,43 +58,4 @@ def fetch_trial_data(self, trial: BaseTrial, **kwargs: Any) -> MetricFetchResult
             metric_name=self.name,
             outcome_index=self.outcome_index,
             include_noise_sd=self.observe_noise_sd,
-            ground_truth=False,
-        )
-
-    def make_ground_truth_metric(self) -> BenchmarkMetricBase:
-        """Create a ground truth version of this metric."""
-        return GroundTruthBenchmarkMetric(original_metric=self)
-
-
-class GroundTruthBenchmarkMetric(BenchmarkMetric, GroundTruthMetricMixin):
-    def __init__(self, original_metric: BenchmarkMetric) -> None:
-        """
-        Args:
-            original_metric: The original BenchmarkMetric to which this metric
-                corresponds.
-        """
-        super().__init__(
-            name=self.get_ground_truth_name(original_metric),
-            lower_is_better=original_metric.lower_is_better,
-            observe_noise_sd=False,
-            outcome_index=original_metric.outcome_index,
-        )
-        self.original_metric = original_metric
-
-    def fetch_trial_data(self, trial: BaseTrial, **kwargs: Any) -> MetricFetchResult:
-        if len(kwargs) > 0:
-            raise NotImplementedError(
-                f"Arguments {set(kwargs)} are not supported in "
-                f"{self.__class__.__name__}.fetch_trial_data."
-            )
-        return _fetch_trial_data(
-            trial=trial,
-            metric_name=self.name,
-            outcome_index=self.outcome_index,
-            include_noise_sd=False,
-            ground_truth=True,
         )
-
-    def make_ground_truth_metric(self) -> BenchmarkMetricBase:
-        """Create a ground truth version of this metric."""
-        return self
diff --git a/ax/benchmark/metrics/utils.py b/ax/benchmark/metrics/utils.py
index 67aa4c3edf6..0b55df0aca0 100644
--- a/ax/benchmark/metrics/utils.py
+++ b/ax/benchmark/metrics/utils.py
@@ -8,11 +8,9 @@
 from typing import Optional
 
 import pandas as pd
-from ax.benchmark.metrics.base import GroundTruthMetricMixin
 from ax.core.base_trial import BaseTrial
 from ax.core.data import Data
 from ax.core.metric import MetricFetchE, MetricFetchResult
-from ax.exceptions.core import UnsupportedError
 from ax.utils.common.result import Err, Ok
 
 
@@ -21,7 +19,6 @@ def _fetch_trial_data(
     metric_name: str,
     outcome_index: Optional[int] = None,
     include_noise_sd: bool = True,
-    ground_truth: bool = False,
 ) -> MetricFetchResult:
     """
     Args:
@@ -30,26 +27,17 @@ def _fetch_trial_data(
             this is used to retrieve the index (of the outcomes) from the
             `outcome_names` dict in a trial's `run_metadata`. If `metric_index` is
             specified, this is simply the name of the metric.
-        outcome_index: The index (in the last dimension) of the `Ys`, `Ys_true`, and
+        outcome_index: The index (in the last dimension) of the `Ys` and
             `Ystds` lists of outcomes stored by the respective runner in the trial's
             `run_metadata`. If omitted, `run_metadata` must contain a `outcome_names`
             list of names in the same order as the outcomes that will be used to
             determine the index.
         include_noise_sd: Whether to include noise standard deviation in the returned
-            data. Must be `False` if `ground_truth` is set to `True`.
-        ground_truth: If True, return the ground truth values instead of the actual
-            (noisy) observations. In this case, the noise standard deviations will
-            be reported as zero.
+            data.
 
     Returns:
         A MetricFetchResult containing the data for the requested metric.
     """
-    if include_noise_sd and ground_truth:
-        raise UnsupportedError(
-            "Cannot include noise standard deviation when extracting ground truth "
-            "data. Will be set to zero for ground truth observations."
-        )
-
     if outcome_index is None:
         # Look up the index based on the outcome name under which we track the data
         # as part of `run_metadata`.
@@ -59,15 +47,11 @@ def _fetch_trial_data(
                 "Trials' `run_metadata` must contain `outcome_names` if "
                 "no `outcome_index` is provided."
             )
-        outcome_index = outcome_names.index(
-            GroundTruthMetricMixin.get_original_name(metric_name)
-            if ground_truth
-            else metric_name
-        )
+        outcome_index = outcome_names.index(metric_name)
 
     try:
         arm_names = list(trial.arms_by_name.keys())
-        all_Ys = trial.run_metadata["Ys_true" if ground_truth else "Ys"]
+        all_Ys = trial.run_metadata["Ys"]
         Ys = [all_Ys[arm_name][outcome_index] for arm_name in arm_names]
 
         if include_noise_sd:
@@ -75,11 +59,6 @@ def _fetch_trial_data(
                 trial.run_metadata["Ystds"][arm_name][outcome_index]
                 for arm_name in arm_names
             ]
-        elif ground_truth:
-            # Ground truth observations are noiseless (note that at least currently
-            # this information is not being used as we only use the ground truth
-            # observations for analysis but not for modeling).
-            stdvs = [0.0] * len(Ys)
         else:
             stdvs = [float("nan")] * len(Ys)
 
diff --git a/ax/benchmark/problems/hpo/torchvision.py b/ax/benchmark/problems/hpo/torchvision.py
index 993376acfeb..bcf8cd5248d 100644
--- a/ax/benchmark/problems/hpo/torchvision.py
+++ b/ax/benchmark/problems/hpo/torchvision.py
@@ -224,8 +224,6 @@ def get_pytorch_cnn_torchvision_benchmark_problem(
         optimization_config=optimization_config,
         num_trials=num_trials,
         observe_noise_stds=False,
-        is_noiseless=True,
-        has_ground_truth=True,
         optimal_value=base_problem.optimal_value,
         runner=runner,
     )
diff --git a/ax/benchmark/problems/synthetic/discretized/mixed_integer.py b/ax/benchmark/problems/synthetic/discretized/mixed_integer.py
index 15b966fe3e8..769b7d2b698 100644
--- a/ax/benchmark/problems/synthetic/discretized/mixed_integer.py
+++ b/ax/benchmark/problems/synthetic/discretized/mixed_integer.py
@@ -118,9 +118,7 @@ def _get_problem_from_common_inputs(
         runner=runner,
         num_trials=num_trials,
         optimal_value=optimal_value,
-        is_noiseless=True,
         observe_noise_stds=observe_noise_sd,
-        has_ground_truth=True,
     )
 
 
diff --git a/ax/benchmark/problems/synthetic/hss/jenatton.py b/ax/benchmark/problems/synthetic/hss/jenatton.py
index f277ceddd38..16cbb93d89b 100644
--- a/ax/benchmark/problems/synthetic/hss/jenatton.py
+++ b/ax/benchmark/problems/synthetic/hss/jenatton.py
@@ -130,8 +130,6 @@ def get_jenatton_benchmark_problem(
             outcome_names=[name],
         ),
         num_trials=num_trials,
-        is_noiseless=noise_std == 0.0,
         observe_noise_stds=observe_noise_sd,
-        has_ground_truth=True,
         optimal_value=Jenatton.optimal_value,
     )
diff --git a/ax/benchmark/runners/base.py b/ax/benchmark/runners/base.py
index b4fb19eabf4..af9d0784628 100644
--- a/ax/benchmark/runners/base.py
+++ b/ax/benchmark/runners/base.py
@@ -89,14 +89,9 @@ def run(self, trial: BaseTrial) -> dict[str, Any]:
                     noise standard deviations (possibly nan if the noise level is
                     unobserved), where the order of the outcomes is the same as in
                     `outcome_names`.
-                - Ys_true: A dict mapping arm names to lists of corresponding ground
-                    truth outcomes, where the order of the outcomes is the same as
-                    in `outcome_names`. If the benchmark problem does not provide a
-                    ground truth, this key will not be present in the dict returned
-                    by this function.
                 - "outcome_names": A list of metric names.
         """
-        Ys, Ys_true, Ystds = {}, {}, {}
+        Ys, Ystds = {}, {}
         noise_stds = self.get_noise_stds()
 
         if noise_stds is not None:
@@ -126,7 +121,6 @@ def run(self, trial: BaseTrial) -> dict[str, Any]:
         for arm in trial.arms:
             # Case where we do have a ground truth
             Y_true = self.get_Y_true(arm)
-            Ys_true[arm.name] = Y_true.tolist()
             if noise_stds is None:
                 # No noise, so just return the true outcome.
                 Ystds[arm.name] = [0.0] * len(Y_true)
@@ -144,7 +138,6 @@ def run(self, trial: BaseTrial) -> dict[str, Any]:
             "Ys": Ys,
             "Ystds": Ystds,
             "outcome_names": self.outcome_names,
-            "Ys_true": Ys_true,
         }
         return run_metadata
 
diff --git a/ax/benchmark/tests/metrics/test_benchmark_metric.py b/ax/benchmark/tests/metrics/test_benchmark_metric.py
index c98c538b22f..715b19b68df 100644
--- a/ax/benchmark/tests/metrics/test_benchmark_metric.py
+++ b/ax/benchmark/tests/metrics/test_benchmark_metric.py
@@ -5,7 +5,7 @@
 
 # pyre-strict
 
-from ax.benchmark.metrics.benchmark import BenchmarkMetric, GroundTruthBenchmarkMetric
+from ax.benchmark.metrics.benchmark import BenchmarkMetric
 from ax.core.arm import Arm
 from ax.core.batch_trial import BatchTrial
 from ax.core.trial import Trial
@@ -110,38 +110,3 @@ def test_fetch_trial_data_batch_trial(self) -> None:
                 "trial_index": {0: 0, 1: 0},
             },
         )
-
-    def test_make_ground_truth_metric(self) -> None:
-        metric = BenchmarkMetric(name="test_metric1", lower_is_better=True)
-        gt_metric = metric.make_ground_truth_metric()
-        self.assertIsInstance(gt_metric, GroundTruthBenchmarkMetric)
-        self.assertEqual(gt_metric.name, "test_metric1__GROUND_TRUTH")
-        self.assertEqual(gt_metric.lower_is_better, metric.lower_is_better)
-        self.assertFalse(gt_metric.observe_noise_sd)  # pyre-ignore [16]
-        self.assertEqual(
-            gt_metric.outcome_index, metric.outcome_index  # pyre-ignore [16]
-        )
-        self.assertIs(gt_metric.original_metric, metric)  # pyre-ignore [16]
-
-        trial = get_test_trial()
-
-        with self.assertRaisesRegex(
-            NotImplementedError,
-            "Arguments {'foo'} are not supported in GroundTruthBenchmarkMetric",
-        ):
-            gt_metric.fetch_trial_data(trial, foo="bar")
-
-        df = gt_metric.fetch_trial_data(trial=trial).value.df  # pyre-ignore [16]
-        self.assertEqual(len(df), 1)
-        self.assertDictEqual(
-            df.iloc[0].to_dict(),
-            {
-                "arm_name": "0_0",
-                "metric_name": "test_metric1__GROUND_TRUTH",
-                "mean": 1.1,
-                "sem": 0.0,
-                "trial_index": 0,
-            },
-        )
-
-        self.assertIs(gt_metric.make_ground_truth_metric(), gt_metric)
diff --git a/ax/benchmark/tests/problems/hpo/test_torchvision.py b/ax/benchmark/tests/problems/hpo/test_torchvision.py
index bed29b8da0c..5f019c24843 100644
--- a/ax/benchmark/tests/problems/hpo/test_torchvision.py
+++ b/ax/benchmark/tests/problems/hpo/test_torchvision.py
@@ -54,9 +54,7 @@ def test_problem_properties(self) -> None:
         )
         self.assertFalse(problem.optimization_config.objective.minimize)
         self.assertEqual(problem.num_trials, num_trials)
-        self.assertTrue(problem.is_noiseless)
         self.assertFalse(problem.observe_noise_stds)
-        self.assertTrue(problem.has_ground_truth)
 
     def test_deterministic(self) -> None:
         problem_name = choice(["MNIST", "FashionMNIST"])
@@ -77,7 +75,6 @@ def test_deterministic(self) -> None:
             {
                 "Ys": {"0": [expected]},
                 "Ystds": {"0": [0.0]},
-                "Ys_true": {"0": [expected]},
                 "outcome_names": ["accuracy"],
             },
         )
diff --git a/ax/benchmark/tests/problems/synthetic/hss/test_jenatton.py b/ax/benchmark/tests/problems/synthetic/hss/test_jenatton.py
index 5f8ee63db30..172e80a64c3 100644
--- a/ax/benchmark/tests/problems/synthetic/hss/test_jenatton.py
+++ b/ax/benchmark/tests/problems/synthetic/hss/test_jenatton.py
@@ -8,7 +8,7 @@
 import math
 from random import random
 
-from ax.benchmark.metrics.benchmark import BenchmarkMetric, GroundTruthBenchmarkMetric
+from ax.benchmark.metrics.benchmark import BenchmarkMetric
 
 from ax.benchmark.problems.synthetic.hss.jenatton import (
     get_jenatton_benchmark_problem,
@@ -114,7 +114,6 @@ def test_create_problem(self) -> None:
             ).test_problem.noise_std,
             0.0,
         )
-        self.assertTrue(problem.is_noiseless)
         self.assertFalse(assert_is_instance(metric, BenchmarkMetric).observe_noise_sd)
 
         problem = get_jenatton_benchmark_problem(
@@ -129,7 +128,6 @@ def test_create_problem(self) -> None:
             ).test_problem.noise_std,
             0.1,
         )
-        self.assertFalse(problem.is_noiseless)
         self.assertTrue(assert_is_instance(metric, BenchmarkMetric).observe_noise_sd)
 
     def test_fetch_trial_data(self) -> None:
@@ -151,7 +149,6 @@ def test_fetch_trial_data(self) -> None:
             "Ys": {"0_0": [4.25]},
             "Ystds": {"0_0": [0.0]},
             "outcome_names": ["Jenatton"],
-            "Ys_true": {"0_0": [4.25]},
         }
         self.assertEqual(metadata, expected_metadata)
 
@@ -186,36 +183,3 @@ def test_fetch_trial_data(self) -> None:
         self.assertNotEqual(res_dict["mean"], 4.25)
         self.assertAlmostEqual(res_dict["sem"], 0.1)
         self.assertEqual(res_dict["trial_index"], 0)
-
-    def test_make_ground_truth_metric(self) -> None:
-        problem = get_jenatton_benchmark_problem()
-
-        arm = Arm(parameters={"x1": 0, "x2": 1, "x5": 2.0, "r8": 0.05}, name="0_0")
-
-        experiment = Experiment(
-            search_space=problem.search_space,
-            name="Jenatton",
-            optimization_config=problem.optimization_config,
-        )
-
-        trial = Trial(experiment=experiment)
-        trial.add_arm(arm)
-        problem.runner.run(trial=trial)
-        metadata = problem.runner.run(trial=trial)
-        trial.update_run_metadata(metadata)
-
-        metric = assert_is_instance(
-            problem.optimization_config.objective.metric, BenchmarkMetric
-        )
-        gt_metric = metric.make_ground_truth_metric()
-        self.assertIsInstance(gt_metric, GroundTruthBenchmarkMetric)
-        runner = assert_is_instance(problem.runner, ParamBasedTestProblemRunner)
-        self.assertEqual(runner.test_problem.noise_std, 0.0)
-        self.assertFalse(
-            assert_is_instance(gt_metric, BenchmarkMetric).observe_noise_sd
-        )
-
-        self.assertIsInstance(metric, BenchmarkMetric)
-        self.assertNotIsInstance(metric, GroundTruthBenchmarkMetric)
-        self.assertEqual(runner.test_problem.noise_std, 0.0)
-        self.assertFalse(metric.observe_noise_sd)
diff --git a/ax/benchmark/tests/problems/test_surrogate_problems.py b/ax/benchmark/tests/problems/test_surrogate_problems.py
index 7295d617f5f..e901d88b87c 100644
--- a/ax/benchmark/tests/problems/test_surrogate_problems.py
+++ b/ax/benchmark/tests/problems/test_surrogate_problems.py
@@ -36,8 +36,8 @@ def test_repr(self) -> None:
             '"branin", '
             "minimize=True), "
             "outcome_constraints=[]), num_trials=6, "
-            "observe_noise_stds=True, has_ground_truth=True, "
-            "tracking_metrics=[], optimal_value=0.0, is_noiseless=True)"
+            "observe_noise_stds=True, "
+            "optimal_value=0.0)"
         )
         self.assertEqual(repr(sbp), expected_repr)
 
diff --git a/ax/benchmark/tests/runners/test_botorch_test_problem.py b/ax/benchmark/tests/runners/test_botorch_test_problem.py
index f6787e812fa..57cdb60e3a4 100644
--- a/ax/benchmark/tests/runners/test_botorch_test_problem.py
+++ b/ax/benchmark/tests/runners/test_botorch_test_problem.py
@@ -159,11 +159,8 @@ def test_synthetic_runner(self) -> None:
                 trial.arm = arm
                 trial.index = 0
                 res = runner.run(trial=trial)
-                self.assertSetEqual(
-                    set(res.keys()), {"Ys", "Ys_true", "Ystds", "outcome_names"}
-                )
-                self.assertSetEqual(set(res["Ys"].keys()), {"0_0"})
-                self.assertEqual(res["Ys_true"]["0_0"], Y.tolist())
+                self.assertEqual({"Ys", "Ystds", "outcome_names"}, res.keys())
+                self.assertEqual({"0_0"}, res["Ys"].keys())
                 if noise_std is not None:
                     self.assertEqual(res["Ystds"]["0_0"], [noise_std] * len(Y))
                 else:
@@ -227,9 +224,7 @@ def test_botorch_test_problem_runner_heterogeneous_noise(self) -> None:
         trial.arm = arm
         trial.index = 0
         res = runner.run(trial=trial)
-        self.assertSetEqual(
-            set(res.keys()), {"Ys", "Ys_true", "Ystds", "outcome_names"}
-        )
+        self.assertSetEqual(set(res.keys()), {"Ys", "Ystds", "outcome_names"})
         self.assertSetEqual(set(res["Ys"].keys()), {"0_0"})
         self.assertEqual(res["Ystds"]["0_0"], [0.1, 0.05])
         self.assertEqual(res["outcome_names"], ["objective", "constraint"])
diff --git a/ax/benchmark/tests/test_benchmark.py b/ax/benchmark/tests/test_benchmark.py
index fb95644c786..8fd7f987204 100644
--- a/ax/benchmark/tests/test_benchmark.py
+++ b/ax/benchmark/tests/test_benchmark.py
@@ -14,8 +14,6 @@
     benchmark_multiple_problems_methods,
     benchmark_one_method_problem,
     benchmark_replication,
-    make_ground_truth_metrics,
-    make_ground_truth_optimization_config,
 )
 from ax.benchmark.benchmark_method import (
     BenchmarkMethod,
@@ -24,10 +22,7 @@
 from ax.benchmark.benchmark_problem import create_single_objective_problem_from_botorch
 from ax.benchmark.benchmark_result import BenchmarkResult
 from ax.benchmark.methods.modular_botorch import get_sobol_botorch_modular_acquisition
-from ax.benchmark.metrics.base import GroundTruthMetricMixin
-from ax.benchmark.metrics.benchmark import BenchmarkMetric, GroundTruthBenchmarkMetric
 from ax.benchmark.problems.registry import get_problem
-from ax.core.optimization_config import MultiObjectiveOptimizationConfig
 from ax.modelbridge.generation_strategy import GenerationNode, GenerationStrategy
 from ax.modelbridge.model_spec import ModelSpec
 from ax.modelbridge.registry import Models
@@ -35,7 +30,7 @@
 from ax.storage.json_store.load import load_experiment
 from ax.storage.json_store.save import save_experiment
 from ax.utils.common.testutils import TestCase
-from ax.utils.common.typeutils import checked_cast, not_none
+from ax.utils.common.typeutils import not_none
 from ax.utils.testing.benchmark_stubs import (
     get_moo_surrogate,
     get_multi_objective_benchmark_problem,
@@ -113,79 +108,6 @@ def test_storage(self) -> None:
             experiment = load_experiment(f.name)
             self.assertEqual(experiment, experiment)
 
-    def test_make_ground_truth_metrics(self) -> None:
-        problem = get_single_objective_benchmark_problem(observe_noise_sd=False)
-        metric = problem.optimization_config.objective.metric
-
-        # basic setup
-        gt_metrics = make_ground_truth_metrics(problem=problem)
-        self.assertEqual(len(gt_metrics), 1)
-        gt_metric = checked_cast(GroundTruthBenchmarkMetric, gt_metrics[metric.name])
-        self.assertIs(gt_metric.original_metric, metric)
-
-        # add a tracking metric
-        tracking_metric = BenchmarkMetric(name="test_track", lower_is_better=True)
-        problem.tracking_metrics = [tracking_metric]
-        gt_metrics = make_ground_truth_metrics(problem=problem)
-        self.assertEqual(len(gt_metrics), 2)
-        gt_tracking_metric = checked_cast(
-            GroundTruthBenchmarkMetric, gt_metrics["test_track"]
-        )
-        self.assertIs(gt_tracking_metric.original_metric, tracking_metric)
-
-        # set include_tracking_metrics=False
-        gt_metrics = make_ground_truth_metrics(
-            problem=problem, include_tracking_metrics=False
-        )
-        self.assertEqual(len(gt_metrics), 1)
-
-        # error out if the problem does not have ground truth
-        problem.has_ground_truth = False
-        with self.assertRaisesRegex(ValueError, "do not have a ground truth"):
-            make_ground_truth_metrics(problem=problem)
-
-    def test_make_ground_truth_optimization_config(self) -> None:
-        problem = get_single_objective_benchmark_problem(observe_noise_sd=False)
-        metric = problem.optimization_config.objective.metric
-        experiment = _create_benchmark_experiment(
-            problem=problem, method_name="test_method"
-        )
-
-        # A vanilla experiment w/o ground truth metrics attached should error
-        with self.assertRaisesRegex(
-            ValueError, f"Ground truth metric for metric {metric.name} not found!"
-        ):
-            make_ground_truth_optimization_config(experiment)
-
-        # Add the ground truth metric and check basic behavior
-        gt_metric = make_ground_truth_metrics(problem)[metric.name]
-        experiment.add_tracking_metric(gt_metric)
-        gt_opt_cfg = make_ground_truth_optimization_config(experiment)
-        self.assertIs(gt_opt_cfg.objective.metric, gt_metric)
-
-        # Test behavior with MOO problem
-        problem = get_multi_objective_benchmark_problem(observe_noise_sd=False)
-        self.assertIsInstance(
-            problem.optimization_config, MultiObjectiveOptimizationConfig
-        )
-        experiment = _create_benchmark_experiment(
-            problem=problem, method_name="test_method"
-        )
-        gt_metrics = make_ground_truth_metrics(problem)
-        for metric in problem.optimization_config.objective.metrics:
-            experiment.add_tracking_metric(gt_metrics[metric.name])
-        gt_opt_cfg = make_ground_truth_optimization_config(experiment)
-
-        for metric in gt_opt_cfg.objective.metrics:
-            gt_name = metric.name
-            metric = checked_cast(GroundTruthMetricMixin, metric)
-            self.assertIs(metric, gt_metrics[metric.get_original_name(gt_name)])
-
-        for metric in gt_opt_cfg.outcome_constraints:
-            gt_name = metric.metric.name
-            metric = checked_cast(GroundTruthMetricMixin, metric.metric)
-            self.assertIs(metric, gt_metrics[metric.get_original_name(gt_name)])
-
     def test_benchmark_result_invalid_inputs(self) -> None:
         """
         Test that a BenchmarkResult cannot be specified with both an `experiment`
@@ -244,14 +166,6 @@ def test_create_benchmark_experiment(self) -> None:
             self.assertEqual(
                 experiment.optimization_config, problem.optimization_config
             )
-            self.assertEqual(len(experiment.tracking_metrics), 1)
-            gt_metric = checked_cast(
-                GroundTruthBenchmarkMetric, experiment.tracking_metrics[0]
-            )
-            self.assertIs(
-                gt_metric.original_metric,
-                problem.optimization_config.objective.metric,
-            )
             self.assertEqual(experiment.runner, problem.runner)
 
         with self.subTest("noisy, observed noise std"):
@@ -267,14 +181,6 @@ def test_create_benchmark_experiment(self) -> None:
             self.assertEqual(
                 experiment.optimization_config, problem.optimization_config
             )
-            self.assertEqual(len(experiment.tracking_metrics), 1)
-            gt_metric = checked_cast(
-                GroundTruthBenchmarkMetric, experiment.tracking_metrics[0]
-            )
-            self.assertIs(
-                gt_metric.original_metric,
-                problem.optimization_config.objective.metric,
-            )
             self.assertEqual(experiment.runner, problem.runner)
 
     def test_replication_sobol_synthetic(self) -> None:
diff --git a/ax/benchmark/tests/test_benchmark_problem.py b/ax/benchmark/tests/test_benchmark_problem.py
index 155d2eee8f0..0f794f9b583 100644
--- a/ax/benchmark/tests/test_benchmark_problem.py
+++ b/ax/benchmark/tests/test_benchmark_problem.py
@@ -91,10 +91,7 @@ def test_single_objective_from_botorch(self) -> None:
                     "minimize=True), outcome_constraints=[]), "
                     "num_trials=1, "
                     "observe_noise_stds=False, "
-                    "has_ground_truth=True, "
-                    "tracking_metrics=[], "
-                    "optimal_value=0.0, "
-                    "is_noiseless=True)"
+                    "optimal_value=0.0)"
                 )
             else:
                 outcome_constraint = (
@@ -112,10 +109,7 @@ def test_single_objective_from_botorch(self) -> None:
                     " >= 0.0)]), "
                     "num_trials=1, "
                     "observe_noise_stds=False, "
-                    "has_ground_truth=True, "
-                    "tracking_metrics=[], "
-                    "optimal_value=-3.32237, "
-                    "is_noiseless=True)"
+                    "optimal_value=-3.32237)"
                 )
 
             self.assertEqual(repr(test_problem), expected_repr)
diff --git a/ax/storage/json_store/registry.py b/ax/storage/json_store/registry.py
index 48ef289edf7..e0f3123904e 100644
--- a/ax/storage/json_store/registry.py
+++ b/ax/storage/json_store/registry.py
@@ -16,7 +16,7 @@
     MultiObjectiveBenchmarkProblem,
 )
 from ax.benchmark.benchmark_result import AggregatedBenchmarkResult, BenchmarkResult
-from ax.benchmark.metrics.benchmark import BenchmarkMetric, GroundTruthBenchmarkMetric
+from ax.benchmark.metrics.benchmark import BenchmarkMetric
 from ax.benchmark.problems.hpo.torchvision import PyTorchCNNTorchvisionParamBasedProblem
 from ax.benchmark.runners.botorch_test import (
     BotorchTestProblemRunner,
@@ -201,7 +201,6 @@
     GenerationNode: generation_node_to_dict,
     GenerationStrategy: generation_strategy_to_dict,
     GeneratorRun: generator_run_to_dict,
-    GroundTruthBenchmarkMetric: metric_to_dict,
     Hartmann6Metric: metric_to_dict,
     ImprovementGlobalStoppingStrategy: improvement_global_stopping_strategy_to_dict,
     Interval: botorch_component_to_dict,
@@ -318,8 +317,6 @@
     "GenerationStep": GenerationStep,
     "GeneratorRun": GeneratorRun,
     "GeneratorRunStruct": GeneratorRunStruct,
-    "GroundTruthBenchmarkMetric": GroundTruthBenchmarkMetric,
-    "GroundTruthBotorchTestProblemMetric": GroundTruthBenchmarkMetric,  # for BC
     "Hartmann6Metric": Hartmann6Metric,
     "HierarchicalSearchSpace": HierarchicalSearchSpace,
     "ImprovementGlobalStoppingStrategy": ImprovementGlobalStoppingStrategy,
diff --git a/ax/utils/testing/benchmark_stubs.py b/ax/utils/testing/benchmark_stubs.py
index fa681ea61a9..100d56a18d6 100644
--- a/ax/utils/testing/benchmark_stubs.py
+++ b/ax/utils/testing/benchmark_stubs.py
@@ -131,7 +131,6 @@ def get_soo_surrogate() -> SOOSurrogateBenchmarkProblem:
         observe_noise_stds=observe_noise_sd,
         optimal_value=0.0,
         runner=runner,
-        is_noiseless=runner.is_noiseless,
     )
 
 
@@ -180,7 +179,6 @@ def get_moo_surrogate() -> MOOSurrogateBenchmarkProblem:
         observe_noise_stds=True,
         optimal_value=1.0,
         runner=runner,
-        is_noiseless=runner.is_noiseless,
     )