diff --git a/demo/guide-python/cross_validation.py b/demo/guide-python/cross_validation.py
index 4e537108aa1a..a33a16c36f04 100644
--- a/demo/guide-python/cross_validation.py
+++ b/demo/guide-python/cross_validation.py
@@ -2,6 +2,7 @@
 Demo for using cross validation
 ===============================
 """
+
 import os
 
 import numpy as np
@@ -83,9 +84,12 @@ def logregobj(preds, dtrain):
 
 def evalerror(preds, dtrain):
     labels = dtrain.get_label()
+    preds = 1.0 / (1.0 + np.exp(-preds))
     return "error", float(sum(labels != (preds > 0.0))) / len(labels)
 
 
 param = {"max_depth": 2, "eta": 1}
 # train with customized objective
-xgb.cv(param, dtrain, num_round, nfold=5, seed=0, obj=logregobj, feval=evalerror)
+xgb.cv(
+    param, dtrain, num_round, nfold=5, seed=0, obj=logregobj, custom_metric=evalerror
+)
diff --git a/doc/python/python_api.rst b/doc/python/python_api.rst
index a8999e119ab4..5398fb5d091f 100644
--- a/doc/python/python_api.rst
+++ b/doc/python/python_api.rst
@@ -37,6 +37,7 @@ Core Data Structure
 .. autoclass:: xgboost.Booster
     :members:
     :show-inheritance:
+    :special-members: __getitem__
 
 .. autoclass:: xgboost.DataIter
     :members:
diff --git a/python-package/xgboost/dask/__init__.py b/python-package/xgboost/dask/__init__.py
index 6c92e9205dc9..b2fc191f1c02 100644
--- a/python-package/xgboost/dask/__init__.py
+++ b/python-package/xgboost/dask/__init__.py
@@ -766,7 +766,6 @@ async def _train_async(
     num_boost_round: int,
     evals: Optional[Sequence[Tuple[DaskDMatrix, str]]],
     obj: Optional[Objective],
-    feval: Optional[Metric],
     early_stopping_rounds: Optional[int],
     verbose_eval: Union[int, bool],
     xgb_model: Optional[Booster],
@@ -816,7 +815,6 @@ def do_train(  # pylint: disable=too-many-positional-arguments
                 evals_result=local_history,
                 evals=evals if len(evals) != 0 else None,
                 obj=obj,
-                feval=feval,
                 custom_metric=custom_metric,
                 early_stopping_rounds=early_stopping_rounds,
                 verbose_eval=verbose_eval,
@@ -870,7 +868,6 @@ def train(  # pylint: disable=unused-argument
     *,
     evals: Optional[Sequence[Tuple[DaskDMatrix, str]]] = None,
     obj: Optional[Objective] = None,
-    feval: Optional[Metric] = None,
     early_stopping_rounds: Optional[int] = None,
     xgb_model: Optional[Booster] = None,
     verbose_eval: Union[int, bool] = True,
@@ -1675,7 +1672,6 @@ async def _fit_async(
             num_boost_round=self.get_num_boosting_rounds(),
             evals=evals,
             obj=obj,
-            feval=None,
             custom_metric=metric,
             verbose_eval=verbose,
             early_stopping_rounds=self.early_stopping_rounds,
@@ -1784,7 +1780,6 @@ async def _fit_async(
             num_boost_round=self.get_num_boosting_rounds(),
             evals=evals,
             obj=obj,
-            feval=None,
             custom_metric=metric,
             verbose_eval=verbose,
             early_stopping_rounds=self.early_stopping_rounds,
@@ -1986,7 +1981,6 @@ async def _fit_async(
             num_boost_round=self.get_num_boosting_rounds(),
             evals=evals,
             obj=None,
-            feval=None,
             custom_metric=metric,
             verbose_eval=verbose,
             early_stopping_rounds=self.early_stopping_rounds,
diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index c337505f7641..b197539bfc1f 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -426,7 +426,7 @@ def task(i: int) -> float:
 
         Metric used for monitoring the training result and early stopping.  It can be a
         string or list of strings as names of predefined metric in XGBoost (See
-        doc/parameter.rst), one of the metrics in :py:mod:`sklearn.metrics`, or any
+        :doc:`/parameter`), one of the metrics in :py:mod:`sklearn.metrics`, or any
         other user defined metric that looks like `sklearn.metrics`.
 
         If custom objective is also provided, then custom metric should implement the
diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py
index 80e0ad2db1f5..0821aee913c3 100644
--- a/python-package/xgboost/testing/__init__.py
+++ b/python-package/xgboost/testing/__init__.py
@@ -662,9 +662,29 @@ def predictor_equal(lhs: xgb.DMatrix, rhs: xgb.DMatrix) -> bool:
 M = TypeVar("M", xgb.Booster, xgb.XGBModel)
 
 
-def eval_error_metric(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, np.float64]:
-    """Evaluation metric for xgb.train"""
+def logregobj(preds: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[np.ndarray, np.ndarray]:
+    """Binary regression custom objective."""
+    labels = dtrain.get_label()
+    preds = 1.0 / (1.0 + np.exp(-preds))
+    grad = preds - labels
+    hess = preds * (1.0 - preds)
+    return grad, hess
+
+
+def eval_error_metric(
+    predt: np.ndarray, dtrain: xgb.DMatrix, rev_link: bool
+) -> Tuple[str, np.float64]:
+    """Evaluation metric for xgb.train.
+
+    Parameters
+    ----------
+    rev_link : Whether the metric needs to apply the reverse link function (activation).
+
+    """
     label = dtrain.get_label()
+    if rev_link:
+        predt = 1.0 / (1.0 + np.exp(-predt))
+    assert (0.0 <= predt).all() and (predt <= 1.0).all()
     r = np.zeros(predt.shape)
     gt = predt > 0.5
     if predt.size == 0:
@@ -675,8 +695,15 @@ def eval_error_metric(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, np.f
     return "CustomErr", np.sum(r)
 
 
-def eval_error_metric_skl(y_true: np.ndarray, y_score: np.ndarray) -> np.float64:
+def eval_error_metric_skl(
+    y_true: np.ndarray, y_score: np.ndarray, rev_link: bool = False
+) -> np.float64:
     """Evaluation metric that looks like metrics provided by sklearn."""
+
+    if rev_link:
+        y_score = 1.0 / (1.0 + np.exp(-y_score))
+    assert (0.0 <= y_score).all() and (y_score <= 1.0).all()
+
     r = np.zeros(y_score.shape)
     gt = y_score > 0.5
     r[gt] = 1 - y_true[gt]
diff --git a/python-package/xgboost/training.py b/python-package/xgboost/training.py
index 86370469a400..29a516e81e24 100644
--- a/python-package/xgboost/training.py
+++ b/python-package/xgboost/training.py
@@ -3,7 +3,6 @@
 """Training Library containing training routines."""
 import copy
 import os
-import warnings
 from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple, Union, cast
 
 import numpy as np
@@ -28,26 +27,6 @@
 _CVFolds = Sequence["CVPack"]
 
 
-def _configure_custom_metric(
-    feval: Optional[Metric], custom_metric: Optional[Metric]
-) -> Optional[Metric]:
-    if feval is not None:
-        link = (
-            "https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html"
-        )
-        warnings.warn(
-            "`feval` is deprecated, use `custom_metric` instead.  They have "
-            "different behavior when custom objective is also used."
-            f"See {link} for details on the `custom_metric`."
-        )
-    if feval is not None and custom_metric is not None:
-        raise ValueError(
-            "Both `feval` and `custom_metric` are supplied.  Use `custom_metric` instead."
-        )
-    eval_metric = custom_metric if custom_metric is not None else feval
-    return eval_metric
-
-
 @_deprecate_positional_args
 def train(
     params: Dict[str, Any],
@@ -56,7 +35,6 @@ def train(
     *,
     evals: Optional[Sequence[Tuple[DMatrix, str]]] = None,
     obj: Optional[Objective] = None,
-    feval: Optional[Metric] = None,
     maximize: Optional[bool] = None,
     early_stopping_rounds: Optional[int] = None,
     evals_result: Optional[TrainingCallback.EvalsLog] = None,
@@ -81,23 +59,27 @@ def train(
     obj
         Custom objective function.  See :doc:`Custom Objective
         </tutorials/custom_metric_obj>` for details.
-    feval :
-        .. deprecated:: 1.6.0
-            Use `custom_metric` instead.
     maximize :
-        Whether to maximize feval.
+        Whether to maximize custom_metric.
+
     early_stopping_rounds :
+
         Activates early stopping. Validation metric needs to improve at least once in
         every **early_stopping_rounds** round(s) to continue training.
+
         Requires at least one item in **evals**.
+
         The method returns the model from the last iteration (not the best one).  Use
-        custom callback or model slicing if the best model is desired.
-        If there's more than one item in **evals**, the last entry will be used for early
-        stopping.
+        custom callback :py:class:`~xgboost.callback.EarlyStopping` or :py:meth:`model
+        slicing <xgboost.Booster.__getitem__>` if the best model is desired.  If there's
+        more than one item in **evals**, the last entry will be used for early stopping.
+
         If there's more than one metric in the **eval_metric** parameter given in
         **params**, the last metric will be used for early stopping.
+
         If early stopping occurs, the model will have two additional fields:
         ``bst.best_score``, ``bst.best_iteration``.
+
     evals_result :
         This dictionary stores the evaluation results of all the items in watchlist.
 
@@ -113,15 +95,22 @@ def train(
 
     verbose_eval :
         Requires at least one item in **evals**.
+
         If **verbose_eval** is True then the evaluation metric on the validation set is
         printed at each boosting stage.
-        If **verbose_eval** is an integer then the evaluation metric on the validation set
-        is printed at every given **verbose_eval** boosting stage. The last boosting stage
-        / the boosting stage found by using **early_stopping_rounds** is also printed.
-        Example: with ``verbose_eval=4`` and at least one item in **evals**, an evaluation metric
-        is printed every 4 boosting stages, instead of every boosting stage.
+
+        If **verbose_eval** is an integer then the evaluation metric on the validation
+        set is printed at every given **verbose_eval** boosting stage. The last boosting
+        stage / the boosting stage found by using **early_stopping_rounds** is also
+        printed.
+
+        Example: with ``verbose_eval=4`` and at least one item in **evals**, an
+        evaluation metric is printed every 4 boosting stages, instead of every boosting
+        stage.
+
     xgb_model :
         Xgb model to be loaded before training (allows training continuation).
+
     callbacks :
         List of callback functions that are applied at end of each iteration.
         It is possible to use predefined callbacks by using
@@ -145,15 +134,17 @@ def train(
         .. versionadded 1.6.0
 
         Custom metric function.  See :doc:`Custom Metric </tutorials/custom_metric_obj>`
-        for details.
+        for details. The metric receives transformed prediction (after applying the
+        reverse link function) when using a builtin objective, and raw output when using
+        a custom objective.
 
     Returns
     -------
     Booster : a trained booster model
+
     """
 
     callbacks = [] if callbacks is None else copy.copy(list(callbacks))
-    metric_fn = _configure_custom_metric(feval, custom_metric)
     evals = list(evals) if evals else []
 
     bst = Booster(params, [dtrain] + [d[0] for d in evals], model_file=xgb_model)
@@ -165,12 +156,7 @@ def train(
     if early_stopping_rounds:
         callbacks.append(EarlyStopping(rounds=early_stopping_rounds, maximize=maximize))
     cb_container = CallbackContainer(
-        callbacks,
-        metric=metric_fn,
-        # For old `feval` parameter, the behavior is unchanged.  For the new
-        # `custom_metric`, it will receive proper prediction result when custom objective
-        # is not used.
-        output_margin=callable(obj) or metric_fn is feval,
+        callbacks, metric=custom_metric, output_margin=callable(obj)
     )
 
     bst = cb_container.before_training(bst)
@@ -423,7 +409,6 @@ def cv(
     folds: XGBStratifiedKFold = None,
     metrics: Sequence[str] = (),
     obj: Optional[Objective] = None,
-    feval: Optional[Metric] = None,
     maximize: Optional[bool] = None,
     early_stopping_rounds: Optional[int] = None,
     fpreproc: Optional[FPreProcCallable] = None,
@@ -464,11 +449,9 @@ def cv(
         Custom objective function.  See :doc:`Custom Objective
         </tutorials/custom_metric_obj>` for details.
 
-    feval : function
-        .. deprecated:: 1.6.0
-            Use `custom_metric` instead.
     maximize : bool
-        Whether to maximize feval.
+        Whether to maximize the evaluataion metric (score or error).
+
     early_stopping_rounds: int
         Activates early stopping. Cross-Validation metric (average of validation
         metric computed over CV folds) needs to improve at least once in
@@ -559,8 +542,6 @@ def cv(
         shuffle=shuffle,
     )
 
-    metric_fn = _configure_custom_metric(feval, custom_metric)
-
     # setup callbacks
     callbacks = [] if callbacks is None else copy.copy(list(callbacks))
 
@@ -570,10 +551,7 @@ def cv(
     if early_stopping_rounds:
         callbacks.append(EarlyStopping(rounds=early_stopping_rounds, maximize=maximize))
     callbacks_container = CallbackContainer(
-        callbacks,
-        metric=metric_fn,
-        is_cv=True,
-        output_margin=callable(obj) or metric_fn is feval,
+        callbacks, metric=custom_metric, is_cv=True, output_margin=callable(obj)
     )
 
     booster = _PackedBooster(cvfolds)
diff --git a/tests/ci_build/lint_python.py b/tests/ci_build/lint_python.py
index 8ee0b4e8e692..d0ef625fa008 100644
--- a/tests/ci_build/lint_python.py
+++ b/tests/ci_build/lint_python.py
@@ -27,6 +27,7 @@ class LintersPaths:
         "tests/python/test_dt.py",
         "tests/python/test_demos.py",
         "tests/python/test_eval_metrics.py",
+        "tests/python/test_early_stopping.py",
         "tests/python/test_multi_target.py",
         "tests/python/test_objectives.py",
         "tests/python/test_predict.py",
@@ -54,6 +55,7 @@ class LintersPaths:
         "demo/guide-python/callbacks.py",
         "demo/guide-python/categorical.py",
         "demo/guide-python/cat_pipeline.py",
+        "demo/guide-python/cross_validation.py",
         "demo/guide-python/feature_weights.py",
         "demo/guide-python/model_parser.py",
         "demo/guide-python/sklearn_parallel.py",
diff --git a/tests/python/test_basic_models.py b/tests/python/test_basic_models.py
index 3e945546e13b..b24152e5dc9a 100644
--- a/tests/python/test_basic_models.py
+++ b/tests/python/test_basic_models.py
@@ -1,6 +1,7 @@
 import json
 import os
 import tempfile
+from typing import Optional
 
 import numpy as np
 import pytest
@@ -17,38 +18,49 @@
 
 class TestModels:
     def test_glm(self):
-        param = {'objective': 'binary:logistic',
-                 'booster': 'gblinear', 'alpha': 0.0001, 'lambda': 1,
-                 'nthread': 1}
+        param = {
+            "objective": "binary:logistic",
+            "booster": "gblinear",
+            "alpha": 0.0001,
+            "lambda": 1,
+            "nthread": 1,
+        }
         dtrain, dtest = tm.load_agaricus(__file__)
-        watchlist = [(dtest, 'eval'), (dtrain, 'train')]
+        watchlist = [(dtest, "eval"), (dtrain, "train")]
         num_round = 4
         bst = xgb.train(param, dtrain, num_round, watchlist)
         assert isinstance(bst, xgb.core.Booster)
         preds = bst.predict(dtest)
         labels = dtest.get_label()
-        err = sum(1 for i in range(len(preds))
-                  if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
+        err = sum(
+            1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]
+        ) / float(len(preds))
         assert err < 0.2
 
     def test_dart(self):
         dtrain, dtest = tm.load_agaricus(__file__)
-        param = {'max_depth': 5, 'objective': 'binary:logistic',
-                 'eval_metric': 'logloss', 'booster': 'dart', 'verbosity': 1}
+        param = {
+            "max_depth": 5,
+            "objective": "binary:logistic",
+            "eval_metric": "logloss",
+            "booster": "dart",
+            "verbosity": 1,
+        }
         # specify validations set to watch performance
-        watchlist = [(dtest, 'eval'), (dtrain, 'train')]
+        watchlist = [(dtest, "eval"), (dtrain, "train")]
         num_round = 2
         bst = xgb.train(param, dtrain, num_round, watchlist)
         # this is prediction
         preds = bst.predict(dtest, iteration_range=(0, num_round))
         labels = dtest.get_label()
-        err = sum(1 for i in range(len(preds))
-                  if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
+        err = sum(
+            1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]
+        ) / float(len(preds))
         # error must be smaller than 10%
         assert err < 0.1
 
         with tempfile.TemporaryDirectory() as tmpdir:
-            dtest_path = os.path.join(tmpdir, 'dtest.dmatrix')
+            dtest_path = os.path.join(tmpdir, "dtest.dmatrix")
             model_path = os.path.join(tmpdir, "xgboost.model.dart.ubj")
             # save dmatrix into binary buffer
             dtest.save_binary(dtest_path)
@@ -66,28 +78,30 @@ def test_dart(self):
 
         def my_logloss(preds, dtrain):
             labels = dtrain.get_label()
-            return 'logloss', np.sum(
-                np.log(np.where(labels, preds, 1 - preds)))
+            return "logloss", np.sum(np.log(np.where(labels, preds, 1 - preds)))
 
         # check whether custom evaluation metrics work
-        bst = xgb.train(param, dtrain, num_round, watchlist,
-                        feval=my_logloss)
+        bst = xgb.train(
+            param, dtrain, num_round, evals=watchlist, custom_metric=my_logloss
+        )
         preds3 = bst.predict(dtest, iteration_range=(0, num_round))
         assert all(preds3 == preds)
 
         # check whether sample_type and normalize_type work
         num_round = 50
-        param['learning_rate'] = 0.1
-        param['rate_drop'] = 0.1
+        param["learning_rate"] = 0.1
+        param["rate_drop"] = 0.1
         preds_list = []
-        for p in [[p0, p1] for p0 in ['uniform', 'weighted']
-                  for p1 in ['tree', 'forest']]:
-            param['sample_type'] = p[0]
-            param['normalize_type'] = p[1]
-            bst = xgb.train(param, dtrain, num_round, watchlist)
+        for p in [
+            [p0, p1] for p0 in ["uniform", "weighted"] for p1 in ["tree", "forest"]
+        ]:
+            param["sample_type"] = p[0]
+            param["normalize_type"] = p[1]
+            bst = xgb.train(param, dtrain, num_round, evals=watchlist)
             preds = bst.predict(dtest, iteration_range=(0, num_round))
-            err = sum(1 for i in range(len(preds))
-                      if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
+            err = sum(
+                1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]
+            ) / float(len(preds))
             assert err < 0.1
             preds_list.append(preds)
 
@@ -143,53 +157,67 @@ def test_boost_from_existing_model(self) -> None:
         )
         assert booster.num_boosted_rounds() == 8
 
-    def run_custom_objective(self, tree_method=None):
+    def run_custom_objective(self, tree_method: Optional[str] = None):
         param = {
-            'max_depth': 2,
-            'eta': 1,
-            'objective': 'reg:logistic',
-            "tree_method": tree_method
+            "max_depth": 2,
+            "eta": 1,
+            "objective": "reg:logistic",
+            "tree_method": tree_method,
         }
         dtrain, dtest = tm.load_agaricus(__file__)
-        watchlist = [(dtest, 'eval'), (dtrain, 'train')]
+        watchlist = [(dtest, "eval"), (dtrain, "train")]
         num_round = 10
 
-        def logregobj(preds, dtrain):
-            labels = dtrain.get_label()
-            preds = 1.0 / (1.0 + np.exp(-preds))
-            grad = preds - labels
-            hess = preds * (1.0 - preds)
-            return grad, hess
-
-        def evalerror(preds, dtrain):
-            labels = dtrain.get_label()
-            preds = 1.0 / (1.0 + np.exp(-preds))
-            return 'error', float(sum(labels != (preds > 0.5))) / len(labels)
+        def evalerror(preds: np.ndarray, dtrain: xgb.DMatrix):
+            return tm.eval_error_metric(preds, dtrain, rev_link=True)
 
         # test custom_objective in training
-        bst = xgb.train(param, dtrain, num_round, watchlist, obj=logregobj,
-                        feval=evalerror)
-        assert isinstance(bst, xgb.core.Booster)
+        bst = xgb.train(
+            param,
+            dtrain,
+            num_round,
+            watchlist,
+            obj=tm.logregobj,
+            custom_metric=evalerror,
+        )
+        assert isinstance(bst, xgb.Booster)
         preds = bst.predict(dtest)
         labels = dtest.get_label()
-        err = sum(1 for i in range(len(preds))
-                  if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
+        err = sum(
+            1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]
+        ) / float(len(preds))
         assert err < 0.1
 
         # test custom_objective in cross-validation
-        xgb.cv(param, dtrain, num_round, nfold=5, seed=0,
-               obj=logregobj, feval=evalerror)
+        xgb.cv(
+            param,
+            dtrain,
+            num_round,
+            nfold=5,
+            seed=0,
+            obj=tm.logregobj,
+            custom_metric=evalerror,
+        )
 
         # test maximize parameter
         def neg_evalerror(preds, dtrain):
             labels = dtrain.get_label()
-            return 'error', float(sum(labels == (preds > 0.0))) / len(labels)
-
-        bst2 = xgb.train(param, dtrain, num_round, watchlist, logregobj,
-                         neg_evalerror, maximize=True)
+            preds = 1.0 / (1.0 + np.exp(-preds))
+            return "error", float(sum(labels == (preds > 0.0))) / len(labels)
+
+        bst2 = xgb.train(
+            param,
+            dtrain,
+            num_round,
+            evals=watchlist,
+            obj=tm.logregobj,
+            custom_metric=neg_evalerror,
+            maximize=True,
+        )
         preds2 = bst2.predict(dtest)
-        err2 = sum(1 for i in range(len(preds2))
-                   if int(preds2[i] > 0.5) != labels[i]) / float(len(preds2))
+        err2 = sum(
+            1 for i in range(len(preds2)) if int(preds2[i] > 0.5) != labels[i]
+        ) / float(len(preds2))
         assert err == err2
 
     def test_custom_objective(self):
@@ -197,36 +225,54 @@ def test_custom_objective(self):
 
     def test_multi_eval_metric(self):
         dtrain, dtest = tm.load_agaricus(__file__)
-        watchlist = [(dtest, 'eval'), (dtrain, 'train')]
-        param = {'max_depth': 2, 'eta': 0.2, 'verbosity': 1,
-                 'objective': 'binary:logistic'}
-        param['eval_metric'] = ["auc", "logloss", 'error']
+        watchlist = [(dtest, "eval"), (dtrain, "train")]
+        param = {
+            "max_depth": 2,
+            "eta": 0.2,
+            "verbosity": 1,
+            "objective": "binary:logistic",
+        }
+        param["eval_metric"] = ["auc", "logloss", "error"]
         evals_result = {}
-        bst = xgb.train(param, dtrain, 4, watchlist, evals_result=evals_result)
+        bst = xgb.train(param, dtrain, 4, evals=watchlist, evals_result=evals_result)
         assert isinstance(bst, xgb.core.Booster)
-        assert len(evals_result['eval']) == 3
-        assert set(evals_result['eval'].keys()) == {'auc', 'error', 'logloss'}
+        assert len(evals_result["eval"]) == 3
+        assert set(evals_result["eval"].keys()) == {"auc", "error", "logloss"}
 
     def test_fpreproc(self):
-        param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
+        param = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"}
         num_round = 2
 
         def fpreproc(dtrain, dtest, param):
             label = dtrain.get_label()
             ratio = float(np.sum(label == 0)) / np.sum(label == 1)
-            param['scale_pos_weight'] = ratio
+            param["scale_pos_weight"] = ratio
             return (dtrain, dtest, param)
 
         dtrain, _ = tm.load_agaricus(__file__)
-        xgb.cv(param, dtrain, num_round, nfold=5,
-               metrics={'auc'}, seed=0, fpreproc=fpreproc)
+        xgb.cv(
+            param,
+            dtrain,
+            num_round,
+            nfold=5,
+            metrics={"auc"},
+            seed=0,
+            fpreproc=fpreproc,
+        )
 
     def test_show_stdv(self):
-        param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
+        param = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"}
         num_round = 2
         dtrain, _ = tm.load_agaricus(__file__)
-        xgb.cv(param, dtrain, num_round, nfold=5,
-               metrics={'error'}, seed=0, show_stdv=False)
+        xgb.cv(
+            param,
+            dtrain,
+            num_round,
+            nfold=5,
+            metrics={"error"},
+            seed=0,
+            show_stdv=False,
+        )
 
     def test_prediction_cache(self) -> None:
         X, y = tm.make_sparse_regression(512, 4, 0.5, as_dense=False)
@@ -273,28 +319,34 @@ def validate_model(parameters):
             X = np.random.random((100, 30))
             y = np.random.randint(0, 4, size=(100,))
 
-            parameters['num_class'] = 4
+            parameters["num_class"] = 4
             m = xgb.DMatrix(X, y)
 
             booster = xgb.train(parameters, m)
-            dump = booster.get_dump(dump_format='json')
+            dump = booster.get_dump(dump_format="json")
 
             for i in range(len(dump)):
-                jsonschema.validate(instance=json.loads(dump[i]),
-                                    schema=schema)
+                jsonschema.validate(instance=json.loads(dump[i]), schema=schema)
 
         path = os.path.dirname(
-            os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-        doc = os.path.join(path, 'doc', 'dump.schema')
-        with open(doc, 'r') as fd:
+            os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+        )
+        doc = os.path.join(path, "doc", "dump.schema")
+        with open(doc, "r") as fd:
             schema = json.load(fd)
 
-        parameters = {'tree_method': 'hist', 'booster': 'gbtree',
-                      'objective': 'multi:softmax'}
+        parameters = {
+            "tree_method": "hist",
+            "booster": "gbtree",
+            "objective": "multi:softmax",
+        }
         validate_model(parameters)
 
-        parameters = {'tree_method': 'hist', 'booster': 'dart',
-                      'objective': 'multi:softmax'}
+        parameters = {
+            "tree_method": "hist",
+            "booster": "dart",
+            "objective": "multi:softmax",
+        }
         validate_model(parameters)
 
     def test_special_model_dump_characters(self) -> None:
@@ -363,7 +415,7 @@ def run_slice(
         sliced_trees = end * num_parallel_tree * num_classes
         assert sliced_trees == len(sliced.get_dump())
 
-        sliced = booster[: end]
+        sliced = booster[:end]
         sliced_trees = end * num_parallel_tree * num_classes
         assert sliced_trees == len(sliced.get_dump())
 
diff --git a/tests/python/test_callback.py b/tests/python/test_callback.py
index d2e7cb5c4b8e..1ee31d6610c1 100644
--- a/tests/python/test_callback.py
+++ b/tests/python/test_callback.py
@@ -1,8 +1,10 @@
 import json
 import os
 import tempfile
-from typing import Union
+from collections import namedtuple
+from typing import Tuple, Union
 
+import numpy as np
 import pytest
 
 import xgboost as xgb
@@ -12,21 +14,29 @@
 pytestmark = pytest.mark.skipif(**tm.no_sklearn())
 
 
-class TestCallbacks:
-    @classmethod
-    def setup_class(cls):
-        from sklearn.datasets import load_breast_cancer
+BreastCancer = namedtuple("BreastCancer", ["full", "tr", "va"])
+
+
+@pytest.fixture
+def breast_cancer() -> BreastCancer:
+    from sklearn.datasets import load_breast_cancer
+
+    X, y = load_breast_cancer(return_X_y=True)
+
+    split = int(X.shape[0] * 0.8)
+    return BreastCancer(
+        full=(X, y),
+        tr=(X[:split, ...], y[:split, ...]),
+        va=(X[split:, ...], y[split:, ...]),
+    )
+
 
-        X, y = load_breast_cancer(return_X_y=True)
-        cls.X = X
-        cls.y = y
+def eval_error_metric(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, np.float64]:
+    # No custom objective, recieve transformed output
+    return tm.eval_error_metric(predt, dtrain, rev_link=False)
 
-        split = int(X.shape[0] * 0.8)
-        cls.X_train = X[:split, ...]
-        cls.y_train = y[:split, ...]
-        cls.X_valid = X[split:, ...]
-        cls.y_valid = y[split:, ...]
 
+class TestCallbacks:
     def run_evaluation_monitor(
         self,
         D_train: xgb.DMatrix,
@@ -70,9 +80,9 @@ def check_output(output: str) -> None:
             output = out.getvalue().strip()
             check_output(output)
 
-    def test_evaluation_monitor(self):
-        D_train = xgb.DMatrix(self.X_train, self.y_train)
-        D_valid = xgb.DMatrix(self.X_valid, self.y_valid)
+    def test_evaluation_monitor(self, breast_cancer: BreastCancer) -> None:
+        D_train = xgb.DMatrix(breast_cancer.tr[0], breast_cancer.tr[1])
+        D_valid = xgb.DMatrix(breast_cancer.va[0], breast_cancer.va[1])
         evals_result = {}
         rounds = 10
         xgb.train(
@@ -91,9 +101,9 @@ def test_evaluation_monitor(self):
         self.run_evaluation_monitor(D_train, D_valid, rounds, 4)
         self.run_evaluation_monitor(D_train, D_valid, rounds, rounds + 1)
 
-    def test_early_stopping(self):
-        D_train = xgb.DMatrix(self.X_train, self.y_train)
-        D_valid = xgb.DMatrix(self.X_valid, self.y_valid)
+    def test_early_stopping(self, breast_cancer: BreastCancer) -> None:
+        D_train = xgb.DMatrix(breast_cancer.tr[0], breast_cancer.tr[1])
+        D_valid = xgb.DMatrix(breast_cancer.va[0], breast_cancer.va[1])
         evals_result = {}
         rounds = 30
         early_stopping_rounds = 5
@@ -109,9 +119,9 @@ def test_early_stopping(self):
         dump = booster.get_dump(dump_format="json")
         assert len(dump) - booster.best_iteration == early_stopping_rounds + 1
 
-    def test_early_stopping_custom_eval(self):
-        D_train = xgb.DMatrix(self.X_train, self.y_train)
-        D_valid = xgb.DMatrix(self.X_valid, self.y_valid)
+    def test_early_stopping_custom_eval(self, breast_cancer: BreastCancer) -> None:
+        D_train = xgb.DMatrix(breast_cancer.tr[0], breast_cancer.tr[1])
+        D_valid = xgb.DMatrix(breast_cancer.va[0], breast_cancer.va[1])
         early_stopping_rounds = 5
         booster = xgb.train(
             {
@@ -121,7 +131,7 @@ def test_early_stopping_custom_eval(self):
             },
             D_train,
             evals=[(D_train, "Train"), (D_valid, "Valid")],
-            feval=tm.eval_error_metric,
+            custom_metric=eval_error_metric,
             num_boost_round=1000,
             early_stopping_rounds=early_stopping_rounds,
             verbose_eval=False,
@@ -129,9 +139,9 @@ def test_early_stopping_custom_eval(self):
         dump = booster.get_dump(dump_format="json")
         assert len(dump) - booster.best_iteration == early_stopping_rounds + 1
 
-    def test_early_stopping_customize(self):
-        D_train = xgb.DMatrix(self.X_train, self.y_train)
-        D_valid = xgb.DMatrix(self.X_valid, self.y_valid)
+    def test_early_stopping_customize(self, breast_cancer: BreastCancer) -> None:
+        D_train = xgb.DMatrix(breast_cancer.tr[0], breast_cancer.tr[1])
+        D_valid = xgb.DMatrix(breast_cancer.va[0], breast_cancer.va[1])
         early_stopping_rounds = 5
         early_stop = xgb.callback.EarlyStopping(
             rounds=early_stopping_rounds, metric_name="CustomErr", data_name="Train"
@@ -145,7 +155,7 @@ def test_early_stopping_customize(self):
             },
             D_train,
             evals=[(D_train, "Train"), (D_valid, "Valid")],
-            feval=tm.eval_error_metric,
+            custom_metric=eval_error_metric,
             num_boost_round=1000,
             callbacks=[early_stop],
             verbose_eval=False,
@@ -170,7 +180,8 @@ def test_early_stopping_customize(self):
             },
             D_train,
             evals=[(D_train, "Train"), (D_valid, "Valid")],
-            feval=tm.eval_error_metric,
+            # No custom objective, transformed output
+            custom_metric=eval_error_metric,
             num_boost_round=rounds,
             callbacks=[early_stop],
             verbose_eval=False,
@@ -179,10 +190,8 @@ def test_early_stopping_customize(self):
         assert booster.best_iteration == 0
         assert booster.num_boosted_rounds() == 1
 
-    def test_early_stopping_skl(self):
-        from sklearn.datasets import load_breast_cancer
-
-        X, y = load_breast_cancer(return_X_y=True)
+    def test_early_stopping_skl(self, breast_cancer: BreastCancer) -> None:
+        X, y = breast_cancer.full
         early_stopping_rounds = 5
         cls = xgb.XGBClassifier(
             early_stopping_rounds=early_stopping_rounds, eval_metric="error"
@@ -192,10 +201,8 @@ def test_early_stopping_skl(self):
         dump = booster.get_dump(dump_format="json")
         assert len(dump) - booster.best_iteration == early_stopping_rounds + 1
 
-    def test_early_stopping_custom_eval_skl(self):
-        from sklearn.datasets import load_breast_cancer
-
-        X, y = load_breast_cancer(return_X_y=True)
+    def test_early_stopping_custom_eval_skl(self, breast_cancer: BreastCancer) -> None:
+        X, y = breast_cancer.full
         early_stopping_rounds = 5
         early_stop = xgb.callback.EarlyStopping(rounds=early_stopping_rounds)
         cls = xgb.XGBClassifier(
@@ -206,10 +213,8 @@ def test_early_stopping_custom_eval_skl(self):
         dump = booster.get_dump(dump_format="json")
         assert len(dump) - booster.best_iteration == early_stopping_rounds + 1
 
-    def test_early_stopping_save_best_model(self):
-        from sklearn.datasets import load_breast_cancer
-
-        X, y = load_breast_cancer(return_X_y=True)
+    def test_early_stopping_save_best_model(self, breast_cancer: BreastCancer) -> None:
+        X, y = breast_cancer.full
         n_estimators = 100
         early_stopping_rounds = 5
         early_stop = xgb.callback.EarlyStopping(
@@ -248,10 +253,8 @@ def test_early_stopping_save_best_model(self):
             callbacks=[early_stop],
         ).fit(X, y, eval_set=[(X, y)])
 
-    def test_early_stopping_continuation(self):
-        from sklearn.datasets import load_breast_cancer
-
-        X, y = load_breast_cancer(return_X_y=True)
+    def test_early_stopping_continuation(self, breast_cancer: BreastCancer) -> None:
+        X, y = breast_cancer.full
 
         early_stopping_rounds = 5
         early_stop = xgb.callback.EarlyStopping(
@@ -283,7 +286,23 @@ def test_early_stopping_continuation(self):
                 == booster.best_iteration + early_stopping_rounds + 1
             )
 
-    def run_eta_decay(self, tree_method):
+    def test_early_stopping_multiple_metrics(self):
+        from sklearn.datasets import make_classification
+
+        X, y = make_classification(random_state=1994)
+        # AUC approaches 1.0 real quick.
+        clf = xgb.XGBClassifier(eval_metric=["logloss", "auc"], early_stopping_rounds=2)
+        clf.fit(X, y, eval_set=[(X, y)])
+        assert clf.best_iteration < 8
+        assert clf.evals_result()["validation_0"]["auc"][-1] > 0.99
+
+        clf = xgb.XGBClassifier(eval_metric=["auc", "logloss"], early_stopping_rounds=2)
+        clf.fit(X, y, eval_set=[(X, y)])
+
+        assert clf.best_iteration > 50
+        assert clf.evals_result()["validation_0"]["auc"][-1] > 0.99
+
+    def run_eta_decay(self, tree_method: str) -> None:
         """Test learning rate scheduler, used by both CPU and GPU tests."""
         scheduler = xgb.callback.LearningRateScheduler
 
@@ -457,10 +476,8 @@ def test_eta_decay(self, tree_method: str) -> None:
     def test_eta_decay_leaf_output(self, tree_method: str, objective: str) -> None:
         self.run_eta_decay_leaf_output(tree_method, objective)
 
-    def test_check_point(self) -> None:
-        from sklearn.datasets import load_breast_cancer
-
-        X, y = load_breast_cancer(return_X_y=True)
+    def test_check_point(self, breast_cancer: BreastCancer) -> None:
+        X, y = breast_cancer.full
         m = xgb.DMatrix(X, y)
         with tempfile.TemporaryDirectory() as tmpdir:
             check_point = xgb.callback.TrainingCheckPoint(
@@ -509,10 +526,8 @@ def test_callback_list(self) -> None:
             )
         assert len(callbacks) == 1
 
-    def test_attribute_error(self) -> None:
-        from sklearn.datasets import load_breast_cancer
-
-        X, y = load_breast_cancer(return_X_y=True)
+    def test_attribute_error(self, breast_cancer: BreastCancer) -> None:
+        X, y = breast_cancer.full
 
         clf = xgb.XGBClassifier(n_estimators=8)
         clf.fit(X, y, eval_set=[(X, y)])
diff --git a/tests/python/test_early_stopping.py b/tests/python/test_early_stopping.py
index a275a8077b71..32afb5f75f51 100644
--- a/tests/python/test_early_stopping.py
+++ b/tests/python/test_early_stopping.py
@@ -1,3 +1,5 @@
+from typing import Tuple
+
 import numpy as np
 import pytest
 
@@ -14,9 +16,7 @@ def test_early_stopping_nonparallel(self):
         from sklearn.datasets import load_digits
         from sklearn.model_selection import train_test_split
 
-        digits = load_digits(n_class=2)
-        X = digits["data"]
-        y = digits["target"]
+        X, y = load_digits(n_class=2, return_X_y=True)
         X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
         clf1 = xgb.XGBClassifier(
             learning_rate=0.1, early_stopping_rounds=5, eval_metric="auc"
@@ -47,50 +47,64 @@ def test_early_stopping_nonparallel(self):
 
         assert clf3.best_score == 1
 
-    def evalerror(self, preds, dtrain):
-        from sklearn.metrics import mean_squared_error
-
-        labels = dtrain.get_label()
-        preds = 1.0 / (1.0 + np.exp(-preds))
-        return 'rmse', mean_squared_error(labels, preds)
-
     @staticmethod
     def assert_metrics_length(cv, expected_length):
         for key, value in cv.items():
             assert len(value) == expected_length
 
     @pytest.mark.skipif(**tm.no_sklearn())
-    def test_cv_early_stopping(self):
+    def test_cv_early_stopping(self) -> None:
         from sklearn.datasets import load_digits
 
-        digits = load_digits(n_class=2)
-        X = digits['data']
-        y = digits['target']
+        X, y = load_digits(n_class=2, return_X_y=True)
         dm = xgb.DMatrix(X, label=y)
         params = {
-            'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic',
-            'eval_metric': 'error'
+            "max_depth": 2,
+            "eta": 1,
+            "objective": "binary:logistic",
+            "eval_metric": "error",
         }
 
-        cv = xgb.cv(params, dm, num_boost_round=10, nfold=10,
-                    early_stopping_rounds=10)
+        def evalerror(preds: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, float]:
+            from sklearn.metrics import mean_squared_error
+
+            labels = dtrain.get_label()
+            return "rmse", mean_squared_error(labels, preds)
+
+        cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, early_stopping_rounds=10)
         self.assert_metrics_length(cv, 10)
-        cv = xgb.cv(params, dm, num_boost_round=10, nfold=10,
-                    early_stopping_rounds=5)
+        cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, early_stopping_rounds=5)
         self.assert_metrics_length(cv, 3)
-        cv = xgb.cv(params, dm, num_boost_round=10, nfold=10,
-                    early_stopping_rounds=1)
+        cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, early_stopping_rounds=1)
         self.assert_metrics_length(cv, 1)
 
-        cv = xgb.cv(params, dm, num_boost_round=10, nfold=10,
-                    feval=self.evalerror, early_stopping_rounds=10)
+        cv = xgb.cv(
+            params,
+            dm,
+            num_boost_round=10,
+            nfold=10,
+            custom_metric=evalerror,
+            early_stopping_rounds=10,
+        )
         self.assert_metrics_length(cv, 10)
-        cv = xgb.cv(params, dm, num_boost_round=10, nfold=10,
-                    feval=self.evalerror, early_stopping_rounds=1)
+        cv = xgb.cv(
+            params,
+            dm,
+            num_boost_round=10,
+            nfold=10,
+            custom_metric=evalerror,
+            early_stopping_rounds=1,
+        )
         self.assert_metrics_length(cv, 5)
-        cv = xgb.cv(params, dm, num_boost_round=10, nfold=10,
-                    feval=self.evalerror, maximize=True,
-                    early_stopping_rounds=1)
+        cv = xgb.cv(
+            params,
+            dm,
+            num_boost_round=10,
+            nfold=10,
+            custom_metric=evalerror,
+            maximize=True,
+            early_stopping_rounds=1,
+        )
         self.assert_metrics_length(cv, 1)
 
     @pytest.mark.skipif(**tm.no_sklearn())
@@ -100,21 +114,35 @@ def test_cv_early_stopping_with_multiple_eval_sets_and_metrics(self):
 
         X, y = load_breast_cancer(return_X_y=True)
         dm = xgb.DMatrix(X, label=y)
-        params = {'objective':'binary:logistic'}
+        params = {"objective": "binary:logistic"}
 
-        metrics = [['auc'], ['error'], ['logloss'],
-                   ['logloss', 'auc'], ['logloss', 'error'], ['error', 'logloss']]
+        metrics = [
+            ["auc"],
+            ["error"],
+            ["logloss"],
+            ["logloss", "auc"],
+            ["logloss", "error"],
+            ["error", "logloss"],
+        ]
 
         num_iteration_history = []
 
         # If more than one metrics is given, early stopping should use the last metric
         for i, m in enumerate(metrics):
-            result = xgb.cv(params, dm, num_boost_round=1000, nfold=5, stratified=True,
-                            metrics=m, early_stopping_rounds=20, seed=42)
+            result = xgb.cv(
+                params,
+                dm,
+                num_boost_round=1000,
+                nfold=5,
+                stratified=True,
+                metrics=m,
+                early_stopping_rounds=20,
+                seed=42,
+            )
             num_iteration_history.append(len(result))
-            df = result['test-{}-mean'.format(m[-1])]
+            df = result["test-{}-mean".format(m[-1])]
             # When early stopping is invoked, the last metric should be as best it can be.
-            if m[-1] == 'auc':
+            if m[-1] == "auc":
                 assert np.all(df <= df.iloc[-1])
             else:
                 assert np.all(df >= df.iloc[-1])
diff --git a/tests/python/test_eval_metrics.py b/tests/python/test_eval_metrics.py
index 2ee8c02cc2b5..b02f348013fb 100644
--- a/tests/python/test_eval_metrics.py
+++ b/tests/python/test_eval_metrics.py
@@ -92,7 +92,7 @@ def test_eval_metrics(self):
             10,
             watchlist,
             early_stopping_rounds=2,
-            feval=self.evalerror_01,
+            custom_metric=self.evalerror_01,
         )
         gbdt_02 = xgb.train(
             self.xgb_params_02,
@@ -100,7 +100,7 @@ def test_eval_metrics(self):
             10,
             watchlist,
             early_stopping_rounds=2,
-            feval=self.evalerror_02,
+            custom_metric=self.evalerror_02,
         )
         gbdt_03 = xgb.train(
             self.xgb_params_03,
@@ -108,7 +108,7 @@ def test_eval_metrics(self):
             10,
             watchlist,
             early_stopping_rounds=2,
-            feval=self.evalerror_03,
+            custom_metric=self.evalerror_03,
         )
         gbdt_04 = xgb.train(
             self.xgb_params_04,
@@ -116,7 +116,7 @@ def test_eval_metrics(self):
             10,
             watchlist,
             early_stopping_rounds=2,
-            feval=self.evalerror_04,
+            custom_metric=self.evalerror_04,
         )
         assert gbdt_01.predict(dvalid)[0] == gbdt_02.predict(dvalid)[0]
         assert gbdt_01.predict(dvalid)[0] == gbdt_03.predict(dvalid)[0]
diff --git a/tests/test_distributed/test_with_dask/test_with_dask.py b/tests/test_distributed/test_with_dask/test_with_dask.py
index 53e263b5e06e..680ed025f15b 100644
--- a/tests/test_distributed/test_with_dask/test_with_dask.py
+++ b/tests/test_distributed/test_with_dask/test_with_dask.py
@@ -2153,6 +2153,9 @@ def test_early_stopping_custom_eval(self, client: "Client") -> None:
         X, y = da.from_array(X), da.from_array(y)
         m = dxgb.DaskDMatrix(client, X, y)
 
+        def eval_error_metric(predt: np.ndarray, dtrain: xgb.DMatrix):
+            return tm.eval_error_metric(predt, dtrain, rev_link=False)
+
         valid = dxgb.DaskDMatrix(client, X, y)
         early_stopping_rounds = 5
         booster = dxgb.train(
@@ -2164,7 +2167,7 @@ def test_early_stopping_custom_eval(self, client: "Client") -> None:
             },
             m,
             evals=[(m, "Train"), (valid, "Valid")],
-            feval=tm.eval_error_metric,
+            custom_metric=eval_error_metric,
             num_boost_round=1000,
             early_stopping_rounds=early_stopping_rounds,
         )["booster"]
diff --git a/tests/test_distributed/test_with_spark/test_spark_local.py b/tests/test_distributed/test_with_spark/test_spark_local.py
index 79569c7fd373..5f0dafd9d6be 100644
--- a/tests/test_distributed/test_with_spark/test_spark_local.py
+++ b/tests/test_distributed/test_with_spark/test_spark_local.py
@@ -9,14 +9,6 @@
 import numpy as np
 import pytest
 from pyspark import SparkConf
-
-import xgboost as xgb
-from xgboost import testing as tm
-from xgboost.collective import Config
-from xgboost.spark.data import pred_contribs
-
-pytestmark = [tm.timeout(60), pytest.mark.skipif(**tm.no_spark())]
-
 from pyspark.ml import Pipeline, PipelineModel
 from pyspark.ml.evaluation import BinaryClassificationEvaluator
 from pyspark.ml.feature import VectorAssembler
@@ -26,7 +18,10 @@
 from pyspark.sql import SparkSession
 from pyspark.sql import functions as spark_sql_func
 
+import xgboost as xgb
 from xgboost import XGBClassifier, XGBModel, XGBRegressor
+from xgboost import testing as tm
+from xgboost.collective import Config
 from xgboost.spark import (
     SparkXGBClassifier,
     SparkXGBClassifierModel,
@@ -35,11 +30,14 @@
     SparkXGBRegressorModel,
 )
 from xgboost.spark.core import _non_booster_params
+from xgboost.spark.data import pred_contribs
 
 from .utils import SparkTestCase
 
 logging.getLogger("py4j").setLevel(logging.INFO)
 
+pytestmark = [tm.timeout(60), pytest.mark.skipif(**tm.no_spark())]
+
 
 def no_sparse_unwrap() -> tm.PytestSkip:
     try: