From 3a56a6024e0d8b239801cce558381807c24ba3d0 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Fri, 19 Mar 2021 22:48:52 +0100
Subject: [PATCH] Prune metrics: other classification 7/n (#6584)

* confusion_matrix

* iou

* f_beta

* hamming_distance

* stat_scores

* tests

* flake8

* chlog
---
 CHANGELOG.md                                  |   2 +
 .../classification/confusion_matrix.py        |  90 +----
 .../metrics/classification/f_beta.py          | 180 +---------
 .../classification/hamming_distance.py        |  85 +----
 .../metrics/classification/iou.py             |  83 +----
 .../metrics/classification/stat_scores.py     | 239 +-------------
 .../metrics/functional/confusion_matrix.py    |  75 +----
 .../metrics/functional/f_beta.py              | 120 +------
 .../metrics/functional/hamming_distance.py    |  58 +---
 pytorch_lightning/metrics/functional/iou.py   |  88 +----
 .../metrics/functional/stat_scores.py         | 271 +--------------
 tests/metrics/classification/__init__.py      |   0
 tests/metrics/classification/inputs.py        |  66 ----
 .../classification/test_confusion_matrix.py   | 128 -------
 tests/metrics/classification/test_f_beta.py   | 153 ---------
 .../classification/test_hamming_distance.py   |  80 -----
 tests/metrics/classification/test_inputs.py   | 312 ------------------
 tests/metrics/classification/test_iou.py      | 216 ------------
 .../classification/test_stat_scores.py        | 255 --------------
 tests/metrics/test_remove_1-5_metrics.py      |  75 +++++
 20 files changed, 155 insertions(+), 2421 deletions(-)
 delete mode 100644 tests/metrics/classification/__init__.py
 delete mode 100644 tests/metrics/classification/inputs.py
 delete mode 100644 tests/metrics/classification/test_confusion_matrix.py
 delete mode 100644 tests/metrics/classification/test_f_beta.py
 delete mode 100644 tests/metrics/classification/test_hamming_distance.py
 delete mode 100644 tests/metrics/classification/test_inputs.py
 delete mode 100644 tests/metrics/classification/test_iou.py
 delete mode 100644 tests/metrics/classification/test_stat_scores.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index bd8f5e31770d2..01c7ae193555a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -78,6 +78,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
     [#6573](https://github.com/PyTorchLightning/pytorch-lightning/pull/6573),
 
+    [#6584](https://github.com/PyTorchLightning/pytorch-lightning/pull/6584),
+
 )
 
 
diff --git a/pytorch_lightning/metrics/classification/confusion_matrix.py b/pytorch_lightning/metrics/classification/confusion_matrix.py
index 112fb4940e6e2..aacd8dcf3b498 100644
--- a/pytorch_lightning/metrics/classification/confusion_matrix.py
+++ b/pytorch_lightning/metrics/classification/confusion_matrix.py
@@ -13,64 +13,14 @@
 # limitations under the License.
 from typing import Any, Optional
 
-import torch
-from torchmetrics import Metric
+from torchmetrics import ConfusionMatrix as _ConfusionMatrix
 
-from pytorch_lightning.metrics.functional.confusion_matrix import _confusion_matrix_compute, _confusion_matrix_update
+from pytorch_lightning.utilities.deprecation import deprecated
 
 
-class ConfusionMatrix(Metric):
-    """
-    Computes the `confusion matrix
-    <https://scikit-learn.org/stable/modules/model_evaluation.html#confusion-matrix>`_.  Works with binary,
-    multiclass, and multilabel data.  Accepts probabilities from a model output or
-    integer class values in prediction.  Works with multi-dimensional preds and
-    target.
-
-    Note:
-        This metric produces a multi-dimensional output, so it can not be directly logged.
-
-    Forward accepts
-
-    - ``preds`` (float or long tensor): ``(N, ...)`` or ``(N, C, ...)`` where C is the number of classes
-    - ``target`` (long tensor): ``(N, ...)``
-
-    If preds and target are the same shape and preds is a float tensor, we use the ``self.threshold`` argument
-    to convert into integer labels. This is the case for binary and multi-label probabilities.
-
-    If preds has an extra dimension as in the case of multi-class scores we perform an argmax on ``dim=1``.
-
-    Args:
-        num_classes: Number of classes in the dataset.
-        normalize: Normalization mode for confusion matrix. Choose from
-
-            - ``None`` or ``'none'``: no normalization (default)
-            - ``'true'``: normalization over the targets (most commonly used)
-            - ``'pred'``: normalization over the predictions
-            - ``'all'``: normalization over the whole matrix
-
-        threshold:
-            Threshold value for binary or multi-label probabilites. default: 0.5
-        compute_on_step:
-            Forward only calls ``update()`` and return None if this is set to False. default: True
-        dist_sync_on_step:
-            Synchronize metric state across processes at each ``forward()``
-            before returning the value at the step. default: False
-        process_group:
-            Specify the process group on which synchronization is called. default: None (which selects the entire world)
-
-    Example:
-
-        >>> from pytorch_lightning.metrics import ConfusionMatrix
-        >>> target = torch.tensor([1, 1, 0, 0])
-        >>> preds = torch.tensor([0, 1, 0, 0])
-        >>> confmat = ConfusionMatrix(num_classes=2)
-        >>> confmat(preds, target)
-        tensor([[2., 0.],
-                [1., 1.]])
-
-    """
+class ConfusionMatrix(_ConfusionMatrix):
 
+    @deprecated(target=_ConfusionMatrix, ver_deprecate="1.3.0", ver_remove="1.5.0")
     def __init__(
         self,
         num_classes: int,
@@ -80,35 +30,9 @@ def __init__(
         dist_sync_on_step: bool = False,
         process_group: Optional[Any] = None,
     ):
-
-        super().__init__(
-            compute_on_step=compute_on_step,
-            dist_sync_on_step=dist_sync_on_step,
-            process_group=process_group,
-        )
-        self.num_classes = num_classes
-        self.normalize = normalize
-        self.threshold = threshold
-
-        allowed_normalize = ('true', 'pred', 'all', 'none', None)
-        assert self.normalize in allowed_normalize, \
-            f"Argument average needs to one of the following: {allowed_normalize}"
-
-        self.add_state("confmat", default=torch.zeros(num_classes, num_classes), dist_reduce_fx="sum")
-
-    def update(self, preds: torch.Tensor, target: torch.Tensor):
-        """
-        Update state with predictions and targets.
-
-        Args:
-            preds: Predictions from model
-            target: Ground truth values
         """
-        confmat = _confusion_matrix_update(preds, target, self.num_classes, self.threshold)
-        self.confmat += confmat
+        This implementation refers to :class:`~torchmetrics.ConfusionMatrix`.
 
-    def compute(self) -> torch.Tensor:
-        """
-        Computes confusion matrix
+        .. deprecated::
+            Use :class:`~torchmetrics.ConfusionMatrix`. Will be removed in v1.5.0.
         """
-        return _confusion_matrix_compute(self.confmat, self.normalize)
diff --git a/pytorch_lightning/metrics/classification/f_beta.py b/pytorch_lightning/metrics/classification/f_beta.py
index a46b01a1aa8b7..bac3cc3e99c4e 100644
--- a/pytorch_lightning/metrics/classification/f_beta.py
+++ b/pytorch_lightning/metrics/classification/f_beta.py
@@ -13,72 +13,15 @@
 # limitations under the License.
 from typing import Any, Optional
 
-import torch
-from torchmetrics import Metric
+from torchmetrics import F1 as _F1
+from torchmetrics import FBeta as _FBeta
 
-from pytorch_lightning.metrics.functional.f_beta import _fbeta_compute, _fbeta_update
-from pytorch_lightning.utilities import rank_zero_warn
+from pytorch_lightning.utilities.deprecation import deprecated
 
 
-class FBeta(Metric):
-    r"""
-    Computes `F-score <https://en.wikipedia.org/wiki/F-score>`_, specifically:
-
-    .. math::
-        F_\beta = (1 + \beta^2) * \frac{\text{precision} * \text{recall}}
-        {(\beta^2 * \text{precision}) + \text{recall}}
-
-    Where :math:`\beta` is some positive real factor. Works with binary, multiclass, and multilabel data.
-    Accepts probabilities from a model output or integer class values in prediction.
-    Works with multi-dimensional preds and target.
-
-    Forward accepts
-
-    - ``preds`` (float or long tensor): ``(N, ...)`` or ``(N, C, ...)`` where C is the number of classes
-    - ``target`` (long tensor): ``(N, ...)``
-
-    If preds and target are the same shape and preds is a float tensor, we use the ``self.threshold`` argument
-    to convert into integer labels. This is the case for binary and multi-label probabilities.
-
-    If preds has an extra dimension as in the case of multi-class scores we perform an argmax on ``dim=1``.
-
-    Args:
-        num_classes: Number of classes in the dataset.
-        beta: Beta coefficient in the F measure.
-        threshold:
-            Threshold value for binary or multi-label probabilities. default: 0.5
-
-        average:
-            - ``'micro'`` computes metric globally
-            - ``'macro'`` computes metric for each class and uniformly averages them
-            - ``'weighted'`` computes metric for each class and does a weighted-average,
-              where each class is weighted by their support (accounts for class imbalance)
-            - ``'none'`` or ``None`` computes and returns the metric per class
-
-        multilabel: If predictions are from multilabel classification.
-        compute_on_step:
-            Forward only calls ``update()`` and return None if this is set to False. default: True
-        dist_sync_on_step:
-            Synchronize metric state across processes at each ``forward()``
-            before returning the value at the step. default: False
-        process_group:
-            Specify the process group on which synchronization is called. default: None (which selects the entire world)
-
-    Raises:
-        ValueError:
-            If ``average`` is none of ``"micro"``, ``"macro"``, ``"weighted"``, ``"none"``, ``None``.
-
-    Example:
-
-        >>> from pytorch_lightning.metrics import FBeta
-        >>> target = torch.tensor([0, 1, 2, 0, 1, 2])
-        >>> preds = torch.tensor([0, 2, 1, 0, 0, 1])
-        >>> f_beta = FBeta(num_classes=3, beta=0.5)
-        >>> f_beta(preds, target)
-        tensor(0.3333)
-
-    """
+class FBeta(_FBeta):
 
+    @deprecated(target=_FBeta, ver_deprecate="1.3.0", ver_remove="1.5.0")
     def __init__(
         self,
         num_classes: int,
@@ -90,103 +33,17 @@ def __init__(
         dist_sync_on_step: bool = False,
         process_group: Optional[Any] = None,
     ):
-        super().__init__(
-            compute_on_step=compute_on_step,
-            dist_sync_on_step=dist_sync_on_step,
-            process_group=process_group,
-        )
-
-        self.num_classes = num_classes
-        self.beta = beta
-        self.threshold = threshold
-        self.average = average
-        self.multilabel = multilabel
-
-        allowed_average = ("micro", "macro", "weighted", "none", None)
-        if self.average not in allowed_average:
-            raise ValueError(
-                'Argument `average` expected to be one of the following:'
-                f' {allowed_average} but got {self.average}'
-            )
-
-        self.add_state("true_positives", default=torch.zeros(num_classes), dist_reduce_fx="sum")
-        self.add_state("predicted_positives", default=torch.zeros(num_classes), dist_reduce_fx="sum")
-        self.add_state("actual_positives", default=torch.zeros(num_classes), dist_reduce_fx="sum")
-
-    def update(self, preds: torch.Tensor, target: torch.Tensor):
-        """
-        Update state with predictions and targets.
-
-        Args:
-            preds: Predictions from model
-            target: Ground truth values
         """
-        true_positives, predicted_positives, actual_positives = _fbeta_update(
-            preds, target, self.num_classes, self.threshold, self.multilabel
-        )
-
-        self.true_positives += true_positives
-        self.predicted_positives += predicted_positives
-        self.actual_positives += actual_positives
+        This implementation refers to :class:`~torchmetrics.FBeta`.
 
-    def compute(self) -> torch.Tensor:
+        .. deprecated::
+            Use :class:`~torchmetrics.FBeta`. Will be removed in v1.5.0.
         """
-        Computes fbeta over state.
-        """
-        return _fbeta_compute(
-            self.true_positives, self.predicted_positives, self.actual_positives, self.beta, self.average
-        )
-
-
-class F1(FBeta):
-    """
-    Computes F1 metric. F1 metrics correspond to a harmonic mean of the
-    precision and recall scores.
-
-    Works with binary, multiclass, and multilabel data.
-    Accepts logits from a model output or integer class values in prediction.
-    Works with multi-dimensional preds and target.
 
-    Forward accepts
 
-    - ``preds`` (float or long tensor): ``(N, ...)`` or ``(N, C, ...)`` where C is the number of classes
-    - ``target`` (long tensor): ``(N, ...)``
-
-    If preds and target are the same shape and preds is a float tensor, we use the ``self.threshold`` argument.
-    This is the case for binary and multi-label logits.
-
-    If preds has an extra dimension as in the case of multi-class scores we perform an argmax on ``dim=1``.
-
-    Args:
-        num_classes: Number of classes in the dataset.
-        threshold:
-            Threshold value for binary or multi-label logits. default: 0.5
-
-        average:
-            - ``'micro'`` computes metric globally
-            - ``'macro'`` computes metric for each class and uniformly averages them
-            - ``'weighted'`` computes metric for each class and does a weighted-average,
-              where each class is weighted by their support (accounts for class imbalance)
-            - ``'none'`` or ``None`` computes and returns the metric per class
-
-        multilabel: If predictions are from multilabel classification.
-        compute_on_step:
-            Forward only calls ``update()`` and returns None if this is set to False. default: True
-        dist_sync_on_step:
-            Synchronize metric state across processes at each ``forward()``
-            before returning the value at the step. default: False
-        process_group:
-            Specify the process group on which synchronization is called. default: None (which selects the entire world)
-
-    Example:
-        >>> from pytorch_lightning.metrics import F1
-        >>> target = torch.tensor([0, 1, 2, 0, 1, 2])
-        >>> preds = torch.tensor([0, 2, 1, 0, 0, 1])
-        >>> f1 = F1(num_classes=3)
-        >>> f1(preds, target)
-        tensor(0.3333)
-    """
+class F1(_F1):
 
+    @deprecated(target=_F1, ver_deprecate="1.3.0", ver_remove="1.5.0")
     def __init__(
         self,
         num_classes: int,
@@ -197,16 +54,9 @@ def __init__(
         dist_sync_on_step: bool = False,
         process_group: Optional[Any] = None,
     ):
-        if multilabel is not False:
-            rank_zero_warn(f'The `multilabel={multilabel}` parameter is unused and will not have any effect.')
+        """
+        This implementation refers to :class:`~torchmetrics.F1`.
 
-        super().__init__(
-            num_classes=num_classes,
-            beta=1.0,
-            threshold=threshold,
-            average=average,
-            multilabel=multilabel,
-            compute_on_step=compute_on_step,
-            dist_sync_on_step=dist_sync_on_step,
-            process_group=process_group,
-        )
+        .. deprecated::
+            Use :class:`~torchmetrics.F1`. Will be removed in v1.5.0.
+        """
diff --git a/pytorch_lightning/metrics/classification/hamming_distance.py b/pytorch_lightning/metrics/classification/hamming_distance.py
index dceb90c0a4ca9..b59c3e1053ab8 100644
--- a/pytorch_lightning/metrics/classification/hamming_distance.py
+++ b/pytorch_lightning/metrics/classification/hamming_distance.py
@@ -13,59 +13,14 @@
 # limitations under the License.
 from typing import Any, Callable, Optional
 
-import torch
-from torchmetrics import Metric
+from torchmetrics import HammingDistance as _HammingDistance
 
-from pytorch_lightning.metrics.functional.hamming_distance import _hamming_distance_compute, _hamming_distance_update
+from pytorch_lightning.utilities.deprecation import deprecated
 
 
-class HammingDistance(Metric):
-    r"""
-    Computes the average `Hamming distance <https://en.wikipedia.org/wiki/Hamming_distance>`_ (also
-    known as Hamming loss) between targets and predictions:
-
-    .. math::
-        \text{Hamming distance} = \frac{1}{N \cdot L}\sum_i^N \sum_l^L 1(y_{il} \neq \hat{y_{il}})
-
-    Where :math:`y` is a tensor of target values, :math:`\hat{y}` is a tensor of predictions,
-    and :math:`\bullet_{il}` refers to the :math:`l`-th label of the :math:`i`-th sample of that
-    tensor.
-
-    This is the same as ``1-accuracy`` for binary data, while for all other types of inputs it
-    treats each possible label separately - meaning that, for example, multi-class data is
-    treated as if it were multi-label.
-
-    Args:
-        threshold:
-            Threshold probability value for transforming probability predictions to binary
-            (0 or 1) predictions, in the case of binary or multi-label inputs.
-        compute_on_step:
-            Forward only calls ``update()`` and return ``None`` if this is set to ``False``.
-        dist_sync_on_step:
-            Synchronize metric state across processes at each ``forward()``
-            before returning the value at the step.
-        process_group:
-            Specify the process group on which synchronization is called.
-            default: ``None`` (which selects the entire world)
-        dist_sync_fn:
-            Callback that performs the allgather operation on the metric state. When ``None``, DDP
-            will be used to perform the all gather.
-
-    Raises:
-        ValueError:
-            If ``threshold`` is not between ``0`` and ``1``.
-
-    Example:
-
-        >>> from pytorch_lightning.metrics import HammingDistance
-        >>> target = torch.tensor([[0, 1], [1, 1]])
-        >>> preds = torch.tensor([[0, 1], [0, 1]])
-        >>> hamming_distance = HammingDistance()
-        >>> hamming_distance(preds, target)
-        tensor(0.2500)
-
-    """
+class HammingDistance(_HammingDistance):
 
+    @deprecated(target=_HammingDistance, ver_deprecate="1.3.0", ver_remove="1.5.0")
     def __init__(
         self,
         threshold: float = 0.5,
@@ -74,35 +29,9 @@ def __init__(
         process_group: Optional[Any] = None,
         dist_sync_fn: Callable = None,
     ):
-        super().__init__(
-            compute_on_step=compute_on_step,
-            dist_sync_on_step=dist_sync_on_step,
-            process_group=process_group,
-            dist_sync_fn=dist_sync_fn,
-        )
-
-        self.add_state("correct", default=torch.tensor(0), dist_reduce_fx="sum")
-        self.add_state("total", default=torch.tensor(0), dist_reduce_fx="sum")
-
-        if not 0 < threshold < 1:
-            raise ValueError("The `threshold` should lie in the (0,1) interval.")
-        self.threshold = threshold
-
-    def update(self, preds: torch.Tensor, target: torch.Tensor):
         """
-        Update state with predictions and targets.
+        This implementation refers to :class:`~torchmetrics.HammingDistance`.
 
-        Args:
-            preds: Predictions from model (probabilities, or labels)
-            target: Ground truth labels
-        """
-        correct, total = _hamming_distance_update(preds, target, self.threshold)
-
-        self.correct += correct
-        self.total += total
-
-    def compute(self) -> torch.Tensor:
-        """
-        Computes hamming distance based on inputs passed in to ``update`` previously.
+        .. deprecated::
+            Use :class:`~torchmetrics.HammingDistance`. Will be removed in v1.5.0.
         """
-        return _hamming_distance_compute(self.correct, self.total)
diff --git a/pytorch_lightning/metrics/classification/iou.py b/pytorch_lightning/metrics/classification/iou.py
index a261b767a8190..d5b5d8eeb47e2 100644
--- a/pytorch_lightning/metrics/classification/iou.py
+++ b/pytorch_lightning/metrics/classification/iou.py
@@ -13,70 +13,14 @@
 # limitations under the License.
 from typing import Any, Optional
 
-import torch
+from torchmetrics import IoU as _IoU
 
-from pytorch_lightning.metrics.classification.confusion_matrix import ConfusionMatrix
-from pytorch_lightning.metrics.functional.iou import _iou_from_confmat
+from pytorch_lightning.utilities.deprecation import deprecated
 
 
-class IoU(ConfusionMatrix):
-    r"""
-    Computes `Intersection over union, or Jaccard index calculation <https://en.wikipedia.org/wiki/Jaccard_index>`_:
-
-    .. math:: J(A,B) = \frac{|A\cap B|}{|A\cup B|}
-
-    Where: :math:`A` and :math:`B` are both tensors of the same size, containing integer class values.
-    They may be subject to conversion from input data (see description below). Note that it is different from box IoU.
-
-    Works with binary, multiclass and multi-label data.
-    Accepts probabilities from a model output or integer class values in prediction.
-    Works with multi-dimensional preds and target.
-
-    Forward accepts
-
-    - ``preds`` (float or long tensor): ``(N, ...)`` or ``(N, C, ...)`` where C is the number of classes
-    - ``target`` (long tensor): ``(N, ...)``
-
-    If preds and target are the same shape and preds is a float tensor, we use the ``self.threshold`` argument
-    to convert into integer labels. This is the case for binary and multi-label probabilities.
-
-    If preds has an extra dimension as in the case of multi-class scores we perform an argmax on ``dim=1``.
-
-    Args:
-        num_classes: Number of classes in the dataset.
-        ignore_index: optional int specifying a target class to ignore. If given, this class index does not contribute
-            to the returned score, regardless of reduction method. Has no effect if given an int that is not in the
-            range [0, num_classes-1]. By default, no index is ignored, and all classes are used.
-        absent_score: score to use for an individual class, if no instances of the class index were present in
-            `pred` AND no instances of the class index were present in `target`. For example, if we have 3 classes,
-            [0, 0] for `pred`, and [0, 2] for `target`, then class 1 would be assigned the `absent_score`.
-        threshold:
-            Threshold value for binary or multi-label probabilities.
-        reduction: a method to reduce metric score over labels.
-
-            - ``'elementwise_mean'``: takes the mean (default)
-            - ``'sum'``: takes the sum
-            - ``'none'``: no reduction will be applied
-
-        compute_on_step:
-            Forward only calls ``update()`` and return None if this is set to False.
-        dist_sync_on_step:
-            Synchronize metric state across processes at each ``forward()``
-            before returning the value at the step.
-        process_group:
-            Specify the process group on which synchronization is called. default: None (which selects the entire world)
-
-    Example:
-        >>> from pytorch_lightning.metrics import IoU
-        >>> target = torch.randint(0, 2, (10, 25, 25))
-        >>> pred = torch.tensor(target)
-        >>> pred[2:5, 7:13, 9:15] = 1 - pred[2:5, 7:13, 9:15]
-        >>> iou = IoU(num_classes=2)
-        >>> iou(pred, target)
-        tensor(0.9660)
-
-    """
+class IoU(_IoU):
 
+    @deprecated(target=_IoU, ver_deprecate="1.3.0", ver_remove="1.5.0")
     def __init__(
         self,
         num_classes: int,
@@ -88,20 +32,9 @@ def __init__(
         dist_sync_on_step: bool = False,
         process_group: Optional[Any] = None,
     ):
-        super().__init__(
-            num_classes=num_classes,
-            normalize=None,
-            threshold=threshold,
-            compute_on_step=compute_on_step,
-            dist_sync_on_step=dist_sync_on_step,
-            process_group=process_group,
-        )
-        self.reduction = reduction
-        self.ignore_index = ignore_index
-        self.absent_score = absent_score
-
-    def compute(self) -> torch.Tensor:
         """
-        Computes intersection over union (IoU)
+        This implementation refers to :class:`~torchmetrics.IoU`.
+
+        .. deprecated::
+            Use :class:`~torchmetrics.IoU`. Will be removed in v1.5.0.
         """
-        return _iou_from_confmat(self.confmat, self.num_classes, self.ignore_index, self.absent_score, self.reduction)
diff --git a/pytorch_lightning/metrics/classification/stat_scores.py b/pytorch_lightning/metrics/classification/stat_scores.py
index 672b0f41c6fc5..2c4764477b262 100644
--- a/pytorch_lightning/metrics/classification/stat_scores.py
+++ b/pytorch_lightning/metrics/classification/stat_scores.py
@@ -11,120 +11,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Callable, Optional, Tuple
+from typing import Any, Callable, Optional
 
-import torch
-from torchmetrics import Metric
+from torchmetrics import StatScores as _StatScores
 
-from pytorch_lightning.metrics.functional.stat_scores import _stat_scores_compute, _stat_scores_update
+from pytorch_lightning.utilities.deprecation import deprecated
 
 
-class StatScores(Metric):
-    """Computes the number of true positives, false positives, true negatives, false negatives.
-    Related to `Type I and Type II errors <https://en.wikipedia.org/wiki/Type_I_and_type_II_errors>`__
-    and the `confusion matrix <https://en.wikipedia.org/wiki/Confusion_matrix#Table_of_confusion>`__.
-
-    The reduction method (how the statistics are aggregated) is controlled by the
-    ``reduce`` parameter, and additionally by the ``mdmc_reduce`` parameter in the
-    multi-dimensional multi-class case.
-
-    Args:
-        threshold:
-            Threshold probability value for transforming probability predictions to binary
-            (0 or 1) predictions, in the case of binary or multi-label inputs.
-
-        top_k:
-            Number of highest probability entries for each sample to convert to 1s - relevant
-            only for inputs with probability predictions. If this parameter is set for multi-label
-            inputs, it will take precedence over ``threshold``. For (multi-dim) multi-class inputs,
-            this parameter defaults to 1.
-
-            Should be left unset (``None``) for inputs with label predictions.
-
-        reduce:
-            Defines the reduction that is applied. Should be one of the following:
-
-            - ``'micro'`` [default]: Counts the statistics by summing over all [sample, class]
-              combinations (globally). Each statistic is represented by a single integer.
-            - ``'macro'``: Counts the statistics for each class separately (over all samples).
-              Each statistic is represented by a ``(C,)`` tensor. Requires ``num_classes``
-              to be set.
-            - ``'samples'``: Counts the statistics for each sample separately (over all classes).
-              Each statistic is represented by a ``(N, )`` 1d tensor.
-
-            Note that what is considered a sample in the multi-dimensional multi-class case
-            depends on the value of ``mdmc_reduce``.
-
-        num_classes:
-            Number of classes. Necessary for (multi-dimensional) multi-class or multi-label data.
-
-        ignore_index:
-            Specify a class (label) to ignore. If given, this class index does not contribute
-            to the returned score, regardless of reduction method. If an index is ignored, and
-            ``reduce='macro'``, the class statistics for the ignored class will all be returned
-            as ``-1``.
-
-        mdmc_reduce:
-            Defines how the multi-dimensional multi-class inputs are handeled. Should be
-            one of the following:
-
-            - ``None`` [default]: Should be left unchanged if your data is not multi-dimensional multi-class.
-
-            - ``'samplewise'``: In this case, the statistics are computed separately for each
-              sample on the ``N`` axis, and then the outputs are concatenated together. In each
-              sample the extra axes ``...`` are flattened to become the sub-sample axis, and
-              statistics for each sample are computed by treating the sub-sample axis as the
-              ``N`` axis for that sample.
-
-            - ``'global'``: In this case the ``N`` and ``...`` dimensions of the inputs are
-              flattened into a new ``N_X`` sample axis, i.e. the inputs are treated as if they
-              were ``(N_X, C)``. From here on the ``reduce`` parameter applies as usual.
-
-        is_multiclass:
-            Used only in certain special cases, where you want to treat inputs as a different type
-            than what they appear to be.
-
-        compute_on_step:
-            Forward only calls ``update()`` and return ``None`` if this is set to ``False``.
-        dist_sync_on_step:
-            Synchronize metric state across processes at each ``forward()``
-            before returning the value at the step
-        process_group:
-            Specify the process group on which synchronization is called.
-            default: ``None`` (which selects the entire world)
-        dist_sync_fn:
-            Callback that performs the allgather operation on the metric state. When ``None``, DDP
-            will be used to perform the allgather.
-
-    Raises:
-        ValueError:
-            If ``threshold`` is not a ``float`` between ``0`` and ``1``.
-        ValueError:
-            If ``reduce`` is none of ``"micro"``, ``"macro"`` or ``"samples"``.
-        ValueError:
-            If ``mdmc_reduce`` is none of ``None``, ``"samplewise"``, ``"global"``.
-        ValueError:
-            If ``reduce`` is set to ``"macro"`` and ``num_classes`` is not provided.
-        ValueError:
-            If ``num_classes`` is set
-            and ``ignore_index`` is not in the range ``0`` <= ``ignore_index`` < ``num_classes``.
-
-    Example:
-
-        >>> from pytorch_lightning.metrics.classification import StatScores
-        >>> preds  = torch.tensor([1, 0, 2, 1])
-        >>> target = torch.tensor([1, 1, 2, 0])
-        >>> stat_scores = StatScores(reduce='macro', num_classes=3)
-        >>> stat_scores(preds, target)
-        tensor([[0, 1, 2, 1, 1],
-                [1, 1, 1, 1, 2],
-                [1, 0, 3, 0, 1]])
-        >>> stat_scores = StatScores(reduce='micro')
-        >>> stat_scores(preds, target)
-        tensor([2, 2, 6, 2, 4])
-
-    """
+class StatScores(_StatScores):
 
+    @deprecated(target=_StatScores, ver_deprecate="1.3.0", ver_remove="1.5.0")
     def __init__(
         self,
         threshold: float = 0.5,
@@ -139,128 +35,9 @@ def __init__(
         process_group: Optional[Any] = None,
         dist_sync_fn: Callable = None,
     ):
-        super().__init__(
-            compute_on_step=compute_on_step,
-            dist_sync_on_step=dist_sync_on_step,
-            process_group=process_group,
-            dist_sync_fn=dist_sync_fn,
-        )
-
-        self.reduce = reduce
-        self.mdmc_reduce = mdmc_reduce
-        self.num_classes = num_classes
-        self.threshold = threshold
-        self.is_multiclass = is_multiclass
-        self.ignore_index = ignore_index
-        self.top_k = top_k
-
-        if not 0 < threshold < 1:
-            raise ValueError(f"The `threshold` should be a float in the (0,1) interval, got {threshold}")
-
-        if reduce not in ["micro", "macro", "samples"]:
-            raise ValueError(f"The `reduce` {reduce} is not valid.")
-
-        if mdmc_reduce not in [None, "samplewise", "global"]:
-            raise ValueError(f"The `mdmc_reduce` {mdmc_reduce} is not valid.")
-
-        if reduce == "macro" and (not num_classes or num_classes < 1):
-            raise ValueError("When you set `reduce` as 'macro', you have to provide the number of classes.")
-
-        if num_classes and ignore_index is not None and (not 0 <= ignore_index < num_classes or num_classes == 1):
-            raise ValueError(f"The `ignore_index` {ignore_index} is not valid for inputs with {num_classes} classes")
-
-        if mdmc_reduce != "samplewise" and reduce != "samples":
-            if reduce == "micro":
-                zeros_shape = []
-            elif reduce == "macro":
-                zeros_shape = (num_classes, )
-            default, reduce_fn = lambda: torch.zeros(zeros_shape, dtype=torch.long), "sum"
-        else:
-            default, reduce_fn = lambda: [], None
-
-        for s in ("tp", "fp", "tn", "fn"):
-            self.add_state(s, default=default(), dist_reduce_fx=reduce_fn)
-
-    def update(self, preds: torch.Tensor, target: torch.Tensor):
-        """
-        Update state with predictions and targets.
-
-        Args:
-            preds: Predictions from model (probabilities or labels)
-            target: Ground truth values
-        """
-
-        tp, fp, tn, fn = _stat_scores_update(
-            preds,
-            target,
-            reduce=self.reduce,
-            mdmc_reduce=self.mdmc_reduce,
-            threshold=self.threshold,
-            num_classes=self.num_classes,
-            top_k=self.top_k,
-            is_multiclass=self.is_multiclass,
-            ignore_index=self.ignore_index,
-        )
-
-        # Update states
-        if self.reduce != "samples" and self.mdmc_reduce != "samplewise":
-            self.tp += tp
-            self.fp += fp
-            self.tn += tn
-            self.fn += fn
-        else:
-            self.tp.append(tp)
-            self.fp.append(fp)
-            self.tn.append(tn)
-            self.fn.append(fn)
-
-    def _get_final_stats(self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Performs concatenation on the stat scores if neccesary,
-        before passing them to a compute function.
         """
+        This implementation refers to :class:`~torchmetrics.StatScores`.
 
-        if isinstance(self.tp, list):
-            tp = torch.cat(self.tp)
-            fp = torch.cat(self.fp)
-            tn = torch.cat(self.tn)
-            fn = torch.cat(self.fn)
-        else:
-            tp, fp, tn, fn = self.tp, self.fp, self.tn, self.fn
-
-        return tp, fp, tn, fn
-
-    def compute(self) -> torch.Tensor:
-        """
-        Computes the stat scores based on inputs passed in to ``update`` previously.
-
-        Return:
-            The metric returns a tensor of shape ``(..., 5)``, where the last dimension corresponds
-            to ``[tp, fp, tn, fn, sup]`` (``sup`` stands for support and equals ``tp + fn``). The
-            shape depends on the ``reduce`` and ``mdmc_reduce`` (in case of multi-dimensional
-            multi-class data) parameters:
-
-            - If the data is not multi-dimensional multi-class, then
-
-              - If ``reduce='micro'``, the shape will be ``(5, )``
-              - If ``reduce='macro'``, the shape will be ``(C, 5)``,
-                where ``C`` stands for the number of classes
-              - If ``reduce='samples'``, the shape will be ``(N, 5)``, where ``N`` stands for
-                the number of samples
-
-            - If the data is multi-dimensional multi-class and ``mdmc_reduce='global'``, then
-
-              - If ``reduce='micro'``, the shape will be ``(5, )``
-              - If ``reduce='macro'``, the shape will be ``(C, 5)``
-              - If ``reduce='samples'``, the shape will be ``(N*X, 5)``, where ``X`` stands for
-                the product of sizes of all "extra" dimensions of the data (i.e. all dimensions
-                except for ``C`` and ``N``)
-
-            - If the data is multi-dimensional multi-class and ``mdmc_reduce='samplewise'``, then
-
-              - If ``reduce='micro'``, the shape will be ``(N, 5)``
-              - If ``reduce='macro'``, the shape will be ``(N, C, 5)``
-              - If ``reduce='samples'``, the shape will be ``(N, X, 5)``
-
+        .. deprecated::
+            Use :class:`~torchmetrics.StatScores`. Will be removed in v1.5.0.
         """
-        tp, fp, tn, fn = self._get_final_stats()
-        return _stat_scores_compute(tp, fp, tn, fn)
diff --git a/pytorch_lightning/metrics/functional/confusion_matrix.py b/pytorch_lightning/metrics/functional/confusion_matrix.py
index e77fc4224d25e..5cf8818176696 100644
--- a/pytorch_lightning/metrics/functional/confusion_matrix.py
+++ b/pytorch_lightning/metrics/functional/confusion_matrix.py
@@ -14,45 +14,12 @@
 from typing import Optional
 
 import torch
-from torchmetrics.classification.checks import _input_format_classification
-from torchmetrics.utilities.enums import DataType
+from torchmetrics.functional import confusion_matrix as _confusion_matrix
 
-from pytorch_lightning.utilities import rank_zero_warn
-
-
-def _confusion_matrix_update(
-    preds: torch.Tensor, target: torch.Tensor, num_classes: int, threshold: float = 0.5
-) -> torch.Tensor:
-    preds, target, mode = _input_format_classification(preds, target, threshold)
-    if mode not in (DataType.BINARY, DataType.MULTILABEL):
-        preds = preds.argmax(dim=1)
-        target = target.argmax(dim=1)
-    unique_mapping = (target.view(-1) * num_classes + preds.view(-1)).to(torch.long)
-    bins = torch.bincount(unique_mapping, minlength=num_classes**2)
-    confmat = bins.reshape(num_classes, num_classes)
-    return confmat
-
-
-def _confusion_matrix_compute(confmat: torch.Tensor, normalize: Optional[str] = None) -> torch.Tensor:
-    allowed_normalize = ('true', 'pred', 'all', 'none', None)
-    assert normalize in allowed_normalize, \
-        f"Argument average needs to one of the following: {allowed_normalize}"
-    confmat = confmat.float()
-    if normalize is not None and normalize != 'none':
-        if normalize == 'true':
-            cm = confmat / confmat.sum(axis=1, keepdim=True)
-        elif normalize == 'pred':
-            cm = confmat / confmat.sum(axis=0, keepdim=True)
-        elif normalize == 'all':
-            cm = confmat / confmat.sum()
-        nan_elements = cm[torch.isnan(cm)].nelement()
-        if nan_elements != 0:
-            cm[torch.isnan(cm)] = 0
-            rank_zero_warn(f'{nan_elements} nan values found in confusion matrix have been replaced with zeros.')
-        return cm
-    return confmat
+from pytorch_lightning.utilities.deprecation import deprecated
 
 
+@deprecated(target=_confusion_matrix, ver_deprecate="1.3.0", ver_remove="1.5.0")
 def confusion_matrix(
     preds: torch.Tensor,
     target: torch.Tensor,
@@ -61,38 +28,6 @@ def confusion_matrix(
     threshold: float = 0.5
 ) -> torch.Tensor:
     """
-    Computes the confusion matrix. Works with binary, multiclass, and multilabel data.
-    Accepts probabilities from a model output or integer class values in prediction.
-    Works with multi-dimensional preds and target.
-
-    If preds and target are the same shape and preds is a float tensor, we use the ``self.threshold`` argument
-    to convert into integer labels. This is the case for binary and multi-label probabilities.
-
-    If preds has an extra dimension as in the case of multi-class scores we perform an argmax on ``dim=1``.
-
-    Args:
-        preds: (float or long tensor), Either a ``(N, ...)`` tensor with labels or
-            ``(N, C, ...)`` where C is the number of classes, tensor with labels/probabilities
-        target: ``target`` (long tensor), tensor with shape ``(N, ...)`` with ground true labels
-        num_classes: Number of classes in the dataset.
-        normalize: Normalization mode for confusion matrix. Choose from
-
-            - ``None`` or ``'none'``: no normalization (default)
-            - ``'true'``: normalization over the targets (most commonly used)
-            - ``'pred'``: normalization over the predictions
-            - ``'all'``: normalization over the whole matrix
-
-        threshold:
-            Threshold value for binary or multi-label probabilities. default: 0.5
-
-    Example:
-
-        >>> from pytorch_lightning.metrics.functional import confusion_matrix
-        >>> target = torch.tensor([1, 1, 0, 0])
-        >>> preds = torch.tensor([0, 1, 0, 0])
-        >>> confusion_matrix(preds, target, num_classes=2)
-        tensor([[2., 0.],
-                [1., 1.]])
+    .. deprecated::
+        Use :func:`torchmetrics.functional.confusion_matrix`. Will be removed in v1.5.0.
     """
-    confmat = _confusion_matrix_update(preds, target, num_classes, threshold)
-    return _confusion_matrix_compute(confmat, normalize)
diff --git a/pytorch_lightning/metrics/functional/f_beta.py b/pytorch_lightning/metrics/functional/f_beta.py
index 5be4786297b65..e4d926e0ab8bf 100644
--- a/pytorch_lightning/metrics/functional/f_beta.py
+++ b/pytorch_lightning/metrics/functional/f_beta.py
@@ -11,46 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Tuple
-
 import torch
-from torchmetrics.utilities import class_reduce
-from torchmetrics.utilities.checks import _input_format_classification_one_hot
-
-
-def _fbeta_update(
-    preds: torch.Tensor,
-    target: torch.Tensor,
-    num_classes: int,
-    threshold: float = 0.5,
-    multilabel: bool = False
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-    preds, target = _input_format_classification_one_hot(num_classes, preds, target, threshold, multilabel)
-    true_positives = torch.sum(preds * target, dim=1)
-    predicted_positives = torch.sum(preds, dim=1)
-    actual_positives = torch.sum(target, dim=1)
-    return true_positives, predicted_positives, actual_positives
-
+from torchmetrics.functional import f1 as _f1
+from torchmetrics.functional import fbeta as _fbeta
 
-def _fbeta_compute(
-    true_positives: torch.Tensor,
-    predicted_positives: torch.Tensor,
-    actual_positives: torch.Tensor,
-    beta: float = 1.0,
-    average: str = "micro"
-) -> torch.Tensor:
-    if average == "micro":
-        precision = true_positives.sum().float() / predicted_positives.sum()
-        recall = true_positives.sum().float() / actual_positives.sum()
-    else:
-        precision = true_positives.float() / predicted_positives
-        recall = true_positives.float() / actual_positives
-
-    num = (1 + beta**2) * precision * recall
-    denom = beta**2 * precision + recall
-    return class_reduce(num, denom, weights=actual_positives, class_reduction=average)
+from pytorch_lightning.utilities.deprecation import deprecated
 
 
+@deprecated(target=_fbeta, ver_deprecate="1.3.0", ver_remove="1.5.0")
 def fbeta(
     preds: torch.Tensor,
     target: torch.Tensor,
@@ -61,49 +29,12 @@ def fbeta(
     multilabel: bool = False
 ) -> torch.Tensor:
     """
-    Computes f_beta metric.
-
-    Works with binary, multiclass, and multilabel data.
-    Accepts probabilities from a model output or integer class values in prediction.
-    Works with multi-dimensional preds and target.
-
-    If preds and target are the same shape and preds is a float tensor, we use the ``self.threshold`` argument
-    to convert into integer labels. This is the case for binary and multi-label probabilities.
-
-    If preds has an extra dimension as in the case of multi-class scores we perform an argmax on ``dim=1``.
-
-    Args:
-        preds: predictions from model (probabilities, or labels)
-        target: ground truth labels
-        num_classes: Number of classes in the dataset.
-        beta: Beta coefficient in the F measure.
-        threshold:
-            Threshold value for binary or multi-label probabilities. default: 0.5
-
-        average:
-            - ``'micro'`` computes metric globally
-            - ``'macro'`` computes metric for each class and uniformly averages them
-            - ``'weighted'`` computes metric for each class and does a weighted-average,
-              where each class is weighted by their support (accounts for class imbalance)
-            - ``'none'`` or ``None`` computes and returns the metric per class
-
-        multilabel: If predictions are from multilabel classification.
-
-    Example:
-
-        >>> from pytorch_lightning.metrics.functional import fbeta
-        >>> target = torch.tensor([0, 1, 2, 0, 1, 2])
-        >>> preds = torch.tensor([0, 2, 1, 0, 0, 1])
-        >>> fbeta(preds, target, num_classes=3, beta=0.5)
-        tensor(0.3333)
-
+    .. deprecated::
+        Use :func:`torchmetrics.functional.accuracy`. Will be removed in v1.5.0.
     """
-    true_positives, predicted_positives, actual_positives = _fbeta_update(
-        preds, target, num_classes, threshold, multilabel
-    )
-    return _fbeta_compute(true_positives, predicted_positives, actual_positives, beta, average)
 
 
+@deprecated(target=_f1, ver_deprecate="1.3.0", ver_remove="1.5.0")
 def f1(
     preds: torch.Tensor,
     target: torch.Tensor,
@@ -113,39 +44,6 @@ def f1(
     multilabel: bool = False
 ) -> torch.Tensor:
     """
-    Computes F1 metric. F1 metrics correspond to a equally weighted average of the
-    precision and recall scores.
-
-    Works with binary, multiclass, and multilabel data.
-    Accepts probabilities from a model output or integer class values in prediction.
-    Works with multi-dimensional preds and target.
-
-    If preds and target are the same shape and preds is a float tensor, we use the ``self.threshold`` argument
-    to convert into integer labels. This is the case for binary and multi-label probabilities.
-
-    If preds has an extra dimension as in the case of multi-class scores we perform an argmax on ``dim=1``.
-
-    Args:
-        preds: predictions from model (probabilities, or labels)
-        target: ground truth labels
-        num_classes: Number of classes in the dataset.
-        threshold:
-            Threshold value for binary or multi-label probabilities. default: 0.5
-
-        average:
-            - ``'micro'`` computes metric globally
-            - ``'macro'`` computes metric for each class and uniformly averages them
-            - ``'weighted'`` computes metric for each class and does a weighted-average,
-              where each class is weighted by their support (accounts for class imbalance)
-            - ``'none'`` or ``None`` computes and returns the metric per class
-
-        multilabel: If predictions are from multilabel classification.
-
-    Example:
-        >>> from pytorch_lightning.metrics.functional import f1
-        >>> target = torch.tensor([0, 1, 2, 0, 1, 2])
-        >>> preds = torch.tensor([0, 2, 1, 0, 0, 1])
-        >>> f1(preds, target, num_classes=3)
-        tensor(0.3333)
+    .. deprecated::
+        Use :func:`torchmetrics.functional.f1`. Will be removed in v1.5.0.
     """
-    return fbeta(preds, target, num_classes, 1.0, threshold, average, multilabel)
diff --git a/pytorch_lightning/metrics/functional/hamming_distance.py b/pytorch_lightning/metrics/functional/hamming_distance.py
index 3254dcbf8badb..ef6bb3277fef2 100644
--- a/pytorch_lightning/metrics/functional/hamming_distance.py
+++ b/pytorch_lightning/metrics/functional/hamming_distance.py
@@ -11,61 +11,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Tuple, Union
-
 import torch
-from torchmetrics.classification.checks import _input_format_classification
-
-
-def _hamming_distance_update(
-    preds: torch.Tensor,
-    target: torch.Tensor,
-    threshold: float = 0.5,
-) -> Tuple[torch.Tensor, int]:
-    preds, target, _ = _input_format_classification(preds, target, threshold=threshold)
-
-    correct = (preds == target).sum()
-    total = preds.numel()
+from torchmetrics.functional import hamming_distance as _hamming_distance
 
-    return correct, total
-
-
-def _hamming_distance_compute(correct: torch.Tensor, total: Union[int, torch.Tensor]) -> torch.Tensor:
-    return 1 - correct.float() / total
+from pytorch_lightning.utilities.deprecation import deprecated
 
 
+@deprecated(target=_hamming_distance, ver_deprecate="1.3.0", ver_remove="1.5.0")
 def hamming_distance(preds: torch.Tensor, target: torch.Tensor, threshold: float = 0.5) -> torch.Tensor:
-    r"""
-    Computes the average `Hamming distance <https://en.wikipedia.org/wiki/Hamming_distance>`_ (also
-    known as Hamming loss) between targets and predictions:
-
-    .. math::
-        \text{Hamming distance} = \frac{1}{N \cdot L} \sum_i^N \sum_l^L 1(y_{il} \neq \hat{y}_{il})
-
-    Where :math:`y` is a tensor of target values, :math:`\hat{y}` is a tensor of predictions,
-    and :math:`\bullet_{il}` refers to the :math:`l`-th label of the :math:`i`-th sample of that
-    tensor.
-
-    This is the same as ``1-accuracy`` for binary data, while for all other types of inputs it
-    treats each possible label separately - meaning that, for example, multi-class data is
-    treated as if it were multi-label.
-
-    Args:
-        preds: Predictions from model
-        target: Ground truth
-        threshold:
-            Threshold probability value for transforming probability predictions to binary
-            (0 or 1) predictions, in the case of binary or multi-label inputs.
-
-    Example:
-
-        >>> from pytorch_lightning.metrics.functional import hamming_distance
-        >>> target = torch.tensor([[0, 1], [1, 1]])
-        >>> preds = torch.tensor([[0, 1], [0, 1]])
-        >>> hamming_distance(preds, target)
-        tensor(0.2500)
-
     """
-
-    correct, total = _hamming_distance_update(preds, target, threshold)
-    return _hamming_distance_compute(correct, total)
+    .. deprecated::
+        Use :func:`torchmetrics.functional.hamming_distance`. Will be removed in v1.5.0.
+    """
diff --git a/pytorch_lightning/metrics/functional/iou.py b/pytorch_lightning/metrics/functional/iou.py
index 0f8152d314848..7ae520eb25dee 100644
--- a/pytorch_lightning/metrics/functional/iou.py
+++ b/pytorch_lightning/metrics/functional/iou.py
@@ -14,35 +14,12 @@
 from typing import Optional
 
 import torch
-from torchmetrics.utilities import reduce
-from torchmetrics.utilities.data import get_num_classes
+from torchmetrics.functional import iou as _iou
 
-from pytorch_lightning.metrics.functional.confusion_matrix import _confusion_matrix_update
-
-
-def _iou_from_confmat(
-    confmat: torch.Tensor,
-    num_classes: int,
-    ignore_index: Optional[int] = None,
-    absent_score: float = 0.0,
-    reduction: str = 'elementwise_mean',
-):
-    intersection = torch.diag(confmat)
-    union = confmat.sum(0) + confmat.sum(1) - intersection
-
-    # If this class is absent in both target AND pred (union == 0), then use the absent_score for this class.
-    scores = intersection.float() / union.float()
-    scores[union == 0] = absent_score
-
-    # Remove the ignored class index from the scores.
-    if ignore_index is not None and ignore_index >= 0 and ignore_index < num_classes:
-        scores = torch.cat([
-            scores[:ignore_index],
-            scores[ignore_index + 1:],
-        ])
-    return reduce(scores, reduction=reduction)
+from pytorch_lightning.utilities.deprecation import deprecated
 
 
+@deprecated(target=_iou, ver_deprecate="1.3.0", ver_remove="1.5.0")
 def iou(
     pred: torch.Tensor,
     target: torch.Tensor,
@@ -52,60 +29,7 @@ def iou(
     num_classes: Optional[int] = None,
     reduction: str = 'elementwise_mean',
 ) -> torch.Tensor:
-    r"""
-    Computes `Intersection over union, or Jaccard index calculation <https://en.wikipedia.org/wiki/Jaccard_index>`_:
-
-    .. math:: J(A,B) = \frac{|A\cap B|}{|A\cup B|}
-
-    Where: :math:`A` and :math:`B` are both tensors of the same size,
-    containing integer class values. They may be subject to conversion from
-    input data (see description below).
-
-    Note that it is different from box IoU.
-
-    If preds and target are the same shape and preds is a float tensor, we use the ``self.threshold`` argument
-    to convert into integer labels. This is the case for binary and multi-label probabilities.
-
-    If pred has an extra dimension as in the case of multi-class scores we
-    perform an argmax on ``dim=1``.
-
-    Args:
-        preds: tensor containing predictions from model (probabilities, or labels) with shape ``[N, d1, d2, ...]``
-        target: tensor containing ground truth labels with shape ``[N, d1, d2, ...]``
-        ignore_index: optional int specifying a target class to ignore. If given,
-            this class index does not contribute to the returned score, regardless
-            of reduction method. Has no effect if given an int that is not in the
-            range [0, num_classes-1], where num_classes is either given or derived
-            from pred and target. By default, no index is ignored, and all classes are used.
-        absent_score: score to use for an individual class, if no instances of
-            the class index were present in `pred` AND no instances of the class
-            index were present in `target`. For example, if we have 3 classes,
-            [0, 0] for `pred`, and [0, 2] for `target`, then class 1 would be
-            assigned the `absent_score`.
-        threshold:
-            Threshold value for binary or multi-label probabilities. default: 0.5
-        num_classes:
-            Optionally specify the number of classes
-        reduction: a method to reduce metric score over labels.
-
-            - ``'elementwise_mean'``: takes the mean (default)
-            - ``'sum'``: takes the sum
-            - ``'none'``: no reduction will be applied
-
-    Return:
-        IoU score : Tensor containing single value if reduction is
-        'elementwise_mean', or number of classes if reduction is 'none'
-
-    Example:
-
-        >>> from pytorch_lightning.metrics.functional import iou
-        >>> target = torch.randint(0, 2, (10, 25, 25))
-        >>> pred = torch.tensor(target)
-        >>> pred[2:5, 7:13, 9:15] = 1 - pred[2:5, 7:13, 9:15]
-        >>> iou(pred, target)
-        tensor(0.9660)
     """
-
-    num_classes = get_num_classes(pred=pred, target=target, num_classes=num_classes)
-    confmat = _confusion_matrix_update(pred, target, num_classes, threshold)
-    return _iou_from_confmat(confmat, num_classes, ignore_index, absent_score, reduction)
+    .. deprecated::
+        Use :func:`torchmetrics.functional.iou`. Will be removed in v1.5.0.
+    """
diff --git a/pytorch_lightning/metrics/functional/stat_scores.py b/pytorch_lightning/metrics/functional/stat_scores.py
index fb1849d3805b2..6f234e84d9aab 100644
--- a/pytorch_lightning/metrics/functional/stat_scores.py
+++ b/pytorch_lightning/metrics/functional/stat_scores.py
@@ -11,130 +11,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Optional, Tuple
+from typing import Optional
 
 import torch
-from torchmetrics.classification.checks import _input_format_classification
+from torchmetrics.functional import stat_scores as _stat_scores
 
-
-def _del_column(tensor: torch.Tensor, index: int):
-    """ Delete the column at index."""
-
-    return torch.cat([tensor[:, :index], tensor[:, (index + 1):]], 1)
-
-
-def _stat_scores(
-    preds: torch.Tensor,
-    target: torch.Tensor,
-    reduce: str = "micro",
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    """Calculate the number of tp, fp, tn, fn.
-
-    Args:
-        preds:
-            An ``(N, C)`` or ``(N, C, X)`` tensor of predictions (0 or 1)
-        target:
-            An ``(N, C)`` or ``(N, C, X)`` tensor of true labels (0 or 1)
-        reduce:
-            One of ``'micro'``, ``'macro'``, ``'samples'``
-
-    Return:
-        Returns a list of 4 tensors; tp, fp, tn, fn.
-        The shape of the returned tensors depnds on the shape of the inputs
-        and the ``reduce`` parameter:
-
-        If inputs are of the shape ``(N, C)``, then
-        - If ``reduce='micro'``, the returned tensors are 1 element tensors
-        - If ``reduce='macro'``, the returned tensors are ``(C,)`` tensors
-        - If ``reduce'samples'``, the returned tensors are ``(N,)`` tensors
-
-        If inputs are of the shape ``(N, C, X)``, then
-        - If ``reduce='micro'``, the returned tensors are ``(N,)`` tensors
-        - If ``reduce='macro'``, the returned tensors are ``(N,C)`` tensors
-        - If ``reduce='samples'``, the returned tensors are ``(N,X)`` tensors
-    """
-    if reduce == "micro":
-        dim = [0, 1] if preds.ndim == 2 else [1, 2]
-    elif reduce == "macro":
-        dim = 0 if preds.ndim == 2 else 2
-    elif reduce == "samples":
-        dim = 1
-
-    true_pred, false_pred = target == preds, target != preds
-    pos_pred, neg_pred = preds == 1, preds == 0
-
-    tp = (true_pred * pos_pred).sum(dim=dim)
-    fp = (false_pred * pos_pred).sum(dim=dim)
-
-    tn = (true_pred * neg_pred).sum(dim=dim)
-    fn = (false_pred * neg_pred).sum(dim=dim)
-
-    return tp.long(), fp.long(), tn.long(), fn.long()
-
-
-def _stat_scores_update(
-    preds: torch.Tensor,
-    target: torch.Tensor,
-    reduce: str = "micro",
-    mdmc_reduce: Optional[str] = None,
-    num_classes: Optional[int] = None,
-    top_k: Optional[int] = None,
-    threshold: float = 0.5,
-    is_multiclass: Optional[bool] = None,
-    ignore_index: Optional[int] = None,
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-
-    preds, target, _ = _input_format_classification(
-        preds, target, threshold=threshold, num_classes=num_classes, is_multiclass=is_multiclass, top_k=top_k
-    )
-
-    if ignore_index is not None and not 0 <= ignore_index < preds.shape[1]:
-        raise ValueError(f"The `ignore_index` {ignore_index} is not valid for inputs with {preds.shape[0]} classes")
-
-    if ignore_index is not None and preds.shape[1] == 1:
-        raise ValueError("You can not use `ignore_index` with binary data.")
-
-    if preds.ndim == 3:
-        if not mdmc_reduce:
-            raise ValueError(
-                "When your inputs are multi-dimensional multi-class, you have to set the `mdmc_reduce` parameter"
-            )
-        if mdmc_reduce == "global":
-            preds = torch.transpose(preds, 1, 2).reshape(-1, preds.shape[1])
-            target = torch.transpose(target, 1, 2).reshape(-1, target.shape[1])
-
-    # Delete what is in ignore_index, if applicable (and classes don't matter):
-    if ignore_index is not None and reduce != "macro":
-        preds = _del_column(preds, ignore_index)
-        target = _del_column(target, ignore_index)
-
-    tp, fp, tn, fn = _stat_scores(preds, target, reduce=reduce)
-
-    # Take care of ignore_index
-    if ignore_index is not None and reduce == "macro":
-        tp[..., ignore_index] = -1
-        fp[..., ignore_index] = -1
-        tn[..., ignore_index] = -1
-        fn[..., ignore_index] = -1
-
-    return tp, fp, tn, fn
-
-
-def _stat_scores_compute(tp: torch.Tensor, fp: torch.Tensor, tn: torch.Tensor, fn: torch.Tensor) -> torch.Tensor:
-
-    outputs = [
-        tp.unsqueeze(-1),
-        fp.unsqueeze(-1),
-        tn.unsqueeze(-1),
-        fn.unsqueeze(-1),
-        tp.unsqueeze(-1) + fn.unsqueeze(-1),  # support
-    ]
-    outputs = torch.cat(outputs, -1)
-    outputs = torch.where(outputs < 0, torch.tensor(-1, device=outputs.device), outputs)
-
-    return outputs
+from pytorch_lightning.utilities.deprecation import deprecated
 
 
+@deprecated(target=_stat_scores, ver_deprecate="1.3.0", ver_remove="1.5.0")
 def stat_scores(
     preds: torch.Tensor,
     target: torch.Tensor,
@@ -146,149 +31,7 @@ def stat_scores(
     is_multiclass: Optional[bool] = None,
     ignore_index: Optional[int] = None,
 ) -> torch.Tensor:
-    """Computes the number of true positives, false positives, true negatives, false negatives.
-    Related to `Type I and Type II errors <https://en.wikipedia.org/wiki/Type_I_and_type_II_errors>`__
-    and the `confusion matrix <https://en.wikipedia.org/wiki/Confusion_matrix#Table_of_confusion>`__.
-
-    The reduction method (how the statistics are aggregated) is controlled by the
-    ``reduce`` parameter, and additionally by the ``mdmc_reduce`` parameter in the
-    multi-dimensional multi-class case.
-
-    Args:
-        preds: Predictions from model (probabilities or labels)
-        target: Ground truth values
-        threshold:
-            Threshold probability value for transforming probability predictions to binary
-            (0 or 1) predictions, in the case of binary or multi-label inputs.
-
-        top_k:
-            Number of highest probability entries for each sample to convert to 1s - relevant
-            only for inputs with probability predictions. If this parameter is set for multi-label
-            inputs, it will take precedence over ``threshold``. For (multi-dim) multi-class inputs,
-            this parameter defaults to 1.
-
-            Should be left unset (``None``) for inputs with label predictions.
-
-        reduce:
-            Defines the reduction that is applied. Should be one of the following:
-
-            - ``'micro'`` [default]: Counts the statistics by summing over all [sample, class]
-              combinations (globally). Each statistic is represented by a single integer.
-            - ``'macro'``: Counts the statistics for each class separately (over all samples).
-              Each statistic is represented by a ``(C,)`` tensor. Requires ``num_classes``
-              to be set.
-            - ``'samples'``: Counts the statistics for each sample separately (over all classes).
-              Each statistic is represented by a ``(N, )`` 1d tensor.
-
-            Note that what is considered a sample in the multi-dimensional multi-class case
-            depends on the value of ``mdmc_reduce``.
-
-        num_classes:
-            Number of classes. Necessary for (multi-dimensional) multi-class or multi-label data.
-
-        ignore_index:
-            Specify a class (label) to ignore. If given, this class index does not contribute
-            to the returned score, regardless of reduction method. If an index is ignored, and
-            ``reduce='macro'``, the class statistics for the ignored class will all be returned
-            as ``-1``.
-
-        mdmc_reduce:
-            Defines how the multi-dimensional multi-class inputs are handeled. Should be
-            one of the following:
-
-            - ``None`` [default]: Should be left unchanged if your data is not multi-dimensional multi-class.
-
-            - ``'samplewise'``: In this case, the statistics are computed separately for each
-              sample on the ``N`` axis, and then the outputs are concatenated together. In each
-              sample the extra axes ``...`` are flattened to become the sub-sample axis, and
-              statistics for each sample are computed by treating the sub-sample axis as the
-              ``N`` axis for that sample.
-
-            - ``'global'``: In this case the ``N`` and ``...`` dimensions of the inputs are
-              flattened into a new ``N_X`` sample axis, i.e. the inputs are treated as if they
-              were ``(N_X, C)``. From here on the ``reduce`` parameter applies as usual.
-
-        is_multiclass:
-            Used only in certain special cases, where you want to treat inputs as a different type
-            than what they appear to be.
-
-    Return:
-        The metric returns a tensor of shape ``(..., 5)``, where the last dimension corresponds
-        to ``[tp, fp, tn, fn, sup]`` (``sup`` stands for support and equals ``tp + fn``). The
-        shape depends on the ``reduce`` and ``mdmc_reduce`` (in case of multi-dimensional
-        multi-class data) parameters:
-
-        - If the data is not multi-dimensional multi-class, then
-
-          - If ``reduce='micro'``, the shape will be ``(5, )``
-          - If ``reduce='macro'``, the shape will be ``(C, 5)``,
-            where ``C`` stands for the number of classes
-          - If ``reduce='samples'``, the shape will be ``(N, 5)``, where ``N`` stands for
-            the number of samples
-
-        - If the data is multi-dimensional multi-class and ``mdmc_reduce='global'``, then
-
-          - If ``reduce='micro'``, the shape will be ``(5, )``
-          - If ``reduce='macro'``, the shape will be ``(C, 5)``
-          - If ``reduce='samples'``, the shape will be ``(N*X, 5)``, where ``X`` stands for
-            the product of sizes of all "extra" dimensions of the data (i.e. all dimensions
-            except for ``C`` and ``N``)
-
-        - If the data is multi-dimensional multi-class and ``mdmc_reduce='samplewise'``, then
-
-          - If ``reduce='micro'``, the shape will be ``(N, 5)``
-          - If ``reduce='macro'``, the shape will be ``(N, C, 5)``
-          - If ``reduce='samples'``, the shape will be ``(N, X, 5)``
-
-    Raises:
-        ValueError:
-            If ``reduce`` is none of ``"micro"``, ``"macro"`` or ``"samples"``.
-        ValueError:
-            If ``mdmc_reduce`` is none of ``None``, ``"samplewise"``, ``"global"``.
-        ValueError:
-            If ``reduce`` is set to ``"macro"`` and ``num_classes`` is not provided.
-        ValueError:
-            If ``num_classes`` is set
-            and ``ignore_index`` is not in the range ``[0, num_classes)``.
-        ValueError:
-            If ``ignore_index`` is used with ``binary data``.
-        ValueError:
-            If inputs are ``multi-dimensional multi-class`` and ``mdmc_reduce`` is not provided.
-
-    Example:
-
-        >>> from pytorch_lightning.metrics.functional import stat_scores
-        >>> preds  = torch.tensor([1, 0, 2, 1])
-        >>> target = torch.tensor([1, 1, 2, 0])
-        >>> stat_scores(preds, target, reduce='macro', num_classes=3)
-        tensor([[0, 1, 2, 1, 1],
-                [1, 1, 1, 1, 2],
-                [1, 0, 3, 0, 1]])
-        >>> stat_scores(preds, target, reduce='micro')
-        tensor([2, 2, 6, 2, 4])
     """
-
-    if reduce not in ["micro", "macro", "samples"]:
-        raise ValueError(f"The `reduce` {reduce} is not valid.")
-
-    if mdmc_reduce not in [None, "samplewise", "global"]:
-        raise ValueError(f"The `mdmc_reduce` {mdmc_reduce} is not valid.")
-
-    if reduce == "macro" and (not num_classes or num_classes < 1):
-        raise ValueError("When you set `reduce` as 'macro', you have to provide the number of classes.")
-
-    if num_classes and ignore_index is not None and (not 0 <= ignore_index < num_classes or num_classes == 1):
-        raise ValueError(f"The `ignore_index` {ignore_index} is not valid for inputs with {num_classes} classes")
-
-    tp, fp, tn, fn = _stat_scores_update(
-        preds,
-        target,
-        reduce=reduce,
-        mdmc_reduce=mdmc_reduce,
-        top_k=top_k,
-        threshold=threshold,
-        num_classes=num_classes,
-        is_multiclass=is_multiclass,
-        ignore_index=ignore_index,
-    )
-    return _stat_scores_compute(tp, fp, tn, fn)
+    .. deprecated::
+        Use :func:`torchmetrics.functional.stat_scores`. Will be removed in v1.5.0.
+    """
diff --git a/tests/metrics/classification/__init__.py b/tests/metrics/classification/__init__.py
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/tests/metrics/classification/inputs.py b/tests/metrics/classification/inputs.py
deleted file mode 100644
index 7f2ac450385fe..0000000000000
--- a/tests/metrics/classification/inputs.py
+++ /dev/null
@@ -1,66 +0,0 @@
-from collections import namedtuple
-
-import torch
-
-from tests.metrics.utils import BATCH_SIZE, EXTRA_DIM, NUM_BATCHES, NUM_CLASSES
-
-Input = namedtuple('Input', ["preds", "target"])
-
-_input_binary_prob = Input(
-    preds=torch.rand(NUM_BATCHES, BATCH_SIZE), target=torch.randint(high=2, size=(NUM_BATCHES, BATCH_SIZE))
-)
-
-_input_binary = Input(
-    preds=torch.randint(high=2, size=(NUM_BATCHES, BATCH_SIZE)),
-    target=torch.randint(high=2, size=(NUM_BATCHES, BATCH_SIZE))
-)
-
-_input_multilabel_prob = Input(
-    preds=torch.rand(NUM_BATCHES, BATCH_SIZE, NUM_CLASSES),
-    target=torch.randint(high=2, size=(NUM_BATCHES, BATCH_SIZE, NUM_CLASSES))
-)
-
-_input_multilabel_multidim_prob = Input(
-    preds=torch.rand(NUM_BATCHES, BATCH_SIZE, NUM_CLASSES, EXTRA_DIM),
-    target=torch.randint(high=2, size=(NUM_BATCHES, BATCH_SIZE, NUM_CLASSES, EXTRA_DIM))
-)
-
-_input_multilabel = Input(
-    preds=torch.randint(high=2, size=(NUM_BATCHES, BATCH_SIZE, NUM_CLASSES)),
-    target=torch.randint(high=2, size=(NUM_BATCHES, BATCH_SIZE, NUM_CLASSES))
-)
-
-_input_multilabel_multidim = Input(
-    preds=torch.randint(high=2, size=(NUM_BATCHES, BATCH_SIZE, NUM_CLASSES, EXTRA_DIM)),
-    target=torch.randint(high=2, size=(NUM_BATCHES, BATCH_SIZE, NUM_CLASSES, EXTRA_DIM))
-)
-
-# Generate edge multilabel edge case, where nothing matches (scores are undefined)
-__temp_preds = torch.randint(high=2, size=(NUM_BATCHES, BATCH_SIZE, NUM_CLASSES))
-__temp_target = abs(__temp_preds - 1)
-
-_input_multilabel_no_match = Input(preds=__temp_preds, target=__temp_target)
-
-__mc_prob_preds = torch.rand(NUM_BATCHES, BATCH_SIZE, NUM_CLASSES)
-__mc_prob_preds = __mc_prob_preds / __mc_prob_preds.sum(dim=2, keepdim=True)
-
-_input_multiclass_prob = Input(
-    preds=__mc_prob_preds, target=torch.randint(high=NUM_CLASSES, size=(NUM_BATCHES, BATCH_SIZE))
-)
-
-_input_multiclass = Input(
-    preds=torch.randint(high=NUM_CLASSES, size=(NUM_BATCHES, BATCH_SIZE)),
-    target=torch.randint(high=NUM_CLASSES, size=(NUM_BATCHES, BATCH_SIZE))
-)
-
-__mdmc_prob_preds = torch.rand(NUM_BATCHES, BATCH_SIZE, NUM_CLASSES, EXTRA_DIM)
-__mdmc_prob_preds = __mdmc_prob_preds / __mdmc_prob_preds.sum(dim=2, keepdim=True)
-
-_input_multidim_multiclass_prob = Input(
-    preds=__mdmc_prob_preds, target=torch.randint(high=NUM_CLASSES, size=(NUM_BATCHES, BATCH_SIZE, EXTRA_DIM))
-)
-
-_input_multidim_multiclass = Input(
-    preds=torch.randint(high=NUM_CLASSES, size=(NUM_BATCHES, BATCH_SIZE, EXTRA_DIM)),
-    target=torch.randint(high=NUM_CLASSES, size=(NUM_BATCHES, BATCH_SIZE, EXTRA_DIM))
-)
diff --git a/tests/metrics/classification/test_confusion_matrix.py b/tests/metrics/classification/test_confusion_matrix.py
deleted file mode 100644
index 5371044d6d4b0..0000000000000
--- a/tests/metrics/classification/test_confusion_matrix.py
+++ /dev/null
@@ -1,128 +0,0 @@
-from functools import partial
-
-import numpy as np
-import pytest
-import torch
-from sklearn.metrics import confusion_matrix as sk_confusion_matrix
-
-from pytorch_lightning.metrics.classification.confusion_matrix import ConfusionMatrix
-from pytorch_lightning.metrics.functional.confusion_matrix import confusion_matrix
-from tests.metrics.classification.inputs import _input_binary, _input_binary_prob
-from tests.metrics.classification.inputs import _input_multiclass as _input_mcls
-from tests.metrics.classification.inputs import _input_multiclass_prob as _input_mcls_prob
-from tests.metrics.classification.inputs import _input_multidim_multiclass as _input_mdmc
-from tests.metrics.classification.inputs import _input_multidim_multiclass_prob as _input_mdmc_prob
-from tests.metrics.classification.inputs import _input_multilabel as _input_mlb
-from tests.metrics.classification.inputs import _input_multilabel_prob as _input_mlb_prob
-from tests.metrics.utils import MetricTester, NUM_CLASSES, THRESHOLD
-
-torch.manual_seed(42)
-
-
-def _sk_cm_binary_prob(preds, target, normalize=None):
-    sk_preds = (preds.view(-1).numpy() >= THRESHOLD).astype(np.uint8)
-    sk_target = target.view(-1).numpy()
-
-    return sk_confusion_matrix(y_true=sk_target, y_pred=sk_preds, normalize=normalize)
-
-
-def _sk_cm_binary(preds, target, normalize=None):
-    sk_preds = preds.view(-1).numpy()
-    sk_target = target.view(-1).numpy()
-
-    return sk_confusion_matrix(y_true=sk_target, y_pred=sk_preds, normalize=normalize)
-
-
-def _sk_cm_multilabel_prob(preds, target, normalize=None):
-    sk_preds = (preds.view(-1).numpy() >= THRESHOLD).astype(np.uint8)
-    sk_target = target.view(-1).numpy()
-
-    return sk_confusion_matrix(y_true=sk_target, y_pred=sk_preds, normalize=normalize)
-
-
-def _sk_cm_multilabel(preds, target, normalize=None):
-    sk_preds = preds.view(-1).numpy()
-    sk_target = target.view(-1).numpy()
-
-    return sk_confusion_matrix(y_true=sk_target, y_pred=sk_preds, normalize=normalize)
-
-
-def _sk_cm_multiclass_prob(preds, target, normalize=None):
-    sk_preds = torch.argmax(preds, dim=len(preds.shape) - 1).view(-1).numpy()
-    sk_target = target.view(-1).numpy()
-
-    return sk_confusion_matrix(y_true=sk_target, y_pred=sk_preds, normalize=normalize)
-
-
-def _sk_cm_multiclass(preds, target, normalize=None):
-    sk_preds = preds.view(-1).numpy()
-    sk_target = target.view(-1).numpy()
-
-    return sk_confusion_matrix(y_true=sk_target, y_pred=sk_preds, normalize=normalize)
-
-
-def _sk_cm_multidim_multiclass_prob(preds, target, normalize=None):
-    sk_preds = torch.argmax(preds, dim=len(preds.shape) - 2).view(-1).numpy()
-    sk_target = target.view(-1).numpy()
-
-    return sk_confusion_matrix(y_true=sk_target, y_pred=sk_preds, normalize=normalize)
-
-
-def _sk_cm_multidim_multiclass(preds, target, normalize=None):
-    sk_preds = preds.view(-1).numpy()
-    sk_target = target.view(-1).numpy()
-
-    return sk_confusion_matrix(y_true=sk_target, y_pred=sk_preds, normalize=normalize)
-
-
-@pytest.mark.parametrize("normalize", ['true', 'pred', 'all', None])
-@pytest.mark.parametrize(
-    "preds, target, sk_metric, num_classes",
-    [(_input_binary_prob.preds, _input_binary_prob.target, _sk_cm_binary_prob, 2),
-     (_input_binary.preds, _input_binary.target, _sk_cm_binary, 2),
-     (_input_mlb_prob.preds, _input_mlb_prob.target, _sk_cm_multilabel_prob, 2),
-     (_input_mlb.preds, _input_mlb.target, _sk_cm_multilabel, 2),
-     (_input_mcls_prob.preds, _input_mcls_prob.target, _sk_cm_multiclass_prob, NUM_CLASSES),
-     (_input_mcls.preds, _input_mcls.target, _sk_cm_multiclass, NUM_CLASSES),
-     (_input_mdmc_prob.preds, _input_mdmc_prob.target, _sk_cm_multidim_multiclass_prob, NUM_CLASSES),
-     (_input_mdmc.preds, _input_mdmc.target, _sk_cm_multidim_multiclass, NUM_CLASSES)]
-)
-class TestConfusionMatrix(MetricTester):
-
-    @pytest.mark.parametrize("ddp", [True, False])
-    @pytest.mark.parametrize("dist_sync_on_step", [True, False])
-    def test_confusion_matrix(self, normalize, preds, target, sk_metric, num_classes, ddp, dist_sync_on_step):
-        self.run_class_metric_test(
-            ddp=ddp,
-            preds=preds,
-            target=target,
-            metric_class=ConfusionMatrix,
-            sk_metric=partial(sk_metric, normalize=normalize),
-            dist_sync_on_step=dist_sync_on_step,
-            metric_args={
-                "num_classes": num_classes,
-                "threshold": THRESHOLD,
-                "normalize": normalize
-            }
-        )
-
-    def test_confusion_matrix_functional(self, normalize, preds, target, sk_metric, num_classes):
-        self.run_functional_metric_test(
-            preds,
-            target,
-            metric_functional=confusion_matrix,
-            sk_metric=partial(sk_metric, normalize=normalize),
-            metric_args={
-                "num_classes": num_classes,
-                "threshold": THRESHOLD,
-                "normalize": normalize
-            }
-        )
-
-
-def test_warning_on_nan(tmpdir):
-    preds = torch.randint(3, size=(20, ))
-    target = torch.randint(3, size=(20, ))
-
-    with pytest.warns(UserWarning, match='.* nan values found in confusion matrix have been replaced with zeros.'):
-        confusion_matrix(preds, target, num_classes=5, normalize='true')
diff --git a/tests/metrics/classification/test_f_beta.py b/tests/metrics/classification/test_f_beta.py
deleted file mode 100644
index b9458fb6c530c..0000000000000
--- a/tests/metrics/classification/test_f_beta.py
+++ /dev/null
@@ -1,153 +0,0 @@
-from functools import partial
-
-import numpy as np
-import pytest
-import torch
-from sklearn.metrics import fbeta_score
-
-from pytorch_lightning.metrics import F1, FBeta
-from pytorch_lightning.metrics.functional import f1, fbeta
-from tests.metrics.classification.inputs import _input_binary, _input_binary_prob
-from tests.metrics.classification.inputs import _input_multiclass as _input_mcls
-from tests.metrics.classification.inputs import _input_multiclass_prob as _input_mcls_prob
-from tests.metrics.classification.inputs import _input_multidim_multiclass as _input_mdmc
-from tests.metrics.classification.inputs import _input_multidim_multiclass_prob as _input_mdmc_prob
-from tests.metrics.classification.inputs import _input_multilabel as _input_mlb
-from tests.metrics.classification.inputs import _input_multilabel_no_match as _input_mlb_nomatch
-from tests.metrics.classification.inputs import _input_multilabel_prob as _mlb_prob_inputs
-from tests.metrics.utils import MetricTester, NUM_CLASSES, THRESHOLD
-
-torch.manual_seed(42)
-
-
-def _sk_fbeta_binary_prob(preds, target, average='micro', beta=1.0):
-    sk_preds = (preds.view(-1).numpy() >= THRESHOLD).astype(np.uint8)
-    sk_target = target.view(-1).numpy()
-
-    return fbeta_score(y_true=sk_target, y_pred=sk_preds, average='binary', beta=beta)
-
-
-def _sk_fbeta_binary(preds, target, average='micro', beta=1.0):
-    sk_preds = preds.view(-1).numpy()
-    sk_target = target.view(-1).numpy()
-
-    return fbeta_score(y_true=sk_target, y_pred=sk_preds, average='binary', beta=beta)
-
-
-def _sk_fbeta_multilabel_prob(preds, target, average='micro', beta=1.0):
-    sk_preds = (preds.view(-1, NUM_CLASSES).numpy() >= THRESHOLD).astype(np.uint8)
-    sk_target = target.view(-1, NUM_CLASSES).numpy()
-
-    return fbeta_score(y_true=sk_target, y_pred=sk_preds, average=average, beta=beta)
-
-
-def _sk_fbeta_multilabel(preds, target, average='micro', beta=1.0):
-    sk_preds = preds.view(-1, NUM_CLASSES).numpy()
-    sk_target = target.view(-1, NUM_CLASSES).numpy()
-
-    return fbeta_score(y_true=sk_target, y_pred=sk_preds, average=average, beta=beta)
-
-
-def _sk_fbeta_multiclass_prob(preds, target, average='micro', beta=1.0):
-    sk_preds = torch.argmax(preds, dim=len(preds.shape) - 1).view(-1).numpy()
-    sk_target = target.view(-1).numpy()
-
-    return fbeta_score(y_true=sk_target, y_pred=sk_preds, average=average, beta=beta)
-
-
-def _sk_fbeta_multiclass(preds, target, average='micro', beta=1.0):
-    sk_preds = preds.view(-1).numpy()
-    sk_target = target.view(-1).numpy()
-
-    return fbeta_score(y_true=sk_target, y_pred=sk_preds, average=average, beta=beta)
-
-
-def _sk_fbeta_multidim_multiclass_prob(preds, target, average='micro', beta=1.0):
-    sk_preds = torch.argmax(preds, dim=len(preds.shape) - 2).view(-1).numpy()
-    sk_target = target.view(-1).numpy()
-
-    return fbeta_score(y_true=sk_target, y_pred=sk_preds, average=average, beta=beta)
-
-
-def _sk_fbeta_multidim_multiclass(preds, target, average='micro', beta=1.0):
-    sk_preds = preds.view(-1).numpy()
-    sk_target = target.view(-1).numpy()
-
-    return fbeta_score(y_true=sk_target, y_pred=sk_preds, average=average, beta=beta)
-
-
-@pytest.mark.parametrize(
-    "preds, target, sk_metric, num_classes, multilabel",
-    [
-        (_input_binary_prob.preds, _input_binary_prob.target, _sk_fbeta_binary_prob, 1, False),
-        (_input_binary.preds, _input_binary.target, _sk_fbeta_binary, 1, False),
-        (_mlb_prob_inputs.preds, _mlb_prob_inputs.target, _sk_fbeta_multilabel_prob, NUM_CLASSES, True),
-        (_input_mlb.preds, _input_mlb.target, _sk_fbeta_multilabel, NUM_CLASSES, True),
-        (_input_mlb_nomatch.preds, _input_mlb_nomatch.target, _sk_fbeta_multilabel, NUM_CLASSES, True),
-        (_input_mcls_prob.preds, _input_mcls_prob.target, _sk_fbeta_multiclass_prob, NUM_CLASSES, False),
-        (_input_mcls.preds, _input_mcls.target, _sk_fbeta_multiclass, NUM_CLASSES, False),
-        (_input_mdmc_prob.preds, _input_mdmc_prob.target, _sk_fbeta_multidim_multiclass_prob, NUM_CLASSES, False),
-        (_input_mdmc.preds, _input_mdmc.target, _sk_fbeta_multidim_multiclass, NUM_CLASSES, False),
-    ],
-)
-@pytest.mark.parametrize("average", ['micro', 'macro', 'weighted', None])
-@pytest.mark.parametrize("beta", [0.5, 1.0, 2.0])
-class TestFBeta(MetricTester):
-
-    @pytest.mark.parametrize("ddp", [True, False])
-    @pytest.mark.parametrize("dist_sync_on_step", [True, False])
-    def test_fbeta(self, preds, target, sk_metric, num_classes, multilabel, average, beta, ddp, dist_sync_on_step):
-        metric_class = F1 if beta == 1.0 else partial(FBeta, beta=beta)
-
-        self.run_class_metric_test(
-            ddp=ddp,
-            preds=preds,
-            target=target,
-            metric_class=metric_class,
-            sk_metric=partial(sk_metric, average=average, beta=beta),
-            dist_sync_on_step=dist_sync_on_step,
-            metric_args={
-                "num_classes": num_classes,
-                "average": average,
-                "multilabel": multilabel,
-                "threshold": THRESHOLD,
-            },
-            check_dist_sync_on_step=False,
-            check_batch=False,
-        )
-
-    def test_fbeta_functional(self, preds, target, sk_metric, num_classes, multilabel, average, beta):
-        metric_functional = f1 if beta == 1.0 else partial(fbeta, beta=beta)
-
-        self.run_functional_metric_test(
-            preds=preds,
-            target=target,
-            metric_functional=metric_functional,
-            sk_metric=partial(sk_metric, average=average, beta=beta),
-            metric_args={
-                "num_classes": num_classes,
-                "average": average,
-                "multilabel": multilabel,
-                "threshold": THRESHOLD
-            }
-        )
-
-
-@pytest.mark.parametrize(['pred', 'target', 'beta', 'exp_score'], [
-    pytest.param([1., 0., 1., 0.], [0., 1., 1., 0.], 0.5, [0.5, 0.5]),
-    pytest.param([1., 0., 1., 0.], [0., 1., 1., 0.], 1, [0.5, 0.5]),
-    pytest.param([1., 0., 1., 0.], [0., 1., 1., 0.], 2, [0.5, 0.5]),
-])
-def test_fbeta_score(pred, target, beta, exp_score):
-    score = fbeta(torch.tensor(pred), torch.tensor(target), num_classes=1, beta=beta, average='none')
-    assert torch.allclose(score, torch.tensor(exp_score))
-
-
-@pytest.mark.parametrize(['pred', 'target', 'exp_score'], [
-    pytest.param([0., 0., 0., 0.], [1., 1., 1., 1.], [0.0, 0.0]),
-    pytest.param([1., 0., 1., 0.], [0., 1., 1., 0.], [0.5, 0.5]),
-    pytest.param([1., 0., 1., 0.], [1., 0., 1., 0.], [1.0, 1.0]),
-])
-def test_f1_score(pred, target, exp_score):
-    score = f1(torch.tensor(pred), torch.tensor(target), num_classes=1, average='none')
-    assert torch.allclose(score, torch.tensor(exp_score))
diff --git a/tests/metrics/classification/test_hamming_distance.py b/tests/metrics/classification/test_hamming_distance.py
deleted file mode 100644
index a4db9c7f339b2..0000000000000
--- a/tests/metrics/classification/test_hamming_distance.py
+++ /dev/null
@@ -1,80 +0,0 @@
-import pytest
-import torch
-from sklearn.metrics import hamming_loss as sk_hamming_loss
-from torchmetrics.classification.checks import _input_format_classification
-
-from pytorch_lightning.metrics import HammingDistance
-from pytorch_lightning.metrics.functional import hamming_distance
-from tests.metrics.classification.inputs import _input_binary, _input_binary_prob
-from tests.metrics.classification.inputs import _input_multiclass as _input_mcls
-from tests.metrics.classification.inputs import _input_multiclass_prob as _input_mcls_prob
-from tests.metrics.classification.inputs import _input_multidim_multiclass as _input_mdmc
-from tests.metrics.classification.inputs import _input_multidim_multiclass_prob as _input_mdmc_prob
-from tests.metrics.classification.inputs import _input_multilabel as _input_mlb
-from tests.metrics.classification.inputs import _input_multilabel_multidim as _input_mlmd
-from tests.metrics.classification.inputs import _input_multilabel_multidim_prob as _input_mlmd_prob
-from tests.metrics.classification.inputs import _input_multilabel_prob as _input_mlb_prob
-from tests.metrics.utils import MetricTester, THRESHOLD
-
-torch.manual_seed(42)
-
-
-def _sk_hamming_loss(preds, target):
-    sk_preds, sk_target, _ = _input_format_classification(preds, target, threshold=THRESHOLD)
-    sk_preds, sk_target = sk_preds.numpy(), sk_target.numpy()
-    sk_preds, sk_target = sk_preds.reshape(sk_preds.shape[0], -1), sk_target.reshape(sk_target.shape[0], -1)
-
-    return sk_hamming_loss(y_true=sk_target, y_pred=sk_preds)
-
-
-@pytest.mark.parametrize(
-    "preds, target",
-    [
-        (_input_binary_prob.preds, _input_binary_prob.target),
-        (_input_binary.preds, _input_binary.target),
-        (_input_mlb_prob.preds, _input_mlb_prob.target),
-        (_input_mlb.preds, _input_mlb.target),
-        (_input_mcls_prob.preds, _input_mcls_prob.target),
-        (_input_mcls.preds, _input_mcls.target),
-        (_input_mdmc_prob.preds, _input_mdmc_prob.target),
-        (_input_mdmc.preds, _input_mdmc.target),
-        (_input_mlmd_prob.preds, _input_mlmd_prob.target),
-        (_input_mlmd.preds, _input_mlmd.target),
-    ],
-)
-class TestHammingDistance(MetricTester):
-
-    @pytest.mark.parametrize("ddp", [True, False])
-    @pytest.mark.parametrize("dist_sync_on_step", [False, True])
-    def test_hamming_distance_class(self, ddp, dist_sync_on_step, preds, target):
-        self.run_class_metric_test(
-            ddp=ddp,
-            preds=preds,
-            target=target,
-            metric_class=HammingDistance,
-            sk_metric=_sk_hamming_loss,
-            dist_sync_on_step=dist_sync_on_step,
-            metric_args={"threshold": THRESHOLD},
-        )
-
-    def test_hamming_distance_fn(self, preds, target):
-        self.run_functional_metric_test(
-            preds,
-            target,
-            metric_functional=hamming_distance,
-            sk_metric=_sk_hamming_loss,
-            metric_args={"threshold": THRESHOLD},
-        )
-
-
-@pytest.mark.parametrize("threshold", [1.5])
-def test_wrong_params(threshold):
-    preds, target = _input_mcls_prob.preds, _input_mcls_prob.target
-
-    with pytest.raises(ValueError):
-        ham_dist = HammingDistance(threshold=threshold)
-        ham_dist(preds, target)
-        ham_dist.compute()
-
-    with pytest.raises(ValueError):
-        hamming_distance(preds, target, threshold=threshold)
diff --git a/tests/metrics/classification/test_inputs.py b/tests/metrics/classification/test_inputs.py
deleted file mode 100644
index f07a9c2821f56..0000000000000
--- a/tests/metrics/classification/test_inputs.py
+++ /dev/null
@@ -1,312 +0,0 @@
-import pytest
-import torch
-from torch import rand, randint
-from torchmetrics.classification.checks import _input_format_classification
-from torchmetrics.utilities.data import select_topk, to_onehot
-from torchmetrics.utilities.enums import DataType
-
-from tests.metrics.classification.inputs import _input_binary as _bin
-from tests.metrics.classification.inputs import _input_binary_prob as _bin_prob
-from tests.metrics.classification.inputs import _input_multiclass as _mc
-from tests.metrics.classification.inputs import _input_multiclass_prob as _mc_prob
-from tests.metrics.classification.inputs import _input_multidim_multiclass as _mdmc
-from tests.metrics.classification.inputs import _input_multidim_multiclass_prob as _mdmc_prob
-from tests.metrics.classification.inputs import _input_multilabel as _ml
-from tests.metrics.classification.inputs import _input_multilabel_multidim as _mlmd
-from tests.metrics.classification.inputs import _input_multilabel_multidim_prob as _mlmd_prob
-from tests.metrics.classification.inputs import _input_multilabel_prob as _ml_prob
-from tests.metrics.classification.inputs import Input
-from tests.metrics.utils import BATCH_SIZE, EXTRA_DIM, NUM_BATCHES, NUM_CLASSES, THRESHOLD
-
-torch.manual_seed(42)
-
-# Some additional inputs to test on
-_ml_prob_half = Input(_ml_prob.preds.half(), _ml_prob.target)
-
-_mc_prob_2cls_preds = rand(NUM_BATCHES, BATCH_SIZE, 2)
-_mc_prob_2cls_preds /= _mc_prob_2cls_preds.sum(dim=2, keepdim=True)
-_mc_prob_2cls = Input(_mc_prob_2cls_preds, randint(high=2, size=(NUM_BATCHES, BATCH_SIZE)))
-
-_mdmc_prob_many_dims_preds = rand(NUM_BATCHES, BATCH_SIZE, NUM_CLASSES, EXTRA_DIM, EXTRA_DIM)
-_mdmc_prob_many_dims_preds /= _mdmc_prob_many_dims_preds.sum(dim=2, keepdim=True)
-_mdmc_prob_many_dims = Input(
-    _mdmc_prob_many_dims_preds,
-    randint(high=2, size=(NUM_BATCHES, BATCH_SIZE, EXTRA_DIM, EXTRA_DIM)),
-)
-
-_mdmc_prob_2cls_preds = rand(NUM_BATCHES, BATCH_SIZE, 2, EXTRA_DIM)
-_mdmc_prob_2cls_preds /= _mdmc_prob_2cls_preds.sum(dim=2, keepdim=True)
-_mdmc_prob_2cls = Input(_mdmc_prob_2cls_preds, randint(high=2, size=(NUM_BATCHES, BATCH_SIZE, EXTRA_DIM)))
-
-# Some utils
-T = torch.Tensor
-
-
-def _idn(x):
-    return x
-
-
-def _usq(x):
-    return x.unsqueeze(-1)
-
-
-def _thrs(x):
-    return x >= THRESHOLD
-
-
-def _rshp1(x):
-    return x.reshape(x.shape[0], -1)
-
-
-def _rshp2(x):
-    return x.reshape(x.shape[0], x.shape[1], -1)
-
-
-def _onehot(x):
-    return to_onehot(x, NUM_CLASSES)
-
-
-def _onehot2(x):
-    return to_onehot(x, 2)
-
-
-def _top1(x):
-    return select_topk(x, 1)
-
-
-def _top2(x):
-    return select_topk(x, 2)
-
-
-# To avoid ugly black line wrapping
-def _ml_preds_tr(x):
-    return _rshp1(_thrs(x))
-
-
-def _onehot_rshp1(x):
-    return _onehot(_rshp1(x))
-
-
-def _onehot2_rshp1(x):
-    return _onehot2(_rshp1(x))
-
-
-def _top1_rshp2(x):
-    return _top1(_rshp2(x))
-
-
-def _top2_rshp2(x):
-    return _top2(_rshp2(x))
-
-
-def _probs_to_mc_preds_tr(x):
-    return _onehot2(_thrs(x))
-
-
-def _mlmd_prob_to_mc_preds_tr(x):
-    return _onehot2(_rshp1(_thrs(x)))
-
-
-########################
-# Test correct inputs
-########################
-
-
-@pytest.mark.parametrize(
-    "inputs, num_classes, is_multiclass, top_k, exp_mode, post_preds, post_target",
-    [
-        #############################
-        # Test usual expected cases
-        (_bin, None, False, None, "multi-class", _usq, _usq),
-        (_bin, 1, False, None, "multi-class", _usq, _usq),
-        (_bin_prob, None, None, None, "binary", lambda x: _usq(_thrs(x)), _usq),
-        (_ml_prob, None, None, None, "multi-label", _thrs, _idn),
-        (_ml, None, False, None, "multi-dim multi-class", _idn, _idn),
-        (_ml_prob, None, None, None, "multi-label", _ml_preds_tr, _rshp1),
-        (_ml_prob, None, None, 2, "multi-label", _top2, _rshp1),
-        (_mlmd, None, False, None, "multi-dim multi-class", _rshp1, _rshp1),
-        (_mc, NUM_CLASSES, None, None, "multi-class", _onehot, _onehot),
-        (_mc_prob, None, None, None, "multi-class", _top1, _onehot),
-        (_mc_prob, None, None, 2, "multi-class", _top2, _onehot),
-        (_mdmc, NUM_CLASSES, None, None, "multi-dim multi-class", _onehot, _onehot),
-        (_mdmc_prob, None, None, None, "multi-dim multi-class", _top1_rshp2, _onehot),
-        (_mdmc_prob, None, None, 2, "multi-dim multi-class", _top2_rshp2, _onehot),
-        (_mdmc_prob_many_dims, None, None, None, "multi-dim multi-class", _top1_rshp2, _onehot_rshp1),
-        (_mdmc_prob_many_dims, None, None, 2, "multi-dim multi-class", _top2_rshp2, _onehot_rshp1),
-        ###########################
-        # Test some special cases
-        # Make sure that half precision works, i.e. is converted to full precision
-        (_ml_prob_half, None, None, None, "multi-label", lambda x: _ml_preds_tr(x.float()), _rshp1),
-        # Binary as multiclass
-        (_bin, None, None, None, "multi-class", _onehot2, _onehot2),
-        # Binary probs as multiclass
-        (_bin_prob, None, True, None, "binary", _probs_to_mc_preds_tr, _onehot2),
-        # Multilabel as multiclass
-        (_ml, None, True, None, "multi-dim multi-class", _onehot2, _onehot2),
-        # Multilabel probs as multiclass
-        (_ml_prob, None, True, None, "multi-label", _probs_to_mc_preds_tr, _onehot2),
-        # Multidim multilabel as multiclass
-        (_mlmd, None, True, None, "multi-dim multi-class", _onehot2_rshp1, _onehot2_rshp1),
-        # Multidim multilabel probs as multiclass
-        (_mlmd_prob, None, True, None, "multi-label", _mlmd_prob_to_mc_preds_tr, _onehot2_rshp1),
-        # Multiclass prob with 2 classes as binary
-        (_mc_prob_2cls, None, False, None, "multi-class", lambda x: _top1(x)[:, [1]], _usq),
-        # Multi-dim multi-class with 2 classes as multi-label
-        (_mdmc_prob_2cls, None, False, None, "multi-dim multi-class", lambda x: _top1(x)[:, 1], _idn),
-    ],
-)
-def test_usual_cases(inputs, num_classes, is_multiclass, top_k, exp_mode, post_preds, post_target):
-
-    def __get_data_type_enum(str_exp_mode):
-        return next(DataType[n] for n in dir(DataType) if DataType[n] == str_exp_mode)
-
-    for exp_mode in (exp_mode, __get_data_type_enum(exp_mode)):
-        preds_out, target_out, mode = _input_format_classification(
-            preds=inputs.preds[0],
-            target=inputs.target[0],
-            threshold=THRESHOLD,
-            num_classes=num_classes,
-            is_multiclass=is_multiclass,
-            top_k=top_k,
-        )
-
-        assert mode == exp_mode
-        assert torch.equal(preds_out, post_preds(inputs.preds[0]).int())
-        assert torch.equal(target_out, post_target(inputs.target[0]).int())
-
-        # Test that things work when batch_size = 1
-        preds_out, target_out, mode = _input_format_classification(
-            preds=inputs.preds[0][[0], ...],
-            target=inputs.target[0][[0], ...],
-            threshold=THRESHOLD,
-            num_classes=num_classes,
-            is_multiclass=is_multiclass,
-            top_k=top_k,
-        )
-
-        assert mode == exp_mode
-        assert torch.equal(preds_out, post_preds(inputs.preds[0][[0], ...]).int())
-        assert torch.equal(target_out, post_target(inputs.target[0][[0], ...]).int())
-
-
-# Test that threshold is correctly applied
-def test_threshold():
-    target = T([1, 1, 1]).int()
-    preds_probs = T([0.5 - 1e-5, 0.5, 0.5 + 1e-5])
-
-    preds_probs_out, _, _ = _input_format_classification(preds_probs, target, threshold=0.5)
-
-    assert torch.equal(torch.tensor([0, 1, 1], dtype=torch.int), preds_probs_out.squeeze().int())
-
-
-########################################################################
-# Test incorrect inputs
-########################################################################
-
-
-@pytest.mark.parametrize("threshold", [-0.5, 0.0, 1.0, 1.5])
-def test_incorrect_threshold(threshold):
-    preds, target = rand(size=(7, )), randint(high=2, size=(7, ))
-    with pytest.raises(ValueError):
-        _input_format_classification(preds, target, threshold=threshold)
-
-
-@pytest.mark.parametrize(
-    "preds, target, num_classes, is_multiclass",
-    [
-        # Target not integer
-        (randint(high=2, size=(7, )), randint(high=2, size=(7, )).float(), None, None),
-        # Target negative
-        (randint(high=2, size=(7, )), -randint(high=2, size=(7, )), None, None),
-        # Preds negative integers
-        (-randint(high=2, size=(7, )), randint(high=2, size=(7, )), None, None),
-        # Negative probabilities
-        (-rand(size=(7, )), randint(high=2, size=(7, )), None, None),
-        # is_multiclass=False and target > 1
-        (rand(size=(7, )), randint(low=2, high=4, size=(7, )), None, False),
-        # is_multiclass=False and preds integers with > 1
-        (randint(low=2, high=4, size=(7, )), randint(high=2, size=(7, )), None, False),
-        # Wrong batch size
-        (randint(high=2, size=(8, )), randint(high=2, size=(7, )), None, None),
-        # Completely wrong shape
-        (randint(high=2, size=(7, )), randint(high=2, size=(7, 4)), None, None),
-        # Same #dims, different shape
-        (randint(high=2, size=(7, 3)), randint(high=2, size=(7, 4)), None, None),
-        # Same shape and preds floats, target not binary
-        (rand(size=(7, 3)), randint(low=2, high=4, size=(7, 3)), None, None),
-        # #dims in preds = 1 + #dims in target, C shape not second or last
-        (rand(size=(7, 3, 4, 3)), randint(high=4, size=(7, 3, 3)), None, None),
-        # #dims in preds = 1 + #dims in target, preds not float
-        (randint(high=2, size=(7, 3, 3, 4)), randint(high=4, size=(7, 3, 3)), None, None),
-        # is_multiclass=False, with C dimension > 2
-        (_mc_prob.preds[0], randint(high=2, size=(BATCH_SIZE, )), None, False),
-        # Probs of multiclass preds do not sum up to 1
-        (rand(size=(7, 3, 5)), randint(high=2, size=(7, 5)), None, None),
-        # Max target larger or equal to C dimension
-        (_mc_prob.preds[0], randint(low=NUM_CLASSES + 1, high=100, size=(BATCH_SIZE, )), None, None),
-        # C dimension not equal to num_classes
-        (_mc_prob.preds[0], _mc_prob.target[0], NUM_CLASSES + 1, None),
-        # Max target larger than num_classes (with #dim preds = 1 + #dims target)
-        (_mc_prob.preds[0], randint(low=NUM_CLASSES + 1, high=100, size=(BATCH_SIZE, NUM_CLASSES)), 4, None),
-        # Max target larger than num_classes (with #dim preds = #dims target)
-        (randint(high=4, size=(7, 3)), randint(low=5, high=7, size=(7, 3)), 4, None),
-        # Max preds larger than num_classes (with #dim preds = #dims target)
-        (randint(low=5, high=7, size=(7, 3)), randint(high=4, size=(7, 3)), 4, None),
-        # Num_classes=1, but is_multiclass not false
-        (randint(high=2, size=(7, )), randint(high=2, size=(7, )), 1, None),
-        # is_multiclass=False, but implied class dimension (for multi-label, from shape) != num_classes
-        (randint(high=2, size=(7, 3, 3)), randint(high=2, size=(7, 3, 3)), 4, False),
-        # Multilabel input with implied class dimension != num_classes
-        (rand(size=(7, 3, 3)), randint(high=2, size=(7, 3, 3)), 4, False),
-        # Multilabel input with is_multiclass=True, but num_classes != 2 (or None)
-        (rand(size=(7, 3)), randint(high=2, size=(7, 3)), 4, True),
-        # Binary input, num_classes > 2
-        (rand(size=(7, )), randint(high=2, size=(7, )), 4, None),
-        # Binary input, num_classes == 2 and is_multiclass not True
-        (rand(size=(7, )), randint(high=2, size=(7, )), 2, None),
-        (rand(size=(7, )), randint(high=2, size=(7, )), 2, False),
-        # Binary input, num_classes == 1 and is_multiclass=True
-        (rand(size=(7, )), randint(high=2, size=(7, )), 1, True),
-    ],
-)
-def test_incorrect_inputs(preds, target, num_classes, is_multiclass):
-    with pytest.raises(ValueError):
-        _input_format_classification(
-            preds=preds, target=target, threshold=THRESHOLD, num_classes=num_classes, is_multiclass=is_multiclass
-        )
-
-
-@pytest.mark.parametrize(
-    "preds, target, num_classes, is_multiclass, top_k",
-    [
-        # Topk set with non (md)mc or ml prob data
-        (_bin.preds[0], _bin.target[0], None, None, 2),
-        (_bin_prob.preds[0], _bin_prob.target[0], None, None, 2),
-        (_mc.preds[0], _mc.target[0], None, None, 2),
-        (_ml.preds[0], _ml.target[0], None, None, 2),
-        (_mlmd.preds[0], _mlmd.target[0], None, None, 2),
-        (_mdmc.preds[0], _mdmc.target[0], None, None, 2),
-        # top_k = 0
-        (_mc_prob_2cls.preds[0], _mc_prob_2cls.target[0], None, None, 0),
-        # top_k = float
-        (_mc_prob_2cls.preds[0], _mc_prob_2cls.target[0], None, None, 0.123),
-        # top_k =2 with 2 classes, is_multiclass=False
-        (_mc_prob_2cls.preds[0], _mc_prob_2cls.target[0], None, False, 2),
-        # top_k = number of classes (C dimension)
-        (_mc_prob.preds[0], _mc_prob.target[0], None, None, NUM_CLASSES),
-        # is_multiclass = True for ml prob inputs, top_k set
-        (_ml_prob.preds[0], _ml_prob.target[0], None, True, 2),
-        # top_k = num_classes for ml prob inputs
-        (_ml_prob.preds[0], _ml_prob.target[0], None, True, NUM_CLASSES),
-    ],
-)
-def test_incorrect_inputs_topk(preds, target, num_classes, is_multiclass, top_k):
-    with pytest.raises(ValueError):
-        _input_format_classification(
-            preds=preds,
-            target=target,
-            threshold=THRESHOLD,
-            num_classes=num_classes,
-            is_multiclass=is_multiclass,
-            top_k=top_k,
-        )
diff --git a/tests/metrics/classification/test_iou.py b/tests/metrics/classification/test_iou.py
deleted file mode 100644
index 6bb100f68165a..0000000000000
--- a/tests/metrics/classification/test_iou.py
+++ /dev/null
@@ -1,216 +0,0 @@
-from functools import partial
-
-import numpy as np
-import pytest
-import torch
-from sklearn.metrics import jaccard_score as sk_jaccard_score
-
-from pytorch_lightning.metrics.classification.iou import IoU
-from pytorch_lightning.metrics.functional.iou import iou
-from tests.metrics.classification.inputs import _input_binary, _input_binary_prob
-from tests.metrics.classification.inputs import _input_multiclass as _input_mcls
-from tests.metrics.classification.inputs import _input_multiclass_prob as _input_mcls_prob
-from tests.metrics.classification.inputs import _input_multidim_multiclass as _input_mdmc
-from tests.metrics.classification.inputs import _input_multidim_multiclass_prob as _input_mdmc_prob
-from tests.metrics.classification.inputs import _input_multilabel as _input_mlb
-from tests.metrics.classification.inputs import _input_multilabel_prob as _input_mlb_prob
-from tests.metrics.utils import MetricTester, NUM_CLASSES, THRESHOLD
-
-
-def _sk_iou_binary_prob(preds, target, average=None):
-    sk_preds = (preds.view(-1).numpy() >= THRESHOLD).astype(np.uint8)
-    sk_target = target.view(-1).numpy()
-
-    return sk_jaccard_score(y_true=sk_target, y_pred=sk_preds, average=average)
-
-
-def _sk_iou_binary(preds, target, average=None):
-    sk_preds = preds.view(-1).numpy()
-    sk_target = target.view(-1).numpy()
-
-    return sk_jaccard_score(y_true=sk_target, y_pred=sk_preds, average=average)
-
-
-def _sk_iou_multilabel_prob(preds, target, average=None):
-    sk_preds = (preds.view(-1).numpy() >= THRESHOLD).astype(np.uint8)
-    sk_target = target.view(-1).numpy()
-
-    return sk_jaccard_score(y_true=sk_target, y_pred=sk_preds, average=average)
-
-
-def _sk_iou_multilabel(preds, target, average=None):
-    sk_preds = preds.view(-1).numpy()
-    sk_target = target.view(-1).numpy()
-
-    return sk_jaccard_score(y_true=sk_target, y_pred=sk_preds, average=average)
-
-
-def _sk_iou_multiclass_prob(preds, target, average=None):
-    sk_preds = torch.argmax(preds, dim=len(preds.shape) - 1).view(-1).numpy()
-    sk_target = target.view(-1).numpy()
-
-    return sk_jaccard_score(y_true=sk_target, y_pred=sk_preds, average=average)
-
-
-def _sk_iou_multiclass(preds, target, average=None):
-    sk_preds = preds.view(-1).numpy()
-    sk_target = target.view(-1).numpy()
-
-    return sk_jaccard_score(y_true=sk_target, y_pred=sk_preds, average=average)
-
-
-def _sk_iou_multidim_multiclass_prob(preds, target, average=None):
-    sk_preds = torch.argmax(preds, dim=len(preds.shape) - 2).view(-1).numpy()
-    sk_target = target.view(-1).numpy()
-
-    return sk_jaccard_score(y_true=sk_target, y_pred=sk_preds, average=average)
-
-
-def _sk_iou_multidim_multiclass(preds, target, average=None):
-    sk_preds = preds.view(-1).numpy()
-    sk_target = target.view(-1).numpy()
-
-    return sk_jaccard_score(y_true=sk_target, y_pred=sk_preds, average=average)
-
-
-@pytest.mark.parametrize("reduction", ['elementwise_mean', 'none'])
-@pytest.mark.parametrize(
-    "preds, target, sk_metric, num_classes",
-    [(_input_binary_prob.preds, _input_binary_prob.target, _sk_iou_binary_prob, 2),
-     (_input_binary.preds, _input_binary.target, _sk_iou_binary, 2),
-     (_input_mlb_prob.preds, _input_mlb_prob.target, _sk_iou_multilabel_prob, 2),
-     (_input_mlb.preds, _input_mlb.target, _sk_iou_multilabel, 2),
-     (_input_mcls_prob.preds, _input_mcls_prob.target, _sk_iou_multiclass_prob, NUM_CLASSES),
-     (_input_mcls.preds, _input_mcls.target, _sk_iou_multiclass, NUM_CLASSES),
-     (_input_mdmc_prob.preds, _input_mdmc_prob.target, _sk_iou_multidim_multiclass_prob, NUM_CLASSES),
-     (_input_mdmc.preds, _input_mdmc.target, _sk_iou_multidim_multiclass, NUM_CLASSES)]
-)
-class TestIoU(MetricTester):
-
-    @pytest.mark.parametrize("ddp", [True, False])
-    @pytest.mark.parametrize("dist_sync_on_step", [True, False])
-    def test_confusion_matrix(self, reduction, preds, target, sk_metric, num_classes, ddp, dist_sync_on_step):
-        average = 'macro' if reduction == 'elementwise_mean' else None  # convert tags
-        self.run_class_metric_test(
-            ddp=ddp,
-            preds=preds,
-            target=target,
-            metric_class=IoU,
-            sk_metric=partial(sk_metric, average=average),
-            dist_sync_on_step=dist_sync_on_step,
-            metric_args={
-                "num_classes": num_classes,
-                "threshold": THRESHOLD,
-                "reduction": reduction
-            }
-        )
-
-    def test_confusion_matrix_functional(self, reduction, preds, target, sk_metric, num_classes):
-        average = 'macro' if reduction == 'elementwise_mean' else None  # convert tags
-        self.run_functional_metric_test(
-            preds,
-            target,
-            metric_functional=iou,
-            sk_metric=partial(sk_metric, average=average),
-            metric_args={
-                "num_classes": num_classes,
-                "threshold": THRESHOLD,
-                "reduction": reduction
-            }
-        )
-
-
-@pytest.mark.parametrize(['half_ones', 'reduction', 'ignore_index', 'expected'], [
-    pytest.param(False, 'none', None, torch.Tensor([1, 1, 1])),
-    pytest.param(False, 'elementwise_mean', None, torch.Tensor([1])),
-    pytest.param(False, 'none', 0, torch.Tensor([1, 1])),
-    pytest.param(True, 'none', None, torch.Tensor([0.5, 0.5, 0.5])),
-    pytest.param(True, 'elementwise_mean', None, torch.Tensor([0.5])),
-    pytest.param(True, 'none', 0, torch.Tensor([0.5, 0.5])),
-])
-def test_iou(half_ones, reduction, ignore_index, expected):
-    pred = (torch.arange(120) % 3).view(-1, 1)
-    target = (torch.arange(120) % 3).view(-1, 1)
-    if half_ones:
-        pred[:60] = 1
-    iou_val = iou(
-        pred=pred,
-        target=target,
-        ignore_index=ignore_index,
-        reduction=reduction,
-    )
-    assert torch.allclose(iou_val, expected, atol=1e-9)
-
-
-# test `absent_score`
-@pytest.mark.parametrize(
-    ['pred', 'target', 'ignore_index', 'absent_score', 'num_classes', 'expected'],
-    [
-        # Note that -1 is used as the absent_score in almost all tests here to distinguish it from the range of valid
-        # scores the function can return ([0., 1.] range, inclusive).
-        # 2 classes, class 0 is correct everywhere, class 1 is absent.
-        pytest.param([0], [0], None, -1., 2, [1., -1.]),
-        pytest.param([0, 0], [0, 0], None, -1., 2, [1., -1.]),
-        # absent_score not applied if only class 0 is present and it's the only class.
-        pytest.param([0], [0], None, -1., 1, [1.]),
-        # 2 classes, class 1 is correct everywhere, class 0 is absent.
-        pytest.param([1], [1], None, -1., 2, [-1., 1.]),
-        pytest.param([1, 1], [1, 1], None, -1., 2, [-1., 1.]),
-        # When 0 index ignored, class 0 does not get a score (not even the absent_score).
-        pytest.param([1], [1], 0, -1., 2, [1.0]),
-        # 3 classes. Only 0 and 2 are present, and are perfectly predicted. 1 should get absent_score.
-        pytest.param([0, 2], [0, 2], None, -1., 3, [1., -1., 1.]),
-        pytest.param([2, 0], [2, 0], None, -1., 3, [1., -1., 1.]),
-        # 3 classes. Only 0 and 1 are present, and are perfectly predicted. 2 should get absent_score.
-        pytest.param([0, 1], [0, 1], None, -1., 3, [1., 1., -1.]),
-        pytest.param([1, 0], [1, 0], None, -1., 3, [1., 1., -1.]),
-        # 3 classes, class 0 is 0.5 IoU, class 1 is 0 IoU (in pred but not target; should not get absent_score), class
-        # 2 is absent.
-        pytest.param([0, 1], [0, 0], None, -1., 3, [0.5, 0., -1.]),
-        # 3 classes, class 0 is 0.5 IoU, class 1 is 0 IoU (in target but not pred; should not get absent_score), class
-        # 2 is absent.
-        pytest.param([0, 0], [0, 1], None, -1., 3, [0.5, 0., -1.]),
-        # Sanity checks with absent_score of 1.0.
-        pytest.param([0, 2], [0, 2], None, 1.0, 3, [1., 1., 1.]),
-        pytest.param([0, 2], [0, 2], 0, 1.0, 3, [1., 1.]),
-    ]
-)
-def test_iou_absent_score(pred, target, ignore_index, absent_score, num_classes, expected):
-    iou_val = iou(
-        pred=torch.tensor(pred),
-        target=torch.tensor(target),
-        ignore_index=ignore_index,
-        absent_score=absent_score,
-        num_classes=num_classes,
-        reduction='none',
-    )
-    assert torch.allclose(iou_val, torch.tensor(expected).to(iou_val))
-
-
-# example data taken from
-# https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/metrics/tests/test_ranking.py
-@pytest.mark.parametrize(
-    ['pred', 'target', 'ignore_index', 'num_classes', 'reduction', 'expected'],
-    [
-        # Ignoring an index outside of [0, num_classes-1] should have no effect.
-        pytest.param([0, 1, 1, 2, 2], [0, 1, 2, 2, 2], None, 3, 'none', [1, 1 / 2, 2 / 3]),
-        pytest.param([0, 1, 1, 2, 2], [0, 1, 2, 2, 2], -1, 3, 'none', [1, 1 / 2, 2 / 3]),
-        pytest.param([0, 1, 1, 2, 2], [0, 1, 2, 2, 2], 255, 3, 'none', [1, 1 / 2, 2 / 3]),
-        # Ignoring a valid index drops only that index from the result.
-        pytest.param([0, 1, 1, 2, 2], [0, 1, 2, 2, 2], 0, 3, 'none', [1 / 2, 2 / 3]),
-        pytest.param([0, 1, 1, 2, 2], [0, 1, 2, 2, 2], 1, 3, 'none', [1, 2 / 3]),
-        pytest.param([0, 1, 1, 2, 2], [0, 1, 2, 2, 2], 2, 3, 'none', [1, 1 / 2]),
-        # When reducing to mean or sum, the ignored index does not contribute to the output.
-        pytest.param([0, 1, 1, 2, 2], [0, 1, 2, 2, 2], 0, 3, 'elementwise_mean', [7 / 12]),
-        pytest.param([0, 1, 1, 2, 2], [0, 1, 2, 2, 2], 0, 3, 'sum', [7 / 6]),
-    ]
-)
-def test_iou_ignore_index(pred, target, ignore_index, num_classes, reduction, expected):
-    iou_val = iou(
-        pred=torch.tensor(pred),
-        target=torch.tensor(target),
-        ignore_index=ignore_index,
-        num_classes=num_classes,
-        reduction=reduction,
-    )
-    assert torch.allclose(iou_val, torch.tensor(expected).to(iou_val))
diff --git a/tests/metrics/classification/test_stat_scores.py b/tests/metrics/classification/test_stat_scores.py
deleted file mode 100644
index 6ccb5abed6711..0000000000000
--- a/tests/metrics/classification/test_stat_scores.py
+++ /dev/null
@@ -1,255 +0,0 @@
-from functools import partial
-from typing import Callable, Optional
-
-import numpy as np
-import pytest
-import torch
-from sklearn.metrics import multilabel_confusion_matrix
-from torchmetrics.classification.checks import _input_format_classification
-
-from pytorch_lightning.metrics import StatScores
-from pytorch_lightning.metrics.functional import stat_scores
-from tests.metrics.classification.inputs import _input_binary, _input_binary_prob, _input_multiclass
-from tests.metrics.classification.inputs import _input_multiclass_prob as _input_mccls_prob
-from tests.metrics.classification.inputs import _input_multidim_multiclass as _input_mdmc
-from tests.metrics.classification.inputs import _input_multidim_multiclass_prob as _input_mdmc_prob
-from tests.metrics.classification.inputs import _input_multilabel as _input_mcls
-from tests.metrics.classification.inputs import _input_multilabel_prob as _input_mlb_prob
-from tests.metrics.utils import MetricTester, NUM_CLASSES, THRESHOLD
-
-torch.manual_seed(42)
-
-
-def _sk_stat_scores(preds, target, reduce, num_classes, is_multiclass, ignore_index, top_k, mdmc_reduce=None):
-    preds, target, _ = _input_format_classification(
-        preds, target, threshold=THRESHOLD, num_classes=num_classes, is_multiclass=is_multiclass, top_k=top_k
-    )
-    sk_preds, sk_target = preds.numpy(), target.numpy()
-
-    if reduce != "macro" and ignore_index is not None and preds.shape[1] > 1:
-        sk_preds = np.delete(sk_preds, ignore_index, 1)
-        sk_target = np.delete(sk_target, ignore_index, 1)
-
-    if preds.shape[1] == 1 and reduce == "samples":
-        sk_target = sk_target.T
-        sk_preds = sk_preds.T
-
-    sk_stats = multilabel_confusion_matrix(
-        sk_target, sk_preds, samplewise=(reduce == "samples") and preds.shape[1] != 1
-    )
-
-    if preds.shape[1] == 1 and reduce != "samples":
-        sk_stats = sk_stats[[1]].reshape(-1, 4)[:, [3, 1, 0, 2]]
-    else:
-        sk_stats = sk_stats.reshape(-1, 4)[:, [3, 1, 0, 2]]
-
-    if reduce == "micro":
-        sk_stats = sk_stats.sum(axis=0, keepdims=True)
-
-    sk_stats = np.concatenate([sk_stats, sk_stats[:, [3]] + sk_stats[:, [0]]], 1)
-
-    if reduce == "micro":
-        sk_stats = sk_stats[0]
-
-    if reduce == "macro" and ignore_index is not None and preds.shape[1]:
-        sk_stats[ignore_index, :] = -1
-
-    return sk_stats
-
-
-def _sk_stat_scores_mdim_mcls(preds, target, reduce, mdmc_reduce, num_classes, is_multiclass, ignore_index, top_k):
-    preds, target, _ = _input_format_classification(
-        preds, target, threshold=THRESHOLD, num_classes=num_classes, is_multiclass=is_multiclass, top_k=top_k
-    )
-
-    if mdmc_reduce == "global":
-        preds = torch.transpose(preds, 1, 2).reshape(-1, preds.shape[1])
-        target = torch.transpose(target, 1, 2).reshape(-1, target.shape[1])
-
-        return _sk_stat_scores(preds, target, reduce, None, False, ignore_index, top_k)
-    elif mdmc_reduce == "samplewise":
-        scores = []
-
-        for i in range(preds.shape[0]):
-            pred_i = preds[i, ...].T
-            target_i = target[i, ...].T
-            scores_i = _sk_stat_scores(pred_i, target_i, reduce, None, False, ignore_index, top_k)
-
-            scores.append(np.expand_dims(scores_i, 0))
-
-        return np.concatenate(scores)
-
-
-@pytest.mark.parametrize(
-    "reduce, mdmc_reduce, num_classes, inputs, ignore_index",
-    [
-        ["unknown", None, None, _input_binary, None],
-        ["micro", "unknown", None, _input_binary, None],
-        ["macro", None, None, _input_binary, None],
-        ["micro", None, None, _input_mdmc_prob, None],
-        ["micro", None, None, _input_binary_prob, 0],
-        ["micro", None, None, _input_mccls_prob, NUM_CLASSES],
-        ["micro", None, NUM_CLASSES, _input_mccls_prob, NUM_CLASSES],
-    ],
-)
-def test_wrong_params(reduce, mdmc_reduce, num_classes, inputs, ignore_index):
-    """Test a combination of parameters that are invalid and should raise an error.
-
-    This includes invalid ``reduce`` and ``mdmc_reduce`` parameter values, not setting
-    ``num_classes`` when ``reduce='macro'`, not setting ``mdmc_reduce`` when inputs
-    are multi-dim multi-class``, setting ``ignore_index`` when inputs are binary, as well
-    as setting ``ignore_index`` to a value higher than the number of classes.
-    """
-    with pytest.raises(ValueError):
-        stat_scores(
-            inputs.preds[0], inputs.target[0], reduce, mdmc_reduce, num_classes=num_classes, ignore_index=ignore_index
-        )
-
-    with pytest.raises(ValueError):
-        sts = StatScores(reduce=reduce, mdmc_reduce=mdmc_reduce, num_classes=num_classes, ignore_index=ignore_index)
-        sts(inputs.preds[0], inputs.target[0])
-
-
-def test_wrong_threshold():
-    with pytest.raises(ValueError):
-        StatScores(threshold=1.5)
-
-
-@pytest.mark.parametrize("ignore_index", [None, 0])
-@pytest.mark.parametrize("reduce", ["micro", "macro", "samples"])
-@pytest.mark.parametrize(
-    "preds, target, sk_fn, mdmc_reduce, num_classes, is_multiclass, top_k",
-    [
-        (_input_binary_prob.preds, _input_binary_prob.target, _sk_stat_scores, None, 1, None, None),
-        (_input_binary.preds, _input_binary.target, _sk_stat_scores, None, 1, False, None),
-        (_input_mlb_prob.preds, _input_mlb_prob.target, _sk_stat_scores, None, NUM_CLASSES, None, None),
-        (_input_mlb_prob.preds, _input_mlb_prob.target, _sk_stat_scores, None, NUM_CLASSES, None, 2),
-        (_input_mcls.preds, _input_mcls.target, _sk_stat_scores, None, NUM_CLASSES, False, None),
-        (_input_mccls_prob.preds, _input_mccls_prob.target, _sk_stat_scores, None, NUM_CLASSES, None, None),
-        (_input_mccls_prob.preds, _input_mccls_prob.target, _sk_stat_scores, None, NUM_CLASSES, None, 2),
-        (_input_multiclass.preds, _input_multiclass.target, _sk_stat_scores, None, NUM_CLASSES, None, None),
-        (_input_mdmc.preds, _input_mdmc.target, _sk_stat_scores_mdim_mcls, "samplewise", NUM_CLASSES, None, None),
-        (
-            _input_mdmc_prob.preds, _input_mdmc_prob.target, _sk_stat_scores_mdim_mcls, "samplewise", NUM_CLASSES, None,
-            None
-        ),
-        (_input_mdmc.preds, _input_mdmc.target, _sk_stat_scores_mdim_mcls, "global", NUM_CLASSES, None, None),
-        (_input_mdmc_prob.preds, _input_mdmc_prob.target, _sk_stat_scores_mdim_mcls, "global", NUM_CLASSES, None, None),
-    ],
-)
-class TestStatScores(MetricTester):
-    # DDP tests temporarily disabled due to hanging issues
-    @pytest.mark.parametrize("ddp", [False])
-    @pytest.mark.parametrize("dist_sync_on_step", [True, False])
-    def test_stat_scores_class(
-        self,
-        ddp: bool,
-        dist_sync_on_step: bool,
-        sk_fn: Callable,
-        preds: torch.Tensor,
-        target: torch.Tensor,
-        reduce: str,
-        mdmc_reduce: Optional[str],
-        num_classes: Optional[int],
-        is_multiclass: Optional[bool],
-        ignore_index: Optional[int],
-        top_k: Optional[int],
-    ):
-        if ignore_index is not None and preds.ndim == 2:
-            pytest.skip("Skipping ignore_index test with binary inputs.")
-
-        self.run_class_metric_test(
-            ddp=ddp,
-            preds=preds,
-            target=target,
-            metric_class=StatScores,
-            sk_metric=partial(
-                sk_fn,
-                reduce=reduce,
-                mdmc_reduce=mdmc_reduce,
-                num_classes=num_classes,
-                is_multiclass=is_multiclass,
-                ignore_index=ignore_index,
-                top_k=top_k,
-            ),
-            dist_sync_on_step=dist_sync_on_step,
-            metric_args={
-                "num_classes": num_classes,
-                "reduce": reduce,
-                "mdmc_reduce": mdmc_reduce,
-                "threshold": THRESHOLD,
-                "is_multiclass": is_multiclass,
-                "ignore_index": ignore_index,
-                "top_k": top_k,
-            },
-            check_dist_sync_on_step=True,
-            check_batch=True,
-        )
-
-    def test_stat_scores_fn(
-        self,
-        sk_fn: Callable,
-        preds: torch.Tensor,
-        target: torch.Tensor,
-        reduce: str,
-        mdmc_reduce: Optional[str],
-        num_classes: Optional[int],
-        is_multiclass: Optional[bool],
-        ignore_index: Optional[int],
-        top_k: Optional[int],
-    ):
-        if ignore_index is not None and preds.ndim == 2:
-            pytest.skip("Skipping ignore_index test with binary inputs.")
-
-        self.run_functional_metric_test(
-            preds,
-            target,
-            metric_functional=stat_scores,
-            sk_metric=partial(
-                sk_fn,
-                reduce=reduce,
-                mdmc_reduce=mdmc_reduce,
-                num_classes=num_classes,
-                is_multiclass=is_multiclass,
-                ignore_index=ignore_index,
-                top_k=top_k,
-            ),
-            metric_args={
-                "num_classes": num_classes,
-                "reduce": reduce,
-                "mdmc_reduce": mdmc_reduce,
-                "threshold": THRESHOLD,
-                "is_multiclass": is_multiclass,
-                "ignore_index": ignore_index,
-                "top_k": top_k,
-            },
-        )
-
-
-_mc_k_target = torch.tensor([0, 1, 2])
-_mc_k_preds = torch.tensor([[0.35, 0.4, 0.25], [0.1, 0.5, 0.4], [0.2, 0.1, 0.7]])
-_ml_k_target = torch.tensor([[0, 1, 0], [1, 1, 0], [0, 0, 0]])
-_ml_k_preds = torch.tensor([[0.9, 0.2, 0.75], [0.1, 0.7, 0.8], [0.6, 0.1, 0.7]])
-
-
-@pytest.mark.parametrize(
-    "k, preds, target, reduce, expected",
-    [
-        (1, _mc_k_preds, _mc_k_target, "micro", torch.tensor([2, 1, 5, 1, 3])),
-        (2, _mc_k_preds, _mc_k_target, "micro", torch.tensor([3, 3, 3, 0, 3])),
-        (1, _ml_k_preds, _ml_k_target, "micro", torch.tensor([0, 3, 3, 3, 3])),
-        (2, _ml_k_preds, _ml_k_target, "micro", torch.tensor([1, 5, 1, 2, 3])),
-        (1, _mc_k_preds, _mc_k_target, "macro", torch.tensor([[0, 1, 1], [0, 1, 0], [2, 1, 2], [1, 0, 0], [1, 1, 1]])),
-        (2, _mc_k_preds, _mc_k_target, "macro", torch.tensor([[1, 1, 1], [1, 1, 1], [1, 1, 1], [0, 0, 0], [1, 1, 1]])),
-        (1, _ml_k_preds, _ml_k_target, "macro", torch.tensor([[0, 0, 0], [1, 0, 2], [1, 1, 1], [1, 2, 0], [1, 2, 0]])),
-        (2, _ml_k_preds, _ml_k_target, "macro", torch.tensor([[0, 1, 0], [2, 0, 3], [0, 1, 0], [1, 1, 0], [1, 2, 0]])),
-    ],
-)
-def test_top_k(k: int, preds: torch.Tensor, target: torch.Tensor, reduce: str, expected: torch.Tensor):
-    """ A simple test to check that top_k works as expected """
-
-    class_metric = StatScores(top_k=k, reduce=reduce, num_classes=3)
-    class_metric.update(preds, target)
-
-    assert torch.equal(class_metric.compute(), expected.T)
-    assert torch.equal(stat_scores(preds, target, top_k=k, reduce=reduce, num_classes=3), expected.T)
diff --git a/tests/metrics/test_remove_1-5_metrics.py b/tests/metrics/test_remove_1-5_metrics.py
index 41ccfb6da8015..339d07b163632 100644
--- a/tests/metrics/test_remove_1-5_metrics.py
+++ b/tests/metrics/test_remove_1-5_metrics.py
@@ -21,21 +21,33 @@
     AUC,
     AUROC,
     AveragePrecision,
+    ConfusionMatrix,
+    F1,
+    FBeta,
+    HammingDistance,
+    IoU,
     MetricCollection,
     Precision,
     PrecisionRecallCurve,
     Recall,
     ROC,
+    StatScores,
 )
 from pytorch_lightning.metrics.functional import (
     auc,
     auroc,
     average_precision,
+    confusion_matrix,
+    f1,
+    fbeta,
+    hamming_distance,
+    iou,
     precision,
     precision_recall,
     precision_recall_curve,
     recall,
     roc,
+    stat_scores,
 )
 from pytorch_lightning.metrics.functional.accuracy import accuracy
 from pytorch_lightning.metrics.utils import get_num_classes, select_topk, to_categorical, to_onehot
@@ -162,3 +174,66 @@ def test_v1_5_metric_precision_recall():
         assert torch.equal(prec, torch.tensor([1., 1., 1., 1.]))
         assert torch.allclose(rc, torch.tensor([1., 0.6667, 0.3333, 0.]), atol=1e-4)
         assert torch.equal(thrs, torch.tensor([1, 2, 3]))
+
+
+def test_v1_5_metric_classif_mix():
+    ConfusionMatrix.__init__.warned = False
+    with pytest.deprecated_call(match='It will be removed in v1.5.0'):
+        ConfusionMatrix(num_classes=1)
+
+    FBeta.__init__.warned = False
+    with pytest.deprecated_call(match='It will be removed in v1.5.0'):
+        FBeta(num_classes=1)
+
+    F1.__init__.warned = False
+    with pytest.deprecated_call(match='It will be removed in v1.5.0'):
+        F1(num_classes=1)
+
+    HammingDistance.__init__.warned = False
+    with pytest.deprecated_call(match='It will be removed in v1.5.0'):
+        HammingDistance()
+
+    StatScores.__init__.warned = False
+    with pytest.deprecated_call(match='It will be removed in v1.5.0'):
+        StatScores()
+
+    target = torch.tensor([1, 1, 0, 0])
+    preds = torch.tensor([0, 1, 0, 0])
+    confusion_matrix.warned = False
+    with pytest.deprecated_call(match='It will be removed in v1.5.0'):
+        assert torch.equal(confusion_matrix(preds, target, num_classes=2), torch.tensor([[2., 0.], [1., 1.]]))
+
+    target = torch.tensor([0, 1, 2, 0, 1, 2])
+    preds = torch.tensor([0, 2, 1, 0, 0, 1])
+    fbeta.warned = False
+    with pytest.deprecated_call(match='It will be removed in v1.5.0'):
+        assert torch.allclose(fbeta(preds, target, num_classes=3, beta=0.5), torch.tensor(0.3333), atol=1e-4)
+
+    f1.warned = False
+    with pytest.deprecated_call(match='It will be removed in v1.5.0'):
+        assert torch.allclose(f1(preds, target, num_classes=3), torch.tensor(0.3333), atol=1e-4)
+
+    target = torch.tensor([[0, 1], [1, 1]])
+    preds = torch.tensor([[0, 1], [0, 1]])
+    hamming_distance.warned = False
+    with pytest.deprecated_call(match='It will be removed in v1.5.0'):
+        assert hamming_distance(preds, target) == torch.tensor(0.25)
+
+    preds = torch.tensor([1, 0, 2, 1])
+    target = torch.tensor([1, 1, 2, 0])
+    stat_scores.warned = False
+    with pytest.deprecated_call(match='It will be removed in v1.5.0'):
+        assert torch.equal(stat_scores(preds, target, reduce='micro'), torch.tensor([2, 2, 6, 2, 4]))
+
+
+def test_v1_5_metric_detect():
+    IoU.__init__.warned = False
+    with pytest.deprecated_call(match='It will be removed in v1.5.0'):
+        IoU(num_classes=1)
+
+    target = torch.randint(0, 2, (10, 25, 25))
+    pred = torch.tensor(target)
+    pred[2:5, 7:13, 9:15] = 1 - pred[2:5, 7:13, 9:15]
+    iou.warned = False
+    with pytest.deprecated_call(match='It will be removed in v1.5.0'):
+        assert torch.allclose(iou(pred, target), torch.tensor(0.9660), atol=1e-4)