From c25bc659a542db2619a8ceffae8e247000e7e582 Mon Sep 17 00:00:00 2001
From: hassiahk <haswanth.kumar.39@gmail.com>
Date: Thu, 13 May 2021 17:22:23 +0530
Subject: [PATCH 01/12] Added Minimal Implementation

---
 .../functional/classification/kldivergence.py | 38 +++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100644 torchmetrics/functional/classification/kldivergence.py

diff --git a/torchmetrics/functional/classification/kldivergence.py b/torchmetrics/functional/classification/kldivergence.py
new file mode 100644
index 00000000000..11fb6cd0137
--- /dev/null
+++ b/torchmetrics/functional/classification/kldivergence.py
@@ -0,0 +1,38 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from torch import Tensor
+import torch
+
+from torchmetrics.utilities.data import METRIC_EPS
+
+
+def _kld_update(preds: Tensor, target: Tensor):
+    preds = torch.clamp(preds, METRIC_EPS)
+    target = torch.clamp(target, METRIC_EPS)
+
+    total = preds.numel()
+
+    measures = torch.sum(target * torch.log(target / preds), axis=-1)
+
+    return measures, total
+
+
+def _kld_compute(measures: Tensor, total: Tensor):
+    return measures / total
+
+
+def kldivergence(preds: Tensor, target: Tensor):
+    measures, total = _kld_update(preds, target)
+    return _kld_compute(measures, total)

From 2fd3eade3f761042b23ebd06d68f8a3637077c7a Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 13 May 2021 11:55:14 +0000
Subject: [PATCH 02/12] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 torchmetrics/functional/classification/kldivergence.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchmetrics/functional/classification/kldivergence.py b/torchmetrics/functional/classification/kldivergence.py
index 11fb6cd0137..56b5cfc72b6 100644
--- a/torchmetrics/functional/classification/kldivergence.py
+++ b/torchmetrics/functional/classification/kldivergence.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from torch import Tensor
 import torch
+from torch import Tensor
 
 from torchmetrics.utilities.data import METRIC_EPS
 

From 987db8272b0a36b0b36b5c37a4be2e85745839a4 Mon Sep 17 00:00:00 2001
From: hassiahk <haswanth.kumar.39@gmail.com>
Date: Mon, 17 May 2021 17:03:23 +0530
Subject: [PATCH 03/12] Requested Changes

---
 torchmetrics/functional/classification/__init__.py |  1 +
 .../functional/classification/kldivergence.py      | 14 +++++++++++---
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/torchmetrics/functional/classification/__init__.py b/torchmetrics/functional/classification/__init__.py
index ddccef949b5..205f67b8b67 100644
--- a/torchmetrics/functional/classification/__init__.py
+++ b/torchmetrics/functional/classification/__init__.py
@@ -22,6 +22,7 @@
 from torchmetrics.functional.classification.hamming_distance import hamming_distance  # noqa: F401
 from torchmetrics.functional.classification.hinge import hinge  # noqa: F401
 from torchmetrics.functional.classification.iou import iou  # noqa: F401
+from torchmetrics.functional.classification.kldivergence import kldivergence  # noqa: F401
 from torchmetrics.functional.classification.matthews_corrcoef import matthews_corrcoef  # noqa: F401
 from torchmetrics.functional.classification.precision_recall import precision, precision_recall, recall  # noqa: F401
 from torchmetrics.functional.classification.precision_recall_curve import precision_recall_curve  # noqa: F401
diff --git a/torchmetrics/functional/classification/kldivergence.py b/torchmetrics/functional/classification/kldivergence.py
index 56b5cfc72b6..f61834c4db5 100644
--- a/torchmetrics/functional/classification/kldivergence.py
+++ b/torchmetrics/functional/classification/kldivergence.py
@@ -12,13 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from typing import Tuple
+
 import torch
 from torch import Tensor
 
+from torchmetrics.utilities.checks import _check_same_shape
 from torchmetrics.utilities.data import METRIC_EPS
 
 
-def _kld_update(preds: Tensor, target: Tensor):
+def _kld_update(preds: Tensor, target: Tensor) -> Tuple[Tensor, int]:
+    _check_same_shape(preds, target)
+
+    preds = preds / preds.sum(axis=-1)
+    target = target / target.sum(axis=-1)
+
     preds = torch.clamp(preds, METRIC_EPS)
     target = torch.clamp(target, METRIC_EPS)
 
@@ -29,10 +37,10 @@ def _kld_update(preds: Tensor, target: Tensor):
     return measures, total
 
 
-def _kld_compute(measures: Tensor, total: Tensor):
+def _kld_compute(measures: Tensor, total: Tensor) -> Tensor:
     return measures / total
 
 
-def kldivergence(preds: Tensor, target: Tensor):
+def kldivergence(preds: Tensor, target: Tensor) -> Tensor:
     measures, total = _kld_update(preds, target)
     return _kld_compute(measures, total)

From 03b7e2dfcae604bfcef0a59b588622ece7ac3b5a Mon Sep 17 00:00:00 2001
From: SkafteNicki <skaftenicki@gmail.com>
Date: Sat, 26 Jun 2021 14:31:15 +0200
Subject: [PATCH 04/12] add class interface

---
 torchmetrics/__init__.py                      |   1 +
 torchmetrics/classification/__init__.py       |   1 +
 torchmetrics/classification/kldivergence.py   | 100 ++++++++++++++++++
 torchmetrics/functional/__init__.py           |   1 +
 .../functional/classification/kldivergence.py |  66 +++++++++---
 5 files changed, 154 insertions(+), 15 deletions(-)
 create mode 100644 torchmetrics/classification/kldivergence.py

diff --git a/torchmetrics/__init__.py b/torchmetrics/__init__.py
index 56e2c990dc9..2dd858fd850 100644
--- a/torchmetrics/__init__.py
+++ b/torchmetrics/__init__.py
@@ -28,6 +28,7 @@
     HammingDistance,
     Hinge,
     IoU,
+    KLDivergence,
     MatthewsCorrcoef,
     Precision,
     PrecisionRecallCurve,
diff --git a/torchmetrics/classification/__init__.py b/torchmetrics/classification/__init__.py
index 0088fb6ecf5..36ebd20781b 100644
--- a/torchmetrics/classification/__init__.py
+++ b/torchmetrics/classification/__init__.py
@@ -24,6 +24,7 @@
 from torchmetrics.classification.hamming_distance import HammingDistance  # noqa: F401
 from torchmetrics.classification.hinge import Hinge  # noqa: F401
 from torchmetrics.classification.iou import IoU  # noqa: F401
+from torchmetrics.classification.kldivergence import KLDivergence  # noqa: F401
 from torchmetrics.classification.matthews_corrcoef import MatthewsCorrcoef  # noqa: F401
 from torchmetrics.classification.precision_recall import Precision, Recall  # noqa: F401
 from torchmetrics.classification.precision_recall_curve import PrecisionRecallCurve  # noqa: F401
diff --git a/torchmetrics/classification/kldivergence.py b/torchmetrics/classification/kldivergence.py
new file mode 100644
index 00000000000..6425bb3de4a
--- /dev/null
+++ b/torchmetrics/classification/kldivergence.py
@@ -0,0 +1,100 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Callable, Optional
+
+import torch
+from torch import Tensor, tensor
+
+from torchmetrics.functional.classification.kldivergence import _kld_update, _kld_compute
+from torchmetrics.metric import Metric
+
+
+class KLDivergence(Metric):
+    r"""Computes the `KL divergence <https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence>`_:
+
+    .. math::
+        D_{KL}(P||Q) = \sum_{x\in\mathcal{X}} P(x) \log\frac{P(x)}{Q{x}}
+
+    Where :math:`P` and :math:`Q` are probability distributions where :math:`P` usually represents a distribution
+    over data and :math:`Q` is often a prior or approximation of :math:`P`. It should be noted that the KL divergence
+    is a none symetrical measure.
+
+    Args:
+        p: data distribution with shape ``[N, d]``
+        q: prior or approximate distribution with shape ``[N, d]``
+        log_prob: bool indicating if input is log-probabilities or probabilities. If given as probabilities, 
+            will normalize to make sure the distributes sum to 1
+        reduction: 
+            Determines how to reduce over the ``N``/batch dimension:
+
+            - ``'mean'`` [default]: Averages score across samples
+            - ``'sum'``: Sum score across samples
+            - ``'none'`` or ``None``: Returns score per sample
+
+    Raises:
+        TypeError:
+            If ``log_prob`` is not an ``bool``
+        ValueError:
+            If ``reduction`` is not one of ``'mean'``, ``'sum'``, ``'none'`` or ``None``
+
+
+    Example:
+        >>> import torch
+        >>> from torchmetrics.functional import kldivergence
+        >>> p = torch.randn(([[0.36, 0.48, 0.16]])
+        >>> q = torch.tensor([[1/3, 1/3, 1/3]])
+        >>> kldivergence(p, q)
+        tensor(0.085)
+    """
+
+    def __init__(
+        self,
+        log_prob: bool = False,
+        reduction: Optional[str] = 'mean',
+        compute_on_step: bool = True,
+        dist_sync_on_step: bool = False,
+        process_group: Optional[Any] = None,
+        dist_sync_fn: Callable = None,
+    ):
+        super().__init__(
+            compute_on_step=compute_on_step,
+            dist_sync_on_step=dist_sync_on_step,
+            process_group=process_group,
+            dist_sync_fn=dist_sync_fn,
+        )
+        if not isinstance(log_prob, bool):
+            raise TypeError(f'Expected argument `log_prob` to be bool but got {log_prob}')
+        self.log_prob = log_prob
+
+        allowed_reduction = ['mean', 'sum', 'none', None]
+        if reduction not in allowed_reduction:
+            raise ValueError(f"Expected argument `reduction` to be one of {allowed_reduction} but got {reduction}")
+        self.recduction = reduction
+
+        if self.reduction in ['mean', 'sum']:
+            self.add_state('measures', torch.zeros(1), dist_reduce_fx='sum')
+            self.add_state('total', torch.zeros(1), dist_reduce_fx='sum')
+        else:
+            self.add_state('measures', [], dist_reduce_fx='cat')
+
+    def update(self, p: Tensor, q: Tensor) -> None:  # type: ignore
+        measures, total = _kld_update(p, q, self.log_prob)
+        if self.reduction is None or self.reduction == 'none':
+            self.measures.append(measures)
+        else:
+            self.measures += measures
+            self.total += total
+
+    def compute(self) -> Tensor:
+        return _kld_compute(self.measures, self.total, self.reduction)
\ No newline at end of file
diff --git a/torchmetrics/functional/__init__.py b/torchmetrics/functional/__init__.py
index 939987268b5..be71df17864 100644
--- a/torchmetrics/functional/__init__.py
+++ b/torchmetrics/functional/__init__.py
@@ -22,6 +22,7 @@
 from torchmetrics.functional.classification.hamming_distance import hamming_distance  # noqa: F401
 from torchmetrics.functional.classification.hinge import hinge  # noqa: F401
 from torchmetrics.functional.classification.iou import iou  # noqa: F401
+from torchmetrics.functional.classification.kldivergence import kldivergence  # noqa: F401
 from torchmetrics.functional.classification.matthews_corrcoef import matthews_corrcoef  # noqa: F401
 from torchmetrics.functional.classification.precision_recall import precision, precision_recall, recall  # noqa: F401
 from torchmetrics.functional.classification.precision_recall_curve import precision_recall_curve  # noqa: F401
diff --git a/torchmetrics/functional/classification/kldivergence.py b/torchmetrics/functional/classification/kldivergence.py
index f61834c4db5..39fb08a054b 100644
--- a/torchmetrics/functional/classification/kldivergence.py
+++ b/torchmetrics/functional/classification/kldivergence.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Tuple
+from typing import Tuple, Optional
 
 import torch
 from torch import Tensor
@@ -21,26 +21,62 @@
 from torchmetrics.utilities.data import METRIC_EPS
 
 
-def _kld_update(preds: Tensor, target: Tensor) -> Tuple[Tensor, int]:
-    _check_same_shape(preds, target)
+def _kld_update(p: Tensor, q: Tensor, log_prob: bool) -> Tuple[Tensor, int]:
+    _check_same_shape(p, q)
+    if p.ndim != 2 or q.ndim != 2:
+        raise ValueError(f"Expected both p and q distribution to be 2D but got {p.ndim} and {q.ndim} respectively")
 
-    preds = preds / preds.sum(axis=-1)
-    target = target / target.sum(axis=-1)
+    total = p.shape[0]
+    if log_prob:
+        measures = torch.sum(p.exp() * (p - q), axis=-1)
+    else:
+        p = p / p.sum(axis=-1)
+        q = q / q.sum(axis=-1)
+        q = torch.clamp(q, METRIC_EPS)
+        measures = torch.sum(p * torch.log(p / q), axis=-1)
 
-    preds = torch.clamp(preds, METRIC_EPS)
-    target = torch.clamp(target, METRIC_EPS)
+    return measures, total
 
-    total = preds.numel()
 
-    measures = torch.sum(target * torch.log(target / preds), axis=-1)
+def _kld_compute(measures: Tensor, total: Tensor, reduction: Optional[str] = 'mean') -> Tensor:
+    if reduction == 'sum':
+        return measures.sum()
+    elif reduction == 'mean':
+        return measures.sum() / total
+    elif reduction is None or reduction == 'none':
+        return measures
+    return measures / total
 
-    return measures, total
 
+def kldivergence(p: Tensor, q: Tensor, log_prob: bool = False, reduction: Optional[str] = 'mean') -> Tensor:
+    r"""Computes the `KL divergence <https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence>`_:
 
-def _kld_compute(measures: Tensor, total: Tensor) -> Tensor:
-    return measures / total
+    .. math::
+        D_{KL}(P||Q) = \sum_{x\in\mathcal{X}} P(x) \log\frac{P(x)}{Q{x}}
+
+    Where :math:`P` and :math:`Q` are probability distributions where :math:`P` usually represents a distribution
+    over data and :math:`Q` is often a prior or approximation of :math:`P`. It should be noted that the KL divergence
+    is a none symetrical measure.
+
+    Args:
+        p: data distribution with shape ``[N, d]``
+        q: prior or approximate distribution with shape ``[N, d]``
+        log_prob: bool indicating if input is log-probabilities or probabilities. If given as probabilities, 
+            will normalize to make sure the distributes sum to 1
+        reduction: 
+            Determines how to reduce over the ``N``/batch dimension:
 
+            - ``'mean'`` [default]: Averages score across samples
+            - ``'sum'``: Sum score across samples
+            - ``'none'`` or ``None``: Returns score per sample
 
-def kldivergence(preds: Tensor, target: Tensor) -> Tensor:
-    measures, total = _kld_update(preds, target)
-    return _kld_compute(measures, total)
+    Example:
+        >>> import torch
+        >>> from torchmetrics.functional import kldivergence
+        >>> p = torch.randn(([[0.36, 0.48, 0.16]])
+        >>> q = torch.tensor([[1/3, 1/3, 1/3]])
+        >>> kldivergence(p, q)
+        tensor(0.085)
+    """
+    measures, total = _kld_update(p, q, log_prob)
+    return _kld_compute(measures, total, reduction)

From 181865b76ced9e1f10125b12d350fc0d1dac332a Mon Sep 17 00:00:00 2001
From: SkafteNicki <skaftenicki@gmail.com>
Date: Sat, 26 Jun 2021 14:34:05 +0200
Subject: [PATCH 05/12] docs

---
 docs/source/references/functional.rst       | 6 ++++++
 docs/source/references/modules.rst          | 6 ++++++
 torchmetrics/classification/kldivergence.py | 6 +++++-
 3 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/docs/source/references/functional.rst b/docs/source/references/functional.rst
index 7de8d14c065..f183bd3bd5d 100644
--- a/docs/source/references/functional.rst
+++ b/docs/source/references/functional.rst
@@ -89,6 +89,12 @@ iou [func]
 .. autofunction:: torchmetrics.functional.iou
     :noindex:
 
+kldivergence [func]
+~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: torchmetrics.functional.kldivergence
+    :noindex:
+
 matthews_corrcoef [func]
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/docs/source/references/modules.rst b/docs/source/references/modules.rst
index 88772aa9698..590ff491dfa 100644
--- a/docs/source/references/modules.rst
+++ b/docs/source/references/modules.rst
@@ -204,6 +204,12 @@ IoU
 .. autoclass:: torchmetrics.IoU
     :noindex:
 
+KLDivergence
+~~~~~~~~~~~~
+
+.. autoclass:: torchmetrics.KLDivergence
+    :noindex:
+
 MatthewsCorrcoef
 ~~~~~~~~~~~~~~~~
 
diff --git a/torchmetrics/classification/kldivergence.py b/torchmetrics/classification/kldivergence.py
index 6425bb3de4a..a1b902eae45 100644
--- a/torchmetrics/classification/kldivergence.py
+++ b/torchmetrics/classification/kldivergence.py
@@ -97,4 +97,8 @@ def update(self, p: Tensor, q: Tensor) -> None:  # type: ignore
             self.total += total
 
     def compute(self) -> Tensor:
-        return _kld_compute(self.measures, self.total, self.reduction)
\ No newline at end of file
+        return _kld_compute(self.measures, self.total, self.reduction)
+
+    @property
+    def is_differentiable(self) -> bool:
+        return True

From 932981ca386d3f90352fa7f717c4f7050613fe7a Mon Sep 17 00:00:00 2001
From: SkafteNicki <skaftenicki@gmail.com>
Date: Sat, 26 Jun 2021 15:03:22 +0200
Subject: [PATCH 06/12] add test

---
 tests/classification/test_kldivergence.py     | 122 ++++++++++++++++++
 torchmetrics/classification/kldivergence.py   |   4 +-
 .../functional/classification/kldivergence.py |   2 +-
 3 files changed, 125 insertions(+), 3 deletions(-)
 create mode 100644 tests/classification/test_kldivergence.py

diff --git a/tests/classification/test_kldivergence.py b/tests/classification/test_kldivergence.py
new file mode 100644
index 00000000000..8081bf7ae36
--- /dev/null
+++ b/tests/classification/test_kldivergence.py
@@ -0,0 +1,122 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections import namedtuple
+from functools import partial
+from typing import Optional
+
+import numpy as np
+import pytest
+import torch
+from scipy.stats import entropy
+from torch import Tensor
+
+from tests.helpers import seed_all
+from tests.helpers.testers import BATCH_SIZE, EXTRA_DIM, NUM_BATCHES, MetricTester
+from torchmetrics.functional import kldivergence
+from torchmetrics.regression import KLDivergence
+from torchmetrics.utilities.imports import _TORCH_GREATER_EQUAL_1_6
+
+seed_all(42)
+
+num_targets = 5
+
+Input = namedtuple('Input', ["preds", "target"])
+
+_probs_inputs = Input(
+    p=torch.rand(NUM_BATCHES, BATCH_SIZE, EXTRA_DIM),
+    q=torch.rand(NUM_BATCHES, BATCH_SIZE, EXTRA_DIM),
+)
+
+_log_probs_inputs = Input(
+    p=torch.rand(NUM_BATCHES, BATCH_SIZE, EXTRA_DIM).log(),
+    q=torch.rand(NUM_BATCHES, BATCH_SIZE, EXTRA_DIM).log(),
+)
+
+
+def _sk_metric(p: Tensor, q: Tensor, log_prob: bool, reduction: Optional[str] = 'mean'):
+    if log_prob:
+        p = p.softmax(dim=-1)
+        q = q.softmax(dim=-1)
+    res = entropy(p, q, axis=1)
+    if reduction == 'mean':
+        return np.mean(res)
+    elif reduction == 'sum':
+        return np.sum(res)
+    else:
+        return res
+
+
+@pytest.mark.parametrize("reduction", ['mean', 'sum', 'none', None])
+@pytest.mark.parametrize(
+    "p, q, log_prob", [(_probs_inputs.p, _probs_inputs.q, False), (_log_probs_inputs.p, _log_probs_inputs.q, True)]
+)
+class TestKLDivergence(MetricTester):
+
+    @pytest.mark.parametrize("ddp", [True, False])
+    @pytest.mark.parametrize("dist_sync_on_step", [True, False])
+    def test_kldivergence(self, reduction, p, q, log_prob, ddp, dist_sync_on_step):
+        self.run_class_metric_test(
+            ddp,
+            p,
+            q,
+            KLDivergence,
+            partial(_sk_metric, log_prob=log_prob, reduction=reduction),
+            dist_sync_on_step,
+            metric_args=dict(log_prob=log_prob, reduction=reduction),
+        )
+
+    def test_kldivergence_functional(self, reduction, p, q, log_prob):
+        # todo: `num_outputs` is unused
+        self.run_functional_metric_test(
+            p,
+            q,
+            kldivergence,
+            partial(_sk_metric, log_prob=log_prob, reduction=reduction),
+            metric_args=dict(log_prob=log_prob, reduction=reduction),
+        )
+
+    def test_kldivergence_differentiabilit(self, reduction, p, q, log_prob):
+        self.run_differentiability_test(
+            preds=p,
+            target=q,
+            metric_module=KLDivergence,
+            metric_functional=kldivergence,
+            metric_args=dict(log_prob=log_prob, reduction=reduction)
+        )
+
+    @pytest.mark.skipif(
+        not _TORCH_GREATER_EQUAL_1_6, reason='half support of core operations on not support before pytorch v1.6'
+    )
+    def test_kldivergence_half_cpu(self, reduction, p, q, log_prob):
+        self.run_precision_test_cpu(p, q, KLDivergence, kldivergence, {'log_prob': log_prob, 'reduction': reduction})
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires cuda')
+    def test_r2_half_gpu(self, reduction, p, q, log_prob):
+        self.run_precision_test_gpu(p, q, KLDivergence, kldivergence, {'log_prob': log_prob, 'reduction': reduction})
+
+
+def test_error_on_different_shape():
+    metric = KLDivergence()
+    with pytest.raises(RuntimeError, match='Predictions and targets are expected to have the same shape'):
+        metric(torch.randn(100, ), torch.randn(50, ))
+
+
+def test_error_on_multidim_tensors():
+    metric = KLDivergence()
+    with pytest.raises(
+        ValueError,
+        match=r'Expected both prediction and target to be 1D or 2D tensors,'
+        r' but received tensors with dimension .'
+    ):
+        metric(torch.randn(10, 20, 5), torch.randn(10, 20, 5))
diff --git a/torchmetrics/classification/kldivergence.py b/torchmetrics/classification/kldivergence.py
index a1b902eae45..3fe29d4bdb1 100644
--- a/torchmetrics/classification/kldivergence.py
+++ b/torchmetrics/classification/kldivergence.py
@@ -14,7 +14,7 @@
 from typing import Any, Callable, Optional
 
 import torch
-from torch import Tensor, tensor
+from torch import Tensor
 
 from torchmetrics.functional.classification.kldivergence import _kld_update, _kld_compute
 from torchmetrics.metric import Metric
@@ -52,7 +52,7 @@ class KLDivergence(Metric):
     Example:
         >>> import torch
         >>> from torchmetrics.functional import kldivergence
-        >>> p = torch.randn(([[0.36, 0.48, 0.16]])
+        >>> p = torch.tensor([[0.36, 0.48, 0.16]])
         >>> q = torch.tensor([[1/3, 1/3, 1/3]])
         >>> kldivergence(p, q)
         tensor(0.085)
diff --git a/torchmetrics/functional/classification/kldivergence.py b/torchmetrics/functional/classification/kldivergence.py
index 39fb08a054b..e1c9d2013b5 100644
--- a/torchmetrics/functional/classification/kldivergence.py
+++ b/torchmetrics/functional/classification/kldivergence.py
@@ -73,7 +73,7 @@ def kldivergence(p: Tensor, q: Tensor, log_prob: bool = False, reduction: Option
     Example:
         >>> import torch
         >>> from torchmetrics.functional import kldivergence
-        >>> p = torch.randn(([[0.36, 0.48, 0.16]])
+        >>> p = torch.tensor([[0.36, 0.48, 0.16]])
         >>> q = torch.tensor([[1/3, 1/3, 1/3]])
         >>> kldivergence(p, q)
         tensor(0.085)

From 435ec1982d67c244e79fe3338ab11f67c5b9e97b Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sat, 26 Jun 2021 13:22:41 +0000
Subject: [PATCH 07/12] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 torchmetrics/classification/kldivergence.py            | 6 +++---
 torchmetrics/functional/classification/kldivergence.py | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/torchmetrics/classification/kldivergence.py b/torchmetrics/classification/kldivergence.py
index 3fe29d4bdb1..5f697905693 100644
--- a/torchmetrics/classification/kldivergence.py
+++ b/torchmetrics/classification/kldivergence.py
@@ -16,7 +16,7 @@
 import torch
 from torch import Tensor
 
-from torchmetrics.functional.classification.kldivergence import _kld_update, _kld_compute
+from torchmetrics.functional.classification.kldivergence import _kld_compute, _kld_update
 from torchmetrics.metric import Metric
 
 
@@ -33,9 +33,9 @@ class KLDivergence(Metric):
     Args:
         p: data distribution with shape ``[N, d]``
         q: prior or approximate distribution with shape ``[N, d]``
-        log_prob: bool indicating if input is log-probabilities or probabilities. If given as probabilities, 
+        log_prob: bool indicating if input is log-probabilities or probabilities. If given as probabilities,
             will normalize to make sure the distributes sum to 1
-        reduction: 
+        reduction:
             Determines how to reduce over the ``N``/batch dimension:
 
             - ``'mean'`` [default]: Averages score across samples
diff --git a/torchmetrics/functional/classification/kldivergence.py b/torchmetrics/functional/classification/kldivergence.py
index e1c9d2013b5..0673a1ff57b 100644
--- a/torchmetrics/functional/classification/kldivergence.py
+++ b/torchmetrics/functional/classification/kldivergence.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Tuple, Optional
+from typing import Optional, Tuple
 
 import torch
 from torch import Tensor
@@ -61,9 +61,9 @@ def kldivergence(p: Tensor, q: Tensor, log_prob: bool = False, reduction: Option
     Args:
         p: data distribution with shape ``[N, d]``
         q: prior or approximate distribution with shape ``[N, d]``
-        log_prob: bool indicating if input is log-probabilities or probabilities. If given as probabilities, 
+        log_prob: bool indicating if input is log-probabilities or probabilities. If given as probabilities,
             will normalize to make sure the distributes sum to 1
-        reduction: 
+        reduction:
             Determines how to reduce over the ``N``/batch dimension:
 
             - ``'mean'`` [default]: Averages score across samples

From 39b41846054987ecfe3376c0e0779a760e263faf Mon Sep 17 00:00:00 2001
From: SkafteNicki <skaftenicki@gmail.com>
Date: Sat, 26 Jun 2021 15:23:15 +0200
Subject: [PATCH 08/12] changelog

---
 CHANGELOG.md                                  |  1 +
 tests/classification/test_kldivergence.py     | 25 ++++++++-----------
 torchmetrics/classification/kldivergence.py   | 16 +++++++-----
 .../functional/classification/kldivergence.py | 10 ++++----
 4 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4052f817ab2..9a381ee38ae 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -22,6 +22,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added Inception Score metric to image module ([#299](https://github.com/PyTorchLightning/metrics/pull/299))
 - Added KID metric to image module ([#301](https://github.com/PyTorchLightning/metrics/pull/301))
 - Added `sync` and `sync_context` methods for manually controlling when metric states are synced ([#302](https://github.com/PyTorchLightning/metrics/pull/302))
+- Added `KLDivergence` metric ([#247](https://github.com/PyTorchLightning/metrics/pull/247))
 
 ### Changed
 
diff --git a/tests/classification/test_kldivergence.py b/tests/classification/test_kldivergence.py
index 8081bf7ae36..ce8d6b5f4ca 100644
--- a/tests/classification/test_kldivergence.py
+++ b/tests/classification/test_kldivergence.py
@@ -23,15 +23,14 @@
 
 from tests.helpers import seed_all
 from tests.helpers.testers import BATCH_SIZE, EXTRA_DIM, NUM_BATCHES, MetricTester
+from torchmetrics.classification import KLDivergence
 from torchmetrics.functional import kldivergence
-from torchmetrics.regression import KLDivergence
-from torchmetrics.utilities.imports import _TORCH_GREATER_EQUAL_1_6
 
 seed_all(42)
 
 num_targets = 5
 
-Input = namedtuple('Input', ["preds", "target"])
+Input = namedtuple('Input', ["p", "q"])
 
 _probs_inputs = Input(
     p=torch.rand(NUM_BATCHES, BATCH_SIZE, EXTRA_DIM),
@@ -39,8 +38,8 @@
 )
 
 _log_probs_inputs = Input(
-    p=torch.rand(NUM_BATCHES, BATCH_SIZE, EXTRA_DIM).log(),
-    q=torch.rand(NUM_BATCHES, BATCH_SIZE, EXTRA_DIM).log(),
+    p=torch.rand(NUM_BATCHES, BATCH_SIZE, EXTRA_DIM).softmax(dim=-1).log(),
+    q=torch.rand(NUM_BATCHES, BATCH_SIZE, EXTRA_DIM).softmax(dim=-1).log(),
 )
 
 
@@ -62,6 +61,7 @@ def _sk_metric(p: Tensor, q: Tensor, log_prob: bool, reduction: Optional[str] =
     "p, q, log_prob", [(_probs_inputs.p, _probs_inputs.q, False), (_log_probs_inputs.p, _log_probs_inputs.q, True)]
 )
 class TestKLDivergence(MetricTester):
+    atol = 1e-6
 
     @pytest.mark.parametrize("ddp", [True, False])
     @pytest.mark.parametrize("dist_sync_on_step", [True, False])
@@ -88,16 +88,15 @@ def test_kldivergence_functional(self, reduction, p, q, log_prob):
 
     def test_kldivergence_differentiabilit(self, reduction, p, q, log_prob):
         self.run_differentiability_test(
-            preds=p,
-            target=q,
+            p,
+            q,
             metric_module=KLDivergence,
             metric_functional=kldivergence,
             metric_args=dict(log_prob=log_prob, reduction=reduction)
         )
 
-    @pytest.mark.skipif(
-        not _TORCH_GREATER_EQUAL_1_6, reason='half support of core operations on not support before pytorch v1.6'
-    )
+    # KLDivergence half + cpu does not work due to missing support in torch.clamp
+    @pytest.mark.xfail(reason="KLDivergence metric does not support cpu + half precision")
     def test_kldivergence_half_cpu(self, reduction, p, q, log_prob):
         self.run_precision_test_cpu(p, q, KLDivergence, kldivergence, {'log_prob': log_prob, 'reduction': reduction})
 
@@ -114,9 +113,5 @@ def test_error_on_different_shape():
 
 def test_error_on_multidim_tensors():
     metric = KLDivergence()
-    with pytest.raises(
-        ValueError,
-        match=r'Expected both prediction and target to be 1D or 2D tensors,'
-        r' but received tensors with dimension .'
-    ):
+    with pytest.raises(ValueError, match='Expected both p and q distribution to be 2D but got 3 and 3 respectively'):
         metric(torch.randn(10, 20, 5), torch.randn(10, 20, 5))
diff --git a/torchmetrics/classification/kldivergence.py b/torchmetrics/classification/kldivergence.py
index 3fe29d4bdb1..2b83a4ce7ba 100644
--- a/torchmetrics/classification/kldivergence.py
+++ b/torchmetrics/classification/kldivergence.py
@@ -16,8 +16,9 @@
 import torch
 from torch import Tensor
 
-from torchmetrics.functional.classification.kldivergence import _kld_update, _kld_compute
+from torchmetrics.functional.classification.kldivergence import _kld_compute, _kld_update
 from torchmetrics.metric import Metric
+from torchmetrics.utilities.data import dim_zero_cat
 
 
 class KLDivergence(Metric):
@@ -33,9 +34,9 @@ class KLDivergence(Metric):
     Args:
         p: data distribution with shape ``[N, d]``
         q: prior or approximate distribution with shape ``[N, d]``
-        log_prob: bool indicating if input is log-probabilities or probabilities. If given as probabilities, 
+        log_prob: bool indicating if input is log-probabilities or probabilities. If given as probabilities
             will normalize to make sure the distributes sum to 1
-        reduction: 
+        reduction:
             Determines how to reduce over the ``N``/batch dimension:
 
             - ``'mean'`` [default]: Averages score across samples
@@ -80,23 +81,26 @@ def __init__(
         allowed_reduction = ['mean', 'sum', 'none', None]
         if reduction not in allowed_reduction:
             raise ValueError(f"Expected argument `reduction` to be one of {allowed_reduction} but got {reduction}")
-        self.recduction = reduction
+        self.reduction = reduction
 
         if self.reduction in ['mean', 'sum']:
             self.add_state('measures', torch.zeros(1), dist_reduce_fx='sum')
-            self.add_state('total', torch.zeros(1), dist_reduce_fx='sum')
         else:
             self.add_state('measures', [], dist_reduce_fx='cat')
+        self.add_state('total', torch.zeros(1), dist_reduce_fx='sum')
 
     def update(self, p: Tensor, q: Tensor) -> None:  # type: ignore
         measures, total = _kld_update(p, q, self.log_prob)
         if self.reduction is None or self.reduction == 'none':
             self.measures.append(measures)
         else:
-            self.measures += measures
+            self.measures += measures.sum()
             self.total += total
 
     def compute(self) -> Tensor:
+        if self.reduction is None or self.reduction == 'none':
+            self.measures = dim_zero_cat(self.measures)
+
         return _kld_compute(self.measures, self.total, self.reduction)
 
     @property
diff --git a/torchmetrics/functional/classification/kldivergence.py b/torchmetrics/functional/classification/kldivergence.py
index e1c9d2013b5..83d097aa2da 100644
--- a/torchmetrics/functional/classification/kldivergence.py
+++ b/torchmetrics/functional/classification/kldivergence.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Tuple, Optional
+from typing import Optional, Tuple
 
 import torch
 from torch import Tensor
@@ -30,8 +30,8 @@ def _kld_update(p: Tensor, q: Tensor, log_prob: bool) -> Tuple[Tensor, int]:
     if log_prob:
         measures = torch.sum(p.exp() * (p - q), axis=-1)
     else:
-        p = p / p.sum(axis=-1)
-        q = q / q.sum(axis=-1)
+        p = p / p.sum(axis=-1, keepdim=True)
+        q = q / q.sum(axis=-1, keepdim=True)
         q = torch.clamp(q, METRIC_EPS)
         measures = torch.sum(p * torch.log(p / q), axis=-1)
 
@@ -61,9 +61,9 @@ def kldivergence(p: Tensor, q: Tensor, log_prob: bool = False, reduction: Option
     Args:
         p: data distribution with shape ``[N, d]``
         q: prior or approximate distribution with shape ``[N, d]``
-        log_prob: bool indicating if input is log-probabilities or probabilities. If given as probabilities, 
+        log_prob: bool indicating if input is log-probabilities or probabilities. If given as probabilities
             will normalize to make sure the distributes sum to 1
-        reduction: 
+        reduction:
             Determines how to reduce over the ``N``/batch dimension:
 
             - ``'mean'`` [default]: Averages score across samples

From b233fb31e2bb902bbf0e4c27de9fcba66eb02c0a Mon Sep 17 00:00:00 2001
From: SkafteNicki <skaftenicki@gmail.com>
Date: Sat, 26 Jun 2021 15:28:34 +0200
Subject: [PATCH 09/12] fix doctest

---
 tests/classification/test_kldivergence.py              | 2 --
 torchmetrics/classification/kldivergence.py            | 4 ++--
 torchmetrics/functional/classification/kldivergence.py | 4 ++--
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/tests/classification/test_kldivergence.py b/tests/classification/test_kldivergence.py
index ce8d6b5f4ca..d1e89e1564f 100644
--- a/tests/classification/test_kldivergence.py
+++ b/tests/classification/test_kldivergence.py
@@ -28,8 +28,6 @@
 
 seed_all(42)
 
-num_targets = 5
-
 Input = namedtuple('Input', ["p", "q"])
 
 _probs_inputs = Input(
diff --git a/torchmetrics/classification/kldivergence.py b/torchmetrics/classification/kldivergence.py
index b3101130190..3ae2cc2d47e 100644
--- a/torchmetrics/classification/kldivergence.py
+++ b/torchmetrics/classification/kldivergence.py
@@ -29,7 +29,7 @@ class KLDivergence(Metric):
 
     Where :math:`P` and :math:`Q` are probability distributions where :math:`P` usually represents a distribution
     over data and :math:`Q` is often a prior or approximation of :math:`P`. It should be noted that the KL divergence
-    is a none symetrical measure.
+    is a non-symetrical metric i.e. :math:`D_{KL}(P||Q) \neq D_{KL}(Q||P)`.
 
     Args:
         p: data distribution with shape ``[N, d]``
@@ -56,7 +56,7 @@ class KLDivergence(Metric):
         >>> p = torch.tensor([[0.36, 0.48, 0.16]])
         >>> q = torch.tensor([[1/3, 1/3, 1/3]])
         >>> kldivergence(p, q)
-        tensor(0.085)
+        tensor(0.0853)
     """
 
     def __init__(
diff --git a/torchmetrics/functional/classification/kldivergence.py b/torchmetrics/functional/classification/kldivergence.py
index 156904f2ad3..796f462bc3b 100644
--- a/torchmetrics/functional/classification/kldivergence.py
+++ b/torchmetrics/functional/classification/kldivergence.py
@@ -56,7 +56,7 @@ def kldivergence(p: Tensor, q: Tensor, log_prob: bool = False, reduction: Option
 
     Where :math:`P` and :math:`Q` are probability distributions where :math:`P` usually represents a distribution
     over data and :math:`Q` is often a prior or approximation of :math:`P`. It should be noted that the KL divergence
-    is a none symetrical measure.
+    is a non-symetrical metric i.e. :math:`D_{KL}(P||Q) \neq D_{KL}(Q||P)`.
 
     Args:
         p: data distribution with shape ``[N, d]``
@@ -76,7 +76,7 @@ def kldivergence(p: Tensor, q: Tensor, log_prob: bool = False, reduction: Option
         >>> p = torch.tensor([[0.36, 0.48, 0.16]])
         >>> q = torch.tensor([[1/3, 1/3, 1/3]])
         >>> kldivergence(p, q)
-        tensor(0.085)
+        tensor(0.0853)
     """
     measures, total = _kld_update(p, q, log_prob)
     return _kld_compute(measures, total, reduction)

From de4502732d4fc6ac2bd01019fb752e2864eaaa78 Mon Sep 17 00:00:00 2001
From: SkafteNicki <skaftenicki@gmail.com>
Date: Sun, 27 Jun 2021 15:48:44 +0200
Subject: [PATCH 10/12] missing note

---
 docs/source/pages/overview.rst              | 1 +
 torchmetrics/classification/kldivergence.py | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/docs/source/pages/overview.rst b/docs/source/pages/overview.rst
index 55081391de5..cbe131629a6 100644
--- a/docs/source/pages/overview.rst
+++ b/docs/source/pages/overview.rst
@@ -167,6 +167,7 @@ the following limitations:
 
   - :ref:`references/modules:PSNR` and :ref:`references/functional:psnr [func]`
   - :ref:`references/modules:SSIM` and :ref:`references/functional:ssim [func]`
+  - :ref:`references/modules:KLDivergence` and :ref:`references/functional:kldivergence [func]`
 
 ******************
 Metric Arithmetics
diff --git a/torchmetrics/classification/kldivergence.py b/torchmetrics/classification/kldivergence.py
index 3ae2cc2d47e..82285352b85 100644
--- a/torchmetrics/classification/kldivergence.py
+++ b/torchmetrics/classification/kldivergence.py
@@ -49,6 +49,8 @@ class KLDivergence(Metric):
         ValueError:
             If ``reduction`` is not one of ``'mean'``, ``'sum'``, ``'none'`` or ``None``
 
+    .. note::
+        Half precision is only support on GPU for this metric
 
     Example:
         >>> import torch

From 81f41be587dd924076f62786a7e7c8ccb086ae5e Mon Sep 17 00:00:00 2001
From: SkafteNicki <skaftenicki@gmail.com>
Date: Sun, 27 Jun 2021 19:00:25 +0200
Subject: [PATCH 11/12] fix tests

---
 tests/helpers/testers.py                    | 3 ++-
 torchmetrics/classification/kldivergence.py | 6 ++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/tests/helpers/testers.py b/tests/helpers/testers.py
index 3da6f68133a..57087f95e28 100644
--- a/tests/helpers/testers.py
+++ b/tests/helpers/testers.py
@@ -151,7 +151,8 @@ def _class_test(
     pickled_metric = pickle.dumps(metric)
     metric = pickle.loads(pickled_metric)
 
-    for i in range(rank, NUM_BATCHES, worldsize):
+    batches_per_rank = int(BATCH_SIZE / worldsize)
+    for i in range(rank * batches_per_rank, (rank + 1) * batches_per_rank):
         batch_kwargs_update = {k: v[i] if isinstance(v, Tensor) else v for k, v in kwargs_update.items()}
 
         batch_result = metric(preds[i], target[i], **batch_kwargs_update)
diff --git a/torchmetrics/classification/kldivergence.py b/torchmetrics/classification/kldivergence.py
index 82285352b85..cb62c627246 100644
--- a/torchmetrics/classification/kldivergence.py
+++ b/torchmetrics/classification/kldivergence.py
@@ -100,10 +100,8 @@ def update(self, p: Tensor, q: Tensor) -> None:  # type: ignore
             self.total += total
 
     def compute(self) -> Tensor:
-        if self.reduction is None or self.reduction == 'none':
-            self.measures = dim_zero_cat(self.measures)
-
-        return _kld_compute(self.measures, self.total, self.reduction)
+        measures = dim_zero_cat(self.measures) if self.reduction is None or self.reduction == 'none' else self.measures
+        return _kld_compute(measures, self.total, self.reduction)
 
     @property
     def is_differentiable(self) -> bool:

From 9367a94c228c0720c874f40889c7f83aac5ba1da Mon Sep 17 00:00:00 2001
From: SkafteNicki <skaftenicki@gmail.com>
Date: Mon, 28 Jun 2021 11:18:12 +0200
Subject: [PATCH 12/12] fix tests

---
 tests/classification/test_kldivergence.py | 2 +-
 tests/helpers/testers.py                  | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/classification/test_kldivergence.py b/tests/classification/test_kldivergence.py
index d1e89e1564f..ba3b574d833 100644
--- a/tests/classification/test_kldivergence.py
+++ b/tests/classification/test_kldivergence.py
@@ -54,7 +54,7 @@ def _sk_metric(p: Tensor, q: Tensor, log_prob: bool, reduction: Optional[str] =
         return res
 
 
-@pytest.mark.parametrize("reduction", ['mean', 'sum', 'none', None])
+@pytest.mark.parametrize("reduction", ['mean', 'sum'])
 @pytest.mark.parametrize(
     "p, q, log_prob", [(_probs_inputs.p, _probs_inputs.q, False), (_log_probs_inputs.p, _log_probs_inputs.q, True)]
 )
diff --git a/tests/helpers/testers.py b/tests/helpers/testers.py
index 57087f95e28..3da6f68133a 100644
--- a/tests/helpers/testers.py
+++ b/tests/helpers/testers.py
@@ -151,8 +151,7 @@ def _class_test(
     pickled_metric = pickle.dumps(metric)
     metric = pickle.loads(pickled_metric)
 
-    batches_per_rank = int(BATCH_SIZE / worldsize)
-    for i in range(rank * batches_per_rank, (rank + 1) * batches_per_rank):
+    for i in range(rank, NUM_BATCHES, worldsize):
         batch_kwargs_update = {k: v[i] if isinstance(v, Tensor) else v for k, v in kwargs_update.items()}
 
         batch_result = metric(preds[i], target[i], **batch_kwargs_update)