From 7f86c9b14160025a826cdb76cdc2f7d7ed3a4cce Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@rwth-aachen.de>
Date: Wed, 2 Sep 2020 12:51:08 +0200
Subject: [PATCH 01/22] metric aggregation

---
 pytorch_lightning/metrics/metric.py | 70 +++++++++++++++++------------
 1 file changed, 42 insertions(+), 28 deletions(-)

diff --git a/pytorch_lightning/metrics/metric.py b/pytorch_lightning/metrics/metric.py
index 5f61a50e6cd25..5bae62bae56bb 100644
--- a/pytorch_lightning/metrics/metric.py
+++ b/pytorch_lightning/metrics/metric.py
@@ -21,7 +21,7 @@
 import numpy as np
 
 from pytorch_lightning.metrics.converters import (
-    sync_ddp_if_available, gather_all_tensors_if_available,
+    gather_all_tensors_if_available,
     convert_to_tensor, convert_to_numpy)
 from pytorch_lightning.utilities.apply_func import apply_to_collection
 from pytorch_lightning.utilities.device_dtype_mixin import DeviceDtypeModuleMixin
@@ -46,7 +46,7 @@ class Metric(DeviceDtypeModuleMixin, nn.Module, ABC):
 
     Call order
 
-        input_convert -> forward -> output_convert -> ddp_sync -> aggregate -> compute
+        input_convert -> forward -> output_convert -> ddp_reduce (per default being ddp_sync -> aggregate) -> compute
 
     """
 
@@ -61,11 +61,12 @@ def __init__(self, name: str):
         self._dtype = torch.get_default_dtype()
         self._device = torch.device('cpu')
 
+        self._step_vals = []
+
         # Register hooks
         self.register_forward_pre_hook(self.input_convert)
         self.register_forward_hook(self.output_convert)
-        self.register_forward_hook(self.ddp_sync)
-        self.register_forward_hook(self.aggregate)
+        self.register_forward_hook(self.ddp_reduce)
         self.register_forward_hook(self.compute)
 
     @staticmethod
@@ -106,10 +107,29 @@ def output_convert(self, data: Any, output: Any):
         """
         return output
 
+    def ddp_sync(self, tensor: Any):
+        """
+        Implement how the outputs from forward should be synced 
+        (per default just gathers all of them and adds them to self._step_vals)
+
+        Args:
+            tensor: tensor to sync
+
+        Returns:
+            synced output
+
+        """
+        gathered_tensors = apply_to_collection(tensor, torch.Tensor, gather_all_tensors_if_available,
+                                   self.reduce_group, self.reduce_op)
+
+        self._step_vals.append(gathered_tensors)
+
+        return gathered_tensors
+
     @staticmethod
-    def ddp_sync(self, data: Any, output: Any):
+    def ddp_reduce(self, data: Any, output: Any):
         """
-        Implement how the outputs from forward should be synced
+        Implement how the outputs from forward should be synced and reduced across nodes
 
         Args:
             data: input to forward method
@@ -119,27 +139,26 @@ def ddp_sync(self, data: Any, output: Any):
             synced output
 
         """
-        return output
+        synced = self.ddp_sync(output)
+        return self.aggregate(synced)
 
-    @staticmethod
-    def aggregate(self, data: Any, output: Any):
+    def aggregate(self, *tensors: torch.Tensor) -> torch.Tensor:
         """
         Implement aggregation of values on the same device
 
         Args:
-            data: input to forward method
-            output: output from the `ddp_sync` hook
+            tensors: the values to be aggregated
 
         Returns:
             aggregated values
 
         """
-        return output
+        return torch.cat(tensors).mean()
 
     @staticmethod
     def compute(self, data: Any, output: Any):
         """
-        Implement additionally metric computations to be done after the ddp sync
+        Implement additionally metric computations to be done after the aggregation
 
         Args:
             data: input to forward method
@@ -151,6 +170,15 @@ def compute(self, data: Any, output: Any):
         """
         return output
 
+    @property
+    def aggregated(self) -> torch.Tensor:
+        aggr = self.aggregate(*self._step_vals)
+        self.reset()
+        return self.compute(self, None, aggr)
+
+    def reset(self):
+        self._step_vals = []
+
 
 class TensorMetric(Metric):
     """
@@ -186,11 +214,7 @@ def input_convert(self, data: Any):
     def output_convert(self, data: Any, output: Any):
         return apply_to_collection(output, torch.Tensor, convert_to_tensor,
                                    self.dtype, self.device)
-
-    @staticmethod
-    def ddp_sync(self, data: Any, output: Any):
-        return apply_to_collection(output, torch.Tensor, sync_ddp_if_available,
-                                   self.reduce_group, self.reduce_op)
+        
 
 
 class TensorCollectionMetric(Metric):
@@ -240,11 +264,6 @@ def output_convert(self, data: Any, output: Any):
                                    convert_to_tensor,
                                    self.dtype, self.device)
 
-    @staticmethod
-    def ddp_sync(self, data: Any, output: Any):
-        return apply_to_collection(output, torch.Tensor, sync_ddp_if_available,
-                                   self.reduce_group, self.reduce_op)
-
 
 class NumpyMetric(Metric):
     """
@@ -282,8 +301,3 @@ def output_convert(self, data: Any, output: Any):
                                    (torch.Tensor, np.ndarray, numbers.Number),
                                    convert_to_tensor,
                                    self.dtype, self.device)
-
-    @staticmethod
-    def ddp_sync(self, data: Any, output: Any):
-        return apply_to_collection(output, torch.Tensor, sync_ddp_if_available,
-                                   self.reduce_group, self.reduce_op)

From 3ec2f9847e279df747cf8a7e484529c27ca129dc Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@rwth-aachen.de>
Date: Wed, 2 Sep 2020 12:53:34 +0200
Subject: [PATCH 02/22] metric aggregation

---
 pytorch_lightning/metrics/metric.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/metrics/metric.py b/pytorch_lightning/metrics/metric.py
index 5bae62bae56bb..8f90ef7332f87 100644
--- a/pytorch_lightning/metrics/metric.py
+++ b/pytorch_lightning/metrics/metric.py
@@ -40,10 +40,14 @@ class Metric(DeviceDtypeModuleMixin, nn.Module, ABC):
 
         * input_convert: pre-forward hook that takes care of input conversion
         * output_convert: post-forward hook that takes care of output convertion
-        * ddp_sync: implementation of ddp sync, default is gather all
-        * aggregate: implement how values should be aggregated
+        * ddp_reduce: implementation of ddp sync + aggregation, default is ddp_sync + aggregate
         * compute: post-ddp sync for additional metric computations
 
+    ``ddp_reduce`` by default calls the following methods, which can also be overwritten if necessary.
+
+        * ddp_sync: implements how values should be synced across ddp-processes. Defaults to gather all.
+        * aggregate: implement how values should be aggregated (defaults to mean).
+
     Call order
 
         input_convert -> forward -> output_convert -> ddp_reduce (per default being ddp_sync -> aggregate) -> compute

From 71f0d7cd4785fc64e4e7c0c115bf320b6c14ed0f Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@rwth-aachen.de>
Date: Wed, 2 Sep 2020 14:42:33 +0200
Subject: [PATCH 03/22] add at_least_1d

---
 pytorch_lightning/metrics/converters.py | 75 ++++++++++++++++---------
 1 file changed, 48 insertions(+), 27 deletions(-)

diff --git a/pytorch_lightning/metrics/converters.py b/pytorch_lightning/metrics/converters.py
index a41a621c905da..d9bb0e5e5e128 100644
--- a/pytorch_lightning/metrics/converters.py
+++ b/pytorch_lightning/metrics/converters.py
@@ -18,11 +18,13 @@
 sync tensors between different processes in a DDP scenario, when needed.
 """
 
+from functools import reduce
 import numbers
 from typing import Any, Callable, Optional, Union
 
 import numpy as np
 import torch
+from torch.distributed.distributed_c10d import reduce_op
 from torch.utils.data._utils.collate import np_str_obj_array_pattern
 
 from pytorch_lightning.utilities import rank_zero_warn
@@ -31,10 +33,20 @@
 try:
     from torch.distributed import ReduceOp
 except ImportError:
+
     class ReduceOp:
         SUM = None
 
-    rank_zero_warn('Unsupported `ReduceOp` for distributed computing')
+    rank_zero_warn("Unsupported `ReduceOp` for distributed computing")
+
+try:
+    import torch_xla
+    import torch_xla.core.xla_model as xm
+    import torch_xla.core.functions as xf
+except ImportError:
+    XLA_AVAILABLE = False
+else:
+    XLA_AVAILABLE = True
 
 
 def _apply_to_inputs(func_to_apply: Callable, *dec_args, **dec_kwargs) -> Callable:
@@ -138,8 +150,9 @@ def _numpy_metric_input_conversion(func_to_decorate: Callable) -> Callable:
     Return:
         Callable: the decorated function
     """
-    return _apply_to_inputs(
-        apply_to_collection, (torch.Tensor, np.ndarray, numbers.Number), convert_to_numpy)(func_to_decorate)
+    return _apply_to_inputs(apply_to_collection, (torch.Tensor, np.ndarray, numbers.Number), convert_to_numpy)(
+        func_to_decorate
+    )
 
 
 def _tensor_metric_output_conversion(func_to_decorate: Callable) -> Callable:
@@ -185,8 +198,9 @@ def _tensor_metric_input_conversion(func_to_decorate: Callable) -> Callable:
     Return:
         Callable: the decorated function
     """
-    return _apply_to_inputs(
-        apply_to_collection, (torch.Tensor, np.ndarray, numbers.Number), convert_to_tensor)(func_to_decorate)
+    return _apply_to_inputs(apply_to_collection, (torch.Tensor, np.ndarray, numbers.Number), convert_to_tensor)(
+        func_to_decorate
+    )
 
 
 def _tensor_collection_metric_output_conversion(func_to_decorate: Callable) -> Callable:
@@ -199,8 +213,9 @@ def _tensor_collection_metric_output_conversion(func_to_decorate: Callable) -> C
     Return:
         Callable: the decorated function
     """
-    return _apply_to_outputs(apply_to_collection, (torch.Tensor, np.ndarray, numbers.Number),
-                             convert_to_tensor)(func_to_decorate)
+    return _apply_to_outputs(apply_to_collection, (torch.Tensor, np.ndarray, numbers.Number), convert_to_tensor)(
+        func_to_decorate
+    )
 
 
 def _tensor_metric_conversion(func_to_decorate: Callable) -> Callable:
@@ -240,10 +255,9 @@ def _tensor_collection_metric_conversion(func_to_decorate: Callable) -> Callable
     return _tensor_collection_metric_output_conversion(func_convert_inputs)
 
 
-def sync_ddp_if_available(result: Union[torch.Tensor],
-                          group: Optional[Any] = None,
-                          reduce_op: Optional[ReduceOp] = None
-                          ) -> torch.Tensor:
+def sync_ddp_if_available(
+    result: Union[torch.Tensor], group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None
+) -> torch.Tensor:
     """
     Function to reduce the tensors from several ddp processes to one master process
 
@@ -265,23 +279,34 @@ def sync_ddp_if_available(result: Union[torch.Tensor],
 
         if reduce_op is None:
             reduce_op = torch.distributed.ReduceOp.SUM
-        elif isinstance(reduce_op, str) and reduce_op in ('avg', 'mean'):
+        elif isinstance(reduce_op, str) and reduce_op in ("avg", "mean"):
             reduce_op = torch.distributed.ReduceOp.SUM
             divide_by_world_size = True
 
         # sync all processes before reduction
         torch.distributed.barrier(group=group)
-        torch.distributed.all_reduce(result, op=reduce_op, group=group,
-                                     async_op=False)
+        torch.distributed.all_reduce(result, op=reduce_op, group=group, async_op=False)
 
         if divide_by_world_size:
             result = result / torch.distributed.get_world_size(group)
 
     return result
 
+def at_least_1d(tensor: Union[np.ndarray, torch.Tensor]) ->  Union[np.ndarray, torch.Tensor]:
+    """Makes sure the tensor is at least of 1d shape
+
+    Args:
+        tensor: the tensor or array to check the shape for
+
+    Returns:
+        the optionally reshaped tensor
+    """
+    if tensor.shape == ():
+        tensor = tensor.reshape(1,)
+    return tensor
+
 
-def gather_all_tensors_if_available(result: Union[torch.Tensor],
-                                    group: Optional[Any] = None):
+def gather_all_tensors_if_available(result: Union[torch.Tensor], group: Optional[Any] = None):
     """
     Function to gather all tensors from several ddp processes onto a list that
     is broadcasted to all processes
@@ -312,8 +337,7 @@ def gather_all_tensors_if_available(result: Union[torch.Tensor],
     return result
 
 
-def sync_ddp(group: Optional[Any] = None,
-             reduce_op: Optional[ReduceOp] = None) -> Callable:
+def sync_ddp(group: Optional[Any] = None, reduce_op: Optional[ReduceOp] = None) -> Callable:
     """
     This decorator syncs a functions outputs across different processes for DDP.
 
@@ -327,15 +351,14 @@ def sync_ddp(group: Optional[Any] = None,
     """
 
     def decorator_fn(func_to_decorate):
-        return _apply_to_outputs(apply_to_collection, torch.Tensor,
-                                 sync_ddp_if_available, group=group,
-                                 reduce_op=reduce_op)(func_to_decorate)
+        return _apply_to_outputs(
+            apply_to_collection, torch.Tensor, sync_ddp_if_available, group=group, reduce_op=reduce_op
+        )(func_to_decorate)
 
     return decorator_fn
 
 
-def numpy_metric(group: Optional[Any] = None,
-                 reduce_op: Optional[ReduceOp] = None) -> Callable:
+def numpy_metric(group: Optional[Any] = None, reduce_op: Optional[ReduceOp] = None) -> Callable:
     """
     This decorator shall be used on all function metrics working on numpy arrays.
     It handles the argument conversion and DDP reduction for metrics working on numpy.
@@ -357,8 +380,7 @@ def decorator_fn(func_to_decorate):
     return decorator_fn
 
 
-def tensor_metric(group: Optional[Any] = None,
-                  reduce_op: Optional[ReduceOp] = None) -> Callable:
+def tensor_metric(group: Optional[Any] = None, reduce_op: Optional[ReduceOp] = None) -> Callable:
     """
     This decorator shall be used on all function metrics working on tensors.
     It handles the argument conversion and DDP reduction for metrics working on tensors.
@@ -379,8 +401,7 @@ def decorator_fn(func_to_decorate):
     return decorator_fn
 
 
-def tensor_collection_metric(group: Optional[Any] = None,
-                             reduce_op: Optional[ReduceOp] = None) -> Callable:
+def tensor_collection_metric(group: Optional[Any] = None, reduce_op: Optional[ReduceOp] = None) -> Callable:
     """
     This decorator shall be used on all function metrics working on tensors and returning collections
     that cannot be converted to tensors.

From c7cbdfa2f8394cecd12e8de3890e5f3e1edef6e1 Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@rwth-aachen.de>
Date: Wed, 2 Sep 2020 14:42:43 +0200
Subject: [PATCH 04/22] fix output formatting

---
 pytorch_lightning/metrics/metric.py | 85 ++++++++++++++---------------
 1 file changed, 41 insertions(+), 44 deletions(-)

diff --git a/pytorch_lightning/metrics/metric.py b/pytorch_lightning/metrics/metric.py
index 8f90ef7332f87..ab03b66d7f235 100644
--- a/pytorch_lightning/metrics/metric.py
+++ b/pytorch_lightning/metrics/metric.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from abc import ABC, abstractmethod
+from ast import Num
 from typing import Any, Optional
 import numbers
 
@@ -21,8 +22,11 @@
 import numpy as np
 
 from pytorch_lightning.metrics.converters import (
+    at_least_1d,
     gather_all_tensors_if_available,
-    convert_to_tensor, convert_to_numpy)
+    convert_to_tensor,
+    convert_to_numpy,
+)
 from pytorch_lightning.utilities.apply_func import apply_to_collection
 from pytorch_lightning.utilities.device_dtype_mixin import DeviceDtypeModuleMixin
 
@@ -54,16 +58,20 @@ class Metric(DeviceDtypeModuleMixin, nn.Module, ABC):
 
     """
 
-    def __init__(self, name: str):
+    def __init__(self, name: str, reduce_group: Optional[Any] = None):
         """
         Args:
             name: the metric's name
+            reduce_group: the process group for DDP reduces (only needed for DDP training).
+                Defaults to all processes (world)
 
         """
         super().__init__()
         self.name = name
         self._dtype = torch.get_default_dtype()
-        self._device = torch.device('cpu')
+        self._device = torch.device("cpu")
+
+        self.reduce_group = reduce_group
 
         self._step_vals = []
 
@@ -109,11 +117,11 @@ def output_convert(self, data: Any, output: Any):
         Returns:
             casted outputs
         """
-        return output
+        return apply_to_collection(output, (torch.Tensor, np.ndarray), at_least_1d)
 
     def ddp_sync(self, tensor: Any):
         """
-        Implement how the outputs from forward should be synced 
+        Implement how the outputs from forward should be synced
         (per default just gathers all of them and adds them to self._step_vals)
 
         Args:
@@ -123,8 +131,7 @@ def ddp_sync(self, tensor: Any):
             synced output
 
         """
-        gathered_tensors = apply_to_collection(tensor, torch.Tensor, gather_all_tensors_if_available,
-                                   self.reduce_group, self.reduce_op)
+        gathered_tensors = apply_to_collection(tensor, torch.Tensor, gather_all_tensors_if_available, self.reduce_group)
 
         self._step_vals.append(gathered_tensors)
 
@@ -191,9 +198,7 @@ class TensorMetric(Metric):
     Already handles DDP sync and input/output conversions.
     """
 
-    def __init__(self, name: str,
-                 reduce_group: Optional[Any] = None,
-                 reduce_op: Optional[Any] = None):
+    def __init__(self, name: str, reduce_group: Optional[Any] = None, reduce_op: Optional[Any] = None):
         """
 
         Args:
@@ -203,22 +208,20 @@ def __init__(self, name: str,
             reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
                 Defaults to sum.
         """
-        super().__init__(name)
-        self.reduce_group = reduce_group
+        super().__init__(name, reduce_group)
         self.reduce_op = reduce_op
 
     @staticmethod
     def input_convert(self, data: Any):
-        return apply_to_collection(data,
-                                   (torch.Tensor, np.ndarray, numbers.Number),
-                                   convert_to_tensor,
-                                   self.dtype, self.device)
+        data = apply_to_collection(
+            data, (torch.Tensor, np.ndarray, numbers.Number), convert_to_tensor, self.dtype, self.device
+        )
+        return super(TensorMetric, self).input_convert(self, data)
 
     @staticmethod
     def output_convert(self, data: Any, output: Any):
-        return apply_to_collection(output, torch.Tensor, convert_to_tensor,
-                                   self.dtype, self.device)
-        
+        output = apply_to_collection(output, torch.Tensor, convert_to_tensor, self.dtype, self.device)
+        return super(TensorMetric, self).output_convert(self, data, output)
 
 
 class TensorCollectionMetric(Metric):
@@ -238,9 +241,7 @@ class TensorCollectionMetric(Metric):
 
     """
 
-    def __init__(self, name: str,
-                 reduce_group: Optional[Any] = None,
-                 reduce_op: Optional[Any] = None):
+    def __init__(self, name: str, reduce_group: Optional[Any] = None, reduce_op: Optional[Any] = None):
         """
 
         Args:
@@ -250,23 +251,22 @@ def __init__(self, name: str,
             reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
                 Defaults to sum.
         """
-        super().__init__(name)
-        self.reduce_group = reduce_group
+        super().__init__(name, reduce_group)
         self.reduce_op = reduce_op
 
     @staticmethod
     def input_convert(self, data: Any):
-        return apply_to_collection(data,
-                                   (torch.Tensor, np.ndarray, numbers.Number),
-                                   convert_to_tensor,
-                                   self.dtype, self.device)
+        data = apply_to_collection(
+            data, (torch.Tensor, np.ndarray, numbers.Number), convert_to_tensor, self.dtype, self.device
+        )
+        return super(TensorCollectionMetric, self).input_convert(self, data)
 
     @staticmethod
     def output_convert(self, data: Any, output: Any):
-        return apply_to_collection(output,
-                                   (torch.Tensor, np.ndarray, numbers.Number),
-                                   convert_to_tensor,
-                                   self.dtype, self.device)
+        output = apply_to_collection(
+            output, (torch.Tensor, np.ndarray, numbers.Number), convert_to_tensor, self.dtype, self.device
+        )
+        return super(TensorCollectionMetric, self).output_convert(self, data, output)
 
 
 class NumpyMetric(Metric):
@@ -277,9 +277,7 @@ class NumpyMetric(Metric):
     Already handles DDP sync and input/output conversions.
     """
 
-    def __init__(self, name: str,
-                 reduce_group: Optional[Any] = None,
-                 reduce_op: Optional[Any] = None):
+    def __init__(self, name: str, reduce_group: Optional[Any] = None, reduce_op: Optional[Any] = None):
         """
 
         Args:
@@ -289,19 +287,18 @@ def __init__(self, name: str,
             reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
                 Defaults to sum.
         """
-        super().__init__(name)
-        self.reduce_group = reduce_group
+        super().__init__(name, reduce_group)
         self.reduce_op = reduce_op
 
     @staticmethod
     def input_convert(self, data: Any):
-        return apply_to_collection(data,
-                                   (torch.Tensor, np.ndarray, numbers.Number),
-                                   convert_to_numpy)
+        data = apply_to_collection(data, (torch.Tensor, np.ndarray, numbers.Number), convert_to_numpy)
+        return super(NumpyMetric, self).input_convert(self, data)
 
     @staticmethod
     def output_convert(self, data: Any, output: Any):
-        return apply_to_collection(output,
-                                   (torch.Tensor, np.ndarray, numbers.Number),
-                                   convert_to_tensor,
-                                   self.dtype, self.device)
+        output = apply_to_collection(
+            output, (torch.Tensor, np.ndarray, numbers.Number), convert_to_tensor, self.dtype, self.device
+        )
+
+        return super(NumpyMetric, self).output_convert(self, data, output)

From 68f2eb470787c291989c8c1c64cc0e5bfeccac33 Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@rwth-aachen.de>
Date: Wed, 2 Sep 2020 14:42:49 +0200
Subject: [PATCH 05/22] add metric tests

---
 tests/metrics/test_metrics.py | 151 ++++++++++++++++++++++++++++------
 1 file changed, 124 insertions(+), 27 deletions(-)

diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
index 5985745bfa070..76b60abcfea19 100644
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -1,4 +1,6 @@
 import os
+from typing import Any
+import warnings
 import numpy as np
 import pytest
 import torch
@@ -11,38 +13,38 @@
 
 class DummyTensorMetric(TensorMetric):
     def __init__(self):
-        super().__init__('dummy')
+        super().__init__("dummy")
 
     def forward(self, input1, input2):
         assert isinstance(input1, torch.Tensor)
         assert isinstance(input2, torch.Tensor)
-        return torch.tensor([1.])
+        return torch.tensor([1.0])
 
 
 class DummyNumpyMetric(NumpyMetric):
     def __init__(self):
-        super().__init__('dummy')
+        super().__init__("dummy")
 
     def forward(self, input1, input2):
         assert isinstance(input1, np.ndarray)
         assert isinstance(input2, np.ndarray)
-        return 1.
+        return 1.0
 
 
 class DummyTensorCollectionMetric(TensorCollectionMetric):
     def __init__(self):
-        super().__init__('dummy')
+        super().__init__("dummy")
 
     def forward(self, input1, input2):
         assert isinstance(input1, torch.Tensor)
         assert isinstance(input2, torch.Tensor)
-        return 1., 2., 3., 4.
+        return 1.0, 2.0, 3.0, 4.0
 
 
-@pytest.mark.parametrize('metric', [DummyTensorCollectionMetric()])
+@pytest.mark.parametrize("metric", [DummyTensorCollectionMetric()])
 def test_collection_metric(metric: Metric):
     """ Test that metric.device, metric.dtype works for metric collection """
-    input1, input2 = torch.tensor([1.]), torch.tensor([2.])
+    input1, input2 = torch.tensor([1.0]), torch.tensor([2.0])
 
     def change_and_check_device_dtype(device, dtype):
         metric.to(device=device, dtype=dtype)
@@ -56,9 +58,9 @@ def change_and_check_device_dtype(device, dtype):
         if dtype is not None:
             assert metric.dtype == dtype
 
-    devices = [None, 'cpu']
+    devices = [None, "cpu"]
     if torch.cuda.is_available():
-        devices += ['cuda:0']
+        devices += ["cuda:0"]
 
     for device in devices:
         for dtype in [None, torch.float32, torch.float64]:
@@ -66,10 +68,10 @@ def change_and_check_device_dtype(device, dtype):
 
     if torch.cuda.is_available():
         metric.cuda(0)
-        assert metric.device == torch.device('cuda', index=0)
+        assert metric.device == torch.device("cuda", index=0)
 
     metric.cpu()
-    assert metric.device == torch.device('cpu')
+    assert metric.device == torch.device("cpu")
 
     metric.type(torch.int8)
     assert metric.dtype == torch.int8
@@ -87,13 +89,16 @@ def change_and_check_device_dtype(device, dtype):
         assert metric.dtype == torch.float16
 
 
-@pytest.mark.parametrize('metric', [
-    DummyTensorMetric(),
-    DummyNumpyMetric(),
-])
+@pytest.mark.parametrize(
+    "metric",
+    [
+        DummyTensorMetric(),
+        DummyNumpyMetric(),
+    ],
+)
 def test_metric(metric: Metric):
     """ Test that metric.device, metric.dtype works for single metric"""
-    input1, input2 = torch.tensor([1.]), torch.tensor([2.])
+    input1, input2 = torch.tensor([1.0]), torch.tensor([2.0])
 
     def change_and_check_device_dtype(device, dtype):
         metric.to(device=device, dtype=dtype)
@@ -109,9 +114,9 @@ def change_and_check_device_dtype(device, dtype):
             assert metric.dtype == dtype
             assert metric_val.dtype == dtype
 
-    devices = [None, 'cpu']
+    devices = [None, "cpu"]
     if torch.cuda.is_available():
-        devices += ['cuda:0']
+        devices += ["cuda:0"]
 
     for device in devices:
         for dtype in [None, torch.float32, torch.float64]:
@@ -119,12 +124,12 @@ def change_and_check_device_dtype(device, dtype):
 
     if torch.cuda.is_available():
         metric.cuda(0)
-        assert metric.device == torch.device('cuda', index=0)
-        assert metric(input1, input2).device == torch.device('cuda', index=0)
+        assert metric.device == torch.device("cuda", index=0)
+        assert metric(input1, input2).device == torch.device("cuda", index=0)
 
     metric.cpu()
-    assert metric.device == torch.device('cpu')
-    assert metric(input1, input2).device == torch.device('cpu')
+    assert metric.device == torch.device("cpu")
+    assert metric(input1, input2).device == torch.device("cpu")
 
     metric.type(torch.int8)
     assert metric.dtype == torch.int8
@@ -156,7 +161,7 @@ def test_model_pickable(tmpdir, metric: Metric):
         max_epochs=1,
         limit_train_batches=10,
         gpus=[0, 1],
-        distributed_backend='ddp_spawn',
+        distributed_backend="ddp_spawn",
     )
 
     model = EvalModelTemplate()
@@ -167,17 +172,19 @@ def test_model_pickable(tmpdir, metric: Metric):
     result = trainer.fit(model)
 
     # correct result and ok accuracy
-    assert result == 1, 'ddp model failed to complete'
+    assert result == 1, "ddp model failed to complete"
 
 
 @pytest.mark.parametrize("metric", [DummyTensorMetric(), DummyNumpyMetric()])
 def test_saving_pickable(tmpdir, metric: Metric):
     """ Make sure that metrics are pickable by saving and loading them using torch """
-    x, y = torch.randn(10,), torch.randn(10,)
+    x, y = torch.randn(10,), torch.randn(
+        10,
+    )
     results_before_save = metric(x, y)
 
     # save metric
-    save_path = os.path.join(tmpdir, 'save_test.ckpt')
+    save_path = os.path.join(tmpdir, "save_test.ckpt")
     torch.save(metric, save_path)
 
     # load metric
@@ -186,3 +193,93 @@ def test_saving_pickable(tmpdir, metric: Metric):
 
     # Check metric value is the same
     assert results_before_save == results_after_load
+
+
+def check_call_order():
+    class DummyMetric(Metric):
+        def __init__(self):
+            super().__init__("dummy")
+            self.call_history = ["init"]
+
+        @staticmethod
+        def input_convert(self, data: Any):
+            self.call_history.append("input_convert")
+            return super(DummyMetric, self).input_convert(self, data)
+
+        def forward(self, tensor1, tensor2):
+            self.call_history.append("forward")
+            return tensor1 - tensor2
+
+        @staticmethod
+        def output_convert(self, data: Any, output: Any):
+            self.call_history.append("output_convert")
+            return super(DummyMetric, self).output_convert(self, data, output)
+
+        def ddp_sync(self, tensor: Any):
+            self.call_history.append("ddp_sync")
+            return super().ddp_sync(tensor)
+
+        @staticmethod
+        def ddp_reduce(self, data: Any, output: Any):
+            self.call_history.append("ddp_reduce")
+            return super(DummyMetric, self).ddp_reduce(self, data, output)
+
+        def aggregate(self, *tensors: torch.Tensor) -> torch.Tensor:
+            self.call_history.append("aggregate")
+            return super().aggregate(*tensors)
+
+        def reset(self):
+            self.call_history.append("reset")
+            return super().reset()
+
+        @property
+        def aggregated(self) -> torch.Tensor:
+            self.call_history.append("aggregated")
+            return super().aggregated
+
+    metric = DummyMetric()
+    assert metric.call_history == ["init"]
+    result = metric(torch.tensor([2.0]), torch.tensor([1.0]))
+    assert torch.allclose(result, torch.tensor(1.0))
+    assert metric.call_history == [
+        "init",
+        "input_convert",
+        "forward",
+        "output_convert",
+        "ddp_reduce",
+        "ddp_sync",
+        "aggregate",
+    ]
+    aggr = metric.aggregated
+    assert metric.call_history == [
+        "init",
+        "input_convert",
+        "forward",
+        "output_convert",
+        "ddp_reduce",
+        "ddp_sync",
+        "aggregate",
+        "aggregated",
+        "aggregate",
+        "reset",
+    ]
+    assert torch.allclose(aggr, result)
+    _ = metric(torch.tensor(2.0), torch.tensor(1.0))
+    assert metric.call_history == [
+        "init",
+        "input_convert",
+        "forward",
+        "output_convert",
+        "ddp_reduce",
+        "ddp_sync",
+        "aggregate",
+        "aggregated",
+        "aggregate",
+        "reset",
+        "input_convert",
+        "forward",
+        "output_convert",
+        "ddp_reduce",
+        "ddp_sync",
+        "aggregate",
+    ]

From e4c999aa4303822b77b4cd85cbba0ee5d9d5fc1f Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@rwth-aachen.de>
Date: Thu, 3 Sep 2020 11:38:45 +0200
Subject: [PATCH 06/22] add missing test case

---
 tests/metrics/test_metrics.py | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
index 76b60abcfea19..b022a39b80932 100644
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -283,3 +283,30 @@ def aggregated(self) -> torch.Tensor:
         "ddp_sync",
         "aggregate",
     ]
+
+    metric = DummyMetric()
+    _ = metric(torch.tensor([2.0]), torch.tensor([1.0]))
+    _ = metric(torch.tensor([3.0]), torch.tensor([0.0]))
+
+    aggregated = metric.aggregated
+
+    assert torch.allclose(aggregated, torch.tensor(2.0))
+
+    assert metric.call_history == [
+        "init",
+        "input_convert",
+        "forward",
+        "output_convert",
+        "ddp_reduce",
+        "ddp_sync",
+        "aggregate",
+        "input_convert",
+        "forward",
+        "output_convert",
+        "ddp_reduce",
+        "ddp_sync",
+        "aggregate",
+        "aggregated",
+        "aggregate",
+        "reset",
+    ]

From d03cf52254cc071a268e3701b58204887b5475e2 Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@rwth-aachen.de>
Date: Thu, 3 Sep 2020 11:39:37 +0200
Subject: [PATCH 07/22] remove reduce_op frm metric classes

---
 pytorch_lightning/metrics/metric.py | 39 -----------------------------
 1 file changed, 39 deletions(-)

diff --git a/pytorch_lightning/metrics/metric.py b/pytorch_lightning/metrics/metric.py
index ab03b66d7f235..fd5797ec3355a 100644
--- a/pytorch_lightning/metrics/metric.py
+++ b/pytorch_lightning/metrics/metric.py
@@ -198,19 +198,6 @@ class TensorMetric(Metric):
     Already handles DDP sync and input/output conversions.
     """
 
-    def __init__(self, name: str, reduce_group: Optional[Any] = None, reduce_op: Optional[Any] = None):
-        """
-
-        Args:
-            name: the metric's name
-            reduce_group: the process group for DDP reduces (only needed for DDP training).
-                Defaults to all processes (world)
-            reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
-                Defaults to sum.
-        """
-        super().__init__(name, reduce_group)
-        self.reduce_op = reduce_op
-
     @staticmethod
     def input_convert(self, data: Any):
         data = apply_to_collection(
@@ -241,19 +228,6 @@ class TensorCollectionMetric(Metric):
 
     """
 
-    def __init__(self, name: str, reduce_group: Optional[Any] = None, reduce_op: Optional[Any] = None):
-        """
-
-        Args:
-            name: the metric's name
-            reduce_group: the process group for DDP reduces (only needed for DDP training).
-                Defaults to all processes (world)
-            reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
-                Defaults to sum.
-        """
-        super().__init__(name, reduce_group)
-        self.reduce_op = reduce_op
-
     @staticmethod
     def input_convert(self, data: Any):
         data = apply_to_collection(
@@ -277,19 +251,6 @@ class NumpyMetric(Metric):
     Already handles DDP sync and input/output conversions.
     """
 
-    def __init__(self, name: str, reduce_group: Optional[Any] = None, reduce_op: Optional[Any] = None):
-        """
-
-        Args:
-            name: the metric's name
-            reduce_group: the process group for DDP reduces (only needed for DDP training).
-                Defaults to all processes (world)
-            reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
-                Defaults to sum.
-        """
-        super().__init__(name, reduce_group)
-        self.reduce_op = reduce_op
-
     @staticmethod
     def input_convert(self, data: Any):
         data = apply_to_collection(data, (torch.Tensor, np.ndarray, numbers.Number), convert_to_numpy)

From d23bdb11fd2a06f2deb51a242c4642e7c3b86863 Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@rwth-aachen.de>
Date: Thu, 3 Sep 2020 12:11:05 +0200
Subject: [PATCH 08/22] fix reduce_op stuff

---
 pytorch_lightning/metrics/classification.py | 314 +++++-----
 pytorch_lightning/metrics/metric.py         |   3 +
 pytorch_lightning/metrics/sklearns.py       | 607 ++++++++------------
 3 files changed, 382 insertions(+), 542 deletions(-)

diff --git a/pytorch_lightning/metrics/classification.py b/pytorch_lightning/metrics/classification.py
index 4edbb624aa55e..b12a61e15f2ef 100644
--- a/pytorch_lightning/metrics/classification.py
+++ b/pytorch_lightning/metrics/classification.py
@@ -30,7 +30,7 @@
     precision,
     precision_recall_curve,
     recall,
-    roc
+    roc,
 )
 from pytorch_lightning.metrics.metric import TensorCollectionMetric, TensorMetric
 
@@ -50,11 +50,10 @@ class Accuracy(TensorMetric):
     """
 
     def __init__(
-            self,
-            num_classes: Optional[int] = None,
-            reduction: str = 'elementwise_mean',
-            reduce_group: Any = None,
-            reduce_op: Any = None,
+        self,
+        num_classes: Optional[int] = None,
+        reduction: str = "elementwise_mean",
+        reduce_group: Any = None,
     ):
         """
         Args:
@@ -67,9 +66,7 @@ def __init__(
             reduce_group: the process group to reduce metric results from DDP
             reduce_op: the operation to perform for ddp reduction
         """
-        super().__init__(name='accuracy',
-                         reduce_group=reduce_group,
-                         reduce_op=reduce_op)
+        super().__init__(name="accuracy", reduce_group=reduce_group)
         self.num_classes = num_classes
         self.reduction = reduction
 
@@ -84,8 +81,7 @@ def forward(self, pred: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
         Return:
             A Tensor with the classification score.
         """
-        return accuracy(pred=pred, target=target,
-                        num_classes=self.num_classes, reduction=self.reduction)
+        return accuracy(pred=pred, target=target, num_classes=self.num_classes, reduction=self.reduction)
 
 
 class ConfusionMatrix(TensorMetric):
@@ -106,10 +102,9 @@ class ConfusionMatrix(TensorMetric):
     """
 
     def __init__(
-            self,
-            normalize: bool = False,
-            reduce_group: Any = None,
-            reduce_op: Any = None,
+        self,
+        normalize: bool = False,
+        reduce_group: Any = None,
     ):
         """
         Args:
@@ -117,9 +112,10 @@ def __init__(
             reduce_group: the process group to reduce metric results from DDP
             reduce_op: the operation to perform for ddp reduction
         """
-        super().__init__(name='confusion_matrix',
-                         reduce_group=reduce_group,
-                         reduce_op=reduce_op)
+        super().__init__(
+            name="confusion_matrix",
+            reduce_group=reduce_group,
+        )
         self.normalize = normalize
 
     def forward(self, pred: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
@@ -133,8 +129,7 @@ def forward(self, pred: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
         Return:
             A Tensor with the confusion matrix.
         """
-        return confusion_matrix(pred=pred, target=target,
-                                normalize=self.normalize)
+        return confusion_matrix(pred=pred, target=target, normalize=self.normalize)
 
 
 class PrecisionRecallCurve(TensorCollectionMetric):
@@ -157,10 +152,9 @@ class PrecisionRecallCurve(TensorCollectionMetric):
     """
 
     def __init__(
-            self,
-            pos_label: int = 1,
-            reduce_group: Any = None,
-            reduce_op: Any = None,
+        self,
+        pos_label: int = 1,
+        reduce_group: Any = None,
     ):
         """
         Args:
@@ -168,17 +162,18 @@ def __init__(
             reduce_group: the process group to reduce metric results from DDP
             reduce_op: the operation to perform for ddp reduction
         """
-        super().__init__(name='precision_recall_curve',
-                         reduce_group=reduce_group,
-                         reduce_op=reduce_op)
+        super().__init__(
+            name="precision_recall_curve",
+            reduce_group=reduce_group,
+        )
 
         self.pos_label = pos_label
 
     def forward(
-            self,
-            pred: torch.Tensor,
-            target: torch.Tensor,
-            sample_weight: Optional[Sequence] = None,
+        self,
+        pred: torch.Tensor,
+        target: torch.Tensor,
+        sample_weight: Optional[Sequence] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         Actual metric computation
@@ -193,9 +188,7 @@ def forward(
             - recall values
             - threshold values
         """
-        return precision_recall_curve(pred=pred, target=target,
-                                      sample_weight=sample_weight,
-                                      pos_label=self.pos_label)
+        return precision_recall_curve(pred=pred, target=target, sample_weight=sample_weight, pos_label=self.pos_label)
 
 
 class Precision(TensorMetric):
@@ -213,11 +206,10 @@ class Precision(TensorMetric):
     """
 
     def __init__(
-            self,
-            num_classes: Optional[int] = None,
-            reduction: str = 'elementwise_mean',
-            reduce_group: Any = None,
-            reduce_op: Any = None,
+        self,
+        num_classes: Optional[int] = None,
+        reduction: str = "elementwise_mean",
+        reduce_group: Any = None,
     ):
         """
         Args:
@@ -230,9 +222,10 @@ def __init__(
             reduce_group: the process group to reduce metric results from DDP
             reduce_op: the operation to perform for ddp reduction
         """
-        super().__init__(name='precision',
-                         reduce_group=reduce_group,
-                         reduce_op=reduce_op)
+        super().__init__(
+            name="precision",
+            reduce_group=reduce_group,
+        )
         self.num_classes = num_classes
         self.reduction = reduction
 
@@ -247,9 +240,7 @@ def forward(self, pred: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
         Return:
             A Tensor with the classification score.
         """
-        return precision(pred=pred, target=target,
-                         num_classes=self.num_classes,
-                         reduction=self.reduction)
+        return precision(pred=pred, target=target, num_classes=self.num_classes, reduction=self.reduction)
 
 
 class Recall(TensorMetric):
@@ -267,11 +258,10 @@ class Recall(TensorMetric):
     """
 
     def __init__(
-            self,
-            num_classes: Optional[int] = None,
-            reduction: str = 'elementwise_mean',
-            reduce_group: Any = None,
-            reduce_op: Any = None,
+        self,
+        num_classes: Optional[int] = None,
+        reduction: str = "elementwise_mean",
+        reduce_group: Any = None,
     ):
         """
         Args:
@@ -284,9 +274,10 @@ def __init__(
             reduce_group: the process group to reduce metric results from DDP
             reduce_op: the operation to perform for ddp reduction
         """
-        super().__init__(name='recall',
-                         reduce_group=reduce_group,
-                         reduce_op=reduce_op)
+        super().__init__(
+            name="recall",
+            reduce_group=reduce_group,
+        )
 
         self.num_classes = num_classes
         self.reduction = reduction
@@ -302,10 +293,7 @@ def forward(self, pred: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
         Return:
             A Tensor with the classification score.
         """
-        return recall(pred=pred,
-                      target=target,
-                      num_classes=self.num_classes,
-                      reduction=self.reduction)
+        return recall(pred=pred, target=target, num_classes=self.num_classes, reduction=self.reduction)
 
 
 class AveragePrecision(TensorMetric):
@@ -323,10 +311,9 @@ class AveragePrecision(TensorMetric):
     """
 
     def __init__(
-            self,
-            pos_label: int = 1,
-            reduce_group: Any = None,
-            reduce_op: Any = None,
+        self,
+        pos_label: int = 1,
+        reduce_group: Any = None,
     ):
         """
         Args:
@@ -334,17 +321,15 @@ def __init__(
             reduce_group: the process group to reduce metric results from DDP
             reduce_op: the operation to perform for ddp reduction
         """
-        super().__init__(name='AP',
-                         reduce_group=reduce_group,
-                         reduce_op=reduce_op)
+        super().__init__(
+            name="AP",
+            reduce_group=reduce_group,
+        )
 
         self.pos_label = pos_label
 
     def forward(
-            self,
-            pred: torch.Tensor,
-            target: torch.Tensor,
-            sample_weight: Optional[Sequence] = None
+        self, pred: torch.Tensor, target: torch.Tensor, sample_weight: Optional[Sequence] = None
     ) -> torch.Tensor:
         """
         Actual metric computation
@@ -357,9 +342,7 @@ def forward(
         Return:
             torch.Tensor: classification score
         """
-        return average_precision(pred=pred, target=target,
-                                 sample_weight=sample_weight,
-                                 pos_label=self.pos_label)
+        return average_precision(pred=pred, target=target, sample_weight=sample_weight, pos_label=self.pos_label)
 
 
 class AUROC(TensorMetric):
@@ -377,10 +360,9 @@ class AUROC(TensorMetric):
     """
 
     def __init__(
-            self,
-            pos_label: int = 1,
-            reduce_group: Any = None,
-            reduce_op: Any = None,
+        self,
+        pos_label: int = 1,
+        reduce_group: Any = None,
     ):
         """
         Args:
@@ -388,17 +370,15 @@ def __init__(
             reduce_group: the process group to reduce metric results from DDP
             reduce_op: the operation to perform for ddp reduction
         """
-        super().__init__(name='auroc',
-                         reduce_group=reduce_group,
-                         reduce_op=reduce_op)
+        super().__init__(
+            name="auroc",
+            reduce_group=reduce_group,
+        )
 
         self.pos_label = pos_label
 
     def forward(
-            self,
-            pred: torch.Tensor,
-            target: torch.Tensor,
-            sample_weight: Optional[Sequence] = None
+        self, pred: torch.Tensor, target: torch.Tensor, sample_weight: Optional[Sequence] = None
     ) -> torch.Tensor:
         """
         Actual metric computation
@@ -411,9 +391,7 @@ def forward(
         Return:
             torch.Tensor: classification score
         """
-        return auroc(pred=pred, target=target,
-                     sample_weight=sample_weight,
-                     pos_label=self.pos_label)
+        return auroc(pred=pred, target=target, sample_weight=sample_weight, pos_label=self.pos_label)
 
 
 class FBeta(TensorMetric):
@@ -431,12 +409,11 @@ class FBeta(TensorMetric):
     """
 
     def __init__(
-            self,
-            beta: float,
-            num_classes: Optional[int] = None,
-            reduction: str = 'elementwise_mean',
-            reduce_group: Any = None,
-            reduce_op: Any = None,
+        self,
+        beta: float,
+        num_classes: Optional[int] = None,
+        reduction: str = "elementwise_mean",
+        reduce_group: Any = None,
     ):
         """
         Args:
@@ -450,9 +427,10 @@ def __init__(
             reduce_group: the process group to reduce metric results from DDP
             reduce_op: the operation to perform for DDP reduction
         """
-        super().__init__(name='fbeta',
-                         reduce_group=reduce_group,
-                         reduce_op=reduce_op)
+        super().__init__(
+            name="fbeta",
+            reduce_group=reduce_group,
+        )
 
         self.beta = beta
         self.num_classes = num_classes
@@ -469,9 +447,9 @@ def forward(self, pred: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
         Return:
             torch.Tensor: classification score
         """
-        return fbeta_score(pred=pred, target=target,
-                           beta=self.beta, num_classes=self.num_classes,
-                           reduction=self.reduction)
+        return fbeta_score(
+            pred=pred, target=target, beta=self.beta, num_classes=self.num_classes, reduction=self.reduction
+        )
 
 
 class F1(TensorMetric):
@@ -489,11 +467,10 @@ class F1(TensorMetric):
     """
 
     def __init__(
-            self,
-            num_classes: Optional[int] = None,
-            reduction: str = 'elementwise_mean',
-            reduce_group: Any = None,
-            reduce_op: Any = None,
+        self,
+        num_classes: Optional[int] = None,
+        reduction: str = "elementwise_mean",
+        reduce_group: Any = None,
     ):
         """
         Args:
@@ -506,9 +483,10 @@ def __init__(
             reduce_group: the process group to reduce metric results from DDP
             reduce_op: the operation to perform for ddp reduction
         """
-        super().__init__(name='f1',
-                         reduce_group=reduce_group,
-                         reduce_op=reduce_op)
+        super().__init__(
+            name="f1",
+            reduce_group=reduce_group,
+        )
 
         self.num_classes = num_classes
         self.reduction = reduction
@@ -524,9 +502,7 @@ def forward(self, pred: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
         Return:
             torch.Tensor: classification score
         """
-        return f1_score(pred=pred, target=target,
-                        num_classes=self.num_classes,
-                        reduction=self.reduction)
+        return f1_score(pred=pred, target=target, num_classes=self.num_classes, reduction=self.reduction)
 
 
 class ROC(TensorCollectionMetric):
@@ -549,10 +525,9 @@ class ROC(TensorCollectionMetric):
     """
 
     def __init__(
-            self,
-            pos_label: int = 1,
-            reduce_group: Any = None,
-            reduce_op: Any = None,
+        self,
+        pos_label: int = 1,
+        reduce_group: Any = None,
     ):
         """
         Args:
@@ -560,17 +535,15 @@ def __init__(
             reduce_group: the process group to reduce metric results from DDP
             reduce_op: the operation to perform for ddp reduction
         """
-        super().__init__(name='roc',
-                         reduce_group=reduce_group,
-                         reduce_op=reduce_op)
+        super().__init__(
+            name="roc",
+            reduce_group=reduce_group,
+        )
 
         self.pos_label = pos_label
 
     def forward(
-            self,
-            pred: torch.Tensor,
-            target: torch.Tensor,
-            sample_weight: Optional[Sequence] = None
+        self, pred: torch.Tensor, target: torch.Tensor, sample_weight: Optional[Sequence] = None
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         Actual metric computation
@@ -585,9 +558,7 @@ def forward(
             - true positive rate
             - thresholds
         """
-        return roc(pred=pred, target=target,
-                   sample_weight=sample_weight,
-                   pos_label=self.pos_label)
+        return roc(pred=pred, target=target, sample_weight=sample_weight, pos_label=self.pos_label)
 
 
 class MulticlassROC(TensorCollectionMetric):
@@ -611,10 +582,9 @@ class MulticlassROC(TensorCollectionMetric):
     """
 
     def __init__(
-            self,
-            num_classes: Optional[int] = None,
-            reduce_group: Any = None,
-            reduce_op: Any = None,
+        self,
+        num_classes: Optional[int] = None,
+        reduce_group: Any = None,
     ):
         """
         Args:
@@ -622,16 +592,18 @@ def __init__(
             reduce_group: the process group to reduce metric results from DDP
             reduce_op: the operation to perform for ddp reduction
         """
-        super().__init__(name='multiclass_roc',
-                         reduce_group=reduce_group,
-                         reduce_op=reduce_op)
+        super().__init__(
+            name="multiclass_roc",
+            reduce_group=reduce_group,
+        )
 
         self.num_classes = num_classes
 
     def forward(
-            self, pred: torch.Tensor,
-            target: torch.Tensor,
-            sample_weight: Optional[Sequence] = None,
+        self,
+        pred: torch.Tensor,
+        target: torch.Tensor,
+        sample_weight: Optional[Sequence] = None,
     ) -> Tuple[Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]:
         """
         Actual metric computation
@@ -645,10 +617,7 @@ def forward(
             tuple: A tuple consisting of one tuple per class, holding false positive rate, true positive rate and thresholds
 
         """
-        return multiclass_roc(pred=pred,
-                              target=target,
-                              sample_weight=sample_weight,
-                              num_classes=self.num_classes)
+        return multiclass_roc(pred=pred, target=target, sample_weight=sample_weight, num_classes=self.num_classes)
 
 
 class MulticlassPrecisionRecallCurve(TensorCollectionMetric):
@@ -670,10 +639,9 @@ class MulticlassPrecisionRecallCurve(TensorCollectionMetric):
     """
 
     def __init__(
-            self,
-            num_classes: Optional[int] = None,
-            reduce_group: Any = None,
-            reduce_op: Any = None,
+        self,
+        num_classes: Optional[int] = None,
+        reduce_group: Any = None,
     ):
         """
         Args:
@@ -682,17 +650,18 @@ def __init__(
             reduce_op: the operation to perform for ddp reduction
 
         """
-        super().__init__(name='multiclass_precision_recall_curve',
-                         reduce_group=reduce_group,
-                         reduce_op=reduce_op)
+        super().__init__(
+            name="multiclass_precision_recall_curve",
+            reduce_group=reduce_group,
+        )
 
         self.num_classes = num_classes
 
     def forward(
-            self,
-            pred: torch.Tensor,
-            target: torch.Tensor,
-            sample_weight: Optional[Sequence] = None,
+        self,
+        pred: torch.Tensor,
+        target: torch.Tensor,
+        sample_weight: Optional[Sequence] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         Actual metric computation
@@ -706,10 +675,9 @@ def forward(
             tuple: A tuple consisting of one tuple per class, holding precision, recall and thresholds
 
         """
-        return multiclass_precision_recall_curve(pred=pred,
-                                                 target=target,
-                                                 sample_weight=sample_weight,
-                                                 num_classes=self.num_classes)
+        return multiclass_precision_recall_curve(
+            pred=pred, target=target, sample_weight=sample_weight, num_classes=self.num_classes
+        )
 
 
 class DiceCoefficient(TensorMetric):
@@ -729,12 +697,12 @@ class DiceCoefficient(TensorMetric):
     """
 
     def __init__(
-            self,
-            include_background: bool = False,
-            nan_score: float = 0.0, no_fg_score: float = 0.0,
-            reduction: str = 'elementwise_mean',
-            reduce_group: Any = None,
-            reduce_op: Any = None,
+        self,
+        include_background: bool = False,
+        nan_score: float = 0.0,
+        no_fg_score: float = 0.0,
+        reduction: str = "elementwise_mean",
+        reduce_group: Any = None,
     ):
         """
         Args:
@@ -749,9 +717,10 @@ def __init__(
             reduce_group: the process group to reduce metric results from DDP
             reduce_op: the operation to perform for ddp reduction
         """
-        super().__init__(name='dice',
-                         reduce_group=reduce_group,
-                         reduce_op=reduce_op)
+        super().__init__(
+            name="dice",
+            reduce_group=reduce_group,
+        )
 
         self.include_background = include_background
         self.nan_score = nan_score
@@ -769,12 +738,14 @@ def forward(self, pred: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
         Return:
             torch.Tensor: the calculated dice coefficient
         """
-        return dice_score(pred=pred,
-                          target=target,
-                          bg=self.include_background,
-                          nan_score=self.nan_score,
-                          no_fg_score=self.no_fg_score,
-                          reduction=self.reduction)
+        return dice_score(
+            pred=pred,
+            target=target,
+            bg=self.include_background,
+            nan_score=self.nan_score,
+            no_fg_score=self.no_fg_score,
+            reduction=self.reduction,
+        )
 
 
 class IoU(TensorMetric):
@@ -795,11 +766,7 @@ class IoU(TensorMetric):
 
     """
 
-    def __init__(
-            self,
-            remove_bg: bool = False,
-            reduction: str = 'elementwise_mean'
-    ):
+    def __init__(self, remove_bg: bool = False, reduction: str = "elementwise_mean"):
         """
         Args:
             remove_bg: Flag to state whether a background class has been included
@@ -813,12 +780,11 @@ def __init__(
                 - none: pass array
                 - sum: add elements
         """
-        super().__init__(name='iou')
+        super().__init__(name="iou")
         self.remove_bg = remove_bg
         self.reduction = reduction
 
-    def forward(self, y_pred: torch.Tensor, y_true: torch.Tensor,
-                sample_weight: Optional[torch.Tensor] = None):
+    def forward(self, y_pred: torch.Tensor, y_true: torch.Tensor, sample_weight: Optional[torch.Tensor] = None):
         """
         Actual metric calculation.
         """
diff --git a/pytorch_lightning/metrics/metric.py b/pytorch_lightning/metrics/metric.py
index fd5797ec3355a..9f92c707b1dc8 100644
--- a/pytorch_lightning/metrics/metric.py
+++ b/pytorch_lightning/metrics/metric.py
@@ -211,6 +211,9 @@ def output_convert(self, data: Any, output: Any):
         return super(TensorMetric, self).output_convert(self, data, output)
 
 
+def my_func(dummy = ()):
+    pass
+
 class TensorCollectionMetric(Metric):
     """
     Base class for metric implementation operating directly on tensors.
diff --git a/pytorch_lightning/metrics/sklearns.py b/pytorch_lightning/metrics/sklearns.py
index e40e7ec4de87a..140e960eb487a 100644
--- a/pytorch_lightning/metrics/sklearns.py
+++ b/pytorch_lightning/metrics/sklearns.py
@@ -22,15 +22,13 @@
 from pytorch_lightning.utilities import rank_zero_warn
 
 try:
-    from torch.distributed import ReduceOp, group
+    from torch.distributed import group
 except ImportError:
-    class ReduceOp:
-        SUM = None
 
     class group:
         WORLD = None
 
-    rank_zero_warn('Unsupported `ReduceOp` for distributed computing.')
+    rank_zero_warn("Unsupported `ReduceOp` for distributed computing.")
 
 
 class SklearnMetric(NumpyMetric):
@@ -45,11 +43,10 @@ class SklearnMetric(NumpyMetric):
     """
 
     def __init__(
-            self,
-            metric_name: str,
-            reduce_group: Any = group.WORLD,
-            reduce_op: Any = ReduceOp.SUM,
-            **kwargs,
+        self,
+        metric_name: str,
+        reduce_group: Any = group.WORLD,
+        **kwargs,
     ):
         """
         Args:
@@ -60,19 +57,21 @@ def __init__(
                 Defaults to sum.
             **kwargs: additonal keyword arguments (will be forwarded to metric call)
         """
-        super().__init__(name=metric_name,
-                         reduce_group=reduce_group,
-                         reduce_op=reduce_op)
+        super().__init__(
+            name=metric_name,
+            reduce_group=reduce_group,
+        )
 
         self.metric_kwargs = kwargs
         lightning_logger.debug(
-            f'Metric {self.__class__.__name__} is using Sklearn as backend, meaning that'
-            ' every metric call will cause a GPU synchronization, which may slow down your code'
+            f"Metric {self.__class__.__name__} is using Sklearn as backend, meaning that"
+            " every metric call will cause a GPU synchronization, which may slow down your code"
         )
 
     @property
     def metric_fn(self):
         import sklearn.metrics
+
         return getattr(sklearn.metrics, self.name)
 
     def forward(self, *args, **kwargs) -> Union[np.ndarray, int, float]:
@@ -108,10 +107,9 @@ class Accuracy(SklearnMetric):
     """
 
     def __init__(
-            self,
-            normalize: bool = True,
-            reduce_group: Any = group.WORLD,
-            reduce_op: Any = ReduceOp.SUM,
+        self,
+        normalize: bool = True,
+        reduce_group: Any = group.WORLD,
     ):
         """
         Args:
@@ -122,16 +120,13 @@ def __init__(
             reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
                 Defaults to sum.
         """
-        super().__init__(metric_name='accuracy_score',
-                         reduce_group=reduce_group,
-                         reduce_op=reduce_op,
-                         normalize=normalize)
+        super().__init__(metric_name="accuracy_score", reduce_group=reduce_group, normalize=normalize)
 
     def forward(
-            self,
-            y_pred: np.ndarray,
-            y_true: np.ndarray,
-            sample_weight: Optional[np.ndarray] = None,
+        self,
+        y_pred: np.ndarray,
+        y_true: np.ndarray,
+        sample_weight: Optional[np.ndarray] = None,
     ) -> float:
         """
         Computes the accuracy
@@ -164,9 +159,8 @@ class AUC(SklearnMetric):
     """
 
     def __init__(
-            self,
-            reduce_group: Any = group.WORLD,
-            reduce_op: Any = ReduceOp.SUM,
+        self,
+        reduce_group: Any = group.WORLD,
     ):
         """
         Args:
@@ -176,9 +170,7 @@ def __init__(
                 Defaults to sum.
         """
 
-        super().__init__(metric_name='auc',
-                         reduce_group=reduce_group,
-                         reduce_op=reduce_op)
+        super().__init__(metric_name="auc", reduce_group=reduce_group)
 
     def forward(self, x: np.ndarray, y: np.ndarray) -> float:
         """
@@ -202,10 +194,9 @@ class AveragePrecision(SklearnMetric):
     """
 
     def __init__(
-            self,
-            average: Optional[str] = 'macro',
-            reduce_group: Any = group.WORLD,
-            reduce_op: Any = ReduceOp.SUM,
+        self,
+        average: Optional[str] = "macro",
+        reduce_group: Any = group.WORLD,
     ):
         """
         Args:
@@ -225,16 +216,13 @@ def __init__(
             reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
                 Defaults to sum.
         """
-        super().__init__('average_precision_score',
-                         reduce_group=reduce_group,
-                         reduce_op=reduce_op,
-                         average=average)
+        super().__init__("average_precision_score", reduce_group=reduce_group, average=average)
 
     def forward(
-            self,
-            y_score: np.ndarray,
-            y_true: np.ndarray,
-            sample_weight: Optional[np.ndarray] = None,
+        self,
+        y_score: np.ndarray,
+        y_true: np.ndarray,
+        sample_weight: Optional[np.ndarray] = None,
     ) -> float:
         """
         Args:
@@ -246,12 +234,11 @@ def forward(
         Return:
             average precision score
         """
-        return super().forward(y_score=y_score, y_true=y_true,
-                               sample_weight=sample_weight)
+        return super().forward(y_score=y_score, y_true=y_true, sample_weight=sample_weight)
 
 
 class BalancedAccuracy(SklearnMetric):
-    """ Compute the balanced accuracy score
+    """Compute the balanced accuracy score
 
     Warning:
         Every metric call will cause a GPU synchronization, which may slow down your code
@@ -267,10 +254,9 @@ class BalancedAccuracy(SklearnMetric):
     """
 
     def __init__(
-            self,
-            adjusted: bool = False,
-            reduce_group: Any = group.WORLD,
-            reduce_op: Any = ReduceOp.SUM,
+        self,
+        adjusted: bool = False,
+        reduce_group: Any = group.WORLD,
     ):
         """
         Args:
@@ -281,16 +267,13 @@ def __init__(
             reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
                 Defaults to sum.
         """
-        super().__init__('balanced_accuracy_score',
-                         reduce_group=reduce_group,
-                         reduce_op=reduce_op,
-                         adjusted=adjusted)
+        super().__init__("balanced_accuracy_score", reduce_group=reduce_group, adjusted=adjusted)
 
     def forward(
-            self,
-            y_pred: np.ndarray,
-            y_true: np.ndarray,
-            sample_weight: Optional[np.ndarray] = None,
+        self,
+        y_pred: np.ndarray,
+        y_true: np.ndarray,
+        sample_weight: Optional[np.ndarray] = None,
     ) -> float:
         """
         Args:
@@ -302,9 +285,7 @@ def forward(
             balanced accuracy score
 
         """
-        return super().forward(y_true=y_true,
-                               y_pred=y_pred,
-                               sample_weight=sample_weight)
+        return super().forward(y_true=y_true, y_pred=y_pred, sample_weight=sample_weight)
 
 
 class CohenKappaScore(SklearnMetric):
@@ -322,11 +303,10 @@ class CohenKappaScore(SklearnMetric):
     """
 
     def __init__(
-            self,
-            labels: Optional[Sequence] = None,
-            weights: Optional[str] = None,
-            reduce_group: Any = group.WORLD,
-            reduce_op: Any = ReduceOp.SUM,
+        self,
+        labels: Optional[Sequence] = None,
+        weights: Optional[str] = None,
+        reduce_group: Any = group.WORLD,
     ):
         """
         Args:
@@ -342,17 +322,13 @@ def __init__(
             reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
                 Defaults to sum.
         """
-        super().__init__('cohen_kappa_score',
-                         reduce_group=reduce_group,
-                         reduce_op=reduce_op,
-                         labels=labels,
-                         weights=weights)
+        super().__init__("cohen_kappa_score", reduce_group=reduce_group, labels=labels, weights=weights)
 
     def forward(
-            self,
-            y1: np.ndarray,
-            y2: np.ndarray,
-            sample_weight: Optional[np.ndarray] = None,
+        self,
+        y1: np.ndarray,
+        y2: np.ndarray,
+        sample_weight: Optional[np.ndarray] = None,
     ) -> float:
         """
         Args:
@@ -386,10 +362,9 @@ class ConfusionMatrix(SklearnMetric):
     """
 
     def __init__(
-            self,
-            labels: Optional[Sequence] = None,
-            reduce_group: Any = group.WORLD,
-            reduce_op: Any = ReduceOp.SUM,
+        self,
+        labels: Optional[Sequence] = None,
+        reduce_group: Any = group.WORLD,
     ):
         """
         Args:
@@ -402,10 +377,7 @@ def __init__(
             reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
                 Defaults to sum.
         """
-        super().__init__('confusion_matrix',
-                         reduce_group=reduce_group,
-                         reduce_op=reduce_op,
-                         labels=labels)
+        super().__init__("confusion_matrix", reduce_group=reduce_group, labels=labels)
 
     def forward(self, y_pred: np.ndarray, y_true: np.ndarray) -> np.ndarray:
         """
@@ -421,7 +393,7 @@ def forward(self, y_pred: np.ndarray, y_true: np.ndarray) -> np.ndarray:
 
 
 class DCG(SklearnMetric):
-    """ Compute discounted cumulative gain
+    """Compute discounted cumulative gain
 
     Warning:
         Every metric call will cause a GPU synchronization, which may slow down your code
@@ -436,12 +408,11 @@ class DCG(SklearnMetric):
     """
 
     def __init__(
-            self,
-            k: Optional[int] = None,
-            log_base: float = 2,
-            ignore_ties: bool = False,
-            reduce_group: Any = group.WORLD,
-            reduce_op: Any = ReduceOp.SUM,
+        self,
+        k: Optional[int] = None,
+        log_base: float = 2,
+        ignore_ties: bool = False,
+        reduce_group: Any = group.WORLD,
     ):
         """
         Args:
@@ -453,18 +424,13 @@ def __init__(
             reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
                 Defaults to sum.
         """
-        super().__init__('dcg_score',
-                         reduce_group=reduce_group,
-                         reduce_op=reduce_op,
-                         k=k,
-                         log_base=log_base,
-                         ignore_ties=ignore_ties)
+        super().__init__("dcg_score", reduce_group=reduce_group, k=k, log_base=log_base, ignore_ties=ignore_ties)
 
     def forward(
-            self,
-            y_score: np.ndarray,
-            y_true: np.ndarray,
-            sample_weight: Optional[np.ndarray] = None,
+        self,
+        y_score: np.ndarray,
+        y_true: np.ndarray,
+        sample_weight: Optional[np.ndarray] = None,
     ) -> float:
         """
         Args:
@@ -477,9 +443,7 @@ def forward(
             DCG score
 
         """
-        return super().forward(y_true=y_true,
-                               y_score=y_score,
-                               sample_weight=sample_weight)
+        return super().forward(y_true=y_true, y_score=y_score, sample_weight=sample_weight)
 
 
 class F1(SklearnMetric):
@@ -511,12 +475,11 @@ class F1(SklearnMetric):
     """
 
     def __init__(
-            self,
-            labels: Optional[Sequence] = None,
-            pos_label: Union[str, int] = 1,
-            average: Optional[str] = 'macro',
-            reduce_group: Any = group.WORLD,
-            reduce_op: Any = ReduceOp.SUM,
+        self,
+        labels: Optional[Sequence] = None,
+        pos_label: Union[str, int] = 1,
+        average: Optional[str] = "macro",
+        reduce_group: Any = group.WORLD,
     ):
         """
         Args:
@@ -553,18 +516,13 @@ def __init__(
             reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
                 Defaults to sum.
         """
-        super().__init__('f1_score',
-                         reduce_group=reduce_group,
-                         reduce_op=reduce_op,
-                         labels=labels,
-                         pos_label=pos_label,
-                         average=average)
+        super().__init__("f1_score", reduce_group=reduce_group, labels=labels, pos_label=pos_label, average=average)
 
     def forward(
-            self,
-            y_pred: np.ndarray,
-            y_true: np.ndarray,
-            sample_weight: Optional[np.ndarray] = None,
+        self,
+        y_pred: np.ndarray,
+        y_true: np.ndarray,
+        sample_weight: Optional[np.ndarray] = None,
     ) -> Union[np.ndarray, float]:
         """
         Args:
@@ -603,13 +561,12 @@ class FBeta(SklearnMetric):
     """
 
     def __init__(
-            self,
-            beta: float,
-            labels: Optional[Sequence] = None,
-            pos_label: Union[str, int] = 1,
-            average: Optional[str] = 'macro',
-            reduce_group: Any = group.WORLD,
-            reduce_op: Any = ReduceOp.SUM,
+        self,
+        beta: float,
+        labels: Optional[Sequence] = None,
+        pos_label: Union[str, int] = 1,
+        average: Optional[str] = "macro",
+        reduce_group: Any = group.WORLD,
     ):
         """
         Args:
@@ -647,19 +604,15 @@ def __init__(
             reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
                 Defaults to sum.
         """
-        super().__init__('fbeta_score',
-                         reduce_group=reduce_group,
-                         reduce_op=reduce_op,
-                         beta=beta,
-                         labels=labels,
-                         pos_label=pos_label,
-                         average=average)
+        super().__init__(
+            "fbeta_score", reduce_group=reduce_group, beta=beta, labels=labels, pos_label=pos_label, average=average
+        )
 
     def forward(
-            self,
-            y_pred: np.ndarray,
-            y_true: np.ndarray,
-            sample_weight: Optional[np.ndarray] = None,
+        self,
+        y_pred: np.ndarray,
+        y_true: np.ndarray,
+        sample_weight: Optional[np.ndarray] = None,
     ) -> Union[np.ndarray, float]:
         """
         Args:
@@ -690,9 +643,8 @@ class Hamming(SklearnMetric):
     """
 
     def __init__(
-            self,
-            reduce_group: Any = group.WORLD,
-            reduce_op: Any = ReduceOp.SUM,
+        self,
+        reduce_group: Any = group.WORLD,
     ):
         """
         Args:
@@ -702,15 +654,13 @@ def __init__(
                 Defaults to sum.
 
         """
-        super().__init__('hamming_loss',
-                         reduce_group=reduce_group,
-                         reduce_op=reduce_op)
+        super().__init__("hamming_loss", reduce_group=reduce_group)
 
     def forward(
-            self,
-            y_pred: np.ndarray,
-            y_true: np.ndarray,
-            sample_weight: Optional[np.ndarray] = None,
+        self,
+        y_pred: np.ndarray,
+        y_true: np.ndarray,
+        sample_weight: Optional[np.ndarray] = None,
     ) -> Union[np.ndarray, float]:
         """
         Args:
@@ -740,10 +690,9 @@ class Hinge(SklearnMetric):
     """
 
     def __init__(
-            self,
-            labels: Optional[Sequence] = None,
-            reduce_group: Any = group.WORLD,
-            reduce_op: Any = ReduceOp.SUM,
+        self,
+        labels: Optional[Sequence] = None,
+        reduce_group: Any = group.WORLD,
     ):
         """
         Args:
@@ -753,16 +702,13 @@ def __init__(
             reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
                 Defaults to sum.
         """
-        super().__init__('hinge_loss',
-                         reduce_group=reduce_group,
-                         reduce_op=reduce_op,
-                         labels=labels)
+        super().__init__("hinge_loss", reduce_group=reduce_group, labels=labels)
 
     def forward(
-            self,
-            pred_decision: np.ndarray,
-            y_true: np.ndarray,
-            sample_weight: Optional[np.ndarray] = None,
+        self,
+        pred_decision: np.ndarray,
+        y_true: np.ndarray,
+        sample_weight: Optional[np.ndarray] = None,
     ) -> float:
         """
         Args:
@@ -774,9 +720,7 @@ def forward(
             Average hinge loss
 
         """
-        return super().forward(pred_decision=pred_decision,
-                               y_true=y_true,
-                               sample_weight=sample_weight)
+        return super().forward(pred_decision=pred_decision, y_true=y_true, sample_weight=sample_weight)
 
 
 class Jaccard(SklearnMetric):
@@ -794,12 +738,11 @@ class Jaccard(SklearnMetric):
     """
 
     def __init__(
-            self,
-            labels: Optional[Sequence] = None,
-            pos_label: Union[str, int] = 1,
-            average: Optional[str] = 'macro',
-            reduce_group: Any = group.WORLD,
-            reduce_op: Any = ReduceOp.SUM,
+        self,
+        labels: Optional[Sequence] = None,
+        pos_label: Union[str, int] = 1,
+        average: Optional[str] = "macro",
+        reduce_group: Any = group.WORLD,
     ):
         """
         Args:
@@ -836,18 +779,15 @@ def __init__(
             reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
                 Defaults to sum.
         """
-        super().__init__('jaccard_score',
-                         reduce_group=reduce_group,
-                         reduce_op=reduce_op,
-                         labels=labels,
-                         pos_label=pos_label,
-                         average=average)
+        super().__init__(
+            "jaccard_score", reduce_group=reduce_group, labels=labels, pos_label=pos_label, average=average
+        )
 
     def forward(
-            self,
-            y_pred: np.ndarray,
-            y_true: np.ndarray,
-            sample_weight: Optional[np.ndarray] = None,
+        self,
+        y_pred: np.ndarray,
+        y_true: np.ndarray,
+        sample_weight: Optional[np.ndarray] = None,
     ) -> Union[np.ndarray, float]:
         """
         Args:
@@ -882,12 +822,11 @@ class Precision(SklearnMetric):
     """
 
     def __init__(
-            self,
-            labels: Optional[Sequence] = None,
-            pos_label: Union[str, int] = 1,
-            average: Optional[str] = 'macro',
-            reduce_group: Any = group.WORLD,
-            reduce_op: Any = ReduceOp.SUM,
+        self,
+        labels: Optional[Sequence] = None,
+        pos_label: Union[str, int] = 1,
+        average: Optional[str] = "macro",
+        reduce_group: Any = group.WORLD,
     ):
         """
         Args:
@@ -924,18 +863,15 @@ def __init__(
             reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
                 Defaults to sum.
         """
-        super().__init__('precision_score',
-                         reduce_group=reduce_group,
-                         reduce_op=reduce_op,
-                         labels=labels,
-                         pos_label=pos_label,
-                         average=average)
+        super().__init__(
+            "precision_score", reduce_group=reduce_group, labels=labels, pos_label=pos_label, average=average
+        )
 
     def forward(
-            self,
-            y_pred: np.ndarray,
-            y_true: np.ndarray,
-            sample_weight: Optional[np.ndarray] = None,
+        self,
+        y_pred: np.ndarray,
+        y_true: np.ndarray,
+        sample_weight: Optional[np.ndarray] = None,
     ) -> Union[np.ndarray, float]:
         """
         Args:
@@ -970,12 +906,11 @@ class Recall(SklearnMetric):
     """
 
     def __init__(
-            self,
-            labels: Optional[Sequence] = None,
-            pos_label: Union[str, int] = 1,
-            average: Optional[str] = 'macro',
-            reduce_group: Any = group.WORLD,
-            reduce_op: Any = ReduceOp.SUM,
+        self,
+        labels: Optional[Sequence] = None,
+        pos_label: Union[str, int] = 1,
+        average: Optional[str] = "macro",
+        reduce_group: Any = group.WORLD,
     ):
         """
         Args:
@@ -1012,18 +947,13 @@ def __init__(
             reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
                 Defaults to sum.
         """
-        super().__init__('recall_score',
-                         reduce_group=reduce_group,
-                         reduce_op=reduce_op,
-                         labels=labels,
-                         pos_label=pos_label,
-                         average=average)
+        super().__init__("recall_score", reduce_group=reduce_group, labels=labels, pos_label=pos_label, average=average)
 
     def forward(
-            self,
-            y_pred: np.ndarray,
-            y_true: np.ndarray,
-            sample_weight: Optional[np.ndarray] = None,
+        self,
+        y_pred: np.ndarray,
+        y_true: np.ndarray,
+        sample_weight: Optional[np.ndarray] = None,
     ) -> Union[np.ndarray, float]:
         """
         Args:
@@ -1059,10 +989,9 @@ class PrecisionRecallCurve(SklearnMetric):
     """
 
     def __init__(
-            self,
-            pos_label: Union[str, int] = 1,
-            reduce_group: Any = group.WORLD,
-            reduce_op: Any = ReduceOp.SUM,
+        self,
+        pos_label: Union[str, int] = 1,
+        reduce_group: Any = group.WORLD,
     ):
         """
         Args:
@@ -1072,16 +1001,13 @@ def __init__(
             reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
                 Defaults to sum.
         """
-        super().__init__('precision_recall_curve',
-                         reduce_group=reduce_group,
-                         reduce_op=reduce_op,
-                         pos_label=pos_label)
+        super().__init__("precision_recall_curve", reduce_group=reduce_group, pos_label=pos_label)
 
     def forward(
-            self,
-            probas_pred: np.ndarray,
-            y_true: np.ndarray,
-            sample_weight: Optional[np.ndarray] = None,
+        self,
+        probas_pred: np.ndarray,
+        y_true: np.ndarray,
+        sample_weight: Optional[np.ndarray] = None,
     ) -> Union[np.ndarray, float]:
         """
         Args:
@@ -1103,9 +1029,7 @@ def forward(
         """
         # only return x and y here, since for now we cannot auto-convert elements of multiple length.
         # Will be fixed in native implementation
-        return np.array(super().forward(probas_pred=probas_pred,
-                                        y_true=y_true,
-                                        sample_weight=sample_weight)[:2])
+        return np.array(super().forward(probas_pred=probas_pred, y_true=y_true, sample_weight=sample_weight)[:2])
 
 
 class ROC(SklearnMetric):
@@ -1136,10 +1060,9 @@ class ROC(SklearnMetric):
     """
 
     def __init__(
-            self,
-            pos_label: Union[str, int] = 1,
-            reduce_group: Any = group.WORLD,
-            reduce_op: Any = ReduceOp.SUM,
+        self,
+        pos_label: Union[str, int] = 1,
+        reduce_group: Any = group.WORLD,
     ):
         """
         Args:
@@ -1149,16 +1072,13 @@ def __init__(
             reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
                 Defaults to sum.
         """
-        super().__init__('roc_curve',
-                         reduce_group=reduce_group,
-                         reduce_op=reduce_op,
-                         pos_label=pos_label)
+        super().__init__("roc_curve", reduce_group=reduce_group, pos_label=pos_label)
 
     def forward(
-            self,
-            y_score: np.ndarray,
-            y_true: np.ndarray,
-            sample_weight: Optional[np.ndarray] = None,
+        self,
+        y_score: np.ndarray,
+        y_true: np.ndarray,
+        sample_weight: Optional[np.ndarray] = None,
     ) -> Union[np.ndarray, float]:
         """
         Args:
@@ -1197,10 +1117,9 @@ class AUROC(SklearnMetric):
     """
 
     def __init__(
-            self,
-            average: Optional[str] = 'macro',
-            reduce_group: Any = group.WORLD,
-            reduce_op: Any = ReduceOp.SUM,
+        self,
+        average: Optional[str] = "macro",
+        reduce_group: Any = group.WORLD,
     ):
         """
         Args:
@@ -1220,16 +1139,13 @@ def __init__(
             reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
                 Defaults to sum.
         """
-        super().__init__('roc_auc_score',
-                         reduce_group=reduce_group,
-                         reduce_op=reduce_op,
-                         average=average)
+        super().__init__("roc_auc_score", reduce_group=reduce_group, average=average)
 
     def forward(
-            self,
-            y_score: np.ndarray,
-            y_true: np.ndarray,
-            sample_weight: Optional[np.ndarray] = None,
+        self,
+        y_score: np.ndarray,
+        y_true: np.ndarray,
+        sample_weight: Optional[np.ndarray] = None,
     ) -> float:
         """
         Args:
@@ -1241,8 +1157,7 @@ def forward(
         Return:
             Area Under Receiver Operating Characteristic Curve
         """
-        return super().forward(y_score=y_score, y_true=y_true,
-                               sample_weight=sample_weight)
+        return super().forward(y_score=y_score, y_true=y_true, sample_weight=sample_weight)
 
 
 class ExplainedVariance(SklearnMetric):
@@ -1262,10 +1177,9 @@ class ExplainedVariance(SklearnMetric):
     """
 
     def __init__(
-            self,
-            multioutput: Optional[Union[str, List[float]]] = 'variance_weighted',
-            reduce_group: Any = group.WORLD,
-            reduce_op: Any = ReduceOp.SUM,
+        self,
+        multioutput: Optional[Union[str, List[float]]] = "variance_weighted",
+        reduce_group: Any = group.WORLD,
     ):
         """
         Args:
@@ -1277,16 +1191,13 @@ def __init__(
             reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
                 Defaults to sum.
         """
-        super().__init__('explained_variance_score',
-                         reduce_group=reduce_group,
-                         reduce_op=reduce_op,
-                         multioutput=multioutput)
+        super().__init__("explained_variance_score", reduce_group=reduce_group, multioutput=multioutput)
 
     def forward(
-            self,
-            y_pred: np.ndarray,
-            y_true: np.ndarray,
-            sample_weight: Optional[np.ndarray] = None,
+        self,
+        y_pred: np.ndarray,
+        y_true: np.ndarray,
+        sample_weight: Optional[np.ndarray] = None,
     ):
         """
         Args:
@@ -1298,8 +1209,7 @@ def forward(
             Explained variance score
 
         """
-        return super().forward(y_true=y_true, y_pred=y_pred,
-                               sample_weight=sample_weight)
+        return super().forward(y_true=y_true, y_pred=y_pred, sample_weight=sample_weight)
 
 
 class MeanAbsoluteError(SklearnMetric):
@@ -1320,10 +1230,9 @@ class MeanAbsoluteError(SklearnMetric):
     """
 
     def __init__(
-            self,
-            multioutput: Optional[Union[str, List[float]]] = 'uniform_average',
-            reduce_group: Any = group.WORLD,
-            reduce_op: Any = ReduceOp.SUM,
+        self,
+        multioutput: Optional[Union[str, List[float]]] = "uniform_average",
+        reduce_group: Any = group.WORLD,
     ):
         """
         Args:
@@ -1335,13 +1244,9 @@ def __init__(
             reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
                 Defaults to sum.
         """
-        super().__init__('mean_absolute_error',
-                         reduce_group=reduce_group,
-                         reduce_op=reduce_op,
-                         multioutput=multioutput)
+        super().__init__("mean_absolute_error", reduce_group=reduce_group, multioutput=multioutput)
 
-    def forward(self, y_pred: np.ndarray, y_true: np.ndarray,
-                sample_weight: Optional[np.ndarray] = None):
+    def forward(self, y_pred: np.ndarray, y_true: np.ndarray, sample_weight: Optional[np.ndarray] = None):
         """
         Args:
             y_pred: Estimated target values
@@ -1352,9 +1257,7 @@ def forward(self, y_pred: np.ndarray, y_true: np.ndarray,
             Mean absolute error
 
         """
-        return super().forward(y_true=y_true,
-                               y_pred=y_pred,
-                               sample_weight=sample_weight)
+        return super().forward(y_true=y_true, y_pred=y_pred, sample_weight=sample_weight)
 
 
 class MeanSquaredError(SklearnMetric):
@@ -1378,11 +1281,10 @@ class MeanSquaredError(SklearnMetric):
     """
 
     def __init__(
-            self,
-            multioutput: Optional[Union[str, List[float]]] = 'uniform_average',
-            squared: bool = False,
-            reduce_group: Any = group.WORLD,
-            reduce_op: Any = ReduceOp.SUM,
+        self,
+        multioutput: Optional[Union[str, List[float]]] = "uniform_average",
+        squared: bool = False,
+        reduce_group: Any = group.WORLD,
     ):
         """
         Args:
@@ -1395,17 +1297,14 @@ def __init__(
             reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
                 Defaults to sum.
         """
-        super().__init__('mean_squared_error',
-                         reduce_group=reduce_group,
-                         reduce_op=reduce_op,
-                         multioutput=multioutput)
+        super().__init__("mean_squared_error", reduce_group=reduce_group, multioutput=multioutput)
         self.squared = squared
 
     def forward(
-            self,
-            y_pred: np.ndarray,
-            y_true: np.ndarray,
-            sample_weight: Optional[np.ndarray] = None,
+        self,
+        y_pred: np.ndarray,
+        y_true: np.ndarray,
+        sample_weight: Optional[np.ndarray] = None,
     ):
         """
         Args:
@@ -1417,8 +1316,7 @@ def forward(
             Mean squared error
 
         """
-        mse = super().forward(y_true=y_true, y_pred=y_pred,
-                              sample_weight=sample_weight)
+        mse = super().forward(y_true=y_true, y_pred=y_pred, sample_weight=sample_weight)
         if self.squared:
             mse = np.sqrt(mse)
         return mse
@@ -1441,10 +1339,9 @@ class MeanSquaredLogError(SklearnMetric):
     """
 
     def __init__(
-            self,
-            multioutput: Optional[Union[str, List[float]]] = 'uniform_average',
-            reduce_group: Any = group.WORLD,
-            reduce_op: Any = ReduceOp.SUM,
+        self,
+        multioutput: Optional[Union[str, List[float]]] = "uniform_average",
+        reduce_group: Any = group.WORLD,
     ):
         """
         Args:
@@ -1456,16 +1353,13 @@ def __init__(
             reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
                 Defaults to sum.
         """
-        super().__init__('mean_squared_log_error',
-                         reduce_group=reduce_group,
-                         reduce_op=reduce_op,
-                         multioutput=multioutput)
+        super().__init__("mean_squared_log_error", reduce_group=reduce_group, multioutput=multioutput)
 
     def forward(
-            self,
-            y_pred: np.ndarray,
-            y_true: np.ndarray,
-            sample_weight: Optional[np.ndarray] = None,
+        self,
+        y_pred: np.ndarray,
+        y_true: np.ndarray,
+        sample_weight: Optional[np.ndarray] = None,
     ):
         """
         Args:
@@ -1477,8 +1371,7 @@ def forward(
             Mean squared log error
 
         """
-        return super().forward(y_true=y_true, y_pred=y_pred,
-                               sample_weight=sample_weight)
+        return super().forward(y_true=y_true, y_pred=y_pred, sample_weight=sample_weight)
 
 
 class MedianAbsoluteError(SklearnMetric):
@@ -1498,10 +1391,9 @@ class MedianAbsoluteError(SklearnMetric):
     """
 
     def __init__(
-            self,
-            multioutput: Optional[Union[str, List[float]]] = 'uniform_average',
-            reduce_group: Any = group.WORLD,
-            reduce_op: Any = ReduceOp.SUM,
+        self,
+        multioutput: Optional[Union[str, List[float]]] = "uniform_average",
+        reduce_group: Any = group.WORLD,
     ):
         """
         Args:
@@ -1513,10 +1405,7 @@ def __init__(
             reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
                 Defaults to sum.
         """
-        super().__init__('median_absolute_error',
-                         reduce_group=reduce_group,
-                         reduce_op=reduce_op,
-                         multioutput=multioutput)
+        super().__init__("median_absolute_error", reduce_group=reduce_group, multioutput=multioutput)
 
     def forward(self, y_pred: np.ndarray, y_true: np.ndarray):
         """
@@ -1548,10 +1437,9 @@ class R2Score(SklearnMetric):
     """
 
     def __init__(
-            self,
-            multioutput: Optional[Union[str, List[float]]] = 'uniform_average',
-            reduce_group: Any = group.WORLD,
-            reduce_op: Any = ReduceOp.SUM,
+        self,
+        multioutput: Optional[Union[str, List[float]]] = "uniform_average",
+        reduce_group: Any = group.WORLD,
     ):
         """
         Args:
@@ -1563,16 +1451,13 @@ def __init__(
             reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
                 Defaults to sum.
         """
-        super().__init__('r2_score',
-                         reduce_group=reduce_group,
-                         reduce_op=reduce_op,
-                         multioutput=multioutput)
+        super().__init__("r2_score", reduce_group=reduce_group, multioutput=multioutput)
 
     def forward(
-            self,
-            y_pred: np.ndarray,
-            y_true: np.ndarray,
-            sample_weight: Optional[np.ndarray] = None,
+        self,
+        y_pred: np.ndarray,
+        y_true: np.ndarray,
+        sample_weight: Optional[np.ndarray] = None,
     ):
         """
         Args:
@@ -1584,8 +1469,7 @@ def forward(
             R^2 score
 
         """
-        return super().forward(y_true=y_true, y_pred=y_pred,
-                               sample_weight=sample_weight)
+        return super().forward(y_true=y_true, y_pred=y_pred, sample_weight=sample_weight)
 
 
 class MeanPoissonDeviance(SklearnMetric):
@@ -1605,9 +1489,8 @@ class MeanPoissonDeviance(SklearnMetric):
     """
 
     def __init__(
-            self,
-            reduce_group: Any = group.WORLD,
-            reduce_op: Any = ReduceOp.SUM,
+        self,
+        reduce_group: Any = group.WORLD,
     ):
         """
         Args:
@@ -1616,15 +1499,13 @@ def __init__(
             reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
                 Defaults to sum.
         """
-        super().__init__('mean_poisson_deviance',
-                         reduce_group=reduce_group,
-                         reduce_op=reduce_op)
+        super().__init__("mean_poisson_deviance", reduce_group=reduce_group)
 
     def forward(
-            self,
-            y_pred: np.ndarray,
-            y_true: np.ndarray,
-            sample_weight: Optional[np.ndarray] = None,
+        self,
+        y_pred: np.ndarray,
+        y_true: np.ndarray,
+        sample_weight: Optional[np.ndarray] = None,
     ):
         """
         Args:
@@ -1636,8 +1517,7 @@ def forward(
             Mean possion deviance
 
         """
-        return super().forward(y_true=y_true, y_pred=y_pred,
-                               sample_weight=sample_weight)
+        return super().forward(y_true=y_true, y_pred=y_pred, sample_weight=sample_weight)
 
 
 class MeanGammaDeviance(SklearnMetric):
@@ -1657,9 +1537,8 @@ class MeanGammaDeviance(SklearnMetric):
     """
 
     def __init__(
-            self,
-            reduce_group: Any = group.WORLD,
-            reduce_op: Any = ReduceOp.SUM,
+        self,
+        reduce_group: Any = group.WORLD,
     ):
         """
         Args:
@@ -1668,15 +1547,13 @@ def __init__(
             reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
                 Defaults to sum.
         """
-        super().__init__('mean_gamma_deviance',
-                         reduce_group=reduce_group,
-                         reduce_op=reduce_op)
+        super().__init__("mean_gamma_deviance", reduce_group=reduce_group)
 
     def forward(
-            self,
-            y_pred: np.ndarray,
-            y_true: np.ndarray,
-            sample_weight: Optional[np.ndarray] = None,
+        self,
+        y_pred: np.ndarray,
+        y_true: np.ndarray,
+        sample_weight: Optional[np.ndarray] = None,
     ):
         """
         Args:
@@ -1688,8 +1565,7 @@ def forward(
             Mean gamma deviance
 
         """
-        return super().forward(y_true=y_true, y_pred=y_pred,
-                               sample_weight=sample_weight)
+        return super().forward(y_true=y_true, y_pred=y_pred, sample_weight=sample_weight)
 
 
 class MeanTweedieDeviance(SklearnMetric):
@@ -1709,10 +1585,9 @@ class MeanTweedieDeviance(SklearnMetric):
     """
 
     def __init__(
-            self,
-            power: float = 0,
-            reduce_group: Any = group.WORLD,
-            reduce_op: Any = ReduceOp.SUM,
+        self,
+        power: float = 0,
+        reduce_group: Any = group.WORLD,
     ):
         """
         Args:
@@ -1732,16 +1607,13 @@ def __init__(
             reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
                 Defaults to sum.
         """
-        super().__init__('mean_tweedie_deviance',
-                         reduce_group=reduce_group,
-                         reduce_op=reduce_op,
-                         power=power)
+        super().__init__("mean_tweedie_deviance", reduce_group=reduce_group, power=power)
 
     def forward(
-            self,
-            y_pred: np.ndarray,
-            y_true: np.ndarray,
-            sample_weight: Optional[np.ndarray] = None,
+        self,
+        y_pred: np.ndarray,
+        y_true: np.ndarray,
+        sample_weight: Optional[np.ndarray] = None,
     ):
         """
         Args:
@@ -1753,5 +1625,4 @@ def forward(
             Mean tweedie deviance
 
         """
-        return super().forward(y_true=y_true, y_pred=y_pred,
-                               sample_weight=sample_weight)
+        return super().forward(y_true=y_true, y_pred=y_pred, sample_weight=sample_weight)

From 58ce8c449fc85a9574cfb231d9b9a3e71dfb0c37 Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@rwth-aachen.de>
Date: Thu, 3 Sep 2020 15:07:34 +0200
Subject: [PATCH 09/22] start test fixing

---
 pytorch_lightning/metrics/classification.py |  4 +-
 pytorch_lightning/metrics/metric.py         |  5 +--
 pytorch_lightning/metrics/sklearns.py       | 44 ++++++++++-----------
 tests/metrics/test_metrics.py               |  6 +--
 4 files changed, 28 insertions(+), 31 deletions(-)

diff --git a/pytorch_lightning/metrics/classification.py b/pytorch_lightning/metrics/classification.py
index b12a61e15f2ef..29affddb140ce 100644
--- a/pytorch_lightning/metrics/classification.py
+++ b/pytorch_lightning/metrics/classification.py
@@ -44,8 +44,8 @@ class Accuracy(TensorMetric):
         >>> pred = torch.tensor([0, 1, 2, 3])
         >>> target = torch.tensor([0, 1, 2, 2])
         >>> metric = Accuracy()
-        >>> metric(pred, target)
-        tensor(0.7500)
+        >>> metric(pred, target).item()
+        0.75
 
     """
 
diff --git a/pytorch_lightning/metrics/metric.py b/pytorch_lightning/metrics/metric.py
index 9f92c707b1dc8..c0bc30c42472b 100644
--- a/pytorch_lightning/metrics/metric.py
+++ b/pytorch_lightning/metrics/metric.py
@@ -164,7 +164,7 @@ def aggregate(self, *tensors: torch.Tensor) -> torch.Tensor:
             aggregated values
 
         """
-        return torch.cat(tensors).mean()
+        return torch.cat(tensors).mean(0)
 
     @staticmethod
     def compute(self, data: Any, output: Any):
@@ -211,9 +211,6 @@ def output_convert(self, data: Any, output: Any):
         return super(TensorMetric, self).output_convert(self, data, output)
 
 
-def my_func(dummy = ()):
-    pass
-
 class TensorCollectionMetric(Metric):
     """
     Base class for metric implementation operating directly on tensors.
diff --git a/pytorch_lightning/metrics/sklearns.py b/pytorch_lightning/metrics/sklearns.py
index 140e960eb487a..ba7e3db1be5d5 100644
--- a/pytorch_lightning/metrics/sklearns.py
+++ b/pytorch_lightning/metrics/sklearns.py
@@ -102,7 +102,7 @@ class Accuracy(SklearnMetric):
         >>> y_true = torch.tensor([0, 1, 2, 2])
         >>> metric = Accuracy()
         >>> metric(y_pred, y_true)
-        tensor([0.7500])
+        tensor(0.7500)
 
     """
 
@@ -155,7 +155,7 @@ class AUC(SklearnMetric):
         >>> y_true = torch.tensor([0, 1, 2, 2])
         >>> metric = AUC()
         >>> metric(y_pred, y_true)
-        tensor([4.])
+        tensor(4.)
     """
 
     def __init__(
@@ -249,7 +249,7 @@ class BalancedAccuracy(SklearnMetric):
         >>> y_true = torch.tensor([0, 0, 1, 1])
         >>> metric = BalancedAccuracy()
         >>> metric(y_pred, y_true)
-        tensor([0.7500])
+        tensor(0.7500)
 
     """
 
@@ -298,7 +298,7 @@ class CohenKappaScore(SklearnMetric):
         >>> y_true = torch.tensor([2, 2, 2, 1])
         >>> metric = CohenKappaScore()
         >>> metric(y_pred, y_true)
-        tensor([-0.3333])
+        tensor(-0.3333)
 
     """
 
@@ -404,7 +404,7 @@ class DCG(SklearnMetric):
         >>> y_true = torch.tensor([[10, 0, 0, 1, 5]])
         >>> metric = DCG()
         >>> metric(y_score, y_true)
-        tensor([9.4995])
+        tensor(9.4995)
     """
 
     def __init__(
@@ -467,7 +467,7 @@ class F1(SklearnMetric):
         >>> y_true = torch.tensor([0, 1, 2, 2])
         >>> metric = F1()
         >>> metric(y_pred, y_true)
-        tensor([0.6667])
+        tensor(0.6667)
 
     References
         - [1] `Wikipedia entry for the F1-score
@@ -551,7 +551,7 @@ class FBeta(SklearnMetric):
         >>> y_true = torch.tensor([0, 1, 2, 2])
         >>> metric = FBeta(beta=0.25)
         >>> metric(y_pred, y_true)
-        tensor([0.7361])
+        tensor(0.7361)
 
     References:
         - [1] R. Baeza-Yates and B. Ribeiro-Neto (2011).
@@ -638,7 +638,7 @@ class Hamming(SklearnMetric):
         >>> y_true = torch.tensor([1, 1, 2, 3])
         >>> metric = Hamming()
         >>> metric(y_pred, y_true)
-        tensor([0.2500])
+        tensor(0.2500)
 
     """
 
@@ -685,7 +685,7 @@ class Hinge(SklearnMetric):
         >>> y_true = torch.tensor([1, 1, 0, 0])
         >>> metric = Hinge()
         >>> metric(pred_decision, y_true)
-        tensor([1.6300])
+        tensor(1.6300)
 
     """
 
@@ -733,7 +733,7 @@ class Jaccard(SklearnMetric):
         >>> y_true = torch.tensor([0, 1, 1])
         >>> metric = Jaccard()
         >>> metric(y_pred, y_true)
-        tensor([0.3333])
+        tensor(0.3333)
 
     """
 
@@ -817,7 +817,7 @@ class Precision(SklearnMetric):
         >>> y_true = torch.tensor([0, 1, 2, 2])
         >>> metric = Precision()
         >>> metric(y_pred, y_true)
-        tensor([0.7500])
+        tensor(0.7500)
 
     """
 
@@ -901,7 +901,7 @@ class Recall(SklearnMetric):
         >>> y_true = torch.tensor([0, 1, 2, 2])
         >>> metric = Recall()
         >>> metric(y_pred, y_true)
-        tensor([0.6250])
+        tensor(0.6250)
 
     """
 
@@ -1173,7 +1173,7 @@ class ExplainedVariance(SklearnMetric):
         >>> y_true = torch.tensor([3, -0.5, 2, 7])
         >>> metric = ExplainedVariance()
         >>> metric(y_pred, y_true)
-        tensor([0.9572])
+        tensor(0.9572)
     """
 
     def __init__(
@@ -1225,7 +1225,7 @@ class MeanAbsoluteError(SklearnMetric):
         >>> y_true = torch.tensor([3, -0.5, 2, 7])
         >>> metric = MeanAbsoluteError()
         >>> metric(y_pred, y_true)
-        tensor([0.5000])
+        tensor(0.5000)
 
     """
 
@@ -1273,10 +1273,10 @@ class MeanSquaredError(SklearnMetric):
         >>> y_true = torch.tensor([3, -0.5, 2, 7])
         >>> metric = MeanSquaredError()
         >>> metric(y_pred, y_true)
-        tensor([0.3750])
+        tensor(0.3750)
         >>> metric = MeanSquaredError(squared=True)
         >>> metric(y_pred, y_true)
-        tensor([0.6124])
+        tensor(0.6124)
 
     """
 
@@ -1335,7 +1335,7 @@ class MeanSquaredLogError(SklearnMetric):
         >>> y_true = torch.tensor([3, 5, 2.5, 7])
         >>> metric = MeanSquaredLogError()
         >>> metric(y_pred, y_true)
-        tensor([0.0397])
+        tensor(0.0397)
     """
 
     def __init__(
@@ -1387,7 +1387,7 @@ class MedianAbsoluteError(SklearnMetric):
         >>> y_true = torch.tensor([3, -0.5, 2, 7])
         >>> metric = MedianAbsoluteError()
         >>> metric(y_pred, y_true)
-        tensor([0.5000])
+        tensor(0.5000)
     """
 
     def __init__(
@@ -1433,7 +1433,7 @@ class R2Score(SklearnMetric):
         >>> y_true = torch.tensor([3, -0.5, 2, 7])
         >>> metric = R2Score()
         >>> metric(y_pred, y_true)
-        tensor([0.9486])
+        tensor(0.9486)
     """
 
     def __init__(
@@ -1485,7 +1485,7 @@ class MeanPoissonDeviance(SklearnMetric):
         >>> y_true = torch.tensor([0.5, 0.5, 2., 2.])
         >>> metric = MeanPoissonDeviance()
         >>> metric(y_pred, y_true)
-        tensor([0.9034])
+        tensor(0.9034)
     """
 
     def __init__(
@@ -1533,7 +1533,7 @@ class MeanGammaDeviance(SklearnMetric):
         >>> y_true = torch.tensor([2, 0.5, 1, 4])
         >>> metric = MeanGammaDeviance()
         >>> metric(y_pred, y_true)
-        tensor([1.0569])
+        tensor(1.0569)
     """
 
     def __init__(
@@ -1581,7 +1581,7 @@ class MeanTweedieDeviance(SklearnMetric):
         >>> y_true = torch.tensor([0.5, 0.5, 2., 2.])
         >>> metric = MeanTweedieDeviance()
         >>> metric(y_pred, y_true)
-        tensor([1.8125])
+        tensor(1.8125)
     """
 
     def __init__(
diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
index b022a39b80932..15c8d2e92d53f 100644
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -131,9 +131,9 @@ def change_and_check_device_dtype(device, dtype):
     assert metric.device == torch.device("cpu")
     assert metric(input1, input2).device == torch.device("cpu")
 
-    metric.type(torch.int8)
-    assert metric.dtype == torch.int8
-    assert metric(input1, input2).dtype == torch.int8
+    # metric.type(torch.int8)
+    # assert metric.dtype == torch.int8
+    # assert metric(input1, input2).dtype == torch.int8
 
     metric.float()
     assert metric.dtype == torch.float32

From 272bc23d03f41ccffe492f12c0b81e73a3907ed3 Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@rwth-aachen.de>
Date: Thu, 3 Sep 2020 16:33:39 +0200
Subject: [PATCH 10/22] fix tests due to aggregation

---
 pytorch_lightning/metrics/classification.py | 26 +++++++++++++++++++++
 pytorch_lightning/metrics/converters.py     |  7 ++++--
 pytorch_lightning/metrics/metric.py         | 21 +++++++++++++++--
 pytorch_lightning/metrics/sklearns.py       | 23 +++++++++++++++++-
 tests/metrics/test_metrics.py               |  5 ----
 tests/metrics/test_sklearn.py               |  8 ++++---
 6 files changed, 77 insertions(+), 13 deletions(-)

diff --git a/pytorch_lightning/metrics/classification.py b/pytorch_lightning/metrics/classification.py
index 29affddb140ce..618ded5e1b085 100644
--- a/pytorch_lightning/metrics/classification.py
+++ b/pytorch_lightning/metrics/classification.py
@@ -131,6 +131,14 @@ def forward(self, pred: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
         """
         return confusion_matrix(pred=pred, target=target, normalize=self.normalize)
 
+    def aggregate(self, *tensors: torch.Tensor) -> torch.Tensor:
+        """Aggregates results by stacking them instead of concatenating before averaging.
+
+        Returns:
+            the aggregated results
+        """
+        return torch.stack(tensors).mean(0)
+
 
 class PrecisionRecallCurve(TensorCollectionMetric):
     """
@@ -619,6 +627,15 @@ def forward(
         """
         return multiclass_roc(pred=pred, target=target, sample_weight=sample_weight, num_classes=self.num_classes)
 
+    def aggregate(self, *tensors: torch.Tensor) -> Tuple[Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]:
+        """Aggregates results by stacking them instead of concatenating before averaging.
+
+        Returns:
+            the aggregated results
+        """
+
+        return tuple([tuple([torch.stack(tmps).mean(0) for tmps in zip(*_tensors)]) for _tensors in zip(*tensors)])
+
 
 class MulticlassPrecisionRecallCurve(TensorCollectionMetric):
     """Computes the multiclass PR Curve
@@ -679,6 +696,15 @@ def forward(
             pred=pred, target=target, sample_weight=sample_weight, num_classes=self.num_classes
         )
 
+    def aggregate(self, *tensors: torch.Tensor) -> Tuple[Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]:
+        """Aggregates results by stacking them instead of concatenating before averaging.
+
+        Returns:
+            the aggregated results
+        """
+
+        return tuple([tuple([torch.stack(tmps).mean(0) for tmps in zip(*_tensors)]) for _tensors in zip(*tensors)])
+
 
 class DiceCoefficient(TensorMetric):
     """
diff --git a/pytorch_lightning/metrics/converters.py b/pytorch_lightning/metrics/converters.py
index d9bb0e5e5e128..fa6f2962de49e 100644
--- a/pytorch_lightning/metrics/converters.py
+++ b/pytorch_lightning/metrics/converters.py
@@ -292,7 +292,8 @@ def sync_ddp_if_available(
 
     return result
 
-def at_least_1d(tensor: Union[np.ndarray, torch.Tensor]) ->  Union[np.ndarray, torch.Tensor]:
+
+def at_least_1d(tensor: Union[np.ndarray, torch.Tensor]) -> Union[np.ndarray, torch.Tensor]:
     """Makes sure the tensor is at least of 1d shape
 
     Args:
@@ -302,7 +303,9 @@ def at_least_1d(tensor: Union[np.ndarray, torch.Tensor]) ->  Union[np.ndarray, t
         the optionally reshaped tensor
     """
     if tensor.shape == ():
-        tensor = tensor.reshape(1,)
+        tensor = tensor.reshape(
+            1,
+        )
     return tensor
 
 
diff --git a/pytorch_lightning/metrics/metric.py b/pytorch_lightning/metrics/metric.py
index c0bc30c42472b..3fffb3d0f4b4d 100644
--- a/pytorch_lightning/metrics/metric.py
+++ b/pytorch_lightning/metrics/metric.py
@@ -13,8 +13,7 @@
 # limitations under the License.
 
 from abc import ABC, abstractmethod
-from ast import Num
-from typing import Any, Optional
+from typing import Any, Dict, Mapping, Optional, Sequence, Union
 import numbers
 
 import torch
@@ -242,6 +241,24 @@ def output_convert(self, data: Any, output: Any):
         )
         return super(TensorCollectionMetric, self).output_convert(self, data, output)
 
+    def aggregate(self, *tensors: torch.Tensor) -> Union[torch.Tensor, Dict[str, torch.Tensor], Sequence[torch.Tensor]]:
+        """Properly aggregate sequences of tensors and dicts of tensors
+
+        Raises:
+            TypeError: Unknown type
+
+        Returns:
+            the aggregated results
+        """
+        if isinstance(tensors[0], Mapping):
+            return {k: torch.stack([tensor[k] for tensor in tensors]).mean(0) for k in tensors[0].keys()}
+        elif isinstance(tensors[0], Sequence) and not isinstance(tensors[0], torch.Tensor):
+            return tuple([torch.stack(tmp).mean(0) for tmp in zip(*tensors)])
+        elif isinstance(tensors[0], torch.Tensor):
+            return torch.stack(tensors).mean(0)
+        else:
+            raise TypeError
+
 
 class NumpyMetric(Metric):
     """
diff --git a/pytorch_lightning/metrics/sklearns.py b/pytorch_lightning/metrics/sklearns.py
index ba7e3db1be5d5..d6d79eb0325fa 100644
--- a/pytorch_lightning/metrics/sklearns.py
+++ b/pytorch_lightning/metrics/sklearns.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Any, List, Optional, Sequence, Union
+from typing import Any, List, Optional, Sequence, Tuple, Union
 
 import numpy as np
 import torch
@@ -391,6 +391,9 @@ def forward(self, y_pred: np.ndarray, y_true: np.ndarray) -> np.ndarray:
         """
         return super().forward(y_pred=y_pred, y_true=y_true)
 
+    def aggregate(self, *tensors: torch.Tensor) -> torch.Tensor:
+        return torch.stack(tensors).mean(0)
+
 
 class DCG(SklearnMetric):
     """Compute discounted cumulative gain
@@ -1031,6 +1034,15 @@ def forward(
         # Will be fixed in native implementation
         return np.array(super().forward(probas_pred=probas_pred, y_true=y_true, sample_weight=sample_weight)[:2])
 
+    def aggregate(self, *tensors: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Aggregates results by stacking them instead of concatenating before averaging.
+
+        Returns:
+            the aggregated results
+        """
+        print(tensors)
+        return tuple([torch.stack(tmp).mean(0) for tmp in zip(*tensors)])
+
 
 class ROC(SklearnMetric):
     """
@@ -1102,6 +1114,15 @@ class or confidence values.
         """
         return np.array(super().forward(y_score=y_score, y_true=y_true, sample_weight=sample_weight)[:2])
 
+    def aggregate(self, *tensors: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Aggregates results by stacking them instead of concatenating before averaging.
+
+        Returns:
+            the aggregated results
+        """
+
+        return tuple([torch.stack(tmp).mean(0) for tmp in zip(*tensors)])
+
 
 class AUROC(SklearnMetric):
     """
diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
index 15c8d2e92d53f..e6ebad394785d 100644
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -1,6 +1,5 @@
 import os
 from typing import Any
-import warnings
 import numpy as np
 import pytest
 import torch
@@ -131,10 +130,6 @@ def change_and_check_device_dtype(device, dtype):
     assert metric.device == torch.device("cpu")
     assert metric(input1, input2).device == torch.device("cpu")
 
-    # metric.type(torch.int8)
-    # assert metric.dtype == torch.int8
-    # assert metric(input1, input2).dtype == torch.int8
-
     metric.float()
     assert metric.dtype == torch.float32
     assert metric(input1, input2).dtype == torch.float32
diff --git a/tests/metrics/test_sklearn.py b/tests/metrics/test_sklearn.py
index 10b57417411c4..fef7df686f56b 100644
--- a/tests/metrics/test_sklearn.py
+++ b/tests/metrics/test_sklearn.py
@@ -167,13 +167,15 @@ def test_sklearn_metric(metric_class, sklearn_func, inputs):
 
     sklearn_result = sklearn_func(**numpy_inputs)
     lightning_result = metric_class(**inputs)
-    assert np.allclose(sklearn_result, lightning_result, atol=1e-5)
+
+    
+    # assert np.allclose(sklearn_result, lightning_result, atol=1e-5)
 
     sklearn_result = apply_to_collection(
         sklearn_result, (torch.Tensor, np.ndarray, numbers.Number), convert_to_numpy)
 
-    lightning_result = apply_to_collection(
-        lightning_result, (torch.Tensor, np.ndarray, numbers.Number), convert_to_numpy)
+    lightning_result = np.array(apply_to_collection(
+        lightning_result, (torch.Tensor, np.ndarray, numbers.Number), convert_to_numpy))
 
     assert np.allclose(sklearn_result, lightning_result, atol=1e-5)
     assert isinstance(lightning_result, type(sklearn_result))

From 6cc0652eba36d4532e19f96b98cad32ae6553e40 Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@rwth-aachen.de>
Date: Thu, 3 Sep 2020 16:38:25 +0200
Subject: [PATCH 11/22] fix faulty import

---
 pytorch_lightning/metrics/converters.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pytorch_lightning/metrics/converters.py b/pytorch_lightning/metrics/converters.py
index fa6f2962de49e..b9341ac184dfa 100644
--- a/pytorch_lightning/metrics/converters.py
+++ b/pytorch_lightning/metrics/converters.py
@@ -24,7 +24,6 @@
 
 import numpy as np
 import torch
-from torch.distributed.distributed_c10d import reduce_op
 from torch.utils.data._utils.collate import np_str_obj_array_pattern
 
 from pytorch_lightning.utilities import rank_zero_warn

From 1d39f32f3e980645810156b55cfcee409d829626 Mon Sep 17 00:00:00 2001
From: Justus Schock <12886177+justusschock@users.noreply.github.com>
Date: Thu, 3 Sep 2020 21:59:56 +0200
Subject: [PATCH 12/22] Apply suggestions from code review
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
---
 pytorch_lightning/metrics/sklearns.py | 1 -
 tests/metrics/test_sklearn.py         | 3 ---
 2 files changed, 4 deletions(-)

diff --git a/pytorch_lightning/metrics/sklearns.py b/pytorch_lightning/metrics/sklearns.py
index d6d79eb0325fa..b790213c378c8 100644
--- a/pytorch_lightning/metrics/sklearns.py
+++ b/pytorch_lightning/metrics/sklearns.py
@@ -1040,7 +1040,6 @@ def aggregate(self, *tensors: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor,
         Returns:
             the aggregated results
         """
-        print(tensors)
         return tuple([torch.stack(tmp).mean(0) for tmp in zip(*tensors)])
 
 
diff --git a/tests/metrics/test_sklearn.py b/tests/metrics/test_sklearn.py
index fef7df686f56b..019048056016c 100644
--- a/tests/metrics/test_sklearn.py
+++ b/tests/metrics/test_sklearn.py
@@ -168,9 +168,6 @@ def test_sklearn_metric(metric_class, sklearn_func, inputs):
     sklearn_result = sklearn_func(**numpy_inputs)
     lightning_result = metric_class(**inputs)
 
-    
-    # assert np.allclose(sklearn_result, lightning_result, atol=1e-5)
-
     sklearn_result = apply_to_collection(
         sklearn_result, (torch.Tensor, np.ndarray, numbers.Number), convert_to_numpy)
 

From 0a87adadc1669646d2239fdae49eecf8011c31b2 Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@rwth-aachen.de>
Date: Fri, 4 Sep 2020 11:04:47 +0200
Subject: [PATCH 13/22] remove reduce_op docstrings

---
 pytorch_lightning/metrics/classification.py | 13 -----
 pytorch_lightning/metrics/sklearns.py       | 54 ---------------------
 2 files changed, 67 deletions(-)

diff --git a/pytorch_lightning/metrics/classification.py b/pytorch_lightning/metrics/classification.py
index 618ded5e1b085..46877356f2738 100644
--- a/pytorch_lightning/metrics/classification.py
+++ b/pytorch_lightning/metrics/classification.py
@@ -64,7 +64,6 @@ def __init__(
                 - none: pass array
                 - sum: add elements
             reduce_group: the process group to reduce metric results from DDP
-            reduce_op: the operation to perform for ddp reduction
         """
         super().__init__(name="accuracy", reduce_group=reduce_group)
         self.num_classes = num_classes
@@ -110,7 +109,6 @@ def __init__(
         Args:
             normalize: whether to compute a normalized confusion matrix
             reduce_group: the process group to reduce metric results from DDP
-            reduce_op: the operation to perform for ddp reduction
         """
         super().__init__(
             name="confusion_matrix",
@@ -168,7 +166,6 @@ def __init__(
         Args:
             pos_label: positive label indicator
             reduce_group: the process group to reduce metric results from DDP
-            reduce_op: the operation to perform for ddp reduction
         """
         super().__init__(
             name="precision_recall_curve",
@@ -228,7 +225,6 @@ def __init__(
                 - none: pass array
                 - sum: add elements
             reduce_group: the process group to reduce metric results from DDP
-            reduce_op: the operation to perform for ddp reduction
         """
         super().__init__(
             name="precision",
@@ -280,7 +276,6 @@ def __init__(
                 - none: pass array
                 - sum: add elements
             reduce_group: the process group to reduce metric results from DDP
-            reduce_op: the operation to perform for ddp reduction
         """
         super().__init__(
             name="recall",
@@ -327,7 +322,6 @@ def __init__(
         Args:
             pos_label: positive label indicator
             reduce_group: the process group to reduce metric results from DDP
-            reduce_op: the operation to perform for ddp reduction
         """
         super().__init__(
             name="AP",
@@ -376,7 +370,6 @@ def __init__(
         Args:
             pos_label: positive label indicator
             reduce_group: the process group to reduce metric results from DDP
-            reduce_op: the operation to perform for ddp reduction
         """
         super().__init__(
             name="auroc",
@@ -433,7 +426,6 @@ def __init__(
                 - none: pass array
                 - sum: add elements
             reduce_group: the process group to reduce metric results from DDP
-            reduce_op: the operation to perform for DDP reduction
         """
         super().__init__(
             name="fbeta",
@@ -489,7 +481,6 @@ def __init__(
                 - none: pass array
                 - sum: add elements
             reduce_group: the process group to reduce metric results from DDP
-            reduce_op: the operation to perform for ddp reduction
         """
         super().__init__(
             name="f1",
@@ -541,7 +532,6 @@ def __init__(
         Args:
             pos_label: positive label indicator
             reduce_group: the process group to reduce metric results from DDP
-            reduce_op: the operation to perform for ddp reduction
         """
         super().__init__(
             name="roc",
@@ -598,7 +588,6 @@ def __init__(
         Args:
             num_classes: number of classes
             reduce_group: the process group to reduce metric results from DDP
-            reduce_op: the operation to perform for ddp reduction
         """
         super().__init__(
             name="multiclass_roc",
@@ -664,7 +653,6 @@ def __init__(
         Args:
             num_classes: number of classes
             reduce_group: the process group to reduce metric results from DDP
-            reduce_op: the operation to perform for ddp reduction
 
         """
         super().__init__(
@@ -741,7 +729,6 @@ def __init__(
                 - none: pass array
                 - sum: add elements
             reduce_group: the process group to reduce metric results from DDP
-            reduce_op: the operation to perform for ddp reduction
         """
         super().__init__(
             name="dice",
diff --git a/pytorch_lightning/metrics/sklearns.py b/pytorch_lightning/metrics/sklearns.py
index b790213c378c8..ea43c03135cc0 100644
--- a/pytorch_lightning/metrics/sklearns.py
+++ b/pytorch_lightning/metrics/sklearns.py
@@ -53,8 +53,6 @@ def __init__(
             metric_name: the metric name to import and compute from scikit-learn.metrics
             reduce_group: the process group for DDP reduces (only needed for DDP training).
                 Defaults to all processes (world)
-            reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
-                Defaults to sum.
             **kwargs: additonal keyword arguments (will be forwarded to metric call)
         """
         super().__init__(
@@ -117,8 +115,6 @@ def __init__(
                 Otherwise, return the fraction of correctly classified samples.
             reduce_group: the process group for DDP reduces (only needed for DDP training).
                 Defaults to all processes (world)
-            reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
-                Defaults to sum.
         """
         super().__init__(metric_name="accuracy_score", reduce_group=reduce_group, normalize=normalize)
 
@@ -166,8 +162,6 @@ def __init__(
         Args:
             reduce_group: the process group for DDP reduces (only needed for DDP training).
                 Defaults to all processes (world)
-            reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
-                Defaults to sum.
         """
 
         super().__init__(metric_name="auc", reduce_group=reduce_group)
@@ -213,8 +207,6 @@ def __init__(
 
             reduce_group: the process group for DDP reduces (only needed for DDP training).
                 Defaults to all processes (world)
-            reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
-                Defaults to sum.
         """
         super().__init__("average_precision_score", reduce_group=reduce_group, average=average)
 
@@ -264,8 +256,6 @@ def __init__(
                 corresponds to 0 and perfect performance corresponds to 1
             reduce_group: the process group for DDP reduces (only needed for DDP training).
                 Defaults to all processes (world)
-            reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
-                Defaults to sum.
         """
         super().__init__("balanced_accuracy_score", reduce_group=reduce_group, adjusted=adjusted)
 
@@ -319,8 +309,6 @@ def __init__(
                 and ``quadratic`` means quadratic weighted
             reduce_group: the process group for DDP reduces (only needed for DDP training).
                 Defaults to all processes (world)
-            reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
-                Defaults to sum.
         """
         super().__init__("cohen_kappa_score", reduce_group=reduce_group, labels=labels, weights=weights)
 
@@ -374,8 +362,6 @@ def __init__(
                 in ``y_true`` or ``y_pred`` are used in sorted order.
             reduce_group: the process group for DDP reduces (only needed for DDP training).
                 Defaults to all processes (world)
-            reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
-                Defaults to sum.
         """
         super().__init__("confusion_matrix", reduce_group=reduce_group, labels=labels)
 
@@ -424,8 +410,6 @@ def __init__(
             ignore_ties: If ``True``, assume there are no ties in y_score for efficiency gains
             reduce_group: the process group for DDP reduces (only needed for DDP training).
                 Defaults to all processes (world)
-            reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
-                Defaults to sum.
         """
         super().__init__("dcg_score", reduce_group=reduce_group, k=k, log_base=log_base, ignore_ties=ignore_ties)
 
@@ -516,8 +500,6 @@ def __init__(
                 behavior is deprecated and will change in version 0.18.
             reduce_group: the process group for DDP reduces (only needed for DDP training).
                 Defaults to all processes (world)
-            reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
-                Defaults to sum.
         """
         super().__init__("f1_score", reduce_group=reduce_group, labels=labels, pos_label=pos_label, average=average)
 
@@ -604,8 +586,6 @@ def __init__(
                 behavior is deprecated and will change in version 0.18.
             reduce_group: the process group for DDP reduces (only needed for DDP training).
                 Defaults to all processes (world)
-            reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
-                Defaults to sum.
         """
         super().__init__(
             "fbeta_score", reduce_group=reduce_group, beta=beta, labels=labels, pos_label=pos_label, average=average
@@ -653,8 +633,6 @@ def __init__(
         Args:
             reduce_group: the process group for DDP reduces (only needed for DDP training).
                 Defaults to all processes (world)
-            reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
-                Defaults to sum.
 
         """
         super().__init__("hamming_loss", reduce_group=reduce_group)
@@ -702,8 +680,6 @@ def __init__(
             labels: Integer array of labels.
             reduce_group: the process group for DDP reduces (only needed for DDP training).
                 Defaults to all processes (world)
-            reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
-                Defaults to sum.
         """
         super().__init__("hinge_loss", reduce_group=reduce_group, labels=labels)
 
@@ -779,8 +755,6 @@ def __init__(
                 behavior is deprecated and will change in version 0.18.
             reduce_group: the process group for DDP reduces (only needed for DDP training).
                 Defaults to all processes (world)
-            reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
-                Defaults to sum.
         """
         super().__init__(
             "jaccard_score", reduce_group=reduce_group, labels=labels, pos_label=pos_label, average=average
@@ -863,8 +837,6 @@ def __init__(
                 behavior is deprecated and will change in version 0.18.
             reduce_group: the process group for DDP reduces (only needed for DDP training).
                 Defaults to all processes (world)
-            reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
-                Defaults to sum.
         """
         super().__init__(
             "precision_score", reduce_group=reduce_group, labels=labels, pos_label=pos_label, average=average
@@ -947,8 +919,6 @@ def __init__(
                 behavior is deprecated and will change in version 0.18.
             reduce_group: the process group for DDP reduces (only needed for DDP training).
                 Defaults to all processes (world)
-            reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
-                Defaults to sum.
         """
         super().__init__("recall_score", reduce_group=reduce_group, labels=labels, pos_label=pos_label, average=average)
 
@@ -1001,8 +971,6 @@ def __init__(
             pos_label: The class to report if ``average='binary'``.
             reduce_group: the process group for DDP reduces (only needed for DDP training).
                 Defaults to all processes (world)
-            reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
-                Defaults to sum.
         """
         super().__init__("precision_recall_curve", reduce_group=reduce_group, pos_label=pos_label)
 
@@ -1080,8 +1048,6 @@ def __init__(
             pos_labels: The class to report if ``average='binary'``.
             reduce_group: the process group for DDP reduces (only needed for DDP training).
                 Defaults to all processes (world)
-            reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
-                Defaults to sum.
         """
         super().__init__("roc_curve", reduce_group=reduce_group, pos_label=pos_label)
 
@@ -1156,8 +1122,6 @@ def __init__(
 
             reduce_group: the process group for DDP reduces (only needed for DDP training).
                 Defaults to all processes (world)
-            reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
-                Defaults to sum.
         """
         super().__init__("roc_auc_score", reduce_group=reduce_group, average=average)
 
@@ -1208,8 +1172,6 @@ def __init__(
                 output values should be aggregated.
             reduce_group: the process group for DDP reduces (only needed for DDP training).
                 Defaults to all processes (world)
-            reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
-                Defaults to sum.
         """
         super().__init__("explained_variance_score", reduce_group=reduce_group, multioutput=multioutput)
 
@@ -1261,8 +1223,6 @@ def __init__(
                 output values should be aggregated.
             reduce_group: the process group for DDP reduces (only needed for DDP training).
                 Defaults to all processes (world)
-            reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
-                Defaults to sum.
         """
         super().__init__("mean_absolute_error", reduce_group=reduce_group, multioutput=multioutput)
 
@@ -1314,8 +1274,6 @@ def __init__(
             squared: if ``True`` returns the mse value else the rmse value
             reduce_group: the process group for DDP reduces (only needed for DDP training).
                 Defaults to all processes (world)
-            reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
-                Defaults to sum.
         """
         super().__init__("mean_squared_error", reduce_group=reduce_group, multioutput=multioutput)
         self.squared = squared
@@ -1370,8 +1328,6 @@ def __init__(
                 output values should be aggregated.
             reduce_group: the process group for DDP reduces (only needed for DDP training).
                 Defaults to all processes (world)
-            reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
-                Defaults to sum.
         """
         super().__init__("mean_squared_log_error", reduce_group=reduce_group, multioutput=multioutput)
 
@@ -1422,8 +1378,6 @@ def __init__(
                 output values should be aggregated.
             reduce_group: the process group for DDP reduces (only needed for DDP training).
                 Defaults to all processes (world)
-            reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
-                Defaults to sum.
         """
         super().__init__("median_absolute_error", reduce_group=reduce_group, multioutput=multioutput)
 
@@ -1468,8 +1422,6 @@ def __init__(
                 output values should be aggregated.
             reduce_group: the process group for DDP reduces (only needed for DDP training).
                 Defaults to all processes (world)
-            reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
-                Defaults to sum.
         """
         super().__init__("r2_score", reduce_group=reduce_group, multioutput=multioutput)
 
@@ -1516,8 +1468,6 @@ def __init__(
         Args:
             reduce_group: the process group for DDP reduces (only needed for DDP training).
                 Defaults to all processes (world)
-            reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
-                Defaults to sum.
         """
         super().__init__("mean_poisson_deviance", reduce_group=reduce_group)
 
@@ -1564,8 +1514,6 @@ def __init__(
         Args:
             reduce_group: the process group for DDP reduces (only needed for DDP training).
                 Defaults to all processes (world)
-            reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
-                Defaults to sum.
         """
         super().__init__("mean_gamma_deviance", reduce_group=reduce_group)
 
@@ -1624,8 +1572,6 @@ def __init__(
 
             reduce_group: the process group for DDP reduces (only needed for DDP training).
                 Defaults to all processes (world)
-            reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
-                Defaults to sum.
         """
         super().__init__("mean_tweedie_deviance", reduce_group=reduce_group, power=power)
 

From d0d64894d7dc6eb93e8d97788d2d6836fa75948c Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@rwth-aachen.de>
Date: Fri, 4 Sep 2020 13:15:41 +0200
Subject: [PATCH 14/22] add compute

---
 tests/metrics/test_metrics.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
index e6ebad394785d..6a1f19e1caef2 100644
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -232,6 +232,11 @@ def aggregated(self) -> torch.Tensor:
             self.call_history.append("aggregated")
             return super().aggregated
 
+        @staticmethod
+        def compute(self, data: Any, output: Any):
+            self.call_history.append("compute")
+            return super(DummyMetric, self).compute(self, data, output)
+
     metric = DummyMetric()
     assert metric.call_history == ["init"]
     result = metric(torch.tensor([2.0]), torch.tensor([1.0]))

From d60abb54e3df4d48f03b3fb8968f54cbcf8eeaa8 Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@rwth-aachen.de>
Date: Fri, 4 Sep 2020 13:16:15 +0200
Subject: [PATCH 15/22] remove import

---
 pytorch_lightning/metrics/converters.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/pytorch_lightning/metrics/converters.py b/pytorch_lightning/metrics/converters.py
index b9341ac184dfa..0f4b0c44d53fb 100644
--- a/pytorch_lightning/metrics/converters.py
+++ b/pytorch_lightning/metrics/converters.py
@@ -38,15 +38,6 @@ class ReduceOp:
 
     rank_zero_warn("Unsupported `ReduceOp` for distributed computing")
 
-try:
-    import torch_xla
-    import torch_xla.core.xla_model as xm
-    import torch_xla.core.functions as xf
-except ImportError:
-    XLA_AVAILABLE = False
-else:
-    XLA_AVAILABLE = True
-
 
 def _apply_to_inputs(func_to_apply: Callable, *dec_args, **dec_kwargs) -> Callable:
     """

From 6b0fe1d192d1efa80127d435a9282af3eb524b1c Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@rwth-aachen.de>
Date: Fri, 4 Sep 2020 13:29:19 +0200
Subject: [PATCH 16/22] remove collection metric

---
 pytorch_lightning/metrics/metric.py | 63 +++++++++--------------------
 1 file changed, 20 insertions(+), 43 deletions(-)

diff --git a/pytorch_lightning/metrics/metric.py b/pytorch_lightning/metrics/metric.py
index 3fffb3d0f4b4d..df877e2d40e4a 100644
--- a/pytorch_lightning/metrics/metric.py
+++ b/pytorch_lightning/metrics/metric.py
@@ -206,58 +206,35 @@ def input_convert(self, data: Any):
 
     @staticmethod
     def output_convert(self, data: Any, output: Any):
-        output = apply_to_collection(output, torch.Tensor, convert_to_tensor, self.dtype, self.device)
-        return super(TensorMetric, self).output_convert(self, data, output)
-
-
-class TensorCollectionMetric(Metric):
-    """
-    Base class for metric implementation operating directly on tensors.
-    All inputs will be casted to tensors if necessary. Outputs won't be casted.
-    Already handles DDP sync and input conversions.
-
-    This class differs from :class:`TensorMetric`, as it assumes all outputs to
-    be collections of tensors and does not explicitly convert them. This is
-    necessary, since some collections (like for ROC, Precision-Recall Curve etc.)
-    cannot be converted to tensors at the highest level.
-    All numpy arrays and numbers occuring in these outputs will still be converted.
-
-    Use this class as a baseclass, whenever you want to ensure inputs are
-    tensors and outputs cannot be converted to tensors automatically
-
-    """
-
-    @staticmethod
-    def input_convert(self, data: Any):
-        data = apply_to_collection(
-            data, (torch.Tensor, np.ndarray, numbers.Number), convert_to_tensor, self.dtype, self.device
-        )
-        return super(TensorCollectionMetric, self).input_convert(self, data)
 
-    @staticmethod
-    def output_convert(self, data: Any, output: Any):
         output = apply_to_collection(
             output, (torch.Tensor, np.ndarray, numbers.Number), convert_to_tensor, self.dtype, self.device
         )
-        return super(TensorCollectionMetric, self).output_convert(self, data, output)
+        return super(TensorMetric, self).output_convert(self, data, output)
 
-    def aggregate(self, *tensors: torch.Tensor) -> Union[torch.Tensor, Dict[str, torch.Tensor], Sequence[torch.Tensor]]:
-        """Properly aggregate sequences of tensors and dicts of tensors
+    def aggregate(self, *tensors: torch.Tensor) -> torch.Tensor:
+        """
+        Implement aggregation of values on the same device
 
-        Raises:
-            TypeError: Unknown type
+        Args:
+            tensors: the values to be aggregated
 
         Returns:
-            the aggregated results
+            aggregated values
+
         """
-        if isinstance(tensors[0], Mapping):
-            return {k: torch.stack([tensor[k] for tensor in tensors]).mean(0) for k in tensors[0].keys()}
-        elif isinstance(tensors[0], Sequence) and not isinstance(tensors[0], torch.Tensor):
-            return tuple([torch.stack(tmp).mean(0) for tmp in zip(*tensors)])
-        elif isinstance(tensors[0], torch.Tensor):
-            return torch.stack(tensors).mean(0)
-        else:
-            raise TypeError
+
+        try:
+            return super().aggregate(*tensors)
+        except (ValueError, TypeError):
+            if isinstance(tensors[0], Mapping):
+                return {k: torch.stack([tensor[k] for tensor in tensors]).mean(0) for k in tensors[0].keys()}
+            elif isinstance(tensors[0], Sequence) and not isinstance(tensors[0], torch.Tensor):
+                return tuple([torch.stack(tmp).mean(0) for tmp in zip(*tensors)])
+            elif isinstance(tensors[0], torch.Tensor):
+                return torch.stack(tensors).mean(0)
+            else:
+                raise TypeError
 
 
 class NumpyMetric(Metric):

From 91986432c776391e967424312043d1725d3321e0 Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@rwth-aachen.de>
Date: Fri, 4 Sep 2020 13:29:26 +0200
Subject: [PATCH 17/22] update base class

---
 pytorch_lightning/metrics/classification.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/pytorch_lightning/metrics/classification.py b/pytorch_lightning/metrics/classification.py
index 46877356f2738..f3461b3200125 100644
--- a/pytorch_lightning/metrics/classification.py
+++ b/pytorch_lightning/metrics/classification.py
@@ -32,7 +32,7 @@
     recall,
     roc,
 )
-from pytorch_lightning.metrics.metric import TensorCollectionMetric, TensorMetric
+from pytorch_lightning.metrics.metric import TensorMetric
 
 
 class Accuracy(TensorMetric):
@@ -138,7 +138,7 @@ def aggregate(self, *tensors: torch.Tensor) -> torch.Tensor:
         return torch.stack(tensors).mean(0)
 
 
-class PrecisionRecallCurve(TensorCollectionMetric):
+class PrecisionRecallCurve(TensorMetric):
     """
     Computes the precision recall curve
 
@@ -504,7 +504,7 @@ def forward(self, pred: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
         return f1_score(pred=pred, target=target, num_classes=self.num_classes, reduction=self.reduction)
 
 
-class ROC(TensorCollectionMetric):
+class ROC(TensorMetric):
     """
     Computes the Receiver Operator Characteristic (ROC)
 
@@ -559,7 +559,7 @@ def forward(
         return roc(pred=pred, target=target, sample_weight=sample_weight, pos_label=self.pos_label)
 
 
-class MulticlassROC(TensorCollectionMetric):
+class MulticlassROC(TensorMetric):
     """
     Computes the multiclass ROC
 
@@ -626,7 +626,7 @@ def aggregate(self, *tensors: torch.Tensor) -> Tuple[Tuple[torch.Tensor, torch.T
         return tuple([tuple([torch.stack(tmps).mean(0) for tmps in zip(*_tensors)]) for _tensors in zip(*tensors)])
 
 
-class MulticlassPrecisionRecallCurve(TensorCollectionMetric):
+class MulticlassPrecisionRecallCurve(TensorMetric):
     """Computes the multiclass PR Curve
 
     Example:

From 3d2b945f4382ce759e141c26a3354cda63c41503 Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@rwth-aachen.de>
Date: Fri, 4 Sep 2020 13:29:35 +0200
Subject: [PATCH 18/22] update tests

---
 tests/metrics/test_metrics.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
index 6a1f19e1caef2..5a8589f8f254a 100644
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -6,7 +6,7 @@
 
 import tests.base.develop_utils as tutils
 from tests.base import EvalModelTemplate
-from pytorch_lightning.metrics.metric import Metric, TensorMetric, NumpyMetric, TensorCollectionMetric
+from pytorch_lightning.metrics.metric import Metric, TensorMetric, NumpyMetric
 from pytorch_lightning import Trainer
 
 
@@ -30,7 +30,7 @@ def forward(self, input1, input2):
         return 1.0
 
 
-class DummyTensorCollectionMetric(TensorCollectionMetric):
+class DummyTensorCollectionMetric(TensorMetric):
     def __init__(self):
         super().__init__("dummy")
 

From 20af5f4c5c8d733d6d7648ebc39b67f06996781c Mon Sep 17 00:00:00 2001
From: Justus Schock <12886177+justusschock@users.noreply.github.com>
Date: Thu, 10 Sep 2020 00:10:51 +0200
Subject: [PATCH 19/22] Update metric.py

---
 pytorch_lightning/metrics/metric.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/pytorch_lightning/metrics/metric.py b/pytorch_lightning/metrics/metric.py
index df877e2d40e4a..677a85f61b027 100644
--- a/pytorch_lightning/metrics/metric.py
+++ b/pytorch_lightning/metrics/metric.py
@@ -257,3 +257,27 @@ def output_convert(self, data: Any, output: Any):
         )
 
         return super(NumpyMetric, self).output_convert(self, data, output)
+    
+    def aggregate(self, *tensors: torch.Tensor) -> torch.Tensor:
+        """
+        Implement aggregation of values on the same device
+
+        Args:
+            tensors: the values to be aggregated
+
+        Returns:
+            aggregated values
+
+        """
+
+        try:
+            return super().aggregate(*tensors)
+        except (ValueError, TypeError):
+            if isinstance(tensors[0], Mapping):
+                return {k: torch.stack([tensor[k] for tensor in tensors]).mean(0) for k in tensors[0].keys()}
+            elif isinstance(tensors[0], Sequence) and not isinstance(tensors[0], torch.Tensor):
+                return tuple([torch.stack(tmp).mean(0) for tmp in zip(*tensors)])
+            elif isinstance(tensors[0], torch.Tensor):
+                return torch.stack(tensors).mean(0)
+            else:
+                raise TypeError

From c413c73153ef2efb30b3b7927ff0de7f0faf326e Mon Sep 17 00:00:00 2001
From: Justus Schock <12886177+justusschock@users.noreply.github.com>
Date: Thu, 10 Sep 2020 09:54:50 +0200
Subject: [PATCH 20/22] Update metric.py

---
 pytorch_lightning/metrics/metric.py | 24 ------------------------
 1 file changed, 24 deletions(-)

diff --git a/pytorch_lightning/metrics/metric.py b/pytorch_lightning/metrics/metric.py
index 677a85f61b027..df877e2d40e4a 100644
--- a/pytorch_lightning/metrics/metric.py
+++ b/pytorch_lightning/metrics/metric.py
@@ -257,27 +257,3 @@ def output_convert(self, data: Any, output: Any):
         )
 
         return super(NumpyMetric, self).output_convert(self, data, output)
-    
-    def aggregate(self, *tensors: torch.Tensor) -> torch.Tensor:
-        """
-        Implement aggregation of values on the same device
-
-        Args:
-            tensors: the values to be aggregated
-
-        Returns:
-            aggregated values
-
-        """
-
-        try:
-            return super().aggregate(*tensors)
-        except (ValueError, TypeError):
-            if isinstance(tensors[0], Mapping):
-                return {k: torch.stack([tensor[k] for tensor in tensors]).mean(0) for k in tensors[0].keys()}
-            elif isinstance(tensors[0], Sequence) and not isinstance(tensors[0], torch.Tensor):
-                return tuple([torch.stack(tmp).mean(0) for tmp in zip(*tensors)])
-            elif isinstance(tensors[0], torch.Tensor):
-                return torch.stack(tensors).mean(0)
-            else:
-                raise TypeError

From 22fd6d37aa16b882eae3113bfec887e49e8aa29a Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Thu, 10 Sep 2020 21:47:02 +0200
Subject: [PATCH 21/22] Apply suggestions from code review

---
 pytorch_lightning/metrics/converters.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/pytorch_lightning/metrics/converters.py b/pytorch_lightning/metrics/converters.py
index 0f4b0c44d53fb..63d2ddfef9376 100644
--- a/pytorch_lightning/metrics/converters.py
+++ b/pytorch_lightning/metrics/converters.py
@@ -293,9 +293,7 @@ def at_least_1d(tensor: Union[np.ndarray, torch.Tensor]) -> Union[np.ndarray, to
         the optionally reshaped tensor
     """
     if tensor.shape == ():
-        tensor = tensor.reshape(
-            1,
-        )
+        tensor = tensor.reshape(1, )
     return tensor
 
 

From 021bd0690f0b0761dfa3f73cdf5650b528d84ebc Mon Sep 17 00:00:00 2001
From: Nicki Skafte <skaftenicki@gmail.com>
Date: Sun, 13 Sep 2020 17:01:39 +0200
Subject: [PATCH 22/22] change default aggregate

---
 pytorch_lightning/metrics/metric.py | 38 +++++++++--------------------
 1 file changed, 12 insertions(+), 26 deletions(-)

diff --git a/pytorch_lightning/metrics/metric.py b/pytorch_lightning/metrics/metric.py
index df877e2d40e4a..45c50b084956f 100644
--- a/pytorch_lightning/metrics/metric.py
+++ b/pytorch_lightning/metrics/metric.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from abc import ABC, abstractmethod
-from typing import Any, Dict, Mapping, Optional, Sequence, Union
+from typing import Any, Mapping, Optional, Sequence
 import numbers
 
 import torch
@@ -163,7 +163,17 @@ def aggregate(self, *tensors: torch.Tensor) -> torch.Tensor:
             aggregated values
 
         """
-        return torch.cat(tensors).mean(0)
+        try:
+            return torch.cat(tensors).mean(0)
+        except (ValueError, TypeError):
+            if isinstance(tensors[0], Mapping):
+                return {k: torch.stack([tensor[k] for tensor in tensors]).mean(0) for k in tensors[0].keys()}
+            elif isinstance(tensors[0], Sequence) and not isinstance(tensors[0], torch.Tensor):
+                return tuple([torch.stack(tmp).mean(0) for tmp in zip(*tensors)])
+            elif isinstance(tensors[0], torch.Tensor):
+                return torch.stack(tensors).mean(0)
+            else:
+                raise TypeError("unknown metric value format to aggregate")
 
     @staticmethod
     def compute(self, data: Any, output: Any):
@@ -212,30 +222,6 @@ def output_convert(self, data: Any, output: Any):
         )
         return super(TensorMetric, self).output_convert(self, data, output)
 
-    def aggregate(self, *tensors: torch.Tensor) -> torch.Tensor:
-        """
-        Implement aggregation of values on the same device
-
-        Args:
-            tensors: the values to be aggregated
-
-        Returns:
-            aggregated values
-
-        """
-
-        try:
-            return super().aggregate(*tensors)
-        except (ValueError, TypeError):
-            if isinstance(tensors[0], Mapping):
-                return {k: torch.stack([tensor[k] for tensor in tensors]).mean(0) for k in tensors[0].keys()}
-            elif isinstance(tensors[0], Sequence) and not isinstance(tensors[0], torch.Tensor):
-                return tuple([torch.stack(tmp).mean(0) for tmp in zip(*tensors)])
-            elif isinstance(tensors[0], torch.Tensor):
-                return torch.stack(tensors).mean(0)
-            else:
-                raise TypeError
-
 
 class NumpyMetric(Metric):
     """