From 9f7daa1041afedcd5b8345cc47dd3f9554d912bb Mon Sep 17 00:00:00 2001
From: Nicholas Vadivelu <nicv@uber.com>
Date: Fri, 7 Aug 2020 16:52:59 -0400
Subject: [PATCH 01/32] update accuracy to accumulate _num_correct in a tensor
 on the right device

---
 ignite/metrics/accuracy.py                   |  6 +++---
 tests/ignite/metrics/test_accuracy.py        | 19 +++++++++++++++++++
 tests/ignite/metrics/test_running_average.py |  2 +-
 3 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/ignite/metrics/accuracy.py b/ignite/metrics/accuracy.py
index 8ac5a25f083f..c41053ef5b99 100644
--- a/ignite/metrics/accuracy.py
+++ b/ignite/metrics/accuracy.py
@@ -138,7 +138,7 @@ def __init__(
 
     @reinit__is_reduced
     def reset(self) -> None:
-        self._num_correct = 0
+        self._num_correct = torch.tensor(0, device=self._device)
         self._num_examples = 0
         super(Accuracy, self).reset()
 
@@ -161,11 +161,11 @@ def update(self, output: Sequence[torch.Tensor]) -> None:
             y = torch.transpose(y, 1, last_dim - 1).reshape(-1, num_classes)
             correct = torch.all(y == y_pred.type_as(y), dim=-1)
 
-        self._num_correct += torch.sum(correct).item()
+        self._num_correct += torch.sum(correct).to(self._device)
         self._num_examples += correct.shape[0]
 
     @sync_all_reduce("_num_examples", "_num_correct")
     def compute(self) -> torch.Tensor:
         if self._num_examples == 0:
             raise NotComputableError("Accuracy must have at least one example before it can be computed.")
-        return self._num_correct / self._num_examples
+        return self._num_correct.item() / self._num_examples
diff --git a/tests/ignite/metrics/test_accuracy.py b/tests/ignite/metrics/test_accuracy.py
index 3ca32257d7bf..bd1a8901c848 100644
--- a/tests/ignite/metrics/test_accuracy.py
+++ b/tests/ignite/metrics/test_accuracy.py
@@ -778,6 +778,18 @@ def update(engine, i):
         _test(n_epochs=2)
 
 
+def _test_distrib_accumulator_device(device):
+    device = torch.device(device)
+    acc = Accuracy(device=device)
+    assert acc._device == device
+
+    y_pred = torch.randint(0, 2, size=(10,)).long()
+    y = torch.randint(0, 2, size=(10,)).long()
+    acc.update((y_pred, y))
+
+    assert acc._num_correct.device == device
+
+
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
@@ -786,6 +798,7 @@ def test_distrib_gpu(distributed_context_single_node_nccl):
     _test_distrib_multilabel_input_NHW(device)
     _test_distrib_integration_multiclass(device)
     _test_distrib_integration_multilabel(device)
+    _test_distrib_accumulator_device(device)
 
 
 @pytest.mark.distributed
@@ -796,6 +809,7 @@ def test_distrib_cpu(distributed_context_single_node_gloo):
     _test_distrib_multilabel_input_NHW(device)
     _test_distrib_integration_multiclass(device)
     _test_distrib_integration_multilabel(device)
+    _test_distrib_accumulator_device(device)
 
 
 @pytest.mark.distributed
@@ -809,6 +823,7 @@ def test_distrib_hvd(gloo_hvd_executor):
     gloo_hvd_executor(_test_distrib_multilabel_input_NHW, (device,), np=nproc, do_init=True)
     gloo_hvd_executor(_test_distrib_integration_multiclass, (device,), np=nproc, do_init=True)
     gloo_hvd_executor(_test_distrib_integration_multilabel, (device,), np=nproc, do_init=True)
+    gloo_hvd_executor(_test_distrib_accumulator_device, (device,), np=nproc, do_init=True)
 
 
 @pytest.mark.multinode_distributed
@@ -819,6 +834,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo):
     _test_distrib_multilabel_input_NHW(device)
     _test_distrib_integration_multiclass(device)
     _test_distrib_integration_multilabel(device)
+    _test_distrib_accumulator_device(device)
 
 
 @pytest.mark.multinode_distributed
@@ -829,6 +845,7 @@ def test_multinode_distrib_gpu(distributed_context_multi_node_nccl):
     _test_distrib_multilabel_input_NHW(device)
     _test_distrib_integration_multiclass(device)
     _test_distrib_integration_multilabel(device)
+    _test_distrib_accumulator_device(device)
 
 
 @pytest.mark.tpu
@@ -839,6 +856,7 @@ def test_distrib_single_device_xla():
     _test_distrib_multilabel_input_NHW(device)
     _test_distrib_integration_multiclass(device)
     _test_distrib_integration_multilabel(device)
+    _test_distrib_accumulator_device(device)
 
 
 def _test_distrib_xla_nprocs(index):
@@ -846,6 +864,7 @@ def _test_distrib_xla_nprocs(index):
     _test_distrib_multilabel_input_NHW(device)
     _test_distrib_integration_multiclass(device)
     _test_distrib_integration_multilabel(device)
+    _test_distrib_accumulator_device(device)
 
 
 @pytest.mark.tpu
diff --git a/tests/ignite/metrics/test_running_average.py b/tests/ignite/metrics/test_running_average.py
index c66fdfabdc5e..ab1528e10596 100644
--- a/tests/ignite/metrics/test_running_average.py
+++ b/tests/ignite/metrics/test_running_average.py
@@ -345,7 +345,7 @@ def manual_running_avg_acc(engine):
             )
             true_acc_metric.update(output)
 
-        batch_acc = true_acc_metric._num_correct * 1.0 / true_acc_metric._num_examples
+        batch_acc = true_acc_metric._num_correct.item() * 1.0 / true_acc_metric._num_examples
 
         if running_avg_acc[0] is None:
             running_avg_acc[0] = batch_acc

From a87f93de6594a66721d1c454070eedb2eff80d6d Mon Sep 17 00:00:00 2001
From: Nicholas Vadivelu <nicv@uber.com>
Date: Fri, 7 Aug 2020 17:14:21 -0400
Subject: [PATCH 02/32] update loss metric to accumulate _sum in a tensor on
 the right device

---
 ignite/metrics/loss.py            |  6 +++---
 tests/ignite/metrics/test_loss.py | 28 ++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/ignite/metrics/loss.py b/ignite/metrics/loss.py
index 5a4133c84d95..f44ce3f6f193 100644
--- a/ignite/metrics/loss.py
+++ b/ignite/metrics/loss.py
@@ -45,7 +45,7 @@ def __init__(
 
     @reinit__is_reduced
     def reset(self) -> None:
-        self._sum = 0
+        self._sum = torch.tensor(0.0, device=self._device)
         self._num_examples = 0
 
     @reinit__is_reduced
@@ -61,11 +61,11 @@ def update(self, output: Sequence[Union[torch.Tensor, dict]]) -> None:
             raise ValueError("loss_fn did not return the average loss.")
 
         n = self._batch_size(y)
-        self._sum += average_loss.item() * n
+        self._sum += average_loss.detach().to(self._device) * n
         self._num_examples += n
 
     @sync_all_reduce("_sum", "_num_examples")
     def compute(self) -> None:
         if self._num_examples == 0:
             raise NotComputableError("Loss must have at least one example before it can be computed.")
-        return self._sum / self._num_examples
+        return self._sum.item() / self._num_examples
diff --git a/tests/ignite/metrics/test_loss.py b/tests/ignite/metrics/test_loss.py
index 5244e7991d7d..4739bca16d1f 100644
--- a/tests/ignite/metrics/test_loss.py
+++ b/tests/ignite/metrics/test_loss.py
@@ -107,6 +107,28 @@ def _test_distrib_compute_on_criterion(device):
     assert_almost_equal(res, true_loss_value.item())
 
 
+def _test_distrib_sum_device(device):
+    device = torch.device(device)
+    loss = Loss(nll_loss, device=device)
+    assert loss._device == device
+
+    y_pred = torch.tensor([[0.1, 0.4, 0.5], [0.1, 0.7, 0.2]]).log()
+    y = torch.tensor([2, 2]).long()
+    loss.update((y_pred, y))
+
+    assert loss._sum.device == device
+
+
+def test_sum_detached():
+    loss = Loss(nll_loss)
+
+    y_pred = torch.tensor([[0.1, 0.4, 0.5], [0.1, 0.7, 0.2]], requires_grad=True).log()
+    y = torch.tensor([2, 2]).long()
+    loss.update((y_pred, y))
+
+    assert not loss._sum.requires_grad
+
+
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
@@ -114,6 +136,7 @@ def test_distrib_gpu(local_rank, distributed_context_single_node_nccl):
 
     device = "cuda:{}".format(local_rank)
     _test_distrib_compute_on_criterion(device)
+    _test_distrib_sum_device(device)
 
 
 @pytest.mark.distributed
@@ -122,6 +145,7 @@ def test_distrib_cpu(distributed_context_single_node_gloo):
 
     device = "cpu"
     _test_distrib_compute_on_criterion(device)
+    _test_distrib_sum_device(device)
 
 
 @pytest.mark.distributed
@@ -133,6 +157,7 @@ def test_distrib_hvd(gloo_hvd_executor):
     nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
 
     gloo_hvd_executor(_test_distrib_compute_on_criterion, (device,), np=nproc, do_init=True)
+    gloo_hvd_executor(_test_distrib_sum_device, (device,), np=nproc, do_init=True)
 
 
 @pytest.mark.multinode_distributed
@@ -141,6 +166,7 @@ def test_distrib_hvd(gloo_hvd_executor):
 def test_multinode_distrib_cpu(distributed_context_multi_node_gloo):
     device = "cpu"
     _test_distrib_compute_on_criterion(device)
+    _test_distrib_sum_device(device)
 
 
 @pytest.mark.multinode_distributed
@@ -149,6 +175,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo):
 def test_multinode_distrib_gpu(distributed_context_multi_node_nccl):
     device = "cuda:{}".format(distributed_context_multi_node_nccl["local_rank"])
     _test_distrib_compute_on_criterion(device)
+    _test_distrib_sum_device(device)
 
 
 @pytest.mark.tpu
@@ -157,6 +184,7 @@ def test_multinode_distrib_gpu(distributed_context_multi_node_nccl):
 def test_distrib_single_device_xla():
     device = idist.device()
     _test_distrib_compute_on_criterion(device)
+    _test_distrib_sum_device(device)
 
 
 def _test_distrib_xla_nprocs(index):

From 30b2e19e400aca12cc0b8e07dc6bc72f50df5ab1 Mon Sep 17 00:00:00 2001
From: Nicholas Vadivelu <nicv@uber.com>
Date: Fri, 7 Aug 2020 17:25:27 -0400
Subject: [PATCH 03/32] update mae metric to accumulate in a tensor on the
 right device

---
 ignite/metrics/mean_absolute_error.py         |  6 ++--
 .../metrics/test_mean_absolute_error.py       | 28 +++++++++++++++++++
 2 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/ignite/metrics/mean_absolute_error.py b/ignite/metrics/mean_absolute_error.py
index 86e699be096f..c3c9a9a9ed2f 100644
--- a/ignite/metrics/mean_absolute_error.py
+++ b/ignite/metrics/mean_absolute_error.py
@@ -17,18 +17,18 @@ class MeanAbsoluteError(Metric):
 
     @reinit__is_reduced
     def reset(self) -> None:
-        self._sum_of_absolute_errors = 0.0
+        self._sum_of_absolute_errors = torch.tensor(0.0, device=self._device)
         self._num_examples = 0
 
     @reinit__is_reduced
     def update(self, output: Sequence[torch.Tensor]) -> None:
         y_pred, y = output
         absolute_errors = torch.abs(y_pred - y.view_as(y_pred))
-        self._sum_of_absolute_errors += torch.sum(absolute_errors).item()
+        self._sum_of_absolute_errors += torch.sum(absolute_errors).detach().to(self._device)
         self._num_examples += y.shape[0]
 
     @sync_all_reduce("_sum_of_absolute_errors", "_num_examples")
     def compute(self) -> Union[float, torch.Tensor]:
         if self._num_examples == 0:
             raise NotComputableError("MeanAbsoluteError must have at least one example before it can be computed.")
-        return self._sum_of_absolute_errors / self._num_examples
+        return self._sum_of_absolute_errors.item() / self._num_examples
diff --git a/tests/ignite/metrics/test_mean_absolute_error.py b/tests/ignite/metrics/test_mean_absolute_error.py
index 8279470dd07e..557c4182f3b4 100644
--- a/tests/ignite/metrics/test_mean_absolute_error.py
+++ b/tests/ignite/metrics/test_mean_absolute_error.py
@@ -65,12 +65,34 @@ def update(engine, i):
     assert pytest.approx(res) == true_res
 
 
+def _test_distrib_accumulator_device(device):
+    device = torch.device(device)
+    mae = MeanAbsoluteError(device=device)
+    assert mae._device == device
+
+    y_pred = torch.tensor([[2.0], [-2.0]])
+    y = torch.zeros(2)
+    mae.update((y_pred, y))
+    assert mae._sum_of_absolute_errors.device == device
+
+
+def test_accumulator_detached():
+    mae = MeanAbsoluteError()
+
+    y_pred = torch.tensor([[2.0], [-2.0]], requires_grad=True)
+    y = torch.zeros(2)
+    mae.update((y_pred, y))
+
+    assert not mae._sum_of_absolute_errors.requires_grad
+
+
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_distrib_gpu(local_rank, distributed_context_single_node_nccl):
     device = "cuda:{}".format(local_rank)
     _test_distrib_integration(device)
+    _test_distrib_accumulator_device(device)
 
 
 @pytest.mark.distributed
@@ -78,6 +100,7 @@ def test_distrib_gpu(local_rank, distributed_context_single_node_nccl):
 def test_distrib_cpu(distributed_context_single_node_gloo):
     device = "cpu"
     _test_distrib_integration(device)
+    _test_distrib_accumulator_device(device)
 
 
 @pytest.mark.distributed
@@ -89,6 +112,7 @@ def test_distrib_hvd(gloo_hvd_executor):
     nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
 
     gloo_hvd_executor(_test_distrib_integration, (device,), np=nproc, do_init=True)
+    gloo_hvd_executor(_test_distrib_accumulator_device, (device,), np=nproc, do_init=True)
 
 
 @pytest.mark.multinode_distributed
@@ -97,6 +121,7 @@ def test_distrib_hvd(gloo_hvd_executor):
 def test_multinode_distrib_cpu(distributed_context_multi_node_gloo):
     device = "cpu"
     _test_distrib_integration(device)
+    _test_distrib_accumulator_device(device)
 
 
 @pytest.mark.multinode_distributed
@@ -105,6 +130,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo):
 def test_multinode_distrib_gpu(distributed_context_multi_node_nccl):
     device = "cuda:{}".format(distributed_context_multi_node_nccl["local_rank"])
     _test_distrib_integration(device)
+    _test_distrib_accumulator_device(device)
 
 
 @pytest.mark.tpu
@@ -113,11 +139,13 @@ def test_multinode_distrib_gpu(distributed_context_multi_node_nccl):
 def test_distrib_single_device_xla():
     device = idist.device()
     _test_distrib_integration(device)
+    _test_distrib_accumulator_device(device)
 
 
 def _test_distrib_xla_nprocs(index):
     device = idist.device()
     _test_distrib_integration(device)
+    _test_distrib_accumulator_device(device)
 
 
 @pytest.mark.tpu

From a3e237c42ecd1860ea31d6f539c9678cdbb36267 Mon Sep 17 00:00:00 2001
From: Nicholas Vadivelu <nicv@uber.com>
Date: Fri, 7 Aug 2020 17:37:40 -0400
Subject: [PATCH 04/32] update mpd metric to accumulate in a tensor on the
 right device

---
 ignite/metrics/mean_pairwise_distance.py      |  6 ++--
 .../metrics/test_mean_pairwise_distance.py    | 29 +++++++++++++++++++
 2 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/ignite/metrics/mean_pairwise_distance.py b/ignite/metrics/mean_pairwise_distance.py
index 9e9239ee6553..b1f1c9a52c26 100644
--- a/ignite/metrics/mean_pairwise_distance.py
+++ b/ignite/metrics/mean_pairwise_distance.py
@@ -29,18 +29,18 @@ def __init__(
 
     @reinit__is_reduced
     def reset(self):
-        self._sum_of_distances = 0.0
+        self._sum_of_distances = torch.tensor(0.0, device=self._device)
         self._num_examples = 0
 
     @reinit__is_reduced
     def update(self, output: Sequence[torch.Tensor]) -> None:
         y_pred, y = output
         distances = pairwise_distance(y_pred, y, p=self._p, eps=self._eps)
-        self._sum_of_distances += torch.sum(distances).item()
+        self._sum_of_distances += torch.sum(distances).detach().to(self._device)
         self._num_examples += y.shape[0]
 
     @sync_all_reduce("_sum_of_distances", "_num_examples")
     def compute(self) -> Union[float, torch.Tensor]:
         if self._num_examples == 0:
             raise NotComputableError("MeanAbsoluteError must have at least one example before it can be computed.")
-        return self._sum_of_distances / self._num_examples
+        return self._sum_of_distances.item() / self._num_examples
diff --git a/tests/ignite/metrics/test_mean_pairwise_distance.py b/tests/ignite/metrics/test_mean_pairwise_distance.py
index 7ada0b5474a5..f5ea4e731ef7 100644
--- a/tests/ignite/metrics/test_mean_pairwise_distance.py
+++ b/tests/ignite/metrics/test_mean_pairwise_distance.py
@@ -78,12 +78,35 @@ def update(engine, i):
     assert pytest.approx(res) == true_res
 
 
+def _test_distrib_accumulator_device(device):
+    device = torch.device(device)
+    mpd = MeanPairwiseDistance(device=device)
+    assert mpd._device == device
+
+    y_pred = torch.Tensor([[3.0, 4.0], [-3.0, -4.0]])
+    y = torch.zeros(2, 2)
+    mpd.update((y_pred, y))
+
+    assert mpd._sum_of_distances.device == device
+
+
+def test_accumulator_detached():
+    mpd = MeanPairwiseDistance()
+
+    y_pred = torch.tensor([[3.0, 4.0], [-3.0, -4.0]], requires_grad=True)
+    y = torch.zeros(2, 2)
+    mpd.update((y_pred, y))
+
+    assert not mpd._sum_of_distances.requires_grad
+
+
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_distrib_gpu(local_rank, distributed_context_single_node_nccl):
     device = "cuda:{}".format(local_rank)
     _test_distrib_integration(device)
+    _test_distrib_accumulator_device(device)
 
 
 @pytest.mark.distributed
@@ -91,6 +114,7 @@ def test_distrib_gpu(local_rank, distributed_context_single_node_nccl):
 def test_distrib_cpu(distributed_context_single_node_gloo):
     device = "cpu"
     _test_distrib_integration(device)
+    _test_distrib_accumulator_device(device)
 
 
 @pytest.mark.distributed
@@ -102,6 +126,7 @@ def test_distrib_hvd(gloo_hvd_executor):
     nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
 
     gloo_hvd_executor(_test_distrib_integration, (device,), np=nproc, do_init=True)
+    gloo_hvd_executor(_test_distrib_accumulator_device, (device,), np=nproc, do_init=True)
 
 
 @pytest.mark.multinode_distributed
@@ -110,6 +135,7 @@ def test_distrib_hvd(gloo_hvd_executor):
 def test_multinode_distrib_cpu(distributed_context_multi_node_gloo):
     device = "cpu"
     _test_distrib_integration(device)
+    _test_distrib_accumulator_device(device)
 
 
 @pytest.mark.multinode_distributed
@@ -118,6 +144,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo):
 def test_multinode_distrib_gpu(distributed_context_multi_node_nccl):
     device = "cuda:{}".format(distributed_context_multi_node_nccl["local_rank"])
     _test_distrib_integration(device)
+    _test_distrib_accumulator_device(device)
 
 
 @pytest.mark.tpu
@@ -126,11 +153,13 @@ def test_multinode_distrib_gpu(distributed_context_multi_node_nccl):
 def test_distrib_single_device_xla():
     device = idist.device()
     _test_distrib_integration(device)
+    _test_distrib_accumulator_device(device)
 
 
 def _test_distrib_xla_nprocs(index):
     device = idist.device()
     _test_distrib_integration(device)
+    _test_distrib_accumulator_device(device)
 
 
 @pytest.mark.tpu

From 71001760f91c5cb390ded84216461402e2f48557 Mon Sep 17 00:00:00 2001
From: Nicholas Vadivelu <nicv@uber.com>
Date: Fri, 7 Aug 2020 17:41:45 -0400
Subject: [PATCH 05/32] update mse metric to accumulate in a tensor on the
 right device

---
 ignite/metrics/mean_squared_error.py          |  6 ++--
 .../ignite/metrics/test_mean_squared_error.py | 28 +++++++++++++++++++
 2 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/ignite/metrics/mean_squared_error.py b/ignite/metrics/mean_squared_error.py
index 4c5a9ee3371c..29d1dc07639c 100644
--- a/ignite/metrics/mean_squared_error.py
+++ b/ignite/metrics/mean_squared_error.py
@@ -17,18 +17,18 @@ class MeanSquaredError(Metric):
 
     @reinit__is_reduced
     def reset(self) -> None:
-        self._sum_of_squared_errors = 0.0
+        self._sum_of_squared_errors = torch.tensor(0.0, device=self._device)
         self._num_examples = 0
 
     @reinit__is_reduced
     def update(self, output: Sequence[torch.Tensor]) -> None:
         y_pred, y = output
         squared_errors = torch.pow(y_pred - y.view_as(y_pred), 2)
-        self._sum_of_squared_errors += torch.sum(squared_errors).item()
+        self._sum_of_squared_errors += torch.sum(squared_errors).detach().to(self._device)
         self._num_examples += y.shape[0]
 
     @sync_all_reduce("_sum_of_squared_errors", "_num_examples")
     def compute(self) -> Union[float, torch.Tensor]:
         if self._num_examples == 0:
             raise NotComputableError("MeanSquaredError must have at least one example before it can be computed.")
-        return self._sum_of_squared_errors / self._num_examples
+        return self._sum_of_squared_errors.item() / self._num_examples
diff --git a/tests/ignite/metrics/test_mean_squared_error.py b/tests/ignite/metrics/test_mean_squared_error.py
index 59ce1fdc567d..08552836531f 100644
--- a/tests/ignite/metrics/test_mean_squared_error.py
+++ b/tests/ignite/metrics/test_mean_squared_error.py
@@ -65,6 +65,27 @@ def update(engine, i):
     assert pytest.approx(res, rel=tol) == true_res
 
 
+def _test_distrib_accumulator_device(device):
+    device = torch.device(device)
+    mse = MeanSquaredError(device=device)
+    assert mse._device == device
+
+    y_pred = torch.tensor([[2.0], [-2.0]])
+    y = torch.zeros(2)
+    mse.update((y_pred, y))
+    assert mse._sum_of_squared_errors.device == device
+
+
+def test_accumulator_detached():
+    mse = MeanSquaredError()
+
+    y_pred = torch.tensor([[2.0], [-2.0]], requires_grad=True)
+    y = torch.zeros(2)
+    mse.update((y_pred, y))
+
+    assert not mse._sum_of_squared_errors.requires_grad
+
+
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
@@ -72,6 +93,7 @@ def test_distrib_gpu(local_rank, distributed_context_single_node_nccl):
 
     device = "cuda:{}".format(local_rank)
     _test_distrib_integration(device)
+    _test_distrib_accumulator_device(device)
 
 
 @pytest.mark.distributed
@@ -79,6 +101,7 @@ def test_distrib_gpu(local_rank, distributed_context_single_node_nccl):
 def test_distrib_cpu(distributed_context_single_node_gloo):
     device = "cpu"
     _test_distrib_integration(device)
+    _test_distrib_accumulator_device(device)
 
 
 @pytest.mark.distributed
@@ -90,6 +113,7 @@ def test_distrib_hvd(gloo_hvd_executor):
     nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
 
     gloo_hvd_executor(_test_distrib_integration, (device,), np=nproc, do_init=True)
+    gloo_hvd_executor(_test_distrib_accumulator_device, (device,), np=nproc, do_init=True)
 
 
 @pytest.mark.multinode_distributed
@@ -98,6 +122,7 @@ def test_distrib_hvd(gloo_hvd_executor):
 def test_multinode_distrib_cpu(distributed_context_multi_node_gloo):
     device = "cpu"
     _test_distrib_integration(device)
+    _test_distrib_accumulator_device(device)
 
 
 @pytest.mark.multinode_distributed
@@ -106,6 +131,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo):
 def test_multinode_distrib_gpu(distributed_context_multi_node_nccl):
     device = "cuda:{}".format(distributed_context_multi_node_nccl["local_rank"])
     _test_distrib_integration(device)
+    _test_distrib_accumulator_device(device)
 
 
 @pytest.mark.tpu
@@ -114,11 +140,13 @@ def test_multinode_distrib_gpu(distributed_context_multi_node_nccl):
 def test_distrib_single_device_xla():
     device = idist.device()
     _test_distrib_integration(device, tol=1e-4)
+    _test_distrib_accumulator_device(device)
 
 
 def _test_distrib_xla_nprocs(index):
     device = idist.device()
     _test_distrib_integration(device, tol=1e-4)
+    _test_distrib_accumulator_device(device)
 
 
 @pytest.mark.tpu

From 3228a0abc270af213c1fbd7a60189d2fefa7a54a Mon Sep 17 00:00:00 2001
From: Nicholas Vadivelu <nicv@uber.com>
Date: Fri, 7 Aug 2020 17:53:19 -0400
Subject: [PATCH 06/32] update top k accuracy  metric to accumulate in a tensor
 on the right device

---
 ignite/metrics/top_k_categorical_accuracy.py   |  6 +++---
 .../metrics/test_top_k_categorical_accuracy.py | 18 ++++++++++++++++++
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/ignite/metrics/top_k_categorical_accuracy.py b/ignite/metrics/top_k_categorical_accuracy.py
index 3fb493ed8441..6d33d51001c5 100644
--- a/ignite/metrics/top_k_categorical_accuracy.py
+++ b/ignite/metrics/top_k_categorical_accuracy.py
@@ -23,7 +23,7 @@ def __init__(
 
     @reinit__is_reduced
     def reset(self) -> None:
-        self._num_correct = 0
+        self._num_correct = torch.tensor(0, device=self._device)
         self._num_examples = 0
 
     @reinit__is_reduced
@@ -32,7 +32,7 @@ def update(self, output: Sequence) -> None:
         sorted_indices = torch.topk(y_pred, self._k, dim=1)[1]
         expanded_y = y.view(-1, 1).expand(-1, self._k)
         correct = torch.sum(torch.eq(sorted_indices, expanded_y), dim=1)
-        self._num_correct += torch.sum(correct).item()
+        self._num_correct += torch.sum(correct).to(self._device)
         self._num_examples += correct.shape[0]
 
     @sync_all_reduce("_num_correct", "_num_examples")
@@ -41,4 +41,4 @@ def compute(self) -> Union[float, torch.Tensor]:
             raise NotComputableError(
                 "TopKCategoricalAccuracy must have at" "least one example before it can be computed."
             )
-        return self._num_correct / self._num_examples
+        return self._num_correct.item() / self._num_examples
diff --git a/tests/ignite/metrics/test_top_k_categorical_accuracy.py b/tests/ignite/metrics/test_top_k_categorical_accuracy.py
index 6caf39a08f71..0f22d2f6b697 100644
--- a/tests/ignite/metrics/test_top_k_categorical_accuracy.py
+++ b/tests/ignite/metrics/test_top_k_categorical_accuracy.py
@@ -99,12 +99,24 @@ def update(engine, i):
         _test(n_epochs=2)
 
 
+def _test_distrib_accumulator_device(device):
+    device = torch.device(device)
+    acc = TopKCategoricalAccuracy(2, device=device)
+    assert acc._device == device
+
+    y_pred = torch.tensor([[0.2, 0.4, 0.6, 0.8], [0.8, 0.6, 0.4, 0.2]])
+    y = torch.ones(2).long()
+    acc.update((y_pred, y))
+    assert acc._num_correct.device == device
+
+
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_distrib_gpu(local_rank, distributed_context_single_node_nccl):
     device = "cuda:{}".format(local_rank)
     _test_distrib_integration(device)
+    _test_distrib_accumulator_device(device)
 
 
 @pytest.mark.distributed
@@ -112,6 +124,7 @@ def test_distrib_gpu(local_rank, distributed_context_single_node_nccl):
 def test_distrib_cpu(local_rank, distributed_context_single_node_gloo):
     device = "cpu"
     _test_distrib_integration(device)
+    _test_distrib_accumulator_device(device)
 
 
 @pytest.mark.distributed
@@ -123,6 +136,7 @@ def test_distrib_hvd(gloo_hvd_executor):
     nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
 
     gloo_hvd_executor(_test_distrib_integration, (device,), np=nproc, do_init=True)
+    gloo_hvd_executor(_test_distrib_accumulator_device, (device,), np=nproc, do_init=True)
 
 
 @pytest.mark.multinode_distributed
@@ -131,6 +145,7 @@ def test_distrib_hvd(gloo_hvd_executor):
 def test_multinode_distrib_cpu(distributed_context_multi_node_gloo):
     device = "cpu"
     _test_distrib_integration(device)
+    _test_distrib_accumulator_device(device)
 
 
 @pytest.mark.multinode_distributed
@@ -139,6 +154,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo):
 def test_multinode_distrib_gpu(distributed_context_multi_node_nccl):
     device = "cuda:{}".format(distributed_context_multi_node_nccl["local_rank"])
     _test_distrib_integration(device)
+    _test_distrib_accumulator_device(device)
 
 
 @pytest.mark.tpu
@@ -147,11 +163,13 @@ def test_multinode_distrib_gpu(distributed_context_multi_node_nccl):
 def test_distrib_single_device_xla():
     device = idist.device()
     _test_distrib_integration(device)
+    _test_distrib_accumulator_device(device)
 
 
 def _test_distrib_xla_nprocs(index):
     device = idist.device()
     _test_distrib_integration(device)
+    _test_distrib_accumulator_device(device)
 
 
 @pytest.mark.tpu

From 412551edc5161e64632911225e4ace06164ddc85 Mon Sep 17 00:00:00 2001
From: Nicholas Vadivelu <nicv@uber.com>
Date: Fri, 7 Aug 2020 23:08:21 -0400
Subject: [PATCH 07/32] update precision and recall metrics to accumulate in
 tensors on the right device

---
 ignite/metrics/precision.py            | 28 +++++++++------
 ignite/metrics/recall.py               | 12 +++----
 tests/ignite/metrics/test_precision.py | 50 ++++++++++++++++++++++++++
 tests/ignite/metrics/test_recall.py    | 50 ++++++++++++++++++++++++++
 4 files changed, 122 insertions(+), 18 deletions(-)

diff --git a/ignite/metrics/precision.py b/ignite/metrics/precision.py
index 2b8152630ddb..aa84cff2f453 100644
--- a/ignite/metrics/precision.py
+++ b/ignite/metrics/precision.py
@@ -18,7 +18,7 @@ def __init__(
         output_transform: Callable = lambda x: x,
         average: bool = False,
         is_multilabel: bool = False,
-        device: Optional[Union[str, torch.device]] = None,
+        device: Optional[Union[str, torch.device]] = torch.device("cpu"),
     ):
         if idist.get_world_size() > 1:
             if (not average) and is_multilabel:
@@ -39,13 +39,20 @@ def __init__(
 
     @reinit__is_reduced
     def reset(self) -> None:
-        dtype = torch.float64
-        self._true_positives = torch.tensor([], dtype=dtype) if (self._is_multilabel and not self._average) else 0
-        self._positives = torch.tensor([], dtype=dtype) if (self._is_multilabel and not self._average) else 0
+        if self._is_multilabel:
+            init_value = 0.0 if self._average else []
+            kws = {'dtype': torch.float64, 'device': self._device}
+            self._true_positives = torch.tensor(init_value, **kws)
+            self._positives = torch.tensor(init_value, **kws)
+        else:
+            self._true_positives = 0
+            self._positives = 0
+
         super(_BasePrecisionRecall, self).reset()
 
     def compute(self) -> Union[torch.Tensor, float]:
-        if not (isinstance(self._positives, torch.Tensor) or self._positives > 0):
+        is_scalar = not isinstance(self._positives, torch.Tensor) or self._positives.ndim == 0
+        if is_scalar and self._positives == 0:
             raise NotComputableError(
                 "{} must have at least one example before" " it can be computed.".format(self.__class__.__name__)
             )
@@ -124,7 +131,7 @@ def __init__(
         output_transform: Callable = lambda x: x,
         average: bool = False,
         is_multilabel: bool = False,
-        device: Optional[Union[str, torch.device]] = None,
+        device: Optional[Union[str, torch.device]] = torch.device("cpu"),
     ):
         super(Precision, self).__init__(
             output_transform=output_transform, average=average, is_multilabel=is_multilabel, device=device
@@ -155,17 +162,16 @@ def update(self, output: Sequence[torch.Tensor]) -> None:
             y_pred = torch.transpose(y_pred, 1, 0).reshape(num_classes, -1)
             y = torch.transpose(y, 1, 0).reshape(num_classes, -1)
 
-        y = y.to(y_pred)
+        # Convert from int cuda/cpu to double on self._device
+        y_pred = y_pred.to(dtype=torch.float64, device=self._device)
+        y = y.to(dtype=torch.float64, device=self._device)
         correct = y * y_pred
-        all_positives = y_pred.sum(dim=0).type(torch.DoubleTensor)  # Convert from int cuda/cpu to double cpu
+        all_positives = y_pred.sum(dim=0)
 
         if correct.sum() == 0:
             true_positives = torch.zeros_like(all_positives)
         else:
             true_positives = correct.sum(dim=0)
-        # Convert from int cuda/cpu to double cpu
-        # We need double precision for the division true_positives / all_positives
-        true_positives = true_positives.type(torch.DoubleTensor)
 
         if self._type == "multilabel":
             if not self._average:
diff --git a/ignite/metrics/recall.py b/ignite/metrics/recall.py
index 048c11b10c5b..c3378d3f3f6a 100644
--- a/ignite/metrics/recall.py
+++ b/ignite/metrics/recall.py
@@ -69,7 +69,7 @@ def __init__(
         output_transform: Callable = lambda x: x,
         average: bool = False,
         is_multilabel: bool = False,
-        device: Optional[Union[str, torch.device]] = None,
+        device: Optional[Union[str, torch.device]] = torch.device("cpu"),
     ):
         super(Recall, self).__init__(
             output_transform=output_transform, average=average, is_multilabel=is_multilabel, device=device
@@ -100,19 +100,17 @@ def update(self, output: Sequence[torch.Tensor]) -> None:
             y_pred = torch.transpose(y_pred, 1, 0).reshape(num_classes, -1)
             y = torch.transpose(y, 1, 0).reshape(num_classes, -1)
 
-        y = y.type_as(y_pred)
+        # Convert from int cuda/cpu to double on self._device
+        y_pred = y_pred.to(dtype=torch.float64, device=self._device)
+        y = y.to(dtype=torch.float64, device=self._device)
         correct = y * y_pred
-        actual_positives = y.sum(dim=0).type(torch.DoubleTensor)  # Convert from int cuda/cpu to double cpu
+        actual_positives = y.sum(dim=0)
 
         if correct.sum() == 0:
             true_positives = torch.zeros_like(actual_positives)
         else:
             true_positives = correct.sum(dim=0)
 
-        # Convert from int cuda/cpu to double cpu
-        # We need double precision for the division true_positives / actual_positives
-        true_positives = true_positives.type(torch.DoubleTensor)
-
         if self._type == "multilabel":
             if not self._average:
                 self._true_positives = torch.cat([self._true_positives, true_positives], dim=0)
diff --git a/tests/ignite/metrics/test_precision.py b/tests/ignite/metrics/test_precision.py
index 94f1d643585f..9ea0741fc835 100644
--- a/tests/ignite/metrics/test_precision.py
+++ b/tests/ignite/metrics/test_precision.py
@@ -833,6 +833,42 @@ def update(engine, i):
         assert (pr_compute1 == pr_compute2).all()
 
 
+def _test_distrib_accumulator_device(device):
+    # Binary accuracy on input of shape (N, 1) or (N, )
+    device = torch.device(device)
+
+    def _test(average):
+        pr = Precision(average=average, device=device)
+        assert pr._device == device
+
+        y_pred = torch.randint(0, 2, size=(10,))
+        y = torch.randint(0, 2, size=(10,)).long()
+        pr.update((y_pred, y))
+
+        assert pr._true_positives.device == device
+        assert pr._positives.device == device
+
+    _test(True)
+    _test(False)
+
+
+def _test_distrib_multilabel_accumulator_device(device):
+    # Multiclass input data of shape (N, ) and (N, C)
+    device = torch.device(device)
+
+    def _test(average):
+        pr = Precision(is_multilabel=True, average=average, device=device)
+        y_pred = torch.randint(0, 2, size=(10, 4, 20, 23))
+        y = torch.randint(0, 2, size=(10, 4, 20, 23)).long()
+        pr.update((y_pred, y))
+
+        assert pr._true_positives.device == device
+        assert pr._positives.device == device
+
+    _test(True)
+    _test(False)
+
+
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
@@ -840,6 +876,8 @@ def test_distrib_gpu(local_rank, distributed_context_single_node_nccl):
     device = "cuda:{}".format(local_rank)
     _test_distrib_integration_multiclass(device)
     _test_distrib_integration_multilabel(device)
+    _test_distrib_accumulator_device(device)
+    _test_distrib_multilabel_accumulator_device(device)
 
 
 @pytest.mark.distributed
@@ -848,6 +886,8 @@ def test_distrib_cpu(local_rank, distributed_context_single_node_gloo):
     device = "cpu"
     _test_distrib_integration_multiclass(device)
     _test_distrib_integration_multilabel(device)
+    _test_distrib_accumulator_device(device)
+    _test_distrib_multilabel_accumulator_device(device)
 
 
 @pytest.mark.distributed
@@ -860,6 +900,8 @@ def test_distrib_hvd(gloo_hvd_executor):
 
     gloo_hvd_executor(_test_distrib_integration_multiclass, (device,), np=nproc, do_init=True)
     gloo_hvd_executor(_test_distrib_integration_multilabel, (device,), np=nproc, do_init=True)
+    gloo_hvd_executor(_test_distrib_accumulator_device, (device,), np=nproc, do_init=True)
+    gloo_hvd_executor(_test_distrib_integration_multilabel, (device,), np=nproc, do_init=True)
 
 
 @pytest.mark.multinode_distributed
@@ -869,6 +911,8 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo):
     device = "cpu"
     _test_distrib_integration_multiclass(device)
     _test_distrib_integration_multilabel(device)
+    _test_distrib_accumulator_device(device)
+    _test_distrib_multilabel_accumulator_device(device)
 
 
 @pytest.mark.multinode_distributed
@@ -878,6 +922,8 @@ def test_multinode_distrib_gpu(distributed_context_multi_node_nccl):
     device = "cuda:{}".format(distributed_context_multi_node_nccl["local_rank"])
     _test_distrib_integration_multiclass(device)
     _test_distrib_integration_multilabel(device)
+    _test_distrib_accumulator_device(device)
+    _test_distrib_multilabel_accumulator_device(device)
 
 
 @pytest.mark.tpu
@@ -887,12 +933,16 @@ def test_distrib_single_device_xla():
     device = idist.device()
     _test_distrib_integration_multiclass(device)
     _test_distrib_integration_multilabel(device)
+    _test_distrib_accumulator_device(device)
+    _test_distrib_multilabel_accumulator_device(device)
 
 
 def _test_distrib_xla_nprocs(index):
     device = idist.device()
     _test_distrib_integration_multiclass(device)
     _test_distrib_integration_multilabel(device)
+    _test_distrib_accumulator_device(device)
+    _test_distrib_multilabel_accumulator_device(device)
 
 
 @pytest.mark.tpu
diff --git a/tests/ignite/metrics/test_recall.py b/tests/ignite/metrics/test_recall.py
index 214fc43c4314..b783b2ec83ce 100644
--- a/tests/ignite/metrics/test_recall.py
+++ b/tests/ignite/metrics/test_recall.py
@@ -833,6 +833,42 @@ def update(engine, i):
         assert (re_compute1 == re_compute2).all()
 
 
+def _test_distrib_accumulator_device(device):
+    # Binary accuracy on input of shape (N, 1) or (N, )
+    device = torch.device(device)
+
+    def _test(average):
+        re = Recall(average=average, device=device)
+        assert re._device == device
+
+        y_reed = torch.randint(0, 2, size=(10,))
+        y = torch.randint(0, 2, size=(10,)).long()
+        re.update((y_reed, y))
+
+        assert re._true_positives.device == device
+        assert re._positives.device == device
+
+    _test(True)
+    _test(False)
+
+
+def _test_distrib_multilabel_accumulator_device(device):
+    # Multiclass input data of shape (N, ) and (N, C)
+    device = torch.device(device)
+
+    def _test(average):
+        re = Recall(is_multilabel=True, average=average, device=device)
+        y_reed = torch.randint(0, 2, size=(10, 4, 20, 23))
+        y = torch.randint(0, 2, size=(10, 4, 20, 23)).long()
+        re.update((y_reed, y))
+
+        assert re._true_positives.device == device
+        assert re._positives.device == device
+
+    _test(True)
+    _test(False)
+
+
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
@@ -840,6 +876,8 @@ def test_distrib_gpu(local_rank, distributed_context_single_node_nccl):
     device = "cuda:{}".format(local_rank)
     _test_distrib_integration_multiclass(device)
     _test_distrib_integration_multilabel(device)
+    _test_distrib_accumulator_device(device)
+    _test_distrib_multilabel_accumulator_device(device)
 
 
 @pytest.mark.distributed
@@ -848,6 +886,8 @@ def test_distrib_cpu(distributed_context_single_node_gloo):
     device = "cpu"
     _test_distrib_integration_multiclass(device)
     _test_distrib_integration_multilabel(device)
+    _test_distrib_accumulator_device(device)
+    _test_distrib_multilabel_accumulator_device(device)
 
 
 @pytest.mark.distributed
@@ -860,6 +900,8 @@ def test_distrib_hvd(gloo_hvd_executor):
 
     gloo_hvd_executor(_test_distrib_integration_multiclass, (device,), np=nproc, do_init=True)
     gloo_hvd_executor(_test_distrib_integration_multilabel, (device,), np=nproc, do_init=True)
+    gloo_hvd_executor(_test_distrib_accumulator_device, (device,), np=nproc, do_init=True)
+    gloo_hvd_executor(_test_distrib_multilabel_accumulator_device, (device,), np=nproc, do_init=True)
 
 
 @pytest.mark.multinode_distributed
@@ -869,6 +911,8 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo):
     device = "cpu"
     _test_distrib_integration_multiclass(device)
     _test_distrib_integration_multilabel(device)
+    _test_distrib_accumulator_device(device)
+    _test_distrib_multilabel_accumulator_device(device)
 
 
 @pytest.mark.multinode_distributed
@@ -878,6 +922,8 @@ def test_multinode_distrib_gpu(distributed_context_multi_node_nccl):
     device = "cuda:{}".format(distributed_context_multi_node_nccl["local_rank"])
     _test_distrib_integration_multiclass(device)
     _test_distrib_integration_multilabel(device)
+    _test_distrib_accumulator_device(device)
+    _test_distrib_multilabel_accumulator_device(device)
 
 
 @pytest.mark.tpu
@@ -887,12 +933,16 @@ def test_distrib_single_device_xla():
     device = idist.device()
     _test_distrib_integration_multiclass(device)
     _test_distrib_integration_multilabel(device)
+    _test_distrib_accumulator_device(device)
+    _test_distrib_multilabel_accumulator_device(device)
 
 
 def _test_distrib_xla_nprocs(index):
     device = idist.device()
     _test_distrib_integration_multiclass(device)
     _test_distrib_integration_multilabel(device)
+    _test_distrib_accumulator_device(device)
+    _test_distrib_multilabel_accumulator_device(device)
 
 
 @pytest.mark.tpu

From 4c4a76cffb40d795d27d7018f90c175a49c5736e Mon Sep 17 00:00:00 2001
From: Nicholas Vadivelu <nicv@uber.com>
Date: Fri, 7 Aug 2020 23:08:30 -0400
Subject: [PATCH 08/32] .....

---
 tests/run_cpu_tests.sh | 0
 tests/run_gpu_tests.sh | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 tests/run_cpu_tests.sh
 mode change 100644 => 100755 tests/run_gpu_tests.sh

diff --git a/tests/run_cpu_tests.sh b/tests/run_cpu_tests.sh
old mode 100644
new mode 100755
diff --git a/tests/run_gpu_tests.sh b/tests/run_gpu_tests.sh
old mode 100644
new mode 100755

From b1e6956814b8f551231683bf6cb8c72bbfce0586 Mon Sep 17 00:00:00 2001
From: Nicholas Vadivelu <nicv@uber.com>
Date: Fri, 7 Aug 2020 23:09:03 -0400
Subject: [PATCH 09/32] black formatting

---
 ignite/metrics/precision.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ignite/metrics/precision.py b/ignite/metrics/precision.py
index aa84cff2f453..b8d920d61964 100644
--- a/ignite/metrics/precision.py
+++ b/ignite/metrics/precision.py
@@ -41,7 +41,7 @@ def __init__(
     def reset(self) -> None:
         if self._is_multilabel:
             init_value = 0.0 if self._average else []
-            kws = {'dtype': torch.float64, 'device': self._device}
+            kws = {"dtype": torch.float64, "device": self._device}
             self._true_positives = torch.tensor(init_value, **kws)
             self._positives = torch.tensor(init_value, **kws)
         else:

From b081e92df7a40ebea446adb0c06c23f9bbe9369a Mon Sep 17 00:00:00 2001
From: Nicholas Vadivelu <nicv@uber.com>
Date: Mon, 10 Aug 2020 00:06:44 -0400
Subject: [PATCH 10/32] reverted run*.sh

---
 tests/run_cpu_tests.sh | 0
 tests/run_gpu_tests.sh | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 mode change 100755 => 100644 tests/run_cpu_tests.sh
 mode change 100755 => 100644 tests/run_gpu_tests.sh

diff --git a/tests/run_cpu_tests.sh b/tests/run_cpu_tests.sh
old mode 100755
new mode 100644
diff --git a/tests/run_gpu_tests.sh b/tests/run_gpu_tests.sh
old mode 100755
new mode 100644

From a343c358923acea1448f85967e9ac57ddd31084b Mon Sep 17 00:00:00 2001
From: Nicholas Vadivelu <nicv@uber.com>
Date: Sun, 16 Aug 2020 03:42:08 -0400
Subject: [PATCH 11/32] change all metrics default device to cpu except
 running_average

---
 ignite/metrics/accumulation.py               | 13 ++++++++++---
 ignite/metrics/accuracy.py                   |  4 ++--
 ignite/metrics/confusion_matrix.py           |  2 +-
 ignite/metrics/fbeta.py                      |  2 +-
 ignite/metrics/frequency.py                  |  6 +++++-
 ignite/metrics/loss.py                       |  2 +-
 ignite/metrics/mean_pairwise_distance.py     |  2 +-
 ignite/metrics/metric.py                     |  4 +++-
 ignite/metrics/top_k_categorical_accuracy.py |  5 ++++-
 9 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/ignite/metrics/accumulation.py b/ignite/metrics/accumulation.py
index dff45ee87fcc..e72debfa98c3 100644
--- a/ignite/metrics/accumulation.py
+++ b/ignite/metrics/accumulation.py
@@ -38,7 +38,10 @@ class VariableAccumulation(Metric):
     _required_output_keys = None
 
     def __init__(
-        self, op: Callable, output_transform: Callable = lambda x: x, device: Optional[Union[str, torch.device]] = None
+        self,
+        op: Callable,
+        output_transform: Callable = lambda x: x,
+        device: Optional[Union[str, torch.device]] = torch.device("cpu"),
     ):
         if not callable(op):
             raise TypeError("Argument op should be a callable, but given {}".format(type(op)))
@@ -115,7 +118,9 @@ class Average(VariableAccumulation):
 
     """
 
-    def __init__(self, output_transform: Callable = lambda x: x, device: Optional[Union[str, torch.device]] = None):
+    def __init__(
+        self, output_transform: Callable = lambda x: x, device: Optional[Union[str, torch.device]] = torch.device("cpu")
+    ):
         def _mean_op(a, x):
             if isinstance(x, torch.Tensor) and x.ndim > 1:
                 x = x.sum(dim=0)
@@ -159,7 +164,9 @@ class GeometricAverage(VariableAccumulation):
 
     """
 
-    def __init__(self, output_transform: Callable = lambda x: x, device: Optional[Union[str, torch.device]] = None):
+    def __init__(
+        self, output_transform: Callable = lambda x: x, device: Optional[Union[str, torch.device]] = torch.device("cpu")
+    ):
         def _geom_op(a: torch.Tensor, x: Union[Any, numbers.Number, torch.Tensor]) -> torch.Tensor:
             if not isinstance(x, torch.Tensor):
                 x = torch.tensor(x)
diff --git a/ignite/metrics/accuracy.py b/ignite/metrics/accuracy.py
index c41053ef5b99..c79d93b79c64 100644
--- a/ignite/metrics/accuracy.py
+++ b/ignite/metrics/accuracy.py
@@ -13,7 +13,7 @@ def __init__(
         self,
         output_transform: Callable = lambda x: x,
         is_multilabel: bool = False,
-        device: Optional[Union[str, torch.device]] = None,
+        device: Optional[Union[str, torch.device]] = torch.device("cpu"),
     ):
         self._is_multilabel = is_multilabel
         self._type = None
@@ -130,7 +130,7 @@ def __init__(
         self,
         output_transform: Callable = lambda x: x,
         is_multilabel: bool = False,
-        device: Optional[Union[str, torch.device]] = None,
+        device: Optional[Union[str, torch.device]] = torch.device("cpu"),
     ):
         self._num_correct = None
         self._num_examples = None
diff --git a/ignite/metrics/confusion_matrix.py b/ignite/metrics/confusion_matrix.py
index 2ab1f436bace..ab9b188a6880 100644
--- a/ignite/metrics/confusion_matrix.py
+++ b/ignite/metrics/confusion_matrix.py
@@ -44,7 +44,7 @@ def __init__(
         num_classes: int,
         average: Optional[str] = None,
         output_transform: Callable = lambda x: x,
-        device: Optional[Union[str, torch.device]] = None,
+        device: Optional[Union[str, torch.device]] = torch.device("cpu"),
     ):
         if average is not None and average not in ("samples", "recall", "precision"):
             raise ValueError("Argument average can None or one of 'samples', 'recall', 'precision'")
diff --git a/ignite/metrics/fbeta.py b/ignite/metrics/fbeta.py
index 05e217846115..1383b520364d 100644
--- a/ignite/metrics/fbeta.py
+++ b/ignite/metrics/fbeta.py
@@ -15,7 +15,7 @@ def Fbeta(
     precision: Optional[Precision] = None,
     recall: Optional[Recall] = None,
     output_transform: Optional[Callable] = None,
-    device: Optional[Union[str, torch.device]] = None,
+    device: Optional[Union[str, torch.device]] = torch.device("cpu"),
 ) -> MetricsLambda:
     """Calculates F-beta score
 
diff --git a/ignite/metrics/frequency.py b/ignite/metrics/frequency.py
index 75eba360bb53..fa91e36df0b1 100644
--- a/ignite/metrics/frequency.py
+++ b/ignite/metrics/frequency.py
@@ -1,3 +1,5 @@
+from typing import Callable, Optional, Union
+
 import torch
 
 import ignite.distributed as idist
@@ -35,7 +37,9 @@ class Frequency(Metric):
             # Epoch [2/10]: [50/100]  50%|█████      , wps=400 [00:17<00:35]
     """
 
-    def __init__(self, output_transform=lambda x: x, device=None):
+    def __init__(
+        self, output_transform: Callable = lambda x: x, device: Optional[Union[str, torch.device]] = torch.device("cpu")
+    ):
         self._timer = None
         self._acc = None
         self._n = None
diff --git a/ignite/metrics/loss.py b/ignite/metrics/loss.py
index f44ce3f6f193..c6fb85171894 100644
--- a/ignite/metrics/loss.py
+++ b/ignite/metrics/loss.py
@@ -37,7 +37,7 @@ def __init__(
         loss_fn: Callable,
         output_transform: Callable = lambda x: x,
         batch_size: Callable = lambda x: len(x),
-        device: Optional[Union[str, torch.device]] = None,
+        device: Optional[Union[str, torch.device]] = torch.device("cpu"),
     ):
         super(Loss, self).__init__(output_transform, device=device)
         self._loss_fn = loss_fn
diff --git a/ignite/metrics/mean_pairwise_distance.py b/ignite/metrics/mean_pairwise_distance.py
index b1f1c9a52c26..1edbf6009d3a 100644
--- a/ignite/metrics/mean_pairwise_distance.py
+++ b/ignite/metrics/mean_pairwise_distance.py
@@ -21,7 +21,7 @@ def __init__(
         p: int = 2,
         eps: float = 1e-6,
         output_transform: Callable = lambda x: x,
-        device: Optional[Union[str, torch.device]] = None,
+        device: Optional[Union[str, torch.device]] = torch.device("cpu"),
     ):
         super(MeanPairwiseDistance, self).__init__(output_transform, device=device)
         self._p = p
diff --git a/ignite/metrics/metric.py b/ignite/metrics/metric.py
index 4d78044f0406..89a62da2b259 100644
--- a/ignite/metrics/metric.py
+++ b/ignite/metrics/metric.py
@@ -129,7 +129,9 @@ class Metric(metaclass=ABCMeta):
     _required_output_keys = ("y_pred", "y")
 
     def __init__(
-        self, output_transform: Callable = lambda x: x, device: Optional[Union[str, torch.device]] = None,
+        self,
+        output_transform: Callable = lambda x: x,
+        device: Optional[Union[str, torch.device]] = torch.device("cpu"),
     ):
         self._output_transform = output_transform
 
diff --git a/ignite/metrics/top_k_categorical_accuracy.py b/ignite/metrics/top_k_categorical_accuracy.py
index 6d33d51001c5..8c41265d81e6 100644
--- a/ignite/metrics/top_k_categorical_accuracy.py
+++ b/ignite/metrics/top_k_categorical_accuracy.py
@@ -16,7 +16,10 @@ class TopKCategoricalAccuracy(Metric):
     """
 
     def __init__(
-        self, k=5, output_transform: Callable = lambda x: x, device: Optional[Union[str, torch.device]] = None
+        self,
+        k=5,
+        output_transform: Callable = lambda x: x,
+        device: Optional[Union[str, torch.device]] = torch.device("cpu"),
     ):
         super(TopKCategoricalAccuracy, self).__init__(output_transform, device=device)
         self._k = k

From 854860113abfa5bf81f61f9437335c58c6977c57 Mon Sep 17 00:00:00 2001
From: Nicholas Vadivelu <nicholas.vadivelu@gmail.com>
Date: Sun, 16 Aug 2020 14:57:54 -0400
Subject: [PATCH 12/32] Update ignite/metrics/precision.py

Co-authored-by: vfdev <vfdev.5@gmail.com>
---
 ignite/metrics/precision.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ignite/metrics/precision.py b/ignite/metrics/precision.py
index b8d920d61964..bd79916eb0cd 100644
--- a/ignite/metrics/precision.py
+++ b/ignite/metrics/precision.py
@@ -54,7 +54,7 @@ def compute(self) -> Union[torch.Tensor, float]:
         is_scalar = not isinstance(self._positives, torch.Tensor) or self._positives.ndim == 0
         if is_scalar and self._positives == 0:
             raise NotComputableError(
-                "{} must have at least one example before" " it can be computed.".format(self.__class__.__name__)
+                "{} must have at least one example before it can be computed.".format(self.__class__.__name__)
             )
 
         if not (self._type == "multilabel" and not self._average):

From b84226bacce77d997f81b6263cbc7025c0510ef9 Mon Sep 17 00:00:00 2001
From: Nicholas Vadivelu <nicv@uber.com>
Date: Sun, 16 Aug 2020 15:20:10 -0400
Subject: [PATCH 13/32] remove Optional type from metric devices since default
 is cpu

---
 ignite/metrics/accumulation.py               | 8 ++++----
 ignite/metrics/accuracy.py                   | 6 +++---
 ignite/metrics/confusion_matrix.py           | 2 +-
 ignite/metrics/fbeta.py                      | 2 +-
 ignite/metrics/frequency.py                  | 2 +-
 ignite/metrics/loss.py                       | 4 ++--
 ignite/metrics/mean_pairwise_distance.py     | 4 ++--
 ignite/metrics/metric.py                     | 6 ++----
 ignite/metrics/precision.py                  | 6 +++---
 ignite/metrics/recall.py                     | 4 ++--
 ignite/metrics/top_k_categorical_accuracy.py | 7 ++-----
 11 files changed, 23 insertions(+), 28 deletions(-)

diff --git a/ignite/metrics/accumulation.py b/ignite/metrics/accumulation.py
index e72debfa98c3..0de2ee469cc7 100644
--- a/ignite/metrics/accumulation.py
+++ b/ignite/metrics/accumulation.py
@@ -1,5 +1,5 @@
 import numbers
-from typing import Any, Callable, Optional, Union
+from typing import Any, Callable, Union
 
 import torch
 
@@ -41,7 +41,7 @@ def __init__(
         self,
         op: Callable,
         output_transform: Callable = lambda x: x,
-        device: Optional[Union[str, torch.device]] = torch.device("cpu"),
+        device: Union[str, torch.device] = torch.device("cpu"),
     ):
         if not callable(op):
             raise TypeError("Argument op should be a callable, but given {}".format(type(op)))
@@ -119,7 +119,7 @@ class Average(VariableAccumulation):
     """
 
     def __init__(
-        self, output_transform: Callable = lambda x: x, device: Optional[Union[str, torch.device]] = torch.device("cpu")
+        self, output_transform: Callable = lambda x: x, device: Union[str, torch.device] = torch.device("cpu")
     ):
         def _mean_op(a, x):
             if isinstance(x, torch.Tensor) and x.ndim > 1:
@@ -165,7 +165,7 @@ class GeometricAverage(VariableAccumulation):
     """
 
     def __init__(
-        self, output_transform: Callable = lambda x: x, device: Optional[Union[str, torch.device]] = torch.device("cpu")
+        self, output_transform: Callable = lambda x: x, device: Union[str, torch.device] = torch.device("cpu")
     ):
         def _geom_op(a: torch.Tensor, x: Union[Any, numbers.Number, torch.Tensor]) -> torch.Tensor:
             if not isinstance(x, torch.Tensor):
diff --git a/ignite/metrics/accuracy.py b/ignite/metrics/accuracy.py
index c79d93b79c64..52f897e7aedd 100644
--- a/ignite/metrics/accuracy.py
+++ b/ignite/metrics/accuracy.py
@@ -1,4 +1,4 @@
-from typing import Callable, Optional, Sequence, Union
+from typing import Callable, Sequence, Union
 
 import torch
 
@@ -13,7 +13,7 @@ def __init__(
         self,
         output_transform: Callable = lambda x: x,
         is_multilabel: bool = False,
-        device: Optional[Union[str, torch.device]] = torch.device("cpu"),
+        device: Union[str, torch.device] = torch.device("cpu"),
     ):
         self._is_multilabel = is_multilabel
         self._type = None
@@ -130,7 +130,7 @@ def __init__(
         self,
         output_transform: Callable = lambda x: x,
         is_multilabel: bool = False,
-        device: Optional[Union[str, torch.device]] = torch.device("cpu"),
+        device: Union[str, torch.device] = torch.device("cpu"),
     ):
         self._num_correct = None
         self._num_examples = None
diff --git a/ignite/metrics/confusion_matrix.py b/ignite/metrics/confusion_matrix.py
index ab9b188a6880..574e7e6a1105 100644
--- a/ignite/metrics/confusion_matrix.py
+++ b/ignite/metrics/confusion_matrix.py
@@ -44,7 +44,7 @@ def __init__(
         num_classes: int,
         average: Optional[str] = None,
         output_transform: Callable = lambda x: x,
-        device: Optional[Union[str, torch.device]] = torch.device("cpu"),
+        device: Union[str, torch.device] = torch.device("cpu"),
     ):
         if average is not None and average not in ("samples", "recall", "precision"):
             raise ValueError("Argument average can None or one of 'samples', 'recall', 'precision'")
diff --git a/ignite/metrics/fbeta.py b/ignite/metrics/fbeta.py
index 1383b520364d..6af40b7234d0 100644
--- a/ignite/metrics/fbeta.py
+++ b/ignite/metrics/fbeta.py
@@ -15,7 +15,7 @@ def Fbeta(
     precision: Optional[Precision] = None,
     recall: Optional[Recall] = None,
     output_transform: Optional[Callable] = None,
-    device: Optional[Union[str, torch.device]] = torch.device("cpu"),
+    device: Union[str, torch.device] = torch.device("cpu"),
 ) -> MetricsLambda:
     """Calculates F-beta score
 
diff --git a/ignite/metrics/frequency.py b/ignite/metrics/frequency.py
index fa91e36df0b1..447cbbf63fd8 100644
--- a/ignite/metrics/frequency.py
+++ b/ignite/metrics/frequency.py
@@ -38,7 +38,7 @@ class Frequency(Metric):
     """
 
     def __init__(
-        self, output_transform: Callable = lambda x: x, device: Optional[Union[str, torch.device]] = torch.device("cpu")
+        self, output_transform: Callable = lambda x: x, device: Union[str, torch.device] = torch.device("cpu")
     ):
         self._timer = None
         self._acc = None
diff --git a/ignite/metrics/loss.py b/ignite/metrics/loss.py
index c6fb85171894..b0e7d1955fd7 100644
--- a/ignite/metrics/loss.py
+++ b/ignite/metrics/loss.py
@@ -1,4 +1,4 @@
-from typing import Callable, Optional, Sequence, Union
+from typing import Callable, Sequence, Union
 
 import torch
 
@@ -37,7 +37,7 @@ def __init__(
         loss_fn: Callable,
         output_transform: Callable = lambda x: x,
         batch_size: Callable = lambda x: len(x),
-        device: Optional[Union[str, torch.device]] = torch.device("cpu"),
+        device: Union[str, torch.device] = torch.device("cpu"),
     ):
         super(Loss, self).__init__(output_transform, device=device)
         self._loss_fn = loss_fn
diff --git a/ignite/metrics/mean_pairwise_distance.py b/ignite/metrics/mean_pairwise_distance.py
index 1edbf6009d3a..01a1eb7386e5 100644
--- a/ignite/metrics/mean_pairwise_distance.py
+++ b/ignite/metrics/mean_pairwise_distance.py
@@ -1,4 +1,4 @@
-from typing import Callable, Optional, Sequence, Union
+from typing import Callable, Sequence, Union
 
 import torch
 from torch.nn.functional import pairwise_distance
@@ -21,7 +21,7 @@ def __init__(
         p: int = 2,
         eps: float = 1e-6,
         output_transform: Callable = lambda x: x,
-        device: Optional[Union[str, torch.device]] = torch.device("cpu"),
+        device: Union[str, torch.device] = torch.device("cpu"),
     ):
         super(MeanPairwiseDistance, self).__init__(output_transform, device=device)
         self._p = p
diff --git a/ignite/metrics/metric.py b/ignite/metrics/metric.py
index 89a62da2b259..a12a09fbb4dc 100644
--- a/ignite/metrics/metric.py
+++ b/ignite/metrics/metric.py
@@ -2,7 +2,7 @@
 from abc import ABCMeta, abstractmethod
 from collections.abc import Mapping
 from functools import wraps
-from typing import Any, Callable, Optional, Union
+from typing import Any, Callable, Union
 
 import torch
 
@@ -129,9 +129,7 @@ class Metric(metaclass=ABCMeta):
     _required_output_keys = ("y_pred", "y")
 
     def __init__(
-        self,
-        output_transform: Callable = lambda x: x,
-        device: Optional[Union[str, torch.device]] = torch.device("cpu"),
+        self, output_transform: Callable = lambda x: x, device: Union[str, torch.device] = torch.device("cpu"),
     ):
         self._output_transform = output_transform
 
diff --git a/ignite/metrics/precision.py b/ignite/metrics/precision.py
index bd79916eb0cd..7754c47f47cc 100644
--- a/ignite/metrics/precision.py
+++ b/ignite/metrics/precision.py
@@ -1,5 +1,5 @@
 import warnings
-from typing import Callable, Optional, Sequence, Union
+from typing import Callable, Sequence, Union
 
 import torch
 
@@ -18,7 +18,7 @@ def __init__(
         output_transform: Callable = lambda x: x,
         average: bool = False,
         is_multilabel: bool = False,
-        device: Optional[Union[str, torch.device]] = torch.device("cpu"),
+        device: Union[str, torch.device] = torch.device("cpu"),
     ):
         if idist.get_world_size() > 1:
             if (not average) and is_multilabel:
@@ -131,7 +131,7 @@ def __init__(
         output_transform: Callable = lambda x: x,
         average: bool = False,
         is_multilabel: bool = False,
-        device: Optional[Union[str, torch.device]] = torch.device("cpu"),
+        device: Union[str, torch.device] = torch.device("cpu"),
     ):
         super(Precision, self).__init__(
             output_transform=output_transform, average=average, is_multilabel=is_multilabel, device=device
diff --git a/ignite/metrics/recall.py b/ignite/metrics/recall.py
index c3378d3f3f6a..d446583f6bd6 100644
--- a/ignite/metrics/recall.py
+++ b/ignite/metrics/recall.py
@@ -1,4 +1,4 @@
-from typing import Callable, Optional, Sequence, Union
+from typing import Callable, Sequence, Union
 
 import torch
 
@@ -69,7 +69,7 @@ def __init__(
         output_transform: Callable = lambda x: x,
         average: bool = False,
         is_multilabel: bool = False,
-        device: Optional[Union[str, torch.device]] = torch.device("cpu"),
+        device: Union[str, torch.device] = torch.device("cpu"),
     ):
         super(Recall, self).__init__(
             output_transform=output_transform, average=average, is_multilabel=is_multilabel, device=device
diff --git a/ignite/metrics/top_k_categorical_accuracy.py b/ignite/metrics/top_k_categorical_accuracy.py
index 8c41265d81e6..abf7996d3047 100644
--- a/ignite/metrics/top_k_categorical_accuracy.py
+++ b/ignite/metrics/top_k_categorical_accuracy.py
@@ -1,4 +1,4 @@
-from typing import Callable, Optional, Sequence, Union
+from typing import Callable, Sequence, Union
 
 import torch
 
@@ -16,10 +16,7 @@ class TopKCategoricalAccuracy(Metric):
     """
 
     def __init__(
-        self,
-        k=5,
-        output_transform: Callable = lambda x: x,
-        device: Optional[Union[str, torch.device]] = torch.device("cpu"),
+        self, k=5, output_transform: Callable = lambda x: x, device: Union[str, torch.device] = torch.device("cpu"),
     ):
         super(TopKCategoricalAccuracy, self).__init__(output_transform, device=device)
         self._k = k

From 685c23bc21ba3b32eb0732c3bdb2f314b75f7ae4 Mon Sep 17 00:00:00 2001
From: Nicholas Vadivelu <nicv@uber.com>
Date: Sun, 16 Aug 2020 15:30:39 -0400
Subject: [PATCH 14/32] add comment explaining lack of detach in accuracy
 metrics

---
 ignite/metrics/accuracy.py                   | 1 +
 ignite/metrics/top_k_categorical_accuracy.py | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/ignite/metrics/accuracy.py b/ignite/metrics/accuracy.py
index 52f897e7aedd..79ba5e4c1d80 100644
--- a/ignite/metrics/accuracy.py
+++ b/ignite/metrics/accuracy.py
@@ -161,6 +161,7 @@ def update(self, output: Sequence[torch.Tensor]) -> None:
             y = torch.transpose(y, 1, last_dim - 1).reshape(-1, num_classes)
             correct = torch.all(y == y_pred.type_as(y), dim=-1)
 
+        # Don't need to detach here because torch.eq is not differentiable, so the computation graph is detached anyway.
         self._num_correct += torch.sum(correct).to(self._device)
         self._num_examples += correct.shape[0]
 
diff --git a/ignite/metrics/top_k_categorical_accuracy.py b/ignite/metrics/top_k_categorical_accuracy.py
index abf7996d3047..aa64f5b45319 100644
--- a/ignite/metrics/top_k_categorical_accuracy.py
+++ b/ignite/metrics/top_k_categorical_accuracy.py
@@ -32,6 +32,8 @@ def update(self, output: Sequence) -> None:
         sorted_indices = torch.topk(y_pred, self._k, dim=1)[1]
         expanded_y = y.view(-1, 1).expand(-1, self._k)
         correct = torch.sum(torch.eq(sorted_indices, expanded_y), dim=1)
+
+        # Don't need to detach here because torch.eq is not differentiable, so the computation graph is detached anyway.
         self._num_correct += torch.sum(correct).to(self._device)
         self._num_examples += correct.shape[0]
 

From 0b4337d09deb829c21489f6308b292e772af3dc5 Mon Sep 17 00:00:00 2001
From: Nicholas Vadivelu <nicv@uber.com>
Date: Sun, 16 Aug 2020 20:59:56 -0400
Subject: [PATCH 15/32] update docstrings and docs

---
 docs/source/metrics.rst            | 15 ++++++++++-----
 ignite/metrics/accumulation.py     | 13 +++++++++----
 ignite/metrics/accuracy.py         |  4 +++-
 ignite/metrics/confusion_matrix.py |  4 +++-
 ignite/metrics/fbeta.py            |  4 +++-
 ignite/metrics/loss.py             |  4 +++-
 ignite/metrics/metric.py           |  4 +++-
 ignite/metrics/precision.py        |  4 +++-
 ignite/metrics/recall.py           |  4 +++-
 9 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/docs/source/metrics.rst b/docs/source/metrics.rst
index bc2da2e12cba..ba4585802dd2 100644
--- a/docs/source/metrics.rst
+++ b/docs/source/metrics.rst
@@ -120,15 +120,15 @@ specific condition (e.g. ignore user-defined classes):
 
     class CustomAccuracy(Metric):
 
-        def __init__(self, ignored_class, output_transform=lambda x: x):
+        def __init__(self, ignored_class, output_transform=lambda x: x, device="cpu"):
             self.ignored_class = ignored_class
             self._num_correct = None
             self._num_examples = None
-            super(CustomAccuracy, self).__init__(output_transform=output_transform)
+            super(CustomAccuracy, self).__init__(output_transform=output_transform, device=device)
 
         @reinit__is_reduced
         def reset(self):
-            self._num_correct = 0
+            self._num_correct = torch.tensor(0, device=self._device)
             self._num_examples = 0
             super(CustomAccuracy, self).reset()
 
@@ -144,14 +144,14 @@ specific condition (e.g. ignore user-defined classes):
             indices = indices[mask]
             correct = torch.eq(indices, y).view(-1)
 
-            self._num_correct += torch.sum(correct).item()
+            self._num_correct += torch.sum(correct).to(self._device)
             self._num_examples += correct.shape[0]
 
         @sync_all_reduce("_num_examples", "_num_correct")
         def compute(self):
             if self._num_examples == 0:
                 raise NotComputableError('CustomAccuracy must have at least one example before it can be computed.')
-            return self._num_correct / self._num_examples
+            return self._num_correct.item() / self._num_examples
 
 
 We imported necessary classes as :class:`~ignite.metrics.Metric`, :class:`~ignite.exceptions.NotComputableError` and
@@ -159,6 +159,11 @@ decorators to adapt the metric for distributed setting. In ``reset`` method, we
 and ``_num_examples`` which are used to compute the custom metric. In ``updated`` method we define how to update
 the internal variables. And finally in ``compute`` method, we compute metric value.
 
+Notice that ``_num_correct`` is a tensor, since in ``update`` we accumulate tensor values. ``_num_examples`` is a python
+scalar since we accumulate normal integers. For differentiable metrics, you must detach the accumulated values before
+adding them to the internal variables. Accuracy is not differentiable (specifically the ``torch.eq`` call), so it
+is implicitly detached from the computation graph.
+
 We can check this implementation in a simple case:
 
 .. code-block:: python
diff --git a/ignite/metrics/accumulation.py b/ignite/metrics/accumulation.py
index 0de2ee469cc7..1399484459e6 100644
--- a/ignite/metrics/accumulation.py
+++ b/ignite/metrics/accumulation.py
@@ -31,7 +31,9 @@ class VariableAccumulation(Metric):
             :class:`~ignite.engine.engine.Engine`'s ``process_function``'s output into the
             form expected by the metric. This can be useful if, for example, you have a multi-output model and
             you want to compute the metric with respect to one of the outputs.
-        device (str of torch.device, optional): optional device specification for internal storage.
+        device (str or torch.device): specifies which device updates are accumulated on. Setting the metric's
+            device to be the same as your `update` arguments ensures the `update` method is non-blocking. By
+            default, CPU.
 
     """
 
@@ -114,8 +116,9 @@ class Average(VariableAccumulation):
             :class:`~ignite.engine.engine.Engine`'s ``process_function``'s output into the
             form expected by the metric. This can be useful if, for example, you have a multi-output model and
             you want to compute the metric with respect to one of the outputs.
-        device (str of torch.device, optional): optional device specification for internal storage.
-
+        device (str or torch.device): specifies which device updates are accumulated on. Setting the metric's
+            device to be the same as your `update` arguments ensures the `update` method is non-blocking. By
+            default, CPU.
     """
 
     def __init__(
@@ -160,7 +163,9 @@ class GeometricAverage(VariableAccumulation):
             :class:`~ignite.engine.engine.Engine`'s ``process_function``'s output into the
             form expected by the metric. This can be useful if, for example, you have a multi-output model and
             you want to compute the metric with respect to one of the outputs.
-        device (str of torch.device, optional): optional device specification for internal storage.
+        device (str or torch.device): specifies which device updates are accumulated on. Setting the metric's
+            device to be the same as your `update` arguments ensures the `update` method is non-blocking. By
+            default, CPU.
 
     """
 
diff --git a/ignite/metrics/accuracy.py b/ignite/metrics/accuracy.py
index 79ba5e4c1d80..71d006dfbcc0 100644
--- a/ignite/metrics/accuracy.py
+++ b/ignite/metrics/accuracy.py
@@ -122,7 +122,9 @@ def thresholded_output_transform(output):
             form expected by the metric. This can be useful if, for example, you have a multi-output model and
             you want to compute the metric with respect to one of the outputs.
         is_multilabel (bool, optional): flag to use in multilabel case. By default, False.
-        device (str of torch.device, optional): unused argument.
+        device (str or torch.device): specifies which device updates are accumulated on. Setting the metric's
+            device to be the same as your `update` arguments ensures the `update` method is non-blocking. By
+            default, CPU.
 
     """
 
diff --git a/ignite/metrics/confusion_matrix.py b/ignite/metrics/confusion_matrix.py
index 574e7e6a1105..b22f18b8e0e5 100644
--- a/ignite/metrics/confusion_matrix.py
+++ b/ignite/metrics/confusion_matrix.py
@@ -30,7 +30,9 @@ class ConfusionMatrix(Metric):
             :class:`~ignite.engine.engine.Engine`'s ``process_function``'s output into the
             form expected by the metric. This can be useful if, for example, you have a multi-output model and
             you want to compute the metric with respect to one of the outputs.
-        device (str of torch.device, optional): optional device specification for internal storage.
+        device (str or torch.device): specifies which device updates are accumulated on. Setting the metric's
+            device to be the same as your `update` arguments ensures the `update` method is non-blocking. By
+            default, CPU.
 
     Note:
         In case of the targets `y` in `(batch_size, ...)` format, target indices between 0 and `num_classes` only
diff --git a/ignite/metrics/fbeta.py b/ignite/metrics/fbeta.py
index 6af40b7234d0..e9f8fbb3f5e7 100644
--- a/ignite/metrics/fbeta.py
+++ b/ignite/metrics/fbeta.py
@@ -28,7 +28,9 @@ def Fbeta(
         output_transform (callable, optional): a callable that is used to transform the
             :class:`~ignite.engine.engine.Engine`'s ``process_function``'s output into the
             form expected by the metric. It is used only if precision or recall are not provided.
-        device (str of torch.device, optional): optional device specification for internal storage.
+        device (str or torch.device): specifies which device updates are accumulated on. Setting the metric's
+            device to be the same as your `update` arguments ensures the `update` method is non-blocking. By
+            default, CPU.
 
     Returns:
         MetricsLambda, F-beta metric
diff --git a/ignite/metrics/loss.py b/ignite/metrics/loss.py
index b0e7d1955fd7..4bf54fcba072 100644
--- a/ignite/metrics/loss.py
+++ b/ignite/metrics/loss.py
@@ -26,7 +26,9 @@ class Loss(Metric):
             keywords arguments. If extra keywords arguments are provided they are passed to `loss_fn`.
         batch_size (callable): a callable taking a target tensor that returns the
             first dimension size (usually the batch size).
-        device (str of torch.device, optional): unused argument.
+        device (str or torch.device): specifies which device updates are accumulated on. Setting the
+            metric's device to be the same as your `update` arguments ensures the `update` method is
+            non-blocking. By default, CPU.
 
     """
 
diff --git a/ignite/metrics/metric.py b/ignite/metrics/metric.py
index a12a09fbb4dc..332b72a243e6 100644
--- a/ignite/metrics/metric.py
+++ b/ignite/metrics/metric.py
@@ -122,7 +122,9 @@ class Metric(metaclass=ABCMeta):
             form expected by the metric. This can be useful if, for example, you have a multi-output model and
             you want to compute the metric with respect to one of the outputs.
             By default, metrics require the output as ``(y_pred, y)`` or ``{'y_pred': y_pred, 'y': y}``.
-        device (str of torch.device, optional): optional device specification for internal storage.
+        device (str or torch.device): specifies which device updates are accumulated on. Setting the
+            metric's device to be the same as your `update` arguments ensures the `update` method is
+            non-blocking. By default, CPU.
 
     """
 
diff --git a/ignite/metrics/precision.py b/ignite/metrics/precision.py
index 7754c47f47cc..4dbdf99ab8dd 100644
--- a/ignite/metrics/precision.py
+++ b/ignite/metrics/precision.py
@@ -122,7 +122,9 @@ def thresholded_output_transform(output):
             in multiclass case), otherwise, returns a tensor with the precision (for each class in multiclass case).
         is_multilabel (bool, optional) flag to use in multilabel case. By default, value is False. If True, average
             parameter should be True and the average is computed across samples, instead of classes.
-        device (str of torch.device, optional): unused argument.
+        device (str or torch.device): specifies which device updates are accumulated on. Setting the metric's
+            device to be the same as your `update` arguments ensures the `update` method is non-blocking. By
+            default, CPU.
 
     """
 
diff --git a/ignite/metrics/recall.py b/ignite/metrics/recall.py
index d446583f6bd6..dccb29154aec 100644
--- a/ignite/metrics/recall.py
+++ b/ignite/metrics/recall.py
@@ -60,7 +60,9 @@ def thresholded_output_transform(output):
             in multiclass case), otherwise, returns a tensor with the precision (for each class in multiclass case).
         is_multilabel (bool, optional) flag to use in multilabel case. By default, value is False. If True, average
             parameter should be True and the average is computed across samples, instead of classes.
-        device (str of torch.device, optional): unused argument.
+        device (str or torch.device): specifies which device updates are accumulated on. Setting the metric's
+            device to be the same as your `update` arguments ensures the `update` method is non-blocking. By
+            default, CPU.
 
     """
 

From b2fa21319cb3495be9f0a2c52506951a86f53a47 Mon Sep 17 00:00:00 2001
From: Nicholas Vadivelu <nicholas.vadivelu@gmail.com>
Date: Mon, 17 Aug 2020 14:44:58 -0400
Subject: [PATCH 16/32] Update ignite/metrics/accumulation.py

Co-authored-by: vfdev <vfdev.5@gmail.com>
---
 ignite/metrics/accumulation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ignite/metrics/accumulation.py b/ignite/metrics/accumulation.py
index 1399484459e6..ec244cfeef40 100644
--- a/ignite/metrics/accumulation.py
+++ b/ignite/metrics/accumulation.py
@@ -32,7 +32,7 @@ class VariableAccumulation(Metric):
             form expected by the metric. This can be useful if, for example, you have a multi-output model and
             you want to compute the metric with respect to one of the outputs.
         device (str or torch.device): specifies which device updates are accumulated on. Setting the metric's
-            device to be the same as your `update` arguments ensures the `update` method is non-blocking. By
+            device to be the same as your ``update`` arguments ensures the ``update`` method is non-blocking. By
             default, CPU.
 
     """

From 90e0e9a3faf474c037c832fee903152d4003a388 Mon Sep 17 00:00:00 2001
From: Nicholas Vadivelu <nicholas.vadivelu@gmail.com>
Date: Mon, 17 Aug 2020 14:58:40 -0400
Subject: [PATCH 17/32] Update ignite/metrics/accumulation.py

Co-authored-by: vfdev <vfdev.5@gmail.com>
---
 ignite/metrics/accumulation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ignite/metrics/accumulation.py b/ignite/metrics/accumulation.py
index ec244cfeef40..1da5b27735b6 100644
--- a/ignite/metrics/accumulation.py
+++ b/ignite/metrics/accumulation.py
@@ -117,7 +117,7 @@ class Average(VariableAccumulation):
             form expected by the metric. This can be useful if, for example, you have a multi-output model and
             you want to compute the metric with respect to one of the outputs.
         device (str or torch.device): specifies which device updates are accumulated on. Setting the metric's
-            device to be the same as your `update` arguments ensures the `update` method is non-blocking. By
+            device to be the same as your ``update`` arguments ensures the ``update`` method is non-blocking. By
             default, CPU.
     """
 

From 6c1fda47baaa55a8ae35f040965f1f0d6813474b Mon Sep 17 00:00:00 2001
From: Nicholas Vadivelu <nicholas.vadivelu@gmail.com>
Date: Mon, 17 Aug 2020 14:58:47 -0400
Subject: [PATCH 18/32] Update ignite/metrics/accumulation.py

Co-authored-by: vfdev <vfdev.5@gmail.com>
---
 ignite/metrics/accumulation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ignite/metrics/accumulation.py b/ignite/metrics/accumulation.py
index 1da5b27735b6..e2916fa8f932 100644
--- a/ignite/metrics/accumulation.py
+++ b/ignite/metrics/accumulation.py
@@ -164,7 +164,7 @@ class GeometricAverage(VariableAccumulation):
             form expected by the metric. This can be useful if, for example, you have a multi-output model and
             you want to compute the metric with respect to one of the outputs.
         device (str or torch.device): specifies which device updates are accumulated on. Setting the metric's
-            device to be the same as your `update` arguments ensures the `update` method is non-blocking. By
+            device to be the same as your ``update`` arguments ensures the ``update`` method is non-blocking. By
             default, CPU.
 
     """

From c510e102b2f07391f077a1292de906d16deb2ffc Mon Sep 17 00:00:00 2001
From: Nicholas Vadivelu <nicholas.vadivelu@gmail.com>
Date: Mon, 17 Aug 2020 14:58:55 -0400
Subject: [PATCH 19/32] Update ignite/metrics/accuracy.py

Co-authored-by: vfdev <vfdev.5@gmail.com>
---
 ignite/metrics/accuracy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ignite/metrics/accuracy.py b/ignite/metrics/accuracy.py
index 71d006dfbcc0..fe53db66f828 100644
--- a/ignite/metrics/accuracy.py
+++ b/ignite/metrics/accuracy.py
@@ -123,7 +123,7 @@ def thresholded_output_transform(output):
             you want to compute the metric with respect to one of the outputs.
         is_multilabel (bool, optional): flag to use in multilabel case. By default, False.
         device (str or torch.device): specifies which device updates are accumulated on. Setting the metric's
-            device to be the same as your `update` arguments ensures the `update` method is non-blocking. By
+            device to be the same as your ``update`` arguments ensures the ``update`` method is non-blocking. By
             default, CPU.
 
     """

From d5d4854cc07fd868f094992f34cedd7f3dad3e7f Mon Sep 17 00:00:00 2001
From: Nicholas Vadivelu <nicholas.vadivelu@gmail.com>
Date: Mon, 17 Aug 2020 14:59:02 -0400
Subject: [PATCH 20/32] Update ignite/metrics/fbeta.py

Co-authored-by: vfdev <vfdev.5@gmail.com>
---
 ignite/metrics/fbeta.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ignite/metrics/fbeta.py b/ignite/metrics/fbeta.py
index e9f8fbb3f5e7..eb6776a17eb8 100644
--- a/ignite/metrics/fbeta.py
+++ b/ignite/metrics/fbeta.py
@@ -29,7 +29,7 @@ def Fbeta(
             :class:`~ignite.engine.engine.Engine`'s ``process_function``'s output into the
             form expected by the metric. It is used only if precision or recall are not provided.
         device (str or torch.device): specifies which device updates are accumulated on. Setting the metric's
-            device to be the same as your `update` arguments ensures the `update` method is non-blocking. By
+            device to be the same as your ``update`` arguments ensures the ``update`` method is non-blocking. By
             default, CPU.
 
     Returns:

From 39515b7960269793e7fc42672def2a230a053977 Mon Sep 17 00:00:00 2001
From: Nicholas Vadivelu <nicholas.vadivelu@gmail.com>
Date: Mon, 17 Aug 2020 14:59:09 -0400
Subject: [PATCH 21/32] Update ignite/metrics/loss.py

Co-authored-by: vfdev <vfdev.5@gmail.com>
---
 ignite/metrics/loss.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ignite/metrics/loss.py b/ignite/metrics/loss.py
index 4bf54fcba072..305ad7c2cb1e 100644
--- a/ignite/metrics/loss.py
+++ b/ignite/metrics/loss.py
@@ -27,7 +27,7 @@ class Loss(Metric):
         batch_size (callable): a callable taking a target tensor that returns the
             first dimension size (usually the batch size).
         device (str or torch.device): specifies which device updates are accumulated on. Setting the
-            metric's device to be the same as your `update` arguments ensures the `update` method is
+            metric's device to be the same as your ``update`` arguments ensures the ``update`` method is
             non-blocking. By default, CPU.
 
     """

From 3c498719b5f5e44ddcc3134b0da8d403e5d3acec Mon Sep 17 00:00:00 2001
From: Nicholas Vadivelu <nicholas.vadivelu@gmail.com>
Date: Mon, 17 Aug 2020 14:59:19 -0400
Subject: [PATCH 22/32] Update ignite/metrics/metric.py

Co-authored-by: vfdev <vfdev.5@gmail.com>
---
 ignite/metrics/metric.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ignite/metrics/metric.py b/ignite/metrics/metric.py
index 332b72a243e6..22858fe1fe89 100644
--- a/ignite/metrics/metric.py
+++ b/ignite/metrics/metric.py
@@ -123,7 +123,7 @@ class Metric(metaclass=ABCMeta):
             you want to compute the metric with respect to one of the outputs.
             By default, metrics require the output as ``(y_pred, y)`` or ``{'y_pred': y_pred, 'y': y}``.
         device (str or torch.device): specifies which device updates are accumulated on. Setting the
-            metric's device to be the same as your `update` arguments ensures the `update` method is
+            metric's device to be the same as your ``update`` arguments ensures the ``update`` method is
             non-blocking. By default, CPU.
 
     """

From 6de10ddd40dc9095b8f49d46366e7df13877810f Mon Sep 17 00:00:00 2001
From: Nicholas Vadivelu <nicholas.vadivelu@gmail.com>
Date: Mon, 17 Aug 2020 14:59:24 -0400
Subject: [PATCH 23/32] Update ignite/metrics/precision.py

Co-authored-by: vfdev <vfdev.5@gmail.com>
---
 ignite/metrics/precision.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ignite/metrics/precision.py b/ignite/metrics/precision.py
index 4dbdf99ab8dd..055f11c9c391 100644
--- a/ignite/metrics/precision.py
+++ b/ignite/metrics/precision.py
@@ -123,7 +123,7 @@ def thresholded_output_transform(output):
         is_multilabel (bool, optional) flag to use in multilabel case. By default, value is False. If True, average
             parameter should be True and the average is computed across samples, instead of classes.
         device (str or torch.device): specifies which device updates are accumulated on. Setting the metric's
-            device to be the same as your `update` arguments ensures the `update` method is non-blocking. By
+            device to be the same as your ``update`` arguments ensures the ``update`` method is non-blocking. By
             default, CPU.
 
     """

From eca0bc37685d32603730563ad76719e0c39b511c Mon Sep 17 00:00:00 2001
From: Nicholas Vadivelu <nicholas.vadivelu@gmail.com>
Date: Mon, 17 Aug 2020 14:59:31 -0400
Subject: [PATCH 24/32] Update ignite/metrics/recall.py

Co-authored-by: vfdev <vfdev.5@gmail.com>
---
 ignite/metrics/recall.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ignite/metrics/recall.py b/ignite/metrics/recall.py
index dccb29154aec..a094557249c5 100644
--- a/ignite/metrics/recall.py
+++ b/ignite/metrics/recall.py
@@ -61,7 +61,7 @@ def thresholded_output_transform(output):
         is_multilabel (bool, optional) flag to use in multilabel case. By default, value is False. If True, average
             parameter should be True and the average is computed across samples, instead of classes.
         device (str or torch.device): specifies which device updates are accumulated on. Setting the metric's
-            device to be the same as your `update` arguments ensures the `update` method is non-blocking. By
+            device to be the same as your ``update`` arguments ensures the ``update`` method is non-blocking. By
             default, CPU.
 
     """

From ad7082ed113e11e9649686a2661060f9eb2c4cf3 Mon Sep 17 00:00:00 2001
From: Nicholas Vadivelu <nicv@uber.com>
Date: Mon, 17 Aug 2020 15:00:17 -0400
Subject: [PATCH 25/32] add comment explaining lack of detach in metrics docs

---
 docs/source/metrics.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/metrics.rst b/docs/source/metrics.rst
index ba4585802dd2..47b7f9f3aab4 100644
--- a/docs/source/metrics.rst
+++ b/docs/source/metrics.rst
@@ -144,6 +144,8 @@ specific condition (e.g. ignore user-defined classes):
             indices = indices[mask]
             correct = torch.eq(indices, y).view(-1)
 
+            # We must detach tensors before adding them to the internal variables. In this case, torch.eq is not
+            # differentiable, so the computation graph is detached implicitly
             self._num_correct += torch.sum(correct).to(self._device)
             self._num_examples += correct.shape[0]
 

From 90b5b858a5862086f9783c8b082f90e0f0f0140d Mon Sep 17 00:00:00 2001
From: Nicholas Vadivelu <nicv@uber.com>
Date: Mon, 17 Aug 2020 16:16:01 -0400
Subject: [PATCH 26/32] support device argument for running_average

---
 ignite/metrics/running_average.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/ignite/metrics/running_average.py b/ignite/metrics/running_average.py
index 2094e3866ada..353c00bbc27f 100644
--- a/ignite/metrics/running_average.py
+++ b/ignite/metrics/running_average.py
@@ -20,7 +20,11 @@ class RunningAverage(Metric):
             corresponds the output of process function. Otherwise it should be None.
         epoch_bound (boolean, optional): whether the running average should be reset after each epoch (defaults
             to True).
-        device (str of torch.device, optional): unused argument.
+        device (str or torch.device, optional): specifies which device updates are accumulated on. Should be
+            None when `src` is an instance of :class:`~ignite.metrics.Metric`, as the running average will
+            use the `src`'s device. Otherwise, defaults to CPU. Only applicable when the computed value
+            from the metric is a tensor.
+
 
     Examples:
 
@@ -63,6 +67,7 @@ def __init__(
             self.src = src
             self._get_src_value = self._get_metric_value
             self.iteration_completed = self._metric_iteration_completed
+            device = src._device
         else:
             if output_transform is None:
                 raise ValueError(
@@ -71,6 +76,8 @@ def __init__(
                 )
             self._get_src_value = self._get_output_value
             self.update = self._output_update
+            if device is None:
+                device = torch.device("cpu")
 
         self.alpha = alpha
         self.epoch_bound = epoch_bound
@@ -118,5 +125,5 @@ def _metric_iteration_completed(self, engine: Engine) -> None:
     @reinit__is_reduced
     def _output_update(self, output: Union[torch.Tensor, float]) -> None:
         if isinstance(output, torch.Tensor):
-            output = output.detach().clone()
+            output = output.detach().to(self._device, copy=True)
         self.src = output

From 3481da162bf83a810a4985d8b90f7f2367d156e8 Mon Sep 17 00:00:00 2001
From: Nicholas Vadivelu <nicv@uber.com>
Date: Mon, 17 Aug 2020 22:30:36 -0400
Subject: [PATCH 27/32] update support for device argumenet for accumulation

---
 ignite/metrics/accumulation.py            |   3 +
 tests/ignite/metrics/test_accumulation.py | 183 +++++++++++++---------
 2 files changed, 114 insertions(+), 72 deletions(-)

diff --git a/ignite/metrics/accumulation.py b/ignite/metrics/accumulation.py
index e2916fa8f932..5a000c38911e 100644
--- a/ignite/metrics/accumulation.py
+++ b/ignite/metrics/accumulation.py
@@ -72,6 +72,9 @@ def update(self, output: Union[Any, torch.Tensor, numbers.Number]) -> None:
                 output = output.to(self._device)
 
         self.accumulator = self._op(self.accumulator, output)
+        if isinstance(self.accumulator, torch.Tensor):
+            self.accumulator = self.accumulator.to(self._device)
+
         if hasattr(output, "shape"):
             self.num_examples += output.shape[0] if len(output.shape) > 1 else 1
         else:
diff --git a/tests/ignite/metrics/test_accumulation.py b/tests/ignite/metrics/test_accumulation.py
index abd71f3ad1df..9aaf38bf9844 100644
--- a/tests/ignite/metrics/test_accumulation.py
+++ b/tests/ignite/metrics/test_accumulation.py
@@ -198,105 +198,120 @@ def compute_mean_std(engine, batch):
 
 
 def _test_distrib_variable_accumulation(device):
+    def _test(metric_device):
+        mean_var = VariableAccumulation(lambda a, x: a + x, device=metric_device)
+        y_true = torch.rand(100, device=device, dtype=torch.float64)
 
-    mean_var = VariableAccumulation(lambda a, x: a + x, device=device)
-    y_true = torch.rand(100, device=device, dtype=torch.float64)
+        for y in y_true:
+            mean_var.update(y)
 
-    for y in y_true:
-        mean_var.update(y)
+        y_true = idist.all_reduce(y_true)
+        a, n = mean_var.compute()
+        assert a.item() == pytest.approx(y_true.sum().item())
+        assert n == len(y_true) * idist.get_world_size()
+        # check if call compute twice
+        a, n = mean_var.compute()
+        assert a.item() == pytest.approx(y_true.sum().item())
+        assert n == len(y_true) * idist.get_world_size()
 
-    y_true = idist.all_reduce(y_true)
-    a, n = mean_var.compute()
-    assert a.item() == pytest.approx(y_true.sum().item())
-    assert n == len(y_true) * idist.get_world_size()
-    # check if call compute twice
-    a, n = mean_var.compute()
-    assert a.item() == pytest.approx(y_true.sum().item())
-    assert n == len(y_true) * idist.get_world_size()
+        mean_var = VariableAccumulation(lambda a, x: a + x, device=metric_device)
+        y_true = torch.rand(50, 10, device=device, dtype=torch.float64)
 
-    mean_var = VariableAccumulation(lambda a, x: a + x, device=device)
-    y_true = torch.rand(50, 10, device=device, dtype=torch.float64)
+        for y in y_true:
+            mean_var.update(y)
 
-    for y in y_true:
-        mean_var.update(y)
+        y_true = idist.all_reduce(y_true)
+        a, n = mean_var.compute()
+        assert n == len(y_true) * idist.get_world_size()
+        np.testing.assert_almost_equal(a.cpu().numpy(), y_true.sum(dim=0).cpu().numpy(), decimal=4)
+        a, n = mean_var.compute()
+        assert n == len(y_true) * idist.get_world_size()
+        np.testing.assert_almost_equal(a.cpu().numpy(), y_true.sum(dim=0).cpu().numpy(), decimal=4)
 
-    y_true = idist.all_reduce(y_true)
-    a, n = mean_var.compute()
-    assert n == len(y_true) * idist.get_world_size()
-    np.testing.assert_almost_equal(a.cpu().numpy(), y_true.sum(dim=0).cpu().numpy(), decimal=4)
-    a, n = mean_var.compute()
-    assert n == len(y_true) * idist.get_world_size()
-    np.testing.assert_almost_equal(a.cpu().numpy(), y_true.sum(dim=0).cpu().numpy(), decimal=4)
+    # check multiple random inputs as random exact occurencies are rare
+    for _ in range(3):
+        _test("cpu")
+        _test(idist.device())
 
 
 def _test_distrib_average(device):
+    def _test(metric_device):
+        with pytest.raises(NotComputableError):
+            v = Average(device=metric_device)
+            v.compute()
 
-    with pytest.raises(NotComputableError):
-        v = Average(device=device)
-        v.compute()
+        mean_var = Average(device=metric_device)
+        y_true = torch.rand(100, dtype=torch.float64) + torch.randint(0, 10, size=(100,)).double()
+        y_true = y_true.to(device)
 
-    mean_var = Average(device=device)
-    y_true = torch.rand(100, dtype=torch.float64) + torch.randint(0, 10, size=(100,)).double()
-    y_true = y_true.to(device)
+        for y in y_true:
+            mean_var.update(y)
 
-    for y in y_true:
-        mean_var.update(y)
+        m = mean_var.compute()
 
-    m = mean_var.compute()
+        y_true = idist.all_reduce(y_true)
+        assert m.item() == pytest.approx(y_true.mean().item() / idist.get_world_size())
 
-    y_true = idist.all_reduce(y_true)
-    assert m.item() == pytest.approx(y_true.mean().item() / idist.get_world_size())
+        mean_var = Average(device=metric_device)
+        y_true = torch.rand(100, 10, dtype=torch.float64) + torch.randint(0, 10, size=(100, 10)).double()
+        y_true = y_true.to(device)
 
-    mean_var = Average(device=device)
-    y_true = torch.rand(100, 10, dtype=torch.float64) + torch.randint(0, 10, size=(100, 10)).double()
-    y_true = y_true.to(device)
+        for y in y_true:
+            mean_var.update(y)
 
-    for y in y_true:
-        mean_var.update(y)
+        m = mean_var.compute()
 
-    m = mean_var.compute()
+        y_true = idist.all_reduce(y_true)
+        np.testing.assert_almost_equal(
+            m.cpu().numpy(), y_true.mean(dim=0).cpu().numpy() / idist.get_world_size(), decimal=5
+        )
 
-    y_true = idist.all_reduce(y_true)
-    np.testing.assert_almost_equal(
-        m.cpu().numpy(), y_true.mean(dim=0).cpu().numpy() / idist.get_world_size(), decimal=5
-    )
+    # check multiple random inputs as random exact occurencies are rare
+    for _ in range(3):
+        _test("cpu")
+        _test(idist.device())
 
 
 def _test_distrib_geom_average(device):
+    def _test(metric_device):
+        with pytest.raises(NotComputableError):
+            v = GeometricAverage(device=metric_device)
+            v.compute()
 
-    with pytest.raises(NotComputableError):
-        v = GeometricAverage(device=device)
-        v.compute()
+        mean_var = GeometricAverage(device=metric_device)
+        y_true = torch.rand(100, dtype=torch.float64) + torch.randint(0, 10, size=(100,)).double()
+        y_true = y_true.to(device)
 
-    mean_var = GeometricAverage(device=device)
-    y_true = torch.rand(100, dtype=torch.float64) + torch.randint(0, 10, size=(100,)).double()
-    y_true = y_true.to(device)
+        for y in y_true:
+            mean_var.update(y)
 
-    for y in y_true:
-        mean_var.update(y)
+        m = mean_var.compute()
+        log_y_true = torch.log(y_true)
+        log_y_true = idist.all_reduce(log_y_true)
+        assert m.item() == pytest.approx(torch.exp(log_y_true.mean(dim=0) / idist.get_world_size()).item())
 
-    m = mean_var.compute()
-    log_y_true = torch.log(y_true)
-    log_y_true = idist.all_reduce(log_y_true)
-    assert m.item() == pytest.approx(torch.exp(log_y_true.mean(dim=0) / idist.get_world_size()).item())
+        mean_var = GeometricAverage(device=metric_device)
+        y_true = torch.rand(100, 10, dtype=torch.float64) + torch.randint(0, 10, size=(100, 10)).double()
+        y_true = y_true.to(device)
 
-    mean_var = GeometricAverage(device=device)
-    y_true = torch.rand(100, 10, dtype=torch.float64) + torch.randint(0, 10, size=(100, 10)).double()
-    y_true = y_true.to(device)
+        for y in y_true:
+            mean_var.update(y)
 
-    for y in y_true:
-        mean_var.update(y)
+        m = mean_var.compute()
+        log_y_true = torch.log(y_true)
+        log_y_true = idist.all_reduce(log_y_true)
+        np.testing.assert_almost_equal(
+            m.cpu().numpy(), torch.exp(log_y_true.mean(dim=0) / idist.get_world_size()).cpu().numpy(), decimal=5
+        )
 
-    m = mean_var.compute()
-    log_y_true = torch.log(y_true)
-    log_y_true = idist.all_reduce(log_y_true)
-    np.testing.assert_almost_equal(
-        m.cpu().numpy(), torch.exp(log_y_true.mean(dim=0) / idist.get_world_size()).cpu().numpy(), decimal=5
-    )
+    # check multiple random inputs as random exact occurencies are rare
+    for _ in range(3):
+        _test("cpu")
+        _test(idist.device())
 
 
 def _test_distrib_integration(device):
-    def _test(metric_cls, true_result_fn, tol=1e-5):
+    def _test(metric_cls, true_result_fn, metric_device, tol=1e-5):
 
         size = 100
         custom_variable = 10.0 + 5.0 * torch.rand(size, 12, dtype=torch.float64)
@@ -307,7 +322,7 @@ def update_fn(engine, batch):
 
         engine = Engine(update_fn)
 
-        custom_var_mean = metric_cls(output_transform=lambda output: output[1], device=device)
+        custom_var_mean = metric_cls(output_transform=lambda output: output[1], device=metric_device)
         custom_var_mean.attach(engine, "agg_custom_var")
 
         state = engine.run([0] * size)
@@ -326,7 +341,7 @@ def update_fn(engine, batch):
 
         engine = Engine(update_fn)
 
-        custom_var_mean = metric_cls(output_transform=lambda output: output[1], device=device)
+        custom_var_mean = metric_cls(output_transform=lambda output: output[1], device=metric_device)
         custom_var_mean.attach(engine, "agg_custom_var")
 
         state = engine.run([0] * size)
@@ -342,8 +357,25 @@ def _geom_mean(y_true):
         np_t = log_y_true.cpu().numpy()
         return np.exp(np.mean(np_t, axis=0) / idist.get_world_size())
 
-    _test(Average, _mean)
-    _test(GeometricAverage, _geom_mean, tol=1e-4)
+    for metric_device in ["cpu", idist.device()]:
+        _test(Average, _mean, metric_device)
+        _test(GeometricAverage, _geom_mean, metric_device, tol=1e-4)
+
+
+def _test_distrib_accumulator_device(device):
+
+    for metric_device in [torch.device("cpu"), idist.device()]:
+
+        m = VariableAccumulation(lambda a, x: x, device=metric_device)
+        assert m._device == metric_device
+        assert m.accumulator.device == metric_device, "{}:{} vs {}:{}".format(
+            type(m.accumulator.device), m.accumulator.device, type(metric_device), metric_device
+        )
+
+        m.update(torch.tensor(1, device=device))
+        assert m.accumulator.device == metric_device, "{}:{} vs {}:{}".format(
+            type(m.accumulator.device), m.accumulator.device, type(metric_device), metric_device
+        )
 
 
 @pytest.mark.distributed
@@ -356,6 +388,7 @@ def test_distrib_gpu(distributed_context_single_node_nccl):
     _test_distrib_average(device)
     _test_distrib_geom_average(device)
     _test_distrib_integration(device)
+    _test_distrib_accumulator_device(device)
 
 
 @pytest.mark.distributed
@@ -367,6 +400,7 @@ def test_distrib_cpu(distributed_context_single_node_gloo):
     _test_distrib_average(device)
     _test_distrib_geom_average(device)
     _test_distrib_integration(device)
+    _test_distrib_accumulator_device(device)
 
 
 @pytest.mark.multinode_distributed
@@ -378,6 +412,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo):
     _test_distrib_average(device)
     _test_distrib_geom_average(device)
     _test_distrib_integration(device)
+    _test_distrib_accumulator_device(device)
 
 
 @pytest.mark.distributed
@@ -392,6 +427,7 @@ def test_distrib_hvd(gloo_hvd_executor):
     gloo_hvd_executor(_test_distrib_average, (device,), np=nproc, do_init=True)
     gloo_hvd_executor(_test_distrib_geom_average, (device,), np=nproc, do_init=True)
     gloo_hvd_executor(_test_distrib_integration, (device,), np=nproc, do_init=True)
+    gloo_hvd_executor(_test_distrib_accumulator_device, (device,), np=nproc, do_init=True)
 
 
 @pytest.mark.multinode_distributed
@@ -403,6 +439,7 @@ def test_multinode_distrib_gpu(distributed_context_multi_node_nccl):
     _test_distrib_average(device)
     _test_distrib_geom_average(device)
     _test_distrib_integration(device)
+    _test_distrib_accumulator_device(device)
 
 
 @pytest.mark.tpu
@@ -414,6 +451,7 @@ def test_distrib_single_device_xla():
     _test_distrib_average(device)
     _test_distrib_geom_average(device)
     _test_distrib_integration(device)
+    _test_distrib_accumulator_device(device)
 
 
 def _test_distrib_xla_nprocs(index):
@@ -422,6 +460,7 @@ def _test_distrib_xla_nprocs(index):
     _test_distrib_average(device)
     _test_distrib_geom_average(device)
     _test_distrib_integration(device)
+    _test_distrib_accumulator_device(device)
 
 
 @pytest.mark.tpu

From d340bb7fa07708d4a22662180916f84309af42ef Mon Sep 17 00:00:00 2001
From: Nicholas Vadivelu <nicv@uber.com>
Date: Mon, 17 Aug 2020 22:31:05 -0400
Subject: [PATCH 28/32] fix and improve device tests for metrics

---
 tests/ignite/metrics/test_accuracy.py         |  12 --
 tests/ignite/metrics/test_confusion_matrix.py | 121 +++++++++++-------
 tests/ignite/metrics/test_loss.py             |  75 ++++++-----
 .../metrics/test_mean_absolute_error.py       |  48 ++++---
 .../metrics/test_mean_pairwise_distance.py    |  67 ++++++----
 .../ignite/metrics/test_mean_squared_error.py |  50 +++++---
 tests/ignite/metrics/test_metrics_lambda.py   |   9 +-
 tests/ignite/metrics/test_precision.py        |  69 ++++++----
 tests/ignite/metrics/test_recall.py           |  71 ++++++----
 .../metrics/test_root_mean_squared_error.py   |  30 +++--
 tests/ignite/metrics/test_running_average.py  | 118 ++++++++++-------
 .../test_top_k_categorical_accuracy.py        |  31 +++--
 12 files changed, 429 insertions(+), 272 deletions(-)

diff --git a/tests/ignite/metrics/test_accuracy.py b/tests/ignite/metrics/test_accuracy.py
index 6ca0226613e1..9dd6b659a3b0 100644
--- a/tests/ignite/metrics/test_accuracy.py
+++ b/tests/ignite/metrics/test_accuracy.py
@@ -823,18 +823,6 @@ def _test_distrib_accumulator_device(device):
         )
 
 
-def _test_distrib_accumulator_device(device):
-    device = torch.device(device)
-    acc = Accuracy(device=device)
-    assert acc._device == device
-
-    y_pred = torch.randint(0, 2, size=(10,)).long()
-    y = torch.randint(0, 2, size=(10,)).long()
-    acc.update((y_pred, y))
-
-    assert acc._num_correct.device == device
-
-
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
diff --git a/tests/ignite/metrics/test_confusion_matrix.py b/tests/ignite/metrics/test_confusion_matrix.py
index 2960a59b9d02..55867cdebe92 100644
--- a/tests/ignite/metrics/test_confusion_matrix.py
+++ b/tests/ignite/metrics/test_confusion_matrix.py
@@ -547,68 +547,90 @@ def test_dice_coefficient():
 
 
 def _test_distrib_multiclass_images(device):
+    def _test(metric_device):
+        num_classes = 3
+        cm = ConfusionMatrix(num_classes=num_classes, device=metric_device)
 
-    num_classes = 3
-    cm = ConfusionMatrix(num_classes=num_classes, device=device)
+        y_true, y_pred = get_y_true_y_pred()
 
-    y_true, y_pred = get_y_true_y_pred()
+        # Compute confusion matrix with sklearn
+        true_res = confusion_matrix(y_true.reshape(-1), y_pred.reshape(-1))
 
-    # Compute confusion matrix with sklearn
-    true_res = confusion_matrix(y_true.reshape(-1), y_pred.reshape(-1))
+        th_y_true, th_y_logits = compute_th_y_true_y_logits(y_true, y_pred)
+        th_y_true = th_y_true.to(device)
+        th_y_logits = th_y_logits.to(device)
 
-    th_y_true, th_y_logits = compute_th_y_true_y_logits(y_true, y_pred)
-    th_y_true = th_y_true.to(device)
-    th_y_logits = th_y_logits.to(device)
+        # Update metric
+        output = (th_y_logits, th_y_true)
+        cm.update(output)
 
-    # Update metric
-    output = (th_y_logits, th_y_true)
-    cm.update(output)
+        res = cm.compute().cpu().numpy() / idist.get_world_size()
 
-    res = cm.compute().cpu().numpy() / idist.get_world_size()
+        assert np.all(true_res == res)
 
-    assert np.all(true_res == res)
+        # Another test on batch of 2 images
+        num_classes = 3
+        cm = ConfusionMatrix(num_classes=num_classes, device=metric_device)
+
+        # Create a batch of two images:
+        th_y_true1 = torch.from_numpy(y_true).reshape(1, 30, 30)
+        th_y_true2 = torch.from_numpy(y_true.transpose()).reshape(1, 30, 30)
+        th_y_true = torch.cat([th_y_true1, th_y_true2], dim=0)
+        th_y_true = th_y_true.to(device)
+
+        # Create a batch of 2 logits tensors
+        y_probas = np.ones((3, 30, 30)) * -10
+        y_probas[0, (y_pred == 0)] = 720
+        y_probas[1, (y_pred == 1)] = 720
+        y_probas[2, (y_pred == 2)] = 768
+        th_y_logits1 = torch.from_numpy(y_probas).reshape(1, 3, 30, 30)
+
+        y_probas = np.ones((3, 30, 30)) * -10
+        y_probas[0, (y_pred.transpose() == 0)] = 720
+        y_probas[1, (y_pred.transpose() == 2)] = 720
+        y_probas[2, (y_pred.transpose() == 1)] = 768
+        th_y_logits2 = torch.from_numpy(y_probas).reshape(1, 3, 30, 30)
+
+        th_y_logits = torch.cat([th_y_logits1, th_y_logits2], dim=0)
+        # check update if input is on another device
+        th_y_logits = th_y_logits.to(device)
+
+        # Update metric & compute
+        output = (th_y_logits, th_y_true)
+        cm.update(output)
+        res = cm.compute().cpu().numpy()
 
-    # Another test on batch of 2 images
-    num_classes = 3
-    cm = ConfusionMatrix(num_classes=num_classes, device=device)
+        # Compute confusion matrix with sklearn
+        th_y_true = idist.all_gather(th_y_true)
+        th_y_logits = idist.all_gather(th_y_logits)
 
-    # Create a batch of two images:
-    th_y_true1 = torch.from_numpy(y_true).reshape(1, 30, 30)
-    th_y_true2 = torch.from_numpy(y_true.transpose()).reshape(1, 30, 30)
-    th_y_true = torch.cat([th_y_true1, th_y_true2], dim=0)
-    th_y_true = th_y_true.to(device)
+        np_y_true = th_y_true.cpu().numpy().reshape(-1)
+        np_y_pred = np.argmax(th_y_logits.cpu().numpy(), axis=1).reshape(-1)
+        true_res = confusion_matrix(np_y_true, np_y_pred)
 
-    # Create a batch of 2 logits tensors
-    y_probas = np.ones((3, 30, 30)) * -10
-    y_probas[0, (y_pred == 0)] = 720
-    y_probas[1, (y_pred == 1)] = 720
-    y_probas[2, (y_pred == 2)] = 768
-    th_y_logits1 = torch.from_numpy(y_probas).reshape(1, 3, 30, 30)
+        assert np.all(true_res == res)
 
-    y_probas = np.ones((3, 30, 30)) * -10
-    y_probas[0, (y_pred.transpose() == 0)] = 720
-    y_probas[1, (y_pred.transpose() == 2)] = 720
-    y_probas[2, (y_pred.transpose() == 1)] = 768
-    th_y_logits2 = torch.from_numpy(y_probas).reshape(1, 3, 30, 30)
+    _test("cpu")
+    _test(idist.device())
 
-    th_y_logits = torch.cat([th_y_logits1, th_y_logits2], dim=0)
-    # check update if input is on another device
-    th_y_logits = th_y_logits.to(device)
 
-    # Update metric & compute
-    output = (th_y_logits, th_y_true)
-    cm.update(output)
-    res = cm.compute().cpu().numpy()
+def _test_distrib_accumulator_device(device):
 
-    # Compute confusion matrix with sklearn
-    th_y_true = idist.all_gather(th_y_true)
-    th_y_logits = idist.all_gather(th_y_logits)
+    for metric_device in [torch.device("cpu"), idist.device()]:
 
-    np_y_true = th_y_true.cpu().numpy().reshape(-1)
-    np_y_pred = np.argmax(th_y_logits.cpu().numpy(), axis=1).reshape(-1)
-    true_res = confusion_matrix(np_y_true, np_y_pred)
+        cm = ConfusionMatrix(num_classes=3, device=metric_device)
+        assert cm._device == metric_device
+        assert cm.confusion_matrix.device == metric_device, "{}:{} vs {}:{}".format(
+            type(cm.confusion_matrix.device), cm._num_correct.device, type(metric_device), metric_device
+        )
 
-    assert np.all(true_res == res)
+        y_true, y_pred = get_y_true_y_pred()
+        th_y_true, th_y_logits = compute_th_y_true_y_logits(y_true, y_pred)
+        cm.update((th_y_logits, th_y_true))
+
+        assert cm.confusion_matrix.device == metric_device, "{}:{} vs {}:{}".format(
+            type(cm.confusion_matrix.device), acc._num_correct.device, type(metric_device), metric_device
+        )
 
 
 @pytest.mark.distributed
@@ -618,6 +640,7 @@ def test_distrib_gpu(local_rank, distributed_context_single_node_nccl):
 
     device = "cuda:{}".format(local_rank)
     _test_distrib_multiclass_images(device)
+    _test_distrib_accumulator_device(device)
 
 
 @pytest.mark.distributed
@@ -626,6 +649,7 @@ def test_distrib_cpu(distributed_context_single_node_gloo):
 
     device = "cpu"
     _test_distrib_multiclass_images(device)
+    _test_distrib_accumulator_device(device)
 
 
 @pytest.mark.distributed
@@ -637,6 +661,7 @@ def test_distrib_hvd(gloo_hvd_executor):
     nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
 
     gloo_hvd_executor(_test_distrib_multiclass_images, (device,), np=nproc, do_init=True)
+    gloo_hvd_executor(_test_distrib_accumulator_device, (device,), np=nproc, do_init=True)
 
 
 @pytest.mark.multinode_distributed
@@ -645,6 +670,7 @@ def test_distrib_hvd(gloo_hvd_executor):
 def test_multinode_distrib_cpu(distributed_context_multi_node_gloo):
     device = "cpu"
     _test_distrib_multiclass_images(device)
+    _test_distrib_accumulator_device(device)
 
 
 @pytest.mark.multinode_distributed
@@ -653,6 +679,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo):
 def test_multinode_distrib_gpu(distributed_context_multi_node_nccl):
     device = "cuda:{}".format(distributed_context_multi_node_nccl["local_rank"])
     _test_distrib_multiclass_images(device)
+    _test_distrib_accumulator_device(device)
 
 
 @pytest.mark.tpu
@@ -661,11 +688,13 @@ def test_multinode_distrib_gpu(distributed_context_multi_node_nccl):
 def test_distrib_single_device_xla():
     device = idist.device()
     _test_distrib_multiclass_images(device)
+    _test_distrib_accumulator_device(device)
 
 
 def _test_distrib_xla_nprocs(index):
     device = idist.device()
     _test_distrib_multiclass_images(device)
+    _test_distrib_accumulator_device(device)
 
 
 @pytest.mark.tpu
diff --git a/tests/ignite/metrics/test_loss.py b/tests/ignite/metrics/test_loss.py
index 4739bca16d1f..e8a405ba93eb 100644
--- a/tests/ignite/metrics/test_loss.py
+++ b/tests/ignite/metrics/test_loss.py
@@ -76,47 +76,56 @@ def test_reset():
 
 
 def _test_distrib_compute_on_criterion(device):
+    def _test(metric_device):
+        criterion = nn.NLLLoss().to(device)
+        loss = Loss(criterion, device=metric_device)
 
-    criterion = nn.NLLLoss().to(device)
-    loss = Loss(criterion, device=device)
-
-    y_pred = torch.tensor([[0.1, 0.4, 0.5], [0.1, 0.7, 0.2]], device=device).log()
-    y = torch.tensor([2, 2], device=device).long()
-    loss.update((y_pred, y))
-    n = loss._num_examples
-    assert n == len(y)
-    res = loss.compute()
-    assert n * idist.get_world_size() == loss._num_examples
-
-    y_pred = idist.all_gather(y_pred)
-    y = idist.all_gather(y)
-    true_loss_value = criterion(y_pred, y)
-    assert_almost_equal(res, true_loss_value.item())
+        y_pred = torch.tensor([[0.1, 0.4, 0.5], [0.1, 0.7, 0.2]], device=device).log()
+        y = torch.tensor([2, 2], device=device).long()
+        loss.update((y_pred, y))
+        n = loss._num_examples
+        assert n == len(y)
+        res = loss.compute()
+        assert n * idist.get_world_size() == loss._num_examples
+
+        y_pred = idist.all_gather(y_pred)
+        y = idist.all_gather(y)
+        true_loss_value = criterion(y_pred, y)
+        assert_almost_equal(res, true_loss_value.item())
+
+        loss.reset()
+        y_pred = torch.tensor([[0.1, 0.3, 0.6], [0.6, 0.2, 0.2], [0.2, 0.7, 0.1]], device=device).log()
+        y = torch.tensor([2, 0, 2], device=device).long()
+        loss.update((y_pred, y))
+        n = loss._num_examples
+        res = loss.compute()
+        assert n * idist.get_world_size() == loss._num_examples
 
-    loss.reset()
-    y_pred = torch.tensor([[0.1, 0.3, 0.6], [0.6, 0.2, 0.2], [0.2, 0.7, 0.1]], device=device).log()
-    y = torch.tensor([2, 0, 2], device=device).long()
-    loss.update((y_pred, y))
-    n = loss._num_examples
-    res = loss.compute()
-    assert n * idist.get_world_size() == loss._num_examples
+        y_pred = idist.all_gather(y_pred)
+        y = idist.all_gather(y)
+        true_loss_value = criterion(y_pred, y)
+        assert_almost_equal(res, true_loss_value.item())
 
-    y_pred = idist.all_gather(y_pred)
-    y = idist.all_gather(y)
-    true_loss_value = criterion(y_pred, y)
-    assert_almost_equal(res, true_loss_value.item())
+    _test("cpu")
+    _test(idist.device())
 
 
 def _test_distrib_sum_device(device):
-    device = torch.device(device)
-    loss = Loss(nll_loss, device=device)
-    assert loss._device == device
 
-    y_pred = torch.tensor([[0.1, 0.4, 0.5], [0.1, 0.7, 0.2]]).log()
-    y = torch.tensor([2, 2]).long()
-    loss.update((y_pred, y))
+    for metric_device in [torch.device("cpu"), idist.device()]:
+        loss = Loss(nll_loss, device=metric_device)
+        assert loss._device == metric_device
+        assert loss._sum.device == metric_device, "{}:{} vs {}:{}".format(
+            type(loss._sum.device), loss._sum.device, type(metric_device), metric_device
+        )
+
+        y_pred = torch.tensor([[0.1, 0.4, 0.5], [0.1, 0.7, 0.2]]).log()
+        y = torch.tensor([2, 2]).long()
+        loss.update((y_pred, y))
 
-    assert loss._sum.device == device
+        assert loss._sum.device == metric_device, "{}:{} vs {}:{}".format(
+            type(loss._sum.device), loss._sum.device, type(metric_device), metric_device
+        )
 
 
 def test_sum_detached():
diff --git a/tests/ignite/metrics/test_mean_absolute_error.py b/tests/ignite/metrics/test_mean_absolute_error.py
index 557c4182f3b4..5e197e14bb9e 100644
--- a/tests/ignite/metrics/test_mean_absolute_error.py
+++ b/tests/ignite/metrics/test_mean_absolute_error.py
@@ -49,31 +49,47 @@ def update(engine, i):
             y_true[i * s + offset * rank : (i + 1) * s + offset * rank],
         )
 
-    engine = Engine(update)
+    def _test(metric_device):
+        engine = Engine(update)
 
-    m = MeanAbsoluteError()
-    m.attach(engine, "mae")
+        m = MeanAbsoluteError(device=metric_device)
+        m.attach(engine, "mae")
 
-    data = list(range(n_iters))
-    engine.run(data=data, max_epochs=1)
+        data = list(range(n_iters))
+        engine.run(data=data, max_epochs=1)
 
-    assert "mae" in engine.state.metrics
-    res = engine.state.metrics["mae"]
+        assert "mae" in engine.state.metrics
+        res = engine.state.metrics["mae"]
 
-    true_res = np.mean(np.abs((y_true - y_preds).cpu().numpy()))
+        true_res = np.mean(np.abs((y_true - y_preds).cpu().numpy()))
 
-    assert pytest.approx(res) == true_res
+        assert pytest.approx(res) == true_res
+
+    _test("cpu")
+    _test(idist.device())
 
 
 def _test_distrib_accumulator_device(device):
-    device = torch.device(device)
-    mae = MeanAbsoluteError(device=device)
-    assert mae._device == device
 
-    y_pred = torch.tensor([[2.0], [-2.0]])
-    y = torch.zeros(2)
-    mae.update((y_pred, y))
-    assert mae._sum_of_absolute_errors.device == device
+    for metric_device in [torch.device("cpu"), idist.device()]:
+        mae = MeanAbsoluteError(device=metric_device)
+        assert mae._device == metric_device
+        assert mae._sum_of_absolute_errors.device == metric_device, "{}:{} vs {}:{}".format(
+            type(mae._sum_of_absolute_errors.device),
+            mae._sum_of_absolute_errors.device,
+            type(metric_device),
+            metric_device,
+        )
+
+        y_pred = torch.tensor([[2.0], [-2.0]])
+        y = torch.zeros(2)
+        mae.update((y_pred, y))
+        assert mae._sum_of_absolute_errors.device == metric_device, "{}:{} vs {}:{}".format(
+            type(mae._sum_of_absolute_errors.device),
+            mae._sum_of_absolute_errors.device,
+            type(metric_device),
+            metric_device,
+        )
 
 
 def test_accumulator_detached():
diff --git a/tests/ignite/metrics/test_mean_pairwise_distance.py b/tests/ignite/metrics/test_mean_pairwise_distance.py
index f5ea4e731ef7..df14e362dc26 100644
--- a/tests/ignite/metrics/test_mean_pairwise_distance.py
+++ b/tests/ignite/metrics/test_mean_pairwise_distance.py
@@ -52,42 +52,53 @@ def update(engine, i):
             y_true[i * s + offset * rank : (i + 1) * s + offset * rank, ...],
         )
 
-    engine = Engine(update)
-
-    m = MeanPairwiseDistance()
-    m.attach(engine, "mpwd")
+    def _test(metric_device):
+        engine = Engine(update)
+
+        m = MeanPairwiseDistance(device=metric_device)
+        m.attach(engine, "mpwd")
+
+        data = list(range(n_iters))
+        engine.run(data=data, max_epochs=1)
+
+        assert "mpwd" in engine.state.metrics
+        res = engine.state.metrics["mpwd"]
+
+        true_res = []
+        for i in range(n_iters * idist.get_world_size()):
+            true_res.append(
+                torch.pairwise_distance(
+                    y_true[i * s : (i + 1) * s, ...], y_preds[i * s : (i + 1) * s, ...], p=m._p, eps=m._eps
+                )
+                .cpu()
+                .numpy()
+            )
+        true_res = np.array(true_res).ravel()
+        true_res = true_res.mean()
 
-    data = list(range(n_iters))
-    engine.run(data=data, max_epochs=1)
+        assert pytest.approx(res) == true_res
 
-    assert "mpwd" in engine.state.metrics
-    res = engine.state.metrics["mpwd"]
+    _test("cpu")
+    _test(idist.device())
 
-    true_res = []
-    for i in range(n_iters * idist.get_world_size()):
-        true_res.append(
-            torch.pairwise_distance(
-                y_true[i * s : (i + 1) * s, ...], y_preds[i * s : (i + 1) * s, ...], p=m._p, eps=m._eps
-            )
-            .cpu()
-            .numpy()
-        )
-    true_res = np.array(true_res).ravel()
-    true_res = true_res.mean()
 
-    assert pytest.approx(res) == true_res
+def _test_distrib_accumulator_device(device):
 
+    for metric_device in [torch.device("cpu"), idist.device()]:
 
-def _test_distrib_accumulator_device(device):
-    device = torch.device(device)
-    mpd = MeanPairwiseDistance(device=device)
-    assert mpd._device == device
+        mpd = MeanPairwiseDistance(device=metric_device)
+        assert mpd._device == metric_device
+        assert mpd._sum_of_distances.device == metric_device, "{}:{} vs {}:{}".format(
+            type(mpd._sum_of_distances.device), mpd._sum_of_distances.device, type(metric_device), metric_device
+        )
 
-    y_pred = torch.Tensor([[3.0, 4.0], [-3.0, -4.0]])
-    y = torch.zeros(2, 2)
-    mpd.update((y_pred, y))
+        y_pred = torch.Tensor([[3.0, 4.0], [-3.0, -4.0]])
+        y = torch.zeros(2, 2)
+        mpd.update((y_pred, y))
 
-    assert mpd._sum_of_distances.device == device
+        assert mpd._sum_of_distances.device == metric_device, "{}:{} vs {}:{}".format(
+            type(mpd._sum_of_distances.device), mpd._sum_of_distances.device, type(metric_device), metric_device
+        )
 
 
 def test_accumulator_detached():
diff --git a/tests/ignite/metrics/test_mean_squared_error.py b/tests/ignite/metrics/test_mean_squared_error.py
index 08552836531f..9d73e0361601 100644
--- a/tests/ignite/metrics/test_mean_squared_error.py
+++ b/tests/ignite/metrics/test_mean_squared_error.py
@@ -49,31 +49,49 @@ def update(engine, i):
             y_true[i * s + offset * rank : (i + 1) * s + offset * rank],
         )
 
-    engine = Engine(update)
+    def _test(metric_device):
+        engine = Engine(update)
 
-    m = MeanSquaredError()
-    m.attach(engine, "mse")
+        m = MeanSquaredError(device=metric_device)
+        m.attach(engine, "mse")
 
-    data = list(range(n_iters))
-    engine.run(data=data, max_epochs=1)
+        data = list(range(n_iters))
+        engine.run(data=data, max_epochs=1)
 
-    assert "mse" in engine.state.metrics
-    res = engine.state.metrics["mse"]
+        assert "mse" in engine.state.metrics
+        res = engine.state.metrics["mse"]
 
-    true_res = np.mean(np.power((y_true - y_preds).cpu().numpy(), 2.0))
+        true_res = np.mean(np.power((y_true - y_preds).cpu().numpy(), 2.0))
 
-    assert pytest.approx(res, rel=tol) == true_res
+        assert pytest.approx(res, rel=tol) == true_res
+
+    _test("cpu")
+    _test(idist.device())
 
 
 def _test_distrib_accumulator_device(device):
-    device = torch.device(device)
-    mse = MeanSquaredError(device=device)
-    assert mse._device == device
 
-    y_pred = torch.tensor([[2.0], [-2.0]])
-    y = torch.zeros(2)
-    mse.update((y_pred, y))
-    assert mse._sum_of_squared_errors.device == device
+    for metric_device in [torch.device("cpu"), idist.device()]:
+
+        device = torch.device(device)
+        mse = MeanSquaredError(device=device)
+        assert mse._device == device
+        assert mse._sum_of_squared_errors.device == metric_device, "{}:{} vs {}:{}".format(
+            type(mse._sum_of_squared_errors.device),
+            mse._sum_of_squared_errors.device,
+            type(metric_device),
+            metric_device,
+        )
+
+        y_pred = torch.tensor([[2.0], [-2.0]])
+        y = torch.zeros(2)
+        mse.update((y_pred, y))
+        assert mse._sum_of_squared_errors.device == metric_device, "{}:{} vs {}:{}".format(
+            type(mse._sum_of_squared_errors.device),
+            mse._sum_of_squared_errors.device,
+            type(metric_device),
+            metric_device,
+        )
 
 
 def test_accumulator_detached():
diff --git a/tests/ignite/metrics/test_metrics_lambda.py b/tests/ignite/metrics/test_metrics_lambda.py
index a6a0902496dd..6c8b65a86ec8 100644
--- a/tests/ignite/metrics/test_metrics_lambda.py
+++ b/tests/ignite/metrics/test_metrics_lambda.py
@@ -325,7 +325,7 @@ def _test_distrib_integration(device):
     batch_size = 10
     n_classes = 10
 
-    def _test():
+    def _test(metric_device):
         y_true = np.arange(0, n_iters * batch_size * idist.get_world_size(), dtype="int64") % n_classes
         y_pred = 0.2 * np.random.rand(n_iters * batch_size * idist.get_world_size(), n_classes)
         for i in range(n_iters * batch_size * idist.get_world_size()):
@@ -345,8 +345,8 @@ def update_fn(engine, i):
 
         evaluator = Engine(update_fn)
 
-        precision = Precision(average=False, device=device)
-        recall = Recall(average=False, device=device)
+        precision = Precision(average=False, device=metric_device)
+        recall = Recall(average=False, device=metric_device)
 
         def Fbeta(r, p, beta):
             return torch.mean((1 + beta ** 2) * p * r / (beta ** 2 * p + r)).item()
@@ -367,7 +367,8 @@ def Fbeta(r, p, beta):
         assert 1.0 + f1_true == approx(state.metrics["ff1"])
 
     for _ in range(5):
-        _test()
+        _test("cpu")
+        _test(idist.device())
 
 
 @pytest.mark.distributed
diff --git a/tests/ignite/metrics/test_precision.py b/tests/ignite/metrics/test_precision.py
index 9ea0741fc835..3abdb420fa81 100644
--- a/tests/ignite/metrics/test_precision.py
+++ b/tests/ignite/metrics/test_precision.py
@@ -722,7 +722,7 @@ def _test_distrib_integration_multiclass(device):
     rank = idist.get_rank()
     torch.manual_seed(12)
 
-    def _test(average, n_epochs):
+    def _test(average, n_epochs, metric_device):
         n_iters = 60
         s = 16
         n_classes = 7
@@ -739,7 +739,7 @@ def update(engine, i):
 
         engine = Engine(update)
 
-        pr = Precision(average=average)
+        pr = Precision(average=average, device=metric_device)
         pr.attach(engine, "pr")
 
         data = list(range(n_iters))
@@ -758,10 +758,11 @@ def update(engine, i):
         assert pytest.approx(res) == true_res
 
     for _ in range(2):
-        _test(average=True, n_epochs=1)
-        _test(average=True, n_epochs=2)
-        _test(average=False, n_epochs=1)
-        _test(average=False, n_epochs=2)
+        for metric_device in ["cpu", idist.device()]:
+            _test(average=True, n_epochs=1, metric_device=metric_device)
+            _test(average=True, n_epochs=2, metric_device=metric_device)
+            _test(average=False, n_epochs=1, metric_device=metric_device)
+            _test(average=False, n_epochs=2, metric_device=metric_device)
 
 
 def _test_distrib_integration_multilabel(device):
@@ -771,7 +772,7 @@ def _test_distrib_integration_multilabel(device):
     rank = idist.get_rank()
     torch.manual_seed(12)
 
-    def _test(average, n_epochs):
+    def _test(average, n_epochs, metric_device):
         n_iters = 60
         s = 16
         n_classes = 7
@@ -813,8 +814,9 @@ def update(engine, i):
         assert pytest.approx(res) == true_res
 
     for _ in range(2):
-        _test(average=True, n_epochs=1)
-        _test(average=True, n_epochs=2)
+        for metric_device in ["cpu", idist.device()]:
+            _test(average=True, n_epochs=1, metric_device=metric_device)
+            _test(average=True, n_epochs=2, metric_device=metric_device)
 
     if idist.get_world_size() > 1:
         with pytest.warns(
@@ -835,38 +837,57 @@ def update(engine, i):
 
 def _test_distrib_accumulator_device(device):
     # Binary accuracy on input of shape (N, 1) or (N, )
-    device = torch.device(device)
 
-    def _test(average):
-        pr = Precision(average=average, device=device)
-        assert pr._device == device
+    def _test(average, metric_device):
+        pr = Precision(average=average, device=metric_device)
+        assert pr._device == metric_device
+        # Since the shape of the accumulated amount isn't known before the first update
+        # call, the internal variables aren't tensors on the right device yet.
 
         y_pred = torch.randint(0, 2, size=(10,))
         y = torch.randint(0, 2, size=(10,)).long()
         pr.update((y_pred, y))
 
-        assert pr._true_positives.device == device
-        assert pr._positives.device == device
+        assert pr._true_positives.device == metric_device, "{}:{} vs {}:{}".format(
+            type(pr._true_positives.device), pr._true_positives.device, type(metric_device), metric_device
+        )
+        assert pr._positives.device == metric_device, "{}:{} vs {}:{}".format(
+            type(pr._positives.device), pr._positives.device, type(metric_device), metric_device
+        )
 
-    _test(True)
-    _test(False)
+    for metric_device in [torch.device("cpu"), idist.device()]:
+        _test(True, metric_device=metric_device)
+        _test(False, metric_device=metric_device)
 
 
 def _test_distrib_multilabel_accumulator_device(device):
     # Multiclass input data of shape (N, ) and (N, C)
-    device = torch.device(device)
 
-    def _test(average):
-        pr = Precision(is_multilabel=True, average=average, device=device)
+    def _test(average, metric_device):
+        pr = Precision(is_multilabel=True, average=average, device=metric_device)
+
+        assert pr._device == metric_device
+        assert pr._true_positives.device == metric_device, "{}:{} vs {}:{}".format(
+            type(pr._true_positives.device), pr._true_positives.device, type(metric_device), metric_device
+        )
+        assert pr._positives.device == metric_device, "{}:{} vs {}:{}".format(
+            type(pr._positives.device), pr._positives.device, type(metric_device), metric_device
+        )
+
         y_pred = torch.randint(0, 2, size=(10, 4, 20, 23))
         y = torch.randint(0, 2, size=(10, 4, 20, 23)).long()
         pr.update((y_pred, y))
 
-        assert pr._true_positives.device == device
-        assert pr._positives.device == device
+        assert pr._true_positives.device == metric_device, "{}:{} vs {}:{}".format(
+            type(pr._true_positives.device), pr._true_positives.device, type(metric_device), metric_device
+        )
+        assert pr._positives.device == metric_device, "{}:{} vs {}:{}".format(
+            type(pr._positives.device), pr._positives.device, type(metric_device), metric_device
+        )
 
-    _test(True)
-    _test(False)
+    for metric_device in [torch.device("cpu"), idist.device()]:
+        _test(True, metric_device=metric_device)
+        _test(False, metric_device=metric_device)
 
 
 @pytest.mark.distributed
diff --git a/tests/ignite/metrics/test_recall.py b/tests/ignite/metrics/test_recall.py
index b783b2ec83ce..65564b8071bb 100644
--- a/tests/ignite/metrics/test_recall.py
+++ b/tests/ignite/metrics/test_recall.py
@@ -722,7 +722,7 @@ def _test_distrib_integration_multiclass(device):
     rank = idist.get_rank()
     torch.manual_seed(12)
 
-    def _test(average, n_epochs):
+    def _test(average, n_epochs, metric_device):
         n_iters = 60
         s = 16
         n_classes = 7
@@ -739,7 +739,7 @@ def update(engine, i):
 
         engine = Engine(update)
 
-        re = Recall(average=average)
+        re = Recall(average=average, device=metric_device)
         re.attach(engine, "re")
 
         data = list(range(n_iters))
@@ -758,10 +758,11 @@ def update(engine, i):
         assert pytest.approx(res) == true_res
 
     for _ in range(2):
-        _test(average=True, n_epochs=1)
-        _test(average=True, n_epochs=2)
-        _test(average=False, n_epochs=1)
-        _test(average=False, n_epochs=2)
+        for metric_device in ["cpu", idist.device()]:
+            _test(average=True, n_epochs=1, metric_device=metric_device)
+            _test(average=True, n_epochs=2, metric_device=metric_device)
+            _test(average=False, n_epochs=1, metric_device=metric_device)
+            _test(average=False, n_epochs=2, metric_device=metric_device)
 
 
 def _test_distrib_integration_multilabel(device):
@@ -771,7 +772,7 @@ def _test_distrib_integration_multilabel(device):
     rank = idist.get_rank()
     torch.manual_seed(12)
 
-    def _test(average, n_epochs):
+    def _test(average, n_epochs, metric_device):
         n_iters = 60
         s = 16
         n_classes = 7
@@ -788,7 +789,7 @@ def update(engine, i):
 
         engine = Engine(update)
 
-        re = Recall(average=average, is_multilabel=True)
+        re = Recall(average=average, is_multilabel=True, device=metric_device)
         re.attach(engine, "re")
 
         data = list(range(n_iters))
@@ -813,8 +814,9 @@ def update(engine, i):
         assert pytest.approx(res) == true_res
 
     for _ in range(2):
-        _test(average=True, n_epochs=1)
-        _test(average=True, n_epochs=2)
+        for metric_device in ["cpu", idist.device()]:
+            _test(average=True, n_epochs=1, metric_device=metric_device)
+            _test(average=True, n_epochs=2, metric_device=metric_device)
 
     if idist.get_world_size() > 1:
         with pytest.warns(
@@ -835,38 +837,57 @@ def update(engine, i):
 
 def _test_distrib_accumulator_device(device):
     # Binary accuracy on input of shape (N, 1) or (N, )
-    device = torch.device(device)
 
-    def _test(average):
-        re = Recall(average=average, device=device)
-        assert re._device == device
+    def _test(average, metric_device):
+        re = Recall(average=average, device=metric_device)
+        assert re._device == metric_device
+        # Since the shape of the accumulated amount isn't known before the first update
+        # call, the internal variables aren't tensors on the right device yet.
 
         y_reed = torch.randint(0, 2, size=(10,))
         y = torch.randint(0, 2, size=(10,)).long()
         re.update((y_reed, y))
 
-        assert re._true_positives.device == device
-        assert re._positives.device == device
+        assert re._true_positives.device == metric_device, "{}:{} vs {}:{}".format(
+            type(re._true_positives.device), re._true_positives.device, type(metric_device), metric_device
+        )
+        assert re._positives.device == metric_device, "{}:{} vs {}:{}".format(
+            type(re._positives.device), re._positives.device, type(metric_device), metric_device
+        )
 
-    _test(True)
-    _test(False)
+    for metric_device in [torch.device("cpu"), idist.device()]:
+        _test(True, metric_device=metric_device)
+        _test(False, metric_device=metric_device)
 
 
 def _test_distrib_multilabel_accumulator_device(device):
     # Multiclass input data of shape (N, ) and (N, C)
-    device = torch.device(device)
 
-    def _test(average):
-        re = Recall(is_multilabel=True, average=average, device=device)
+    def _test(average, metric_device):
+        re = Recall(is_multilabel=True, average=average, device=metric_device)
+
+        assert re._device == metric_device
+        assert re._true_positives.device == metric_device, "{}:{} vs {}:{}".format(
+            type(re._true_positives.device), re._true_positives.device, type(metric_device), metric_device
+        )
+        assert re._positives.device == metric_device, "{}:{} vs {}:{}".format(
+            type(re._positives.device), re._positives.device, type(metric_device), metric_device
+        )
+
         y_reed = torch.randint(0, 2, size=(10, 4, 20, 23))
         y = torch.randint(0, 2, size=(10, 4, 20, 23)).long()
         re.update((y_reed, y))
 
-        assert re._true_positives.device == device
-        assert re._positives.device == device
+        assert re._true_positives.device == metric_device, "{}:{} vs {}:{}".format(
+            type(re._true_positives.device), re._true_positives.device, type(metric_device), metric_device
+        )
+        assert re._positives.device == metric_device, "{}:{} vs {}:{}".format(
+            type(re._positives.device), re._positives.device, type(metric_device), metric_device
+        )
 
-    _test(True)
-    _test(False)
+    for metric_device in [torch.device("cpu"), idist.device()]:
+        _test(True, metric_device=metric_device)
+        _test(False, metric_device=metric_device)
 
 
 @pytest.mark.distributed
diff --git a/tests/ignite/metrics/test_root_mean_squared_error.py b/tests/ignite/metrics/test_root_mean_squared_error.py
index 878ef9df367d..b7068be11867 100644
--- a/tests/ignite/metrics/test_root_mean_squared_error.py
+++ b/tests/ignite/metrics/test_root_mean_squared_error.py
@@ -46,25 +46,29 @@ def _test_distrib_integration(device, tol=1e-6):
     def update(engine, i):
         return y_preds[i * s : (i + 1) * s], y_true[i * s + offset * rank : (i + 1) * s + offset * rank]
 
-    engine = Engine(update)
+    def _test(metric_device):
+        engine = Engine(update)
 
-    m = RootMeanSquaredError()
-    m.attach(engine, "rmse")
+        m = RootMeanSquaredError(device=metric_device)
+        m.attach(engine, "rmse")
 
-    data = list(range(n_iters))
-    engine.run(data=data, max_epochs=1)
+        data = list(range(n_iters))
+        engine.run(data=data, max_epochs=1)
 
-    assert "rmse" in engine.state.metrics
-    res = engine.state.metrics["rmse"]
+        assert "rmse" in engine.state.metrics
+        res = engine.state.metrics["rmse"]
 
-    y_preds_full = []
-    for i in range(idist.get_world_size()):
-        y_preds_full.append((i + 1) * torch.ones(offset))
-    y_preds_full = torch.stack(y_preds_full).to(device).flatten()
+        y_preds_full = []
+        for i in range(idist.get_world_size()):
+            y_preds_full.append((i + 1) * torch.ones(offset))
+        y_preds_full = torch.stack(y_preds_full).to(device).flatten()
 
-    true_res = np.sqrt(np.mean(np.square((y_true - y_preds_full).cpu().numpy())))
+        true_res = np.sqrt(np.mean(np.square((y_true - y_preds_full).cpu().numpy())))
 
-    assert pytest.approx(res, rel=tol) == true_res
+        assert pytest.approx(res, rel=tol) == true_res
+
+    _test("cpu")
+    _test(idist.device())
 
 
 @pytest.mark.distributed
diff --git a/tests/ignite/metrics/test_running_average.py b/tests/ignite/metrics/test_running_average.py
index ab1528e10596..1d729b97c5d0 100644
--- a/tests/ignite/metrics/test_running_average.py
+++ b/tests/ignite/metrics/test_running_average.py
@@ -305,61 +305,84 @@ def _test_distrib_on_metric(device):
     batch_size = 10
     n_classes = 10
 
-    data = list(range(n_iters))
-    np.random.seed(12)
-    all_y_true_batch_values = np.random.randint(
-        0, n_classes, size=(idist.get_world_size(), n_epochs * n_iters, batch_size)
-    )
-    all_y_pred_batch_values = np.random.rand(idist.get_world_size(), n_epochs * n_iters, batch_size, n_classes)
+    def _test(metric_device):
+        data = list(range(n_iters))
+        np.random.seed(12)
+        all_y_true_batch_values = np.random.randint(
+            0, n_classes, size=(idist.get_world_size(), n_epochs * n_iters, batch_size)
+        )
+        all_y_pred_batch_values = np.random.rand(idist.get_world_size(), n_epochs * n_iters, batch_size, n_classes)
 
-    y_true_batch_values = iter(all_y_true_batch_values[rank, ...])
-    y_pred_batch_values = iter(all_y_pred_batch_values[rank, ...])
+        y_true_batch_values = iter(all_y_true_batch_values[rank, ...])
+        y_pred_batch_values = iter(all_y_pred_batch_values[rank, ...])
 
-    def update_fn(engine, batch):
-        y_true_batch = next(y_true_batch_values)
-        y_pred_batch = next(y_pred_batch_values)
-        return torch.from_numpy(y_pred_batch), torch.from_numpy(y_true_batch)
+        def update_fn(engine, batch):
+            y_true_batch = next(y_true_batch_values)
+            y_pred_batch = next(y_pred_batch_values)
+            return torch.from_numpy(y_pred_batch), torch.from_numpy(y_true_batch)
 
-    trainer = Engine(update_fn)
-    alpha = 0.98
+        trainer = Engine(update_fn)
+        alpha = 0.98
 
-    acc_metric = RunningAverage(
-        Accuracy(output_transform=lambda x: [x[0], x[1]], device=device), alpha=alpha, epoch_bound=False
-    )
-    acc_metric.attach(trainer, "running_avg_accuracy")
+        acc_metric = RunningAverage(
+            Accuracy(output_transform=lambda x: [x[0], x[1]], device=metric_device), alpha=alpha, epoch_bound=False
+        )
+        acc_metric.attach(trainer, "running_avg_accuracy")
+
+        running_avg_acc = [
+            None,
+        ]
+        true_acc_metric = Accuracy(device=metric_device)
+
+        @trainer.on(Events.ITERATION_COMPLETED)
+        def manual_running_avg_acc(engine):
+            i = engine.state.iteration - 1
+
+            true_acc_metric.reset()
+            for j in range(idist.get_world_size()):
+                output = (
+                    torch.from_numpy(all_y_pred_batch_values[j, i, :, :]),
+                    torch.from_numpy(all_y_true_batch_values[j, i, :]),
+                )
+                true_acc_metric.update(output)
+
+            batch_acc = true_acc_metric._num_correct.item() * 1.0 / true_acc_metric._num_examples
+
+            if running_avg_acc[0] is None:
+                running_avg_acc[0] = batch_acc
+            else:
+                running_avg_acc[0] = running_avg_acc[0] * alpha + (1.0 - alpha) * batch_acc
+            engine.state.running_avg_acc = running_avg_acc[0]
+
+        @trainer.on(Events.ITERATION_COMPLETED)
+        def assert_equal_running_avg_acc_values(engine):
+            assert engine.state.running_avg_acc == engine.state.metrics["running_avg_accuracy"], "{} vs {}".format(
+                engine.state.running_avg_acc, engine.state.metrics["running_avg_accuracy"]
+            )
 
-    running_avg_acc = [
-        None,
-    ]
-    true_acc_metric = Accuracy(device=device)
+        trainer.run(data, max_epochs=3)
 
-    @trainer.on(Events.ITERATION_COMPLETED)
-    def manual_running_avg_acc(engine):
-        i = engine.state.iteration - 1
+    _test("cpu")
+    _test(idist.device())
 
-        true_acc_metric.reset()
-        for j in range(idist.get_world_size()):
-            output = (
-                torch.from_numpy(all_y_pred_batch_values[j, i, :, :]),
-                torch.from_numpy(all_y_true_batch_values[j, i, :]),
-            )
-            true_acc_metric.update(output)
 
-        batch_acc = true_acc_metric._num_correct.item() * 1.0 / true_acc_metric._num_examples
+def _test_distrib_accumulator_device(device):
 
-        if running_avg_acc[0] is None:
-            running_avg_acc[0] = batch_acc
-        else:
-            running_avg_acc[0] = running_avg_acc[0] * alpha + (1.0 - alpha) * batch_acc
-        engine.state.running_avg_acc = running_avg_acc[0]
+    for metric_device in [torch.device("cpu"), idist.device()]:
 
-    @trainer.on(Events.ITERATION_COMPLETED)
-    def assert_equal_running_avg_acc_values(engine):
-        assert engine.state.running_avg_acc == engine.state.metrics["running_avg_accuracy"], "{} vs {}".format(
-            engine.state.running_avg_acc, engine.state.metrics["running_avg_accuracy"]
-        )
+        # Don't test the src=Metric case because compute() returns a scalar,
+        # so the metric doesn't accumulate on the device specified
+        avg = RunningAverage(output_transform=lambda x: x, device=metric_device)
+        assert avg._device == metric_device
+        # Value is None until the first update then compute call
 
-    trainer.run(data, max_epochs=3)
+        for _ in range(3):
+            avg.update(torch.tensor(1.0, device=device))
+            avg.compute()
+
+            assert avg._value.device == metric_device, "{}:{} vs {}:{}".format(
+                type(avg._value.device), avg._value.device, type(metric_device), metric_device
+            )
 
 
 @pytest.mark.distributed
@@ -370,6 +393,7 @@ def test_distrib_gpu(local_rank, distributed_context_single_node_nccl):
     device = "cuda:{}".format(local_rank)
     _test_distrib_on_output(device)
     _test_distrib_on_metric(device)
+    _test_distrib_accumulator_device(device)
 
 
 @pytest.mark.distributed
@@ -379,6 +403,7 @@ def test_distrib_cpu(distributed_context_single_node_gloo):
     device = "cpu"
     _test_distrib_on_output(device)
     _test_distrib_on_metric(device)
+    _test_distrib_accumulator_device(device)
 
 
 @pytest.mark.distributed
@@ -391,6 +416,7 @@ def test_distrib_hvd(gloo_hvd_executor):
 
     gloo_hvd_executor(_test_distrib_on_output, (device,), np=nproc, do_init=True)
     gloo_hvd_executor(_test_distrib_on_metric, (device,), np=nproc, do_init=True)
+    gloo_hvd_executor(_test_distrib_accumulator_device, (device,), np=nproc, do_init=True)
 
 
 @pytest.mark.multinode_distributed
@@ -400,6 +426,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo):
     device = "cpu"
     _test_distrib_on_output(device)
     _test_distrib_on_metric(device)
+    _test_distrib_accumulator_device(device)
 
 
 @pytest.mark.multinode_distributed
@@ -409,6 +436,7 @@ def test_multinode_distrib_gpu(distributed_context_multi_node_nccl):
     device = "cuda:{}".format(distributed_context_multi_node_nccl["local_rank"])
     _test_distrib_on_output(device)
     _test_distrib_on_metric(device)
+    _test_distrib_accumulator_device(device)
 
 
 @pytest.mark.tpu
@@ -418,12 +446,14 @@ def test_distrib_single_device_xla():
     device = idist.device()
     _test_distrib_on_output(device)
     _test_distrib_on_metric(device)
+    _test_distrib_accumulator_device(device)
 
 
 def _test_distrib_xla_nprocs(index):
     device = idist.device()
     _test_distrib_on_output(device)
     _test_distrib_on_metric(device)
+    _test_distrib_accumulator_device(device)
 
 
 @pytest.mark.tpu
diff --git a/tests/ignite/metrics/test_top_k_categorical_accuracy.py b/tests/ignite/metrics/test_top_k_categorical_accuracy.py
index 0f22d2f6b697..209a6bf25fca 100644
--- a/tests/ignite/metrics/test_top_k_categorical_accuracy.py
+++ b/tests/ignite/metrics/test_top_k_categorical_accuracy.py
@@ -59,7 +59,7 @@ def _test_distrib_integration(device):
     rank = idist.get_rank()
     torch.manual_seed(12)
 
-    def _test(n_epochs):
+    def _test(n_epochs, metric_device):
         n_iters = 100
         s = 16
         n_classes = 10
@@ -79,7 +79,7 @@ def update(engine, i):
         engine = Engine(update)
 
         k = 5
-        acc = TopKCategoricalAccuracy(k=k, device=device)
+        acc = TopKCategoricalAccuracy(k=k, device=metric_device)
         acc.attach(engine, "acc")
 
         data = list(range(n_iters))
@@ -95,19 +95,28 @@ def update(engine, i):
         assert pytest.approx(res) == true_res
 
     for _ in range(5):
-        _test(n_epochs=1)
-        _test(n_epochs=2)
+        for metric_device in ["cpu", idist.device()]:
+            _test(n_epochs=1, metric_device=metric_device)
+            _test(n_epochs=2, metric_device=metric_device)
 
 
 def _test_distrib_accumulator_device(device):
-    device = torch.device(device)
-    acc = TopKCategoricalAccuracy(2, device=device)
-    assert acc._device == device
 
-    y_pred = torch.tensor([[0.2, 0.4, 0.6, 0.8], [0.8, 0.6, 0.4, 0.2]])
-    y = torch.ones(2).long()
-    acc.update((y_pred, y))
-    assert acc._num_correct.device == device
+    for metric_device in [torch.device("cpu"), idist.device()]:
+
+        acc = TopKCategoricalAccuracy(2, device=device)
+        assert acc._device == metric_device
+        assert acc._num_correct.device == metric_device, "{}:{} vs {}:{}".format(
+            type(acc._num_correct.device), acc._num_correct.device, type(metric_device), metric_device
+        )
+
+        y_pred = torch.tensor([[0.2, 0.4, 0.6, 0.8], [0.8, 0.6, 0.4, 0.2]])
+        y = torch.ones(2).long()
+        acc.update((y_pred, y))
+
+        assert acc._num_correct.device == metric_device, "{}:{} vs {}:{}".format(
+            type(acc._num_correct.device), acc._num_correct.device, type(metric_device), metric_device
+        )
 
 
 @pytest.mark.distributed

From 4824e24166be4a2b6b9c275bfe7301a41dc3fb71 Mon Sep 17 00:00:00 2001
From: Nicholas Vadivelu <nicv@uber.com>
Date: Mon, 17 Aug 2020 22:31:05 -0400
Subject: [PATCH 29/32] fix and improve device tests for metrics

---
 tests/ignite/metrics/test_mean_squared_error.py         | 2 +-
 tests/ignite/metrics/test_precision.py                  | 4 ++--
 tests/ignite/metrics/test_recall.py                     | 4 ++--
 tests/ignite/metrics/test_top_k_categorical_accuracy.py | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/ignite/metrics/test_mean_squared_error.py b/tests/ignite/metrics/test_mean_squared_error.py
index 9d73e0361601..a824ce57d1c5 100644
--- a/tests/ignite/metrics/test_mean_squared_error.py
+++ b/tests/ignite/metrics/test_mean_squared_error.py
@@ -74,7 +74,7 @@ def _test_distrib_accumulator_device(device):
     for metric_device in [torch.device("cpu"), idist.device()]:
 
         device = torch.device(device)
-        mse = MeanSquaredError(device=device)
+        mse = MeanSquaredError(device=metric_device)
         assert mse._device == device
         assert mse._sum_of_squared_errors.device == metric_device, "{}:{} vs {}:{}".format(
             type(mse._sum_of_squared_errors.device),
diff --git a/tests/ignite/metrics/test_precision.py b/tests/ignite/metrics/test_precision.py
index 3abdb420fa81..1fb5b0fd4a5f 100644
--- a/tests/ignite/metrics/test_precision.py
+++ b/tests/ignite/metrics/test_precision.py
@@ -748,7 +748,7 @@ def update(engine, i):
         assert "pr" in engine.state.metrics
         res = engine.state.metrics["pr"]
         if isinstance(res, torch.Tensor):
-            assert res.device.type == "cpu"
+            assert res.device == metric_device
             res = res.cpu().numpy()
 
         true_res = precision_score(
@@ -758,7 +758,7 @@ def update(engine, i):
         assert pytest.approx(res) == true_res
 
     for _ in range(2):
-        for metric_device in ["cpu", idist.device()]:
+        for metric_device in [torch.device("cpu"), idist.device()]:
             _test(average=True, n_epochs=1, metric_device=metric_device)
             _test(average=True, n_epochs=2, metric_device=metric_device)
             _test(average=False, n_epochs=1, metric_device=metric_device)
diff --git a/tests/ignite/metrics/test_recall.py b/tests/ignite/metrics/test_recall.py
index 65564b8071bb..a170f3c0ac6d 100644
--- a/tests/ignite/metrics/test_recall.py
+++ b/tests/ignite/metrics/test_recall.py
@@ -748,7 +748,7 @@ def update(engine, i):
         assert "re" in engine.state.metrics
         res = engine.state.metrics["re"]
         if isinstance(res, torch.Tensor):
-            assert res.device.type == "cpu"
+            assert res.device == metric_device
             res = res.cpu().numpy()
 
         true_res = recall_score(
@@ -758,7 +758,7 @@ def update(engine, i):
         assert pytest.approx(res) == true_res
 
     for _ in range(2):
-        for metric_device in ["cpu", idist.device()]:
+        for metric_device in [torch.device("cpu"), idist.device()]:
             _test(average=True, n_epochs=1, metric_device=metric_device)
             _test(average=True, n_epochs=2, metric_device=metric_device)
             _test(average=False, n_epochs=1, metric_device=metric_device)
diff --git a/tests/ignite/metrics/test_top_k_categorical_accuracy.py b/tests/ignite/metrics/test_top_k_categorical_accuracy.py
index 209a6bf25fca..e244e48a7367 100644
--- a/tests/ignite/metrics/test_top_k_categorical_accuracy.py
+++ b/tests/ignite/metrics/test_top_k_categorical_accuracy.py
@@ -104,7 +104,7 @@ def _test_distrib_accumulator_device(device):
 
     for metric_device in [torch.device("cpu"), idist.device()]:
 
-        acc = TopKCategoricalAccuracy(2, device=device)
+        acc = TopKCategoricalAccuracy(2, device=metric_device)
         assert acc._device == metric_device
         assert acc._num_correct.device == metric_device, "{}:{} vs {}:{}".format(
             type(acc._num_correct.device), acc._num_correct.device, type(metric_device), metric_device

From 1361866f610a2307d8cf2fe629ee5ea9c2f0109d Mon Sep 17 00:00:00 2001
From: Nicholas Vadivelu <nicv@uber.com>
Date: Mon, 17 Aug 2020 23:49:27 -0400
Subject: [PATCH 30/32] fix TPU tests

---
 tests/ignite/metrics/test_mean_squared_error.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/ignite/metrics/test_mean_squared_error.py b/tests/ignite/metrics/test_mean_squared_error.py
index a824ce57d1c5..e8bb18777e84 100644
--- a/tests/ignite/metrics/test_mean_squared_error.py
+++ b/tests/ignite/metrics/test_mean_squared_error.py
@@ -75,7 +75,7 @@ def _test_distrib_accumulator_device(device):
 
         device = torch.device(device)
         mse = MeanSquaredError(device=metric_device)
-        assert mse._device == device
+        assert mse._device == metric_device
         assert mse._sum_of_squared_errors.device == metric_device, "{}:{} vs {}:{}".format(
             type(mse._sum_of_squared_errors.device),
             mse._sum_of_squared_errors.device,

From 556262bcd15cec64119eea4b22036acb51bfffd9 Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Tue, 18 Aug 2020 09:25:31 +0200
Subject: [PATCH 31/32] Apply suggestions from code review

---
 ignite/metrics/confusion_matrix.py                      | 2 +-
 ignite/metrics/running_average.py                       | 2 +-
 tests/ignite/metrics/test_top_k_categorical_accuracy.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/ignite/metrics/confusion_matrix.py b/ignite/metrics/confusion_matrix.py
index b22f18b8e0e5..3d179a458d62 100644
--- a/ignite/metrics/confusion_matrix.py
+++ b/ignite/metrics/confusion_matrix.py
@@ -31,7 +31,7 @@ class ConfusionMatrix(Metric):
             form expected by the metric. This can be useful if, for example, you have a multi-output model and
             you want to compute the metric with respect to one of the outputs.
         device (str or torch.device): specifies which device updates are accumulated on. Setting the metric's
-            device to be the same as your `update` arguments ensures the `update` method is non-blocking. By
+            device to be the same as your ``update`` arguments ensures the ``update`` method is non-blocking. By
             default, CPU.
 
     Note:
diff --git a/ignite/metrics/running_average.py b/ignite/metrics/running_average.py
index 353c00bbc27f..0c30f3920171 100644
--- a/ignite/metrics/running_average.py
+++ b/ignite/metrics/running_average.py
@@ -21,7 +21,7 @@ class RunningAverage(Metric):
         epoch_bound (boolean, optional): whether the running average should be reset after each epoch (defaults
             to True).
         device (str or torch.device, optional): specifies which device updates are accumulated on. Should be
-            None when `src` is an instance of :class:`~ignite.metrics.Metric`, as the running average will
+            None when ``src`` is an instance of :class:`~ignite.metrics.Metric`, as the running average will
             use the `src`'s device. Otherwise, defaults to CPU. Only applicable when the computed value
             from the metric is a tensor.
 
diff --git a/tests/ignite/metrics/test_top_k_categorical_accuracy.py b/tests/ignite/metrics/test_top_k_categorical_accuracy.py
index e244e48a7367..b16cd4efce3a 100644
--- a/tests/ignite/metrics/test_top_k_categorical_accuracy.py
+++ b/tests/ignite/metrics/test_top_k_categorical_accuracy.py
@@ -94,7 +94,7 @@ def update(engine, i):
 
         assert pytest.approx(res) == true_res
 
-    for _ in range(5):
+    for _ in range(3):
         for metric_device in ["cpu", idist.device()]:
             _test(n_epochs=1, metric_device=metric_device)
             _test(n_epochs=2, metric_device=metric_device)

From 489620b15067bd978193cdf9e2263d6cf99b38c5 Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Tue, 18 Aug 2020 09:31:15 +0200
Subject: [PATCH 32/32] Apply suggestions from code review

---
 ignite/metrics/running_average.py           | 2 +-
 tests/ignite/metrics/test_metrics_lambda.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ignite/metrics/running_average.py b/ignite/metrics/running_average.py
index 0c30f3920171..0fa1216c7940 100644
--- a/ignite/metrics/running_average.py
+++ b/ignite/metrics/running_average.py
@@ -22,7 +22,7 @@ class RunningAverage(Metric):
             to True).
         device (str or torch.device, optional): specifies which device updates are accumulated on. Should be
             None when ``src`` is an instance of :class:`~ignite.metrics.Metric`, as the running average will
-            use the `src`'s device. Otherwise, defaults to CPU. Only applicable when the computed value
+            use the ``src``'s device. Otherwise, defaults to CPU. Only applicable when the computed value
             from the metric is a tensor.
 
 
diff --git a/tests/ignite/metrics/test_metrics_lambda.py b/tests/ignite/metrics/test_metrics_lambda.py
index 6c8b65a86ec8..3e98b9088967 100644
--- a/tests/ignite/metrics/test_metrics_lambda.py
+++ b/tests/ignite/metrics/test_metrics_lambda.py
@@ -366,7 +366,7 @@ def Fbeta(r, p, beta):
         assert f1_true == approx(state.metrics["f1"])
         assert 1.0 + f1_true == approx(state.metrics["ff1"])
 
-    for _ in range(5):
+    for _ in range(3):
         _test("cpu")
         _test(idist.device())