From 28956d5aca8c7085987d2f96fc88091bf7e6fdc8 Mon Sep 17 00:00:00 2001
From: Santiago Castro <sacastro@umich.edu>
Date: Wed, 26 Feb 2020 00:00:18 -0500
Subject: [PATCH 1/7] Make most metrics work on GPU

---
 .../tests/training/metrics/entropy_test.py    |  4 +-
 .../training/metrics/attachment_scores.py     |  7 ++-
 allennlp/training/metrics/auc.py              | 11 +++--
 allennlp/training/metrics/average.py          |  2 +-
 allennlp/training/metrics/bleu.py             |  4 +-
 allennlp/training/metrics/boolean_accuracy.py |  4 +-
 .../training/metrics/categorical_accuracy.py  |  4 +-
 .../training/metrics/conll_coref_scores.py    | 47 +++++++++----------
 allennlp/training/metrics/covariance.py       |  2 +-
 allennlp/training/metrics/entropy.py          |  4 +-
 allennlp/training/metrics/fbeta_measure.py    | 16 +++----
 .../training/metrics/mean_absolute_error.py   |  4 +-
 allennlp/training/metrics/mention_recall.py   |  4 +-
 allennlp/training/metrics/metric.py           | 11 +++--
 .../training/metrics/pearson_correlation.py   |  2 +-
 .../training/metrics/sequence_accuracy.py     |  4 +-
 .../training/metrics/span_based_f1_measure.py |  8 ++--
 .../training/metrics/spearman_correlation.py  |  9 ++--
 allennlp/training/metrics/srl_eval_scorer.py  |  6 +--
 allennlp/training/metrics/unigram_recall.py   |  8 ++--
 20 files changed, 83 insertions(+), 78 deletions(-)

diff --git a/allennlp/tests/training/metrics/entropy_test.py b/allennlp/tests/training/metrics/entropy_test.py
index 97040b1c97e..398a4ff51cb 100644
--- a/allennlp/tests/training/metrics/entropy_test.py
+++ b/allennlp/tests/training/metrics/entropy_test.py
@@ -16,11 +16,11 @@ def test_entropy_for_uniform_distribution(self):
         metric = Entropy()
         logits = torch.Tensor([[1, 1, 1, 1], [1, 1, 1, 1]])
         metric(logits)
-        numpy.testing.assert_almost_equal(metric.get_metric(), 1.38629436)
+        numpy.testing.assert_almost_equal(metric.get_metric().cpu(), 1.38629436)
         # actual values shouldn't effect uniform distribution:
         logits = torch.Tensor([[2, 2, 2, 2], [2, 2, 2, 2]])
         metric(logits)
-        numpy.testing.assert_almost_equal(metric.get_metric(), 1.38629436)
+        numpy.testing.assert_almost_equal(metric.get_metric().cpu(), 1.38629436)
 
         metric.reset()
         assert metric._entropy == 0.0
diff --git a/allennlp/training/metrics/attachment_scores.py b/allennlp/training/metrics/attachment_scores.py
index 29a970dbb50..f94ea054d49 100644
--- a/allennlp/training/metrics/attachment_scores.py
+++ b/allennlp/training/metrics/attachment_scores.py
@@ -53,10 +53,13 @@ def __call__(  # type: ignore
         mask : `torch.Tensor`, optional (default = None).
             A tensor of the same shape as `predicted_indices`.
         """
-        unwrapped = self.unwrap_to_tensors(
+        detached = self.detach_tensors(
             predicted_indices, predicted_labels, gold_indices, gold_labels, mask
         )
-        predicted_indices, predicted_labels, gold_indices, gold_labels, mask = unwrapped
+        predicted_indices, predicted_labels, gold_indices, gold_labels, mask = detached
+
+        if mask is None:
+            mask = torch.ones_like(predicted_indices)
 
         mask = mask.long()
         predicted_indices = predicted_indices.long()
diff --git a/allennlp/training/metrics/auc.py b/allennlp/training/metrics/auc.py
index 4c8694ff26f..f3ddd243ec0 100644
--- a/allennlp/training/metrics/auc.py
+++ b/allennlp/training/metrics/auc.py
@@ -40,7 +40,7 @@ def __call__(
             A one-dimensional label tensor of shape (batch_size).
         """
 
-        predictions, gold_labels, mask = self.unwrap_to_tensors(predictions, gold_labels, mask)
+        predictions, gold_labels, mask = self.detach_tensors(predictions, gold_labels, mask)
 
         # Sanity checks.
         if gold_labels.dim() != 1:
@@ -70,9 +70,12 @@ def __call__(
 
         if mask is None:
             batch_size = gold_labels.shape[0]
-            mask = torch.ones(batch_size)
+            mask = torch.ones(batch_size, device=gold_labels.device)
         mask = mask.to(dtype=torch.bool)
 
+        self._all_predictions = self._all_predictions.to(predictions.device)
+        self._all_gold_labels = self._all_gold_labels.to(gold_labels.device)
+
         self._all_predictions = torch.cat(
             [self._all_predictions, torch.masked_select(predictions, mask).float()], dim=0
         )
@@ -84,8 +87,8 @@ def get_metric(self, reset: bool = False):
         if self._all_gold_labels.shape[0] == 0:
             return 0.5
         false_positive_rates, true_positive_rates, _ = metrics.roc_curve(
-            self._all_gold_labels.numpy(),
-            self._all_predictions.numpy(),
+            self._all_gold_labels.cpu().numpy(),
+            self._all_predictions.cpu().numpy(),
             pos_label=self._positive_label,
         )
         auc = metrics.auc(false_positive_rates, true_positive_rates)
diff --git a/allennlp/training/metrics/average.py b/allennlp/training/metrics/average.py
index 8d860cbf87a..6652fbc9a52 100644
--- a/allennlp/training/metrics/average.py
+++ b/allennlp/training/metrics/average.py
@@ -24,7 +24,7 @@ def __call__(self, value):
         value : `float`
             The value to average.
         """
-        self._total_value += list(self.unwrap_to_tensors(value))[0]
+        self._total_value += list(self.detach_tensors(value))[0]
         self._count += 1
 
     @overrides
diff --git a/allennlp/training/metrics/bleu.py b/allennlp/training/metrics/bleu.py
index d2d389091f3..972293a5a9c 100644
--- a/allennlp/training/metrics/bleu.py
+++ b/allennlp/training/metrics/bleu.py
@@ -108,7 +108,7 @@ def _get_brevity_penalty(self) -> float:
         return math.exp(1.0 - self._reference_lengths / self._prediction_lengths)
 
     def _get_valid_tokens_mask(self, tensor: torch.LongTensor) -> torch.ByteTensor:
-        valid_tokens_mask = torch.ones(tensor.size(), dtype=torch.bool)
+        valid_tokens_mask = torch.ones_like(tensor, dtype=torch.bool)
         for index in self._exclude_indices:
             valid_tokens_mask = valid_tokens_mask & (tensor != index)
         return valid_tokens_mask
@@ -133,7 +133,7 @@ def __call__(
 
         None
         """
-        predictions, gold_targets = self.unwrap_to_tensors(predictions, gold_targets)
+        predictions, gold_targets = self.detach_tensors(predictions, gold_targets)
         for ngram_size, _ in enumerate(self._ngram_weights, start=1):
             precision_matches, precision_totals = self._get_modified_precision_counts(
                 predictions, gold_targets, ngram_size
diff --git a/allennlp/training/metrics/boolean_accuracy.py b/allennlp/training/metrics/boolean_accuracy.py
index f36874fca44..5e3777fe60f 100644
--- a/allennlp/training/metrics/boolean_accuracy.py
+++ b/allennlp/training/metrics/boolean_accuracy.py
@@ -43,7 +43,7 @@ def __call__(
         mask : `torch.Tensor`, optional (default = None).
             A tensor of the same shape as `predictions`.
         """
-        predictions, gold_labels, mask = self.unwrap_to_tensors(predictions, gold_labels, mask)
+        predictions, gold_labels, mask = self.detach_tensors(predictions, gold_labels, mask)
 
         # Some sanity checks.
         if gold_labels.size() != predictions.size():
@@ -69,7 +69,7 @@ def __call__(
             # so we'll keep predictions that aren't.
             keep = mask.view(batch_size, -1).max(dim=1)[0].float()
         else:
-            keep = torch.ones(batch_size).float()
+            keep = torch.ones(batch_size, device=predictions.device).float()
 
         predictions = predictions.view(batch_size, -1)
         gold_labels = gold_labels.view(batch_size, -1)
diff --git a/allennlp/training/metrics/categorical_accuracy.py b/allennlp/training/metrics/categorical_accuracy.py
index a1bc038e456..cc8aac66885 100644
--- a/allennlp/training/metrics/categorical_accuracy.py
+++ b/allennlp/training/metrics/categorical_accuracy.py
@@ -45,7 +45,7 @@ def __call__(
         mask : `torch.Tensor`, optional (default = None).
             A masking tensor the same size as `gold_labels`.
         """
-        predictions, gold_labels, mask = self.unwrap_to_tensors(predictions, gold_labels, mask)
+        predictions, gold_labels, mask = self.detach_tensors(predictions, gold_labels, mask)
 
         # Some sanity checks.
         num_classes = predictions.size(-1)
@@ -80,7 +80,7 @@ def __call__(
             # ith entry in gold_labels points to index (0-num_classes) for ith row in max_predictions
             # For each row check if index pointed by gold_label is was 1 or not (among max scored classes)
             correct = max_predictions_mask[
-                torch.arange(gold_labels.numel()).long(), gold_labels
+                torch.arange(gold_labels.numel(), device=gold_labels.device).long(), gold_labels
             ].float()
             tie_counts = max_predictions_mask.sum(-1)
             correct /= tie_counts.float()
diff --git a/allennlp/training/metrics/conll_coref_scores.py b/allennlp/training/metrics/conll_coref_scores.py
index eb2fc71d7d2..1df68077df9 100644
--- a/allennlp/training/metrics/conll_coref_scores.py
+++ b/allennlp/training/metrics/conll_coref_scores.py
@@ -40,9 +40,15 @@ def __call__(
             A metadata dictionary for each instance in the batch.  We use the "clusters" key from
             this dictionary, which has the annotated gold coreference clusters for that instance.
         """
-        top_spans, antecedent_indices, predicted_antecedents = self.unwrap_to_tensors(
+        top_spans, antecedent_indices, predicted_antecedents = self.detach_tensors(
             top_spans, antecedent_indices, predicted_antecedents
         )
+
+        # They need to be in CPU because Scorer.ceafe uses a SciPy function.
+        top_spans = top_spans.cpu()
+        antecedent_indices = antecedent_indices.cpu()
+        predicted_antecedents = predicted_antecedents.cpu()
+
         for i, metadata in enumerate(metadata_list):
             gold_clusters, mention_to_gold = self.get_gold_clusters(metadata["clusters"])
             predicted_clusters, mention_to_predicted = self.get_predicted_clusters(
@@ -78,17 +84,12 @@ def get_gold_clusters(gold_clusters):
 
     @staticmethod
     def get_predicted_clusters(
-        top_spans: torch.Tensor,
-        antecedent_indices: torch.Tensor,
-        predicted_antecedents: torch.Tensor,
+        top_spans: torch.Tensor,  # (num_spans, 2)
+        antecedent_indices: torch.Tensor,  # (num_spans, num_antecedents)
+        predicted_antecedents: torch.Tensor,  # (num_spans,)
     ) -> Tuple[
         List[Tuple[Tuple[int, int], ...]], Dict[Tuple[int, int], Tuple[Tuple[int, int], ...]]
     ]:
-        # Pytorch 0.4 introduced scalar tensors, so our calls to tuple() and such below don't
-        # actually give ints unless we convert to numpy first.  So we do that here.
-        top_spans = top_spans.numpy()  # (num_spans, 2)
-        antecedent_indices = antecedent_indices.numpy()  # (num_spans, num_antecedents)
-        predicted_antecedents = predicted_antecedents.numpy()  # (num_spans,)
 
         predicted_clusters_to_ids: Dict[Tuple[int, int], int] = {}
         clusters: List[List[Tuple[int, int]]] = []
@@ -100,7 +101,9 @@ def get_predicted_clusters(
             predicted_index = antecedent_indices[i, predicted_antecedent]
             # Must be a previous span.
             assert i > predicted_index
-            antecedent_span: Tuple[int, int] = tuple(top_spans[predicted_index])  # type: ignore
+            antecedent_span: Tuple[int, int] = tuple(  # type: ignore
+                top_spans[predicted_index].tolist()
+            )
 
             # Check if we've seen the span before.
             if antecedent_span in predicted_clusters_to_ids.keys():
@@ -111,7 +114,7 @@ def get_predicted_clusters(
                 clusters.append([antecedent_span])
                 predicted_clusters_to_ids[antecedent_span] = predicted_cluster_id
 
-            mention: Tuple[int, int] = tuple(top_spans[i])  # type: ignore
+            mention: Tuple[int, int] = tuple(top_spans[i].tolist())  # type: ignore
             clusters[predicted_cluster_id].append(mention)
             predicted_clusters_to_ids[mention] = predicted_cluster_id
 
@@ -150,29 +153,21 @@ def update(self, predicted, gold, mention_to_predicted, mention_to_gold):
         self.recall_denominator += r_den
 
     def get_f1(self):
-        precision = (
-            0
-            if self.precision_denominator == 0
-            else self.precision_numerator / float(self.precision_denominator)
-        )
-        recall = (
-            0
-            if self.recall_denominator == 0
-            else self.recall_numerator / float(self.recall_denominator)
-        )
+        precision = self.get_precision()
+        recall = self.get_recall()
         return 0 if precision + recall == 0 else 2 * precision * recall / (precision + recall)
 
     def get_recall(self):
-        if self.recall_numerator == 0:
+        if self.recall_denominator == 0:
             return 0
         else:
-            return self.recall_numerator / float(self.recall_denominator)
+            return self.recall_numerator / self.recall_denominator
 
     def get_precision(self):
-        if self.precision_numerator == 0:
+        if self.precision_denominator == 0:
             return 0
         else:
-            return self.precision_numerator / float(self.precision_denominator)
+            return self.precision_numerator / self.precision_denominator
 
     def get_prf(self):
         return self.get_precision(), self.get_recall(), self.get_f1()
@@ -234,7 +229,7 @@ def phi4(gold_clustering, predicted_clustering):
     @staticmethod
     def ceafe(clusters, gold_clusters):
         """
-        Computes the  Constrained EntityAlignment F-Measure (CEAF) for evaluating coreference.
+        Computes the Constrained Entity-Alignment F-Measure (CEAF) for evaluating coreference.
         Gold and predicted mentions are aligned into clusterings which maximise a metric - in
         this case, the F measure between gold and predicted clusters.
 
diff --git a/allennlp/training/metrics/covariance.py b/allennlp/training/metrics/covariance.py
index c5bf26cabc8..ddb316b1458 100644
--- a/allennlp/training/metrics/covariance.py
+++ b/allennlp/training/metrics/covariance.py
@@ -48,7 +48,7 @@ def __call__(
         mask : `torch.Tensor`, optional (default = None).
             A tensor of the same shape as `predictions`.
         """
-        predictions, gold_labels, mask = self.unwrap_to_tensors(predictions, gold_labels, mask)
+        predictions, gold_labels, mask = self.detach_tensors(predictions, gold_labels, mask)
         # Flatten predictions, gold_labels, and mask. We calculate the covariance between
         # the vectors, since each element in the predictions and gold_labels tensor is assumed
         # to be a separate observation.
diff --git a/allennlp/training/metrics/entropy.py b/allennlp/training/metrics/entropy.py
index 716c66c4819..72c2d6038f9 100644
--- a/allennlp/training/metrics/entropy.py
+++ b/allennlp/training/metrics/entropy.py
@@ -26,10 +26,10 @@ def __call__(
         mask : `torch.Tensor`, optional (default = None).
             A masking tensor of shape (batch_size, ...).
         """
-        logits, mask = self.unwrap_to_tensors(logits, mask)
+        logits, mask = self.detach_tensors(logits, mask)
 
         if mask is None:
-            mask = torch.ones(logits.size()[:-1])
+            mask = torch.ones(logits.size()[:-1], device=logits.device)
 
         log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
         probabilities = torch.exp(log_probs) * mask.unsqueeze(-1)
diff --git a/allennlp/training/metrics/fbeta_measure.py b/allennlp/training/metrics/fbeta_measure.py
index b47fb57ded4..a87f156ff9d 100644
--- a/allennlp/training/metrics/fbeta_measure.py
+++ b/allennlp/training/metrics/fbeta_measure.py
@@ -107,7 +107,7 @@ def __call__(
         mask : `torch.Tensor`, optional (default = None).
             A masking tensor the same size as `gold_labels`.
         """
-        predictions, gold_labels, mask = self.unwrap_to_tensors(predictions, gold_labels, mask)
+        predictions, gold_labels, mask = self.detach_tensors(predictions, gold_labels, mask)
 
         # Calculate true_positive_sum, true_negative_sum, pred_sum, true_sum
         num_classes = predictions.size(-1)
@@ -120,10 +120,10 @@ def __call__(
         # It means we call this metric at the first time
         # when `self._true_positive_sum` is None.
         if self._true_positive_sum is None:
-            self._true_positive_sum = torch.zeros(num_classes)
-            self._true_sum = torch.zeros(num_classes)
-            self._pred_sum = torch.zeros(num_classes)
-            self._total_sum = torch.zeros(num_classes)
+            self._true_positive_sum = torch.zeros(num_classes, device=predictions.device)
+            self._true_sum = torch.zeros(num_classes, device=predictions.device)
+            self._pred_sum = torch.zeros(num_classes, device=predictions.device)
+            self._total_sum = torch.zeros(num_classes, device=predictions.device)
 
         if mask is None:
             mask = torch.ones_like(gold_labels)
@@ -137,7 +137,7 @@ def __call__(
         # Watch it:
         # The total numbers of true positives under all _predicted_ classes are zeros.
         if true_positives_bins.shape[0] == 0:
-            true_positive_sum = torch.zeros(num_classes)
+            true_positive_sum = torch.zeros(num_classes, device=predictions.device)
         else:
             true_positive_sum = torch.bincount(
                 true_positives_bins.long(), minlength=num_classes
@@ -149,13 +149,13 @@ def __call__(
         if pred_bins.shape[0] != 0:
             pred_sum = torch.bincount(pred_bins, minlength=num_classes).float()
         else:
-            pred_sum = torch.zeros(num_classes)
+            pred_sum = torch.zeros(num_classes, device=predictions.device)
 
         gold_labels_bins = gold_labels[mask].long()
         if gold_labels.shape[0] != 0:
             true_sum = torch.bincount(gold_labels_bins, minlength=num_classes).float()
         else:
-            true_sum = torch.zeros(num_classes)
+            true_sum = torch.zeros(num_classes, device=predictions.device)
 
         self._true_positive_sum += true_positive_sum
         self._pred_sum += pred_sum
diff --git a/allennlp/training/metrics/mean_absolute_error.py b/allennlp/training/metrics/mean_absolute_error.py
index 2166a4a9304..4d12ec7aad8 100644
--- a/allennlp/training/metrics/mean_absolute_error.py
+++ b/allennlp/training/metrics/mean_absolute_error.py
@@ -32,7 +32,7 @@ def __call__(
         mask : `torch.Tensor`, optional (default = None).
             A tensor of the same shape as `predictions`.
         """
-        predictions, gold_labels, mask = self.unwrap_to_tensors(predictions, gold_labels, mask)
+        predictions, gold_labels, mask = self.detach_tensors(predictions, gold_labels, mask)
 
         absolute_errors = torch.abs(predictions - gold_labels)
         if mask is not None:
@@ -48,7 +48,7 @@ def get_metric(self, reset: bool = False):
 
         The accumulated mean absolute error.
         """
-        mean_absolute_error = float(self._absolute_error) / float(self._total_count)
+        mean_absolute_error = self._absolute_error / self._total_count
         if reset:
             self.reset()
         return mean_absolute_error
diff --git a/allennlp/training/metrics/mention_recall.py b/allennlp/training/metrics/mention_recall.py
index 5a08f404d12..01b72c8fcf3 100644
--- a/allennlp/training/metrics/mention_recall.py
+++ b/allennlp/training/metrics/mention_recall.py
@@ -18,7 +18,7 @@ def __call__(
         batched_top_spans: torch.Tensor,
         batched_metadata: List[Dict[str, Any]],
     ):
-        for top_spans, metadata in zip(batched_top_spans.data.tolist(), batched_metadata):
+        for top_spans, metadata in zip(batched_top_spans.tolist(), batched_metadata):
 
             gold_mentions: Set[Tuple[int, int]] = {
                 mention for cluster in metadata["clusters"] for mention in cluster
@@ -32,7 +32,7 @@ def get_metric(self, reset: bool = False) -> float:
         if self._num_gold_mentions == 0:
             recall = 0.0
         else:
-            recall = self._num_recalled_mentions / float(self._num_gold_mentions)
+            recall = self._num_recalled_mentions / self._num_gold_mentions
         if reset:
             self.reset()
         return recall
diff --git a/allennlp/training/metrics/metric.py b/allennlp/training/metrics/metric.py
index 15b8f6b0dbf..fb6af0ef937 100644
--- a/allennlp/training/metrics/metric.py
+++ b/allennlp/training/metrics/metric.py
@@ -1,4 +1,5 @@
-from typing import Dict, Optional, Tuple, Union, List
+from typing import Dict, Iterable, List, Optional, Tuple, Union
+
 import torch
 
 from allennlp.common.registrable import Registrable
@@ -41,11 +42,11 @@ def reset(self) -> None:
         raise NotImplementedError
 
     @staticmethod
-    def unwrap_to_tensors(*tensors: torch.Tensor):
+    def detach_tensors(*tensors: torch.Tensor) -> Iterable[torch.Tensor]:
         """
         If you actually passed gradient-tracking Tensors to a Metric, there will be
         a huge memory leak, because it will prevent garbage collection for the computation
-        graph. This method ensures that you're using tensors directly and that they are on
-        the CPU.
+        graph. This method ensures the tensors are detached.
         """
-        return (x.detach().cpu() if isinstance(x, torch.Tensor) else x for x in tensors)
+        # Check if it's actually a tensor in case something else was passed.
+        return (x.detach() if isinstance(x, torch.Tensor) else x for x in tensors)
diff --git a/allennlp/training/metrics/pearson_correlation.py b/allennlp/training/metrics/pearson_correlation.py
index 9791745064b..846900b7185 100644
--- a/allennlp/training/metrics/pearson_correlation.py
+++ b/allennlp/training/metrics/pearson_correlation.py
@@ -55,7 +55,7 @@ def __call__(
         mask : `torch.Tensor`, optional (default = None).
             A tensor of the same shape as `predictions`.
         """
-        predictions, gold_labels, mask = self.unwrap_to_tensors(predictions, gold_labels, mask)
+        predictions, gold_labels, mask = self.detach_tensors(predictions, gold_labels, mask)
         self._predictions_labels_covariance(predictions, gold_labels, mask)
         self._predictions_variance(predictions, predictions, mask)
         self._labels_variance(gold_labels, gold_labels, mask)
diff --git a/allennlp/training/metrics/sequence_accuracy.py b/allennlp/training/metrics/sequence_accuracy.py
index 6811dbabebc..983ed2401b9 100644
--- a/allennlp/training/metrics/sequence_accuracy.py
+++ b/allennlp/training/metrics/sequence_accuracy.py
@@ -34,7 +34,7 @@ def __call__(
         mask : `torch.Tensor`, optional (default = None).
             A masking tensor the same size as `gold_labels`.
         """
-        predictions, gold_labels, mask = self.unwrap_to_tensors(predictions, gold_labels, mask)
+        predictions, gold_labels, mask = self.detach_tensors(predictions, gold_labels, mask)
 
         # Some sanity checks.
         if gold_labels.dim() != predictions.dim() - 1:
@@ -76,7 +76,7 @@ def get_metric(self, reset: bool = False):
         The accumulated accuracy.
         """
         if self.total_count > 0:
-            accuracy = float(self.correct_count) / float(self.total_count)
+            accuracy = self.correct_count / self.total_count
         else:
             accuracy = 0
 
diff --git a/allennlp/training/metrics/span_based_f1_measure.py b/allennlp/training/metrics/span_based_f1_measure.py
index 0f1c93b7d5d..01f695240ab 100644
--- a/allennlp/training/metrics/span_based_f1_measure.py
+++ b/allennlp/training/metrics/span_based_f1_measure.py
@@ -125,7 +125,7 @@ def __call__(
         if mask is None:
             mask = torch.ones_like(gold_labels)
 
-        predictions, gold_labels, mask, prediction_map = self.unwrap_to_tensors(
+        predictions, gold_labels, mask, prediction_map = self.detach_tensors(
             predictions, gold_labels, mask, prediction_map
         )
 
@@ -277,9 +277,9 @@ def get_metric(self, reset: bool = False):
 
     @staticmethod
     def _compute_metrics(true_positives: int, false_positives: int, false_negatives: int):
-        precision = float(true_positives) / float(true_positives + false_positives + 1e-13)
-        recall = float(true_positives) / float(true_positives + false_negatives + 1e-13)
-        f1_measure = 2.0 * ((precision * recall) / (precision + recall + 1e-13))
+        precision = true_positives / (true_positives + false_positives + 1e-13)
+        recall = true_positives / (true_positives + false_negatives + 1e-13)
+        f1_measure = 2.0 * (precision * recall) / (precision + recall + 1e-13)
         return precision, recall, f1_measure
 
     def reset(self):
diff --git a/allennlp/training/metrics/spearman_correlation.py b/allennlp/training/metrics/spearman_correlation.py
index 5f4f10efd4f..0f7314a0738 100644
--- a/allennlp/training/metrics/spearman_correlation.py
+++ b/allennlp/training/metrics/spearman_correlation.py
@@ -40,13 +40,16 @@ def __call__(
         mask : `torch.Tensor`, optional (default = None).
             A tensor of the same shape as `predictions`.
         """
-        predictions, gold_labels, mask = self.unwrap_to_tensors(predictions, gold_labels, mask)
-        # Flatten predictions, gold_labels, and mask. We calculate the spearman correlation between
+        predictions, gold_labels, mask = self.detach_tensors(predictions, gold_labels, mask)
+        # Flatten predictions, gold_labels, and mask. We calculate the Spearman correlation between
         # the vectors, since each element in the predictions and gold_labels tensor is assumed
         # to be a separate observation.
         predictions = predictions.view(-1)
         gold_labels = gold_labels.view(-1)
 
+        self.total_predictions = self.total_predictions.to(predictions.device)
+        self.total_gold_labels = self.total_gold_labels.to(gold_labels.device)
+
         if mask is not None:
             mask = mask.view(-1)
             self.total_predictions = torch.cat((self.total_predictions, predictions * mask), 0)
@@ -63,7 +66,7 @@ def get_metric(self, reset: bool = False):
         The accumulated sample Spearman correlation.
         """
         spearman_correlation = stats.spearmanr(
-            self.total_predictions.numpy(), self.total_gold_labels.numpy()
+            self.total_predictions.cpu().numpy(), self.total_gold_labels.cpu().numpy()
         )
 
         if reset:
diff --git a/allennlp/training/metrics/srl_eval_scorer.py b/allennlp/training/metrics/srl_eval_scorer.py
index f5616efb958..97ee34bc742 100644
--- a/allennlp/training/metrics/srl_eval_scorer.py
+++ b/allennlp/training/metrics/srl_eval_scorer.py
@@ -171,9 +171,9 @@ def get_metric(self, reset: bool = False):
 
     @staticmethod
     def _compute_metrics(true_positives: int, false_positives: int, false_negatives: int):
-        precision = float(true_positives) / float(true_positives + false_positives + 1e-13)
-        recall = float(true_positives) / float(true_positives + false_negatives + 1e-13)
-        f1_measure = 2.0 * ((precision * recall) / (precision + recall + 1e-13))
+        precision = true_positives / (true_positives + false_positives + 1e-13)
+        recall = true_positives / (true_positives + false_negatives + 1e-13)
+        f1_measure = 2.0 * (precision * recall) / (precision + recall + 1e-13)
         return precision, recall, f1_measure
 
     def reset(self):
diff --git a/allennlp/training/metrics/unigram_recall.py b/allennlp/training/metrics/unigram_recall.py
index 5a070c1a0cc..e4e1a6043d6 100644
--- a/allennlp/training/metrics/unigram_recall.py
+++ b/allennlp/training/metrics/unigram_recall.py
@@ -38,7 +38,7 @@ def __call__(
         mask : `torch.Tensor`, optional (default = None).
             A masking tensor the same size as `gold_labels`.
         """
-        predictions, gold_labels, mask = self.unwrap_to_tensors(predictions, gold_labels, mask)
+        predictions, gold_labels, mask = self.detach_tensors(predictions, gold_labels, mask)
 
         # Some sanity checks.
         if gold_labels.dim() != predictions.dim() - 1:
@@ -71,8 +71,8 @@ def __call__(
                     # word is from cleaned gold which doesn't have 0 or
                     # end_index, so we don't need to explicitly remove those
                     # from beam.
-                    if stillsearch and (word in beam):
-                        retval += 1.0 / float(len(cleaned_gold))
+                    if stillsearch and word in beam:
+                        retval += 1 / len(cleaned_gold)
                         stillsearch = False
             correct += retval
 
@@ -85,7 +85,7 @@ def get_metric(self, reset: bool = False):
 
         The accumulated recall.
         """
-        recall = float(self.correct_count) / float(self.total_count) if self.total_count > 0 else 0
+        recall = self.correct_count / self.total_count if self.total_count > 0 else 0
         if reset:
             self.reset()
         return recall

From b7d89c19e699ab526119c7f40cb717241150a254 Mon Sep 17 00:00:00 2001
From: Santiago Castro <sacastro@umich.edu>
Date: Thu, 27 Feb 2020 01:55:11 -0500
Subject: [PATCH 2/7] Make metric tests work on both GPU and CPU

---
 allennlp/common/testing/__init__.py           |   2 +-
 allennlp/common/testing/test_case.py          |  34 +++
 .../metrics/attachment_scores_test.py         |  29 ++-
 allennlp/tests/training/metrics/auc_test.py   |  57 ++---
 allennlp/tests/training/metrics/bleu_test.py  |  39 ++--
 .../training/metrics/boolean_accuracy_test.py |  47 +++--
 .../metrics/categorical_accuracy_test.py      | 108 ++++++----
 .../metrics/conll_coref_scores_test.py        |  11 +-
 .../tests/training/metrics/covariance_test.py |  62 +++---
 .../tests/training/metrics/entropy_test.py    |  33 +--
 .../tests/training/metrics/f1_measure_test.py |  93 +++++----
 .../training/metrics/fbeta_measure_test.py    | 196 +++++++++++-------
 .../metrics/mean_absolute_error_test.py       |  21 +-
 .../metrics/pearson_correlation_test.py       |  93 ++++-----
 .../metrics/sequence_accuracy_test.py         |  46 ++--
 .../metrics/span_based_f1_measure_test.py     | 148 +++++++------
 .../metrics/spearman_correlation_test.py      |  68 +++---
 .../training/metrics/unigram_recall_test.py   |  46 ++--
 .../training/metrics/spearman_correlation.py  |   6 +-
 19 files changed, 658 insertions(+), 481 deletions(-)

diff --git a/allennlp/common/testing/__init__.py b/allennlp/common/testing/__init__.py
index 5a9246987c5..7157fb06218 100644
--- a/allennlp/common/testing/__init__.py
+++ b/allennlp/common/testing/__init__.py
@@ -1,5 +1,5 @@
 """
 Utilities and helpers for writing tests.
 """
-from allennlp.common.testing.test_case import AllenNlpTestCase
+from allennlp.common.testing.test_case import AllenNlpTestCase, multi_device
 from allennlp.common.testing.model_test_case import ModelTestCase
diff --git a/allennlp/common/testing/test_case.py b/allennlp/common/testing/test_case.py
index 7951641b6f3..56ea99eba7e 100644
--- a/allennlp/common/testing/test_case.py
+++ b/allennlp/common/testing/test_case.py
@@ -3,8 +3,11 @@
 import pathlib
 import shutil
 import tempfile
+from typing import Any, Iterable
 from unittest import TestCase
 
+import torch
+
 from allennlp.common.checks import log_pytorch_version_info
 
 TEST_DIR = tempfile.mkdtemp(prefix="allennlp_tests")
@@ -40,3 +43,34 @@ def setUp(self):
 
     def tearDown(self):
         shutil.rmtree(self.TEST_DIR)
+
+
+def parametrize(arg_names: Iterable[str], arg_values: Iterable[Iterable[Any]]):
+    """
+    Decorator to create parameterized tests.
+
+    # Parameters
+
+    arg_names : `Iterable[str]`, required.
+        Argument names to pass to the test function.
+    arg_values : `Iterable[Iterable[Any]]`, required.
+        Iterable of values to pass to each of the args.
+        A function call is gonna be made for each inner iterable.
+    """
+
+    def decorator(func):
+        def wrapper(*args, **kwargs):
+            for arg_value in arg_values:
+                kwargs_extra = {name: value for name, value in zip(arg_names, arg_value)}
+                func(*args, **kwargs, **kwargs_extra)
+
+        return wrapper
+
+    return decorator
+
+
+_available_devices = ["cpu"] + (["cuda"] if torch.cuda.is_available() else [])
+multi_device = parametrize(("device",), [(device,) for device in _available_devices])
+"""
+Decorator that provides an argument `device` of type `str` for each available PyTorch device.
+"""
diff --git a/allennlp/tests/training/metrics/attachment_scores_test.py b/allennlp/tests/training/metrics/attachment_scores_test.py
index d80fefa6b44..58d55dca3d0 100644
--- a/allennlp/tests/training/metrics/attachment_scores_test.py
+++ b/allennlp/tests/training/metrics/attachment_scores_test.py
@@ -1,6 +1,6 @@
 import torch
 
-from allennlp.common.testing import AllenNlpTestCase
+from allennlp.common.testing import AllenNlpTestCase, multi_device
 from allennlp.training.metrics import AttachmentScores
 
 
@@ -19,7 +19,17 @@ def setUp(self):
 
         self.mask = torch.Tensor([[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 0, 0]])
 
-    def test_perfect_scores(self):
+    def _send_tensors_to_device(self, device: str):
+        self.predictions = self.predictions.to(device)
+        self.gold_indices = self.gold_indices.to(device)
+        self.label_predictions = self.label_predictions.to(device)
+        self.gold_labels = self.gold_labels.to(device)
+        self.mask = self.mask.to(device)
+
+    @multi_device
+    def test_perfect_scores(self, device: str):
+        self._send_tensors_to_device(device)
+
         self.scorer(
             self.predictions, self.label_predictions, self.gold_indices, self.gold_labels, self.mask
         )
@@ -27,7 +37,10 @@ def test_perfect_scores(self):
         for value in self.scorer.get_metric().values():
             assert value == 1.0
 
-    def test_unlabeled_accuracy_ignores_incorrect_labels(self):
+    @multi_device
+    def test_unlabeled_accuracy_ignores_incorrect_labels(self, device: str):
+        self._send_tensors_to_device(device)
+
         label_predictions = self.label_predictions
         # Change some stuff so our 4 of our label predictions are wrong.
         label_predictions[0, 3:] = 3
@@ -47,7 +60,10 @@ def test_unlabeled_accuracy_ignores_incorrect_labels(self):
         # Neither should have labeled exact match.
         assert metrics["LEM"] == 0.0
 
-    def test_labeled_accuracy_is_affected_by_incorrect_heads(self):
+    @multi_device
+    def test_labeled_accuracy_is_affected_by_incorrect_heads(self, device: str):
+        self._send_tensors_to_device(device)
+
         predictions = self.predictions
         # Change some stuff so our 4 of our predictions are wrong.
         predictions[0, 3:] = 3
@@ -71,7 +87,10 @@ def test_labeled_accuracy_is_affected_by_incorrect_heads(self):
         assert metrics["LEM"] == 0.0
         assert metrics["UEM"] == 0.0
 
-    def test_attachment_scores_can_ignore_labels(self):
+    @multi_device
+    def test_attachment_scores_can_ignore_labels(self, device: str):
+        self._send_tensors_to_device(device)
+
         scorer = AttachmentScores(ignore_classes=[1])
 
         label_predictions = self.label_predictions
diff --git a/allennlp/tests/training/metrics/auc_test.py b/allennlp/tests/training/metrics/auc_test.py
index b129a9e0ee2..dfd33bde6d3 100644
--- a/allennlp/tests/training/metrics/auc_test.py
+++ b/allennlp/tests/training/metrics/auc_test.py
@@ -1,21 +1,22 @@
+import pytest
 import torch
 from sklearn import metrics
-from numpy.testing import assert_almost_equal
-import pytest
+from torch.testing import assert_allclose
 
-from allennlp.common.testing import AllenNlpTestCase
-from allennlp.training.metrics import Auc
 from allennlp.common.checks import ConfigurationError
+from allennlp.common.testing import AllenNlpTestCase, multi_device
+from allennlp.training.metrics import Auc
 
 
 class AucTest(AllenNlpTestCase):
-    def test_auc_computation(self):
+    @multi_device
+    def test_auc_computation(self, device: str):
         auc = Auc()
         all_predictions = []
         all_labels = []
         for _ in range(5):
-            predictions = torch.randn(8).float()
-            labels = torch.randint(0, 2, (8,)).long()
+            predictions = torch.randn(8, device=device)
+            labels = torch.randint(0, 2, (8,), dtype=torch.long, device=device)
 
             auc(predictions, labels)
 
@@ -25,62 +26,66 @@ def test_auc_computation(self):
         computed_auc_value = auc.get_metric(reset=True)
 
         false_positive_rates, true_positive_rates, _ = metrics.roc_curve(
-            torch.cat(all_labels, dim=0).numpy(), torch.cat(all_predictions, dim=0).numpy()
+            torch.cat(all_labels, dim=0).cpu().numpy(),
+            torch.cat(all_predictions, dim=0).cpu().numpy(),
         )
         real_auc_value = metrics.auc(false_positive_rates, true_positive_rates)
-        assert_almost_equal(real_auc_value, computed_auc_value)
+        assert_allclose(real_auc_value, computed_auc_value)
 
         # One more computation to assure reset works.
-        predictions = torch.randn(8).float()
-        labels = torch.randint(0, 2, (8,)).long()
+        predictions = torch.randn(8, device=device)
+        labels = torch.randint(0, 2, (8,), dtype=torch.long, device=device)
 
         auc(predictions, labels)
         computed_auc_value = auc.get_metric(reset=True)
 
         false_positive_rates, true_positive_rates, _ = metrics.roc_curve(
-            labels.numpy(), predictions.numpy()
+            labels.cpu().numpy(), predictions.cpu().numpy()
         )
         real_auc_value = metrics.auc(false_positive_rates, true_positive_rates)
-        assert_almost_equal(real_auc_value, computed_auc_value)
+        assert_allclose(real_auc_value, computed_auc_value)
 
-    def test_auc_gold_labels_behaviour(self):
+    @multi_device
+    def test_auc_gold_labels_behaviour(self, device: str):
         # Check that it works with different pos_label
         auc = Auc(positive_label=4)
 
-        predictions = torch.randn(8).float()
-        labels = torch.randint(3, 5, (8,)).long()
+        predictions = torch.randn(8, device=device)
+        labels = torch.randint(3, 5, (8,), dtype=torch.long, device=device)
         # We make sure that the positive label is always present.
         labels[0] = 4
         auc(predictions, labels)
         computed_auc_value = auc.get_metric(reset=True)
 
         false_positive_rates, true_positive_rates, _ = metrics.roc_curve(
-            labels.numpy(), predictions.numpy(), pos_label=4
+            labels.cpu().numpy(), predictions.cpu().numpy(), pos_label=4
         )
         real_auc_value = metrics.auc(false_positive_rates, true_positive_rates)
-        assert_almost_equal(real_auc_value, computed_auc_value)
+        assert_allclose(real_auc_value, computed_auc_value)
 
         # Check that it errs on getting more than 2 labels.
         with pytest.raises(ConfigurationError) as _:
-            labels = torch.LongTensor([3, 4, 5, 6, 7, 8, 9, 10])
+            labels = torch.tensor([3, 4, 5, 6, 7, 8, 9, 10], device=device)
             auc(predictions, labels)
 
-    def test_auc_with_mask(self):
+    @multi_device
+    def test_auc_with_mask(self, device: str):
         auc = Auc()
 
-        predictions = torch.randn(8).float()
-        labels = torch.randint(0, 2, (8,)).long()
-        mask = torch.ByteTensor([1, 1, 1, 1, 0, 0, 0, 0])
+        predictions = torch.randn(8, device=device)
+        labels = torch.randint(0, 2, (8,), dtype=torch.long, device=device)
+        mask = torch.tensor([1, 1, 1, 1, 0, 0, 0, 0], dtype=torch.uint8, device=device)
 
         auc(predictions, labels, mask)
         computed_auc_value = auc.get_metric(reset=True)
 
         false_positive_rates, true_positive_rates, _ = metrics.roc_curve(
-            labels[:4].numpy(), predictions[:4].numpy()
+            labels[:4].cpu().numpy(), predictions[:4].cpu().numpy()
         )
         real_auc_value = metrics.auc(false_positive_rates, true_positive_rates)
-        assert_almost_equal(real_auc_value, computed_auc_value)
+        assert_allclose(real_auc_value, computed_auc_value)
 
-    def test_auc_works_without_calling_metric_at_all(self):
+    @multi_device
+    def test_auc_works_without_calling_metric_at_all(self, device: str):
         auc = Auc()
         auc.get_metric()
diff --git a/allennlp/tests/training/metrics/bleu_test.py b/allennlp/tests/training/metrics/bleu_test.py
index 643c1e9a66a..536db7c3789 100644
--- a/allennlp/tests/training/metrics/bleu_test.py
+++ b/allennlp/tests/training/metrics/bleu_test.py
@@ -1,10 +1,10 @@
-from collections import Counter
 import math
+from collections import Counter
 
-import numpy as np
 import torch
+from torch.testing import assert_allclose
 
-from allennlp.common.testing import AllenNlpTestCase
+from allennlp.common.testing import AllenNlpTestCase, multi_device
 from allennlp.training.metrics import BLEU
 
 
@@ -13,15 +13,16 @@ def setUp(self):
         super().setUp()
         self.metric = BLEU(ngram_weights=(0.5, 0.5), exclude_indices={0})
 
-    def test_get_valid_tokens_mask(self):
-        tensor = torch.tensor([[1, 2, 3, 0], [0, 1, 1, 0]])
-        result = self.metric._get_valid_tokens_mask(tensor)
-        result = result.long().numpy()
-        check = np.array([[1, 1, 1, 0], [0, 1, 1, 0]])
-        np.testing.assert_array_equal(result, check)
+    @multi_device
+    def test_get_valid_tokens_mask(self, device: str):
+        tensor = torch.tensor([[1, 2, 3, 0], [0, 1, 1, 0]], device=device)
+        result = self.metric._get_valid_tokens_mask(tensor).long()
+        check = torch.tensor([[1, 1, 1, 0], [0, 1, 1, 0]], device=device)
+        assert_allclose(result, check)
 
-    def test_ngrams(self):
-        tensor = torch.tensor([1, 2, 3, 1, 2, 0])
+    @multi_device
+    def test_ngrams(self, device: str):
+        tensor = torch.tensor([1, 2, 3, 1, 2, 0], device=device)
 
         # Unigrams.
         counts = Counter(self.metric._ngrams(tensor, 1))
@@ -42,14 +43,15 @@ def test_ngrams(self):
         counts = Counter(self.metric._ngrams(tensor, 7))
         assert counts == {}
 
-    def test_bleu_computed_correctly(self):
+    @multi_device
+    def test_bleu_computed_correctly(self, device: str):
         self.metric.reset()
 
         # shape: (batch_size, max_sequence_length)
-        predictions = torch.tensor([[1, 0, 0], [1, 1, 0], [1, 1, 1]])
+        predictions = torch.tensor([[1, 0, 0], [1, 1, 0], [1, 1, 1]], device=device)
 
         # shape: (batch_size, max_gold_sequence_length)
-        gold_targets = torch.tensor([[2, 0, 0], [1, 0, 0], [1, 1, 2]])
+        gold_targets = torch.tensor([[2, 0, 0], [1, 0, 0], [1, 1, 2]], device=device)
 
         self.metric(predictions, gold_targets)
 
@@ -57,7 +59,7 @@ def test_bleu_computed_correctly(self):
         assert self.metric._reference_lengths == 5
 
         # Number of unigrams in predicted sentences that match gold sentences
-        # (but not more than maximum occurence of gold unigram within batch).
+        # (but not more than maximum occurrence of gold unigram within batch).
         assert self.metric._precision_matches[1] == (
             0
             + 1  # no matches in first sentence.
@@ -68,7 +70,7 @@ def test_bleu_computed_correctly(self):
         assert self.metric._precision_totals[1] == (1 + 2 + 3)
 
         # Number of bigrams in predicted sentences that match gold sentences
-        # (but not more than maximum occurence of gold bigram within batch).
+        # (but not more than maximum occurrence of gold bigram within batch).
         assert self.metric._precision_matches[2] == (0 + 0 + 1)
 
         # Total number of predicted bigrams.
@@ -79,8 +81,9 @@ def test_bleu_computed_correctly(self):
 
         bleu = self.metric.get_metric(reset=True)["BLEU"]
         check = math.exp(0.5 * (math.log(3) - math.log(6)) + 0.5 * (math.log(1) - math.log(3)))
-        np.testing.assert_approx_equal(bleu, check)
+        assert_allclose(bleu, check)
 
-    def test_bleu_computed_with_zero_counts(self):
+    @multi_device
+    def test_bleu_computed_with_zero_counts(self, device: str):
         self.metric.reset()
         assert self.metric.get_metric()["BLEU"] == 0
diff --git a/allennlp/tests/training/metrics/boolean_accuracy_test.py b/allennlp/tests/training/metrics/boolean_accuracy_test.py
index bb111edc3fa..bb332b74261 100644
--- a/allennlp/tests/training/metrics/boolean_accuracy_test.py
+++ b/allennlp/tests/training/metrics/boolean_accuracy_test.py
@@ -1,57 +1,62 @@
 import torch
 import pytest
 
-from allennlp.common.testing import AllenNlpTestCase
+from allennlp.common.testing import AllenNlpTestCase, multi_device
 from allennlp.training.metrics import BooleanAccuracy
 
 
 class BooleanAccuracyTest(AllenNlpTestCase):
-    def test_accuracy_computation(self):
+    @multi_device
+    def test_accuracy_computation(self, device: str):
         accuracy = BooleanAccuracy()
-        predictions = torch.Tensor([[0, 1], [2, 3], [4, 5], [6, 7]])
-        targets = torch.Tensor([[0, 1], [2, 2], [4, 5], [7, 7]])
+        predictions = torch.tensor([[0, 1], [2, 3], [4, 5], [6, 7]], device=device)
+        targets = torch.tensor([[0, 1], [2, 2], [4, 5], [7, 7]], device=device)
         accuracy(predictions, targets)
-        assert accuracy.get_metric() == 2.0 / 4
+        assert accuracy.get_metric() == 2 / 4
 
-        mask = torch.ones(4, 2)
+        mask = torch.ones(4, 2, device=device)
         mask[1, 1] = 0
         accuracy(predictions, targets, mask)
-        assert accuracy.get_metric() == 5.0 / 8
+        assert accuracy.get_metric() == 5 / 8
 
         targets[1, 1] = 3
         accuracy(predictions, targets)
-        assert accuracy.get_metric() == 8.0 / 12
+        assert accuracy.get_metric() == 8 / 12
 
         accuracy.reset()
         accuracy(predictions, targets)
-        assert accuracy.get_metric() == 3.0 / 4
+        assert accuracy.get_metric() == 3 / 4
 
-    def test_skips_completely_masked_instances(self):
+    @multi_device
+    def test_skips_completely_masked_instances(self, device: str):
         accuracy = BooleanAccuracy()
-        predictions = torch.Tensor([[0, 1], [2, 3], [4, 5], [6, 7]])
-        targets = torch.Tensor([[0, 1], [2, 2], [4, 5], [7, 7]])
+        predictions = torch.tensor([[0, 1], [2, 3], [4, 5], [6, 7]], device=device)
+        targets = torch.tensor([[0, 1], [2, 2], [4, 5], [7, 7]], device=device)
 
-        mask = torch.Tensor([[0, 0], [1, 0], [1, 1], [1, 1]])
+        mask = torch.tensor([[0, 0], [1, 0], [1, 1], [1, 1]], device=device)
         accuracy(predictions, targets, mask)
 
         # First example should be skipped, second is correct with mask, third is correct, fourth is wrong.
         assert accuracy.get_metric() == 2 / 3
 
-    def test_incorrect_gold_labels_shape_catches_exceptions(self):
+    @multi_device
+    def test_incorrect_gold_labels_shape_catches_exceptions(self, device: str):
         accuracy = BooleanAccuracy()
-        predictions = torch.rand([5, 7])
-        incorrect_shape_labels = torch.rand([5, 8])
+        predictions = torch.rand([5, 7], device=device)
+        incorrect_shape_labels = torch.rand([5, 8], device=device)
         with pytest.raises(ValueError):
             accuracy(predictions, incorrect_shape_labels)
 
-    def test_incorrect_mask_shape_catches_exceptions(self):
+    @multi_device
+    def test_incorrect_mask_shape_catches_exceptions(self, device: str):
         accuracy = BooleanAccuracy()
-        predictions = torch.rand([5, 7])
-        labels = torch.rand([5, 7])
-        incorrect_shape_mask = torch.randint(0, 2, [5, 8])
+        predictions = torch.rand([5, 7], device=device)
+        labels = torch.rand([5, 7], device=device)
+        incorrect_shape_mask = torch.randint(0, 2, [5, 8], device=device)
         with pytest.raises(ValueError):
             accuracy(predictions, labels, incorrect_shape_mask)
 
-    def test_does_not_divide_by_zero_with_no_count(self):
+    @multi_device
+    def test_does_not_divide_by_zero_with_no_count(self, device: str):
         accuracy = BooleanAccuracy()
         self.assertAlmostEqual(accuracy.get_metric(), 0.0)
diff --git a/allennlp/tests/training/metrics/categorical_accuracy_test.py b/allennlp/tests/training/metrics/categorical_accuracy_test.py
index 0020fb95d52..87252f390cd 100644
--- a/allennlp/tests/training/metrics/categorical_accuracy_test.py
+++ b/allennlp/tests/training/metrics/categorical_accuracy_test.py
@@ -1,97 +1,113 @@
-import torch
 import pytest
-import numpy
+import torch
+from torch.testing import assert_allclose
 
-from allennlp.common.testing import AllenNlpTestCase
 from allennlp.common.checks import ConfigurationError
+from allennlp.common.testing import AllenNlpTestCase, multi_device
 from allennlp.training.metrics import CategoricalAccuracy
 
 
 class CategoricalAccuracyTest(AllenNlpTestCase):
-    def test_categorical_accuracy(self):
+    @multi_device
+    def test_categorical_accuracy(self, device: str):
         accuracy = CategoricalAccuracy()
-        predictions = torch.Tensor([[0.35, 0.25, 0.1, 0.1, 0.2], [0.1, 0.6, 0.1, 0.2, 0.0]])
-        targets = torch.Tensor([0, 3])
+        predictions = torch.tensor(
+            [[0.35, 0.25, 0.1, 0.1, 0.2], [0.1, 0.6, 0.1, 0.2, 0.0]], device=device
+        )
+        targets = torch.tensor([0, 3], device=device)
         accuracy(predictions, targets)
         actual_accuracy = accuracy.get_metric()
         assert actual_accuracy == 0.50
 
-    def test_top_k_categorical_accuracy(self):
+    @multi_device
+    def test_top_k_categorical_accuracy(self, device: str):
         accuracy = CategoricalAccuracy(top_k=2)
-        predictions = torch.Tensor([[0.35, 0.25, 0.1, 0.1, 0.2], [0.1, 0.6, 0.1, 0.2, 0.0]])
-        targets = torch.Tensor([0, 3])
+        predictions = torch.tensor(
+            [[0.35, 0.25, 0.1, 0.1, 0.2], [0.1, 0.6, 0.1, 0.2, 0.0]], device=device
+        )
+        targets = torch.tensor([0, 3], device=device)
         accuracy(predictions, targets)
         actual_accuracy = accuracy.get_metric()
         assert actual_accuracy == 1.0
 
-    def test_top_k_categorical_accuracy_accumulates_and_resets_correctly(self):
+    @multi_device
+    def test_top_k_categorical_accuracy_accumulates_and_resets_correctly(self, device: str):
         accuracy = CategoricalAccuracy(top_k=2)
-        predictions = torch.Tensor([[0.35, 0.25, 0.1, 0.1, 0.2], [0.1, 0.6, 0.1, 0.2, 0.0]])
-        targets = torch.Tensor([0, 3])
+        predictions = torch.tensor(
+            [[0.35, 0.25, 0.1, 0.1, 0.2], [0.1, 0.6, 0.1, 0.2, 0.0]], device=device
+        )
+        targets = torch.tensor([0, 3], device=device)
         accuracy(predictions, targets)
         accuracy(predictions, targets)
-        accuracy(predictions, torch.Tensor([4, 4]))
-        accuracy(predictions, torch.Tensor([4, 4]))
+        accuracy(predictions, torch.tensor([4, 4], device=device))
+        accuracy(predictions, torch.tensor([4, 4], device=device))
         actual_accuracy = accuracy.get_metric(reset=True)
         assert actual_accuracy == 0.50
         assert accuracy.correct_count == 0.0
         assert accuracy.total_count == 0.0
 
-    def test_top_k_categorical_accuracy_respects_mask(self):
+    @multi_device
+    def test_top_k_categorical_accuracy_respects_mask(self, device: str):
         accuracy = CategoricalAccuracy(top_k=2)
-        predictions = torch.Tensor(
-            [[0.35, 0.25, 0.1, 0.1, 0.2], [0.1, 0.6, 0.1, 0.2, 0.0], [0.1, 0.2, 0.5, 0.2, 0.0]]
+        predictions = torch.tensor(
+            [[0.35, 0.25, 0.1, 0.1, 0.2], [0.1, 0.6, 0.1, 0.2, 0.0], [0.1, 0.2, 0.5, 0.2, 0.0]],
+            device=device,
         )
-        targets = torch.Tensor([0, 3, 0])
-        mask = torch.Tensor([0, 1, 1])
+        targets = torch.tensor([0, 3, 0], device=device)
+        mask = torch.tensor([0, 1, 1], device=device)
         accuracy(predictions, targets, mask)
         actual_accuracy = accuracy.get_metric()
-        assert actual_accuracy == 0.50
+        assert_allclose(actual_accuracy, 0.50)
 
-    def test_top_k_categorical_accuracy_works_for_sequences(self):
+    @multi_device
+    def test_top_k_categorical_accuracy_works_for_sequences(self, device: str):
         accuracy = CategoricalAccuracy(top_k=2)
-        predictions = torch.Tensor(
+        predictions = torch.tensor(
             [
                 [[0.35, 0.25, 0.1, 0.1, 0.2], [0.1, 0.6, 0.1, 0.2, 0.0], [0.1, 0.6, 0.1, 0.2, 0.0]],
                 [[0.35, 0.25, 0.1, 0.1, 0.2], [0.1, 0.6, 0.1, 0.2, 0.0], [0.1, 0.6, 0.1, 0.2, 0.0]],
-            ]
+            ],
+            device=device,
         )
-        targets = torch.Tensor([[0, 3, 4], [0, 1, 4]])
+        targets = torch.tensor([[0, 3, 4], [0, 1, 4]], device=device)
         accuracy(predictions, targets)
         actual_accuracy = accuracy.get_metric(reset=True)
-        numpy.testing.assert_almost_equal(actual_accuracy, 0.6666666)
+        assert_allclose(actual_accuracy, 0.6666666)
 
         # Test the same thing but with a mask:
-        mask = torch.Tensor([[0, 1, 1], [1, 0, 1]])
+        mask = torch.tensor([[0, 1, 1], [1, 0, 1]], device=device)
         accuracy(predictions, targets, mask)
         actual_accuracy = accuracy.get_metric(reset=True)
-        numpy.testing.assert_almost_equal(actual_accuracy, 0.50)
+        assert_allclose(actual_accuracy, 0.50)
 
-    def test_top_k_categorical_accuracy_catches_exceptions(self):
+    @multi_device
+    def test_top_k_categorical_accuracy_catches_exceptions(self, device: str):
         accuracy = CategoricalAccuracy()
-        predictions = torch.rand([5, 7])
-        out_of_range_labels = torch.Tensor([10, 3, 4, 0, 1])
+        predictions = torch.rand([5, 7], device=device)
+        out_of_range_labels = torch.tensor([10, 3, 4, 0, 1], device=device)
         with pytest.raises(ConfigurationError):
             accuracy(predictions, out_of_range_labels)
 
-    def test_tie_break_categorical_accuracy(self):
+    @multi_device
+    def test_tie_break_categorical_accuracy(self, device: str):
         accuracy = CategoricalAccuracy(tie_break=True)
-        predictions = torch.Tensor(
-            [[0.35, 0.25, 0.35, 0.35, 0.35], [0.1, 0.6, 0.1, 0.2, 0.2], [0.1, 0.0, 0.1, 0.2, 0.2]]
+        predictions = torch.tensor(
+            [[0.35, 0.25, 0.35, 0.35, 0.35], [0.1, 0.6, 0.1, 0.2, 0.2], [0.1, 0.0, 0.1, 0.2, 0.2]],
+            device=device,
         )
         # Test without mask:
-        targets = torch.Tensor([2, 1, 4])
+        targets = torch.tensor([2, 1, 4], device=device)
         accuracy(predictions, targets)
         assert accuracy.get_metric(reset=True) == (0.25 + 1 + 0.5) / 3.0
 
         # # # Test with mask
-        mask = torch.Tensor([1, 0, 1])
-        targets = torch.Tensor([2, 1, 4])
+        mask = torch.tensor([1, 0, 1], device=device)
+        targets = torch.tensor([2, 1, 4], device=device)
         accuracy(predictions, targets, mask)
         assert accuracy.get_metric(reset=True) == (0.25 + 0.5) / 2.0
 
         # # Test tie-break with sequence
-        predictions = torch.Tensor(
+        predictions = torch.tensor(
             [
                 [
                     [0.35, 0.25, 0.35, 0.35, 0.35],
@@ -103,21 +119,27 @@ def test_tie_break_categorical_accuracy(self):
                     [0.1, 0.6, 0.1, 0.2, 0.2],
                     [0.1, 0.0, 0.1, 0.2, 0.2],
                 ],
-            ]
+            ],
+            device=device,
+        )
+        targets = torch.tensor(
+            [[0, 1, 3], [0, 3, 4]], device=device  # 0.25 + 1 + 0.5  # 0.25 + 0 + 0.5 = 2.5
         )
-        targets = torch.Tensor([[0, 1, 3], [0, 3, 4]])  # 0.25 + 1 + 0.5  # 0.25 + 0 + 0.5 = 2.5
         accuracy(predictions, targets)
         actual_accuracy = accuracy.get_metric(reset=True)
-        numpy.testing.assert_almost_equal(actual_accuracy, 2.5 / 6.0)
+        assert_allclose(actual_accuracy, 2.5 / 6.0)
 
-    def test_top_k_and_tie_break_together_catches_exceptions(self):
+    @multi_device
+    def test_top_k_and_tie_break_together_catches_exceptions(self, device: str):
         with pytest.raises(ConfigurationError):
             CategoricalAccuracy(top_k=2, tie_break=True)
 
-    def test_incorrect_top_k_catches_exceptions(self):
+    @multi_device
+    def test_incorrect_top_k_catches_exceptions(self, device: str):
         with pytest.raises(ConfigurationError):
             CategoricalAccuracy(top_k=0)
 
-    def test_does_not_divide_by_zero_with_no_count(self):
+    @multi_device
+    def test_does_not_divide_by_zero_with_no_count(self, device: str):
         accuracy = CategoricalAccuracy()
         self.assertAlmostEqual(accuracy.get_metric(), 0.0)
diff --git a/allennlp/tests/training/metrics/conll_coref_scores_test.py b/allennlp/tests/training/metrics/conll_coref_scores_test.py
index c426d541e89..f285611953c 100644
--- a/allennlp/tests/training/metrics/conll_coref_scores_test.py
+++ b/allennlp/tests/training/metrics/conll_coref_scores_test.py
@@ -1,14 +1,15 @@
 import torch
 
-from allennlp.common.testing import AllenNlpTestCase
+from allennlp.common.testing import AllenNlpTestCase, multi_device
 from allennlp.training.metrics import ConllCorefScores
 
 
 class ConllCorefScoresTest(AllenNlpTestCase):
-    def test_get_predicted_clusters(self):
-        top_spans = torch.Tensor([[0, 1], [4, 6], [8, 9]]).long()
-        antecedent_indices = torch.Tensor([[-1, -1, -1], [0, -1, -1], [0, 1, -1]]).long()
-        predicted_antecedents = torch.Tensor([-1, -1, 1]).long()
+    @multi_device
+    def test_get_predicted_clusters(self, device: str):
+        top_spans = torch.tensor([[0, 1], [4, 6], [8, 9]], device=device)
+        antecedent_indices = torch.tensor([[-1, -1, -1], [0, -1, -1], [0, 1, -1]], device=device)
+        predicted_antecedents = torch.tensor([-1, -1, 1], device=device)
         clusters, mention_to_cluster = ConllCorefScores.get_predicted_clusters(
             top_spans, antecedent_indices, predicted_antecedents
         )
diff --git a/allennlp/tests/training/metrics/covariance_test.py b/allennlp/tests/training/metrics/covariance_test.py
index 7c311caa8a7..c1690fb2d74 100644
--- a/allennlp/tests/training/metrics/covariance_test.py
+++ b/allennlp/tests/training/metrics/covariance_test.py
@@ -1,73 +1,75 @@
-import torch
 import numpy as np
-from numpy.testing import assert_allclose
+import torch
+from torch.testing import assert_allclose
 
-from allennlp.common.testing import AllenNlpTestCase
+from allennlp.common.testing import AllenNlpTestCase, multi_device
 from allennlp.training.metrics import Covariance
 
 
 class CovarianceTest(AllenNlpTestCase):
-    def test_covariance_unmasked_computation(self):
+    @multi_device
+    def test_covariance_unmasked_computation(self, device: str):
         covariance = Covariance()
         batch_size = 100
         num_labels = 10
-        predictions = np.random.randn(batch_size, num_labels).astype("float32")
-        labels = 0.5 * predictions + np.random.randn(batch_size, num_labels).astype("float32")
+        predictions = torch.randn(batch_size, num_labels, device=device)
+        labels = 0.5 * predictions + torch.randn(batch_size, num_labels, device=device)
 
         stride = 10
 
         for i in range(batch_size // stride):
-            timestep_predictions = torch.FloatTensor(predictions[stride * i : stride * (i + 1), :])
-            timestep_labels = torch.FloatTensor(labels[stride * i : stride * (i + 1), :])
+            timestep_predictions = predictions[stride * i : stride * (i + 1), :]
+            timestep_labels = labels[stride * i : stride * (i + 1), :]
             # Flatten the predictions and labels thus far, so numpy treats them as
             # independent observations.
             expected_covariance = np.cov(
-                predictions[: stride * (i + 1), :].reshape(-1),
-                labels[: stride * (i + 1), :].reshape(-1),
+                predictions[: stride * (i + 1), :].view(-1).cpu().numpy(),
+                labels[: stride * (i + 1), :].view(-1).cpu().numpy(),
             )[0, 1]
             covariance(timestep_predictions, timestep_labels)
-            assert_allclose(expected_covariance, covariance.get_metric(), rtol=1e-5)
+            assert_allclose(expected_covariance, covariance.get_metric())
 
         # Test reset
         covariance.reset()
-        covariance(torch.FloatTensor(predictions), torch.FloatTensor(labels))
+        covariance(predictions, labels)
         assert_allclose(
-            np.cov(predictions.reshape(-1), labels.reshape(-1))[0, 1],
+            np.cov(predictions.view(-1).cpu().numpy(), labels.view(-1).cpu().numpy())[0, 1],
             covariance.get_metric(),
-            rtol=1e-5,
         )
 
-    def test_covariance_masked_computation(self):
+    @multi_device
+    def test_covariance_masked_computation(self, device: str):
         covariance = Covariance()
         batch_size = 100
         num_labels = 10
-        predictions = np.random.randn(batch_size, num_labels).astype("float32")
-        labels = 0.5 * predictions + np.random.randn(batch_size, num_labels).astype("float32")
+        predictions = torch.randn(batch_size, num_labels, device=device)
+        labels = 0.5 * predictions + torch.randn(batch_size, num_labels, device=device)
         # Random binary mask
-        mask = np.random.randint(0, 2, size=(batch_size, num_labels)).astype("float32")
+        mask = torch.randint(0, 2, size=(batch_size, num_labels), device=device)
         stride = 10
 
         for i in range(batch_size // stride):
-            timestep_predictions = torch.FloatTensor(predictions[stride * i : stride * (i + 1), :])
-            timestep_labels = torch.FloatTensor(labels[stride * i : stride * (i + 1), :])
-            timestep_mask = torch.FloatTensor(mask[stride * i : stride * (i + 1), :])
+            timestep_predictions = predictions[stride * i : stride * (i + 1), :]
+            timestep_labels = labels[stride * i : stride * (i + 1), :]
+            timestep_mask = mask[stride * i : stride * (i + 1), :]
             # Flatten the predictions, labels, and mask thus far, so numpy treats them as
             # independent observations.
             expected_covariance = np.cov(
-                predictions[: stride * (i + 1), :].reshape(-1),
-                labels[: stride * (i + 1), :].reshape(-1),
-                fweights=mask[: stride * (i + 1), :].reshape(-1),
+                predictions[: stride * (i + 1), :].view(-1).cpu().numpy(),
+                labels[: stride * (i + 1), :].view(-1).cpu().numpy(),
+                fweights=mask[: stride * (i + 1), :].view(-1).cpu().numpy(),
             )[0, 1]
             covariance(timestep_predictions, timestep_labels, timestep_mask)
-            assert_allclose(expected_covariance, covariance.get_metric(), rtol=1e-5)
+            assert_allclose(expected_covariance, covariance.get_metric())
 
         # Test reset
         covariance.reset()
-        covariance(
-            torch.FloatTensor(predictions), torch.FloatTensor(labels), torch.FloatTensor(mask)
-        )
+        covariance(predictions, labels, mask)
         assert_allclose(
-            np.cov(predictions.reshape(-1), labels.reshape(-1), fweights=mask.reshape(-1))[0, 1],
+            np.cov(
+                predictions.view(-1).cpu().numpy(),
+                labels.view(-1).cpu().numpy(),
+                fweights=mask.view(-1).cpu().numpy(),
+            )[0, 1],
             covariance.get_metric(),
-            rtol=1e-5,
         )
diff --git a/allennlp/tests/training/metrics/entropy_test.py b/allennlp/tests/training/metrics/entropy_test.py
index 398a4ff51cb..c93749b45fd 100644
--- a/allennlp/tests/training/metrics/entropy_test.py
+++ b/allennlp/tests/training/metrics/entropy_test.py
@@ -1,35 +1,44 @@
 import torch
-import numpy
+from torch.testing import assert_allclose
 
-from allennlp.common.testing import AllenNlpTestCase
+from allennlp.common.testing import AllenNlpTestCase, multi_device
 from allennlp.training.metrics import Entropy
 
 
 class EntropyTest(AllenNlpTestCase):
-    def test_low_entropy_distribution(self):
+    @multi_device
+    def test_low_entropy_distribution(self, device: str):
         metric = Entropy()
-        logits = torch.Tensor([[10000, -10000, -10000, -1000], [10000, -10000, -10000, -1000]])
+        logits = torch.tensor(
+            [[10000, -10000, -10000, -1000], [10000, -10000, -10000, -1000]],
+            dtype=torch.float,
+            device=device,
+        )
         metric(logits)
         assert metric.get_metric() == 0.0
 
-    def test_entropy_for_uniform_distribution(self):
+    @multi_device
+    def test_entropy_for_uniform_distribution(self, device: str):
         metric = Entropy()
-        logits = torch.Tensor([[1, 1, 1, 1], [1, 1, 1, 1]])
+        logits = torch.tensor([[1, 1, 1, 1], [1, 1, 1, 1]], dtype=torch.float, device=device)
         metric(logits)
-        numpy.testing.assert_almost_equal(metric.get_metric().cpu(), 1.38629436)
+        assert_allclose(metric.get_metric(), torch.tensor(1.38629436, device=device))
         # actual values shouldn't effect uniform distribution:
-        logits = torch.Tensor([[2, 2, 2, 2], [2, 2, 2, 2]])
+        logits = torch.tensor([[2, 2, 2, 2], [2, 2, 2, 2]], dtype=torch.float, device=device)
         metric(logits)
-        numpy.testing.assert_almost_equal(metric.get_metric().cpu(), 1.38629436)
+        assert_allclose(metric.get_metric(), torch.tensor(1.38629436, device=device))
 
         metric.reset()
         assert metric._entropy == 0.0
         assert metric._count == 0.0
 
-    def test_masked_case(self):
+    @multi_device
+    def test_masked_case(self, device: str):
         metric = Entropy()
         # This would have non-zero entropy without the mask.
-        logits = torch.Tensor([[1, 1, 1, 1], [10000, -10000, -10000, -1000]])
-        mask = torch.Tensor([0, 1])
+        logits = torch.tensor(
+            [[1, 1, 1, 1], [10000, -10000, -10000, -1000]], dtype=torch.float, device=device
+        )
+        mask = torch.tensor([0, 1], device=device)
         metric(logits, mask)
         assert metric.get_metric() == 0.0
diff --git a/allennlp/tests/training/metrics/f1_measure_test.py b/allennlp/tests/training/metrics/f1_measure_test.py
index f150083c984..ac8713673bf 100644
--- a/allennlp/tests/training/metrics/f1_measure_test.py
+++ b/allennlp/tests/training/metrics/f1_measure_test.py
@@ -1,23 +1,25 @@
-import torch
 import pytest
-import numpy
+import torch
+from torch.testing import assert_allclose
 
-from allennlp.common.testing import AllenNlpTestCase
 from allennlp.common.checks import ConfigurationError
+from allennlp.common.testing import AllenNlpTestCase, multi_device
 from allennlp.training.metrics import F1Measure
 
 
 class F1MeasureTest(AllenNlpTestCase):
-    def test_f1_measure_catches_exceptions(self):
+    @multi_device
+    def test_f1_measure_catches_exceptions(self, device: str):
         f1_measure = F1Measure(0)
-        predictions = torch.rand([5, 7])
-        out_of_range_labels = torch.Tensor([10, 3, 4, 0, 1])
+        predictions = torch.rand([5, 7], device=device)
+        out_of_range_labels = torch.tensor([10, 3, 4, 0, 1], device=device)
         with pytest.raises(ConfigurationError):
             f1_measure(predictions, out_of_range_labels)
 
-    def test_f1_measure(self):
+    @multi_device
+    def test_f1_measure(self, device: str):
         f1_measure = F1Measure(positive_label=0)
-        predictions = torch.Tensor(
+        predictions = torch.tensor(
             [
                 [0.35, 0.25, 0.1, 0.1, 0.2],
                 [0.1, 0.6, 0.1, 0.2, 0.0],
@@ -25,11 +27,12 @@ def test_f1_measure(self):
                 [0.1, 0.5, 0.1, 0.2, 0.0],
                 [0.1, 0.2, 0.1, 0.7, 0.0],
                 [0.1, 0.6, 0.1, 0.2, 0.0],
-            ]
+            ],
+            device=device,
         )
         # [True Positive, True Negative, True Negative,
         #  False Negative, True Negative, False Negative]
-        targets = torch.Tensor([0, 4, 1, 0, 3, 0])
+        targets = torch.tensor([0, 4, 1, 0, 3, 0], device=device)
         f1_measure(predictions, targets)
         precision, recall, f1 = f1_measure.get_metric()
         assert f1_measure._true_positives == 1.0
@@ -38,16 +41,16 @@ def test_f1_measure(self):
         assert f1_measure._false_negatives == 2.0
         f1_measure.reset()
         # check value
-        numpy.testing.assert_almost_equal(precision, 1.0)
-        numpy.testing.assert_almost_equal(recall, 0.333333333)
-        numpy.testing.assert_almost_equal(f1, 0.499999999)
+        assert_allclose(precision, 1.0)
+        assert_allclose(recall, 0.333333333)
+        assert_allclose(f1, 0.499999999)
         # check type
         assert isinstance(precision, float)
         assert isinstance(recall, float)
         assert isinstance(f1, float)
 
         # Test the same thing with a mask:
-        mask = torch.Tensor([1, 0, 1, 1, 1, 0])
+        mask = torch.tensor([1, 0, 1, 1, 1, 0], device=device)
         f1_measure(predictions, targets, mask)
         precision, recall, f1 = f1_measure.get_metric()
         assert f1_measure._true_positives == 1.0
@@ -55,13 +58,14 @@ def test_f1_measure(self):
         assert f1_measure._false_positives == 0.0
         assert f1_measure._false_negatives == 1.0
         f1_measure.reset()
-        numpy.testing.assert_almost_equal(precision, 1.0)
-        numpy.testing.assert_almost_equal(recall, 0.5)
-        numpy.testing.assert_almost_equal(f1, 0.6666666666)
+        assert_allclose(precision, 1.0)
+        assert_allclose(recall, 0.5)
+        assert_allclose(f1, 0.6666666666)
 
-    def test_f1_measure_other_positive_label(self):
+    @multi_device
+    def test_f1_measure_other_positive_label(self, device: str):
         f1_measure = F1Measure(positive_label=1)
-        predictions = torch.Tensor(
+        predictions = torch.tensor(
             [
                 [0.35, 0.25, 0.1, 0.1, 0.2],
                 [0.1, 0.6, 0.1, 0.2, 0.0],
@@ -69,11 +73,12 @@ def test_f1_measure_other_positive_label(self):
                 [0.1, 0.5, 0.1, 0.2, 0.0],
                 [0.1, 0.2, 0.1, 0.7, 0.0],
                 [0.1, 0.6, 0.1, 0.2, 0.0],
-            ]
+            ],
+            device=device,
         )
         # [True Negative, False Positive, True Positive,
         #  False Positive, True Negative, False Positive]
-        targets = torch.Tensor([0, 4, 1, 0, 3, 0])
+        targets = torch.tensor([0, 4, 1, 0, 3, 0], device=device)
         f1_measure(predictions, targets)
         precision, recall, f1 = f1_measure.get_metric()
         assert f1_measure._true_positives == 1.0
@@ -82,17 +87,18 @@ def test_f1_measure_other_positive_label(self):
         assert f1_measure._false_negatives == 0.0
         f1_measure.reset()
         # check value
-        numpy.testing.assert_almost_equal(precision, 0.25)
-        numpy.testing.assert_almost_equal(recall, 1.0)
-        numpy.testing.assert_almost_equal(f1, 0.4)
+        assert_allclose(precision, 0.25)
+        assert_allclose(recall, 1.0)
+        assert_allclose(f1, 0.4)
         # check type
         assert isinstance(precision, float)
         assert isinstance(recall, float)
         assert isinstance(f1, float)
 
-    def test_f1_measure_accumulates_and_resets_correctly(self):
+    @multi_device
+    def test_f1_measure_accumulates_and_resets_correctly(self, device: str):
         f1_measure = F1Measure(positive_label=0)
-        predictions = torch.Tensor(
+        predictions = torch.tensor(
             [
                 [0.35, 0.25, 0.1, 0.1, 0.2],
                 [0.1, 0.6, 0.1, 0.2, 0.0],
@@ -100,11 +106,12 @@ def test_f1_measure_accumulates_and_resets_correctly(self):
                 [0.1, 0.5, 0.1, 0.2, 0.0],
                 [0.1, 0.2, 0.1, 0.7, 0.0],
                 [0.1, 0.6, 0.1, 0.2, 0.0],
-            ]
+            ],
+            device=device,
         )
         # [True Positive, True Negative, True Negative,
         #  False Negative, True Negative, False Negative]
-        targets = torch.Tensor([0, 4, 1, 0, 3, 0])
+        targets = torch.tensor([0, 4, 1, 0, 3, 0], device=device)
         f1_measure(predictions, targets)
         f1_measure(predictions, targets)
         precision, recall, f1 = f1_measure.get_metric()
@@ -113,25 +120,27 @@ def test_f1_measure_accumulates_and_resets_correctly(self):
         assert f1_measure._false_positives == 0.0
         assert f1_measure._false_negatives == 4.0
         f1_measure.reset()
-        numpy.testing.assert_almost_equal(precision, 1.0)
-        numpy.testing.assert_almost_equal(recall, 0.333333333)
-        numpy.testing.assert_almost_equal(f1, 0.499999999)
+        assert_allclose(precision, 1.0)
+        assert_allclose(recall, 0.333333333)
+        assert_allclose(f1, 0.499999999)
         assert f1_measure._true_positives == 0.0
         assert f1_measure._true_negatives == 0.0
         assert f1_measure._false_positives == 0.0
         assert f1_measure._false_negatives == 0.0
 
-    def test_f1_measure_works_for_sequences(self):
+    @multi_device
+    def test_f1_measure_works_for_sequences(self, device: str):
         f1_measure = F1Measure(positive_label=0)
-        predictions = torch.Tensor(
+        predictions = torch.tensor(
             [
                 [[0.35, 0.25, 0.1, 0.1, 0.2], [0.1, 0.6, 0.1, 0.2, 0.0], [0.1, 0.6, 0.1, 0.2, 0.0]],
                 [[0.35, 0.25, 0.1, 0.1, 0.2], [0.1, 0.6, 0.1, 0.2, 0.0], [0.1, 0.6, 0.1, 0.2, 0.0]],
-            ]
+            ],
+            device=device,
         )
         # [[True Positive, True Negative, True Negative],
         #  [True Positive, True Negative, False Negative]]
-        targets = torch.Tensor([[0, 3, 4], [0, 1, 0]])
+        targets = torch.tensor([[0, 3, 4], [0, 1, 0]], device=device)
         f1_measure(predictions, targets)
         precision, recall, f1 = f1_measure.get_metric()
         assert f1_measure._true_positives == 2.0
@@ -139,18 +148,18 @@ def test_f1_measure_works_for_sequences(self):
         assert f1_measure._false_positives == 0.0
         assert f1_measure._false_negatives == 1.0
         f1_measure.reset()
-        numpy.testing.assert_almost_equal(precision, 1.0)
-        numpy.testing.assert_almost_equal(recall, 0.666666666)
-        numpy.testing.assert_almost_equal(f1, 0.8)
+        assert_allclose(precision, 1.0)
+        assert_allclose(recall, 0.666666666)
+        assert_allclose(f1, 0.8)
 
         # Test the same thing with a mask:
-        mask = torch.Tensor([[0, 1, 0], [1, 1, 1]])
+        mask = torch.tensor([[0, 1, 0], [1, 1, 1]], device=device)
         f1_measure(predictions, targets, mask)
         precision, recall, f1 = f1_measure.get_metric()
         assert f1_measure._true_positives == 1.0
         assert f1_measure._true_negatives == 2.0
         assert f1_measure._false_positives == 0.0
         assert f1_measure._false_negatives == 1.0
-        numpy.testing.assert_almost_equal(precision, 1.0)
-        numpy.testing.assert_almost_equal(recall, 0.5)
-        numpy.testing.assert_almost_equal(f1, 0.66666666666)
+        assert_allclose(precision, 1.0)
+        assert_allclose(recall, 0.5)
+        assert_allclose(f1, 0.66666666666)
diff --git a/allennlp/tests/training/metrics/fbeta_measure_test.py b/allennlp/tests/training/metrics/fbeta_measure_test.py
index ce269f1d845..9b0c1240fce 100644
--- a/allennlp/tests/training/metrics/fbeta_measure_test.py
+++ b/allennlp/tests/training/metrics/fbeta_measure_test.py
@@ -1,11 +1,11 @@
 from typing import List
 
-import numpy
 import torch
 from sklearn.metrics import precision_recall_fscore_support
+from torch.testing import assert_allclose
 
 from allennlp.common.checks import ConfigurationError
-from allennlp.common.testing import AllenNlpTestCase
+from allennlp.common.testing import AllenNlpTestCase, multi_device
 from allennlp.training.metrics import FBetaMeasure
 
 
@@ -13,7 +13,7 @@ class FBetaMeasureTest(AllenNlpTestCase):
     def setUp(self):
         super().setUp()
         # [0, 1, 1, 1, 3, 1]
-        self.predictions = torch.Tensor(
+        self.predictions = torch.tensor(
             [
                 [0.35, 0.25, 0.1, 0.1, 0.2],
                 [0.1, 0.6, 0.1, 0.2, 0.0],
@@ -23,7 +23,7 @@ def setUp(self):
                 [0.1, 0.6, 0.1, 0.2, 0.0],
             ]
         )
-        self.targets = torch.Tensor([0, 4, 1, 0, 3, 0])
+        self.targets = torch.tensor([0, 4, 1, 0, 3, 0])
 
         # detailed target state
         self.pred_sum = [1, 4, 0, 1, 0]
@@ -33,7 +33,7 @@ def setUp(self):
         self.total_sum = [6, 6, 6, 6, 6]
 
         desired_precisions = [1.00, 0.25, 0.00, 1.00, 0.00]
-        desired_recalls = [0.33, 1.00, 0.00, 1.00, 0.00]
+        desired_recalls = [1 / 3, 1.00, 0.00, 1.00, 0.00]
         desired_fscores = [
             (2 * p * r) / (p + r) if p + r != 0.0 else 0.0
             for p, r in zip(desired_precisions, desired_recalls)
@@ -42,7 +42,8 @@ def setUp(self):
         self.desired_recalls = desired_recalls
         self.desired_fscores = desired_fscores
 
-    def test_config_errors(self):
+    @multi_device
+    def test_config_errors(self, device: str):
         # Bad beta
         self.assertRaises(ConfigurationError, FBetaMeasure, beta=0.0)
 
@@ -52,23 +53,32 @@ def test_config_errors(self):
         # Empty input labels
         self.assertRaises(ConfigurationError, FBetaMeasure, labels=[])
 
-    def test_runtime_errors(self):
+    @multi_device
+    def test_runtime_errors(self, device: str):
         fbeta = FBetaMeasure()
         # Metric was never called.
         self.assertRaises(RuntimeError, fbeta.get_metric)
 
-    def test_fbeta_multiclass_state(self):
+    @multi_device
+    def test_fbeta_multiclass_state(self, device: str):
+        self.predictions = self.predictions.to(device)
+        self.targets = self.targets.to(device)
+
         fbeta = FBetaMeasure()
         fbeta(self.predictions, self.targets)
 
         # check state
-        numpy.testing.assert_almost_equal(fbeta._pred_sum.tolist(), self.pred_sum)
-        numpy.testing.assert_almost_equal(fbeta._true_sum.tolist(), self.true_sum)
-        numpy.testing.assert_almost_equal(fbeta._true_positive_sum.tolist(), self.true_positive_sum)
-        numpy.testing.assert_almost_equal(fbeta._true_negative_sum.tolist(), self.true_negative_sum)
-        numpy.testing.assert_almost_equal(fbeta._total_sum.tolist(), self.total_sum)
+        assert_allclose(fbeta._pred_sum.tolist(), self.pred_sum)
+        assert_allclose(fbeta._true_sum.tolist(), self.true_sum)
+        assert_allclose(fbeta._true_positive_sum.tolist(), self.true_positive_sum)
+        assert_allclose(fbeta._true_negative_sum.tolist(), self.true_negative_sum)
+        assert_allclose(fbeta._total_sum.tolist(), self.total_sum)
+
+    @multi_device
+    def test_fbeta_multiclass_metric(self, device: str):
+        self.predictions = self.predictions.to(device)
+        self.targets = self.targets.to(device)
 
-    def test_fbeta_multiclass_metric(self):
         fbeta = FBetaMeasure()
         fbeta(self.predictions, self.targets)
         metric = fbeta.get_metric()
@@ -77,17 +87,21 @@ def test_fbeta_multiclass_metric(self):
         fscores = metric["fscore"]
 
         # check value
-        numpy.testing.assert_almost_equal(precisions, self.desired_precisions, decimal=2)
-        numpy.testing.assert_almost_equal(recalls, self.desired_recalls, decimal=2)
-        numpy.testing.assert_almost_equal(fscores, self.desired_fscores, decimal=2)
+        assert_allclose(precisions, self.desired_precisions)
+        assert_allclose(recalls, self.desired_recalls)
+        assert_allclose(fscores, self.desired_fscores)
 
         # check type
         assert isinstance(precisions, List)
         assert isinstance(recalls, List)
         assert isinstance(fscores, List)
 
-    def test_fbeta_multiclass_with_mask(self):
-        mask = torch.Tensor([1, 1, 1, 1, 1, 0])
+    @multi_device
+    def test_fbeta_multiclass_with_mask(self, device: str):
+        self.predictions = self.predictions.to(device)
+        self.targets = self.targets.to(device)
+
+        mask = torch.tensor([1, 1, 1, 1, 1, 0], device=device)
 
         fbeta = FBetaMeasure()
         fbeta(self.predictions, self.targets, mask)
@@ -96,21 +110,25 @@ def test_fbeta_multiclass_with_mask(self):
         recalls = metric["recall"]
         fscores = metric["fscore"]
 
-        numpy.testing.assert_almost_equal(fbeta._pred_sum.tolist(), [1, 3, 0, 1, 0])
-        numpy.testing.assert_almost_equal(fbeta._true_sum.tolist(), [2, 1, 0, 1, 1])
-        numpy.testing.assert_almost_equal(fbeta._true_positive_sum.tolist(), [1, 1, 0, 1, 0])
+        assert_allclose(fbeta._pred_sum.tolist(), [1, 3, 0, 1, 0])
+        assert_allclose(fbeta._true_sum.tolist(), [2, 1, 0, 1, 1])
+        assert_allclose(fbeta._true_positive_sum.tolist(), [1, 1, 0, 1, 0])
 
-        desired_precisions = [1.00, 0.33, 0.00, 1.00, 0.00]
+        desired_precisions = [1.00, 1 / 3, 0.00, 1.00, 0.00]
         desired_recalls = [0.50, 1.00, 0.00, 1.00, 0.00]
         desired_fscores = [
             (2 * p * r) / (p + r) if p + r != 0.0 else 0.0
             for p, r in zip(desired_precisions, desired_recalls)
         ]
-        numpy.testing.assert_almost_equal(precisions, desired_precisions, decimal=2)
-        numpy.testing.assert_almost_equal(recalls, desired_recalls, decimal=2)
-        numpy.testing.assert_almost_equal(fscores, desired_fscores, decimal=2)
+        assert_allclose(precisions, desired_precisions)
+        assert_allclose(recalls, desired_recalls)
+        assert_allclose(fscores, desired_fscores)
+
+    @multi_device
+    def test_fbeta_multiclass_macro_average_metric(self, device: str):
+        self.predictions = self.predictions.to(device)
+        self.targets = self.targets.to(device)
 
-    def test_fbeta_multiclass_macro_average_metric(self):
         fbeta = FBetaMeasure(average="macro")
         fbeta(self.predictions, self.targets)
         metric = fbeta.get_metric()
@@ -118,20 +136,25 @@ def test_fbeta_multiclass_macro_average_metric(self):
         recalls = metric["recall"]
         fscores = metric["fscore"]
 
-        macro_precision = numpy.mean(self.desired_precisions)
-        macro_recall = numpy.mean(self.desired_recalls)
-        macro_fscore = numpy.mean(self.desired_fscores)
+        # We keep the expected values in CPU because FBetaMeasure returns them in CPU.
+        macro_precision = torch.tensor(self.desired_precisions).mean()
+        macro_recall = torch.tensor(self.desired_recalls).mean()
+        macro_fscore = torch.tensor(self.desired_fscores).mean()
         # check value
-        numpy.testing.assert_almost_equal(precisions, macro_precision, decimal=2)
-        numpy.testing.assert_almost_equal(recalls, macro_recall, decimal=2)
-        numpy.testing.assert_almost_equal(fscores, macro_fscore, decimal=2)
+        assert_allclose(precisions, macro_precision)
+        assert_allclose(recalls, macro_recall)
+        assert_allclose(fscores, macro_fscore)
 
         # check type
         assert isinstance(precisions, float)
         assert isinstance(recalls, float)
         assert isinstance(fscores, float)
 
-    def test_fbeta_multiclass_micro_average_metric(self):
+    @multi_device
+    def test_fbeta_multiclass_micro_average_metric(self, device: str):
+        self.predictions = self.predictions.to(device)
+        self.targets = self.targets.to(device)
+
         fbeta = FBetaMeasure(average="micro")
         fbeta(self.predictions, self.targets)
         metric = fbeta.get_metric()
@@ -139,22 +162,27 @@ def test_fbeta_multiclass_micro_average_metric(self):
         recalls = metric["recall"]
         fscores = metric["fscore"]
 
-        true_positives = [1, 1, 0, 1, 0]
-        false_positives = [0, 3, 0, 0, 0]
-        false_negatives = [2, 0, 0, 0, 1]
-        mean_true_positive = numpy.mean(true_positives)
-        mean_false_positive = numpy.mean(false_positives)
-        mean_false_negative = numpy.mean(false_negatives)
+        # We keep the expected values in CPU because FBetaMeasure returns them in CPU.
+        true_positives = torch.tensor([1, 1, 0, 1, 0], dtype=torch.float32)
+        false_positives = torch.tensor([0, 3, 0, 0, 0], dtype=torch.float32)
+        false_negatives = torch.tensor([2, 0, 0, 0, 1], dtype=torch.float32)
+        mean_true_positive = true_positives.mean()
+        mean_false_positive = false_positives.mean()
+        mean_false_negative = false_negatives.mean()
 
         micro_precision = mean_true_positive / (mean_true_positive + mean_false_positive)
         micro_recall = mean_true_positive / (mean_true_positive + mean_false_negative)
         micro_fscore = (2 * micro_precision * micro_recall) / (micro_precision + micro_recall)
         # check value
-        numpy.testing.assert_almost_equal(precisions, micro_precision, decimal=2)
-        numpy.testing.assert_almost_equal(recalls, micro_recall, decimal=2)
-        numpy.testing.assert_almost_equal(fscores, micro_fscore, decimal=2)
+        assert_allclose(precisions, micro_precision)
+        assert_allclose(recalls, micro_recall)
+        assert_allclose(fscores, micro_fscore)
+
+    @multi_device
+    def test_fbeta_multiclass_with_explicit_labels(self, device: str):
+        self.predictions = self.predictions.to(device)
+        self.targets = self.targets.to(device)
 
-    def test_fbeta_multiclass_with_explicit_labels(self):
         # same prediction but with and explicit label ordering
         fbeta = FBetaMeasure(labels=[4, 3, 2, 1, 0])
         fbeta(self.predictions, self.targets)
@@ -167,11 +195,15 @@ def test_fbeta_multiclass_with_explicit_labels(self):
         desired_recalls = self.desired_recalls[::-1]
         desired_fscores = self.desired_fscores[::-1]
         # check value
-        numpy.testing.assert_almost_equal(precisions, desired_precisions, decimal=2)
-        numpy.testing.assert_almost_equal(recalls, desired_recalls, decimal=2)
-        numpy.testing.assert_almost_equal(fscores, desired_fscores, decimal=2)
+        assert_allclose(precisions, desired_precisions)
+        assert_allclose(recalls, desired_recalls)
+        assert_allclose(fscores, desired_fscores)
+
+    @multi_device
+    def test_fbeta_multiclass_with_macro_average(self, device: str):
+        self.predictions = self.predictions.to(device)
+        self.targets = self.targets.to(device)
 
-    def test_fbeta_multiclass_with_macro_average(self):
         labels = [0, 1]
         fbeta = FBetaMeasure(average="macro", labels=labels)
         fbeta(self.predictions, self.targets)
@@ -180,16 +212,21 @@ def test_fbeta_multiclass_with_macro_average(self):
         recalls = metric["recall"]
         fscores = metric["fscore"]
 
-        macro_precision = numpy.array(self.desired_precisions)[labels].mean()
-        macro_recall = numpy.array(self.desired_recalls)[labels].mean()
-        macro_fscore = numpy.array(self.desired_fscores)[labels].mean()
+        # We keep the expected values in CPU because FBetaMeasure returns them in CPU.
+        macro_precision = torch.tensor(self.desired_precisions)[labels].mean()
+        macro_recall = torch.tensor(self.desired_recalls)[labels].mean()
+        macro_fscore = torch.tensor(self.desired_fscores)[labels].mean()
 
         # check value
-        numpy.testing.assert_almost_equal(precisions, macro_precision, decimal=2)
-        numpy.testing.assert_almost_equal(recalls, macro_recall, decimal=2)
-        numpy.testing.assert_almost_equal(fscores, macro_fscore, decimal=2)
+        assert_allclose(precisions, macro_precision)
+        assert_allclose(recalls, macro_recall)
+        assert_allclose(fscores, macro_fscore)
+
+    @multi_device
+    def test_fbeta_multiclass_with_micro_average(self, device: str):
+        self.predictions = self.predictions.to(device)
+        self.targets = self.targets.to(device)
 
-    def test_fbeta_multiclass_with_micro_average(self):
         labels = [1, 3]
         fbeta = FBetaMeasure(average="micro", labels=labels)
         fbeta(self.predictions, self.targets)
@@ -198,22 +235,27 @@ def test_fbeta_multiclass_with_micro_average(self):
         recalls = metric["recall"]
         fscores = metric["fscore"]
 
-        true_positives = [1, 1]
-        false_positives = [3, 0]
-        false_negatives = [0, 0]
-        mean_true_positive = numpy.mean(true_positives)
-        mean_false_positive = numpy.mean(false_positives)
-        mean_false_negative = numpy.mean(false_negatives)
+        # We keep the expected values in CPU because FBetaMeasure returns them in CPU.
+        true_positives = torch.tensor([1, 1], dtype=torch.float32)
+        false_positives = torch.tensor([3, 0], dtype=torch.float32)
+        false_negatives = torch.tensor([0, 0], dtype=torch.float32)
+        mean_true_positive = true_positives.mean()
+        mean_false_positive = false_positives.mean()
+        mean_false_negative = false_negatives.mean()
 
         micro_precision = mean_true_positive / (mean_true_positive + mean_false_positive)
         micro_recall = mean_true_positive / (mean_true_positive + mean_false_negative)
         micro_fscore = (2 * micro_precision * micro_recall) / (micro_precision + micro_recall)
         # check value
-        numpy.testing.assert_almost_equal(precisions, micro_precision, decimal=2)
-        numpy.testing.assert_almost_equal(recalls, micro_recall, decimal=2)
-        numpy.testing.assert_almost_equal(fscores, micro_fscore, decimal=2)
+        assert_allclose(precisions, micro_precision)
+        assert_allclose(recalls, micro_recall)
+        assert_allclose(fscores, micro_fscore)
+
+    @multi_device
+    def test_fbeta_multiclass_with_weighted_average(self, device: str):
+        self.predictions = self.predictions.to(device)
+        self.targets = self.targets.to(device)
 
-    def test_fbeta_multiclass_with_weighted_average(self):
         labels = [0, 1]
         fbeta = FBetaMeasure(average="weighted", labels=labels)
         fbeta(self.predictions, self.targets)
@@ -223,18 +265,22 @@ def test_fbeta_multiclass_with_weighted_average(self):
         fscores = metric["fscore"]
 
         weighted_precision, weighted_recall, weighted_fscore, _ = precision_recall_fscore_support(
-            self.targets, self.predictions.argmax(dim=1), labels=labels, average="weighted"
+            self.targets.cpu().numpy(),
+            self.predictions.argmax(dim=1).cpu().numpy(),
+            labels=labels,
+            average="weighted",
         )
 
         # check value
-        numpy.testing.assert_almost_equal(precisions, weighted_precision, decimal=2)
-        numpy.testing.assert_almost_equal(recalls, weighted_recall, decimal=2)
-        numpy.testing.assert_almost_equal(fscores, weighted_fscore, decimal=2)
+        assert_allclose(precisions, weighted_precision)
+        assert_allclose(recalls, weighted_recall)
+        assert_allclose(fscores, weighted_fscore)
 
-    def test_fbeta_handles_batch_size_of_one(self):
-        predictions = torch.Tensor([[0.2862, 0.3479, 0.1627, 0.2033]])
-        targets = torch.Tensor([1])
-        mask = torch.Tensor([1])
+    @multi_device
+    def test_fbeta_handles_batch_size_of_one(self, device: str):
+        predictions = torch.tensor([[0.2862, 0.3479, 0.1627, 0.2033]], device=device)
+        targets = torch.tensor([1], device=device)
+        mask = torch.tensor([1], device=device)
 
         fbeta = FBetaMeasure()
         fbeta(predictions, targets, mask)
@@ -242,5 +288,5 @@ def test_fbeta_handles_batch_size_of_one(self):
         precisions = metric["precision"]
         recalls = metric["recall"]
 
-        numpy.testing.assert_almost_equal(precisions, [0.0, 1.0, 0.0, 0.0])
-        numpy.testing.assert_almost_equal(recalls, [0.0, 1.0, 0.0, 0.0])
+        assert_allclose(precisions, [0.0, 1.0, 0.0, 0.0])
+        assert_allclose(recalls, [0.0, 1.0, 0.0, 0.0])
diff --git a/allennlp/tests/training/metrics/mean_absolute_error_test.py b/allennlp/tests/training/metrics/mean_absolute_error_test.py
index 9c25680c652..f3901225866 100644
--- a/allennlp/tests/training/metrics/mean_absolute_error_test.py
+++ b/allennlp/tests/training/metrics/mean_absolute_error_test.py
@@ -1,25 +1,30 @@
 import torch
 
-from allennlp.common.testing import AllenNlpTestCase
+from allennlp.common.testing import AllenNlpTestCase, multi_device
 from allennlp.training.metrics import MeanAbsoluteError
 
 
 class MeanAbsoluteErrorTest(AllenNlpTestCase):
-    def test_mean_absolute_error_computation(self):
+    @multi_device
+    def test_mean_absolute_error_computation(self, device: str):
         mae = MeanAbsoluteError()
-        predictions = torch.Tensor(
-            [[1.0, 1.5, 1.0], [2.0, 3.0, 3.5], [4.0, 5.0, 5.5], [6.0, 7.0, 7.5]]
+        predictions = torch.tensor(
+            [[1.0, 1.5, 1.0], [2.0, 3.0, 3.5], [4.0, 5.0, 5.5], [6.0, 7.0, 7.5]], device=device
+        )
+        targets = torch.tensor(
+            [[0.0, 1.0, 0.0], [2.0, 2.0, 0.0], [4.0, 5.0, 0.0], [7.0, 7.0, 0.0]], device=device
         )
-        targets = torch.Tensor([[0.0, 1.0, 0.0], [2.0, 2.0, 0.0], [4.0, 5.0, 0.0], [7.0, 7.0, 0.0]])
         mae(predictions, targets)
         assert mae.get_metric() == 21.0 / 12.0
 
-        mask = torch.Tensor([[1.0, 1.0, 0.0], [1.0, 1.0, 0.0], [1.0, 1.0, 0.0], [1.0, 1.0, 0.0]])
+        mask = torch.tensor(
+            [[1.0, 1.0, 0.0], [1.0, 1.0, 0.0], [1.0, 1.0, 0.0], [1.0, 1.0, 0.0]], device=device
+        )
         mae(predictions, targets, mask)
         assert mae.get_metric() == (21.0 + 3.5) / (12.0 + 8.0)
 
-        new_targets = torch.Tensor(
-            [[2.0, 2.0, 0.0], [0.0, 1.0, 0.0], [7.0, 7.0, 0.0], [4.0, 5.0, 0.0]]
+        new_targets = torch.tensor(
+            [[2.0, 2.0, 0.0], [0.0, 1.0, 0.0], [7.0, 7.0, 0.0], [4.0, 5.0, 0.0]], device=device
         )
         mae(predictions, new_targets)
         assert mae.get_metric() == (21.0 + 3.5 + 32.0) / (12.0 + 8.0 + 12.0)
diff --git a/allennlp/tests/training/metrics/pearson_correlation_test.py b/allennlp/tests/training/metrics/pearson_correlation_test.py
index 2ecc7bf50a9..5378aafb9ad 100644
--- a/allennlp/tests/training/metrics/pearson_correlation_test.py
+++ b/allennlp/tests/training/metrics/pearson_correlation_test.py
@@ -1,12 +1,16 @@
-import torch
+from typing import Optional
+
 import numpy as np
-from numpy.testing import assert_allclose
+import torch
+from torch.testing import assert_allclose
 
-from allennlp.common.testing import AllenNlpTestCase
+from allennlp.common.testing import AllenNlpTestCase, multi_device
 from allennlp.training.metrics import PearsonCorrelation
 
 
-def pearson_corrcoef(predictions, labels, fweights=None):
+def pearson_corrcoef(
+    predictions: np.ndarray, labels: np.ndarray, fweights: Optional[np.ndarray] = None
+):
     covariance_matrices = np.cov(predictions, labels, fweights=fweights)
     denominator = np.sqrt(covariance_matrices[0, 0] * covariance_matrices[1, 1])
     if np.around(denominator, decimals=5) == 0:
@@ -17,17 +21,18 @@ def pearson_corrcoef(predictions, labels, fweights=None):
 
 
 class PearsonCorrelationTest(AllenNlpTestCase):
-    def test_pearson_correlation_unmasked_computation(self):
+    @multi_device
+    def test_pearson_correlation_unmasked_computation(self, device: str):
         pearson_correlation = PearsonCorrelation()
         batch_size = 100
         num_labels = 10
-        predictions_1 = np.random.randn(batch_size, num_labels).astype("float32")
-        labels_1 = 0.5 * predictions_1 + np.random.randn(batch_size, num_labels).astype("float32")
+        predictions_1 = torch.randn(batch_size, num_labels, device=device)
+        labels_1 = 0.5 * predictions_1 + torch.randn(batch_size, num_labels, device=device)
 
-        predictions_2 = np.random.randn(1).repeat(num_labels).astype("float32")
-        predictions_2 = predictions_2[np.newaxis, :].repeat(batch_size, axis=0)
-        labels_2 = np.random.randn(1).repeat(num_labels).astype("float32")
-        labels_2 = 0.5 * predictions_2 + labels_2[np.newaxis, :].repeat(batch_size, axis=0)
+        predictions_2 = torch.randn(1, device=device).expand(num_labels)
+        predictions_2 = predictions_2.unsqueeze(0).expand(batch_size, -1)
+        labels_2 = torch.randn(1, device=device).expand(num_labels)
+        labels_2 = 0.5 * predictions_2 + labels_2.unsqueeze(0).expand(batch_size, -1)
 
         # in most cases, the data is constructed like predictions_1, the data of such a batch different.
         # but in a few cases, for example, predictions_2, the data of such a batch is exactly the same.
@@ -38,72 +43,62 @@ def test_pearson_correlation_unmasked_computation(self):
         for predictions, labels in predictions_labels:
             pearson_correlation.reset()
             for i in range(batch_size // stride):
-                timestep_predictions = torch.FloatTensor(
-                    predictions[stride * i : stride * (i + 1), :]
-                )
-                timestep_labels = torch.FloatTensor(labels[stride * i : stride * (i + 1), :])
+                timestep_predictions = predictions[stride * i : stride * (i + 1), :]
+                timestep_labels = labels[stride * i : stride * (i + 1), :]
                 expected_pearson_correlation = pearson_corrcoef(
-                    predictions[: stride * (i + 1), :].reshape(-1),
-                    labels[: stride * (i + 1), :].reshape(-1),
+                    predictions[: stride * (i + 1), :].view(-1).cpu().numpy(),
+                    labels[: stride * (i + 1), :].view(-1).cpu().numpy(),
                 )
                 pearson_correlation(timestep_predictions, timestep_labels)
-                assert_allclose(
-                    expected_pearson_correlation, pearson_correlation.get_metric(), rtol=1e-5
-                )
+                assert_allclose(expected_pearson_correlation, pearson_correlation.get_metric())
             # Test reset
             pearson_correlation.reset()
-            pearson_correlation(torch.FloatTensor(predictions), torch.FloatTensor(labels))
+            pearson_correlation(predictions, labels)
             assert_allclose(
-                pearson_corrcoef(predictions.reshape(-1), labels.reshape(-1)),
+                pearson_corrcoef(predictions.view(-1).cpu().numpy(), labels.view(-1).cpu().numpy()),
                 pearson_correlation.get_metric(),
-                rtol=1e-5,
             )
 
-    def test_pearson_correlation_masked_computation(self):
+    @multi_device
+    def test_pearson_correlation_masked_computation(self, device: str):
         pearson_correlation = PearsonCorrelation()
         batch_size = 100
         num_labels = 10
-        predictions_1 = np.random.randn(batch_size, num_labels).astype("float32")
-        labels_1 = 0.5 * predictions_1 + np.random.randn(batch_size, num_labels).astype("float32")
+        predictions_1 = torch.randn(batch_size, num_labels, device=device)
+        labels_1 = 0.5 * predictions_1 + torch.randn(batch_size, num_labels, device=device)
 
-        predictions_2 = np.random.randn(1).repeat(num_labels).astype("float32")
-        predictions_2 = predictions_2[np.newaxis, :].repeat(batch_size, axis=0)
-        labels_2 = np.random.randn(1).repeat(num_labels).astype("float32")
-        labels_2 = 0.5 * predictions_2 + labels_2[np.newaxis, :].repeat(batch_size, axis=0)
+        predictions_2 = torch.randn(1, device=device).expand(num_labels)
+        predictions_2 = predictions_2.unsqueeze(0).expand(batch_size, -1)
+        labels_2 = torch.randn(1, device=device).expand(num_labels)
+        labels_2 = 0.5 * predictions_2 + labels_2.unsqueeze(0).expand(batch_size, -1)
 
         predictions_labels = [(predictions_1, labels_1), (predictions_2, labels_2)]
 
         # Random binary mask
-        mask = np.random.randint(0, 2, size=(batch_size, num_labels)).astype("float32")
+        mask = torch.randint(0, 2, size=(batch_size, num_labels), device=device)
         stride = 10
 
         for predictions, labels in predictions_labels:
             pearson_correlation.reset()
             for i in range(batch_size // stride):
-                timestep_predictions = torch.FloatTensor(
-                    predictions[stride * i : stride * (i + 1), :]
-                )
-                timestep_labels = torch.FloatTensor(labels[stride * i : stride * (i + 1), :])
-                timestep_mask = torch.FloatTensor(mask[stride * i : stride * (i + 1), :])
+                timestep_predictions = predictions[stride * i : stride * (i + 1), :]
+                timestep_labels = labels[stride * i : stride * (i + 1), :]
+                timestep_mask = mask[stride * i : stride * (i + 1), :]
                 expected_pearson_correlation = pearson_corrcoef(
-                    predictions[: stride * (i + 1), :].reshape(-1),
-                    labels[: stride * (i + 1), :].reshape(-1),
-                    fweights=mask[: stride * (i + 1), :].reshape(-1),
+                    predictions[: stride * (i + 1), :].view(-1).cpu().numpy(),
+                    labels[: stride * (i + 1), :].view(-1).cpu().numpy(),
+                    fweights=mask[: stride * (i + 1), :].view(-1).cpu().numpy(),
                 )
 
                 pearson_correlation(timestep_predictions, timestep_labels, timestep_mask)
-                assert_allclose(
-                    expected_pearson_correlation, pearson_correlation.get_metric(), rtol=1e-5
-                )
+                assert_allclose(expected_pearson_correlation, pearson_correlation.get_metric())
             # Test reset
             pearson_correlation.reset()
-            pearson_correlation(
-                torch.FloatTensor(predictions), torch.FloatTensor(labels), torch.FloatTensor(mask)
-            )
+            pearson_correlation(predictions, labels, mask)
             expected_pearson_correlation = pearson_corrcoef(
-                predictions.reshape(-1), labels.reshape(-1), fweights=mask.reshape(-1)
+                predictions.view(-1).cpu().numpy(),
+                labels.view(-1).cpu().numpy(),
+                fweights=mask.view(-1).cpu().numpy(),
             )
 
-            assert_allclose(
-                expected_pearson_correlation, pearson_correlation.get_metric(), rtol=1e-5
-            )
+            assert_allclose(expected_pearson_correlation, pearson_correlation.get_metric())
diff --git a/allennlp/tests/training/metrics/sequence_accuracy_test.py b/allennlp/tests/training/metrics/sequence_accuracy_test.py
index eb71aaf354f..c01656b5f2a 100644
--- a/allennlp/tests/training/metrics/sequence_accuracy_test.py
+++ b/allennlp/tests/training/metrics/sequence_accuracy_test.py
@@ -1,52 +1,58 @@
 import torch
-import numpy
+from torch.testing import assert_allclose
 
-from allennlp.common.testing import AllenNlpTestCase
+from allennlp.common.testing import AllenNlpTestCase, multi_device
 from allennlp.training.metrics import SequenceAccuracy
 
 
 class SequenceAccuracyTest(AllenNlpTestCase):
-    def test_sequence_accuracy(self):
+    @multi_device
+    def test_sequence_accuracy(self, device: str):
         accuracy = SequenceAccuracy()
-        gold = torch.Tensor([[1, 2, 3], [2, 4, 8], [0, 1, 1]])
-        predictions = torch.Tensor(
-            [[[1, 2, 3], [1, 2, -1]], [[2, 4, 8], [2, 5, 9]], [[-1, -1, -1], [0, 1, -1]]]
+        gold = torch.tensor([[1, 2, 3], [2, 4, 8], [0, 1, 1]], device=device)
+        predictions = torch.tensor(
+            [[[1, 2, 3], [1, 2, -1]], [[2, 4, 8], [2, 5, 9]], [[-1, -1, -1], [0, 1, -1]]],
+            device=device,
         )
 
         accuracy(predictions, gold)
         actual_accuracy = accuracy.get_metric()
-        numpy.testing.assert_almost_equal(actual_accuracy, 2 / 3)
+        assert_allclose(actual_accuracy, 2 / 3)
 
-    def test_sequence_accuracy_respects_mask(self):
+    @multi_device
+    def test_sequence_accuracy_respects_mask(self, device: str):
         accuracy = SequenceAccuracy()
-        gold = torch.Tensor([[1, 2, 3], [2, 4, 8], [0, 1, 1], [11, 13, 17]])
-        predictions = torch.Tensor(
+        gold = torch.tensor([[1, 2, 3], [2, 4, 8], [0, 1, 1], [11, 13, 17]], device=device)
+        predictions = torch.tensor(
             [
                 [[1, 2, 3], [1, 2, -1]],
                 [[2, 4, 8], [2, 5, 9]],
                 [[-1, -1, -1], [0, 1, -1]],
                 [[12, 13, 17], [11, 13, 18]],
-            ]
+            ],
+            device=device,
         )
-        mask = torch.Tensor([[0, 1, 1], [1, 1, 1], [1, 1, 0], [1, 0, 1]])
+        mask = torch.tensor([[0, 1, 1], [1, 1, 1], [1, 1, 0], [1, 0, 1]], device=device)
 
         accuracy(predictions, gold, mask)
         actual_accuracy = accuracy.get_metric()
-        numpy.testing.assert_almost_equal(actual_accuracy, 3 / 4)
+        assert_allclose(actual_accuracy, 3 / 4)
 
-    def test_sequence_accuracy_accumulates_and_resets_correctly(self):
+    @multi_device
+    def test_sequence_accuracy_accumulates_and_resets_correctly(self, device: str):
         accuracy = SequenceAccuracy()
-        gold = torch.Tensor([[1, 2, 3]])
-        accuracy(torch.Tensor([[[1, 2, 3]]]), gold)
-        accuracy(torch.Tensor([[[1, 2, 4]]]), gold)
+        gold = torch.tensor([[1, 2, 3]], device=device)
+        accuracy(torch.tensor([[[1, 2, 3]]], device=device), gold)
+        accuracy(torch.tensor([[[1, 2, 4]]], device=device), gold)
 
         actual_accuracy = accuracy.get_metric(reset=True)
-        numpy.testing.assert_almost_equal(actual_accuracy, 1 / 2)
+        assert_allclose(actual_accuracy, 1 / 2)
         assert accuracy.correct_count == 0
         assert accuracy.total_count == 0
 
-    def test_get_metric_on_new_object_works(self):
+    @multi_device
+    def test_get_metric_on_new_object_works(self, device: str):
         accuracy = SequenceAccuracy()
 
         actual_accuracy = accuracy.get_metric(reset=True)
-        numpy.testing.assert_almost_equal(actual_accuracy, 0)
+        assert_allclose(actual_accuracy, 0)
diff --git a/allennlp/tests/training/metrics/span_based_f1_measure_test.py b/allennlp/tests/training/metrics/span_based_f1_measure_test.py
index ac886ef17c7..9095decfb2d 100644
--- a/allennlp/tests/training/metrics/span_based_f1_measure_test.py
+++ b/allennlp/tests/training/metrics/span_based_f1_measure_test.py
@@ -2,14 +2,14 @@
 import subprocess
 
 import torch
-import numpy
+from torch.testing import assert_allclose
 
-from allennlp.common.testing import AllenNlpTestCase
+from allennlp.common.checks import ConfigurationError
+from allennlp.common.params import Params
+from allennlp.common.testing import AllenNlpTestCase, multi_device
 from allennlp.data import Vocabulary
-from allennlp.training.metrics import SpanBasedF1Measure, Metric
 from allennlp.models.semantic_role_labeler import write_bio_formatted_tags_to_file
-from allennlp.common.params import Params
-from allennlp.common.checks import ConfigurationError
+from allennlp.training.metrics import SpanBasedF1Measure, Metric
 
 
 class SpanBasedF1Test(AllenNlpTestCase):
@@ -38,7 +38,8 @@ def setUp(self):
 
         self.vocab = vocab
 
-    def test_span_metrics_are_computed_correcly_with_prediction_map(self):
+    @multi_device
+    def test_span_metrics_are_computed_correcly_with_prediction_map(self, device: str):
         # In this example, datapoint1 only has access to ARG1 and V labels,
         # whereas datapoint2 only has access to ARG2 and V labels.
 
@@ -47,10 +48,10 @@ def test_span_metrics_are_computed_correcly_with_prediction_map(self):
         gold_indices = [[0, 1, 2, 0, 3, 0], [1, 2, 0, 3, 4, 0]]
         prediction_map_indices = [[0, 1, 2, 5, 6], [0, 3, 4, 5, 6]]
 
-        gold_tensor = torch.Tensor(gold_indices)
-        prediction_map_tensor = torch.Tensor(prediction_map_indices)
+        gold_tensor = torch.tensor(gold_indices, device=device)
+        prediction_map_tensor = torch.tensor(prediction_map_indices, device=device)
 
-        prediction_tensor = torch.rand([2, 6, 5])
+        prediction_tensor = torch.rand([2, 6, 5], device=device)
         prediction_tensor[0, 0, 0] = 1
         prediction_tensor[0, 1, 1] = 1  # (True Positive - ARG1
         prediction_tensor[0, 2, 2] = 1  # *)
@@ -97,30 +98,33 @@ def test_span_metrics_are_computed_correcly_with_prediction_map(self):
 
         metric_dict = metric.get_metric()
 
-        numpy.testing.assert_almost_equal(metric_dict["recall-ARG2"], 0.0)
-        numpy.testing.assert_almost_equal(metric_dict["precision-ARG2"], 0.0)
-        numpy.testing.assert_almost_equal(metric_dict["f1-measure-ARG2"], 0.0)
-        numpy.testing.assert_almost_equal(metric_dict["recall-ARG1"], 1.0)
-        numpy.testing.assert_almost_equal(metric_dict["precision-ARG1"], 0.5)
-        numpy.testing.assert_almost_equal(metric_dict["f1-measure-ARG1"], 0.666666666)
-        numpy.testing.assert_almost_equal(metric_dict["recall-V"], 1.0)
-        numpy.testing.assert_almost_equal(metric_dict["precision-V"], 1.0)
-        numpy.testing.assert_almost_equal(metric_dict["f1-measure-V"], 1.0)
-        numpy.testing.assert_almost_equal(metric_dict["recall-overall"], 0.75)
-        numpy.testing.assert_almost_equal(metric_dict["precision-overall"], 0.6)
-        numpy.testing.assert_almost_equal(metric_dict["f1-measure-overall"], 0.666666666)
-
-    def test_span_metrics_are_computed_correctly(self):
+        assert_allclose(metric_dict["recall-ARG2"], 0.0)
+        assert_allclose(metric_dict["precision-ARG2"], 0.0)
+        assert_allclose(metric_dict["f1-measure-ARG2"], 0.0)
+        assert_allclose(metric_dict["recall-ARG1"], 1.0)
+        assert_allclose(metric_dict["precision-ARG1"], 0.5)
+        assert_allclose(metric_dict["f1-measure-ARG1"], 0.666666666)
+        assert_allclose(metric_dict["recall-V"], 1.0)
+        assert_allclose(metric_dict["precision-V"], 1.0)
+        assert_allclose(metric_dict["f1-measure-V"], 1.0)
+        assert_allclose(metric_dict["recall-overall"], 0.75)
+        assert_allclose(metric_dict["precision-overall"], 0.6)
+        assert_allclose(metric_dict["f1-measure-overall"], 0.666666666)
+
+    @multi_device
+    def test_span_metrics_are_computed_correctly(self, device: str):
         gold_labels = ["O", "B-ARG1", "I-ARG1", "O", "B-ARG2", "I-ARG2", "O", "O", "O"]
         gold_indices = [self.vocab.get_token_index(x, "tags") for x in gold_labels]
 
-        gold_tensor = torch.Tensor([gold_indices])
+        gold_tensor = torch.tensor([gold_indices], device=device)
 
-        prediction_tensor = torch.rand([2, 9, self.vocab.get_vocab_size("tags")])
+        prediction_tensor = torch.rand([2, 9, self.vocab.get_vocab_size("tags")], device=device)
 
         # Test that the span measure ignores completely masked sequences by
         # passing a mask with a fully masked row.
-        mask = torch.LongTensor([[1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0]])
+        mask = torch.tensor(
+            [[1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0]], device=device
+        )
 
         prediction_tensor[:, 0, 0] = 1
         prediction_tensor[:, 1, 1] = 1  # (True positive - ARG1
@@ -159,24 +163,25 @@ def test_span_metrics_are_computed_correctly(self):
 
         metric_dict = metric.get_metric()
 
-        numpy.testing.assert_almost_equal(metric_dict["recall-ARG2"], 0.0)
-        numpy.testing.assert_almost_equal(metric_dict["precision-ARG2"], 0.0)
-        numpy.testing.assert_almost_equal(metric_dict["f1-measure-ARG2"], 0.0)
-        numpy.testing.assert_almost_equal(metric_dict["recall-ARG1"], 1.0)
-        numpy.testing.assert_almost_equal(metric_dict["precision-ARG1"], 0.5)
-        numpy.testing.assert_almost_equal(metric_dict["f1-measure-ARG1"], 0.666666666)
-        numpy.testing.assert_almost_equal(metric_dict["recall-overall"], 0.5)
-        numpy.testing.assert_almost_equal(metric_dict["precision-overall"], 0.5)
-        numpy.testing.assert_almost_equal(metric_dict["f1-measure-overall"], 0.5)
-
-    def test_bmes_span_metrics_are_computed_correctly(self):
+        assert_allclose(metric_dict["recall-ARG2"], 0.0)
+        assert_allclose(metric_dict["precision-ARG2"], 0.0)
+        assert_allclose(metric_dict["f1-measure-ARG2"], 0.0)
+        assert_allclose(metric_dict["recall-ARG1"], 1.0)
+        assert_allclose(metric_dict["precision-ARG1"], 0.5)
+        assert_allclose(metric_dict["f1-measure-ARG1"], 0.666666666)
+        assert_allclose(metric_dict["recall-overall"], 0.5)
+        assert_allclose(metric_dict["precision-overall"], 0.5)
+        assert_allclose(metric_dict["f1-measure-overall"], 0.5)
+
+    @multi_device
+    def test_bmes_span_metrics_are_computed_correctly(self, device: str):
         # (bmes_tags) B:0, M:1, E:2, S:3.
         # [S, B, M, E, S]
         # [S, S, S, S, S]
         gold_indices = [[3, 0, 1, 2, 3], [3, 3, 3, 3, 3]]
-        gold_tensor = torch.Tensor(gold_indices)
+        gold_tensor = torch.tensor(gold_indices, device=device)
 
-        prediction_tensor = torch.rand([2, 5, 4])
+        prediction_tensor = torch.rand([2, 5, 4], device=device)
         # [S, B, E, S, S]
         # TP: 2, FP: 2, FN: 1.
         prediction_tensor[0, 0, 3] = 1  # (True positive)
@@ -198,24 +203,28 @@ def test_bmes_span_metrics_are_computed_correctly(self):
         # TP: 3, FP: 4, FN: 5.
         metric_dict = metric.get_metric()
 
-        numpy.testing.assert_almost_equal(metric_dict["recall-overall"], 0.375)
-        numpy.testing.assert_almost_equal(metric_dict["precision-overall"], 0.428, decimal=3)
-        numpy.testing.assert_almost_equal(metric_dict["f1-measure-overall"], 0.4)
+        assert_allclose(metric_dict["recall-overall"], 0.375, rtol=0.001, atol=1e-03)
+        assert_allclose(metric_dict["precision-overall"], 0.428, rtol=0.001, atol=1e-03)
+        assert_allclose(metric_dict["f1-measure-overall"], 0.4, rtol=0.001, atol=1e-03)
 
-    def test_span_f1_can_build_from_params(self):
+    @multi_device
+    def test_span_f1_can_build_from_params(self, device: str):
         params = Params({"type": "span_f1", "tag_namespace": "tags", "ignore_classes": ["V"]})
         metric = Metric.from_params(params=params, vocabulary=self.vocab)
-        assert metric._ignore_classes == ["V"]
-        assert metric._label_vocabulary == self.vocab.get_index_to_token_vocabulary("tags")
+        assert metric._ignore_classes == ["V"]  # type: ignore
+        assert metric._label_vocabulary == self.vocab.get_index_to_token_vocabulary(
+            "tags"
+        )  # type: ignore
 
-    def test_span_f1_matches_perl_script_for_continued_arguments(self):
+    @multi_device
+    def test_span_f1_matches_perl_script_for_continued_arguments(self, device: str):
         bio_tags = ["B-ARG1", "O", "B-C-ARG1", "B-V", "B-ARGM-ADJ", "O"]
         sentence = ["Mark", "and", "Matt", "were", "running", "fast", "."]
 
         gold_indices = [self.vocab.get_token_index(x, "tags") for x in bio_tags]
-        gold_tensor = torch.Tensor([gold_indices])
-        prediction_tensor = torch.rand([1, 6, self.vocab.get_vocab_size("tags")])
-        mask = torch.LongTensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])
+        gold_tensor = torch.tensor([gold_indices], device=device)
+        prediction_tensor = torch.rand([1, 6, self.vocab.get_vocab_size("tags")], device=device)
+        mask = torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]], device=device)
 
         # Make prediction so that it is exactly correct.
         for i, tag_index in enumerate(gold_indices):
@@ -235,24 +244,24 @@ def test_span_f1_matches_perl_script_for_continued_arguments(self):
         assert metric._true_positives["V"] == 1
         assert metric._true_positives["ARGM-ADJ"] == 1
 
-        numpy.testing.assert_almost_equal(metric_dict["recall-ARG1"], 1.0)
-        numpy.testing.assert_almost_equal(metric_dict["precision-ARG1"], 1.0)
-        numpy.testing.assert_almost_equal(metric_dict["f1-measure-ARG1"], 1.0)
-        numpy.testing.assert_almost_equal(metric_dict["recall-V"], 1.0)
-        numpy.testing.assert_almost_equal(metric_dict["precision-V"], 1.0)
-        numpy.testing.assert_almost_equal(metric_dict["f1-measure-V"], 1.0)
-        numpy.testing.assert_almost_equal(metric_dict["recall-ARGM-ADJ"], 1.0)
-        numpy.testing.assert_almost_equal(metric_dict["precision-ARGM-ADJ"], 1.0)
-        numpy.testing.assert_almost_equal(metric_dict["f1-measure-ARGM-ADJ"], 1.0)
-        numpy.testing.assert_almost_equal(metric_dict["recall-overall"], 1.0)
-        numpy.testing.assert_almost_equal(metric_dict["precision-overall"], 1.0)
-        numpy.testing.assert_almost_equal(metric_dict["f1-measure-overall"], 1.0)
+        assert_allclose(metric_dict["recall-ARG1"], 1.0)
+        assert_allclose(metric_dict["precision-ARG1"], 1.0)
+        assert_allclose(metric_dict["f1-measure-ARG1"], 1.0)
+        assert_allclose(metric_dict["recall-V"], 1.0)
+        assert_allclose(metric_dict["precision-V"], 1.0)
+        assert_allclose(metric_dict["f1-measure-V"], 1.0)
+        assert_allclose(metric_dict["recall-ARGM-ADJ"], 1.0)
+        assert_allclose(metric_dict["precision-ARGM-ADJ"], 1.0)
+        assert_allclose(metric_dict["f1-measure-ARGM-ADJ"], 1.0)
+        assert_allclose(metric_dict["recall-overall"], 1.0)
+        assert_allclose(metric_dict["precision-overall"], 1.0)
+        assert_allclose(metric_dict["f1-measure-overall"], 1.0)
 
         # Check that the number of true positive ARG1 labels is the same as the perl script's output:
         gold_file_path = os.path.join(self.TEST_DIR, "gold_conll_eval.txt")
         prediction_file_path = os.path.join(self.TEST_DIR, "prediction_conll_eval.txt")
-        with open(gold_file_path, "a+") as gold_file, open(
-            prediction_file_path, "a+"
+        with open(gold_file_path, "w") as gold_file, open(
+            prediction_file_path, "w"
         ) as prediction_file:
             # Use the same bio tags as prediction vs gold to make it obvious by looking
             # at the perl script output if something is wrong.
@@ -274,15 +283,16 @@ def test_span_f1_matches_perl_script_for_continued_arguments(self):
         )
         assert num_correct_arg1_instances_from_perl_evaluation == metric._true_positives["ARG1"]
 
-    def test_span_f1_accepts_tags_to_spans_function_argument(self):
+    @multi_device
+    def test_span_f1_accepts_tags_to_spans_function_argument(self, device: str):
         def mock_tags_to_spans_function(tag_sequence, classes_to_ignore=None):
             return [("mock", (42, 42))]
 
         # Should be ignore.
         bio_tags = ["B-ARG1", "O", "B-C-ARG1", "B-V", "B-ARGM-ADJ", "O"]
         gold_indices = [self.vocab.get_token_index(x, "tags") for x in bio_tags]
-        gold_tensor = torch.Tensor([gold_indices])
-        prediction_tensor = torch.rand([1, 6, self.vocab.get_vocab_size("tags")])
+        gold_tensor = torch.tensor([gold_indices], device=device)
+        prediction_tensor = torch.rand([1, 6, self.vocab.get_vocab_size("tags")], device=device)
 
         metric = SpanBasedF1Measure(
             self.vocab,
@@ -294,9 +304,9 @@ def mock_tags_to_spans_function(tag_sequence, classes_to_ignore=None):
         metric(prediction_tensor, gold_tensor)
         metric_dict = metric.get_metric()
 
-        numpy.testing.assert_almost_equal(metric_dict["recall-overall"], 1.0)
-        numpy.testing.assert_almost_equal(metric_dict["precision-overall"], 1.0)
-        numpy.testing.assert_almost_equal(metric_dict["f1-measure-overall"], 1.0)
+        assert_allclose(metric_dict["recall-overall"], 1.0)
+        assert_allclose(metric_dict["precision-overall"], 1.0)
+        assert_allclose(metric_dict["f1-measure-overall"], 1.0)
 
         with self.assertRaises(ConfigurationError):
             SpanBasedF1Measure(self.vocab, label_encoding="INVALID")
diff --git a/allennlp/tests/training/metrics/spearman_correlation_test.py b/allennlp/tests/training/metrics/spearman_correlation_test.py
index 1111a6834a5..8f2edd9832e 100644
--- a/allennlp/tests/training/metrics/spearman_correlation_test.py
+++ b/allennlp/tests/training/metrics/spearman_correlation_test.py
@@ -1,9 +1,9 @@
 import math
+
 import torch
-import numpy as np
-from numpy.testing import assert_allclose
+from torch.testing import assert_allclose
 
-from allennlp.common.testing import AllenNlpTestCase
+from allennlp.common.testing import AllenNlpTestCase, multi_device
 from allennlp.training.metrics import SpearmanCorrelation
 
 
@@ -17,8 +17,8 @@ def spearman_formula(predictions, labels, mask=None):
         labels = labels * mask
 
     # if all number of a set is same, return np.nan
-    if len(np.unique(predictions)) == 1 or len(np.unique(labels)) == 1:
-        return np.nan
+    if len(torch.unique(predictions)) == 1 or len(torch.unique(labels)) == 1:
+        return float("NaN")
 
     len_pre = len(predictions)
 
@@ -35,23 +35,24 @@ def spearman_formula(predictions, labels, mask=None):
     total = 0
     for i in range(len_pre):
         total += (predictions[i][0] - labels[i][0]) ** 2
-    expected_spearman_correlation = 1 - float(6 * total) / (len_pre * (len_pre ** 2 - 1))
+    expected_spearman_correlation = 1 - 6 * total / (len_pre * (len_pre ** 2 - 1))
 
     return expected_spearman_correlation
 
 
 class SpearmanCorrelationTest(AllenNlpTestCase):
-    def test_unmasked_computation(self):
+    @multi_device
+    def test_unmasked_computation(self, device: str):
         spearman_correlation = SpearmanCorrelation()
         batch_size = 10
         num_labels = 10
-        predictions1 = np.random.randn(batch_size, num_labels).astype("float32")
-        labels1 = 0.5 * predictions1 + np.random.randn(batch_size, num_labels).astype("float32")
+        predictions1 = torch.randn(batch_size, num_labels, device=device)
+        labels1 = 0.5 * predictions1 + torch.randn(batch_size, num_labels, device=device)
 
-        predictions2 = np.random.randn(1).repeat(num_labels).astype("float32")
-        predictions2 = predictions2[np.newaxis, :].repeat(batch_size, axis=0)
-        labels2 = np.random.randn(1).repeat(num_labels).astype("float32")
-        labels2 = 0.5 * predictions2 + labels2[np.newaxis, :].repeat(batch_size, axis=0)
+        predictions2 = torch.randn(1, device=device).repeat(num_labels)
+        predictions2 = predictions2.unsqueeze(0).expand(batch_size, -1)
+        labels2 = torch.randn(1, device=device).expand(num_labels)
+        labels2 = 0.5 * predictions2 + labels2.unsqueeze(0).expand(batch_size, -1)
 
         # in most cases, the data is constructed like predictions_1, the data of such a batch different.
         # but in a few cases, for example, predictions_2, the data of such a batch is exactly the same.
@@ -59,39 +60,37 @@ def test_unmasked_computation(self):
 
         for predictions, labels in predictions_labels_:
             spearman_correlation.reset()
-            spearman_correlation(torch.FloatTensor(predictions), torch.FloatTensor(labels))
+            spearman_correlation(predictions, labels)
             assert_allclose(
                 spearman_formula(predictions.reshape(-1), labels.reshape(-1)),
                 spearman_correlation.get_metric(),
-                rtol=1e-5,
             )
 
-    def test_masked_computation(self):
+    @multi_device
+    def test_masked_computation(self, device: str):
         spearman_correlation = SpearmanCorrelation()
         batch_size = 10
         num_labels = 10
-        predictions1 = np.random.randn(batch_size, num_labels).astype("float32")
-        labels1 = 0.5 * predictions1 + np.random.randn(batch_size, num_labels).astype("float32")
+        predictions1 = torch.randn(batch_size, num_labels, device=device)
+        labels1 = 0.5 * predictions1 + torch.randn(batch_size, num_labels, device=device)
 
-        predictions2 = np.random.randn(1).repeat(num_labels).astype("float32")
-        predictions2 = predictions2[np.newaxis, :].repeat(batch_size, axis=0)
-        labels2 = np.random.randn(1).repeat(num_labels).astype("float32")
-        labels2 = 0.5 * predictions2 + labels2[np.newaxis, :].repeat(batch_size, axis=0)
+        predictions2 = torch.randn(1, device=device).expand(num_labels)
+        predictions2 = predictions2.unsqueeze(0).expand(batch_size, -1)
+        labels2 = torch.randn(1, device=device).expand(num_labels)
+        labels2 = 0.5 * predictions2 + labels2.unsqueeze(0).expand(batch_size, -1)
 
         # in most cases, the data is constructed like predictions_1, the data of such a batch different.
         # but in a few cases, for example, predictions_2, the data of such a batch is exactly the same.
         predictions_labels_ = [(predictions1, labels1), (predictions2, labels2)]
 
         # Random binary mask
-        mask = np.random.randint(0, 2, size=(batch_size, num_labels)).astype("float32")
+        mask = torch.randint(0, 2, size=(batch_size, num_labels), device=device)
 
         for predictions, labels in predictions_labels_:
             spearman_correlation.reset()
-            spearman_correlation(
-                torch.FloatTensor(predictions), torch.FloatTensor(labels), torch.FloatTensor(mask)
-            )
+            spearman_correlation(predictions, labels, mask)
             expected_spearman_correlation = spearman_formula(
-                predictions.reshape(-1), labels.reshape(-1), mask=mask.reshape(-1)
+                predictions.view(-1), labels.view(-1), mask=mask.view(-1)
             )
 
             # because add mask, a batch of predictions or labels will have many 0,
@@ -101,26 +100,27 @@ def test_masked_computation(self):
             # so here we only test the positive and negative results of the results.
             assert (expected_spearman_correlation * spearman_correlation.get_metric()) > 0
 
-    def test_reset(self):
+    @multi_device
+    def test_reset(self, device: str):
         spearman_correlation = SpearmanCorrelation()
         batch_size = 10
         num_labels = 10
-        predictions = np.random.randn(batch_size, num_labels).astype("float32")
-        labels = 0.5 * predictions + np.random.randn(batch_size, num_labels).astype("float32")
+        predictions = torch.randn(batch_size, num_labels, device=device)
+        labels = 0.5 * predictions + torch.randn(batch_size, num_labels, device=device)
 
         # 1.test spearman_correlation.reset()
         spearman_correlation.reset()
-        spearman_correlation(torch.FloatTensor(predictions), torch.FloatTensor(labels))
+        spearman_correlation(predictions, labels)
         temp = spearman_correlation.get_metric()
         spearman_correlation.reset()
-        spearman_correlation(torch.FloatTensor(predictions), torch.FloatTensor(labels))
+        spearman_correlation(predictions, labels)
         assert spearman_correlation.get_metric() == temp
 
         # 2.test spearman_correlation.reset()
         spearman_correlation.reset()
-        spearman_correlation(torch.FloatTensor(predictions), torch.FloatTensor(labels))
+        spearman_correlation(predictions, labels)
 
         spearman_correlation.get_metric(reset=False)
-        assert spearman_correlation.get_metric() != np.nan
+        assert spearman_correlation.get_metric() != float("NaN")
         spearman_correlation.get_metric(reset=True)
         assert math.isnan(spearman_correlation.get_metric())
diff --git a/allennlp/tests/training/metrics/unigram_recall_test.py b/allennlp/tests/training/metrics/unigram_recall_test.py
index 0862ee94a62..38f7523e628 100644
--- a/allennlp/tests/training/metrics/unigram_recall_test.py
+++ b/allennlp/tests/training/metrics/unigram_recall_test.py
@@ -1,52 +1,58 @@
 import torch
-import numpy
+from torch.testing import assert_allclose
 
-from allennlp.common.testing import AllenNlpTestCase
+from allennlp.common.testing import AllenNlpTestCase, multi_device
 from allennlp.training.metrics import UnigramRecall
 
 
 class UnigramRecallTest(AllenNlpTestCase):
-    def test_sequence_recall(self):
+    @multi_device
+    def test_sequence_recall(self, device: str):
         recall = UnigramRecall()
-        gold = torch.Tensor([[1, 2, 3], [2, 4, 8], [7, 1, 1]])
-        predictions = torch.Tensor(
-            [[[1, 2, 3], [1, 2, -1]], [[2, 4, 8], [2, 5, 9]], [[-1, -1, -1], [7, 1, -1]]]
+        gold = torch.tensor([[1, 2, 3], [2, 4, 8], [7, 1, 1]], device=device)
+        predictions = torch.tensor(
+            [[[1, 2, 3], [1, 2, -1]], [[2, 4, 8], [2, 5, 9]], [[-1, -1, -1], [7, 1, -1]]],
+            device=device,
         )
 
         recall(predictions, gold)
         actual_recall = recall.get_metric()
-        numpy.testing.assert_almost_equal(actual_recall, 1)
+        assert_allclose(actual_recall, 1)
 
-    def test_sequence_recall_respects_mask(self):
+    @multi_device
+    def test_sequence_recall_respects_mask(self, device: str):
         recall = UnigramRecall()
-        gold = torch.Tensor([[2, 4, 8], [1, 2, 3], [7, 1, 1], [11, 14, 17]])
-        predictions = torch.Tensor(
+        gold = torch.tensor([[2, 4, 8], [1, 2, 3], [7, 1, 1], [11, 14, 17]], device=device)
+        predictions = torch.tensor(
             [
                 [[2, 4, 8], [2, 5, 9]],  # 3/3
                 [[-1, 2, 4], [3, 8, -1]],  # 2/2
                 [[-1, -1, -1], [7, 2, -1]],  # 1/2
                 [[12, 13, 17], [11, 13, 18]],  # 2/2
-            ]
+            ],
+            device=device,
         )
-        mask = torch.Tensor([[1, 1, 1], [0, 1, 1], [1, 1, 0], [1, 0, 1]])
+        mask = torch.tensor([[1, 1, 1], [0, 1, 1], [1, 1, 0], [1, 0, 1]], device=device)
 
         recall(predictions, gold, mask)
         actual_recall = recall.get_metric()
-        numpy.testing.assert_almost_equal(actual_recall, 7 / 8)
+        assert_allclose(actual_recall, 7 / 8)
 
-    def test_sequence_recall_accumulates_and_resets_correctly(self):
+    @multi_device
+    def test_sequence_recall_accumulates_and_resets_correctly(self, device: str):
         recall = UnigramRecall()
-        gold = torch.Tensor([[1, 2, 3]])
-        recall(torch.Tensor([[[1, 2, 3]]]), gold)
-        recall(torch.Tensor([[[7, 8, 4]]]), gold)
+        gold = torch.tensor([[1, 2, 3]], device=device)
+        recall(torch.tensor([[[1, 2, 3]]], device=device), gold)
+        recall(torch.tensor([[[7, 8, 4]]], device=device), gold)
 
         actual_recall = recall.get_metric(reset=True)
-        numpy.testing.assert_almost_equal(actual_recall, 1 / 2)
+        assert_allclose(actual_recall, 1 / 2)
         assert recall.correct_count == 0
         assert recall.total_count == 0
 
-    def test_get_metric_on_new_object_works(self):
+    @multi_device
+    def test_get_metric_on_new_object_works(self, device: str):
         recall = UnigramRecall()
 
         actual_recall = recall.get_metric(reset=True)
-        numpy.testing.assert_almost_equal(actual_recall, 0)
+        assert_allclose(actual_recall, 0)
diff --git a/allennlp/training/metrics/spearman_correlation.py b/allennlp/training/metrics/spearman_correlation.py
index 0f7314a0738..b66ad2f2ce0 100644
--- a/allennlp/training/metrics/spearman_correlation.py
+++ b/allennlp/training/metrics/spearman_correlation.py
@@ -44,14 +44,14 @@ def __call__(
         # Flatten predictions, gold_labels, and mask. We calculate the Spearman correlation between
         # the vectors, since each element in the predictions and gold_labels tensor is assumed
         # to be a separate observation.
-        predictions = predictions.view(-1)
-        gold_labels = gold_labels.view(-1)
+        predictions = predictions.reshape(-1)
+        gold_labels = gold_labels.reshape(-1)
 
         self.total_predictions = self.total_predictions.to(predictions.device)
         self.total_gold_labels = self.total_gold_labels.to(gold_labels.device)
 
         if mask is not None:
-            mask = mask.view(-1)
+            mask = mask.reshape(-1)
             self.total_predictions = torch.cat((self.total_predictions, predictions * mask), 0)
             self.total_gold_labels = torch.cat((self.total_gold_labels, gold_labels * mask), 0)
         else:

From 99660ba0771fd724f635b81f1be75fa2b2e23e12 Mon Sep 17 00:00:00 2001
From: Santiago Castro <sacastro@umich.edu>
Date: Thu, 27 Feb 2020 10:54:09 -0500
Subject: [PATCH 3/7] Add a test for the test utility

---
 allennlp/tests/common/testing.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 allennlp/tests/common/testing.py

diff --git a/allennlp/tests/common/testing.py b/allennlp/tests/common/testing.py
new file mode 100644
index 00000000000..19d0005cb46
--- /dev/null
+++ b/allennlp/tests/common/testing.py
@@ -0,0 +1,19 @@
+import torch
+
+from allennlp.common.testing import AllenNlpTestCase, multi_device
+
+
+class TestFromParams(AllenNlpTestCase):
+    def test_multi_device(self):
+        actual_devices = set()
+
+        @multi_device
+        def dummy_func(_self, device: str):
+            # Have `self` as in class test functions.
+            nonlocal actual_devices
+            actual_devices.add(device)
+
+        dummy_func(self)
+
+        expected_devices = {"cpu", "cuda"} if torch.cuda.is_available() else {"cpu"}
+        self.assertSetEqual(expected_devices, actual_devices)

From 97772f7ce52ebe37f99572f4321bef6bed7240f0 Mon Sep 17 00:00:00 2001
From: Santiago Castro <sacastro@umich.edu>
Date: Thu, 27 Feb 2020 13:05:31 -0500
Subject: [PATCH 4/7] mypy

---
 allennlp/tests/training/metrics/span_based_f1_measure_test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/allennlp/tests/training/metrics/span_based_f1_measure_test.py b/allennlp/tests/training/metrics/span_based_f1_measure_test.py
index 9095decfb2d..e4aaca1f674 100644
--- a/allennlp/tests/training/metrics/span_based_f1_measure_test.py
+++ b/allennlp/tests/training/metrics/span_based_f1_measure_test.py
@@ -212,9 +212,9 @@ def test_span_f1_can_build_from_params(self, device: str):
         params = Params({"type": "span_f1", "tag_namespace": "tags", "ignore_classes": ["V"]})
         metric = Metric.from_params(params=params, vocabulary=self.vocab)
         assert metric._ignore_classes == ["V"]  # type: ignore
-        assert metric._label_vocabulary == self.vocab.get_index_to_token_vocabulary(
+        assert metric._label_vocabulary == self.vocab.get_index_to_token_vocabulary(  # type: ignore
             "tags"
-        )  # type: ignore
+        )
 
     @multi_device
     def test_span_f1_matches_perl_script_for_continued_arguments(self, device: str):

From 8cb62df9ee9be93d4e5f09ab502977eeb4bb5fc5 Mon Sep 17 00:00:00 2001
From: Santiago Castro <bryant@montevideo.com.uy>
Date: Thu, 27 Feb 2020 13:21:09 -0500
Subject: [PATCH 5/7] Update allennlp/common/testing/test_case.py

Co-Authored-By: Mark Neumann <markn@allenai.org>
---
 allennlp/common/testing/test_case.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/allennlp/common/testing/test_case.py b/allennlp/common/testing/test_case.py
index 56ea99eba7e..2eec156a6f6 100644
--- a/allennlp/common/testing/test_case.py
+++ b/allennlp/common/testing/test_case.py
@@ -55,7 +55,7 @@ def parametrize(arg_names: Iterable[str], arg_values: Iterable[Iterable[Any]]):
         Argument names to pass to the test function.
     arg_values : `Iterable[Iterable[Any]]`, required.
         Iterable of values to pass to each of the args.
-        A function call is gonna be made for each inner iterable.
+		The decorated test will be run for each inner iterable.
     """
 
     def decorator(func):

From a7180b0fa6cb527e500c7f9d20e5117898914afd Mon Sep 17 00:00:00 2001
From: Santiago Castro <sacastro@umich.edu>
Date: Thu, 27 Feb 2020 13:38:58 -0500
Subject: [PATCH 6/7] Fix a PR comment

---
 allennlp/tests/common/testing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/allennlp/tests/common/testing.py b/allennlp/tests/common/testing.py
index 19d0005cb46..fff065baf33 100644
--- a/allennlp/tests/common/testing.py
+++ b/allennlp/tests/common/testing.py
@@ -3,7 +3,7 @@
 from allennlp.common.testing import AllenNlpTestCase, multi_device
 
 
-class TestFromParams(AllenNlpTestCase):
+class TestTesting(AllenNlpTestCase):
     def test_multi_device(self):
         actual_devices = set()
 

From 4d708bad321fdcb50772b798ad1cdcc968921066 Mon Sep 17 00:00:00 2001
From: Santiago Castro <sacastro@umich.edu>
Date: Thu, 27 Feb 2020 13:40:33 -0500
Subject: [PATCH 7/7] flake8

---
 allennlp/common/testing/test_case.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/allennlp/common/testing/test_case.py b/allennlp/common/testing/test_case.py
index 2eec156a6f6..222a49361a9 100644
--- a/allennlp/common/testing/test_case.py
+++ b/allennlp/common/testing/test_case.py
@@ -55,7 +55,7 @@ def parametrize(arg_names: Iterable[str], arg_values: Iterable[Iterable[Any]]):
         Argument names to pass to the test function.
     arg_values : `Iterable[Iterable[Any]]`, required.
         Iterable of values to pass to each of the args.
-		The decorated test will be run for each inner iterable.
+        The decorated test will be run for each inner iterable.
     """
 
     def decorator(func):