From 28956d5aca8c7085987d2f96fc88091bf7e6fdc8 Mon Sep 17 00:00:00 2001 From: Santiago Castro Date: Wed, 26 Feb 2020 00:00:18 -0500 Subject: [PATCH 1/7] Make most metrics work on GPU --- .../tests/training/metrics/entropy_test.py | 4 +- .../training/metrics/attachment_scores.py | 7 ++- allennlp/training/metrics/auc.py | 11 +++-- allennlp/training/metrics/average.py | 2 +- allennlp/training/metrics/bleu.py | 4 +- allennlp/training/metrics/boolean_accuracy.py | 4 +- .../training/metrics/categorical_accuracy.py | 4 +- .../training/metrics/conll_coref_scores.py | 47 +++++++++---------- allennlp/training/metrics/covariance.py | 2 +- allennlp/training/metrics/entropy.py | 4 +- allennlp/training/metrics/fbeta_measure.py | 16 +++---- .../training/metrics/mean_absolute_error.py | 4 +- allennlp/training/metrics/mention_recall.py | 4 +- allennlp/training/metrics/metric.py | 11 +++-- .../training/metrics/pearson_correlation.py | 2 +- .../training/metrics/sequence_accuracy.py | 4 +- .../training/metrics/span_based_f1_measure.py | 8 ++-- .../training/metrics/spearman_correlation.py | 9 ++-- allennlp/training/metrics/srl_eval_scorer.py | 6 +-- allennlp/training/metrics/unigram_recall.py | 8 ++-- 20 files changed, 83 insertions(+), 78 deletions(-) diff --git a/allennlp/tests/training/metrics/entropy_test.py b/allennlp/tests/training/metrics/entropy_test.py index 97040b1c97e..398a4ff51cb 100644 --- a/allennlp/tests/training/metrics/entropy_test.py +++ b/allennlp/tests/training/metrics/entropy_test.py @@ -16,11 +16,11 @@ def test_entropy_for_uniform_distribution(self): metric = Entropy() logits = torch.Tensor([[1, 1, 1, 1], [1, 1, 1, 1]]) metric(logits) - numpy.testing.assert_almost_equal(metric.get_metric(), 1.38629436) + numpy.testing.assert_almost_equal(metric.get_metric().cpu(), 1.38629436) # actual values shouldn't effect uniform distribution: logits = torch.Tensor([[2, 2, 2, 2], [2, 2, 2, 2]]) metric(logits) - numpy.testing.assert_almost_equal(metric.get_metric(), 1.38629436) + numpy.testing.assert_almost_equal(metric.get_metric().cpu(), 1.38629436) metric.reset() assert metric._entropy == 0.0 diff --git a/allennlp/training/metrics/attachment_scores.py b/allennlp/training/metrics/attachment_scores.py index 29a970dbb50..f94ea054d49 100644 --- a/allennlp/training/metrics/attachment_scores.py +++ b/allennlp/training/metrics/attachment_scores.py @@ -53,10 +53,13 @@ def __call__( # type: ignore mask : `torch.Tensor`, optional (default = None). A tensor of the same shape as `predicted_indices`. """ - unwrapped = self.unwrap_to_tensors( + detached = self.detach_tensors( predicted_indices, predicted_labels, gold_indices, gold_labels, mask ) - predicted_indices, predicted_labels, gold_indices, gold_labels, mask = unwrapped + predicted_indices, predicted_labels, gold_indices, gold_labels, mask = detached + + if mask is None: + mask = torch.ones_like(predicted_indices) mask = mask.long() predicted_indices = predicted_indices.long() diff --git a/allennlp/training/metrics/auc.py b/allennlp/training/metrics/auc.py index 4c8694ff26f..f3ddd243ec0 100644 --- a/allennlp/training/metrics/auc.py +++ b/allennlp/training/metrics/auc.py @@ -40,7 +40,7 @@ def __call__( A one-dimensional label tensor of shape (batch_size). """ - predictions, gold_labels, mask = self.unwrap_to_tensors(predictions, gold_labels, mask) + predictions, gold_labels, mask = self.detach_tensors(predictions, gold_labels, mask) # Sanity checks. if gold_labels.dim() != 1: @@ -70,9 +70,12 @@ def __call__( if mask is None: batch_size = gold_labels.shape[0] - mask = torch.ones(batch_size) + mask = torch.ones(batch_size, device=gold_labels.device) mask = mask.to(dtype=torch.bool) + self._all_predictions = self._all_predictions.to(predictions.device) + self._all_gold_labels = self._all_gold_labels.to(gold_labels.device) + self._all_predictions = torch.cat( [self._all_predictions, torch.masked_select(predictions, mask).float()], dim=0 ) @@ -84,8 +87,8 @@ def get_metric(self, reset: bool = False): if self._all_gold_labels.shape[0] == 0: return 0.5 false_positive_rates, true_positive_rates, _ = metrics.roc_curve( - self._all_gold_labels.numpy(), - self._all_predictions.numpy(), + self._all_gold_labels.cpu().numpy(), + self._all_predictions.cpu().numpy(), pos_label=self._positive_label, ) auc = metrics.auc(false_positive_rates, true_positive_rates) diff --git a/allennlp/training/metrics/average.py b/allennlp/training/metrics/average.py index 8d860cbf87a..6652fbc9a52 100644 --- a/allennlp/training/metrics/average.py +++ b/allennlp/training/metrics/average.py @@ -24,7 +24,7 @@ def __call__(self, value): value : `float` The value to average. """ - self._total_value += list(self.unwrap_to_tensors(value))[0] + self._total_value += list(self.detach_tensors(value))[0] self._count += 1 @overrides diff --git a/allennlp/training/metrics/bleu.py b/allennlp/training/metrics/bleu.py index d2d389091f3..972293a5a9c 100644 --- a/allennlp/training/metrics/bleu.py +++ b/allennlp/training/metrics/bleu.py @@ -108,7 +108,7 @@ def _get_brevity_penalty(self) -> float: return math.exp(1.0 - self._reference_lengths / self._prediction_lengths) def _get_valid_tokens_mask(self, tensor: torch.LongTensor) -> torch.ByteTensor: - valid_tokens_mask = torch.ones(tensor.size(), dtype=torch.bool) + valid_tokens_mask = torch.ones_like(tensor, dtype=torch.bool) for index in self._exclude_indices: valid_tokens_mask = valid_tokens_mask & (tensor != index) return valid_tokens_mask @@ -133,7 +133,7 @@ def __call__( None """ - predictions, gold_targets = self.unwrap_to_tensors(predictions, gold_targets) + predictions, gold_targets = self.detach_tensors(predictions, gold_targets) for ngram_size, _ in enumerate(self._ngram_weights, start=1): precision_matches, precision_totals = self._get_modified_precision_counts( predictions, gold_targets, ngram_size diff --git a/allennlp/training/metrics/boolean_accuracy.py b/allennlp/training/metrics/boolean_accuracy.py index f36874fca44..5e3777fe60f 100644 --- a/allennlp/training/metrics/boolean_accuracy.py +++ b/allennlp/training/metrics/boolean_accuracy.py @@ -43,7 +43,7 @@ def __call__( mask : `torch.Tensor`, optional (default = None). A tensor of the same shape as `predictions`. """ - predictions, gold_labels, mask = self.unwrap_to_tensors(predictions, gold_labels, mask) + predictions, gold_labels, mask = self.detach_tensors(predictions, gold_labels, mask) # Some sanity checks. if gold_labels.size() != predictions.size(): @@ -69,7 +69,7 @@ def __call__( # so we'll keep predictions that aren't. keep = mask.view(batch_size, -1).max(dim=1)[0].float() else: - keep = torch.ones(batch_size).float() + keep = torch.ones(batch_size, device=predictions.device).float() predictions = predictions.view(batch_size, -1) gold_labels = gold_labels.view(batch_size, -1) diff --git a/allennlp/training/metrics/categorical_accuracy.py b/allennlp/training/metrics/categorical_accuracy.py index a1bc038e456..cc8aac66885 100644 --- a/allennlp/training/metrics/categorical_accuracy.py +++ b/allennlp/training/metrics/categorical_accuracy.py @@ -45,7 +45,7 @@ def __call__( mask : `torch.Tensor`, optional (default = None). A masking tensor the same size as `gold_labels`. """ - predictions, gold_labels, mask = self.unwrap_to_tensors(predictions, gold_labels, mask) + predictions, gold_labels, mask = self.detach_tensors(predictions, gold_labels, mask) # Some sanity checks. num_classes = predictions.size(-1) @@ -80,7 +80,7 @@ def __call__( # ith entry in gold_labels points to index (0-num_classes) for ith row in max_predictions # For each row check if index pointed by gold_label is was 1 or not (among max scored classes) correct = max_predictions_mask[ - torch.arange(gold_labels.numel()).long(), gold_labels + torch.arange(gold_labels.numel(), device=gold_labels.device).long(), gold_labels ].float() tie_counts = max_predictions_mask.sum(-1) correct /= tie_counts.float() diff --git a/allennlp/training/metrics/conll_coref_scores.py b/allennlp/training/metrics/conll_coref_scores.py index eb2fc71d7d2..1df68077df9 100644 --- a/allennlp/training/metrics/conll_coref_scores.py +++ b/allennlp/training/metrics/conll_coref_scores.py @@ -40,9 +40,15 @@ def __call__( A metadata dictionary for each instance in the batch. We use the "clusters" key from this dictionary, which has the annotated gold coreference clusters for that instance. """ - top_spans, antecedent_indices, predicted_antecedents = self.unwrap_to_tensors( + top_spans, antecedent_indices, predicted_antecedents = self.detach_tensors( top_spans, antecedent_indices, predicted_antecedents ) + + # They need to be in CPU because Scorer.ceafe uses a SciPy function. + top_spans = top_spans.cpu() + antecedent_indices = antecedent_indices.cpu() + predicted_antecedents = predicted_antecedents.cpu() + for i, metadata in enumerate(metadata_list): gold_clusters, mention_to_gold = self.get_gold_clusters(metadata["clusters"]) predicted_clusters, mention_to_predicted = self.get_predicted_clusters( @@ -78,17 +84,12 @@ def get_gold_clusters(gold_clusters): @staticmethod def get_predicted_clusters( - top_spans: torch.Tensor, - antecedent_indices: torch.Tensor, - predicted_antecedents: torch.Tensor, + top_spans: torch.Tensor, # (num_spans, 2) + antecedent_indices: torch.Tensor, # (num_spans, num_antecedents) + predicted_antecedents: torch.Tensor, # (num_spans,) ) -> Tuple[ List[Tuple[Tuple[int, int], ...]], Dict[Tuple[int, int], Tuple[Tuple[int, int], ...]] ]: - # Pytorch 0.4 introduced scalar tensors, so our calls to tuple() and such below don't - # actually give ints unless we convert to numpy first. So we do that here. - top_spans = top_spans.numpy() # (num_spans, 2) - antecedent_indices = antecedent_indices.numpy() # (num_spans, num_antecedents) - predicted_antecedents = predicted_antecedents.numpy() # (num_spans,) predicted_clusters_to_ids: Dict[Tuple[int, int], int] = {} clusters: List[List[Tuple[int, int]]] = [] @@ -100,7 +101,9 @@ def get_predicted_clusters( predicted_index = antecedent_indices[i, predicted_antecedent] # Must be a previous span. assert i > predicted_index - antecedent_span: Tuple[int, int] = tuple(top_spans[predicted_index]) # type: ignore + antecedent_span: Tuple[int, int] = tuple( # type: ignore + top_spans[predicted_index].tolist() + ) # Check if we've seen the span before. if antecedent_span in predicted_clusters_to_ids.keys(): @@ -111,7 +114,7 @@ def get_predicted_clusters( clusters.append([antecedent_span]) predicted_clusters_to_ids[antecedent_span] = predicted_cluster_id - mention: Tuple[int, int] = tuple(top_spans[i]) # type: ignore + mention: Tuple[int, int] = tuple(top_spans[i].tolist()) # type: ignore clusters[predicted_cluster_id].append(mention) predicted_clusters_to_ids[mention] = predicted_cluster_id @@ -150,29 +153,21 @@ def update(self, predicted, gold, mention_to_predicted, mention_to_gold): self.recall_denominator += r_den def get_f1(self): - precision = ( - 0 - if self.precision_denominator == 0 - else self.precision_numerator / float(self.precision_denominator) - ) - recall = ( - 0 - if self.recall_denominator == 0 - else self.recall_numerator / float(self.recall_denominator) - ) + precision = self.get_precision() + recall = self.get_recall() return 0 if precision + recall == 0 else 2 * precision * recall / (precision + recall) def get_recall(self): - if self.recall_numerator == 0: + if self.recall_denominator == 0: return 0 else: - return self.recall_numerator / float(self.recall_denominator) + return self.recall_numerator / self.recall_denominator def get_precision(self): - if self.precision_numerator == 0: + if self.precision_denominator == 0: return 0 else: - return self.precision_numerator / float(self.precision_denominator) + return self.precision_numerator / self.precision_denominator def get_prf(self): return self.get_precision(), self.get_recall(), self.get_f1() @@ -234,7 +229,7 @@ def phi4(gold_clustering, predicted_clustering): @staticmethod def ceafe(clusters, gold_clusters): """ - Computes the Constrained EntityAlignment F-Measure (CEAF) for evaluating coreference. + Computes the Constrained Entity-Alignment F-Measure (CEAF) for evaluating coreference. Gold and predicted mentions are aligned into clusterings which maximise a metric - in this case, the F measure between gold and predicted clusters. diff --git a/allennlp/training/metrics/covariance.py b/allennlp/training/metrics/covariance.py index c5bf26cabc8..ddb316b1458 100644 --- a/allennlp/training/metrics/covariance.py +++ b/allennlp/training/metrics/covariance.py @@ -48,7 +48,7 @@ def __call__( mask : `torch.Tensor`, optional (default = None). A tensor of the same shape as `predictions`. """ - predictions, gold_labels, mask = self.unwrap_to_tensors(predictions, gold_labels, mask) + predictions, gold_labels, mask = self.detach_tensors(predictions, gold_labels, mask) # Flatten predictions, gold_labels, and mask. We calculate the covariance between # the vectors, since each element in the predictions and gold_labels tensor is assumed # to be a separate observation. diff --git a/allennlp/training/metrics/entropy.py b/allennlp/training/metrics/entropy.py index 716c66c4819..72c2d6038f9 100644 --- a/allennlp/training/metrics/entropy.py +++ b/allennlp/training/metrics/entropy.py @@ -26,10 +26,10 @@ def __call__( mask : `torch.Tensor`, optional (default = None). A masking tensor of shape (batch_size, ...). """ - logits, mask = self.unwrap_to_tensors(logits, mask) + logits, mask = self.detach_tensors(logits, mask) if mask is None: - mask = torch.ones(logits.size()[:-1]) + mask = torch.ones(logits.size()[:-1], device=logits.device) log_probs = torch.nn.functional.log_softmax(logits, dim=-1) probabilities = torch.exp(log_probs) * mask.unsqueeze(-1) diff --git a/allennlp/training/metrics/fbeta_measure.py b/allennlp/training/metrics/fbeta_measure.py index b47fb57ded4..a87f156ff9d 100644 --- a/allennlp/training/metrics/fbeta_measure.py +++ b/allennlp/training/metrics/fbeta_measure.py @@ -107,7 +107,7 @@ def __call__( mask : `torch.Tensor`, optional (default = None). A masking tensor the same size as `gold_labels`. """ - predictions, gold_labels, mask = self.unwrap_to_tensors(predictions, gold_labels, mask) + predictions, gold_labels, mask = self.detach_tensors(predictions, gold_labels, mask) # Calculate true_positive_sum, true_negative_sum, pred_sum, true_sum num_classes = predictions.size(-1) @@ -120,10 +120,10 @@ def __call__( # It means we call this metric at the first time # when `self._true_positive_sum` is None. if self._true_positive_sum is None: - self._true_positive_sum = torch.zeros(num_classes) - self._true_sum = torch.zeros(num_classes) - self._pred_sum = torch.zeros(num_classes) - self._total_sum = torch.zeros(num_classes) + self._true_positive_sum = torch.zeros(num_classes, device=predictions.device) + self._true_sum = torch.zeros(num_classes, device=predictions.device) + self._pred_sum = torch.zeros(num_classes, device=predictions.device) + self._total_sum = torch.zeros(num_classes, device=predictions.device) if mask is None: mask = torch.ones_like(gold_labels) @@ -137,7 +137,7 @@ def __call__( # Watch it: # The total numbers of true positives under all _predicted_ classes are zeros. if true_positives_bins.shape[0] == 0: - true_positive_sum = torch.zeros(num_classes) + true_positive_sum = torch.zeros(num_classes, device=predictions.device) else: true_positive_sum = torch.bincount( true_positives_bins.long(), minlength=num_classes @@ -149,13 +149,13 @@ def __call__( if pred_bins.shape[0] != 0: pred_sum = torch.bincount(pred_bins, minlength=num_classes).float() else: - pred_sum = torch.zeros(num_classes) + pred_sum = torch.zeros(num_classes, device=predictions.device) gold_labels_bins = gold_labels[mask].long() if gold_labels.shape[0] != 0: true_sum = torch.bincount(gold_labels_bins, minlength=num_classes).float() else: - true_sum = torch.zeros(num_classes) + true_sum = torch.zeros(num_classes, device=predictions.device) self._true_positive_sum += true_positive_sum self._pred_sum += pred_sum diff --git a/allennlp/training/metrics/mean_absolute_error.py b/allennlp/training/metrics/mean_absolute_error.py index 2166a4a9304..4d12ec7aad8 100644 --- a/allennlp/training/metrics/mean_absolute_error.py +++ b/allennlp/training/metrics/mean_absolute_error.py @@ -32,7 +32,7 @@ def __call__( mask : `torch.Tensor`, optional (default = None). A tensor of the same shape as `predictions`. """ - predictions, gold_labels, mask = self.unwrap_to_tensors(predictions, gold_labels, mask) + predictions, gold_labels, mask = self.detach_tensors(predictions, gold_labels, mask) absolute_errors = torch.abs(predictions - gold_labels) if mask is not None: @@ -48,7 +48,7 @@ def get_metric(self, reset: bool = False): The accumulated mean absolute error. """ - mean_absolute_error = float(self._absolute_error) / float(self._total_count) + mean_absolute_error = self._absolute_error / self._total_count if reset: self.reset() return mean_absolute_error diff --git a/allennlp/training/metrics/mention_recall.py b/allennlp/training/metrics/mention_recall.py index 5a08f404d12..01b72c8fcf3 100644 --- a/allennlp/training/metrics/mention_recall.py +++ b/allennlp/training/metrics/mention_recall.py @@ -18,7 +18,7 @@ def __call__( batched_top_spans: torch.Tensor, batched_metadata: List[Dict[str, Any]], ): - for top_spans, metadata in zip(batched_top_spans.data.tolist(), batched_metadata): + for top_spans, metadata in zip(batched_top_spans.tolist(), batched_metadata): gold_mentions: Set[Tuple[int, int]] = { mention for cluster in metadata["clusters"] for mention in cluster @@ -32,7 +32,7 @@ def get_metric(self, reset: bool = False) -> float: if self._num_gold_mentions == 0: recall = 0.0 else: - recall = self._num_recalled_mentions / float(self._num_gold_mentions) + recall = self._num_recalled_mentions / self._num_gold_mentions if reset: self.reset() return recall diff --git a/allennlp/training/metrics/metric.py b/allennlp/training/metrics/metric.py index 15b8f6b0dbf..fb6af0ef937 100644 --- a/allennlp/training/metrics/metric.py +++ b/allennlp/training/metrics/metric.py @@ -1,4 +1,5 @@ -from typing import Dict, Optional, Tuple, Union, List +from typing import Dict, Iterable, List, Optional, Tuple, Union + import torch from allennlp.common.registrable import Registrable @@ -41,11 +42,11 @@ def reset(self) -> None: raise NotImplementedError @staticmethod - def unwrap_to_tensors(*tensors: torch.Tensor): + def detach_tensors(*tensors: torch.Tensor) -> Iterable[torch.Tensor]: """ If you actually passed gradient-tracking Tensors to a Metric, there will be a huge memory leak, because it will prevent garbage collection for the computation - graph. This method ensures that you're using tensors directly and that they are on - the CPU. + graph. This method ensures the tensors are detached. """ - return (x.detach().cpu() if isinstance(x, torch.Tensor) else x for x in tensors) + # Check if it's actually a tensor in case something else was passed. + return (x.detach() if isinstance(x, torch.Tensor) else x for x in tensors) diff --git a/allennlp/training/metrics/pearson_correlation.py b/allennlp/training/metrics/pearson_correlation.py index 9791745064b..846900b7185 100644 --- a/allennlp/training/metrics/pearson_correlation.py +++ b/allennlp/training/metrics/pearson_correlation.py @@ -55,7 +55,7 @@ def __call__( mask : `torch.Tensor`, optional (default = None). A tensor of the same shape as `predictions`. """ - predictions, gold_labels, mask = self.unwrap_to_tensors(predictions, gold_labels, mask) + predictions, gold_labels, mask = self.detach_tensors(predictions, gold_labels, mask) self._predictions_labels_covariance(predictions, gold_labels, mask) self._predictions_variance(predictions, predictions, mask) self._labels_variance(gold_labels, gold_labels, mask) diff --git a/allennlp/training/metrics/sequence_accuracy.py b/allennlp/training/metrics/sequence_accuracy.py index 6811dbabebc..983ed2401b9 100644 --- a/allennlp/training/metrics/sequence_accuracy.py +++ b/allennlp/training/metrics/sequence_accuracy.py @@ -34,7 +34,7 @@ def __call__( mask : `torch.Tensor`, optional (default = None). A masking tensor the same size as `gold_labels`. """ - predictions, gold_labels, mask = self.unwrap_to_tensors(predictions, gold_labels, mask) + predictions, gold_labels, mask = self.detach_tensors(predictions, gold_labels, mask) # Some sanity checks. if gold_labels.dim() != predictions.dim() - 1: @@ -76,7 +76,7 @@ def get_metric(self, reset: bool = False): The accumulated accuracy. """ if self.total_count > 0: - accuracy = float(self.correct_count) / float(self.total_count) + accuracy = self.correct_count / self.total_count else: accuracy = 0 diff --git a/allennlp/training/metrics/span_based_f1_measure.py b/allennlp/training/metrics/span_based_f1_measure.py index 0f1c93b7d5d..01f695240ab 100644 --- a/allennlp/training/metrics/span_based_f1_measure.py +++ b/allennlp/training/metrics/span_based_f1_measure.py @@ -125,7 +125,7 @@ def __call__( if mask is None: mask = torch.ones_like(gold_labels) - predictions, gold_labels, mask, prediction_map = self.unwrap_to_tensors( + predictions, gold_labels, mask, prediction_map = self.detach_tensors( predictions, gold_labels, mask, prediction_map ) @@ -277,9 +277,9 @@ def get_metric(self, reset: bool = False): @staticmethod def _compute_metrics(true_positives: int, false_positives: int, false_negatives: int): - precision = float(true_positives) / float(true_positives + false_positives + 1e-13) - recall = float(true_positives) / float(true_positives + false_negatives + 1e-13) - f1_measure = 2.0 * ((precision * recall) / (precision + recall + 1e-13)) + precision = true_positives / (true_positives + false_positives + 1e-13) + recall = true_positives / (true_positives + false_negatives + 1e-13) + f1_measure = 2.0 * (precision * recall) / (precision + recall + 1e-13) return precision, recall, f1_measure def reset(self): diff --git a/allennlp/training/metrics/spearman_correlation.py b/allennlp/training/metrics/spearman_correlation.py index 5f4f10efd4f..0f7314a0738 100644 --- a/allennlp/training/metrics/spearman_correlation.py +++ b/allennlp/training/metrics/spearman_correlation.py @@ -40,13 +40,16 @@ def __call__( mask : `torch.Tensor`, optional (default = None). A tensor of the same shape as `predictions`. """ - predictions, gold_labels, mask = self.unwrap_to_tensors(predictions, gold_labels, mask) - # Flatten predictions, gold_labels, and mask. We calculate the spearman correlation between + predictions, gold_labels, mask = self.detach_tensors(predictions, gold_labels, mask) + # Flatten predictions, gold_labels, and mask. We calculate the Spearman correlation between # the vectors, since each element in the predictions and gold_labels tensor is assumed # to be a separate observation. predictions = predictions.view(-1) gold_labels = gold_labels.view(-1) + self.total_predictions = self.total_predictions.to(predictions.device) + self.total_gold_labels = self.total_gold_labels.to(gold_labels.device) + if mask is not None: mask = mask.view(-1) self.total_predictions = torch.cat((self.total_predictions, predictions * mask), 0) @@ -63,7 +66,7 @@ def get_metric(self, reset: bool = False): The accumulated sample Spearman correlation. """ spearman_correlation = stats.spearmanr( - self.total_predictions.numpy(), self.total_gold_labels.numpy() + self.total_predictions.cpu().numpy(), self.total_gold_labels.cpu().numpy() ) if reset: diff --git a/allennlp/training/metrics/srl_eval_scorer.py b/allennlp/training/metrics/srl_eval_scorer.py index f5616efb958..97ee34bc742 100644 --- a/allennlp/training/metrics/srl_eval_scorer.py +++ b/allennlp/training/metrics/srl_eval_scorer.py @@ -171,9 +171,9 @@ def get_metric(self, reset: bool = False): @staticmethod def _compute_metrics(true_positives: int, false_positives: int, false_negatives: int): - precision = float(true_positives) / float(true_positives + false_positives + 1e-13) - recall = float(true_positives) / float(true_positives + false_negatives + 1e-13) - f1_measure = 2.0 * ((precision * recall) / (precision + recall + 1e-13)) + precision = true_positives / (true_positives + false_positives + 1e-13) + recall = true_positives / (true_positives + false_negatives + 1e-13) + f1_measure = 2.0 * (precision * recall) / (precision + recall + 1e-13) return precision, recall, f1_measure def reset(self): diff --git a/allennlp/training/metrics/unigram_recall.py b/allennlp/training/metrics/unigram_recall.py index 5a070c1a0cc..e4e1a6043d6 100644 --- a/allennlp/training/metrics/unigram_recall.py +++ b/allennlp/training/metrics/unigram_recall.py @@ -38,7 +38,7 @@ def __call__( mask : `torch.Tensor`, optional (default = None). A masking tensor the same size as `gold_labels`. """ - predictions, gold_labels, mask = self.unwrap_to_tensors(predictions, gold_labels, mask) + predictions, gold_labels, mask = self.detach_tensors(predictions, gold_labels, mask) # Some sanity checks. if gold_labels.dim() != predictions.dim() - 1: @@ -71,8 +71,8 @@ def __call__( # word is from cleaned gold which doesn't have 0 or # end_index, so we don't need to explicitly remove those # from beam. - if stillsearch and (word in beam): - retval += 1.0 / float(len(cleaned_gold)) + if stillsearch and word in beam: + retval += 1 / len(cleaned_gold) stillsearch = False correct += retval @@ -85,7 +85,7 @@ def get_metric(self, reset: bool = False): The accumulated recall. """ - recall = float(self.correct_count) / float(self.total_count) if self.total_count > 0 else 0 + recall = self.correct_count / self.total_count if self.total_count > 0 else 0 if reset: self.reset() return recall From b7d89c19e699ab526119c7f40cb717241150a254 Mon Sep 17 00:00:00 2001 From: Santiago Castro Date: Thu, 27 Feb 2020 01:55:11 -0500 Subject: [PATCH 2/7] Make metric tests work on both GPU and CPU --- allennlp/common/testing/__init__.py | 2 +- allennlp/common/testing/test_case.py | 34 +++ .../metrics/attachment_scores_test.py | 29 ++- allennlp/tests/training/metrics/auc_test.py | 57 ++--- allennlp/tests/training/metrics/bleu_test.py | 39 ++-- .../training/metrics/boolean_accuracy_test.py | 47 +++-- .../metrics/categorical_accuracy_test.py | 108 ++++++---- .../metrics/conll_coref_scores_test.py | 11 +- .../tests/training/metrics/covariance_test.py | 62 +++--- .../tests/training/metrics/entropy_test.py | 33 +-- .../tests/training/metrics/f1_measure_test.py | 93 +++++---- .../training/metrics/fbeta_measure_test.py | 196 +++++++++++------- .../metrics/mean_absolute_error_test.py | 21 +- .../metrics/pearson_correlation_test.py | 93 ++++----- .../metrics/sequence_accuracy_test.py | 46 ++-- .../metrics/span_based_f1_measure_test.py | 148 +++++++------ .../metrics/spearman_correlation_test.py | 68 +++--- .../training/metrics/unigram_recall_test.py | 46 ++-- .../training/metrics/spearman_correlation.py | 6 +- 19 files changed, 658 insertions(+), 481 deletions(-) diff --git a/allennlp/common/testing/__init__.py b/allennlp/common/testing/__init__.py index 5a9246987c5..7157fb06218 100644 --- a/allennlp/common/testing/__init__.py +++ b/allennlp/common/testing/__init__.py @@ -1,5 +1,5 @@ """ Utilities and helpers for writing tests. """ -from allennlp.common.testing.test_case import AllenNlpTestCase +from allennlp.common.testing.test_case import AllenNlpTestCase, multi_device from allennlp.common.testing.model_test_case import ModelTestCase diff --git a/allennlp/common/testing/test_case.py b/allennlp/common/testing/test_case.py index 7951641b6f3..56ea99eba7e 100644 --- a/allennlp/common/testing/test_case.py +++ b/allennlp/common/testing/test_case.py @@ -3,8 +3,11 @@ import pathlib import shutil import tempfile +from typing import Any, Iterable from unittest import TestCase +import torch + from allennlp.common.checks import log_pytorch_version_info TEST_DIR = tempfile.mkdtemp(prefix="allennlp_tests") @@ -40,3 +43,34 @@ def setUp(self): def tearDown(self): shutil.rmtree(self.TEST_DIR) + + +def parametrize(arg_names: Iterable[str], arg_values: Iterable[Iterable[Any]]): + """ + Decorator to create parameterized tests. + + # Parameters + + arg_names : `Iterable[str]`, required. + Argument names to pass to the test function. + arg_values : `Iterable[Iterable[Any]]`, required. + Iterable of values to pass to each of the args. + A function call is gonna be made for each inner iterable. + """ + + def decorator(func): + def wrapper(*args, **kwargs): + for arg_value in arg_values: + kwargs_extra = {name: value for name, value in zip(arg_names, arg_value)} + func(*args, **kwargs, **kwargs_extra) + + return wrapper + + return decorator + + +_available_devices = ["cpu"] + (["cuda"] if torch.cuda.is_available() else []) +multi_device = parametrize(("device",), [(device,) for device in _available_devices]) +""" +Decorator that provides an argument `device` of type `str` for each available PyTorch device. +""" diff --git a/allennlp/tests/training/metrics/attachment_scores_test.py b/allennlp/tests/training/metrics/attachment_scores_test.py index d80fefa6b44..58d55dca3d0 100644 --- a/allennlp/tests/training/metrics/attachment_scores_test.py +++ b/allennlp/tests/training/metrics/attachment_scores_test.py @@ -1,6 +1,6 @@ import torch -from allennlp.common.testing import AllenNlpTestCase +from allennlp.common.testing import AllenNlpTestCase, multi_device from allennlp.training.metrics import AttachmentScores @@ -19,7 +19,17 @@ def setUp(self): self.mask = torch.Tensor([[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 0, 0]]) - def test_perfect_scores(self): + def _send_tensors_to_device(self, device: str): + self.predictions = self.predictions.to(device) + self.gold_indices = self.gold_indices.to(device) + self.label_predictions = self.label_predictions.to(device) + self.gold_labels = self.gold_labels.to(device) + self.mask = self.mask.to(device) + + @multi_device + def test_perfect_scores(self, device: str): + self._send_tensors_to_device(device) + self.scorer( self.predictions, self.label_predictions, self.gold_indices, self.gold_labels, self.mask ) @@ -27,7 +37,10 @@ def test_perfect_scores(self): for value in self.scorer.get_metric().values(): assert value == 1.0 - def test_unlabeled_accuracy_ignores_incorrect_labels(self): + @multi_device + def test_unlabeled_accuracy_ignores_incorrect_labels(self, device: str): + self._send_tensors_to_device(device) + label_predictions = self.label_predictions # Change some stuff so our 4 of our label predictions are wrong. label_predictions[0, 3:] = 3 @@ -47,7 +60,10 @@ def test_unlabeled_accuracy_ignores_incorrect_labels(self): # Neither should have labeled exact match. assert metrics["LEM"] == 0.0 - def test_labeled_accuracy_is_affected_by_incorrect_heads(self): + @multi_device + def test_labeled_accuracy_is_affected_by_incorrect_heads(self, device: str): + self._send_tensors_to_device(device) + predictions = self.predictions # Change some stuff so our 4 of our predictions are wrong. predictions[0, 3:] = 3 @@ -71,7 +87,10 @@ def test_labeled_accuracy_is_affected_by_incorrect_heads(self): assert metrics["LEM"] == 0.0 assert metrics["UEM"] == 0.0 - def test_attachment_scores_can_ignore_labels(self): + @multi_device + def test_attachment_scores_can_ignore_labels(self, device: str): + self._send_tensors_to_device(device) + scorer = AttachmentScores(ignore_classes=[1]) label_predictions = self.label_predictions diff --git a/allennlp/tests/training/metrics/auc_test.py b/allennlp/tests/training/metrics/auc_test.py index b129a9e0ee2..dfd33bde6d3 100644 --- a/allennlp/tests/training/metrics/auc_test.py +++ b/allennlp/tests/training/metrics/auc_test.py @@ -1,21 +1,22 @@ +import pytest import torch from sklearn import metrics -from numpy.testing import assert_almost_equal -import pytest +from torch.testing import assert_allclose -from allennlp.common.testing import AllenNlpTestCase -from allennlp.training.metrics import Auc from allennlp.common.checks import ConfigurationError +from allennlp.common.testing import AllenNlpTestCase, multi_device +from allennlp.training.metrics import Auc class AucTest(AllenNlpTestCase): - def test_auc_computation(self): + @multi_device + def test_auc_computation(self, device: str): auc = Auc() all_predictions = [] all_labels = [] for _ in range(5): - predictions = torch.randn(8).float() - labels = torch.randint(0, 2, (8,)).long() + predictions = torch.randn(8, device=device) + labels = torch.randint(0, 2, (8,), dtype=torch.long, device=device) auc(predictions, labels) @@ -25,62 +26,66 @@ def test_auc_computation(self): computed_auc_value = auc.get_metric(reset=True) false_positive_rates, true_positive_rates, _ = metrics.roc_curve( - torch.cat(all_labels, dim=0).numpy(), torch.cat(all_predictions, dim=0).numpy() + torch.cat(all_labels, dim=0).cpu().numpy(), + torch.cat(all_predictions, dim=0).cpu().numpy(), ) real_auc_value = metrics.auc(false_positive_rates, true_positive_rates) - assert_almost_equal(real_auc_value, computed_auc_value) + assert_allclose(real_auc_value, computed_auc_value) # One more computation to assure reset works. - predictions = torch.randn(8).float() - labels = torch.randint(0, 2, (8,)).long() + predictions = torch.randn(8, device=device) + labels = torch.randint(0, 2, (8,), dtype=torch.long, device=device) auc(predictions, labels) computed_auc_value = auc.get_metric(reset=True) false_positive_rates, true_positive_rates, _ = metrics.roc_curve( - labels.numpy(), predictions.numpy() + labels.cpu().numpy(), predictions.cpu().numpy() ) real_auc_value = metrics.auc(false_positive_rates, true_positive_rates) - assert_almost_equal(real_auc_value, computed_auc_value) + assert_allclose(real_auc_value, computed_auc_value) - def test_auc_gold_labels_behaviour(self): + @multi_device + def test_auc_gold_labels_behaviour(self, device: str): # Check that it works with different pos_label auc = Auc(positive_label=4) - predictions = torch.randn(8).float() - labels = torch.randint(3, 5, (8,)).long() + predictions = torch.randn(8, device=device) + labels = torch.randint(3, 5, (8,), dtype=torch.long, device=device) # We make sure that the positive label is always present. labels[0] = 4 auc(predictions, labels) computed_auc_value = auc.get_metric(reset=True) false_positive_rates, true_positive_rates, _ = metrics.roc_curve( - labels.numpy(), predictions.numpy(), pos_label=4 + labels.cpu().numpy(), predictions.cpu().numpy(), pos_label=4 ) real_auc_value = metrics.auc(false_positive_rates, true_positive_rates) - assert_almost_equal(real_auc_value, computed_auc_value) + assert_allclose(real_auc_value, computed_auc_value) # Check that it errs on getting more than 2 labels. with pytest.raises(ConfigurationError) as _: - labels = torch.LongTensor([3, 4, 5, 6, 7, 8, 9, 10]) + labels = torch.tensor([3, 4, 5, 6, 7, 8, 9, 10], device=device) auc(predictions, labels) - def test_auc_with_mask(self): + @multi_device + def test_auc_with_mask(self, device: str): auc = Auc() - predictions = torch.randn(8).float() - labels = torch.randint(0, 2, (8,)).long() - mask = torch.ByteTensor([1, 1, 1, 1, 0, 0, 0, 0]) + predictions = torch.randn(8, device=device) + labels = torch.randint(0, 2, (8,), dtype=torch.long, device=device) + mask = torch.tensor([1, 1, 1, 1, 0, 0, 0, 0], dtype=torch.uint8, device=device) auc(predictions, labels, mask) computed_auc_value = auc.get_metric(reset=True) false_positive_rates, true_positive_rates, _ = metrics.roc_curve( - labels[:4].numpy(), predictions[:4].numpy() + labels[:4].cpu().numpy(), predictions[:4].cpu().numpy() ) real_auc_value = metrics.auc(false_positive_rates, true_positive_rates) - assert_almost_equal(real_auc_value, computed_auc_value) + assert_allclose(real_auc_value, computed_auc_value) - def test_auc_works_without_calling_metric_at_all(self): + @multi_device + def test_auc_works_without_calling_metric_at_all(self, device: str): auc = Auc() auc.get_metric() diff --git a/allennlp/tests/training/metrics/bleu_test.py b/allennlp/tests/training/metrics/bleu_test.py index 643c1e9a66a..536db7c3789 100644 --- a/allennlp/tests/training/metrics/bleu_test.py +++ b/allennlp/tests/training/metrics/bleu_test.py @@ -1,10 +1,10 @@ -from collections import Counter import math +from collections import Counter -import numpy as np import torch +from torch.testing import assert_allclose -from allennlp.common.testing import AllenNlpTestCase +from allennlp.common.testing import AllenNlpTestCase, multi_device from allennlp.training.metrics import BLEU @@ -13,15 +13,16 @@ def setUp(self): super().setUp() self.metric = BLEU(ngram_weights=(0.5, 0.5), exclude_indices={0}) - def test_get_valid_tokens_mask(self): - tensor = torch.tensor([[1, 2, 3, 0], [0, 1, 1, 0]]) - result = self.metric._get_valid_tokens_mask(tensor) - result = result.long().numpy() - check = np.array([[1, 1, 1, 0], [0, 1, 1, 0]]) - np.testing.assert_array_equal(result, check) + @multi_device + def test_get_valid_tokens_mask(self, device: str): + tensor = torch.tensor([[1, 2, 3, 0], [0, 1, 1, 0]], device=device) + result = self.metric._get_valid_tokens_mask(tensor).long() + check = torch.tensor([[1, 1, 1, 0], [0, 1, 1, 0]], device=device) + assert_allclose(result, check) - def test_ngrams(self): - tensor = torch.tensor([1, 2, 3, 1, 2, 0]) + @multi_device + def test_ngrams(self, device: str): + tensor = torch.tensor([1, 2, 3, 1, 2, 0], device=device) # Unigrams. counts = Counter(self.metric._ngrams(tensor, 1)) @@ -42,14 +43,15 @@ def test_ngrams(self): counts = Counter(self.metric._ngrams(tensor, 7)) assert counts == {} - def test_bleu_computed_correctly(self): + @multi_device + def test_bleu_computed_correctly(self, device: str): self.metric.reset() # shape: (batch_size, max_sequence_length) - predictions = torch.tensor([[1, 0, 0], [1, 1, 0], [1, 1, 1]]) + predictions = torch.tensor([[1, 0, 0], [1, 1, 0], [1, 1, 1]], device=device) # shape: (batch_size, max_gold_sequence_length) - gold_targets = torch.tensor([[2, 0, 0], [1, 0, 0], [1, 1, 2]]) + gold_targets = torch.tensor([[2, 0, 0], [1, 0, 0], [1, 1, 2]], device=device) self.metric(predictions, gold_targets) @@ -57,7 +59,7 @@ def test_bleu_computed_correctly(self): assert self.metric._reference_lengths == 5 # Number of unigrams in predicted sentences that match gold sentences - # (but not more than maximum occurence of gold unigram within batch). + # (but not more than maximum occurrence of gold unigram within batch). assert self.metric._precision_matches[1] == ( 0 + 1 # no matches in first sentence. @@ -68,7 +70,7 @@ def test_bleu_computed_correctly(self): assert self.metric._precision_totals[1] == (1 + 2 + 3) # Number of bigrams in predicted sentences that match gold sentences - # (but not more than maximum occurence of gold bigram within batch). + # (but not more than maximum occurrence of gold bigram within batch). assert self.metric._precision_matches[2] == (0 + 0 + 1) # Total number of predicted bigrams. @@ -79,8 +81,9 @@ def test_bleu_computed_correctly(self): bleu = self.metric.get_metric(reset=True)["BLEU"] check = math.exp(0.5 * (math.log(3) - math.log(6)) + 0.5 * (math.log(1) - math.log(3))) - np.testing.assert_approx_equal(bleu, check) + assert_allclose(bleu, check) - def test_bleu_computed_with_zero_counts(self): + @multi_device + def test_bleu_computed_with_zero_counts(self, device: str): self.metric.reset() assert self.metric.get_metric()["BLEU"] == 0 diff --git a/allennlp/tests/training/metrics/boolean_accuracy_test.py b/allennlp/tests/training/metrics/boolean_accuracy_test.py index bb111edc3fa..bb332b74261 100644 --- a/allennlp/tests/training/metrics/boolean_accuracy_test.py +++ b/allennlp/tests/training/metrics/boolean_accuracy_test.py @@ -1,57 +1,62 @@ import torch import pytest -from allennlp.common.testing import AllenNlpTestCase +from allennlp.common.testing import AllenNlpTestCase, multi_device from allennlp.training.metrics import BooleanAccuracy class BooleanAccuracyTest(AllenNlpTestCase): - def test_accuracy_computation(self): + @multi_device + def test_accuracy_computation(self, device: str): accuracy = BooleanAccuracy() - predictions = torch.Tensor([[0, 1], [2, 3], [4, 5], [6, 7]]) - targets = torch.Tensor([[0, 1], [2, 2], [4, 5], [7, 7]]) + predictions = torch.tensor([[0, 1], [2, 3], [4, 5], [6, 7]], device=device) + targets = torch.tensor([[0, 1], [2, 2], [4, 5], [7, 7]], device=device) accuracy(predictions, targets) - assert accuracy.get_metric() == 2.0 / 4 + assert accuracy.get_metric() == 2 / 4 - mask = torch.ones(4, 2) + mask = torch.ones(4, 2, device=device) mask[1, 1] = 0 accuracy(predictions, targets, mask) - assert accuracy.get_metric() == 5.0 / 8 + assert accuracy.get_metric() == 5 / 8 targets[1, 1] = 3 accuracy(predictions, targets) - assert accuracy.get_metric() == 8.0 / 12 + assert accuracy.get_metric() == 8 / 12 accuracy.reset() accuracy(predictions, targets) - assert accuracy.get_metric() == 3.0 / 4 + assert accuracy.get_metric() == 3 / 4 - def test_skips_completely_masked_instances(self): + @multi_device + def test_skips_completely_masked_instances(self, device: str): accuracy = BooleanAccuracy() - predictions = torch.Tensor([[0, 1], [2, 3], [4, 5], [6, 7]]) - targets = torch.Tensor([[0, 1], [2, 2], [4, 5], [7, 7]]) + predictions = torch.tensor([[0, 1], [2, 3], [4, 5], [6, 7]], device=device) + targets = torch.tensor([[0, 1], [2, 2], [4, 5], [7, 7]], device=device) - mask = torch.Tensor([[0, 0], [1, 0], [1, 1], [1, 1]]) + mask = torch.tensor([[0, 0], [1, 0], [1, 1], [1, 1]], device=device) accuracy(predictions, targets, mask) # First example should be skipped, second is correct with mask, third is correct, fourth is wrong. assert accuracy.get_metric() == 2 / 3 - def test_incorrect_gold_labels_shape_catches_exceptions(self): + @multi_device + def test_incorrect_gold_labels_shape_catches_exceptions(self, device: str): accuracy = BooleanAccuracy() - predictions = torch.rand([5, 7]) - incorrect_shape_labels = torch.rand([5, 8]) + predictions = torch.rand([5, 7], device=device) + incorrect_shape_labels = torch.rand([5, 8], device=device) with pytest.raises(ValueError): accuracy(predictions, incorrect_shape_labels) - def test_incorrect_mask_shape_catches_exceptions(self): + @multi_device + def test_incorrect_mask_shape_catches_exceptions(self, device: str): accuracy = BooleanAccuracy() - predictions = torch.rand([5, 7]) - labels = torch.rand([5, 7]) - incorrect_shape_mask = torch.randint(0, 2, [5, 8]) + predictions = torch.rand([5, 7], device=device) + labels = torch.rand([5, 7], device=device) + incorrect_shape_mask = torch.randint(0, 2, [5, 8], device=device) with pytest.raises(ValueError): accuracy(predictions, labels, incorrect_shape_mask) - def test_does_not_divide_by_zero_with_no_count(self): + @multi_device + def test_does_not_divide_by_zero_with_no_count(self, device: str): accuracy = BooleanAccuracy() self.assertAlmostEqual(accuracy.get_metric(), 0.0) diff --git a/allennlp/tests/training/metrics/categorical_accuracy_test.py b/allennlp/tests/training/metrics/categorical_accuracy_test.py index 0020fb95d52..87252f390cd 100644 --- a/allennlp/tests/training/metrics/categorical_accuracy_test.py +++ b/allennlp/tests/training/metrics/categorical_accuracy_test.py @@ -1,97 +1,113 @@ -import torch import pytest -import numpy +import torch +from torch.testing import assert_allclose -from allennlp.common.testing import AllenNlpTestCase from allennlp.common.checks import ConfigurationError +from allennlp.common.testing import AllenNlpTestCase, multi_device from allennlp.training.metrics import CategoricalAccuracy class CategoricalAccuracyTest(AllenNlpTestCase): - def test_categorical_accuracy(self): + @multi_device + def test_categorical_accuracy(self, device: str): accuracy = CategoricalAccuracy() - predictions = torch.Tensor([[0.35, 0.25, 0.1, 0.1, 0.2], [0.1, 0.6, 0.1, 0.2, 0.0]]) - targets = torch.Tensor([0, 3]) + predictions = torch.tensor( + [[0.35, 0.25, 0.1, 0.1, 0.2], [0.1, 0.6, 0.1, 0.2, 0.0]], device=device + ) + targets = torch.tensor([0, 3], device=device) accuracy(predictions, targets) actual_accuracy = accuracy.get_metric() assert actual_accuracy == 0.50 - def test_top_k_categorical_accuracy(self): + @multi_device + def test_top_k_categorical_accuracy(self, device: str): accuracy = CategoricalAccuracy(top_k=2) - predictions = torch.Tensor([[0.35, 0.25, 0.1, 0.1, 0.2], [0.1, 0.6, 0.1, 0.2, 0.0]]) - targets = torch.Tensor([0, 3]) + predictions = torch.tensor( + [[0.35, 0.25, 0.1, 0.1, 0.2], [0.1, 0.6, 0.1, 0.2, 0.0]], device=device + ) + targets = torch.tensor([0, 3], device=device) accuracy(predictions, targets) actual_accuracy = accuracy.get_metric() assert actual_accuracy == 1.0 - def test_top_k_categorical_accuracy_accumulates_and_resets_correctly(self): + @multi_device + def test_top_k_categorical_accuracy_accumulates_and_resets_correctly(self, device: str): accuracy = CategoricalAccuracy(top_k=2) - predictions = torch.Tensor([[0.35, 0.25, 0.1, 0.1, 0.2], [0.1, 0.6, 0.1, 0.2, 0.0]]) - targets = torch.Tensor([0, 3]) + predictions = torch.tensor( + [[0.35, 0.25, 0.1, 0.1, 0.2], [0.1, 0.6, 0.1, 0.2, 0.0]], device=device + ) + targets = torch.tensor([0, 3], device=device) accuracy(predictions, targets) accuracy(predictions, targets) - accuracy(predictions, torch.Tensor([4, 4])) - accuracy(predictions, torch.Tensor([4, 4])) + accuracy(predictions, torch.tensor([4, 4], device=device)) + accuracy(predictions, torch.tensor([4, 4], device=device)) actual_accuracy = accuracy.get_metric(reset=True) assert actual_accuracy == 0.50 assert accuracy.correct_count == 0.0 assert accuracy.total_count == 0.0 - def test_top_k_categorical_accuracy_respects_mask(self): + @multi_device + def test_top_k_categorical_accuracy_respects_mask(self, device: str): accuracy = CategoricalAccuracy(top_k=2) - predictions = torch.Tensor( - [[0.35, 0.25, 0.1, 0.1, 0.2], [0.1, 0.6, 0.1, 0.2, 0.0], [0.1, 0.2, 0.5, 0.2, 0.0]] + predictions = torch.tensor( + [[0.35, 0.25, 0.1, 0.1, 0.2], [0.1, 0.6, 0.1, 0.2, 0.0], [0.1, 0.2, 0.5, 0.2, 0.0]], + device=device, ) - targets = torch.Tensor([0, 3, 0]) - mask = torch.Tensor([0, 1, 1]) + targets = torch.tensor([0, 3, 0], device=device) + mask = torch.tensor([0, 1, 1], device=device) accuracy(predictions, targets, mask) actual_accuracy = accuracy.get_metric() - assert actual_accuracy == 0.50 + assert_allclose(actual_accuracy, 0.50) - def test_top_k_categorical_accuracy_works_for_sequences(self): + @multi_device + def test_top_k_categorical_accuracy_works_for_sequences(self, device: str): accuracy = CategoricalAccuracy(top_k=2) - predictions = torch.Tensor( + predictions = torch.tensor( [ [[0.35, 0.25, 0.1, 0.1, 0.2], [0.1, 0.6, 0.1, 0.2, 0.0], [0.1, 0.6, 0.1, 0.2, 0.0]], [[0.35, 0.25, 0.1, 0.1, 0.2], [0.1, 0.6, 0.1, 0.2, 0.0], [0.1, 0.6, 0.1, 0.2, 0.0]], - ] + ], + device=device, ) - targets = torch.Tensor([[0, 3, 4], [0, 1, 4]]) + targets = torch.tensor([[0, 3, 4], [0, 1, 4]], device=device) accuracy(predictions, targets) actual_accuracy = accuracy.get_metric(reset=True) - numpy.testing.assert_almost_equal(actual_accuracy, 0.6666666) + assert_allclose(actual_accuracy, 0.6666666) # Test the same thing but with a mask: - mask = torch.Tensor([[0, 1, 1], [1, 0, 1]]) + mask = torch.tensor([[0, 1, 1], [1, 0, 1]], device=device) accuracy(predictions, targets, mask) actual_accuracy = accuracy.get_metric(reset=True) - numpy.testing.assert_almost_equal(actual_accuracy, 0.50) + assert_allclose(actual_accuracy, 0.50) - def test_top_k_categorical_accuracy_catches_exceptions(self): + @multi_device + def test_top_k_categorical_accuracy_catches_exceptions(self, device: str): accuracy = CategoricalAccuracy() - predictions = torch.rand([5, 7]) - out_of_range_labels = torch.Tensor([10, 3, 4, 0, 1]) + predictions = torch.rand([5, 7], device=device) + out_of_range_labels = torch.tensor([10, 3, 4, 0, 1], device=device) with pytest.raises(ConfigurationError): accuracy(predictions, out_of_range_labels) - def test_tie_break_categorical_accuracy(self): + @multi_device + def test_tie_break_categorical_accuracy(self, device: str): accuracy = CategoricalAccuracy(tie_break=True) - predictions = torch.Tensor( - [[0.35, 0.25, 0.35, 0.35, 0.35], [0.1, 0.6, 0.1, 0.2, 0.2], [0.1, 0.0, 0.1, 0.2, 0.2]] + predictions = torch.tensor( + [[0.35, 0.25, 0.35, 0.35, 0.35], [0.1, 0.6, 0.1, 0.2, 0.2], [0.1, 0.0, 0.1, 0.2, 0.2]], + device=device, ) # Test without mask: - targets = torch.Tensor([2, 1, 4]) + targets = torch.tensor([2, 1, 4], device=device) accuracy(predictions, targets) assert accuracy.get_metric(reset=True) == (0.25 + 1 + 0.5) / 3.0 # # # Test with mask - mask = torch.Tensor([1, 0, 1]) - targets = torch.Tensor([2, 1, 4]) + mask = torch.tensor([1, 0, 1], device=device) + targets = torch.tensor([2, 1, 4], device=device) accuracy(predictions, targets, mask) assert accuracy.get_metric(reset=True) == (0.25 + 0.5) / 2.0 # # Test tie-break with sequence - predictions = torch.Tensor( + predictions = torch.tensor( [ [ [0.35, 0.25, 0.35, 0.35, 0.35], @@ -103,21 +119,27 @@ def test_tie_break_categorical_accuracy(self): [0.1, 0.6, 0.1, 0.2, 0.2], [0.1, 0.0, 0.1, 0.2, 0.2], ], - ] + ], + device=device, + ) + targets = torch.tensor( + [[0, 1, 3], [0, 3, 4]], device=device # 0.25 + 1 + 0.5 # 0.25 + 0 + 0.5 = 2.5 ) - targets = torch.Tensor([[0, 1, 3], [0, 3, 4]]) # 0.25 + 1 + 0.5 # 0.25 + 0 + 0.5 = 2.5 accuracy(predictions, targets) actual_accuracy = accuracy.get_metric(reset=True) - numpy.testing.assert_almost_equal(actual_accuracy, 2.5 / 6.0) + assert_allclose(actual_accuracy, 2.5 / 6.0) - def test_top_k_and_tie_break_together_catches_exceptions(self): + @multi_device + def test_top_k_and_tie_break_together_catches_exceptions(self, device: str): with pytest.raises(ConfigurationError): CategoricalAccuracy(top_k=2, tie_break=True) - def test_incorrect_top_k_catches_exceptions(self): + @multi_device + def test_incorrect_top_k_catches_exceptions(self, device: str): with pytest.raises(ConfigurationError): CategoricalAccuracy(top_k=0) - def test_does_not_divide_by_zero_with_no_count(self): + @multi_device + def test_does_not_divide_by_zero_with_no_count(self, device: str): accuracy = CategoricalAccuracy() self.assertAlmostEqual(accuracy.get_metric(), 0.0) diff --git a/allennlp/tests/training/metrics/conll_coref_scores_test.py b/allennlp/tests/training/metrics/conll_coref_scores_test.py index c426d541e89..f285611953c 100644 --- a/allennlp/tests/training/metrics/conll_coref_scores_test.py +++ b/allennlp/tests/training/metrics/conll_coref_scores_test.py @@ -1,14 +1,15 @@ import torch -from allennlp.common.testing import AllenNlpTestCase +from allennlp.common.testing import AllenNlpTestCase, multi_device from allennlp.training.metrics import ConllCorefScores class ConllCorefScoresTest(AllenNlpTestCase): - def test_get_predicted_clusters(self): - top_spans = torch.Tensor([[0, 1], [4, 6], [8, 9]]).long() - antecedent_indices = torch.Tensor([[-1, -1, -1], [0, -1, -1], [0, 1, -1]]).long() - predicted_antecedents = torch.Tensor([-1, -1, 1]).long() + @multi_device + def test_get_predicted_clusters(self, device: str): + top_spans = torch.tensor([[0, 1], [4, 6], [8, 9]], device=device) + antecedent_indices = torch.tensor([[-1, -1, -1], [0, -1, -1], [0, 1, -1]], device=device) + predicted_antecedents = torch.tensor([-1, -1, 1], device=device) clusters, mention_to_cluster = ConllCorefScores.get_predicted_clusters( top_spans, antecedent_indices, predicted_antecedents ) diff --git a/allennlp/tests/training/metrics/covariance_test.py b/allennlp/tests/training/metrics/covariance_test.py index 7c311caa8a7..c1690fb2d74 100644 --- a/allennlp/tests/training/metrics/covariance_test.py +++ b/allennlp/tests/training/metrics/covariance_test.py @@ -1,73 +1,75 @@ -import torch import numpy as np -from numpy.testing import assert_allclose +import torch +from torch.testing import assert_allclose -from allennlp.common.testing import AllenNlpTestCase +from allennlp.common.testing import AllenNlpTestCase, multi_device from allennlp.training.metrics import Covariance class CovarianceTest(AllenNlpTestCase): - def test_covariance_unmasked_computation(self): + @multi_device + def test_covariance_unmasked_computation(self, device: str): covariance = Covariance() batch_size = 100 num_labels = 10 - predictions = np.random.randn(batch_size, num_labels).astype("float32") - labels = 0.5 * predictions + np.random.randn(batch_size, num_labels).astype("float32") + predictions = torch.randn(batch_size, num_labels, device=device) + labels = 0.5 * predictions + torch.randn(batch_size, num_labels, device=device) stride = 10 for i in range(batch_size // stride): - timestep_predictions = torch.FloatTensor(predictions[stride * i : stride * (i + 1), :]) - timestep_labels = torch.FloatTensor(labels[stride * i : stride * (i + 1), :]) + timestep_predictions = predictions[stride * i : stride * (i + 1), :] + timestep_labels = labels[stride * i : stride * (i + 1), :] # Flatten the predictions and labels thus far, so numpy treats them as # independent observations. expected_covariance = np.cov( - predictions[: stride * (i + 1), :].reshape(-1), - labels[: stride * (i + 1), :].reshape(-1), + predictions[: stride * (i + 1), :].view(-1).cpu().numpy(), + labels[: stride * (i + 1), :].view(-1).cpu().numpy(), )[0, 1] covariance(timestep_predictions, timestep_labels) - assert_allclose(expected_covariance, covariance.get_metric(), rtol=1e-5) + assert_allclose(expected_covariance, covariance.get_metric()) # Test reset covariance.reset() - covariance(torch.FloatTensor(predictions), torch.FloatTensor(labels)) + covariance(predictions, labels) assert_allclose( - np.cov(predictions.reshape(-1), labels.reshape(-1))[0, 1], + np.cov(predictions.view(-1).cpu().numpy(), labels.view(-1).cpu().numpy())[0, 1], covariance.get_metric(), - rtol=1e-5, ) - def test_covariance_masked_computation(self): + @multi_device + def test_covariance_masked_computation(self, device: str): covariance = Covariance() batch_size = 100 num_labels = 10 - predictions = np.random.randn(batch_size, num_labels).astype("float32") - labels = 0.5 * predictions + np.random.randn(batch_size, num_labels).astype("float32") + predictions = torch.randn(batch_size, num_labels, device=device) + labels = 0.5 * predictions + torch.randn(batch_size, num_labels, device=device) # Random binary mask - mask = np.random.randint(0, 2, size=(batch_size, num_labels)).astype("float32") + mask = torch.randint(0, 2, size=(batch_size, num_labels), device=device) stride = 10 for i in range(batch_size // stride): - timestep_predictions = torch.FloatTensor(predictions[stride * i : stride * (i + 1), :]) - timestep_labels = torch.FloatTensor(labels[stride * i : stride * (i + 1), :]) - timestep_mask = torch.FloatTensor(mask[stride * i : stride * (i + 1), :]) + timestep_predictions = predictions[stride * i : stride * (i + 1), :] + timestep_labels = labels[stride * i : stride * (i + 1), :] + timestep_mask = mask[stride * i : stride * (i + 1), :] # Flatten the predictions, labels, and mask thus far, so numpy treats them as # independent observations. expected_covariance = np.cov( - predictions[: stride * (i + 1), :].reshape(-1), - labels[: stride * (i + 1), :].reshape(-1), - fweights=mask[: stride * (i + 1), :].reshape(-1), + predictions[: stride * (i + 1), :].view(-1).cpu().numpy(), + labels[: stride * (i + 1), :].view(-1).cpu().numpy(), + fweights=mask[: stride * (i + 1), :].view(-1).cpu().numpy(), )[0, 1] covariance(timestep_predictions, timestep_labels, timestep_mask) - assert_allclose(expected_covariance, covariance.get_metric(), rtol=1e-5) + assert_allclose(expected_covariance, covariance.get_metric()) # Test reset covariance.reset() - covariance( - torch.FloatTensor(predictions), torch.FloatTensor(labels), torch.FloatTensor(mask) - ) + covariance(predictions, labels, mask) assert_allclose( - np.cov(predictions.reshape(-1), labels.reshape(-1), fweights=mask.reshape(-1))[0, 1], + np.cov( + predictions.view(-1).cpu().numpy(), + labels.view(-1).cpu().numpy(), + fweights=mask.view(-1).cpu().numpy(), + )[0, 1], covariance.get_metric(), - rtol=1e-5, ) diff --git a/allennlp/tests/training/metrics/entropy_test.py b/allennlp/tests/training/metrics/entropy_test.py index 398a4ff51cb..c93749b45fd 100644 --- a/allennlp/tests/training/metrics/entropy_test.py +++ b/allennlp/tests/training/metrics/entropy_test.py @@ -1,35 +1,44 @@ import torch -import numpy +from torch.testing import assert_allclose -from allennlp.common.testing import AllenNlpTestCase +from allennlp.common.testing import AllenNlpTestCase, multi_device from allennlp.training.metrics import Entropy class EntropyTest(AllenNlpTestCase): - def test_low_entropy_distribution(self): + @multi_device + def test_low_entropy_distribution(self, device: str): metric = Entropy() - logits = torch.Tensor([[10000, -10000, -10000, -1000], [10000, -10000, -10000, -1000]]) + logits = torch.tensor( + [[10000, -10000, -10000, -1000], [10000, -10000, -10000, -1000]], + dtype=torch.float, + device=device, + ) metric(logits) assert metric.get_metric() == 0.0 - def test_entropy_for_uniform_distribution(self): + @multi_device + def test_entropy_for_uniform_distribution(self, device: str): metric = Entropy() - logits = torch.Tensor([[1, 1, 1, 1], [1, 1, 1, 1]]) + logits = torch.tensor([[1, 1, 1, 1], [1, 1, 1, 1]], dtype=torch.float, device=device) metric(logits) - numpy.testing.assert_almost_equal(metric.get_metric().cpu(), 1.38629436) + assert_allclose(metric.get_metric(), torch.tensor(1.38629436, device=device)) # actual values shouldn't effect uniform distribution: - logits = torch.Tensor([[2, 2, 2, 2], [2, 2, 2, 2]]) + logits = torch.tensor([[2, 2, 2, 2], [2, 2, 2, 2]], dtype=torch.float, device=device) metric(logits) - numpy.testing.assert_almost_equal(metric.get_metric().cpu(), 1.38629436) + assert_allclose(metric.get_metric(), torch.tensor(1.38629436, device=device)) metric.reset() assert metric._entropy == 0.0 assert metric._count == 0.0 - def test_masked_case(self): + @multi_device + def test_masked_case(self, device: str): metric = Entropy() # This would have non-zero entropy without the mask. - logits = torch.Tensor([[1, 1, 1, 1], [10000, -10000, -10000, -1000]]) - mask = torch.Tensor([0, 1]) + logits = torch.tensor( + [[1, 1, 1, 1], [10000, -10000, -10000, -1000]], dtype=torch.float, device=device + ) + mask = torch.tensor([0, 1], device=device) metric(logits, mask) assert metric.get_metric() == 0.0 diff --git a/allennlp/tests/training/metrics/f1_measure_test.py b/allennlp/tests/training/metrics/f1_measure_test.py index f150083c984..ac8713673bf 100644 --- a/allennlp/tests/training/metrics/f1_measure_test.py +++ b/allennlp/tests/training/metrics/f1_measure_test.py @@ -1,23 +1,25 @@ -import torch import pytest -import numpy +import torch +from torch.testing import assert_allclose -from allennlp.common.testing import AllenNlpTestCase from allennlp.common.checks import ConfigurationError +from allennlp.common.testing import AllenNlpTestCase, multi_device from allennlp.training.metrics import F1Measure class F1MeasureTest(AllenNlpTestCase): - def test_f1_measure_catches_exceptions(self): + @multi_device + def test_f1_measure_catches_exceptions(self, device: str): f1_measure = F1Measure(0) - predictions = torch.rand([5, 7]) - out_of_range_labels = torch.Tensor([10, 3, 4, 0, 1]) + predictions = torch.rand([5, 7], device=device) + out_of_range_labels = torch.tensor([10, 3, 4, 0, 1], device=device) with pytest.raises(ConfigurationError): f1_measure(predictions, out_of_range_labels) - def test_f1_measure(self): + @multi_device + def test_f1_measure(self, device: str): f1_measure = F1Measure(positive_label=0) - predictions = torch.Tensor( + predictions = torch.tensor( [ [0.35, 0.25, 0.1, 0.1, 0.2], [0.1, 0.6, 0.1, 0.2, 0.0], @@ -25,11 +27,12 @@ def test_f1_measure(self): [0.1, 0.5, 0.1, 0.2, 0.0], [0.1, 0.2, 0.1, 0.7, 0.0], [0.1, 0.6, 0.1, 0.2, 0.0], - ] + ], + device=device, ) # [True Positive, True Negative, True Negative, # False Negative, True Negative, False Negative] - targets = torch.Tensor([0, 4, 1, 0, 3, 0]) + targets = torch.tensor([0, 4, 1, 0, 3, 0], device=device) f1_measure(predictions, targets) precision, recall, f1 = f1_measure.get_metric() assert f1_measure._true_positives == 1.0 @@ -38,16 +41,16 @@ def test_f1_measure(self): assert f1_measure._false_negatives == 2.0 f1_measure.reset() # check value - numpy.testing.assert_almost_equal(precision, 1.0) - numpy.testing.assert_almost_equal(recall, 0.333333333) - numpy.testing.assert_almost_equal(f1, 0.499999999) + assert_allclose(precision, 1.0) + assert_allclose(recall, 0.333333333) + assert_allclose(f1, 0.499999999) # check type assert isinstance(precision, float) assert isinstance(recall, float) assert isinstance(f1, float) # Test the same thing with a mask: - mask = torch.Tensor([1, 0, 1, 1, 1, 0]) + mask = torch.tensor([1, 0, 1, 1, 1, 0], device=device) f1_measure(predictions, targets, mask) precision, recall, f1 = f1_measure.get_metric() assert f1_measure._true_positives == 1.0 @@ -55,13 +58,14 @@ def test_f1_measure(self): assert f1_measure._false_positives == 0.0 assert f1_measure._false_negatives == 1.0 f1_measure.reset() - numpy.testing.assert_almost_equal(precision, 1.0) - numpy.testing.assert_almost_equal(recall, 0.5) - numpy.testing.assert_almost_equal(f1, 0.6666666666) + assert_allclose(precision, 1.0) + assert_allclose(recall, 0.5) + assert_allclose(f1, 0.6666666666) - def test_f1_measure_other_positive_label(self): + @multi_device + def test_f1_measure_other_positive_label(self, device: str): f1_measure = F1Measure(positive_label=1) - predictions = torch.Tensor( + predictions = torch.tensor( [ [0.35, 0.25, 0.1, 0.1, 0.2], [0.1, 0.6, 0.1, 0.2, 0.0], @@ -69,11 +73,12 @@ def test_f1_measure_other_positive_label(self): [0.1, 0.5, 0.1, 0.2, 0.0], [0.1, 0.2, 0.1, 0.7, 0.0], [0.1, 0.6, 0.1, 0.2, 0.0], - ] + ], + device=device, ) # [True Negative, False Positive, True Positive, # False Positive, True Negative, False Positive] - targets = torch.Tensor([0, 4, 1, 0, 3, 0]) + targets = torch.tensor([0, 4, 1, 0, 3, 0], device=device) f1_measure(predictions, targets) precision, recall, f1 = f1_measure.get_metric() assert f1_measure._true_positives == 1.0 @@ -82,17 +87,18 @@ def test_f1_measure_other_positive_label(self): assert f1_measure._false_negatives == 0.0 f1_measure.reset() # check value - numpy.testing.assert_almost_equal(precision, 0.25) - numpy.testing.assert_almost_equal(recall, 1.0) - numpy.testing.assert_almost_equal(f1, 0.4) + assert_allclose(precision, 0.25) + assert_allclose(recall, 1.0) + assert_allclose(f1, 0.4) # check type assert isinstance(precision, float) assert isinstance(recall, float) assert isinstance(f1, float) - def test_f1_measure_accumulates_and_resets_correctly(self): + @multi_device + def test_f1_measure_accumulates_and_resets_correctly(self, device: str): f1_measure = F1Measure(positive_label=0) - predictions = torch.Tensor( + predictions = torch.tensor( [ [0.35, 0.25, 0.1, 0.1, 0.2], [0.1, 0.6, 0.1, 0.2, 0.0], @@ -100,11 +106,12 @@ def test_f1_measure_accumulates_and_resets_correctly(self): [0.1, 0.5, 0.1, 0.2, 0.0], [0.1, 0.2, 0.1, 0.7, 0.0], [0.1, 0.6, 0.1, 0.2, 0.0], - ] + ], + device=device, ) # [True Positive, True Negative, True Negative, # False Negative, True Negative, False Negative] - targets = torch.Tensor([0, 4, 1, 0, 3, 0]) + targets = torch.tensor([0, 4, 1, 0, 3, 0], device=device) f1_measure(predictions, targets) f1_measure(predictions, targets) precision, recall, f1 = f1_measure.get_metric() @@ -113,25 +120,27 @@ def test_f1_measure_accumulates_and_resets_correctly(self): assert f1_measure._false_positives == 0.0 assert f1_measure._false_negatives == 4.0 f1_measure.reset() - numpy.testing.assert_almost_equal(precision, 1.0) - numpy.testing.assert_almost_equal(recall, 0.333333333) - numpy.testing.assert_almost_equal(f1, 0.499999999) + assert_allclose(precision, 1.0) + assert_allclose(recall, 0.333333333) + assert_allclose(f1, 0.499999999) assert f1_measure._true_positives == 0.0 assert f1_measure._true_negatives == 0.0 assert f1_measure._false_positives == 0.0 assert f1_measure._false_negatives == 0.0 - def test_f1_measure_works_for_sequences(self): + @multi_device + def test_f1_measure_works_for_sequences(self, device: str): f1_measure = F1Measure(positive_label=0) - predictions = torch.Tensor( + predictions = torch.tensor( [ [[0.35, 0.25, 0.1, 0.1, 0.2], [0.1, 0.6, 0.1, 0.2, 0.0], [0.1, 0.6, 0.1, 0.2, 0.0]], [[0.35, 0.25, 0.1, 0.1, 0.2], [0.1, 0.6, 0.1, 0.2, 0.0], [0.1, 0.6, 0.1, 0.2, 0.0]], - ] + ], + device=device, ) # [[True Positive, True Negative, True Negative], # [True Positive, True Negative, False Negative]] - targets = torch.Tensor([[0, 3, 4], [0, 1, 0]]) + targets = torch.tensor([[0, 3, 4], [0, 1, 0]], device=device) f1_measure(predictions, targets) precision, recall, f1 = f1_measure.get_metric() assert f1_measure._true_positives == 2.0 @@ -139,18 +148,18 @@ def test_f1_measure_works_for_sequences(self): assert f1_measure._false_positives == 0.0 assert f1_measure._false_negatives == 1.0 f1_measure.reset() - numpy.testing.assert_almost_equal(precision, 1.0) - numpy.testing.assert_almost_equal(recall, 0.666666666) - numpy.testing.assert_almost_equal(f1, 0.8) + assert_allclose(precision, 1.0) + assert_allclose(recall, 0.666666666) + assert_allclose(f1, 0.8) # Test the same thing with a mask: - mask = torch.Tensor([[0, 1, 0], [1, 1, 1]]) + mask = torch.tensor([[0, 1, 0], [1, 1, 1]], device=device) f1_measure(predictions, targets, mask) precision, recall, f1 = f1_measure.get_metric() assert f1_measure._true_positives == 1.0 assert f1_measure._true_negatives == 2.0 assert f1_measure._false_positives == 0.0 assert f1_measure._false_negatives == 1.0 - numpy.testing.assert_almost_equal(precision, 1.0) - numpy.testing.assert_almost_equal(recall, 0.5) - numpy.testing.assert_almost_equal(f1, 0.66666666666) + assert_allclose(precision, 1.0) + assert_allclose(recall, 0.5) + assert_allclose(f1, 0.66666666666) diff --git a/allennlp/tests/training/metrics/fbeta_measure_test.py b/allennlp/tests/training/metrics/fbeta_measure_test.py index ce269f1d845..9b0c1240fce 100644 --- a/allennlp/tests/training/metrics/fbeta_measure_test.py +++ b/allennlp/tests/training/metrics/fbeta_measure_test.py @@ -1,11 +1,11 @@ from typing import List -import numpy import torch from sklearn.metrics import precision_recall_fscore_support +from torch.testing import assert_allclose from allennlp.common.checks import ConfigurationError -from allennlp.common.testing import AllenNlpTestCase +from allennlp.common.testing import AllenNlpTestCase, multi_device from allennlp.training.metrics import FBetaMeasure @@ -13,7 +13,7 @@ class FBetaMeasureTest(AllenNlpTestCase): def setUp(self): super().setUp() # [0, 1, 1, 1, 3, 1] - self.predictions = torch.Tensor( + self.predictions = torch.tensor( [ [0.35, 0.25, 0.1, 0.1, 0.2], [0.1, 0.6, 0.1, 0.2, 0.0], @@ -23,7 +23,7 @@ def setUp(self): [0.1, 0.6, 0.1, 0.2, 0.0], ] ) - self.targets = torch.Tensor([0, 4, 1, 0, 3, 0]) + self.targets = torch.tensor([0, 4, 1, 0, 3, 0]) # detailed target state self.pred_sum = [1, 4, 0, 1, 0] @@ -33,7 +33,7 @@ def setUp(self): self.total_sum = [6, 6, 6, 6, 6] desired_precisions = [1.00, 0.25, 0.00, 1.00, 0.00] - desired_recalls = [0.33, 1.00, 0.00, 1.00, 0.00] + desired_recalls = [1 / 3, 1.00, 0.00, 1.00, 0.00] desired_fscores = [ (2 * p * r) / (p + r) if p + r != 0.0 else 0.0 for p, r in zip(desired_precisions, desired_recalls) @@ -42,7 +42,8 @@ def setUp(self): self.desired_recalls = desired_recalls self.desired_fscores = desired_fscores - def test_config_errors(self): + @multi_device + def test_config_errors(self, device: str): # Bad beta self.assertRaises(ConfigurationError, FBetaMeasure, beta=0.0) @@ -52,23 +53,32 @@ def test_config_errors(self): # Empty input labels self.assertRaises(ConfigurationError, FBetaMeasure, labels=[]) - def test_runtime_errors(self): + @multi_device + def test_runtime_errors(self, device: str): fbeta = FBetaMeasure() # Metric was never called. self.assertRaises(RuntimeError, fbeta.get_metric) - def test_fbeta_multiclass_state(self): + @multi_device + def test_fbeta_multiclass_state(self, device: str): + self.predictions = self.predictions.to(device) + self.targets = self.targets.to(device) + fbeta = FBetaMeasure() fbeta(self.predictions, self.targets) # check state - numpy.testing.assert_almost_equal(fbeta._pred_sum.tolist(), self.pred_sum) - numpy.testing.assert_almost_equal(fbeta._true_sum.tolist(), self.true_sum) - numpy.testing.assert_almost_equal(fbeta._true_positive_sum.tolist(), self.true_positive_sum) - numpy.testing.assert_almost_equal(fbeta._true_negative_sum.tolist(), self.true_negative_sum) - numpy.testing.assert_almost_equal(fbeta._total_sum.tolist(), self.total_sum) + assert_allclose(fbeta._pred_sum.tolist(), self.pred_sum) + assert_allclose(fbeta._true_sum.tolist(), self.true_sum) + assert_allclose(fbeta._true_positive_sum.tolist(), self.true_positive_sum) + assert_allclose(fbeta._true_negative_sum.tolist(), self.true_negative_sum) + assert_allclose(fbeta._total_sum.tolist(), self.total_sum) + + @multi_device + def test_fbeta_multiclass_metric(self, device: str): + self.predictions = self.predictions.to(device) + self.targets = self.targets.to(device) - def test_fbeta_multiclass_metric(self): fbeta = FBetaMeasure() fbeta(self.predictions, self.targets) metric = fbeta.get_metric() @@ -77,17 +87,21 @@ def test_fbeta_multiclass_metric(self): fscores = metric["fscore"] # check value - numpy.testing.assert_almost_equal(precisions, self.desired_precisions, decimal=2) - numpy.testing.assert_almost_equal(recalls, self.desired_recalls, decimal=2) - numpy.testing.assert_almost_equal(fscores, self.desired_fscores, decimal=2) + assert_allclose(precisions, self.desired_precisions) + assert_allclose(recalls, self.desired_recalls) + assert_allclose(fscores, self.desired_fscores) # check type assert isinstance(precisions, List) assert isinstance(recalls, List) assert isinstance(fscores, List) - def test_fbeta_multiclass_with_mask(self): - mask = torch.Tensor([1, 1, 1, 1, 1, 0]) + @multi_device + def test_fbeta_multiclass_with_mask(self, device: str): + self.predictions = self.predictions.to(device) + self.targets = self.targets.to(device) + + mask = torch.tensor([1, 1, 1, 1, 1, 0], device=device) fbeta = FBetaMeasure() fbeta(self.predictions, self.targets, mask) @@ -96,21 +110,25 @@ def test_fbeta_multiclass_with_mask(self): recalls = metric["recall"] fscores = metric["fscore"] - numpy.testing.assert_almost_equal(fbeta._pred_sum.tolist(), [1, 3, 0, 1, 0]) - numpy.testing.assert_almost_equal(fbeta._true_sum.tolist(), [2, 1, 0, 1, 1]) - numpy.testing.assert_almost_equal(fbeta._true_positive_sum.tolist(), [1, 1, 0, 1, 0]) + assert_allclose(fbeta._pred_sum.tolist(), [1, 3, 0, 1, 0]) + assert_allclose(fbeta._true_sum.tolist(), [2, 1, 0, 1, 1]) + assert_allclose(fbeta._true_positive_sum.tolist(), [1, 1, 0, 1, 0]) - desired_precisions = [1.00, 0.33, 0.00, 1.00, 0.00] + desired_precisions = [1.00, 1 / 3, 0.00, 1.00, 0.00] desired_recalls = [0.50, 1.00, 0.00, 1.00, 0.00] desired_fscores = [ (2 * p * r) / (p + r) if p + r != 0.0 else 0.0 for p, r in zip(desired_precisions, desired_recalls) ] - numpy.testing.assert_almost_equal(precisions, desired_precisions, decimal=2) - numpy.testing.assert_almost_equal(recalls, desired_recalls, decimal=2) - numpy.testing.assert_almost_equal(fscores, desired_fscores, decimal=2) + assert_allclose(precisions, desired_precisions) + assert_allclose(recalls, desired_recalls) + assert_allclose(fscores, desired_fscores) + + @multi_device + def test_fbeta_multiclass_macro_average_metric(self, device: str): + self.predictions = self.predictions.to(device) + self.targets = self.targets.to(device) - def test_fbeta_multiclass_macro_average_metric(self): fbeta = FBetaMeasure(average="macro") fbeta(self.predictions, self.targets) metric = fbeta.get_metric() @@ -118,20 +136,25 @@ def test_fbeta_multiclass_macro_average_metric(self): recalls = metric["recall"] fscores = metric["fscore"] - macro_precision = numpy.mean(self.desired_precisions) - macro_recall = numpy.mean(self.desired_recalls) - macro_fscore = numpy.mean(self.desired_fscores) + # We keep the expected values in CPU because FBetaMeasure returns them in CPU. + macro_precision = torch.tensor(self.desired_precisions).mean() + macro_recall = torch.tensor(self.desired_recalls).mean() + macro_fscore = torch.tensor(self.desired_fscores).mean() # check value - numpy.testing.assert_almost_equal(precisions, macro_precision, decimal=2) - numpy.testing.assert_almost_equal(recalls, macro_recall, decimal=2) - numpy.testing.assert_almost_equal(fscores, macro_fscore, decimal=2) + assert_allclose(precisions, macro_precision) + assert_allclose(recalls, macro_recall) + assert_allclose(fscores, macro_fscore) # check type assert isinstance(precisions, float) assert isinstance(recalls, float) assert isinstance(fscores, float) - def test_fbeta_multiclass_micro_average_metric(self): + @multi_device + def test_fbeta_multiclass_micro_average_metric(self, device: str): + self.predictions = self.predictions.to(device) + self.targets = self.targets.to(device) + fbeta = FBetaMeasure(average="micro") fbeta(self.predictions, self.targets) metric = fbeta.get_metric() @@ -139,22 +162,27 @@ def test_fbeta_multiclass_micro_average_metric(self): recalls = metric["recall"] fscores = metric["fscore"] - true_positives = [1, 1, 0, 1, 0] - false_positives = [0, 3, 0, 0, 0] - false_negatives = [2, 0, 0, 0, 1] - mean_true_positive = numpy.mean(true_positives) - mean_false_positive = numpy.mean(false_positives) - mean_false_negative = numpy.mean(false_negatives) + # We keep the expected values in CPU because FBetaMeasure returns them in CPU. + true_positives = torch.tensor([1, 1, 0, 1, 0], dtype=torch.float32) + false_positives = torch.tensor([0, 3, 0, 0, 0], dtype=torch.float32) + false_negatives = torch.tensor([2, 0, 0, 0, 1], dtype=torch.float32) + mean_true_positive = true_positives.mean() + mean_false_positive = false_positives.mean() + mean_false_negative = false_negatives.mean() micro_precision = mean_true_positive / (mean_true_positive + mean_false_positive) micro_recall = mean_true_positive / (mean_true_positive + mean_false_negative) micro_fscore = (2 * micro_precision * micro_recall) / (micro_precision + micro_recall) # check value - numpy.testing.assert_almost_equal(precisions, micro_precision, decimal=2) - numpy.testing.assert_almost_equal(recalls, micro_recall, decimal=2) - numpy.testing.assert_almost_equal(fscores, micro_fscore, decimal=2) + assert_allclose(precisions, micro_precision) + assert_allclose(recalls, micro_recall) + assert_allclose(fscores, micro_fscore) + + @multi_device + def test_fbeta_multiclass_with_explicit_labels(self, device: str): + self.predictions = self.predictions.to(device) + self.targets = self.targets.to(device) - def test_fbeta_multiclass_with_explicit_labels(self): # same prediction but with and explicit label ordering fbeta = FBetaMeasure(labels=[4, 3, 2, 1, 0]) fbeta(self.predictions, self.targets) @@ -167,11 +195,15 @@ def test_fbeta_multiclass_with_explicit_labels(self): desired_recalls = self.desired_recalls[::-1] desired_fscores = self.desired_fscores[::-1] # check value - numpy.testing.assert_almost_equal(precisions, desired_precisions, decimal=2) - numpy.testing.assert_almost_equal(recalls, desired_recalls, decimal=2) - numpy.testing.assert_almost_equal(fscores, desired_fscores, decimal=2) + assert_allclose(precisions, desired_precisions) + assert_allclose(recalls, desired_recalls) + assert_allclose(fscores, desired_fscores) + + @multi_device + def test_fbeta_multiclass_with_macro_average(self, device: str): + self.predictions = self.predictions.to(device) + self.targets = self.targets.to(device) - def test_fbeta_multiclass_with_macro_average(self): labels = [0, 1] fbeta = FBetaMeasure(average="macro", labels=labels) fbeta(self.predictions, self.targets) @@ -180,16 +212,21 @@ def test_fbeta_multiclass_with_macro_average(self): recalls = metric["recall"] fscores = metric["fscore"] - macro_precision = numpy.array(self.desired_precisions)[labels].mean() - macro_recall = numpy.array(self.desired_recalls)[labels].mean() - macro_fscore = numpy.array(self.desired_fscores)[labels].mean() + # We keep the expected values in CPU because FBetaMeasure returns them in CPU. + macro_precision = torch.tensor(self.desired_precisions)[labels].mean() + macro_recall = torch.tensor(self.desired_recalls)[labels].mean() + macro_fscore = torch.tensor(self.desired_fscores)[labels].mean() # check value - numpy.testing.assert_almost_equal(precisions, macro_precision, decimal=2) - numpy.testing.assert_almost_equal(recalls, macro_recall, decimal=2) - numpy.testing.assert_almost_equal(fscores, macro_fscore, decimal=2) + assert_allclose(precisions, macro_precision) + assert_allclose(recalls, macro_recall) + assert_allclose(fscores, macro_fscore) + + @multi_device + def test_fbeta_multiclass_with_micro_average(self, device: str): + self.predictions = self.predictions.to(device) + self.targets = self.targets.to(device) - def test_fbeta_multiclass_with_micro_average(self): labels = [1, 3] fbeta = FBetaMeasure(average="micro", labels=labels) fbeta(self.predictions, self.targets) @@ -198,22 +235,27 @@ def test_fbeta_multiclass_with_micro_average(self): recalls = metric["recall"] fscores = metric["fscore"] - true_positives = [1, 1] - false_positives = [3, 0] - false_negatives = [0, 0] - mean_true_positive = numpy.mean(true_positives) - mean_false_positive = numpy.mean(false_positives) - mean_false_negative = numpy.mean(false_negatives) + # We keep the expected values in CPU because FBetaMeasure returns them in CPU. + true_positives = torch.tensor([1, 1], dtype=torch.float32) + false_positives = torch.tensor([3, 0], dtype=torch.float32) + false_negatives = torch.tensor([0, 0], dtype=torch.float32) + mean_true_positive = true_positives.mean() + mean_false_positive = false_positives.mean() + mean_false_negative = false_negatives.mean() micro_precision = mean_true_positive / (mean_true_positive + mean_false_positive) micro_recall = mean_true_positive / (mean_true_positive + mean_false_negative) micro_fscore = (2 * micro_precision * micro_recall) / (micro_precision + micro_recall) # check value - numpy.testing.assert_almost_equal(precisions, micro_precision, decimal=2) - numpy.testing.assert_almost_equal(recalls, micro_recall, decimal=2) - numpy.testing.assert_almost_equal(fscores, micro_fscore, decimal=2) + assert_allclose(precisions, micro_precision) + assert_allclose(recalls, micro_recall) + assert_allclose(fscores, micro_fscore) + + @multi_device + def test_fbeta_multiclass_with_weighted_average(self, device: str): + self.predictions = self.predictions.to(device) + self.targets = self.targets.to(device) - def test_fbeta_multiclass_with_weighted_average(self): labels = [0, 1] fbeta = FBetaMeasure(average="weighted", labels=labels) fbeta(self.predictions, self.targets) @@ -223,18 +265,22 @@ def test_fbeta_multiclass_with_weighted_average(self): fscores = metric["fscore"] weighted_precision, weighted_recall, weighted_fscore, _ = precision_recall_fscore_support( - self.targets, self.predictions.argmax(dim=1), labels=labels, average="weighted" + self.targets.cpu().numpy(), + self.predictions.argmax(dim=1).cpu().numpy(), + labels=labels, + average="weighted", ) # check value - numpy.testing.assert_almost_equal(precisions, weighted_precision, decimal=2) - numpy.testing.assert_almost_equal(recalls, weighted_recall, decimal=2) - numpy.testing.assert_almost_equal(fscores, weighted_fscore, decimal=2) + assert_allclose(precisions, weighted_precision) + assert_allclose(recalls, weighted_recall) + assert_allclose(fscores, weighted_fscore) - def test_fbeta_handles_batch_size_of_one(self): - predictions = torch.Tensor([[0.2862, 0.3479, 0.1627, 0.2033]]) - targets = torch.Tensor([1]) - mask = torch.Tensor([1]) + @multi_device + def test_fbeta_handles_batch_size_of_one(self, device: str): + predictions = torch.tensor([[0.2862, 0.3479, 0.1627, 0.2033]], device=device) + targets = torch.tensor([1], device=device) + mask = torch.tensor([1], device=device) fbeta = FBetaMeasure() fbeta(predictions, targets, mask) @@ -242,5 +288,5 @@ def test_fbeta_handles_batch_size_of_one(self): precisions = metric["precision"] recalls = metric["recall"] - numpy.testing.assert_almost_equal(precisions, [0.0, 1.0, 0.0, 0.0]) - numpy.testing.assert_almost_equal(recalls, [0.0, 1.0, 0.0, 0.0]) + assert_allclose(precisions, [0.0, 1.0, 0.0, 0.0]) + assert_allclose(recalls, [0.0, 1.0, 0.0, 0.0]) diff --git a/allennlp/tests/training/metrics/mean_absolute_error_test.py b/allennlp/tests/training/metrics/mean_absolute_error_test.py index 9c25680c652..f3901225866 100644 --- a/allennlp/tests/training/metrics/mean_absolute_error_test.py +++ b/allennlp/tests/training/metrics/mean_absolute_error_test.py @@ -1,25 +1,30 @@ import torch -from allennlp.common.testing import AllenNlpTestCase +from allennlp.common.testing import AllenNlpTestCase, multi_device from allennlp.training.metrics import MeanAbsoluteError class MeanAbsoluteErrorTest(AllenNlpTestCase): - def test_mean_absolute_error_computation(self): + @multi_device + def test_mean_absolute_error_computation(self, device: str): mae = MeanAbsoluteError() - predictions = torch.Tensor( - [[1.0, 1.5, 1.0], [2.0, 3.0, 3.5], [4.0, 5.0, 5.5], [6.0, 7.0, 7.5]] + predictions = torch.tensor( + [[1.0, 1.5, 1.0], [2.0, 3.0, 3.5], [4.0, 5.0, 5.5], [6.0, 7.0, 7.5]], device=device + ) + targets = torch.tensor( + [[0.0, 1.0, 0.0], [2.0, 2.0, 0.0], [4.0, 5.0, 0.0], [7.0, 7.0, 0.0]], device=device ) - targets = torch.Tensor([[0.0, 1.0, 0.0], [2.0, 2.0, 0.0], [4.0, 5.0, 0.0], [7.0, 7.0, 0.0]]) mae(predictions, targets) assert mae.get_metric() == 21.0 / 12.0 - mask = torch.Tensor([[1.0, 1.0, 0.0], [1.0, 1.0, 0.0], [1.0, 1.0, 0.0], [1.0, 1.0, 0.0]]) + mask = torch.tensor( + [[1.0, 1.0, 0.0], [1.0, 1.0, 0.0], [1.0, 1.0, 0.0], [1.0, 1.0, 0.0]], device=device + ) mae(predictions, targets, mask) assert mae.get_metric() == (21.0 + 3.5) / (12.0 + 8.0) - new_targets = torch.Tensor( - [[2.0, 2.0, 0.0], [0.0, 1.0, 0.0], [7.0, 7.0, 0.0], [4.0, 5.0, 0.0]] + new_targets = torch.tensor( + [[2.0, 2.0, 0.0], [0.0, 1.0, 0.0], [7.0, 7.0, 0.0], [4.0, 5.0, 0.0]], device=device ) mae(predictions, new_targets) assert mae.get_metric() == (21.0 + 3.5 + 32.0) / (12.0 + 8.0 + 12.0) diff --git a/allennlp/tests/training/metrics/pearson_correlation_test.py b/allennlp/tests/training/metrics/pearson_correlation_test.py index 2ecc7bf50a9..5378aafb9ad 100644 --- a/allennlp/tests/training/metrics/pearson_correlation_test.py +++ b/allennlp/tests/training/metrics/pearson_correlation_test.py @@ -1,12 +1,16 @@ -import torch +from typing import Optional + import numpy as np -from numpy.testing import assert_allclose +import torch +from torch.testing import assert_allclose -from allennlp.common.testing import AllenNlpTestCase +from allennlp.common.testing import AllenNlpTestCase, multi_device from allennlp.training.metrics import PearsonCorrelation -def pearson_corrcoef(predictions, labels, fweights=None): +def pearson_corrcoef( + predictions: np.ndarray, labels: np.ndarray, fweights: Optional[np.ndarray] = None +): covariance_matrices = np.cov(predictions, labels, fweights=fweights) denominator = np.sqrt(covariance_matrices[0, 0] * covariance_matrices[1, 1]) if np.around(denominator, decimals=5) == 0: @@ -17,17 +21,18 @@ def pearson_corrcoef(predictions, labels, fweights=None): class PearsonCorrelationTest(AllenNlpTestCase): - def test_pearson_correlation_unmasked_computation(self): + @multi_device + def test_pearson_correlation_unmasked_computation(self, device: str): pearson_correlation = PearsonCorrelation() batch_size = 100 num_labels = 10 - predictions_1 = np.random.randn(batch_size, num_labels).astype("float32") - labels_1 = 0.5 * predictions_1 + np.random.randn(batch_size, num_labels).astype("float32") + predictions_1 = torch.randn(batch_size, num_labels, device=device) + labels_1 = 0.5 * predictions_1 + torch.randn(batch_size, num_labels, device=device) - predictions_2 = np.random.randn(1).repeat(num_labels).astype("float32") - predictions_2 = predictions_2[np.newaxis, :].repeat(batch_size, axis=0) - labels_2 = np.random.randn(1).repeat(num_labels).astype("float32") - labels_2 = 0.5 * predictions_2 + labels_2[np.newaxis, :].repeat(batch_size, axis=0) + predictions_2 = torch.randn(1, device=device).expand(num_labels) + predictions_2 = predictions_2.unsqueeze(0).expand(batch_size, -1) + labels_2 = torch.randn(1, device=device).expand(num_labels) + labels_2 = 0.5 * predictions_2 + labels_2.unsqueeze(0).expand(batch_size, -1) # in most cases, the data is constructed like predictions_1, the data of such a batch different. # but in a few cases, for example, predictions_2, the data of such a batch is exactly the same. @@ -38,72 +43,62 @@ def test_pearson_correlation_unmasked_computation(self): for predictions, labels in predictions_labels: pearson_correlation.reset() for i in range(batch_size // stride): - timestep_predictions = torch.FloatTensor( - predictions[stride * i : stride * (i + 1), :] - ) - timestep_labels = torch.FloatTensor(labels[stride * i : stride * (i + 1), :]) + timestep_predictions = predictions[stride * i : stride * (i + 1), :] + timestep_labels = labels[stride * i : stride * (i + 1), :] expected_pearson_correlation = pearson_corrcoef( - predictions[: stride * (i + 1), :].reshape(-1), - labels[: stride * (i + 1), :].reshape(-1), + predictions[: stride * (i + 1), :].view(-1).cpu().numpy(), + labels[: stride * (i + 1), :].view(-1).cpu().numpy(), ) pearson_correlation(timestep_predictions, timestep_labels) - assert_allclose( - expected_pearson_correlation, pearson_correlation.get_metric(), rtol=1e-5 - ) + assert_allclose(expected_pearson_correlation, pearson_correlation.get_metric()) # Test reset pearson_correlation.reset() - pearson_correlation(torch.FloatTensor(predictions), torch.FloatTensor(labels)) + pearson_correlation(predictions, labels) assert_allclose( - pearson_corrcoef(predictions.reshape(-1), labels.reshape(-1)), + pearson_corrcoef(predictions.view(-1).cpu().numpy(), labels.view(-1).cpu().numpy()), pearson_correlation.get_metric(), - rtol=1e-5, ) - def test_pearson_correlation_masked_computation(self): + @multi_device + def test_pearson_correlation_masked_computation(self, device: str): pearson_correlation = PearsonCorrelation() batch_size = 100 num_labels = 10 - predictions_1 = np.random.randn(batch_size, num_labels).astype("float32") - labels_1 = 0.5 * predictions_1 + np.random.randn(batch_size, num_labels).astype("float32") + predictions_1 = torch.randn(batch_size, num_labels, device=device) + labels_1 = 0.5 * predictions_1 + torch.randn(batch_size, num_labels, device=device) - predictions_2 = np.random.randn(1).repeat(num_labels).astype("float32") - predictions_2 = predictions_2[np.newaxis, :].repeat(batch_size, axis=0) - labels_2 = np.random.randn(1).repeat(num_labels).astype("float32") - labels_2 = 0.5 * predictions_2 + labels_2[np.newaxis, :].repeat(batch_size, axis=0) + predictions_2 = torch.randn(1, device=device).expand(num_labels) + predictions_2 = predictions_2.unsqueeze(0).expand(batch_size, -1) + labels_2 = torch.randn(1, device=device).expand(num_labels) + labels_2 = 0.5 * predictions_2 + labels_2.unsqueeze(0).expand(batch_size, -1) predictions_labels = [(predictions_1, labels_1), (predictions_2, labels_2)] # Random binary mask - mask = np.random.randint(0, 2, size=(batch_size, num_labels)).astype("float32") + mask = torch.randint(0, 2, size=(batch_size, num_labels), device=device) stride = 10 for predictions, labels in predictions_labels: pearson_correlation.reset() for i in range(batch_size // stride): - timestep_predictions = torch.FloatTensor( - predictions[stride * i : stride * (i + 1), :] - ) - timestep_labels = torch.FloatTensor(labels[stride * i : stride * (i + 1), :]) - timestep_mask = torch.FloatTensor(mask[stride * i : stride * (i + 1), :]) + timestep_predictions = predictions[stride * i : stride * (i + 1), :] + timestep_labels = labels[stride * i : stride * (i + 1), :] + timestep_mask = mask[stride * i : stride * (i + 1), :] expected_pearson_correlation = pearson_corrcoef( - predictions[: stride * (i + 1), :].reshape(-1), - labels[: stride * (i + 1), :].reshape(-1), - fweights=mask[: stride * (i + 1), :].reshape(-1), + predictions[: stride * (i + 1), :].view(-1).cpu().numpy(), + labels[: stride * (i + 1), :].view(-1).cpu().numpy(), + fweights=mask[: stride * (i + 1), :].view(-1).cpu().numpy(), ) pearson_correlation(timestep_predictions, timestep_labels, timestep_mask) - assert_allclose( - expected_pearson_correlation, pearson_correlation.get_metric(), rtol=1e-5 - ) + assert_allclose(expected_pearson_correlation, pearson_correlation.get_metric()) # Test reset pearson_correlation.reset() - pearson_correlation( - torch.FloatTensor(predictions), torch.FloatTensor(labels), torch.FloatTensor(mask) - ) + pearson_correlation(predictions, labels, mask) expected_pearson_correlation = pearson_corrcoef( - predictions.reshape(-1), labels.reshape(-1), fweights=mask.reshape(-1) + predictions.view(-1).cpu().numpy(), + labels.view(-1).cpu().numpy(), + fweights=mask.view(-1).cpu().numpy(), ) - assert_allclose( - expected_pearson_correlation, pearson_correlation.get_metric(), rtol=1e-5 - ) + assert_allclose(expected_pearson_correlation, pearson_correlation.get_metric()) diff --git a/allennlp/tests/training/metrics/sequence_accuracy_test.py b/allennlp/tests/training/metrics/sequence_accuracy_test.py index eb71aaf354f..c01656b5f2a 100644 --- a/allennlp/tests/training/metrics/sequence_accuracy_test.py +++ b/allennlp/tests/training/metrics/sequence_accuracy_test.py @@ -1,52 +1,58 @@ import torch -import numpy +from torch.testing import assert_allclose -from allennlp.common.testing import AllenNlpTestCase +from allennlp.common.testing import AllenNlpTestCase, multi_device from allennlp.training.metrics import SequenceAccuracy class SequenceAccuracyTest(AllenNlpTestCase): - def test_sequence_accuracy(self): + @multi_device + def test_sequence_accuracy(self, device: str): accuracy = SequenceAccuracy() - gold = torch.Tensor([[1, 2, 3], [2, 4, 8], [0, 1, 1]]) - predictions = torch.Tensor( - [[[1, 2, 3], [1, 2, -1]], [[2, 4, 8], [2, 5, 9]], [[-1, -1, -1], [0, 1, -1]]] + gold = torch.tensor([[1, 2, 3], [2, 4, 8], [0, 1, 1]], device=device) + predictions = torch.tensor( + [[[1, 2, 3], [1, 2, -1]], [[2, 4, 8], [2, 5, 9]], [[-1, -1, -1], [0, 1, -1]]], + device=device, ) accuracy(predictions, gold) actual_accuracy = accuracy.get_metric() - numpy.testing.assert_almost_equal(actual_accuracy, 2 / 3) + assert_allclose(actual_accuracy, 2 / 3) - def test_sequence_accuracy_respects_mask(self): + @multi_device + def test_sequence_accuracy_respects_mask(self, device: str): accuracy = SequenceAccuracy() - gold = torch.Tensor([[1, 2, 3], [2, 4, 8], [0, 1, 1], [11, 13, 17]]) - predictions = torch.Tensor( + gold = torch.tensor([[1, 2, 3], [2, 4, 8], [0, 1, 1], [11, 13, 17]], device=device) + predictions = torch.tensor( [ [[1, 2, 3], [1, 2, -1]], [[2, 4, 8], [2, 5, 9]], [[-1, -1, -1], [0, 1, -1]], [[12, 13, 17], [11, 13, 18]], - ] + ], + device=device, ) - mask = torch.Tensor([[0, 1, 1], [1, 1, 1], [1, 1, 0], [1, 0, 1]]) + mask = torch.tensor([[0, 1, 1], [1, 1, 1], [1, 1, 0], [1, 0, 1]], device=device) accuracy(predictions, gold, mask) actual_accuracy = accuracy.get_metric() - numpy.testing.assert_almost_equal(actual_accuracy, 3 / 4) + assert_allclose(actual_accuracy, 3 / 4) - def test_sequence_accuracy_accumulates_and_resets_correctly(self): + @multi_device + def test_sequence_accuracy_accumulates_and_resets_correctly(self, device: str): accuracy = SequenceAccuracy() - gold = torch.Tensor([[1, 2, 3]]) - accuracy(torch.Tensor([[[1, 2, 3]]]), gold) - accuracy(torch.Tensor([[[1, 2, 4]]]), gold) + gold = torch.tensor([[1, 2, 3]], device=device) + accuracy(torch.tensor([[[1, 2, 3]]], device=device), gold) + accuracy(torch.tensor([[[1, 2, 4]]], device=device), gold) actual_accuracy = accuracy.get_metric(reset=True) - numpy.testing.assert_almost_equal(actual_accuracy, 1 / 2) + assert_allclose(actual_accuracy, 1 / 2) assert accuracy.correct_count == 0 assert accuracy.total_count == 0 - def test_get_metric_on_new_object_works(self): + @multi_device + def test_get_metric_on_new_object_works(self, device: str): accuracy = SequenceAccuracy() actual_accuracy = accuracy.get_metric(reset=True) - numpy.testing.assert_almost_equal(actual_accuracy, 0) + assert_allclose(actual_accuracy, 0) diff --git a/allennlp/tests/training/metrics/span_based_f1_measure_test.py b/allennlp/tests/training/metrics/span_based_f1_measure_test.py index ac886ef17c7..9095decfb2d 100644 --- a/allennlp/tests/training/metrics/span_based_f1_measure_test.py +++ b/allennlp/tests/training/metrics/span_based_f1_measure_test.py @@ -2,14 +2,14 @@ import subprocess import torch -import numpy +from torch.testing import assert_allclose -from allennlp.common.testing import AllenNlpTestCase +from allennlp.common.checks import ConfigurationError +from allennlp.common.params import Params +from allennlp.common.testing import AllenNlpTestCase, multi_device from allennlp.data import Vocabulary -from allennlp.training.metrics import SpanBasedF1Measure, Metric from allennlp.models.semantic_role_labeler import write_bio_formatted_tags_to_file -from allennlp.common.params import Params -from allennlp.common.checks import ConfigurationError +from allennlp.training.metrics import SpanBasedF1Measure, Metric class SpanBasedF1Test(AllenNlpTestCase): @@ -38,7 +38,8 @@ def setUp(self): self.vocab = vocab - def test_span_metrics_are_computed_correcly_with_prediction_map(self): + @multi_device + def test_span_metrics_are_computed_correcly_with_prediction_map(self, device: str): # In this example, datapoint1 only has access to ARG1 and V labels, # whereas datapoint2 only has access to ARG2 and V labels. @@ -47,10 +48,10 @@ def test_span_metrics_are_computed_correcly_with_prediction_map(self): gold_indices = [[0, 1, 2, 0, 3, 0], [1, 2, 0, 3, 4, 0]] prediction_map_indices = [[0, 1, 2, 5, 6], [0, 3, 4, 5, 6]] - gold_tensor = torch.Tensor(gold_indices) - prediction_map_tensor = torch.Tensor(prediction_map_indices) + gold_tensor = torch.tensor(gold_indices, device=device) + prediction_map_tensor = torch.tensor(prediction_map_indices, device=device) - prediction_tensor = torch.rand([2, 6, 5]) + prediction_tensor = torch.rand([2, 6, 5], device=device) prediction_tensor[0, 0, 0] = 1 prediction_tensor[0, 1, 1] = 1 # (True Positive - ARG1 prediction_tensor[0, 2, 2] = 1 # *) @@ -97,30 +98,33 @@ def test_span_metrics_are_computed_correcly_with_prediction_map(self): metric_dict = metric.get_metric() - numpy.testing.assert_almost_equal(metric_dict["recall-ARG2"], 0.0) - numpy.testing.assert_almost_equal(metric_dict["precision-ARG2"], 0.0) - numpy.testing.assert_almost_equal(metric_dict["f1-measure-ARG2"], 0.0) - numpy.testing.assert_almost_equal(metric_dict["recall-ARG1"], 1.0) - numpy.testing.assert_almost_equal(metric_dict["precision-ARG1"], 0.5) - numpy.testing.assert_almost_equal(metric_dict["f1-measure-ARG1"], 0.666666666) - numpy.testing.assert_almost_equal(metric_dict["recall-V"], 1.0) - numpy.testing.assert_almost_equal(metric_dict["precision-V"], 1.0) - numpy.testing.assert_almost_equal(metric_dict["f1-measure-V"], 1.0) - numpy.testing.assert_almost_equal(metric_dict["recall-overall"], 0.75) - numpy.testing.assert_almost_equal(metric_dict["precision-overall"], 0.6) - numpy.testing.assert_almost_equal(metric_dict["f1-measure-overall"], 0.666666666) - - def test_span_metrics_are_computed_correctly(self): + assert_allclose(metric_dict["recall-ARG2"], 0.0) + assert_allclose(metric_dict["precision-ARG2"], 0.0) + assert_allclose(metric_dict["f1-measure-ARG2"], 0.0) + assert_allclose(metric_dict["recall-ARG1"], 1.0) + assert_allclose(metric_dict["precision-ARG1"], 0.5) + assert_allclose(metric_dict["f1-measure-ARG1"], 0.666666666) + assert_allclose(metric_dict["recall-V"], 1.0) + assert_allclose(metric_dict["precision-V"], 1.0) + assert_allclose(metric_dict["f1-measure-V"], 1.0) + assert_allclose(metric_dict["recall-overall"], 0.75) + assert_allclose(metric_dict["precision-overall"], 0.6) + assert_allclose(metric_dict["f1-measure-overall"], 0.666666666) + + @multi_device + def test_span_metrics_are_computed_correctly(self, device: str): gold_labels = ["O", "B-ARG1", "I-ARG1", "O", "B-ARG2", "I-ARG2", "O", "O", "O"] gold_indices = [self.vocab.get_token_index(x, "tags") for x in gold_labels] - gold_tensor = torch.Tensor([gold_indices]) + gold_tensor = torch.tensor([gold_indices], device=device) - prediction_tensor = torch.rand([2, 9, self.vocab.get_vocab_size("tags")]) + prediction_tensor = torch.rand([2, 9, self.vocab.get_vocab_size("tags")], device=device) # Test that the span measure ignores completely masked sequences by # passing a mask with a fully masked row. - mask = torch.LongTensor([[1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0]]) + mask = torch.tensor( + [[1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0]], device=device + ) prediction_tensor[:, 0, 0] = 1 prediction_tensor[:, 1, 1] = 1 # (True positive - ARG1 @@ -159,24 +163,25 @@ def test_span_metrics_are_computed_correctly(self): metric_dict = metric.get_metric() - numpy.testing.assert_almost_equal(metric_dict["recall-ARG2"], 0.0) - numpy.testing.assert_almost_equal(metric_dict["precision-ARG2"], 0.0) - numpy.testing.assert_almost_equal(metric_dict["f1-measure-ARG2"], 0.0) - numpy.testing.assert_almost_equal(metric_dict["recall-ARG1"], 1.0) - numpy.testing.assert_almost_equal(metric_dict["precision-ARG1"], 0.5) - numpy.testing.assert_almost_equal(metric_dict["f1-measure-ARG1"], 0.666666666) - numpy.testing.assert_almost_equal(metric_dict["recall-overall"], 0.5) - numpy.testing.assert_almost_equal(metric_dict["precision-overall"], 0.5) - numpy.testing.assert_almost_equal(metric_dict["f1-measure-overall"], 0.5) - - def test_bmes_span_metrics_are_computed_correctly(self): + assert_allclose(metric_dict["recall-ARG2"], 0.0) + assert_allclose(metric_dict["precision-ARG2"], 0.0) + assert_allclose(metric_dict["f1-measure-ARG2"], 0.0) + assert_allclose(metric_dict["recall-ARG1"], 1.0) + assert_allclose(metric_dict["precision-ARG1"], 0.5) + assert_allclose(metric_dict["f1-measure-ARG1"], 0.666666666) + assert_allclose(metric_dict["recall-overall"], 0.5) + assert_allclose(metric_dict["precision-overall"], 0.5) + assert_allclose(metric_dict["f1-measure-overall"], 0.5) + + @multi_device + def test_bmes_span_metrics_are_computed_correctly(self, device: str): # (bmes_tags) B:0, M:1, E:2, S:3. # [S, B, M, E, S] # [S, S, S, S, S] gold_indices = [[3, 0, 1, 2, 3], [3, 3, 3, 3, 3]] - gold_tensor = torch.Tensor(gold_indices) + gold_tensor = torch.tensor(gold_indices, device=device) - prediction_tensor = torch.rand([2, 5, 4]) + prediction_tensor = torch.rand([2, 5, 4], device=device) # [S, B, E, S, S] # TP: 2, FP: 2, FN: 1. prediction_tensor[0, 0, 3] = 1 # (True positive) @@ -198,24 +203,28 @@ def test_bmes_span_metrics_are_computed_correctly(self): # TP: 3, FP: 4, FN: 5. metric_dict = metric.get_metric() - numpy.testing.assert_almost_equal(metric_dict["recall-overall"], 0.375) - numpy.testing.assert_almost_equal(metric_dict["precision-overall"], 0.428, decimal=3) - numpy.testing.assert_almost_equal(metric_dict["f1-measure-overall"], 0.4) + assert_allclose(metric_dict["recall-overall"], 0.375, rtol=0.001, atol=1e-03) + assert_allclose(metric_dict["precision-overall"], 0.428, rtol=0.001, atol=1e-03) + assert_allclose(metric_dict["f1-measure-overall"], 0.4, rtol=0.001, atol=1e-03) - def test_span_f1_can_build_from_params(self): + @multi_device + def test_span_f1_can_build_from_params(self, device: str): params = Params({"type": "span_f1", "tag_namespace": "tags", "ignore_classes": ["V"]}) metric = Metric.from_params(params=params, vocabulary=self.vocab) - assert metric._ignore_classes == ["V"] - assert metric._label_vocabulary == self.vocab.get_index_to_token_vocabulary("tags") + assert metric._ignore_classes == ["V"] # type: ignore + assert metric._label_vocabulary == self.vocab.get_index_to_token_vocabulary( + "tags" + ) # type: ignore - def test_span_f1_matches_perl_script_for_continued_arguments(self): + @multi_device + def test_span_f1_matches_perl_script_for_continued_arguments(self, device: str): bio_tags = ["B-ARG1", "O", "B-C-ARG1", "B-V", "B-ARGM-ADJ", "O"] sentence = ["Mark", "and", "Matt", "were", "running", "fast", "."] gold_indices = [self.vocab.get_token_index(x, "tags") for x in bio_tags] - gold_tensor = torch.Tensor([gold_indices]) - prediction_tensor = torch.rand([1, 6, self.vocab.get_vocab_size("tags")]) - mask = torch.LongTensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]]) + gold_tensor = torch.tensor([gold_indices], device=device) + prediction_tensor = torch.rand([1, 6, self.vocab.get_vocab_size("tags")], device=device) + mask = torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]], device=device) # Make prediction so that it is exactly correct. for i, tag_index in enumerate(gold_indices): @@ -235,24 +244,24 @@ def test_span_f1_matches_perl_script_for_continued_arguments(self): assert metric._true_positives["V"] == 1 assert metric._true_positives["ARGM-ADJ"] == 1 - numpy.testing.assert_almost_equal(metric_dict["recall-ARG1"], 1.0) - numpy.testing.assert_almost_equal(metric_dict["precision-ARG1"], 1.0) - numpy.testing.assert_almost_equal(metric_dict["f1-measure-ARG1"], 1.0) - numpy.testing.assert_almost_equal(metric_dict["recall-V"], 1.0) - numpy.testing.assert_almost_equal(metric_dict["precision-V"], 1.0) - numpy.testing.assert_almost_equal(metric_dict["f1-measure-V"], 1.0) - numpy.testing.assert_almost_equal(metric_dict["recall-ARGM-ADJ"], 1.0) - numpy.testing.assert_almost_equal(metric_dict["precision-ARGM-ADJ"], 1.0) - numpy.testing.assert_almost_equal(metric_dict["f1-measure-ARGM-ADJ"], 1.0) - numpy.testing.assert_almost_equal(metric_dict["recall-overall"], 1.0) - numpy.testing.assert_almost_equal(metric_dict["precision-overall"], 1.0) - numpy.testing.assert_almost_equal(metric_dict["f1-measure-overall"], 1.0) + assert_allclose(metric_dict["recall-ARG1"], 1.0) + assert_allclose(metric_dict["precision-ARG1"], 1.0) + assert_allclose(metric_dict["f1-measure-ARG1"], 1.0) + assert_allclose(metric_dict["recall-V"], 1.0) + assert_allclose(metric_dict["precision-V"], 1.0) + assert_allclose(metric_dict["f1-measure-V"], 1.0) + assert_allclose(metric_dict["recall-ARGM-ADJ"], 1.0) + assert_allclose(metric_dict["precision-ARGM-ADJ"], 1.0) + assert_allclose(metric_dict["f1-measure-ARGM-ADJ"], 1.0) + assert_allclose(metric_dict["recall-overall"], 1.0) + assert_allclose(metric_dict["precision-overall"], 1.0) + assert_allclose(metric_dict["f1-measure-overall"], 1.0) # Check that the number of true positive ARG1 labels is the same as the perl script's output: gold_file_path = os.path.join(self.TEST_DIR, "gold_conll_eval.txt") prediction_file_path = os.path.join(self.TEST_DIR, "prediction_conll_eval.txt") - with open(gold_file_path, "a+") as gold_file, open( - prediction_file_path, "a+" + with open(gold_file_path, "w") as gold_file, open( + prediction_file_path, "w" ) as prediction_file: # Use the same bio tags as prediction vs gold to make it obvious by looking # at the perl script output if something is wrong. @@ -274,15 +283,16 @@ def test_span_f1_matches_perl_script_for_continued_arguments(self): ) assert num_correct_arg1_instances_from_perl_evaluation == metric._true_positives["ARG1"] - def test_span_f1_accepts_tags_to_spans_function_argument(self): + @multi_device + def test_span_f1_accepts_tags_to_spans_function_argument(self, device: str): def mock_tags_to_spans_function(tag_sequence, classes_to_ignore=None): return [("mock", (42, 42))] # Should be ignore. bio_tags = ["B-ARG1", "O", "B-C-ARG1", "B-V", "B-ARGM-ADJ", "O"] gold_indices = [self.vocab.get_token_index(x, "tags") for x in bio_tags] - gold_tensor = torch.Tensor([gold_indices]) - prediction_tensor = torch.rand([1, 6, self.vocab.get_vocab_size("tags")]) + gold_tensor = torch.tensor([gold_indices], device=device) + prediction_tensor = torch.rand([1, 6, self.vocab.get_vocab_size("tags")], device=device) metric = SpanBasedF1Measure( self.vocab, @@ -294,9 +304,9 @@ def mock_tags_to_spans_function(tag_sequence, classes_to_ignore=None): metric(prediction_tensor, gold_tensor) metric_dict = metric.get_metric() - numpy.testing.assert_almost_equal(metric_dict["recall-overall"], 1.0) - numpy.testing.assert_almost_equal(metric_dict["precision-overall"], 1.0) - numpy.testing.assert_almost_equal(metric_dict["f1-measure-overall"], 1.0) + assert_allclose(metric_dict["recall-overall"], 1.0) + assert_allclose(metric_dict["precision-overall"], 1.0) + assert_allclose(metric_dict["f1-measure-overall"], 1.0) with self.assertRaises(ConfigurationError): SpanBasedF1Measure(self.vocab, label_encoding="INVALID") diff --git a/allennlp/tests/training/metrics/spearman_correlation_test.py b/allennlp/tests/training/metrics/spearman_correlation_test.py index 1111a6834a5..8f2edd9832e 100644 --- a/allennlp/tests/training/metrics/spearman_correlation_test.py +++ b/allennlp/tests/training/metrics/spearman_correlation_test.py @@ -1,9 +1,9 @@ import math + import torch -import numpy as np -from numpy.testing import assert_allclose +from torch.testing import assert_allclose -from allennlp.common.testing import AllenNlpTestCase +from allennlp.common.testing import AllenNlpTestCase, multi_device from allennlp.training.metrics import SpearmanCorrelation @@ -17,8 +17,8 @@ def spearman_formula(predictions, labels, mask=None): labels = labels * mask # if all number of a set is same, return np.nan - if len(np.unique(predictions)) == 1 or len(np.unique(labels)) == 1: - return np.nan + if len(torch.unique(predictions)) == 1 or len(torch.unique(labels)) == 1: + return float("NaN") len_pre = len(predictions) @@ -35,23 +35,24 @@ def spearman_formula(predictions, labels, mask=None): total = 0 for i in range(len_pre): total += (predictions[i][0] - labels[i][0]) ** 2 - expected_spearman_correlation = 1 - float(6 * total) / (len_pre * (len_pre ** 2 - 1)) + expected_spearman_correlation = 1 - 6 * total / (len_pre * (len_pre ** 2 - 1)) return expected_spearman_correlation class SpearmanCorrelationTest(AllenNlpTestCase): - def test_unmasked_computation(self): + @multi_device + def test_unmasked_computation(self, device: str): spearman_correlation = SpearmanCorrelation() batch_size = 10 num_labels = 10 - predictions1 = np.random.randn(batch_size, num_labels).astype("float32") - labels1 = 0.5 * predictions1 + np.random.randn(batch_size, num_labels).astype("float32") + predictions1 = torch.randn(batch_size, num_labels, device=device) + labels1 = 0.5 * predictions1 + torch.randn(batch_size, num_labels, device=device) - predictions2 = np.random.randn(1).repeat(num_labels).astype("float32") - predictions2 = predictions2[np.newaxis, :].repeat(batch_size, axis=0) - labels2 = np.random.randn(1).repeat(num_labels).astype("float32") - labels2 = 0.5 * predictions2 + labels2[np.newaxis, :].repeat(batch_size, axis=0) + predictions2 = torch.randn(1, device=device).repeat(num_labels) + predictions2 = predictions2.unsqueeze(0).expand(batch_size, -1) + labels2 = torch.randn(1, device=device).expand(num_labels) + labels2 = 0.5 * predictions2 + labels2.unsqueeze(0).expand(batch_size, -1) # in most cases, the data is constructed like predictions_1, the data of such a batch different. # but in a few cases, for example, predictions_2, the data of such a batch is exactly the same. @@ -59,39 +60,37 @@ def test_unmasked_computation(self): for predictions, labels in predictions_labels_: spearman_correlation.reset() - spearman_correlation(torch.FloatTensor(predictions), torch.FloatTensor(labels)) + spearman_correlation(predictions, labels) assert_allclose( spearman_formula(predictions.reshape(-1), labels.reshape(-1)), spearman_correlation.get_metric(), - rtol=1e-5, ) - def test_masked_computation(self): + @multi_device + def test_masked_computation(self, device: str): spearman_correlation = SpearmanCorrelation() batch_size = 10 num_labels = 10 - predictions1 = np.random.randn(batch_size, num_labels).astype("float32") - labels1 = 0.5 * predictions1 + np.random.randn(batch_size, num_labels).astype("float32") + predictions1 = torch.randn(batch_size, num_labels, device=device) + labels1 = 0.5 * predictions1 + torch.randn(batch_size, num_labels, device=device) - predictions2 = np.random.randn(1).repeat(num_labels).astype("float32") - predictions2 = predictions2[np.newaxis, :].repeat(batch_size, axis=0) - labels2 = np.random.randn(1).repeat(num_labels).astype("float32") - labels2 = 0.5 * predictions2 + labels2[np.newaxis, :].repeat(batch_size, axis=0) + predictions2 = torch.randn(1, device=device).expand(num_labels) + predictions2 = predictions2.unsqueeze(0).expand(batch_size, -1) + labels2 = torch.randn(1, device=device).expand(num_labels) + labels2 = 0.5 * predictions2 + labels2.unsqueeze(0).expand(batch_size, -1) # in most cases, the data is constructed like predictions_1, the data of such a batch different. # but in a few cases, for example, predictions_2, the data of such a batch is exactly the same. predictions_labels_ = [(predictions1, labels1), (predictions2, labels2)] # Random binary mask - mask = np.random.randint(0, 2, size=(batch_size, num_labels)).astype("float32") + mask = torch.randint(0, 2, size=(batch_size, num_labels), device=device) for predictions, labels in predictions_labels_: spearman_correlation.reset() - spearman_correlation( - torch.FloatTensor(predictions), torch.FloatTensor(labels), torch.FloatTensor(mask) - ) + spearman_correlation(predictions, labels, mask) expected_spearman_correlation = spearman_formula( - predictions.reshape(-1), labels.reshape(-1), mask=mask.reshape(-1) + predictions.view(-1), labels.view(-1), mask=mask.view(-1) ) # because add mask, a batch of predictions or labels will have many 0, @@ -101,26 +100,27 @@ def test_masked_computation(self): # so here we only test the positive and negative results of the results. assert (expected_spearman_correlation * spearman_correlation.get_metric()) > 0 - def test_reset(self): + @multi_device + def test_reset(self, device: str): spearman_correlation = SpearmanCorrelation() batch_size = 10 num_labels = 10 - predictions = np.random.randn(batch_size, num_labels).astype("float32") - labels = 0.5 * predictions + np.random.randn(batch_size, num_labels).astype("float32") + predictions = torch.randn(batch_size, num_labels, device=device) + labels = 0.5 * predictions + torch.randn(batch_size, num_labels, device=device) # 1.test spearman_correlation.reset() spearman_correlation.reset() - spearman_correlation(torch.FloatTensor(predictions), torch.FloatTensor(labels)) + spearman_correlation(predictions, labels) temp = spearman_correlation.get_metric() spearman_correlation.reset() - spearman_correlation(torch.FloatTensor(predictions), torch.FloatTensor(labels)) + spearman_correlation(predictions, labels) assert spearman_correlation.get_metric() == temp # 2.test spearman_correlation.reset() spearman_correlation.reset() - spearman_correlation(torch.FloatTensor(predictions), torch.FloatTensor(labels)) + spearman_correlation(predictions, labels) spearman_correlation.get_metric(reset=False) - assert spearman_correlation.get_metric() != np.nan + assert spearman_correlation.get_metric() != float("NaN") spearman_correlation.get_metric(reset=True) assert math.isnan(spearman_correlation.get_metric()) diff --git a/allennlp/tests/training/metrics/unigram_recall_test.py b/allennlp/tests/training/metrics/unigram_recall_test.py index 0862ee94a62..38f7523e628 100644 --- a/allennlp/tests/training/metrics/unigram_recall_test.py +++ b/allennlp/tests/training/metrics/unigram_recall_test.py @@ -1,52 +1,58 @@ import torch -import numpy +from torch.testing import assert_allclose -from allennlp.common.testing import AllenNlpTestCase +from allennlp.common.testing import AllenNlpTestCase, multi_device from allennlp.training.metrics import UnigramRecall class UnigramRecallTest(AllenNlpTestCase): - def test_sequence_recall(self): + @multi_device + def test_sequence_recall(self, device: str): recall = UnigramRecall() - gold = torch.Tensor([[1, 2, 3], [2, 4, 8], [7, 1, 1]]) - predictions = torch.Tensor( - [[[1, 2, 3], [1, 2, -1]], [[2, 4, 8], [2, 5, 9]], [[-1, -1, -1], [7, 1, -1]]] + gold = torch.tensor([[1, 2, 3], [2, 4, 8], [7, 1, 1]], device=device) + predictions = torch.tensor( + [[[1, 2, 3], [1, 2, -1]], [[2, 4, 8], [2, 5, 9]], [[-1, -1, -1], [7, 1, -1]]], + device=device, ) recall(predictions, gold) actual_recall = recall.get_metric() - numpy.testing.assert_almost_equal(actual_recall, 1) + assert_allclose(actual_recall, 1) - def test_sequence_recall_respects_mask(self): + @multi_device + def test_sequence_recall_respects_mask(self, device: str): recall = UnigramRecall() - gold = torch.Tensor([[2, 4, 8], [1, 2, 3], [7, 1, 1], [11, 14, 17]]) - predictions = torch.Tensor( + gold = torch.tensor([[2, 4, 8], [1, 2, 3], [7, 1, 1], [11, 14, 17]], device=device) + predictions = torch.tensor( [ [[2, 4, 8], [2, 5, 9]], # 3/3 [[-1, 2, 4], [3, 8, -1]], # 2/2 [[-1, -1, -1], [7, 2, -1]], # 1/2 [[12, 13, 17], [11, 13, 18]], # 2/2 - ] + ], + device=device, ) - mask = torch.Tensor([[1, 1, 1], [0, 1, 1], [1, 1, 0], [1, 0, 1]]) + mask = torch.tensor([[1, 1, 1], [0, 1, 1], [1, 1, 0], [1, 0, 1]], device=device) recall(predictions, gold, mask) actual_recall = recall.get_metric() - numpy.testing.assert_almost_equal(actual_recall, 7 / 8) + assert_allclose(actual_recall, 7 / 8) - def test_sequence_recall_accumulates_and_resets_correctly(self): + @multi_device + def test_sequence_recall_accumulates_and_resets_correctly(self, device: str): recall = UnigramRecall() - gold = torch.Tensor([[1, 2, 3]]) - recall(torch.Tensor([[[1, 2, 3]]]), gold) - recall(torch.Tensor([[[7, 8, 4]]]), gold) + gold = torch.tensor([[1, 2, 3]], device=device) + recall(torch.tensor([[[1, 2, 3]]], device=device), gold) + recall(torch.tensor([[[7, 8, 4]]], device=device), gold) actual_recall = recall.get_metric(reset=True) - numpy.testing.assert_almost_equal(actual_recall, 1 / 2) + assert_allclose(actual_recall, 1 / 2) assert recall.correct_count == 0 assert recall.total_count == 0 - def test_get_metric_on_new_object_works(self): + @multi_device + def test_get_metric_on_new_object_works(self, device: str): recall = UnigramRecall() actual_recall = recall.get_metric(reset=True) - numpy.testing.assert_almost_equal(actual_recall, 0) + assert_allclose(actual_recall, 0) diff --git a/allennlp/training/metrics/spearman_correlation.py b/allennlp/training/metrics/spearman_correlation.py index 0f7314a0738..b66ad2f2ce0 100644 --- a/allennlp/training/metrics/spearman_correlation.py +++ b/allennlp/training/metrics/spearman_correlation.py @@ -44,14 +44,14 @@ def __call__( # Flatten predictions, gold_labels, and mask. We calculate the Spearman correlation between # the vectors, since each element in the predictions and gold_labels tensor is assumed # to be a separate observation. - predictions = predictions.view(-1) - gold_labels = gold_labels.view(-1) + predictions = predictions.reshape(-1) + gold_labels = gold_labels.reshape(-1) self.total_predictions = self.total_predictions.to(predictions.device) self.total_gold_labels = self.total_gold_labels.to(gold_labels.device) if mask is not None: - mask = mask.view(-1) + mask = mask.reshape(-1) self.total_predictions = torch.cat((self.total_predictions, predictions * mask), 0) self.total_gold_labels = torch.cat((self.total_gold_labels, gold_labels * mask), 0) else: From 99660ba0771fd724f635b81f1be75fa2b2e23e12 Mon Sep 17 00:00:00 2001 From: Santiago Castro Date: Thu, 27 Feb 2020 10:54:09 -0500 Subject: [PATCH 3/7] Add a test for the test utility --- allennlp/tests/common/testing.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 allennlp/tests/common/testing.py diff --git a/allennlp/tests/common/testing.py b/allennlp/tests/common/testing.py new file mode 100644 index 00000000000..19d0005cb46 --- /dev/null +++ b/allennlp/tests/common/testing.py @@ -0,0 +1,19 @@ +import torch + +from allennlp.common.testing import AllenNlpTestCase, multi_device + + +class TestFromParams(AllenNlpTestCase): + def test_multi_device(self): + actual_devices = set() + + @multi_device + def dummy_func(_self, device: str): + # Have `self` as in class test functions. + nonlocal actual_devices + actual_devices.add(device) + + dummy_func(self) + + expected_devices = {"cpu", "cuda"} if torch.cuda.is_available() else {"cpu"} + self.assertSetEqual(expected_devices, actual_devices) From 97772f7ce52ebe37f99572f4321bef6bed7240f0 Mon Sep 17 00:00:00 2001 From: Santiago Castro Date: Thu, 27 Feb 2020 13:05:31 -0500 Subject: [PATCH 4/7] mypy --- allennlp/tests/training/metrics/span_based_f1_measure_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/allennlp/tests/training/metrics/span_based_f1_measure_test.py b/allennlp/tests/training/metrics/span_based_f1_measure_test.py index 9095decfb2d..e4aaca1f674 100644 --- a/allennlp/tests/training/metrics/span_based_f1_measure_test.py +++ b/allennlp/tests/training/metrics/span_based_f1_measure_test.py @@ -212,9 +212,9 @@ def test_span_f1_can_build_from_params(self, device: str): params = Params({"type": "span_f1", "tag_namespace": "tags", "ignore_classes": ["V"]}) metric = Metric.from_params(params=params, vocabulary=self.vocab) assert metric._ignore_classes == ["V"] # type: ignore - assert metric._label_vocabulary == self.vocab.get_index_to_token_vocabulary( + assert metric._label_vocabulary == self.vocab.get_index_to_token_vocabulary( # type: ignore "tags" - ) # type: ignore + ) @multi_device def test_span_f1_matches_perl_script_for_continued_arguments(self, device: str): From 8cb62df9ee9be93d4e5f09ab502977eeb4bb5fc5 Mon Sep 17 00:00:00 2001 From: Santiago Castro Date: Thu, 27 Feb 2020 13:21:09 -0500 Subject: [PATCH 5/7] Update allennlp/common/testing/test_case.py Co-Authored-By: Mark Neumann --- allennlp/common/testing/test_case.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/allennlp/common/testing/test_case.py b/allennlp/common/testing/test_case.py index 56ea99eba7e..2eec156a6f6 100644 --- a/allennlp/common/testing/test_case.py +++ b/allennlp/common/testing/test_case.py @@ -55,7 +55,7 @@ def parametrize(arg_names: Iterable[str], arg_values: Iterable[Iterable[Any]]): Argument names to pass to the test function. arg_values : `Iterable[Iterable[Any]]`, required. Iterable of values to pass to each of the args. - A function call is gonna be made for each inner iterable. + The decorated test will be run for each inner iterable. """ def decorator(func): From a7180b0fa6cb527e500c7f9d20e5117898914afd Mon Sep 17 00:00:00 2001 From: Santiago Castro Date: Thu, 27 Feb 2020 13:38:58 -0500 Subject: [PATCH 6/7] Fix a PR comment --- allennlp/tests/common/testing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/allennlp/tests/common/testing.py b/allennlp/tests/common/testing.py index 19d0005cb46..fff065baf33 100644 --- a/allennlp/tests/common/testing.py +++ b/allennlp/tests/common/testing.py @@ -3,7 +3,7 @@ from allennlp.common.testing import AllenNlpTestCase, multi_device -class TestFromParams(AllenNlpTestCase): +class TestTesting(AllenNlpTestCase): def test_multi_device(self): actual_devices = set() From 4d708bad321fdcb50772b798ad1cdcc968921066 Mon Sep 17 00:00:00 2001 From: Santiago Castro Date: Thu, 27 Feb 2020 13:40:33 -0500 Subject: [PATCH 7/7] flake8 --- allennlp/common/testing/test_case.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/allennlp/common/testing/test_case.py b/allennlp/common/testing/test_case.py index 2eec156a6f6..222a49361a9 100644 --- a/allennlp/common/testing/test_case.py +++ b/allennlp/common/testing/test_case.py @@ -55,7 +55,7 @@ def parametrize(arg_names: Iterable[str], arg_values: Iterable[Iterable[Any]]): Argument names to pass to the test function. arg_values : `Iterable[Iterable[Any]]`, required. Iterable of values to pass to each of the args. - The decorated test will be run for each inner iterable. + The decorated test will be run for each inner iterable. """ def decorator(func):