Lightning-AI · ashutoshml · Dec 18, 2021 · Dec 18, 2021 · Dec 18, 2021 · Dec 18, 2021
@@ -45,6 +45,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - `BLEUScore` now expects untokenized input to stay consistent with all the other text metrics ([#640](https://github.com/PyTorchLightning/metrics/pull/640))
 
 
+- `BERTScore` and `ROUGEScore` now expect references to be the first argument and predictions to be the second. Consistent with other NLG metrics ([#687](https://github.com/PyTorchLightning/metrics/pull/687))
+
+
 ### Deprecated
 
 - Renamed IoU -> Jaccard Index ([#662](https://github.com/PyTorchLightning/metrics/pull/662))

@@ -60,6 +60,30 @@
  ],
 }
 
+# Examples and expected values taken from:
+# https://github.com/Tiiiger/bert_score/blob/master/tests/test_scorer.py
+ARTICLES_INPUT = {
+ "preds": [
+ "28-year-old chef found dead in San Francisco mall",
+ "A 28-year-old chef who recently moved to San Francisco was "
+ "found dead in the staircase of a local shopping center.",
+ "The victim's brother said he cannot imagine anyone who \
+ would want to harm him,\"Finally, it went uphill again at "
+ 'him."',
+ ],
+ "targets": [
+ "28-Year-Old Chef Found Dead at San Francisco Mall",
+ "A 28-year-old chef who had recently moved to San Francisco \
+ was found dead in the stairwell of a local mall this "
+ "week.",
+ "But the victim's brother says he can't think of anyone who would \
+ want to hurt him, saying, \"Things were finally "
+ 'going well for him."',
+ ],
+}
+
 _inputs_error_rate_batch_size_1 = Input(**ERROR_RATES_BATCHES_1)
 
 _inputs_error_rate_batch_size_2 = Input(**ERROR_RATES_BATCHES_2)
+
+_inputs_multiple_sentences_multiple_reference = Input(**ARTICLES_INPUT)
@@ -7,6 +7,7 @@
 import torch.distributed as dist
 import torch.multiprocessing as mp
 
+from tests.text.inputs import _inputs_multiple_sentences_multiple_reference
 from torchmetrics.functional.text.bert import bert_score as metrics_bert_score
 from torchmetrics.text.bert import BERTScore
 from torchmetrics.utilities.imports import _BERTSCORE_AVAILABLE
@@ -16,22 +17,11 @@
 
 os.environ["TOKENIZERS_PARALLELISM"] = "1"
 
-# Examples and expected values taken from:
-# https://github.com/Tiiiger/bert_score/blob/master/tests/test_scorer.py
-preds = [
- "28-year-old chef found dead in San Francisco mall",
- "A 28-year-old chef who recently moved to San Francisco was "
- "found dead in the staircase of a local shopping center.",
- "The victim's brother said he cannot imagine anyone who would want to harm him,\"Finally, it went uphill again at "
- 'him."',
-]
-refs = [
- "28-Year-Old Chef Found Dead at San Francisco Mall",
- "A 28-year-old chef who had recently moved to San Francisco was found dead in the stairwell of a local mall this "
- "week.",
- "But the victim's brother says he can't think of anyone who would want to hurt him, saying, \"Things were finally "
- 'going well for him."',
-]
+preds = _inputs_multiple_sentences_multiple_reference.preds
+refs = _inputs_multiple_sentences_multiple_reference.targets
+
+preds_batched = [preds[:2], preds[2:]]
+refs_batched = [refs[:2], refs[2:]]
 
 
 _METRICS = ["precision", "recall", "f1"]
@@ -50,85 +40,81 @@ def _parse_original_bert_score(score: torch.Tensor) -> Dict[str, List[float]]:
  return score_dict
 
 
-preds_batched = [preds[0:2], preds[2:]]
-refs_batched = [refs[0:2], refs[2:]]
-
-
 @pytest.mark.parametrize(
- "preds,refs",
- [(preds, refs)],
+ "refs,preds",
+ [(refs, preds)],
 )
 @pytest.mark.skipif(not _BERTSCORE_AVAILABLE, reason="test requires bert_score")
-def test_score_fn(preds, refs):
+def test_score_fn(refs, preds):
  """Tests for functional."""
  original_score = original_bert_score(preds, refs, model_type=MODEL_NAME, num_layers=8, idf=False, batch_size=3)
  original_score = _parse_original_bert_score(original_score)
 
  metrics_score = metrics_bert_score(
- preds, refs, model_name_or_path=MODEL_NAME, num_layers=8, idf=False, batch_size=3
+ refs, preds, model_name_or_path=MODEL_NAME, num_layers=8, idf=False, batch_size=3
  )
 
  for metric in _METRICS:
  _assert_list(metrics_score[metric], original_score[metric])
 
 
 @pytest.mark.parametrize(
- "preds,refs",
- [(preds, refs)],
+ "refs, preds",
+ [(refs, preds)],
 )
 @pytest.mark.skipif(not _BERTSCORE_AVAILABLE, reason="test requires bert_score")
-def test_score_fn_with_idf(preds, refs):
+def test_score_fn_with_idf(refs, preds):
  """Tests for functional with IDF rescaling."""
  original_score = original_bert_score(preds, refs, model_type=MODEL_NAME, num_layers=12, idf=True, batch_size=3)
  original_score = _parse_original_bert_score(original_score)
 
  metrics_score = metrics_bert_score(
- preds, refs, model_name_or_path=MODEL_NAME, num_layers=12, idf=True, batch_size=3
+ refs, preds, model_name_or_path=MODEL_NAME, num_layers=12, idf=True, batch_size=3
  )
 
  for metric in _METRICS:
  _assert_list(metrics_score[metric], original_score[metric])
 
 
 @pytest.mark.parametrize(
- "preds,refs",
- [(preds, refs)],
+ "refs, preds",
+ [(refs, preds)],
 )
 @pytest.mark.skipif(not _BERTSCORE_AVAILABLE, reason="test requires bert_score")
-def test_score_fn_all_layers(preds, refs):
+def test_score_fn_all_layers(refs, preds):
  """Tests for functional and all layers."""
  original_score = original_bert_score(preds, refs, model_type=MODEL_NAME, all_layers=True, idf=False, batch_size=3)
  original_score = _parse_original_bert_score(original_score)
 
  metrics_score = metrics_bert_score(
- preds, refs, model_name_or_path=MODEL_NAME, all_layers=True, idf=False, batch_size=3
+ refs, preds, model_name_or_path=MODEL_NAME, all_layers=True, idf=False, batch_size=3
  )
 
  for metric in _METRICS:
  _assert_list(metrics_score[metric], original_score[metric])
 
 
 @pytest.mark.parametrize(
- "preds,refs",
- [(preds, refs)],
+ "refs, preds",
+ [(refs, preds)],
 )
 @pytest.mark.skipif(not _BERTSCORE_AVAILABLE, reason="test requires bert_score")
-def test_score_fn_all_layers_with_idf(preds, refs):
+def test_score_fn_all_layers_with_idf(refs, preds):
  """Tests for functional and all layers with IDF rescaling."""
  original_score = original_bert_score(preds, refs, model_type=MODEL_NAME, all_layers=True, idf=True, batch_size=3)
  original_score = _parse_original_bert_score(original_score)
 
  metrics_score = metrics_bert_score(
- preds, refs, model_name_or_path=MODEL_NAME, all_layers=True, idf=True, batch_size=3
+ refs, preds, model_name_or_path=MODEL_NAME, all_layers=True, idf=True, batch_size=3
  )
 
  for metric in _METRICS:
  _assert_list(metrics_score[metric], original_score[metric])
 
 
 @pytest.mark.parametrize(
- "preds,refs",
- [(preds, refs)],
+ "refs, preds",
+ [(refs, preds)],
 )
 @pytest.mark.skipif(not _BERTSCORE_AVAILABLE, reason="test requires bert_score")
 def test_score_fn_all_layers_rescale_with_baseline(preds, refs):
@@ -146,8 +132,8 @@ def test_score_fn_all_layers_rescale_with_baseline(preds, refs):
  original_score = _parse_original_bert_score(original_score)
 
  metrics_score = metrics_bert_score(
- preds,
  refs,
+ preds,
  model_name_or_path=MODEL_NAME,
  lang="en",
  num_layers=8,
@@ -161,11 +147,11 @@ def test_score_fn_all_layers_rescale_with_baseline(preds, refs):
 
 
 @pytest.mark.parametrize(
- "preds,refs",
- [(preds, refs)],
+ "refs, preds",
+ [(refs, preds)],
 )
 @pytest.mark.skipif(not _BERTSCORE_AVAILABLE, reason="test requires bert_score")
-def test_score_fn_rescale_with_baseline(preds, refs):
+def test_score_fn_rescale_with_baseline(refs, preds):
  """Tests for functional with baseline rescaling with all layers."""
  original_score = original_bert_score(
  preds,
@@ -180,8 +166,8 @@ def test_score_fn_rescale_with_baseline(preds, refs):
  original_score = _parse_original_bert_score(original_score)
 
  metrics_score = metrics_bert_score(
- preds,
  refs,
+ preds,
  model_name_or_path=MODEL_NAME,
  lang="en",
  all_layers=True,
@@ -195,83 +181,83 @@ def test_score_fn_rescale_with_baseline(preds, refs):
 
 
 @pytest.mark.parametrize(
- "preds,refs",
- [(preds, refs)],
+ "refs, preds",
+ [(refs, preds)],
 )
 @pytest.mark.skipif(not _BERTSCORE_AVAILABLE, reason="test requires bert_score")
-def test_score(preds, refs):
+def test_score(refs, preds):
  """Tests for metric."""
  original_score = original_bert_score(preds, refs, model_type=MODEL_NAME, num_layers=8, idf=False, batch_size=3)
  original_score = _parse_original_bert_score(original_score)
 
  Scorer = BERTScore(model_name_or_path=MODEL_NAME, num_layers=8, idf=False, batch_size=3)
- Scorer.update(predictions=preds, references=refs)
+ Scorer.update(references=refs, predictions=preds)
  metrics_score = Scorer.compute()
 
  for metric in _METRICS:
  _assert_list(metrics_score[metric], original_score[metric])
 
 
 @pytest.mark.parametrize(
- "preds,refs",
- [(preds, refs)],
+ "refs, preds",
+ [(refs, preds)],
 )
 @pytest.mark.skipif(not _BERTSCORE_AVAILABLE, reason="test requires bert_score")
-def test_score_with_idf(preds, refs):
+def test_score_with_idf(refs, preds):
  """Tests for metric with IDF rescaling."""
  original_score = original_bert_score(preds, refs, model_type=MODEL_NAME, num_layers=8, idf=True, batch_size=3)
  original_score = _parse_original_bert_score(original_score)
 
  Scorer = BERTScore(model_name_or_path=MODEL_NAME, num_layers=8, idf=True, batch_size=3)
- Scorer.update(predictions=preds, references=refs)
+ Scorer.update(references=refs, predictions=preds)
  metrics_score = Scorer.compute()
 
  for metric in _METRICS:
  _assert_list(metrics_score[metric], original_score[metric])
 
 
 @pytest.mark.parametrize(
- "preds,refs",
- [(preds, refs)],
+ "refs, preds",
+ [(refs, preds)],
 )
 @pytest.mark.skipif(not _BERTSCORE_AVAILABLE, reason="test requires bert_score")
-def test_score_all_layers(preds, refs):
+def test_score_all_layers(refs, preds):
  """Tests for metric and all layers."""
  original_score = original_bert_score(preds, refs, model_type=MODEL_NAME, all_layers=True, idf=False, batch_size=3)
  original_score = _parse_original_bert_score(original_score)
 
  Scorer = BERTScore(model_name_or_path=MODEL_NAME, all_layers=True, idf=False, batch_size=3)
- Scorer.update(predictions=preds, references=refs)
+ Scorer.update(references=refs, predictions=preds)
  metrics_score = Scorer.compute()
 
  for metric in _METRICS:
  _assert_list(metrics_score[metric], original_score[metric])
 
 
 @pytest.mark.parametrize(
- "preds,refs",
- [(preds, refs)],
+ "refs, preds",
+ [(refs, preds)],
 )
 @pytest.mark.skipif(not _BERTSCORE_AVAILABLE, reason="test requires bert_score")
-def test_score_all_layers_with_idf(preds, refs):
+def test_score_all_layers_with_idf(refs, preds):
  """Tests for metric and all layers with IDF rescaling."""
  original_score = original_bert_score(preds, refs, model_type=MODEL_NAME, all_layers=True, idf=True, batch_size=3)
  original_score = _parse_original_bert_score(original_score)
 
  Scorer = BERTScore(model_name_or_path=MODEL_NAME, all_layers=True, idf=True, batch_size=3)
- Scorer.update(predictions=preds, references=refs)
+ Scorer.update(references=refs, predictions=preds)
  metrics_score = Scorer.compute()
 
  for metric in _METRICS:
  _assert_list(metrics_score[metric], original_score[metric])
 
 
 @pytest.mark.parametrize(
- "preds,refs",
- [(preds_batched, refs_batched)],
+ "refs, preds",
+ [(refs_batched, preds_batched)],
 )
 @pytest.mark.skipif(not _BERTSCORE_AVAILABLE, reason="test requires bert_score")
-def test_accumulation(preds, refs):
+def test_accumulation(refs, preds):
  """Tests for metric works with accumulation."""
  original_score = original_bert_score(
  sum(preds, []), sum(refs, []), model_type=MODEL_NAME, num_layers=8, idf=False, batch_size=3
@@ -280,7 +266,7 @@ def test_accumulation(preds, refs):
 
  Scorer = BERTScore(model_name_or_path=MODEL_NAME, num_layers=8, idf=False, batch_size=3)
  for p, r in zip(preds, refs):
- Scorer.update(predictions=p, references=r)
+ Scorer.update(references=r, predictions=p)
  metrics_score = Scorer.compute()
 
  for metric in _METRICS:
@@ -293,7 +279,7 @@ def _bert_score_ddp(rank, world_size, preds, refs, original_score):
  os.environ["MASTER_PORT"] = "12355"
  dist.init_process_group("gloo", rank=rank, world_size=world_size)
  Scorer = BERTScore(model_name_or_path=MODEL_NAME, num_layers=8, idf=False, batch_size=3, max_length=128)
- Scorer.update(preds, refs)
+ Scorer.update(refs, preds)
  metrics_score = Scorer.compute()
  for metric in _METRICS:
  _assert_list(metrics_score[metric], original_score[metric])
@@ -308,11 +294,11 @@ def _test_score_ddp_fn(rank, world_size, preds, refs):
 
 
 @pytest.mark.parametrize(
- "preds,refs",
- [(preds, refs)],
+ "refs, preds",
+ [(refs, preds)],
 )
 @pytest.mark.skipif(not (_BERTSCORE_AVAILABLE and dist.is_available()), reason="test requires bert_score")
-def test_score_ddp(preds, refs):
+def test_score_ddp(refs, preds):
  """Tests for metric using DDP."""
  world_size = 2
  mp.spawn(_test_score_ddp_fn, args=(world_size, preds, refs), nprocs=world_size, join=False)