Lightning-AI · Borda · Aug 17, 2021 · Aug 11, 2021 · Aug 11, 2021 · Aug 11, 2021
@@ -25,3 +25,6 @@ mir_eval>=0.6
 #pesq @ https://github.com/ludlows/python-pesq/archive/refs/heads/master.zip
 #SRMRpy @ https://github.com/jfsantos/SRMRpy/archive/refs/heads/master.zip
 speechmetrics @ https://github.com/aliutkus/speechmetrics/archive/refs/heads/master.zip
+
+# text
+rouge-score>=0.0.4
@@ -1,4 +1,3 @@
 jiwer>=2.2.0
 nltk>=3.6
-rouge-score>=0.0.4
 bert-score==0.3.10
@@ -16,7 +16,6 @@
 
 import pytest
 import torch
-from torch import tensor
 
 from torchmetrics.functional.text.rouge import rouge_score
 from torchmetrics.text.rouge import ROUGEScore
@@ -30,16 +29,13 @@
 
 ROUGE_KEYS = ("rouge1", "rouge2", "rougeL", "rougeLsum")
 
-PRECISION = 0
-RECALL = 1
-F_MEASURE = 2
-
 SINGLE_SENTENCE_EXAMPLE_PREDS = "The quick brown fox jumps over the lazy dog"
 SINGLE_SENTENCE_EXAMPLE_TARGET = "The quick brown dog jumps on the log."
 
 PREDS = "My name is John".split()
 TARGETS = "Is your name John".split()
 
+
 BATCHES_RS_PREDS = [SINGLE_SENTENCE_EXAMPLE_PREDS]
 BATCHES_RS_PREDS.extend(PREDS)
 BATCHES_RS_TARGETS = [SINGLE_SENTENCE_EXAMPLE_TARGET]
@@ -55,145 +51,139 @@ def _compute_rouge_score(preds: List[str], targets: List[str], use_stemmer: bool
  scorer = RougeScorer(ROUGE_KEYS, use_stemmer=use_stemmer)
  aggregator = BootstrapAggregator()
  for pred, target in zip(preds, targets):
- aggregator.add_scores(scorer.score(pred, target))
+ aggregator.add_scores(scorer.score(target, pred))
  return aggregator.aggregate()
 
 
-@pytest.mark.skipif(not (_NLTK_AVAILABLE or _ROUGE_SCORE_AVAILABLE), reason="test requires nltk and rouge-score")
+@pytest.mark.skipif(not _NLTK_AVAILABLE, reason="test requires nltk")
 @pytest.mark.parametrize(
- ["pl_rouge_metric_key", "rouge_score_key", "metric", "decimal_places", "use_stemmer", "newline_sep"],
+ ["pl_rouge_metric_key", "use_stemmer"],
  [
- pytest.param("rouge1_precision", "rouge1", PRECISION, 1, True, True),
- pytest.param("rouge1_recall", "rouge1", RECALL, 2, True, False),
- pytest.param("rouge1_fmeasure", "rouge1", F_MEASURE, 3, False, True),
- pytest.param("rouge2_precision", "rouge2", PRECISION, 4, False, False),
- pytest.param("rouge2_recall", "rouge2", RECALL, 5, True, True),
- pytest.param("rouge2_fmeasure", "rouge2", F_MEASURE, 6, True, False),
- pytest.param("rougeL_precision", "rougeL", PRECISION, 6, False, True),
- pytest.param("rougeL_recall", "rougeL", RECALL, 5, False, False),
- pytest.param("rougeL_fmeasure", "rougeL", F_MEASURE, 3, True, True),
- pytest.param("rougeLsum_precision", "rougeLsum", PRECISION, 2, True, False),
- pytest.param("rougeLsum_recall", "rougeLsum", RECALL, 1, False, True),
- pytest.param("rougeLsum_fmeasure", "rougeLsum", F_MEASURE, 8, False, False),
+ pytest.param("rouge1_precision", True),
+ pytest.param("rouge1_recall", True),
+ pytest.param("rouge1_fmeasure", False),
+ pytest.param("rouge2_precision", False),
+ pytest.param("rouge2_recall", True),
+ pytest.param("rouge2_fmeasure", True),
+ pytest.param("rougeL_precision", False),
+ pytest.param("rougeL_recall", False),
+ pytest.param("rougeL_fmeasure", True),
+ pytest.param("rougeLsum_precision", True),
+ pytest.param("rougeLsum_recall", False),
+ pytest.param("rougeLsum_fmeasure", False),
  ],
 )
-def test_rouge_metric_functional_single_sentence(
- pl_rouge_metric_key, rouge_score_key, metric, decimal_places, use_stemmer, newline_sep
-):
- scorer = RougeScorer(ROUGE_KEYS)
- rs_scores = scorer.score(SINGLE_SENTENCE_EXAMPLE_PREDS, SINGLE_SENTENCE_EXAMPLE_TARGET)
- rs_output = round(rs_scores[rouge_score_key][metric], decimal_places)
+def test_rouge_metric_functional_single_sentence(pl_rouge_metric_key, use_stemmer):
+ rouge_level, metric = pl_rouge_metric_key.split("_")
+
+ scorer = RougeScorer(ROUGE_KEYS, use_stemmer=use_stemmer)
+ rs_scores = scorer.score(SINGLE_SENTENCE_EXAMPLE_TARGET, SINGLE_SENTENCE_EXAMPLE_PREDS)
+ rs_result = torch.tensor(getattr(rs_scores[rouge_level], metric), dtype=torch.float32)
 
- pl_output = rouge_score(
- [SINGLE_SENTENCE_EXAMPLE_PREDS],
- [SINGLE_SENTENCE_EXAMPLE_TARGET],
- newline_sep=newline_sep,
- use_stemmer=use_stemmer,
- decimal_places=decimal_places,
- )
+ pl_output = rouge_score([SINGLE_SENTENCE_EXAMPLE_PREDS], [SINGLE_SENTENCE_EXAMPLE_TARGET], use_stemmer=use_stemmer)
 
- assert torch.allclose(pl_output[pl_rouge_metric_key], tensor(rs_output, dtype=torch.float32))
+ assert torch.allclose(pl_output[pl_rouge_metric_key], rs_result)
 
 
-@pytest.mark.skipif(not (_NLTK_AVAILABLE or _ROUGE_SCORE_AVAILABLE), reason="test requires nltk and rouge-score")
+@pytest.mark.skipif(not _NLTK_AVAILABLE, reason="test requires nltk")
 @pytest.mark.parametrize(
- ["pl_rouge_metric_key", "rouge_score_key", "metric", "decimal_places", "use_stemmer", "newline_sep"],
+ ["pl_rouge_metric_key", "use_stemmer"],
  [
- pytest.param("rouge1_precision", "rouge1", PRECISION, 1, True, True),
- pytest.param("rouge1_recall", "rouge1", RECALL, 2, True, False),
- pytest.param("rouge1_fmeasure", "rouge1", F_MEASURE, 3, False, True),
- pytest.param("rouge2_precision", "rouge2", PRECISION, 4, False, False),
- pytest.param("rouge2_recall", "rouge2", RECALL, 5, True, True),
- pytest.param("rouge2_fmeasure", "rouge2", F_MEASURE, 6, True, False),
- pytest.param("rougeL_precision", "rougeL", PRECISION, 6, False, True),
- pytest.param("rougeL_recall", "rougeL", RECALL, 5, False, False),
- pytest.param("rougeL_fmeasure", "rougeL", F_MEASURE, 3, True, True),
- pytest.param("rougeLsum_precision", "rougeLsum", PRECISION, 2, True, False),
- pytest.param("rougeLsum_recall", "rougeLsum", RECALL, 1, False, True),
- pytest.param("rougeLsum_fmeasure", "rougeLsum", F_MEASURE, 8, False, False),
+ pytest.param("rouge1_precision", True),
+ pytest.param("rouge1_recall", True),
+ pytest.param("rouge1_fmeasure", False),
+ pytest.param("rouge2_precision", False),
+ pytest.param("rouge2_recall", True),
+ pytest.param("rouge2_fmeasure", True),
+ pytest.param("rougeL_precision", False),
+ pytest.param("rougeL_recall", False),
+ pytest.param("rougeL_fmeasure", True),
+ pytest.param("rougeLsum_precision", True),
+ pytest.param("rougeLsum_recall", False),
+ pytest.param("rougeLsum_fmeasure", False),
  ],
 )
-def test_rouge_metric_functional(
- pl_rouge_metric_key, rouge_score_key, metric, decimal_places, use_stemmer, newline_sep
-):
+def test_rouge_metric_functional(pl_rouge_metric_key, use_stemmer):
+ rouge_level, metric = pl_rouge_metric_key.split("_")
+
  rs_scores = _compute_rouge_score(PREDS, TARGETS, use_stemmer=use_stemmer)
- rs_output = round(rs_scores[rouge_score_key].mid[metric], decimal_places)
+ rs_result = torch.tensor(getattr(rs_scores[rouge_level].mid, metric), dtype=torch.float32)
 
- pl_output = rouge_score(
- PREDS, TARGETS, newline_sep=newline_sep, use_stemmer=use_stemmer, decimal_places=decimal_places
- )
+ pl_output = rouge_score(PREDS, TARGETS, use_stemmer=use_stemmer)
 
- assert torch.allclose(pl_output[pl_rouge_metric_key], tensor(rs_output, dtype=torch.float32))
+ assert torch.allclose(pl_output[pl_rouge_metric_key], rs_result)
 
 
-@pytest.mark.skipif(not (_NLTK_AVAILABLE or _ROUGE_SCORE_AVAILABLE), reason="test requires nltk and rouge-score")
+@pytest.mark.skipif(not _NLTK_AVAILABLE, reason="test requires nltk")
 @pytest.mark.parametrize(
- ["pl_rouge_metric_key", "rouge_score_key", "metric", "decimal_places", "use_stemmer", "newline_sep"],
+ ["pl_rouge_metric_key", "use_stemmer"],
  [
- pytest.param("rouge1_precision", "rouge1", PRECISION, 1, True, True),
- pytest.param("rouge1_recall", "rouge1", RECALL, 2, True, False),
- pytest.param("rouge1_fmeasure", "rouge1", F_MEASURE, 3, False, True),
- pytest.param("rouge2_precision", "rouge2", PRECISION, 4, False, False),
- pytest.param("rouge2_recall", "rouge2", RECALL, 5, True, True),
- pytest.param("rouge2_fmeasure", "rouge2", F_MEASURE, 6, True, False),
- pytest.param("rougeL_precision", "rougeL", PRECISION, 6, False, True),
- pytest.param("rougeL_recall", "rougeL", RECALL, 5, False, False),
- pytest.param("rougeL_fmeasure", "rougeL", F_MEASURE, 3, True, True),
- pytest.param("rougeLsum_precision", "rougeLsum", PRECISION, 2, True, False),
- pytest.param("rougeLsum_recall", "rougeLsum", RECALL, 1, False, True),
- pytest.param("rougeLsum_fmeasure", "rougeLsum", F_MEASURE, 8, False, False),
+ pytest.param("rouge1_precision", True),
+ pytest.param("rouge1_recall", True),
+ pytest.param("rouge1_fmeasure", False),
+ pytest.param("rouge2_precision", False),
+ pytest.param("rouge2_recall", True),
+ pytest.param("rouge2_fmeasure", True),
+ pytest.param("rougeL_precision", False),
+ pytest.param("rougeL_recall", False),
+ pytest.param("rougeL_fmeasure", True),
+ pytest.param("rougeLsum_precision", True),
+ pytest.param("rougeLsum_recall", False),
+ pytest.param("rougeLsum_fmeasure", False),
  ],
 )
-def test_rouge_metric_class(pl_rouge_metric_key, rouge_score_key, metric, decimal_places, use_stemmer, newline_sep):
- scorer = RougeScorer(ROUGE_KEYS)
- rs_scores = scorer.score(SINGLE_SENTENCE_EXAMPLE_PREDS, SINGLE_SENTENCE_EXAMPLE_TARGET)
- rs_output = round(rs_scores[rouge_score_key][metric], decimal_places)
+def test_rouge_metric_class(pl_rouge_metric_key, use_stemmer):
+ rouge_level, metric = pl_rouge_metric_key.split("_")
+
+ scorer = RougeScorer(ROUGE_KEYS, use_stemmer=use_stemmer)
+ rs_scores = scorer.score(SINGLE_SENTENCE_EXAMPLE_TARGET, SINGLE_SENTENCE_EXAMPLE_PREDS)
+ rs_result = torch.tensor(getattr(rs_scores[rouge_level], metric), dtype=torch.float32)
 
- rouge = ROUGEScore(newline_sep=newline_sep, use_stemmer=use_stemmer, decimal_places=decimal_places)
+ rouge = ROUGEScore(use_stemmer=use_stemmer)
  pl_output = rouge([SINGLE_SENTENCE_EXAMPLE_PREDS], [SINGLE_SENTENCE_EXAMPLE_TARGET])
 
- assert torch.allclose(pl_output[pl_rouge_metric_key], tensor(rs_output, dtype=torch.float32))
+ assert torch.allclose(pl_output[pl_rouge_metric_key], rs_result)
 
 
-@pytest.mark.skipif(not (_NLTK_AVAILABLE or _ROUGE_SCORE_AVAILABLE), reason="test requires nltk and rouge-score")
+@pytest.mark.skipif(not _NLTK_AVAILABLE, reason="test requires nltk")
 @pytest.mark.parametrize(
- ["pl_rouge_metric_key", "rouge_score_key", "metric", "decimal_places", "use_stemmer", "newline_sep"],
+ ["pl_rouge_metric_key", "use_stemmer"],
  [
- pytest.param("rouge1_precision", "rouge1", PRECISION, 1, True, True),
- pytest.param("rouge1_recall", "rouge1", RECALL, 2, True, False),
- pytest.param("rouge1_fmeasure", "rouge1", F_MEASURE, 3, False, True),
- pytest.param("rouge2_precision", "rouge2", PRECISION, 4, False, False),
- pytest.param("rouge2_recall", "rouge2", RECALL, 5, True, True),
- pytest.param("rouge2_fmeasure", "rouge2", F_MEASURE, 6, True, False),
- pytest.param("rougeL_precision", "rougeL", PRECISION, 6, False, True),
- pytest.param("rougeL_recall", "rougeL", RECALL, 5, False, False),
- pytest.param("rougeL_fmeasure", "rougeL", F_MEASURE, 3, True, True),
- pytest.param("rougeLsum_precision", "rougeLsum", PRECISION, 2, True, False),
- pytest.param("rougeLsum_recall", "rougeLsum", RECALL, 1, False, True),
- pytest.param("rougeLsum_fmeasure", "rougeLsum", F_MEASURE, 8, False, False),
+ pytest.param("rouge1_precision", True),
+ pytest.param("rouge1_recall", True),
+ pytest.param("rouge1_fmeasure", False),
+ pytest.param("rouge2_precision", False),
+ pytest.param("rouge2_recall", True),
+ pytest.param("rouge2_fmeasure", True),
+ pytest.param("rougeL_precision", False),
+ pytest.param("rougeL_recall", False),
+ pytest.param("rougeL_fmeasure", True),
+ pytest.param("rougeLsum_precision", True),
+ pytest.param("rougeLsum_recall", False),
+ pytest.param("rougeLsum_fmeasure", False),
  ],
 )
-def test_rouge_metric_class_batches(
- pl_rouge_metric_key, rouge_score_key, metric, decimal_places, use_stemmer, newline_sep
-):
+def test_rouge_metric_class_batches(pl_rouge_metric_key, use_stemmer):
+ rouge_level, metric = pl_rouge_metric_key.split("_")
+
  rs_scores = _compute_rouge_score(BATCHES_RS_PREDS, BATCHES_RS_TARGETS, use_stemmer=use_stemmer)
- rs_output = round(rs_scores[rouge_score_key].mid[metric], decimal_places)
+ rs_result = torch.tensor(getattr(rs_scores[rouge_level].mid, metric), dtype=torch.float32)
 
- rouge = ROUGEScore(newline_sep=newline_sep, use_stemmer=use_stemmer, decimal_places=decimal_places)
+ rouge = ROUGEScore(use_stemmer=use_stemmer)
  for batch in BATCHES:
  rouge.update(batch["preds"], batch["targets"])
  pl_output = rouge.compute()
 
- assert torch.allclose(pl_output[pl_rouge_metric_key], tensor(rs_output, dtype=torch.float32))
+ assert torch.allclose(pl_output[pl_rouge_metric_key], rs_result)
 
 
 def test_rouge_metric_raises_errors_and_warnings():
  """Test that expected warnings and errors are raised."""
- if not (_NLTK_AVAILABLE and _ROUGE_SCORE_AVAILABLE):
+ if not _NLTK_AVAILABLE:
  with pytest.raises(
  ValueError,
- match="ROUGE metric requires that both nltk and rouge-score is installed."
- "Either as `pip install torchmetrics[text]` or `pip install nltk rouge-score`",
+ match="ROUGE metric requires that nltk is installed."
+ "Either as `pip install torchmetrics[text]` or `pip install nltk`",
  ):
  ROUGEScore()