Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Unify input order of ROUGEScore and BERTScore with other NLG metrics #687

Closed
wants to merge 10 commits into from
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- `BLEUScore` now expects untokenized input to stay consistent with all the other text metrics ([#640](https://github.com/PyTorchLightning/metrics/pull/640))


- `BERTScore` and `ROUGEScore` now expect references to be the first argument and predictions to be the second. Consistent with other NLG metrics ([#687](https://github.com/PyTorchLightning/metrics/pull/687))


### Deprecated

- Renamed IoU -> Jaccard Index ([#662](https://github.com/PyTorchLightning/metrics/pull/662))
Expand Down
24 changes: 24 additions & 0 deletions tests/text/inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,30 @@
],
}

# Examples and expected values taken from:
# https://github.com/Tiiiger/bert_score/blob/master/tests/test_scorer.py
ARTICLES_INPUT = {
"preds": [
"28-year-old chef found dead in San Francisco mall",
"A 28-year-old chef who recently moved to San Francisco was "
"found dead in the staircase of a local shopping center.",
"The victim's brother said he cannot imagine anyone who \
would want to harm him,\"Finally, it went uphill again at "
'him."',
],
"targets": [
"28-Year-Old Chef Found Dead at San Francisco Mall",
"A 28-year-old chef who had recently moved to San Francisco \
was found dead in the stairwell of a local mall this "
"week.",
"But the victim's brother says he can't think of anyone who would \
want to hurt him, saying, \"Things were finally "
'going well for him."',
],
}

_inputs_error_rate_batch_size_1 = Input(**ERROR_RATES_BATCHES_1)

_inputs_error_rate_batch_size_2 = Input(**ERROR_RATES_BATCHES_2)

_inputs_multiple_sentences_multiple_reference = Input(**ARTICLES_INPUT)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, there's a single reference for a given hypothesis.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. Should I call it _inputs_multiple_sentences_single_reference?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Possibly we can. Or maybe we can leave the references in the test file for now. We aim to adjust BERTScore in a way to handle multiple references #647 (similar updates as you made for ROUGEScore) so we can eventually use already defined _inputs_multiple_references.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right. For now, I'll keep it as is in the current PR. We can rename it when issue #647 is completed.

I additionally had concerns that we should standardize naming conventions for preds (in some places hypothesis) and targets (in some places references) in the entirety of NLG metrics.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMO we should go with predictions and targets everywhere, since this is then more consistent with metrics in other domains.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

agree here :]

120 changes: 53 additions & 67 deletions tests/text/test_bertscore.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import torch.distributed as dist
import torch.multiprocessing as mp

from tests.text.inputs import _inputs_multiple_sentences_multiple_reference
from torchmetrics.functional.text.bert import bert_score as metrics_bert_score
from torchmetrics.text.bert import BERTScore
from torchmetrics.utilities.imports import _BERTSCORE_AVAILABLE
Expand All @@ -16,22 +17,11 @@

os.environ["TOKENIZERS_PARALLELISM"] = "1"

# Examples and expected values taken from:
# https://github.com/Tiiiger/bert_score/blob/master/tests/test_scorer.py
preds = [
"28-year-old chef found dead in San Francisco mall",
"A 28-year-old chef who recently moved to San Francisco was "
"found dead in the staircase of a local shopping center.",
"The victim's brother said he cannot imagine anyone who would want to harm him,\"Finally, it went uphill again at "
'him."',
]
refs = [
"28-Year-Old Chef Found Dead at San Francisco Mall",
"A 28-year-old chef who had recently moved to San Francisco was found dead in the stairwell of a local mall this "
"week.",
"But the victim's brother says he can't think of anyone who would want to hurt him, saying, \"Things were finally "
'going well for him."',
]
preds = _inputs_multiple_sentences_multiple_reference.preds
refs = _inputs_multiple_sentences_multiple_reference.targets

preds_batched = [preds[:2], preds[2:]]
refs_batched = [refs[:2], refs[2:]]


_METRICS = ["precision", "recall", "f1"]
Expand All @@ -50,85 +40,81 @@ def _parse_original_bert_score(score: torch.Tensor) -> Dict[str, List[float]]:
return score_dict


preds_batched = [preds[0:2], preds[2:]]
refs_batched = [refs[0:2], refs[2:]]


@pytest.mark.parametrize(
"preds,refs",
[(preds, refs)],
"refs,preds",
[(refs, preds)],
)
@pytest.mark.skipif(not _BERTSCORE_AVAILABLE, reason="test requires bert_score")
def test_score_fn(preds, refs):
def test_score_fn(refs, preds):
"""Tests for functional."""
original_score = original_bert_score(preds, refs, model_type=MODEL_NAME, num_layers=8, idf=False, batch_size=3)
original_score = _parse_original_bert_score(original_score)

metrics_score = metrics_bert_score(
preds, refs, model_name_or_path=MODEL_NAME, num_layers=8, idf=False, batch_size=3
refs, preds, model_name_or_path=MODEL_NAME, num_layers=8, idf=False, batch_size=3
)

for metric in _METRICS:
_assert_list(metrics_score[metric], original_score[metric])


@pytest.mark.parametrize(
"preds,refs",
[(preds, refs)],
"refs, preds",
[(refs, preds)],
)
@pytest.mark.skipif(not _BERTSCORE_AVAILABLE, reason="test requires bert_score")
def test_score_fn_with_idf(preds, refs):
def test_score_fn_with_idf(refs, preds):
"""Tests for functional with IDF rescaling."""
original_score = original_bert_score(preds, refs, model_type=MODEL_NAME, num_layers=12, idf=True, batch_size=3)
original_score = _parse_original_bert_score(original_score)

metrics_score = metrics_bert_score(
preds, refs, model_name_or_path=MODEL_NAME, num_layers=12, idf=True, batch_size=3
refs, preds, model_name_or_path=MODEL_NAME, num_layers=12, idf=True, batch_size=3
)

for metric in _METRICS:
_assert_list(metrics_score[metric], original_score[metric])


@pytest.mark.parametrize(
"preds,refs",
[(preds, refs)],
"refs, preds",
[(refs, preds)],
)
@pytest.mark.skipif(not _BERTSCORE_AVAILABLE, reason="test requires bert_score")
def test_score_fn_all_layers(preds, refs):
def test_score_fn_all_layers(refs, preds):
"""Tests for functional and all layers."""
original_score = original_bert_score(preds, refs, model_type=MODEL_NAME, all_layers=True, idf=False, batch_size=3)
original_score = _parse_original_bert_score(original_score)

metrics_score = metrics_bert_score(
preds, refs, model_name_or_path=MODEL_NAME, all_layers=True, idf=False, batch_size=3
refs, preds, model_name_or_path=MODEL_NAME, all_layers=True, idf=False, batch_size=3
)

for metric in _METRICS:
_assert_list(metrics_score[metric], original_score[metric])


@pytest.mark.parametrize(
"preds,refs",
[(preds, refs)],
"refs, preds",
[(refs, preds)],
)
@pytest.mark.skipif(not _BERTSCORE_AVAILABLE, reason="test requires bert_score")
def test_score_fn_all_layers_with_idf(preds, refs):
def test_score_fn_all_layers_with_idf(refs, preds):
"""Tests for functional and all layers with IDF rescaling."""
original_score = original_bert_score(preds, refs, model_type=MODEL_NAME, all_layers=True, idf=True, batch_size=3)
original_score = _parse_original_bert_score(original_score)

metrics_score = metrics_bert_score(
preds, refs, model_name_or_path=MODEL_NAME, all_layers=True, idf=True, batch_size=3
refs, preds, model_name_or_path=MODEL_NAME, all_layers=True, idf=True, batch_size=3
)

for metric in _METRICS:
_assert_list(metrics_score[metric], original_score[metric])


@pytest.mark.parametrize(
"preds,refs",
[(preds, refs)],
"refs, preds",
[(refs, preds)],
)
@pytest.mark.skipif(not _BERTSCORE_AVAILABLE, reason="test requires bert_score")
def test_score_fn_all_layers_rescale_with_baseline(preds, refs):
Expand All @@ -146,8 +132,8 @@ def test_score_fn_all_layers_rescale_with_baseline(preds, refs):
original_score = _parse_original_bert_score(original_score)

metrics_score = metrics_bert_score(
preds,
refs,
preds,
model_name_or_path=MODEL_NAME,
lang="en",
num_layers=8,
Expand All @@ -161,11 +147,11 @@ def test_score_fn_all_layers_rescale_with_baseline(preds, refs):


@pytest.mark.parametrize(
"preds,refs",
[(preds, refs)],
"refs, preds",
[(refs, preds)],
)
@pytest.mark.skipif(not _BERTSCORE_AVAILABLE, reason="test requires bert_score")
def test_score_fn_rescale_with_baseline(preds, refs):
def test_score_fn_rescale_with_baseline(refs, preds):
"""Tests for functional with baseline rescaling with all layers."""
original_score = original_bert_score(
preds,
Expand All @@ -180,8 +166,8 @@ def test_score_fn_rescale_with_baseline(preds, refs):
original_score = _parse_original_bert_score(original_score)

metrics_score = metrics_bert_score(
preds,
refs,
preds,
model_name_or_path=MODEL_NAME,
lang="en",
all_layers=True,
Expand All @@ -195,83 +181,83 @@ def test_score_fn_rescale_with_baseline(preds, refs):


@pytest.mark.parametrize(
"preds,refs",
[(preds, refs)],
"refs, preds",
[(refs, preds)],
)
@pytest.mark.skipif(not _BERTSCORE_AVAILABLE, reason="test requires bert_score")
def test_score(preds, refs):
def test_score(refs, preds):
"""Tests for metric."""
original_score = original_bert_score(preds, refs, model_type=MODEL_NAME, num_layers=8, idf=False, batch_size=3)
original_score = _parse_original_bert_score(original_score)

Scorer = BERTScore(model_name_or_path=MODEL_NAME, num_layers=8, idf=False, batch_size=3)
Scorer.update(predictions=preds, references=refs)
Scorer.update(references=refs, predictions=preds)
metrics_score = Scorer.compute()

for metric in _METRICS:
_assert_list(metrics_score[metric], original_score[metric])


@pytest.mark.parametrize(
"preds,refs",
[(preds, refs)],
"refs, preds",
[(refs, preds)],
)
@pytest.mark.skipif(not _BERTSCORE_AVAILABLE, reason="test requires bert_score")
def test_score_with_idf(preds, refs):
def test_score_with_idf(refs, preds):
"""Tests for metric with IDF rescaling."""
original_score = original_bert_score(preds, refs, model_type=MODEL_NAME, num_layers=8, idf=True, batch_size=3)
original_score = _parse_original_bert_score(original_score)

Scorer = BERTScore(model_name_or_path=MODEL_NAME, num_layers=8, idf=True, batch_size=3)
Scorer.update(predictions=preds, references=refs)
Scorer.update(references=refs, predictions=preds)
metrics_score = Scorer.compute()

for metric in _METRICS:
_assert_list(metrics_score[metric], original_score[metric])


@pytest.mark.parametrize(
"preds,refs",
[(preds, refs)],
"refs, preds",
[(refs, preds)],
)
@pytest.mark.skipif(not _BERTSCORE_AVAILABLE, reason="test requires bert_score")
def test_score_all_layers(preds, refs):
def test_score_all_layers(refs, preds):
"""Tests for metric and all layers."""
original_score = original_bert_score(preds, refs, model_type=MODEL_NAME, all_layers=True, idf=False, batch_size=3)
original_score = _parse_original_bert_score(original_score)

Scorer = BERTScore(model_name_or_path=MODEL_NAME, all_layers=True, idf=False, batch_size=3)
Scorer.update(predictions=preds, references=refs)
Scorer.update(references=refs, predictions=preds)
metrics_score = Scorer.compute()

for metric in _METRICS:
_assert_list(metrics_score[metric], original_score[metric])


@pytest.mark.parametrize(
"preds,refs",
[(preds, refs)],
"refs, preds",
[(refs, preds)],
)
@pytest.mark.skipif(not _BERTSCORE_AVAILABLE, reason="test requires bert_score")
def test_score_all_layers_with_idf(preds, refs):
def test_score_all_layers_with_idf(refs, preds):
"""Tests for metric and all layers with IDF rescaling."""
original_score = original_bert_score(preds, refs, model_type=MODEL_NAME, all_layers=True, idf=True, batch_size=3)
original_score = _parse_original_bert_score(original_score)

Scorer = BERTScore(model_name_or_path=MODEL_NAME, all_layers=True, idf=True, batch_size=3)
Scorer.update(predictions=preds, references=refs)
Scorer.update(references=refs, predictions=preds)
metrics_score = Scorer.compute()

for metric in _METRICS:
_assert_list(metrics_score[metric], original_score[metric])


@pytest.mark.parametrize(
"preds,refs",
[(preds_batched, refs_batched)],
"refs, preds",
[(refs_batched, preds_batched)],
)
@pytest.mark.skipif(not _BERTSCORE_AVAILABLE, reason="test requires bert_score")
def test_accumulation(preds, refs):
def test_accumulation(refs, preds):
"""Tests for metric works with accumulation."""
original_score = original_bert_score(
sum(preds, []), sum(refs, []), model_type=MODEL_NAME, num_layers=8, idf=False, batch_size=3
Expand All @@ -280,7 +266,7 @@ def test_accumulation(preds, refs):

Scorer = BERTScore(model_name_or_path=MODEL_NAME, num_layers=8, idf=False, batch_size=3)
for p, r in zip(preds, refs):
Scorer.update(predictions=p, references=r)
Scorer.update(references=r, predictions=p)
metrics_score = Scorer.compute()

for metric in _METRICS:
Expand All @@ -293,7 +279,7 @@ def _bert_score_ddp(rank, world_size, preds, refs, original_score):
os.environ["MASTER_PORT"] = "12355"
dist.init_process_group("gloo", rank=rank, world_size=world_size)
Scorer = BERTScore(model_name_or_path=MODEL_NAME, num_layers=8, idf=False, batch_size=3, max_length=128)
Scorer.update(preds, refs)
Scorer.update(refs, preds)
metrics_score = Scorer.compute()
for metric in _METRICS:
_assert_list(metrics_score[metric], original_score[metric])
Expand All @@ -308,11 +294,11 @@ def _test_score_ddp_fn(rank, world_size, preds, refs):


@pytest.mark.parametrize(
"preds,refs",
[(preds, refs)],
"refs, preds",
[(refs, preds)],
)
@pytest.mark.skipif(not (_BERTSCORE_AVAILABLE and dist.is_available()), reason="test requires bert_score")
def test_score_ddp(preds, refs):
def test_score_ddp(refs, preds):
"""Tests for metric using DDP."""
world_size = 2
mp.spawn(_test_score_ddp_fn, args=(world_size, preds, refs), nprocs=world_size, join=False)
Loading