Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion metrics/bleu/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,12 @@ This metric takes as input a list of predicted sentences and a list of lists of
### Inputs
- **predictions** (`list` of `str`s): Translations to score.
- **references** (`list` of `list`s of `str`s): references for each translation.
- ** tokenizer** : approach used for standardizing `predictions` and `references`.
- ** tokenizer** : approach used for standardizing `predictions` and `references`.
The default tokenizer is `tokenizer_13a`, a relatively minimal tokenization approach that is however equivalent to `mteval-v13a`, used by WMT.
This can be replaced by another tokenizer from a source such as [SacreBLEU](https://github.com/mjpost/sacrebleu/tree/master/sacrebleu/tokenizers).
You can alternatively pass `tokenizer_name` to select built-ins:
- `"whitespace"`: simple whitespace split
- `"coco"`/`"ptb"`: COCO caption PTBTokenizer (requires `pycocoevalcap`)

The default tokenizer is based on whitespace and regexes. It can be replaced by any function that takes a string as input and returns a list of tokens as output. E.g. `word_tokenize()` from [NLTK](https://www.nltk.org/api/nltk.tokenize.html) or pretrained tokenizers from the [Tokenizers library](https://huggingface.co/docs/tokenizers/index).
- **max_order** (`int`): Maximum n-gram order to use when computing BLEU score. Defaults to `4`.
Expand Down Expand Up @@ -122,6 +125,16 @@ Example with the word tokenizer from NLTK:
... [["foo bar foobar"]]
... ]
>>> results = bleu.compute(predictions=predictions, references=references, tokenizer=word_tokenize)

Example matching COCO/PTB tokenization (requires `pycocoevalcap`):
```python
>>> bleu = evaluate.load("bleu")
>>> predictions = ["opacity, consolidation, pleural effusion, and atelectasis are present."]
>>> references = [["opacity, consolidation, pleural effusion, and pneumonia are present."]]
>>> results = bleu.compute(predictions=predictions, references=references, tokenizer_name="coco")
>>> print(round(results["bleu"], 6))
0.594604
```
>>> print(results)
{'bleu': 1.0, 'precisions': [1.0, 1.0, 1.0, 1.0], 'brevity_penalty': 1.0, 'length_ratio': 1.1666666666666667, 'translation_length': 7, 'reference_length': 6}
```
Expand Down
12 changes: 10 additions & 2 deletions metrics/bleu/bleu.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import evaluate

from .nmt_bleu import compute_bleu # From: https://github.com/tensorflow/nmt/blob/master/nmt/scripts/bleu.py
from .tokenizer_13a import Tokenizer13a
from .tokenizer_13a import Tokenizer13a, WhitespaceTokenizer, CocoPTBTokenizer


_CITATION = """\
Expand Down Expand Up @@ -112,7 +112,15 @@ def _info(self):
],
)

def _compute(self, predictions, references, tokenizer=Tokenizer13a(), max_order=4, smooth=False):
def _compute(self, predictions, references, tokenizer=Tokenizer13a(), max_order=4, smooth=False, tokenizer_name=None):
# Backward-compatible alias: allow passing tokenizer_name="whitespace" to mimic COCO/PTB tokenization splitting behavior
if tokenizer_name is not None:
if tokenizer_name == "whitespace":
tokenizer = WhitespaceTokenizer()
elif tokenizer_name in {"coco", "ptb", "coco-ptb"}:
tokenizer = CocoPTBTokenizer()
else:
raise ValueError("Unsupported tokenizer_name: {}".format(tokenizer_name))
# if only one reference is provided make sure we still use list of lists
if isinstance(references[0], str):
references = [[ref] for ref in references]
Expand Down
38 changes: 38 additions & 0 deletions metrics/bleu/tokenizer_13a.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,3 +98,41 @@ def __call__(self, line):
line = line.replace(">", ">")

return self._post_tokenizer(f" {line} ")


class WhitespaceTokenizer(BaseTokenizer):
def signature(self):
return "whitespace"

@lru_cache(maxsize=2**16)
def __call__(self, line):
return line.split()


class CocoPTBTokenizer(BaseTokenizer):
"""Adapter around pycocoevalcap's PTBTokenizer to reproduce COCO caption tokenization.

This requires `pycocoevalcap` to be installed. We call into its tokenizer and
then split the returned detokenized string to obtain tokens, matching how COCO BLEU
builds n-grams (space-split over PTBTokenized text).
"""

def signature(self):
return "coco-ptb"

@lru_cache(maxsize=2**16)
def __call__(self, line):
try:
from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer # type: ignore
except Exception as exc: # pragma: no cover - import error path
raise ImportError(
"To use tokenizer_name='coco' (PTBTokenizer), install pycocoevalcap."
) from exc

tokenizer = PTBTokenizer()
# The PTBTokenizer expects a dict of id -> list of dicts with 'caption'
input_dict = {0: [{"caption": line}]}
tokenized = tokenizer.tokenize(input_dict)
# Retrieve tokenized string and split on whitespace to get tokens
tokenized_str = tokenized[0][0]["caption"]
return tokenized_str.split()
39 changes: 39 additions & 0 deletions tests/test_bleu_coco_tokenization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import importlib

import pytest

import evaluate


def require_pycoco():
return importlib.util.find_spec("pycocoevalcap") is not None


@pytest.mark.skipif(not require_pycoco(), reason="pycocoevalcap not installed")
def test_bleu_coco_tokenizer_matches_reported_example():
bleu = evaluate.load("bleu")

preds = ["opacity, consolidation, pleural effusion, and atelectasis are present."]
refs = ["opacity, consolidation, pleural effusion, and pneumonia are present."]

# evaluate with COCO/PTB tokenization
res_coco = bleu.compute(predictions=preds, references=refs, tokenizer_name="coco")
# evaluate default tokenizer to ensure different score
res_default = bleu.compute(predictions=preds, references=refs)

assert res_coco["bleu"] != pytest.approx(res_default["bleu"]) # ensure difference exists
# Expected around 0.5946035573 vs ~0.70 for default/period example
assert res_coco["bleu"] == pytest.approx(0.5946035573, rel=1e-6, abs=1e-6)


@pytest.mark.skipif(not require_pycoco(), reason="pycocoevalcap not installed")
def test_bleu_coco_tokenizer_period_case():
bleu = evaluate.load("bleu")

preds = ["opacity . consolidation . pleural effusion . atelectasis are present ."]
refs = ["opacity . consolidation . pleural effusion . pneumonia are present ."]

res_coco = bleu.compute(predictions=preds, references=refs, tokenizer_name="coco")
assert res_coco["bleu"] == pytest.approx(0.7016879389890388, rel=1e-6, abs=1e-6)