From 8f0fc2d56f8c0f124c6d5ec0d782588d0963d17e Mon Sep 17 00:00:00 2001 From: Bavishya Sankaranarayanan Date: Wed, 13 Aug 2025 13:59:16 +0530 Subject: [PATCH] bleu: add tokenizer_name option with COCO/PTB tokenizer and whitespace tokenizer; add focused tests and docs --- metrics/bleu/README.md | 15 ++++++++++- metrics/bleu/bleu.py | 12 +++++++-- metrics/bleu/tokenizer_13a.py | 38 +++++++++++++++++++++++++++ tests/test_bleu_coco_tokenization.py | 39 ++++++++++++++++++++++++++++ 4 files changed, 101 insertions(+), 3 deletions(-) create mode 100644 tests/test_bleu_coco_tokenization.py diff --git a/metrics/bleu/README.md b/metrics/bleu/README.md index 6371b6cfe..47ea714eb 100644 --- a/metrics/bleu/README.md +++ b/metrics/bleu/README.md @@ -50,9 +50,12 @@ This metric takes as input a list of predicted sentences and a list of lists of ### Inputs - **predictions** (`list` of `str`s): Translations to score. - **references** (`list` of `list`s of `str`s): references for each translation. -- ** tokenizer** : approach used for standardizing `predictions` and `references`. + - ** tokenizer** : approach used for standardizing `predictions` and `references`. The default tokenizer is `tokenizer_13a`, a relatively minimal tokenization approach that is however equivalent to `mteval-v13a`, used by WMT. This can be replaced by another tokenizer from a source such as [SacreBLEU](https://github.com/mjpost/sacrebleu/tree/master/sacrebleu/tokenizers). + You can alternatively pass `tokenizer_name` to select built-ins: + - `"whitespace"`: simple whitespace split + - `"coco"`/`"ptb"`: COCO caption PTBTokenizer (requires `pycocoevalcap`) The default tokenizer is based on whitespace and regexes. It can be replaced by any function that takes a string as input and returns a list of tokens as output. E.g. `word_tokenize()` from [NLTK](https://www.nltk.org/api/nltk.tokenize.html) or pretrained tokenizers from the [Tokenizers library](https://huggingface.co/docs/tokenizers/index). - **max_order** (`int`): Maximum n-gram order to use when computing BLEU score. Defaults to `4`. @@ -122,6 +125,16 @@ Example with the word tokenizer from NLTK: ... [["foo bar foobar"]] ... ] >>> results = bleu.compute(predictions=predictions, references=references, tokenizer=word_tokenize) + +Example matching COCO/PTB tokenization (requires `pycocoevalcap`): +```python +>>> bleu = evaluate.load("bleu") +>>> predictions = ["opacity, consolidation, pleural effusion, and atelectasis are present."] +>>> references = [["opacity, consolidation, pleural effusion, and pneumonia are present."]] +>>> results = bleu.compute(predictions=predictions, references=references, tokenizer_name="coco") +>>> print(round(results["bleu"], 6)) +0.594604 +``` >>> print(results) {'bleu': 1.0, 'precisions': [1.0, 1.0, 1.0, 1.0], 'brevity_penalty': 1.0, 'length_ratio': 1.1666666666666667, 'translation_length': 7, 'reference_length': 6} ``` diff --git a/metrics/bleu/bleu.py b/metrics/bleu/bleu.py index 38a10c3b3..006108dd7 100644 --- a/metrics/bleu/bleu.py +++ b/metrics/bleu/bleu.py @@ -18,7 +18,7 @@ import evaluate from .nmt_bleu import compute_bleu # From: https://github.com/tensorflow/nmt/blob/master/nmt/scripts/bleu.py -from .tokenizer_13a import Tokenizer13a +from .tokenizer_13a import Tokenizer13a, WhitespaceTokenizer, CocoPTBTokenizer _CITATION = """\ @@ -112,7 +112,15 @@ def _info(self): ], ) - def _compute(self, predictions, references, tokenizer=Tokenizer13a(), max_order=4, smooth=False): + def _compute(self, predictions, references, tokenizer=Tokenizer13a(), max_order=4, smooth=False, tokenizer_name=None): + # Backward-compatible alias: allow passing tokenizer_name="whitespace" to mimic COCO/PTB tokenization splitting behavior + if tokenizer_name is not None: + if tokenizer_name == "whitespace": + tokenizer = WhitespaceTokenizer() + elif tokenizer_name in {"coco", "ptb", "coco-ptb"}: + tokenizer = CocoPTBTokenizer() + else: + raise ValueError("Unsupported tokenizer_name: {}".format(tokenizer_name)) # if only one reference is provided make sure we still use list of lists if isinstance(references[0], str): references = [[ref] for ref in references] diff --git a/metrics/bleu/tokenizer_13a.py b/metrics/bleu/tokenizer_13a.py index c7a1b3dbd..6d93143d3 100644 --- a/metrics/bleu/tokenizer_13a.py +++ b/metrics/bleu/tokenizer_13a.py @@ -98,3 +98,41 @@ def __call__(self, line): line = line.replace(">", ">") return self._post_tokenizer(f" {line} ") + + +class WhitespaceTokenizer(BaseTokenizer): + def signature(self): + return "whitespace" + + @lru_cache(maxsize=2**16) + def __call__(self, line): + return line.split() + + +class CocoPTBTokenizer(BaseTokenizer): + """Adapter around pycocoevalcap's PTBTokenizer to reproduce COCO caption tokenization. + + This requires `pycocoevalcap` to be installed. We call into its tokenizer and + then split the returned detokenized string to obtain tokens, matching how COCO BLEU + builds n-grams (space-split over PTBTokenized text). + """ + + def signature(self): + return "coco-ptb" + + @lru_cache(maxsize=2**16) + def __call__(self, line): + try: + from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer # type: ignore + except Exception as exc: # pragma: no cover - import error path + raise ImportError( + "To use tokenizer_name='coco' (PTBTokenizer), install pycocoevalcap." + ) from exc + + tokenizer = PTBTokenizer() + # The PTBTokenizer expects a dict of id -> list of dicts with 'caption' + input_dict = {0: [{"caption": line}]} + tokenized = tokenizer.tokenize(input_dict) + # Retrieve tokenized string and split on whitespace to get tokens + tokenized_str = tokenized[0][0]["caption"] + return tokenized_str.split() diff --git a/tests/test_bleu_coco_tokenization.py b/tests/test_bleu_coco_tokenization.py new file mode 100644 index 000000000..e0585a715 --- /dev/null +++ b/tests/test_bleu_coco_tokenization.py @@ -0,0 +1,39 @@ +import importlib + +import pytest + +import evaluate + + +def require_pycoco(): + return importlib.util.find_spec("pycocoevalcap") is not None + + +@pytest.mark.skipif(not require_pycoco(), reason="pycocoevalcap not installed") +def test_bleu_coco_tokenizer_matches_reported_example(): + bleu = evaluate.load("bleu") + + preds = ["opacity, consolidation, pleural effusion, and atelectasis are present."] + refs = ["opacity, consolidation, pleural effusion, and pneumonia are present."] + + # evaluate with COCO/PTB tokenization + res_coco = bleu.compute(predictions=preds, references=refs, tokenizer_name="coco") + # evaluate default tokenizer to ensure different score + res_default = bleu.compute(predictions=preds, references=refs) + + assert res_coco["bleu"] != pytest.approx(res_default["bleu"]) # ensure difference exists + # Expected around 0.5946035573 vs ~0.70 for default/period example + assert res_coco["bleu"] == pytest.approx(0.5946035573, rel=1e-6, abs=1e-6) + + +@pytest.mark.skipif(not require_pycoco(), reason="pycocoevalcap not installed") +def test_bleu_coco_tokenizer_period_case(): + bleu = evaluate.load("bleu") + + preds = ["opacity . consolidation . pleural effusion . atelectasis are present ."] + refs = ["opacity . consolidation . pleural effusion . pneumonia are present ."] + + res_coco = bleu.compute(predictions=preds, references=refs, tokenizer_name="coco") + assert res_coco["bleu"] == pytest.approx(0.7016879389890388, rel=1e-6, abs=1e-6) + +