huggingface · bavi404 · Aug 13, 2025
diff --git a/metrics/bleu/README.md b/metrics/bleu/README.md
@@ -50,9 +50,12 @@ This metric takes as input a list of predicted sentences and a list of lists of
 ### Inputs
 - **predictions** (`list` of `str`s): Translations to score.
 - **references** (`list` of `list`s of `str`s): references for each translation.
-- ** tokenizer** : approach used for standardizing `predictions` and `references`.
+ - ** tokenizer** : approach used for standardizing `predictions` and `references`.
     The default tokenizer is `tokenizer_13a`, a relatively minimal tokenization approach that is however equivalent to `mteval-v13a`, used by WMT.
     This can be replaced by another tokenizer from a source such as [SacreBLEU](https://github.com/mjpost/sacrebleu/tree/master/sacrebleu/tokenizers).
+     You can alternatively pass `tokenizer_name` to select built-ins:
+     - `"whitespace"`: simple whitespace split
+     - `"coco"`/`"ptb"`: COCO caption PTBTokenizer (requires `pycocoevalcap`)
 
 The default tokenizer is based on whitespace and regexes. It can be replaced by any function that takes a string as input and returns a list of tokens as output. E.g. `word_tokenize()` from [NLTK](https://www.nltk.org/api/nltk.tokenize.html) or pretrained tokenizers from the [Tokenizers library](https://huggingface.co/docs/tokenizers/index).
 - **max_order** (`int`): Maximum n-gram order to use when computing BLEU score. Defaults to `4`.
@@ -122,6 +125,16 @@ Example with the word tokenizer from NLTK:
 ...     [["foo bar foobar"]]
 ... ]
 >>> results = bleu.compute(predictions=predictions, references=references, tokenizer=word_tokenize)
+
+Example matching COCO/PTB tokenization (requires `pycocoevalcap`):
+```python
+>>> bleu = evaluate.load("bleu")
+>>> predictions = ["opacity, consolidation, pleural effusion, and atelectasis are present."]
+>>> references = [["opacity, consolidation, pleural effusion, and pneumonia are present."]]
+>>> results = bleu.compute(predictions=predictions, references=references, tokenizer_name="coco")
+>>> print(round(results["bleu"], 6))
+0.594604
+```
 >>> print(results)
 {'bleu': 1.0, 'precisions': [1.0, 1.0, 1.0, 1.0], 'brevity_penalty': 1.0, 'length_ratio': 1.1666666666666667, 'translation_length': 7, 'reference_length': 6}
 ```

diff --git a/metrics/bleu/bleu.py b/metrics/bleu/bleu.py
@@ -18,7 +18,7 @@
 import evaluate
 
 from .nmt_bleu import compute_bleu  # From: https://github.com/tensorflow/nmt/blob/master/nmt/scripts/bleu.py
-from .tokenizer_13a import Tokenizer13a
+from .tokenizer_13a import Tokenizer13a, WhitespaceTokenizer, CocoPTBTokenizer
 
 
 _CITATION = """\
@@ -112,7 +112,15 @@ def _info(self):
             ],
         )
 
-    def _compute(self, predictions, references, tokenizer=Tokenizer13a(), max_order=4, smooth=False):
+    def _compute(self, predictions, references, tokenizer=Tokenizer13a(), max_order=4, smooth=False, tokenizer_name=None):
+        # Backward-compatible alias: allow passing tokenizer_name="whitespace" to mimic COCO/PTB tokenization splitting behavior
+        if tokenizer_name is not None:
+            if tokenizer_name == "whitespace":
+                tokenizer = WhitespaceTokenizer()
+            elif tokenizer_name in {"coco", "ptb", "coco-ptb"}:
+                tokenizer = CocoPTBTokenizer()
+            else:
+                raise ValueError("Unsupported tokenizer_name: {}".format(tokenizer_name))
         # if only one reference is provided make sure we still use list of lists
         if isinstance(references[0], str):
             references = [[ref] for ref in references]

diff --git a/metrics/bleu/tokenizer_13a.py b/metrics/bleu/tokenizer_13a.py
@@ -98,3 +98,41 @@ def __call__(self, line):
             line = line.replace("&gt;", ">")
 
         return self._post_tokenizer(f" {line} ")
+
+
+class WhitespaceTokenizer(BaseTokenizer):
+    def signature(self):
+        return "whitespace"
+
+    @lru_cache(maxsize=2**16)
+    def __call__(self, line):
+        return line.split()
+
+
+class CocoPTBTokenizer(BaseTokenizer):
+    """Adapter around pycocoevalcap's PTBTokenizer to reproduce COCO caption tokenization.
+
+    This requires `pycocoevalcap` to be installed. We call into its tokenizer and
+    then split the returned detokenized string to obtain tokens, matching how COCO BLEU
+    builds n-grams (space-split over PTBTokenized text).
+    """
+
+    def signature(self):
+        return "coco-ptb"
+
+    @lru_cache(maxsize=2**16)
+    def __call__(self, line):
+        try:
+            from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer  # type: ignore
+        except Exception as exc:  # pragma: no cover - import error path
+            raise ImportError(
+                "To use tokenizer_name='coco' (PTBTokenizer), install pycocoevalcap."
+            ) from exc
+
+        tokenizer = PTBTokenizer()
+        # The PTBTokenizer expects a dict of id -> list of dicts with 'caption'
+        input_dict = {0: [{"caption": line}]}
+        tokenized = tokenizer.tokenize(input_dict)
+        # Retrieve tokenized string and split on whitespace to get tokens
+        tokenized_str = tokenized[0][0]["caption"]
+        return tokenized_str.split()
diff --git a/tests/test_bleu_coco_tokenization.py b/tests/test_bleu_coco_tokenization.py
@@ -0,0 +1,39 @@
+import importlib
+
+import pytest
+
+import evaluate
+
+
+def require_pycoco():
+    return importlib.util.find_spec("pycocoevalcap") is not None
+
+
+@pytest.mark.skipif(not require_pycoco(), reason="pycocoevalcap not installed")
+def test_bleu_coco_tokenizer_matches_reported_example():
+    bleu = evaluate.load("bleu")
+
+    preds = ["opacity, consolidation, pleural effusion, and atelectasis are present."]
+    refs = ["opacity, consolidation, pleural effusion, and pneumonia are present."]
+
+    # evaluate with COCO/PTB tokenization
+    res_coco = bleu.compute(predictions=preds, references=refs, tokenizer_name="coco")
+    # evaluate default tokenizer to ensure different score
+    res_default = bleu.compute(predictions=preds, references=refs)
+
+    assert res_coco["bleu"] != pytest.approx(res_default["bleu"])  # ensure difference exists
+    # Expected around 0.5946035573 vs ~0.70 for default/period example
+    assert res_coco["bleu"] == pytest.approx(0.5946035573, rel=1e-6, abs=1e-6)
+
+
+@pytest.mark.skipif(not require_pycoco(), reason="pycocoevalcap not installed")
+def test_bleu_coco_tokenizer_period_case():
+    bleu = evaluate.load("bleu")
+
+    preds = ["opacity . consolidation . pleural effusion . atelectasis are present ."]
+    refs = ["opacity . consolidation . pleural effusion . pneumonia are present ."]
+
+    res_coco = bleu.compute(predictions=preds, references=refs, tokenizer_name="coco")
+    assert res_coco["bleu"] == pytest.approx(0.7016879389890388, rel=1e-6, abs=1e-6)
+
+