[breaking|pipelines|tokenizers] Adding slow-fast tokenizers equivalen…

…ce tests pipelines - Removing sentencepiece as a required dependency (huggingface#8073) * Fixing roberta for slow-fast tests * WIP getting equivalence on pipelines * slow-to-fast equivalence - working on question-answering pipeline * optional FAISS tests * Pipeline Q&A * Move pipeline tests to their own test job again * update tokenizer to add sequence id methods * update to tokenizers 0.9.4 * set sentencepiecce as optional * clean up squad * clean up pipelines to use sequence_ids * style/quality * wording * Switch to use_fast = True by default * update tests for use_fast at True by default * fix rag tokenizer test * removing protobuf from required dependencies * fix NER test for use_fast = True by default * fixing example tests (Q&A examples use slow tokenizers for now) * protobuf in main deps extras["sentencepiece"] and example deps * fix protobug install test * try to fix seq2seq by switching to slow tokenizers for now * Update src/transformers/tokenization_utils_base.py Co-authored-by: Lysandre Debut <lysandre@huggingface.co> * Update src/transformers/tokenization_utils_base.py Co-authored-by: Lysandre Debut <lysandre@huggingface.co> Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
Zhylkaaa · Nov 17, 2020 · 0355ad2 · 0355ad2
1 parent e21232c
commit 0355ad2
Show file tree

Hide file tree

Showing 23 changed files with 690 additions and 263 deletions.
diff --git a/examples/question-answering/run_squad.py b/examples/question-answering/run_squad.py
@@ -736,6 +736,7 @@ def main():
         args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
         do_lower_case=args.do_lower_case,
         cache_dir=args.cache_dir if args.cache_dir else None,
+        use_fast=False,  # SquadDataset is not compatible with Fast tokenizers which have a smarter overflow handeling
     )
     model = AutoModelForQuestionAnswering.from_pretrained(
         args.model_name_or_path,
@@ -784,7 +785,10 @@ def main():
 
         # Load a trained model and vocabulary that you have fine-tuned
         model = AutoModelForQuestionAnswering.from_pretrained(args.output_dir)  # , force_download=True)
-        tokenizer = AutoTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
+
+        # SquadDataset is not compatible with Fast tokenizers which have a smarter overflow handeling
+        # So we use use_fast=False here for now until Fast-tokenizer-compatible-examples are out
+        tokenizer = AutoTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case, use_fast=False)
         model.to(args.device)
 
     # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory

diff --git a/examples/question-answering/run_squad_trainer.py b/examples/question-answering/run_squad_trainer.py
@@ -114,6 +114,7 @@ def main():
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
+        use_fast=False,  # SquadDataset is not compatible with Fast tokenizers which have a smarter overflow handeling
     )
     model = AutoModelForQuestionAnswering.from_pretrained(
         model_args.model_name_or_path,

diff --git a/examples/requirements.txt b/examples/requirements.txt
@@ -18,3 +18,4 @@ fire
 pytest
 conllu
 sentencepiece != 0.1.92
+protobuf
diff --git a/examples/seq2seq/test_datasets.py b/examples/seq2seq/test_datasets.py
@@ -197,7 +197,7 @@ def test_distributed_sortish_sampler_splits_indices_between_procs(self):
     )
     @require_torch_non_multi_gpu_but_fix_me
     def test_dataset_kwargs(self, tok_name):
-        tokenizer = AutoTokenizer.from_pretrained(tok_name)
+        tokenizer = AutoTokenizer.from_pretrained(tok_name, use_fast=False)
         if tok_name == MBART_TINY:
             train_dataset = Seq2SeqDataset(
                 tokenizer,

diff --git a/setup.py b/setup.py
@@ -96,13 +96,13 @@
     extras["retrieval"] = ["faiss-cpu", "datasets"]
     extras["flax"] = ["jaxlib==0.1.55", "jax>=0.2.0", "flax==0.2.2"]
 
-extras["tokenizers"] = ["tokenizers==0.9.2"]
+extras["tokenizers"] = ["tokenizers==0.9.4"]
 extras["onnxruntime"] = ["onnxruntime>=1.4.0", "onnxruntime-tools>=1.4.2"]
 extras["modelcreation"] = ["cookiecutter==1.7.2"]
 
 extras["serving"] = ["pydantic", "uvicorn", "fastapi", "starlette"]
 
-extras["sentencepiece"] = ["sentencepiece==0.1.91"]
+extras["sentencepiece"] = ["sentencepiece==0.1.91", "protobuf"]
 extras["retrieval"] = ["faiss-cpu", "datasets"]
 extras["testing"] = ["pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil"] + extras["retrieval"] + extras["modelcreation"]
 # sphinx-rtd-theme==0.5.0 introduced big changes in the style.
@@ -130,7 +130,7 @@
     packages=find_packages("src"),
     install_requires=[
         "numpy",
-        "tokenizers == 0.9.3",
+        "tokenizers == 0.9.4",
         # dataclasses for Python versions that don't have it
         "dataclasses;python_version<'3.7'",
         # utilities from PyPA to e.g. compare versions
@@ -143,9 +143,6 @@
         "tqdm >= 4.27",
         # for OpenAI GPT
         "regex != 2019.12.17",
-        # for SentencePiece models
-        "sentencepiece == 0.1.91",
-        "protobuf",
         # for XLM
         "sacremoses",
     ],

diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py
@@ -24,10 +24,7 @@
 from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers, processors
 from tokenizers.models import BPE, Unigram, WordPiece
 
-# from transformers.tokenization_openai import OpenAIGPTTokenizer
-from transformers.utils import sentencepiece_model_pb2 as model
-
-from .file_utils import requires_sentencepiece
+from .file_utils import requires_protobuf, requires_sentencepiece
 
 
 class SentencePieceExtractor:
@@ -64,12 +61,6 @@ def check_number_comma(piece: str) -> bool:
     return len(piece) < 2 or piece[-1] != "," or not piece[-2].isdigit()
 
 
-def get_proto(filename: str):
-    m = model.ModelProto()
-    m.ParseFromString(open(filename, "rb").read())
-    return m
-
-
 class Converter:
     def __init__(self, original_tokenizer):
         self.original_tokenizer = original_tokenizer
@@ -292,8 +283,15 @@ def converted(self) -> Tokenizer:
 
 class SpmConverter(Converter):
     def __init__(self, *args):
+        requires_protobuf(self)
+
         super().__init__(*args)
-        self.proto = get_proto(self.original_tokenizer.vocab_file)
+
+        from .utils import sentencepiece_model_pb2 as model_pb2
+
+        m = model_pb2.ModelProto()
+        m.ParseFromString(open(self.original_tokenizer.vocab_file, "rb").read())
+        self.proto = m
 
     def vocab(self, proto):
         return [(piece.piece, piece.score) for piece in proto.pieces]

diff --git a/src/transformers/data/processors/squad.py b/src/transformers/data/processors/squad.py
@@ -8,7 +8,7 @@
 
 from ...file_utils import is_tf_available, is_torch_available
 from ...tokenization_bert import whitespace_tokenize
-from ...tokenization_utils_base import PreTrainedTokenizerBase, TruncationStrategy
+from ...tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase, TruncationStrategy
 from ...utils import logging
 from .utils import DataProcessor
 
@@ -765,6 +765,7 @@ class SquadFeatures:
         token_to_orig_map: mapping between the tokens and the original text, needed in order to identify the answer.
         start_position: start of the answer token index
         end_position: end of the answer token index
+        encoding: optionally store the BatchEncoding with the fast-tokenizer alignement methods.
     """
 
     def __init__(
@@ -784,6 +785,7 @@ def __init__(
         end_position,
         is_impossible,
         qas_id: str = None,
+        encoding: BatchEncoding = None,
     ):
         self.input_ids = input_ids
         self.attention_mask = attention_mask
@@ -803,6 +805,8 @@ def __init__(
         self.is_impossible = is_impossible
         self.qas_id = qas_id
 
+        self.encoding = encoding
+
 
 class SquadResult:
     """

diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
@@ -185,6 +185,15 @@
     _sentencepiece_available = False
 
 
+try:
+    import google.protobuf  # noqa: F401
+
+    _protobuf_available = True
+
+except ImportError:
+    _protobuf_available = False
+
+
 try:
     import tokenizers  # noqa: F401
 
@@ -270,6 +279,10 @@ def is_sentencepiece_available():
     return _sentencepiece_available
 
 
+def is_protobuf_available():
+    return _protobuf_available
+
+
 def is_tokenizers_available():
     return _tokenizers_available
 
@@ -330,6 +343,14 @@ def wrapper(*args, **kwargs):
 """
 
 
+# docstyle-ignore
+PROTOBUF_IMPORT_ERROR = """
+{0} requires the protobuf library but it was not found in your environment. Checkout the instructions on the
+installation page of its repo: https://github.com/protocolbuffers/protobuf/tree/master/python#installation and follow the ones
+that match your environment.
+"""
+
+
 # docstyle-ignore
 FAISS_IMPORT_ERROR = """
 {0} requires the faiss library but it was not found in your environment. Checkout the instructions on the
@@ -420,6 +441,12 @@ def requires_sentencepiece(obj):
         raise ImportError(SENTENCEPIECE_IMPORT_ERROR.format(name))
 
 
+def requires_protobuf(obj):
+    name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
+    if not is_protobuf_available():
+        raise ImportError(PROTOBUF_IMPORT_ERROR.format(name))
+
+
 def add_start_docstrings(*docstr):
     def docstring_decorator(fn):
         fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "")