Skip to content

Commit

Permalink
[breaking|pipelines|tokenizers] Adding slow-fast tokenizers equivalen…
Browse files Browse the repository at this point in the history
…ce tests pipelines - Removing sentencepiece as a required dependency (huggingface#8073)

* Fixing roberta for slow-fast tests

* WIP getting equivalence on pipelines

* slow-to-fast equivalence - working on question-answering pipeline

* optional FAISS tests

* Pipeline Q&A

* Move pipeline tests to their own test job again

* update tokenizer to add sequence id methods

* update to tokenizers 0.9.4

* set sentencepiecce as optional

* clean up squad

* clean up pipelines to use sequence_ids

* style/quality

* wording

* Switch to use_fast = True by default

* update tests for use_fast at True by default

* fix rag tokenizer test

* removing protobuf from required dependencies

* fix NER test for use_fast = True by default

* fixing example tests (Q&A examples use slow tokenizers for now)

* protobuf in main deps extras["sentencepiece"] and example deps

* fix protobug install test

* try to fix seq2seq by switching to slow tokenizers for now

* Update src/transformers/tokenization_utils_base.py

Co-authored-by: Lysandre Debut <lysandre@huggingface.co>

* Update src/transformers/tokenization_utils_base.py

Co-authored-by: Lysandre Debut <lysandre@huggingface.co>

Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
  • Loading branch information
2 people authored and Zhylkaaa committed Nov 17, 2020
1 parent e21232c commit 0355ad2
Show file tree
Hide file tree
Showing 23 changed files with 690 additions and 263 deletions.
6 changes: 5 additions & 1 deletion examples/question-answering/run_squad.py
Original file line number Diff line number Diff line change
Expand Up @@ -736,6 +736,7 @@ def main():
args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
do_lower_case=args.do_lower_case,
cache_dir=args.cache_dir if args.cache_dir else None,
use_fast=False, # SquadDataset is not compatible with Fast tokenizers which have a smarter overflow handeling
)
model = AutoModelForQuestionAnswering.from_pretrained(
args.model_name_or_path,
Expand Down Expand Up @@ -784,7 +785,10 @@ def main():

# Load a trained model and vocabulary that you have fine-tuned
model = AutoModelForQuestionAnswering.from_pretrained(args.output_dir) # , force_download=True)
tokenizer = AutoTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)

# SquadDataset is not compatible with Fast tokenizers which have a smarter overflow handeling
# So we use use_fast=False here for now until Fast-tokenizer-compatible-examples are out
tokenizer = AutoTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case, use_fast=False)
model.to(args.device)

# Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
Expand Down
1 change: 1 addition & 0 deletions examples/question-answering/run_squad_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ def main():
tokenizer = AutoTokenizer.from_pretrained(
model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
use_fast=False, # SquadDataset is not compatible with Fast tokenizers which have a smarter overflow handeling
)
model = AutoModelForQuestionAnswering.from_pretrained(
model_args.model_name_or_path,
Expand Down
1 change: 1 addition & 0 deletions examples/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@ fire
pytest
conllu
sentencepiece != 0.1.92
protobuf
2 changes: 1 addition & 1 deletion examples/seq2seq/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ def test_distributed_sortish_sampler_splits_indices_between_procs(self):
)
@require_torch_non_multi_gpu_but_fix_me
def test_dataset_kwargs(self, tok_name):
tokenizer = AutoTokenizer.from_pretrained(tok_name)
tokenizer = AutoTokenizer.from_pretrained(tok_name, use_fast=False)
if tok_name == MBART_TINY:
train_dataset = Seq2SeqDataset(
tokenizer,
Expand Down
9 changes: 3 additions & 6 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,13 +96,13 @@
extras["retrieval"] = ["faiss-cpu", "datasets"]
extras["flax"] = ["jaxlib==0.1.55", "jax>=0.2.0", "flax==0.2.2"]

extras["tokenizers"] = ["tokenizers==0.9.2"]
extras["tokenizers"] = ["tokenizers==0.9.4"]
extras["onnxruntime"] = ["onnxruntime>=1.4.0", "onnxruntime-tools>=1.4.2"]
extras["modelcreation"] = ["cookiecutter==1.7.2"]

extras["serving"] = ["pydantic", "uvicorn", "fastapi", "starlette"]

extras["sentencepiece"] = ["sentencepiece==0.1.91"]
extras["sentencepiece"] = ["sentencepiece==0.1.91", "protobuf"]
extras["retrieval"] = ["faiss-cpu", "datasets"]
extras["testing"] = ["pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil"] + extras["retrieval"] + extras["modelcreation"]
# sphinx-rtd-theme==0.5.0 introduced big changes in the style.
Expand Down Expand Up @@ -130,7 +130,7 @@
packages=find_packages("src"),
install_requires=[
"numpy",
"tokenizers == 0.9.3",
"tokenizers == 0.9.4",
# dataclasses for Python versions that don't have it
"dataclasses;python_version<'3.7'",
# utilities from PyPA to e.g. compare versions
Expand All @@ -143,9 +143,6 @@
"tqdm >= 4.27",
# for OpenAI GPT
"regex != 2019.12.17",
# for SentencePiece models
"sentencepiece == 0.1.91",
"protobuf",
# for XLM
"sacremoses",
],
Expand Down
20 changes: 9 additions & 11 deletions src/transformers/convert_slow_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,7 @@
from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers, processors
from tokenizers.models import BPE, Unigram, WordPiece

# from transformers.tokenization_openai import OpenAIGPTTokenizer
from transformers.utils import sentencepiece_model_pb2 as model

from .file_utils import requires_sentencepiece
from .file_utils import requires_protobuf, requires_sentencepiece


class SentencePieceExtractor:
Expand Down Expand Up @@ -64,12 +61,6 @@ def check_number_comma(piece: str) -> bool:
return len(piece) < 2 or piece[-1] != "," or not piece[-2].isdigit()


def get_proto(filename: str):
m = model.ModelProto()
m.ParseFromString(open(filename, "rb").read())
return m


class Converter:
def __init__(self, original_tokenizer):
self.original_tokenizer = original_tokenizer
Expand Down Expand Up @@ -292,8 +283,15 @@ def converted(self) -> Tokenizer:

class SpmConverter(Converter):
def __init__(self, *args):
requires_protobuf(self)

super().__init__(*args)
self.proto = get_proto(self.original_tokenizer.vocab_file)

from .utils import sentencepiece_model_pb2 as model_pb2

m = model_pb2.ModelProto()
m.ParseFromString(open(self.original_tokenizer.vocab_file, "rb").read())
self.proto = m

def vocab(self, proto):
return [(piece.piece, piece.score) for piece in proto.pieces]
Expand Down
6 changes: 5 additions & 1 deletion src/transformers/data/processors/squad.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from ...file_utils import is_tf_available, is_torch_available
from ...tokenization_bert import whitespace_tokenize
from ...tokenization_utils_base import PreTrainedTokenizerBase, TruncationStrategy
from ...tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase, TruncationStrategy
from ...utils import logging
from .utils import DataProcessor

Expand Down Expand Up @@ -765,6 +765,7 @@ class SquadFeatures:
token_to_orig_map: mapping between the tokens and the original text, needed in order to identify the answer.
start_position: start of the answer token index
end_position: end of the answer token index
encoding: optionally store the BatchEncoding with the fast-tokenizer alignement methods.
"""

def __init__(
Expand All @@ -784,6 +785,7 @@ def __init__(
end_position,
is_impossible,
qas_id: str = None,
encoding: BatchEncoding = None,
):
self.input_ids = input_ids
self.attention_mask = attention_mask
Expand All @@ -803,6 +805,8 @@ def __init__(
self.is_impossible = is_impossible
self.qas_id = qas_id

self.encoding = encoding


class SquadResult:
"""
Expand Down
27 changes: 27 additions & 0 deletions src/transformers/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,15 @@
_sentencepiece_available = False


try:
import google.protobuf # noqa: F401

_protobuf_available = True

except ImportError:
_protobuf_available = False


try:
import tokenizers # noqa: F401

Expand Down Expand Up @@ -270,6 +279,10 @@ def is_sentencepiece_available():
return _sentencepiece_available


def is_protobuf_available():
return _protobuf_available


def is_tokenizers_available():
return _tokenizers_available

Expand Down Expand Up @@ -330,6 +343,14 @@ def wrapper(*args, **kwargs):
"""


# docstyle-ignore
PROTOBUF_IMPORT_ERROR = """
{0} requires the protobuf library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/protocolbuffers/protobuf/tree/master/python#installation and follow the ones
that match your environment.
"""


# docstyle-ignore
FAISS_IMPORT_ERROR = """
{0} requires the faiss library but it was not found in your environment. Checkout the instructions on the
Expand Down Expand Up @@ -420,6 +441,12 @@ def requires_sentencepiece(obj):
raise ImportError(SENTENCEPIECE_IMPORT_ERROR.format(name))


def requires_protobuf(obj):
name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
if not is_protobuf_available():
raise ImportError(PROTOBUF_IMPORT_ERROR.format(name))


def add_start_docstrings(*docstr):
def docstring_decorator(fn):
fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "")
Expand Down
Loading

0 comments on commit 0355ad2

Please sign in to comment.