Skip to content

Commit

Permalink
Reformat to make code clearer in tokenizer call (#11497)
Browse files Browse the repository at this point in the history
* Reformat to make code clearer

* Reformat to make code clearer
  • Loading branch information
sgugger authored Apr 29, 2021
1 parent f748bd4 commit ad1f7be
Showing 1 changed file with 32 additions and 37 deletions.
69 changes: 32 additions & 37 deletions src/transformers/tokenization_utils_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2236,47 +2236,42 @@ def __call__(
:obj:`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
"""
# Input type checking for clearer error
assert isinstance(text, str) or (
isinstance(text, (list, tuple))
and (
len(text) == 0
or (
isinstance(text[0], str)
or (isinstance(text[0], (list, tuple)) and (len(text[0]) == 0 or isinstance(text[0][0], str)))
)
)
), (
"text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
"or `List[List[str]]` (batch of pretokenized examples)."
)
def _is_valid_text_input(t):
if isinstance(t, str):
# Strings are fine
return True
elif isinstance(t, (list, tuple)):
# List are fine as long as they are...
if len(t) == 0:
# ... empty
return True
elif isinstance(t[0], str):
# ... list of strings
return True
elif isinstance(t[0], (list, tuple)):
# ... list with an empty list or with a list of strings
return len(t[0]) == 0 or isinstance(t[0][0], str)
else:
return False
else:
return False

assert (
text_pair is None
or isinstance(text_pair, str)
or (
isinstance(text_pair, (list, tuple))
and (
len(text_pair) == 0
or (
isinstance(text_pair[0], str)
or (
isinstance(text_pair[0], (list, tuple))
and (len(text_pair[0]) == 0 or isinstance(text_pair[0][0], str))
)
)
)
if not _is_valid_text_input(text):
raise ValueError(
"text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
"or `List[List[str]]` (batch of pretokenized examples)."
)
), (
"text_pair input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
"or `List[List[str]]` (batch of pretokenized examples)."
)

is_batched = bool(
(not is_split_into_words and isinstance(text, (list, tuple)))
or (
is_split_into_words and isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))
if text_pair is not None and not _is_valid_text_input(text_pair):
raise ValueError(
"text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
"or `List[List[str]]` (batch of pretokenized examples)."
)
)

if is_split_into_words:
is_batched = isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))
else:
is_batched = isinstance(text, (list, tuple))

if is_batched:
if isinstance(text_pair, str):
Expand Down

0 comments on commit ad1f7be

Please sign in to comment.