From 6715e3b6a14e58fe535a58cdc41d87ca81254656 Mon Sep 17 00:00:00 2001
From: Kostas Stathoulopoulos <k.stathoylopoylos@gmail.com>
Date: Mon, 26 Apr 2021 16:29:36 +0100
Subject: [PATCH] Clarify description of the is_split_into_words argument
 (#11449)

* Improve documentation for is_split_into_words argument

* Change description wording
---
 src/transformers/models/tapas/tokenization_tapas.py | 5 +++--
 src/transformers/tokenization_utils.py              | 4 +++-
 src/transformers/tokenization_utils_base.py         | 5 +++--
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/tapas/tokenization_tapas.py b/src/transformers/models/tapas/tokenization_tapas.py
index 076a45ccdf4c3a..23d5d99d5c5b82 100644
--- a/src/transformers/models/tapas/tokenization_tapas.py
+++ b/src/transformers/models/tapas/tokenization_tapas.py
@@ -172,8 +172,9 @@ def whitespace_tokenize(text):
                 length is required by one of the truncation/padding parameters. If the model has no specific maximum
                 input length (like XLNet) truncation/padding to a maximum length will be deactivated.
             is_split_into_words (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not the input is already pre-tokenized (e.g., split into words), in which case the tokenizer
-                will skip the pre-tokenization step. This is useful for NER or token classification.
+                Whether or not the input is already pre-tokenized (e.g., split into words). If set to :obj:`True`,
+                the tokenizer assumes the input is already split into words (for instance, by splitting it on
+                whitespace) which it will tokenize. This is useful for NER or token classification.
             pad_to_multiple_of (:obj:`int`, `optional`):
                 If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
                 the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py
index fafe8a5597b67d..3ae7affdb3a7a6 100644
--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -643,7 +643,9 @@ def prepare_for_tokenization(
             text (:obj:`str`):
                 The text to prepare.
             is_split_into_words (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not the text has been pretokenized.
+                Whether or not the input is already pre-tokenized (e.g., split into words). If set to :obj:`True`,
+                the tokenizer assumes the input is already split into words (for instance, by splitting it on
+                whitespace) which it will tokenize. This is useful for NER or token classification.
             kwargs:
                 Keyword arguments to use for the tokenization.
 
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index fb69674081822c..cb0a99cd2fc434 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -1286,8 +1286,9 @@ def all_special_ids(self) -> List[int]:
                 returned to provide some overlap between truncated and overflowing sequences. The value of this
                 argument defines the number of overlapping tokens.
             is_split_into_words (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not the input is already pre-tokenized (e.g., split into words), in which case the tokenizer
-                will skip the pre-tokenization step. This is useful for NER or token classification.
+                Whether or not the input is already pre-tokenized (e.g., split into words). If set to :obj:`True`,
+                the tokenizer assumes the input is already split into words (for instance, by splitting it on
+                whitespace) which it will tokenize. This is useful for NER or token classification.
             pad_to_multiple_of (:obj:`int`, `optional`):
                 If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
                 the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).