[Tokenizer] Fix TokenizerFast missing clean_up_tokenization_spaces (P…

…addlePaddle#9304)
lvdongyi · Oct 23, 2024 · 6ea6847 · 6ea6847
1 parent 35c2c21
commit 6ea6847
Showing 1 changed file with 1 addition and 1 deletion.
diff --git a/paddlenlp/transformers/tokenizer_utils_base.py b/paddlenlp/transformers/tokenizer_utils_base.py
@@ -1389,7 +1389,7 @@ def __init__(self, **kwargs):
         self.model_input_names = kwargs.pop("model_input_names", self.model_input_names)
 
         # By default, cleaning tokenization spaces for both fast and slow tokenizers
-        self.clean_up_tokenization_spaces = kwargs.pop("clean_up_tokenization_spaces", True)
+        self.clean_up_tokenization_spaces = kwargs.pop("clean_up_tokenization_spaces", False)
 
         # By default, do not split special tokens for both fast and slow tokenizers
         self.split_special_tokens = kwargs.pop("split_special_tokens", False)