Change encoding to utf-8 to fix Kaggle branch test failure for PyTorch (

#1367) * Change encoding to utf-8 * Change encoding to utf-8 * Change encoding to utf-8
keras-team · Dec 13, 2023 · 540317c · 540317c
1 parent 009003f
commit 540317c
Show file tree

Hide file tree

Showing 4 changed files with 8 additions and 8 deletions.
diff --git a/keras_nlp/tokenizers/byte_pair_tokenizer.py b/keras_nlp/tokenizers/byte_pair_tokenizer.py
@@ -314,9 +314,9 @@ def __init__(
     def save_assets(self, dir_path):
         vocab_path = os.path.join(dir_path, VOCAB_FILENAME)
         merges_path = os.path.join(dir_path, MERGES_FILENAME)
-        with open(vocab_path, "w") as file:
+        with open(vocab_path, "w", encoding="utf-8") as file:
             file.write(json.dumps(dict(self.vocabulary)))
-        with open(merges_path, "w") as file:
+        with open(merges_path, "w", encoding="utf-8") as file:
             for merge in self.merges:
                 file.write(f"{merge}\n")
 
@@ -339,7 +339,7 @@ def set_vocabulary_and_merges(self, vocabulary, merges):
             return
 
         if isinstance(vocabulary, str):
-            with open(vocabulary, "r") as f:
+            with open(vocabulary, "r", encoding="utf-8") as f:
                 self.vocabulary = json.load(f)
         elif isinstance(vocabulary, dict):
             self.vocabulary = vocabulary.copy()
@@ -350,7 +350,7 @@ def set_vocabulary_and_merges(self, vocabulary, merges):
                 f"`type(vocabulary)={type(vocabulary)}`."
             )
         if isinstance(merges, str):
-            self.merges = [bp.rstrip() for bp in open(merges)]
+            self.merges = [bp.rstrip() for bp in open(merges, encoding="utf-8")]
         elif isinstance(merges, Iterable):
             self.merges = list(merges)
         else:

diff --git a/keras_nlp/tokenizers/word_piece_tokenizer.py b/keras_nlp/tokenizers/word_piece_tokenizer.py
@@ -329,7 +329,7 @@ def __init__(
 
     def save_assets(self, dir_path):
         path = os.path.join(dir_path, VOCAB_FILENAME)
-        with open(path, "w") as file:
+        with open(path, "w", encoding="utf-8") as file:
             for token in self.vocabulary:
                 file.write(f"{token}\n")
 
@@ -345,7 +345,7 @@ def set_vocabulary(self, vocabulary):
             return
 
         if isinstance(vocabulary, str):
-            with open(vocabulary) as file:
+            with open(vocabulary, "r", encoding="utf-8") as file:
                 self.vocabulary = [line.rstrip() for line in file]
         elif isinstance(vocabulary, Iterable):
             # Make a defensive copy.

diff --git a/keras_nlp/tokenizers/word_piece_tokenizer_trainer.py b/keras_nlp/tokenizers/word_piece_tokenizer_trainer.py
@@ -172,7 +172,7 @@ def normalize_and_split(x):
     if vocabulary_output_file is not None:
         vocab_text = "".join([line + "\n" for line in vocab])
         # Write vocab to file.
-        with open(vocabulary_output_file, "w") as vocab_file:
+        with open(vocabulary_output_file, "w", encoding="utf-8") as vocab_file:
             vocab_file.write(vocab_text)
     else:
         return vocab
diff --git a/keras_nlp/tokenizers/word_piece_tokenizer_trainer_test.py b/keras_nlp/tokenizers/word_piece_tokenizer_trainer_test.py
@@ -177,7 +177,7 @@ def test_output_file(self):
             reserved_tokens=[],
         )
         vocab_from_file = []
-        with open(vocab_file, "r") as f:
+        with open(vocab_file, "r", encoding="utf-8") as f:
             for line in f:
                 vocab_from_file.append(line.strip())
         self.assertAllEqual(vocab_from_file, test_output)