Skip to content

Commit

Permalink
Change encoding to utf-8 to fix Kaggle branch test failure for PyTorch (
Browse files Browse the repository at this point in the history
#1367)

* Change encoding to utf-8

* Change encoding to utf-8

* Change encoding to utf-8
  • Loading branch information
sampathweb authored Dec 13, 2023
1 parent 009003f commit 540317c
Show file tree
Hide file tree
Showing 4 changed files with 8 additions and 8 deletions.
8 changes: 4 additions & 4 deletions keras_nlp/tokenizers/byte_pair_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,9 +314,9 @@ def __init__(
def save_assets(self, dir_path):
vocab_path = os.path.join(dir_path, VOCAB_FILENAME)
merges_path = os.path.join(dir_path, MERGES_FILENAME)
with open(vocab_path, "w") as file:
with open(vocab_path, "w", encoding="utf-8") as file:
file.write(json.dumps(dict(self.vocabulary)))
with open(merges_path, "w") as file:
with open(merges_path, "w", encoding="utf-8") as file:
for merge in self.merges:
file.write(f"{merge}\n")

Expand All @@ -339,7 +339,7 @@ def set_vocabulary_and_merges(self, vocabulary, merges):
return

if isinstance(vocabulary, str):
with open(vocabulary, "r") as f:
with open(vocabulary, "r", encoding="utf-8") as f:
self.vocabulary = json.load(f)
elif isinstance(vocabulary, dict):
self.vocabulary = vocabulary.copy()
Expand All @@ -350,7 +350,7 @@ def set_vocabulary_and_merges(self, vocabulary, merges):
f"`type(vocabulary)={type(vocabulary)}`."
)
if isinstance(merges, str):
self.merges = [bp.rstrip() for bp in open(merges)]
self.merges = [bp.rstrip() for bp in open(merges, encoding="utf-8")]
elif isinstance(merges, Iterable):
self.merges = list(merges)
else:
Expand Down
4 changes: 2 additions & 2 deletions keras_nlp/tokenizers/word_piece_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,7 @@ def __init__(

def save_assets(self, dir_path):
path = os.path.join(dir_path, VOCAB_FILENAME)
with open(path, "w") as file:
with open(path, "w", encoding="utf-8") as file:
for token in self.vocabulary:
file.write(f"{token}\n")

Expand All @@ -345,7 +345,7 @@ def set_vocabulary(self, vocabulary):
return

if isinstance(vocabulary, str):
with open(vocabulary) as file:
with open(vocabulary, "r", encoding="utf-8") as file:
self.vocabulary = [line.rstrip() for line in file]
elif isinstance(vocabulary, Iterable):
# Make a defensive copy.
Expand Down
2 changes: 1 addition & 1 deletion keras_nlp/tokenizers/word_piece_tokenizer_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ def normalize_and_split(x):
if vocabulary_output_file is not None:
vocab_text = "".join([line + "\n" for line in vocab])
# Write vocab to file.
with open(vocabulary_output_file, "w") as vocab_file:
with open(vocabulary_output_file, "w", encoding="utf-8") as vocab_file:
vocab_file.write(vocab_text)
else:
return vocab
2 changes: 1 addition & 1 deletion keras_nlp/tokenizers/word_piece_tokenizer_trainer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ def test_output_file(self):
reserved_tokens=[],
)
vocab_from_file = []
with open(vocab_file, "r") as f:
with open(vocab_file, "r", encoding="utf-8") as f:
for line in f:
vocab_from_file.append(line.strip())
self.assertAllEqual(vocab_from_file, test_output)
Expand Down

0 comments on commit 540317c

Please sign in to comment.