Skip to content

Commit 233fc1c

Browse files
authored
Minor improvements in GPT2 tokenizer (#3567)
* Fixing minor bugs in bpe_gpt2_preprocess * Don't add bos token in test
1 parent c5b4936 commit 233fc1c

5 files changed

+17
-20
lines changed

llama.cpp

+4-5
Original file line numberDiff line numberDiff line change
@@ -6342,7 +6342,6 @@ struct llm_tokenizer_bpe {
63426342
for (int i = 0; i < (int)text_utf.size(); i++) {
63436343
const std::string & utf_char = text_utf[i];
63446344
bool split_condition = false;
6345-
// const char* text_pos = raw_text_p + utf_char.seq_offset_bytes;
63466345
int bytes_remain = text_utf.size() - i;
63476346
// forward backward lookups
63486347
const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
@@ -6368,9 +6367,9 @@ struct llm_tokenizer_bpe {
63686367
if (!split_condition && bytes_remain >= 3) {
63696368
// 're|'ve|'ll
63706369
if (utf_char == "\'" && (
6371-
(utf_char_next == "r" || utf_char_next_next == "e") ||
6372-
(utf_char_next == "v" || utf_char_next_next == "e") ||
6373-
(utf_char_next == "l" || utf_char_next_next == "l"))
6370+
(utf_char_next == "r" && utf_char_next_next == "e") ||
6371+
(utf_char_next == "v" && utf_char_next_next == "e") ||
6372+
(utf_char_next == "l" && utf_char_next_next == "l"))
63746373
) {
63756374
split_condition = true;
63766375
}
@@ -6421,7 +6420,7 @@ struct llm_tokenizer_bpe {
64216420
else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
64226421
split_condition = true;
64236422
}
6424-
else if (collecting_whitespace_lookahead && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE) {
6423+
else if (collecting_whitespace_lookahead && (codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
64256424
split_condition = true;
64266425
}
64276426
}

tests/test-tokenizer-0-falcon.cpp

+4-4
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
3636
{ " Hello" , { 258, 23090, }, },
3737
{ " Hello" , { 466, 23090, }, },
3838
{ " Hello\n Hello" , { 466, 23090, 742, 23090, }, },
39+
{ "\n =" , { 1212, 40, }, },
40+
{ "' era" , { 18, 4932, }, },
3941
};
4042

4143
return _k_tests;
@@ -155,7 +157,7 @@ int main(int argc, char **argv) {
155157

156158
fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
157159

158-
const std::vector<llama_token> res = llama_tokenize(ctx, text, true);
160+
const std::vector<llama_token> res = llama_tokenize(ctx, text, false);
159161

160162
fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
161163

@@ -169,10 +171,8 @@ int main(int argc, char **argv) {
169171
}
170172

171173
for (const auto & tok : res) {
172-
ofs << tok << " ";
174+
ofs << tok << " '" << llama_detokenize_bpe(ctx, std::vector<int>{tok}) << "'" << std::endl;
173175
}
174-
175-
ofs << "\n";
176176
}
177177

178178
fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());

tests/test-tokenizer-0-falcon.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@
4141
" Hello",
4242
" Hello",
4343
" Hello\n Hello",
44+
"\n =",
45+
"' era",
4446
]
4547

4648
for text in tests:
@@ -69,15 +71,14 @@
6971
if fname_tok:
7072
print('tokenizing file: ', fname_tok)
7173
fname_out = fname_tok + '.tok'
72-
with open(fname_tok, 'r') as f:
74+
with open(fname_tok, 'r', encoding='utf-8') as f:
7375
lines = f.readlines()
7476
s = ''.join(lines)
7577
res = tokenizer.encode(s)
7678
# write to file
77-
with open(fname_out, 'w') as f:
79+
with open(fname_out, 'w', encoding='utf-8') as f:
7880
for x in res:
79-
f.write(str(x) + ' ')
80-
f.write('\n')
81+
f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
8182
print('len(res): ', len(res))
8283
print('len(lines): ', len(lines))
8384
print('results written to: ', fname_out)

tests/test-tokenizer-0-llama.cpp

+1-3
Original file line numberDiff line numberDiff line change
@@ -174,10 +174,8 @@ int main(int argc, char **argv) {
174174
}
175175

176176
for (const auto & tok : res) {
177-
ofs << tok << " ";
177+
ofs << tok << " '" << llama_detokenize_spm(ctx, std::vector<int>{tok}) << "'" << std::endl;
178178
}
179-
180-
ofs << "\n";
181179
}
182180

183181
fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());

tests/test-tokenizer-0-llama.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -81,15 +81,14 @@
8181
if fname_tok:
8282
print('tokenizing file: ', fname_tok)
8383
fname_out = fname_tok + '.tok'
84-
with open(fname_tok, 'r') as f:
84+
with open(fname_tok, 'r', encoding='utf-8') as f:
8585
lines = f.readlines()
8686
s = ''.join(lines)
8787
res = tokenizer.encode(s, add_bos=True)
8888
# write to file
89-
with open(fname_out, 'w') as f:
89+
with open(fname_out, 'w', encoding='utf-8') as f:
9090
for x in res:
91-
f.write(str(x) + ' ')
92-
f.write('\n')
91+
f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
9392
print('len(res): ', len(res))
9493
print('len(lines): ', len(lines))
9594
print('results written to: ', fname_out)

0 commit comments

Comments
 (0)