Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

llama : more tokenizer fixes #2810

Merged
merged 13 commits into from
Aug 27, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions examples/embedding/embedding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,6 @@ int main(int argc, char ** argv) {

int n_past = 0;

// Add a space in front of the first character to match OG llama tokenizer behavior
params.prompt.insert(0, 1, ' ');

// tokenize the prompt
auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);

Expand Down
5 changes: 0 additions & 5 deletions examples/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -195,11 +195,6 @@ int main(int argc, char ** argv) {
// tokenize the prompt
std::vector<llama_token> embd_inp;

if (llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM) {
// Add a space in front of the first character to match OG llama tokenizer behavior
params.prompt.insert(0, 1, ' ');
}

if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
} else {
Expand Down
21 changes: 11 additions & 10 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1635,7 +1635,7 @@ static void llm_load_hparams(
}

// TODO: This should probably be in llama.h
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos);
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos);

static void llm_load_vocab(
llama_model_loader & ml,
Expand Down Expand Up @@ -3026,10 +3026,8 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
return vocab.token_to_id.at(buf);
}

static std::string llama_escape_whitespace(const std::string& text) {
std::string result = text;
replace_all(result, " ", "\xe2\x96\x81");
return result;
static void llama_escape_whitespace(std::string & text) {
replace_all(text, " ", "\xe2\x96\x81");
}

static void llama_unescape_whitespace(std::string & word) {
Expand Down Expand Up @@ -3373,22 +3371,25 @@ struct llm_tokenizer_bpe {
llm_bigram_bpe::queue work_queue;
};

static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos) {
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos) {
std::vector<llama_vocab::id> output;

if (bos && vocab.special_bos_id != -1) {
output.push_back(vocab.special_bos_id);
}

if (raw_text.empty()) {
return output;
}

if (bos && vocab.special_bos_id != -1) {
output.push_back(vocab.special_bos_id);
}
raw_text = " " + raw_text;
klosax marked this conversation as resolved.
Show resolved Hide resolved

switch (vocab.type) {
case LLAMA_VOCAB_TYPE_SPM:
{
llm_tokenizer_spm tokenizer(vocab);
tokenizer.tokenize(llama_escape_whitespace(raw_text), output);
llama_escape_whitespace(raw_text);
tokenizer.tokenize(raw_text, output);
} break;
case LLAMA_VOCAB_TYPE_BPE:
{
Expand Down
86 changes: 47 additions & 39 deletions tests/test-tokenizer-0.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#include <map>
#include <vector>

static std::string unescape_whitespace(llama_context* ctx, const std::vector<llama_token>& tokens) {
static std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens) {
std::string result;
for (size_t i = 0; i < tokens.size(); ++i) {
result += llama_token_to_str(ctx, tokens[i]);
Expand All @@ -16,36 +16,40 @@ static std::string unescape_whitespace(llama_context* ctx, const std::vector<lla

static const std::map<std::string, std::vector<llama_token>> & k_tests() {
static std::map<std::string, std::vector<llama_token>> _k_tests = {
{ " ", {1, 259, }, },
{ " ", { 1, 1678, }, },
{ " ", { 1, 268, }, },
{ "\t", { 1, 29871, 12, }, },
{ "\n", { 1, 29871, 13, }, },
{ "\t\n", { 1, 29871, 12, 13, }, },
{ "Hello world", { 1, 15043, 3186, }, },
{ " Hello world", { 1, 29871, 15043, 3186, }, },
{ "Hello World", { 1, 15043, 2787, }, },
{ " Hello World", { 1, 29871, 15043, 2787, }, },
{ " Hello World!", { 1, 29871, 15043, 2787, 29991, }, },
{ " this is 🦙.cpp", { 1, 29871, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, },
{ "w048 7tuijk dsdfhu", { 1, 281, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, },
{ "нещо на Български", { 1, 1538, 4851, 665, 1386, 29713, 1305, }, },
{ "កាន់តែពិសេសអាចខលចេញ", { 1, 29871, 31849, 31324, 31934, 228, 162, 142, 228, 161,
146, 228, 162, 133, 228, 161, 153, 228, 161, 186,
31708, 228, 162, 132, 31708, 228, 161, 165, 31324, 228,
161, 136, 228, 161, 132, 228, 161, 158, 228, 161,
136, 228, 162, 132, 228, 161, 140, }, },
{ "" , { }, },
{ " ", { 259, }, },
{ " ", { 1678, }, },
{ " ", { 268, }, },
{ "\t", { 29871, 12, }, },
{ "\n", { 29871, 13, }, },
{ "\t\n", { 29871, 12, 13, }, },
{ "Hello world", { 15043, 3186, }, },
{ " Hello world", { 29871, 15043, 3186, }, },
{ "Hello World", { 15043, 2787, }, },
{ " Hello World", { 29871, 15043, 2787, }, },
{ " Hello World!", { 29871, 15043, 2787, 29991, }, },
{ "Hello, world!", { 15043, 29892, 3186, 29991, }, },
{ " Hello, world!", { 29871, 15043, 29892, 3186, 29991, }, },
{ " this is 🦙.cpp", { 29871, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, },
{ "w048 7tuijk dsdfhu", { 281, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, },
{ "нещо на Български", { 1538, 4851, 665, 1386, 29713, 1305, }, },
{ "កាន់តែពិសេសអាចខលចេញ",
{ 29871, 31849, 31324, 31934, 228, 162, 142, 228, 161,
146, 228, 162, 133, 228, 161, 153, 228, 161, 186,
31708, 228, 162, 132, 31708, 228, 161, 165, 31324, 228,
161, 136, 228, 161, 132, 228, 161, 158, 228, 161,
136, 228, 162, 132, 228, 161, 140, }, },
{ "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
{ 1, 29871, 243, 162, 157, 131, 313, 8945, 29897, 29871,
243, 162, 155, 185, 30722, 243, 162, 143, 174, 30598,
313, 20787, 953, 3848, 275, 16125, 630, 29897, 29871, 31681,
313, 6194, 953, 29877, 2397, 393, 756, 967, 1914, 5993, 29897, }, },
{ "Hello", { 1, 15043 }, },
{ " Hello", { 1, 29871, 15043 }, },
{ " Hello", { 1, 259, 15043 }, },
{ " Hello", { 1, 1678, 15043 }, },
{ " Hello", { 1, 268, 15043 }, },
{ " Hello\n Hello", { 1, 268, 15043, 13, 1678, 15043 }, },
{ 29871, 243, 162, 157, 131, 313, 8945, 29897, 29871,
243, 162, 155, 185, 30722, 243, 162, 143, 174, 30598,
313, 20787, 953, 3848, 275, 16125, 630, 29897, 29871, 31681,
313, 6194, 953, 29877, 2397, 393, 756, 967, 1914, 5993, 29897, }, },
{ "Hello", { 15043, }, },
{ " Hello", { 29871, 15043, }, },
{ " Hello", { 259, 15043, }, },
{ " Hello", { 1678, 15043, }, },
{ " Hello", { 268, 15043, }, },
{ " Hello\n Hello", { 268, 15043, 13, 1678, 15043, }, },
};

return _k_tests;
Expand Down Expand Up @@ -100,30 +104,34 @@ int main(int argc, char **argv) {
bool success = true;

for (const auto & test_kv : k_tests()) {
// Add a space in front of the first character to match OG llama tokenizer behavior
std::vector<llama_token> res = llama_tokenize(ctx, " " + test_kv.first, true);
fprintf(stderr, "%s : '%s' tokenized to '%s'\n",
__func__, test_kv.first.c_str(), unescape_whitespace(ctx, res).c_str());
const std::vector<llama_token> res_bos = llama_tokenize(ctx, test_kv.first, true);
const std::vector<llama_token> res_nobos = llama_tokenize(ctx, test_kv.first, false);

bool correct = res.size() == test_kv.second.size();
fprintf(stderr, "%s : '%s' tokenized to '%s'\n", __func__, test_kv.first.c_str(), llama_detokenize(ctx, res_bos).c_str());

for (int i = 0; i < (int) res.size() && correct; ++i) {
if (res[i] != test_kv.second[i]) {
bool correct = res_nobos.size() == test_kv.second.size() && res_bos.size() == res_nobos.size() + 1 && res_bos[0] == 1;

for (int i = 0; i < (int) res_nobos.size() && correct; ++i) {
if (test_kv.second[i] != res_bos[i + 1]) {
correct = false;
}
if (test_kv.second[i] != res_nobos[i]) {
correct = false;
}
}

if (!correct) {
fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());
fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
unescape_whitespace(ctx, res).c_str(), unescape_whitespace(ctx, test_kv.second).c_str());
llama_detokenize(ctx, res_nobos).c_str(),
llama_detokenize(ctx, test_kv.second).c_str());
fprintf(stderr, "%s : expected tokens: ", __func__);
for (const auto & t : test_kv.second) {
fprintf(stderr, "%6d, ", t);
}
fprintf(stderr, "\n");
fprintf(stderr, "%s : got tokens: ", __func__);
for (const auto & t : res) {
for (const auto & t : res_nobos) {
fprintf(stderr, "%6d, ", t);
}
fprintf(stderr, "\n");
Expand Down
51 changes: 51 additions & 0 deletions tests/test-tokenizer-0.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import os
import sys
import argparse

from sentencepiece import SentencePieceProcessor

parser = argparse.ArgumentParser()
parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
args = parser.parse_args()

dir_tokenizer = args.dir_tokenizer

tokenizer = SentencePieceProcessor(dir_tokenizer + '/tokenizer.model')

tests = [
""
" ",
" ",
" ",
"\t",
"\n",
"\t\n",
"Hello world",
" Hello world",
"Hello World",
" Hello World",
" Hello World!",
"Hello, world!",
" Hello, world!",
" this is 🦙.cpp",
"w048 7tuijk dsdfhu",
"нещо на Български",
"កាន់តែពិសេសអាចខលចេញ",
"🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
"Hello",
" Hello",
" Hello",
" Hello",
" Hello",
" Hello\n Hello",
]


for text in tests:
print('text: ', text)
print('\nwith bos:')
print(tokenizer.encode(text, add_bos=True))
print(tokenizer.decode(tokenizer.encode(text, add_bos=True)))
print('\nwithout bos:')
print(tokenizer.encode(text, add_bos=False))
print(tokenizer.decode(tokenizer.encode(text, add_bos=False)))