diff --git a/convert.py b/convert.py index e3f1096e149c4..75cfdf86e1528 100755 --- a/convert.py +++ b/convert.py @@ -231,19 +231,10 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]: tokenizer = self.sentencepiece_tokenizer for i in range(tokenizer.vocab_size()): - text: bytes - if tokenizer.is_unknown(i): - text = " \u2047 ".encode("utf-8") - elif tokenizer.is_control(i): - text = b"" - elif tokenizer.is_byte(i): - piece = tokenizer.id_to_piece(i) - if len(piece) != 6: - raise Exception(f"Invalid token: {piece}") - byte_value = int(piece[3:-1], 16) - text = struct.pack("B", byte_value) - else: - text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8") + # TODO: How do we want to support is_unknown, is_control, is_byte and is_unused(i)? + piece = tokenizer.id_to_piece(i) + text: bytes = piece.encode("utf-8") + score: float = tokenizer.get_score(i) yield text, score diff --git a/llama.cpp b/llama.cpp index 0a381afd5b726..65a279759f430 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1805,7 +1805,8 @@ struct llama_tokenizer { size_t offs = 0; while (offs < text.size()) { llama_sp_symbol sym; - size_t char_len = std::min(text.size() - offs, utf8_len(text[offs])); + assert(utf8_len(text[offs]) <= text.size() - offs); + size_t char_len = utf8_len(text[offs]); sym.text = text.c_str() + offs; sym.n = char_len; offs += char_len; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 11ec6c7252f46..d54f0fdbbe1de 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -11,5 +11,6 @@ llama_add_test(test-quantize-fns.cpp) llama_add_test(test-quantize-perf.cpp) llama_add_test(test-sampling.cpp) llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin) +llama_add_test(test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin) llama_add_test(test-grad0.c) # SLOW # llama_add_test(test-opt.c) # SLOW diff --git a/tests/test-tokenizer-1.cpp b/tests/test-tokenizer-1.cpp new file mode 100644 index 0000000000000..91464b8507de0 --- /dev/null +++ b/tests/test-tokenizer-1.cpp @@ -0,0 +1,103 @@ +#include "llama.h" + +#include +#include +#include +#include +#include +#include + +std::string detokenize(llama_context * ctx, llama_token * tokens, int count) { + std::string result; + for (int i = 0; i < count; ++i) { + result += llama_token_to_str(ctx, tokens[i]); + if (i < count - 1) { + result += "_"; + } + } + return result; +} + +int main(int argc, char **argv) { + if (argc < 2) { + fprintf(stderr, "Usage: %s \n", argv[0]); + return 1; + } + + const std::string fname = argv[1]; + + fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str()); + + llama_model * model; + llama_context * ctx; + + llama_backend_init(false); + + // load the vocab + { + auto lparams = llama_context_default_params(); + + lparams.vocab_only = true; + + model = llama_load_model_from_file(fname.c_str(), lparams); + + if (model == NULL) { + fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); + return 1; + } + + ctx = llama_new_context_with_model(model, lparams); + + if (ctx == NULL) { + fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); + llama_free_model(model); + return 1; + } + } + + const int n_vocab = llama_n_vocab(ctx); + + if (n_vocab != 32000) { + fprintf(stderr, "%s : expected 32000 tokens, got %d\n", __func__, n_vocab); + llama_free_model(model); + llama_free(ctx); + return 2; + } + + for (int i = 0; i < n_vocab; ++i) { + const char * forward = llama_token_to_str(ctx, i); + llama_token tokens[strlen(forward)]; + auto n = llama_tokenize(ctx, forward, tokens, strlen(forward), false); + if (n == 1) { + if (i != tokens[0]) { + const char* backward = llama_token_to_str(ctx, tokens[0]); + fprintf(stderr, "%s : error: token %d is string %s but tokenize() returns token %d %s\n", __func__, i, forward, tokens[0], backward); + } + } else { + if (i <= 258) { + fprintf(stderr, "%s : info: token %d is string %s and tokenize() returns tokens %s\n", __func__, i, forward, detokenize(ctx, tokens, n).c_str()); + } else { + fprintf(stderr, "%s : error: token %d is string %s but tokenize() returns tokens %s\n", __func__, i, forward, detokenize(ctx, tokens, n).c_str()); + } + } + } + + std::wstring string_to_convert; + std::wstring_convert, wchar_t> converter; + for (wchar_t ch = 0x0000; ch < 0xffff; ++ch) { + std::wstring wstr(1, ch); + std::string str = converter.to_bytes(wstr); + llama_token tokens[strlen(str.c_str())]; + auto n = llama_tokenize(ctx, str.c_str(), tokens, str.length(), false); + if (n == 1) { + fprintf(stderr, "%s : info: %s tokenized to %d \n", __func__, str.c_str(), tokens[0]); + } + } + + llama_free_model(model); + llama_free(ctx); + + llama_backend_free(); + + return 0; +}