Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for tokenize and untokenize of UTF-8 encoding in prompt/output #87

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,4 @@ models/*

arm_neon.h
compile_commands.json
*.dSYM/
6 changes: 3 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ endif
# Compile flags
#

CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
LDFLAGS =
CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC -g -I/opt/homebrew/include
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -g -I/opt/homebrew/include
LDFLAGS = -L/opt/homebrew/lib -lsentencepiece

# OS specific
# TODO: support Windows
Expand Down
29 changes: 19 additions & 10 deletions main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <map>
#include <string>
#include <vector>
#include <sentencepiece_processor.h>

#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
#include <signal.h>
Expand Down Expand Up @@ -84,7 +85,7 @@ struct llama_model {
};

// load the model's weights from a file
bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) {
bool llama_model_load(const std::string & fname, llama_model & model, sentencepiece::SentencePieceProcessor & sp, gpt_vocab & vocab, int n_ctx) {
printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());

auto fin = std::ifstream(fname, std::ios::binary);
Expand Down Expand Up @@ -146,6 +147,8 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
return false;
}

printf("total pieces: %d", sp.GetPieceSize());

std::string word;
for (int i = 0; i < n_vocab; i++) {
uint32_t len;
Expand All @@ -154,8 +157,9 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
word.resize(len);
fin.read((char *) word.data(), len);

vocab.token_to_id[word] = i;
vocab.id_to_token[i] = word;
std::string wordx = sp.IdToPiece(i);
vocab.token_to_id[wordx] = i;
vocab.id_to_token[i] = wordx;

//if (i < 30000) {
// printf("%s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
Expand Down Expand Up @@ -767,11 +771,15 @@ int main(int argc, char ** argv) {

gpt_params params;
params.model = "models/llama-7B/ggml-model.bin";
params.tokenizer = "models/tokenizer.model";

if (gpt_params_parse(argc, argv, params) == false) {
return 1;
}

sentencepiece::SentencePieceProcessor sp;
sp.Load(params.tokenizer);

if (params.seed < 0) {
params.seed = time(NULL);
}
Expand All @@ -795,7 +803,7 @@ int main(int argc, char ** argv) {
{
const int64_t t_start_us = ggml_time_us();

if (!llama_model_load(params.model, model, vocab, 512)) { // TODO: set context from user input ??
if (!llama_model_load(params.model, model, sp, vocab, 512)) { // TODO: set context from user input ??
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
return 1;
}
Expand All @@ -811,12 +819,12 @@ int main(int argc, char ** argv) {
std::vector<float> logits;

// tokenize the prompt
std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(vocab, params.prompt, true);
std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(sp, vocab, params.prompt, true);

params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());

// tokenize the reverse prompt
std::vector<gpt_vocab::id> antiprompt_inp = ::llama_tokenize(vocab, params.antiprompt, false);
std::vector<gpt_vocab::id> antiprompt_inp = ::llama_tokenize(sp, vocab, params.antiprompt, false);

printf("\n");
printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
Expand Down Expand Up @@ -882,6 +890,8 @@ int main(int argc, char ** argv) {
printf(ANSI_COLOR_YELLOW);
}

// buffering UTF-8 tokens like <0xE6>,<0xAC><0xA2> spanning across multiple output to make it complete.
std::vector<gpt_vocab::id> buffids = {};
while (remaining_tokens > 0) {
// predict
if (embd.size() > 0) {
Expand Down Expand Up @@ -943,9 +953,8 @@ int main(int argc, char ** argv) {

// display text
if (!input_noecho) {
for (auto id : embd) {
printf("%s", vocab.id_to_token[id].c_str());
}
untokenize(sp, buffids, embd);

// reset color to default if we there is no pending user input
if (params.use_color && embd_inp.size() <= input_consumed) {
printf(ANSI_COLOR_RESET);
Expand Down Expand Up @@ -986,7 +995,7 @@ int main(int argc, char ** argv) {
buf[n_read+1] = 0;
}

std::vector<gpt_vocab::id> line_inp = ::llama_tokenize(vocab, buf, false);
std::vector<gpt_vocab::id> line_inp = ::llama_tokenize(sp, vocab, buf, false);
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());

remaining_tokens -= line_inp.size();
Expand Down
82 changes: 44 additions & 38 deletions utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,7 @@
#include <cstring>
#include <fstream>
#include <regex>
#include <iostream>
#include <iterator>
#include <string>
#include <math.h>
#include <sentencepiece_processor.h>

#if defined(_MSC_VER) || defined(__MINGW32__)
#include <malloc.h> // using malloc.h with MSC/MINGW
Expand Down Expand Up @@ -49,6 +46,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
params.n_batch = std::stoi(argv[++i]);
} else if (arg == "-m" || arg == "--model") {
params.model = argv[++i];
} else if (arg == "--tokenizer") {
params.tokenizer = argv[++i];
} else if (arg == "-i" || arg == "--interactive") {
params.interactive = true;
} else if (arg == "--interactive-start") {
Expand Down Expand Up @@ -96,6 +95,8 @@ void gpt_print_usage(int argc, char ** argv, const gpt_params & params) {
fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
fprintf(stderr, " -m FNAME, --model FNAME\n");
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
fprintf(stderr, " --tokenizer FNAME\n");
fprintf(stderr, " tokenizer path (default: %s)\n", params.model.c_str());
fprintf(stderr, "\n");
}

Expand Down Expand Up @@ -272,42 +273,11 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
return tokens;
}

std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) {
//auto res = gpt_tokenize(vocab, text);

//if (bos) {
// res.insert(res.begin(), 1); // TODO: replace with vocab.bos
//}

std::vector<gpt_vocab::id> llama_tokenize(sentencepiece::SentencePieceProcessor & sp, const gpt_vocab & vocab, const std::string & text, bool bos) {
std::vector<gpt_vocab::id> res;

if (bos) {
res.push_back(1); // TODO: replace with vocab.bos
}

//find the longest token that matches the text
int pos = 0;
while (true) {
int l = 0;
int t = 0;
for (const auto & kv : vocab.id_to_token) {
if (kv.second.size() < l) continue;
if (kv.second.size() > text.size() - pos) continue;
if (text.substr(pos, kv.second.size()) == kv.second) {
l = kv.second.size();
t = kv.first;
}
}

if (l == 0) {
break;
}

res.push_back(t);
pos += l;
}

return res;
std::vector<std::string> pieces;
return sp.EncodeAsIds(text);
}

bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
Expand Down Expand Up @@ -542,3 +512,39 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t

return (n/k)*row_size;
}

void untokenize(sentencepiece::SentencePieceProcessor &sp, std::vector<gpt_vocab::id> &buffids, std::vector<gpt_vocab::id> &embd)
{
for (auto id : embd)
{
std::string s = sp.IdToPiece(id); // vocab.id_to_token[id];

if (s.find("<0x") == 0 && s[s.length() - 1] == '>')
{
buffids.push_back(id);
std::string txt = sp.DecodeIds(buffids);
// printf("bufferring %s, total buffer: %s\n", s.c_str(), txt.c_str());
}
else if (s.find("▁") == 0)
{
if (!buffids.empty())
{
std::string txt = sp.DecodeIds(buffids);
printf("%s", txt.c_str());
buffids.clear();
}
s = std::regex_replace(s, std::regex("▁"), " ");
printf("%s", s.c_str());
}
else
{
if (!buffids.empty())
{
std::string txt = sp.DecodeIds(buffids);
printf("%s", txt.c_str());
buffids.clear();
}
printf("%s", s.c_str());
}
}
}
7 changes: 6 additions & 1 deletion utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <vector>
#include <random>
#include <thread>
#include <sentencepiece_processor.h>

//
// CLI argument parsing
Expand All @@ -27,6 +28,7 @@ struct gpt_params {
int32_t n_batch = 8; // batch size for prompt processing

std::string model = "models/lamma-7B/ggml-model.bin"; // model path
std::string tokenizer = "models/tokenizer.model"; // tokenizer path
std::string prompt;

bool use_color = false; // use color to distinguish generations and inputs
Expand Down Expand Up @@ -73,7 +75,7 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri

// TODO: this is probably wrong, but I cannot figure out how this tokenizer works ..
// ref: https://github.com/google/sentencepiece
std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos);
std::vector<gpt_vocab::id> llama_tokenize(sentencepiece::SentencePieceProcessor & sp, const gpt_vocab & vocab, const std::string & text, bool bos);

// load the tokens from encoder.json
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);
Expand Down Expand Up @@ -102,3 +104,6 @@ void sample_top_k(std::vector<std::pair<double, gpt_vocab::id>> & logits_id, int

size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist);
size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist);

void untokenize(sentencepiece::SentencePieceProcessor & sp, std::vector<gpt_vocab::id> & buffids, std::vector<gpt_vocab::id> & embd);