Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

llama : more tokenizer fixes #2810

Merged
merged 13 commits into from
Aug 27, 2023
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 24 additions & 3 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -733,16 +733,37 @@ std::vector<llama_token> llama_tokenize(
return result;
}

std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
std::vector<char> result(8, 0);
const int n_tokens = llama_token_to_str(ctx, token, result.data(), result.size());
const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size());
if (n_tokens < 0) {
result.resize(-n_tokens);
int check = llama_token_to_str(ctx, token, result.data(), result.size());
int check = llama_token_to_piece(ctx, token, result.data(), result.size());
GGML_ASSERT(check == -n_tokens);
} else {
result.resize(n_tokens);
}

return std::string(result.data(), result.size());
}

std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens) {
const llama_token bos_id = llama_token_bos(ctx);

std::string piece;
std::string result;

for (size_t i = 0; i < tokens.size(); ++i) {
piece = llama_token_to_piece(ctx, tokens[i]);

// remove the leading space of the first non-BOS token
if (((tokens[0] == bos_id && i == 1) || (tokens[0] != bos_id && i == 0)) && piece[0] == ' ') {
piece = piece.substr(1);
}

result += piece;
}

return result;
}

13 changes: 12 additions & 1 deletion common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,11 +116,22 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
// Vocab utils
//

// tokenizes a string into a vector of tokens
// should work similar to Python's `tokenizer.encode`
std::vector<llama_token> llama_tokenize(
struct llama_context * ctx,
const std::string & text,
bool add_bos);

std::string llama_token_to_str(
// tokenizes a token into a piece
// should work similar to Python's `tokenizer.id_to_piece`
std::string llama_token_to_piece(
const struct llama_context * ctx,
llama_token token);

// detokenizes a vector of tokens into a string
// should work similar to Python's `tokenizer.decode`
// removes the leading space from the first non-BOS token
std::string llama_detokenize(
llama_context * ctx,
const std::vector<llama_token> & tokens);
6 changes: 3 additions & 3 deletions examples/beam_search/beam_search.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ struct ostream_beam_view {
std::ostream& operator<<(std::ostream& os, const ostream_beam_view & obv) {
os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";
for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {
os << llama_token_to_str(obv.ctx, obv.beam_view.tokens[i]);
os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]);
}
return os << ')';
}
Expand Down Expand Up @@ -156,7 +156,7 @@ int main(int argc, char ** argv)

for( auto id : tokens_list )
{
std::cout << llama_token_to_str(ctx, id);
std::cout << llama_token_to_piece(ctx, id);
}
std::cout << std::flush;

Expand All @@ -175,7 +175,7 @@ int main(int argc, char ** argv)

std::cout << "\n\n";
for (llama_token const token_id : callback_data.response) {
std::cout << llama_token_to_str(ctx,token_id);
std::cout << llama_token_to_piece(ctx,token_id);
}
std::cout << std::endl;

Expand Down
2 changes: 1 addition & 1 deletion examples/embd-input/embd-input-lib.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ const char * sampling(struct MyModel * mymodel) {
if (id == llama_token_eos(ctx)) {
ret = "</s>";
} else {
ret = llama_token_to_str(ctx, id);
ret = llama_token_to_piece(ctx, id);
}
eval_id(mymodel, id);
return ret.c_str();
Expand Down
5 changes: 1 addition & 4 deletions examples/embedding/embedding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,6 @@ int main(int argc, char ** argv) {

int n_past = 0;

// Add a space in front of the first character to match OG llama tokenizer behavior
params.prompt.insert(0, 1, ' ');

// tokenize the prompt
auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);

Expand All @@ -67,7 +64,7 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
for (int i = 0; i < (int) embd_inp.size(); i++) {
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str());
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
}
fprintf(stderr, "\n");
}
Expand Down
20 changes: 7 additions & 13 deletions examples/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -195,11 +195,6 @@ int main(int argc, char ** argv) {
// tokenize the prompt
std::vector<llama_token> embd_inp;

if (llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM) {
// Add a space in front of the first character to match OG llama tokenizer behavior
params.prompt.insert(0, 1, ' ');
}

if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
} else {
Expand All @@ -216,7 +211,6 @@ int main(int argc, char ** argv) {
int guidance_offset = 0;
int original_prompt_len = 0;
if (ctx_guidance) {
params.cfg_negative_prompt.insert(0, 1, ' ');
guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos);

std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
Expand Down Expand Up @@ -285,22 +279,22 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
for (int i = 0; i < (int) embd_inp.size(); i++) {
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str());
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
}

if (ctx_guidance) {
fprintf(stderr, "\n");
fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
for (int i = 0; i < (int) guidance_inp.size(); i++) {
fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]).c_str());
fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
}
}

if (params.n_keep > 0) {
fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
for (int i = 0; i < params.n_keep; i++) {
fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i]).c_str());
fprintf(stderr, "%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
}
fprintf(stderr, "'\n");
}
Expand Down Expand Up @@ -456,7 +450,7 @@ int main(int argc, char ** argv) {
//printf("\n---\n");
//printf("resetting: '");
//for (int i = 0; i < (int) embd.size(); i++) {
// printf("%s", llama_token_to_str(ctx, embd[i]));
// printf("%s", llama_token_to_piece(ctx, embd[i]));
//}
//printf("'\n");
//printf("\n---\n");
Expand Down Expand Up @@ -509,7 +503,7 @@ int main(int argc, char ** argv) {
input_size = embd_guidance.size();
//fprintf(stderr, "\n---------------------\n");
//for (int i = 0; i < (int) embd_guidance.size(); i++) {
//fprintf(stderr, "%s", llama_token_to_str(ctx, embd_guidance[i]));
//fprintf(stderr, "%s", llama_token_to_piece(ctx, embd_guidance[i]));
//}
//fprintf(stderr, "\n---------------------\n");
} else {
Expand Down Expand Up @@ -673,7 +667,7 @@ int main(int argc, char ** argv) {
// display text
if (input_echo) {
for (auto id : embd) {
printf("%s", llama_token_to_str(ctx, id).c_str());
printf("%s", llama_token_to_piece(ctx, id).c_str());
}
fflush(stdout);
}
Expand All @@ -689,7 +683,7 @@ int main(int argc, char ** argv) {
if (params.antiprompt.size()) {
std::string last_output;
for (auto id : last_n_tokens) {
last_output += llama_token_to_str(ctx, id);
last_output += llama_token_to_piece(ctx, id);
}

is_antiprompt = false;
Expand Down
4 changes: 2 additions & 2 deletions examples/perplexity/perplexity.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -392,7 +392,7 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
hs_data[i].context = prompt_lines[idx*6];
hs_data[i].gold_ending_idx = std::stoi( prompt_lines[idx*6+1] );
for (size_t j=0; j < 4; j++) {
hs_data[i].ending[j] = " " + prompt_lines[idx*6+2+j];
hs_data[i].ending[j] = prompt_lines[idx*6+2+j];
}

// Delete the selected random example from the prompt
Expand All @@ -417,7 +417,7 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
size_t context_size = context_embd.size();

for (int i = 0; i < 4; ++i) {
ending_tokens[i] = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[i], add_bos);
ending_tokens[i] = ::llama_tokenize(ctx, hs_data[task_idx].context + " " + hs_data[task_idx].ending[i], add_bos);
for (int k = 0; k < int(context_size); ++k) {
if (ending_tokens[i][k] != context_embd[k]) {
fprintf(stderr, "Oops: ending %d of task %d differs from context at position %d\n",i,int(task_idx),k);
Expand Down
4 changes: 2 additions & 2 deletions examples/save-load-state/save-load-state.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ int main(int argc, char ** argv) {
}
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
auto next_token = llama_sample_token(ctx, &candidates_p);
auto next_token_str = llama_token_to_str(ctx, next_token);
auto next_token_str = llama_token_to_piece(ctx, next_token);
last_n_tokens_data.push_back(next_token);

printf("%s", next_token_str.c_str());
Expand Down Expand Up @@ -147,7 +147,7 @@ int main(int argc, char ** argv) {
}
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
auto next_token = llama_sample_token(ctx2, &candidates_p);
auto next_token_str = llama_token_to_str(ctx2, next_token);
auto next_token_str = llama_token_to_piece(ctx2, next_token);
last_n_tokens_data.push_back(next_token);

printf("%s", next_token_str.c_str());
Expand Down
16 changes: 7 additions & 9 deletions examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
std::string ret;
for (; begin != end; ++begin)
{
ret += llama_token_to_str(ctx, *begin);
ret += llama_token_to_piece(ctx, *begin);
}
return ret;
}
Expand Down Expand Up @@ -123,7 +123,7 @@ static void server_log(const char *level, const char *function, int line,
// format incomplete utf-8 multibyte character for output
static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
{
std::string out = token == -1 ? "" : llama_token_to_str(ctx, token);
std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
// if the size is 1 and first bit is 1, meaning it's a partial character
// (size > 1 meaning it's already a known token)
if (out.size() == 1 && (out[0] & 0x80) == 0x80)
Expand Down Expand Up @@ -286,7 +286,6 @@ struct llama_server_context
std::vector<llama_token> p;
if (first)
{
s.insert(0, 1, ' '); // add a space if it's the first
p = ::llama_tokenize(ctx, s, add_bos);
first = false;
}
Expand All @@ -309,7 +308,6 @@ struct llama_server_context
else
{
auto s = json_prompt.template get<std::string>();
s.insert(0, 1, ' '); // always add a first space
prompt_tokens = ::llama_tokenize(ctx, s, add_bos);
}

Expand Down Expand Up @@ -566,7 +564,7 @@ struct llama_server_context

if (!embd.empty() && embd.back() == llama_token_eos(ctx))
{
// stopping_word = llama_token_to_str(ctx, embd.back());
// stopping_word = llama_token_to_piece(ctx, embd.back());
has_next_token = false;
stopped_eos = true;
LOG_VERBOSE("eos token found", {});
Expand Down Expand Up @@ -613,7 +611,7 @@ struct llama_server_context
{
const completion_token_output token_with_probs = nextToken();

const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(ctx, token_with_probs.tok);
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok);
generated_text += token_text;

if (params.n_probs > 0)
Expand Down Expand Up @@ -1248,7 +1246,7 @@ void beam_search_callback(void * callback_data, llama_beams_state beams_state) {

struct token_translator {
llama_context * ctx;
std::string operator()(llama_token tok) const { return llama_token_to_str(ctx, tok); }
std::string operator()(llama_token tok) const { return llama_token_to_piece(ctx, tok); }
std::string operator()(completion_token_output cto) const { return (*this)(cto.tok); }
};

Expand Down Expand Up @@ -1358,7 +1356,7 @@ int main(int argc, char **argv)

while (llama.has_next_token) {
const completion_token_output token_with_probs = llama.doCompletion();
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(llama.ctx, token_with_probs.tok);
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(llama.ctx, token_with_probs.tok);

stop_pos = llama.findStoppingStrings(llama.generated_text,
token_text.size(), STOP_FULL);
Expand Down Expand Up @@ -1389,7 +1387,7 @@ int main(int argc, char **argv)
if (token_with_probs.tok == -1 || llama.multibyte_pending > 0) {
continue;
}
const std::string token_text = llama_token_to_str(llama.ctx, token_with_probs.tok);
const std::string token_text = llama_token_to_piece(llama.ctx, token_with_probs.tok);

size_t pos = std::min(sent_count, llama.generated_text.size());

Expand Down
4 changes: 2 additions & 2 deletions examples/simple/simple.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ int main(int argc, char ** argv) {
fprintf(stderr, "\n\n");

for (auto id : tokens_list) {
fprintf(stderr, "%s", llama_token_to_str(ctx, id).c_str());
fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
}

fflush(stderr);
Expand Down Expand Up @@ -112,7 +112,7 @@ int main(int argc, char ** argv) {
}

// print the new token :
printf("%s", llama_token_to_str(ctx, new_token_id).c_str());
printf("%s", llama_token_to_piece(ctx, new_token_id).c_str());
fflush(stdout);

// push this new token for next evaluation
Expand Down
4 changes: 2 additions & 2 deletions examples/train-text-from-scratch/train-text-from-scratch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1964,7 +1964,7 @@ void print_matrix(struct ggml_tensor * probs) {


void print_token(struct llama_context * ctx, llama_token token) {
printf("%s", llama_token_to_str(ctx, token).c_str());
printf("%s", llama_token_to_piece(ctx, token).c_str());
}

void print_tokens(struct llama_context* ctx, struct ggml_tensor * tokens) {
Expand Down Expand Up @@ -2202,7 +2202,7 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
const char * in = buf.data();
const char * end = buf.data() + buf.size();
for (int i = 0; i < (int) out.size(); ++i) {
std::string s = llama_token_to_str(lctx, out[i]);
std::string s = llama_token_to_piece(lctx, out[i]);
int len = s.length();
if (in >= end) {
printf("%s: unexpected end of original text.\n", __func__);
Expand Down
Loading