From 1a61c1a5e11c75fed39fdd575298369f2f0f430e Mon Sep 17 00:00:00 2001 From: Xiao-Yong Jin Date: Fri, 21 Jul 2023 00:35:58 -0500 Subject: [PATCH 1/4] server: allow json array in prompt or content We accept an array of strings and numbers representing tokens, in addition to the current string valued prompt or content. This allows direct token input, so that any special tokens can be processed and used at the frontend during the construction of the json data, before sending to the server. And the server does not need to know or parse special tokens from textual input. With this, we can use EOS and BOS used in llama-2-chat models. --- examples/server/README.md | 2 +- examples/server/server.cpp | 60 +++++++++++++++++++++++++++++++++----- 2 files changed, 54 insertions(+), 8 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index e5ca8269b9d56..0b562b3acab7a 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -126,7 +126,7 @@ node . `stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`. - `prompt`: Provide a prompt. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. A space is inserted in the front like main.cpp does. + `prompt`: Provide a prompt as a string, or as an array of strings and numbers representing tokens. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. If the prompt is a string, or an array with the first element given as a string, a space is inserted in the front like main.cpp does. `stop`: Specify a JSON array of stopping strings. These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []). diff --git a/examples/server/server.cpp b/examples/server/server.cpp index f442f2b56d368..567dc4d2d23d1 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -188,6 +188,7 @@ struct llama_server_context size_t n_past = 0; size_t n_remain = 0; + json prompt; std::vector embd; std::vector last_n_tokens; @@ -257,10 +258,55 @@ struct llama_server_context return true; } + std::vector tokenizePrompt(void) + { + std::vector prompt_tokens; + + if (prompt.is_array()) + { + bool first = true; + for (const auto& p : prompt) + { + if (p.is_string()) + { + auto s = p.template get(); + std::vector p; + if (first) + { + s.insert(0, 1, ' '); // add a space if it's the first + p = ::llama_tokenize(ctx, s, true); // also add BOS + first = false; + } + else + { + p = ::llama_tokenize(ctx, s, false); + } + prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end()); + } + else + { + if (first) + { + first = false; + } + prompt_tokens.push_back(p.template get()); + } + } + } + else + { + auto s = prompt.template get(); + s.insert(0, 1, ' '); // always add a first space + prompt_tokens = ::llama_tokenize(ctx, s, true); + } + + return prompt_tokens; + } + void loadPrompt() { - params.prompt.insert(0, 1, ' '); // always add a first space - std::vector prompt_tokens = ::llama_tokenize(ctx, params.prompt, true); + auto prompt_tokens = tokenizePrompt(); + num_prompt_tokens = prompt_tokens.size(); if (params.n_keep < 0) @@ -954,7 +1000,7 @@ static json format_final_response(llama_server_context &llama, const std::string {"tokens_predicted", llama.num_tokens_predicted}, {"tokens_evaluated", llama.num_prompt_tokens}, {"generation_settings", format_generation_settings(llama)}, - {"prompt", llama.params.prompt}, + {"prompt", llama.prompt}, {"truncated", llama.truncated}, {"stopped_eos", llama.stopped_eos}, {"stopped_word", llama.stopped_word}, @@ -1015,8 +1061,8 @@ static void parse_options_completion(const json &body, llama_server_context &lla llama.params.penalize_nl = body.value("penalize_nl", default_params.penalize_nl); llama.params.n_keep = body.value("n_keep", default_params.n_keep); llama.params.seed = body.value("seed", default_params.seed); - llama.params.prompt = body.value("prompt", default_params.prompt); llama.params.n_probs = body.value("n_probs", default_params.n_probs); + llama.prompt = body["prompt"]; llama.params.logit_bias.clear(); if (body.value("ignore_eos", false)) @@ -1258,8 +1304,8 @@ int main(int argc, char **argv) auto lock = llama.lock(); const json body = json::parse(req.body); - const std::string content = body.value("content", ""); - const std::vector tokens = llama_tokenize(llama.ctx, content, false); + llama.prompt = body["content"]; + const std::vector tokens = llama.tokenizePrompt(); const json data = format_tokenizer_response(tokens); return res.set_content(data.dump(), "application/json"); }); @@ -1271,7 +1317,7 @@ int main(int argc, char **argv) llama.rewind(); llama_reset_timings(llama.ctx); - llama.params.prompt = body.value("content", ""); + llama.prompt = body["content"]; llama.params.n_predict = 0; llama.loadPrompt(); llama.beginCompletion(); From 97deb25398100f778fcf192b900288a598b5c0bc Mon Sep 17 00:00:00 2001 From: Xiao-Yong Jin Date: Mon, 24 Jul 2023 21:39:35 -0500 Subject: [PATCH 2/4] server: use tokenizePrompt(json) and default "" if empty prompt --- examples/server/server.cpp | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 567dc4d2d23d1..c52db9c052103 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -258,14 +258,14 @@ struct llama_server_context return true; } - std::vector tokenizePrompt(void) + std::vector tokenizePrompt(json json_prompt) { std::vector prompt_tokens; - if (prompt.is_array()) + if (json_prompt.is_array()) { bool first = true; - for (const auto& p : prompt) + for (const auto& p : json_prompt) { if (p.is_string()) { @@ -295,7 +295,7 @@ struct llama_server_context } else { - auto s = prompt.template get(); + auto s = json_prompt.template get(); s.insert(0, 1, ' '); // always add a first space prompt_tokens = ::llama_tokenize(ctx, s, true); } @@ -305,7 +305,7 @@ struct llama_server_context void loadPrompt() { - auto prompt_tokens = tokenizePrompt(); + auto prompt_tokens = tokenizePrompt(prompt); num_prompt_tokens = prompt_tokens.size(); @@ -1062,7 +1062,15 @@ static void parse_options_completion(const json &body, llama_server_context &lla llama.params.n_keep = body.value("n_keep", default_params.n_keep); llama.params.seed = body.value("seed", default_params.seed); llama.params.n_probs = body.value("n_probs", default_params.n_probs); - llama.prompt = body["prompt"]; + + if (body.count("content") != 0) + { + llama.prompt = body["prompt"]; + } + else + { + llama.prompt = ""; + } llama.params.logit_bias.clear(); if (body.value("ignore_eos", false)) @@ -1304,8 +1312,11 @@ int main(int argc, char **argv) auto lock = llama.lock(); const json body = json::parse(req.body); - llama.prompt = body["content"]; - const std::vector tokens = llama.tokenizePrompt(); + std::vector tokens; + if (body.count("content") != 0) + { + tokens = llama.tokenizePrompt(body["content"]); + } const json data = format_tokenizer_response(tokens); return res.set_content(data.dump(), "application/json"); }); @@ -1317,7 +1328,14 @@ int main(int argc, char **argv) llama.rewind(); llama_reset_timings(llama.ctx); - llama.prompt = body["content"]; + if (body.count("content") != 0) + { + llama.prompt = body["content"]; + } + else + { + llama.prompt = ""; + } llama.params.n_predict = 0; llama.loadPrompt(); llama.beginCompletion(); From 53c2db16858dd2af40725ecfdaf86cc9003a6873 Mon Sep 17 00:00:00 2001 From: Xiao-Yong Jin Date: Tue, 25 Jul 2023 07:47:17 -0500 Subject: [PATCH 3/4] server: fix prompt check --- examples/server/server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 710623a77dc08..b9222067ecc11 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1075,7 +1075,7 @@ static void parse_options_completion(const json &body, llama_server_context &lla llama.params.seed = body.value("seed", default_params.seed); llama.params.n_probs = body.value("n_probs", default_params.n_probs); - if (body.count("content") != 0) + if (body.count("prompt") != 0) { llama.prompt = body["prompt"]; } From bb3770b3e60b862c7068d48884fd873621a7d5f8 Mon Sep 17 00:00:00 2001 From: Xiao-Yong Jin Date: Wed, 26 Jul 2023 17:42:20 -0500 Subject: [PATCH 4/4] server: tokenize endpoint no longer adds BOS --- examples/server/server.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index b9222067ecc11..00d77de5d0dbd 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -258,8 +258,10 @@ struct llama_server_context return true; } - std::vector tokenizePrompt(json json_prompt) + std::vector tokenize(json json_prompt, bool add_bos) { + // If `add_bos` is true, we only add BOS, when json_prompt is a string, + // or the first element of the json_prompt array is a string. std::vector prompt_tokens; if (json_prompt.is_array()) @@ -274,7 +276,7 @@ struct llama_server_context if (first) { s.insert(0, 1, ' '); // add a space if it's the first - p = ::llama_tokenize(ctx, s, true); // also add BOS + p = ::llama_tokenize(ctx, s, add_bos); first = false; } else @@ -297,7 +299,7 @@ struct llama_server_context { auto s = json_prompt.template get(); s.insert(0, 1, ' '); // always add a first space - prompt_tokens = ::llama_tokenize(ctx, s, true); + prompt_tokens = ::llama_tokenize(ctx, s, add_bos); } return prompt_tokens; @@ -305,7 +307,7 @@ struct llama_server_context void loadPrompt() { - auto prompt_tokens = tokenizePrompt(prompt); + auto prompt_tokens = tokenize(prompt, true); // always add BOS num_prompt_tokens = prompt_tokens.size(); @@ -1327,7 +1329,7 @@ int main(int argc, char **argv) std::vector tokens; if (body.count("content") != 0) { - tokens = llama.tokenizePrompt(body["content"]); + tokens = llama.tokenize(body["content"], false); } const json data = format_tokenizer_response(tokens); return res.set_content(data.dump(), "application/json"); });