Skip to content

Commit 2061ca7

Browse files
ochafikggerganov
authored andcommitted
tool-call: fix Qwen 2.5 Coder support, add micro benchmarks, support trigger patterns for lazy grammars (ggml-org#12034)
* sampler: turn lazy grammar trigger words to regexes * add scripts/tool_bench.sh & .py * constrain llama json output regardless of function name if matches at beginning * update relaxed newline space rule in grammar tests * support add_generation_prompt query parameter (useful for /apply_template) * Update src/llama-grammar.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
1 parent 923262d commit 2061ca7

26 files changed

+1315
-409
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
2525

2626
- **How to use [MTLResidencySet](https://developer.apple.com/documentation/metal/mtlresidencyset?language=objc) to keep the GPU memory active?** https://github.com/ggml-org/llama.cpp/pull/11427
2727
- **VS Code extension for FIM completions:** https://github.com/ggml-org/llama.vscode
28-
- Universal tool call support in `llama-server`: https://github.com/ggml-org/llama.cpp/pull/9639
28+
- Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
2929
- Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
3030
- Introducing GGUF-my-LoRA https://github.com/ggml-org/llama.cpp/discussions/10123
3131
- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggml-org/llama.cpp/discussions/9669

common/chat.cpp

Lines changed: 300 additions & 139 deletions
Large diffs are not rendered by default.

common/common.cpp

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
// Change JSON_ASSERT from assert() to GGML_ASSERT:
1111
#define JSON_ASSERT GGML_ASSERT
1212
#include "json.hpp"
13-
#include "json-schema-to-grammar.h"
1413
#include "llama.h"
1514

1615
#include <algorithm>
@@ -483,6 +482,11 @@ void string_replace_all(std::string & s, const std::string & search, const std::
483482
s = std::move(builder);
484483
}
485484

485+
std::string regex_escape(const std::string & s) {
486+
static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
487+
return std::regex_replace(s, special_chars, "\\$0");
488+
}
489+
486490
std::string string_join(const std::vector<std::string> & values, const std::string & separator) {
487491
std::ostringstream result;
488492
for (size_t i = 0; i < values.size(); ++i) {
@@ -2026,3 +2030,25 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
20262030
return result;
20272031
}
20282032

2033+
template <>
2034+
json common_grammar_trigger::to_json() const {
2035+
json out {
2036+
{"type", (int) type},
2037+
{"value", value},
2038+
};
2039+
if (type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
2040+
out["token"] = (int) token;
2041+
}
2042+
return out;
2043+
}
2044+
2045+
template <>
2046+
common_grammar_trigger common_grammar_trigger::from_json(const json & in) {
2047+
common_grammar_trigger out;
2048+
out.type = (common_grammar_trigger_type) in.at("type").get<int>();
2049+
out.value = in.at("value").get<std::string>();
2050+
if (out.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
2051+
out.token = (llama_token) in.at("token").get<int>();
2052+
}
2053+
return out;
2054+
}

common/common.h

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -110,9 +110,21 @@ enum common_conversation_mode {
110110
COMMON_CONVERSATION_MODE_AUTO = 2,
111111
};
112112

113+
enum common_grammar_trigger_type {
114+
COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN,
115+
COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
116+
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
117+
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
118+
};
119+
113120
struct common_grammar_trigger {
114-
std::string word;
115-
bool at_start;
121+
common_grammar_trigger_type type;
122+
std::string value;
123+
llama_token token = LLAMA_TOKEN_NULL;
124+
125+
// T can only be nlohmann::ordered_json
126+
template <class T> T to_json() const;
127+
template <class T> static common_grammar_trigger from_json(const T & in);
116128
};
117129

118130
// sampling parameters
@@ -163,8 +175,7 @@ struct common_params_sampling {
163175

164176
std::string grammar; // optional BNF-like grammar to constrain sampling
165177
bool grammar_lazy = false;
166-
std::vector<common_grammar_trigger> grammar_trigger_words; // optional trigger words to trigger lazy grammar
167-
std::vector<llama_token> grammar_trigger_tokens; // optional trigger tokens to trigger lazy grammar and print trigger special tokens.
178+
std::vector<common_grammar_trigger> grammar_triggers; // optional triggers (for lazy grammars)
168179
std::set<llama_token> preserved_tokens;
169180

170181
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
@@ -458,6 +469,8 @@ std::string string_repeat(const std::string & str, size_t n);
458469

459470
void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
460471

472+
std::string regex_escape(const std::string & s);
473+
461474
template<class T>
462475
static std::vector<T> string_split(const std::string & str, char delim) {
463476
static_assert(!std::is_same<T, std::string>::value, "Please use the specialized version for std::string");

common/json-schema-to-grammar.cpp

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -264,7 +264,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
264264
throw std::runtime_error("At least one of min_value or max_value must be set");
265265
}
266266

267-
const std::string SPACE_RULE = "| \" \" | \"\\n\" [ \\t]{0,20}";
267+
const std::string SPACE_RULE = "| \" \" | \"\\n\"{1,2} [ \\t]{0,20}";
268268

269269
struct BuiltinRule {
270270
std::string content;
@@ -764,11 +764,10 @@ class SchemaConverter {
764764
public:
765765
SchemaConverter(
766766
const std::function<json(const std::string &)> & fetch_json,
767-
bool dotall,
768-
bool compact_spaces)
767+
bool dotall)
769768
: _fetch_json(fetch_json), _dotall(dotall)
770769
{
771-
_rules["space"] = compact_spaces ? "\" \"?" : SPACE_RULE;
770+
_rules["space"] = SPACE_RULE;
772771
}
773772

774773
void resolve_refs(json & schema, const std::string & url) {
@@ -1007,7 +1006,7 @@ std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
10071006
}
10081007

10091008
std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options) {
1010-
SchemaConverter converter([&](const std::string &) { return json(); }, options.dotall, options.compact_spaces);
1009+
SchemaConverter converter([&](const std::string &) { return json(); }, options.dotall);
10111010
common_grammar_builder builder {
10121011
/* .add_rule = */ [&](const std::string & name, const std::string & rule) {
10131012
return converter._add_rule(name, rule);

common/json-schema-to-grammar.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ struct common_grammar_builder {
1616

1717
struct common_grammar_options {
1818
bool dotall = false;
19-
bool compact_spaces = false;
2019
};
2120

2221
std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options = {});

common/sampling.cpp

Lines changed: 44 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -160,16 +160,53 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
160160
GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
161161
#endif // LLAMA_USE_LLGUIDANCE
162162
} else {
163-
std::vector<const char *> trigger_words;
164-
trigger_words.reserve(params.grammar_trigger_words.size());
165-
for (const auto & str : params.grammar_trigger_words) {
166-
trigger_words.push_back(str.word.c_str());
163+
std::vector<std::string> patterns_at_start;
164+
std::vector<std::string> patterns_anywhere;
165+
std::vector<llama_token> trigger_tokens;
166+
for (const auto & trigger : params.grammar_triggers) {
167+
switch (trigger.type) {
168+
case COMMON_GRAMMAR_TRIGGER_TYPE_WORD:
169+
{
170+
const auto & word = trigger.value;
171+
patterns_anywhere.push_back(regex_escape(word));
172+
break;
173+
}
174+
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:
175+
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START:
176+
{
177+
const auto & pattern = trigger.value;
178+
(trigger.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START ? patterns_at_start : patterns_anywhere).push_back(pattern);
179+
break;
180+
}
181+
case COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN:
182+
{
183+
const auto token = trigger.token;
184+
trigger_tokens.push_back(token);
185+
break;
186+
}
187+
default:
188+
GGML_ASSERT(false && "unknown trigger type");
189+
}
190+
}
191+
192+
std::vector<std::string> trigger_patterns;
193+
if (!patterns_at_start.empty()) {
194+
trigger_patterns.push_back("^(" + string_join(patterns_at_start, "|") + ")[\\s\\S]*");
195+
}
196+
if (!patterns_anywhere.empty()) {
197+
trigger_patterns.push_back("^[\\s\\S]*?(" + string_join(patterns_anywhere, "|") + ")[\\s\\S]*");
198+
}
199+
200+
std::vector<const char *> trigger_patterns_c;
201+
trigger_patterns_c.reserve(trigger_patterns.size());
202+
for (const auto & regex : trigger_patterns) {
203+
trigger_patterns_c.push_back(regex.c_str());
167204
}
168205

169206
grmr = params.grammar_lazy
170-
? llama_sampler_init_grammar_lazy(vocab, params.grammar.c_str(), "root",
171-
trigger_words.data(), trigger_words.size(),
172-
params.grammar_trigger_tokens.data(), params.grammar_trigger_tokens.size())
207+
? llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
208+
trigger_patterns_c.data(), trigger_patterns_c.size(),
209+
trigger_tokens.data(), trigger_tokens.size())
173210
: llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
174211
}
175212

examples/json_schema_to_grammar.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,7 @@ def __init__(self, content: str, deps: list | None = None):
195195
self.deps = deps or []
196196

197197
# Constraining spaces to prevent model "running away".
198-
SPACE_RULE = '| " " | "\\n" [ \\t]{0,20}'
198+
SPACE_RULE = '| " " | "\\n"{1,2} [ \\t]{0,20}'
199199

200200
PRIMITIVE_RULES = {
201201
'boolean' : BuiltinRule('("true" | "false") space', []),

examples/server/public_legacy/json-schema-to-grammar.mjs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
// WARNING: This file was ported from json_schema_to_grammar.py, please fix bugs / add features there first.
2-
const SPACE_RULE = '| " " | "\\n" [ \\t]{0,20}';
2+
const SPACE_RULE = '| " " | "\\n"{1,2} [ \\t]{0,20}';
33

44
function _buildRepetition(itemRule, minItems, maxItems, opts={}) {
55
if (minItems === 0 && maxItems === 1) {

examples/server/server.cpp

Lines changed: 35 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -131,9 +131,9 @@ struct slot_params {
131131
lora.push_back({{"id", i}, {"scale", this->lora[i].scale}});
132132
}
133133

134-
std::vector<std::string> grammar_trigger_words;
135-
for (const auto & trigger : sampling.grammar_trigger_words) {
136-
grammar_trigger_words.push_back(trigger.word);
134+
auto grammar_triggers = json::array();
135+
for (const auto & trigger : sampling.grammar_triggers) {
136+
grammar_triggers.push_back(trigger.to_json<json>());
137137
}
138138

139139
return json {
@@ -170,8 +170,8 @@ struct slot_params {
170170
{"n_probs", sampling.n_probs},
171171
{"min_keep", sampling.min_keep},
172172
{"grammar", sampling.grammar},
173-
{"grammar_trigger_words", grammar_trigger_words},
174-
{"grammar_trigger_tokens", sampling.grammar_trigger_tokens},
173+
{"grammar_lazy", sampling.grammar_lazy},
174+
{"grammar_triggers", grammar_triggers},
175175
{"preserved_tokens", sampling.preserved_tokens},
176176
{"chat_format", common_chat_format_name(oaicompat_chat_format)},
177177
{"samplers", samplers},
@@ -356,24 +356,6 @@ struct server_task {
356356
}
357357

358358
{
359-
const auto grammar_triggers = data.find("grammar_triggers");
360-
if (grammar_triggers != data.end()) {
361-
for (const auto & t : *grammar_triggers) {
362-
common_grammar_trigger trigger;
363-
trigger.word = t.at("word");
364-
trigger.at_start = t.at("at_start");
365-
366-
auto ids = common_tokenize(vocab, trigger.word, /* add_special= */ false, /* parse_special= */ true);
367-
if (ids.size() == 1) {
368-
SRV_DBG("Grammar trigger token: %d (`%s`)\n", ids[0], trigger.word.c_str());
369-
params.sampling.grammar_trigger_tokens.push_back(ids[0]);
370-
params.sampling.preserved_tokens.insert(ids[0]);
371-
continue;
372-
}
373-
SRV_DBG("Grammar trigger word: `%s`\n", trigger.word.c_str());
374-
params.sampling.grammar_trigger_words.push_back(trigger);
375-
}
376-
}
377359
const auto preserved_tokens = data.find("preserved_tokens");
378360
if (preserved_tokens != data.end()) {
379361
for (const auto & t : *preserved_tokens) {
@@ -383,12 +365,38 @@ struct server_task {
383365
params.sampling.preserved_tokens.insert(ids[0]);
384366
} else {
385367
// This may happen when using a tool call style meant for a model with special tokens to preserve on a model without said tokens.
386-
SRV_WRN("Not preserved because more than 1 token (wrong chat template override?): %s\n", t.get<std::string>().c_str());
368+
SRV_DBG("Not preserved because more than 1 token: %s\n", t.get<std::string>().c_str());
369+
}
370+
}
371+
}
372+
const auto grammar_triggers = data.find("grammar_triggers");
373+
if (grammar_triggers != data.end()) {
374+
for (const auto & t : *grammar_triggers) {
375+
auto ct = common_grammar_trigger::from_json(t);
376+
if (ct.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) {
377+
const auto & word = ct.value;
378+
auto ids = common_tokenize(vocab, word, /* add_special= */ false, /* parse_special= */ true);
379+
if (ids.size() == 1) {
380+
auto token = ids[0];
381+
if (std::find(params.sampling.preserved_tokens.begin(), params.sampling.preserved_tokens.end(), (llama_token) token) == params.sampling.preserved_tokens.end()) {
382+
throw std::runtime_error("Grammar trigger word should be marked as preserved token: " + word);
383+
}
384+
SRV_DBG("Grammar trigger token: %d (`%s`)\n", token, word.c_str());
385+
common_grammar_trigger trigger;
386+
trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN;
387+
trigger.value = (llama_token) token;
388+
params.sampling.grammar_triggers.push_back(trigger);
389+
} else {
390+
SRV_DBG("Grammar trigger word: `%s`\n", word.c_str());
391+
params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word});
392+
}
393+
} else {
394+
params.sampling.grammar_triggers.push_back(ct);
387395
}
388396
}
389397
}
390-
if (params.sampling.grammar_lazy) {
391-
GGML_ASSERT(params.sampling.grammar_trigger_tokens.size() > 0 || params.sampling.grammar_trigger_words.size() > 0);
398+
if (params.sampling.grammar_lazy && params.sampling.grammar_triggers.empty()) {
399+
throw std::runtime_error("Error: no triggers set for lazy grammar!");
392400
}
393401
}
394402

@@ -2045,7 +2053,7 @@ struct server_context {
20452053

20462054
if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
20472055
// Might be better to reject the request with a 400 ?
2048-
SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d", slot.params.n_predict, slot.n_predict);
2056+
SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d\n", slot.params.n_predict, slot.n_predict);
20492057
slot.params.n_predict = slot.n_predict;
20502058
}
20512059

0 commit comments

Comments
 (0)