ggml-org · bandoti · Jan 31, 2025 · Jan 31, 2025 · Feb 3, 2025 · Feb 4, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -85,6 +85,9 @@ option(LLAMA_BUILD_SERVER   "llama: build server example" ${LLAMA_STANDALONE})
 option(LLAMA_CURL       "llama: use libcurl to download model from an URL" ON)
 option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)
 
+# Toolcall support - needs LLAMA_CURL support to connect with SSE endpoints
+option(LLAMA_TOOLCALL "llama: add toolcall support via Model Context Protocol" ON)
+
 # Required for relocatable CMake package
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake)
@@ -172,6 +175,11 @@ add_subdirectory(src)
 if (NOT LLAMA_BUILD_COMMON)
     message(STATUS "LLAMA_BUILD_COMMON is OFF, disabling LLAMA_CURL")
     set(LLAMA_CURL OFF)
+    set(LLAMA_TOOLCALL OFF)
+endif()
+
+if (LLAMA_TOOLCALL)
+    add_subdirectory(toolcall)
 endif()
 
 if (LLAMA_BUILD_COMMON)

diff --git a/common/arg.cpp b/common/arg.cpp
@@ -2890,6 +2890,24 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.chat_template = read_file(value);
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
+
+    add_opt(common_arg(
+        {"--tools"}, "JINJA_TOOLS",
+        "set to URI of a Model Context Protocol server, or "
+        "a JSON array containing tool definitions (requires --jinja)",
+        [](common_params &params, const std::string & value) {
+            params.toolcall.tools = value;
+
+        }).set_examples({LLAMA_EXAMPLE_MAIN}));
+
+    add_opt(common_arg(
+        {"--tool-choice"}, "JINJA_TOOL_CHOICE",
+        "set to \"auto\", \"required\", or \"none\" (default: \"auto\")",
+        [](common_params &params, const std::string & value) {
+            params.toolcall.choice = value;
+
+        }).set_examples({LLAMA_EXAMPLE_MAIN}));
+
     add_opt(common_arg(
         {"--no-prefill-assistant"},
         string_format(

diff --git a/common/common.cpp b/common/common.cpp
@@ -8,6 +8,7 @@
 #include "common.h"
 #include "log.h"
 #include "llama.h"
+#include "chat.h"
 
 #include <algorithm>
 #include <cinttypes>
@@ -1329,6 +1330,67 @@ std::string common_detokenize(const struct llama_vocab * vocab, const std::vecto
     return text;
 }
 
+void common_chat_grammar_to_sampler(const common_chat_params * src,
+                                    const llama_vocab * vocab,
+                                    common_params_sampling * sparams)
+{
+    GGML_ASSERT(src && vocab && sparams);
+
+    auto & dst = *sparams;
+
+    dst.grammar      = src->grammar;
+    dst.grammar_lazy = src->grammar_lazy;
+
+    for (const auto & preserved : src->preserved_tokens) {
+        auto ids = common_tokenize(vocab, preserved, false, true);
+        if (ids.size() == 1) {
+            LOG_DBG("Preserved token: %d\n", ids[0]);
+            dst.preserved_tokens.insert(ids[0]);
+
+        } else {
+            // This may happen when using a tool call style meant for a model
+            // with special tokens to preserve on a model without said tokens.
+            LOG_WRN("Not preserved because more than 1 token (wrong chat template override?): %s\n",
+                    preserved.c_str());
+        }
+    }
+
+    for (const auto & trigger : src->grammar_triggers) {
+        if (trigger.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) {
+            const auto & word = trigger.value;
+            auto ids = common_tokenize(vocab, word, /* add_special= */ false, /* parse_special= */ true);
+
+            if (ids.size() == 1) {
+                auto token = ids[0];
+                auto found = std::find(dst.preserved_tokens.begin(), dst.preserved_tokens.end(),
+                                       (llama_token) token);
+
+                if (found == dst.preserved_tokens.end()) {
+                    throw std::runtime_error("Grammar trigger word should be marked as preserved token: " + word);
+                }
+
+                LOG_DBG("Grammar trigger token: %d (`%s`)\n", token, word.c_str());
+                common_grammar_trigger trigger;
+                trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN;
+                trigger.value = word;
+                trigger.token = token;
+                dst.grammar_triggers.push_back(std::move(trigger));
+
+            } else {
+                LOG_DBG("Grammar trigger word: `%s`\n", word.c_str());
+                dst.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word});
+            }
+
+        } else {
+            dst.grammar_triggers.push_back(trigger);
+        }
+    }
+    if (dst.grammar_lazy && dst.grammar_triggers.empty()) {
+        throw std::runtime_error("Error: no triggers set for lazy grammar!");
+    }
+}
+
+
 //
 // Embedding utils
 //

diff --git a/common/common.h b/common/common.h
@@ -218,6 +218,11 @@ enum common_reasoning_format {
     COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`
 };
 
+struct common_toolcall_params {
+    std::string tools  = "";
+    std::string choice = "auto";
+};
+
 struct common_params {
     int32_t n_predict             =    -1; // new tokens to predict
     int32_t n_ctx                 =  4096; // context size
@@ -367,6 +372,9 @@ struct common_params {
     std::string chat_template = "";                                                                         // NOLINT
     bool use_jinja = false;                                                                                 // NOLINT
     bool enable_chat_template = true;
+
+    struct common_toolcall_params toolcall;
+
     common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
     int reasoning_budget = -1;
     bool prefill_assistant = true;                                                                          // if true, any trailing assistant message will be prefilled into the response
@@ -622,6 +630,12 @@ std::string common_detokenize(
         const std::vector<llama_token> & tokens,
                                   bool   special = true);
 
+struct common_chat_params;
+void common_chat_grammar_to_sampler(const common_chat_params * src,
+                                    const llama_vocab * vocab,
+                                    common_params_sampling * sparams);
+
+
 //
 // Embedding utils
 //

diff --git a/toolcall/CMakeLists.txt b/toolcall/CMakeLists.txt
@@ -0,0 +1,37 @@
+
+set(TARGET toolcall)
+
+set(SOURCES
+    client.cpp
+    mcp_messages.cpp
+    mcp_stdio_transport.cpp
+    params.cpp
+)
+
+set(HEADERS
+    toolcall-params.h
+    toolcall-client.h
+    mcp_transport.h
+    mcp_messages.h
+    mcp_stdio_transport.h
+)
+
+add_library(${TARGET} STATIC ${SOURCES} ${HEADERS})
+
+target_include_directories(${TARGET}  # Right now only for "json.hpp"
+    PRIVATE $<TARGET_PROPERTY:common,INTERFACE_INCLUDE_DIRECTORIES>)
+
+if (LLAMA_CURL)
+    find_package(CURL REQUIRED)
+    target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
+    include_directories(${CURL_INCLUDE_DIRS})
+    find_library(CURL_LIBRARY curl REQUIRED)
+
+    target_link_libraries(${TARGET} PRIVATE ${CURL_LIBRARY})
+    target_sources(${TARGET} PRIVATE mcp_sse_transport.cpp mcp_sse_transport.h)
+
+endif()
+
+target_compile_definitions(${TARGET} INTERFACE LLAMA_USE_TOOLCALL)
+target_include_directories(${TARGET} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>)
+target_compile_features (${TARGET} PUBLIC cxx_std_17)