ggml-org · ochafik · Feb 13, 2025 · Feb 3, 2025 · Feb 3, 2025 · Feb 3, 2025
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -1975,6 +1975,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.use_jinja = true;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
+    add_opt(common_arg(
+        {"--think"},
+        "*experimental* thinking mode (default: disabled)\n"
+        "returns reasoning_content in messages, forcing model to think unless it supports native <think> tags (DeepSeek R1, Command R7B)\n"
+        "only supported for non-streamed responses",
+        [](common_params & params) {
+            params.think = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
     add_opt(common_arg(
         {"--chat-template"}, "JINJA_TEMPLATE",
         string_format(

diff --git a/common/chat.cpp b/common/chat.cpp
diff --git a/common/chat.hpp b/common/chat.hpp
@@ -19,6 +19,7 @@ struct common_chat_inputs {
     bool stream;
     std::string grammar;
     bool add_generation_prompt = true;
+    bool think                 = false;
 };
 
 enum common_chat_format {
@@ -28,11 +29,13 @@ enum common_chat_format {
     COMMON_CHAT_FORMAT_LLAMA_3_X,
     COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
     COMMON_CHAT_FORMAT_DEEPSEEK_R1,
+    COMMON_CHAT_FORMAT_DEEPSEEK_R1_THINK,
     COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
     COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
     COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
     COMMON_CHAT_FORMAT_HERMES_2_PRO,
     COMMON_CHAT_FORMAT_COMMAND_R7B,
+    COMMON_CHAT_FORMAT_COMMAND_R7B_THINK,
 
     COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
 };

diff --git a/common/common.h b/common/common.h
@@ -346,6 +346,7 @@ struct common_params {
     std::string chat_template = "";                                                                         // NOLINT
     bool use_jinja = false;                                                                                 // NOLINT
     bool enable_chat_template = true;
+    bool think                = false;     // return reasoning_content, force model to think unless it supports native <think> tags.
 
     std::vector<std::string> api_keys;
 
@@ -623,7 +624,7 @@ struct common_chat_msg {
     std::string role;
     std::string content;
     std::vector<common_tool_call> tool_calls;
-    std::string tool_plan = "";
+    std::string reasoning_content = "";
 };
 
 // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid

diff --git a/common/sampling.cpp b/common/sampling.cpp
@@ -151,12 +151,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
 
     lparams.no_perf = params.no_perf;
 
-    std::vector<const char *> trigger_words;
-    trigger_words.reserve(params.grammar_trigger_words.size());
-    for (const auto & str : params.grammar_trigger_words) {
-        trigger_words.push_back(str.word.c_str());
-    }
-
     struct llama_sampler * grmr;
     if (params.grammar.compare(0, 11, "%llguidance") == 0) {
 #ifdef LLAMA_USE_LLGUIDANCE
@@ -165,6 +159,12 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
         GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
 #endif // LLAMA_USE_LLGUIDANCE
     } else {
+        std::vector<const char *> trigger_words;
+        trigger_words.reserve(params.grammar_trigger_words.size());
+        for (const auto & str : params.grammar_trigger_words) {
+            trigger_words.push_back(str.word.c_str());
+        }
+
         grmr = params.grammar_lazy
              ? llama_sampler_init_grammar_lazy(vocab, params.grammar.c_str(), "root",
                                                trigger_words.data(), trigger_words.size(),

@@ -127,6 +127,7 @@ The project is under active development, and we are [looking for feedback and co
 | `--grammar-file FNAME` | file to read grammar from |
 | `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
 | `--jinja` | Enable experimental Jinja templating engine (required for tool use) |
+| `--think` | Enable experimental thinking mode (extracts DeepSeek R1 & Command R7B's native thinking tags and forces any other model to think before responding, resulting thoughts are in the `reasoning_content` output field) (requires `--jinja`) |
 
 **Example-specific params**
 
@@ -1136,61 +1137,74 @@ curl http://localhost:8080/v1/chat/completions \
 
   | Template | Format |
   |----------|--------|
-  | CohereForAI-c4ai-command-r-plus-default.jinja | generic tool calls |
-  | CohereForAI-c4ai-command-r-plus-rag.jinja | generic tool calls |
-  | CohereForAI-c4ai-command-r-plus-tool_use.jinja | generic tool calls |
-  | MiniMaxAI-MiniMax-Text-01.jinja | generic tool calls |
-  | NexaAIDev-Octopus-v2.jinja | generic tool calls |
-  | NousResearch-Hermes-2-Pro-Llama-3-8B-default.jinja | generic tool calls |
-  | NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja | hermes 2 pro tool calls |
-  | NousResearch-Hermes-2-Pro-Mistral-7B-default.jinja | generic tool calls |
-  | NousResearch-Hermes-2-Pro-Mistral-7B-tool_use.jinja | hermes 2 pro tool calls |
-  | NousResearch-Hermes-3-Llama-3.1-70B-default.jinja | generic tool calls |
-  | NousResearch-Hermes-3-Llama-3.1-70B-tool_use.jinja | hermes 2 pro tool calls |
-  | OrionStarAI-Orion-14B-Chat.jinja | generic tool calls |
-  | Qwen-QwQ-32B-Preview.jinja | hermes 2 pro tool calls |
-  | Qwen-Qwen2-7B-Instruct.jinja | generic tool calls |
-  | Qwen-Qwen2-VL-7B-Instruct.jinja | generic tool calls |
-  | Qwen-Qwen2.5-7B-Instruct.jinja | hermes 2 pro tool calls |
-  | Qwen-Qwen2.5-Math-7B-Instruct.jinja | hermes 2 pro tool calls |
-  | TheBloke-FusionNet_34Bx2_MoE-AWQ.jinja | generic tool calls |
-  | abacusai-Fewshot-Metamath-OrcaVicuna-Mistral.jinja | generic tool calls |
-  | bofenghuang-vigogne-2-70b-chat.jinja | generic tool calls |
-  | databricks-dbrx-instruct.jinja | generic tool calls |
-  | deepseek-ai-DeepSeek-Coder-V2-Instruct.jinja | generic tool calls |
-  | deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja | deepseek r1 tool calls |
-  | deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja | deepseek r1 tool calls |
-  | deepseek-ai-DeepSeek-R1-Distill-Qwen-7B.jinja | deepseek r1 tool calls |
-  | deepseek-ai-DeepSeek-V2.5.jinja | deepseek r1 tool calls |
-  | deepseek-ai-deepseek-coder-33b-instruct.jinja | generic tool calls |
-  | google-gemma-2-2b-it.jinja | generic tool calls |
-  | google-gemma-7b-it.jinja | generic tool calls |
-  | indischepartij-MiniCPM-3B-OpenHermes-2.5-v2.jinja | generic tool calls |
-  | mattshumer-Reflection-Llama-3.1-70B.jinja | generic tool calls |
-  | meetkai-functionary-medium-v3.2.jinja | functionary v3.2 tool calls |
-  | meta-llama-Llama-3.1-8B-Instruct.jinja | llama 3.x tool calls (w/ builtin tools) |
-  | meta-llama-Llama-3.2-3B-Instruct.jinja | llama 3.x tool calls |
-  | meta-llama-Llama-3.3-70B-Instruct.jinja | llama 3.x tool calls (w/ builtin tools) |
-  | meta-llama-Meta-Llama-3.1-8B-Instruct.jinja | llama 3.x tool calls (w/ builtin tools) |
-  | microsoft-Phi-3-medium-4k-instruct.jinja | generic tool calls |
-  | microsoft-Phi-3-mini-4k-instruct.jinja | generic tool calls |
-  | microsoft-Phi-3-small-8k-instruct.jinja | generic tool calls |
-  | microsoft-Phi-3.5-mini-instruct.jinja | generic tool calls |
-  | microsoft-Phi-3.5-vision-instruct.jinja | generic tool calls |
-  | mistralai-Mistral-7B-Instruct-v0.2.jinja | generic tool calls |
-  | mistralai-Mistral-Large-Instruct-2407.jinja | mistral nemo tool calls |
-  | mistralai-Mistral-Large-Instruct-2411.jinja | generic tool calls |
-  | mistralai-Mistral-Nemo-Instruct-2407.jinja | mistral nemo tool calls |
-  | mistralai-Mixtral-8x7B-Instruct-v0.1.jinja | generic tool calls |
-  | mlabonne-AlphaMonarch-7B.jinja | generic tool calls |
-  | nvidia-Llama-3.1-Nemotron-70B-Instruct-HF.jinja | llama 3.x tool calls (w/ builtin tools) |
-  | openchat-openchat-3.5-0106.jinja | generic tool calls |
-  | teknium-OpenHermes-2.5-Mistral-7B.jinja | generic tool calls |
+  | CohereForAI-c4ai-command-r-plus-default.jinja | Generic |
+  | CohereForAI-c4ai-command-r-plus-rag.jinja | Generic |
+  | CohereForAI-c4ai-command-r-plus-tool_use.jinja | Generic |
+  | CohereForAI-c4ai-command-r7b-12-2024-default.jinja | Command R7B |
+  | CohereForAI-c4ai-command-r7b-12-2024-rag.jinja | Command R7B |
+  | CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja | Command R7B |
+  | Infinigence-Megrez-3B-Instruct.jinja | Generic |
+  | LGAI-EXAONE-EXAONE-3.5-7.8B-Instruct.jinja | Generic |
+  | MiniMaxAI-MiniMax-Text-01.jinja | Generic |
+  | NexaAIDev-Octopus-v2.jinja | Generic |
+  | NousResearch-Hermes-2-Pro-Llama-3-8B-default.jinja | Generic |
+  | NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja | Hermes 2 Pro |
+  | NousResearch-Hermes-2-Pro-Mistral-7B-default.jinja | Generic |
+  | NousResearch-Hermes-2-Pro-Mistral-7B-tool_use.jinja | Hermes 2 Pro |
+  | NousResearch-Hermes-3-Llama-3.1-70B-default.jinja | Generic |
+  | NousResearch-Hermes-3-Llama-3.1-70B-tool_use.jinja | Hermes 2 Pro |
+  | OrionStarAI-Orion-14B-Chat.jinja | Generic |
+  | Qwen-QwQ-32B-Preview.jinja | Hermes 2 Pro |
+  | Qwen-Qwen2-7B-Instruct.jinja | Generic |
+  | Qwen-Qwen2-VL-7B-Instruct.jinja | Generic |
+  | Qwen-Qwen2.5-7B-Instruct.jinja | Hermes 2 Pro |
+  | Qwen-Qwen2.5-Math-7B-Instruct.jinja | Hermes 2 Pro |
+  | THUDM-glm-4-9b-chat.jinja | Generic |
+  | THUDM-glm-edge-1.5b-chat.jinja | Generic |
+  | TheBloke-FusionNet_34Bx2_MoE-AWQ.jinja | Generic |
+  | abacusai-Fewshot-Metamath-OrcaVicuna-Mistral.jinja | Generic |
+  | ai21labs-AI21-Jamba-1.5-Large.jinja | Generic |
+  | bofenghuang-vigogne-2-70b-chat.jinja | Generic |
+  | databricks-dbrx-instruct.jinja | Generic |
+  | deepseek-ai-DeepSeek-Coder-V2-Instruct.jinja | Generic |
+  | deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja | DeepSeek R1 |
+  | deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja | DeepSeek R1 |
+  | deepseek-ai-DeepSeek-R1-Distill-Qwen-7B.jinja | DeepSeek R1 |
+  | deepseek-ai-DeepSeek-V2.5.jinja | DeepSeek R1 |
+  | deepseek-ai-deepseek-coder-33b-instruct.jinja | Generic |
+  | fireworks-ai-llama-3-firefunction-v2.jinja | FireFunction v2 |
+  | google-gemma-2-2b-it.jinja | Generic |
+  | google-gemma-7b-it.jinja | Generic |
+  | ibm-granite-granite-3.1-8b-instruct.jinja | Generic |
+  | indischepartij-MiniCPM-3B-OpenHermes-2.5-v2.jinja | Generic |
+  | mattshumer-Reflection-Llama-3.1-70B.jinja | Generic |
+  | meetkai-functionary-medium-v3.1.jinja | Functionary v3.1 Llama 3.1 |
+  | meetkai-functionary-medium-v3.2.jinja | Functionary v3.2 |
+  | meta-llama-Llama-2-7b-chat-hf.jinja | Generic |
+  | meta-llama-Llama-3.1-8B-Instruct.jinja | Llama 3.x |
+  | meta-llama-Llama-3.2-3B-Instruct.jinja | Llama 3.x |
+  | meta-llama-Llama-3.3-70B-Instruct.jinja | Llama 3.x |
+  | meta-llama-Meta-Llama-3.1-8B-Instruct.jinja | Llama 3.x |
+  | microsoft-Phi-3-medium-4k-instruct.jinja | Generic |
+  | microsoft-Phi-3-mini-4k-instruct.jinja | Generic |
+  | microsoft-Phi-3-small-8k-instruct.jinja | Generic |
+  | microsoft-Phi-3.5-mini-instruct.jinja | Generic |
+  | microsoft-Phi-3.5-vision-instruct.jinja | Generic |
+  | mistralai-Mistral-7B-Instruct-v0.2.jinja | Generic |
+  | mistralai-Mistral-Large-Instruct-2407.jinja | Mistral Nemo |
+  | mistralai-Mistral-Large-Instruct-2411.jinja | Generic |
+  | mistralai-Mistral-Nemo-Instruct-2407.jinja | Mistral Nemo |
+  | mistralai-Mixtral-8x7B-Instruct-v0.1.jinja | Generic |
+  | mlabonne-AlphaMonarch-7B.jinja | Generic |
+  | nvidia-Llama-3.1-Nemotron-70B-Instruct-HF.jinja | Llama 3.x |
+  | openchat-openchat-3.5-0106.jinja | Generic |
+  | teknium-OpenHermes-2.5-Mistral-7B.jinja | Generic |
 
   This table can be generated with:
 
   ```bash
   ./build/bin/test-chat ../minja/build/tests/*.jinja 2>/dev/null
+  ```
 
   </details>
 
@@ -1202,11 +1216,20 @@ curl http://localhost:8080/v1/chat/completions \
 
   ```shell
   # Native support:
+
   llama-server --jinja -fa -hf bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M
   llama-server --jinja -fa -hf bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q6_K_L
   llama-server --jinja -fa -hf bartowski/functionary-small-v3.2-GGUF:Q4_K_M
   llama-server --jinja -fa -hf bartowski/Llama-3.3-70B-Instruct-GGUF:Q4_K_M
 
+  # Native support for DeepSeek R1 works best w/ our own template (official template buggy)
+
+  llama-server --jinja -fa -hf bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q6_K_L --think \
+    --chat-template-file models/templates/llama-cpp-deepseek-r1.jinja
+
+  llama-server --jinja -fa -hf bartowski/DeepSeek-R1-Distill-Qwen-32B-GGUF:Q4_K_M --think \
+    --chat-template-file models/templates/llama-cpp-deepseek-r1.jinja
+
   # Native support requires the right template for these GGUFs:
 
   llama-server --jinja -fa -hf bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M \
@@ -1218,7 +1241,7 @@ curl http://localhost:8080/v1/chat/completions \
   llama-server --jinja -fa -hf bartowski/firefunction-v2-GGUF -hff firefunction-v2-IQ1_M.gguf \
     --chat-template-file <( python scripts/get_chat_template.py fireworks-ai/llama-3-firefunction-v2 tool_use )
 
-  llama-server --jinja -fa -hf bartowski/c4ai-command-r7b-12-2024-GGUF:Q6_K_L \
+  llama-server --jinja -fa -hf bartowski/c4ai-command-r7b-12-2024-GGUF:Q6_K_L --think \
     --chat-template-file <( python scripts/get_chat_template.py CohereForAI/c4ai-command-r7b-12-2024 tool_use )
 
   # Generic format support

@@ -173,6 +173,7 @@ struct slot_params {
             {"grammar_trigger_words",     grammar_trigger_words},
             {"grammar_trigger_tokens",    sampling.grammar_trigger_tokens},
             {"preserved_tokens",          sampling.preserved_tokens},
+            {"chat_format",               common_chat_format_name(oaicompat_chat_format)},
             {"samplers",                  samplers},
             {"speculative.n_max",         speculative.n_max},
             {"speculative.n_min",         speculative.n_min},
@@ -740,12 +741,12 @@ struct server_task_result_cmpl_final : server_task_result {
         }
 
         json message {
-            {"content", msg.content},
+            {"content", msg.content == "" && !tool_calls.empty() ? json() : json(msg.content)},
             {"tool_calls", tool_calls},
             {"role", "assistant"},
         };
-        if (!msg.tool_plan.empty()) {
-            message["tool_plan"] = msg.tool_plan;
+        if (!msg.reasoning_content.empty()) {
+            message["reasoning_content"] = msg.reasoning_content;
         }
 
         json choice {
@@ -4054,7 +4055,7 @@ int main(int argc, char ** argv) {
         }
 
         auto body = json::parse(req.body);
-        json data = oaicompat_completion_params_parse(body, params.use_jinja, ctx_server.chat_templates);
+        json data = oaicompat_completion_params_parse(body, params.use_jinja, params.think, ctx_server.chat_templates);
 
         return handle_completions_impl(
             SERVER_TASK_TYPE_COMPLETION,
@@ -4067,7 +4068,7 @@ int main(int argc, char ** argv) {
     // same with handle_chat_completions, but without inference part
     const auto handle_apply_template = [&ctx_server, &params, &res_ok](const httplib::Request & req, httplib::Response & res) {
         auto body = json::parse(req.body);
-        json data = oaicompat_completion_params_parse(body, params.use_jinja, ctx_server.chat_templates);
+        json data = oaicompat_completion_params_parse(body, params.use_jinja, params.think, ctx_server.chat_templates);
         res_ok(res, {{ "prompt", std::move(data.at("prompt")) }});
     };