sync from upstream. Still need to fix the streaming.

rubra-ai · Oct 23, 2024 · 24a85a3 · 24a85a3
1 parent 0a1c750
commit 24a85a3
Show file tree

Hide file tree

Showing 9 changed files with 2,390 additions and 38 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,11 @@
 # Extensions
 
+*package-lock.json
+*package.json
+*node_modules
+*.ipynb
+.*.json
+
 *.a
 *.bat
 *.bin

diff --git a/Makefile b/Makefile
@@ -1453,6 +1453,8 @@ endif # GGML_RPC
 llama-server: \
 	examples/server/server.cpp \
 	examples/server/utils.hpp \
+	examples/server/function-call-parser.hpp \
+	examples/server/function-call.hpp \
 	examples/server/httplib.h \
 	examples/server/colorthemes.css.hpp \
 	examples/server/style.css.hpp \

diff --git a/README.md b/README.md
@@ -1,14 +1,124 @@
-# llama.cpp
+# tools.cpp
+tools.cpp is Rubra's fork of llama.cpp, offering inference of Rubra's function calling models (and others) in pure C/C++.
 
-![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
+## tools.cpp quickstart
+1. build from source:
+
+- Mac user
+```
+make
+```
+
+- Nvidia-Cuda user:
+```
+make LLAMA_CUDA=1
+```
+
+2. Install a helper package that fixes some rare edgecases:
+```
+npm install jsonrepair
+```
 
-[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
-[![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
-[![Conan Center](https://shields.io/conan/v/llama-cpp)](https://conan.io/center/llama-cpp)
+3. Download a compatible Rubra GGUF model:
+For example:
+```
+wget https://huggingface.co/rubra-ai/Meta-Llama-3-8B-Instruct-GGUF/resolve/main/rubra-meta-llama-3-8b-instruct.Q6_K.gguf
+```
+
+For large multi-part model files, such as [rubra-meta-llama-3-70b-instruct_Q6_K-0000*-of-00003.gguf](https://huggingface.co/rubra-ai/Meta-Llama-3-70B-Instruct-GGUF/tree/main), use the following command to merge them before proceeding to the next step:
+```
+./llama-gguf-split --merge rubra-meta-llama-3-70b-instruct_Q6_K-0000*-of-00003.gguf rubra-meta-llama-3-70b-instruct_Q6_K.gguf
+```
+This will merge multi-part model files to one gguf file `rubra-meta-llama-3-70b-instruct_Q6_K.gguf`.
+
+4. start openai compatible server:
+```
+./llama-server -ngl 37 -m rubra-meta-llama-3-8b-instruct.Q6_K.gguf   --port 1234 --host 0.0.0.0  -c 8000 --chat-template llama3
+```
 
-[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
+5. Test the server, ensure it is available:
+```bash
+curl localhost:1234/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer tokenabc-123" \
+  -d '{
+    "model": "rubra-model",
+    "messages": [
+      {
+        "role": "system",
+        "content": "You are a helpful assistant."
+      },
+      {
+        "role": "user",
+        "content": "hello"
+      }
+    ]
+  }'
+```
+
+6. Try a python function calling example:
+```python
+# if openai not installed, do `pip install openai`
+from openai import OpenAI
+client = OpenAI(api_key="123", base_url = "http://localhost:1234/v1/")
+
+tools = [
+  {
+    "type": "function",
+    "function": {
+      "name": "get_current_weather",
+      "description": "Get the current weather in a given location",
+      "parameters": {
+        "type": "object",
+        "properties": {
+          "location": {
+            "type": "string",
+            "description": "The city and state, e.g. San Francisco, CA",
+          },
+          "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+        },
+        "required": ["location"],
+      },
+    }
+  }
+]
+messages = [{"role": "user", "content": "What's the weather like in Boston today?"}]
+completion = client.chat.completions.create(
+  model="rubra-model",
+  messages=messages,
+  tools=tools,
+  tool_choice="auto"
+)
+
+print(completion)
+```
+
+The output should look like this:
+```
+ChatCompletion(id='chatcmpl-EmHd8kai4DVwBUOyim054GmfcyUbjiLf', choices=[Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content=None, role='assistant', function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='e885974b', function=Function(arguments='{"location":"Boston"}', name='get_current_weather'), type='function')]))], created=1719528056, model='rubra-model', object='chat.completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=29, prompt_tokens=241, total_tokens=270))
+```
+
+That's it! MAKE SURE you turn `stream` OFF when making api calls to the server, as the streaming feature is not supported yet. And we will support streaming too soon.
+
+For more function calling examples, you can checkout `test_llamacpp.ipynb` notebook.
+
+### Choosing a Chat Template for Different Models
+
+| Model   | Chat Template |
+|---------|:-------------:|
+| Llama3  |     llama3    |
+| Mistral |     llama2    |
+| Phi3    |      phi3     |
+| Gemma   |     gemma     |
+| Qwen2   |     chatml    |
+
+For example, to run [Rubra's enhanced Phi3 model](https://huggingface.co/rubra-ai/Phi-3-mini-128k-instruct-function-calling-alpha-v1-GGUF), use the following command:
+
+```bash
+./llama-server -ngl 37 -m phi-3-mini-128k-instruct-function-calling-alpha-v1.Q8_0.gguf --port 1234 --host 0.0.0.0 -c 32000 --chat-template phi3
+```
 
-Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
+==============================================================================
 
 ## Recent API changes
 

diff --git a/examples/server/function-call-parser.hpp b/examples/server/function-call-parser.hpp
@@ -0,0 +1,148 @@
+#include <iostream>
+#include <fstream>
+#include "json.hpp"
+#include <regex>
+#include <memory>
+
+using json = nlohmann::ordered_json;
+
+std::string generate_uuid() {
+    static std::random_device rd;
+    static std::mt19937 generator(rd());
+    static std::uniform_int_distribution<int> distribution(0, 15);
+
+    const char *v = "0123456789abcdef";
+    std::stringstream uuid;
+
+    for (int i = 0; i < 8; ++i) {
+        uuid << v[distribution(generator)];
+    }
+    return uuid.str();
+}
+
+
+std::string jsonrepair(const std::string value) {
+    std::array<char, 128> buffer;
+    std::string result;
+    // Ensure the command passed to popen() is null-terminated
+    std::string tmpfile_name = "." + generate_uuid() + ".json";
+    std::ofstream outfile(tmpfile_name);
+    outfile << value; // Assuming jsonStr contains your JSON string
+    outfile.close();
+    std::string command = "node jsonrepair.ts " + tmpfile_name;
+    std::unique_ptr<FILE, decltype(&pclose)> pipe(popen(command.c_str(), "r"), pclose);
+    if (!pipe) {
+        throw std::runtime_error("popen() failed!");
+    }
+    while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) {
+        result += buffer.data();
+    }
+    return result;
+}
+
+
+json parse_if_json(const std::string& value) {
+    try {
+        // json repair here
+        return json::parse(jsonrepair(value));
+    } catch (const json::parse_error&) {
+        return value;  // Return the original string if parsing fails
+    }
+}
+
+
+std::string clean_command_string(const std::string& command_str) {
+    std::string cleaned_command = std::regex_replace(command_str, std::regex(R"(\\(?!["\\/bfnrt]|u[a-fA-F0-9]{4}))"), "");
+    cleaned_command = std::regex_replace(cleaned_command, std::regex(R"(\\")"), "\"");
+
+    if (cleaned_command.front() == '"' && cleaned_command.back() == '"') {
+        cleaned_command = cleaned_command.substr(1, cleaned_command.size() - 2);
+    }
+    return cleaned_command;
+}
+
+
+json clean_json_strings(const std::string& input_str) {
+    try {
+        // json repair here
+        std::string fixed_str = jsonrepair(input_str);
+        json data = json::parse(fixed_str);
+        for (auto& [key, value] : data.items()) {
+            if (value.is_string()) {
+                std::string val = value.get<std::string>();
+                if (val.front() == '{' || val.front() == '[') {
+                    data[key] = parse_if_json(val);
+                } else {
+                    data[key] = clean_command_string(val);
+                }
+            } else if (value.is_object()) {
+                for (auto& [k, v] : value.items()) {
+                    if (v.is_string()) {
+                        v = clean_command_string(v.get<std::string>());
+                    }
+
+                }
+            }
+        }
+        return data;
+    } catch (const json::parse_error& e) {
+        std::cerr << "Error decoding JSON: " << e.what() << std::endl;
+        return nullptr;
+    }
+}
+
+
+
+
+std::vector<json> rubra_fc_json_tool_extractor(const std::string& output_str) {
+    std::vector<json> result;
+    std::cout << "Output to Parse : " << output_str.c_str() << std::endl;
+    if (output_str.find("endtoolcall") == std::string::npos) {
+        return result;
+    }
+
+    std::vector<std::string> listOfStrToParse;
+    size_t start = 0, end = 0;
+
+    // Iterate until all instances of "endtoolcall" are processed
+    while ((end = output_str.find("endtoolcall", start)) != std::string::npos) {
+        std::string segment = output_str.substr(start, end - start);
+        size_t pos = segment.find("starttoolcall");
+        if (pos != std::string::npos) {
+            // Extract substring after "toolcall"
+            std::string ss = segment.substr(pos + std::string("starttoolcall").length());
+            listOfStrToParse.push_back(ss);
+        }
+        start = end + std::string("endtoolcall").length();  // Move past the "endtoolcall"
+    }
+
+    std::vector<json> function_call_json;
+
+    try {
+        for (const auto & line : listOfStrToParse) {
+            // json fc = json::parse(line);
+
+            json fc = clean_json_strings(line);
+            if (!fc["arguments"].is_string()) {
+                fc["arguments"] = fc["arguments"].dump();
+            }
+            if (!fc.is_null()) {
+                function_call_json.push_back(fc);
+            }
+
+        }
+    } catch (const std::exception& e) {
+        std::cerr << "Error: " << e.what() << std::endl;
+    }
+
+    for (const auto& fc : function_call_json) {
+        json func_call;
+        func_call["id"] = generate_uuid();
+        func_call["name"] = fc["name"];
+        func_call["kwargs"] = fc["arguments"];
+        func_call["type"] = "function";
+        result.push_back(func_call);
+    }
+
+    return result;
+}