ggml-org · z80maniac · Feb 28, 2024 · Mar 8, 2024 · Mar 8, 2024 · Mar 8, 2024
diff --git a/Makefile b/Makefile
@@ -638,7 +638,7 @@ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o
 llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
-COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
+COMMON_H_DEPS = common/common.h common/sampling.h common/log.h common/error.h
 COMMON_DEPS   = common.o sampling.o grammar-parser.o build-info.o
 
 common.o: common/common.cpp $(COMMON_H_DEPS)

diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
@@ -62,7 +62,8 @@ add_library(${TARGET} STATIC
     grammar-parser.cpp
     train.h
     train.cpp
-    )
+    error.h
+)
 
 if (BUILD_SHARED_LIBS)
     set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)

diff --git a/common/error.h b/common/error.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <cstdio>
+#include <exception>
+#include <string>
+
+class llama_error : public std::exception
+{
+private:
+    std::string _type;
+    std::string _message;
+
+public:
+    llama_error(const std::string & type, const std::string & message)
+    :
+    _type(type),
+    _message(message)
+    {
+        fprintf(stderr, "ERROR [%s]: %s\n", type.c_str(), message.c_str());
+    }
+
+    inline const std::string & type() const { return _type; }
+    inline const std::string & message() const { return _message; }
+};
diff --git a/common/grammar-parser.cpp b/common/grammar-parser.cpp
@@ -1,4 +1,6 @@
 #include "grammar-parser.h"
+#include "error.h"
+
 #include <cstdint>
 #include <cwchar>
 #include <string>
@@ -280,8 +282,7 @@ namespace grammar_parser {
             }
             return state;
         } catch (const std::exception & err) {
-            fprintf(stderr, "%s: error parsing grammar: %s\n", __func__, err.what());
-            return parse_state();
+            throw llama_error("grammar.invalid", std::string(__func__) + ": error parsing grammar: " + err.what());
         }
     }
 

diff --git a/common/sampling.cpp b/common/sampling.cpp
@@ -1,4 +1,5 @@
 #include "sampling.h"
+#include "error.h"
 
 struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) {
     struct llama_sampling_context * result = new llama_sampling_context();
@@ -8,13 +9,17 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
 
     // if there is a grammar, parse it
     if (!params.grammar.empty()) {
-        result->parsed_grammar = grammar_parser::parse(params.grammar.c_str());
+        try {
+            result->parsed_grammar = grammar_parser::parse(params.grammar.c_str());
+        } catch (const llama_error & err) {
+            delete result;
+            throw err;
+        }
 
         // will be empty (default) if there are parse errors
         if (result->parsed_grammar.rules.empty()) {
-            fprintf(stderr, "%s: failed to parse grammar\n", __func__);
             delete result;
-            return nullptr;
+            throw llama_error("grammar.empty", std::string(__func__) + ": empty grammar");
         }
 
         std::vector<const llama_grammar_element *> grammar_rules(result->parsed_grammar.c_rules());

@@ -302,6 +302,20 @@ Notice that each `probs` is an array of length `n_probs`.
 - `tokens_evaluated`: Number of tokens evaluated in total from the prompt
 - `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)
 
+In case of an error the error details will be returned as follows:
+```json
+{
+  "error": {
+    "description": "parse: error parsing grammar: expecting name at (",
+    "id": "grammar.invalid"
+  }
+}
+```
+where:
+- `description` - human-readable description
+- `id` - unique ID for this error type
+
+
 - **POST** `/tokenize`: Tokenize a given text.
 
     *Options:*

@@ -96,18 +96,18 @@ export async function* llama(prompt, params = {}, config = {}) {
             }
           }
           if (result.error) {
-            result.error = JSON.parse(result.error);
-            if (result.error.content.includes('slot unavailable')) {
-              // Throw an error to be caught by upstream callers
-              throw new Error('slot unavailable');
-            } else {
-              console.error(`llama.cpp error: ${result.error.content}`);
+            try {
+              result.error = JSON.parse(JSON.parse(result.error).content).error;
+              if (result.error.message.includes('slot unavailable')) {
+                // Throw an error to be caught by upstream callers
+                throw new Error('slot unavailable');
+              } else {
+                console.error(`llama.cpp error [${result.error.type}]: ${result.error.message}`);
+              }
+            } catch(e) {
+              console.error(`llama.cpp error ${result.error}`)
             }
           }
-          if (result.error) {
-            result.error = JSON.parse(result.error);
-            console.error(`llama.cpp error: ${result.error.content}`);
-          }
         }
       }
     }

@@ -18,11 +18,13 @@
 #include "index.js.hpp"
 #include "completion.js.hpp"
 #include "json-schema-to-grammar.mjs.hpp"
+#include "error.h"
 
 #include <atomic>
 #include <chrono>
 #include <condition_variable>
 #include <cstddef>
+#include <exception>
 #include <set>
 #include <mutex>
 #include <thread>
@@ -967,6 +969,7 @@ struct server_context {
         {
             if (slot.ctx_sampling != nullptr) {
                 llama_sampling_free(slot.ctx_sampling);
+                slot.ctx_sampling = nullptr;
             }
             slot.ctx_sampling = llama_sampling_init(slot.sparams);
             llama_set_rng_seed(ctx, slot.params.seed);
@@ -1210,16 +1213,25 @@ struct server_context {
         };
     }
 
-    void send_error(const server_task & task, const std::string & error) {
-        LOG_TEE("task %i - error: %s\n", task.id, error.c_str());
+    static json error_to_json(const llama_error& error)
+    {
+        return {
+            { "error", {
+                { "type", error.type() },
+                { "message", error.message() }
+            } }
+        };
+    }
 
+    void send_error(const server_task & task, const llama_error& error)
+    {
+        LOG_TEE("task %i - error: %s - %s\n", task.id, error.type().c_str(), error.message().c_str());
         server_task_result res;
-        res.id       = task.id;
+        res.id = task.id;
         res.id_multi = task.id_multi;
-        res.stop     = false;
-        res.error    = true;
-        res.data     = { { "content", error } };
-
+        res.stop = false;
+        res.error = true;
+        res.data = { { "content", error_to_json(error).dump() } };
         queue_results.send(res);
     }
 
@@ -1397,7 +1409,7 @@ struct server_context {
     void split_multiprompt_task(int id_multi, const server_task & multiprompt_task) {
         const int prompt_count = multiprompt_task.data.at("prompt").size();
         if (prompt_count <= 1) {
-            send_error(multiprompt_task, "error while handling multiple prompts");
+            send_error(multiprompt_task, llama_error("prompt.multi.empty", "error while handling multiple prompts"));
             return;
         }
 
@@ -1448,10 +1460,17 @@ struct server_context {
                     slot->infill    = task.infill;
                     slot->embedding = task.embedding;
 
-                    if (!launch_slot_with_data(*slot, task.data)) {
-                        // send error result
-                        send_error(task, "internal_error");
-                        break;
+                    try {
+                        if (!launch_slot_with_data(*slot, task.data))
+                        {
+                            // send error result
+                            send_error(task, llama_error("unknown", "Unknown error"));
+                            break;
+                        }
+                    } catch (const llama_error & err) {
+                        send_error(task, err);
+                    } catch (const std::exception & err) {
+                        send_error(task, llama_error("unhandled", err.what()));
                     }
                 } break;
             case SERVER_TASK_TYPE_CANCEL:
@@ -3028,7 +3047,15 @@ int main(int argc, char ** argv) {
             return;
         }
 
-        json data = json::parse(req.body);
+        json data;
+        try {
+            data = json::parse(req.body);
+        } catch (const json::exception & json_err) {
+            const auto err = llama_error("request.invalid_json", std::string("Invalid JSON: ") + json_err.what());
+            const auto err_json = server_context::error_to_json(err).dump();
+            res.set_content(err_json, "text/plain; charset=utf-8");
+            return;
+        }
 
         const int id_task = ctx_server.queue_tasks.get_new_id();