Skip to content

server: error handling #5776

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -638,7 +638,7 @@ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
$(CXX) $(CXXFLAGS) -c $< -o $@

COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h common/error.h
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o

common.o: common/common.cpp $(COMMON_H_DEPS)
Expand Down
3 changes: 2 additions & 1 deletion common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@ add_library(${TARGET} STATIC
grammar-parser.cpp
train.h
train.cpp
)
error.h
)

if (BUILD_SHARED_LIBS)
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
Expand Down
24 changes: 24 additions & 0 deletions common/error.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#pragma once

#include <cstdio>
#include <exception>
#include <string>

class llama_error : public std::exception
{
private:
std::string _type;
std::string _message;

public:
llama_error(const std::string & type, const std::string & message)
:
_type(type),
_message(message)
{
fprintf(stderr, "ERROR [%s]: %s\n", type.c_str(), message.c_str());
}

inline const std::string & type() const { return _type; }
inline const std::string & message() const { return _message; }
};
5 changes: 3 additions & 2 deletions common/grammar-parser.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#include "grammar-parser.h"
#include "error.h"

#include <cstdint>
#include <cwchar>
#include <string>
Expand Down Expand Up @@ -280,8 +282,7 @@ namespace grammar_parser {
}
return state;
} catch (const std::exception & err) {
fprintf(stderr, "%s: error parsing grammar: %s\n", __func__, err.what());
return parse_state();
throw llama_error("grammar.invalid", std::string(__func__) + ": error parsing grammar: " + err.what());
}
}

Expand Down
11 changes: 8 additions & 3 deletions common/sampling.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "sampling.h"
#include "error.h"

struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) {
struct llama_sampling_context * result = new llama_sampling_context();
Expand All @@ -8,13 +9,17 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_

// if there is a grammar, parse it
if (!params.grammar.empty()) {
result->parsed_grammar = grammar_parser::parse(params.grammar.c_str());
try {
result->parsed_grammar = grammar_parser::parse(params.grammar.c_str());
} catch (const llama_error & err) {
delete result;
throw err;
}

// will be empty (default) if there are parse errors
if (result->parsed_grammar.rules.empty()) {
fprintf(stderr, "%s: failed to parse grammar\n", __func__);
delete result;
return nullptr;
throw llama_error("grammar.empty", std::string(__func__) + ": empty grammar");
}

std::vector<const llama_grammar_element *> grammar_rules(result->parsed_grammar.c_rules());
Expand Down
14 changes: 14 additions & 0 deletions examples/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,20 @@ Notice that each `probs` is an array of length `n_probs`.
- `tokens_evaluated`: Number of tokens evaluated in total from the prompt
- `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)

In case of an error the error details will be returned as follows:
```json
{
"error": {
"description": "parse: error parsing grammar: expecting name at (",
"id": "grammar.invalid"
}
}
```
where:
- `description` - human-readable description
- `id` - unique ID for this error type


- **POST** `/tokenize`: Tokenize a given text.

*Options:*
Expand Down
470 changes: 236 additions & 234 deletions examples/server/completion.js.hpp

Large diffs are not rendered by default.

20 changes: 10 additions & 10 deletions examples/server/public/completion.js
Original file line number Diff line number Diff line change
Expand Up @@ -96,18 +96,18 @@ export async function* llama(prompt, params = {}, config = {}) {
}
}
if (result.error) {
result.error = JSON.parse(result.error);
if (result.error.content.includes('slot unavailable')) {
// Throw an error to be caught by upstream callers
throw new Error('slot unavailable');
} else {
console.error(`llama.cpp error: ${result.error.content}`);
try {
result.error = JSON.parse(JSON.parse(result.error).content).error;
if (result.error.message.includes('slot unavailable')) {
// Throw an error to be caught by upstream callers
throw new Error('slot unavailable');
} else {
console.error(`llama.cpp error [${result.error.type}]: ${result.error.message}`);
}
} catch(e) {
console.error(`llama.cpp error ${result.error}`)
}
}
if (result.error) {
result.error = JSON.parse(result.error);
console.error(`llama.cpp error: ${result.error.content}`);
}
}
}
}
Expand Down
53 changes: 40 additions & 13 deletions examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,13 @@
#include "index.js.hpp"
#include "completion.js.hpp"
#include "json-schema-to-grammar.mjs.hpp"
#include "error.h"

#include <atomic>
#include <chrono>
#include <condition_variable>
#include <cstddef>
#include <exception>
#include <set>
#include <mutex>
#include <thread>
Expand Down Expand Up @@ -967,6 +969,7 @@ struct server_context {
{
if (slot.ctx_sampling != nullptr) {
llama_sampling_free(slot.ctx_sampling);
slot.ctx_sampling = nullptr;
}
slot.ctx_sampling = llama_sampling_init(slot.sparams);
llama_set_rng_seed(ctx, slot.params.seed);
Expand Down Expand Up @@ -1210,16 +1213,25 @@ struct server_context {
};
}

void send_error(const server_task & task, const std::string & error) {
LOG_TEE("task %i - error: %s\n", task.id, error.c_str());
static json error_to_json(const llama_error& error)
{
return {
{ "error", {
{ "type", error.type() },
{ "message", error.message() }
} }
};
}

void send_error(const server_task & task, const llama_error& error)
{
LOG_TEE("task %i - error: %s - %s\n", task.id, error.type().c_str(), error.message().c_str());
server_task_result res;
res.id = task.id;
res.id = task.id;
res.id_multi = task.id_multi;
res.stop = false;
res.error = true;
res.data = { { "content", error } };

res.stop = false;
res.error = true;
res.data = { { "content", error_to_json(error).dump() } };
queue_results.send(res);
}

Expand Down Expand Up @@ -1397,7 +1409,7 @@ struct server_context {
void split_multiprompt_task(int id_multi, const server_task & multiprompt_task) {
const int prompt_count = multiprompt_task.data.at("prompt").size();
if (prompt_count <= 1) {
send_error(multiprompt_task, "error while handling multiple prompts");
send_error(multiprompt_task, llama_error("prompt.multi.empty", "error while handling multiple prompts"));
return;
}

Expand Down Expand Up @@ -1448,10 +1460,17 @@ struct server_context {
slot->infill = task.infill;
slot->embedding = task.embedding;

if (!launch_slot_with_data(*slot, task.data)) {
// send error result
send_error(task, "internal_error");
break;
try {
if (!launch_slot_with_data(*slot, task.data))
{
// send error result
send_error(task, llama_error("unknown", "Unknown error"));
break;
}
} catch (const llama_error & err) {
send_error(task, err);
} catch (const std::exception & err) {
send_error(task, llama_error("unhandled", err.what()));
}
} break;
case SERVER_TASK_TYPE_CANCEL:
Expand Down Expand Up @@ -3028,7 +3047,15 @@ int main(int argc, char ** argv) {
return;
}

json data = json::parse(req.body);
json data;
try {
data = json::parse(req.body);
} catch (const json::exception & json_err) {
const auto err = llama_error("request.invalid_json", std::string("Invalid JSON: ") + json_err.what());
const auto err_json = server_context::error_to_json(err).dump();
res.set_content(err_json, "text/plain; charset=utf-8");
return;
}

const int id_task = ctx_server.queue_tasks.get_new_id();

Expand Down
Loading