Skip to content
This repository was archived by the owner on Jul 4, 2025. It is now read-only.

Commit 51e7aef

Browse files
committed
Merge branch 'fix/linux-arm' of https://github.com/janhq/cortex.cpp into fix/linux-arm
2 parents a48a84a + cbce050 commit 51e7aef

26 files changed

+4551
-185
lines changed

docs/static/openapi/cortex.json

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2199,6 +2199,84 @@
21992199
"tags": ["Engines"]
22002200
}
22012201
},
2202+
"/v1/engines/{name}/releases/{version}": {
2203+
"get": {
2204+
"summary": "List variants for a specific engine version",
2205+
"description": "Lists all available variants (builds) for a specific version of an engine. Variants can include different CPU architectures (AVX, AVX2, AVX512), GPU support (CUDA, Vulkan), and operating systems (Windows, Linux, macOS).",
2206+
"parameters": [
2207+
{
2208+
"name": "name",
2209+
"in": "path",
2210+
"required": true,
2211+
"schema": {
2212+
"type": "string",
2213+
"enum": ["llama-cpp", "onnxruntime", "tensorrt-llm"],
2214+
"default": "llama-cpp"
2215+
},
2216+
"description": "The type of engine"
2217+
},
2218+
{
2219+
"name": "version",
2220+
"in": "path",
2221+
"required": true,
2222+
"schema": {
2223+
"type": "string"
2224+
},
2225+
"description": "The version of the engine"
2226+
},
2227+
{
2228+
"name": "show",
2229+
"in": "query",
2230+
"required": false,
2231+
"schema": {
2232+
"type": "string",
2233+
"enum": ["all", "compatible"],
2234+
"default": "all"
2235+
},
2236+
"description": "Filter the variants list. Use 'compatible' to show only variants compatible with the current system, or 'all' to show all available variants."
2237+
}
2238+
],
2239+
"responses": {
2240+
"200": {
2241+
"description": "Successfully retrieved variants list",
2242+
"content": {
2243+
"application/json": {
2244+
"schema": {
2245+
"type": "array",
2246+
"items": {
2247+
"type": "object",
2248+
"properties": {
2249+
"name": {
2250+
"type": "string",
2251+
"description": "The name of the variant, including OS, architecture, and capabilities",
2252+
"example": "linux-amd64-avx-cuda-11-7"
2253+
},
2254+
"created_at": {
2255+
"type": "string",
2256+
"format": "date-time",
2257+
"description": "Creation timestamp of the variant",
2258+
"example": "2024-11-13T04:51:16Z"
2259+
},
2260+
"size": {
2261+
"type": "integer",
2262+
"description": "Size of the variant in bytes",
2263+
"example": 151224604
2264+
},
2265+
"download_count": {
2266+
"type": "integer",
2267+
"description": "Number of times this variant has been downloaded",
2268+
"example": 0
2269+
}
2270+
}
2271+
}
2272+
}
2273+
}
2274+
}
2275+
}
2276+
},
2277+
"tags": ["Engines"]
2278+
}
2279+
},
22022280
"/v1/engines/{name}/releases/latest": {
22032281
"get": {
22042282
"summary": "Get latest release",

engine/cli/commands/chat_completion_cmd.cc

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,6 @@ size_t WriteCallback(char* ptr, size_t size, size_t nmemb, void* userdata) {
5050

5151
return data_length;
5252
}
53-
5453
} // namespace
5554

5655
void ChatCompletionCmd::Exec(const std::string& host, int port,
@@ -103,7 +102,7 @@ void ChatCompletionCmd::Exec(const std::string& host, int port,
103102
return;
104103
}
105104

106-
std::string url = "http://" + address + "/v1/chat/completions";
105+
auto url = "http://" + address + "/v1/chat/completions";
107106
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
108107
curl_easy_setopt(curl, CURLOPT_POST, 1L);
109108

@@ -151,18 +150,18 @@ void ChatCompletionCmd::Exec(const std::string& host, int port,
151150
json_data["model"] = model_handle;
152151
json_data["stream"] = true;
153152

154-
std::string json_payload = json_data.toStyledString();
155-
156-
curl_easy_setopt(curl, CURLOPT_POSTFIELDS, json_payload.c_str());
153+
auto json_str = json_data.toStyledString();
154+
curl_easy_setopt(curl, CURLOPT_POSTFIELDS, json_str.c_str());
155+
curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE, json_str.length());
156+
curl_easy_setopt(curl, CURLOPT_TCP_KEEPALIVE, 1L);
157157

158158
std::string ai_chat;
159159
StreamingCallback callback;
160160
callback.ai_chat = &ai_chat;
161161

162162
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback);
163163
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &callback);
164-
165-
CURLcode res = curl_easy_perform(curl);
164+
auto res = curl_easy_perform(curl);
166165

167166
if (res != CURLE_OK) {
168167
CLI_LOG("CURL request failed: " << curl_easy_strerror(res));

engine/cli/commands/model_status_cmd.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ bool ModelStatusCmd::IsLoaded(const std::string& host, int port,
2525
auto res = curl_utils::SimpleGetJson(url.ToFullPath());
2626
if (res.has_error()) {
2727
auto root = json_helper::ParseJsonString(res.error());
28-
CLI_LOG(root["message"].asString());
28+
CTL_WRN(root["message"].asString());
2929
return false;
3030
}
3131

engine/common/model_metadata.h

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
#pragma once
2+
3+
#include <sstream>
4+
#include "common/tokenizer.h"
5+
6+
struct ModelMetadata {
7+
uint32_t version;
8+
uint64_t tensor_count;
9+
uint64_t metadata_kv_count;
10+
std::shared_ptr<Tokenizer> tokenizer;
11+
12+
std::string ToString() const {
13+
std::ostringstream ss;
14+
ss << "ModelMetadata {\n"
15+
<< "version: " << version << "\n"
16+
<< "tensor_count: " << tensor_count << "\n"
17+
<< "metadata_kv_count: " << metadata_kv_count << "\n"
18+
<< "tokenizer: ";
19+
20+
if (tokenizer) {
21+
ss << "\n" << tokenizer->ToString();
22+
} else {
23+
ss << "null";
24+
}
25+
26+
ss << "\n}";
27+
return ss.str();
28+
}
29+
};

engine/common/tokenizer.h

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
#pragma once
2+
3+
#include <sstream>
4+
#include <string>
5+
6+
struct Tokenizer {
7+
std::string eos_token = "";
8+
bool add_eos_token = true;
9+
10+
std::string bos_token = "";
11+
bool add_bos_token = true;
12+
13+
std::string unknown_token = "";
14+
std::string padding_token = "";
15+
16+
std::string chat_template = "";
17+
18+
bool add_generation_prompt = true;
19+
20+
// Helper function for common fields
21+
std::string BaseToString() const {
22+
std::ostringstream ss;
23+
ss << "eos_token: \"" << eos_token << "\"\n"
24+
<< "add_eos_token: " << (add_eos_token ? "true" : "false") << "\n"
25+
<< "bos_token: \"" << bos_token << "\"\n"
26+
<< "add_bos_token: " << (add_bos_token ? "true" : "false") << "\n"
27+
<< "unknown_token: \"" << unknown_token << "\"\n"
28+
<< "padding_token: \"" << padding_token << "\"\n"
29+
<< "chat_template: \"" << chat_template << "\"\n"
30+
<< "add_generation_prompt: "
31+
<< (add_generation_prompt ? "true" : "false") << "\"";
32+
return ss.str();
33+
}
34+
35+
virtual ~Tokenizer() = default;
36+
37+
virtual std::string ToString() = 0;
38+
};
39+
40+
struct GgufTokenizer : public Tokenizer {
41+
std::string pre = "";
42+
43+
~GgufTokenizer() override = default;
44+
45+
std::string ToString() override {
46+
std::ostringstream ss;
47+
ss << "GgufTokenizer {\n";
48+
// Add base class members
49+
ss << BaseToString() << "\n";
50+
// Add derived class members
51+
ss << "pre: \"" << pre << "\"\n";
52+
ss << "}";
53+
return ss.str();
54+
}
55+
};
56+
57+
struct SafeTensorTokenizer : public Tokenizer {
58+
bool add_prefix_space = true;
59+
60+
~SafeTensorTokenizer() = default;
61+
62+
std::string ToString() override {
63+
std::ostringstream ss;
64+
ss << "SafeTensorTokenizer {\n";
65+
// Add base class members
66+
ss << BaseToString() << "\n";
67+
// Add derived class members
68+
ss << "add_prefix_space: " << (add_prefix_space ? "true" : "false") << "\n";
69+
ss << "}";
70+
return ss.str();
71+
}
72+
};

engine/controllers/engines.cc

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,8 @@ void Engines::GetEngineReleases(
129129
void Engines::GetEngineVariants(
130130
const HttpRequestPtr& req,
131131
std::function<void(const HttpResponsePtr&)>&& callback,
132-
const std::string& engine, const std::string& version) const {
132+
const std::string& engine, const std::string& version,
133+
std::optional<std::string> show) const {
133134
if (engine.empty()) {
134135
Json::Value res;
135136
res["message"] = "Engine name is required";
@@ -140,7 +141,18 @@ void Engines::GetEngineVariants(
140141
return;
141142
}
142143

143-
auto result = engine_service_->GetEngineVariants(engine, version);
144+
auto show_value = show.value_or("all");
145+
if (show_value != "all" && show_value != "compatible") {
146+
Json::Value res;
147+
res["message"] = "Invalid show value. Can either be `all` or `compatible`";
148+
auto resp = cortex_utils::CreateCortexHttpJsonResponse(res);
149+
resp->setStatusCode(k400BadRequest);
150+
callback(resp);
151+
return;
152+
}
153+
154+
auto result = engine_service_->GetEngineVariants(engine, version,
155+
show_value == "compatible");
144156

145157
auto normalize_version = string_utils::RemoveSubstring(version, "v");
146158
Json::Value releases(Json::arrayValue);

engine/controllers/engines.h

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -53,13 +53,11 @@ class Engines : public drogon::HttpController<Engines, false> {
5353
METHOD_ADD(Engines::GetEngineReleases, "/{1}/releases", Get);
5454
ADD_METHOD_TO(Engines::GetEngineReleases, "/v1/engines/{1}/releases", Get);
5555

56-
METHOD_ADD(Engines::GetEngineVariants, "/{1}/releases/{2}", Get);
57-
ADD_METHOD_TO(Engines::GetEngineVariants, "/v1/engines/{1}/releases/{2}",
58-
Get);
56+
ADD_METHOD_TO(Engines::GetEngineVariants,
57+
"/v1/engines/{engine}/releases/{version}?show={show}", Get);
5958

60-
METHOD_ADD(Engines::GetLatestEngineVersion, "/{1}/releases/latest", Get);
6159
ADD_METHOD_TO(Engines::GetLatestEngineVersion,
62-
"/v1/engines/{1}/releases/latest", Get);
60+
"/v1/engines/{engine}/releases/latest", Get);
6361

6462
METHOD_LIST_END
6563

@@ -83,8 +81,8 @@ class Engines : public drogon::HttpController<Engines, false> {
8381

8482
void GetEngineVariants(const HttpRequestPtr& req,
8583
std::function<void(const HttpResponsePtr&)>&& callback,
86-
const std::string& engine,
87-
const std::string& version) const;
84+
const std::string& engine, const std::string& version,
85+
std::optional<std::string> show) const;
8886

8987
void GetInstalledEngineVariants(
9088
const HttpRequestPtr& req,

engine/controllers/files.cc

Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -216,10 +216,8 @@ void Files::RetrieveFileContent(
216216
return;
217217
}
218218

219-
auto [buffer, size] = std::move(res.value());
220-
auto resp = HttpResponse::newHttpResponse();
221-
resp->setBody(std::string(buffer.get(), size));
222-
resp->setContentTypeCode(CT_APPLICATION_OCTET_STREAM);
219+
auto resp =
220+
cortex_utils::CreateCortexContentResponse(std::move(res.value()));
223221
callback(resp);
224222
} else {
225223
if (!msg_res->rel_path.has_value()) {
@@ -243,10 +241,8 @@ void Files::RetrieveFileContent(
243241
return;
244242
}
245243

246-
auto [buffer, size] = std::move(content_res.value());
247-
auto resp = HttpResponse::newHttpResponse();
248-
resp->setBody(std::string(buffer.get(), size));
249-
resp->setContentTypeCode(CT_APPLICATION_OCTET_STREAM);
244+
auto resp = cortex_utils::CreateCortexContentResponse(
245+
std::move(content_res.value()));
250246
callback(resp);
251247
}
252248
}
@@ -261,9 +257,6 @@ void Files::RetrieveFileContent(
261257
return;
262258
}
263259

264-
auto [buffer, size] = std::move(res.value());
265-
auto resp = HttpResponse::newHttpResponse();
266-
resp->setBody(std::string(buffer.get(), size));
267-
resp->setContentTypeCode(CT_APPLICATION_OCTET_STREAM);
260+
auto resp = cortex_utils::CreateCortexContentResponse(std::move(res.value()));
268261
callback(resp);
269262
}

engine/controllers/server.cc

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
#include "trantor/utils/Logger.h"
44
#include "utils/cortex_utils.h"
55
#include "utils/function_calling/common.h"
6-
#include "utils/http_util.h"
76

87
using namespace inferences;
98

@@ -27,6 +26,14 @@ void server::ChatCompletion(
2726
std::function<void(const HttpResponsePtr&)>&& callback) {
2827
LOG_DEBUG << "Start chat completion";
2928
auto json_body = req->getJsonObject();
29+
if (json_body == nullptr) {
30+
Json::Value ret;
31+
ret["message"] = "Body can't be empty";
32+
auto resp = cortex_utils::CreateCortexHttpJsonResponse(ret);
33+
resp->setStatusCode(k400BadRequest);
34+
callback(resp);
35+
return;
36+
}
3037
bool is_stream = (*json_body).get("stream", false).asBool();
3138
auto model_id = (*json_body).get("model", "invalid_model").asString();
3239
auto engine_type = [this, &json_body]() -> std::string {

engine/e2e-test/test_api_engine.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,14 @@ def test_engines_get_llamacpp_should_be_successful(self):
2828

2929
# engines install
3030
def test_engines_install_llamacpp_specific_version_and_variant(self):
31-
data = {"version": "v0.1.35-27.10.24", "variant": "linux-amd64-avx-cuda-11-7"}
31+
data = {"version": "v0.1.40-b4354", "variant": "linux-amd64-avx-cuda-11-7"}
3232
response = requests.post(
3333
"http://localhost:3928/v1/engines/llama-cpp/install", json=data
3434
)
3535
assert response.status_code == 200
3636

3737
def test_engines_install_llamacpp_specific_version_and_null_variant(self):
38-
data = {"version": "v0.1.35-27.10.24"}
38+
data = {"version": "v0.1.40-b4354"}
3939
response = requests.post(
4040
"http://localhost:3928/v1/engines/llama-cpp/install", json=data
4141
)

0 commit comments

Comments
 (0)