janhq
diff --git a/‎docs/static/openapi/cortex.json‎
Lines changed: 78 additions & 0 deletions b/‎docs/static/openapi/cortex.json‎
Lines changed: 78 additions & 0 deletions
diff --git a/‎engine/cli/commands/chat_completion_cmd.cc‎
Lines changed: 6 additions & 7 deletions b/‎engine/cli/commands/chat_completion_cmd.cc‎
Lines changed: 6 additions & 7 deletions
diff --git a/‎engine/cli/commands/model_status_cmd.cc‎
Lines changed: 1 addition & 1 deletion b/‎engine/cli/commands/model_status_cmd.cc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎engine/common/model_metadata.h‎
Lines changed: 29 additions & 0 deletions b/‎engine/common/model_metadata.h‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎engine/common/tokenizer.h‎
Lines changed: 72 additions & 0 deletions b/‎engine/common/tokenizer.h‎
Lines changed: 72 additions & 0 deletions
diff --git a/‎engine/controllers/engines.cc‎
Lines changed: 14 additions & 2 deletions b/‎engine/controllers/engines.cc‎
Lines changed: 14 additions & 2 deletions
diff --git a/‎engine/controllers/engines.h‎
Lines changed: 5 additions & 7 deletions b/‎engine/controllers/engines.h‎
Lines changed: 5 additions & 7 deletions
diff --git a/‎engine/controllers/files.cc‎
Lines changed: 5 additions & 12 deletions b/‎engine/controllers/files.cc‎
Lines changed: 5 additions & 12 deletions
diff --git a/‎engine/controllers/server.cc‎
Lines changed: 8 additions & 1 deletion b/‎engine/controllers/server.cc‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎engine/e2e-test/test_api_engine.py‎
Lines changed: 2 additions & 2 deletions b/‎engine/e2e-test/test_api_engine.py‎
Lines changed: 2 additions & 2 deletions
@@ -2199,6 +2199,84 @@
         "tags": ["Engines"]
       }
     },
+    "/v1/engines/{name}/releases/{version}": {
+      "get": {
+        "summary": "List variants for a specific engine version",
+        "description": "Lists all available variants (builds) for a specific version of an engine. Variants can include different CPU architectures (AVX, AVX2, AVX512), GPU support (CUDA, Vulkan), and operating systems (Windows, Linux, macOS).",
+        "parameters": [
+          {
+            "name": "name",
+            "in": "path",
+            "required": true,
+            "schema": {
+              "type": "string",
+              "enum": ["llama-cpp", "onnxruntime", "tensorrt-llm"],
+              "default": "llama-cpp"
+            },
+            "description": "The type of engine"
+          },
+          {
+            "name": "version",
+            "in": "path",
+            "required": true,
+            "schema": {
+              "type": "string"
+            },
+            "description": "The version of the engine"
+          },
+          {
+            "name": "show",
+            "in": "query",
+            "required": false,
+            "schema": {
+              "type": "string",
+              "enum": ["all", "compatible"],
+              "default": "all"
+            },
+            "description": "Filter the variants list. Use 'compatible' to show only variants compatible with the current system, or 'all' to show all available variants."
+          }
+        ],
+        "responses": {
+          "200": {
+            "description": "Successfully retrieved variants list",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "array",
+                  "items": {
+                    "type": "object",
+                    "properties": {
+                      "name": {
+                        "type": "string",
+                        "description": "The name of the variant, including OS, architecture, and capabilities",
+                        "example": "linux-amd64-avx-cuda-11-7"
+                      },
+                      "created_at": {
+                        "type": "string",
+                        "format": "date-time",
+                        "description": "Creation timestamp of the variant",
+                        "example": "2024-11-13T04:51:16Z"
+                      },
+                      "size": {
+                        "type": "integer",
+                        "description": "Size of the variant in bytes",
+                        "example": 151224604
+                      },
+                      "download_count": {
+                        "type": "integer",
+                        "description": "Number of times this variant has been downloaded",
+                        "example": 0
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        },
+        "tags": ["Engines"]
+      }
+    },
     "/v1/engines/{name}/releases/latest": {
       "get": {
         "summary": "Get latest release",
 
@@ -50,7 +50,6 @@ size_t WriteCallback(char* ptr, size_t size, size_t nmemb, void* userdata) {
 
   return data_length;
 }
-
 }  // namespace
 
 void ChatCompletionCmd::Exec(const std::string& host, int port,
@@ -103,7 +102,7 @@ void ChatCompletionCmd::Exec(const std::string& host, int port,
     return;
   }
 
-  std::string url = "http://" + address + "/v1/chat/completions";
+  auto url = "http://" + address + "/v1/chat/completions";
   curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
   curl_easy_setopt(curl, CURLOPT_POST, 1L);
 
@@ -151,18 +150,18 @@ void ChatCompletionCmd::Exec(const std::string& host, int port,
       json_data["model"] = model_handle;
       json_data["stream"] = true;
 
-      std::string json_payload = json_data.toStyledString();
-
-      curl_easy_setopt(curl, CURLOPT_POSTFIELDS, json_payload.c_str());
+      auto json_str = json_data.toStyledString();
+      curl_easy_setopt(curl, CURLOPT_POSTFIELDS, json_str.c_str());
+      curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE, json_str.length());
+      curl_easy_setopt(curl, CURLOPT_TCP_KEEPALIVE, 1L);
 
       std::string ai_chat;
       StreamingCallback callback;
       callback.ai_chat = &ai_chat;
 
       curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback);
       curl_easy_setopt(curl, CURLOPT_WRITEDATA, &callback);
-
-      CURLcode res = curl_easy_perform(curl);
+      auto res = curl_easy_perform(curl);
 
       if (res != CURLE_OK) {
         CLI_LOG("CURL request failed: " << curl_easy_strerror(res));
 
@@ -25,7 +25,7 @@ bool ModelStatusCmd::IsLoaded(const std::string& host, int port,
   auto res = curl_utils::SimpleGetJson(url.ToFullPath());
   if (res.has_error()) {
     auto root = json_helper::ParseJsonString(res.error());
-    CLI_LOG(root["message"].asString());
+    CTL_WRN(root["message"].asString());
     return false;
   }
 
 
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <sstream>
+#include "common/tokenizer.h"
+
+struct ModelMetadata {
+  uint32_t version;
+  uint64_t tensor_count;
+  uint64_t metadata_kv_count;
+  std::shared_ptr<Tokenizer> tokenizer;
+
+  std::string ToString() const {
+    std::ostringstream ss;
+    ss << "ModelMetadata {\n"
+       << "version: " << version << "\n"
+       << "tensor_count: " << tensor_count << "\n"
+       << "metadata_kv_count: " << metadata_kv_count << "\n"
+       << "tokenizer: ";
+
+    if (tokenizer) {
+      ss << "\n" << tokenizer->ToString();
+    } else {
+      ss << "null";
+    }
+
+    ss << "\n}";
+    return ss.str();
+  }
+};
@@ -0,0 +1,72 @@
+#pragma once
+
+#include <sstream>
+#include <string>
+
+struct Tokenizer {
+  std::string eos_token = "";
+  bool add_eos_token = true;
+
+  std::string bos_token = "";
+  bool add_bos_token = true;
+
+  std::string unknown_token = "";
+  std::string padding_token = "";
+
+  std::string chat_template = "";
+
+  bool add_generation_prompt = true;
+
+  // Helper function for common fields
+  std::string BaseToString() const {
+    std::ostringstream ss;
+    ss << "eos_token: \"" << eos_token << "\"\n"
+       << "add_eos_token: " << (add_eos_token ? "true" : "false") << "\n"
+       << "bos_token: \"" << bos_token << "\"\n"
+       << "add_bos_token: " << (add_bos_token ? "true" : "false") << "\n"
+       << "unknown_token: \"" << unknown_token << "\"\n"
+       << "padding_token: \"" << padding_token << "\"\n"
+       << "chat_template: \"" << chat_template << "\"\n"
+       << "add_generation_prompt: "
+       << (add_generation_prompt ? "true" : "false") << "\"";
+    return ss.str();
+  }
+
+  virtual ~Tokenizer() = default;
+
+  virtual std::string ToString() = 0;
+};
+
+struct GgufTokenizer : public Tokenizer {
+  std::string pre = "";
+
+  ~GgufTokenizer() override = default;
+
+  std::string ToString() override {
+    std::ostringstream ss;
+    ss << "GgufTokenizer {\n";
+    // Add base class members
+    ss << BaseToString() << "\n";
+    // Add derived class members
+    ss << "pre: \"" << pre << "\"\n";
+    ss << "}";
+    return ss.str();
+  }
+};
+
+struct SafeTensorTokenizer : public Tokenizer {
+  bool add_prefix_space = true;
+
+  ~SafeTensorTokenizer() = default;
+
+  std::string ToString() override {
+    std::ostringstream ss;
+    ss << "SafeTensorTokenizer {\n";
+    // Add base class members
+    ss << BaseToString() << "\n";
+    // Add derived class members
+    ss << "add_prefix_space: " << (add_prefix_space ? "true" : "false") << "\n";
+    ss << "}";
+    return ss.str();
+  }
+};
@@ -129,7 +129,8 @@ void Engines::GetEngineReleases(
 void Engines::GetEngineVariants(
     const HttpRequestPtr& req,
     std::function<void(const HttpResponsePtr&)>&& callback,
-    const std::string& engine, const std::string& version) const {
+    const std::string& engine, const std::string& version,
+    std::optional<std::string> show) const {
   if (engine.empty()) {
     Json::Value res;
     res["message"] = "Engine name is required";
@@ -140,7 +141,18 @@ void Engines::GetEngineVariants(
     return;
   }
 
-  auto result = engine_service_->GetEngineVariants(engine, version);
+  auto show_value = show.value_or("all");
+  if (show_value != "all" && show_value != "compatible") {
+    Json::Value res;
+    res["message"] = "Invalid show value. Can either be `all` or `compatible`";
+    auto resp = cortex_utils::CreateCortexHttpJsonResponse(res);
+    resp->setStatusCode(k400BadRequest);
+    callback(resp);
+    return;
+  }
+
+  auto result = engine_service_->GetEngineVariants(engine, version,
+                                                   show_value == "compatible");
 
   auto normalize_version = string_utils::RemoveSubstring(version, "v");
   Json::Value releases(Json::arrayValue);
 
@@ -53,13 +53,11 @@ class Engines : public drogon::HttpController<Engines, false> {
   METHOD_ADD(Engines::GetEngineReleases, "/{1}/releases", Get);
   ADD_METHOD_TO(Engines::GetEngineReleases, "/v1/engines/{1}/releases", Get);
 
-  METHOD_ADD(Engines::GetEngineVariants, "/{1}/releases/{2}", Get);
-  ADD_METHOD_TO(Engines::GetEngineVariants, "/v1/engines/{1}/releases/{2}",
-                Get);
+  ADD_METHOD_TO(Engines::GetEngineVariants,
+                "/v1/engines/{engine}/releases/{version}?show={show}", Get);
 
-  METHOD_ADD(Engines::GetLatestEngineVersion, "/{1}/releases/latest", Get);
   ADD_METHOD_TO(Engines::GetLatestEngineVersion,
-                "/v1/engines/{1}/releases/latest", Get);
+                "/v1/engines/{engine}/releases/latest", Get);
 
   METHOD_LIST_END
 
@@ -83,8 +81,8 @@ class Engines : public drogon::HttpController<Engines, false> {
 
   void GetEngineVariants(const HttpRequestPtr& req,
                          std::function<void(const HttpResponsePtr&)>&& callback,
-                         const std::string& engine,
-                         const std::string& version) const;
+                         const std::string& engine, const std::string& version,
+                         std::optional<std::string> show) const;
 
   void GetInstalledEngineVariants(
       const HttpRequestPtr& req,
 
@@ -216,10 +216,8 @@ void Files::RetrieveFileContent(
         return;
       }
 
-      auto [buffer, size] = std::move(res.value());
-      auto resp = HttpResponse::newHttpResponse();
-      resp->setBody(std::string(buffer.get(), size));
-      resp->setContentTypeCode(CT_APPLICATION_OCTET_STREAM);
+      auto resp =
+          cortex_utils::CreateCortexContentResponse(std::move(res.value()));
       callback(resp);
     } else {
       if (!msg_res->rel_path.has_value()) {
@@ -243,10 +241,8 @@ void Files::RetrieveFileContent(
         return;
       }
 
-      auto [buffer, size] = std::move(content_res.value());
-      auto resp = HttpResponse::newHttpResponse();
-      resp->setBody(std::string(buffer.get(), size));
-      resp->setContentTypeCode(CT_APPLICATION_OCTET_STREAM);
+      auto resp = cortex_utils::CreateCortexContentResponse(
+          std::move(content_res.value()));
       callback(resp);
     }
   }
@@ -261,9 +257,6 @@ void Files::RetrieveFileContent(
     return;
   }
 
-  auto [buffer, size] = std::move(res.value());
-  auto resp = HttpResponse::newHttpResponse();
-  resp->setBody(std::string(buffer.get(), size));
-  resp->setContentTypeCode(CT_APPLICATION_OCTET_STREAM);
+  auto resp = cortex_utils::CreateCortexContentResponse(std::move(res.value()));
   callback(resp);
 }
@@ -3,7 +3,6 @@
 #include "trantor/utils/Logger.h"
 #include "utils/cortex_utils.h"
 #include "utils/function_calling/common.h"
-#include "utils/http_util.h"
 
 using namespace inferences;
 
@@ -27,6 +26,14 @@ void server::ChatCompletion(
     std::function<void(const HttpResponsePtr&)>&& callback) {
   LOG_DEBUG << "Start chat completion";
   auto json_body = req->getJsonObject();
+  if (json_body == nullptr) {
+    Json::Value ret;
+    ret["message"] = "Body can't be empty";
+    auto resp = cortex_utils::CreateCortexHttpJsonResponse(ret);
+    resp->setStatusCode(k400BadRequest);
+    callback(resp);
+    return;
+  }
   bool is_stream = (*json_body).get("stream", false).asBool();
   auto model_id = (*json_body).get("model", "invalid_model").asString();
   auto engine_type = [this, &json_body]() -> std::string {
 
@@ -28,14 +28,14 @@ def test_engines_get_llamacpp_should_be_successful(self):
 
     # engines install
     def test_engines_install_llamacpp_specific_version_and_variant(self):
-        data = {"version": "v0.1.35-27.10.24", "variant": "linux-amd64-avx-cuda-11-7"}
+        data = {"version": "v0.1.40-b4354", "variant": "linux-amd64-avx-cuda-11-7"}
         response = requests.post(
             "http://localhost:3928/v1/engines/llama-cpp/install", json=data
         )
         assert response.status_code == 200
 
     def test_engines_install_llamacpp_specific_version_and_null_variant(self):
-        data = {"version": "v0.1.35-27.10.24"}
+        data = {"version": "v0.1.40-b4354"}
         response = requests.post(
             "http://localhost:3928/v1/engines/llama-cpp/install", json=data
         )
Original file line number	Diff line number	Diff line change
`@@ -25,7 +25,7 @@ bool ModelStatusCmd::IsLoaded(const std::string& host, int port,`
`25`	`25`	`auto res = curl_utils::SimpleGetJson(url.ToFullPath());`
`26`	`26`	`if (res.has_error()) {`
`27`	`27`	`auto root = json_helper::ParseJsonString(res.error());`
`28`		`- CLI_LOG(root["message"].asString());`
	`28`	`+ CTL_WRN(root["message"].asString());`
`29`	`29`	`return false;`
`30`	`30`	`}`
`31`	`31`
Original file line number	Diff line number	Diff line change
`@@ -216,10 +216,8 @@ void Files::RetrieveFileContent(`
`216`	`216`	`return;`
`217`	`217`	`}`
`218`	`218`
`219`		`- auto [buffer, size] = std::move(res.value());`
`220`		`- auto resp = HttpResponse::newHttpResponse();`
`221`		`- resp->setBody(std::string(buffer.get(), size));`
`222`		`- resp->setContentTypeCode(CT_APPLICATION_OCTET_STREAM);`
	`219`	`+ auto resp =`
	`220`	`+ cortex_utils::CreateCortexContentResponse(std::move(res.value()));`
`223`	`221`	`callback(resp);`
`224`	`222`	`} else {`
`225`	`223`	`if (!msg_res->rel_path.has_value()) {`
`@@ -243,10 +241,8 @@ void Files::RetrieveFileContent(`
`243`	`241`	`return;`
`244`	`242`	`}`
`245`	`243`
`246`		`- auto [buffer, size] = std::move(content_res.value());`
`247`		`- auto resp = HttpResponse::newHttpResponse();`
`248`		`- resp->setBody(std::string(buffer.get(), size));`
`249`		`- resp->setContentTypeCode(CT_APPLICATION_OCTET_STREAM);`
	`244`	`+ auto resp = cortex_utils::CreateCortexContentResponse(`
	`245`	`+ std::move(content_res.value()));`
`250`	`246`	`callback(resp);`
`251`	`247`	`}`
`252`	`248`	`}`
`@@ -261,9 +257,6 @@ void Files::RetrieveFileContent(`
`261`	`257`	`return;`
`262`	`258`	`}`
`263`	`259`
`264`		`- auto [buffer, size] = std::move(res.value());`
`265`		`- auto resp = HttpResponse::newHttpResponse();`
`266`		`- resp->setBody(std::string(buffer.get(), size));`
`267`		`- resp->setContentTypeCode(CT_APPLICATION_OCTET_STREAM);`
	`260`	`+ auto resp = cortex_utils::CreateCortexContentResponse(std::move(res.value()));`
`268`	`261`	`callback(resp);`
`269`	`262`	`}`