Feat/structured output (#308)

nguyenhoangthuan99 · web-flow · commit a51c2c536b3d · 2024-12-01T20:42:01.000+07:00
* Add openai compatible embedding

* Feat: add structured output in chat completion request

* chore: remove unnecessary cout
diff --git a/src/chat_completion_request.h b/src/chat_completion_request.h
@@ -88,6 +88,7 @@ struct ChatCompletionRequest {
   bool include_usage = false;
   std::string grammar;
   Json::Value logit_bias = Json::Value(Json::arrayValue);
+  Json::Value json_schema;
 
   static Json::Value ConvertLogitBiasToArray(const Json::Value& input) {
     Json::Value result(Json::arrayValue);
@@ -155,6 +156,7 @@ inline ChatCompletionRequest fromJson(std::shared_ptr<Json::Value> jsonBody) {
     completion.min_keep = (*jsonBody).get("min_keep", 0).asInt();
     completion.n = (*jsonBody).get("n", 1).asInt();
     completion.grammar = (*jsonBody).get("grammar", "").asString();
+    completion.json_schema = (*jsonBody).get("response_format", Json::Value::null);
     const Json::Value& input_logit_bias = (*jsonBody)["logit_bias"];
     if (!input_logit_bias.isNull()) {
       completion.logit_bias =
diff --git a/src/llama_engine.cc b/src/llama_engine.cc
@@ -3,10 +3,12 @@
 #include <cmath>
 #include <limits>
 #include <optional>
+#include "json-schema-to-grammar.h"
 #include "json/writer.h"
 #include "llama_utils.h"
 #include "trantor/utils/Logger.h"
 
+
 #if defined(_WIN32)
 #include <windows.h>
 #include <codecvt>
@@ -56,6 +58,7 @@ bool AreAllElementsInt32(const Json::Value& arr) {
     }
     // Check if value is within int32_t range
     auto value = element.asInt();
+
     if (value < std::numeric_limits<int32_t>::min() ||
         value > std::numeric_limits<int32_t>::max()) {
       return false;
@@ -748,6 +751,15 @@ void LlamaEngine::HandleInferenceImpl(
   data["n_probs"] = completion.n_probs;
   data["min_keep"] = completion.min_keep;
   data["grammar"] = completion.grammar;
+  if (!completion.json_schema.isNull() &&
+      (completion.json_schema.isMember("type") &&
+       (completion.json_schema["type"] == "json_object" ||
+        completion.json_schema["type"] == "json_schema"))) {
+
+    data["grammar"] =
+        json_schema_to_grammar(llama::inferences::ConvertJsonCppToNlohmann(
+            completion.json_schema["json_schema"]["schema"]));
+  }
   data["n"] = completion.n;  // number of choices to return
   json arr = json::array();
   for (const auto& elem : completion.logit_bias) {
@@ -1039,7 +1051,6 @@ void LlamaEngine::HandleInferenceImpl(
         status["is_stream"] = false;
         status["status_code"] = k200OK;
         cb(std::move(status), std::move(respData));
-
         LOG_INFO << "Request " << request_id << ": " << "Inference completed";
       }
     });
@@ -1091,6 +1102,7 @@ void LlamaEngine::HandleEmbeddingImpl(
           prompt_tokens +=
               static_cast<int>(result.result_json["tokens_evaluated"]);
           std::vector<float> embedding_result = result.result_json["embedding"];
+
           responseData.append(
               CreateEmbeddingPayload(embedding_result, 0, is_base64));
         } else {
@@ -1128,6 +1140,7 @@ void LlamaEngine::HandleEmbeddingImpl(
             prompt_tokens += cur_pt;
             std::vector<float> embedding_result =
                 result.result_json["embedding"];
+
             responseData.append(
                 CreateEmbeddingPayload(embedding_result, i, is_base64));
           }