Update llama.cpp submodule to latest release b3943 (#257)

* Update submodule to latest release b3943 * fix: API changes * fix: build * fix: more --------- Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> Co-authored-by: vansangpfiev <sang@jan.ai>
janhq · Oct 23, 2024 · 86146f0 · 86146f0
1 parent f430ead
commit 86146f0
Show file tree

Hide file tree

Showing 12 changed files with 219 additions and 208 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
diff --git a/.github/workflows/nightly-build.yml b/.github/workflows/nightly-build.yml
diff --git a/.github/workflows/template-e2e-weekend-test.yml b/.github/workflows/template-e2e-weekend-test.yml
@@ -33,7 +33,7 @@ jobs:
           - os: "linux"
             name: "amd64-avx2"
             runs-on: "ubuntu-20-04"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: true
             vulkan: false
             sccache: true
@@ -42,7 +42,7 @@ jobs:
           - os: "linux"
             name: "amd64-noavx-cuda-12-0"
             runs-on: "ubuntu-20-04-cuda-12-0-gpu"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX=OFF -DGGML_FMA=OFF -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX=OFF -DGGML_FMA=OFF -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: true
             vulkan: false
             sccache: true
@@ -51,23 +51,23 @@ jobs:
           - os: "mac"
             name: "amd64"
             runs-on: "macos-12"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DBUILD_SHARED_LIBS=OFF -DGGML_METAL=OFF"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_METAL=OFF"
             run-e2e: true
             vulkan: false
             sccache: false
             sccache-conf-path: ""
           - os: "mac"
             name: "arm64"
             runs-on: "macos-silicon"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DBUILD_SHARED_LIBS=OFF -DGGML_METAL_EMBED_LIBRARY=ON"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_METAL_EMBED_LIBRARY=ON"
             run-e2e: true
             vulkan: false
             sccache: false
             sccache-conf-path: ""
           - os: "windows"
             name: "amd64-avx2"
             runs-on: "windows-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
             run-e2e: true
             vulkan: false
             sccache: false

diff --git a/.github/workflows/template-quality-gate-pr.yml b/.github/workflows/template-quality-gate-pr.yml
diff --git a/.github/workflows/template-quality-gate-submodule.yml b/.github/workflows/template-quality-gate-submodule.yml
diff --git a/llama.cpp b/llama.cpp
diff --git a/src/chat_completion_request.h b/src/chat_completion_request.h
@@ -35,7 +35,7 @@ struct ChatCompletionRequest {
 
 inline ChatCompletionRequest fromJson(std::shared_ptr<Json::Value> jsonBody) {
   ChatCompletionRequest completion;
-  gpt_sampler_params default_params;
+  common_sampler_params default_params;
   if (jsonBody) {
     completion.stream = (*jsonBody).get("stream", false).asBool();
     completion.max_tokens = (*jsonBody).get("max_tokens", 500).asInt();

diff --git a/src/llama_client_slot.cc b/src/llama_client_slot.cc
@@ -26,7 +26,7 @@ void LlamaClientSlot::Reset() {
   images.clear();
 }
 
-bool LlamaClientSlot::HasBudget(gpt_params& global_params) {
+bool LlamaClientSlot::HasBudget(common_params& global_params) {
   n_remaining = -1;
   if (params.n_predict != -1) {
     n_remaining = params.n_predict - n_decoded;

diff --git a/src/llama_client_slot.h b/src/llama_client_slot.h
@@ -133,8 +133,8 @@ struct LlamaClientSlot {
   std::string stopping_word;
 
   // sampling
-  struct gpt_sampler_params sparams;
-  struct gpt_sampler* smpl = nullptr;
+  struct common_sampler_params sparams;
+  struct common_sampler* smpl = nullptr;
 
   // multimodal
   std::vector<SlotImage> images;
@@ -154,7 +154,7 @@ struct LlamaClientSlot {
 
   void Reset();
 
-  bool HasBudget(gpt_params& global_params);
+  bool HasBudget(common_params& global_params);
 
   bool Available() const;
 

diff --git a/src/llama_engine.cc b/src/llama_engine.cc
@@ -182,7 +182,7 @@ LlamaEngine::LlamaEngine(int log_option) {
     asynce_file_logger_ = std::make_unique<trantor::FileLogger>();
   }
 
-  gpt_log_pause(gpt_log_main());
+  common_log_pause(common_log_main());
 
   llama_log_set(
       [](ggml_log_level level, const char* text, void* user_data) {
@@ -403,7 +403,7 @@ void LlamaEngine::SetFileLogger(int max_log_lines,
 }
 
 bool LlamaEngine::LoadModelImpl(std::shared_ptr<Json::Value> json_body) {
-  gpt_params params;
+  common_params params;
   std::string model_type;
   auto model_id = llama_utils::GetModelId(*json_body);
   // By default will setting based on number of handlers
@@ -515,11 +515,11 @@ bool LlamaEngine::LoadModelImpl(std::shared_ptr<Json::Value> json_body) {
     LOG_DEBUG << "stop: " << server_map_[model_id].stop_words.toStyledString();
 
     if (!json_body->operator[]("llama_log_folder").isNull()) {
-      gpt_log_resume(gpt_log_main());
+      common_log_resume(common_log_main());
       std::string llama_log_folder =
           json_body->operator[]("llama_log_folder").asString();
       llama_log_folder += "llama.log";
-      gpt_log_set_file(gpt_log_main(), llama_log_folder.c_str());
+      common_log_set_file(common_log_main(), llama_log_folder.c_str());
     }  // Set folder for llama log
   }
   if (params.model_alias == "unknown") {