Merge branch 'feat/use-llama-cpp-server' of github.com:janhq/cortex.llamacpp into feat/use-llama-cpp-server

sangjanai · sangjanai · commit 399d01fb956f · 2025-01-13T07:32:56.000+07:00
diff --git a/README.md b/README.md
@@ -148,3 +148,4 @@ Table of parameters
 |`flash_attn` | Boolean| To enable Flash Attention, default is true|
 |`cache_type` | String| KV cache type: f16, q8_0, q4_0, default is f16|
 |`use_mmap` | Boolean| To enable mmap, default is true|
+|`ctx_shift` | Boolean| To enable context shift, default is true|
diff --git a/llama.cpp b/llama.cpp
@@ -1 +1 @@
-Subproject commit 0827b2c1da299805288abbd556d869318f2b121e
+Subproject commit 0da5d860266c6928b8c9408efbd264ae59fedda6
diff --git a/src/llama_engine.cc b/src/llama_engine.cc
@@ -712,6 +712,7 @@ bool LlamaEngine::LoadModelImpl(std::shared_ptr<Json::Value> json_body) {
       }
     }
 
+    params.ctx_shift = json_body->get("ctx_shift", true).asBool();
     params.n_gpu_layers =
         json_body->get("ngl", 300)
             .asInt();  // change from 100 -> 300 since llama 3.1 has 292 gpu layers

Original file line number	Diff line number	Diff line change
`@@ -712,6 +712,7 @@ bool LlamaEngine::LoadModelImpl(std::shared_ptr<Json::Value> json_body) {`
`712`	`712`	`}`
`713`	`713`	`}`
`714`	`714`
	`715`	`+ params.ctx_shift = json_body->get("ctx_shift", true).asBool();`
`715`	`716`	`params.n_gpu_layers =`
`716`	`717`	`json_body->get("ngl", 300)`
`717`	`718`	`.asInt(); // change from 100 -> 300 since llama 3.1 has 292 gpu layers`