diff --git a/backend/cpp/llama-cpp/Makefile b/backend/cpp/llama-cpp/Makefile index 014c60ef3dc0..35fba161b1d5 100644 --- a/backend/cpp/llama-cpp/Makefile +++ b/backend/cpp/llama-cpp/Makefile @@ -1,5 +1,5 @@ -LLAMA_VERSION?=5c8a717128cc98aa9e5b1c44652f5cf458fd426e +LLAMA_VERSION?=9d52f17ae33e8df958e20f3f1b13bfec53ab5a1d LLAMA_REPO?=https://github.com/ggerganov/llama.cpp CMAKE_ARGS?= diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp index 0fc208783dc2..e08413d0cf58 100644 --- a/backend/cpp/llama-cpp/grpc-server.cpp +++ b/backend/cpp/llama-cpp/grpc-server.cpp @@ -392,6 +392,34 @@ static void params_parse(server_context& ctx_server, const backend::ModelOptions // Initialize grpc_servers to empty (can be overridden by options) std::string grpc_servers_option = ""; + // Initialize fit_params options (can be overridden by options) + // fit_params: whether to auto-adjust params to fit device memory (default: true as in llama.cpp) + params.fit_params = true; + // fit_params_target: target margin per device in bytes (default: 1GB) + params.fit_params_target = 1024 * 1024 * 1024; + // fit_params_min_ctx: minimum context size for fit (default: 4096) + params.fit_params_min_ctx = 4096; + + // Initialize additional server options (can be overridden by options) + // n_cache_reuse: min chunk size for KV cache reuse via shifting (default: 0 = disabled) + params.n_cache_reuse = 0; + // slot_prompt_similarity: threshold for slot prompt matching (default: 0.1) + params.slot_prompt_similarity = 0.1f; + // swa_full: use full-size SWA cache (default: false) + params.swa_full = false; + // cont_batching: continuous batching (default: true, auto-enabled when n_parallel > 1) + params.cont_batching = true; + // check_tensors: validate tensor data (default: false) + params.check_tensors = false; + // warmup: enable warmup run (default: true) + params.warmup = true; + // no_op_offload: disable host tensor op offload (default: false) + params.no_op_offload = false; + // kv_unified: enable unified KV cache (default: false) + params.kv_unified = false; + // n_ctx_checkpoints: max context checkpoints per slot (default: 8) + params.n_ctx_checkpoints = 8; + // decode options. Options are in form optname:optvale, or if booleans only optname. for (int i = 0; i < request->options_size(); i++) { std::string opt = request->options(i); @@ -436,6 +464,89 @@ static void params_parse(server_context& ctx_server, const backend::ModelOptions if (optval != NULL) { grpc_servers_option = optval_str; } + } else if (!strcmp(optname, "fit_params") || !strcmp(optname, "fit")) { + if (optval_str == "true" || optval_str == "1" || optval_str == "yes" || optval_str == "on" || optval_str == "enabled") { + params.fit_params = true; + } else if (optval_str == "false" || optval_str == "0" || optval_str == "no" || optval_str == "off" || optval_str == "disabled") { + params.fit_params = false; + } + } else if (!strcmp(optname, "fit_params_target") || !strcmp(optname, "fit_target")) { + if (optval != NULL) { + try { + // Value is in MiB, convert to bytes + params.fit_params_target = static_cast(std::stoi(optval_str)) * 1024 * 1024; + } catch (const std::exception& e) { + // If conversion fails, keep default value (1GB) + } + } + } else if (!strcmp(optname, "fit_params_min_ctx") || !strcmp(optname, "fit_ctx")) { + if (optval != NULL) { + try { + params.fit_params_min_ctx = std::stoi(optval_str); + } catch (const std::exception& e) { + // If conversion fails, keep default value (4096) + } + } + } else if (!strcmp(optname, "n_cache_reuse") || !strcmp(optname, "cache_reuse")) { + if (optval != NULL) { + try { + params.n_cache_reuse = std::stoi(optval_str); + } catch (const std::exception& e) { + // If conversion fails, keep default value (0) + } + } + } else if (!strcmp(optname, "slot_prompt_similarity") || !strcmp(optname, "sps")) { + if (optval != NULL) { + try { + params.slot_prompt_similarity = std::stof(optval_str); + } catch (const std::exception& e) { + // If conversion fails, keep default value (0.1) + } + } + } else if (!strcmp(optname, "swa_full")) { + if (optval_str == "true" || optval_str == "1" || optval_str == "yes" || optval_str == "on" || optval_str == "enabled") { + params.swa_full = true; + } else if (optval_str == "false" || optval_str == "0" || optval_str == "no" || optval_str == "off" || optval_str == "disabled") { + params.swa_full = false; + } + } else if (!strcmp(optname, "cont_batching") || !strcmp(optname, "continuous_batching")) { + if (optval_str == "true" || optval_str == "1" || optval_str == "yes" || optval_str == "on" || optval_str == "enabled") { + params.cont_batching = true; + } else if (optval_str == "false" || optval_str == "0" || optval_str == "no" || optval_str == "off" || optval_str == "disabled") { + params.cont_batching = false; + } + } else if (!strcmp(optname, "check_tensors")) { + if (optval_str == "true" || optval_str == "1" || optval_str == "yes" || optval_str == "on" || optval_str == "enabled") { + params.check_tensors = true; + } else if (optval_str == "false" || optval_str == "0" || optval_str == "no" || optval_str == "off" || optval_str == "disabled") { + params.check_tensors = false; + } + } else if (!strcmp(optname, "warmup")) { + if (optval_str == "true" || optval_str == "1" || optval_str == "yes" || optval_str == "on" || optval_str == "enabled") { + params.warmup = true; + } else if (optval_str == "false" || optval_str == "0" || optval_str == "no" || optval_str == "off" || optval_str == "disabled") { + params.warmup = false; + } + } else if (!strcmp(optname, "no_op_offload")) { + if (optval_str == "true" || optval_str == "1" || optval_str == "yes" || optval_str == "on" || optval_str == "enabled") { + params.no_op_offload = true; + } else if (optval_str == "false" || optval_str == "0" || optval_str == "no" || optval_str == "off" || optval_str == "disabled") { + params.no_op_offload = false; + } + } else if (!strcmp(optname, "kv_unified") || !strcmp(optname, "unified_kv")) { + if (optval_str == "true" || optval_str == "1" || optval_str == "yes" || optval_str == "on" || optval_str == "enabled") { + params.kv_unified = true; + } else if (optval_str == "false" || optval_str == "0" || optval_str == "no" || optval_str == "off" || optval_str == "disabled") { + params.kv_unified = false; + } + } else if (!strcmp(optname, "n_ctx_checkpoints") || !strcmp(optname, "ctx_checkpoints")) { + if (optval != NULL) { + try { + params.n_ctx_checkpoints = std::stoi(optval_str); + } catch (const std::exception& e) { + // If conversion fails, keep default value (8) + } + } } } diff --git a/docs/content/features/text-generation.md b/docs/content/features/text-generation.md index 3831f8e1fa5f..69797f87fe4b 100644 --- a/docs/content/features/text-generation.md +++ b/docs/content/features/text-generation.md @@ -149,6 +149,18 @@ The `llama.cpp` backend supports additional configuration options that can be sp | `cache_ram` | integer | Set the maximum RAM cache size in MiB for KV cache. Use `-1` for unlimited (default). | `cache_ram:2048` | | `parallel` or `n_parallel` | integer | Enable parallel request processing. When set to a value greater than 1, enables continuous batching for handling multiple requests concurrently. | `parallel:4` | | `grpc_servers` or `rpc_servers` | string | Comma-separated list of gRPC server addresses for distributed inference. Allows distributing workload across multiple llama.cpp workers. | `grpc_servers:localhost:50051,localhost:50052` | +| `fit_params` or `fit` | boolean | Enable auto-adjustment of model/context parameters to fit available device memory. Default: `true`. | `fit_params:true` | +| `fit_params_target` or `fit_target` | integer | Target margin per device in MiB when using fit_params. Default: `1024` (1GB). | `fit_target:2048` | +| `fit_params_min_ctx` or `fit_ctx` | integer | Minimum context size that can be set by fit_params. Default: `4096`. | `fit_ctx:2048` | +| `n_cache_reuse` or `cache_reuse` | integer | Minimum chunk size to attempt reusing from the cache via KV shifting. Default: `0` (disabled). | `cache_reuse:256` | +| `slot_prompt_similarity` or `sps` | float | How much the prompt of a request must match the prompt of a slot to use that slot. Default: `0.1`. Set to `0` to disable. | `sps:0.5` | +| `swa_full` | boolean | Use full-size SWA (Sliding Window Attention) cache. Default: `false`. | `swa_full:true` | +| `cont_batching` or `continuous_batching` | boolean | Enable continuous batching for handling multiple sequences. Default: `true`. | `cont_batching:true` | +| `check_tensors` | boolean | Validate tensor data for invalid values during model loading. Default: `false`. | `check_tensors:true` | +| `warmup` | boolean | Enable warmup run after model loading. Default: `true`. | `warmup:false` | +| `no_op_offload` | boolean | Disable offloading host tensor operations to device. Default: `false`. | `no_op_offload:true` | +| `kv_unified` or `unified_kv` | boolean | Enable unified KV cache. Default: `false`. | `kv_unified:true` | +| `n_ctx_checkpoints` or `ctx_checkpoints` | integer | Maximum number of context checkpoints per slot. Default: `8`. | `ctx_checkpoints:4` | **Example configuration with options:** @@ -162,6 +174,9 @@ options: - context_shift:true - cache_ram:4096 - parallel:2 + - fit_params:true + - fit_target:1024 + - slot_prompt_similarity:0.5 ``` **Note:** The `parallel` option can also be set via the `LLAMACPP_PARALLEL` environment variable, and `grpc_servers` can be set via the `LLAMACPP_GRPC_SERVERS` environment variable. Options specified in the YAML file take precedence over environment variables.