diff --git a/docs/parameters.md b/docs/parameters.md index 95b4bd5885..494b525ba5 100644 --- a/docs/parameters.md +++ b/docs/parameters.md @@ -130,6 +130,7 @@ Task specific parameters for different tasks (text generation/image generation/e | `--dynamic_split_fuse` | `bool` | Enables dynamic split fuse algorithm. Default: true. | | `--max_prompt_len` | `integer` | Sets NPU specific property for maximum number of tokens in the prompt. | | `--kv_cache_precision` | `string` | Reduced kv cache precision to `u8` lowers the cache size consumption. Accepted values: `u8` or empty (default). | +| `--model_distribution_policy` | `string` | TENSOR_PARALLEL distributes tensor to multiple sockets/devices and processes it in parallel. PIPELINE_PARALLEL distributes different tensors to process by each device. Accepted values: `TENSOR_PARALLEL`, `PIPELINE_PARALLEL` or empty (default). | | `--reasoning_parser` | `string` | Type of parser to use for reasoning content extraction from model output. Currently supported: [qwen3] | | `--tool_parser` | `string` | Type of parser to use for tool calls extraction from model output. Currently supported: [llama3, hermes3, phi4] | | `--enable_tool_guided_generation` | `bool` | Enables enforcing tool schema during generation. Requires setting response parser. Default: false. | diff --git a/src/capi_frontend/server_settings.cpp b/src/capi_frontend/server_settings.cpp index e439f4d382..f4cafcde63 100644 --- a/src/capi_frontend/server_settings.cpp +++ b/src/capi_frontend/server_settings.cpp @@ -20,7 +20,6 @@ #include "../stringutils.hpp" namespace ovms { - std::string enumToString(ConfigExportType type) { auto it = configExportTypeToString.find(type); return (it != configExportTypeToString.end()) ? it->second : "UNKNOWN_MODEL"; diff --git a/src/capi_frontend/server_settings.hpp b/src/capi_frontend/server_settings.hpp index 4b0700b1f1..3b8c46b8db 100644 --- a/src/capi_frontend/server_settings.hpp +++ b/src/capi_frontend/server_settings.hpp @@ -88,20 +88,29 @@ enum OvmsServerMode : int { }; struct PluginConfigSettingsImpl { + std::optional manualString; std::optional kvCachePrecision; std::optional maxPromptLength; std::optional modelDistributionPolicy; + std::optional numStreams; + std::optional cacheDir; + std::optional useNpuPrefixCaching; + bool empty() const { + return !kvCachePrecision.has_value() && + !maxPromptLength.has_value() && + !modelDistributionPolicy.has_value() && + !numStreams.has_value() && + !cacheDir.has_value() && + !useNpuPrefixCaching.has_value() && + (!manualString.has_value() || manualString.value().empty()); + } }; struct TextGenGraphSettingsImpl { - std::string modelPath = "./"; - std::string modelName = ""; uint32_t maxNumSeqs = 256; - std::string targetDevice = "CPU"; std::string enablePrefixCaching = "true"; uint32_t cacheSize = 10; std::string dynamicSplitFuse = "true"; - PluginConfigSettingsImpl pluginConfig; std::optional maxNumBatchedTokens; std::optional draftModelDirName; std::optional pipelineType; @@ -111,27 +120,16 @@ struct TextGenGraphSettingsImpl { }; struct EmbeddingsGraphSettingsImpl { - std::string modelPath = "./"; - std::string targetDevice = "CPU"; - std::string modelName = ""; - uint32_t numStreams = 1; std::string normalize = "true"; std::string truncate = "false"; std::string pooling = "CLS"; }; struct RerankGraphSettingsImpl { - std::string modelPath = "./"; - std::string targetDevice = "CPU"; - std::string modelName = ""; - uint32_t numStreams = 1; uint64_t maxAllowedChunks = 10000; }; struct ImageGenerationGraphSettingsImpl { - std::string modelName = ""; - std::string modelPath = "./"; - std::string targetDevice = "CPU"; std::string resolution = ""; std::string maxResolution = ""; std::string defaultResolution = ""; @@ -140,13 +138,15 @@ struct ImageGenerationGraphSettingsImpl { std::optional maxNumberImagesPerPrompt; std::optional defaultNumInferenceSteps; std::optional maxNumInferenceSteps; - std::string pluginConfig; }; struct ExportSettings { + std::string modelName = ""; + std::string modelPath = "./"; std::string targetDevice = "CPU"; std::optional extraQuantizationParams; std::string precision = "int8"; + PluginConfigSettingsImpl pluginConfig; }; struct HFSettingsImpl { diff --git a/src/cli_parser.cpp b/src/cli_parser.cpp index a60f6e1764..923d0a76bb 100644 --- a/src/cli_parser.cpp +++ b/src/cli_parser.cpp @@ -574,6 +574,7 @@ void CLIParser::prepareModel(ModelsSettingsImpl& modelsSettings, HFSettingsImpl& if (result->count("plugin_config")) { modelsSettings.pluginConfig = result->operator[]("plugin_config").as(); + hfSettings.exportSettings.pluginConfig.manualString = modelsSettings.pluginConfig; modelsSettings.userSetSingleModelArguments.push_back("plugin_config"); } @@ -684,6 +685,9 @@ void CLIParser::prepareGraph(ServerSettingsImpl& serverSettings, HFSettingsImpl& throw std::logic_error("Tried to prepare graph settings without graph parser initialization"); } } + if (!serverSettings.cacheDir.empty()) { + hfSettings.exportSettings.pluginConfig.cacheDir = serverSettings.cacheDir; + } // No pull nor pull and start mode } else { if (result->count("weight-format")) { diff --git a/src/config.cpp b/src/config.cpp index 59d5498117..0a1744c7f5 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -129,40 +129,41 @@ bool Config::validate() { std::cerr << "Graph options not initialized for text generation."; return false; } - auto settings = std::get(this->serverSettings.hfSettings.graphSettings); + const auto& exportSettings = this->serverSettings.hfSettings.exportSettings; + auto textGenSettings = std::get(this->serverSettings.hfSettings.graphSettings); std::vector allowedPipelineTypes = {"LM", "LM_CB", "VLM", "VLM_CB", "AUTO"}; - if (settings.pipelineType.has_value() && std::find(allowedPipelineTypes.begin(), allowedPipelineTypes.end(), settings.pipelineType) == allowedPipelineTypes.end()) { - std::cerr << "pipeline_type: " << settings.pipelineType.value() << " is not allowed. Supported types: LM, LM_CB, VLM, VLM_CB, AUTO" << std::endl; + if (textGenSettings.pipelineType.has_value() && std::find(allowedPipelineTypes.begin(), allowedPipelineTypes.end(), textGenSettings.pipelineType) == allowedPipelineTypes.end()) { + std::cerr << "pipeline_type: " << textGenSettings.pipelineType.value() << " is not allowed. Supported types: LM, LM_CB, VLM, VLM_CB, AUTO" << std::endl; return false; } std::vector allowedTargetDevices = {"CPU", "GPU", "NPU", "AUTO"}; bool validDeviceSelected = false; - if (settings.targetDevice.rfind("GPU.", 0) == 0) { + if (exportSettings.targetDevice.rfind("GPU.", 0) == 0) { // Accept GPU.x where x is a number to select specific GPU card - std::string indexPart = settings.targetDevice.substr(4); + std::string indexPart = exportSettings.targetDevice.substr(4); validDeviceSelected = !indexPart.empty() && std::all_of(indexPart.begin(), indexPart.end(), ::isdigit); - } else if (settings.targetDevice.rfind("HETERO", 0) == 0) { - // Accept HETERO:,,... to select specific devices in the list + } else if ((exportSettings.targetDevice.rfind("HETERO", 0) == 0) || (exportSettings.targetDevice.rfind("AUTO", 0) == 0)) { + // Accept HETERO:,,... AUTO:,,... to select specific devices in the list validDeviceSelected = true; - } else if (std::find(allowedTargetDevices.begin(), allowedTargetDevices.end(), settings.targetDevice) != allowedTargetDevices.end()) { + } else if (std::find(allowedTargetDevices.begin(), allowedTargetDevices.end(), exportSettings.targetDevice) != allowedTargetDevices.end()) { // Accept CPU, GPU, NPU, AUTO as valid devices validDeviceSelected = true; } if (!validDeviceSelected) { - std::cerr << "target_device: " << settings.targetDevice << " is not allowed. Supported devices: CPU, GPU, NPU, HETERO, AUTO" << std::endl; + std::cerr << "target_device: " << exportSettings.targetDevice << " is not allowed. Supported devices: CPU, GPU, NPU, HETERO, AUTO" << std::endl; return false; } std::vector allowedBoolValues = {"false", "true"}; - if (std::find(allowedBoolValues.begin(), allowedBoolValues.end(), settings.enablePrefixCaching) == allowedBoolValues.end()) { - std::cerr << "enable_prefix_caching: " << settings.enablePrefixCaching << " is not allowed. Supported values: true, false" << std::endl; + if (std::find(allowedBoolValues.begin(), allowedBoolValues.end(), textGenSettings.enablePrefixCaching) == allowedBoolValues.end()) { + std::cerr << "enable_prefix_caching: " << textGenSettings.enablePrefixCaching << " is not allowed. Supported values: true, false" << std::endl; return false; } - if (std::find(allowedBoolValues.begin(), allowedBoolValues.end(), settings.dynamicSplitFuse) == allowedBoolValues.end()) { - std::cerr << "dynamic_split_fuse: " << settings.dynamicSplitFuse << " is not allowed. Supported values: true, false" << std::endl; + if (std::find(allowedBoolValues.begin(), allowedBoolValues.end(), textGenSettings.dynamicSplitFuse) == allowedBoolValues.end()) { + std::cerr << "dynamic_split_fuse: " << textGenSettings.dynamicSplitFuse << " is not allowed. Supported values: true, false" << std::endl; return false; } } @@ -172,16 +173,16 @@ bool Config::validate() { std::cerr << "Graph options not initialized for embeddings."; return false; } - auto settings = std::get(this->serverSettings.hfSettings.graphSettings); + auto embedSettings = std::get(this->serverSettings.hfSettings.graphSettings); std::vector allowedBoolValues = {"false", "true"}; - if (std::find(allowedBoolValues.begin(), allowedBoolValues.end(), settings.normalize) == allowedBoolValues.end()) { - std::cerr << "normalize: " << settings.normalize << " is not allowed. Supported values: true, false" << std::endl; + if (std::find(allowedBoolValues.begin(), allowedBoolValues.end(), embedSettings.normalize) == allowedBoolValues.end()) { + std::cerr << "normalize: " << embedSettings.normalize << " is not allowed. Supported values: true, false" << std::endl; return false; } - if (std::find(allowedBoolValues.begin(), allowedBoolValues.end(), settings.truncate) == allowedBoolValues.end()) { - std::cerr << "truncate: " << settings.truncate << " is not allowed. Supported values: true, false" << std::endl; + if (std::find(allowedBoolValues.begin(), allowedBoolValues.end(), embedSettings.truncate) == allowedBoolValues.end()) { + std::cerr << "truncate: " << embedSettings.truncate << " is not allowed. Supported values: true, false" << std::endl; return false; } } diff --git a/src/graph_export/BUILD b/src/graph_export/BUILD index 7522ca76fd..66af85677a 100644 --- a/src/graph_export/BUILD +++ b/src/graph_export/BUILD @@ -80,7 +80,6 @@ ovms_cc_library( "@ovms//src:libovms_server_settings", "@ovms//src:ovms_exit_codes", "@com_github_jarro2783_cxxopts//:cxxopts", - "@com_github_tencent_rapidjson//:rapidjson", ], visibility = ["//visibility:public"], ) diff --git a/src/graph_export/embeddings_graph_cli_parser.cpp b/src/graph_export/embeddings_graph_cli_parser.cpp index 7f77d98318..8bdcffe7bf 100644 --- a/src/graph_export/embeddings_graph_cli_parser.cpp +++ b/src/graph_export/embeddings_graph_cli_parser.cpp @@ -81,11 +81,11 @@ std::vector EmbeddingsGraphCLIParser::parse(const std::vectoroperator[]("num_streams").as(); + hfSettings.exportSettings.pluginConfig.numStreams = result->operator[]("num_streams").as(); embeddingsGraphSettings.normalize = result->operator[]("normalize").as(); embeddingsGraphSettings.truncate = result->operator[]("truncate").as(); embeddingsGraphSettings.pooling = result->operator[]("pooling").as(); diff --git a/src/graph_export/graph_cli_parser.cpp b/src/graph_export/graph_cli_parser.cpp index 4e61bf689c..f59962e718 100644 --- a/src/graph_export/graph_cli_parser.cpp +++ b/src/graph_export/graph_cli_parser.cpp @@ -89,7 +89,11 @@ void GraphCLIParser::createOptions() { ("kv_cache_precision", "u8 or empty (model default). Reduced kv cache precision to u8 lowers the cache size consumption.", cxxopts::value()->default_value(""), - "KV_CACHE_PRECISION"); + "KV_CACHE_PRECISION") + ("model_distribution_policy", + "TENSOR_PARALLEL, PIPELINE_PARALLEL or empty (model default). Sets model distribution policy for inference with multiple sockets/devices.", + cxxopts::value(), + "MODEL_DISTRIBUTION_POLICY"); } void GraphCLIParser::printHelp() { @@ -115,12 +119,12 @@ std::vector GraphCLIParser::parse(const std::vector& u void GraphCLIParser::prepare(OvmsServerMode serverMode, HFSettingsImpl& hfSettings, const std::string& modelName) { TextGenGraphSettingsImpl graphSettings = GraphCLIParser::defaultGraphSettings(); - graphSettings.targetDevice = hfSettings.exportSettings.targetDevice; + hfSettings.exportSettings.targetDevice = hfSettings.exportSettings.targetDevice; // Deduct model name if (modelName != "") { - graphSettings.modelName = modelName; + hfSettings.exportSettings.modelName = modelName; } else { - graphSettings.modelName = hfSettings.sourceModel; + hfSettings.exportSettings.modelName = hfSettings.sourceModel; } if (nullptr == result) { @@ -131,6 +135,9 @@ void GraphCLIParser::prepare(OvmsServerMode serverMode, HFSettingsImpl& hfSettin } else { graphSettings.maxNumSeqs = result->operator[]("max_num_seqs").as(); graphSettings.enablePrefixCaching = result->operator[]("enable_prefix_caching").as(); + if (graphSettings.enablePrefixCaching == "true" && hfSettings.exportSettings.targetDevice == "NPU") { + hfSettings.exportSettings.pluginConfig.useNpuPrefixCaching = true; + } graphSettings.cacheSize = result->operator[]("cache_size").as(); graphSettings.dynamicSplitFuse = result->operator[]("dynamic_split_fuse").as(); if (result->count("draft_source_model")) { @@ -153,11 +160,13 @@ void GraphCLIParser::prepare(OvmsServerMode serverMode, HFSettingsImpl& hfSettin // Plugin configuration if (result->count("max_prompt_len")) { - graphSettings.pluginConfig.maxPromptLength = result->operator[]("max_prompt_len").as(); + hfSettings.exportSettings.pluginConfig.maxPromptLength = result->operator[]("max_prompt_len").as(); + } + if (result->count("model_distribution_policy")) { + hfSettings.exportSettings.pluginConfig.modelDistributionPolicy = result->operator[]("model_distribution_policy").as(); } - if (result->count("kv_cache_precision")) { - graphSettings.pluginConfig.kvCachePrecision = result->operator[]("kv_cache_precision").as(); + hfSettings.exportSettings.pluginConfig.kvCachePrecision = result->operator[]("kv_cache_precision").as(); } } diff --git a/src/graph_export/graph_export.cpp b/src/graph_export/graph_export.cpp index 747ac4384a..b2fd7e9167 100644 --- a/src/graph_export/graph_export.cpp +++ b/src/graph_export/graph_export.cpp @@ -82,12 +82,44 @@ std::string GraphExport::getDraftModelDirectoryPath(const std::string& directory std::string fullPath = FileSystem::joinPath({directoryPath, GraphExport::getDraftModelDirectoryName(draftModel)}); return fullPath; } +#define GET_PLUGIN_CONFIG_OPT_OR_FAIL_AND_RETURN(EXPORT_SETTINGS) \ + auto pluginConfigOrStatus = GraphExport::createPluginString(EXPORT_SETTINGS); \ + if (std::holds_alternative(pluginConfigOrStatus)) { \ + auto status = std::get(pluginConfigOrStatus); \ + SPDLOG_ERROR("Failed to create plugin config: {}", status.string()); \ + return status; \ + } \ + auto pluginConfigOpt = std::get>(pluginConfigOrStatus) + +static Status createPbtxtFile(const std::string& directoryPath, const std::string& pbtxtContent) { +#if (MEDIAPIPE_DISABLE == 0) + ::mediapipe::CalculatorGraphConfig config; + SPDLOG_TRACE("Generated pbtxt: {}", pbtxtContent); + bool success = ::google::protobuf::TextFormat::ParseFromString(pbtxtContent, &config); + if (!success) { + SPDLOG_ERROR("Created graph config file couldn't be parsed - check used task parameters values."); + return StatusCode::MEDIAPIPE_GRAPH_CONFIG_FILE_INVALID; + } +#endif + // clang-format on + std::string fullPath = FileSystem::joinPath({directoryPath, "graph.pbtxt"}); + return FileSystem::createFileOverwrite(fullPath, pbtxtContent); +} + +static Status createTextGenerationGraphTemplate(const std::string& directoryPath, const HFSettingsImpl& hfSettings) { + if (!std::holds_alternative(hfSettings.graphSettings)) { + SPDLOG_ERROR("Graph options not initialized for text generation."); + return StatusCode::INTERNAL_ERROR; + } + auto& graphSettings = std::get(hfSettings.graphSettings); + auto& ggufFilename = hfSettings.ggufFilename; + auto& exportSettings = hfSettings.exportSettings; -static Status createTextGenerationGraphTemplate(const std::string& directoryPath, const TextGenGraphSettingsImpl& graphSettings, const std::optional ggufFilename) { std::ostringstream oss; oss << OVMS_VERSION_GRAPH_LINE; - std::string modelsPath = constructModelsPath(graphSettings.modelPath, ggufFilename); + std::string modelsPath = constructModelsPath(exportSettings.modelPath, ggufFilename); SPDLOG_TRACE("modelsPath: {}, directoryPath: {}, ggufFilename: {}", modelsPath, directoryPath, ggufFilename.value_or("std::nullopt")); + GET_PLUGIN_CONFIG_OPT_OR_FAIL_AND_RETURN(exportSettings); // clang-format off oss << R"( input_stream: "HTTP_REQUEST_PAYLOAD:input" @@ -109,12 +141,16 @@ static Status createTextGenerationGraphTemplate(const std::string& directoryPath max_num_seqs:)" << graphSettings.maxNumSeqs << R"(, device: ")" - << graphSettings.targetDevice << R"(", + << exportSettings.targetDevice << R"(", models_path: ")" << modelsPath << R"(", - plugin_config: ')" - << GraphExport::createPluginString(graphSettings.pluginConfig) << R"(', - enable_prefix_caching: )" + )"; + if (pluginConfigOpt.has_value()) { + oss << R"(plugin_config: ')" + << pluginConfigOpt.value() << R"(', + )"; + } + oss << R"(enable_prefix_caching: )" << graphSettings.enablePrefixCaching << R"(, cache_size: )" << graphSettings.cacheSize << R"(,)"; @@ -162,35 +198,31 @@ static Status createTextGenerationGraphTemplate(const std::string& directoryPath } } })"; -#if (MEDIAPIPE_DISABLE == 0) - ::mediapipe::CalculatorGraphConfig config; - bool success = ::google::protobuf::TextFormat::ParseFromString(oss.str(), &config); - SPDLOG_TRACE("Generated pbtxt: {}", oss.str()); - if (!success) { - SPDLOG_ERROR("Created graph config file couldn't be parsed - check used task parameters values."); - return StatusCode::MEDIAPIPE_GRAPH_CONFIG_FILE_INVALID; - } -#endif - // clang-format on - std::string fullPath = FileSystem::joinPath({directoryPath, "graph.pbtxt"}); - return FileSystem::createFileOverwrite(fullPath, oss.str()); + return createPbtxtFile(directoryPath, oss.str()); } -static Status createRerankGraphTemplate(const std::string& directoryPath, const RerankGraphSettingsImpl& graphSettings) { +static Status createRerankGraphTemplate(const std::string& directoryPath, const HFSettingsImpl& hfSettings) { + if (!std::holds_alternative(hfSettings.graphSettings)) { + SPDLOG_ERROR("Graph options not initialized for reranking."); + return StatusCode::INTERNAL_ERROR; + } + auto& graphSettings = std::get(hfSettings.graphSettings); + auto& ggufFilename = hfSettings.ggufFilename; + auto& exportSettings = hfSettings.exportSettings; + std::ostringstream oss; oss << OVMS_VERSION_GRAPH_LINE; // Windows path creation - graph parser needs forward slashes in paths - std::string graphOkPath = graphSettings.modelPath; - if (FileSystem::getOsSeparator() != "/") { - std::replace(graphOkPath.begin(), graphOkPath.end(), '\\', '/'); - } + std::string modelsPath = constructModelsPath(exportSettings.modelPath, ggufFilename); + SPDLOG_TRACE("modelsPath: {}, directoryPath: {}, ggufFilename: {}", modelsPath, directoryPath, ggufFilename.value_or("std::nullopt")); + GET_PLUGIN_CONFIG_OPT_OR_FAIL_AND_RETURN(exportSettings); // clang-format off oss << R"( input_stream: "REQUEST_PAYLOAD:input" output_stream: "RESPONSE_PAYLOAD:output" node { name: ")" - << graphSettings.modelName << R"(", + << exportSettings.modelName << R"(", calculator: "RerankCalculatorOV" input_side_packet: "RERANK_NODE_RESOURCES:rerank_servable" input_stream: "REQUEST_PAYLOAD:input" @@ -198,44 +230,42 @@ node { node_options: { [type.googleapis.com / mediapipe.RerankCalculatorOVOptions]: { models_path: ")" - << graphOkPath << R"(", + << modelsPath << R"(", max_allowed_chunks: )" << graphSettings.maxAllowedChunks << R"(, - target_device: ")" << graphSettings.targetDevice << R"(", - plugin_config: '{ "NUM_STREAMS": ")" << graphSettings.numStreams << R"("}', + target_device: ")" << exportSettings.targetDevice << R"(", + )"; + if (pluginConfigOpt.has_value()) { + oss << R"(plugin_config: ')" << pluginConfigOpt.value() << R"(',)"; + } + oss << R"( } } })"; + return createPbtxtFile(directoryPath, oss.str()); +} -#if (MEDIAPIPE_DISABLE == 0) - ::mediapipe::CalculatorGraphConfig config; - bool success = ::google::protobuf::TextFormat::ParseFromString(oss.str(), &config); - if (!success) { - SPDLOG_ERROR("Created rerank graph config couldn't be parsed."); - return StatusCode::MEDIAPIPE_GRAPH_CONFIG_FILE_INVALID; +static Status createEmbeddingsGraphTemplate(const std::string& directoryPath, const HFSettingsImpl& hfSettings) { + if (!std::holds_alternative(hfSettings.graphSettings)) { + SPDLOG_ERROR("Graph options not initialized for embeddings."); + return StatusCode::INTERNAL_ERROR; } -#endif - // clang-format on - std::string fullPath = FileSystem::joinPath({directoryPath, "graph.pbtxt"}); - return FileSystem::createFileOverwrite(fullPath, oss.str()); -} + auto& graphSettings = std::get(hfSettings.graphSettings); + auto& ggufFilename = hfSettings.ggufFilename; + auto& exportSettings = hfSettings.exportSettings; -static Status createEmbeddingsGraphTemplate(const std::string& directoryPath, const EmbeddingsGraphSettingsImpl& graphSettings) { std::ostringstream oss; oss << OVMS_VERSION_GRAPH_LINE; - // Windows path creation - graph parser needs forward slashes in paths - std::string graphOkPath = graphSettings.modelPath; - if (FileSystem::getOsSeparator() != "/") { - std::replace(graphOkPath.begin(), graphOkPath.end(), '\\', '/'); - } - + std::string modelsPath = constructModelsPath(exportSettings.modelPath, ggufFilename); + SPDLOG_TRACE("modelsPath: {}, directoryPath: {}, ggufFilename: {}", modelsPath, directoryPath, ggufFilename.value_or("std::nullopt")); + GET_PLUGIN_CONFIG_OPT_OR_FAIL_AND_RETURN(exportSettings); // clang-format off oss << R"( input_stream: "REQUEST_PAYLOAD:input" output_stream: "RESPONSE_PAYLOAD:output" node { name: ")" - << graphSettings.modelName << R"(", + << exportSettings.modelName << R"(", calculator: "EmbeddingsCalculatorOV" input_side_packet: "EMBEDDINGS_NODE_RESOURCES:embeddings_servable" input_stream: "REQUEST_PAYLOAD:input" @@ -243,33 +273,37 @@ node { node_options: { [type.googleapis.com / mediapipe.EmbeddingsCalculatorOVOptions]: { models_path: ")" - << graphOkPath << R"(", + << modelsPath << R"(", normalize_embeddings: )" << graphSettings.normalize << R"(, truncate: )" << graphSettings.truncate << R"(, pooling: )" << graphSettings.pooling << R"(, - target_device: ")" << graphSettings.targetDevice << R"(", - plugin_config: '{ "NUM_STREAMS": ")" << graphSettings.numStreams << R"("}', - } + target_device: ")" << exportSettings.targetDevice << R"(", + )"; + if (pluginConfigOpt.has_value()) { + oss << R"(plugin_config: ')" << pluginConfigOpt.value() << R"(', + )"; + } + oss << R"(} } })"; + return createPbtxtFile(directoryPath, oss.str()); +} -#if (MEDIAPIPE_DISABLE == 0) - ::mediapipe::CalculatorGraphConfig config; - bool success = ::google::protobuf::TextFormat::ParseFromString(oss.str(), &config); - if (!success) { - SPDLOG_ERROR("Created embeddings graph config couldn't be parsed."); - return StatusCode::MEDIAPIPE_GRAPH_CONFIG_FILE_INVALID; +static Status createImageGenerationGraphTemplate(const std::string& directoryPath, const HFSettingsImpl& hfSettings) { + if (!std::holds_alternative(hfSettings.graphSettings)) { + SPDLOG_ERROR("Graph options not initialized for image generation."); + return StatusCode::INTERNAL_ERROR; } -#endif - // clang-format on - std::string fullPath = FileSystem::joinPath({directoryPath, "graph.pbtxt"}); - return FileSystem::createFileOverwrite(fullPath, oss.str()); -} + auto& graphSettings = std::get(hfSettings.graphSettings); + auto& exportSettings = hfSettings.exportSettings; + auto& ggufFilename = hfSettings.ggufFilename; + std::string modelsPath = constructModelsPath(exportSettings.modelPath, ggufFilename); + SPDLOG_TRACE("modelsPath: {}, directoryPath: {}, ggufFilename: {}", modelsPath, directoryPath, ggufFilename.value_or("std::nullopt")); + GET_PLUGIN_CONFIG_OPT_OR_FAIL_AND_RETURN(exportSettings); -static Status createImageGenerationGraphTemplate(const std::string& directoryPath, const ImageGenerationGraphSettingsImpl& graphSettings) { std::ostringstream oss; oss << OVMS_VERSION_GRAPH_LINE; // clang-format off @@ -285,12 +319,11 @@ node: { output_stream: "HTTP_RESPONSE_PAYLOAD:output" node_options: { [type.googleapis.com / mediapipe.ImageGenCalculatorOptions]: { - models_path: ")" << graphSettings.modelPath << R"(" - device: ")" << graphSettings.targetDevice << R"(")"; - - if (graphSettings.pluginConfig.size()) { + models_path: ")" << modelsPath << R"(" + device: ")" << exportSettings.targetDevice << R"(")"; + if (pluginConfigOpt.has_value()) { oss << R"( - plugin_config: ')" << graphSettings.pluginConfig << R"(')"; + plugin_config: ')" << pluginConfigOpt.value() << R"(')"; } if (graphSettings.resolution.size()) { @@ -338,10 +371,8 @@ node: { } } )"; - // clang-format on - std::string fullPath = FileSystem::joinPath({directoryPath, "graph.pbtxt"}); - return FileSystem::createFileOverwrite(fullPath, oss.str()); + return createPbtxtFile(directoryPath, oss.str()); } GraphExport::GraphExport() { @@ -360,77 +391,111 @@ Status GraphExport::createServableConfig(const std::string& directoryPath, const if (!hfSettings.ggufFilename.has_value()) { bool is_dir = false; status = LocalFileSystem::isDir(directoryPath, &is_dir); - if (!status.ok()) + if (!status.ok()) { + SPDLOG_ERROR("Failed to check if graph path is directory: {}: {}", directoryPath, status.string()); return status; - + } if (!is_dir) { SPDLOG_ERROR("Graph path is not a directory: {}", directoryPath); return StatusCode::PATH_INVALID; } } if (hfSettings.task == TEXT_GENERATION_GRAPH) { - if (std::holds_alternative(hfSettings.graphSettings)) { - return createTextGenerationGraphTemplate(directoryPath, std::get(hfSettings.graphSettings), hfSettings.ggufFilename); - } else { - SPDLOG_ERROR("Graph options not initialized for text generation."); - return StatusCode::INTERNAL_ERROR; - } + return createTextGenerationGraphTemplate(directoryPath, hfSettings); } else if (hfSettings.task == EMBEDDINGS_GRAPH) { - if (std::holds_alternative(hfSettings.graphSettings)) { - return createEmbeddingsGraphTemplate(directoryPath, std::get(hfSettings.graphSettings)); - } else { - SPDLOG_ERROR("Graph options not initialized for embeddings."); - return StatusCode::INTERNAL_ERROR; - } + return createEmbeddingsGraphTemplate(directoryPath, hfSettings); } else if (hfSettings.task == RERANK_GRAPH) { - if (std::holds_alternative(hfSettings.graphSettings)) { - return createRerankGraphTemplate(directoryPath, std::get(hfSettings.graphSettings)); - } else { - SPDLOG_ERROR("Graph options not initialized for rerank."); - return StatusCode::INTERNAL_ERROR; - } + return createRerankGraphTemplate(directoryPath, hfSettings); } else if (hfSettings.task == IMAGE_GENERATION_GRAPH) { - if (std::holds_alternative(hfSettings.graphSettings)) { - return createImageGenerationGraphTemplate(directoryPath, std::get(hfSettings.graphSettings)); - } else { - SPDLOG_ERROR("Graph options not initialized for image generation."); - return StatusCode::INTERNAL_ERROR; - } + return createImageGenerationGraphTemplate(directoryPath, hfSettings); } else if (hfSettings.task == UNKNOWN_GRAPH) { SPDLOG_ERROR("Graph options not initialized."); return StatusCode::INTERNAL_ERROR; } + SPDLOG_ERROR("Graph options not initialized."); return StatusCode::INTERNAL_ERROR; } -std::string GraphExport::createPluginString(const PluginConfigSettingsImpl& pluginConfig) { +std::variant, Status> GraphExport::createPluginString(const ExportSettings& exportSettings) { + bool configNotEmpty = false; + auto& stringPluginConfig = exportSettings.pluginConfig.manualString; + auto& pluginConfig = exportSettings.pluginConfig; + SPDLOG_TRACE("Creating plugin config string from export settings. Manual string: {}, pluginConfig.numStreams: {}, pluginConfig.kvCachePrecision: {}, pluginConfig.maxPromptLength: {}, pluginConfig.modelDistributionPolicy: {}, pluginConfig.cacheDir: {}", pluginConfig.manualString.value_or("std::nullopt"), pluginConfig.numStreams.value_or(0), pluginConfig.kvCachePrecision.value_or("std::nullopt"), pluginConfig.maxPromptLength.value_or(0), pluginConfig.modelDistributionPolicy.value_or("std::nullopt"), exportSettings.pluginConfig.cacheDir.value_or("std::nullopt")); rapidjson::Document d; d.SetObject(); - bool configNotEmpty = false; - + if (stringPluginConfig.has_value() && !stringPluginConfig.value().empty()) { + configNotEmpty = true; + if (d.Parse(stringPluginConfig.value().c_str()).HasParseError()) { + return StatusCode::PLUGIN_CONFIG_WRONG_FORMAT; + } + } if (pluginConfig.kvCachePrecision.has_value()) { rapidjson::Value name; name.SetString(pluginConfig.kvCachePrecision.value().c_str(), d.GetAllocator()); + auto itr = d.FindMember("KV_CACHE_PRECISION"); + if (itr != d.MemberEnd()) { + return Status(StatusCode::PLUGIN_CONFIG_CONFLICTING_PARAMETERS, "Doubled KV_CACHE_PRECISION parameter in plugin config."); + } d.AddMember("KV_CACHE_PRECISION", name, d.GetAllocator()); configNotEmpty = true; } - if (pluginConfig.maxPromptLength.has_value()) { - rapidjson::Value name; - name.SetString(std::to_string(pluginConfig.maxPromptLength.value()).c_str(), d.GetAllocator()); - d.AddMember("MAX_PROMPT_LEN", name, d.GetAllocator()); + rapidjson::Value value; + value.SetUint(pluginConfig.maxPromptLength.value()); + auto itr = d.FindMember("MAX_PROMPT_LEN"); + if (itr != d.MemberEnd()) { + return Status(StatusCode::PLUGIN_CONFIG_CONFLICTING_PARAMETERS, "Doubled MAX_PROMPT_LEN parameter in plugin config."); + } + d.AddMember("MAX_PROMPT_LEN", value, d.GetAllocator()); configNotEmpty = true; } - if (pluginConfig.modelDistributionPolicy.has_value()) { - rapidjson::Value name; - name.SetString(pluginConfig.modelDistributionPolicy.value().c_str(), d.GetAllocator()); - d.AddMember("MODEL_DISTRIBUTION_POLICY", name, d.GetAllocator()); + rapidjson::Value value; + value.SetString(pluginConfig.modelDistributionPolicy.value().c_str(), d.GetAllocator()); + auto itr = d.FindMember("MODEL_DISTRIBUTION_POLICY"); + if (itr != d.MemberEnd()) { + return Status(StatusCode::PLUGIN_CONFIG_CONFLICTING_PARAMETERS, "Doubled MODEL_DISTRIBUTION_POLICY parameter in plugin config."); + } + d.AddMember("MODEL_DISTRIBUTION_POLICY", value, d.GetAllocator()); + configNotEmpty = true; + } + if (pluginConfig.numStreams.has_value()) { + rapidjson::Value value; + value.SetUint(pluginConfig.numStreams.value()); + auto itr = d.FindMember("NUM_STREAMS"); + if (itr != d.MemberEnd()) { + if (pluginConfig.numStreams.value() == 1) { + // ignoring double setting NUM_STREAMS is required for embeddings & rerank + // since 1 is default value coming from CLI + SPDLOG_DEBUG("Doubled NUM_STREAMS parameter in plugin config. Will ignore `--num_streams` CLI parameter."); + } else { + return Status(StatusCode::PLUGIN_CONFIG_CONFLICTING_PARAMETERS, "Doubled NUM_STREAMS parameter in plugin config."); + } + } else { + d.AddMember("NUM_STREAMS", value, d.GetAllocator()); + configNotEmpty = true; + } + } + if (exportSettings.pluginConfig.cacheDir.has_value()) { + rapidjson::Value value; + value.SetString(exportSettings.pluginConfig.cacheDir.value().c_str(), d.GetAllocator()); + auto itr = d.FindMember("CACHE_DIR"); + if (itr != d.MemberEnd()) { + return Status(StatusCode::PLUGIN_CONFIG_CONFLICTING_PARAMETERS, "Doubled CACHE_DIR parameter in plugin config."); + } + d.AddMember("CACHE_DIR", value, d.GetAllocator()); + configNotEmpty = true; + } + if (pluginConfig.useNpuPrefixCaching.has_value()) { + rapidjson::Value value; + value.SetBool(pluginConfig.useNpuPrefixCaching.value()); + auto itr = d.FindMember("NPUW_LLM_ENABLE_PREFIX_CACHING"); + if (itr != d.MemberEnd()) { + return Status(StatusCode::PLUGIN_CONFIG_CONFLICTING_PARAMETERS, "Doubled NPUW_LLM_ENABLE_PREFIX_CACHING parameter in plugin config."); + } + d.AddMember("NPUW_LLM_ENABLE_PREFIX_CACHING", value, d.GetAllocator()); configNotEmpty = true; } - - std::string pluginString = "{ }"; - if (configNotEmpty) { // Serialize the document to a JSON string rapidjson::StringBuffer buffer; @@ -438,10 +503,10 @@ std::string GraphExport::createPluginString(const PluginConfigSettingsImpl& plug d.Accept(writer); // Output the JSON string - pluginString = buffer.GetString(); + return buffer.GetString(); + } else { + return std::nullopt; } - - return pluginString; } } // namespace ovms diff --git a/src/graph_export/graph_export.hpp b/src/graph_export/graph_export.hpp index 8b8ee46129..e6f9fdcbef 100644 --- a/src/graph_export/graph_export.hpp +++ b/src/graph_export/graph_export.hpp @@ -14,18 +14,21 @@ // limitations under the License. //***************************************************************************** #pragma once +#include #include +#include namespace ovms { struct PluginConfigSettingsImpl; struct HFSettingsImpl; +struct ExportSettings; class Status; class GraphExport { public: GraphExport(); Status createServableConfig(const std::string& directoryPath, const HFSettingsImpl& graphSettings); - static std::string createPluginString(const PluginConfigSettingsImpl& pluginConfig); + static std::variant, Status> createPluginString(const ExportSettings& exportSettings); static std::string getDraftModelDirectoryName(std::string draftModel); static std::string getDraftModelDirectoryPath(const std::string& directoryPath, const std::string& draftModel); }; diff --git a/src/graph_export/image_generation_graph_cli_parser.cpp b/src/graph_export/image_generation_graph_cli_parser.cpp index 162e9ac310..ed0d1b91ef 100644 --- a/src/graph_export/image_generation_graph_cli_parser.cpp +++ b/src/graph_export/image_generation_graph_cli_parser.cpp @@ -24,14 +24,6 @@ #include #include -#pragma warning(push) -#pragma warning(disable : 6313) -#include -#include -#include -#include -#pragma warning(pop) - #include "../capi_frontend/server_settings.hpp" #include "../ovms_exit_codes.hpp" #include "../status.hpp" @@ -115,12 +107,11 @@ std::vector ImageGenerationGraphCLIParser::parse(const std::vector< void ImageGenerationGraphCLIParser::prepare(ServerSettingsImpl& serverSettings, HFSettingsImpl& hfSettings, const std::string& modelName) { ImageGenerationGraphSettingsImpl imageGenerationGraphSettings = ImageGenerationGraphCLIParser::defaultGraphSettings(); - imageGenerationGraphSettings.targetDevice = hfSettings.exportSettings.targetDevice; // Deduct model name if (modelName != "") { - imageGenerationGraphSettings.modelName = modelName; + hfSettings.exportSettings.modelName = modelName; } else { - imageGenerationGraphSettings.modelName = hfSettings.sourceModel; + hfSettings.exportSettings.modelName = hfSettings.sourceModel; } if (nullptr == result) { // Pull with default arguments - no arguments from user @@ -159,25 +150,17 @@ void ImageGenerationGraphCLIParser::prepare(ServerSettingsImpl& serverSettings, } if (result->count("num_streams") || serverSettings.cacheDir != "") { - rapidjson::Document pluginConfigDoc; - pluginConfigDoc.SetObject(); - rapidjson::Document::AllocatorType& allocator = pluginConfigDoc.GetAllocator(); if (result->count("num_streams")) { uint32_t numStreams = result->operator[]("num_streams").as(); if (numStreams == 0) { throw std::invalid_argument("num_streams must be greater than 0"); } - pluginConfigDoc.AddMember("NUM_STREAMS", numStreams, allocator); + hfSettings.exportSettings.pluginConfig.numStreams = result->operator[]("num_streams").as(); } if (!serverSettings.cacheDir.empty()) { - pluginConfigDoc.AddMember("CACHE_DIR", rapidjson::Value(serverSettings.cacheDir.c_str(), allocator), allocator); + hfSettings.exportSettings.pluginConfig.cacheDir = serverSettings.cacheDir; } - - rapidjson::StringBuffer buffer; - rapidjson::Writer writer(buffer); - pluginConfigDoc.Accept(writer); - imageGenerationGraphSettings.pluginConfig = buffer.GetString(); } } diff --git a/src/graph_export/rerank_graph_cli_parser.cpp b/src/graph_export/rerank_graph_cli_parser.cpp index 1038687107..80f1561a4a 100644 --- a/src/graph_export/rerank_graph_cli_parser.cpp +++ b/src/graph_export/rerank_graph_cli_parser.cpp @@ -73,12 +73,12 @@ std::vector RerankGraphCLIParser::parse(const std::vectoroperator[]("num_streams").as(); + hfSettings.exportSettings.pluginConfig.numStreams = result->operator[]("num_streams").as(); rerankGraphSettings.maxAllowedChunks = result->operator[]("max_allowed_chunks").as(); } diff --git a/src/mediapipe_internal/mediapipegraphdefinition.cpp b/src/mediapipe_internal/mediapipegraphdefinition.cpp index c40a8d1087..ca708451db 100644 --- a/src/mediapipe_internal/mediapipegraphdefinition.cpp +++ b/src/mediapipe_internal/mediapipegraphdefinition.cpp @@ -100,7 +100,7 @@ Status MediapipeGraphDefinition::validateForConfigLoadableness() { SPDLOG_LOGGER_ERROR(modelmanager_logger, "Trying to parse empty mediapipe graph definition: {} failed", this->getName(), this->chosenConfig); return StatusCode::MEDIAPIPE_GRAPH_CONFIG_FILE_INVALID; } - + SPDLOG_TRACE("Will try to load pbtxt config: {}", this->chosenConfig); bool success = ::google::protobuf::TextFormat::ParseFromString(chosenConfig, &this->config); if (!success) { SPDLOG_LOGGER_ERROR(modelmanager_logger, "Trying to parse mediapipe graph definition: {} failed", this->getName(), this->chosenConfig); diff --git a/src/status.cpp b/src/status.cpp index 38640f52fc..3e3b9425b9 100644 --- a/src/status.cpp +++ b/src/status.cpp @@ -36,6 +36,7 @@ const std::unordered_map Status::statusMessageMap = { {StatusCode::LAYOUT_WRONG_FORMAT, "The provided layout is in wrong format"}, {StatusCode::DIM_WRONG_FORMAT, "The provided dimension is in wrong format"}, {StatusCode::PLUGIN_CONFIG_WRONG_FORMAT, "Plugin config is in wrong format"}, + {StatusCode::PLUGIN_CONFIG_CONFLICTING_PARAMETERS, "Tried to set the same key twice in plugin config"}, {StatusCode::MODEL_VERSION_POLICY_WRONG_FORMAT, "Model version policy is in wrong format"}, {StatusCode::MODEL_VERSION_POLICY_UNSUPPORTED_KEY, "Model version policy contains unsupported key"}, {StatusCode::GRPC_CHANNEL_ARG_WRONG_FORMAT, "Grpc channel arguments passed in wrong format"}, diff --git a/src/status.hpp b/src/status.hpp index d604e792d2..fee6300d99 100644 --- a/src/status.hpp +++ b/src/status.hpp @@ -40,6 +40,7 @@ enum class StatusCode { LAYOUT_WRONG_FORMAT, /*!< The provided layout param is in wrong format */ DIM_WRONG_FORMAT, /*!< The provided dimension param is in wrong format */ PLUGIN_CONFIG_WRONG_FORMAT, /*!< Plugin config is in wrong format */ + PLUGIN_CONFIG_CONFLICTING_PARAMETERS, /*!< Tried to set the same key twice in plugin config */ MODEL_VERSION_POLICY_WRONG_FORMAT, /*!< Model version policy is in wrong format */ MODEL_VERSION_POLICY_UNSUPPORTED_KEY, /*!< Model version policy contains invalid key */ GRPC_CHANNEL_ARG_WRONG_FORMAT, diff --git a/src/test/graph_export_test.cpp b/src/test/graph_export_test.cpp index 3a103f135c..8499a76d77 100644 --- a/src/test/graph_export_test.cpp +++ b/src/test/graph_export_test.cpp @@ -85,7 +85,7 @@ const std::string expectedFullPluginGraphContents = R"( max_num_seqs:256, device: "CPU", models_path: "./", - plugin_config: '{"KV_CACHE_PRECISION":"u8","MAX_PROMPT_LEN":"123","MODEL_DISTRIBUTION_POLICY":"PIPELINE_PARALLEL"}', + plugin_config: '{"KV_CACHE_PRECISION":"u8","MAX_PROMPT_LEN":123,"MODEL_DISTRIBUTION_POLICY":"PIPELINE_PARALLEL"}', enable_prefix_caching: true, cache_size: 10, } @@ -123,7 +123,6 @@ const std::string expectedGraphContentsWithResponseParser = R"( max_num_seqs:256, device: "CPU", models_path: "./", - plugin_config: '{ }', enable_prefix_caching: true, cache_size: 10, reasoning_parser: "REASONING_PARSER", @@ -164,7 +163,6 @@ const std::string expectedDefaultGraphContents = R"( max_num_seqs:256, device: "CPU", models_path: "./", - plugin_config: '{ }', enable_prefix_caching: true, cache_size: 10, } @@ -202,7 +200,6 @@ const std::string expectedDraftAndFuseGraphContents = R"( max_num_seqs:256, device: "CPU", models_path: "./", - plugin_config: '{ }', enable_prefix_caching: true, cache_size: 10, dynamic_split_fuse: false, @@ -243,7 +240,6 @@ const std::string expectedGGUFGraphContents = R"( max_num_seqs:256, device: "CPU", models_path: "./PRETTY_GOOD_GGUF_MODEL.gguf", - plugin_config: '{ }', enable_prefix_caching: true, cache_size: 10, } @@ -281,7 +277,6 @@ const std::string expectedGGUFGraphContents2 = R"( max_num_seqs:256, device: "CPU", models_path: "./PRETTY_GOOD_GGUF_MODEL_Q8-00001-of-20000.gguf", - plugin_config: '{ }', enable_prefix_caching: true, cache_size: 10, } @@ -313,7 +308,7 @@ node { models_path: "/some/path", max_allowed_chunks: 18, target_device: "GPU", - plugin_config: '{ "NUM_STREAMS": "2"}', + plugin_config: '{"NUM_STREAMS":2}', } } } @@ -333,7 +328,7 @@ node { models_path: "./", max_allowed_chunks: 10000, target_device: "CPU", - plugin_config: '{ "NUM_STREAMS": "1"}', + plugin_config: '{"NUM_STREAMS":1}', } } } @@ -355,7 +350,7 @@ node { truncate: true, pooling: LAST, target_device: "GPU", - plugin_config: '{ "NUM_STREAMS": "2"}', + plugin_config: '{"NUM_STREAMS":2}', } } } @@ -377,7 +372,7 @@ node { truncate: false, pooling: CLS, target_device: "CPU", - plugin_config: '{ "NUM_STREAMS": "1"}', + plugin_config: '{"NUM_STREAMS":1}', } } } @@ -460,8 +455,9 @@ TEST_F(GraphCreationTest, positiveDefaultWithVersionString) { ASSERT_EQ(expected, graphContents) << graphContents; } -TEST_F(GraphCreationTest, positiveReranktWithVersionString) { +TEST_F(GraphCreationTest, positiveRerankWithVersionString) { ovms::HFSettingsImpl hfSettings; + hfSettings.exportSettings.pluginConfig.numStreams = 1; hfSettings.task = ovms::RERANK_GRAPH; ovms::RerankGraphSettingsImpl rerankGraphSettings; hfSettings.graphSettings = std::move(rerankGraphSettings); @@ -477,6 +473,7 @@ TEST_F(GraphCreationTest, positiveReranktWithVersionString) { TEST_F(GraphCreationTest, positiveEmbeddingsWithVersionString) { ovms::HFSettingsImpl hfSettings; + hfSettings.exportSettings.pluginConfig.numStreams = 1; hfSettings.task = ovms::EMBEDDINGS_GRAPH; ovms::EmbeddingsGraphSettingsImpl embeddingsGraphSettings; hfSettings.graphSettings = std::move(embeddingsGraphSettings); @@ -566,12 +563,13 @@ TEST_F(GraphCreationTest, WillOverwriteExistingGraphPbtxtGGUF) { TEST_F(GraphCreationTest, rerankPositiveNonDefault) { ovms::HFSettingsImpl hfSettings; + auto& exportSettings = hfSettings.exportSettings; hfSettings.task = ovms::RERANK_GRAPH; ovms::RerankGraphSettingsImpl rerankGraphSettings; - rerankGraphSettings.targetDevice = "GPU"; - rerankGraphSettings.modelName = "myModel"; - rerankGraphSettings.modelPath = "/some/path"; - rerankGraphSettings.numStreams = 2; + exportSettings.targetDevice = "GPU"; + exportSettings.modelName = "myModel"; + exportSettings.modelPath = "/some/path"; + exportSettings.pluginConfig.numStreams = 2; rerankGraphSettings.maxAllowedChunks = 18; hfSettings.graphSettings = std::move(rerankGraphSettings); @@ -586,6 +584,7 @@ TEST_F(GraphCreationTest, rerankPositiveNonDefault) { TEST_F(GraphCreationTest, rerankPositiveDefault) { ovms::HFSettingsImpl hfSettings; + hfSettings.exportSettings.pluginConfig.numStreams = 1; hfSettings.task = ovms::RERANK_GRAPH; ovms::RerankGraphSettingsImpl rerankGraphSettings; hfSettings.graphSettings = std::move(rerankGraphSettings); @@ -601,11 +600,12 @@ TEST_F(GraphCreationTest, rerankPositiveDefault) { TEST_F(GraphCreationTest, rerankCreatedPbtxtInvalid) { ovms::HFSettingsImpl hfSettings; + auto& exportSettings = hfSettings.exportSettings; hfSettings.task = ovms::RERANK_GRAPH; ovms::RerankGraphSettingsImpl rerankGraphSettings; - rerankGraphSettings.targetDevice = "GPU"; - rerankGraphSettings.modelName = "myModel\""; - rerankGraphSettings.numStreams = 2; + exportSettings.targetDevice = "GPU"; + exportSettings.modelName = "myModel\""; + exportSettings.pluginConfig.numStreams = 2; hfSettings.graphSettings = std::move(rerankGraphSettings); std::string graphPath = ovms::FileSystem::appendSlash(this->directoryPath) + "graph.pbtxt"; std::unique_ptr graphExporter = std::make_unique(); @@ -621,10 +621,10 @@ TEST_F(GraphCreationTest, embeddingsPositiveNonDefault) { ovms::HFSettingsImpl hfSettings; hfSettings.task = ovms::EMBEDDINGS_GRAPH; ovms::EmbeddingsGraphSettingsImpl embeddingsGraphSettings; - embeddingsGraphSettings.targetDevice = "GPU"; - embeddingsGraphSettings.modelName = "myModel"; - embeddingsGraphSettings.modelPath = "/model1/path"; - embeddingsGraphSettings.numStreams = 2; + hfSettings.exportSettings.targetDevice = "GPU"; + hfSettings.exportSettings.modelName = "myModel"; + hfSettings.exportSettings.modelPath = "/model1/path"; + hfSettings.exportSettings.pluginConfig.numStreams = 2; embeddingsGraphSettings.normalize = "false"; embeddingsGraphSettings.truncate = "true"; embeddingsGraphSettings.pooling = "LAST"; @@ -643,6 +643,7 @@ TEST_F(GraphCreationTest, embeddingsPositiveDefault) { hfSettings.task = ovms::EMBEDDINGS_GRAPH; ovms::EmbeddingsGraphSettingsImpl embeddingsGraphSettings; hfSettings.graphSettings = std::move(embeddingsGraphSettings); + hfSettings.exportSettings.pluginConfig.numStreams = 1; std::string graphPath = ovms::FileSystem::appendSlash(this->directoryPath) + "graph.pbtxt"; std::unique_ptr graphExporter = std::make_unique(); auto status = graphExporter->createServableConfig(this->directoryPath, hfSettings); @@ -656,9 +657,9 @@ TEST_F(GraphCreationTest, embeddingsCreatedPbtxtInvalid) { ovms::HFSettingsImpl hfSettings; hfSettings.task = ovms::EMBEDDINGS_GRAPH; ovms::EmbeddingsGraphSettingsImpl embeddingsGraphSettings; - embeddingsGraphSettings.targetDevice = "GPU"; - embeddingsGraphSettings.modelName = "myModel\""; - embeddingsGraphSettings.numStreams = 2; + hfSettings.exportSettings.targetDevice = "GPU"; + hfSettings.exportSettings.modelName = "myModel\""; + hfSettings.exportSettings.pluginConfig.numStreams = 2; embeddingsGraphSettings.normalize = "true"; embeddingsGraphSettings.pooling = "CLS"; hfSettings.graphSettings = std::move(embeddingsGraphSettings); @@ -670,13 +671,35 @@ TEST_F(GraphCreationTest, embeddingsCreatedPbtxtInvalid) { ASSERT_EQ(status, ovms::StatusCode::OK); #endif } +TEST_F(GraphCreationTest, embeddingsDoubleSetNumStreams) { + // by default for embeddings we set numStreams=1 in CLI + // we should ignore double setting & check if equals the one from `--plugin_config` + // if both `--num_streams` is used to change from 1 and `--plugin_config` is used with + // num streams we trigger error + ovms::HFSettingsImpl hfSettings; + hfSettings.task = ovms::EMBEDDINGS_GRAPH; + ovms::EmbeddingsGraphSettingsImpl embeddingsGraphSettings; + hfSettings.exportSettings.targetDevice = "GPU"; + hfSettings.exportSettings.modelName = "myModel"; + hfSettings.exportSettings.pluginConfig.numStreams = 1; // imitates default from CLI + hfSettings.exportSettings.pluginConfig.manualString = "{\"NUM_STREAMS\":1}"; + embeddingsGraphSettings.normalize = "true"; + embeddingsGraphSettings.pooling = "CLS"; + hfSettings.graphSettings = std::move(embeddingsGraphSettings); + std::unique_ptr graphExporter = std::make_unique(); + auto status = graphExporter->createServableConfig(this->directoryPath, hfSettings); + ASSERT_EQ(status, ovms::StatusCode::OK); + hfSettings.exportSettings.pluginConfig.numStreams = 2; // non-default value - it should fail + status = graphExporter->createServableConfig(this->directoryPath, hfSettings); + ASSERT_EQ(status, ovms::StatusCode::PLUGIN_CONFIG_CONFLICTING_PARAMETERS) << status.string(); +} TEST_F(GraphCreationTest, positivePluginConfigAll) { ovms::HFSettingsImpl hfSettings; ovms::TextGenGraphSettingsImpl graphSettings; - graphSettings.pluginConfig.kvCachePrecision = "u8"; - graphSettings.pluginConfig.maxPromptLength = 123; - graphSettings.pluginConfig.modelDistributionPolicy = "PIPELINE_PARALLEL"; + hfSettings.exportSettings.pluginConfig.kvCachePrecision = "u8"; + hfSettings.exportSettings.pluginConfig.maxPromptLength = 123; + hfSettings.exportSettings.pluginConfig.modelDistributionPolicy = "PIPELINE_PARALLEL"; hfSettings.graphSettings = std::move(graphSettings); @@ -710,7 +733,7 @@ TEST_F(GraphCreationTest, positiveWithParsersAndToolGuidedGeneration) { TEST_F(GraphCreationTest, positivePluginConfigOne) { ovms::HFSettingsImpl hfSettings; ovms::TextGenGraphSettingsImpl graphSettings; - graphSettings.pluginConfig.kvCachePrecision = "u8"; + hfSettings.exportSettings.pluginConfig.kvCachePrecision = "u8"; hfSettings.graphSettings = std::move(graphSettings); std::string graphPath = ovms::FileSystem::appendSlash(this->directoryPath) + "graph.pbtxt"; @@ -763,7 +786,7 @@ TEST_F(GraphCreationTest, negativeCreatedPbtxtInvalid) { ovms::HFSettingsImpl hfSettings; hfSettings.task = ovms::TEXT_GENERATION_GRAPH; ovms::TextGenGraphSettingsImpl graphSettings; - graphSettings.modelPath = "invalid\""; + hfSettings.exportSettings.modelPath = "invalid\""; hfSettings.graphSettings = std::move(graphSettings); std::string graphPath = ovms::FileSystem::appendSlash(this->directoryPath) + "graph.pbtxt"; std::string subconfigPath = ovms::FileSystem::appendSlash(this->directoryPath) + "subconfig.json"; @@ -775,6 +798,19 @@ TEST_F(GraphCreationTest, negativeCreatedPbtxtInvalid) { ASSERT_EQ(status, ovms::StatusCode::OK); #endif } +TEST_F(GraphCreationTest, positiveTextGeneration) { + ovms::HFSettingsImpl hfSettings; + hfSettings.task = ovms::TEXT_GENERATION_GRAPH; + ovms::TextGenGraphSettingsImpl graphSettings; + hfSettings.graphSettings = std::move(graphSettings); + hfSettings.exportSettings.targetDevice = "NPU"; + hfSettings.exportSettings.pluginConfig.useNpuPrefixCaching = true; + std::string graphPath = ovms::FileSystem::appendSlash(this->directoryPath) + "graph.pbtxt"; + std::string subconfigPath = ovms::FileSystem::appendSlash(this->directoryPath) + "subconfig.json"; + std::unique_ptr graphExporter = std::make_unique(); + auto status = graphExporter->createServableConfig(this->directoryPath, hfSettings); + ASSERT_EQ(status, ovms::StatusCode::OK); +} TEST_F(GraphCreationTest, imageGenerationPositiveDefault) { ovms::HFSettingsImpl hfSettings; @@ -794,8 +830,9 @@ TEST_F(GraphCreationTest, imageGenerationPositiveFull) { ovms::HFSettingsImpl hfSettings; hfSettings.task = ovms::IMAGE_GENERATION_GRAPH; ovms::ImageGenerationGraphSettingsImpl imageGenerationGraphSettings; - imageGenerationGraphSettings.pluginConfig = "{\"NUM_STREAMS\":14,\"CACHE_DIR\":\"/cache\"}"; - imageGenerationGraphSettings.targetDevice = "GPU"; + hfSettings.exportSettings.pluginConfig.numStreams = 14; + hfSettings.exportSettings.pluginConfig.cacheDir = "/cache"; + hfSettings.exportSettings.targetDevice = "GPU"; imageGenerationGraphSettings.defaultResolution = "300x400"; imageGenerationGraphSettings.maxResolution = "3000x4000"; imageGenerationGraphSettings.maxNumberImagesPerPrompt = 7; @@ -810,3 +847,47 @@ TEST_F(GraphCreationTest, imageGenerationPositiveFull) { std::string graphContents = GetFileContents(graphPath); ASSERT_EQ(expectedImageGenerationGraphContents, removeVersionString(graphContents)) << graphContents; } +TEST_F(GraphCreationTest, pluginConfigAsString) { + ovms::ExportSettings exportSettings; + + auto& pluginConfig = exportSettings.pluginConfig; + std::optional stringPluginConfig; + pluginConfig.kvCachePrecision = "u8"; + pluginConfig.maxPromptLength = 256; + pluginConfig.modelDistributionPolicy = "TENSOR_PARALLEL"; + pluginConfig.manualString = "{\"NUM_STREAMS\":4}"; + auto res = ovms::GraphExport::createPluginString(exportSettings); + ASSERT_TRUE(std::holds_alternative>(res)); + ASSERT_EQ(std::get>(res).value(), + "{\"NUM_STREAMS\":4,\"KV_CACHE_PRECISION\":\"u8\",\"MAX_PROMPT_LEN\":256,\"MODEL_DISTRIBUTION_POLICY\":\"TENSOR_PARALLEL\"}"); +} +TEST_F(GraphCreationTest, pluginConfigNegative) { + using ovms::Status; + ovms::PluginConfigSettingsImpl pluginConfig; + ovms::ExportSettings exportSettings; + std::optional stringPluginConfig; + pluginConfig.kvCachePrecision = "u8"; + pluginConfig.maxPromptLength = 256; + pluginConfig.modelDistributionPolicy = "TENSOR_PARALLEL"; + pluginConfig.cacheDir = "/cache"; + + exportSettings.pluginConfig = pluginConfig; + exportSettings.pluginConfig.manualString = "{\"KV_CACHE_PRECISION\":\"fp16\"}"; + auto res = ovms::GraphExport::createPluginString(exportSettings); + ASSERT_TRUE(std::holds_alternative(res)); + ASSERT_EQ(std::get(res), ovms::StatusCode::PLUGIN_CONFIG_CONFLICTING_PARAMETERS); + + exportSettings.pluginConfig.manualString = "{\"MAX_PROMPT_LEN\":512}"; + res = ovms::GraphExport::createPluginString(exportSettings); + ASSERT_TRUE(std::holds_alternative(res)); + ASSERT_EQ(std::get(res), ovms::StatusCode::PLUGIN_CONFIG_CONFLICTING_PARAMETERS); + + exportSettings.pluginConfig.manualString = "{\"CACHE_DIR\":\"/cache\"}"; + res = ovms::GraphExport::createPluginString(exportSettings); + ASSERT_TRUE(std::holds_alternative(res)); + ASSERT_EQ(std::get(res), ovms::StatusCode::PLUGIN_CONFIG_CONFLICTING_PARAMETERS); + exportSettings.pluginConfig.manualString = "{\"MODEL_DISTRIBUTION_POLICY\":\"PIPELINE_PARALLEL\"}"; + res = ovms::GraphExport::createPluginString(exportSettings); + ASSERT_TRUE(std::holds_alternative(res)); + ASSERT_EQ(std::get(res), ovms::StatusCode::PLUGIN_CONFIG_CONFLICTING_PARAMETERS); +} diff --git a/src/test/llm/llmnode_test.cpp b/src/test/llm/llmnode_test.cpp index d3d673d880..e96feff657 100644 --- a/src/test/llm/llmnode_test.cpp +++ b/src/test/llm/llmnode_test.cpp @@ -3573,7 +3573,8 @@ TEST_F(LLMConfigHttpTest, LLMNodeNonExistantModelsPath) { ovms::MediapipeGraphConfig mgc{"mediaDummy", "", ""}; DummyMediapipeGraphDefinition mediapipeDummy("mediaDummy", mgc, testPbtxt, nullptr); mediapipeDummy.inputConfig = testPbtxt; - ASSERT_EQ(mediapipeDummy.validate(manager), StatusCode::LLM_NODE_DIRECTORY_DOES_NOT_EXIST); + auto status = mediapipeDummy.validate(manager); + ASSERT_EQ(status, StatusCode::LLM_NODE_DIRECTORY_DOES_NOT_EXIST) << status.string(); } TEST_F(LLMConfigHttpTest, LLMNodeBadWorkspacePathEmpty) { @@ -3615,7 +3616,8 @@ TEST_F(LLMConfigHttpTest, LLMNodeBadWorkspacePathEmpty) { ovms::MediapipeGraphConfig mgc{"mediaDummy", "", ""}; DummyMediapipeGraphDefinition mediapipeDummy("mediaDummy", mgc, testPbtxt, nullptr); mediapipeDummy.inputConfig = testPbtxt; - ASSERT_EQ(mediapipeDummy.validate(manager), StatusCode::LLM_NODE_DIRECTORY_DOES_NOT_EXIST); + auto status = mediapipeDummy.validate(manager); + ASSERT_EQ(status, StatusCode::LLM_NODE_DIRECTORY_DOES_NOT_EXIST) << status.string(); } TEST_F(LLMConfigHttpTest, LLMNodeWorkspacePathToFileNotDir) { @@ -3657,7 +3659,8 @@ TEST_F(LLMConfigHttpTest, LLMNodeWorkspacePathToFileNotDir) { ovms::MediapipeGraphConfig mgc{"mediaDummy", "", ""}; DummyMediapipeGraphDefinition mediapipeDummy("mediaDummy", mgc, testPbtxt, nullptr); mediapipeDummy.inputConfig = testPbtxt; - ASSERT_EQ(mediapipeDummy.validate(manager), StatusCode::LLM_NODE_PATH_DOES_NOT_EXIST_AND_NOT_GGUFFILE); + auto status = mediapipeDummy.validate(manager); + ASSERT_EQ(status, StatusCode::LLM_NODE_PATH_DOES_NOT_EXIST_AND_NOT_GGUFFILE) << status.string(); } class LLMConfigHttpTestParameterized : public ::testing::Test, public ::testing::WithParamInterface> { @@ -3710,7 +3713,8 @@ TEST_P(LLMConfigHttpTestParameterized, LLMNodeResourceInitFailed) { ovms::MediapipeGraphConfig mgc{"mediaDummy", "", ""}; DummyMediapipeGraphDefinition mediapipeDummy("mediaDummy", mgc, testPbtxt, nullptr); mediapipeDummy.inputConfig = testPbtxt; - ASSERT_EQ(mediapipeDummy.validate(manager), expectedStatusCode); + auto status = mediapipeDummy.validate(manager); + ASSERT_EQ(status, expectedStatusCode); ASSERT_EQ(mediapipeDummy.getGenAiServable("llmNode"), nullptr); } diff --git a/src/test/ovmsconfig_test.cpp b/src/test/ovmsconfig_test.cpp index de98c1815c..192f64cf43 100644 --- a/src/test/ovmsconfig_test.cpp +++ b/src/test/ovmsconfig_test.cpp @@ -1005,7 +1005,7 @@ TEST_F(OvmsConfigDeathTest, simultaneousPullAndRemove) { EXPECT_EXIT(ovms::Config::instance().parse(arg_count, n_argv), ::testing::ExitedWithCode(OVMS_EX_USAGE), "--remove_from_config cannot be used with --pull or --task") << createCmd(arg_count, n_argv) << buffer.str(); } -TEST(OvmsGraphConfigTest, positiveAllChanged) { +TEST(OvmsGraphConfigTest, positiveAllChangedTextGeneration) { std::string modelName = "OpenVINO/Phi-3-mini-FastDraft-50M-int8-ov"; std::string downloadPath = "test/repository"; char* n_argv[] = { @@ -1022,9 +1022,9 @@ TEST(OvmsGraphConfigTest, positiveAllChanged) { (char*)"--max_num_seqs", (char*)"128", (char*)"--target_device", - (char*)"GPU", + (char*)"NPU", (char*)"--enable_prefix_caching", - (char*)"false", + (char*)"true", (char*)"--cache_size", (char*)"20", (char*)"--max_num_batched_tokens", @@ -1038,23 +1038,31 @@ TEST(OvmsGraphConfigTest, positiveAllChanged) { (char*)"--tool_parser", (char*)"toolParserName", (char*)"--enable_tool_guided_generation", - (char*)"true"}; - - int arg_count = 30; + (char*)"true", + (char*)"--model_distribution_policy", + (char*)"TENSOR_PARALLEL", + (char*)"--max_prompt_len", + (char*)"2048", + (char*)"--kv_cache_precision", + (char*)"u8"}; + + int arg_count = 36; ConstructorEnabledConfig config; config.parse(arg_count, n_argv); auto& hfSettings = config.getServerSettings().hfSettings; + auto& exportSettings = hfSettings.exportSettings; ASSERT_EQ(hfSettings.sourceModel, modelName); ASSERT_EQ(hfSettings.downloadPath, downloadPath); ASSERT_EQ(config.getServerSettings().serverMode, ovms::HF_PULL_MODE); ovms::TextGenGraphSettingsImpl graphSettings = std::get(hfSettings.graphSettings); ASSERT_EQ(graphSettings.pipelineType.value(), "VLM"); - ASSERT_EQ(graphSettings.modelPath, "./"); + ASSERT_EQ(exportSettings.modelPath, "./"); ASSERT_EQ(graphSettings.maxNumSeqs, 128); - ASSERT_EQ(graphSettings.targetDevice, "GPU"); - ASSERT_EQ(graphSettings.pluginConfig.kvCachePrecision.has_value(), false); - ASSERT_EQ(graphSettings.enablePrefixCaching, "false"); + ASSERT_EQ(exportSettings.targetDevice, "NPU"); + ASSERT_EQ(exportSettings.pluginConfig.kvCachePrecision.has_value(), true); + ASSERT_EQ(exportSettings.pluginConfig.kvCachePrecision.value(), "u8"); + ASSERT_EQ(graphSettings.enablePrefixCaching, "true"); ASSERT_EQ(graphSettings.cacheSize, 20); ASSERT_EQ(graphSettings.maxNumBatchedTokens.value(), 16); ASSERT_EQ(graphSettings.dynamicSplitFuse, "true"); @@ -1062,9 +1070,15 @@ TEST(OvmsGraphConfigTest, positiveAllChanged) { ASSERT_EQ(graphSettings.reasoningParser.value(), "reasoningParserName"); ASSERT_EQ(graphSettings.toolParser.value(), "toolParserName"); ASSERT_EQ(graphSettings.enableToolGuidedGeneration, "true"); + ASSERT_EQ(exportSettings.pluginConfig.modelDistributionPolicy.has_value(), true); + ASSERT_EQ(exportSettings.pluginConfig.modelDistributionPolicy.value(), "TENSOR_PARALLEL"); + ASSERT_EQ(exportSettings.pluginConfig.maxPromptLength.has_value(), true); + ASSERT_EQ(exportSettings.pluginConfig.maxPromptLength.value(), 2048); + ASSERT_EQ(exportSettings.pluginConfig.kvCachePrecision.value(), "u8"); + ASSERT_EQ(exportSettings.pluginConfig.useNpuPrefixCaching.value(), true); } -TEST(OvmsGraphConfigTest, positiveSomeChanged) { +TEST(OvmsGraphConfigTest, positiveSomeChangedTextGeneration) { std::string modelName = "OpenVINO/Phi-3-mini-FastDraft-50M-int8-ov"; std::string downloadPath = "test/repository"; char* n_argv[] = { @@ -1090,17 +1104,18 @@ TEST(OvmsGraphConfigTest, positiveSomeChanged) { config.parse(arg_count, n_argv); auto& hfSettings = config.getServerSettings().hfSettings; + auto& exportSettings = hfSettings.exportSettings; ASSERT_EQ(hfSettings.sourceModel, modelName); ASSERT_EQ(hfSettings.downloadPath, downloadPath); ASSERT_EQ(hfSettings.overwriteModels, true); ASSERT_EQ(config.getServerSettings().serverMode, ovms::HF_PULL_MODE); ovms::TextGenGraphSettingsImpl graphSettings = std::get(hfSettings.graphSettings); - ASSERT_EQ(graphSettings.modelName, modelName); + ASSERT_EQ(exportSettings.modelName, modelName); ASSERT_EQ(graphSettings.pipelineType.value(), "VLM"); - ASSERT_EQ(graphSettings.modelPath, "./"); + ASSERT_EQ(exportSettings.modelPath, "./"); ASSERT_EQ(graphSettings.maxNumSeqs, 128); - ASSERT_EQ(graphSettings.targetDevice, "NPU"); - ASSERT_EQ(graphSettings.pluginConfig.kvCachePrecision.has_value(), false); + ASSERT_EQ(exportSettings.targetDevice, "NPU"); + ASSERT_EQ(exportSettings.pluginConfig.kvCachePrecision.has_value(), false); ASSERT_EQ(graphSettings.enablePrefixCaching, "true"); ASSERT_EQ(graphSettings.cacheSize, 10); ASSERT_EQ(graphSettings.maxNumBatchedTokens.has_value(), false); @@ -1126,16 +1141,17 @@ TEST(OvmsGraphConfigTest, positiveTaskTextGen) { ConstructorEnabledConfig config; config.parse(arg_count, n_argv); auto& hfSettings = config.getServerSettings().hfSettings; + auto& exportSettings = hfSettings.exportSettings; ASSERT_EQ(hfSettings.sourceModel, modelName); ASSERT_EQ(hfSettings.downloadPath, downloadPath); ASSERT_EQ(config.getServerSettings().serverMode, ovms::HF_PULL_MODE); ovms::TextGenGraphSettingsImpl graphSettings = std::get(hfSettings.graphSettings); - ASSERT_EQ(graphSettings.modelName, modelName); + ASSERT_EQ(exportSettings.modelName, modelName); ASSERT_EQ(graphSettings.pipelineType.has_value(), false); - ASSERT_EQ(graphSettings.modelPath, "./"); + ASSERT_EQ(exportSettings.modelPath, "./"); ASSERT_EQ(graphSettings.maxNumSeqs, 256); - ASSERT_EQ(graphSettings.targetDevice, "CPU"); - ASSERT_EQ(graphSettings.pluginConfig.kvCachePrecision.has_value(), false); + ASSERT_EQ(exportSettings.targetDevice, "CPU"); + ASSERT_EQ(exportSettings.pluginConfig.kvCachePrecision.has_value(), false); ASSERT_EQ(graphSettings.enablePrefixCaching, "true"); ASSERT_EQ(graphSettings.cacheSize, 10); ASSERT_EQ(graphSettings.maxNumBatchedTokens.has_value(), false); @@ -1218,20 +1234,29 @@ TEST(OvmsExportHfSettingsTest, allChanged) { (char*)"NPU", (char*)"--task", (char*)"text_generation", - }; + (char*)"--plugin_config", + (char*)"{\"NUM_STREAMS\":\"2\"}", + (char*)"--cache_dir", + (char*)"/tmp/cache_dir_with_gold"}; - int arg_count = 15; + int arg_count = 19; ConstructorEnabledConfig config; config.parse(arg_count, n_argv); - auto& hfSettings = config.getServerSettings().hfSettings; + auto& serverSettings = config.getServerSettings(); + auto& hfSettings = serverSettings.hfSettings; + auto& exportSettings = hfSettings.exportSettings; ASSERT_EQ(hfSettings.sourceModel, modelName); ASSERT_EQ(hfSettings.downloadPath, downloadPath); ASSERT_EQ(hfSettings.overwriteModels, true); - ASSERT_EQ(hfSettings.exportSettings.precision, "fp64"); - ASSERT_EQ(hfSettings.exportSettings.targetDevice, "NPU"); + ASSERT_EQ(exportSettings.precision, "fp64"); + ASSERT_EQ(exportSettings.targetDevice, "NPU"); ASSERT_EQ(hfSettings.downloadType, ovms::OPTIMUM_CLI_DOWNLOAD); - ASSERT_EQ(hfSettings.exportSettings.extraQuantizationParams.value(), "--sym --ratio 1.0"); + ASSERT_EQ(exportSettings.extraQuantizationParams.value(), "--sym --ratio 1.0"); + ASSERT_EQ(exportSettings.pluginConfig.cacheDir.value(), "/tmp/cache_dir_with_gold"); + // here we expect only what is passed by user not all plugin parameters passed to genai + ASSERT_EQ(hfSettings.exportSettings.pluginConfig.manualString.value(), "{\"NUM_STREAMS\":\"2\"}"); + ASSERT_EQ(serverSettings.cacheDir, "/tmp/cache_dir_with_gold"); ASSERT_EQ(config.getServerSettings().serverMode, ovms::HF_PULL_MODE); } @@ -1255,21 +1280,27 @@ TEST(OvmsExportHfSettingsTest, allChangedPullAndStart) { (char*)"NPU", (char*)"--task", (char*)"text_generation", - }; + (char*)"--plugin_config", + (char*)"{\"NUM_STREAMS\":\"2\"}", + (char*)"--cache_dir", + (char*)"/tmp/cache_dir_with_gold"}; - int arg_count = 16; + int arg_count = 20; ConstructorEnabledConfig config; config.parse(arg_count, n_argv); auto& hfSettings = config.getServerSettings().hfSettings; + auto& exportSettings = hfSettings.exportSettings; ASSERT_EQ(hfSettings.sourceModel, modelName); ASSERT_EQ(hfSettings.downloadPath, downloadPath); ASSERT_EQ(hfSettings.overwriteModels, true); - ASSERT_EQ(hfSettings.exportSettings.precision, "fp64"); - ASSERT_EQ(hfSettings.exportSettings.targetDevice, "NPU"); + ASSERT_EQ(exportSettings.precision, "fp64"); + ASSERT_EQ(exportSettings.targetDevice, "NPU"); ASSERT_EQ(hfSettings.downloadType, ovms::OPTIMUM_CLI_DOWNLOAD); - ASSERT_EQ(hfSettings.exportSettings.extraQuantizationParams.value(), "--sym --ratio 1.0"); + ASSERT_EQ(exportSettings.extraQuantizationParams.value(), "--sym --ratio 1.0"); ASSERT_EQ(config.getServerSettings().serverMode, ovms::HF_PULL_AND_START_MODE); + ASSERT_EQ(exportSettings.pluginConfig.manualString.value(), "{\"NUM_STREAMS\":\"2\"}"); + ASSERT_EQ(exportSettings.pluginConfig.cacheDir.value(), "/tmp/cache_dir_with_gold"); } TEST(OvmsGraphConfigTest, positiveDefault) { @@ -1290,16 +1321,17 @@ TEST(OvmsGraphConfigTest, positiveDefault) { ConstructorEnabledConfig config; config.parse(arg_count, n_argv); auto& hfSettings = config.getServerSettings().hfSettings; + auto& exportSettings = hfSettings.exportSettings; ASSERT_EQ(hfSettings.sourceModel, modelName); ASSERT_EQ(hfSettings.downloadPath, downloadPath); ASSERT_EQ(config.getServerSettings().serverMode, ovms::HF_PULL_MODE); ASSERT_EQ(hfSettings.task, ovms::TEXT_GENERATION_GRAPH); ovms::TextGenGraphSettingsImpl graphSettings = std::get(hfSettings.graphSettings); ASSERT_EQ(graphSettings.pipelineType.has_value(), false); - ASSERT_EQ(graphSettings.modelPath, "./"); + ASSERT_EQ(exportSettings.modelPath, "./"); ASSERT_EQ(graphSettings.maxNumSeqs, 256); - ASSERT_EQ(graphSettings.targetDevice, "CPU"); - ASSERT_EQ(graphSettings.pluginConfig.kvCachePrecision.has_value(), false); + ASSERT_EQ(exportSettings.targetDevice, "CPU"); + ASSERT_EQ(exportSettings.pluginConfig.kvCachePrecision.has_value(), false); ASSERT_EQ(graphSettings.enablePrefixCaching, "true"); ASSERT_EQ(graphSettings.cacheSize, 10); ASSERT_EQ(graphSettings.maxNumBatchedTokens.has_value(), false); @@ -1328,6 +1360,7 @@ TEST(OvmsGraphConfigTest, positiveDefaultStart) { ConstructorEnabledConfig config; config.parse(arg_count, n_argv); auto& hfSettings = config.getServerSettings().hfSettings; + auto& exportSettings = hfSettings.exportSettings; ASSERT_EQ(hfSettings.sourceModel, modelName); ASSERT_EQ(hfSettings.downloadPath, downloadPath); ASSERT_EQ(hfSettings.overwriteModels, false); @@ -1335,10 +1368,10 @@ TEST(OvmsGraphConfigTest, positiveDefaultStart) { ASSERT_EQ(hfSettings.task, ovms::TEXT_GENERATION_GRAPH); ovms::TextGenGraphSettingsImpl graphSettings = std::get(hfSettings.graphSettings); ASSERT_EQ(graphSettings.pipelineType.has_value(), false); - ASSERT_EQ(graphSettings.modelPath, "./"); + ASSERT_EQ(exportSettings.modelPath, "./"); ASSERT_EQ(graphSettings.maxNumSeqs, 256); - ASSERT_EQ(graphSettings.targetDevice, "CPU"); - ASSERT_EQ(graphSettings.pluginConfig.kvCachePrecision.has_value(), false); + ASSERT_EQ(exportSettings.targetDevice, "CPU"); + ASSERT_EQ(exportSettings.pluginConfig.kvCachePrecision.has_value(), false); ASSERT_EQ(graphSettings.enablePrefixCaching, "true"); ASSERT_EQ(graphSettings.cacheSize, 10); ASSERT_EQ(graphSettings.maxNumBatchedTokens.has_value(), false); @@ -1369,7 +1402,7 @@ TEST(OvmsGraphConfigTest, positiveTargetDeviceHetero) { config.parse(arg_count, n_argv); auto& hfSettings = config.getServerSettings().hfSettings; ovms::TextGenGraphSettingsImpl graphSettings = std::get(hfSettings.graphSettings); - ASSERT_EQ(graphSettings.targetDevice, "HETERO"); + ASSERT_EQ(hfSettings.exportSettings.targetDevice, "HETERO"); } TEST(OvmsGraphConfigTest, positiveTargetDeviceSpecificGPU) { @@ -1393,7 +1426,7 @@ TEST(OvmsGraphConfigTest, positiveTargetDeviceSpecificGPU) { config.parse(arg_count, n_argv); auto& hfSettings = config.getServerSettings().hfSettings; ovms::TextGenGraphSettingsImpl graphSettings = std::get(hfSettings.graphSettings); - ASSERT_EQ(graphSettings.targetDevice, "GPU.1"); + ASSERT_EQ(hfSettings.exportSettings.targetDevice, "GPU.1"); } TEST(OvmsGraphConfigTest, negativePipelineType) { @@ -1497,23 +1530,29 @@ TEST(OvmsGraphConfigTest, positiveAllChangedRerank) { (char*)"2", (char*)"--model_name", (char*)servingName.c_str(), - }; + (char*)"--plugin_config", + (char*)"{\"SOME_KEY\":\"SOME_VALUE\"}", + (char*)"--cache_dir", + (char*)"/tmp/cache_dir_with_emptiness"}; - int arg_count = 16; + int arg_count = 20; ConstructorEnabledConfig config; config.parse(arg_count, n_argv); auto& hfSettings = config.getServerSettings().hfSettings; + auto& exportSettings = hfSettings.exportSettings; ASSERT_EQ(hfSettings.sourceModel, modelName); ASSERT_EQ(hfSettings.downloadPath, downloadPath); ASSERT_EQ(config.getServerSettings().serverMode, ovms::HF_PULL_MODE); ASSERT_EQ(hfSettings.task, ovms::RERANK_GRAPH); ovms::RerankGraphSettingsImpl rerankGraphSettings = std::get(hfSettings.graphSettings); ASSERT_EQ(rerankGraphSettings.maxAllowedChunks, 1002); - ASSERT_EQ(rerankGraphSettings.numStreams, 2); - ASSERT_EQ(rerankGraphSettings.targetDevice, "GPU"); - ASSERT_EQ(rerankGraphSettings.modelName, servingName); - ASSERT_EQ(rerankGraphSettings.modelPath, "./"); + ASSERT_EQ(exportSettings.pluginConfig.numStreams, 2); + ASSERT_EQ(exportSettings.targetDevice, "GPU"); + ASSERT_EQ(exportSettings.modelName, servingName); + ASSERT_EQ(exportSettings.modelPath, "./"); + ASSERT_EQ(hfSettings.exportSettings.pluginConfig.cacheDir.value(), "/tmp/cache_dir_with_emptiness"); + ASSERT_EQ(hfSettings.exportSettings.pluginConfig.manualString.value(), "{\"SOME_KEY\":\"SOME_VALUE\"}"); } TEST(OvmsGraphConfigTest, positiveAllChangedRerankStart) { @@ -1545,16 +1584,17 @@ TEST(OvmsGraphConfigTest, positiveAllChangedRerankStart) { config.parse(arg_count, n_argv); auto& hfSettings = config.getServerSettings().hfSettings; + auto& exportSettings = hfSettings.exportSettings; ASSERT_EQ(hfSettings.sourceModel, modelName); ASSERT_EQ(hfSettings.downloadPath, downloadPath); ASSERT_EQ(config.getServerSettings().serverMode, ovms::HF_PULL_AND_START_MODE); ASSERT_EQ(hfSettings.task, ovms::RERANK_GRAPH); ovms::RerankGraphSettingsImpl rerankGraphSettings = std::get(hfSettings.graphSettings); ASSERT_EQ(rerankGraphSettings.maxAllowedChunks, 1002); - ASSERT_EQ(rerankGraphSettings.numStreams, 2); - ASSERT_EQ(rerankGraphSettings.targetDevice, "GPU"); - ASSERT_EQ(rerankGraphSettings.modelName, servingName); - ASSERT_EQ(rerankGraphSettings.modelPath, "./"); + ASSERT_EQ(exportSettings.pluginConfig.numStreams, 2); + ASSERT_EQ(exportSettings.targetDevice, "GPU"); + ASSERT_EQ(exportSettings.modelName, servingName); + ASSERT_EQ(exportSettings.modelPath, "./"); } TEST(OvmsGraphConfigTest, positiveDefaultRerank) { @@ -1577,16 +1617,17 @@ TEST(OvmsGraphConfigTest, positiveDefaultRerank) { config.parse(arg_count, n_argv); auto& hfSettings = config.getServerSettings().hfSettings; + auto& exportSettings = hfSettings.exportSettings; ASSERT_EQ(hfSettings.sourceModel, modelName); ASSERT_EQ(hfSettings.downloadPath, downloadPath); ASSERT_EQ(config.getServerSettings().serverMode, ovms::HF_PULL_MODE); ASSERT_EQ(hfSettings.task, ovms::RERANK_GRAPH); ovms::RerankGraphSettingsImpl rerankGraphSettings = std::get(hfSettings.graphSettings); ASSERT_EQ(rerankGraphSettings.maxAllowedChunks, 10000); - ASSERT_EQ(rerankGraphSettings.numStreams, 1); - ASSERT_EQ(rerankGraphSettings.targetDevice, "CPU"); - ASSERT_EQ(rerankGraphSettings.modelName, modelName); - ASSERT_EQ(rerankGraphSettings.modelPath, "./"); + ASSERT_EQ(exportSettings.pluginConfig.numStreams, 1); + ASSERT_EQ(exportSettings.targetDevice, "CPU"); + ASSERT_EQ(exportSettings.modelName, modelName); + ASSERT_EQ(exportSettings.modelPath, "./"); } TEST(OvmsGraphConfigTest, positiveSomeChangedRerank) { @@ -1615,16 +1656,17 @@ TEST(OvmsGraphConfigTest, positiveSomeChangedRerank) { config.parse(arg_count, n_argv); auto& hfSettings = config.getServerSettings().hfSettings; + auto& exportSettings = hfSettings.exportSettings; ASSERT_EQ(hfSettings.sourceModel, modelName); ASSERT_EQ(hfSettings.downloadPath, downloadPath); ASSERT_EQ(config.getServerSettings().serverMode, ovms::HF_PULL_MODE); ASSERT_EQ(hfSettings.task, ovms::RERANK_GRAPH); ovms::RerankGraphSettingsImpl rerankGraphSettings = std::get(hfSettings.graphSettings); ASSERT_EQ(rerankGraphSettings.maxAllowedChunks, 2); - ASSERT_EQ(rerankGraphSettings.numStreams, 1); - ASSERT_EQ(rerankGraphSettings.targetDevice, "GPU"); - ASSERT_EQ(rerankGraphSettings.modelName, servingName); - ASSERT_EQ(rerankGraphSettings.modelPath, "./"); + ASSERT_EQ(exportSettings.pluginConfig.numStreams, 1); + ASSERT_EQ(exportSettings.targetDevice, "GPU"); + ASSERT_EQ(exportSettings.modelName, servingName); + ASSERT_EQ(exportSettings.modelPath, "./"); } TEST(OvmsGraphConfigTest, positiveAllChangedImageGeneration) { @@ -1661,19 +1703,22 @@ TEST(OvmsGraphConfigTest, positiveAllChangedImageGeneration) { (char*)"2", (char*)"--max_num_inference_steps", (char*)"3", + (char*)"--plugin_config", + (char*)"{\"SOME_KEY\":\"SOME_VALUE\"}", }; - int arg_count = 30; + int arg_count = 32; ConstructorEnabledConfig config; config.parse(arg_count, n_argv); auto& hfSettings = config.getServerSettings().hfSettings; + auto& exportSettings = hfSettings.exportSettings; ASSERT_EQ(hfSettings.sourceModel, modelName); ASSERT_EQ(hfSettings.downloadPath, downloadPath); ASSERT_EQ(config.getServerSettings().serverMode, ovms::HF_PULL_MODE); ASSERT_EQ(hfSettings.task, ovms::IMAGE_GENERATION_GRAPH); ovms::ImageGenerationGraphSettingsImpl imageGenerationGraphSettings = std::get(hfSettings.graphSettings); - ASSERT_EQ(imageGenerationGraphSettings.targetDevice, "GPU GPU NPU"); + ASSERT_EQ(exportSettings.targetDevice, "GPU GPU NPU"); ASSERT_EQ(imageGenerationGraphSettings.resolution, " 3000x4000 200x700 100x200"); ASSERT_TRUE(imageGenerationGraphSettings.guidanceScale.has_value()); ASSERT_NEAR(imageGenerationGraphSettings.guidanceScale.value(), 8.2, 1e-5); @@ -1686,7 +1731,9 @@ TEST(OvmsGraphConfigTest, positiveAllChangedImageGeneration) { ASSERT_EQ(imageGenerationGraphSettings.defaultNumInferenceSteps.value(), 2); ASSERT_TRUE(imageGenerationGraphSettings.maxNumInferenceSteps.has_value()); ASSERT_EQ(imageGenerationGraphSettings.maxNumInferenceSteps.value(), 3); - ASSERT_EQ(imageGenerationGraphSettings.pluginConfig, "{\"NUM_STREAMS\":14,\"CACHE_DIR\":\"/cache\"}"); + ASSERT_EQ(exportSettings.pluginConfig.numStreams, 14); + ASSERT_EQ(exportSettings.pluginConfig.cacheDir.value(), "/cache"); + ASSERT_EQ(exportSettings.pluginConfig.manualString.value(), "{\"SOME_KEY\":\"SOME_VALUE\"}"); } TEST(OvmsGraphConfigTest, positiveDefaultImageGeneration) { @@ -1708,18 +1755,19 @@ TEST(OvmsGraphConfigTest, positiveDefaultImageGeneration) { config.parse(arg_count, n_argv); auto& hfSettings = config.getServerSettings().hfSettings; + auto& exportSettings = hfSettings.exportSettings; ASSERT_EQ(hfSettings.sourceModel, modelName); ASSERT_EQ(hfSettings.downloadPath, downloadPath); ASSERT_EQ(config.getServerSettings().serverMode, ovms::HF_PULL_MODE); ASSERT_EQ(hfSettings.task, ovms::IMAGE_GENERATION_GRAPH); ovms::ImageGenerationGraphSettingsImpl imageGenerationGraphSettings = std::get(hfSettings.graphSettings); - ASSERT_EQ(imageGenerationGraphSettings.targetDevice, "CPU"); + ASSERT_EQ(exportSettings.targetDevice, "CPU"); ASSERT_TRUE(imageGenerationGraphSettings.maxResolution.empty()); ASSERT_TRUE(imageGenerationGraphSettings.defaultResolution.empty()); ASSERT_FALSE(imageGenerationGraphSettings.maxNumberImagesPerPrompt.has_value()); ASSERT_FALSE(imageGenerationGraphSettings.defaultNumInferenceSteps.has_value()); ASSERT_FALSE(imageGenerationGraphSettings.maxNumInferenceSteps.has_value()); - ASSERT_TRUE(imageGenerationGraphSettings.pluginConfig.empty()); + ASSERT_TRUE(exportSettings.pluginConfig.empty()); } TEST(OvmsGraphConfigTest, positiveAllChangedEmbeddings) { @@ -1747,13 +1795,17 @@ TEST(OvmsGraphConfigTest, positiveAllChangedEmbeddings) { (char*)"2", (char*)"--model_name", (char*)servingName.c_str(), - }; + (char*)"--plugin_config", + (char*)"{\"SOME_KEY\":\"SOME_VALUE\"}", + (char*)"--cache_dir", + (char*)"/tmp/cache_dir_with_emptiness"}; - int arg_count = 20; + int arg_count = 24; ConstructorEnabledConfig config; config.parse(arg_count, n_argv); auto& hfSettings = config.getServerSettings().hfSettings; + auto& exportSettings = hfSettings.exportSettings; ASSERT_EQ(hfSettings.sourceModel, modelName); ASSERT_EQ(hfSettings.downloadPath, downloadPath); ASSERT_EQ(config.getServerSettings().serverMode, ovms::HF_PULL_MODE); @@ -1762,10 +1814,12 @@ TEST(OvmsGraphConfigTest, positiveAllChangedEmbeddings) { ASSERT_EQ(embeddingsGraphSettings.normalize, "false"); ASSERT_EQ(embeddingsGraphSettings.truncate, "true"); ASSERT_EQ(embeddingsGraphSettings.pooling, "CLS"); - ASSERT_EQ(embeddingsGraphSettings.numStreams, 2); - ASSERT_EQ(embeddingsGraphSettings.targetDevice, "GPU"); - ASSERT_EQ(embeddingsGraphSettings.modelName, servingName); - ASSERT_EQ(embeddingsGraphSettings.modelPath, "./"); + ASSERT_EQ(exportSettings.pluginConfig.numStreams, 2); + ASSERT_EQ(exportSettings.targetDevice, "GPU"); + ASSERT_EQ(exportSettings.modelName, servingName); + ASSERT_EQ(exportSettings.modelPath, "./"); + ASSERT_EQ(exportSettings.pluginConfig.cacheDir.value(), "/tmp/cache_dir_with_emptiness"); + ASSERT_EQ(exportSettings.pluginConfig.manualString.value(), "{\"SOME_KEY\":\"SOME_VALUE\"}"); } TEST(OvmsGraphConfigTest, positiveAllChangedEmbeddingsStart) { @@ -1801,6 +1855,7 @@ TEST(OvmsGraphConfigTest, positiveAllChangedEmbeddingsStart) { config.parse(arg_count, n_argv); auto& hfSettings = config.getServerSettings().hfSettings; + auto& exportSettings = hfSettings.exportSettings; ASSERT_EQ(hfSettings.sourceModel, modelName); ASSERT_EQ(hfSettings.downloadPath, downloadPath); ASSERT_EQ(config.getServerSettings().serverMode, ovms::HF_PULL_AND_START_MODE); @@ -1809,10 +1864,10 @@ TEST(OvmsGraphConfigTest, positiveAllChangedEmbeddingsStart) { ASSERT_EQ(embeddingsGraphSettings.normalize, "false"); ASSERT_EQ(embeddingsGraphSettings.truncate, "true"); ASSERT_EQ(embeddingsGraphSettings.pooling, "LAST"); - ASSERT_EQ(embeddingsGraphSettings.numStreams, 2); - ASSERT_EQ(embeddingsGraphSettings.targetDevice, "GPU"); - ASSERT_EQ(embeddingsGraphSettings.modelName, servingName); - ASSERT_EQ(embeddingsGraphSettings.modelPath, "./"); + ASSERT_EQ(exportSettings.pluginConfig.numStreams, 2); + ASSERT_EQ(exportSettings.targetDevice, "GPU"); + ASSERT_EQ(exportSettings.modelName, servingName); + ASSERT_EQ(exportSettings.modelPath, "./"); } TEST(OvmsGraphConfigTest, positiveDefaultEmbeddings) { @@ -1834,6 +1889,7 @@ TEST(OvmsGraphConfigTest, positiveDefaultEmbeddings) { config.parse(arg_count, n_argv); auto& hfSettings = config.getServerSettings().hfSettings; + auto& exportSettings = hfSettings.exportSettings; ASSERT_EQ(hfSettings.sourceModel, modelName); ASSERT_EQ(hfSettings.downloadPath, downloadPath); ASSERT_EQ(config.getServerSettings().serverMode, ovms::HF_PULL_MODE); @@ -1842,9 +1898,9 @@ TEST(OvmsGraphConfigTest, positiveDefaultEmbeddings) { ASSERT_EQ(embeddingsGraphSettings.normalize, "true"); ASSERT_EQ(embeddingsGraphSettings.truncate, "false"); ASSERT_EQ(embeddingsGraphSettings.pooling, "CLS"); - ASSERT_EQ(embeddingsGraphSettings.numStreams, 1); - ASSERT_EQ(embeddingsGraphSettings.targetDevice, "CPU"); - ASSERT_EQ(embeddingsGraphSettings.modelName, modelName); + ASSERT_EQ(exportSettings.pluginConfig.numStreams, 1); + ASSERT_EQ(exportSettings.targetDevice, "CPU"); + ASSERT_EQ(exportSettings.modelName, modelName); } TEST(OvmsGraphConfigTest, positiveSomeChangedEmbeddings) { @@ -1875,17 +1931,18 @@ TEST(OvmsGraphConfigTest, positiveSomeChangedEmbeddings) { config.parse(arg_count, n_argv); auto& hfSettings = config.getServerSettings().hfSettings; + auto& exportSettings = config.getServerSettings().hfSettings.exportSettings; ASSERT_EQ(hfSettings.sourceModel, modelName); ASSERT_EQ(hfSettings.downloadPath, downloadPath); ASSERT_EQ(config.getServerSettings().serverMode, ovms::HF_PULL_MODE); ASSERT_EQ(hfSettings.task, ovms::EMBEDDINGS_GRAPH); ovms::EmbeddingsGraphSettingsImpl embeddingsGraphSettings = std::get(hfSettings.graphSettings); ASSERT_EQ(embeddingsGraphSettings.pooling, "LAST"); - ASSERT_EQ(embeddingsGraphSettings.numStreams, 1); + ASSERT_EQ(exportSettings.pluginConfig.numStreams, 1); ASSERT_EQ(embeddingsGraphSettings.normalize, "false"); - ASSERT_EQ(embeddingsGraphSettings.targetDevice, "GPU"); - ASSERT_EQ(embeddingsGraphSettings.modelName, servingName); - ASSERT_EQ(embeddingsGraphSettings.modelPath, "./"); + ASSERT_EQ(exportSettings.targetDevice, "GPU"); + ASSERT_EQ(exportSettings.modelName, servingName); + ASSERT_EQ(exportSettings.modelPath, "./"); } TEST(OvmsGraphConfigTest, negativeEmbeddingsInvalidNormalize) { diff --git a/src/test/pull_hf_model_test.cpp b/src/test/pull_hf_model_test.cpp index 5993d4cffa..8b43167d9a 100644 --- a/src/test/pull_hf_model_test.cpp +++ b/src/test/pull_hf_model_test.cpp @@ -91,7 +91,6 @@ const std::string expectedGraphContents = R"( max_num_seqs:256, device: "CPU", models_path: "./", - plugin_config: '{ }', enable_prefix_caching: true, cache_size: 10, } @@ -129,7 +128,6 @@ const std::string expectedGraphContentsDraft = R"( max_num_seqs:256, device: "CPU", models_path: "./", - plugin_config: '{ }', enable_prefix_caching: true, cache_size: 10, # Speculative decoding configuration