openvinotoolkit · atobiszei · Oct 20, 2025 · Oct 27, 2025 · Oct 27, 2025 · Oct 28, 2025
diff --git a/docs/parameters.md b/docs/parameters.md
@@ -130,6 +130,7 @@ Task specific parameters for different tasks (text generation/image generation/e
 | `--dynamic_split_fuse`                | `bool`       | Enables dynamic split fuse algorithm. Default: true.                                                                       |
 | `--max_prompt_len`                    | `integer`    | Sets NPU specific property for maximum number of tokens in the prompt.                                                     |
 | `--kv_cache_precision`                | `string`     | Reduced kv cache precision to `u8` lowers the cache size consumption. Accepted values: `u8` or empty (default).            |
+| `--model_distribution_policy`         | `string`     | TENSOR_PARALLEL distributes tensor to multiple sockets/devices and processes it in parallel. PIPELINE_PARALLEL distributes different tensors to process by each device. Accepted values: `TENSOR_PARALLEL`, `PIPELINE_PARALLEL` or empty (default). |
-| `--model_distribution_policy`         | `string`     | TENSOR_PARALLEL distributes tensor to multiple sockets/devices and processes it in parallel. PIPELINE_PARALLEL distributes different tensors to process by each device. Accepted values: `TENSOR_PARALLEL`, `PIPELINE_PARALLEL` or empty (default). |
+| `--model_distribution_policy`         | `string`     | `TENSOR_PARALLEL` distributes tensor to multiple sockets/devices and processes it in parallel. `PIPELINE_PARALLEL` distributes different tensors to process by each device. Accepted values: `TENSOR_PARALLEL`, `PIPELINE_PARALLEL` or empty (default). |
-| `--model_distribution_policy`         | `string`     | TENSOR_PARALLEL distributes tensor to multiple sockets/devices and processes it in parallel. PIPELINE_PARALLEL distributes different tensors to process by each device. Accepted values: `TENSOR_PARALLEL`, `PIPELINE_PARALLEL` or empty (default). |
+| `--model_distribution_policy`         | `string`     | `TENSOR_PARALLEL` distributes tensor to multiple sockets/devices and processes it in parallel. `PIPELINE_PARALLEL` distributes different tensors to process by each device. Accepted values: `TENSOR_PARALLEL`, `PIPELINE_PARALLEL` or empty (default). |
 | `--reasoning_parser`                  | `string`     | Type of parser to use for reasoning content extraction from model output. Currently supported: [qwen3]                     |
 | `--tool_parser`                       | `string`     | Type of parser to use for tool calls extraction from model output. Currently supported: [llama3, hermes3, phi4]            |
 | `--enable_tool_guided_generation`     | `bool`       | Enables enforcing tool schema during generation. Requires setting response parser. Default: false.                         |

diff --git a/src/capi_frontend/server_settings.cpp b/src/capi_frontend/server_settings.cpp
@@ -20,7 +20,6 @@
 #include "../stringutils.hpp"
 
 namespace ovms {
-
 std::string enumToString(ConfigExportType type) {
     auto it = configExportTypeToString.find(type);
     return (it != configExportTypeToString.end()) ? it->second : "UNKNOWN_MODEL";

diff --git a/src/capi_frontend/server_settings.hpp b/src/capi_frontend/server_settings.hpp
@@ -88,20 +88,27 @@ enum OvmsServerMode : int {
 };
 
 struct PluginConfigSettingsImpl {
+    std::optional<std::string> manualString;
     std::optional<std::string> kvCachePrecision;
     std::optional<uint32_t> maxPromptLength;
     std::optional<std::string> modelDistributionPolicy;
+    std::optional<uint32_t> numStreams;
+    std::optional<std::string> cacheDir;
+    bool empty() const {
+        return !kvCachePrecision.has_value() &&
+               !maxPromptLength.has_value() &&
+               !modelDistributionPolicy.has_value() &&
+               !numStreams.has_value() &&
+               !cacheDir.has_value() &&
+               (!manualString.has_value() || manualString.value().empty());
+    }
 };
 
 struct TextGenGraphSettingsImpl {
-    std::string modelPath = "./";
-    std::string modelName = "";
     uint32_t maxNumSeqs = 256;
-    std::string targetDevice = "CPU";
     std::string enablePrefixCaching = "true";
     uint32_t cacheSize = 10;
     std::string dynamicSplitFuse = "true";
-    PluginConfigSettingsImpl pluginConfig;
     std::optional<uint32_t> maxNumBatchedTokens;
     std::optional<std::string> draftModelDirName;
     std::optional<std::string> pipelineType;
@@ -111,27 +118,16 @@ struct TextGenGraphSettingsImpl {
 };
 
 struct EmbeddingsGraphSettingsImpl {
-    std::string modelPath = "./";
-    std::string targetDevice = "CPU";
-    std::string modelName = "";
-    uint32_t numStreams = 1;
     std::string normalize = "true";
     std::string truncate = "false";
     std::string pooling = "CLS";
 };
 
 struct RerankGraphSettingsImpl {
-    std::string modelPath = "./";
-    std::string targetDevice = "CPU";
-    std::string modelName = "";
-    uint32_t numStreams = 1;
     uint64_t maxAllowedChunks = 10000;
 };
 
 struct ImageGenerationGraphSettingsImpl {
-    std::string modelName = "";
-    std::string modelPath = "./";
-    std::string targetDevice = "CPU";
     std::string resolution = "";
     std::string maxResolution = "";
     std::string defaultResolution = "";
@@ -140,13 +136,15 @@ struct ImageGenerationGraphSettingsImpl {
     std::optional<uint32_t> maxNumberImagesPerPrompt;
     std::optional<uint32_t> defaultNumInferenceSteps;
     std::optional<uint32_t> maxNumInferenceSteps;
-    std::string pluginConfig;
 };
 
 struct ExportSettings {
+    std::string modelName = "";
+    std::string modelPath = "./";
     std::string targetDevice = "CPU";
     std::optional<std::string> extraQuantizationParams;
     std::string precision = "int8";
+    PluginConfigSettingsImpl pluginConfig;
 };
 
 struct HFSettingsImpl {

diff --git a/src/cli_parser.cpp b/src/cli_parser.cpp
@@ -574,6 +574,7 @@ void CLIParser::prepareModel(ModelsSettingsImpl& modelsSettings, HFSettingsImpl&
 
     if (result->count("plugin_config")) {
         modelsSettings.pluginConfig = result->operator[]("plugin_config").as<std::string>();
+        hfSettings.exportSettings.pluginConfig.manualString = modelsSettings.pluginConfig;
         modelsSettings.userSetSingleModelArguments.push_back("plugin_config");
     }
 
@@ -684,6 +685,9 @@ void CLIParser::prepareGraph(ServerSettingsImpl& serverSettings, HFSettingsImpl&
                 throw std::logic_error("Tried to prepare graph settings without graph parser initialization");
             }
         }
+        if (!serverSettings.cacheDir.empty()) {
+            hfSettings.exportSettings.pluginConfig.cacheDir = serverSettings.cacheDir;
+        }
     // No pull nor pull and start mode
     } else {
         if (result->count("weight-format")) {

diff --git a/src/config.cpp b/src/config.cpp
@@ -129,40 +129,41 @@ bool Config::validate() {
                 std::cerr << "Graph options not initialized for text generation.";
                 return false;
             }
-            auto settings = std::get<TextGenGraphSettingsImpl>(this->serverSettings.hfSettings.graphSettings);
+            const auto& exportSettings = this->serverSettings.hfSettings.exportSettings;
+            auto textGenSettings = std::get<TextGenGraphSettingsImpl>(this->serverSettings.hfSettings.graphSettings);
             std::vector allowedPipelineTypes = {"LM", "LM_CB", "VLM", "VLM_CB", "AUTO"};
-            if (settings.pipelineType.has_value() && std::find(allowedPipelineTypes.begin(), allowedPipelineTypes.end(), settings.pipelineType) == allowedPipelineTypes.end()) {
-                std::cerr << "pipeline_type: " << settings.pipelineType.value() << " is not allowed. Supported types: LM, LM_CB, VLM, VLM_CB, AUTO" << std::endl;
+            if (textGenSettings.pipelineType.has_value() && std::find(allowedPipelineTypes.begin(), allowedPipelineTypes.end(), textGenSettings.pipelineType) == allowedPipelineTypes.end()) {
+                std::cerr << "pipeline_type: " << textGenSettings.pipelineType.value() << " is not allowed. Supported types: LM, LM_CB, VLM, VLM_CB, AUTO" << std::endl;
                 return false;
             }
 
             std::vector allowedTargetDevices = {"CPU", "GPU", "NPU", "AUTO"};
             bool validDeviceSelected = false;
-            if (settings.targetDevice.rfind("GPU.", 0) == 0) {
+            if (exportSettings.targetDevice.rfind("GPU.", 0) == 0) {
                 // Accept GPU.x where x is a number to select specific GPU card
-                std::string indexPart = settings.targetDevice.substr(4);
+                std::string indexPart = exportSettings.targetDevice.substr(4);
                 validDeviceSelected = !indexPart.empty() && std::all_of(indexPart.begin(), indexPart.end(), ::isdigit);
-            } else if (settings.targetDevice.rfind("HETERO", 0) == 0) {
-                // Accept HETERO:<device1>,<device2>,... to select specific devices in the list
+            } else if ((exportSettings.targetDevice.rfind("HETERO", 0) == 0) || (exportSettings.targetDevice.rfind("AUTO", 0) == 0)) {
+                // Accept HETERO:<device1>,<device2>,... AUTO:<device1>,<device2>,... to select specific devices in the list
                 validDeviceSelected = true;
-            } else if (std::find(allowedTargetDevices.begin(), allowedTargetDevices.end(), settings.targetDevice) != allowedTargetDevices.end()) {
+            } else if (std::find(allowedTargetDevices.begin(), allowedTargetDevices.end(), exportSettings.targetDevice) != allowedTargetDevices.end()) {
                 // Accept CPU, GPU, NPU, AUTO as valid devices
                 validDeviceSelected = true;
             }
 
             if (!validDeviceSelected) {
-                std::cerr << "target_device: " << settings.targetDevice << " is not allowed. Supported devices: CPU, GPU, NPU, HETERO, AUTO" << std::endl;
+                std::cerr << "target_device: " << exportSettings.targetDevice << " is not allowed. Supported devices: CPU, GPU, NPU, HETERO, AUTO" << std::endl;
                 return false;
             }
 
             std::vector allowedBoolValues = {"false", "true"};
-            if (std::find(allowedBoolValues.begin(), allowedBoolValues.end(), settings.enablePrefixCaching) == allowedBoolValues.end()) {
-                std::cerr << "enable_prefix_caching: " << settings.enablePrefixCaching << " is not allowed. Supported values: true, false" << std::endl;
+            if (std::find(allowedBoolValues.begin(), allowedBoolValues.end(), textGenSettings.enablePrefixCaching) == allowedBoolValues.end()) {
+                std::cerr << "enable_prefix_caching: " << textGenSettings.enablePrefixCaching << " is not allowed. Supported values: true, false" << std::endl;
                 return false;
             }
 
-            if (std::find(allowedBoolValues.begin(), allowedBoolValues.end(), settings.dynamicSplitFuse) == allowedBoolValues.end()) {
-                std::cerr << "dynamic_split_fuse: " << settings.dynamicSplitFuse << " is not allowed. Supported values: true, false" << std::endl;
+            if (std::find(allowedBoolValues.begin(), allowedBoolValues.end(), textGenSettings.dynamicSplitFuse) == allowedBoolValues.end()) {
+                std::cerr << "dynamic_split_fuse: " << textGenSettings.dynamicSplitFuse << " is not allowed. Supported values: true, false" << std::endl;
                 return false;
             }
         }
@@ -172,16 +173,16 @@ bool Config::validate() {
                 std::cerr << "Graph options not initialized for embeddings.";
                 return false;
             }
-            auto settings = std::get<EmbeddingsGraphSettingsImpl>(this->serverSettings.hfSettings.graphSettings);
+            auto embedSettings = std::get<EmbeddingsGraphSettingsImpl>(this->serverSettings.hfSettings.graphSettings);
 
             std::vector allowedBoolValues = {"false", "true"};
-            if (std::find(allowedBoolValues.begin(), allowedBoolValues.end(), settings.normalize) == allowedBoolValues.end()) {
-                std::cerr << "normalize: " << settings.normalize << " is not allowed. Supported values: true, false" << std::endl;
+            if (std::find(allowedBoolValues.begin(), allowedBoolValues.end(), embedSettings.normalize) == allowedBoolValues.end()) {
+                std::cerr << "normalize: " << embedSettings.normalize << " is not allowed. Supported values: true, false" << std::endl;
                 return false;
             }
 
-            if (std::find(allowedBoolValues.begin(), allowedBoolValues.end(), settings.truncate) == allowedBoolValues.end()) {
-                std::cerr << "truncate: " << settings.truncate << " is not allowed. Supported values: true, false" << std::endl;
+            if (std::find(allowedBoolValues.begin(), allowedBoolValues.end(), embedSettings.truncate) == allowedBoolValues.end()) {
+                std::cerr << "truncate: " << embedSettings.truncate << " is not allowed. Supported values: true, false" << std::endl;
                 return false;
             }
         }

diff --git a/src/graph_export/BUILD b/src/graph_export/BUILD
@@ -80,7 +80,6 @@ ovms_cc_library(
         "@ovms//src:libovms_server_settings",
         "@ovms//src:ovms_exit_codes",
         "@com_github_jarro2783_cxxopts//:cxxopts",
-        "@com_github_tencent_rapidjson//:rapidjson",
     ],
     visibility = ["//visibility:public"],
 )

diff --git a/src/graph_export/embeddings_graph_cli_parser.cpp b/src/graph_export/embeddings_graph_cli_parser.cpp
@@ -81,19 +81,19 @@ std::vector<std::string> EmbeddingsGraphCLIParser::parse(const std::vector<std::
 
 void EmbeddingsGraphCLIParser::prepare(OvmsServerMode serverMode, HFSettingsImpl& hfSettings, const std::string& modelName) {
     EmbeddingsGraphSettingsImpl embeddingsGraphSettings = EmbeddingsGraphCLIParser::defaultGraphSettings();
-    embeddingsGraphSettings.targetDevice = hfSettings.exportSettings.targetDevice;
+    hfSettings.exportSettings.targetDevice = hfSettings.exportSettings.targetDevice;
     if (modelName != "") {
-        embeddingsGraphSettings.modelName = modelName;
+        hfSettings.exportSettings.modelName = modelName;
     } else {
-        embeddingsGraphSettings.modelName = hfSettings.sourceModel;
+        hfSettings.exportSettings.modelName = hfSettings.sourceModel;
     }
     if (nullptr == result) {
         // Pull with default arguments - no arguments from user
         if (serverMode != HF_PULL_MODE && serverMode != HF_PULL_AND_START_MODE) {
             throw std::logic_error("Tried to prepare server and model settings without graph parse result");
         }
     } else {
-        embeddingsGraphSettings.numStreams = result->operator[]("num_streams").as<uint32_t>();
+        hfSettings.exportSettings.pluginConfig.numStreams = result->operator[]("num_streams").as<uint32_t>();
         embeddingsGraphSettings.normalize = result->operator[]("normalize").as<std::string>();
         embeddingsGraphSettings.truncate = result->operator[]("truncate").as<std::string>();
         embeddingsGraphSettings.pooling = result->operator[]("pooling").as<std::string>();

diff --git a/src/graph_export/graph_cli_parser.cpp b/src/graph_export/graph_cli_parser.cpp
@@ -89,7 +89,11 @@ void GraphCLIParser::createOptions() {
         ("kv_cache_precision",
             "u8 or empty (model default). Reduced kv cache precision to u8 lowers the cache size consumption.",
             cxxopts::value<std::string>()->default_value(""),
-            "KV_CACHE_PRECISION");
+            "KV_CACHE_PRECISION")
+        ("model_distribution_policy",
+            "TENSOR_PARALLEL, PIPELINE_PARALLEL or empty (model default). Sets model distribution policy for inference with multiple sockets/devices.",
+            cxxopts::value<std::string>(),
+            "MODEL_DISTRIBUTION_POLICY");
 }
 
 void GraphCLIParser::printHelp() {
@@ -115,12 +119,12 @@ std::vector<std::string> GraphCLIParser::parse(const std::vector<std::string>& u
 
 void GraphCLIParser::prepare(OvmsServerMode serverMode, HFSettingsImpl& hfSettings, const std::string& modelName) {
     TextGenGraphSettingsImpl graphSettings = GraphCLIParser::defaultGraphSettings();
-    graphSettings.targetDevice = hfSettings.exportSettings.targetDevice;
+    hfSettings.exportSettings.targetDevice = hfSettings.exportSettings.targetDevice;
     // Deduct model name
     if (modelName != "") {
-        graphSettings.modelName = modelName;
+        hfSettings.exportSettings.modelName = modelName;
     } else {
-        graphSettings.modelName = hfSettings.sourceModel;
+        hfSettings.exportSettings.modelName = hfSettings.sourceModel;
     }
 
     if (nullptr == result) {
@@ -153,11 +157,13 @@ void GraphCLIParser::prepare(OvmsServerMode serverMode, HFSettingsImpl& hfSettin
 
         // Plugin configuration
         if (result->count("max_prompt_len")) {
-            graphSettings.pluginConfig.maxPromptLength = result->operator[]("max_prompt_len").as<uint32_t>();
+            hfSettings.exportSettings.pluginConfig.maxPromptLength = result->operator[]("max_prompt_len").as<uint32_t>();
+        }
+        if (result->count("model_distribution_policy")) {
+            hfSettings.exportSettings.pluginConfig.modelDistributionPolicy = result->operator[]("model_distribution_policy").as<std::string>();
         }
-
         if (result->count("kv_cache_precision")) {
-            graphSettings.pluginConfig.kvCachePrecision = result->operator[]("kv_cache_precision").as<std::string>();
+            hfSettings.exportSettings.pluginConfig.kvCachePrecision = result->operator[]("kv_cache_precision").as<std::string>();
         }
     }