Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/parameters.md
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ Task specific parameters for different tasks (text generation/image generation/e
| `--dynamic_split_fuse` | `bool` | Enables dynamic split fuse algorithm. Default: true. |
| `--max_prompt_len` | `integer` | Sets NPU specific property for maximum number of tokens in the prompt. |
| `--kv_cache_precision` | `string` | Reduced kv cache precision to `u8` lowers the cache size consumption. Accepted values: `u8` or empty (default). |
| `--model_distribution_policy` | `string` | TENSOR_PARALLEL distributes tensor to multiple sockets/devices and processes it in parallel. PIPELINE_PARALLEL distributes different tensors to process by each device. Accepted values: `TENSOR_PARALLEL`, `PIPELINE_PARALLEL` or empty (default). |
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
| `--model_distribution_policy` | `string` | TENSOR_PARALLEL distributes tensor to multiple sockets/devices and processes it in parallel. PIPELINE_PARALLEL distributes different tensors to process by each device. Accepted values: `TENSOR_PARALLEL`, `PIPELINE_PARALLEL` or empty (default). |
| `--model_distribution_policy` | `string` | `TENSOR_PARALLEL` distributes tensor to multiple sockets/devices and processes it in parallel. `PIPELINE_PARALLEL` distributes different tensors to process by each device. Accepted values: `TENSOR_PARALLEL`, `PIPELINE_PARALLEL` or empty (default). |

| `--reasoning_parser` | `string` | Type of parser to use for reasoning content extraction from model output. Currently supported: [qwen3] |
| `--tool_parser` | `string` | Type of parser to use for tool calls extraction from model output. Currently supported: [llama3, hermes3, phi4] |
| `--enable_tool_guided_generation` | `bool` | Enables enforcing tool schema during generation. Requires setting response parser. Default: false. |
Expand Down
1 change: 0 additions & 1 deletion src/capi_frontend/server_settings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
#include "../stringutils.hpp"

namespace ovms {

std::string enumToString(ConfigExportType type) {
auto it = configExportTypeToString.find(type);
return (it != configExportTypeToString.end()) ? it->second : "UNKNOWN_MODEL";
Expand Down
30 changes: 14 additions & 16 deletions src/capi_frontend/server_settings.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,20 +88,27 @@ enum OvmsServerMode : int {
};

struct PluginConfigSettingsImpl {
std::optional<std::string> manualString;
std::optional<std::string> kvCachePrecision;
std::optional<uint32_t> maxPromptLength;
std::optional<std::string> modelDistributionPolicy;
std::optional<uint32_t> numStreams;
std::optional<std::string> cacheDir;
bool empty() const {
return !kvCachePrecision.has_value() &&
!maxPromptLength.has_value() &&
!modelDistributionPolicy.has_value() &&
!numStreams.has_value() &&
!cacheDir.has_value() &&
(!manualString.has_value() || manualString.value().empty());
}
};

struct TextGenGraphSettingsImpl {
std::string modelPath = "./";
std::string modelName = "";
uint32_t maxNumSeqs = 256;
std::string targetDevice = "CPU";
std::string enablePrefixCaching = "true";
uint32_t cacheSize = 10;
std::string dynamicSplitFuse = "true";
PluginConfigSettingsImpl pluginConfig;
std::optional<uint32_t> maxNumBatchedTokens;
std::optional<std::string> draftModelDirName;
std::optional<std::string> pipelineType;
Expand All @@ -111,27 +118,16 @@ struct TextGenGraphSettingsImpl {
};

struct EmbeddingsGraphSettingsImpl {
std::string modelPath = "./";
std::string targetDevice = "CPU";
std::string modelName = "";
uint32_t numStreams = 1;
std::string normalize = "true";
std::string truncate = "false";
std::string pooling = "CLS";
};

struct RerankGraphSettingsImpl {
std::string modelPath = "./";
std::string targetDevice = "CPU";
std::string modelName = "";
uint32_t numStreams = 1;
uint64_t maxAllowedChunks = 10000;
};

struct ImageGenerationGraphSettingsImpl {
std::string modelName = "";
std::string modelPath = "./";
std::string targetDevice = "CPU";
std::string resolution = "";
std::string maxResolution = "";
std::string defaultResolution = "";
Expand All @@ -140,13 +136,15 @@ struct ImageGenerationGraphSettingsImpl {
std::optional<uint32_t> maxNumberImagesPerPrompt;
std::optional<uint32_t> defaultNumInferenceSteps;
std::optional<uint32_t> maxNumInferenceSteps;
std::string pluginConfig;
};

struct ExportSettings {
std::string modelName = "";
std::string modelPath = "./";
std::string targetDevice = "CPU";
std::optional<std::string> extraQuantizationParams;
std::string precision = "int8";
PluginConfigSettingsImpl pluginConfig;
};

struct HFSettingsImpl {
Expand Down
4 changes: 4 additions & 0 deletions src/cli_parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -574,6 +574,7 @@ void CLIParser::prepareModel(ModelsSettingsImpl& modelsSettings, HFSettingsImpl&

if (result->count("plugin_config")) {
modelsSettings.pluginConfig = result->operator[]("plugin_config").as<std::string>();
hfSettings.exportSettings.pluginConfig.manualString = modelsSettings.pluginConfig;
modelsSettings.userSetSingleModelArguments.push_back("plugin_config");
}

Expand Down Expand Up @@ -684,6 +685,9 @@ void CLIParser::prepareGraph(ServerSettingsImpl& serverSettings, HFSettingsImpl&
throw std::logic_error("Tried to prepare graph settings without graph parser initialization");
}
}
if (!serverSettings.cacheDir.empty()) {
hfSettings.exportSettings.pluginConfig.cacheDir = serverSettings.cacheDir;
}
// No pull nor pull and start mode
} else {
if (result->count("weight-format")) {
Expand Down
37 changes: 19 additions & 18 deletions src/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,40 +129,41 @@ bool Config::validate() {
std::cerr << "Graph options not initialized for text generation.";
return false;
}
auto settings = std::get<TextGenGraphSettingsImpl>(this->serverSettings.hfSettings.graphSettings);
const auto& exportSettings = this->serverSettings.hfSettings.exportSettings;
auto textGenSettings = std::get<TextGenGraphSettingsImpl>(this->serverSettings.hfSettings.graphSettings);
std::vector allowedPipelineTypes = {"LM", "LM_CB", "VLM", "VLM_CB", "AUTO"};
if (settings.pipelineType.has_value() && std::find(allowedPipelineTypes.begin(), allowedPipelineTypes.end(), settings.pipelineType) == allowedPipelineTypes.end()) {
std::cerr << "pipeline_type: " << settings.pipelineType.value() << " is not allowed. Supported types: LM, LM_CB, VLM, VLM_CB, AUTO" << std::endl;
if (textGenSettings.pipelineType.has_value() && std::find(allowedPipelineTypes.begin(), allowedPipelineTypes.end(), textGenSettings.pipelineType) == allowedPipelineTypes.end()) {
std::cerr << "pipeline_type: " << textGenSettings.pipelineType.value() << " is not allowed. Supported types: LM, LM_CB, VLM, VLM_CB, AUTO" << std::endl;
return false;
}

std::vector allowedTargetDevices = {"CPU", "GPU", "NPU", "AUTO"};
bool validDeviceSelected = false;
if (settings.targetDevice.rfind("GPU.", 0) == 0) {
if (exportSettings.targetDevice.rfind("GPU.", 0) == 0) {
// Accept GPU.x where x is a number to select specific GPU card
std::string indexPart = settings.targetDevice.substr(4);
std::string indexPart = exportSettings.targetDevice.substr(4);
validDeviceSelected = !indexPart.empty() && std::all_of(indexPart.begin(), indexPart.end(), ::isdigit);
} else if (settings.targetDevice.rfind("HETERO", 0) == 0) {
// Accept HETERO:<device1>,<device2>,... to select specific devices in the list
} else if ((exportSettings.targetDevice.rfind("HETERO", 0) == 0) || (exportSettings.targetDevice.rfind("AUTO", 0) == 0)) {
// Accept HETERO:<device1>,<device2>,... AUTO:<device1>,<device2>,... to select specific devices in the list
validDeviceSelected = true;
} else if (std::find(allowedTargetDevices.begin(), allowedTargetDevices.end(), settings.targetDevice) != allowedTargetDevices.end()) {
} else if (std::find(allowedTargetDevices.begin(), allowedTargetDevices.end(), exportSettings.targetDevice) != allowedTargetDevices.end()) {
// Accept CPU, GPU, NPU, AUTO as valid devices
validDeviceSelected = true;
}

if (!validDeviceSelected) {
std::cerr << "target_device: " << settings.targetDevice << " is not allowed. Supported devices: CPU, GPU, NPU, HETERO, AUTO" << std::endl;
std::cerr << "target_device: " << exportSettings.targetDevice << " is not allowed. Supported devices: CPU, GPU, NPU, HETERO, AUTO" << std::endl;
return false;
}

std::vector allowedBoolValues = {"false", "true"};
if (std::find(allowedBoolValues.begin(), allowedBoolValues.end(), settings.enablePrefixCaching) == allowedBoolValues.end()) {
std::cerr << "enable_prefix_caching: " << settings.enablePrefixCaching << " is not allowed. Supported values: true, false" << std::endl;
if (std::find(allowedBoolValues.begin(), allowedBoolValues.end(), textGenSettings.enablePrefixCaching) == allowedBoolValues.end()) {
std::cerr << "enable_prefix_caching: " << textGenSettings.enablePrefixCaching << " is not allowed. Supported values: true, false" << std::endl;
return false;
}

if (std::find(allowedBoolValues.begin(), allowedBoolValues.end(), settings.dynamicSplitFuse) == allowedBoolValues.end()) {
std::cerr << "dynamic_split_fuse: " << settings.dynamicSplitFuse << " is not allowed. Supported values: true, false" << std::endl;
if (std::find(allowedBoolValues.begin(), allowedBoolValues.end(), textGenSettings.dynamicSplitFuse) == allowedBoolValues.end()) {
std::cerr << "dynamic_split_fuse: " << textGenSettings.dynamicSplitFuse << " is not allowed. Supported values: true, false" << std::endl;
return false;
}
}
Expand All @@ -172,16 +173,16 @@ bool Config::validate() {
std::cerr << "Graph options not initialized for embeddings.";
return false;
}
auto settings = std::get<EmbeddingsGraphSettingsImpl>(this->serverSettings.hfSettings.graphSettings);
auto embedSettings = std::get<EmbeddingsGraphSettingsImpl>(this->serverSettings.hfSettings.graphSettings);

std::vector allowedBoolValues = {"false", "true"};
if (std::find(allowedBoolValues.begin(), allowedBoolValues.end(), settings.normalize) == allowedBoolValues.end()) {
std::cerr << "normalize: " << settings.normalize << " is not allowed. Supported values: true, false" << std::endl;
if (std::find(allowedBoolValues.begin(), allowedBoolValues.end(), embedSettings.normalize) == allowedBoolValues.end()) {
std::cerr << "normalize: " << embedSettings.normalize << " is not allowed. Supported values: true, false" << std::endl;
return false;
}

if (std::find(allowedBoolValues.begin(), allowedBoolValues.end(), settings.truncate) == allowedBoolValues.end()) {
std::cerr << "truncate: " << settings.truncate << " is not allowed. Supported values: true, false" << std::endl;
if (std::find(allowedBoolValues.begin(), allowedBoolValues.end(), embedSettings.truncate) == allowedBoolValues.end()) {
std::cerr << "truncate: " << embedSettings.truncate << " is not allowed. Supported values: true, false" << std::endl;
return false;
}
}
Expand Down
1 change: 0 additions & 1 deletion src/graph_export/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,6 @@ ovms_cc_library(
"@ovms//src:libovms_server_settings",
"@ovms//src:ovms_exit_codes",
"@com_github_jarro2783_cxxopts//:cxxopts",
"@com_github_tencent_rapidjson//:rapidjson",
],
visibility = ["//visibility:public"],
)
Expand Down
8 changes: 4 additions & 4 deletions src/graph_export/embeddings_graph_cli_parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,19 +81,19 @@ std::vector<std::string> EmbeddingsGraphCLIParser::parse(const std::vector<std::

void EmbeddingsGraphCLIParser::prepare(OvmsServerMode serverMode, HFSettingsImpl& hfSettings, const std::string& modelName) {
EmbeddingsGraphSettingsImpl embeddingsGraphSettings = EmbeddingsGraphCLIParser::defaultGraphSettings();
embeddingsGraphSettings.targetDevice = hfSettings.exportSettings.targetDevice;
hfSettings.exportSettings.targetDevice = hfSettings.exportSettings.targetDevice;
if (modelName != "") {
embeddingsGraphSettings.modelName = modelName;
hfSettings.exportSettings.modelName = modelName;
} else {
embeddingsGraphSettings.modelName = hfSettings.sourceModel;
hfSettings.exportSettings.modelName = hfSettings.sourceModel;
}
if (nullptr == result) {
// Pull with default arguments - no arguments from user
if (serverMode != HF_PULL_MODE && serverMode != HF_PULL_AND_START_MODE) {
throw std::logic_error("Tried to prepare server and model settings without graph parse result");
}
} else {
embeddingsGraphSettings.numStreams = result->operator[]("num_streams").as<uint32_t>();
hfSettings.exportSettings.pluginConfig.numStreams = result->operator[]("num_streams").as<uint32_t>();
embeddingsGraphSettings.normalize = result->operator[]("normalize").as<std::string>();
embeddingsGraphSettings.truncate = result->operator[]("truncate").as<std::string>();
embeddingsGraphSettings.pooling = result->operator[]("pooling").as<std::string>();
Expand Down
20 changes: 13 additions & 7 deletions src/graph_export/graph_cli_parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,11 @@ void GraphCLIParser::createOptions() {
("kv_cache_precision",
"u8 or empty (model default). Reduced kv cache precision to u8 lowers the cache size consumption.",
cxxopts::value<std::string>()->default_value(""),
"KV_CACHE_PRECISION");
"KV_CACHE_PRECISION")
("model_distribution_policy",
"TENSOR_PARALLEL, PIPELINE_PARALLEL or empty (model default). Sets model distribution policy for inference with multiple sockets/devices.",
cxxopts::value<std::string>(),
"MODEL_DISTRIBUTION_POLICY");
}

void GraphCLIParser::printHelp() {
Expand All @@ -115,12 +119,12 @@ std::vector<std::string> GraphCLIParser::parse(const std::vector<std::string>& u

void GraphCLIParser::prepare(OvmsServerMode serverMode, HFSettingsImpl& hfSettings, const std::string& modelName) {
TextGenGraphSettingsImpl graphSettings = GraphCLIParser::defaultGraphSettings();
graphSettings.targetDevice = hfSettings.exportSettings.targetDevice;
hfSettings.exportSettings.targetDevice = hfSettings.exportSettings.targetDevice;
// Deduct model name
if (modelName != "") {
graphSettings.modelName = modelName;
hfSettings.exportSettings.modelName = modelName;
} else {
graphSettings.modelName = hfSettings.sourceModel;
hfSettings.exportSettings.modelName = hfSettings.sourceModel;
}

if (nullptr == result) {
Expand Down Expand Up @@ -153,11 +157,13 @@ void GraphCLIParser::prepare(OvmsServerMode serverMode, HFSettingsImpl& hfSettin

// Plugin configuration
if (result->count("max_prompt_len")) {
graphSettings.pluginConfig.maxPromptLength = result->operator[]("max_prompt_len").as<uint32_t>();
hfSettings.exportSettings.pluginConfig.maxPromptLength = result->operator[]("max_prompt_len").as<uint32_t>();
}
if (result->count("model_distribution_policy")) {
hfSettings.exportSettings.pluginConfig.modelDistributionPolicy = result->operator[]("model_distribution_policy").as<std::string>();
}

if (result->count("kv_cache_precision")) {
graphSettings.pluginConfig.kvCachePrecision = result->operator[]("kv_cache_precision").as<std::string>();
hfSettings.exportSettings.pluginConfig.kvCachePrecision = result->operator[]("kv_cache_precision").as<std::string>();
}
}

Expand Down
Loading