feat: [vLLM] implement cli args for tool and reasoning parsers (#2619)

ayushag-nv · KrishnanPrash · commit 8cda57a96537 · 2025-09-02T15:20:19.000-07:00
diff --git a/components/backends/vllm/src/dynamo/vllm/args.py b/components/backends/vllm/src/dynamo/vllm/args.py
@@ -59,6 +59,10 @@ class Config:
     # Connector list from CLI
     connector_list: Optional[list] = None
 
+    # tool and reasoning parser info
+    tool_call_parser: Optional[str] = None
+    reasoning_parser: Optional[str] = None
+
 
 def parse_args() -> Config:
     parser = FlexibleArgumentParser(
@@ -109,6 +113,19 @@ def parse_args() -> Config:
         help="List of connectors to use in order (e.g., --connector nixl lmcache). "
         "Options: nixl, lmcache, kvbm, null, none. Default: nixl. Order will be preserved in MultiConnector.",
     )
+    # To avoid name conflicts with different backends, adoped prefix "dyn-" for dynamo specific args
+    parser.add_argument(
+        "--dyn-tool-call-parser",
+        type=str,
+        default=None,
+        help="Tool call parser name for the model. Available options: 'hermes', 'nemotron_deci', 'llama3_json', 'mistral', 'phi4'.",
+    )
+    parser.add_argument(
+        "--dyn-reasoning-parser",
+        type=str,
+        default=None,
+        help="Reasoning parser name for the model.",
+    )
 
     parser = AsyncEngineArgs.add_cli_args(parser)
     args = parser.parse_args()
@@ -160,6 +177,8 @@ def parse_args() -> Config:
     )
     config.custom_jinja_template = args.custom_jinja_template
 
+    config.tool_call_parser = args.dyn_tool_call_parser
+    config.reasoning_parser = args.dyn_reasoning_parser
     # Check for conflicting flags
     has_kv_transfer_config = (
         hasattr(engine_args, "kv_transfer_config")
diff --git a/components/backends/vllm/src/dynamo/vllm/main.py b/components/backends/vllm/src/dynamo/vllm/main.py
@@ -234,6 +234,8 @@ async def init(runtime: DistributedRuntime, config: Config):
         runtime_config.total_kv_blocks = runtime_values["num_gpu_blocks"]
         runtime_config.max_num_seqs = runtime_values["max_num_seqs"]
         runtime_config.max_num_batched_tokens = runtime_values["max_num_batched_tokens"]
+        runtime_config.tool_call_parser = config.tool_call_parser
+        runtime_config.reasoning_parser = config.reasoning_parser
 
         await register_llm(
             ModelType.Backend,
diff --git a/lib/bindings/python/rust/llm/local_model.rs b/lib/bindings/python/rust/llm/local_model.rs
@@ -34,6 +34,16 @@ impl ModelRuntimeConfig {
         self.inner.max_num_batched_tokens = Some(max_num_batched_tokens);
     }
 
+    #[setter]
+    fn set_tool_call_parser(&mut self, tool_call_parser: Option<String>) {
+        self.inner.tool_call_parser = tool_call_parser;
+    }
+
+    #[setter]
+    fn set_reasoning_parser(&mut self, reasoning_parser: Option<String>) {
+        self.inner.reasoning_parser = reasoning_parser;
+    }
+
     fn set_engine_specific(&mut self, key: &str, value: String) -> PyResult<()> {
         let value: serde_json::Value = serde_json::from_str(&value).map_err(to_pyerr)?;
         self.inner
@@ -57,6 +67,16 @@ impl ModelRuntimeConfig {
         self.inner.max_num_batched_tokens
     }
 
+    #[getter]
+    fn tool_call_parser(&self) -> Option<String> {
+        self.inner.tool_call_parser.clone()
+    }
+
+    #[getter]
+    fn reasoning_parser(&self) -> Option<String> {
+        self.inner.reasoning_parser.clone()
+    }
+
     #[getter]
     fn runtime_data(&self, py: Python<'_>) -> PyResult<PyObject> {
         let dict = PyDict::new(py);
diff --git a/lib/llm/src/discovery/model_manager.rs b/lib/llm/src/discovery/model_manager.rs
@@ -246,6 +246,18 @@ impl ModelManager {
             .insert(model_name.to_string(), new_kv_chooser.clone());
         Ok(new_kv_chooser)
     }
+
+    pub fn get_model_tool_call_parser(&self, model: &str) -> Option<String> {
+        match self.entries.lock() {
+            Ok(entries) => entries
+                .values()
+                .find(|entry| entry.name == model)
+                .and_then(|entry| entry.runtime_config.as_ref())
+                .and_then(|config| config.tool_call_parser.clone())
+                .map(|parser| parser.to_string()),
+            Err(_) => None,
+        }
+    }
 }
 
 pub struct ModelEngines<E> {
diff --git a/lib/llm/src/http/service/openai.rs b/lib/llm/src/http/service/openai.rs
@@ -37,6 +37,7 @@ use crate::protocols::openai::{
     completions::{NvCreateCompletionRequest, NvCreateCompletionResponse},
     embeddings::{NvCreateEmbeddingRequest, NvCreateEmbeddingResponse},
     responses::{NvCreateResponse, NvResponse},
+    ParsingOptions,
 };
 use crate::request_template::RequestTemplate;
 use crate::types::Annotated;
@@ -194,6 +195,13 @@ fn get_or_create_request_id(primary: Option<&str>, headers: &HeaderMap) -> Strin
     uuid.to_string()
 }
 
+fn get_parsing_options(state: &Arc<service_v2::State>, model: &str) -> ParsingOptions {
+    let tool_call_parser = state.manager().get_model_tool_call_parser(model);
+    let reasoning_parser = None; // TODO: Implement reasoning parser
+
+    ParsingOptions::new(tool_call_parser, reasoning_parser)
+}
+
 /// OpenAI Completions Request Handler
 ///
 /// This method will handle the incoming request for the `/v1/completions endpoint`. The endpoint is a "source"
@@ -267,6 +275,8 @@ async fn completions(
         .get_completions_engine(model)
         .map_err(|_| ErrorMessage::model_not_found())?;
 
+    let parsing_options = get_parsing_options(&state, model);
+
     let mut inflight_guard =
         state
             .metrics_clone()
@@ -325,7 +335,7 @@ async fn completions(
             process_metrics_only(response, &mut response_collector);
         });
 
-        let response = NvCreateCompletionResponse::from_annotated_stream(stream)
+        let response = NvCreateCompletionResponse::from_annotated_stream(stream, parsing_options)
             .await
             .map_err(|e| {
                 tracing::error!(
@@ -494,6 +504,8 @@ async fn chat_completions(
         .get_chat_completions_engine(model)
         .map_err(|_| ErrorMessage::model_not_found())?;
 
+    let parsing_options = get_parsing_options(&state, model);
+
     let mut inflight_guard =
         state
             .metrics_clone()
@@ -553,19 +565,20 @@ async fn chat_completions(
             process_metrics_only(response, &mut response_collector);
         });
 
-        let response = NvCreateChatCompletionResponse::from_annotated_stream(stream)
-            .await
-            .map_err(|e| {
-                tracing::error!(
-                    request_id,
-                    "Failed to fold chat completions stream for: {:?}",
-                    e
-                );
-                ErrorMessage::internal_server_error(&format!(
-                    "Failed to fold chat completions stream: {}",
-                    e
-                ))
-            })?;
+        let response =
+            NvCreateChatCompletionResponse::from_annotated_stream(stream, parsing_options.clone())
+                .await
+                .map_err(|e| {
+                    tracing::error!(
+                        request_id,
+                        "Failed to fold chat completions stream for: {:?}",
+                        e
+                    );
+                    ErrorMessage::internal_server_error(&format!(
+                        "Failed to fold chat completions stream: {}",
+                        e
+                    ))
+                })?;
 
         inflight_guard.mark_ok();
         Ok(Json(response).into_response())
@@ -726,6 +739,8 @@ async fn responses(
         .get_chat_completions_engine(model)
         .map_err(|_| ErrorMessage::model_not_found())?;
 
+    let parsing_options = get_parsing_options(&state, model);
+
     let mut inflight_guard =
         state
             .metrics_clone()
@@ -742,19 +757,20 @@ async fn responses(
         .map_err(|e| ErrorMessage::from_anyhow(e, "Failed to generate completions"))?;
 
     // TODO: handle streaming, currently just unary
-    let response = NvCreateChatCompletionResponse::from_annotated_stream(stream)
-        .await
-        .map_err(|e| {
-            tracing::error!(
-                request_id,
-                "Failed to fold chat completions stream for: {:?}",
-                e
-            );
-            ErrorMessage::internal_server_error(&format!(
-                "Failed to fold chat completions stream: {}",
-                e
-            ))
-        })?;
+    let response =
+        NvCreateChatCompletionResponse::from_annotated_stream(stream, parsing_options.clone())
+            .await
+            .map_err(|e| {
+                tracing::error!(
+                    request_id,
+                    "Failed to fold chat completions stream for: {:?}",
+                    e
+                );
+                ErrorMessage::internal_server_error(&format!(
+                    "Failed to fold chat completions stream: {}",
+                    e
+                ))
+            })?;
 
     // Convert NvCreateChatCompletionResponse --> NvResponse
     let response: NvResponse = response.try_into().map_err(|e| {
diff --git a/lib/llm/src/local_model.rs b/lib/llm/src/local_model.rs
@@ -209,6 +209,7 @@ impl LocalModelBuilder {
             );
             card.migration_limit = self.migration_limit;
             card.user_data = self.user_data.take();
+
             return Ok(LocalModel {
                 card,
                 full_path: PathBuf::new(),
@@ -402,6 +403,7 @@ impl LocalModel {
         // Publish the Model Deployment Card to etcd
         let kvstore: Box<dyn KeyValueStore> = Box::new(EtcdStorage::new(etcd_client.clone()));
         let card_store = Arc::new(KeyValueStoreManager::new(kvstore));
+<<<<<<< HEAD
         let slug_key = self.card.slug();
         let key = slug_key.to_string();
 
@@ -436,6 +438,13 @@ impl LocalModel {
                     .await?;
             }
         }
+=======
+        let key = self.card.slug().to_string();
+
+        card_store
+            .publish(model_card::ROOT_PATH, None, &key, &mut self.card)
+            .await?;
+>>>>>>> cbe854fc (feat: [vLLM] implement cli args for tool and reasoning parsers (#2619))
 
         // Publish our ModelEntry to etcd. This allows ingress to find the model card.
         // (Why don't we put the model card directly under this key?)
diff --git a/lib/llm/src/local_model/runtime_config.rs b/lib/llm/src/local_model/runtime_config.rs
@@ -13,6 +13,10 @@ pub struct ModelRuntimeConfig {
 
     pub max_num_batched_tokens: Option<u64>,
 
+    pub tool_call_parser: Option<String>,
+
+    pub reasoning_parser: Option<String>,
+
     /// Mapping of engine-specific runtime configs
     #[serde(default, skip_serializing_if = "HashMap::is_empty")]
     pub runtime_data: HashMap<String, serde_json::Value>,
diff --git a/lib/llm/src/preprocessor.rs b/lib/llm/src/preprocessor.rs
@@ -101,7 +101,6 @@ impl OpenAIPreprocessor {
         let mdcsum = mdc.mdcsum();
         let formatter = PromptFormatter::from_mdc(mdc.clone()).await?;
         let PromptFormatter::OAI(formatter) = formatter;
-
         let tokenizer = match &mdc.tokenizer {
             Some(TokenizerKind::HfTokenizerJson(file)) => HuggingFaceTokenizer::from_file(file)?,
             Some(TokenizerKind::GGUF(tokenizer)) => {
diff --git a/lib/llm/src/protocols/openai.rs b/lib/llm/src/protocols/openai.rs
@@ -193,3 +193,19 @@ pub trait DeltaGeneratorExt<ResponseType: Send + 'static + std::fmt::Debug>:
     /// Gets the current prompt token count (Input Sequence Length).
     fn get_isl(&self) -> Option<u32>;
 }
+
+#[derive(Clone, Debug, Serialize, Deserialize, Default)]
+pub struct ParsingOptions {
+    pub tool_call_parser: Option<String>,
+
+    pub reasoning_parser: Option<String>,
+}
+
+impl ParsingOptions {
+    pub fn new(tool_call_parser: Option<String>, reasoning_parser: Option<String>) -> Self {
+        Self {
+            tool_call_parser,
+            reasoning_parser,
+        }
+    }
+}
diff --git a/lib/llm/src/protocols/openai/chat_completions/aggregator.rs b/lib/llm/src/protocols/openai/chat_completions/aggregator.rs
diff --git a/lib/llm/src/protocols/openai/completions/aggregator.rs b/lib/llm/src/protocols/openai/completions/aggregator.rs
diff --git a/lib/llm/tests/aggregators.rs b/lib/llm/tests/aggregators.rs
diff --git a/lib/parsers/src/tool_calling/tools.rs b/lib/parsers/src/tool_calling/tools.rs