Skip to content

Commit 47d52bc

Browse files
nachiketb-nvidiahhzhang16
authored andcommitted
feat: enable --dyn-reasoning-parser flag to set reasoning parser for vllm deployments (#2700)
Signed-off-by: Hannah Zhang <hannahz@nvidia.com>
1 parent 62259bb commit 47d52bc

File tree

8 files changed

+65
-19
lines changed

8 files changed

+65
-19
lines changed

components/backends/vllm/src/dynamo/vllm/args.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ def parse_args() -> Config:
117117
"--dyn-reasoning-parser",
118118
type=str,
119119
default=None,
120-
help="Reasoning parser name for the model.",
120+
help="Reasoning parser name for the model. Available options: 'basic', 'deepseek_r1', 'gpt_oss'.",
121121
)
122122

123123
parser = AsyncEngineArgs.add_cli_args(parser)

lib/llm/src/engines.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ use dynamo_runtime::pipeline::{Error, ManyOut, SingleIn};
1414
use dynamo_runtime::protocols::annotated::Annotated;
1515

1616
use crate::backend::ExecutionContext;
17+
use crate::local_model::runtime_config;
1718
use crate::preprocessor::PreprocessedRequest;
1819
use crate::protocols::common::llm_backend::LLMEngineOutput;
1920
use crate::protocols::openai::{
@@ -183,7 +184,7 @@ impl
183184
incoming_request: SingleIn<NvCreateChatCompletionRequest>,
184185
) -> Result<ManyOut<Annotated<NvCreateChatCompletionStreamResponse>>, Error> {
185186
let (request, context) = incoming_request.transfer(());
186-
let mut deltas = request.response_generator();
187+
let mut deltas = request.response_generator(runtime_config::ModelRuntimeConfig::default());
187188
let ctx = context.context();
188189
let req = request.inner.messages.into_iter().next_back().unwrap();
189190

lib/llm/src/local_model.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,7 @@ impl LocalModelBuilder {
202202
);
203203
card.migration_limit = self.migration_limit;
204204
card.user_data = self.user_data.take();
205+
card.runtime_config = self.runtime_config.clone();
205206

206207
return Ok(LocalModel {
207208
card,
@@ -276,6 +277,7 @@ impl LocalModelBuilder {
276277

277278
card.migration_limit = self.migration_limit;
278279
card.user_data = self.user_data.take();
280+
card.runtime_config = self.runtime_config.clone();
279281

280282
Ok(LocalModel {
281283
card,

lib/llm/src/model_card.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ use std::path::{Path, PathBuf};
1919
use std::sync::Arc;
2020
use std::time::Duration;
2121

22+
use crate::local_model::runtime_config::ModelRuntimeConfig;
2223
use anyhow::{Context, Result};
2324
use derive_builder::Builder;
2425
use dynamo_runtime::{slug::Slug, storage::key_value_store::Versioned, transports::nats};
@@ -137,6 +138,9 @@ pub struct ModelDeploymentCard {
137138
/// User-defined metadata for custom worker behavior
138139
#[serde(default, skip_serializing_if = "Option::is_none")]
139140
pub user_data: Option<serde_json::Value>,
141+
142+
#[serde(default)]
143+
pub runtime_config: ModelRuntimeConfig,
140144
}
141145

142146
impl ModelDeploymentCard {
@@ -441,6 +445,7 @@ impl ModelDeploymentCard {
441445
kv_cache_block_size: 0,
442446
migration_limit: 0,
443447
user_data: None,
448+
runtime_config: ModelRuntimeConfig::default(),
444449
})
445450
}
446451

@@ -482,6 +487,7 @@ impl ModelDeploymentCard {
482487
kv_cache_block_size: 0, // set later
483488
migration_limit: 0,
484489
user_data: None,
490+
runtime_config: ModelRuntimeConfig::default(),
485491
})
486492
}
487493
}

lib/llm/src/preprocessor.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
2222
use std::{collections::HashMap, sync::Arc};
2323
use tracing;
2424

25+
use crate::local_model::runtime_config::ModelRuntimeConfig;
2526
use crate::model_card::{ModelDeploymentCard, ModelInfo, TokenizerKind};
2627
use crate::preprocessor::prompt::OAIChatLikeRequest;
2728
use crate::tokenizers::Encoding;
@@ -94,6 +95,7 @@ pub struct OpenAIPreprocessor {
9495
formatter: Arc<dyn OAIPromptFormatter>,
9596
tokenizer: Arc<dyn Tokenizer>,
9697
model_info: Arc<dyn ModelInfo>,
98+
runtime_config: ModelRuntimeConfig,
9799
}
98100

99101
impl OpenAIPreprocessor {
@@ -121,11 +123,14 @@ impl OpenAIPreprocessor {
121123
};
122124
let model_info = model_info.get_model_info().await?;
123125

126+
let runtime_config = mdc.runtime_config.clone();
127+
124128
Ok(Arc::new(Self {
125129
formatter,
126130
tokenizer,
127131
model_info,
128132
mdcsum,
133+
runtime_config,
129134
}))
130135
}
131136

@@ -494,7 +499,7 @@ impl
494499
let (request, context) = request.into_parts();
495500

496501
// create a response generator
497-
let response_generator = request.response_generator();
502+
let response_generator = request.response_generator(self.runtime_config.clone());
498503
let mut response_generator = Box::new(response_generator);
499504

500505
// convert the chat completion request to a common completion request

lib/llm/src/protocols/openai/chat_completions/delta.rs

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ use dynamo_parsers::{ParserResult, ReasoningParser, ReasoningParserType, Reasoni
55

66
use super::{NvCreateChatCompletionRequest, NvCreateChatCompletionStreamResponse};
77
use crate::{
8+
local_model::runtime_config,
89
protocols::common::{self},
910
types::TokenIdType,
1011
};
@@ -15,11 +16,15 @@ impl NvCreateChatCompletionRequest {
1516
///
1617
/// # Returns
1718
/// * [`DeltaGenerator`] configured with model name and response options.
18-
pub fn response_generator(&self) -> DeltaGenerator {
19+
pub fn response_generator(
20+
&self,
21+
runtime_config: runtime_config::ModelRuntimeConfig,
22+
) -> DeltaGenerator {
1923
let options = DeltaGeneratorOptions {
2024
enable_usage: true,
2125
enable_logprobs: self.inner.logprobs.unwrap_or(false)
2226
|| self.inner.top_logprobs.unwrap_or(0) > 0,
27+
runtime_config,
2328
};
2429

2530
DeltaGenerator::new(self.inner.model.clone(), options)
@@ -33,6 +38,8 @@ pub struct DeltaGeneratorOptions {
3338
pub enable_usage: bool,
3439
/// Determines whether log probabilities should be included in the response.
3540
pub enable_logprobs: bool,
41+
42+
pub runtime_config: runtime_config::ModelRuntimeConfig,
3643
}
3744

3845
/// Generates incremental chat completion responses in a streaming fashion.
@@ -92,10 +99,14 @@ impl DeltaGenerator {
9299
// This is hardcoded for now, but can be made configurable later.
93100
// TODO: Make parser type configurable once front-end integration is determined
94101
// Change to GptOss to test GptOSS parser
95-
let reasoning_parser_type = ReasoningParserType::Basic;
96-
97102
// Reasoning parser wrapper
98-
let reasoning_parser = reasoning_parser_type.get_reasoning_parser();
103+
let reasoning_parser = ReasoningParserType::get_reasoning_parser_from_name(
104+
options
105+
.runtime_config
106+
.reasoning_parser
107+
.as_deref()
108+
.unwrap_or("basic"),
109+
);
99110

100111
Self {
101112
id: format!("chatcmpl-{}", uuid::Uuid::new_v4()),

lib/llm/tests/http-service.rs

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -16,17 +16,6 @@
1616
use anyhow::Error;
1717
use async_stream::stream;
1818
use dynamo_async_openai::config::OpenAIConfig;
19-
use dynamo_llm::http::{
20-
client::{
21-
GenericBYOTClient, HttpClientConfig, HttpRequestContext, NvCustomClient, PureOpenAIClient,
22-
},
23-
service::{
24-
Metrics,
25-
error::HttpError,
26-
metrics::{Endpoint, FRONTEND_METRIC_PREFIX, RequestType, Status},
27-
service_v2::HttpService,
28-
},
29-
};
3019
use dynamo_llm::protocols::{
3120
Annotated,
3221
codec::SseLineCodec,
@@ -36,6 +25,21 @@ use dynamo_llm::protocols::{
3625
completions::{NvCreateCompletionRequest, NvCreateCompletionResponse},
3726
},
3827
};
28+
use dynamo_llm::{
29+
http::{
30+
client::{
31+
GenericBYOTClient, HttpClientConfig, HttpRequestContext, NvCustomClient,
32+
PureOpenAIClient,
33+
},
34+
service::{
35+
Metrics,
36+
error::HttpError,
37+
metrics::{Endpoint, FRONTEND_METRIC_PREFIX, RequestType, Status},
38+
service_v2::HttpService,
39+
},
40+
},
41+
local_model::runtime_config,
42+
};
3943
use dynamo_runtime::{
4044
CancellationToken,
4145
engine::AsyncEngineContext,
@@ -95,7 +99,8 @@ impl
9599
let max_tokens = request.inner.max_tokens.unwrap_or(0) as u64;
96100

97101
// let generator = NvCreateChatCompletionStreamResponse::generator(request.model.clone());
98-
let mut generator = request.response_generator();
102+
let mut generator =
103+
request.response_generator(runtime_config::ModelRuntimeConfig::default());
99104

100105
let stream = stream! {
101106
tokio::time::sleep(std::time::Duration::from_millis(max_tokens)).await;

lib/parsers/src/reasoning/mod.rs

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,4 +115,20 @@ impl ReasoningParserType {
115115
},
116116
}
117117
}
118+
119+
pub fn get_reasoning_parser_from_name(name: &str) -> ReasoningParserWrapper {
120+
tracing::debug!("Selected reasoning parser: {}", name);
121+
match name.to_lowercase().as_str() {
122+
"deepseek_r1" => Self::DeepseekR1.get_reasoning_parser(),
123+
"basic" => Self::Basic.get_reasoning_parser(),
124+
"gpt_oss" => Self::GptOss.get_reasoning_parser(),
125+
_ => {
126+
tracing::warn!(
127+
"Unknown reasoning parser type '{}', falling back to Basic Reasoning Parser",
128+
name
129+
);
130+
Self::Basic.get_reasoning_parser()
131+
}
132+
}
133+
}
118134
}

0 commit comments

Comments
 (0)