@@ -11,7 +11,7 @@ use crate::{
1111 kv_router:: { KvPushRouter , KvRouter } ,
1212 migration:: Migration ,
1313 model_card:: ModelDeploymentCard ,
14- preprocessor:: OpenAIPreprocessor ,
14+ preprocessor:: { OpenAIPreprocessor , prompt :: PromptFormatter } ,
1515 protocols:: common:: llm_backend:: { BackendOutput , LLMEngineOutput , PreprocessedRequest } ,
1616 request_template:: RequestTemplate ,
1717 types:: {
@@ -131,10 +131,18 @@ pub async fn prepare_engine(
131131 None
132132 } ;
133133
134+ let hf_tokenizer = card. tokenizer_hf ( ) ?;
134135 let chat_engine = entrypoint:: build_routed_pipeline :: <
135136 NvCreateChatCompletionRequest ,
136137 NvCreateChatCompletionStreamResponse ,
137- > ( card, & client, router_mode, None , kv_chooser. clone ( ) )
138+ > (
139+ card,
140+ & client,
141+ router_mode,
142+ None ,
143+ kv_chooser. clone ( ) ,
144+ hf_tokenizer,
145+ )
138146 . await ?;
139147
140148 let service_name = local_model. service_name ( ) . to_string ( ) ;
@@ -167,7 +175,7 @@ pub async fn prepare_engine(
167175 let pipeline = build_pipeline :: <
168176 NvCreateChatCompletionRequest ,
169177 NvCreateChatCompletionStreamResponse ,
170- > ( model. card ( ) , inner_engine)
178+ > ( model. card ( ) , inner_engine, model . card ( ) . tokenizer_hf ( ) ? )
171179 . await ?;
172180
173181 let service_name = model. service_name ( ) . to_string ( ) ;
@@ -186,6 +194,7 @@ pub async fn prepare_engine(
186194pub async fn build_pipeline < Req , Resp > (
187195 card : & ModelDeploymentCard ,
188196 engine : ExecutionContext ,
197+ hf_tokenizer : tokenizers:: Tokenizer ,
189198) -> anyhow:: Result < Arc < ServiceFrontend < SingleIn < Req > , ManyOut < Annotated < Resp > > > > >
190199where
191200 Req : Data ,
@@ -198,10 +207,11 @@ where
198207 > ,
199208{
200209 let frontend = ServiceFrontend :: < SingleIn < Req > , ManyOut < Annotated < Resp > > > :: new ( ) ;
201- let preprocessor = OpenAIPreprocessor :: new ( ( * card) . clone ( ) )
202- . await ?
203- . into_operator ( ) ;
204- let backend = Backend :: from_mdc ( ( * card) . clone ( ) ) . await ?. into_operator ( ) ;
210+ let PromptFormatter :: OAI ( formatter) = PromptFormatter :: from_mdc ( card) ?;
211+ let preprocessor =
212+ OpenAIPreprocessor :: new_with_parts ( card. clone ( ) , formatter, hf_tokenizer. clone ( ) ) ?
213+ . into_operator ( ) ;
214+ let backend = Backend :: from_tokenizer ( hf_tokenizer) . into_operator ( ) ;
205215 let engine = ServiceBackend :: from_engine ( engine) ;
206216
207217 Ok ( frontend
@@ -219,6 +229,7 @@ pub async fn build_routed_pipeline<Req, Resp>(
219229 router_mode : RouterMode ,
220230 busy_threshold : Option < f64 > ,
221231 chooser : Option < Arc < KvRouter > > ,
232+ hf_tokenizer : tokenizers:: Tokenizer ,
222233) -> anyhow:: Result < ServiceEngine < SingleIn < Req > , ManyOut < Annotated < Resp > > > >
223234where
224235 Req : Data ,
@@ -230,14 +241,17 @@ where
230241 Pin < Box < dyn AsyncEngineStream < Annotated < BackendOutput > > > > ,
231242 > ,
232243{
233- let preprocessor = OpenAIPreprocessor :: new ( card. clone ( ) ) . await ?;
244+ let PromptFormatter :: OAI ( formatter) = PromptFormatter :: from_mdc ( card) ?;
245+ let preprocessor =
246+ OpenAIPreprocessor :: new_with_parts ( card. clone ( ) , formatter, hf_tokenizer. clone ( ) ) ?;
234247 build_routed_pipeline_with_preprocessor (
235248 card,
236249 client,
237250 router_mode,
238251 busy_threshold,
239252 chooser,
240253 preprocessor,
254+ hf_tokenizer,
241255 )
242256 . await
243257}
@@ -249,6 +263,7 @@ pub async fn build_routed_pipeline_with_preprocessor<Req, Resp>(
249263 busy_threshold : Option < f64 > ,
250264 chooser : Option < Arc < KvRouter > > ,
251265 preprocessor : Arc < OpenAIPreprocessor > ,
266+ hf_tokenizer : tokenizers:: Tokenizer ,
252267) -> anyhow:: Result < ServiceEngine < SingleIn < Req > , ManyOut < Annotated < Resp > > > >
253268where
254269 Req : Data ,
@@ -262,8 +277,8 @@ where
262277{
263278 let frontend = SegmentSource :: < SingleIn < Req > , ManyOut < Annotated < Resp > > > :: new ( ) ;
264279 let preprocessor_op = preprocessor. into_operator ( ) ;
265- let backend = Backend :: from_mdc ( card . clone ( ) ) . await ? . into_operator ( ) ;
266- let migration = Migration :: from_mdc ( card. clone ( ) ) . await ? . into_operator ( ) ;
280+ let backend = Backend :: from_tokenizer ( hf_tokenizer ) . into_operator ( ) ;
281+ let migration = Migration :: from_mdc ( card) . into_operator ( ) ;
267282 let router =
268283 PushRouter :: < PreprocessedRequest , Annotated < LLMEngineOutput > > :: from_client_with_threshold (
269284 client. clone ( ) ,
@@ -312,14 +327,14 @@ mod tests {
312327 #[ tokio:: test]
313328 async fn test_build_chat_completions_pipeline_core_engine_succeeds ( ) -> anyhow:: Result < ( ) > {
314329 // Create test model card
315- let card = ModelDeploymentCard :: load ( HF_PATH , None ) . await ?;
330+ let card = ModelDeploymentCard :: load ( HF_PATH , None ) ?;
316331 let engine = crate :: engines:: make_engine_core ( ) ;
317332
318333 // Build pipeline for chat completions
319334 let pipeline = build_pipeline :: <
320335 NvCreateChatCompletionRequest ,
321336 NvCreateChatCompletionStreamResponse ,
322- > ( & card, engine)
337+ > ( & card, engine, card . tokenizer_hf ( ) ? )
323338 . await ?;
324339
325340 // Verify pipeline was created
@@ -331,13 +346,16 @@ mod tests {
331346 #[ tokio:: test]
332347 async fn test_build_completions_pipeline_core_engine_succeeds ( ) -> anyhow:: Result < ( ) > {
333348 // Create test model card
334- let card = ModelDeploymentCard :: load ( HF_PATH , None ) . await ?;
349+ let card = ModelDeploymentCard :: load ( HF_PATH , None ) ?;
335350 let engine = crate :: engines:: make_engine_core ( ) ;
336351
337352 // Build pipeline for completions
338- let pipeline =
339- build_pipeline :: < NvCreateCompletionRequest , NvCreateCompletionResponse > ( & card, engine)
340- . await ?;
353+ let pipeline = build_pipeline :: < NvCreateCompletionRequest , NvCreateCompletionResponse > (
354+ & card,
355+ engine,
356+ card. tokenizer_hf ( ) ?,
357+ )
358+ . await ?;
341359
342360 // Verify pipeline was created
343361 assert ! ( Arc :: strong_count( & pipeline) >= 1 ) ;
0 commit comments