@@ -313,23 +313,22 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutpu
313313 InstanceSource :: Dynamic ( _) => {
314314 // Extract context ID for request tracking
315315 let context_id = request. context ( ) . id ( ) . to_string ( ) ;
316-
317316 let ( instance_id, overlap_amount) = self
318317 . chooser
319318 . find_best_match ( & context_id, & request. token_ids )
320319 . await ?;
321- let has_worker_id_annotation = request. has_annotation ( "query_instance_id" ) ;
320+ let query_instance_id = request. has_annotation ( "query_instance_id" ) ;
322321 // Extract context information before moving the request
323322 let stream_context = request. context ( ) . clone ( ) ;
324323 // Update the request with the estimated prefix hit blocks
325324 let ( mut backend_input, context) = request. into_parts ( ) ;
326325 let isl = backend_input. token_ids . len ( ) ;
327326 backend_input. estimated_prefix_hit_num_blocks = Some ( overlap_amount) ;
328327 let updated_request = context. map ( |_| backend_input) ;
329- // Take this branch when request has the annotation "query_instance_id"
330- // "nvext": { "annotations": ["query_instance_id"]}}
331- // This is used to indicate that the request will be routed to a worker
332- if has_worker_id_annotation {
328+ // if request has the annotation "query_instance_id", for example
329+ // curl -d '{... , "nvext": { "annotations": ["query_instance_id"]}}'
330+ // request will not be routed to worker immediately
331+ if query_instance_id {
333332 let instance_id_str = instance_id. to_string ( ) ;
334333 let response =
335334 Annotated :: from_annotation ( "worker_instance_id" , & instance_id_str) ?;
0 commit comments