Skip to content

Commit 5aba407

Browse files
committed
feat: add nvext annotations to query worker_id from router and early exit
1 parent dc90da9 commit 5aba407

File tree

1 file changed

+5
-6
lines changed

1 file changed

+5
-6
lines changed

lib/llm/src/kv_router.rs

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -313,23 +313,22 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutpu
313313
InstanceSource::Dynamic(_) => {
314314
// Extract context ID for request tracking
315315
let context_id = request.context().id().to_string();
316-
317316
let (instance_id, overlap_amount) = self
318317
.chooser
319318
.find_best_match(&context_id, &request.token_ids)
320319
.await?;
321-
let has_worker_id_annotation = request.has_annotation("query_instance_id");
320+
let query_instance_id = request.has_annotation("query_instance_id");
322321
// Extract context information before moving the request
323322
let stream_context = request.context().clone();
324323
// Update the request with the estimated prefix hit blocks
325324
let (mut backend_input, context) = request.into_parts();
326325
let isl = backend_input.token_ids.len();
327326
backend_input.estimated_prefix_hit_num_blocks = Some(overlap_amount);
328327
let updated_request = context.map(|_| backend_input);
329-
// Take this branch when request has the annotation "query_instance_id"
330-
// "nvext": { "annotations": ["query_instance_id"]}}
331-
// This is used to indicate that the request will be routed to a worker
332-
if has_worker_id_annotation {
328+
// if request has the annotation "query_instance_id", for example
329+
// curl -d '{... ,"nvext": { "annotations": ["query_instance_id"]}}'
330+
// request will not be routed to worker immediately
331+
if query_instance_id {
333332
let instance_id_str = instance_id.to_string();
334333
let response =
335334
Annotated::from_annotation("worker_instance_id", &instance_id_str)?;

0 commit comments

Comments
 (0)