Skip to content

Commit 60ff1f6

Browse files
committed
feat: add nvext annotations to query worker_id from router and early exit
1 parent 1630f8b commit 60ff1f6

File tree

2 files changed

+17
-3
lines changed

2 files changed

+17
-3
lines changed

lib/llm/src/kv_router.rs

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -245,11 +245,25 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutpu
245245
InstanceSource::Dynamic(_) => {
246246
let (instance_id, overlap_amount) =
247247
self.chooser.find_best_match(&request.token_ids).await?;
248+
249+
let has_worker_id_annotation = request.has_annotation("query_instance_id");
250+
248251
// Update the request with the estimated prefix hit blocks
249252
let (mut backend_input, context) = request.into_parts();
250253
backend_input.estimated_prefix_hit_num_blocks = Some(overlap_amount);
251-
let updated_request = context.map(|_| backend_input);
252-
self.inner.direct(updated_request, instance_id).await
254+
// Take this branch when request has the annotation "query_instance_id"
255+
// "nvext": { "annotations": ["query_instance_id"]}}
256+
// This is used to indicate that the request will be routed to a worker
257+
if has_worker_id_annotation {
258+
let instance_id_str = instance_id.to_string();
259+
let response =
260+
Annotated::from_annotation("worker_instance_id", &instance_id_str)?;
261+
let stream = stream::iter(vec![response]);
262+
Ok(ResponseStream::new(Box::pin(stream), context.context()))
263+
} else {
264+
let updated_request = context.map(|_| backend_input);
265+
self.inner.direct(updated_request, instance_id).await
266+
}
253267
}
254268
}
255269
}

lib/llm/src/preprocessor.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -412,8 +412,8 @@ impl OpenAIPreprocessor {
412412
// Only set event if not already set to avoid overriding existing events (like errors)
413413
if response.event.is_none() {
414414
response.event = metrics_annotated.event;
415+
response.comment = metrics_annotated.comment;
415416
}
416-
response.comment = metrics_annotated.comment;
417417
}
418418

419419
tracing::trace!(

0 commit comments

Comments
 (0)