Skip to content

Commit bb9a01c

Browse files
committed
align OpenAI response IDs with distributed trace IDs
1 parent fb10ffb commit bb9a01c

File tree

8 files changed

+29
-25
lines changed

8 files changed

+29
-25
lines changed

components/backends/sglang/src/dynamo/sglang/common/sgl_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ def setup_native_endpoints(server_args, component, handler):
6868
"""Setup sgl native endpoints"""
6969
# flush cache
7070
flush_endpoint = component.endpoint("flush_cache")
71-
tasks = []
71+
tasks = []
7272
tasks.append(flush_endpoint.serve_endpoint(handler.flush_cache))
7373

7474
# expert distribution endpoints

lib/engines/mistralrs/src/lib.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -339,7 +339,7 @@ impl
339339
n_choices: 1,
340340
dry_params: det.dry_params,
341341
};
342-
let request_id = self.mistralrs.next_request_id();
342+
let request_id = ctx.id().to_string();
343343
let mistralrs_request = Request::Normal(Box::new(NormalRequest {
344344
id: request_id,
345345
model_id: Some(self.display_name.clone()),
@@ -486,7 +486,7 @@ impl
486486
let (request, context) = request.transfer(());
487487
let ctx = context.context();
488488
let (tx, mut rx) = channel(10_000);
489-
let response_generator = request.response_generator();
489+
let response_generator = request.response_generator(Some(ctx.id().to_string()));
490490

491491
let messages = RequestMessage::Completion {
492492
text: prompt_to_string(&request.inner.prompt),
@@ -540,7 +540,7 @@ impl
540540
dry_params: det.dry_params,
541541
};
542542

543-
let request_id = self.mistralrs.next_request_id();
543+
let request_id = ctx.id().to_string();
544544
let mistralrs_request = Request::Normal(Box::new(NormalRequest {
545545
id: request_id,
546546
model_id: Some(self.display_name.clone()),

lib/llm/src/engines.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -183,8 +183,8 @@ impl
183183
incoming_request: SingleIn<NvCreateChatCompletionRequest>,
184184
) -> Result<ManyOut<Annotated<NvCreateChatCompletionStreamResponse>>, Error> {
185185
let (request, context) = incoming_request.transfer(());
186-
let deltas = request.response_generator();
187186
let ctx = context.context();
187+
let deltas = request.response_generator(Some(ctx.id().to_string()));
188188
let req = request.inner.messages.into_iter().next_back().unwrap();
189189

190190
let prompt = match req {
@@ -236,8 +236,8 @@ impl
236236
incoming_request: SingleIn<NvCreateCompletionRequest>,
237237
) -> Result<ManyOut<Annotated<NvCreateCompletionResponse>>, Error> {
238238
let (request, context) = incoming_request.transfer(());
239-
let deltas = request.response_generator();
240239
let ctx = context.context();
240+
let deltas = request.response_generator(Some(ctx.id().to_string()));
241241
let chars_string = prompt_to_string(&request.inner.prompt);
242242
let output = stream! {
243243
let mut id = 1;

lib/llm/src/http/service/openai.rs

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -215,8 +215,7 @@ async fn completions(
215215
// return a 503 if the service is not ready
216216
check_ready(&state)?;
217217

218-
// todo - extract distributed tracing id and context id from headers
219-
let request_id = uuid::Uuid::new_v4().to_string();
218+
let request_id = request.id().to_string();
220219

221220
// todo - decide on default
222221
let streaming = request.inner.stream.unwrap_or(false);
@@ -319,8 +318,7 @@ async fn embeddings(
319318
// return a 503 if the service is not ready
320319
check_ready(&state)?;
321320

322-
// todo - extract distributed tracing id and context id from headers
323-
let request_id = uuid::Uuid::new_v4().to_string();
321+
let request_id = request.id().to_string();
324322

325323
// Embeddings are typically not streamed, so we default to non-streaming
326324
let streaming = false;
@@ -341,10 +339,6 @@ async fn embeddings(
341339
.metrics_clone()
342340
.create_inflight_guard(model, Endpoint::Embeddings, streaming);
343341

344-
// setup context
345-
// todo - inherit request_id from distributed trace details
346-
let request = Context::with_id(request, request_id.clone());
347-
348342
// issue the generate call on the engine
349343
let stream = engine
350344
.generate(request)

lib/llm/src/preprocessor.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -493,7 +493,7 @@ impl
493493
let (request, context) = request.into_parts();
494494

495495
// create a response generator
496-
let response_generator = request.response_generator();
496+
let response_generator = request.response_generator(context.id().to_string());
497497
let mut response_generator = Box::new(response_generator);
498498

499499
// convert the chat completion request to a common completion request
@@ -551,7 +551,7 @@ impl
551551
let (request, context) = request.into_parts();
552552

553553
// create a response generator
554-
let response_generator = request.response_generator();
554+
let response_generator = request.response_generator(context.id().to_string());
555555
let mut response_generator = Box::new(response_generator);
556556
// convert the chat completion request to a common completion request
557557
let (common_request, annotations) = self.preprocess_request(&request)?;

lib/llm/src/protocols/openai/chat_completions/delta.rs

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,21 +18,25 @@ use crate::{
1818
protocols::common::{self},
1919
types::TokenIdType,
2020
};
21+
use dynamo_runtime::logging;
2122

2223
/// Provides a method for generating a [`DeltaGenerator`] from a chat completion request.
2324
impl NvCreateChatCompletionRequest {
2425
/// Creates a [`DeltaGenerator`] instance based on the chat completion request.
2526
///
27+
/// # Arguments
28+
/// * `request_id` - The request ID to use for the chat completion response ID.
29+
///
2630
/// # Returns
2731
/// * [`DeltaGenerator`] configured with model name and response options.
28-
pub fn response_generator(&self) -> DeltaGenerator {
32+
pub fn response_generator(&self, request_id: String) -> DeltaGenerator {
2933
let options = DeltaGeneratorOptions {
3034
enable_usage: true,
3135
enable_logprobs: self.inner.logprobs.unwrap_or(false)
3236
|| self.inner.top_logprobs.unwrap_or(0) > 0,
3337
};
3438

35-
DeltaGenerator::new(self.inner.model.clone(), options)
39+
DeltaGenerator::new(self.inner.model.clone(), options, request_id)
3640
}
3741
}
3842

@@ -74,10 +78,11 @@ impl DeltaGenerator {
7478
/// # Arguments
7579
/// * `model` - The model name used for response generation.
7680
/// * `options` - Configuration options for enabling usage and log probabilities.
81+
/// * `request_id` - The request ID to use for the chat completion response.
7782
///
7883
/// # Returns
7984
/// * A new instance of [`DeltaGenerator`].
80-
pub fn new(model: String, options: DeltaGeneratorOptions) -> Self {
85+
pub fn new(model: String, options: DeltaGeneratorOptions, request_id: String) -> Self {
8186
let now = std::time::SystemTime::now()
8287
.duration_since(std::time::UNIX_EPOCH)
8388
.unwrap()
@@ -95,8 +100,10 @@ impl DeltaGenerator {
95100
completion_tokens_details: None,
96101
};
97102

103+
let chatcmpl_id = format!("chatcmpl-{}", request_id);
104+
98105
Self {
99-
id: format!("chatcmpl-{}", uuid::Uuid::new_v4()),
106+
id: chatcmpl_id,
100107
object: "chat.completion.chunk".to_string(),
101108
created: now,
102109
model,

lib/llm/src/protocols/openai/completions/delta.rs

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,17 +15,18 @@
1515

1616
use super::{NvCreateCompletionRequest, NvCreateCompletionResponse};
1717
use crate::{protocols::common, types::TokenIdType};
18+
use dynamo_runtime::logging;
1819

1920
impl NvCreateCompletionRequest {
2021
// put this method on the request
2122
// inspect the request to extract options
22-
pub fn response_generator(&self) -> DeltaGenerator {
23+
pub fn response_generator(&self, request_id: String) -> DeltaGenerator {
2324
let options = DeltaGeneratorOptions {
2425
enable_usage: true,
2526
enable_logprobs: self.inner.logprobs.unwrap_or(0) > 0,
2627
};
2728

28-
DeltaGenerator::new(self.inner.model.clone(), options)
29+
DeltaGenerator::new(self.inner.model.clone(), options, request_id)
2930
}
3031
}
3132

@@ -47,7 +48,7 @@ pub struct DeltaGenerator {
4748
}
4849

4950
impl DeltaGenerator {
50-
pub fn new(model: String, options: DeltaGeneratorOptions) -> Self {
51+
pub fn new(model: String, options: DeltaGeneratorOptions, request_id: String) -> Self {
5152
let now = std::time::SystemTime::now()
5253
.duration_since(std::time::UNIX_EPOCH)
5354
.unwrap()
@@ -67,8 +68,10 @@ impl DeltaGenerator {
6768
prompt_tokens_details: None,
6869
};
6970

71+
let completion_id = format!("cmpl-{}", request_id);
72+
7073
Self {
71-
id: format!("cmpl-{}", uuid::Uuid::new_v4()),
74+
id: completion_id,
7275
object: "text_completion".to_string(),
7376
created: now,
7477
model,

lib/llm/tests/http-service.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ impl
9292
let max_tokens = request.inner.max_tokens.unwrap_or(0) as u64;
9393

9494
// let generator = NvCreateChatCompletionStreamResponse::generator(request.model.clone());
95-
let generator = request.response_generator();
95+
let generator = request.response_generator(Some(ctx.id().to_string()));
9696

9797
let stream = stream! {
9898
tokio::time::sleep(std::time::Duration::from_millis(max_tokens)).await;

0 commit comments

Comments
 (0)