ai-dynamo
diff --git a/‎.github/workflows/docs-link-check.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/docs-link-check.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lib/async-openai/src/types/chat.rs‎
Lines changed: 6 additions & 0 deletions b/‎lib/async-openai/src/types/chat.rs‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎lib/engines/mistralrs/src/lib.rs‎
Lines changed: 2 additions & 2 deletions b/‎lib/engines/mistralrs/src/lib.rs‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎lib/llm/src/engines.rs‎
Lines changed: 2 additions & 8 deletions b/‎lib/llm/src/engines.rs‎
Lines changed: 2 additions & 8 deletions
diff --git a/‎lib/llm/src/entrypoint/input/batch.rs‎
Lines changed: 1 addition & 1 deletion b/‎lib/llm/src/entrypoint/input/batch.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/llm/src/entrypoint/input/text.rs‎
Lines changed: 1 addition & 1 deletion b/‎lib/llm/src/entrypoint/input/text.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/llm/src/http/service/openai.rs‎
Lines changed: 1 addition & 0 deletions b/‎lib/llm/src/http/service/openai.rs‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lib/llm/src/perf/logprobs.rs‎
Lines changed: 12 additions & 18 deletions b/‎lib/llm/src/perf/logprobs.rs‎
Lines changed: 12 additions & 18 deletions
diff --git a/‎lib/llm/src/protocols/openai/chat_completions.rs‎
Lines changed: 4 additions & 11 deletions b/‎lib/llm/src/protocols/openai/chat_completions.rs‎
Lines changed: 4 additions & 11 deletions
@@ -50,6 +50,7 @@ jobs:
           # Set GITHUB_TOKEN to avoid github rate limits on URL checks
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         run: |
+          cd docs
           set -euo pipefail
           # Run lychee against all files in repo
           lychee \
 
@@ -449,6 +449,9 @@ pub struct ChatCompletionResponseMessage {
     /// If the audio output modality is requested, this object contains data about the audio response from the model. [Learn more](https://platform.openai.com/docs/guides/audio).
     #[serde(skip_serializing_if = "Option::is_none")]
     pub audio: Option<ChatCompletionResponseMessageAudio>,
+
+    /// NVIDIA-specific extensions for the chat completion response.
+    pub reasoning_content: Option<String>,
 }
 
 #[derive(Clone, Serialize, Default, Debug, Deserialize, Builder, PartialEq)]
@@ -1021,6 +1024,9 @@ pub struct ChatCompletionStreamResponseDelta {
     pub role: Option<Role>,
     /// The refusal message generated by the model.
     pub refusal: Option<String>,
+
+    /// NVIDIA-specific extensions for the chat completion response.
+    pub reasoning_content: Option<String>,
 }
 
 #[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
 
@@ -396,7 +396,7 @@ impl
                         //tracing::trace!("from_assistant: {from_assistant}");
 
                         #[allow(deprecated)]
-                        let inner = dynamo_async_openai::types::CreateChatCompletionStreamResponse{
+                        let delta = NvCreateChatCompletionStreamResponse {
                             id: c.id,
                             choices: vec![dynamo_async_openai::types::ChatChoiceStream{
                                 index: 0,
@@ -407,6 +407,7 @@ impl
                                     tool_calls: None,
                                     refusal: None,
                                     function_call: None,
+                                    reasoning_content: None,
                                 },
                                 logprobs: None,
                                 finish_reason,
@@ -418,7 +419,6 @@ impl
                             system_fingerprint: Some(c.system_fingerprint),
                             service_tier: None,
                         };
-                        let delta = NvCreateChatCompletionStreamResponse{inner};
                         let ann = Annotated{
                             id: None,
                             data: Some(delta),
 
@@ -204,18 +204,12 @@ impl
             for c in prompt.chars() {
                 // we are returning characters not tokens, so there will be some postprocessing overhead
                 tokio::time::sleep(*TOKEN_ECHO_DELAY).await;
-                let inner = deltas.create_choice(0, Some(c.to_string()), None, None);
-                let response = NvCreateChatCompletionStreamResponse {
-                    inner,
-                };
+                let response = deltas.create_choice(0, Some(c.to_string()), None, None);
                 yield Annotated{ id: Some(id.to_string()), data: Some(response), event: None, comment: None };
                 id += 1;
             }
 
-            let inner = deltas.create_choice(0, None, Some(dynamo_async_openai::types::FinishReason::Stop), None);
-            let response = NvCreateChatCompletionStreamResponse {
-                inner,
-            };
+            let response = deltas.create_choice(0, None, Some(dynamo_async_openai::types::FinishReason::Stop), None);
             yield Annotated { id: Some(id.to_string()), data: Some(response), event: None, comment: None };
         };
 
 
@@ -233,7 +233,7 @@ async fn evaluate(
         match (item.data.as_ref(), item.event.as_deref()) {
             (Some(data), _) => {
                 // Normal case
-                let choice = data.inner.choices.first();
+                let choice = data.choices.first();
                 let chat_comp = choice.as_ref().unwrap();
                 if let Some(c) = &chat_comp.delta.content {
                     output += c;
 
@@ -143,7 +143,7 @@ async fn main_loop(
             match (item.data.as_ref(), item.event.as_deref()) {
                 (Some(data), _) => {
                     // Normal case
-                    let entry = data.inner.choices.first();
+                    let entry = data.choices.first();
                     let chat_comp = entry.as_ref().unwrap();
                     if let Some(c) = &chat_comp.delta.content {
                         let _ = stdout.write(c.as_bytes());
 
@@ -31,6 +31,7 @@ use super::{
     service_v2, RouteDoc,
 };
 use crate::preprocessor::LLMMetricAnnotation;
+use crate::protocols::openai::chat_completions::aggregator::ChatCompletionAggregator;
 use crate::protocols::openai::{
     chat_completions::{NvCreateChatCompletionRequest, NvCreateChatCompletionResponse},
     completions::{NvCreateCompletionRequest, NvCreateCompletionResponse},
 
@@ -128,7 +128,7 @@ impl LogprobExtractor for NvCreateChatCompletionStreamResponse {
     fn extract_logprobs_by_choice(&self) -> HashMap<u32, Vec<TokenLogProbs>> {
         let mut result = HashMap::new();
 
-        for choice in &self.inner.choices {
+        for choice in &self.choices {
             let choice_index = choice.index;
 
             let choice_logprobs = choice
@@ -574,8 +574,7 @@ mod tests {
     use approx::assert_abs_diff_eq;
     use dynamo_async_openai::types::{
         ChatChoiceLogprobs, ChatChoiceStream, ChatCompletionStreamResponseDelta,
-        ChatCompletionTokenLogprob, CreateChatCompletionStreamResponse, FinishReason, Role,
-        TopLogprobs,
+        ChatCompletionTokenLogprob, FinishReason, Role, TopLogprobs,
     };
     use futures::StreamExt;
     use std::sync::Arc;
@@ -949,7 +948,7 @@ mod tests {
         token_logprobs: Vec<ChatCompletionTokenLogprob>,
     ) -> NvCreateChatCompletionStreamResponse {
         #[expect(deprecated)]
-        let inner = CreateChatCompletionStreamResponse {
+        NvCreateChatCompletionStreamResponse {
             id: "test_id".to_string(),
             choices: vec![ChatChoiceStream {
                 index: 0,
@@ -959,6 +958,7 @@ mod tests {
                     tool_calls: None,
                     role: Some(Role::Assistant),
                     refusal: None,
+                    reasoning_content: None,
                 },
                 finish_reason: Some(FinishReason::Stop),
                 logprobs: Some(ChatChoiceLogprobs {
@@ -972,9 +972,7 @@ mod tests {
             system_fingerprint: None,
             object: "chat.completion.chunk".to_string(),
             usage: None,
-        };
-
-        NvCreateChatCompletionStreamResponse { inner }
+        }
     }
 
     fn create_mock_response_with_multiple_choices(
@@ -992,6 +990,7 @@ mod tests {
                     tool_calls: None,
                     role: Some(Role::Assistant),
                     refusal: None,
+                    reasoning_content: None,
                 },
                 finish_reason: Some(FinishReason::Stop),
                 logprobs: Some(ChatChoiceLogprobs {
@@ -1001,7 +1000,7 @@ mod tests {
             })
             .collect();
 
-        let inner = CreateChatCompletionStreamResponse {
+        NvCreateChatCompletionStreamResponse {
             id: "test_id".to_string(),
             choices,
             created: 1234567890,
@@ -1010,9 +1009,7 @@ mod tests {
             system_fingerprint: None,
             object: "chat.completion.chunk".to_string(),
             usage: None,
-        };
-
-        NvCreateChatCompletionStreamResponse { inner }
+        }
     }
 
     #[test]
@@ -1331,7 +1328,7 @@ mod tests {
     fn test_logprob_extractor_with_missing_data() {
         // Test with choice that has no logprobs
         #[expect(deprecated)]
-        let inner = CreateChatCompletionStreamResponse {
+        let response = NvCreateChatCompletionStreamResponse {
             id: "test_id".to_string(),
             choices: vec![ChatChoiceStream {
                 index: 0,
@@ -1341,6 +1338,7 @@ mod tests {
                     tool_calls: None,
                     role: Some(Role::Assistant),
                     refusal: None,
+                    reasoning_content: None,
                 },
                 finish_reason: Some(FinishReason::Stop),
                 logprobs: None, // No logprobs
@@ -1353,7 +1351,6 @@ mod tests {
             usage: None,
         };
 
-        let response = NvCreateChatCompletionStreamResponse { inner };
         let logprobs = response.extract_logprobs_by_choice();
         assert_eq!(logprobs.len(), 1);
         assert!(logprobs.values().any(|v| v.is_empty()));
@@ -1556,9 +1553,8 @@ mod tests {
     fn create_mock_response() -> NvCreateChatCompletionStreamResponse {
         // Create a mock response for testing
         // In practice, this would have real logprobs data
-        use dynamo_async_openai::types::CreateChatCompletionStreamResponse;
 
-        let inner = CreateChatCompletionStreamResponse {
+        NvCreateChatCompletionStreamResponse {
             id: "test_id".to_string(),
             choices: vec![],
             created: 1234567890,
@@ -1567,9 +1563,7 @@ mod tests {
             system_fingerprint: None,
             object: "chat.completion.chunk".to_string(),
             usage: None,
-        };
-
-        NvCreateChatCompletionStreamResponse { inner }
+        }
     }
 
     // Mock context for testing
 
@@ -27,7 +27,7 @@ use super::{
     OpenAIStopConditionsProvider,
 };
 
-mod aggregator;
+pub mod aggregator;
 mod delta;
 
 pub use aggregator::DeltaAggregator;
@@ -59,23 +59,16 @@ pub struct NvCreateChatCompletionRequest {
 /// # Fields
 /// - `inner`: The base OpenAI unary chat completion response, embedded
 ///   using `serde(flatten)`.
-#[derive(Serialize, Deserialize, Validate, Debug, Clone)]
-pub struct NvCreateChatCompletionResponse {
-    #[serde(flatten)]
-    pub inner: dynamo_async_openai::types::CreateChatCompletionResponse,
-}
+pub type NvCreateChatCompletionResponse = dynamo_async_openai::types::CreateChatCompletionResponse;
 
 /// A response structure for streamed chat completions, embedding OpenAI's
 /// `CreateChatCompletionStreamResponse`.
 ///
 /// # Fields
 /// - `inner`: The base OpenAI streaming chat completion response, embedded
 ///   using `serde(flatten)`.
-#[derive(Serialize, Deserialize, Validate, Debug, Clone)]
-pub struct NvCreateChatCompletionStreamResponse {
-    #[serde(flatten)]
-    pub inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse,
-}
+pub type NvCreateChatCompletionStreamResponse =
+    dynamo_async_openai::types::CreateChatCompletionStreamResponse;
 
 /// Implements `NvExtProvider` for `NvCreateChatCompletionRequest`,
 /// providing access to NVIDIA-specific extensions.