Merge branch 'main' into hzhou/standalone-profile

tedzhouhk · web-flow · commit 1bace0ae2a1c · 2025-08-11T12:12:48.000-07:00
diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
@@ -34,6 +34,9 @@ jobs:
         uses: actions/checkout@v4
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v3
+      - name: Login to NGC
+        run: |
+          echo "${{ secrets.NGC_CI_ACCESS_TOKEN }}" | docker login nvcr.io -u '$oauthtoken' --password-stdin
       - name: Define Image Tag
         id: define_image_tag
         run: |
diff --git a/components/backends/sglang/README.md b/components/backends/sglang/README.md
@@ -179,6 +179,9 @@ Below we provide a selected list of advanced examples. Please open up an issue i
 ### Supporting SGLang's native endpoints via Dynamo
 - **[HTTP Server for native SGLang endpoints](docs/sgl-http-server.md)**
 
+### Hierarchical Cache (HiCache)
+- **[Enable SGLang Hierarchical Cache (HiCache)](docs/sgl-hicache-example.md)**
+
 ## Deployment
 
 We currently provide deployment examples for Kubernetes and SLURM.
diff --git a/components/backends/sglang/docs/sgl-hicache-example.md b/components/backends/sglang/docs/sgl-hicache-example.md
@@ -0,0 +1,65 @@
+<!--
+SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: Apache-2.0
+-->
+
+# Enable SGLang Hierarchical Cache (HiCache)
+
+This guide shows how to enable SGLang's Hierarchical Cache (HiCache) inside Dynamo.
+
+## 1) Start the SGLang worker with HiCache enabled
+
+```bash
+python -m dynamo.sglang.worker \
+  --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
+  --host 0.0.0.0 --port 8000 \
+  --page-size 64 \
+  --enable-hierarchical-cache \
+  --hicache-size 30 \
+  --hicache-write-policy write_through \
+  --hicache-storage-backend nixl \
+  --log-level debug \
+  --skip-tokenizer-init
+```
+
+- **--enable-hierarchical-cache**: Enables hierarchical KV cache/offload
+- **--hicache-size**: HiCache capacity in GB of pinned host memory (upper bound of offloaded KV to CPU)
+- **--hicache-write-policy**: Write policy (e.g., `write_through` for synchronous host writes)
+- **--hicache-storage-backend**: Host storage backend for HiCache (e.g., `nixl`). NIXL selects the concrete store automatically; see [PR #8488](https://github.com/sgl-project/sglang/pull/8488)
+
+
+Then, start the frontend:
+```bash
+python -m dynamo.frontend --http-port 8000
+```
+
+## 2) Send a single request
+
+```bash
+curl localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+    "messages": [
+      {
+        "role": "user",
+        "content": "Explain why Roger Federer is considered one of the greatest tennis players of all time"
+      }
+    ],
+    "stream": false,
+    "max_tokens": 30
+  }'
+```
+
+## 3) (Optional) Benchmarking
+
+Run the perf script:
+```bash
+bash -x /workspace/benchmarks/llm/perf.sh \
+  --model deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
+  --tensor-parallelism 1 \
+  --data-parallelism 1 \
+  --concurrency "2,4,8" \
+  --input-sequence-length 2048 \
+  --output-sequence-length 256
+```
diff --git a/components/backends/trtllm/engine_configs/gpt_oss/decode.yaml b/components/backends/trtllm/engine_configs/gpt_oss/decode.yaml
@@ -17,7 +17,6 @@ disable_overlap_scheduler: false
 moe_config:
     backend: CUTLASS
 cuda_graph_config:
-    max_batch_size: 128
     enable_padding: true
 cache_transceiver_config:
   backend: ucx
diff --git a/components/backends/trtllm/gpt-oss.md b/components/backends/trtllm/gpt-oss.md
@@ -203,7 +203,6 @@ CUDA_VISIBLE_DEVICES=4,5,6,7 python3 -m dynamo.trtllm \
   --disaggregation-mode decode \
   --disaggregation-strategy prefill_first \
   --max-num-tokens 16384 \
-  --max-batch-size 128 \
   --free-gpu-memory-fraction 0.9 \
   --tensor-parallel-size 4 \
   --expert-parallel-size 4
diff --git a/components/backends/trtllm/launch/gpt_oss_disagg.sh b/components/backends/trtllm/launch/gpt_oss_disagg.sh
@@ -40,7 +40,6 @@ CUDA_VISIBLE_DEVICES=4,5,6,7 python3 -m dynamo.trtllm \
   --disaggregation-mode decode \
   --disaggregation-strategy "$DISAGGREGATION_STRATEGY" \
   --max-num-tokens 16384 \
-  --max-batch-size 128 \
   --free-gpu-memory-fraction 0.9 \
   --tensor-parallel-size 4 \
   --expert-parallel-size 4
diff --git a/launch/dynamo-run/src/lib.rs b/launch/dynamo-run/src/lib.rs
@@ -35,6 +35,7 @@ pub async fn run(
                 .or(flags.model_path_flag.clone()),
         )
         .model_name(flags.model_name.clone())
+        .model_config(flags.model_config.clone())
         .kv_cache_block_size(flags.kv_cache_block_size)
         // Only set if user provides. Usually loaded from tokenizer_config.json
         .context_length(flags.context_length)
diff --git a/lib/llm/Cargo.toml b/lib/llm/Cargo.toml
@@ -35,6 +35,7 @@ testing-cuda  = ["dep:cudarc"]
 testing-nixl  = ["dep:nixl-sys"]
 block-manager = ["dep:nixl-sys", "dep:cudarc", "dep:ndarray", "dep:nix"]
 sentencepiece = ["dep:sentencepiece"]
+cuda          = ["dep:cudarc"]
 integration = []
 
 [[bench]]
diff --git a/lib/llm/src/cuda.rs b/lib/llm/src/cuda.rs
@@ -0,0 +1,207 @@
+// SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Module to integration with CUDA
+//!
+//! This module will be a standalong crates, likely called `dynamo-cuda`; however, for the time, it will
+//! life as a submodule of `dynamo-llm`.
+//!
+//! This implementation will include a set of traits for extracting raw `cudarc::driver::sys` objects.
+//!
+//! Dynamo will generally not be the primary compute driver within an application, but a secondary source
+//! of logic that may be used inconjunction with the primary compute driver, e.g. vLLM use of PyTorch is
+//! the primary CUDA context.
+//!
+//! In order for Dynamo to avoid creating its own CUDA context, the following traits are provided so
+//! that we may tap the lower level CUDA context, streams, events, etcs from external sources and leverage
+//! them within Dynamo.
+
+use cudarc::driver::{
+    sys::{cuCtxPopCurrent_v2, cuCtxPushCurrent_v2, cudaError_enum, CUcontext, CUstream},
+    CudaContext, CudaStream,
+};
+use std::pin::Pin;
+use std::{marker::PhantomData, sync::Arc};
+
+pub trait DynamoCudaContextProvider {
+    /// # Safety
+    ///
+    /// This method is unsafe because it directly accesses the underlying CUDA context.
+    /// The caller must ensure that the context is valid and that the CUDA context is active.
+    unsafe fn cu_context(&self) -> cudarc::driver::sys::CUcontext;
+
+    fn bind_to_thread(&self) -> Pin<Box<DynamoCudaContextGuard>> {
+        unsafe { DynamoCudaContextGuard::new(self.cu_context()) }
+    }
+}
+
+pub trait DynamoCudaStreamProvider {
+    /// # Safety
+    ///
+    /// This method is unsafe because it directly accesses the underlying CUDA stream.
+    /// The caller must ensure that the stream is valid and that the CUDA context is active.
+    ///
+    /// Similarly, any pointers/references to data for which the stream will be accessed must
+    /// have proper lifetimes and scoping, which is not guaranteed by this trait.
+    unsafe fn cu_stream(&self) -> cudarc::driver::sys::CUstream;
+
+    fn context(&self) -> Arc<dyn DynamoCudaContextProvider>;
+}
+
+/// A CUDA context guard that ensures safe access to CUDA contexts.
+///
+/// This guard:
+/// - Cannot be moved (uses PhantomPinned)
+/// - Cannot be cloned
+/// - Cannot pass across async boundaries (!Send + !Sync)
+/// - Provides safe access to the underlying CUDA context
+/// - Automatically manages context lifecycle
+pub struct DynamoCudaContextGuard {
+    context: cudarc::driver::sys::CUcontext,
+    // Prevent the guard from being moved
+    _pin: std::marker::PhantomPinned,
+    // Prevent Send + Sync to avoid crossing async boundaries
+    _not_send_sync: PhantomData<*const ()>,
+}
+
+impl DynamoCudaContextGuard {
+    /// Create a new context guard from a context provider.
+    ///
+    /// This is a safe constructor that pushes the context onto the CUDA context stack
+    /// and ensures it will be properly popped when the guard is dropped.
+    ///
+    /// # Arguments
+    /// * `provider` - A reference to something that can provide a CUDA context
+    ///
+    /// # Returns
+    /// A pinned context guard that manages the CUDA context safely
+    ///
+    /// # Panics
+    /// Panics if the CUDA context push operation fails
+    /// # Safety
+    ///
+    /// This function dereferences a raw pointer and interacts with the CUDA driver API.
+    /// The caller must ensure the context is valid.
+    pub unsafe fn new(context: CUcontext) -> Pin<Box<Self>> {
+        // Push the context onto the CUDA context stack
+        let result = cuCtxPushCurrent_v2(context);
+        if result != cudaError_enum::CUDA_SUCCESS {
+            panic!("Failed to push CUDA context: {:?}", result);
+        }
+
+        let guard = Self {
+            context,
+            _pin: std::marker::PhantomPinned,
+            _not_send_sync: PhantomData,
+        };
+
+        Box::pin(guard)
+    }
+
+    /// Get the raw CUDA context.
+    ///
+    /// This method is safe because the guard ensures the context remains valid
+    /// for its lifetime and cannot be moved or passed across async boundaries.
+    ///
+    /// # Returns
+    /// The raw CUDA context handle
+    pub fn context(&self) -> cudarc::driver::sys::CUcontext {
+        self.context
+    }
+}
+
+impl Drop for DynamoCudaContextGuard {
+    fn drop(&mut self) {
+        // Pop the context from the CUDA context stack when the guard is dropped
+        let mut popped_context: CUcontext = std::ptr::null_mut();
+        let result = unsafe { cuCtxPopCurrent_v2(&mut popped_context) };
+
+        // Log errors but don't panic in Drop
+        if result != cudaError_enum::CUDA_SUCCESS {
+            eprintln!("Warning: Failed to pop CUDA context in drop: {:?}", result);
+        }
+
+        // Verify we popped the expected context
+        if popped_context != self.context {
+            eprintln!(
+                "Warning: Popped context {:?} does not match expected context {:?}",
+                popped_context, self.context
+            );
+        }
+    }
+}
+
+/// A CUDA context provider that wraps an external CUDA context.
+pub struct ExternalCudaContext {
+    // SAFETY: CUcontext is thread-safe to pass between threads and can be used concurrently.
+    context: CUcontext,
+}
+
+// SAFETY: See notes on CUcontext above.
+unsafe impl Send for ExternalCudaContext {}
+unsafe impl Sync for ExternalCudaContext {}
+
+impl ExternalCudaContext {
+    pub fn new(context: CUcontext) -> Arc<Self> {
+        Arc::new(Self { context })
+    }
+
+    pub fn cu_context(&self) -> CUcontext {
+        self.context
+    }
+}
+
+impl DynamoCudaContextProvider for ExternalCudaContext {
+    unsafe fn cu_context(&self) -> cudarc::driver::sys::CUcontext {
+        self.cu_context()
+    }
+}
+
+/// A CUDA stream provider that wraps an external CUDA stream.
+pub struct ExternalCudaStream {
+    stream: CUstream,
+    context: Arc<dyn DynamoCudaContextProvider>,
+}
+
+impl ExternalCudaStream {
+    pub fn new(stream: CUstream, context: Arc<dyn DynamoCudaContextProvider>) -> Self {
+        Self { stream, context }
+    }
+}
+
+impl DynamoCudaStreamProvider for ExternalCudaStream {
+    unsafe fn cu_stream(&self) -> cudarc::driver::sys::CUstream {
+        self.stream
+    }
+
+    fn context(&self) -> Arc<dyn DynamoCudaContextProvider> {
+        self.context.clone()
+    }
+}
+
+// The PhantomData<*const ()> field automatically makes this !Send and !Sync
+// which prevents the guard from crossing async boundaries
+
+// Implementations of this trait for the [`cudarc`] crate.
+
+impl DynamoCudaContextProvider for CudaContext {
+    unsafe fn cu_context(&self) -> cudarc::driver::sys::CUcontext {
+        self.cu_ctx()
+    }
+}
+
+impl DynamoCudaContextProvider for CudaStream {
+    unsafe fn cu_context(&self) -> cudarc::driver::sys::CUcontext {
+        self.context().cu_context()
+    }
+}
+
+impl DynamoCudaStreamProvider for CudaStream {
+    unsafe fn cu_stream(&self) -> cudarc::driver::sys::CUstream {
+        self.cu_stream()
+    }
+
+    fn context(&self) -> Arc<dyn DynamoCudaContextProvider> {
+        self.context().clone()
+    }
+}
diff --git a/lib/llm/src/lib.rs b/lib/llm/src/lib.rs
@@ -38,6 +38,9 @@ pub mod types;
 #[cfg(feature = "block-manager")]
 pub mod block_manager;
 
+#[cfg(feature = "cuda")]
+pub mod cuda;
+
 /// Reads a JSON file, extracts a specific field, and deserializes it into type T.
 ///
 /// # Arguments
diff --git a/lib/llm/src/migration.rs b/lib/llm/src/migration.rs
@@ -175,6 +175,7 @@ mod tests {
     // Helper to create a mock preprocessed request
     fn create_mock_request(max_tokens: u32) -> PreprocessedRequest {
         PreprocessedRequest {
+            model: "mock".to_string(),
             token_ids: vec![1, 2, 3],
             batch_token_ids: None,
             stop_conditions: StopConditions {
diff --git a/lib/llm/src/mocker/engine.rs b/lib/llm/src/mocker/engine.rs
@@ -633,6 +633,7 @@ mod integration_tests {
 
         // Create test requests for both DP workers
         let create_request = |tokens: Vec<TokenIdType>, dp_rank: u32| PreprocessedRequest {
+            model: "mock".to_string(),
             token_ids: tokens,
             batch_token_ids: None,
             stop_conditions: StopConditions {
diff --git a/lib/llm/src/preprocessor.rs b/lib/llm/src/preprocessor.rs
@@ -153,6 +153,7 @@ impl OpenAIPreprocessor {
     ) -> Result<(PreprocessedRequest, HashMap<String, String>)> {
         let mut annotations = HashMap::new();
         let mut builder = PreprocessedRequest::builder();
+        builder.model(request.model());
 
         // match request type before any conversion/processing
         match request.prompt_input_type() {
diff --git a/lib/llm/src/preprocessor/prompt.rs b/lib/llm/src/preprocessor/prompt.rs
@@ -58,6 +58,7 @@ pub enum PromptInput {
 
 /// Trait that defines a request that can map to an OpenAI-like request.
 pub trait OAIChatLikeRequest {
+    fn model(&self) -> String;
     fn messages(&self) -> Value;
     fn tools(&self) -> Option<Value> {
         None
diff --git a/lib/llm/src/preprocessor/prompt/template/oai.rs b/lib/llm/src/preprocessor/prompt/template/oai.rs
@@ -25,6 +25,10 @@ use tracing;
 use crate::preprocessor::prompt::{PromptInput, TextInput, TokenInput};
 
 impl OAIChatLikeRequest for NvCreateChatCompletionRequest {
+    fn model(&self) -> String {
+        self.inner.model.clone()
+    }
+
     fn messages(&self) -> Value {
         Value::from_serialize(&self.inner.messages)
     }
@@ -62,6 +66,9 @@ impl OAIChatLikeRequest for NvCreateChatCompletionRequest {
 }
 
 impl OAIChatLikeRequest for NvCreateCompletionRequest {
+    fn model(&self) -> String {
+        self.inner.model.clone()
+    }
     fn messages(&self) -> minijinja::value::Value {
         let message = async_openai::types::ChatCompletionRequestMessage::User(
             async_openai::types::ChatCompletionRequestUserMessage {
diff --git a/lib/llm/src/protocols/common/preprocessor.rs b/lib/llm/src/protocols/common/preprocessor.rs

Original file line number	Diff line number	Diff line change
`@@ -35,6 +35,7 @@ pub async fn run(`
`35`	`35`	`.or(flags.model_path_flag.clone()),`
`36`	`36`	`)`
`37`	`37`	`.model_name(flags.model_name.clone())`
	`38`	`+ .model_config(flags.model_config.clone())`
`38`	`39`	`.kv_cache_block_size(flags.kv_cache_block_size)`
`39`	`40`	`// Only set if user provides. Usually loaded from tokenizer_config.json`
`40`	`41`	`.context_length(flags.context_length)`