Skip to content

Commit 1bace0a

Browse files
authored
Merge branch 'main' into hzhou/standalone-profile
2 parents c635a4b + 4d24d03 commit 1bace0a

File tree

16 files changed

+297
-15
lines changed

16 files changed

+297
-15
lines changed

.github/workflows/build-and-test.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,9 @@ jobs:
3434
uses: actions/checkout@v4
3535
- name: Set up Docker Buildx
3636
uses: docker/setup-buildx-action@v3
37+
- name: Login to NGC
38+
run: |
39+
echo "${{ secrets.NGC_CI_ACCESS_TOKEN }}" | docker login nvcr.io -u '$oauthtoken' --password-stdin
3740
- name: Define Image Tag
3841
id: define_image_tag
3942
run: |

components/backends/sglang/README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,9 @@ Below we provide a selected list of advanced examples. Please open up an issue i
179179
### Supporting SGLang's native endpoints via Dynamo
180180
- **[HTTP Server for native SGLang endpoints](docs/sgl-http-server.md)**
181181

182+
### Hierarchical Cache (HiCache)
183+
- **[Enable SGLang Hierarchical Cache (HiCache)](docs/sgl-hicache-example.md)**
184+
182185
## Deployment
183186

184187
We currently provide deployment examples for Kubernetes and SLURM.
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
<!--
2+
SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
SPDX-License-Identifier: Apache-2.0
4+
-->
5+
6+
# Enable SGLang Hierarchical Cache (HiCache)
7+
8+
This guide shows how to enable SGLang's Hierarchical Cache (HiCache) inside Dynamo.
9+
10+
## 1) Start the SGLang worker with HiCache enabled
11+
12+
```bash
13+
python -m dynamo.sglang.worker \
14+
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
15+
--host 0.0.0.0 --port 8000 \
16+
--page-size 64 \
17+
--enable-hierarchical-cache \
18+
--hicache-size 30 \
19+
--hicache-write-policy write_through \
20+
--hicache-storage-backend nixl \
21+
--log-level debug \
22+
--skip-tokenizer-init
23+
```
24+
25+
- **--enable-hierarchical-cache**: Enables hierarchical KV cache/offload
26+
- **--hicache-size**: HiCache capacity in GB of pinned host memory (upper bound of offloaded KV to CPU)
27+
- **--hicache-write-policy**: Write policy (e.g., `write_through` for synchronous host writes)
28+
- **--hicache-storage-backend**: Host storage backend for HiCache (e.g., `nixl`). NIXL selects the concrete store automatically; see [PR #8488](https://github.com/sgl-project/sglang/pull/8488)
29+
30+
31+
Then, start the frontend:
32+
```bash
33+
python -m dynamo.frontend --http-port 8000
34+
```
35+
36+
## 2) Send a single request
37+
38+
```bash
39+
curl localhost:8000/v1/chat/completions \
40+
-H "Content-Type: application/json" \
41+
-d '{
42+
"model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
43+
"messages": [
44+
{
45+
"role": "user",
46+
"content": "Explain why Roger Federer is considered one of the greatest tennis players of all time"
47+
}
48+
],
49+
"stream": false,
50+
"max_tokens": 30
51+
}'
52+
```
53+
54+
## 3) (Optional) Benchmarking
55+
56+
Run the perf script:
57+
```bash
58+
bash -x /workspace/benchmarks/llm/perf.sh \
59+
--model deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
60+
--tensor-parallelism 1 \
61+
--data-parallelism 1 \
62+
--concurrency "2,4,8" \
63+
--input-sequence-length 2048 \
64+
--output-sequence-length 256
65+
```

components/backends/trtllm/engine_configs/gpt_oss/decode.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ disable_overlap_scheduler: false
1717
moe_config:
1818
backend: CUTLASS
1919
cuda_graph_config:
20-
max_batch_size: 128
2120
enable_padding: true
2221
cache_transceiver_config:
2322
backend: ucx

components/backends/trtllm/gpt-oss.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,6 @@ CUDA_VISIBLE_DEVICES=4,5,6,7 python3 -m dynamo.trtllm \
203203
--disaggregation-mode decode \
204204
--disaggregation-strategy prefill_first \
205205
--max-num-tokens 16384 \
206-
--max-batch-size 128 \
207206
--free-gpu-memory-fraction 0.9 \
208207
--tensor-parallel-size 4 \
209208
--expert-parallel-size 4

components/backends/trtllm/launch/gpt_oss_disagg.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@ CUDA_VISIBLE_DEVICES=4,5,6,7 python3 -m dynamo.trtllm \
4040
--disaggregation-mode decode \
4141
--disaggregation-strategy "$DISAGGREGATION_STRATEGY" \
4242
--max-num-tokens 16384 \
43-
--max-batch-size 128 \
4443
--free-gpu-memory-fraction 0.9 \
4544
--tensor-parallel-size 4 \
4645
--expert-parallel-size 4

launch/dynamo-run/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ pub async fn run(
3535
.or(flags.model_path_flag.clone()),
3636
)
3737
.model_name(flags.model_name.clone())
38+
.model_config(flags.model_config.clone())
3839
.kv_cache_block_size(flags.kv_cache_block_size)
3940
// Only set if user provides. Usually loaded from tokenizer_config.json
4041
.context_length(flags.context_length)

lib/llm/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ testing-cuda = ["dep:cudarc"]
3535
testing-nixl = ["dep:nixl-sys"]
3636
block-manager = ["dep:nixl-sys", "dep:cudarc", "dep:ndarray", "dep:nix"]
3737
sentencepiece = ["dep:sentencepiece"]
38+
cuda = ["dep:cudarc"]
3839
integration = []
3940

4041
[[bench]]

lib/llm/src/cuda.rs

Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
// SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
//! Module to integration with CUDA
5+
//!
6+
//! This module will be a standalong crates, likely called `dynamo-cuda`; however, for the time, it will
7+
//! life as a submodule of `dynamo-llm`.
8+
//!
9+
//! This implementation will include a set of traits for extracting raw `cudarc::driver::sys` objects.
10+
//!
11+
//! Dynamo will generally not be the primary compute driver within an application, but a secondary source
12+
//! of logic that may be used inconjunction with the primary compute driver, e.g. vLLM use of PyTorch is
13+
//! the primary CUDA context.
14+
//!
15+
//! In order for Dynamo to avoid creating its own CUDA context, the following traits are provided so
16+
//! that we may tap the lower level CUDA context, streams, events, etcs from external sources and leverage
17+
//! them within Dynamo.
18+
19+
use cudarc::driver::{
20+
sys::{cuCtxPopCurrent_v2, cuCtxPushCurrent_v2, cudaError_enum, CUcontext, CUstream},
21+
CudaContext, CudaStream,
22+
};
23+
use std::pin::Pin;
24+
use std::{marker::PhantomData, sync::Arc};
25+
26+
pub trait DynamoCudaContextProvider {
27+
/// # Safety
28+
///
29+
/// This method is unsafe because it directly accesses the underlying CUDA context.
30+
/// The caller must ensure that the context is valid and that the CUDA context is active.
31+
unsafe fn cu_context(&self) -> cudarc::driver::sys::CUcontext;
32+
33+
fn bind_to_thread(&self) -> Pin<Box<DynamoCudaContextGuard>> {
34+
unsafe { DynamoCudaContextGuard::new(self.cu_context()) }
35+
}
36+
}
37+
38+
pub trait DynamoCudaStreamProvider {
39+
/// # Safety
40+
///
41+
/// This method is unsafe because it directly accesses the underlying CUDA stream.
42+
/// The caller must ensure that the stream is valid and that the CUDA context is active.
43+
///
44+
/// Similarly, any pointers/references to data for which the stream will be accessed must
45+
/// have proper lifetimes and scoping, which is not guaranteed by this trait.
46+
unsafe fn cu_stream(&self) -> cudarc::driver::sys::CUstream;
47+
48+
fn context(&self) -> Arc<dyn DynamoCudaContextProvider>;
49+
}
50+
51+
/// A CUDA context guard that ensures safe access to CUDA contexts.
52+
///
53+
/// This guard:
54+
/// - Cannot be moved (uses PhantomPinned)
55+
/// - Cannot be cloned
56+
/// - Cannot pass across async boundaries (!Send + !Sync)
57+
/// - Provides safe access to the underlying CUDA context
58+
/// - Automatically manages context lifecycle
59+
pub struct DynamoCudaContextGuard {
60+
context: cudarc::driver::sys::CUcontext,
61+
// Prevent the guard from being moved
62+
_pin: std::marker::PhantomPinned,
63+
// Prevent Send + Sync to avoid crossing async boundaries
64+
_not_send_sync: PhantomData<*const ()>,
65+
}
66+
67+
impl DynamoCudaContextGuard {
68+
/// Create a new context guard from a context provider.
69+
///
70+
/// This is a safe constructor that pushes the context onto the CUDA context stack
71+
/// and ensures it will be properly popped when the guard is dropped.
72+
///
73+
/// # Arguments
74+
/// * `provider` - A reference to something that can provide a CUDA context
75+
///
76+
/// # Returns
77+
/// A pinned context guard that manages the CUDA context safely
78+
///
79+
/// # Panics
80+
/// Panics if the CUDA context push operation fails
81+
/// # Safety
82+
///
83+
/// This function dereferences a raw pointer and interacts with the CUDA driver API.
84+
/// The caller must ensure the context is valid.
85+
pub unsafe fn new(context: CUcontext) -> Pin<Box<Self>> {
86+
// Push the context onto the CUDA context stack
87+
let result = cuCtxPushCurrent_v2(context);
88+
if result != cudaError_enum::CUDA_SUCCESS {
89+
panic!("Failed to push CUDA context: {:?}", result);
90+
}
91+
92+
let guard = Self {
93+
context,
94+
_pin: std::marker::PhantomPinned,
95+
_not_send_sync: PhantomData,
96+
};
97+
98+
Box::pin(guard)
99+
}
100+
101+
/// Get the raw CUDA context.
102+
///
103+
/// This method is safe because the guard ensures the context remains valid
104+
/// for its lifetime and cannot be moved or passed across async boundaries.
105+
///
106+
/// # Returns
107+
/// The raw CUDA context handle
108+
pub fn context(&self) -> cudarc::driver::sys::CUcontext {
109+
self.context
110+
}
111+
}
112+
113+
impl Drop for DynamoCudaContextGuard {
114+
fn drop(&mut self) {
115+
// Pop the context from the CUDA context stack when the guard is dropped
116+
let mut popped_context: CUcontext = std::ptr::null_mut();
117+
let result = unsafe { cuCtxPopCurrent_v2(&mut popped_context) };
118+
119+
// Log errors but don't panic in Drop
120+
if result != cudaError_enum::CUDA_SUCCESS {
121+
eprintln!("Warning: Failed to pop CUDA context in drop: {:?}", result);
122+
}
123+
124+
// Verify we popped the expected context
125+
if popped_context != self.context {
126+
eprintln!(
127+
"Warning: Popped context {:?} does not match expected context {:?}",
128+
popped_context, self.context
129+
);
130+
}
131+
}
132+
}
133+
134+
/// A CUDA context provider that wraps an external CUDA context.
135+
pub struct ExternalCudaContext {
136+
// SAFETY: CUcontext is thread-safe to pass between threads and can be used concurrently.
137+
context: CUcontext,
138+
}
139+
140+
// SAFETY: See notes on CUcontext above.
141+
unsafe impl Send for ExternalCudaContext {}
142+
unsafe impl Sync for ExternalCudaContext {}
143+
144+
impl ExternalCudaContext {
145+
pub fn new(context: CUcontext) -> Arc<Self> {
146+
Arc::new(Self { context })
147+
}
148+
149+
pub fn cu_context(&self) -> CUcontext {
150+
self.context
151+
}
152+
}
153+
154+
impl DynamoCudaContextProvider for ExternalCudaContext {
155+
unsafe fn cu_context(&self) -> cudarc::driver::sys::CUcontext {
156+
self.cu_context()
157+
}
158+
}
159+
160+
/// A CUDA stream provider that wraps an external CUDA stream.
161+
pub struct ExternalCudaStream {
162+
stream: CUstream,
163+
context: Arc<dyn DynamoCudaContextProvider>,
164+
}
165+
166+
impl ExternalCudaStream {
167+
pub fn new(stream: CUstream, context: Arc<dyn DynamoCudaContextProvider>) -> Self {
168+
Self { stream, context }
169+
}
170+
}
171+
172+
impl DynamoCudaStreamProvider for ExternalCudaStream {
173+
unsafe fn cu_stream(&self) -> cudarc::driver::sys::CUstream {
174+
self.stream
175+
}
176+
177+
fn context(&self) -> Arc<dyn DynamoCudaContextProvider> {
178+
self.context.clone()
179+
}
180+
}
181+
182+
// The PhantomData<*const ()> field automatically makes this !Send and !Sync
183+
// which prevents the guard from crossing async boundaries
184+
185+
// Implementations of this trait for the [`cudarc`] crate.
186+
187+
impl DynamoCudaContextProvider for CudaContext {
188+
unsafe fn cu_context(&self) -> cudarc::driver::sys::CUcontext {
189+
self.cu_ctx()
190+
}
191+
}
192+
193+
impl DynamoCudaContextProvider for CudaStream {
194+
unsafe fn cu_context(&self) -> cudarc::driver::sys::CUcontext {
195+
self.context().cu_context()
196+
}
197+
}
198+
199+
impl DynamoCudaStreamProvider for CudaStream {
200+
unsafe fn cu_stream(&self) -> cudarc::driver::sys::CUstream {
201+
self.cu_stream()
202+
}
203+
204+
fn context(&self) -> Arc<dyn DynamoCudaContextProvider> {
205+
self.context().clone()
206+
}
207+
}

lib/llm/src/lib.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,9 @@ pub mod types;
3838
#[cfg(feature = "block-manager")]
3939
pub mod block_manager;
4040

41+
#[cfg(feature = "cuda")]
42+
pub mod cuda;
43+
4144
/// Reads a JSON file, extracts a specific field, and deserializes it into type T.
4245
///
4346
/// # Arguments

0 commit comments

Comments
 (0)