feat: kvbm transfer context v2 (dis-598) (#2873)

ryanolson · nnshah1 · commit bc6cd7e708d4 · 2025-09-08T16:06:34.000-07:00
Signed-off-by: Ryan Olson &lt;ryanolson@users.noreply.github.com&gt;
Signed-off-by: nnshah1 &lt;neelays@nvidia.com&gt;
diff --git a/lib/llm/Cargo.toml b/lib/llm/Cargo.toml
@@ -43,6 +43,10 @@ integration   = ["dynamo-runtime/integration"]
 name = "tokenizer"
 harness = false
 
+[[bench]]
+name = "transfer_context_v2"
+harness = false
+required-features = ["block-manager", "testing-cuda"]
 [dependencies]
 # repo
 dynamo-runtime = { workspace = true }
@@ -175,4 +179,4 @@ aligned-vec = "0.6.4"
 lazy_static = "1.4"
 
 [build-dependencies]
-tonic-build = { version = "0.13.1"}
+tonic-build = { version = "0.13.1"}
diff --git a/lib/llm/benches/transfer_context_v2.rs b/lib/llm/benches/transfer_context_v2.rs
@@ -0,0 +1,204 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#[cfg(feature = "testing-cuda")]
+mod benchmarks {
+    use std::sync::Arc;
+
+    use criterion::{BenchmarkId, Criterion, criterion_group};
+    use cudarc::driver::{CudaContext, CudaStream};
+    use nixl_sys;
+    use tokio::runtime::Runtime;
+    use tokio_util::task::TaskTracker;
+
+    use dynamo_llm::block_manager::block::transfer::context;
+
+    struct BenchmarkRuntime {
+        _runtime: Runtime,
+        handle: tokio::runtime::Handle,
+        stream: Arc<CudaStream>,
+        nixl_agent: Arc<Option<nixl_sys::Agent>>,
+    }
+
+    impl BenchmarkRuntime {
+        fn new() -> Self {
+            let runtime = Runtime::new().expect("Failed to create benchmark runtime");
+            let handle = runtime.handle().clone();
+
+            let cuda_ctx = Arc::new(CudaContext::new(0).expect("Failed to create CUDA context"));
+            let stream = cuda_ctx.default_stream();
+            let nixl_agent = Arc::new(None);
+
+            Self {
+                _runtime: runtime,
+                handle,
+                stream,
+                nixl_agent,
+            }
+        }
+
+        fn create_transfer_context(&self) -> context::v2::TransferContext {
+            context::v2::TransferContext::new(
+                self.nixl_agent.clone(),
+                self.stream.clone(),
+                self.handle.clone(),
+            )
+        }
+    }
+
+    /// Benchmark blocking synchronization in tight loop
+    /// This measures the baseline performance of direct CUDA event sync
+    fn bench_blocking(c: &mut Criterion) {
+        let runtime = BenchmarkRuntime::new();
+        let ctx = runtime.create_transfer_context();
+
+        let mut group = c.benchmark_group("blocking_sync");
+        group.warm_up_time(std::time::Duration::from_millis(500));
+        group.measurement_time(std::time::Duration::from_secs(3));
+
+        group.bench_function("sync", |b| {
+            b.iter(|| {
+                let event = ctx.record_event().unwrap();
+                event.synchronize_blocking().unwrap();
+            })
+        });
+
+        group.finish();
+    }
+
+    /// Benchmark single-threaded async synchronization
+    /// This measures only the tokio spawn_blocking overhead vs direct blocking
+    fn bench_async_single(c: &mut Criterion) {
+        let runtime = BenchmarkRuntime::new();
+        let ctx = runtime.create_transfer_context();
+
+        let mut group = c.benchmark_group("async_sync");
+        group.warm_up_time(std::time::Duration::from_millis(500));
+        group.measurement_time(std::time::Duration::from_secs(3));
+
+        group.bench_function("sync", |b| {
+            b.iter(|| {
+                runtime._runtime.block_on(async {
+                    let event = ctx.record_event().unwrap();
+                    event.synchronize().await.unwrap();
+                })
+            })
+        });
+
+        group.finish();
+    }
+
+    /// Benchmark concurrent async synchronization at different scales
+    /// This shows where async becomes beneficial due to parallelism
+    fn bench_concurrent_async(c: &mut Criterion) {
+        let runtime = BenchmarkRuntime::new();
+        let mut group = c.benchmark_group("concurrent_async");
+        group.warm_up_time(std::time::Duration::from_millis(500));
+        group.measurement_time(std::time::Duration::from_secs(3));
+
+        // Test different concurrency levels
+        for concurrency in [1, 5, 10, 25, 50, 100].iter() {
+            group.bench_with_input(
+                BenchmarkId::new("concurrent", concurrency),
+                concurrency,
+                |b, &concurrency| {
+                    let ctx = runtime.create_transfer_context();
+                    b.iter(|| {
+                        runtime._runtime.block_on(async {
+                            // Spawn concurrent tasks using TaskTracker
+                            let tracker = TaskTracker::new();
+
+                            for _ in 0..concurrency {
+                                let ctx_clone = ctx.clone();
+                                tracker.spawn(async move {
+                                    let event = ctx_clone.record_event().unwrap();
+                                    event.synchronize().await.unwrap();
+                                });
+                            }
+
+                            // Wait for all tasks to complete
+                            tracker.close();
+                            tracker.wait().await;
+                        });
+                    });
+                },
+            );
+        }
+
+        group.finish();
+    }
+
+    /// Benchmark throughput: events per second at different concurrency levels
+    fn bench_throughput(c: &mut Criterion) {
+        let runtime = BenchmarkRuntime::new();
+        let mut group = c.benchmark_group("throughput");
+        group.sample_size(50); // Fewer samples for throughput tests
+        group.warm_up_time(std::time::Duration::from_millis(500));
+        group.measurement_time(std::time::Duration::from_secs(3));
+
+        for concurrency in [1, 10, 50].iter() {
+            let events_per_task = 10; // Process multiple events per task
+
+            group.bench_with_input(
+                BenchmarkId::new("events_per_sec", concurrency),
+                concurrency,
+                |b, &concurrency| {
+                    let ctx = runtime.create_transfer_context();
+                    b.iter(|| {
+                        runtime._runtime.block_on(async {
+                            let tracker = TaskTracker::new();
+
+                            for _ in 0..concurrency {
+                                let ctx_clone = ctx.clone();
+                                tracker.spawn(async move {
+                                    // Process multiple events per task
+                                    for _ in 0..events_per_task {
+                                        let event = ctx_clone.record_event().unwrap();
+                                        event.synchronize().await.unwrap();
+                                    }
+                                });
+                            }
+
+                            tracker.close();
+                            tracker.wait().await;
+                        });
+                    });
+                },
+            );
+        }
+
+        group.finish();
+    }
+
+    criterion_group!(
+        benches,
+        // Core comparison benchmarks
+        bench_blocking,
+        bench_async_single,
+        // Concurrency benchmarks
+        bench_concurrent_async,
+        bench_throughput
+    );
+}
+
+#[cfg(feature = "testing-cuda")]
+criterion::criterion_main!(benchmarks::benches);
+
+#[cfg(not(feature = "testing-cuda"))]
+fn main() {
+    println!(
+        "Benchmarks require 'testing-cuda' feature. Run with: cargo bench --features testing-cuda"
+    );
+}
diff --git a/lib/llm/src/block_manager/block/transfer.rs b/lib/llm/src/block_manager/block/transfer.rs
@@ -13,7 +13,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-mod context;
+pub mod context;
 mod cuda;
 mod memcpy;
 mod nixl;
diff --git a/lib/llm/src/block_manager/block/transfer/context.rs b/lib/llm/src/block_manager/block/transfer/context.rs