fix vllm graph register and add test (#1894)

NVShreyas · gemini-code-assist[bot] · web-flow · commit bea5949652b1 · 2025-10-13T19:43:16.000-07:00
## 📌 Description  The all-reduce buffers are incorrectly registered during the cuda graph capture phase when using vLLM custom all reduce. The base_ptr is not initialized and leads to illegal memory access error. vLLM's implementation with proper memory initialization - https://github.com/vllm-project/vllm/blob/0d21b9b51eccabfa1f8114eab2df61d75459bee7/csrc/custom_all_reduce.cuh#L452 This PR ensures that the memory pointer is initialized correctly and adds tests for vLLM custom all reduce during cuda graph capture ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [x] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [x] I have installed the hooks with `pre-commit install`. - [x] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [x] Tests have been added or updated as needed. - [x] All tests are passing (`unittest`, etc.). ## Reviewer Notes  --------- Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
diff --git a/include/flashinfer/comm/vllm_custom_all_reduce.cuh b/include/flashinfer/comm/vllm_custom_all_reduce.cuh
@@ -44,6 +44,8 @@ struct cuda_error : public std::runtime_error {
 namespace vllm {
 
 constexpr int kMaxBlocks = 36;
+constexpr CUpointer_attribute rangeStartAddrAttr = CU_POINTER_ATTRIBUTE_RANGE_START_ADDR;
+
 // Counter may overflow, but it's fine since unsigned int overflow is
 // well-defined behavior.
 using FlagType = uint32_t;
@@ -366,6 +368,9 @@ class CustomAllreduce {
       void* base_ptr;
       // note: must share the base address of each allocation, or we get wrong
       // address
+      if (cuPointerGetAttribute(&base_ptr, rangeStartAddrAttr, (CUdeviceptr)ptr) != CUDA_SUCCESS)
+        throw std::runtime_error("failed to get pointer attr");
+
       CHECK_CUDA_SUCCESS(
           cudaIpcGetMemHandle((cudaIpcMemHandle_t*)&handles[i * handle_sz], base_ptr));
       offsets[i] = ((char*)ptr) - ((char*)base_ptr);
diff --git a/tests/comm/test_vllm_custom_allreduce.py b/tests/comm/test_vllm_custom_allreduce.py
@@ -18,7 +18,7 @@
 logger = logging.getLogger(__name__)
 
 
-def _run_correctness_worker(world_size, rank, distributed_init_port):
+def _initialize_process_group(world_size, rank, distributed_init_port):
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
     distributed_init_method = f"tcp://localhost:{distributed_init_port}"
@@ -29,8 +29,12 @@ def _run_correctness_worker(world_size, rank, distributed_init_port):
         world_size=world_size,
     )
     group = dist.group.WORLD
+    return group
+
 
+def _run_correctness_worker(world_size, rank, distributed_init_port):
     try:
+        group = _initialize_process_group(world_size, rank, distributed_init_port)
         device = torch.device(f"cuda:{rank}")
         max_size = 8192 * 1024
         meta_ptrs = comm.create_shared_buffer(
@@ -104,6 +108,103 @@ def get_open_port() -> int:
             return s.getsockname()[1]
 
 
+def _run_graph_buffer_ipc_meta_worker(
+    world_size: int, rank: int, distributed_init_port: int
+):
+    """Test get_graph_buffer_ipc_meta function with CUDA graph capture."""
+
+    custom_ptr = None
+    meta_ptrs = None
+
+    try:
+        # Setup
+        group = _initialize_process_group(world_size, rank, distributed_init_port)
+        device = torch.device(f"cuda:{rank}")
+        max_size = 8192 * 1024
+        meta_ptrs = comm.create_shared_buffer(
+            comm.vllm_meta_size() + max_size, group=group
+        )
+        rank_data = torch.empty(8 * 1024 * 1024, dtype=torch.uint8, device=device)
+        custom_ptr = comm.vllm_init_custom_ar(meta_ptrs, rank_data, rank, True)
+
+        # Test 1: Empty state before graph capture
+        handle_bytes, offsets = comm.vllm_get_graph_buffer_ipc_meta(custom_ptr)
+        assert len(handle_bytes) == 0 and len(offsets) == 0, (
+            "Expected empty buffers before graph capture"
+        )
+
+        # Test 2: Capture graph and validate IPC metadata structure
+        test_size = 4096
+        num_cta = 16
+        dtype = torch.float16
+
+        inp1 = torch.randn(test_size, dtype=dtype, device=device)
+        inp2 = torch.randn(test_size, dtype=dtype, device=device)
+        out1 = torch.empty_like(inp1)
+        out2 = torch.empty_like(inp2)
+
+        g = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(g, pool=None):
+            comm.vllm_all_reduce(custom_ptr, inp1, out1, 0, 0, num_cta)
+            comm.vllm_all_reduce(custom_ptr, inp2, out2, 0, 0, num_cta)
+
+        handle_bytes, offsets = comm.vllm_get_graph_buffer_ipc_meta(custom_ptr)
+
+        # Validate structure: 2 buffers, correct handle size (64 bytes each)
+        ipc_handle_size = 64
+        expected_num_buffers = 2
+        assert len(offsets) == expected_num_buffers, (
+            f"Expected {expected_num_buffers} offsets, got {len(offsets)}"
+        )
+        assert len(handle_bytes) == ipc_handle_size * expected_num_buffers, (
+            f"Expected {ipc_handle_size * expected_num_buffers} handle bytes"
+        )
+        assert all(isinstance(o, int) and o >= 0 for o in offsets), (
+            "All offsets should be non-negative integers"
+        )
+
+        # Test 3: Distributed gather and register graph buffers
+        all_handle_bytes = [None] * world_size
+        all_offsets = [None] * world_size
+
+        dist.all_gather_object(all_handle_bytes, handle_bytes, group=group)
+        dist.all_gather_object(all_offsets, offsets, group=group)
+
+        # All ranks should have same number of buffers
+        assert all(len(off) == expected_num_buffers for off in all_offsets), (
+            "All ranks should have same number of buffers"
+        )
+
+        comm.vllm_register_graph_buffers(custom_ptr, all_handle_bytes, all_offsets)
+
+        # Test 4: Graph replay produces correct results
+        inp1_test = torch.randn(test_size, dtype=dtype, device=device)
+        inp2_test = torch.randn(test_size, dtype=dtype, device=device)
+
+        inp1.copy_(inp1_test)
+        inp2.copy_(inp2_test)
+
+        g.replay()
+        torch.cuda.synchronize()
+
+        # Verify with NCCL reference
+        inp1_ref = inp1_test.clone()
+        inp2_ref = inp2_test.clone()
+        dist.all_reduce(inp1_ref, group=group)
+        dist.all_reduce(inp2_ref, group=group)
+
+        torch.testing.assert_close(out1, inp1_ref, rtol=1e-3, atol=1e-3)
+        torch.testing.assert_close(out2, inp2_ref, rtol=1e-3, atol=1e-3)
+
+    finally:
+        dist.barrier(group=group)
+        if custom_ptr is not None:
+            comm.vllm_dispose(custom_ptr)
+        if meta_ptrs:
+            comm.free_shared_buffer(meta_ptrs, group)
+        dist.destroy_process_group(group=group)
+
+
 def multi_process_parallel(
     world_size: int, test_target: Any, target_args: tuple = ()
 ) -> None:
@@ -138,3 +239,20 @@ def test_vllm_custom_allreduce(world_size):
         target_args=(),
     )
     print(f"custom allreduce tp = {world_size}: OK")
+
+
+@pytest.mark.parametrize("world_size", [2, 4])
+def test_get_graph_buffer_ipc_meta(world_size: int):
+    """Test get_graph_buffer_ipc_meta function with CUDA graph capture."""
+    available_gpus = torch.cuda.device_count()
+    if world_size > available_gpus:
+        pytest.skip(
+            f"world_size {world_size} is greater than available_gpus {available_gpus}"
+        )
+    print(f"Running get_graph_buffer_ipc_meta test for world_size={world_size}")
+    multi_process_parallel(
+        world_size,
+        _run_graph_buffer_ipc_meta_worker,
+        target_args=(),
+    )
+    print(f"get_graph_buffer_ipc_meta test for world_size={world_size}: OK")