v1/spec_decode/eagle: make dummy_run CUDA graph gating robust with cudagraphs_enabled

xiaohajiayou · xiaohajiayou · commit 6eacbb5baabb · 2025-10-24T10:50:15.000+08:00
Signed-off-by: xiaohajiayou &lt;923390377@qq.com&gt;
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
@@ -1050,21 +1050,19 @@ def dummy_run(
         num_tokens: int,
         use_cudagraphs=True,
     ) -> None:
-        if (
-            use_cudagraphs
-            and self.use_cuda_graph
-            and num_tokens <= self.cudagraph_batch_sizes[-1]
-        ):
+        # Determine if CUDA graphs should be used for this run.
+        cudagraphs_enabled = (
+            use_cudagraphs and self.use_cuda_graph and bool(self.cudagraph_batch_sizes)
+        )
+        if cudagraphs_enabled and num_tokens <= self.cudagraph_batch_sizes[-1]:
             num_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)
 
         with set_forward_context(
             None,
             self.vllm_config,
             num_tokens=num_tokens,
             cudagraph_runtime_mode=(
-                CUDAGraphMode.PIECEWISE
-                if (use_cudagraphs and self.use_cuda_graph)
-                else CUDAGraphMode.NONE
+                CUDAGraphMode.PIECEWISE if cudagraphs_enabled else CUDAGraphMode.NONE
             ),
         ):
             if self.supports_mm_inputs: