get rid of _DEFAULT_MAX_NUM_SPLITS_FOR_CUDA_GRAPH and extend to non-MLA flash attention

qqma · qqma · commit 9c6c81dbd65a · 2025-09-20T13:58:50.000-07:00
Signed-off-by: qqma &lt;qqma@amazon.com&gt;
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
@@ -7,6 +7,7 @@
 import numpy as np
 import torch
 
+from vllm import envs
 from vllm import _custom_ops as ops
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionMetadata, AttentionType,
@@ -33,9 +34,6 @@
 
 logger = init_logger(__name__)
 
-# NOTE(woosuk): This is an arbitrary number. Tune it if needed.
-_DEFAULT_MAX_NUM_SPLITS_FOR_CUDA_GRAPH = 16
-
 
 class FlashAttentionBackend(AttentionBackend):
 
@@ -215,7 +213,8 @@ def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
             # When using cuda graph, we need to set the upper bound of the
             # number of splits so that large enough intermediate buffers are
             # pre-allocated during capture.
-            self.max_num_splits = _DEFAULT_MAX_NUM_SPLITS_FOR_CUDA_GRAPH
+            self.max_num_splits = (
+                envs.VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH)
 
         # Sliding window size to be used with the AOT scheduler will be
         # populated on first build() call.
diff --git a/vllm/v1/attention/backends/mla/flashattn_mla.py b/vllm/v1/attention/backends/mla/flashattn_mla.py
@@ -25,10 +25,6 @@
 
 logger = init_logger(__name__)
 
-# NOTE(matt): This is an arbitrary number, copied from
-# woosuk's implementation in standard FlashAttention backend
-_DEFAULT_MAX_NUM_SPLITS_FOR_CUDA_GRAPH = 16
-
 
 class FlashAttnMLABackend(MLACommonBackend):
 
@@ -98,15 +94,8 @@ def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
             # When using cuda graph, we need to set the upper bound of the
             # number of splits so that large enough intermediate buffers are
             # pre-allocated during capture.
-            if envs.VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH is not None:
-                logger.info_once(
-                    "Getting flash attention max num splits for "
-                    "cuda graph from environment variable, value=%s",
-                    envs.VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH)
-                self.max_num_splits = (
-                    envs.VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH)
-            else:
-                self.max_num_splits = _DEFAULT_MAX_NUM_SPLITS_FOR_CUDA_GRAPH
+            self.max_num_splits = (
+                envs.VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH)
 
         # TODO(lucas): Until we add support for the DCP custom masking we need
         #   to restrict decodes to q_len == 1 when DCP is enabled.