[deepseek] kernel block size for UniformTypeKVCacheSpecs (#26559)

heheda12345 · web-flow · commit 6f0f570c436c · 2025-10-10T16:40:41.000+08:00
Signed-off-by: Chen Zhang &lt;zhangch99@outlook.com&gt;
diff --git a/vllm/v1/attention/backends/mla/indexer.py b/vllm/v1/attention/backends/mla/indexer.py
@@ -1,11 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
-from typing import ClassVar, Optional
+from typing import ClassVar, Optional, Union
 
 import torch
 
-from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata
+from vllm.attention.backends.abstract import (
+    AttentionBackend,
+    AttentionMetadata,
+    MultipleOf,
+)
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.utils.deep_gemm import get_paged_mqa_logits_metadata
@@ -47,6 +51,10 @@ def get_kv_cache_shape(
     def get_kv_cache_stride_order() -> tuple[int, ...]:
         return (0, 1, 2)
 
+    @classmethod
+    def get_supported_kernel_block_size(cls) -> list[Union[int, MultipleOf]]:
+        return [64]
+
 
 @dataclass
 class DeepseekV32IndexerPrefillChunkMetadata:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -4242,9 +4242,14 @@ def _prepare_kernel_block_sizes(self, kv_cache_config: KVCacheConfig) -> list[in
         for kv_cache_group_id, kv_cache_group in enumerate(
             kv_cache_config.kv_cache_groups
         ):
-            if isinstance(kv_cache_group.kv_cache_spec, EncoderOnlyAttentionSpec):
+            kv_cache_spec = kv_cache_group.kv_cache_spec
+            if isinstance(kv_cache_spec, UniformTypeKVCacheSpecs):
+                # All layers in the UniformTypeKVCacheSpecs have the same type,
+                # Pick an arbitrary one to dispatch.
+                kv_cache_spec = next(iter(kv_cache_spec.kv_cache_specs.values()))
+            if isinstance(kv_cache_spec, EncoderOnlyAttentionSpec):
                 continue
-            elif isinstance(kv_cache_group.kv_cache_spec, AttentionSpec):
+            elif isinstance(kv_cache_spec, AttentionSpec):
                 # This is an attention backend that supports virtual
                 # block splitting. Get the supported block sizes from
                 # all backends in the group.
@@ -4254,10 +4259,10 @@ def _prepare_kernel_block_sizes(self, kv_cache_config: KVCacheConfig) -> list[in
                     kv_manager_block_size, attn_groups
                 )
                 kernel_block_sizes.append(selected_kernel_size)
-            elif isinstance(kv_cache_group.kv_cache_spec, MambaSpec):
+            elif isinstance(kv_cache_spec, MambaSpec):
                 # This is likely Mamba or other non-attention cache,
                 # no splitting.
-                kernel_block_sizes.append(kv_cache_group.kv_cache_spec.block_size)
+                kernel_block_sizes.append(kv_cache_spec.block_size)
             else:
                 raise NotImplementedError(
                     f"unknown kv cache spec {kv_cache_group.kv_cache_spec}"