@@ -172,7 +172,6 @@ def __init__(
172172 kv_cache_dtype = "auto"
173173 block_size = 16
174174 calculate_kv_scales = False
175- self .block_size = block_size
176175 self .kv_cache_torch_dtype = kv_cache_dtype_str_to_dtype (
177176 kv_cache_dtype , vllm_config .model_config
178177 )
@@ -409,22 +408,24 @@ def get_attn_backend(self) -> type[AttentionBackend]:
409408 return self .attn_backend
410409
411410 def get_kv_cache_spec (self , vllm_config : VllmConfig ) -> KVCacheSpec :
411+ # Block size may get updated after model loading, refresh it
412+ block_size = vllm_config .cache_config .block_size
412413 # Should not be called for enc-dec or encoder-only attention.
413414 assert self .attn_type == AttentionType .DECODER
414415 if self .sliding_window is not None :
415416 assert not vllm_config .model_config .use_mla , (
416417 "MLA is not supported for slidingwindow"
417418 )
418419 return SlidingWindowSpec (
419- block_size = self . block_size ,
420+ block_size = block_size ,
420421 num_kv_heads = self .num_kv_heads ,
421422 head_size = self .head_size ,
422423 dtype = self .kv_cache_torch_dtype ,
423424 sliding_window = self .sliding_window ,
424425 )
425426 else :
426427 return FullAttentionSpec (
427- block_size = self . block_size ,
428+ block_size = block_size ,
428429 num_kv_heads = self .num_kv_heads ,
429430 head_size = self .head_size ,
430431 dtype = self .kv_cache_torch_dtype ,
@@ -624,7 +625,6 @@ def __init__(
624625 block_size = 16
625626 calculate_kv_scales = False
626627 self .kv_cache_dtype = kv_cache_dtype
627- self .block_size = block_size
628628
629629 dtype = torch .get_default_dtype ()
630630 self .attn_backend = get_attn_backend (
@@ -791,7 +791,7 @@ def get_kv_cache_spec(self, vllm_config: VllmConfig) -> KVCacheSpec:
791791 self .kv_cache_dtype , vllm_config .model_config .dtype
792792 )
793793 return MLAAttentionSpec (
794- block_size = self .block_size ,
794+ block_size = vllm_config . cache_config .block_size ,
795795 num_kv_heads = 1 ,
796796 head_size = self .head_size ,
797797 dtype = kv_cache_dtype ,
0 commit comments