Skip to content

Commit b615111

Browse files
committed
refresh block_Size
Signed-off-by: NickLucche <nlucches@redhat.com>
1 parent b409658 commit b615111

File tree

1 file changed

+5
-5
lines changed

1 file changed

+5
-5
lines changed

vllm/attention/layer.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,6 @@ def __init__(
172172
kv_cache_dtype = "auto"
173173
block_size = 16
174174
calculate_kv_scales = False
175-
self.block_size = block_size
176175
self.kv_cache_torch_dtype = kv_cache_dtype_str_to_dtype(
177176
kv_cache_dtype, vllm_config.model_config
178177
)
@@ -409,22 +408,24 @@ def get_attn_backend(self) -> type[AttentionBackend]:
409408
return self.attn_backend
410409

411410
def get_kv_cache_spec(self, vllm_config: VllmConfig) -> KVCacheSpec:
411+
# Block size may get updated after model loading, refresh it
412+
block_size = vllm_config.cache_config.block_size
412413
# Should not be called for enc-dec or encoder-only attention.
413414
assert self.attn_type == AttentionType.DECODER
414415
if self.sliding_window is not None:
415416
assert not vllm_config.model_config.use_mla, (
416417
"MLA is not supported for slidingwindow"
417418
)
418419
return SlidingWindowSpec(
419-
block_size=self.block_size,
420+
block_size=block_size,
420421
num_kv_heads=self.num_kv_heads,
421422
head_size=self.head_size,
422423
dtype=self.kv_cache_torch_dtype,
423424
sliding_window=self.sliding_window,
424425
)
425426
else:
426427
return FullAttentionSpec(
427-
block_size=self.block_size,
428+
block_size=block_size,
428429
num_kv_heads=self.num_kv_heads,
429430
head_size=self.head_size,
430431
dtype=self.kv_cache_torch_dtype,
@@ -624,7 +625,6 @@ def __init__(
624625
block_size = 16
625626
calculate_kv_scales = False
626627
self.kv_cache_dtype = kv_cache_dtype
627-
self.block_size = block_size
628628

629629
dtype = torch.get_default_dtype()
630630
self.attn_backend = get_attn_backend(
@@ -791,7 +791,7 @@ def get_kv_cache_spec(self, vllm_config: VllmConfig) -> KVCacheSpec:
791791
self.kv_cache_dtype, vllm_config.model_config.dtype
792792
)
793793
return MLAAttentionSpec(
794-
block_size=self.block_size,
794+
block_size=vllm_config.cache_config.block_size,
795795
num_kv_heads=1,
796796
head_size=self.head_size,
797797
dtype=kv_cache_dtype,

0 commit comments

Comments
 (0)