Skip to content

Commit 10fabbb

Browse files
committed
revert some changes
Signed-off-by: lizhiyuan <uniartisan2017@gmail.com>
1 parent 86e414c commit 10fabbb

File tree

2 files changed

+10
-11
lines changed

2 files changed

+10
-11
lines changed

tests/v1/worker/test_gpu_model_runner.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -822,21 +822,21 @@ def test_hybrid_attention_mamba_tensor_shapes(monkeypatch):
822822
assert attn_shape[0] % num_blocks == 0
823823
block_split_ratio = attn_shape[0] // num_blocks
824824

825-
# Use small blocks for testing to avoid memory issues
825+
# use small blocks for testing to avoid memory issues
826826
test_block_size = min(2, len(blocks0), len(blocks1))
827827

828-
# Use non-overlapping blocks to avoid data contamination
829-
# Split physical blocks: first half for attention, second half for mamba
828+
# use non-overlapping blocks to avoid data contamination
829+
# Split kernel blocks: first half for attention, second half for mamba
830830
mid_point = num_blocks // 2
831831

832-
# Attention uses physical blocks from first half (mapped to logical blocks)
832+
# attention uses kernel blocks from first half (mapped to logical blocks)
833833
kv_blocks_for_attention = np.array([0, 1])[:test_block_size]
834834

835-
# Mamba uses physical blocks from second half
835+
# mamba uses kernel blocks from second half
836836
kv_blocks_for_mamba = np.array([mid_point, mid_point + 1])[:test_block_size]
837837

838-
# Create small constant tensors for testing with corrected shapes
839-
# Attention: [block_size, ...] starting from dimension 2
838+
# create small constant tensors for testing with corrected shapes
839+
# attention: [block_size, ...] starting from dimension 2
840840
attn_constant_shape = attn_shape[2:]
841841
conv_constant_shape = conv_shape[1:]
842842
ssm_constant_shape = ssm_shape[1:]
@@ -859,14 +859,14 @@ def test_hybrid_attention_mamba_tensor_shapes(monkeypatch):
859859
for i, kernel_block in enumerate(kernel_blocks_for_attention):
860860
vllm_ctx[layer].kv_cache[0][kernel_block, :] = attn_blocks_constant[i]
861861

862-
# Fill mamba blocks with constants using physical block indices
862+
# fill mamba blocks with constants using kernel block indices
863863
for layer in [layer_2, layer_3, layer_4, layer_5]:
864-
# mamba: kv_cache[0][component][physical_block_idx, ...]
864+
# mamba: kv_cache[0][component][kernel_block_idx, ...]
865865
for i, kv_block in enumerate(kv_blocks_for_mamba):
866866
vllm_ctx[layer].kv_cache[0][0][kv_block, :] = conv_blocks_constant[i]
867867
vllm_ctx[layer].kv_cache[0][1][kv_block, :] = ssm_blocks_constant[i]
868868

869-
# Verify attention and mamba contents are correct
869+
# verify attention and mamba contents are correct
870870
for layer in [layer_0, layer_1]:
871871
for i, kernel_block in enumerate(kernel_blocks_for_attention):
872872
actual_kv = vllm_ctx[layer].kv_cache[0][kernel_block, :]

vllm/v1/worker/gpu_model_runner.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4459,7 +4459,6 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
44594459
self.may_add_encoder_only_layers_to_kv_cache_config()
44604460
self.maybe_add_kv_sharing_layers_to_kv_cache_groups(kv_cache_config)
44614461
self.initialize_attn_backend(kv_cache_config)
4462-
44634462
# Reinitialize need to after initialize_attn_backend
44644463
self.may_reinitialize_input_batch(kv_cache_config)
44654464
kv_caches = self.initialize_kv_cache_tensors(kv_cache_config)

0 commit comments

Comments
 (0)