From 410c452db835f22b5a545b8fb2066ab7ca1677f9 Mon Sep 17 00:00:00 2001 From: Yong Hoon Shin Date: Wed, 4 Jun 2025 11:28:27 -0700 Subject: [PATCH 1/7] Fix TPU kv sharing tests Signed-off-by: Yong Hoon Shin --- tests/v1/tpu/worker/test_tpu_model_runner.py | 27 ++++++++++++-------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py index e351f0e92525..26c8618158a3 100644 --- a/tests/v1/tpu/worker/test_tpu_model_runner.py +++ b/tests/v1/tpu/worker/test_tpu_model_runner.py @@ -371,11 +371,14 @@ def test_get_req_paddings(): @pytest.mark.skip(reason="Test is broken on TPU when it's added.") -def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order(): +def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order( + model_runner): layer_0 = "model.layers.0.self_attn.attn" layer_1 = "model.layers.1.self_attn.attn" error_msg = f"{layer_1} must come before the current layer" - with pytest.raises(ValueError, match=error_msg): + vllm_config = model_runner.vllm_config + with pytest.raises(ValueError, match=error_msg), \ + set_current_vllm_config(vllm_config): fwd_context = { # initialization below will fail because target layer is invalid; # the target layer needs to come before layer 1 @@ -400,12 +403,14 @@ def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order(): @pytest.mark.skip(reason="Test is broken on TPU when it's added.") -def test_init_kv_cache_with_kv_sharing_target_layer_not_exist(): +def test_init_kv_cache_with_kv_sharing_target_layer_not_exist(model_runner): layer_0 = "model.layers.0.self_attn.attn" layer_1 = "model.layers.1.self_attn.attn" invalid_layer = "model.layers.0.cross_attn.attn" error_msg = f"{invalid_layer} is not a valid Attention layer in the model" - with pytest.raises(ValueError, match=error_msg): + vllm_config = model_runner.vllm_config + with pytest.raises(ValueError, match=error_msg), \ + set_current_vllm_config(vllm_config): fwd_context = { layer_0: Attention( @@ -429,11 +434,13 @@ def test_init_kv_cache_with_kv_sharing_target_layer_not_exist(): @pytest.mark.skip(reason="Test is broken on TPU when it's added.") -def test_init_kv_cache_with_kv_sharing_target_same_as_current(): +def test_init_kv_cache_with_kv_sharing_target_same_as_current(model_runner): layer_0 = "model.layers.0.self_attn.attn" layer_1 = "model.layers.1.self_attn.attn" error_msg = f"{layer_1} cannot be the same as the current layer" - with pytest.raises(ValueError, match=error_msg): + vllm_config = model_runner.vllm_config + with pytest.raises(ValueError, match=error_msg), \ + set_current_vllm_config(vllm_config): fwd_context = { # initialization below will fail because target layer is invalid; # the target layer needs to come before layer 1 @@ -488,7 +495,7 @@ def test_init_kv_cache_without_kv_sharing(model_runner): assert len(kv_cache_spec) == 2 assert len(model_runner.shared_kv_cache_layers) == 0 - available_memory = 20 * GiB_bytes + available_memory = 40 * GiB_bytes # page size for layer 0's kv_cache_spec is 32KB num_expected_blocks = 327680 # 20GB / 32KB / 2 (num layers) kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec, @@ -499,7 +506,7 @@ def test_init_kv_cache_without_kv_sharing(model_runner): assert kv_cache_config.tensors[layer_1].size == available_memory // 2 max_context_len =\ - estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes) + estimate_max_model_len(vllm_config, kv_cache_spec, 10 * GiB_bytes) # max context len with KV sharing should be 2x as large as without assert max_context_len == 1310720 @@ -557,7 +564,7 @@ def test_init_kv_cache_with_kv_sharing_valid(model_runner): assert layer_0 in kv_cache_spec assert model_runner.shared_kv_cache_layers[layer_1] == layer_0 - available_memory = 20 * GiB_bytes + available_memory = 40 * GiB_bytes # page size for layer 0's kv_cache_spec is 32KB # with KV sharing, we can allocate (available_mem//page_size//1) blocks # which is twice as many as without KV sharing @@ -571,7 +578,7 @@ def test_init_kv_cache_with_kv_sharing_valid(model_runner): assert kv_cache_config.tensors[layer_0].size == available_memory max_context_len =\ - estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes) + estimate_max_model_len(vllm_config, kv_cache_spec, 10 * GiB_bytes) # max context len with KV sharing should be 2x as large as without assert max_context_len == 2 * 1310720 From dfd8276a0282e97848da8b1adc539e00238cbed7 Mon Sep 17 00:00:00 2001 From: Yong Hoon Shin Date: Wed, 4 Jun 2025 17:06:38 -0700 Subject: [PATCH 2/7] Fix more tests Signed-off-by: Yong Hoon Shin --- tests/v1/tpu/worker/test_tpu_model_runner.py | 43 ++++++++++++-------- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py index 26c8618158a3..8b4e53e3d2e0 100644 --- a/tests/v1/tpu/worker/test_tpu_model_runner.py +++ b/tests/v1/tpu/worker/test_tpu_model_runner.py @@ -31,10 +31,7 @@ "vllm.v1.worker.tpu_model_runner.PallasAttentionBackend", ) pallas_attention_backend_patcher.start() - -@pytest.fixture -def model_runner(): - # Patchers have already been started at module level. +def get_vllm_config(): scheduler_config = SchedulerConfig( max_num_seqs=10, max_num_batched_tokens=512, @@ -60,12 +57,20 @@ def model_runner(): cache_config=cache_config, scheduler_config=scheduler_config, ) + +def get_model_runner(vllm_config): device = "xla:0" # Mocking TPU device with mock.patch("vllm.v1.worker.tpu_model_runner.torch"), \ mock.patch("vllm.v1.worker.tpu_model_runner.xm"), \ mock.patch("vllm.v1.worker.tpu_model_runner.xr"): return TPUModelRunner(vllm_config, device) +@pytest.fixture +def model_runner(): + # Patchers have already been started at module level. + vllm_config = get_vllm_config() + return get_model_runner(vllm_config) + @pytest.fixture(autouse=True, scope="session") def cleanup_patches(): @@ -465,10 +470,10 @@ def test_init_kv_cache_with_kv_sharing_target_same_as_current(model_runner): @pytest.mark.skip(reason="Test is broken on TPU when it's added.") -def test_init_kv_cache_without_kv_sharing(model_runner): +def test_init_kv_cache_without_kv_sharing(): layer_0 = "model.layers.0.self_attn.attn" layer_1 = "model.layers.1.self_attn.attn" - vllm_config = model_runner.vllm_config + vllm_config = get_vllm_config() with set_current_vllm_config(vllm_config): fwd_context = { layer_0: @@ -491,13 +496,14 @@ def test_init_kv_cache_without_kv_sharing(model_runner): # Set high context length to test max context length estimation vllm_config.model_config.max_model_len = 3_000_000 vllm_ctx = vllm_config.compilation_config.static_forward_context + model_runner = get_model_runner(vllm_config) kv_cache_spec = model_runner.get_kv_cache_spec() assert len(kv_cache_spec) == 2 assert len(model_runner.shared_kv_cache_layers) == 0 - available_memory = 40 * GiB_bytes - # page size for layer 0's kv_cache_spec is 32KB - num_expected_blocks = 327680 # 20GB / 32KB / 2 (num layers) + available_memory = 20 * GiB_bytes + # page size for layer 0's kv_cache_spec is 64KB + num_expected_blocks = 163840 # 20GB / 64KB / 2 (num layers) kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec, available_memory) assert kv_cache_config.num_blocks == num_expected_blocks @@ -506,9 +512,9 @@ def test_init_kv_cache_without_kv_sharing(model_runner): assert kv_cache_config.tensors[layer_1].size == available_memory // 2 max_context_len =\ - estimate_max_model_len(vllm_config, kv_cache_spec, 10 * GiB_bytes) + estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes) # max context len with KV sharing should be 2x as large as without - assert max_context_len == 1310720 + assert max_context_len == 655360 # important: override tensor size to prevent large mem alloc during test # this will only allocate 2 block worth of memory (2 * 32kb) @@ -532,10 +538,10 @@ def test_init_kv_cache_without_kv_sharing(model_runner): @pytest.mark.skip(reason="Test is broken on TPU when it's added.") -def test_init_kv_cache_with_kv_sharing_valid(model_runner): +def test_init_kv_cache_with_kv_sharing_valid(): layer_0 = "model.layers.0.self_attn.attn" layer_1 = "model.layers.1.self_attn.attn" - vllm_config = model_runner.vllm_config + vllm_config = get_vllm_config() with set_current_vllm_config(vllm_config): fwd_context = { layer_0: @@ -559,16 +565,17 @@ def test_init_kv_cache_with_kv_sharing_valid(model_runner): # Set high context length to test max context length estimation vllm_config.model_config.max_model_len = 3_000_000 vllm_ctx = vllm_config.compilation_config.static_forward_context + model_runner = get_model_runner(vllm_config) kv_cache_spec = model_runner.get_kv_cache_spec() assert len(kv_cache_spec) == 1 assert layer_0 in kv_cache_spec assert model_runner.shared_kv_cache_layers[layer_1] == layer_0 - available_memory = 40 * GiB_bytes - # page size for layer 0's kv_cache_spec is 32KB + available_memory = 20 * GiB_bytes + # page size for layer 0's kv_cache_spec is 64KB # with KV sharing, we can allocate (available_mem//page_size//1) blocks # which is twice as many as without KV sharing - num_expected_blocks = 655360 # 20GB / 32KB + num_expected_blocks = 327680 # 20GB / 64KB kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec, available_memory) assert kv_cache_config.num_blocks == num_expected_blocks @@ -578,9 +585,9 @@ def test_init_kv_cache_with_kv_sharing_valid(model_runner): assert kv_cache_config.tensors[layer_0].size == available_memory max_context_len =\ - estimate_max_model_len(vllm_config, kv_cache_spec, 10 * GiB_bytes) + estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes) # max context len with KV sharing should be 2x as large as without - assert max_context_len == 2 * 1310720 + assert max_context_len == 2 * 655360 # important: override tensor size to prevent large mem alloc during test # this will only allocate 1 block worth of memory (32kb) From 50deb499a03a3c60b2ab5a4c1f721b885a875a63 Mon Sep 17 00:00:00 2001 From: Yong Hoon Shin Date: Wed, 4 Jun 2025 18:03:00 -0700 Subject: [PATCH 3/7] fix lint Signed-off-by: Yong Hoon Shin --- tests/v1/tpu/worker/test_tpu_model_runner.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py index 8b4e53e3d2e0..94540429c390 100644 --- a/tests/v1/tpu/worker/test_tpu_model_runner.py +++ b/tests/v1/tpu/worker/test_tpu_model_runner.py @@ -31,6 +31,7 @@ "vllm.v1.worker.tpu_model_runner.PallasAttentionBackend", ) pallas_attention_backend_patcher.start() + def get_vllm_config(): scheduler_config = SchedulerConfig( max_num_seqs=10, @@ -57,6 +58,8 @@ def get_vllm_config(): cache_config=cache_config, scheduler_config=scheduler_config, ) + return vllm_config + def get_model_runner(vllm_config): device = "xla:0" # Mocking TPU device @@ -65,6 +68,7 @@ def get_model_runner(vllm_config): mock.patch("vllm.v1.worker.tpu_model_runner.xr"): return TPUModelRunner(vllm_config, device) + @pytest.fixture def model_runner(): # Patchers have already been started at module level. From a7b2c321eb6a5a57cd938571504b888eaf098ed6 Mon Sep 17 00:00:00 2001 From: Yong Hoon Shin Date: Wed, 4 Jun 2025 19:41:04 -0700 Subject: [PATCH 4/7] Do not skip kv sharing tpu tests Signed-off-by: Yong Hoon Shin --- tests/v1/tpu/worker/test_tpu_model_runner.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py index 94540429c390..3588f35872b9 100644 --- a/tests/v1/tpu/worker/test_tpu_model_runner.py +++ b/tests/v1/tpu/worker/test_tpu_model_runner.py @@ -379,7 +379,6 @@ def test_get_req_paddings(): assert _get_req_paddings(8, 36) == [8, 16, 32, 36] -@pytest.mark.skip(reason="Test is broken on TPU when it's added.") def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order( model_runner): layer_0 = "model.layers.0.self_attn.attn" @@ -411,7 +410,6 @@ def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order( assert fwd_context is not None -@pytest.mark.skip(reason="Test is broken on TPU when it's added.") def test_init_kv_cache_with_kv_sharing_target_layer_not_exist(model_runner): layer_0 = "model.layers.0.self_attn.attn" layer_1 = "model.layers.1.self_attn.attn" @@ -442,7 +440,6 @@ def test_init_kv_cache_with_kv_sharing_target_layer_not_exist(model_runner): assert fwd_context is not None -@pytest.mark.skip(reason="Test is broken on TPU when it's added.") def test_init_kv_cache_with_kv_sharing_target_same_as_current(model_runner): layer_0 = "model.layers.0.self_attn.attn" layer_1 = "model.layers.1.self_attn.attn" @@ -473,7 +470,6 @@ def test_init_kv_cache_with_kv_sharing_target_same_as_current(model_runner): assert fwd_context is not None -@pytest.mark.skip(reason="Test is broken on TPU when it's added.") def test_init_kv_cache_without_kv_sharing(): layer_0 = "model.layers.0.self_attn.attn" layer_1 = "model.layers.1.self_attn.attn" @@ -541,7 +537,6 @@ def test_init_kv_cache_without_kv_sharing(): assert kv_cache_config.kv_cache_groups[0].layer_names[1] == layer_1 -@pytest.mark.skip(reason="Test is broken on TPU when it's added.") def test_init_kv_cache_with_kv_sharing_valid(): layer_0 = "model.layers.0.self_attn.attn" layer_1 = "model.layers.1.self_attn.attn" From 65fc127c84dc0ebbd5a1f7ff9d1ce3af4d2f01cb Mon Sep 17 00:00:00 2001 From: Yong Hoon Shin Date: Thu, 5 Jun 2025 17:53:32 -0700 Subject: [PATCH 5/7] fix test (tpu worker uses attn dtype instead of kv dtype) Signed-off-by: Yong Hoon Shin --- tests/v1/tpu/worker/test_tpu_model_runner.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py index 3588f35872b9..9d3e3e6330ee 100644 --- a/tests/v1/tpu/worker/test_tpu_model_runner.py +++ b/tests/v1/tpu/worker/test_tpu_model_runner.py @@ -494,7 +494,7 @@ def test_init_kv_cache_without_kv_sharing(): # suppress var not used error assert fwd_context is not None # Set high context length to test max context length estimation - vllm_config.model_config.max_model_len = 3_000_000 + vllm_config.model_config.max_model_len = 1_000_000 vllm_ctx = vllm_config.compilation_config.static_forward_context model_runner = get_model_runner(vllm_config) kv_cache_spec = model_runner.get_kv_cache_spec() @@ -502,8 +502,8 @@ def test_init_kv_cache_without_kv_sharing(): assert len(model_runner.shared_kv_cache_layers) == 0 available_memory = 20 * GiB_bytes - # page size for layer 0's kv_cache_spec is 64KB - num_expected_blocks = 163840 # 20GB / 64KB / 2 (num layers) + # page size for layer 0's kv_cache_spec is 128KB + num_expected_blocks = 81920 # 20GB / 128KB / 2 (num layers) kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec, available_memory) assert kv_cache_config.num_blocks == num_expected_blocks @@ -514,7 +514,7 @@ def test_init_kv_cache_without_kv_sharing(): max_context_len =\ estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes) # max context len with KV sharing should be 2x as large as without - assert max_context_len == 655360 + assert max_context_len == 327680 # important: override tensor size to prevent large mem alloc during test # this will only allocate 2 block worth of memory (2 * 32kb) @@ -562,7 +562,7 @@ def test_init_kv_cache_with_kv_sharing_valid(): # suppress var not used error assert fwd_context is not None # Set high context length to test max context length estimation - vllm_config.model_config.max_model_len = 3_000_000 + vllm_config.model_config.max_model_len = 1_000_000 vllm_ctx = vllm_config.compilation_config.static_forward_context model_runner = get_model_runner(vllm_config) kv_cache_spec = model_runner.get_kv_cache_spec() @@ -571,10 +571,10 @@ def test_init_kv_cache_with_kv_sharing_valid(): assert model_runner.shared_kv_cache_layers[layer_1] == layer_0 available_memory = 20 * GiB_bytes - # page size for layer 0's kv_cache_spec is 64KB + # page size for layer 0's kv_cache_spec is 128KB # with KV sharing, we can allocate (available_mem//page_size//1) blocks # which is twice as many as without KV sharing - num_expected_blocks = 327680 # 20GB / 64KB + num_expected_blocks = 163840 # 20GB / 128KB kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec, available_memory) assert kv_cache_config.num_blocks == num_expected_blocks @@ -586,7 +586,7 @@ def test_init_kv_cache_with_kv_sharing_valid(): max_context_len =\ estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes) # max context len with KV sharing should be 2x as large as without - assert max_context_len == 2 * 655360 + assert max_context_len == 2 * 327680 # important: override tensor size to prevent large mem alloc during test # this will only allocate 1 block worth of memory (32kb) From 9107b51739f320baa65edd222210a534184590fd Mon Sep 17 00:00:00 2001 From: Yong Hoon Shin Date: Thu, 5 Jun 2025 22:25:18 -0700 Subject: [PATCH 6/7] fix tpu test with bfloat16 and page_size=128 Signed-off-by: Yong Hoon Shin --- tests/v1/tpu/worker/test_tpu_model_runner.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py index 9d3e3e6330ee..859803adabcd 100644 --- a/tests/v1/tpu/worker/test_tpu_model_runner.py +++ b/tests/v1/tpu/worker/test_tpu_model_runner.py @@ -502,8 +502,10 @@ def test_init_kv_cache_without_kv_sharing(): assert len(model_runner.shared_kv_cache_layers) == 0 available_memory = 20 * GiB_bytes - # page size for layer 0's kv_cache_spec is 128KB - num_expected_blocks = 81920 # 20GB / 128KB / 2 (num layers) + # page size for each layer KV can be calculated as + # 2 (non-MLA) * 8 (num_heads) * 128 (head_dim) + # * 2 (bfloat16, kv_cache dtype) * 128 (block_size) = 512KB + num_expected_blocks = 20480 # 20GB / 512KB / 2 (num layers) kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec, available_memory) assert kv_cache_config.num_blocks == num_expected_blocks @@ -514,7 +516,7 @@ def test_init_kv_cache_without_kv_sharing(): max_context_len =\ estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes) # max context len with KV sharing should be 2x as large as without - assert max_context_len == 327680 + assert max_context_len == 81920 # important: override tensor size to prevent large mem alloc during test # this will only allocate 2 block worth of memory (2 * 32kb) @@ -571,10 +573,10 @@ def test_init_kv_cache_with_kv_sharing_valid(): assert model_runner.shared_kv_cache_layers[layer_1] == layer_0 available_memory = 20 * GiB_bytes - # page size for layer 0's kv_cache_spec is 128KB + # page size for layer 0's kv_cache_spec is 512KB # with KV sharing, we can allocate (available_mem//page_size//1) blocks # which is twice as many as without KV sharing - num_expected_blocks = 163840 # 20GB / 128KB + num_expected_blocks = 2 * 20480 # 20GB / 512KB kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec, available_memory) assert kv_cache_config.num_blocks == num_expected_blocks @@ -586,7 +588,7 @@ def test_init_kv_cache_with_kv_sharing_valid(): max_context_len =\ estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes) # max context len with KV sharing should be 2x as large as without - assert max_context_len == 2 * 327680 + assert max_context_len == 2 * 81920 # important: override tensor size to prevent large mem alloc during test # this will only allocate 1 block worth of memory (32kb) From ef53a4954a8df370ee396563ed3cf572d3d4b7fb Mon Sep 17 00:00:00 2001 From: Siyuan Liu Date: Fri, 6 Jun 2025 23:05:54 +0000 Subject: [PATCH 7/7] fix tests Signed-off-by: Siyuan Liu clean up Signed-off-by: Siyuan Liu clean up Signed-off-by: Siyuan Liu --- tests/v1/tpu/worker/test_tpu_model_runner.py | 57 ++++++-------------- 1 file changed, 17 insertions(+), 40 deletions(-) diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py index 859803adabcd..917c16c3c4cd 100644 --- a/tests/v1/tpu/worker/test_tpu_model_runner.py +++ b/tests/v1/tpu/worker/test_tpu_model_runner.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import unittest.mock as mock import pytest @@ -17,20 +16,6 @@ TPUModelRunner, _get_padded_num_reqs_with_upper_limit, _get_padded_token_len, _get_req_paddings, _get_token_paddings) -# Mock torch_xla module since it may not be available in the test environments -torch_xla_patcher = mock.patch.dict( - "sys.modules", { - "torch_xla": mock.MagicMock(), - "torch_xla.core.xla_model": mock.MagicMock(), - "torch_xla.runtime": mock.MagicMock(), - }) -torch_xla_patcher.start() - -# Mock the PallasAttentionBackend -pallas_attention_backend_patcher = mock.patch( - "vllm.v1.worker.tpu_model_runner.PallasAttentionBackend", ) -pallas_attention_backend_patcher.start() - def get_vllm_config(): scheduler_config = SchedulerConfig( @@ -63,10 +48,7 @@ def get_vllm_config(): def get_model_runner(vllm_config): device = "xla:0" # Mocking TPU device - with mock.patch("vllm.v1.worker.tpu_model_runner.torch"), \ - mock.patch("vllm.v1.worker.tpu_model_runner.xm"), \ - mock.patch("vllm.v1.worker.tpu_model_runner.xr"): - return TPUModelRunner(vllm_config, device) + return TPUModelRunner(vllm_config, device) @pytest.fixture @@ -76,13 +58,6 @@ def model_runner(): return get_model_runner(vllm_config) -@pytest.fixture(autouse=True, scope="session") -def cleanup_patches(): - yield - torch_xla_patcher.stop() - pallas_attention_backend_patcher.stop() - - def _schedule_new_request(*req_ids: str) -> SchedulerOutput: new_reqs = [] num_scheduled_tokens = {} @@ -509,21 +484,23 @@ def test_init_kv_cache_without_kv_sharing(): kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec, available_memory) assert kv_cache_config.num_blocks == num_expected_blocks - assert len(kv_cache_config.tensors) == 2 - assert kv_cache_config.tensors[layer_0].size == available_memory // 2 - assert kv_cache_config.tensors[layer_1].size == available_memory // 2 + assert len(kv_cache_config.kv_cache_tensors) == 2 + assert kv_cache_config.kv_cache_tensors[0].size == available_memory // 2 + assert kv_cache_config.kv_cache_tensors[1].size == available_memory // 2 max_context_len =\ estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes) # max context len with KV sharing should be 2x as large as without - assert max_context_len == 81920 + # max_context_len = available_memory / (page_size / block_size) / num_caches + # max_context_len = 5GB / (512KB / 128) / 2 = 655360 + assert max_context_len == 655360 # important: override tensor size to prevent large mem alloc during test - # this will only allocate 2 block worth of memory (2 * 32kb) + # this will only allocate 2 block worth of memory (2 * 512kb) kv_cache_config.num_blocks = 1 - for layer in kv_cache_config.tensors: - kv_cache_config.tensors[layer].size =\ - kv_cache_spec[layer].page_size_bytes + for kv_cache_tensor in kv_cache_config.kv_cache_tensors: + kv_cache_tensor.size = ( + kv_cache_spec[kv_cache_tensor.shared_by[0]].page_size_bytes) model_runner.initialize_kv_cache(kv_cache_config) @@ -564,7 +541,7 @@ def test_init_kv_cache_with_kv_sharing_valid(): # suppress var not used error assert fwd_context is not None # Set high context length to test max context length estimation - vllm_config.model_config.max_model_len = 1_000_000 + vllm_config.model_config.max_model_len = 3_000_000 vllm_ctx = vllm_config.compilation_config.static_forward_context model_runner = get_model_runner(vllm_config) kv_cache_spec = model_runner.get_kv_cache_spec() @@ -580,20 +557,20 @@ def test_init_kv_cache_with_kv_sharing_valid(): kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec, available_memory) assert kv_cache_config.num_blocks == num_expected_blocks - assert len(kv_cache_config.tensors) == 1 + assert len(kv_cache_config.kv_cache_tensors) == 1 # Each layer now has twice the available memory for KV cache # compared to no KV sharing - assert kv_cache_config.tensors[layer_0].size == available_memory + assert kv_cache_config.kv_cache_tensors[0].size == available_memory max_context_len =\ estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes) # max context len with KV sharing should be 2x as large as without - assert max_context_len == 2 * 81920 + assert max_context_len == (2 * 655360) # important: override tensor size to prevent large mem alloc during test - # this will only allocate 1 block worth of memory (32kb) + # this will only allocate 1 block worth of memory (512kb) kv_cache_config.num_blocks = 1 - kv_cache_config.tensors[layer_0].size =\ + kv_cache_config.kv_cache_tensors[0].size =\ kv_cache_spec[layer_0].page_size_bytes model_runner.initialize_kv_cache(kv_cache_config)