From 410c452db835f22b5a545b8fb2066ab7ca1677f9 Mon Sep 17 00:00:00 2001
From: Yong Hoon Shin <yhshin@meta.com>
Date: Wed, 4 Jun 2025 11:28:27 -0700
Subject: [PATCH 1/7] Fix TPU kv sharing tests

Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
---
 tests/v1/tpu/worker/test_tpu_model_runner.py | 27 ++++++++++++--------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py
index e351f0e92525..26c8618158a3 100644
--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -371,11 +371,14 @@ def test_get_req_paddings():
 
 
 @pytest.mark.skip(reason="Test is broken on TPU when it's added.")
-def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():
+def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order(
+        model_runner):
     layer_0 = "model.layers.0.self_attn.attn"
     layer_1 = "model.layers.1.self_attn.attn"
     error_msg = f"{layer_1} must come before the current layer"
-    with pytest.raises(ValueError, match=error_msg):
+    vllm_config = model_runner.vllm_config
+    with pytest.raises(ValueError, match=error_msg), \
+        set_current_vllm_config(vllm_config):
         fwd_context = {
             # initialization below will fail because target layer is invalid;
             # the target layer needs to come before layer 1
@@ -400,12 +403,14 @@ def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():
 
 
 @pytest.mark.skip(reason="Test is broken on TPU when it's added.")
-def test_init_kv_cache_with_kv_sharing_target_layer_not_exist():
+def test_init_kv_cache_with_kv_sharing_target_layer_not_exist(model_runner):
     layer_0 = "model.layers.0.self_attn.attn"
     layer_1 = "model.layers.1.self_attn.attn"
     invalid_layer = "model.layers.0.cross_attn.attn"
     error_msg = f"{invalid_layer} is not a valid Attention layer in the model"
-    with pytest.raises(ValueError, match=error_msg):
+    vllm_config = model_runner.vllm_config
+    with pytest.raises(ValueError, match=error_msg), \
+        set_current_vllm_config(vllm_config):
         fwd_context = {
             layer_0:
             Attention(
@@ -429,11 +434,13 @@ def test_init_kv_cache_with_kv_sharing_target_layer_not_exist():
 
 
 @pytest.mark.skip(reason="Test is broken on TPU when it's added.")
-def test_init_kv_cache_with_kv_sharing_target_same_as_current():
+def test_init_kv_cache_with_kv_sharing_target_same_as_current(model_runner):
     layer_0 = "model.layers.0.self_attn.attn"
     layer_1 = "model.layers.1.self_attn.attn"
     error_msg = f"{layer_1} cannot be the same as the current layer"
-    with pytest.raises(ValueError, match=error_msg):
+    vllm_config = model_runner.vllm_config
+    with pytest.raises(ValueError, match=error_msg), \
+        set_current_vllm_config(vllm_config):
         fwd_context = {
             # initialization below will fail because target layer is invalid;
             # the target layer needs to come before layer 1
@@ -488,7 +495,7 @@ def test_init_kv_cache_without_kv_sharing(model_runner):
     assert len(kv_cache_spec) == 2
     assert len(model_runner.shared_kv_cache_layers) == 0
 
-    available_memory = 20 * GiB_bytes
+    available_memory = 40 * GiB_bytes
     # page size for layer 0's kv_cache_spec is 32KB
     num_expected_blocks = 327680  # 20GB / 32KB / 2 (num layers)
     kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
@@ -499,7 +506,7 @@ def test_init_kv_cache_without_kv_sharing(model_runner):
     assert kv_cache_config.tensors[layer_1].size == available_memory // 2
 
     max_context_len =\
-        estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
+        estimate_max_model_len(vllm_config, kv_cache_spec, 10 * GiB_bytes)
     # max context len with KV sharing should be 2x as large as without
     assert max_context_len == 1310720
 
@@ -557,7 +564,7 @@ def test_init_kv_cache_with_kv_sharing_valid(model_runner):
     assert layer_0 in kv_cache_spec
     assert model_runner.shared_kv_cache_layers[layer_1] == layer_0
 
-    available_memory = 20 * GiB_bytes
+    available_memory = 40 * GiB_bytes
     # page size for layer 0's kv_cache_spec is 32KB
     # with KV sharing, we can allocate (available_mem//page_size//1) blocks
     # which is twice as many as without KV sharing
@@ -571,7 +578,7 @@ def test_init_kv_cache_with_kv_sharing_valid(model_runner):
     assert kv_cache_config.tensors[layer_0].size == available_memory
 
     max_context_len =\
-        estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
+        estimate_max_model_len(vllm_config, kv_cache_spec, 10 * GiB_bytes)
     # max context len with KV sharing should be 2x as large as without
     assert max_context_len == 2 * 1310720
 

From dfd8276a0282e97848da8b1adc539e00238cbed7 Mon Sep 17 00:00:00 2001
From: Yong Hoon Shin <yhshin@meta.com>
Date: Wed, 4 Jun 2025 17:06:38 -0700
Subject: [PATCH 2/7] Fix more tests

Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
---
 tests/v1/tpu/worker/test_tpu_model_runner.py | 43 ++++++++++++--------
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py
index 26c8618158a3..8b4e53e3d2e0 100644
--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -31,10 +31,7 @@
     "vllm.v1.worker.tpu_model_runner.PallasAttentionBackend", )
 pallas_attention_backend_patcher.start()
 
-
-@pytest.fixture
-def model_runner():
-    # Patchers have already been started at module level.
+def get_vllm_config():
     scheduler_config = SchedulerConfig(
         max_num_seqs=10,
         max_num_batched_tokens=512,
@@ -60,12 +57,20 @@ def model_runner():
         cache_config=cache_config,
         scheduler_config=scheduler_config,
     )
+
+def get_model_runner(vllm_config):
     device = "xla:0"  # Mocking TPU device
     with mock.patch("vllm.v1.worker.tpu_model_runner.torch"), \
          mock.patch("vllm.v1.worker.tpu_model_runner.xm"), \
          mock.patch("vllm.v1.worker.tpu_model_runner.xr"):
         return TPUModelRunner(vllm_config, device)
 
+@pytest.fixture
+def model_runner():
+    # Patchers have already been started at module level.
+    vllm_config = get_vllm_config()
+    return get_model_runner(vllm_config)
+
 
 @pytest.fixture(autouse=True, scope="session")
 def cleanup_patches():
@@ -465,10 +470,10 @@ def test_init_kv_cache_with_kv_sharing_target_same_as_current(model_runner):
 
 
 @pytest.mark.skip(reason="Test is broken on TPU when it's added.")
-def test_init_kv_cache_without_kv_sharing(model_runner):
+def test_init_kv_cache_without_kv_sharing():
     layer_0 = "model.layers.0.self_attn.attn"
     layer_1 = "model.layers.1.self_attn.attn"
-    vllm_config = model_runner.vllm_config
+    vllm_config = get_vllm_config()
     with set_current_vllm_config(vllm_config):
         fwd_context = {
             layer_0:
@@ -491,13 +496,14 @@ def test_init_kv_cache_without_kv_sharing(model_runner):
     # Set high context length to test max context length estimation
     vllm_config.model_config.max_model_len = 3_000_000
     vllm_ctx = vllm_config.compilation_config.static_forward_context
+    model_runner = get_model_runner(vllm_config)
     kv_cache_spec = model_runner.get_kv_cache_spec()
     assert len(kv_cache_spec) == 2
     assert len(model_runner.shared_kv_cache_layers) == 0
 
-    available_memory = 40 * GiB_bytes
-    # page size for layer 0's kv_cache_spec is 32KB
-    num_expected_blocks = 327680  # 20GB / 32KB / 2 (num layers)
+    available_memory = 20 * GiB_bytes
+    # page size for layer 0's kv_cache_spec is 64KB
+    num_expected_blocks = 163840  # 20GB / 64KB / 2 (num layers)
     kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
                                           available_memory)
     assert kv_cache_config.num_blocks == num_expected_blocks
@@ -506,9 +512,9 @@ def test_init_kv_cache_without_kv_sharing(model_runner):
     assert kv_cache_config.tensors[layer_1].size == available_memory // 2
 
     max_context_len =\
-        estimate_max_model_len(vllm_config, kv_cache_spec, 10 * GiB_bytes)
+        estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
     # max context len with KV sharing should be 2x as large as without
-    assert max_context_len == 1310720
+    assert max_context_len == 655360
 
     # important: override tensor size to prevent large mem alloc during test
     # this will only allocate 2 block worth of memory (2 * 32kb)
@@ -532,10 +538,10 @@ def test_init_kv_cache_without_kv_sharing(model_runner):
 
 
 @pytest.mark.skip(reason="Test is broken on TPU when it's added.")
-def test_init_kv_cache_with_kv_sharing_valid(model_runner):
+def test_init_kv_cache_with_kv_sharing_valid():
     layer_0 = "model.layers.0.self_attn.attn"
     layer_1 = "model.layers.1.self_attn.attn"
-    vllm_config = model_runner.vllm_config
+    vllm_config = get_vllm_config()
     with set_current_vllm_config(vllm_config):
         fwd_context = {
             layer_0:
@@ -559,16 +565,17 @@ def test_init_kv_cache_with_kv_sharing_valid(model_runner):
     # Set high context length to test max context length estimation
     vllm_config.model_config.max_model_len = 3_000_000
     vllm_ctx = vllm_config.compilation_config.static_forward_context
+    model_runner = get_model_runner(vllm_config)
     kv_cache_spec = model_runner.get_kv_cache_spec()
     assert len(kv_cache_spec) == 1
     assert layer_0 in kv_cache_spec
     assert model_runner.shared_kv_cache_layers[layer_1] == layer_0
 
-    available_memory = 40 * GiB_bytes
-    # page size for layer 0's kv_cache_spec is 32KB
+    available_memory = 20 * GiB_bytes
+    # page size for layer 0's kv_cache_spec is 64KB
     # with KV sharing, we can allocate (available_mem//page_size//1) blocks
     # which is twice as many as without KV sharing
-    num_expected_blocks = 655360  # 20GB / 32KB
+    num_expected_blocks = 327680  # 20GB / 64KB
     kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
                                           available_memory)
     assert kv_cache_config.num_blocks == num_expected_blocks
@@ -578,9 +585,9 @@ def test_init_kv_cache_with_kv_sharing_valid(model_runner):
     assert kv_cache_config.tensors[layer_0].size == available_memory
 
     max_context_len =\
-        estimate_max_model_len(vllm_config, kv_cache_spec, 10 * GiB_bytes)
+        estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
     # max context len with KV sharing should be 2x as large as without
-    assert max_context_len == 2 * 1310720
+    assert max_context_len == 2 * 655360
 
     # important: override tensor size to prevent large mem alloc during test
     # this will only allocate 1 block worth of memory (32kb)

From 50deb499a03a3c60b2ab5a4c1f721b885a875a63 Mon Sep 17 00:00:00 2001
From: Yong Hoon Shin <yhshin@meta.com>
Date: Wed, 4 Jun 2025 18:03:00 -0700
Subject: [PATCH 3/7] fix lint

Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
---
 tests/v1/tpu/worker/test_tpu_model_runner.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py
index 8b4e53e3d2e0..94540429c390 100644
--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -31,6 +31,7 @@
     "vllm.v1.worker.tpu_model_runner.PallasAttentionBackend", )
 pallas_attention_backend_patcher.start()
 
+
 def get_vllm_config():
     scheduler_config = SchedulerConfig(
         max_num_seqs=10,
@@ -57,6 +58,8 @@ def get_vllm_config():
         cache_config=cache_config,
         scheduler_config=scheduler_config,
     )
+    return vllm_config
+
 
 def get_model_runner(vllm_config):
     device = "xla:0"  # Mocking TPU device
@@ -65,6 +68,7 @@ def get_model_runner(vllm_config):
          mock.patch("vllm.v1.worker.tpu_model_runner.xr"):
         return TPUModelRunner(vllm_config, device)
 
+
 @pytest.fixture
 def model_runner():
     # Patchers have already been started at module level.

From a7b2c321eb6a5a57cd938571504b888eaf098ed6 Mon Sep 17 00:00:00 2001
From: Yong Hoon Shin <yhshin@meta.com>
Date: Wed, 4 Jun 2025 19:41:04 -0700
Subject: [PATCH 4/7] Do not skip kv sharing tpu tests

Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
---
 tests/v1/tpu/worker/test_tpu_model_runner.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py
index 94540429c390..3588f35872b9 100644
--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -379,7 +379,6 @@ def test_get_req_paddings():
     assert _get_req_paddings(8, 36) == [8, 16, 32, 36]
 
 
-@pytest.mark.skip(reason="Test is broken on TPU when it's added.")
 def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order(
         model_runner):
     layer_0 = "model.layers.0.self_attn.attn"
@@ -411,7 +410,6 @@ def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order(
         assert fwd_context is not None
 
 
-@pytest.mark.skip(reason="Test is broken on TPU when it's added.")
 def test_init_kv_cache_with_kv_sharing_target_layer_not_exist(model_runner):
     layer_0 = "model.layers.0.self_attn.attn"
     layer_1 = "model.layers.1.self_attn.attn"
@@ -442,7 +440,6 @@ def test_init_kv_cache_with_kv_sharing_target_layer_not_exist(model_runner):
         assert fwd_context is not None
 
 
-@pytest.mark.skip(reason="Test is broken on TPU when it's added.")
 def test_init_kv_cache_with_kv_sharing_target_same_as_current(model_runner):
     layer_0 = "model.layers.0.self_attn.attn"
     layer_1 = "model.layers.1.self_attn.attn"
@@ -473,7 +470,6 @@ def test_init_kv_cache_with_kv_sharing_target_same_as_current(model_runner):
         assert fwd_context is not None
 
 
-@pytest.mark.skip(reason="Test is broken on TPU when it's added.")
 def test_init_kv_cache_without_kv_sharing():
     layer_0 = "model.layers.0.self_attn.attn"
     layer_1 = "model.layers.1.self_attn.attn"
@@ -541,7 +537,6 @@ def test_init_kv_cache_without_kv_sharing():
     assert kv_cache_config.kv_cache_groups[0].layer_names[1] == layer_1
 
 
-@pytest.mark.skip(reason="Test is broken on TPU when it's added.")
 def test_init_kv_cache_with_kv_sharing_valid():
     layer_0 = "model.layers.0.self_attn.attn"
     layer_1 = "model.layers.1.self_attn.attn"

From 65fc127c84dc0ebbd5a1f7ff9d1ce3af4d2f01cb Mon Sep 17 00:00:00 2001
From: Yong Hoon Shin <yhshin@meta.com>
Date: Thu, 5 Jun 2025 17:53:32 -0700
Subject: [PATCH 5/7] fix test (tpu worker uses attn dtype instead of kv dtype)

Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
---
 tests/v1/tpu/worker/test_tpu_model_runner.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py
index 3588f35872b9..9d3e3e6330ee 100644
--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -494,7 +494,7 @@ def test_init_kv_cache_without_kv_sharing():
         # suppress var not used error
         assert fwd_context is not None
     # Set high context length to test max context length estimation
-    vllm_config.model_config.max_model_len = 3_000_000
+    vllm_config.model_config.max_model_len = 1_000_000
     vllm_ctx = vllm_config.compilation_config.static_forward_context
     model_runner = get_model_runner(vllm_config)
     kv_cache_spec = model_runner.get_kv_cache_spec()
@@ -502,8 +502,8 @@ def test_init_kv_cache_without_kv_sharing():
     assert len(model_runner.shared_kv_cache_layers) == 0
 
     available_memory = 20 * GiB_bytes
-    # page size for layer 0's kv_cache_spec is 64KB
-    num_expected_blocks = 163840  # 20GB / 64KB / 2 (num layers)
+    # page size for layer 0's kv_cache_spec is 128KB
+    num_expected_blocks = 81920  # 20GB / 128KB / 2 (num layers)
     kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
                                           available_memory)
     assert kv_cache_config.num_blocks == num_expected_blocks
@@ -514,7 +514,7 @@ def test_init_kv_cache_without_kv_sharing():
     max_context_len =\
         estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
     # max context len with KV sharing should be 2x as large as without
-    assert max_context_len == 655360
+    assert max_context_len == 327680
 
     # important: override tensor size to prevent large mem alloc during test
     # this will only allocate 2 block worth of memory (2 * 32kb)
@@ -562,7 +562,7 @@ def test_init_kv_cache_with_kv_sharing_valid():
         # suppress var not used error
         assert fwd_context is not None
     # Set high context length to test max context length estimation
-    vllm_config.model_config.max_model_len = 3_000_000
+    vllm_config.model_config.max_model_len = 1_000_000
     vllm_ctx = vllm_config.compilation_config.static_forward_context
     model_runner = get_model_runner(vllm_config)
     kv_cache_spec = model_runner.get_kv_cache_spec()
@@ -571,10 +571,10 @@ def test_init_kv_cache_with_kv_sharing_valid():
     assert model_runner.shared_kv_cache_layers[layer_1] == layer_0
 
     available_memory = 20 * GiB_bytes
-    # page size for layer 0's kv_cache_spec is 64KB
+    # page size for layer 0's kv_cache_spec is 128KB
     # with KV sharing, we can allocate (available_mem//page_size//1) blocks
     # which is twice as many as without KV sharing
-    num_expected_blocks = 327680  # 20GB / 64KB
+    num_expected_blocks = 163840  # 20GB / 128KB
     kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
                                           available_memory)
     assert kv_cache_config.num_blocks == num_expected_blocks
@@ -586,7 +586,7 @@ def test_init_kv_cache_with_kv_sharing_valid():
     max_context_len =\
         estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
     # max context len with KV sharing should be 2x as large as without
-    assert max_context_len == 2 * 655360
+    assert max_context_len == 2 * 327680
 
     # important: override tensor size to prevent large mem alloc during test
     # this will only allocate 1 block worth of memory (32kb)

From 9107b51739f320baa65edd222210a534184590fd Mon Sep 17 00:00:00 2001
From: Yong Hoon Shin <yhshin@meta.com>
Date: Thu, 5 Jun 2025 22:25:18 -0700
Subject: [PATCH 6/7] fix tpu test with bfloat16 and page_size=128

Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
---
 tests/v1/tpu/worker/test_tpu_model_runner.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py
index 9d3e3e6330ee..859803adabcd 100644
--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -502,8 +502,10 @@ def test_init_kv_cache_without_kv_sharing():
     assert len(model_runner.shared_kv_cache_layers) == 0
 
     available_memory = 20 * GiB_bytes
-    # page size for layer 0's kv_cache_spec is 128KB
-    num_expected_blocks = 81920  # 20GB / 128KB / 2 (num layers)
+    # page size for each layer KV can be calculated as
+    # 2 (non-MLA) * 8 (num_heads) * 128 (head_dim)
+    # * 2 (bfloat16, kv_cache dtype) * 128 (block_size) = 512KB
+    num_expected_blocks = 20480  # 20GB / 512KB / 2 (num layers)
     kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
                                           available_memory)
     assert kv_cache_config.num_blocks == num_expected_blocks
@@ -514,7 +516,7 @@ def test_init_kv_cache_without_kv_sharing():
     max_context_len =\
         estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
     # max context len with KV sharing should be 2x as large as without
-    assert max_context_len == 327680
+    assert max_context_len == 81920
 
     # important: override tensor size to prevent large mem alloc during test
     # this will only allocate 2 block worth of memory (2 * 32kb)
@@ -571,10 +573,10 @@ def test_init_kv_cache_with_kv_sharing_valid():
     assert model_runner.shared_kv_cache_layers[layer_1] == layer_0
 
     available_memory = 20 * GiB_bytes
-    # page size for layer 0's kv_cache_spec is 128KB
+    # page size for layer 0's kv_cache_spec is 512KB
     # with KV sharing, we can allocate (available_mem//page_size//1) blocks
     # which is twice as many as without KV sharing
-    num_expected_blocks = 163840  # 20GB / 128KB
+    num_expected_blocks = 2 * 20480  # 20GB / 512KB
     kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
                                           available_memory)
     assert kv_cache_config.num_blocks == num_expected_blocks
@@ -586,7 +588,7 @@ def test_init_kv_cache_with_kv_sharing_valid():
     max_context_len =\
         estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
     # max context len with KV sharing should be 2x as large as without
-    assert max_context_len == 2 * 327680
+    assert max_context_len == 2 * 81920
 
     # important: override tensor size to prevent large mem alloc during test
     # this will only allocate 1 block worth of memory (32kb)

From ef53a4954a8df370ee396563ed3cf572d3d4b7fb Mon Sep 17 00:00:00 2001
From: Siyuan Liu <lsiyuan@google.com>
Date: Fri, 6 Jun 2025 23:05:54 +0000
Subject: [PATCH 7/7] fix tests

Signed-off-by: Siyuan Liu <lsiyuan@google.com>

clean up

Signed-off-by: Siyuan Liu <lsiyuan@google.com>

clean up

Signed-off-by: Siyuan Liu <lsiyuan@google.com>
---
 tests/v1/tpu/worker/test_tpu_model_runner.py | 57 ++++++--------------
 1 file changed, 17 insertions(+), 40 deletions(-)

diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py
index 859803adabcd..917c16c3c4cd 100644
--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import unittest.mock as mock
 
 import pytest
 
@@ -17,20 +16,6 @@
     TPUModelRunner, _get_padded_num_reqs_with_upper_limit,
     _get_padded_token_len, _get_req_paddings, _get_token_paddings)
 
-# Mock torch_xla module since it may not be available in the test environments
-torch_xla_patcher = mock.patch.dict(
-    "sys.modules", {
-        "torch_xla": mock.MagicMock(),
-        "torch_xla.core.xla_model": mock.MagicMock(),
-        "torch_xla.runtime": mock.MagicMock(),
-    })
-torch_xla_patcher.start()
-
-# Mock the PallasAttentionBackend
-pallas_attention_backend_patcher = mock.patch(
-    "vllm.v1.worker.tpu_model_runner.PallasAttentionBackend", )
-pallas_attention_backend_patcher.start()
-
 
 def get_vllm_config():
     scheduler_config = SchedulerConfig(
@@ -63,10 +48,7 @@ def get_vllm_config():
 
 def get_model_runner(vllm_config):
     device = "xla:0"  # Mocking TPU device
-    with mock.patch("vllm.v1.worker.tpu_model_runner.torch"), \
-         mock.patch("vllm.v1.worker.tpu_model_runner.xm"), \
-         mock.patch("vllm.v1.worker.tpu_model_runner.xr"):
-        return TPUModelRunner(vllm_config, device)
+    return TPUModelRunner(vllm_config, device)
 
 
 @pytest.fixture
@@ -76,13 +58,6 @@ def model_runner():
     return get_model_runner(vllm_config)
 
 
-@pytest.fixture(autouse=True, scope="session")
-def cleanup_patches():
-    yield
-    torch_xla_patcher.stop()
-    pallas_attention_backend_patcher.stop()
-
-
 def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
     new_reqs = []
     num_scheduled_tokens = {}
@@ -509,21 +484,23 @@ def test_init_kv_cache_without_kv_sharing():
     kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
                                           available_memory)
     assert kv_cache_config.num_blocks == num_expected_blocks
-    assert len(kv_cache_config.tensors) == 2
-    assert kv_cache_config.tensors[layer_0].size == available_memory // 2
-    assert kv_cache_config.tensors[layer_1].size == available_memory // 2
+    assert len(kv_cache_config.kv_cache_tensors) == 2
+    assert kv_cache_config.kv_cache_tensors[0].size == available_memory // 2
+    assert kv_cache_config.kv_cache_tensors[1].size == available_memory // 2
 
     max_context_len =\
         estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
     # max context len with KV sharing should be 2x as large as without
-    assert max_context_len == 81920
+    # max_context_len = available_memory / (page_size / block_size) / num_caches
+    # max_context_len = 5GB / (512KB / 128) / 2 = 655360
+    assert max_context_len == 655360
 
     # important: override tensor size to prevent large mem alloc during test
-    # this will only allocate 2 block worth of memory (2 * 32kb)
+    # this will only allocate 2 block worth of memory (2 * 512kb)
     kv_cache_config.num_blocks = 1
-    for layer in kv_cache_config.tensors:
-        kv_cache_config.tensors[layer].size =\
-            kv_cache_spec[layer].page_size_bytes
+    for kv_cache_tensor in kv_cache_config.kv_cache_tensors:
+        kv_cache_tensor.size = (
+            kv_cache_spec[kv_cache_tensor.shared_by[0]].page_size_bytes)
 
     model_runner.initialize_kv_cache(kv_cache_config)
 
@@ -564,7 +541,7 @@ def test_init_kv_cache_with_kv_sharing_valid():
         # suppress var not used error
         assert fwd_context is not None
     # Set high context length to test max context length estimation
-    vllm_config.model_config.max_model_len = 1_000_000
+    vllm_config.model_config.max_model_len = 3_000_000
     vllm_ctx = vllm_config.compilation_config.static_forward_context
     model_runner = get_model_runner(vllm_config)
     kv_cache_spec = model_runner.get_kv_cache_spec()
@@ -580,20 +557,20 @@ def test_init_kv_cache_with_kv_sharing_valid():
     kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
                                           available_memory)
     assert kv_cache_config.num_blocks == num_expected_blocks
-    assert len(kv_cache_config.tensors) == 1
+    assert len(kv_cache_config.kv_cache_tensors) == 1
     # Each layer now has twice the available memory for KV cache
     # compared to no KV sharing
-    assert kv_cache_config.tensors[layer_0].size == available_memory
+    assert kv_cache_config.kv_cache_tensors[0].size == available_memory
 
     max_context_len =\
         estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
     # max context len with KV sharing should be 2x as large as without
-    assert max_context_len == 2 * 81920
+    assert max_context_len == (2 * 655360)
 
     # important: override tensor size to prevent large mem alloc during test
-    # this will only allocate 1 block worth of memory (32kb)
+    # this will only allocate 1 block worth of memory (512kb)
     kv_cache_config.num_blocks = 1
-    kv_cache_config.tensors[layer_0].size =\
+    kv_cache_config.kv_cache_tensors[0].size =\
         kv_cache_spec[layer_0].page_size_bytes
 
     model_runner.initialize_kv_cache(kv_cache_config)