fix comments

yaochengji · yaochengji · commit 068e16956cc5 · 2025-06-25T23:14:17.000Z
Signed-off-by: Chengji Yao &lt;chengjiyao@google.com&gt;
diff --git a/tests/v1/tpu/test_kv_cache_update_kernel.py b/tests/v1/tpu/test_kv_cache_update_kernel.py
@@ -15,9 +15,9 @@
 @pytest.mark.parametrize("page_size", [32, 33])
 @pytest.mark.parametrize("combined_kv_head_num", [2, 16])
 @pytest.mark.parametrize("head_dim", [128, 256])
-@pytest.mark.parametrize("kernel_block_size", [4, 8])
+@pytest.mark.parametrize("num_slices_per_block", [4, 8])
 def test_kv_cache_update_kernel(page_size: int, combined_kv_head_num: int,
-                                head_dim: int, kernel_block_size: int):
+                                head_dim: int, num_slices_per_block: int):
     page_num = 1000
     padded_num_tokens = 128
     kv_cache_cpu = torch.zeros(
@@ -42,11 +42,12 @@ def test_kv_cache_update_kernel(page_size: int, combined_kv_head_num: int,
          np.cumsum(slice_lens[:-1])])
     slot_mapping = np.stack(
         [kv_cache_start_indices, new_kv_cache_indices, slice_lens], axis=1)
-    padded_size = (slot_mapping.shape[0] + kernel_block_size -
-                   1) // kernel_block_size * kernel_block_size
+    padded_size = (slot_mapping.shape[0] + num_slices_per_block -
+                   1) // num_slices_per_block * num_slices_per_block
     slot_mapping = np.pad(slot_mapping,
                           [[0, padded_size - slot_mapping.shape[0]], [0, 0]],
                           constant_values=0)
+    slot_mapping = np.transpose(slot_mapping)
     slot_mapping_cpu = torch.tensor(slot_mapping,
                                     device="cpu",
                                     dtype=torch.int32)
@@ -56,7 +57,7 @@ def test_kv_cache_update_kernel(page_size: int, combined_kv_head_num: int,
     torch.ops.xla.dynamo_set_buffer_donor_(kv_cache_xla, True)
     new_kv_cache_xla = torch.ops.xla.kv_cache_update_op(
         new_kv_xla, slot_mapping_xla, kv_cache_xla, page_size,
-        kernel_block_size)
+        num_slices_per_block)
     kv_cache_xla.copy_(new_kv_cache_xla)
     torch_xla.sync()
 
diff --git a/tests/v1/tpu/test_pallas.py b/tests/v1/tpu/test_pallas.py
@@ -65,7 +65,7 @@ class FakeAttentionLayer:
         context_lens=context_lens,
         query_start_loc=query_start_loc,
         num_seqs=num_seqs,
-        kv_cache_update_block_size=8,
+        num_slices_per_kv_cache_update_block=8,
     )
 
     with patch("torch.ops.xla.ragged_paged_attention"
diff --git a/vllm/attention/ops/pallas_kv_cache_update.py b/vllm/attention/ops/pallas_kv_cache_update.py
@@ -10,25 +10,28 @@
 
 def _kv_cache_update_kernel(
     # Prefetch
-    slices_ref,  # [num_slices, 3]
+    slices_ref,  # [3, num_slices], list of (kv_cache_start, new_kv_start,
+    # slice_len)
     # Input
-    new_kv_hbm_ref,  # [tokens, num_combined_kv_heads, head_dim]
-    kv_cache_hbm_ref,
+    new_kv_hbm_ref,  # [num_tokens, num_combined_kv_heads, head_dim]
+    kv_cache_hbm_ref,  # [total_num_pages * page_size, num_combined_kv_heads,
+    # head_dim]
     # Output
     _,  # [total_num_pages * page_size, num_combined_kv_heads, head_dim]
     # Scratch
-    scratch,  # [block_size, page_size, num_combined_kv_heads, head_dim]
+    scratch,  # [num_slices_per_block, page_size, num_combined_kv_heads,
+    # head_dim]
     sem,
 ):
     async_copies = []
     block_idx = pl.program_id(0)
-    block_size = scratch.shape[0]
+    num_slices_per_block = scratch.shape[0]
 
     # Copy from new_kv_hbm_ref to scratch
-    for i in range(block_size):
-        offset_i = i + block_idx * block_size
-        new_kv_start = slices_ref[offset_i, 1]
-        length = slices_ref[offset_i, 2]
+    for i in range(num_slices_per_block):
+        offset_i = i + block_idx * num_slices_per_block
+        new_kv_start = slices_ref[1, offset_i]
+        length = slices_ref[2, offset_i]
         async_copy = pltpu.make_async_copy(
             new_kv_hbm_ref.at[pl.ds(new_kv_start, length), ...],
             scratch.at[i, pl.ds(0, length), ...],
@@ -42,10 +45,10 @@ def _kv_cache_update_kernel(
 
     # Copy from scratch to kv_cache_hbm_ref
     async_copies.clear()
-    for i in range(block_size):
-        offset_i = i + block_idx * block_size
-        kv_cache_start = slices_ref[offset_i, 0]
-        length = slices_ref[offset_i, 2]
+    for i in range(num_slices_per_block):
+        offset_i = i + block_idx * num_slices_per_block
+        kv_cache_start = slices_ref[0, offset_i]
+        length = slices_ref[2, offset_i]
         async_copy = pltpu.make_async_copy(
             scratch.at[i, pl.ds(0, length), ...],
             kv_cache_hbm_ref.at[pl.ds(kv_cache_start, length), ...],
@@ -59,23 +62,25 @@ def _kv_cache_update_kernel(
 
 @functools.partial(
     jax.jit,
-    static_argnames=["page_size", "block_size"],
+    static_argnames=["page_size", "num_slices_per_block"],
 )
 def kv_cache_update(
     new_kv: jax.Array,  # [total_num_token, num_combined_kv_heads, head_dim]
     slices: jax.
-    Array,  # [num_slices, 3], list of (kv_cache_start, new_kv_start, slice_len)
+    Array,  # [3, slices], list of (kv_cache_start, new_kv_start, slice_len)
     kv_cache: jax.
     Array,  # [total_num_pages * page_size, num_combined_kv_heads, head_dim]
     *,
     page_size: int = 32,
-    block_size: int = 8,
+    num_slices_per_block: int = 8,
 ):
-    assert slices.shape[0] % block_size == 0
+    assert slices.shape[1] % num_slices_per_block == 0
     _, num_combined_kv_heads, head_dim = new_kv.shape
     assert kv_cache.shape[1] == num_combined_kv_heads
     assert kv_cache.shape[2] == head_dim
     assert head_dim % 128 == 0
+    # TODO: Add dynamic check to make sure that the all the slice lengths are
+    # smaller or equal to page_size
 
     in_specs = [
         pl.BlockSpec(memory_space=pltpu.TPUMemorySpace.ANY),
@@ -87,7 +92,7 @@ def kv_cache_update(
 
     scalar_prefetches = [slices]
     scratch = pltpu.VMEM(
-        (block_size, page_size, num_combined_kv_heads, head_dim),
+        (num_slices_per_block, page_size, num_combined_kv_heads, head_dim),
         new_kv.dtype,
     )
 
@@ -102,7 +107,7 @@ def kv_cache_update(
             num_scalar_prefetch=len(scalar_prefetches),
             in_specs=in_specs,
             out_specs=out_specs,
-            grid=(slices.shape[0] // block_size, ),
+            grid=(slices.shape[1] // num_slices_per_block, ),
             scratch_shapes=scratch_shapes,
         ),
         out_shape=out_shape,
diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py
@@ -111,7 +111,7 @@ class PallasMetadata:
     context_lens: torch.Tensor
     query_start_loc: torch.Tensor
     num_seqs: torch.Tensor
-    kv_cache_update_block_size: int
+    num_slices_per_kv_cache_update_block: int
 
 
 class PallasAttentionBackendImpl(AttentionImpl):
@@ -217,10 +217,9 @@ def forward(
             # Write input keys and values to the KV cache.
             # Skip this if sharing KV cache with an earlier attention layer.
             slot_mapping = attn_metadata.slot_mapping
-            kv_cache_update_block_size = \
-                attn_metadata.kv_cache_update_block_size
-            write_to_kv_cache(key, value, kv_cache, slot_mapping,
-                              kv_cache_update_block_size)
+            write_to_kv_cache(
+                key, value, kv_cache, slot_mapping,
+                attn_metadata.num_slices_per_kv_cache_update_block)
 
         output = torch.ops.xla.ragged_paged_attention(
             query,
@@ -252,15 +251,15 @@ def write_to_kv_cache(
     value: torch.Tensor,
     kv_cache: torch.Tensor,
     slot_mapping: torch.Tensor,
-    kv_cache_update_block_size: int,
+    num_slices_per_kv_cache_update_block: int,
 ) -> None:
     """ Write the key and values to the KV cache.
 
     Args:
         key: shape = [num_tokens, num_kv_heads * head_size]
         value: shape = [num_tokens, num_kv_heads *  head_size]
         kv_cache = [num_blocks, block_size, num_kv_heads * 2, head_size]
-        kv_cache_update_block_size: int
+        num_slices_per_kv_cache_update_block: int
     """
     _, page_size, num_combined_kv_heads, head_size = kv_cache.shape
     head_size = cdiv(head_size,
@@ -272,39 +271,40 @@ def write_to_kv_cache(
 
     kv_cache = kv_cache.flatten(0, 1)
     new_kv_cache = torch.ops.xla.kv_cache_update_op(
-        kv, slot_mapping, kv_cache, page_size, kv_cache_update_block_size)
+        kv, slot_mapping, kv_cache, page_size,
+        num_slices_per_kv_cache_update_block)
     # NOTE: the in-place copy will be optimized away by XLA compiler.
     kv_cache.copy_(new_kv_cache)
 
 
 @requires_jax
 def kv_cache_update_op_impl(kv: torch.Tensor, slot_mapping: torch.Tensor,
                             kv_cache: torch.Tensor, page_size: int,
-                            block_size: int):
+                            num_slices_per_block: int):
     from vllm.attention.ops.pallas_kv_cache_update import kv_cache_update
     new_kv_cache = xb.call_jax(kv_cache_update, (kv, slot_mapping, kv_cache), {
         "page_size": page_size,
-        "block_size": block_size
+        "num_slices_per_block": num_slices_per_block
     })
     return new_kv_cache
 
 
 XLA_LIB.define(
     "kv_cache_update_op(Tensor kv, Tensor slot_mapping, Tensor kv_cache, "
-    "int page_size, int block_size) -> Tensor", )
+    "int page_size, int num_slices_per_block) -> Tensor", )
 
 
 @impl(XLA_LIB, "kv_cache_update_op", "XLA")
 def kv_cache_update_op_xla(kv: torch.Tensor, slot_mapping: torch.Tensor,
                            kv_cache: torch.Tensor, page_size: int,
-                           block_size: int) -> torch.Tensor:
+                           num_slices_per_block: int) -> torch.Tensor:
     new_kv_cache = kv_cache_update_op_impl(kv, slot_mapping, kv_cache,
-                                           page_size, block_size)
+                                           page_size, num_slices_per_block)
     return new_kv_cache
 
 
 @impl(XLA_LIB, "kv_cache_update_op", "CompositeExplicitAutograd")
 def kv_cache_update_op_non_xla(kv: torch.Tensor, slot_mapping: torch.Tensor,
                                kv_cache: torch.Tensor, page_size: int,
-                               block_size: int) -> torch.Tensor:
+                               num_slices_per_block: int) -> torch.Tensor:
     return kv_cache
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
@@ -57,7 +57,7 @@
 # Smallest output size
 MIN_NUM_SEQS = 8
 # Block size used for kv cache updating kernel
-KV_CACHE_UPDATE_KERNEL_BLOCK_SIZE = 8
+NUM_SLICES_PER_KV_CACHE_UPDATE_BLOCK = 8
 
 
 #########################################################
@@ -720,6 +720,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput",
             slot_mapping_metadata,
             [[0, padded_num_slices - len(slot_mapping_metadata)], [0, 0]],
             constant_values=0)
+        slot_mapping_metadata = np.transpose(slot_mapping_metadata)
         slot_mapping_metadata = torch.tensor(slot_mapping_metadata,
                                              device=self.device)
 
@@ -742,7 +743,8 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput",
             num_seqs=torch.tensor([num_reqs],
                                   dtype=torch.int32,
                                   device=self.device),
-            kv_cache_update_block_size=KV_CACHE_UPDATE_KERNEL_BLOCK_SIZE,
+            num_slices_per_kv_cache_update_block=
+            NUM_SLICES_PER_KV_CACHE_UPDATE_BLOCK,
         )
         # NOTE(woosuk): Due to chunked prefills, there can be at most 1 partial
         # request in the batch. While we should not sample any token from this
@@ -1170,7 +1172,7 @@ def _dummy_run(self, num_tokens: int, num_reqs: int,
                                    dtype=torch.int32).to(self.device)
         padded_num_slices = _get_padded_num_kv_cache_update_slices(
             num_tokens, self.max_num_reqs, self.block_size)
-        slot_mapping = torch.zeros((padded_num_slices, 3),
+        slot_mapping = torch.zeros((3, padded_num_slices),
                                    dtype=torch.int32).to(self.device)
         block_tables = torch.zeros(
             (num_reqs, num_blocks),
@@ -1190,7 +1192,8 @@ def _dummy_run(self, num_tokens: int, num_reqs: int,
             context_lens=context_lens,
             query_start_loc=query_start_loc,
             num_seqs=num_seqs,
-            kv_cache_update_block_size=KV_CACHE_UPDATE_KERNEL_BLOCK_SIZE,
+            num_slices_per_kv_cache_update_block=
+            NUM_SLICES_PER_KV_CACHE_UPDATE_BLOCK,
         )
 
         if self.is_multimodal_model:
@@ -1802,8 +1805,9 @@ def _get_padded_num_kv_cache_update_slices(num_tokens: int, max_num_reqs: int,
     padded_num_slices = 2 * max_num_reqs + num_tokens // page_size
     padded_num_slices = min(padded_num_slices, num_tokens)
     padded_num_slices = (
-        padded_num_slices + KV_CACHE_UPDATE_KERNEL_BLOCK_SIZE - 1
-    ) // KV_CACHE_UPDATE_KERNEL_BLOCK_SIZE * KV_CACHE_UPDATE_KERNEL_BLOCK_SIZE
+        padded_num_slices + NUM_SLICES_PER_KV_CACHE_UPDATE_BLOCK - 1
+    ) // NUM_SLICES_PER_KV_CACHE_UPDATE_BLOCK * \
+        NUM_SLICES_PER_KV_CACHE_UPDATE_BLOCK
     return padded_num_slices
 
 

Original file line number	Diff line number	Diff line change
`@@ -65,7 +65,7 @@ class FakeAttentionLayer:`
`65`	`65`	`context_lens=context_lens,`
`66`	`66`	`query_start_loc=query_start_loc,`
`67`	`67`	`num_seqs=num_seqs,`
`68`		`- kv_cache_update_block_size=8,`
	`68`	`+ num_slices_per_kv_cache_update_block=8,`
`69`	`69`	`)`
`70`	`70`
`71`	`71`	`with patch("torch.ops.xla.ragged_paged_attention"`