From 1137a05ba864a846b350e4e194b997c726f8d444 Mon Sep 17 00:00:00 2001
From: Xiongfei Wei <isaacwxf23@gmail.com>
Date: Thu, 6 Mar 2025 05:56:03 +0000
Subject: [PATCH] Use the optimized block sizes after tuning the kernel

Signed-off-by: Xiongfei Wei <isaacwxf23@gmail.com>
---
 vllm/v1/attention/backends/pallas.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py
index bbbdf50ac0cc..bf3992281a73 100644
--- a/vllm/v1/attention/backends/pallas.py
+++ b/vllm/v1/attention/backends/pallas.py
@@ -12,8 +12,8 @@
 from vllm.attention.backends.utils import CommonAttentionState
 
 # These are the 2 tunable parameters of the paged attention Pallas kernel.
-NUM_QUERIES_PER_BLOCK = 32
-NUM_KV_PAGES_PER_BLOCK = 128
+NUM_QUERIES_PER_BLOCK = 16
+NUM_KV_PAGES_PER_BLOCK = 256
 
 
 class PallasAttentionBackend(AttentionBackend):