From 1137a05ba864a846b350e4e194b997c726f8d444 Mon Sep 17 00:00:00 2001 From: Xiongfei Wei Date: Thu, 6 Mar 2025 05:56:03 +0000 Subject: [PATCH] Use the optimized block sizes after tuning the kernel Signed-off-by: Xiongfei Wei --- vllm/v1/attention/backends/pallas.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py index bbbdf50ac0cc..bf3992281a73 100644 --- a/vllm/v1/attention/backends/pallas.py +++ b/vllm/v1/attention/backends/pallas.py @@ -12,8 +12,8 @@ from vllm.attention.backends.utils import CommonAttentionState # These are the 2 tunable parameters of the paged attention Pallas kernel. -NUM_QUERIES_PER_BLOCK = 32 -NUM_KV_PAGES_PER_BLOCK = 128 +NUM_QUERIES_PER_BLOCK = 16 +NUM_KV_PAGES_PER_BLOCK = 256 class PallasAttentionBackend(AttentionBackend):