Merge pull request vllm-project#26 from ROCm/cl/updates-pag-shomy

shajrawi · web-flow · commit fa75cbaa6b1b · 2024-05-30T11:42:23.000-05:00
Update max_context_len for custom paged attention.
diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py
@@ -114,7 +114,8 @@ def forward_decode(
         use_custom = (custom_attn_available and query.dtype == torch.half
                       and head_size == 128 and block_size == 16
                       and kv_cache_dtype == "auto"
-                      and (gqa_ratio >= 1 and gqa_ratio <= 16))
+                      and (gqa_ratio >= 1 and gqa_ratio <= 16)
+                      and max_context_len <= 32768)
         if not use_custom:
             _PARTITION_SIZE = _PARTITION_SIZE_V1V2
         else: