update conditions of max-context-len

lcskrishna · lcskrishna · commit 2be22a549ee5 · 2024-05-30T16:40:11.000Z
diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py
@@ -114,7 +114,8 @@ def forward_decode(
         use_custom = (custom_attn_available and query.dtype == torch.half
                       and head_size == 128 and block_size == 16
                       and kv_cache_dtype == "auto"
-                      and (gqa_ratio >= 1 and gqa_ratio <= 16))
+                      and (gqa_ratio >= 1 and gqa_ratio <= 16)
+                      and max_context_len <= 32768)
         if not use_custom:
             _PARTITION_SIZE = _PARTITION_SIZE_V1V2
         else: