We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent cf4cd6c commit b18c17fCopy full SHA for b18c17f
vllm/utils/flashinfer.py
@@ -269,11 +269,6 @@ def use_trtllm_attention(
269
270
# Must use TRTLLM attention if query is FP8 quantized
271
if q_dtype == current_platform.fp8_dtype():
272
- if has_sinks:
273
- raise RuntimeError(
274
- "TRTLLM FP8-qkv kernel is not supported for attention sinks. "
275
- "Use kv_cache_dtype=auto for now."
276
- )
277
logger.info_once("Using TRTLLM attention (query is quantized).")
278
return True
279
0 commit comments