From 5e4bdecd8bf6db1afe51d736f0a1473c385db38b Mon Sep 17 00:00:00 2001 From: sa-buc Date: Tue, 3 Jun 2025 20:20:04 +0800 Subject: [PATCH 1/3] Fix DualChunkFlashAttention for short sequences Signed-off-by: sa-buc --- examples/offline_inference/qwen_1m.py | 1 + vllm/attention/backends/dual_chunk_flash_attn.py | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/examples/offline_inference/qwen_1m.py b/examples/offline_inference/qwen_1m.py index 856a35b0e59b..3a5f59f560af 100644 --- a/examples/offline_inference/qwen_1m.py +++ b/examples/offline_inference/qwen_1m.py @@ -62,6 +62,7 @@ def initialize_engine() -> LLM: def main(): llm = initialize_engine() + process_requests(llm, ["Hello, world!"]) prompt = load_prompt() process_requests(llm, [prompt]) diff --git a/vllm/attention/backends/dual_chunk_flash_attn.py b/vllm/attention/backends/dual_chunk_flash_attn.py index eceab1f1ac9a..b437ccc9b6fb 100644 --- a/vllm/attention/backends/dual_chunk_flash_attn.py +++ b/vllm/attention/backends/dual_chunk_flash_attn.py @@ -1218,7 +1218,10 @@ def _do_flash_attn( device=query_states.device), max_seqlen_k=max_seqlen_k, causal=causal, - block_table=block_table.unsqueeze(0), + # Since the key_states and value_states are directly retrieved from the KV cache + # through the block_table, + # setting `block_table` here is both wrong and unnecessary. + block_table=None, return_softmax_lse=True, ) softmax_lse = softmax_lse.view(q_len, q_heads, 1).transpose(0, From 27d63d1d105f59c9a35001d887a5609d2cb0db66 Mon Sep 17 00:00:00 2001 From: sa-buc Date: Tue, 3 Jun 2025 20:49:14 +0800 Subject: [PATCH 2/3] remove block_table directly Signed-off-by: sa-buc --- vllm/attention/backends/dual_chunk_flash_attn.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/attention/backends/dual_chunk_flash_attn.py b/vllm/attention/backends/dual_chunk_flash_attn.py index b437ccc9b6fb..718ee7bb1bec 100644 --- a/vllm/attention/backends/dual_chunk_flash_attn.py +++ b/vllm/attention/backends/dual_chunk_flash_attn.py @@ -1221,7 +1221,6 @@ def _do_flash_attn( # Since the key_states and value_states are directly retrieved from the KV cache # through the block_table, # setting `block_table` here is both wrong and unnecessary. - block_table=None, return_softmax_lse=True, ) softmax_lse = softmax_lse.view(q_len, q_heads, 1).transpose(0, From 5bbd250eeb59da3f4e242a5825a8b27e5e43aca9 Mon Sep 17 00:00:00 2001 From: sa-buc Date: Tue, 3 Jun 2025 21:02:56 +0800 Subject: [PATCH 3/3] remove comment Signed-off-by: sa-buc --- vllm/attention/backends/dual_chunk_flash_attn.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/vllm/attention/backends/dual_chunk_flash_attn.py b/vllm/attention/backends/dual_chunk_flash_attn.py index 718ee7bb1bec..88f2940c42ac 100644 --- a/vllm/attention/backends/dual_chunk_flash_attn.py +++ b/vllm/attention/backends/dual_chunk_flash_attn.py @@ -1218,9 +1218,6 @@ def _do_flash_attn( device=query_states.device), max_seqlen_k=max_seqlen_k, causal=causal, - # Since the key_states and value_states are directly retrieved from the KV cache - # through the block_table, - # setting `block_table` here is both wrong and unnecessary. return_softmax_lse=True, ) softmax_lse = softmax_lse.view(q_len, q_heads, 1).transpose(0,