We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent c9461e0 commit 084a9daCopy full SHA for 084a9da
vllm/v1/attention/backends/flex_attention.py
@@ -658,7 +658,10 @@ def build(
658
total_cache_tokens=total_cache_tokens,
659
decode_offset=offset_tensor,
660
num_blocks_per_seq=num_blocks_per_seq,
661
- direct_build=self.direct_build,
+ # FIXME(Isotr0py): direct build has issue to build bidirectional
662
+ # attention block mask for encoder-only models, disable it temporarily.
663
+ # see: https://github.com/vllm-project/vllm/pull/27329#issuecomment-3431484053
664
+ direct_build=(self.direct_build and common_attn_metadata.causal),
665
q_block_size=self.q_block_size,
666
kv_block_size=self.kv_block_size,
667
)
0 commit comments