Skip to content

Commit 531bb75

Browse files
vasquArthurZucker
authored andcommitted
[Flex Attn] Fix lse x attention sinks logic (#41249)
fix
1 parent aca2380 commit 531bb75

File tree

1 file changed

+3
-3
lines changed

1 file changed

+3
-3
lines changed

src/transformers/integrations/flex_attention.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -290,10 +290,10 @@ def score_mod(score, batch_idx, head_idx, q_idx, kv_idx):
290290
# On CPU we must skip returning LSE due to a runtime issue; elsewhere, follow PyTorch API and return it
291291
return_lse = query.device.type != "cpu"
292292

293-
# Validate that s_aux is not silently ignored
294293
if not return_lse and s_aux is not None:
295-
logger.warning_once("s_aux provided with return_lse=False - forcing return_lse=True to avoid silent failure")
296-
return_lse = True
294+
raise ValueError(
295+
"Attention sinks cannot be run on CPU with flex attention. Please switch to a different device, e.g. CUDA"
296+
)
297297

298298
flex_attention_output = compile_friendly_flex_attention(
299299
query,

0 commit comments

Comments
 (0)