enable sdpa enable gqa logic for Ascend NPU (#41601)

FightingZhen · vasqu · web-flow · commit ab9253437758 · 2025-10-15T13:45:28.000Z
* enable gqa logic for Ascend NPU

* remove redundant comments

* fix comments about Ascend NPU

---------

Co-authored-by: Anton Vlasjuk &lt;73884904+vasqu@users.noreply.github.com&gt;
diff --git a/src/transformers/integrations/sdpa_attention.py b/src/transformers/integrations/sdpa_attention.py
@@ -29,19 +29,15 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
 
 def use_gqa_in_sdpa(attention_mask: Optional[torch.Tensor], key: torch.Tensor) -> bool:
     # GQA can only be used under the following conditions
-    # 1.cuda
+    # 1.cuda or Ascend NPU
     #   - torch version >= 2.5
     #   - attention_mask is None (otherwise it will fall back to the math kernel)
     #   - key is not a torch.fx.Proxy (otherwise it will fail with a tracing error)
     # 2.xpu
     #   - torch version >= 2.8
     #   - key is not a torch.fx.Proxy (otherwise it will fail with a tracing error)
-    # 3.npu
-    #   - npu is not supported gqa currently
     if _is_torch_xpu_available:
         return _is_torch_greater_or_equal_than_2_8 and not isinstance(key, torch.fx.Proxy)
-    if _is_torch_npu_available:
-        return False
     return _is_torch_greater_or_equal_than_2_5 and attention_mask is None and not isinstance(key, torch.fx.Proxy)