PaddlePaddle · SigureMo · Jun 12, 2024 · Jun 12, 2024 · Jun 12, 2024 · Jun 12, 2024
diff --git a/python/paddle/nn/functional/sparse_attention.py b/python/paddle/nn/functional/sparse_attention.py
@@ -12,20 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
+import paddle
 from paddle import _legacy_C_ops, in_dynamic_mode
 from paddle.base.layer_helper import LayerHelper
 
 
 def sparse_attention(
-    query,
-    key,
-    value,
-    sparse_csr_offset,
-    sparse_csr_columns,
-    key_padding_mask=None,
-    attn_mask=None,
-    name=None,
-):
+    query: paddle.Tensor,
+    key: paddle.Tensor,
+    value: paddle.Tensor,
+    sparse_csr_offset: paddle.Tensor,
+    sparse_csr_columns: paddle.Tensor,
+    key_padding_mask: paddle.Tensor | None = None,
+    attn_mask: paddle.Tensor | None = None,
+    name: str | None = None,
+) -> paddle.Tensor:
     r"""
     This operator sparsify the Attention matrix in Transformer module
     to achieve the effect of reducing memory consumption and computation.
@@ -68,20 +71,20 @@ def sparse_attention(
                         3-D tensor with shape:
                         [batch_size, num_heads, sparse_nnz].
                         The dtype should be int32.
-        key_padding_mask(Tensor, optional):The key padding mask tensor in the Attention module.
+        key_padding_mask(Tensor|None, optional):The key padding mask tensor in the Attention module.
                         2-D tensor with shape: [batch_size, seq_len].
                         The dtype can be float32 and float64.
                         A value of 0 means that the position is masked.
-        attn_mask(Tensor, optional):The attention mask tensor in the Attention module.
+        attn_mask(Tensor|None, optional):The attention mask tensor in the Attention module.
                         2-D tensor with shape: [seq_len, seq_len].
                         The dtype can be float32 and float64.
                         A value of 0 means that the position is masked.
-        name(str, optional): The default value is None. Normally there is no need for user
+        name(str|None, optional): The default value is None. Normally there is no need for user
                         to set this property. For more information, please refer to
                         :ref:`api_guide_Name`.
 
     Returns:
-        4-D tensor with shape:
+        Tensor, 4-D tensor with shape:
         [batch_size, num_heads, seq_len, head_dim].
         The dtype can be float32 or float64.