From d419bc701cc842282802ac643051796bd8b57cab Mon Sep 17 00:00:00 2001
From: liuzhenhai93 <liuzhenhai93@outlook.com>
Date: Sun, 9 Jul 2023 09:38:44 +0000
Subject: [PATCH] polish

---
 .../paddle/nn/functional/flash_attention.py   | 47 +++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/python/paddle/nn/functional/flash_attention.py b/python/paddle/nn/functional/flash_attention.py
index 277c633c8f3d3..6d6f85655268c 100644
--- a/python/paddle/nn/functional/flash_attention.py
+++ b/python/paddle/nn/functional/flash_attention.py
@@ -410,6 +410,53 @@ def flash_attn_unpadded(
 def scaled_dot_product_attention(
     query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False
 ):
+    r"""
+    The equation is:
+
+    .. math::
+
+        result=softmax(\frac{ Q * K^T }{\sqrt{d}}) * V
+
+    where : ``Q``, ``K``, and ``V`` represent the three input parameters of the attention module.
+    The dimensions of the three parameters are the same.
+    ``d`` represents the size of the last dimension of the three parameters.
+
+    Warning:
+        This API is only support inputs with dtype float16 and bfloat16.
+
+    Args:
+        query(Tensor): The query tensor in the Attention module.
+                        4-D tensor with shape:
+                        [batch_size, seq_len, num_heads, head_dim].
+                        The dtype can be float61 or bfloat16.
+        key(Tensor): The key tensor in the Attention module.
+                        4-D tensor with shape:
+                        [batch_size, seq_len, num_heads, head_dim].
+                        The dtype can be float61 or bfloat16.
+        value(Tensor): The value tensor in the Attention module.
+                        4-D tensor with shape:
+                        [batch_size, seq_len, num_heads, head_dim].
+                        The dtype can be float61 or bfloat16.
+        attn_mask(Tensor,optional): A float mask of the same type as query,
+                        key, value that is added to the attention score.
+                        not supported yet.
+        dropout_p(float): The dropout ratio.
+        is_causal(bool): Whether enable causal mode.
+
+    Returns:
+        out(Tensor): The attention tensor.
+                    4-D tensor with shape: [batch_size, seq_len, num_heads, head_dim].
+                    The dtype can be float16 or bfloat16.
+
+    Examples:
+        .. code-block:: python
+
+            # required: skiptest
+            import paddle
+            q = paddle.rand((1, 128, 2, 16), dtype=paddle.float16)
+            output = paddle.nn.functional.scaled_dot_product_attention(q, q, q, None, 0.9, False)
+            print(output)
+    """
     assert attn_mask is None, "attn_mask is not supported yet"
     out, _ = flash_attention(query, key, value, dropout_p, is_causal)
     return out