Revert "supports fa3_varlen api (#72805)"

umiswing · umiswing · commit b81a1d262ceb · 2025-07-04T16:21:32.000+08:00
This reverts commit c0032d7.
diff --git a/paddle/phi/kernels/gpu/flash_attn_v3_kernel.cu b/paddle/phi/kernels/gpu/flash_attn_v3_kernel.cu
@@ -1071,127 +1071,6 @@ void FlashAttnV3Kernel(const Context &dev_ctx,
 #endif
 }
 
-template <typename T, typename Context>
-void FlashAttnV3VarLenKernel(const Context &dev_ctx,
-                             const DenseTensor &q,
-                             const DenseTensor &k,
-                             const DenseTensor &v,
-                             const DenseTensor &cu_seqlens_q,
-                             const DenseTensor &cu_seqlens_k,
-                             const paddle::optional<DenseTensor> &q_v_,
-                             const paddle::optional<DenseTensor> &q_descale_,
-                             const paddle::optional<DenseTensor> &k_descale_,
-                             const paddle::optional<DenseTensor> &v_descale_,
-                             const float softmax_scale,
-                             bool is_causal,
-                             int window_size_left,
-                             int window_size_right,
-                             const float softcap,
-                             int num_splits,
-                             const bool manual_set_pack_gqa,
-                             const bool pack_gqa_,
-                             const int sm_margin,
-                             const int max_seqlen_q,
-                             const int max_seqlen_k,
-                             DenseTensor *out,
-                             DenseTensor *softmax_lse) {
-#ifdef PADDLE_WITH_FLASHATTN_V3
-  // umiswing: the following options have not been fully tested
-  PADDLE_ENFORCE_EQ(q_v_.is_initialized(),
-                    false,
-                    common::errors::InvalidArgument("q_v_ is not supported"));
-  PADDLE_ENFORCE_EQ(
-      q_descale_.is_initialized(),
-      false,
-      common::errors::InvalidArgument("q_descale_ is not supported"));
-  PADDLE_ENFORCE_EQ(
-      k_descale_.is_initialized(),
-      false,
-      common::errors::InvalidArgument("k_descale_ is not supported"));
-  PADDLE_ENFORCE_EQ(
-      v_descale_.is_initialized(),
-      false,
-      common::errors::InvalidArgument("v_descale_ is not supported"));
-  PADDLE_ENFORCE_EQ(
-      window_size_left,
-      -1,
-      common::errors::InvalidArgument("window_size is not supported, please "
-                                      "set window_size_left/right to -1"));
-  PADDLE_ENFORCE_EQ(
-      window_size_right,
-      -1,
-      common::errors::InvalidArgument("window_size is not supported, please "
-                                      "set window_size_left/right to -1"));
-  PADDLE_ENFORCE_EQ(softcap,
-                    0,
-                    common::errors::InvalidArgument(
-                        "softcap is not supported, please set softcap to 0"));
-  PADDLE_ENFORCE_EQ(
-      num_splits,
-      1,
-      common::errors::InvalidArgument(
-          "num_splits is not supported, please set num_splits to 1"));
-  PADDLE_ENFORCE_EQ(manual_set_pack_gqa,
-                    false,
-                    common::errors::InvalidArgument(
-                        "manual_set_pack_gqa is not supported, please set "
-                        "manual_set_pack_gqa to false"));
-  PADDLE_ENFORCE_EQ(
-      pack_gqa_,
-      false,
-      common::errors::InvalidArgument(
-          "pack_gqa_ is not supported, please set pack_gqa_ to false"));
-  PADDLE_ENFORCE_EQ(
-      sm_margin,
-      0,
-      common::errors::InvalidArgument(
-          "sm_margin is not supported, please set sm_margin to 0"));
-
-  DenseTensor out_accum;
-  DenseTensor softmax_lse_accum;
-  FlashAttnV3BaseKernel<T, Context>(dev_ctx,
-                                    q,
-                                    k,
-                                    v,
-                                    paddle::none,  // k_new_
-                                    paddle::none,  // v_new_
-                                    q_v_,
-                                    paddle::none,  // out_
-                                    cu_seqlens_q,  // cu_seqlens_q_
-                                    cu_seqlens_k,  // cu_seqlens_k_
-                                    paddle::none,  // cu_seqlens_k_new_
-                                    paddle::none,  // seqused_q_
-                                    paddle::none,  // seqused_k_
-                                    paddle::none,  // page_table_
-                                    paddle::none,  // kv_batch_idx_
-                                    paddle::none,  // leftpad_k_
-                                    paddle::none,  // rotary_cos_
-                                    paddle::none,  // rotary_sin_
-                                    q_descale_,
-                                    k_descale_,
-                                    v_descale_,
-                                    paddle::none,  // scheduler_metadata
-                                    max_seqlen_q,  // max_seqlen_q_
-                                    max_seqlen_k,  // max_seqlen_k_
-                                    softmax_scale,
-                                    is_causal,
-                                    window_size_left,
-                                    window_size_right,
-                                    softcap,
-                                    true,  // is_rotary_interleaved
-                                    num_splits,
-                                    manual_set_pack_gqa,
-                                    pack_gqa_,
-                                    sm_margin,
-                                    out,
-                                    softmax_lse,
-                                    &out_accum,
-                                    &softmax_lse_accum);
-#else
-  RaiseNotSupportedError();
-#endif
-}
-
 }  // namespace phi
 
 PD_REGISTER_KERNEL(flash_attn_v3,
@@ -1200,10 +1079,3 @@ PD_REGISTER_KERNEL(flash_attn_v3,
                    phi::FlashAttnV3Kernel,
                    phi::dtype::float16,
                    phi::dtype::bfloat16) {}
-
-PD_REGISTER_KERNEL(flash_attn_v3_varlen,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::FlashAttnV3VarLenKernel,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/flash_attn_v3_kernel.h b/paddle/phi/kernels/gpu/flash_attn_v3_kernel.h
@@ -34,29 +34,4 @@ void FlashAttnV3Kernel(const Context &ctx,
                        const int sm_margin,
                        DenseTensor *out,
                        DenseTensor *softmax_lse);
-
-template <typename T, typename Context>
-void FlashAttnV3VarLenKernel(const Context &ctx,
-                             const DenseTensor &q,
-                             const DenseTensor &k,
-                             const DenseTensor &v,
-                             const DenseTensor &cu_seqlens_q,
-                             const DenseTensor &cu_seqlens_k,
-                             const paddle::optional<DenseTensor> &q_v_,
-                             const paddle::optional<DenseTensor> &q_descale_,
-                             const paddle::optional<DenseTensor> &k_descale_,
-                             const paddle::optional<DenseTensor> &v_descale_,
-                             const float softmax_scale,
-                             bool is_causal,
-                             int window_size_left,
-                             int window_size_right,
-                             const float softcap,
-                             int num_splits,
-                             const bool manual_set_pack_gqa,
-                             const bool pack_gqa_,
-                             const int sm_margin,
-                             const int max_seqlen_q,
-                             const int max_seqlen_k,
-                             DenseTensor *out,
-                             DenseTensor *softmax_lse);
 }  // namespace phi
diff --git a/paddle/phi/ops/yaml/ops.yaml b/paddle/phi/ops/yaml/ops.yaml
@@ -2088,17 +2088,6 @@
     data_type : q
   backward : flash_attn_v3_grad
 
-- op : flash_attn_v3_varlen
-  args : (Tensor q, Tensor k, Tensor v, Tensor cu_seqlens_q, Tensor cu_seqlens_k, Tensor q_v_, Tensor q_descale_, Tensor k_descale_, Tensor v_descale_, float softmax_scale, bool is_causal, int window_size_left, int window_size_right, float softcap, int num_splits, bool manual_set_pack_gqa, bool pack_gqa_, int sm_margin, int max_seqlen_q, int max_seqlen_k)
-  output : Tensor(out), Tensor(softmax_lse)
-  optional : q_v_, q_descale_, k_descale_, v_descale_
-  infer_meta :
-    func : FlashAttnV3InferMeta
-    param : [q, k, v]
-  kernel :
-    func : flash_attn_v3_varlen
-    data_type : q
-
 - op : flash_attn_varlen_qkvpacked
   args : (Tensor qkv, Tensor cu_seqlens_q,  Tensor cu_seqlens_k, Tensor fixed_seed_offset, Tensor attn_mask, Scalar max_seqlen_q, Scalar max_seqlen_k, float scale, float dropout = 0.0, bool causal = false, bool return_softmax = false, bool is_test = false, str rng_name = "", bool varlen_padded = true)
   output : Tensor(out), Tensor(softmax), Tensor(softmax_lse), Tensor(seed_offset)
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
@@ -87,7 +87,6 @@
     temporal_shift,
 )
 from .flash_attention import (
-    flash_attention_v3_varlen,
     flash_attn_qkvpacked,
     flash_attn_varlen_qkvpacked,
     flashmask_attention,
@@ -297,7 +296,6 @@
     'scaled_dot_product_attention',
     'flashmask_attention',
     'flash_attn_qkvpacked',
-    "flash_attention_v3_varlen",
     'flash_attn_varlen_qkvpacked',
     'group_norm',
     'moe_permute',
diff --git a/python/paddle/nn/functional/flash_attention.py b/python/paddle/nn/functional/flash_attention.py
@@ -446,15 +446,15 @@ def flash_attention(
         query(Tensor): The query tensor in the Attention module.
                         4-D tensor with shape:
                         [batch_size, seq_len, num_heads, head_dim].
-                        The dtype can be float16 or bfloat16.
+                        The dtype can be float61 or bfloat16.
         key(Tensor): The key tensor in the Attention module.
                         4-D tensor with shape:
                         [batch_size, seq_len, num_heads, head_dim].
-                        The dtype can be float16 or bfloat16.
+                        The dtype can be float61 or bfloat16.
         value(Tensor): The value tensor in the Attention module.
                         4-D tensor with shape:
                         [batch_size, seq_len, num_heads, head_dim].
-                        The dtype can be float16 or bfloat16.
+                        The dtype can be float61 or bfloat16.
         dropout(float): The dropout ratio.
         causal(bool): Whether enable causal mode.
         return_softmax(bool): Whether to return softmax.
@@ -635,157 +635,6 @@ def flash_attention(
             )
 
 
-@overload
-def flash_attention_v3_varlen(
-    query: Tensor,
-    key: Tensor,
-    value: Tensor,
-    cu_seqlens_q: Tensor,
-    cu_seqlens_k: Tensor,
-    dropout: float = ...,
-    causal: bool = ...,
-    return_softmax: Literal[False] = ...,
-    *,
-    fixed_seed_offset: Tensor | None = ...,
-    rng_name: str = ...,
-    training: bool = ...,
-    softmax_scale: float | None = ...,
-    max_seqlen_q: int = ...,
-    max_seqlen_k: int = ...,
-    name: str | None = ...,
-) -> tuple[Tensor, None]: ...
-
-
-@overload
-def flash_attention_v3_varlen(
-    query: Tensor,
-    key: Tensor,
-    value: Tensor,
-    cu_seqlens_q: Tensor,
-    cu_seqlens_k: Tensor,
-    dropout: float = ...,
-    causal: bool = ...,
-    return_softmax: Literal[True] = ...,
-    *,
-    fixed_seed_offset: Tensor | None = ...,
-    rng_name: str = ...,
-    training: bool = ...,
-    softmax_scale: float | None = ...,
-    max_seqlen_q: int = ...,
-    max_seqlen_k: int = ...,
-    name: str | None = ...,
-) -> tuple[Tensor, Tensor]: ...
-
-
-def flash_attention_v3_varlen(
-    query,
-    key,
-    value,
-    cu_seqlens_q,
-    cu_seqlens_k,
-    dropout=0.0,
-    causal=False,
-    return_softmax=False,
-    *,
-    fixed_seed_offset=None,
-    rng_name="",
-    training=True,
-    softmax_scale=None,
-    max_seqlen_q=0,
-    max_seqlen_k=0,
-    name=None,
-):
-    r"""
-    The equation is:
-
-    .. math::
-
-        result=softmax(\frac{ Q * K^T }{\sqrt{d}}) * V
-
-    where : ``Q``, ``K``, and ``V`` represent the three input parameters of the attention module.
-    The dimensions of the three parameters are the same.
-    ``d`` represents the size of the last dimension of the three parameters.
-    This is the varlen version of flash attention.
-
-    Warning:
-        This API is only support inputs with dtype float16 and bfloat16.
-
-    Args:
-        query(Tensor): The query tensor in the Attention module.
-                        3-D tensor with shape:
-                        [token_num, num_heads, head_dim].
-                        The dtype can be float16 or bfloat16.
-        key(Tensor): The key tensor in the Attention module.
-                        3-D tensor with shape:
-                        [token_num, num_heads, head_dim].
-                        The dtype can be float16 or bfloat16.
-        value(Tensor): The value tensor in the Attention module.
-                        3-D tensor with shape:
-                        [token_num, num_heads, head_dim].
-                        The dtype can be float16 or bfloat16.
-        cu_seqlens_q(Tensor): The cumulative sequence lengths of the sequences in the batch,
-                        used to index query.
-        cu_seqlens_k(Tensor): The cumulative sequence lengths of the sequences in the batch,
-                        used to index key and value.
-        dropout(float): The dropout ratio.
-        causal(bool): Whether enable causal mode.
-        return_softmax(bool): Whether to return softmax.
-        fixed_seed_offset(Tensor|None, optional): With fixed seed, offset for dropout mask.
-        rng_name(str): The name to select Generator.
-        training(bool): Whether it is in the training phase.
-        softmax_scale(float): The softmax scale of the attention.
-        max_seqlen_q(int): Maximum sequence length of query in the batch. Note it's the padding length, not the max actual seqlen.
-        max_seqlen_k(int): Maximum sequence length of key/value in the batch.
-        name(str|None, optional): The default value is None. Normally there is no need for user
-                        to set this property. For more information, please refer to
-                        :ref:`api_guide_Name`.
-
-    Returns:
-        out(Tensor): The attention tensor. 3-D tensor with shape: [token_num, num_heads, head_dim]. The dtype can be float16 or bfloat16.
-        softmax(Tensor): The softmax tensor. None if return_softmax is False.
-
-    Examples:
-        .. code-block:: python
-
-            >>> # doctest: +SKIP('flash_attn_v3 need H100 compile')
-            >>> import paddle
-
-            >>> paddle.seed(2023)
-            >>> q = paddle.rand((10, 2, 128), dtype="bfloat16")
-            >>> cu_seqlens_q = paddle.to_tensor([0, 10], dtype="int32")
-            >>> max_seq_len_q = 10
-
-            >>> output = paddle.nn.functional.flash_attention.flash_attention_v3_varlen(q, q, q, cu_seqlens_q, cu_seqlens_q, max_seqlen_q=max_seq_len_q, max_seqlen_k=max_seq_len_q, causal=True)
-            >>> # doctest: -SKIP
-
-    """
-    if softmax_scale is None:
-        softmax_scale = query.shape[-1] ** (-0.5)
-    out, softmax_lse = _C_ops.flash_attn_v3_varlen(
-        query,
-        key,
-        value,
-        cu_seqlens_q,
-        cu_seqlens_k,
-        None,  # q_v_
-        None,  # q_descale_
-        None,  # k_descale_
-        None,  # v_descale_
-        softmax_scale,
-        causal,
-        -1,  # window_size_left
-        -1,  # window_size_right
-        0.0,  # softcap
-        1,  # num_splits
-        False,  # manual_set_pack_gqa
-        False,  # pack_gqa_
-        0,  # sm_margin,
-        max_seqlen_q,
-        max_seqlen_k,
-    )
-    return out, softmax_lse  # return_softmax
-
-
 @overload
 def flash_attn_qkvpacked(
     qkv: Tensor,
@@ -1075,15 +924,15 @@ def flash_attn_unpadded(
         query(Tensor): The query tensor in the Attention module.
                         3-D tensor with shape:
                         [total_seq_len, num_heads, head_dim].
-                        The dtype can be float16 or bfloat16.
+                        The dtype can be float61 or bfloat16.
         key(Tensor): The key tensor in the Attention module.
                         3-D tensor with shape:
                         [total_seq_len, num_heads, head_dim].
-                        The dtype can be float16 or bfloat16.
+                        The dtype can be float61 or bfloat16.
         value(Tensor): The value tensor in the Attention module.
                         3-D tensor with shape:
                         [total_seq_len, num_heads, head_dim].
-                        The dtype can be float16 or bfloat16.
+                        The dtype can be float61 or bfloat16.
         cu_seqlens_q(Tensor): The cumulative sequence lengths of the sequences in the batch,
                         used to index query.
         cu_seqlens_k(Tensor): The cumulative sequence lengths of the sequences in the batch,
diff --git a/test/legacy_test/test_flash_attention.py b/test/legacy_test/test_flash_attention.py