add cp_tot_seqused_k to calc mask and block boundary

minosfuture · minosfuture · commit efc45c04d4e2 · 2025-09-29T18:25:29.000-07:00
Signed-off-by: Ming Yang &lt;minos.future@gmail.com&gt;
diff --git a/hopper/block.h b/hopper/block.h
@@ -38,11 +38,17 @@ struct BlockMN {
             // TODO: check off-by-1 error
             if (PackGQA) { m_idx_max = qhead_per_khead_divmod.divide(m_idx_max - 1) + 1 ; }
             // If local, blocking (m_idx_max - m_idx_min + window_size_right + window_size_left)  
-            n_block_max = std::min(n_block_max,
-                                   cute::ceil_div(m_idx_max +
-                                                  seqlen_info.cp_world_size * seqlen_k -
-                                                  seqlen_q + window_size_right,
-                                                  seqlen_info.cp_world_size * kBlockN));
+            if (seqlen_info.cp_world_size > 1) {
+              n_block_max = std::min(n_block_max,
+                                     cute::ceil_div(
+                                     cute::ceil_div(m_idx_max + seqlen_info.cp_tot_seqlen_k - seqlen_q + window_size_right - seqlen_info.cp_rank,
+                                                    seqlen_info.cp_world_size),
+                                     kBlockN));
+            } else {
+              n_block_max = std::min(n_block_max,
+                                     cute::ceil_div(m_idx_max + seqlen_k - seqlen_q + window_size_right,
+                                                    kBlockN));
+            }
         }
         // Now, only adjust n_block_min if split
         int n_block_min = 0;
diff --git a/hopper/flash.h b/hopper/flash.h
@@ -165,6 +165,7 @@ struct Flash_fwd_params : public Qkv_params {
     // CP (Context Parallelism) parameters
     int cp_world_size;
     int cp_rank;
+    int *__restrict__ cp_tot_seqused_k;
 };
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/hopper/flash_api.cpp b/hopper/flash_api.cpp
@@ -703,7 +703,8 @@ mha_fwd(at::Tensor &q,   // (b, s_q, h, d) or (total_q, h, d) if there is cu_seq
         int const sm_margin,
         std::optional<const at::Tensor> &s_aux_, // (h)
         int const cp_world_size,  // context parallelism (cp) world size
-        int const cp_rank         // cp rank
+        int const cp_rank,         // cp rank
+        std::optional<const at::Tensor> &cp_tot_seqused_k_ // b. total seqused_k in cp world
         ) {
 
     auto dprops = at::cuda::getCurrentDeviceProperties();
@@ -841,6 +842,12 @@ mha_fwd(at::Tensor &q,   // (b, s_q, h, d) or (total_q, h, d) if there is cu_seq
         CHECK_DEVICE(seqused_k); CHECK_CONTIGUOUS(seqused_k);
         CHECK_SHAPE(seqused_k, batch_size);
     }
+    if (cp_tot_seqused_k_.has_value()) {
+        auto cp_tot_seqused_k = cp_tot_seqused_k_.value();
+        TORCH_CHECK(cp_tot_seqused_k.dtype() == torch::kInt32, "seqused_k must have dtype int32");
+        CHECK_DEVICE(cp_tot_seqused_k); CHECK_CONTIGUOUS(cp_tot_seqused_k);
+        CHECK_SHAPE(cp_tot_seqused_k, batch_size);
+    }
 
     if (leftpad_k_.has_value()) {
         auto leftpad_k = leftpad_k_.value();
@@ -1152,6 +1159,8 @@ mha_fwd(at::Tensor &q,   // (b, s_q, h, d) or (total_q, h, d) if there is cu_seq
 
     params.cp_world_size = cp_world_size;
     params.cp_rank = cp_rank;
+    params.cp_tot_seqused_k = cp_tot_seqused_k_.has_value() ?
+      static_cast<int *>(cp_tot_seqused_k_.value().data_ptr()) : nullptr;
 
     #ifdef FLASHATTENTION_DISABLE_LOCAL
     TORCH_CHECK(!params.is_local, "This flash attention build does not support local attention.");
diff --git a/hopper/flash_api_torch_lib.cpp b/hopper/flash_api_torch_lib.cpp
@@ -54,7 +54,8 @@ mha_fwd(at::Tensor &q,   // (b, s_q, h, d) or (total_q, h, d) if there is cu_seq
         int const sm_margin,
         std::optional<const at::Tensor> &s_aux_,
         int const cp_world_size,
-        int const cp_rank
+        int const cp_rank,
+        std::optional<const at::Tensor> &cp_tot_seqused_k
 );
 
 // Only applicable to the case where seqused_k (i.e. cache_seqlens) is available
@@ -124,7 +125,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
             "    int      sm_margin,"
             "    Tensor?  s_aux,"
             "    int      cp_world_size,"
-            "    int      cp_rank) -> Tensor[]");
+            "    int      cp_rank,"
+            "    Tensor?  cp_tot_seqused_k) -> Tensor[]");
     ops.impl("fwd", torch::kCUDA, make_pytorch_shim(&mha_fwd));
 
     ops.def("get_scheduler_metadata("
diff --git a/hopper/flash_attn_interface.py b/hopper/flash_attn_interface.py
@@ -51,7 +51,8 @@ def _flash_attn_forward(
         sm_margin=0,
         s_aux=None,
         cp_world_size=1,
-        cp_rank=0):
+        cp_rank=0,
+        cp_tot_seqused_k=None):
     q, k, k_new, v_new = [maybe_contiguous(x) for x in (q, k, k_new, v_new)]
     v = v.contiguous() if v.stride(-1) != 1 and v.stride(-3) != 1 else v
     cu_seqlens_q, cu_seqlens_k, cu_seqlens_k_new = [
@@ -99,7 +100,8 @@ def _flash_attn_forward(
         sm_margin,
         s_aux,
         cp_world_size,
-        cp_rank
+        cp_rank,
+        cp_tot_seqused_k,
     )
     return out, softmax_lse, *rest
 
@@ -266,6 +268,7 @@ def forward(
         s_aux=None,
         cp_world_size=1,
         cp_rank=0,
+        cp_tot_seqused_k=None,
     ):
         if softmax_scale is None:
             softmax_scale = (q.shape[-1] + (qv.shape[-1] if qv is not None else 0)) ** (-0.5)
@@ -293,6 +296,7 @@ def forward(
             s_aux=s_aux,
             cp_world_size=cp_world_size,
             cp_rank=cp_rank,
+            cp_tot_seqused_k=cp_tot_seqused_k,
         )
         # ctx.save_for_backward(q, k, v, out_padded, softmax_lse)
         ctx.save_for_backward(q, k, v, out, softmax_lse)
@@ -361,6 +365,7 @@ def forward(
         s_aux=None,
         cp_world_size=1,
         cp_rank=0,
+        cp_tot_seqused_k=0,
     ):
         if softmax_scale is None:
             softmax_scale = (q.shape[-1] + (qv.shape[-1] if qv is not None else 0)) ** (-0.5)
@@ -392,6 +397,7 @@ def forward(
             s_aux=s_aux,
             cp_world_size=cp_world_size,
             cp_rank=cp_rank,
+            cp_tot_seqused_k=cp_tot_seqused_k,
         )
         # ctx.save_for_backward(q, k, v, out_padded, softmax_lse, cu_seqlens_q, cu_seqlens_k, seqused_q, seqused_k)
         ctx.save_for_backward(q, k, v, out, softmax_lse, cu_seqlens_q, cu_seqlens_k, seqused_q, seqused_k)
@@ -511,6 +517,7 @@ def flash_attn_func(
     s_aux=None,
     cp_world_size=1,
     cp_rank=0,
+    cp_tot_seqused_k=None,
 ):
     """dropout_p should be set to 0.0 during evaluation
     Supports multi-query and grouped-query attention (MQA/GQA) by passing in KV with fewer heads
@@ -574,6 +581,7 @@ def flash_attn_func(
         s_aux,
         cp_world_size,
         cp_rank,
+        cp_tot_seqused_k,
     )
 
 
@@ -600,6 +608,7 @@ def flash_attn_varlen_func(
     s_aux=None,
     cp_world_size=1,
     cp_rank=0,
+    cp_tot_seqused_k=None,
 ):
     return FlashAttnVarlenFunc.apply(
         q,
@@ -624,6 +633,7 @@ def flash_attn_varlen_func(
         s_aux,
         cp_world_size,
         cp_rank,
+        cp_tot_seqused_k,
     )
 
 
@@ -664,6 +674,7 @@ def flash_attn_with_kvcache(
     s_aux=None,
     cp_world_size=1,
     cp_rank=0,
+    cp_tot_seqused_k=None,
 ):
     """
     If k and v are not None, k_cache and v_cache will be updated *inplace* with the new values from
@@ -793,6 +804,7 @@ def flash_attn_with_kvcache(
         s_aux=s_aux,
         cp_world_size=cp_world_size,
         cp_rank=cp_rank,
+        cp_tot_seqused_k=cp_tot_seqused_k,
     )
     # return (out, softmax_lse) if return_softmax_lse else out
     return (out, softmax_lse, *rest) if return_softmax_lse else out
diff --git a/hopper/flash_fwd_kernel_sm90.h b/hopper/flash_fwd_kernel_sm90.h
@@ -348,7 +348,9 @@ class FlashAttnFwdSm90 {
                     params.mainloop.cu_seqlens_q, params.mainloop.cu_seqlens_k, params.mainloop.cu_seqlens_k_new,
                     params.mainloop.seqused_q, params.mainloop.seqused_k, params.mainloop.leftpad_k,
                     params.mainloop.seqlens_rotary,
-                    params.mainloop.cp_world_size
+                    params.mainloop.cp_world_size,
+                    params.mainloop.cp_rank,
+                    params.mainloop.cp_tot_seqused_k
                 };
                 if constexpr (AppendKV) {
                     bool tile_new_valid = mainloop.load_kv_new(
@@ -397,7 +399,9 @@ class FlashAttnFwdSm90 {
                     get<0>(params.mainloop.shape_K_new),
                     params.mainloop.cu_seqlens_q, params.mainloop.cu_seqlens_k, params.mainloop.cu_seqlens_k_new,
                     params.mainloop.seqused_q, params.mainloop.seqused_k, params.mainloop.leftpad_k,
-                    params.mainloop.seqlens_rotary, params.mainloop.cp_world_size
+                    params.mainloop.seqlens_rotary, params.mainloop.cp_world_size,
+                    params.mainloop.cp_rank,
+                    params.mainloop.cp_tot_seqused_k
                 };
                 if constexpr (AppendKV) {
                     bool tile_new_valid = mainloop.store_kv_new(
diff --git a/hopper/flash_fwd_launch_template.h b/hopper/flash_fwd_launch_template.h
@@ -131,7 +131,7 @@ void run_flash_fwd(Flash_fwd_params &params, cudaStream_t stream) {
         params.seqused_q, params.seqused_k,
         params.leftpad_k, params.seqlens_rotary,
         static_cast<ElementS const*>(params.s_aux_ptr),
-        params.cp_world_size, params.cp_rank,
+        params.cp_world_size, params.cp_rank, params.cp_tot_seqused_k
     };
     typename CollectiveEpilogue::Arguments epilogue_args {
         static_cast<ElementOut*>(params.o_ptr),
diff --git a/hopper/mainloop_fwd_sm80.hpp b/hopper/mainloop_fwd_sm80.hpp
@@ -217,6 +217,7 @@ struct CollectiveMainloopFwdSm80 {
         ElementSAux const* const ptr_S_aux = nullptr;
         int cp_world_size;
         int cp_rank;
+        int const* const cp_tot_seqused_k = nullptr;
     };
 
     // Device side kernel params
diff --git a/hopper/mainloop_fwd_sm90_tma_gmma_ws.hpp b/hopper/mainloop_fwd_sm90_tma_gmma_ws.hpp
@@ -415,6 +415,7 @@ struct CollectiveMainloopFwdSm90 {
         // Context parallelism (CP) parameters
         int const cp_world_size = 1;
         int const cp_rank = 0;
+        int const* const cp_tot_seqused_k = nullptr;
     };
 
     // Device side kernel params
@@ -474,6 +475,7 @@ struct CollectiveMainloopFwdSm90 {
         ElementSAux const* const ptr_S_aux = nullptr;
         int cp_world_size = 1;
         int cp_rank = 0;
+        int const* const cp_tot_seqused_k = nullptr;
     };
 
     static Params
@@ -590,7 +592,7 @@ struct CollectiveMainloopFwdSm90 {
                 args.cu_seqlens_q, args.cu_seqlens_k, args.cu_seqlens_k_new,
                 args.seqused_q, args.seqused_k, args.leftpad_k, args.seqlens_rotary,
                 args.ptr_S_aux,
-                args.cp_world_size, args.cp_rank};
+                args.cp_world_size, args.cp_rank, args.cp_tot_seqused_k};
     }
 
     /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
@@ -1101,7 +1103,7 @@ struct CollectiveMainloopFwdSm90 {
         flash::Mask<kBlockM, kBlockN, PackGQA, TiledMmaQK> mask(
             thread_idx, seqlen_q, seqlen_k, params.window_size_left, params.window_size_right, 0 - n_offset /*sink_token_length*/,
             params.qhead_per_khead_divmod,
-            params.cp_world_size, params.cp_rank
+            params.cp_world_size, params.cp_rank, seqlen_info.cp_tot_seqlen_k
         );
 
         float softcap_val = params.softcap_val;
diff --git a/hopper/mask.h b/hopper/mask.h
@@ -23,13 +23,13 @@ struct Mask {
     int const seqlen_q, seqlen_k;
     int const window_size_left, window_size_right, sink_token_length;
     cutlass::FastDivmod const qhead_per_khead_divmod;
-    int const cp_world_size, cp_rank;
+    int const cp_world_size, cp_rank, cp_tot_seqlen_k;
 
     CUTLASS_DEVICE
     Mask(const int thread_idx, const int seqlen_q, const int seqlen_k,
          const int window_size_left, const int window_size_right, const int sink_token_length,
          cutlass::FastDivmod const &qhead_per_khead_divmod,
-         const int cp_world_size = 1, const int cp_rank = 0)
+         const int cp_world_size = 1, const int cp_rank = 0, const int cp_tot_seqlen_k = 0)
         : thread_idx(thread_idx)
         , seqlen_q(seqlen_q)
         , seqlen_k(seqlen_k)
@@ -39,6 +39,7 @@ struct Mask {
         , qhead_per_khead_divmod(qhead_per_khead_divmod)
         , cp_world_size(cp_world_size)
         , cp_rank(cp_rank)
+        , cp_tot_seqlen_k(cp_tot_seqlen_k)
     {
     };
 
@@ -102,8 +103,8 @@ struct Mask {
                             if (cp_world_size > 1) {
                                 int local_k_idx = int(get<Col>(t0ScS_rowcol(_0{}, n))) + get<Col>(tScS_rowcol(_0{}, _0{})) + n_block * kBlockN;
                                 int abs_k_idx = local_k_idx * cp_world_size + cp_rank;
-                                int k_limit = row_idx + cp_world_size * seqlen_k - seqlen_q;
-                                if (abs_k_idx > k_limit || (Seqlenk_mask && abs_k_idx > cp_world_size * seqlen_k)) {
+                                int k_limit = row_idx + cp_tot_seqlen_k - seqlen_q;
+                                if (abs_k_idx > k_limit || (Seqlenk_mask && abs_k_idx >= cp_tot_seqlen_k)) {
                                     tSrS_rowcol(m, n) = -INFINITY;
                                 }
                             } else {
diff --git a/hopper/seqlen.h b/hopper/seqlen.h
@@ -34,12 +34,14 @@ struct SeqlenInfoQK {
     int const offset_q, offset_k, offset_q_padded;
     int const seqlen_q, seqlen_k;
     int const cp_world_size;
+    int const cp_tot_seqlen_k;
 
     CUTLASS_DEVICE
     SeqlenInfoQK(int const bidb, int const seqlen_q_static, int const seqlen_k_static,
                  int const* const cu_seqlens_q, int const* const cu_seqlens_k,
                  int const* const seqused_q, int const* const seqused_k,
-                 int const cp_world_size=1)
+                 int const cp_world_size=1,
+                 int const* const cp_tot_seqused_k=nullptr)
         : offset_q(!Varlen || cu_seqlens_q == nullptr ? 0 : cu_seqlens_q[bidb])
         , offset_k(!Varlen || cu_seqlens_k == nullptr ? 0 : cu_seqlens_k[bidb])
         // If varlen, the layout for dPSum, LSE_log2, and dQaccum is that we pad each sequence in the batch
@@ -54,6 +56,9 @@ struct SeqlenInfoQK {
                    ? seqlen_k_static
                    : (seqused_k ? seqused_k[bidb] : (cu_seqlens_k ? cu_seqlens_k[bidb + 1] - cu_seqlens_k[bidb] : seqlen_k_static)))
         , cp_world_size(cp_world_size)
+        , cp_tot_seqlen_k(cp_tot_seqused_k == nullptr
+                          ? 0
+                          : cp_tot_seqused_k[bidb])
     {
     }
 
@@ -68,12 +73,15 @@ struct SeqlenInfoQKNewK {
     int const offset_q, offset_k, offset_k_new;
     int const seqlen_q, seqlen_k_og, seqlen_k_new, seqlen_k, seqlen_rotary;
     int const cp_world_size;
+    int const cp_rank;
+    int const cp_tot_seqlen_k;
 
     CUTLASS_DEVICE
     SeqlenInfoQKNewK(int const bidb, int const seqlen_q_static, int const seqlen_k_static, int const shape_K_new_0,
                      int const* const cu_seqlens_q, int const* const cu_seqlens_k, int const* const cu_seqlens_k_new,
                      int const* const seqused_q, int const* const seqused_k, int const* const ptr_leftpad_k,
-                     int const* const seqlens_rotary, int const cp_world_size=1
+                     int const* const seqlens_rotary, int const cp_world_size=1, int const cp_rank=0,
+                     int const* const cp_tot_seqused_k=nullptr
                      )
         : leftpad_k(ptr_leftpad_k ? ptr_leftpad_k[bidb] : 0)
         , offset_q(!Varlen || cu_seqlens_q == nullptr ? 0 : cu_seqlens_q[bidb])
@@ -91,6 +99,10 @@ struct SeqlenInfoQKNewK {
         , seqlen_k(!AppendKV ? seqlen_k_og : seqlen_k_og + seqlen_k_new)
         , seqlen_rotary(!AppendKV || !seqlens_rotary ? seqlen_k_og + leftpad_k : seqlens_rotary[bidb])
         , cp_world_size(cp_world_size)
+        , cp_rank(cp_rank)
+        , cp_tot_seqlen_k(cp_tot_seqused_k == nullptr
+                          ? 0
+                          : cp_tot_seqused_k[bidb])
     {
     }
 
diff --git a/vllm_flash_attn/flash_attn_interface.py b/vllm_flash_attn/flash_attn_interface.py
@@ -148,6 +148,7 @@ def flash_attn_varlen_func(
     s_aux=None,
     cp_world_size=1,
     cp_rank=0,
+    cp_tot_seqused_k=None,
 ):
     """dropout_p should be set to 0.0 during evaluation
     Supports multi-query and grouped-query attention (MQA/GQA) by passing in K, V with fewer heads
@@ -284,6 +285,7 @@ def flash_attn_varlen_func(
             s_aux,            # s_aux
             cp_world_size,
             cp_rank,
+            cp_tot_seqused_k,
         )
     else:
         raise ValueError(f"Unsupported FA version: {fa_version}")
@@ -322,6 +324,7 @@ def flash_attn_with_kvcache(
     s_aux=None,
     cp_world_size=1,
     cp_rank=0,
+    cp_tot_seqused_k=None,
 ):
     """
     If k and v are not None, k_cache and v_cache will be updated *inplace* with the new values from