remove format; simplify tot_seqlen_k handling

minosfuture · minosfuture · commit ce374069fab0 · 2025-09-30T16:24:54.000-07:00
Signed-off-by: Ming Yang &lt;minos.future@gmail.com&gt;
diff --git a/hopper/block.h b/hopper/block.h
@@ -38,17 +38,13 @@ struct BlockMN {
             // TODO: check off-by-1 error
             if (PackGQA) { m_idx_max = qhead_per_khead_divmod.divide(m_idx_max - 1) + 1 ; }
             // If local, blocking (m_idx_max - m_idx_min + window_size_right + window_size_left)  
-            if (seqlen_info.cp_world_size > 1) {
-              n_block_max = std::min(n_block_max,
-                                     cute::ceil_div(
-                                     cute::ceil_div(m_idx_max + seqlen_info.cp_tot_seqlen_k - seqlen_q + window_size_right - seqlen_info.cp_rank,
-                                                    seqlen_info.cp_world_size),
-                                     kBlockN));
-            } else {
-              n_block_max = std::min(n_block_max,
-                                     cute::ceil_div(m_idx_max + seqlen_k - seqlen_q + window_size_right,
-                                                    kBlockN));
-            }
+            // when cp is not enabled, tot_seqlen_k is equal to seqlen_k, and cp_world_size is 1.
+            // cp_world_size is guaranteed to be greater than 0
+            n_block_max = std::min(n_block_max,
+                                    cute::ceil_div(
+                                    cute::ceil_div(m_idx_max + seqlen_info.tot_seqlen_k - seqlen_q + window_size_right - seqlen_info.cp_rank,
+                                                  seqlen_info.cp_world_size),
+                                    kBlockN));
         }
         // Now, only adjust n_block_min if split
         int n_block_min = 0;
diff --git a/hopper/flash_api.cpp b/hopper/flash_api.cpp
@@ -434,7 +434,7 @@ inline bool get_pack_gqa(Flash_fwd_params const& params) {
     if (params.arch < 90 || (params.page_table && !params.pagedkv_tma) || params.num_splits > 1) { return true; }
     // Always enable PackGQA for special case of hdim = 64, qheads/kvheads = 8, local attention
     // TODO: investigate more cases where PackGQA improves perf due to better tile quantization
-    bool const packgqa_override = params.arch >= 90 && (params.h / params.h_k) == 8 &&
+    bool const packgqa_override = params.arch >= 90 && (params.h / params.h_k) == 8 && 
                                   params.is_local &&
                                   params.d == 64 && (params.dv == params.d);
     if (packgqa_override) { return true; }
@@ -787,7 +787,7 @@ mha_fwd(at::Tensor &q,   // (b, s_q, h, d) or (total_q, h, d) if there is cu_seq
         }
         #ifdef FLASHATTENTION_DISABLE_HDIMDIFF64
         TORCH_CHECK(head_size > 64, "This flash attention build does not support hdim != hdim_v when hdim <= 64");
-        #endif
+        #endif 
         #ifdef FLASHATTENTION_DISABLE_HDIMDIFF192
         TORCH_CHECK(head_size <= 64, "This flash attention build does not support hdim != hdim_v when hdim in (128, 192]");
         #endif
@@ -1161,6 +1161,9 @@ mha_fwd(at::Tensor &q,   // (b, s_q, h, d) or (total_q, h, d) if there is cu_seq
     params.cp_rank = cp_rank;
     params.cp_tot_seqused_k = cp_tot_seqused_k_.has_value() ?
       static_cast<int *>(cp_tot_seqused_k_.value().data_ptr()) : nullptr;
+    TORCH_CHECK(cp_world_size > 0, "cp_world_size must be positive, required by downstream unified code path. Use 1 if CP is not enabled.");
+    TORCH_CHECK(cp_world_size != 1 || cp_rank == 0, "When context parallelism is disabled, cp_rank must be zero");
+    TORCH_CHECK(cp_world_size == 1 || cp_tot_seqused_k_.has_value(), "cp_tot_seqused_k_ must be provided when context parallelism is enabled.");
 
     #ifdef FLASHATTENTION_DISABLE_LOCAL
     TORCH_CHECK(!params.is_local, "This flash attention build does not support local attention.");
diff --git a/hopper/flash_attn_interface.py b/hopper/flash_attn_interface.py
@@ -241,7 +241,7 @@ def backward(ctx, dout, *args):
             ctx.causal,
             ctx.window_size,
             ctx.softcap,
-            ctx.deterministic,
+            ctx.deterministic, 
         )
         dqkv = dqkv[..., : dout.shape[-1]]  # We could have padded the head dimension
         return dqkv, None, None, None, None, None, None, None, None, None, None
diff --git a/hopper/flash_fwd_launch_template.h b/hopper/flash_fwd_launch_template.h
@@ -89,7 +89,6 @@ void run_flash_fwd(Flash_fwd_params &params, cudaStream_t stream) {
         cute::conditional_return<!V_colmajor>(
             make_stride(params.v_row_stride, _1{}, params.v_head_stride, !is_varlen_k ? params.v_batch_stride : 0),
             make_stride(_1{}, params.v_dim_stride, params.v_head_stride, !is_varlen_k ? params.v_batch_stride : 0));
-
     typename CollectiveMainloop::Arguments mainloop_args {
         static_cast<Element const*>(params.q_ptr),
         {seqlen_q, params.d, params.h, batch_q},  // shape_Q
diff --git a/hopper/mainloop_fwd_sm90_tma_gmma_ws.hpp b/hopper/mainloop_fwd_sm90_tma_gmma_ws.hpp
@@ -547,7 +547,7 @@ struct CollectiveMainloopFwdSm90 {
                 return nullptr;
             }
         }();
-
+        
         auto const shape_Qv_packed = cute::conditional_return<!PackGQA>(
             shape_Qv,
             make_shape(make_shape(qhead_per_khead, get<0>(shape_Qv)), get<1>(shape_Qv), get<2>(args.shape_K), get<3>(shape_Qv))
@@ -1007,7 +1007,6 @@ struct CollectiveMainloopFwdSm90 {
         static constexpr int kBlockN = get<1>(TileShape_MNK{});
 
         // can't use auto [m_block, ...] = block_coord since structured binding cannot be captured in lambda
-        // block index
         int const m_block = get<0>(block_coord);
         int const bidh = get<1>(block_coord);
         int const bidb = get<2>(block_coord);
@@ -1103,7 +1102,7 @@ struct CollectiveMainloopFwdSm90 {
         flash::Mask<kBlockM, kBlockN, PackGQA, TiledMmaQK> mask(
             thread_idx, seqlen_q, seqlen_k, params.window_size_left, params.window_size_right, 0 - n_offset /*sink_token_length*/,
             params.qhead_per_khead_divmod,
-            params.cp_world_size, params.cp_rank, seqlen_info.cp_tot_seqlen_k
+            params.cp_world_size, params.cp_rank, seqlen_info.tot_seqlen_k
         );
 
         float softcap_val = params.softcap_val;
@@ -1211,7 +1210,6 @@ struct CollectiveMainloopFwdSm90 {
         }
 
         if constexpr (IntraWGOverlap) {
-
             Tensor tSrS = partition_fragment_C(tiled_mma_qk, select<0, 1>(TileShape_MNK{}));
             consumer_wait(pipeline_k, smem_pipe_read);
             flash::gemm</*zero_init=*/true, /*wg_wait=*/-1>(tiled_mma_qk, tSrQ, tSrK(_, _, _, smem_pipe_read.index()), tSrS);
@@ -1283,8 +1281,7 @@ struct CollectiveMainloopFwdSm90 {
             };
 
             if constexpr (Is_causal || Is_local) { // Separate iterations with causal or local masking
-                auto mask_fn = [&](auto& tSrS, int n_block) {
-                  mask.template apply<false /*Seqlenk_mask*/, Is_causal, Is_local>(tSrS, m_block, n_block); };
+                auto mask_fn = [&](auto& tSrS, int n_block) { mask.template apply<false /*Seqlenk_mask*/, Is_causal, Is_local>(tSrS, m_block, n_block); };
                 int const m_idx_min = !PackGQA ? m_block * kBlockM : params.qhead_per_khead_divmod.divide(m_block * kBlockM);
                 // If local, blocking (window_size_right + window_size_left)
                 int const n_block_min_causal_local_mask =
@@ -1297,13 +1294,15 @@ struct CollectiveMainloopFwdSm90 {
 
             int const m_idx_max = !PackGQA ? (m_block + 1) * kBlockM : params.qhead_per_khead_divmod.divide((m_block + 1) * kBlockM - 1) + 1;
             // If local, blocking (m_idx_max - m_idx_min)
+            // when cp is not enabled, tot_seqlen_k is equal to seqlen_k, and cp_world_size is 1.
+            // cp_world_size is guaranteed to be greater than 0
             int const n_block_min_before_local_mask = !Is_local
                 ? n_block_min
                 : std::max(n_block_min,
-                           cute::ceil_div(m_idx_max +
-                                          params.cp_world_size * seqlen_k -
-                                          seqlen_q - params.window_size_left,
-                                          params.cp_world_size * kBlockN));
+                           cute::ceil_div(
+                           cute::ceil_div(m_idx_max + seqlen_info.tot_seqlen_k - seqlen_q - params.window_size_left - seqlen_info.cp_rank,
+                                          seqlen_info.cp_world_size),
+                           kBlockN));
             auto no_mask_fn = [](auto& tSrS, int n_block) { };
             #pragma unroll 1
             for (; n_block >= n_block_min_before_local_mask; --n_block) {
@@ -1429,7 +1428,7 @@ struct CollectiveMainloopFwdSm90 {
             // Tensor scores_scale = softmax.finalize(v_descale);
             Tensor scores_scale = make_tensor_like(softmax.row_max);
             finalize_dispatch(scores_scale, v_descale);
-
+            
             if constexpr (LargeHeadDimV) {
                 cutlass::arch::NamedBarrier::sync(NumMmaThreads, static_cast<uint32_t>(FwdNamedBarriers::PEmpty) /*id*/);
                 store_scales(scores_scale, smem_pipe_read.index());
diff --git a/hopper/mask.h b/hopper/mask.h
@@ -23,13 +23,13 @@ struct Mask {
     int const seqlen_q, seqlen_k;
     int const window_size_left, window_size_right, sink_token_length;
     cutlass::FastDivmod const qhead_per_khead_divmod;
-    int const cp_world_size, cp_rank, cp_tot_seqlen_k;
+    int const cp_world_size, cp_rank, tot_seqlen_k;
 
     CUTLASS_DEVICE
     Mask(const int thread_idx, const int seqlen_q, const int seqlen_k,
          const int window_size_left, const int window_size_right, const int sink_token_length,
          cutlass::FastDivmod const &qhead_per_khead_divmod,
-         const int cp_world_size = 1, const int cp_rank = 0, const int cp_tot_seqlen_k = 0)
+         const int cp_world_size = 1, const int cp_rank = 0, const int tot_seqlen_k = 0)
         : thread_idx(thread_idx)
         , seqlen_q(seqlen_q)
         , seqlen_k(seqlen_k)
@@ -39,7 +39,7 @@ struct Mask {
         , qhead_per_khead_divmod(qhead_per_khead_divmod)
         , cp_world_size(cp_world_size)
         , cp_rank(cp_rank)
-        , cp_tot_seqlen_k(cp_tot_seqlen_k)
+        , tot_seqlen_k(tot_seqlen_k)
     {
     };
 
@@ -103,8 +103,8 @@ struct Mask {
                             if (cp_world_size > 1) {
                                 int local_k_idx = int(get<Col>(t0ScS_rowcol(_0{}, n))) + get<Col>(tScS_rowcol(_0{}, _0{})) + n_block * kBlockN;
                                 int abs_k_idx = local_k_idx * cp_world_size + cp_rank;
-                                int k_limit = row_idx + cp_tot_seqlen_k - seqlen_q;
-                                if (abs_k_idx > k_limit || (Seqlenk_mask && abs_k_idx >= cp_tot_seqlen_k)) {
+                                int k_limit = row_idx + tot_seqlen_k - seqlen_q;
+                                if (abs_k_idx > k_limit || (Seqlenk_mask && abs_k_idx >= tot_seqlen_k)) {
                                     tSrS_rowcol(m, n) = -INFINITY;
                                 }
                             } else {
diff --git a/hopper/seqlen.h b/hopper/seqlen.h
@@ -34,7 +34,7 @@ struct SeqlenInfoQK {
     int const offset_q, offset_k, offset_q_padded;
     int const seqlen_q, seqlen_k;
     int const cp_world_size;
-    int const cp_tot_seqlen_k;
+    int const tot_seqlen_k;
 
     CUTLASS_DEVICE
     SeqlenInfoQK(int const bidb, int const seqlen_q_static, int const seqlen_k_static,
@@ -56,9 +56,9 @@ struct SeqlenInfoQK {
                    ? seqlen_k_static
                    : (seqused_k ? seqused_k[bidb] : (cu_seqlens_k ? cu_seqlens_k[bidb + 1] - cu_seqlens_k[bidb] : seqlen_k_static)))
         , cp_world_size(cp_world_size)
-        , cp_tot_seqlen_k(cp_tot_seqused_k == nullptr
-                          ? 0
-                          : cp_tot_seqused_k[bidb])
+        , tot_seqlen_k(cp_tot_seqused_k == nullptr
+                       ? seqlen_k
+                       : cp_tot_seqused_k[bidb])
     {
     }
 
@@ -74,7 +74,7 @@ struct SeqlenInfoQKNewK {
     int const seqlen_q, seqlen_k_og, seqlen_k_new, seqlen_k, seqlen_rotary;
     int const cp_world_size;
     int const cp_rank;
-    int const cp_tot_seqlen_k;
+    int const tot_seqlen_k;
 
     CUTLASS_DEVICE
     SeqlenInfoQKNewK(int const bidb, int const seqlen_q_static, int const seqlen_k_static, int const shape_K_new_0,
@@ -100,9 +100,9 @@ struct SeqlenInfoQKNewK {
         , seqlen_rotary(!AppendKV || !seqlens_rotary ? seqlen_k_og + leftpad_k : seqlens_rotary[bidb])
         , cp_world_size(cp_world_size)
         , cp_rank(cp_rank)
-        , cp_tot_seqlen_k(cp_tot_seqused_k == nullptr
-                          ? 0
-                          : cp_tot_seqused_k[bidb])
+        , tot_seqlen_k(cp_tot_seqused_k == nullptr
+                       ? seqlen_k
+                       : cp_tot_seqused_k[bidb])
     {
     }
 

Original file line number	Diff line number	Diff line change
`@@ -241,7 +241,7 @@ def backward(ctx, dout, *args):`
`241`	`241`	`ctx.causal,`
`242`	`242`	`ctx.window_size,`
`243`	`243`	`ctx.softcap,`
`244`		`- ctx.deterministic,`
	`244`	`+ ctx.deterministic,`
`245`	`245`	`)`
`246`	`246`	`dqkv = dqkv[..., : dout.shape[-1]] # We could have padded the head dimension`
`247`	`247`	`return dqkv, None, None, None, None, None, None, None, None, None, None`