vllm-project
diff --git a/‎hopper/block.h‎
Lines changed: 1 addition & 1 deletion b/‎hopper/block.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎hopper/flash.h‎
Lines changed: 4 additions & 0 deletions b/‎hopper/flash.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎hopper/flash_api.cpp‎
Lines changed: 10 additions & 5 deletions b/‎hopper/flash_api.cpp‎
Lines changed: 10 additions & 5 deletions
diff --git a/‎hopper/flash_api_torch_lib.cpp‎
Lines changed: 7 additions & 3 deletions b/‎hopper/flash_api_torch_lib.cpp‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎hopper/flash_attn_interface.py‎
Lines changed: 27 additions & 3 deletions b/‎hopper/flash_attn_interface.py‎
Lines changed: 27 additions & 3 deletions
diff --git a/‎hopper/flash_fwd_kernel_sm90.h‎
Lines changed: 3 additions & 2 deletions b/‎hopper/flash_fwd_kernel_sm90.h‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎hopper/flash_fwd_launch_template.h‎
Lines changed: 5 additions & 1 deletion b/‎hopper/flash_fwd_launch_template.h‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎hopper/mainloop_fwd_sm80.hpp‎
Lines changed: 2 additions & 0 deletions b/‎hopper/mainloop_fwd_sm80.hpp‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎hopper/mainloop_fwd_sm90_tma_gmma_ws.hpp‎
Lines changed: 16 additions & 6 deletions b/‎hopper/mainloop_fwd_sm90_tma_gmma_ws.hpp‎
Lines changed: 16 additions & 6 deletions
@@ -39,7 +39,7 @@ struct BlockMN {
             if (PackGQA) { m_idx_max = qhead_per_khead_divmod.divide(m_idx_max - 1) + 1 ; }
             // If local, blocking (m_idx_max - m_idx_min + window_size_right + window_size_left)  
             n_block_max = std::min(n_block_max,
-                                   cute::ceil_div(m_idx_max + seqlen_k - seqlen_q + window_size_right, kBlockN));
+                                   cute::ceil_div(m_idx_max + seqlen_info.cp_world_size * seqlen_k - seqlen_q + window_size_right, kBlockN));
         }
         // Now, only adjust n_block_min if split
         int n_block_min = 0;
 
@@ -161,6 +161,10 @@ struct Flash_fwd_params : public Qkv_params {
 
     // The S extra matrix, (num_heads)
     void *__restrict__ s_aux_ptr;
+
+    // CP (Context Parallelism) parameters
+    int cp_world_size;
+    int cp_rank;
 };
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -434,8 +434,8 @@ inline bool get_pack_gqa(Flash_fwd_params const& params) {
     if (params.arch < 90 || (params.page_table && !params.pagedkv_tma) || params.num_splits > 1) { return true; }
     // Always enable PackGQA for special case of hdim = 64, qheads/kvheads = 8, local attention
     // TODO: investigate more cases where PackGQA improves perf due to better tile quantization
-    bool const packgqa_override = params.arch >= 90 && (params.h / params.h_k) == 8 && 
-                                  params.is_local && 
+    bool const packgqa_override = params.arch >= 90 && (params.h / params.h_k) == 8 &&
+                                  params.is_local &&
                                   params.d == 64 && (params.dv == params.d);
     if (packgqa_override) { return true; }
     #ifdef FLASHATTENTION_DISABLE_PACKGQA
@@ -701,7 +701,9 @@ mha_fwd(at::Tensor &q,   // (b, s_q, h, d) or (total_q, h, d) if there is cu_seq
         int num_splits,
         std::optional<bool> pack_gqa_,
         int const sm_margin,
-        std::optional<const at::Tensor> &s_aux_ // (h)
+        std::optional<const at::Tensor> &s_aux_, // (h)
+        int const cp_world_size,  // context parallelism (cp) world size
+        int const cp_rank         // cp rank
         ) {
 
     auto dprops = at::cuda::getCurrentDeviceProperties();
@@ -784,7 +786,7 @@ mha_fwd(at::Tensor &q,   // (b, s_q, h, d) or (total_q, h, d) if there is cu_seq
         }
         #ifdef FLASHATTENTION_DISABLE_HDIMDIFF64
         TORCH_CHECK(head_size > 64, "This flash attention build does not support hdim != hdim_v when hdim <= 64");
-        #endif 
+        #endif
         #ifdef FLASHATTENTION_DISABLE_HDIMDIFF192
         TORCH_CHECK(head_size <= 64, "This flash attention build does not support hdim != hdim_v when hdim in (128, 192]");
         #endif
@@ -1148,6 +1150,9 @@ mha_fwd(at::Tensor &q,   // (b, s_q, h, d) or (total_q, h, d) if there is cu_seq
         params.s_aux_ptr = nullptr;
     }
 
+    params.cp_world_size = cp_world_size;
+    params.cp_rank = cp_rank;
+
     #ifdef FLASHATTENTION_DISABLE_LOCAL
     TORCH_CHECK(!params.is_local, "This flash attention build does not support local attention.");
     #endif
@@ -1664,4 +1669,4 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
     m.def("get_scheduler_metadata", &mha_fwd_get_scheduler_metadata, "Get scheduler metadata for varlen forward pass");
 }
 
-#endif
+#endif
@@ -52,7 +52,9 @@ mha_fwd(at::Tensor &q,   // (b, s_q, h, d) or (total_q, h, d) if there is cu_seq
         int num_splits,
         std::optional<bool> pack_gqa_,
         int const sm_margin,
-        std::optional<const at::Tensor> &s_aux_ 
+        std::optional<const at::Tensor> &s_aux_,
+        int const cp_world_size,
+        int const cp_rank,
 );
 
 // Only applicable to the case where seqused_k (i.e. cache_seqlens) is available
@@ -120,7 +122,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
             "    int      num_splits,"
             "    bool?    pack_gqa,"
             "    int      sm_margin,"
-            "    Tensor?  s_aux) -> Tensor[]");
+            "    Tensor?  s_aux,"
+            "    int      cp_world_size,"
+            "    int      cp_rank") -> Tensor[]");
     ops.impl("fwd", torch::kCUDA, make_pytorch_shim(&mha_fwd));
 
     ops.def("get_scheduler_metadata("
@@ -151,4 +155,4 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
         make_pytorch_shim(&mha_fwd_get_scheduler_metadata));
 }
 
-REGISTER_EXTENSION(TORCH_EXTENSION_NAME);
+REGISTER_EXTENSION(TORCH_EXTENSION_NAME);
@@ -49,7 +49,9 @@ def _flash_attn_forward(
         num_splits=1,
         pack_gqa=None,
         sm_margin=0,
-        s_aux=None):
+        s_aux=None,
+        cp_world_size=1,
+        cp_rank=0):
     q, k, k_new, v_new = [maybe_contiguous(x) for x in (q, k, k_new, v_new)]
     v = v.contiguous() if v.stride(-1) != 1 and v.stride(-3) != 1 else v
     cu_seqlens_q, cu_seqlens_k, cu_seqlens_k_new = [
@@ -95,7 +97,9 @@ def _flash_attn_forward(
         num_splits,
         pack_gqa,
         sm_margin,
-        s_aux
+        s_aux,
+        cp_world_size,
+        cp_rank
     )
     return out, softmax_lse, *rest
 
@@ -235,7 +239,7 @@ def backward(ctx, dout, *args):
             ctx.causal,
             ctx.window_size,
             ctx.softcap,
-            ctx.deterministic, 
+            ctx.deterministic,
         )
         dqkv = dqkv[..., : dout.shape[-1]]  # We could have padded the head dimension
         return dqkv, None, None, None, None, None, None, None, None, None, None
@@ -260,6 +264,8 @@ def forward(
         deterministic=False,
         sm_margin=0,
         s_aux=None,
+        cp_world_size=1,
+        cp_rank=0,
     ):
         if softmax_scale is None:
             softmax_scale = (q.shape[-1] + (qv.shape[-1] if qv is not None else 0)) ** (-0.5)
@@ -285,6 +291,8 @@ def forward(
             pack_gqa=pack_gqa,
             sm_margin=sm_margin,
             s_aux=s_aux,
+            cp_world_size=cp_world_size,
+            cp_rank=cp_rank,
         )
         # ctx.save_for_backward(q, k, v, out_padded, softmax_lse)
         ctx.save_for_backward(q, k, v, out, softmax_lse)
@@ -351,6 +359,8 @@ def forward(
         deterministic=False,
         sm_margin=0,
         s_aux=None,
+        cp_world_size=1,
+        cp_rank=0,
     ):
         if softmax_scale is None:
             softmax_scale = (q.shape[-1] + (qv.shape[-1] if qv is not None else 0)) ** (-0.5)
@@ -380,6 +390,8 @@ def forward(
             pack_gqa=pack_gqa,
             sm_margin=sm_margin,
             s_aux=s_aux,
+            cp_world_size=cp_world_size,
+            cp_rank=cp_rank,
         )
         # ctx.save_for_backward(q, k, v, out_padded, softmax_lse, cu_seqlens_q, cu_seqlens_k, seqused_q, seqused_k)
         ctx.save_for_backward(q, k, v, out, softmax_lse, cu_seqlens_q, cu_seqlens_k, seqused_q, seqused_k)
@@ -497,6 +509,8 @@ def flash_attn_func(
     deterministic=False,
     sm_margin=0,
     s_aux=None,
+    cp_world_size=1,
+    cp_rank=0,
 ):
     """dropout_p should be set to 0.0 during evaluation
     Supports multi-query and grouped-query attention (MQA/GQA) by passing in KV with fewer heads
@@ -558,6 +572,8 @@ def flash_attn_func(
         deterministic,
         sm_margin,
         s_aux,
+        cp_world_size,
+        cp_rank,
     )
 
 
@@ -582,6 +598,8 @@ def flash_attn_varlen_func(
     deterministic=False,
     sm_margin=0,
     s_aux=None,
+    cp_world_size=1,
+    cp_rank=0,
 ):
     return FlashAttnVarlenFunc.apply(
         q,
@@ -604,6 +622,8 @@ def flash_attn_varlen_func(
         deterministic,
         sm_margin,
         s_aux,
+        cp_world_size,
+        cp_rank,
     )
 
 
@@ -642,6 +662,8 @@ def flash_attn_with_kvcache(
     sm_margin=0,     # Can be tuned if some SMs are used for communication
     return_softmax_lse=False,
     s_aux=None,
+    cp_world_size=1,
+    cp_rank=0,
 ):
     """
     If k and v are not None, k_cache and v_cache will be updated *inplace* with the new values from
@@ -769,6 +791,8 @@ def flash_attn_with_kvcache(
         pack_gqa=pack_gqa,
         sm_margin=sm_margin,
         s_aux=s_aux,
+        cp_world_size=cp_world_size,
+        cp_rank=cp_rank,
     )
     # return (out, softmax_lse) if return_softmax_lse else out
     return (out, softmax_lse, *rest) if return_softmax_lse else out
 
@@ -347,7 +347,8 @@ class FlashAttnFwdSm90 {
                     get<0>(params.mainloop.shape_K_new),
                     params.mainloop.cu_seqlens_q, params.mainloop.cu_seqlens_k, params.mainloop.cu_seqlens_k_new,
                     params.mainloop.seqused_q, params.mainloop.seqused_k, params.mainloop.leftpad_k,
-                    params.mainloop.seqlens_rotary
+                    params.mainloop.seqlens_rotary,
+                    params.mainloop.cp_world_size
                 };
                 if constexpr (AppendKV) {
                     bool tile_new_valid = mainloop.load_kv_new(
@@ -396,7 +397,7 @@ class FlashAttnFwdSm90 {
                     get<0>(params.mainloop.shape_K_new),
                     params.mainloop.cu_seqlens_q, params.mainloop.cu_seqlens_k, params.mainloop.cu_seqlens_k_new,
                     params.mainloop.seqused_q, params.mainloop.seqused_k, params.mainloop.leftpad_k,
-                    params.mainloop.seqlens_rotary
+                    params.mainloop.seqlens_rotary, params.mainloop.cp_world_size
                 };
                 if constexpr (AppendKV) {
                     bool tile_new_valid = mainloop.store_kv_new(
 
@@ -89,6 +89,7 @@ void run_flash_fwd(Flash_fwd_params &params, cudaStream_t stream) {
         cute::conditional_return<!V_colmajor>(
             make_stride(params.v_row_stride, _1{}, params.v_head_stride, !is_varlen_k ? params.v_batch_stride : 0),
             make_stride(_1{}, params.v_dim_stride, params.v_head_stride, !is_varlen_k ? params.v_batch_stride : 0));
+
     typename CollectiveMainloop::Arguments mainloop_args {
         static_cast<Element const*>(params.q_ptr),
         {seqlen_q, params.d, params.h, batch_q},  // shape_Q
@@ -129,7 +130,8 @@ void run_flash_fwd(Flash_fwd_params &params, cudaStream_t stream) {
         params.cu_seqlens_q, params.cu_seqlens_k, params.cu_seqlens_knew,
         params.seqused_q, params.seqused_k,
         params.leftpad_k, params.seqlens_rotary,
-        static_cast<ElementS const*>(params.s_aux_ptr)
+        static_cast<ElementS const*>(params.s_aux_ptr),
+        params.cp_world_size, params.cp_rank,
     };
     typename CollectiveEpilogue::Arguments epilogue_args {
         static_cast<ElementOut*>(params.o_ptr),
@@ -156,6 +158,8 @@ void run_flash_fwd(Flash_fwd_params &params, cudaStream_t stream) {
         params.tile_count_semaphore, params.cu_seqlens_q, params.seqused_q,
         // params.num_m_blocks_ptr,
         params.num_splits_dynamic_ptr,
+        params.cp_world_size,
+        params.cp_rank,
     };
 
     if (Varlen && params.num_splits_dynamic_ptr && !params.skip_scheduler_metadata_computation) {
 
@@ -215,6 +215,8 @@ struct CollectiveMainloopFwdSm80 {
         int const* const leftpad_k = nullptr;
         int const* const seqlens_rotary = nullptr;
         ElementSAux const* const ptr_S_aux = nullptr;
+        int cp_world_size;
+        int cp_rank;
     };
 
     // Device side kernel params
 
@@ -412,6 +412,9 @@ struct CollectiveMainloopFwdSm90 {
         int const* const leftpad_k = nullptr;
         int const* const seqlens_rotary = nullptr;
         ElementSAux const* const ptr_S_aux = nullptr;
+        // Context parallelism (CP) parameters
+        int const cp_world_size = 1;
+        int const cp_rank = 0;
     };
 
     // Device side kernel params
@@ -469,6 +472,8 @@ struct CollectiveMainloopFwdSm90 {
         int const* const leftpad_k = nullptr;
         int const* const seqlens_rotary = nullptr;
         ElementSAux const* const ptr_S_aux = nullptr;
+        int cp_world_size = 1;
+        int cp_rank = 0;
     };
 
     static Params
@@ -540,7 +545,7 @@ struct CollectiveMainloopFwdSm90 {
                 return nullptr;
             }
         }();
-        
+
         auto const shape_Qv_packed = cute::conditional_return<!PackGQA>(
             shape_Qv,
             make_shape(make_shape(qhead_per_khead, get<0>(shape_Qv)), get<1>(shape_Qv), get<2>(args.shape_K), get<3>(shape_Qv))
@@ -584,7 +589,8 @@ struct CollectiveMainloopFwdSm90 {
                 args.kv_batch_idx,
                 args.cu_seqlens_q, args.cu_seqlens_k, args.cu_seqlens_k_new,
                 args.seqused_q, args.seqused_k, args.leftpad_k, args.seqlens_rotary,
-                args.ptr_S_aux};
+                args.ptr_S_aux,
+                args.cp_world_size, args.cp_rank};
     }
 
     /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
@@ -999,6 +1005,7 @@ struct CollectiveMainloopFwdSm90 {
         static constexpr int kBlockN = get<1>(TileShape_MNK{});
 
         // can't use auto [m_block, ...] = block_coord since structured binding cannot be captured in lambda
+        // block index
         int const m_block = get<0>(block_coord);
         int const bidh = get<1>(block_coord);
         int const bidb = get<2>(block_coord);
@@ -1093,7 +1100,8 @@ struct CollectiveMainloopFwdSm90 {
         // But we subtract n_offset for consistency in mask calculations
         flash::Mask<kBlockM, kBlockN, PackGQA, TiledMmaQK> mask(
             thread_idx, seqlen_q, seqlen_k, params.window_size_left, params.window_size_right, 0 - n_offset /*sink_token_length*/,
-            params.qhead_per_khead_divmod
+            params.qhead_per_khead_divmod,
+            params.cp_world_size, params.cp_rank
         );
 
         float softcap_val = params.softcap_val;
@@ -1201,6 +1209,7 @@ struct CollectiveMainloopFwdSm90 {
         }
 
         if constexpr (IntraWGOverlap) {
+
             Tensor tSrS = partition_fragment_C(tiled_mma_qk, select<0, 1>(TileShape_MNK{}));
             consumer_wait(pipeline_k, smem_pipe_read);
             flash::gemm</*zero_init=*/true, /*wg_wait=*/-1>(tiled_mma_qk, tSrQ, tSrK(_, _, _, smem_pipe_read.index()), tSrS);
@@ -1272,7 +1281,8 @@ struct CollectiveMainloopFwdSm90 {
             };
 
             if constexpr (Is_causal || Is_local) { // Separate iterations with causal or local masking
-                auto mask_fn = [&](auto& tSrS, int n_block) { mask.template apply<false /*Seqlenk_mask*/, Is_causal, Is_local>(tSrS, m_block, n_block); };
+                auto mask_fn = [&](auto& tSrS, int n_block) {
+                  mask.template apply<false /*Seqlenk_mask*/, Is_causal, Is_local>(tSrS, m_block, n_block); };
                 int const m_idx_min = !PackGQA ? m_block * kBlockM : params.qhead_per_khead_divmod.divide(m_block * kBlockM);
                 // If local, blocking (window_size_right + window_size_left)
                 int const n_block_min_causal_local_mask =
@@ -1288,7 +1298,7 @@ struct CollectiveMainloopFwdSm90 {
             int const n_block_min_before_local_mask = !Is_local
                 ? n_block_min
                 : std::max(n_block_min,
-                           cute::ceil_div(m_idx_max + seqlen_k - seqlen_q - params.window_size_left, kBlockN));
+                           cute::ceil_div(m_idx_max + params.cp_world_size * seqlen_k - seqlen_q - params.window_size_left, kBlockN));
             auto no_mask_fn = [](auto& tSrS, int n_block) { };
             #pragma unroll 1
             for (; n_block >= n_block_min_before_local_mask; --n_block) {
@@ -1414,7 +1424,7 @@ struct CollectiveMainloopFwdSm90 {
             // Tensor scores_scale = softmax.finalize(v_descale);
             Tensor scores_scale = make_tensor_like(softmax.row_max);
             finalize_dispatch(scores_scale, v_descale);
-            
+
             if constexpr (LargeHeadDimV) {
                 cutlass::arch::NamedBarrier::sync(NumMmaThreads, static_cast<uint32_t>(FwdNamedBarriers::PEmpty) /*id*/);
                 store_scales(scores_scale, smem_pipe_read.index());
Original file line number	Diff line number	Diff line change
`@@ -39,7 +39,7 @@ struct BlockMN {`
`39`	`39`	`if (PackGQA) { m_idx_max = qhead_per_khead_divmod.divide(m_idx_max - 1) + 1 ; }`
`40`	`40`	`// If local, blocking (m_idx_max - m_idx_min + window_size_right + window_size_left)`
`41`	`41`	`n_block_max = std::min(n_block_max,`
`42`		`- cute::ceil_div(m_idx_max + seqlen_k - seqlen_q + window_size_right, kBlockN));`
	`42`	`+ cute::ceil_div(m_idx_max + seqlen_info.cp_world_size * seqlen_k - seqlen_q + window_size_right, kBlockN));`
`43`	`43`	`}`
`44`	`44`	`// Now, only adjust n_block_min if split`
`45`	`45`	`int n_block_min = 0;`