varlen combine scheduler (#70)

LucasWilkinson · tlrmchlsmth · LucasWilkinson · commit 27334013c6b5 · 2025-08-07T22:50:54.000Z
* varlen combine scheduler Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com> * cleanup Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com> * move check Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com> * standard scheduling algo Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com> * better heuristic Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com> * better comments Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com> * cleanup Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com> * cleanup Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com> * put in a more readable heurisitic Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com> * Apply suggestions from code review Co-authored-by: Tyler Michael Smith <tysmith@redhat.com> Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com> * FA2 8.0 PTX (#69) Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com> --------- Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com> Co-authored-by: Tyler Michael Smith <tysmith@redhat.com>
diff --git a/hopper/flash_fwd_combine_kernel.h b/hopper/flash_fwd_combine_kernel.h
@@ -122,16 +122,24 @@ class FlashAttnFwdCombine {
     using ShapeLSE = cute::Shape<int32_t, int32_t, int32_t>;  // (seqlen, head, batch)
     using StrideLSE = cute::Stride<_1, int64_t, int64_t>;  // (seqlen, head, batch)
 
+    struct BlockCoord {
+        int block_m;
+        int block_k;
+        int bidb;
+    };
+
     struct SharedStorage : cute::aligned_struct<128> {
         cute::array_aligned<float, cute::cosize_v<SmemLayoutLSE>> smem_lse_partial;
         cute::array_aligned<int, kBlockM> smem_max_valid_split;
         cute::array_aligned<ElementPartial, cute::cosize_v<SmemLayoutO>> smem_o_partial;
+        BlockCoord block_coord;
     };
 
     static constexpr int SharedStorageSize = sizeof(SharedStorage);
 
     // Device side arguments
     struct Arguments {
+        int b;
         ElementPartial const* const ptr_O_partial;
         ShapeOPartial const shape_O_partial;
         StrideOPartial const stride_O_partial;
@@ -149,7 +157,8 @@ class FlashAttnFwdCombine {
     };
 
     // Kernel entry point API
-    struct Params {
+    struct CollectiveParams {
+        int b;
         ElementPartial const* const ptr_O_partial;
         ShapeOPartial const shape_O_partial;
         StrideOPartial const stride_O_partial;
@@ -169,10 +178,11 @@ class FlashAttnFwdCombine {
 
     // Convert to underlying arguments. In this case, a simple copy for the aliased type.
     static
-    Params
+    CollectiveParams
     to_underlying_arguments(Arguments const& args) {
         assert(get<1>(args.shape_LSE_partial) <= kMaxSplits);
         return {
+            args.b,
             args.ptr_O_partial,
             args.shape_O_partial,
             args.stride_O_partial,
@@ -191,33 +201,243 @@ class FlashAttnFwdCombine {
         };
     }
 
+    struct SchedulerArguments {
+        int b;
+        int seqlen_q;
+        int total_q;
+        int num_heads;
+        int dv;
+        int const* cu_seqlens_q;
+        int const* seqused_q;
+    };
+
+    struct StaticTileScheduler {
+        struct Params {};
+        static Params to_underlying_arguments(SchedulerArguments const& args) { return {}; }
+
+        SharedStorage& shared_storage;
+        CUTE_DEVICE StaticTileScheduler(SharedStorage& shared_storage): shared_storage(shared_storage) {}
+
+        static dim3 get_grid_shape(SchedulerArguments const& args) {
+            unsigned int num_blocks_k = cute::ceil_div(args.dv, kBlockK);
+            unsigned int num_blocks_m = cute::ceil_div(args.seqlen_q * args.num_heads, kBlockM);
+            return {num_blocks_m, num_blocks_k, static_cast<unsigned int>(args.b)};
+        }
+
+        CUTE_DEVICE BlockCoord get_block_coord(Params const& params) {
+            int block_m = blockIdx.x;
+            int block_k = blockIdx.y;
+            int bidb = blockIdx.z;
+            return {block_m, block_k, bidb};
+        }
+    };
+
+    struct StaticVarlenTileScheduler {
+        //
+        // For varlen we have two Scheduling algos:
+        //  1) STANDARD, same as StaticTileScheduler
+        //  2) LINEARIZE_M_AND_BATCH, this flattens the tiled M dimension and
+        //     batch dimension into a linear tile index. The grid is then a
+        //     2D grid of (tile_id, k_block). We then map the linear tile id
+        //     to (m_block, bidb) in the get_block_coord function. This mapping
+        //     is non-trivial since each batch element can have a different
+        //     number of m_blocks. This has overhead when computing the block
+        //     coordinates, but it is more efficient when prefills and decodes
+        //     are mixed since in that case the STANDARD scheduling algo will
+        //     have a lot of empty (no work) blocks in the grid.
+        //
+
+        enum SchedulingAlgo {
+            STANDARD,           // Same as StaticTileScheduler
+            LINEARIZE_M_AND_BATCH,  // Linearize the M and batch dimensions into a single tile index
+        };
+
+        struct Params {
+            int b;
+            int num_heads;
+            int const* const cu_seqlens_q;
+            int const* const seqused_q;
+            SchedulingAlgo algo;
+        };
+
+        SharedStorage& shared_storage;
+        CUTE_DEVICE StaticVarlenTileScheduler(SharedStorage& shared_storage): shared_storage(shared_storage) {}
+
+        static SchedulingAlgo choose_scheduling_algo(SchedulerArguments const& args) {
+            // Choose the scheduling algorithm based on how dense the grid of tiles that
+            // do actual work is. If the grid is more then 50% sparse, we linearize the M
+            // and batch. If the grid is more than 50% dense, we use the standard scheduling
+            // algorithm since its more efficient at calculating the block coordinates.
+            // NOTE: in varlen case args.seqlen_q is the max seqlen_q across all batches
+            // use lower bound to estimate when the density is more than 50%
+            int lower_bound_on_non_empty_tiles = cute::ceil_div(args.total_q, kBlockM);
+            int grid_size = args.b * cute::ceil_div(args.seqlen_q, kBlockM);
+            return 2 * lower_bound_on_non_empty_tiles >= grid_size ? 
+                SchedulingAlgo::STANDARD : 
+                SchedulingAlgo::LINEARIZE_M_AND_BATCH;
+        }
+
+        static Params to_underlying_arguments(SchedulerArguments const& args) { 
+            return {
+                args.b,
+                args.num_heads,
+                args.cu_seqlens_q,
+                args.seqused_q,
+                choose_scheduling_algo(args)
+            }; 
+        }
+
+        static dim3 get_grid_shape(SchedulerArguments const& args) {
+            unsigned int num_blocks_k = cute::ceil_div(args.dv, kBlockK);
+
+            switch (choose_scheduling_algo(args)) {
+            case SchedulingAlgo::STANDARD: {
+                unsigned int num_blocks_k = cute::ceil_div(args.dv, kBlockK);
+                unsigned int num_blocks_m = cute::ceil_div(args.seqlen_q * args.num_heads, kBlockM);
+                return {num_blocks_m, num_blocks_k, static_cast<unsigned int>(args.b)};
+            }
+            case SchedulingAlgo::LINEARIZE_M_AND_BATCH: {
+                // rough worst case upper bound on the number of blocks required 
+                //  (assuming each batch has an additional partial block)
+                unsigned int num_blocks_m = cute::ceil_div(args.total_q * args.num_heads, kBlockM) + args.b;
+                return {num_blocks_m, num_blocks_k, 1};
+            }}
+
+            // rough worst case upper bound on the number of blocks required 
+            //  (assuming each batch has an additional partial block)
+            unsigned int num_blocks_m = cute::ceil_div(args.total_q * args.num_heads, kBlockM) + args.b;
+            return {num_blocks_m, num_blocks_k, 1};
+        }
+
+        CUTE_DEVICE BlockCoord get_block_coord_linearized_m_and_batch(Params const& params) {
+            int num_heads = params.num_heads;
+            int curr_tile_id = blockIdx.x;
+
+            // Scan through the batches find the batch that contains the current
+            // tile_id. Compute using only the first warp of the block.
+            if (threadIdx.x < 32) {
+                // We compute linearized tile index start and ends for each batch
+                // in groups of 32 in parallel
+                int group_start_bidb = -(cutlass::NumThreadsPerWarp);
+                int group_end_bidb = 0;
+                int group_end_tile_id = 0;
+                int group_start_tile_id = 0;
+                int group_total_num_tiles = 0;
+
+                int local_num_m_blocks = 0;
+                int local_num_m_blocks_cumulative = 0;
+
+                do {
+                    group_start_bidb += cutlass::NumThreadsPerWarp;
+                    group_end_bidb += cutlass::NumThreadsPerWarp;
+
+                    auto get_num_m_blocks = [&](int bidb) {
+                        if (bidb >= params.b) return 0;
+                        flash::SeqlenInfo<Varlen, kBlockM> seqlen_info{bidb, 0, params.cu_seqlens_q, params.seqused_q};
+                        return cute::ceil_div(seqlen_info.seqlen * num_heads, Int<kBlockM>{}());
+                    };
+
+                    // Cumulative number of blocks for the next 31 batches
+                    local_num_m_blocks = get_num_m_blocks(group_start_bidb + threadIdx.x);
+                    local_num_m_blocks_cumulative = warp_prefix_sum(local_num_m_blocks);
+                    // Total number of blocks for the next 32 batches
+                    group_total_num_tiles = warp_shfl_get_last(local_num_m_blocks_cumulative);
+                    
+                    group_start_tile_id = group_end_tile_id;
+                    group_end_tile_id += group_total_num_tiles;
+                } while (curr_tile_id >= group_end_tile_id && group_end_bidb < params.b);
+                
+                int local_batch_end_tile_id = group_start_tile_id + local_num_m_blocks_cumulative;
+                // Find the last batch idx in the group where `local_batch_end_tile_id <= curr_tile_id`
+                // these values below are now common to all threads in the warp
+                int batch_idx_in_group = warp_last_true_laneid(local_batch_end_tile_id <= curr_tile_id);
+                int batch_num_m_blocks = warp_shfl_get(local_num_m_blocks, batch_idx_in_group);
+                int batch_m_start_tile_id = group_start_tile_id + (batch_idx_in_group > 0 ? 
+                    warp_shfl_get(local_num_m_blocks_cumulative, batch_idx_in_group - 1) : 0);
+                
+                int bidb = group_start_bidb + batch_idx_in_group;
+                int block_m = curr_tile_id - batch_m_start_tile_id;
+                // NOTE(lucas): not sure why this causes a block_k unused warning
+                //  just inlined `blockIdx.y` to suppress the warning
+                // int block_k = blockIdx.y;
+                // shared_storage.block_coord = {block_m, block_k, bidb};
+                BlockCoord block_coord{block_m, static_cast<int>(blockIdx.y), bidb};
+                if (threadIdx.x == 0) { shared_storage.block_coord = block_coord; }
+            }
+
+            __syncthreads();
+            return shared_storage.block_coord;
+        }
+
+
+        CUTE_DEVICE BlockCoord get_block_coord_standard(Params const& params) {
+            int block_m = blockIdx.x;
+            int block_k = blockIdx.y;
+            int bidb = blockIdx.z;
+            return {block_m, block_k, bidb};
+        }
+
+        CUTE_DEVICE BlockCoord get_block_coord(Params const& params) {
+            switch (params.algo) {
+                case SchedulingAlgo::STANDARD:
+                    return get_block_coord_standard(params);
+                case SchedulingAlgo::LINEARIZE_M_AND_BATCH:
+                    return get_block_coord_linearized_m_and_batch(params);
+            }
+            return {0, 0, 0};  // Should never reach here
+        }
+    };
+
+    using TileScheduler = std::conditional_t<
+        Varlen,
+        StaticVarlenTileScheduler,
+        StaticTileScheduler
+    >;
+
+    using SchedulerParams = typename TileScheduler::Params;
+
+    struct Params {
+        CollectiveParams params;
+        SchedulerParams scheduler_params;
+    };
+
     CUTLASS_DEVICE
     void
-    operator()(Params const& params, char* smem_buf) {
+    operator()(Params const& kernel_params, char* smem_buf) {
+        CollectiveParams const& params = kernel_params.params;
 
         SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+        TileScheduler tile_scheduler{shared_storage};
+
         Tensor sLSE = make_tensor(make_smem_ptr(shared_storage.smem_lse_partial.data()), SmemLayoutLSE{});
         Tensor sMaxValidSplit = make_tensor(make_smem_ptr(shared_storage.smem_max_valid_split.data()), Shape<Int<kBlockM>>{});
         Tensor sO = make_tensor(make_smem_ptr(shared_storage.smem_o_partial.data()), SmemLayoutO{});
 
         int const thread_idx = threadIdx.x;
-        int const m_block = blockIdx.x;
-        int const k_block = blockIdx.y;
-        int const batch = blockIdx.z;
-        int const num_splits = params.num_splits_dynamic_ptr ? params.num_splits_dynamic_ptr[batch] : get<1>(params.shape_LSE_partial);
+
+        BlockCoord block_coord = tile_scheduler.get_block_coord(kernel_params.scheduler_params);
+
+        int const m_block = block_coord.block_m;
+        int const k_block = block_coord.block_k;
+        int const batch = block_coord.bidb;
 
         if (params.semaphore_to_reset && threadIdx.x == 0 && blockIdx.x == gridDim.x - 1 && blockIdx.y == gridDim.y - 1 && blockIdx.z == gridDim.z - 1) {
             cutlass::arch::wait_on_dependent_grids();
             *params.semaphore_to_reset = 0;
         }
-        if (num_splits <= 1) { return; }
+
         flash::SeqlenInfo<Varlen, kBlockM> seqlen_info{batch, size<0>(params.shape_LSE_partial), params.cu_seqlens, params.seqused};
         int const offset = seqlen_info.offset;
         int const seqlen = seqlen_info.seqlen;
         int max_idx = seqlen * get<2>(params.shape_LSE_partial);
-        if constexpr (Varlen) {
-            if (m_block * kBlockM >= max_idx) { return; }
-        }
+
+        bool block_coord_valid = 
+            block_coord.block_m < cute::ceil_div(max_idx, Int<kBlockM>{}) &&
+            block_coord.bidb < params.b;
+        if (!block_coord_valid) { return; }
+
+        int const num_splits = params.num_splits_dynamic_ptr ? params.num_splits_dynamic_ptr[batch] : get<1>(params.shape_LSE_partial);
+        if (num_splits <= 1) { return; }
 
         cutlass::FastDivmod seqlen_divmod_dynamic(seqlen);
 
diff --git a/hopper/flash_fwd_combine_launch_template.h b/hopper/flash_fwd_combine_launch_template.h
@@ -25,6 +25,7 @@ void run_flash_fwd_combine(Flash_fwd_params &params, cudaStream_t stream, bool e
                                                      IsEvenK, Varlen, Element, ElementPartial, ArchTag>;
 
     typename CombineKernel::Arguments args {
+        params.b,
         static_cast<ElementPartial const*>(params.oaccum_ptr),
         {!Varlen ? params.seqlen_q : params.total_q, params.dv, params.num_splits, params.h, !Varlen ? params.b : 1},  // shape_O_partial
         {params.oaccum_row_stride, _1{}, params.oaccum_split_stride, params.oaccum_head_stride, !Varlen ? params.oaccum_batch_stride : 0},  // stride_O_partial
@@ -38,10 +39,17 @@ void run_flash_fwd_combine(Flash_fwd_params &params, cudaStream_t stream, bool e
         params.cu_seqlens_q, params.seqused_q, params.num_splits_dynamic_ptr, params.tile_count_semaphore
     };
 
-    typename CombineKernel::Params kernel_params = CombineKernel::to_underlying_arguments(args);
-    int num_blocks_k = cute::ceil_div(params.dv, kBlockK);
-    int num_blocks_m = cute::ceil_div(params.seqlen_q * params.h, kBlockM);
-    dim3 grid_m(num_blocks_m, num_blocks_k, params.b);
+    typename CombineKernel::SchedulerArguments scheduler_args  {
+        params.b, params.seqlen_q, params.total_q, params.h, params.dv,
+        params.cu_seqlens_q, params.seqused_q
+    };
+
+    typename CombineKernel::Params kernel_params = {
+        CombineKernel::to_underlying_arguments(args),
+        CombineKernel::TileScheduler::to_underlying_arguments(scheduler_args)
+    };
+
+    dim3 grid_m = CombineKernel::TileScheduler::get_grid_shape(scheduler_args);
     auto kernel = cutlass::device_kernel<CombineKernel>;
     int smem_size = CombineKernel::SharedStorageSize;
     if (smem_size >= 48 * 1024) {
diff --git a/hopper/utils.h b/hopper/utils.h
@@ -646,6 +646,22 @@ CUTE_DEVICE T warp_prefix_sum(T val) {
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
+template<typename T>
+CUTE_DEVICE T warp_shfl_get(T val, int src_lane) {
+    return __shfl_sync(0xffffffff, val, src_lane);
+};
+
+template<typename T>
+CUTE_DEVICE T warp_shfl_get_last(T val) {
+    return __shfl_sync(0xffffffff, val, cutlass::NumThreadsPerWarp - 1);
+};
+
+CUTE_DEVICE int warp_last_true_laneid(bool cond) {
+    return __popc(__ballot_sync(0xffffffff, cond));
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
 template<class T>
 CUTE_DEVICE T warp_uniform(T a) {
     return __shfl_sync(0xffffffff, a, 0);