ROCm · bartekxk · Dec 5, 2024 · Dec 5, 2024 · Dec 6, 2024 · Dec 7, 2024
@@ -54,8 +54,7 @@ using CDataType   = Types::CDataType;
 auto create_args(int argc, char* argv[])
 {
     ck_tile::ArgParser arg_parser;
-    arg_parser.insert("b", "1", "batch size")
-        .insert("m", "3840", "m dimension")
+    arg_parser.insert("m", "3840", "m dimension")
         .insert("n", "4096", "n dimension")
         .insert("k", "2048", "k dimension")
         .insert("a_layout", "R", "A tensor data layout - Row by default")
@@ -68,7 +67,8 @@ auto create_args(int argc, char* argv[])
         .insert("prec", "fp16", "data type. fp16/bf16/fp8/bf8")
         .insert("warmup", "50", "number of iterations before benchmark the kernel")
         .insert("repeat", "100", "number of iterations to benchmark the kernel")
-        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer");
+        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
+        .insert("split_k", "1", "splitK value");
 
     bool result = arg_parser.parse(argc, argv);
     return std::make_tuple(result, arg_parser);

@@ -64,9 +64,9 @@ int run_gemm_example_with_layouts(int argc,
     ck_tile::index_t stride_B = arg_parser.get_int("stride_b");
     ck_tile::index_t stride_C = arg_parser.get_int("stride_c");
 
-    ck_tile::index_t batch_size = arg_parser.get_int("b");
-    int n_warmup                = arg_parser.get_int("warmup");
-    int n_repeat                = arg_parser.get_int("repeat");
+    ck_tile::index_t kbatch = arg_parser.get_int("split_k");
+    int n_warmup            = arg_parser.get_int("warmup");
+    int n_repeat            = arg_parser.get_int("repeat");
 
     using namespace ck_tile::literals;
 
@@ -133,7 +133,7 @@ int run_gemm_example_with_layouts(int argc,
                                            stride_A,
                                            stride_B,
                                            stride_C,
-                                           batch_size,
+                                           kbatch,
                                            n_warmup,
                                            n_repeat);
 

@@ -22,7 +22,7 @@
 #endif
 
 template <typename ALayout, typename BLayout, typename CLayout>
-float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s)
+float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
 {
 #if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
     // Memory friendly for Interwave scheduler
@@ -78,7 +78,9 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s)
 #endif
         ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>>;
 
-    const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(args.K);
+    const ck_tile::index_t k_grain     = args.k_batch * K_Tile;
+    const ck_tile::index_t K_split     = (args.K + k_grain - 1) / k_grain * K_Tile;
+    const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
     const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
     const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
 
@@ -106,17 +108,9 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s)
                                                   has_hot_loop_v,
                                                   tail_number_v>>;
         using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
-        auto kargs   = Kernel::MakeKargs(args.p_a,
-                                       args.p_b,
-                                       args.p_c,
-                                       args.M,
-                                       args.N,
-                                       args.K,
-                                       args.stride_A,
-                                       args.stride_B,
-                                       args.stride_C);
-
-        const dim3 grids      = Kernel::GridSize(args.M, args.N, args.kbatch);
+        auto kargs   = Kernel::MakeKernelArgs(args);
+
+        const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
         constexpr dim3 blocks = Kernel::BlockSize();
 
         if(!Kernel::IsSupportedArgument(kargs))

@@ -70,20 +70,25 @@ float batched_gemm(const ck_tile::BatchedGemmHostArgs& args, const ck_tile::stre
 
     using CodegenGemmTraits =
         ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
-
     using CodegenPipelineProblem = ck_tile::
         GemmPipelineProblem<ADataType, BDataType, AccDataType, CodegenGemmShape, CodegenGemmTraits>;
-
-    using CodegenGemmPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
+    using CodegenGemmPolicy = ck_tile::UniversalGemmPipelineAgBgCrPolicy;
+    using CodegenGemmPipeline =
+        ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem, CodegenGemmPolicy>;
     // ToDo: Will add the codegen part to test different pipeline policies in GEMM.
     // Now we only use the BlockGemmASmemBSmemCRegV1DefaultPolicy.
     using Kernel = ck_tile::BatchedGemmKernel<TilePartitioner, CodegenGemmPipeline, GemmEpilogue>;
 
     auto kargs = Kernel::MakeKernelArgs(args);
 
-    const dim3 grids      = Kernel::GridSize(args.M, args.N, args.batch_count);
+    const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch, args.batch_count);
     constexpr dim3 blocks = Kernel::BlockSize();
 
+    if(!Kernel::IsSupportedArgument(kargs))
+    {
+        throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+    }
+
     if(s.log_level_ > 0)
     {
         std::cout << "Launching kernel with args:"

@@ -49,7 +49,8 @@ auto create_args(int argc, char* argv[])
         .insert("prec", "fp16", "data type. fp16/bf16/fp8/bf8")
         .insert("warmup", "50", "number of iterations before benchmark the kernel")
         .insert("repeat", "100", "number of iterations to benchmark the kernel")
-        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer");
+        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
+        .insert("split_k", "1", "splitK value");
 
     bool result = arg_parser.parse(argc, argv);
     return std::make_tuple(result, arg_parser);

@@ -17,13 +17,15 @@ float invoke_batched_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
                           ck_tile::index_t batch_stride_B,
                           ck_tile::index_t batch_stride_C,
                           ck_tile::index_t batch_count,
+                          ck_tile::index_t kbatch,
                           int n_warmup,
                           int n_repeat)
 {
     ck_tile::BatchedGemmHostArgs args;
     args.a_ptr          = a_m_k_dev_buf.GetDeviceBuffer();
     args.b_ptr          = b_k_n_dev_buf.GetDeviceBuffer();
     args.c_ptr          = c_m_n_dev_buf.GetDeviceBuffer();
+    args.k_batch        = kbatch;
     args.M              = M;
     args.N              = N;
     args.K              = K;
@@ -79,6 +81,7 @@ int run_batched_gemm_example_with_layouts(int argc,
     ck_tile::index_t batch_stride_B = arg_parser.get_int("batch_stride_b");
     ck_tile::index_t batch_stride_C = arg_parser.get_int("batch_stride_c");
     ck_tile::index_t batch_count    = arg_parser.get_int("batch_count");
+    ck_tile::index_t kbatch         = arg_parser.get_int("split_k");
 
     int n_warmup = arg_parser.get_int("warmup");
     int n_repeat = arg_parser.get_int("repeat");
@@ -159,6 +162,7 @@ int run_batched_gemm_example_with_layouts(int argc,
                                                    batch_stride_B,
                                                    batch_stride_C,
                                                    batch_count,
+                                                   kbatch,
                                                    n_warmup,
                                                    n_repeat);
 

@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -56,6 +56,13 @@ struct CShuffleEpilogue
     // No additional shared memory needed
     CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() { return 0; }
 
+    CK_TILE_HOST_DEVICE static constexpr bool IsOutputTransposed()
+    {
+        // TODO: At now CShuffle doesn't allow to vector store after permute.
+        //       It should be fixed and this function should return true.
+        return false;
+    }
+
     template <typename OAccTile>
     CK_TILE_DEVICE void permute_tile_data(OAccTile& o_acc_tile)
     {
@@ -111,7 +118,9 @@ struct CShuffleEpilogue
         }
     }
 
-    template <typename ODramWindowTmp, typename OAccTile>
+    template <typename ODramWindowTmp,
+              typename OAccTile,
+              memory_operation_enum out_memory_data_op = memory_operation_enum::set>
     CK_TILE_DEVICE auto operator()(ODramWindowTmp& o_dram_window_tmp, OAccTile& o_acc_tile)
     {
         const auto& current_window_origin = o_dram_window_tmp.get_window_origin();
@@ -158,12 +167,26 @@ struct CShuffleEpilogue
         // Store the tile data to the permuted location
         if constexpr(kPadM || kPadN)
         {
-            store_tile_raw(o_dram_window_tmp, cast_tile<ODataType>(o_acc_tile));
+            if constexpr(out_memory_data_op == memory_operation_enum::set)
+            {
+                store_tile_raw(o_dram_window_tmp, cast_tile<ODataType>(o_acc_tile));
+            }
+            else
+            {
+                update_tile_raw(o_dram_window_tmp, cast_tile<ODataType>(o_acc_tile));
+            }
             buffer_store_fence();
         }
         else
         {
-            store_tile(o_dram_window_tmp, cast_tile<ODataType>(o_acc_tile));
+            if constexpr(out_memory_data_op == memory_operation_enum::set)
+            {
+                store_tile(o_dram_window_tmp, cast_tile<ODataType>(o_acc_tile));
+            }
+            else
+            {
+                update_tile(o_dram_window_tmp, cast_tile<ODataType>(o_acc_tile));
+            }
         }
     }
 };

@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -35,21 +35,39 @@ struct Default2DEpilogue
 
     CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() { return 0; }
 
+    CK_TILE_HOST_DEVICE static constexpr bool IsOutputTransposed() { return false; }
+
     // TODO: this function assume store out vector size is the same as OAccTile last dimension size
     //       how do we fix this ?
-    template <typename ODramWindowTmp, typename OAccTile>
+    template <typename ODramWindowTmp,
+              typename OAccTile,
+              memory_operation_enum out_memory_data_op = memory_operation_enum::set>
     CK_TILE_DEVICE auto operator()(ODramWindowTmp& o_dram_window_tmp, const OAccTile& o_acc_tile)
     {
 
         // TODO: this is ugly
         if constexpr(UseRawStore && (kPadM || kPadN))
         {
-            store_tile_raw(o_dram_window_tmp, cast_tile<ODataType>(o_acc_tile));
+            if constexpr(out_memory_data_op == memory_operation_enum::set)
+            {
+                store_tile_raw(o_dram_window_tmp, cast_tile<ODataType>(o_acc_tile));
+            }
+            else
+            {
+                update_tile_raw(o_dram_window_tmp, cast_tile<ODataType>(o_acc_tile));
+            }
             buffer_store_fence();
         }
         else
         {
-            store_tile(o_dram_window_tmp, cast_tile<ODataType>(o_acc_tile));
+            if constexpr(out_memory_data_op == memory_operation_enum::set)
+            {
+                store_tile(o_dram_window_tmp, cast_tile<ODataType>(o_acc_tile));
+            }
+            else
+            {
+                update_tile(o_dram_window_tmp, cast_tile<ODataType>(o_acc_tile));
+            }
         }
     }
 };

@@ -67,9 +67,10 @@ struct BatchedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
 
     using KernelArgs = BatchedGemmKernelArgs;
 
-    __host__ static constexpr auto GridSize(index_t M, index_t N, index_t batch_count)
+    __host__ static constexpr auto
+    GridSize(index_t M, index_t N, index_t KBatch, index_t batch_count)
     {
-        return TilePartitioner::GridSize(M, N, batch_count);
+        return TilePartitioner::GridSize(M, N, KBatch * batch_count);
     }
 
     __host__ static constexpr auto BlockSize() { return dim3(Base::KernelBlockSize); }
@@ -85,7 +86,8 @@ struct BatchedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
                                       hostArgs.K,
                                       hostArgs.stride_A,
                                       hostArgs.stride_B,
-                                      hostArgs.stride_C},
+                                      hostArgs.stride_C,
+                                      hostArgs.k_batch},
                                      hostArgs.batch_stride_A,
                                      hostArgs.batch_stride_B,
                                      hostArgs.batch_stride_C,
@@ -100,22 +102,38 @@ struct BatchedGemmKernel : public GemmKernel<TilePartitioner_, GemmPipeline_, Ep
     CK_TILE_DEVICE void operator()(BatchedGemmKernelArgs kargs) const
     {
         const auto [i_m, i_n] = TilePartitioner{}();
-        const auto i_batch    = __builtin_amdgcn_readfirstlane(blockIdx.z);
+        const auto i_batch    = __builtin_amdgcn_readfirstlane(blockIdx.z / kargs.KBatch);
+        const auto i_k        = __builtin_amdgcn_readfirstlane(blockIdx.z - i_batch * kargs.KBatch);
+
+        const typename Base::SplitKBatchOffset splitk_batch_offset(kargs, i_k);
 
         //  options
         const auto batch_stride_A = __builtin_amdgcn_readfirstlane(kargs.batch_stride_A);
         const auto batch_offset_A = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_A);
-        const ADataType* a_ptr    = static_cast<const ADataType*>(kargs.a_ptr) + batch_offset_A;
+        const ADataType* a_ptr    = static_cast<const ADataType*>(kargs.a_ptr) + batch_offset_A +
+                                 splitk_batch_offset.a_k_split_offset;
 
         const auto batch_stride_B = __builtin_amdgcn_readfirstlane(kargs.batch_stride_B);
         const auto batch_offset_B = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_B);
-        const BDataType* b_ptr    = static_cast<const BDataType*>(kargs.b_ptr) + batch_offset_B;
+        const BDataType* b_ptr    = static_cast<const BDataType*>(kargs.b_ptr) + batch_offset_B +
+                                 splitk_batch_offset.b_k_split_offset;
 
         const auto batch_stride_C = __builtin_amdgcn_readfirstlane(kargs.batch_stride_C);
         const auto batch_offset_C = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_C);
         CDataType* c_ptr          = static_cast<CDataType*>(kargs.c_ptr) + batch_offset_C;
 
-        this->RunGemm(a_ptr, b_ptr, c_ptr, kargs, i_m, i_n);
+        // allocate LDS
+        __shared__ char smem_ptr[GetSmemSize()];
+
+        if(kargs.KBatch == 1)
+        {
+            this->RunGemm(a_ptr, b_ptr, c_ptr, smem_ptr, kargs, splitk_batch_offset, i_m, i_n);
+        }
+        else
+        {
+            this->template RunGemm<memory_operation_enum::atomic_add>(
+                a_ptr, b_ptr, c_ptr, smem_ptr, kargs, splitk_batch_offset, i_m, i_n);
+        }
     }
 };