Refactor GEMM layout and testing for improved clarity and functionality

LeiWang1999 · LeiWang1999 · commit 1ab46ef8abd7 · 2025-09-09T15:33:55.000+08:00
- Updated `gemm_layouts.cc` to enhance the layout generation logic for transposed and non-transposed GEMM operations.
- Renamed and modified functions in `test_tilelang_tilelibrary_gemm.py` to reflect changes in GEMM function signatures and improve test coverage.
- Introduced new GEMM operation combinations in `gemm/__init__.py` to support additional layouts and configurations.
- Enhanced layout inference in `mma_layout.py` and `mma_macro_generator.py` for better handling of shared memory layouts.

These changes improve the clarity, functionality, and testing coverage of GEMM operations in the TileLang framework.
diff --git a/3rdparty/tvm b/3rdparty/tvm
@@ -1 +1 @@
-Subproject commit 1fc7578cd1ff934455b07597508b5a67d7cb5a73
+Subproject commit eddefbd65acb7b1ea51dd18068b4049754c4fa7a
diff --git a/src/layout/gemm_layouts.cc b/src/layout/gemm_layouts.cc
@@ -205,16 +205,14 @@ Fragment makeGemmFragmentB(const int block_m, const int block_n,
   ICHECK(block_k % 16 == 0);
   if (transposed) {
     auto base_layout = makeGemmFragment8x8()->Repeat({1, 2}, false, false);
-    auto warp_layout = base_layout->Replicate(block_m / warp_m)
-                           ->Repeat({block_n / warp_n, 1}, true, false);
+    auto warp_layout = base_layout->Repeat({block_n / warp_n, 1}, true, false)->Replicate(block_m / warp_m);
     auto block_layout =
         warp_layout->Repeat({warp_n / 8, block_k / 16}, false, false);
     return block_layout;
   } else {
     auto base_layout =
         makeGemmFragment8x8Transposed()->Repeat({2, 1}, false, false);
-    auto warp_layout = base_layout->Replicate(block_m / warp_m)
-                           ->Repeat({1, block_n / warp_n}, true);
+    auto warp_layout = base_layout->Repeat({1, block_n / warp_n}, true)->Replicate(block_m / warp_m);
     auto block_layout =
         warp_layout->Repeat({block_k / 16, warp_n / 8}, false, true);
     return block_layout;
diff --git a/src/op/gemm_py.cc b/src/op/gemm_py.cc
@@ -12,6 +12,7 @@
 #include <tvm/tir/transform.h>
 
 #include "../target/utils.h"
+#include "tvm/ffi/string.h"
 
 namespace tvm {
 namespace tl {
@@ -224,9 +225,18 @@ Stmt GemmPyNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
       M, N, block_size, T.target, gemm_inst == GemmInst::kWGMMA);
 
   if (const auto f = ffi::Function::GetGlobal("tl.gemm_py.lower")) {
-    auto stmt = Downcast<Stmt>(
+    auto prim_func = Downcast<PrimFunc>(
         (*f)(GetRef<GemmPy>(this), T.target, T.thread_bounds, T.thread_var));
-    return stmt;
+    BlockRealize block_realize = Downcast<BlockRealize>(prim_func->body);
+    ICHECK(prim_func->attrs.defined());
+    auto global_symbol = prim_func->attrs.GetAttr<String>("global_symbol");
+    ICHECK(global_symbol.defined());
+    auto block = block_realize->block;
+    {
+      BlockNode* n = block.CopyOnWrite();
+      n->name_hint = global_symbol.value();
+    }
+    return BlockRealize(block_realize->iter_values, block_realize->predicate, block);
   } else {
     LOG(FATAL) << "No lower function found for gemm_py";
   }
diff --git a/src/transform/inject_pipeline.cc b/src/transform/inject_pipeline.cc
@@ -675,6 +675,12 @@ class PipelineRewriter : public StmtExprMutator {
       }
       new_block = Downcast<Block>(Substitute(
           new_block, {{pipeline_loop_->loop_var, normalized_access_index}}));
+
+      Array<Array<BufferRegion>> access = GetBlockReadWriteRegion(block, buffer_data_to_buffer_);
+      BlockNode* n = new_block.CopyOnWrite();
+      n->reads = access[0];
+      n->writes = access[1];
+
       if (pipeline_info_[block].async) {
         auto &local_state = async_states_local[stage];
         local_state.producer_head = normalized_access_index;
diff --git a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm.py b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm.py
@@ -46,12 +46,13 @@ def main(
                 else:
                     T.copy(B[k * block_K, bx * block_N], B_shared)
                 T.gemm_v2(A_shared, B_shared, C_local, trans_A, trans_B)
+                # T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
             T.copy(C_local, C[by * block_M, bx * block_N])
 
     return main
 
 
-def run_gemm(
+def run_gemm_ss(
     M,
     N,
     K,
@@ -106,13 +107,13 @@ def ref_program(A, B):
     profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
 
 
-def test_gemm():
+def test_gemm_ss():
     # More test case can be found in kernel/test_tilelang_kernel_gemm.py
     # GEMM tests for float16
-    run_gemm(512, 1024, 768, False, True, "float16", "float16", "float16", 128, 128, 32, 0)
-    run_gemm(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 128, 32, 0)
-    run_gemm(512, 1024, 768, True, False, "float16", "float16", "float16", 128, 128, 32, 0)
-    run_gemm(512, 1024, 768, True, True, "float16", "float16", "float16", 128, 128, 32, 0)
+    run_gemm_ss(512, 1024, 768, False, True, "float16", "float16", "float16", 128, 128, 32, 0)
+    run_gemm_ss(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 128, 32, 0)
+    run_gemm_ss(512, 1024, 768, True, False, "float16", "float16", "float16", 128, 128, 32, 0)
+    run_gemm_ss(512, 1024, 768, True, True, "float16", "float16", "float16", 128, 128, 32, 0)
     
 
 
@@ -165,7 +166,6 @@ def main(
                     T.copy(B[k * block_K, bx * block_N], B_shared)
                 T.copy(A_shared, A_frag)
                 T.gemm_v2(A_frag, B_shared, C_local, trans_A, trans_B)
-                # T.gemm(A_frag, B_shared, C_local, trans_A, trans_B)
             T.copy(C_local, C[by * block_M, bx * block_N])
 
     return main
@@ -228,8 +228,8 @@ def ref_program(A, B):
 
 def test_gemm_rs():
     # GEMM tests for float16
-    run_gemm_rs(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
-    run_gemm_rs(512, 1024, 768, False, True, "float16", "float16", "float16", 128, 256, 32, 2)
+    run_gemm_rs(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 0)
+    run_gemm_rs(512, 1024, 768, False, True, "float16", "float16", "float16", 128, 256, 32, 0)
 
 
 def matmul_sr(
@@ -280,7 +280,10 @@ def main(
                 else:
                     T.copy(B[k * block_K, bx * block_N], B_shared)
                 T.copy(B_shared, B_frag)
-                T.gemm_v2(A_shared, B_frag, C_local, trans_A, trans_B)
+                # for i, j in T.Parallel(block_N, block_K):
+                #     B_frag[i, j] = B_shared[j, i]
+                # T.gemm_v2(A_shared, B_frag, C_local, trans_A, trans_B)
+                T.gemm(A_shared, B_frag, C_local, trans_A, trans_B)
             T.copy(C_local, C[by * block_M, bx * block_N])
 
     return main
@@ -345,21 +348,160 @@ def test_gemm_sr():
     # GEMM tests for float16
     run_gemm_sr(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
     run_gemm_sr(512, 1024, 768, False, True, "float16", "float16", "float16", 128, 256, 32, 2)
+    run_gemm_sr(512, 1024, 768, True, False, "float16", "float16", "float16", 128, 256, 32, 2)
+    run_gemm_sr(512, 1024, 768, True, True, "float16", "float16", "float16", 128, 256, 32, 2)
+
+
+def matmul_rr(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    threads,
+):
+    A_shape = (K, M) if trans_A else (M, K)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+    A_frag_shape = A_shared_shape
+    B_frag_shape = B_shared_shape
+
+    import tilelang.language as T
+
+    @T.prim_func
+    def main(
+            A: T.Tensor(A_shape, in_dtype),
+            B: T.Tensor(B_shape, in_dtype),
+            C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope="shared")
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope="shared")
+            A_frag = T.alloc_fragment(A_frag_shape, in_dtype)
+            B_frag = T.alloc_fragment(B_frag_shape, in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            T.annotate_layout({
+                A_shared: tilelang.layout.make_swizzled_layout(A_shared),
+                B_shared: tilelang.layout.make_swizzled_layout(B_shared),
+            })
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                if trans_A:
+                    T.copy(A[k * block_K, by * block_M], A_shared)
+                else:
+                    T.copy(A[by * block_M, k * block_K], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.copy(A_shared, A_frag)
+                T.copy(B_shared, B_frag)
+                T.gemm_v2(A_frag, B_frag, C_local, trans_A, trans_B)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm_rr(
+    M,
+    N,
+    K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    dtypeAccum,
+    block_M,
+    block_N,
+    block_K,
+    num_stages=3,
+    num_threads=128,
+):
+    program = matmul_rr(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    kernel = tilelang.compile(
+        program,
+        out_idx=[2],
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        })
+    print(kernel.get_kernel_source())
+    profiler = kernel.get_profiler()
+
+    def ref_program(A, B):
+        import torch
+
+        if trans_A:
+            A = A.T
+        if trans_B:
+            B = B.T
+        C = torch.matmul(A.to(torch.float), B.to(torch.float))
+        C = C.to(torch.__getattribute__(out_dtype))
+        return C
+
+    profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
+
+
+def test_gemm_rr():
+    # GEMM tests for float16
+    run_gemm_rr(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+    run_gemm_rr(512, 1024, 768, False, True, "float16", "float16", "float16", 128, 256, 32, 2)
+    run_gemm_rr(512, 1024, 768, True, False, "float16", "float16", "float16", 128, 256, 32, 2)
+    run_gemm_rr(512, 1024, 768, True, True, "float16", "float16", "float16", 128, 256, 32, 2)
+    run_gemm_rr(512, 1024, 768, False, True, "bfloat16", "bfloat16", "float", 128, 256, 32, 2)
 
 
 if __name__ == "__main__":
     # tilelang.testing.main()
     tilelang.disable_cache()
-    tilelang.testing.set_random_seed(42)
-    # run_gemm(512, 1024, 768, False, True, "float16", "float16", "float16", 128, 128, 32, 0)
+    # test_gemm_ss()
+    run_gemm_sr(128, 128, 128, False, False, "float16", "float16", "float16", 128, 128, 32, 2)
+    # tilelang.testing.set_random_seed(42)
+    # run_gemm_ss(128, 128, 128, False, True, "float16", "float16", "float16", 128, 128, 32, 1)
     # print("gemm fp16 nt ss done")
-    # run_gemm(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 128, 32, 0)
-    # print("gemm fp16 nn ss done")
-    # run_gemm(512, 1024, 768, True, False, "float16", "float16", "float16", 128, 128, 32, 0)
-    # print("gemm fp16 tn ss done")
-    # run_gemm(512, 1024, 768, True, True, "float16", "float16", "float16", 128, 128, 32, 0)
-    # print("gemm fp16 tt ss done")
-    # run_gemm_rs(64, 64, 32, False, True, "float16", "float16", "float16", 64, 64, 32, 0, 128)
+    # exit()
+    
+    # run_gemm_rs(128, 128, 32, False, True, "float16", "float16", "float16", 128, 128, 32, 0)
     # print("gemm fp16 nt rs done")
-    run_gemm_rs(64, 64, 32, False, True, "float16", "float16", "float16", 64, 64, 32, 0, 128)
-    # run_gemm(64, 64, 32, False, True, "float16", "float16", "float16", 64, 64, 32, 0, 128)
+    # run_gemm_rs(128, 128, 32, False, False, "float16", "float16", "float16", 128, 128, 32, 0)
+    # print("gemm fp16 nn rs done")
+    # run_gemm_rs(128, 128, 32, True, False, "float16", "float16", "float16", 128, 128, 32, 0)
+    # print("gemm fp16 tn rs done")
+    # run_gemm_rs(128, 128, 32, True, True, "float16", "float16", "float16", 128, 128, 32, 0)
+    # print("gemm fp16 tt rs done")
+
+    # run_gemm_rs(16, 16, 16, True, False, "float16", "float16", "float16", 16, 16, 16, 0, 32)
+
+    # run_gemm_rr(128, 128, 32, False, False, "bfloat16", "bfloat16", "float", 128, 128, 32, 0)
+    # print("gemm bf16 nn rr done")
+    # run_gemm_rr(128, 128, 32, False, True, "bfloat16", "bfloat16", "float", 128, 128, 32, 0)
+    # print("gemm bf16 nt rr done")
+    # run_gemm_rr(128, 128, 32, True, False, "bfloat16", "bfloat16", "float", 128, 128, 32, 0)
+    # print("gemm bf16 tn rr done")
+    # run_gemm_rr(128, 128, 32, True, True, "bfloat16", "bfloat16", "float", 128, 128, 32, 0)
+    # print("gemm bf16 tt rr done")
+    
+    
diff --git a/tilelang/engine/phase.py b/tilelang/engine/phase.py
@@ -140,7 +140,11 @@ def OptimizeForTarget(mod: IRModule, target: Target) -> IRModule:
         mod = tilelang.transform.IfStmtBinding()(mod)
         mod = tir.transform.PlanAndUpdateBufferAllocationLocation()(mod)
         mod = tilelang.transform.PipelinePlanning()(mod)
+        print("after pipeline planning")
+        print(mod)
         mod = tilelang.transform.InjectSoftwarePipeline()(mod)
+        print("after inject software pipeline")
+        print(mod)
         mod = tilelang.transform.MergeIfStmt()(mod)
         if allow_fence_proxy(target=target):
             # in hopper device, wgmma is an async proxy
diff --git a/tilelang/intrinsics/mma_layout.py b/tilelang/intrinsics/mma_layout.py
@@ -47,18 +47,26 @@ def mma_store_32x8_to_shared_16x16_layout(thread_id, local_id):
 
 # sr represents spatial + reduction layout
 # the first axis is spatial while the second axis is reduction
-def shared_16x16_to_mma_32x8_layout_sr(i, j):
+# mma.sync matrix A layout, if wanna trans, please apply map_indices
+def shared_16x16_to_mma_a_32x8_layout(i, j):
     thread_id = 4 * (i % 8) + (j % 8) // 2
     return thread_id, 4 * (j // 8) + (i // 8) * 2 + (j % 2)
 
+def shared_16x16_to_mma_a_32x8_layout_trans(i, j):
+    return shared_16x16_to_mma_a_32x8_layout(j, i)
 
-def shared_16x16_to_mma_32x8_layout_rs(i, j):
-    thread_id = 4 * (j % 8) + (i % 8) // 2
-    return thread_id, 4 * (i // 8) + (j // 8) * 2 + (i % 2)
+# mma.sync matrix B layout, if wanna trans, please apply map_indices
+def shared_16x16_to_mma_b_32x8_layout(i, j):
+    thread_id = 4 * (i % 8) + (j % 8) // 2
+    return thread_id, 4 * (i // 8) + (j // 8) * 2 + (j % 2)
 
+def shared_16x16_to_mma_b_32x8_layout_trans(i, j):
+    return shared_16x16_to_mma_b_32x8_layout(j, i)
 
-shared_16x16_to_mma_32x8_layout = shared_16x16_to_mma_32x8_layout_sr
-shared_16x16_to_mma_32x8_layout_trans = shared_16x16_to_mma_32x8_layout_rs
+shared_16x16_to_mma_32x8_layout_sr_a = shared_16x16_to_mma_a_32x8_layout
+shared_16x16_to_mma_32x8_layout_sr_b = shared_16x16_to_mma_b_32x8_layout
+shared_16x16_to_mma_32x8_layout_rs_a = shared_16x16_to_mma_a_32x8_layout_trans
+shared_16x16_to_mma_32x8_layout_rs_b = shared_16x16_to_mma_b_32x8_layout_trans
 
 
 def shared_16x32_to_mma_32x16_layout(i, j):
diff --git a/tilelang/intrinsics/mma_macro_generator.py b/tilelang/intrinsics/mma_macro_generator.py
diff --git a/tilelang/tileop/gemm/__init__.py b/tilelang/tileop/gemm/__init__.py