Enhance CUDA code generation and testing for GEMM operations

LeiWang1999 · LeiWang1999 · commit e299b419c780 · 2025-09-08T15:19:54.000+08:00
- Added indentation printing in `codegen_cuda.cc` for improved assembly code formatting.
- Updated `test_tilelang_tilelibrary_gemm.py` to include additional GEMM test cases and shared memory allocation with specified scope.
- Introduced new `matmul_sr` and `run_gemm_sr` functions for GEMM operations with shared and fragment memory layouts.
- Refactored layout inference in `mma_macro_generator.py` to improve clarity and correctness in shared memory handling.
- Enhanced `gemm/__init__.py` to support new GEMM operation combinations and layout inference logic.

These changes improve the clarity, functionality, and testing coverage of GEMM operations in the TileLang framework.
diff --git a/src/target/codegen_cuda.cc b/src/target/codegen_cuda.cc
@@ -1259,7 +1259,7 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
     std::string asm_code = PrintMMAAssembly(
         shape, A_layout, B_layout, A_dtype, B_dtype, C_dtype, a_ref, a_bias,
         b_ref, b_bias, c_ref, c_bias, "", "", "", bit_op, false, saturate);
-
+    this->PrintIndent();
     this->stream << asm_code;
   } else if (op->op.same_as(builtin::ptx_mma_sp())) {
     // arg 0: shape: mXnXkX
@@ -1295,6 +1295,7 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
     std::string metadata_offset = this->PrintExpr(op->args[13]);
     std::string sparse_selector = this->PrintExpr(op->args[14]);
     bool saturate = Downcast<Bool>(op->args[15])->value;
+    this->PrintIndent();
     std::string asm_code = PrintMMAAssembly(
         shape, A_layout, B_layout, A_dtype, B_dtype, C_dtype, a_ref, a_offset,
         b_ref, b_offset, c_ref, c_offset, metadata, metadata_offset,
@@ -1330,10 +1331,16 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
       os << "}\n";
     } else {
       std::string smem_elem_offset = this->PrintExpr(op->args[6]);
-      need_cast_smem_ptr_to_int_ = true;
-      this->stream << PrintLoadMatrixAssembly(trans, num, type, local_ptr,
-                                              local_elem_offset, smem_ptr,
-                                              smem_elem_offset);
+      // need_cast_smem_ptr_to_int_ = true;
+      // this->stream << PrintLoadMatrixAssembly(trans, num, type, local_ptr,
+      //                                         local_elem_offset, smem_ptr,
+      //                                         smem_elem_offset);
+      std::string func_name = "tl::ptx_ldmatrix_x" + std::to_string(num);
+      if (trans == 1)
+        func_name += "_trans";
+      // this->stream << func_name << "(" << local_ptr "" << ", " << smem_ptr << ");\n";
+      this->PrintIndent();
+      this->stream << func_name << "(" << smem_ptr << " + " << smem_elem_offset<< ", " << local_ptr << " + " << local_elem_offset << ");\n";
     }
   } else if (op->op.same_as(builtin::mma_store())) {
     int m = Downcast<Integer>(op->args[0])->value;
diff --git a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm.py b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm.py
@@ -1,3 +1,4 @@
+from asyncio import threads
 from tilelang import tvm as tvm
 import tilelang.testing
 
@@ -31,8 +32,8 @@ def main(
             C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
-            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope="shared")
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope="shared")
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
             T.clear(C_local)
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
@@ -108,8 +109,11 @@ def ref_program(A, B):
 def test_gemm():
     # More test case can be found in kernel/test_tilelang_kernel_gemm.py
     # GEMM tests for float16
-    run_gemm(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32,
-             2)  # f16f16f16_nn
+    run_gemm(512, 1024, 768, False, True, "float16", "float16", "float16", 128, 128, 32, 0)
+    run_gemm(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 128, 32, 0)
+    run_gemm(512, 1024, 768, True, False, "float16", "float16", "float16", 128, 128, 32, 0)
+    run_gemm(512, 1024, 768, True, True, "float16", "float16", "float16", 128, 128, 32, 0)
+    
 
 
 def matmul_rs(
@@ -142,23 +146,26 @@ def main(
             C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
-            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope="shared")
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope="shared")
             A_frag = T.alloc_fragment(A_frag_shape, in_dtype)
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
             T.clear(C_local)
+            T.annotate_layout({
+                A_shared: tilelang.layout.make_swizzled_layout(A_shared),
+            })
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 if trans_A:
                     T.copy(A[k * block_K, by * block_M], A_shared)
-                    T.copy(A_shared, A_frag)
                 else:
                     T.copy(A[by * block_M, k * block_K], A_shared)
-                    T.copy(A_shared, A_frag)
                 if trans_B:
                     T.copy(B[bx * block_N, k * block_K], B_shared)
                 else:
                     T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.copy(A_shared, A_frag)
                 T.gemm_v2(A_frag, B_shared, C_local, trans_A, trans_B)
+                # T.gemm(A_frag, B_shared, C_local, trans_A, trans_B)
             T.copy(C_local, C[by * block_M, bx * block_N])
 
     return main
@@ -202,6 +209,7 @@ def run_gemm_rs(
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
         })
+    print(kernel.get_kernel_source())
     profiler = kernel.get_profiler()
 
     def ref_program(A, B):
@@ -224,10 +232,134 @@ def test_gemm_rs():
     run_gemm_rs(512, 1024, 768, False, True, "float16", "float16", "float16", 128, 256, 32, 2)
 
 
+def matmul_sr(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    threads,
+):
+    A_shape = (K, M) if trans_A else (M, K)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+    B_frag_shape = B_shared_shape
+
+    import tilelang.language as T
+
+    @T.prim_func
+    def main(
+            A: T.Tensor(A_shape, in_dtype),
+            B: T.Tensor(B_shape, in_dtype),
+            C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope="shared")
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope="shared")
+            B_frag = T.alloc_fragment(B_frag_shape, in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            T.annotate_layout({
+                B_shared: tilelang.layout.make_swizzled_layout(B_shared),
+            })
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                if trans_A:
+                    T.copy(A[k * block_K, by * block_M], A_shared)
+                else:
+                    T.copy(A[by * block_M, k * block_K], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.copy(B_shared, B_frag)
+                T.gemm_v2(A_shared, B_frag, C_local, trans_A, trans_B)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm_sr(
+    M,
+    N,
+    K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    dtypeAccum,
+    block_M,
+    block_N,
+    block_K,
+    num_stages=3,
+    num_threads=128,
+):
+    program = matmul_sr(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    kernel = tilelang.compile(
+        program,
+        out_idx=[2],
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        })
+    print(kernel.get_kernel_source())
+    profiler = kernel.get_profiler()
+
+    def ref_program(A, B):
+        import torch
+
+        if trans_A:
+            A = A.T
+        if trans_B:
+            B = B.T
+        C = torch.matmul(A.to(torch.float), B.to(torch.float))
+        C = C.to(torch.__getattribute__(out_dtype))
+        return C
+
+    profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
+
+
+def test_gemm_sr():
+    # GEMM tests for float16
+    run_gemm_sr(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+    run_gemm_sr(512, 1024, 768, False, True, "float16", "float16", "float16", 128, 256, 32, 2)
+
+
 if __name__ == "__main__":
     # tilelang.testing.main()
     tilelang.disable_cache()
+    tilelang.testing.set_random_seed(42)
     # run_gemm(512, 1024, 768, False, True, "float16", "float16", "float16", 128, 128, 32, 0)
     # print("gemm fp16 nt ss done")
-    run_gemm(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 128, 32, 0)
-    print("gemm fp16 nn ss done")
+    # run_gemm(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 128, 32, 0)
+    # print("gemm fp16 nn ss done")
+    # run_gemm(512, 1024, 768, True, False, "float16", "float16", "float16", 128, 128, 32, 0)
+    # print("gemm fp16 tn ss done")
+    # run_gemm(512, 1024, 768, True, True, "float16", "float16", "float16", 128, 128, 32, 0)
+    # print("gemm fp16 tt ss done")
+    # run_gemm_rs(64, 64, 32, False, True, "float16", "float16", "float16", 64, 64, 32, 0, 128)
+    # print("gemm fp16 nt rs done")
+    run_gemm_rs(64, 64, 32, False, True, "float16", "float16", "float16", 64, 64, 32, 0, 128)
+    # run_gemm(64, 64, 32, False, True, "float16", "float16", "float16", 64, 64, 32, 0, 128)
diff --git a/tilelang/intrinsics/mma_macro_generator.py b/tilelang/intrinsics/mma_macro_generator.py
@@ -189,19 +189,20 @@ def _warp_ldmatrix_a(
             stride = A_shared_buf.shape[-1]
             tx, _, warp_m = self.extract_thread_binding(thread_binding)
             trans = self.a_transposed
-
+            
             for i in T.serial(warp_rows):
+                # Assign A_shared_buf_elem
+                wi, wk = warp_m * warp_row_tiles + i * micro_size_x, rk * chunk + ki * micro_size_k
+                A_shared_buf_elem = A_shared_buf[wk, wi] if a_transposed else A_shared_buf[wi, wk]
+
                 T.ptx_ldmatrix(
                     a_dtype,
                     T.bool(trans),
                     4,
                     ".b16",
                     A_local_buf.data,
                     i * local_size_a,
-                    T.address_of(A_shared_buf[
-                        warp_m * warp_row_tiles + i * micro_size_x,
-                        rk * chunk + ki * micro_size_k,
-                    ]),
+                    T.address_of(A_shared_buf_elem),
                     get_ldmatrix_offset("A", tx, 0, stride, a_dtype, a_transposed),
                 )
 
@@ -232,16 +233,15 @@ def _warp_ldmatrix_b(
         ):
             stride = B_shared_buf.shape[-1]
             tx, warp_n, _ = self.extract_thread_binding(thread_binding)
-            trans = not self.b_transposed
+            trans = not b_transposed
 
             for j in T.serial(warp_cols):
                 # Assign B_shared_elem
                 wi, wk = (
                     warp_n * warp_col_tiles + j * micro_size_y,
                     rk * chunk + ki * micro_size_k,
                 )
-                B_shared_buf_elem = B_shared_buf[wi, wk] if self.b_transposed else B_shared_buf[wk,
-                                                                                                wi]
+                B_shared_buf_elem = B_shared_buf[wi, wk] if b_transposed else B_shared_buf[wk, wi]
 
                 T.ptx_ldmatrix(
                     b_dtype,
@@ -470,9 +470,6 @@ def forward_index(i: int, j: int) -> int:
         block_fragment = warp_fragment.repeat([warp_s, chunk // micro_size_r],
                                               repeat_on_thread=False,
                                               lower_dim_first=False)
-        print(f"base_fragment: {base_fragment}")
-        print(f"warp_fragment: {warp_fragment}")
-        print(f"block_fragment: {block_fragment}")
         return block_fragment
 
     def make_mma_store_layout(self, local_buf: Buffer) -> T.Fragment:
diff --git a/tilelang/tileop/gemm/__init__.py b/tilelang/tileop/gemm/__init__.py