Refactor GEMM and frontend legalize operations for improved clarity and functionality

LeiWang1999 · LeiWang1999 · commit 52800a5fda5f · 2025-09-08T01:23:46.000+08:00
- Updated `gemm_py.h` to include the correct header for GEMM operations.
- Renamed `FrontendLegalizer` class to `LetInliner` and updated related methods to reflect this change, enhancing code clarity.
- Modified the pass function from `FrontendLegalize` to `LetInline` for better alignment with its purpose.
- Updated test cases to utilize the new `gemm_v2` function and adjusted the testing framework for improved output and clarity.
- Removed obsolete test file `test_tilelang_transform_frontend_legalize.py` to streamline the test suite.
- Enhanced the `LowerAndLegalize` function to utilize the new `LetInline` pass, improving the overall transformation process.
diff --git a/src/op/gemm_py.h b/src/op/gemm_py.h
@@ -7,16 +7,15 @@
 #ifndef TVM_TL_OP_GEMM_PY_H_
 #define TVM_TL_OP_GEMM_PY_H_
 
+#include "gemm.h"
 #include "operator.h"
-#include "gemm_py.h"
 
 namespace tvm {
 
 namespace tl {
 
 using namespace tir;
 
-
 class GemmPyNode : public TileOperatorNode {
 public:
   bool CheckWGMMA() const;
diff --git a/src/transform/frontend_legalize.cc b/src/transform/frontend_legalize.cc
@@ -34,11 +34,11 @@ namespace tl {
 
 using namespace tir;
 
-class FrontendLegalizer : public arith::IRMutatorWithAnalyzer {
+class LetInliner : public arith::IRMutatorWithAnalyzer {
 public:
   static PrimFunc Substitute(PrimFunc f) {
     arith::Analyzer analyzer;
-    FrontendLegalizer substituter(&analyzer);
+    LetInliner substituter(&analyzer);
     PrimFuncNode *fptr = f.CopyOnWrite();
     fptr->body = substituter.VisitStmt(f->body);
     return f;
@@ -82,16 +82,16 @@ class FrontendLegalizer : public arith::IRMutatorWithAnalyzer {
 
 using namespace tir::transform;
 
-Pass FrontendLegalize() {
+Pass LetInline() {
   auto pass_func = [=](PrimFunc f, const IRModule &m, const PassContext &ctx) {
-    return FrontendLegalizer::Substitute(std::move(f));
+    return LetInliner::Substitute(std::move(f));
   };
-  return CreatePrimFuncPass(pass_func, 0, "tl.FrontendLegalize", {});
+  return CreatePrimFuncPass(pass_func, 0, "tl.LetInline", {});
 }
 
 TVM_FFI_STATIC_INIT_BLOCK({
   namespace refl = tvm::ffi::reflection;
-  refl::GlobalDef().def("tl.transform.FrontendLegalize", FrontendLegalize);
+  refl::GlobalDef().def("tl.transform.LetInline", LetInline);
 });
 
 } // namespace tl
diff --git a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm.py b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm.py
@@ -44,7 +44,7 @@ def main(
                     T.copy(B[bx * block_N, k * block_K], B_shared)
                 else:
                     T.copy(B[k * block_K, bx * block_N], B_shared)
-                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
+                T.gemm_v2(A_shared, B_shared, C_local, trans_A, trans_B)
             T.copy(C_local, C[by * block_M, bx * block_N])
 
     return main
@@ -88,6 +88,7 @@ def run_gemm(
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
         })
+    print(kernel.get_kernel_source())
     profiler = kernel.get_profiler()
 
     def ref_program(A, B):
@@ -157,7 +158,7 @@ def main(
                     T.copy(B[bx * block_N, k * block_K], B_shared)
                 else:
                     T.copy(B[k * block_K, bx * block_N], B_shared)
-                T.gemm(A_frag, B_shared, C_local, trans_A, trans_B)
+                T.gemm_v2(A_frag, B_shared, C_local, trans_A, trans_B)
             T.copy(C_local, C[by * block_M, bx * block_N])
 
     return main
@@ -224,4 +225,9 @@ def test_gemm_rs():
 
 
 if __name__ == "__main__":
-    tilelang.testing.main()
+    # tilelang.testing.main()
+    tilelang.disable_cache()
+    # run_gemm(512, 1024, 768, False, True, "float16", "float16", "float16", 128, 128, 32, 0)
+    # print("gemm fp16 nt ss done")
+    run_gemm(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 128, 32, 0)
+    print("gemm fp16 nn ss done")
diff --git a/testing/python/transform/test_tilelang_transform_let_inline.py b/testing/python/transform/test_tilelang_transform_let_inline.py
@@ -7,7 +7,7 @@
 def _check(original, transformed):
     func = original
     mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
-    mod = tl.transform.FrontendLegalize()(mod)
+    mod = tl.transform.LetInline()(mod)
     tvm.ir.assert_structural_equal(mod["main"], transformed.with_attr("global_symbol", "main"),
                                    True)
 
diff --git a/tilelang/engine/phase.py b/tilelang/engine/phase.py
@@ -85,8 +85,8 @@ def LowerAndLegalize(mod: IRModule, target: Target) -> IRModule:
     """
     mod = tir.transform.BindTarget(target)(mod)
 
-    # Legalize the frontend IR to make it compatible with TVM
-    mod = tilelang.transform.FrontendLegalize()(mod)
+    # Inline let expressions and statements
+    mod = tilelang.transform.LetInline()(mod)
     # Inject assumes to speedup tvm prover
     mod = tilelang.transform.InjectAssumes()(mod)
     # Simplify the IR expressions
diff --git a/tilelang/intrinsics/mma_macro_generator.py b/tilelang/intrinsics/mma_macro_generator.py
@@ -188,10 +188,12 @@ def _warp_ldmatrix_a(
         ):
             stride = A_shared_buf.shape[-1]
             tx, _, warp_m = self.extract_thread_binding(thread_binding)
+            trans = self.a_transposed
+
             for i in T.serial(warp_rows):
                 T.ptx_ldmatrix(
                     a_dtype,
-                    T.bool(False),
+                    T.bool(trans),
                     4,
                     ".b16",
                     A_local_buf.data,
@@ -230,22 +232,25 @@ def _warp_ldmatrix_b(
         ):
             stride = B_shared_buf.shape[-1]
             tx, warp_n, _ = self.extract_thread_binding(thread_binding)
+            trans = not self.b_transposed
 
             for j in T.serial(warp_cols):
                 # Assign B_shared_elem
-                ri, rj = (
+                wi, wk = (
                     warp_n * warp_col_tiles + j * micro_size_y,
                     rk * chunk + ki * micro_size_k,
                 )
+                B_shared_buf_elem = B_shared_buf[wi, wk] if self.b_transposed else B_shared_buf[wk,
+                                                                                                wi]
 
                 T.ptx_ldmatrix(
                     b_dtype,
-                    T.bool(False),  # TODO(lei): should be optimized
+                    T.bool(trans),
                     4,
                     ".b16",
                     B_local_buf.data,
                     j * local_size_b,
-                    T.address_of(B_shared_buf[ri, rj]),
+                    T.address_of(B_shared_buf_elem),
                     get_ldmatrix_offset("B", tx, 0, stride, b_dtype, b_transposed),
                 )
 
@@ -289,7 +294,7 @@ def _warp_mma(A_local_buf, B_local_buf, C_local_buf):
                     b_local_stride + j * local_size_b,
                     C_local_buf.data,
                     i * warp_cols * local_size_out + j * local_size_out,
-                    T.bool(False),
+                    T.bool(False),  # saturate
                 )
 
                 T.ptx_mma(
@@ -306,7 +311,7 @@ def _warp_mma(A_local_buf, B_local_buf, C_local_buf):
                     b_local_stride + j * local_size_b + lift(local_size_b) // 2,
                     C_local_buf.data,
                     i * warp_cols * local_size_out + j * local_size_out + lift(local_size_out) // 2,
-                    T.bool(False),
+                    T.bool(False),  # saturate
                 )
 
         return _warp_mma(A_local_buf, B_local_buf, C_local_buf)
diff --git a/tilelang/tileop/__init__.py b/tilelang/tileop/__init__.py
@@ -1 +1 @@
-from .gemm import GemmPy
+from .gemm import GemmPy  # noqa: F401
diff --git a/tilelang/tileop/gemm/__init__.py b/tilelang/tileop/gemm/__init__.py
@@ -1,10 +1,7 @@
 from tilelang import tvm as tvm
 from tvm import tir
 from tilelang.utils.target import (
-    target_is_cuda,
-    target_is_hip,
-)
-from tilelang import _ffi_api
+    target_is_cuda,)
 from tilelang.intrinsics.mma_macro_generator import (
     TensorCoreIntrinEmitter,)
 from tilelang.layout import make_swizzled_layout
@@ -14,25 +11,28 @@
 from tvm.runtime import Scriptable
 import tvm.ffi
 from tilelang.ir import GemmWarpPolicy
+from tilelang.transform.simplify import _Simplify
 
 
 @tvm.ffi.register_func("tl.gemm_py.infer_layout")
 def gemm_py_infer_layout(gemm_py, target, thread_bounds):
     thread_nums = thread_bounds.extent
     return gemm_py.infer_layout(target, thread_nums)
 
+
 @tvm.ffi.register_func("tl.gemm_py.lower")
 def gemm_py_lower(gemm_py, target, thread_bounds, thread_var):
     thread_nums = thread_bounds.extent
     stmt = gemm_py.lower(target, thread_nums, thread_var)
     return stmt
 
+
 @tvm.ffi.register_object("tl.GemmPy")
 class GemmPy(Node, Scriptable):
     A: tir.Buffer
     B: tir.Buffer
     C: tir.Buffer
-    
+
     APtr: tir.PrimExpr
     BPtr: tir.PrimExpr
     CPtr: tir.PrimExpr
@@ -52,23 +52,23 @@ class GemmPy(Node, Scriptable):
     k_pack: int
     wg_wait: int
     policy: GemmWarpPolicy
-    
 
     def infer_layout(self, target: Target, thread_nums: int):
         if target_is_cuda(target):
             # TODO(lei): Support more cuda architectures, now mma only
             # Now only implement ssr layout
-            m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, False)
-            warp_row_tiles = m_warp * 16
-            warp_col_tiles = n_warp * 16
+            m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target,
+                                                                False)
+            warp_row_tiles = int(self.M // m_warp)
+            warp_col_tiles = int(self.N // n_warp)
             mma_emitter = TensorCoreIntrinEmitter(
                 a_dtype=self.in_dtype,
                 b_dtype=self.in_dtype,
                 accum_dtype=self.accum_dtype,
                 a_transposed=self.trans_A,
                 b_transposed=self.trans_B,
-                block_row_warps=self.M,
-                block_col_warps=self.N,
+                block_row_warps=m_warp,
+                block_col_warps=n_warp,
                 warp_row_tiles=warp_row_tiles,
                 warp_col_tiles=warp_col_tiles,
                 chunk=self.chunk,
@@ -81,23 +81,23 @@ def infer_layout(self, target: Target, thread_nums: int):
             return layout_map
         else:
             raise ValueError(f"Unsupported target: {target}")
-    
 
     def lower(self, target: Target, thread_nums: int, thread_var: tir.Var):
         if target_is_cuda(target):
             # TODO(lei): Support more cuda architectures, now mma only
             # Now only implement ssr layout
-            m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, False)
-            warp_row_tiles = m_warp * 16
-            warp_col_tiles = n_warp * 16
+            m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target,
+                                                                False)
+            warp_row_tiles = int(self.M // m_warp)
+            warp_col_tiles = int(self.N // n_warp)
             mma_emitter = TensorCoreIntrinEmitter(
                 a_dtype=self.in_dtype,
                 b_dtype=self.in_dtype,
                 accum_dtype=self.accum_dtype,
                 a_transposed=self.trans_A,
                 b_transposed=self.trans_B,
-                block_row_warps=self.M,
-                block_col_warps=self.N,
+                block_row_warps=m_warp,
+                block_col_warps=n_warp,
                 warp_row_tiles=warp_row_tiles,
                 warp_col_tiles=warp_col_tiles,
                 chunk=self.chunk,
@@ -125,7 +125,6 @@ def _gemm_ssr() -> None:
                 A_local = T.alloc_local((warp_rows * local_size_a), in_dtype)
                 B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
 
-
                 for ki in T.serial(0, (block_K // micro_size_k)):
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(
@@ -143,10 +142,12 @@ def _gemm_ssr() -> None:
 
                     # Perform Matrix Multiplication
                     mma_emitter.mma(A_local, B_local, C_local)
-            return _gemm_ssr.body
+
+            # Simplify to optimize the index computing
+            # Must inline let statements to simplify the analysis
+            return _Simplify(_gemm_ssr, inline_let=True).body
         else:
             raise ValueError(f"Unsupported target: {target}")
-    
 
     @property
     def in_dtype(self) -> str:
@@ -156,7 +157,7 @@ def in_dtype(self) -> str:
     @property
     def accum_dtype(self) -> str:
         return self.C.dtype
-    
+
     @property
     def chunk(self) -> int:
-        return self.A.shape[-2] if self.trans_A else self.A.shape[-1]
+        return self.A.shape[-2] if self.trans_A else self.A.shape[-1]
diff --git a/tilelang/transform/__init__.py b/tilelang/transform/__init__.py
@@ -2,7 +2,7 @@
 # pylint: disable=invalid-name, unsupported-binary-operation
 
 from . import _ffi_api
-from .simplify import Simplify, simplify_prim_func  # noqa: F401
+from .simplify import Simplify, simplify_prim_func, LetInline  # noqa: F401
 from .pass_config import PassConfigKey  # noqa: F401
 from tilelang import tvm as tvm  # noqa: F401
 from tvm.ir.transform import PassContext  # noqa: F401
@@ -68,17 +68,6 @@ def InjectSoftwarePipeline():
     return _ffi_api.InjectSoftwarePipeline()  # type: ignore
 
 
-def FrontendLegalize():
-    """FrontendLegalize
-
-    Returns
-    -------
-    fpass : tvm.transform.Pass
-        The result pass
-    """
-    return _ffi_api.FrontendLegalize()  # type: ignore
-
-
 def InjectAssumes():
     """Inject Assumes
     
diff --git a/tilelang/transform/simplify.py b/tilelang/transform/simplify.py
@@ -5,6 +5,17 @@
 from . import _ffi_api
 
 
+def LetInline():
+    """LetInline
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.LetInline()  # type: ignore
+
+
 def Simplify(simplify_arguments: bool = False):
     """Simplify
 
@@ -16,13 +27,24 @@ def Simplify(simplify_arguments: bool = False):
     return _ffi_api.Simplify(simplify_arguments)  # type: ignore
 
 
-def _Simplify(stmt: Union[PrimFunc, IRModule]) -> Union[PrimFunc, IRModule]:
+def _Simplify(stmt: Union[PrimFunc, IRModule],
+              inline_let: bool = False) -> Union[PrimFunc, IRModule]:
     if isinstance(stmt, PrimFunc):
-        mod = Simplify(simplify_arguments=True)(IRModule.from_expr(stmt))
+        if inline_let:
+            mod = LetInline()(IRModule.from_expr(stmt))
+            mod = Simplify(simplify_arguments=True)(mod)
+        else:
+            mod = Simplify(simplify_arguments=True)(IRModule.from_expr(stmt))
         assert len(mod.functions) == 1, "Simplify should return a single function"
         return list(mod.functions.values()).pop()
     elif isinstance(stmt, IRModule):
-        return Simplify(simplify_arguments=True)(stmt)
+        if inline_let:
+            mod = LetInline()(stmt)
+            mod = Simplify(simplify_arguments=True)(mod)
+        else:
+            mod = Simplify(simplify_arguments=True)(stmt)
+        assert len(mod.functions) == 1, "Simplify should return a single function"
+        return list(mod.functions.values()).pop()
     else:
         raise ValueError(f"Unsupported type: {type(stmt)}")
 
@@ -37,6 +59,7 @@ def wrapper(*args, **kwargs):
     return wrapper
 
 
-def apply_simplify(stmt: Union[PrimFunc, IRModule]) -> Union[PrimFunc, IRModule]:
+def apply_simplify(stmt: Union[PrimFunc, IRModule],
+                   inline_let: bool = False) -> Union[PrimFunc, IRModule]:
     """Apply Simplify pass to a PrimFunc or IRModule."""
-    return _Simplify(stmt)
+    return _Simplify(stmt, inline_let)

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-from .gemm import GemmPy`
	`1`	`+from .gemm import GemmPy # noqa: F401`