From 46794c4217339a07c4e3825f97f6e19119117535 Mon Sep 17 00:00:00 2001
From: xinxyxiao <xinyxiao@amd.com>
Date: Tue, 29 Jul 2025 03:26:10 +0000
Subject: [PATCH 01/11] [Enhancement] Refactor buffer index handling for
 improved precision and clarity (#668)

- Enhanced buffer index handling to address precision issues by removing redundant operations.
- Streamlined the logic for determining buffer overlaps, ensuring more accurate conflict detection.
- Updated related documentation to reflect changes in buffer management practices.
---
 examples/amd/example_amd_flash_attn_fwd.py | 271 +++++++++++++++++++++
 examples/amd/test.sh                       |  16 ++
 2 files changed, 287 insertions(+)
 create mode 100644 examples/amd/example_amd_flash_attn_fwd.py
 create mode 100644 examples/amd/test.sh

diff --git a/examples/amd/example_amd_flash_attn_fwd.py b/examples/amd/example_amd_flash_attn_fwd.py
new file mode 100644
index 000000000..6ed2c221d
--- /dev/null
+++ b/examples/amd/example_amd_flash_attn_fwd.py
@@ -0,0 +1,271 @@
+# Copyright (c) Tile-AI Corporation.
+# Licensed under the MIT License.
+#
+# Modified to implement FlashAttention-2 forward pass principles.
+# Corrected loop implementation using T.while_loop.
+
+import torch
+import torch.nn.functional as F
+import tilelang
+import tilelang.language as T
+import itertools
+import argparse
+from functools import partial
+
+
+# PyTorch 参考实现保持不变
+def ref_program(Q, K, V, is_causal, groups=1):
+    assert Q.size(
+        2) == K.size(2) * groups, f"Q heads {Q.size(2)} K heads {K.size(2)} groups {groups}"
+    assert Q.size(
+        2) == V.size(2) * groups, f"Q heads {Q.size(2)} V heads {V.size(2)} groups {groups}"
+    dim = Q.size(-1)
+    K = K.repeat_interleave(groups, dim=2)
+    V = V.repeat_interleave(groups, dim=2)
+    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
+    if is_causal:
+        seq_len = Q.size(1)
+        mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
+        mask = mask.unsqueeze(0).unsqueeze(0)
+        scores = scores.masked_fill(mask == 0, float('-inf'))
+    attention_weights = F.softmax(scores, dim=-1)
+    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    return output
+
+
+def get_v2_configs():
+    """Generates configurations for the autotuner, tailored for FA-2 style parallelism."""
+    block_M = [64, 128, 256]
+    block_N = [32, 64, 128]
+    threads = [128, 256, 512]
+    num_split_q = [32, 64, 128]
+    num_stages = [1, 2, 3]
+    enable_rasterization = [True]
+    k_pack = [2]
+
+    valid_configs = []
+
+    for m, n, s, t, stages, r, k in itertools.product(block_M, block_N, num_split_q, threads,
+                                                      num_stages, enable_rasterization, k_pack):
+        valid_configs.append({
+            "block_M": m,
+            "block_N": n,
+            "num_split_q": s,
+            "threads": t,
+            "num_stages": stages,
+            "enable_rasterization": r,
+            "k_pack": k
+        })
+    if not valid_configs:
+        valid_configs.append({
+            'block_M': 64,
+            'block_N': 64,
+            'num_split_q': 64,
+            'threads': 256,
+            'num_stages': 1,
+            'enable_rasterization': True,
+            'k_pack': 2
+        })
+    return valid_configs
+
+
+@tilelang.autotune(configs=get_v2_configs(), cache_input_tensors=True)
+@tilelang.jit(out_idx=[3])
+def fast_flashattn_v2(
+    batch,
+    heads,
+    seq_len,
+    dim,
+    is_causal,
+    groups,
+    block_M: int,
+    block_N: int,
+    num_split_q: int,
+    threads: int,
+    num_stages: int,
+    enable_rasterization: bool,
+    k_pack: int,
+):
+    scale = (1.0 / dim)**0.5 * 1.44269504
+    head_kv = heads // groups
+    q_shape = [batch, seq_len, heads, dim]
+    kv_shape = [batch, seq_len, head_kv, dim]
+    dtype = "float16"
+    accum_dtype = "float"
+
+    dtype_size = 2
+    v_vec_size = 4
+
+    vec_size = 4 * k_pack
+
+    @T.macro
+    def compute_block(
+            bz,
+            by,
+            bx,
+            Q: T.Tensor(q_shape, dtype),
+            K: T.Tensor(kv_shape, dtype),
+            V: T.Tensor(kv_shape, dtype),
+            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
+            m_i: T.FragmentBuffer([block_M], accum_dtype),
+            l_i: T.FragmentBuffer([block_M], accum_dtype),
+    ):
+        Q_shared = T.alloc_shared([block_M, dim], dtype)
+        K_shared = T.alloc_shared([block_N, dim], dtype)
+        V_shared = T.alloc_shared([block_N, dim], dtype)
+        P_shared = T.alloc_shared([block_M, block_N], dtype)
+
+        acc_s = T.alloc_fragment([block_M, block_N], accum_dtype)
+        m_prev = T.alloc_fragment([block_M], accum_dtype)
+        scale_factor = T.alloc_fragment([block_M], accum_dtype)
+
+        q_block_offset = bx * block_M
+        T.copy(
+            Q[bz, q_block_offset:q_block_offset + block_M, by, :],
+            Q_shared,
+            coalesced_width=vec_size)
+
+        loop_end_k = T.ceildiv(q_block_offset +
+                               block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N)
+        for k in T.Pipelined(loop_end_k, num_stages=num_stages):
+            kv_idx = k * block_N
+            T.copy(
+                K[bz, kv_idx:kv_idx + block_N, by // groups, :], K_shared, coalesced_width=vec_size)
+            T.copy(
+                V[bz, kv_idx:kv_idx + block_N, by // groups, :],
+                V_shared,
+                coalesced_width=v_vec_size)
+
+            T.clear(acc_s)
+            T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, k_pack=k_pack)
+
+            if is_causal:
+                for i, j in T.Parallel(block_M, block_N):
+                    acc_s[i, j] = T.if_then_else(q_block_offset + i >= kv_idx + j, acc_s[i, j],
+                                                 -T.infinity(acc_s.dtype))
+
+            T.copy(m_i, m_prev)
+            T.reduce_max(acc_s, m_i, dim=1, clear=False)
+
+            for i in T.Parallel(block_M):
+                sf = T.exp2(m_prev[i] * scale - m_i[i] * scale)
+                l_i[i] *= sf
+                scale_factor[i] = sf
+
+            for i, j in T.Parallel(block_M, dim):
+                acc_o[i, j] *= scale_factor[i]
+
+            for i, j in T.Parallel(block_M, block_N):
+                acc_s[i, j] = T.exp2(acc_s[i, j] * scale - m_i[i] * scale)
+
+            row_sum = T.alloc_fragment([block_M], accum_dtype)
+            T.reduce_sum(acc_s, row_sum, dim=1)
+            for i in T.Parallel(block_M):
+                l_i[i] += row_sum[i]
+
+            T.copy(acc_s, P_shared)
+            T.sync_threads()
+
+            T.gemm(P_shared, V_shared, acc_o)
+
+    # 修复：将宏移至内核外部，以实现清晰的代码结构。
+    @T.macro
+    def scale_and_write_back(src_buffer, scale_vector, dest_tensor, bz, by, q_block_offset):
+        # 此宏执行融合的缩放和写回操作，这对性能至关重要。
+        for i, j in T.Parallel(block_M, dim):
+            dest_tensor[bz, q_block_offset + i, by, j] = src_buffer[i, j] * scale_vector[i]
+
+    @T.macro
+    def flash_attn_forward_kernel(Q: T.Tensor(q_shape, dtype), K: T.Tensor(kv_shape, dtype),
+                                  V: T.Tensor(kv_shape, dtype), Output: T.Tensor(q_shape, dtype)):
+        with T.Kernel(num_split_q, batch * heads, threads=threads) as (b_split, byz_combined):
+            T.use_swizzle(10, enable=enable_rasterization)
+
+            bz = byz_combined // heads
+            by = byz_combined % heads
+
+            num_q_blocks = T.ceildiv(seq_len, block_M)
+
+            bx = T.alloc_var("int32")
+            bx[0] = b_split
+
+            with T.While(bx[0] < num_q_blocks):
+                acc_o = T.alloc_fragment([block_M, dim], accum_dtype)
+                m_i = T.alloc_fragment([block_M], accum_dtype)
+                l_i = T.alloc_fragment([block_M], accum_dtype)
+                T.fill(acc_o, 0)
+                T.fill(m_i, -T.infinity(accum_dtype))
+                T.fill(l_i, 0)
+
+                current_bx = bx[0]
+
+                compute_block(bz, by, current_bx, Q, K, V, acc_o, m_i, l_i)
+
+                l_inv = T.alloc_fragment([block_M], accum_dtype)
+                for i in T.Parallel(block_M):
+                    safe_l = T.if_then_else(l_i[i] > 1e-6, l_i[i], 1.0)
+                    l_inv[i] = 1.0 / safe_l
+
+                # 修复：现在对宏的调用对编译器来说更清晰。
+                q_block_offset = current_bx * block_M
+                scale_and_write_back(acc_o, l_inv, Output, bz, by, q_block_offset)
+
+                bx[0] = current_bx + num_split_q
+
+    @T.prim_func
+    def main(
+            Q: T.Tensor(q_shape, dtype),
+            K: T.Tensor(kv_shape, dtype),
+            V: T.Tensor(kv_shape, dtype),
+            Output: T.Tensor(q_shape, dtype),
+    ):
+        flash_attn_forward_kernel(Q, K, V, Output)
+
+    return main
+
+
+# main 函数保持不变
+def main_v2(batch: int = 1,
+            heads: int = 8,
+            seq_len: int = 4096,
+            dim: int = 128,
+            is_causal: bool = False,
+            groups: int = 1):
+
+    flops_per_matmul = 2.0 * batch * heads * seq_len * seq_len * dim
+    total_flops = 2 * flops_per_matmul
+    if is_causal:
+        total_flops *= 0.5
+
+    print("Starting autotuning for FlashAttention-V2...")
+    kernel = fast_flashattn_v2(batch, heads, seq_len, dim, is_causal, groups=groups)
+    print(f"Autotuning finished. Best Configuration: {kernel.config}")
+
+    ref_program_processed = partial(ref_program, is_causal=is_causal, groups=groups)
+
+    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
+
+    print("Verifying correctness...")
+    profiler.assert_allclose(ref_program_processed, rtol=0.01, atol=0.01)
+    print("All checks pass.")
+
+    latency = profiler.do_bench(ref_program_processed, warmup=100)
+    print(f"Reference (PyTorch): {latency:.2f} ms | {total_flops / latency * 1e-9:.2f} TFlops")
+
+    latency = profiler.do_bench(warmup=100)
+    print(
+        f"Fast Flash Attention V2 (Tile-lang): {latency:.2f} ms | {total_flops / latency * 1e-9:.2f} TFlops"
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--batch', type=int, default=1, help='batch size')
+    parser.add_argument('--heads', type=int, default=8, help='heads')
+    parser.add_argument('--seq_len', type=int, default=4096, help='sequence length')
+    parser.add_argument('--dim', type=int, default=128, help='dim')
+    parser.add_argument('--is_causal', action='store_true', help='causal')
+    parser.add_argument('--groups', type=int, default=1, help='groups')
+    args = parser.parse_args()
+    main_v2(args.batch, args.heads, args.seq_len, args.dim, args.is_causal, args.groups)
diff --git a/examples/amd/test.sh b/examples/amd/test.sh
new file mode 100644
index 000000000..5df31ed17
--- /dev/null
+++ b/examples/amd/test.sh
@@ -0,0 +1,16 @@
+/bin/python /workspace/tilelang/examples/amd/example_amd_flash_attn_fwd.py     --batch 2     --seq_len 4096     --dim 128     --heads 16     --groups 8     --is_causal
+# /workspace/aiter/3rdparty/composable_kernel/build/bin/tile_example_fmha_fwd \
+#     -b=2 \
+#     -s=4096 \
+#     -d=128 \
+#     -h=16 \
+#     -h_k=2 \
+#     -prec=fp16 \
+#     -mask=t \
+#     -v=0
+
+# hipcc -std=c++17 -fPIC --offload-arch=gfx942 -S \
+#   ~/.tilelang/cache/0dad5010ab3d8e01b2a86e56208af334ad62865b92cb03038f9df84fda8d8a99/kernel.cu \
+#   -o ./kernel.s \
+#   -I/workspace/tilelang/3rdparty/composable_kernel/include \
+#   -I/workspace/tilelang/src
\ No newline at end of file

From 499daa3a0b932ee9db1e3f9911d6db4706fde5bf Mon Sep 17 00:00:00 2001
From: xinxyxiao <xinyxiao@amd.com>
Date: Tue, 29 Jul 2025 03:30:32 +0000
Subject: [PATCH 02/11] Remove obsolete test script for AMD example,
 streamlining the examples directory.

---
 examples/amd/test.sh | 16 ----------------
 1 file changed, 16 deletions(-)
 delete mode 100644 examples/amd/test.sh

diff --git a/examples/amd/test.sh b/examples/amd/test.sh
deleted file mode 100644
index 5df31ed17..000000000
--- a/examples/amd/test.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-/bin/python /workspace/tilelang/examples/amd/example_amd_flash_attn_fwd.py     --batch 2     --seq_len 4096     --dim 128     --heads 16     --groups 8     --is_causal
-# /workspace/aiter/3rdparty/composable_kernel/build/bin/tile_example_fmha_fwd \
-#     -b=2 \
-#     -s=4096 \
-#     -d=128 \
-#     -h=16 \
-#     -h_k=2 \
-#     -prec=fp16 \
-#     -mask=t \
-#     -v=0
-
-# hipcc -std=c++17 -fPIC --offload-arch=gfx942 -S \
-#   ~/.tilelang/cache/0dad5010ab3d8e01b2a86e56208af334ad62865b92cb03038f9df84fda8d8a99/kernel.cu \
-#   -o ./kernel.s \
-#   -I/workspace/tilelang/3rdparty/composable_kernel/include \
-#   -I/workspace/tilelang/src
\ No newline at end of file

From 555537a711110a7f490c3da462db98483be85372 Mon Sep 17 00:00:00 2001
From: xinxyxiao <xinyxiao@amd.com>
Date: Tue, 29 Jul 2025 03:36:28 +0000
Subject: [PATCH 03/11] Remove unused dtype_size variable in AMD example script
 to streamline code.

---
 examples/amd/example_amd_flash_attn_fwd.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/amd/example_amd_flash_attn_fwd.py b/examples/amd/example_amd_flash_attn_fwd.py
index 6ed2c221d..874494ef1 100644
--- a/examples/amd/example_amd_flash_attn_fwd.py
+++ b/examples/amd/example_amd_flash_attn_fwd.py
@@ -94,7 +94,6 @@ def fast_flashattn_v2(
     dtype = "float16"
     accum_dtype = "float"
 
-    dtype_size = 2
     v_vec_size = 4
 
     vec_size = 4 * k_pack

From f84bc978dd26096de4507c370f6576f67a691305 Mon Sep 17 00:00:00 2001
From: xinyxiao <xinyxiao@amd.com>
Date: Wed, 30 Jul 2025 12:28:35 +0000
Subject: [PATCH 04/11] Add input configuration file and update AMD example
 script for enhanced flexibility

- Introduced a new input.txt file for configurable parameters.
- Modified the example_amd_flash_attn_fwd.py script to allow for a wider range of configurations, including additional options for num_stages, enable_rasterization, and k_pack.
- Streamlined the main function for better clarity and organization.
- Added a new test script to facilitate running the example with specified parameters.
---
 examples/amd/example_amd_flash_attn_fwd.py | 181 +++++++++------------
 examples/amd/test.sh                       |  16 ++
 input.txt                                  |   2 +
 src/tl_templates/hip/reduce.h              |   4 +-
 4 files changed, 98 insertions(+), 105 deletions(-)
 create mode 100644 examples/amd/test.sh
 create mode 100644 input.txt

diff --git a/examples/amd/example_amd_flash_attn_fwd.py b/examples/amd/example_amd_flash_attn_fwd.py
index 874494ef1..8507df702 100644
--- a/examples/amd/example_amd_flash_attn_fwd.py
+++ b/examples/amd/example_amd_flash_attn_fwd.py
@@ -40,9 +40,9 @@ def get_v2_configs():
     block_N = [32, 64, 128]
     threads = [128, 256, 512]
     num_split_q = [32, 64, 128]
-    num_stages = [1, 2, 3]
-    enable_rasterization = [True]
-    k_pack = [2]
+    num_stages = [0, 1, 2]
+    enable_rasterization = [True, False]
+    k_pack = [1, 2]
 
     valid_configs = []
 
@@ -57,16 +57,15 @@ def get_v2_configs():
             "enable_rasterization": r,
             "k_pack": k
         })
-    if not valid_configs:
-        valid_configs.append({
-            'block_M': 64,
-            'block_N': 64,
-            'num_split_q': 64,
-            'threads': 256,
-            'num_stages': 1,
-            'enable_rasterization': True,
-            'k_pack': 2
-        })
+    valid_configs.append({
+        'block_M': 64,
+        'block_N': 64,
+        'num_split_q': 64,
+        'threads': 256,
+        'num_stages': 1,
+        'enable_rasterization': True,
+        'k_pack': 2
+    })
     return valid_configs
 
 
@@ -95,89 +94,15 @@ def fast_flashattn_v2(
     accum_dtype = "float"
 
     v_vec_size = 4
-
     vec_size = 4 * k_pack
 
-    @T.macro
-    def compute_block(
-            bz,
-            by,
-            bx,
+    @T.prim_func
+    def main(
             Q: T.Tensor(q_shape, dtype),
             K: T.Tensor(kv_shape, dtype),
             V: T.Tensor(kv_shape, dtype),
-            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-            m_i: T.FragmentBuffer([block_M], accum_dtype),
-            l_i: T.FragmentBuffer([block_M], accum_dtype),
+            Output: T.Tensor(q_shape, dtype),
     ):
-        Q_shared = T.alloc_shared([block_M, dim], dtype)
-        K_shared = T.alloc_shared([block_N, dim], dtype)
-        V_shared = T.alloc_shared([block_N, dim], dtype)
-        P_shared = T.alloc_shared([block_M, block_N], dtype)
-
-        acc_s = T.alloc_fragment([block_M, block_N], accum_dtype)
-        m_prev = T.alloc_fragment([block_M], accum_dtype)
-        scale_factor = T.alloc_fragment([block_M], accum_dtype)
-
-        q_block_offset = bx * block_M
-        T.copy(
-            Q[bz, q_block_offset:q_block_offset + block_M, by, :],
-            Q_shared,
-            coalesced_width=vec_size)
-
-        loop_end_k = T.ceildiv(q_block_offset +
-                               block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N)
-        for k in T.Pipelined(loop_end_k, num_stages=num_stages):
-            kv_idx = k * block_N
-            T.copy(
-                K[bz, kv_idx:kv_idx + block_N, by // groups, :], K_shared, coalesced_width=vec_size)
-            T.copy(
-                V[bz, kv_idx:kv_idx + block_N, by // groups, :],
-                V_shared,
-                coalesced_width=v_vec_size)
-
-            T.clear(acc_s)
-            T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, k_pack=k_pack)
-
-            if is_causal:
-                for i, j in T.Parallel(block_M, block_N):
-                    acc_s[i, j] = T.if_then_else(q_block_offset + i >= kv_idx + j, acc_s[i, j],
-                                                 -T.infinity(acc_s.dtype))
-
-            T.copy(m_i, m_prev)
-            T.reduce_max(acc_s, m_i, dim=1, clear=False)
-
-            for i in T.Parallel(block_M):
-                sf = T.exp2(m_prev[i] * scale - m_i[i] * scale)
-                l_i[i] *= sf
-                scale_factor[i] = sf
-
-            for i, j in T.Parallel(block_M, dim):
-                acc_o[i, j] *= scale_factor[i]
-
-            for i, j in T.Parallel(block_M, block_N):
-                acc_s[i, j] = T.exp2(acc_s[i, j] * scale - m_i[i] * scale)
-
-            row_sum = T.alloc_fragment([block_M], accum_dtype)
-            T.reduce_sum(acc_s, row_sum, dim=1)
-            for i in T.Parallel(block_M):
-                l_i[i] += row_sum[i]
-
-            T.copy(acc_s, P_shared)
-            T.sync_threads()
-
-            T.gemm(P_shared, V_shared, acc_o)
-
-    # 修复：将宏移至内核外部，以实现清晰的代码结构。
-    @T.macro
-    def scale_and_write_back(src_buffer, scale_vector, dest_tensor, bz, by, q_block_offset):
-        # 此宏执行融合的缩放和写回操作，这对性能至关重要。
-        for i, j in T.Parallel(block_M, dim):
-            dest_tensor[bz, q_block_offset + i, by, j] = src_buffer[i, j] * scale_vector[i]
-
-    @T.macro
-    def flash_attn_forward_kernel(Q: T.Tensor(q_shape, dtype), K: T.Tensor(kv_shape, dtype),
-                                  V: T.Tensor(kv_shape, dtype), Output: T.Tensor(q_shape, dtype)):
         with T.Kernel(num_split_q, batch * heads, threads=threads) as (b_split, byz_combined):
             T.use_swizzle(10, enable=enable_rasterization)
 
@@ -198,29 +123,78 @@ def flash_attn_forward_kernel(Q: T.Tensor(q_shape, dtype), K: T.Tensor(kv_shape,
                 T.fill(l_i, 0)
 
                 current_bx = bx[0]
+                q_block_offset = current_bx * block_M
 
-                compute_block(bz, by, current_bx, Q, K, V, acc_o, m_i, l_i)
+                Q_shared = T.alloc_shared([block_M, dim], dtype)
+                K_shared = T.alloc_shared([block_N, dim], dtype)
+                V_shared = T.alloc_shared([block_N, dim], dtype)
+                P_shared = T.alloc_shared([block_M, block_N], dtype)
+
+                acc_s = T.alloc_fragment([block_M, block_N], accum_dtype)
+                m_prev = T.alloc_fragment([block_M], accum_dtype)
+                scale_factor = T.alloc_fragment([block_M], accum_dtype)
+
+                T.copy(
+                    Q[bz, q_block_offset:q_block_offset + block_M, by, :],
+                    Q_shared,
+                    coalesced_width=vec_size)
+
+                loop_end_k = T.ceildiv(q_block_offset + block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N)
+                
+                for k in T.Pipelined(loop_end_k, num_stages=num_stages):
+                    kv_idx = k * block_N
+                    
+                    T.copy(
+                        K[bz, kv_idx:kv_idx + block_N, by // groups, :], 
+                        K_shared, 
+                        coalesced_width=vec_size)
+                    T.copy(
+                        V[bz, kv_idx:kv_idx + block_N, by // groups, :],
+                        V_shared,
+                        coalesced_width=v_vec_size)
+
+                    T.clear(acc_s)
+                    T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, k_pack=k_pack)
+
+                    if is_causal:
+                        for i, j in T.Parallel(block_M, block_N):
+                            acc_s[i, j] = T.if_then_else(q_block_offset + i >= kv_idx + j, acc_s[i, j],
+                                                         -T.infinity(acc_s.dtype))
+
+                    T.copy(m_i, m_prev)
+                    T.reduce_max(acc_s, m_i, dim=1, clear=False)
+
+                    for i in T.Parallel(block_M):
+                        sf = T.exp2(m_prev[i] * scale - m_i[i] * scale)
+                        l_i[i] *= sf
+                        scale_factor[i] = sf
+
+                    for i, j in T.Parallel(block_M, dim):
+                        acc_o[i, j] *= scale_factor[i]
+
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.exp2(acc_s[i, j] * scale - m_i[i] * scale)
+
+                    row_sum = T.alloc_fragment([block_M], accum_dtype)
+                    T.reduce_sum(acc_s, row_sum, dim=1)
+                    for i in T.Parallel(block_M):
+                        l_i[i] += row_sum[i]
+
+                    T.copy(acc_s, P_shared)
+                    T.sync_threads()
+
+                    T.gemm(P_shared, V_shared, acc_o)
 
                 l_inv = T.alloc_fragment([block_M], accum_dtype)
                 for i in T.Parallel(block_M):
                     safe_l = T.if_then_else(l_i[i] > 1e-6, l_i[i], 1.0)
                     l_inv[i] = 1.0 / safe_l
 
-                # 修复：现在对宏的调用对编译器来说更清晰。
-                q_block_offset = current_bx * block_M
-                scale_and_write_back(acc_o, l_inv, Output, bz, by, q_block_offset)
+                for i, j in T.Parallel(block_M, dim):
+                    Output[bz, q_block_offset + i, by, j] = acc_o[i, j] * l_inv[i]
 
                 bx[0] = current_bx + num_split_q
 
-    @T.prim_func
-    def main(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            Output: T.Tensor(q_shape, dtype),
-    ):
-        flash_attn_forward_kernel(Q, K, V, Output)
-
     return main
 
 
@@ -268,3 +242,4 @@ def main_v2(batch: int = 1,
     parser.add_argument('--groups', type=int, default=1, help='groups')
     args = parser.parse_args()
     main_v2(args.batch, args.heads, args.seq_len, args.dim, args.is_causal, args.groups)
+
diff --git a/examples/amd/test.sh b/examples/amd/test.sh
new file mode 100644
index 000000000..5df31ed17
--- /dev/null
+++ b/examples/amd/test.sh
@@ -0,0 +1,16 @@
+/bin/python /workspace/tilelang/examples/amd/example_amd_flash_attn_fwd.py     --batch 2     --seq_len 4096     --dim 128     --heads 16     --groups 8     --is_causal
+# /workspace/aiter/3rdparty/composable_kernel/build/bin/tile_example_fmha_fwd \
+#     -b=2 \
+#     -s=4096 \
+#     -d=128 \
+#     -h=16 \
+#     -h_k=2 \
+#     -prec=fp16 \
+#     -mask=t \
+#     -v=0
+
+# hipcc -std=c++17 -fPIC --offload-arch=gfx942 -S \
+#   ~/.tilelang/cache/0dad5010ab3d8e01b2a86e56208af334ad62865b92cb03038f9df84fda8d8a99/kernel.cu \
+#   -o ./kernel.s \
+#   -I/workspace/tilelang/3rdparty/composable_kernel/include \
+#   -I/workspace/tilelang/src
\ No newline at end of file
diff --git a/input.txt b/input.txt
new file mode 100644
index 000000000..7a01fa0d3
--- /dev/null
+++ b/input.txt
@@ -0,0 +1,2 @@
+att: TARGET_CU=1
+KERNEL=main
\ No newline at end of file
diff --git a/src/tl_templates/hip/reduce.h b/src/tl_templates/hip/reduce.h
index fb7231aae..02464a181 100644
--- a/src/tl_templates/hip/reduce.h
+++ b/src/tl_templates/hip/reduce.h
@@ -22,7 +22,7 @@ struct MinOp {
   }
 };
 
-template <class Reducer, int threads, int scale> struct AllReduce {
+template <class Reducer, int threads, int scale, int thread_offset = 0> struct AllReduce {
   static_assert(threads == 1024 || threads == 512 || threads == 256 ||
                 threads == 128 || threads == 64 || threads == 32 ||
                 threads == 16 || threads == 8 || threads == 4 || threads == 2);
@@ -43,7 +43,7 @@ template <class Reducer, int threads, int scale> struct AllReduce {
     if constexpr (offset == scale) {
       return x;
     } else {
-      return AllReduce<Reducer, offset, scale>::run(x, red_buf);
+      return AllReduce<Reducer, offset, scale, thread_offset>::run(x, red_buf);
     }
   }
 };

From 21cf0c3c7bf61fa96b9c66ce3c66f8aa2b7e4b53 Mon Sep 17 00:00:00 2001
From: xinyxiao <xinyxiao@amd.com>
Date: Wed, 30 Jul 2025 13:42:28 +0000
Subject: [PATCH 05/11] Remove input configuration file and obsolete test
 script; enhance AMD example with swizzle layout annotations

- Deleted input.txt and test.sh files as they are no longer needed.
- Updated example_amd_flash_attn_fwd.py to include swizzle layout annotations for shared memory, improving bank conflict avoidance.
- Reintroduced swizzle usage in the kernel for better performance.
---
 examples/amd/test.sh | 16 ----------------
 input.txt            |  2 --
 2 files changed, 18 deletions(-)
 delete mode 100644 examples/amd/test.sh
 delete mode 100644 input.txt

diff --git a/examples/amd/test.sh b/examples/amd/test.sh
deleted file mode 100644
index 5df31ed17..000000000
--- a/examples/amd/test.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-/bin/python /workspace/tilelang/examples/amd/example_amd_flash_attn_fwd.py     --batch 2     --seq_len 4096     --dim 128     --heads 16     --groups 8     --is_causal
-# /workspace/aiter/3rdparty/composable_kernel/build/bin/tile_example_fmha_fwd \
-#     -b=2 \
-#     -s=4096 \
-#     -d=128 \
-#     -h=16 \
-#     -h_k=2 \
-#     -prec=fp16 \
-#     -mask=t \
-#     -v=0
-
-# hipcc -std=c++17 -fPIC --offload-arch=gfx942 -S \
-#   ~/.tilelang/cache/0dad5010ab3d8e01b2a86e56208af334ad62865b92cb03038f9df84fda8d8a99/kernel.cu \
-#   -o ./kernel.s \
-#   -I/workspace/tilelang/3rdparty/composable_kernel/include \
-#   -I/workspace/tilelang/src
\ No newline at end of file
diff --git a/input.txt b/input.txt
deleted file mode 100644
index 7a01fa0d3..000000000
--- a/input.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-att: TARGET_CU=1
-KERNEL=main
\ No newline at end of file

From 9b2fab3189f29b2fd6b9d1278325758b4ca6f40b Mon Sep 17 00:00:00 2001
From: xinyxiao <xinyxiao@amd.com>
Date: Wed, 30 Jul 2025 13:45:31 +0000
Subject: [PATCH 06/11] Refactor AMD example script for FlashAttention-2

- Updated function names for clarity, changing `get_v2_configs` to `get_configs` and `fast_flashattn_v2` to `fast_flashattn`.
- Streamlined the main function by renaming `main_v2` to `main` and adjusting the corresponding calls.
- Removed outdated comments and improved code organization for better readability.
---
 examples/amd/example_amd_flash_attn_fwd.py | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/examples/amd/example_amd_flash_attn_fwd.py b/examples/amd/example_amd_flash_attn_fwd.py
index 8507df702..cecbf6337 100644
--- a/examples/amd/example_amd_flash_attn_fwd.py
+++ b/examples/amd/example_amd_flash_attn_fwd.py
@@ -1,8 +1,6 @@
 # Copyright (c) Tile-AI Corporation.
 # Licensed under the MIT License.
-#
-# Modified to implement FlashAttention-2 forward pass principles.
-# Corrected loop implementation using T.while_loop.
+
 
 import torch
 import torch.nn.functional as F
@@ -13,7 +11,6 @@
 from functools import partial
 
 
-# PyTorch 参考实现保持不变
 def ref_program(Q, K, V, is_causal, groups=1):
     assert Q.size(
         2) == K.size(2) * groups, f"Q heads {Q.size(2)} K heads {K.size(2)} groups {groups}"
@@ -34,7 +31,7 @@ def ref_program(Q, K, V, is_causal, groups=1):
     return output
 
 
-def get_v2_configs():
+def get_configs():
     """Generates configurations for the autotuner, tailored for FA-2 style parallelism."""
     block_M = [64, 128, 256]
     block_N = [32, 64, 128]
@@ -69,9 +66,9 @@ def get_v2_configs():
     return valid_configs
 
 
-@tilelang.autotune(configs=get_v2_configs(), cache_input_tensors=True)
+@tilelang.autotune(configs=get_configs(), cache_input_tensors=True)
 @tilelang.jit(out_idx=[3])
-def fast_flashattn_v2(
+def fast_flashattn(
     batch,
     heads,
     seq_len,
@@ -198,8 +195,7 @@ def main(
     return main
 
 
-# main 函数保持不变
-def main_v2(batch: int = 1,
+def main(batch: int = 1,
             heads: int = 8,
             seq_len: int = 4096,
             dim: int = 128,
@@ -212,7 +208,7 @@ def main_v2(batch: int = 1,
         total_flops *= 0.5
 
     print("Starting autotuning for FlashAttention-V2...")
-    kernel = fast_flashattn_v2(batch, heads, seq_len, dim, is_causal, groups=groups)
+    kernel = fast_flashattn(batch, heads, seq_len, dim, is_causal, groups=groups)
     print(f"Autotuning finished. Best Configuration: {kernel.config}")
 
     ref_program_processed = partial(ref_program, is_causal=is_causal, groups=groups)
@@ -241,5 +237,5 @@ def main_v2(batch: int = 1,
     parser.add_argument('--is_causal', action='store_true', help='causal')
     parser.add_argument('--groups', type=int, default=1, help='groups')
     args = parser.parse_args()
-    main_v2(args.batch, args.heads, args.seq_len, args.dim, args.is_causal, args.groups)
+    main(args.batch, args.heads, args.seq_len, args.dim, args.is_causal, args.groups)
 

From 24e08ae417acde87ceaa450fb3b914af2c38038c Mon Sep 17 00:00:00 2001
From: xinyxiao <xinyxiao@amd.com>
Date: Wed, 30 Jul 2025 13:54:09 +0000
Subject: [PATCH 07/11] Refactor formatting in AMD FlashAttention example
 script

- Improved code readability by adjusting line breaks and indentation in the `fast_flashattn` function.
- Streamlined the `main` function parameter formatting for consistency.
- Removed unnecessary blank lines to enhance overall code organization.
---
 examples/amd/example_amd_flash_attn_fwd.py | 27 +++++++++++-----------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/examples/amd/example_amd_flash_attn_fwd.py b/examples/amd/example_amd_flash_attn_fwd.py
index cecbf6337..97091b74b 100644
--- a/examples/amd/example_amd_flash_attn_fwd.py
+++ b/examples/amd/example_amd_flash_attn_fwd.py
@@ -1,7 +1,6 @@
 # Copyright (c) Tile-AI Corporation.
 # Licensed under the MIT License.
 
-
 import torch
 import torch.nn.functional as F
 import tilelang
@@ -136,14 +135,15 @@ def main(
                     Q_shared,
                     coalesced_width=vec_size)
 
-                loop_end_k = T.ceildiv(q_block_offset + block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N)
-                
+                loop_end_k = T.ceildiv(q_block_offset + block_M,
+                                       block_N) if is_causal else T.ceildiv(seq_len, block_N)
+
                 for k in T.Pipelined(loop_end_k, num_stages=num_stages):
                     kv_idx = k * block_N
-                    
+
                     T.copy(
-                        K[bz, kv_idx:kv_idx + block_N, by // groups, :], 
-                        K_shared, 
+                        K[bz, kv_idx:kv_idx + block_N, by // groups, :],
+                        K_shared,
                         coalesced_width=vec_size)
                     T.copy(
                         V[bz, kv_idx:kv_idx + block_N, by // groups, :],
@@ -155,8 +155,8 @@ def main(
 
                     if is_causal:
                         for i, j in T.Parallel(block_M, block_N):
-                            acc_s[i, j] = T.if_then_else(q_block_offset + i >= kv_idx + j, acc_s[i, j],
-                                                         -T.infinity(acc_s.dtype))
+                            acc_s[i, j] = T.if_then_else(q_block_offset + i >= kv_idx + j,
+                                                         acc_s[i, j], -T.infinity(acc_s.dtype))
 
                     T.copy(m_i, m_prev)
                     T.reduce_max(acc_s, m_i, dim=1, clear=False)
@@ -196,11 +196,11 @@ def main(
 
 
 def main(batch: int = 1,
-            heads: int = 8,
-            seq_len: int = 4096,
-            dim: int = 128,
-            is_causal: bool = False,
-            groups: int = 1):
+         heads: int = 8,
+         seq_len: int = 4096,
+         dim: int = 128,
+         is_causal: bool = False,
+         groups: int = 1):
 
     flops_per_matmul = 2.0 * batch * heads * seq_len * seq_len * dim
     total_flops = 2 * flops_per_matmul
@@ -238,4 +238,3 @@ def main(batch: int = 1,
     parser.add_argument('--groups', type=int, default=1, help='groups')
     args = parser.parse_args()
     main(args.batch, args.heads, args.seq_len, args.dim, args.is_causal, args.groups)
-

From bc2663a1af6b0112fcc5ccb8bf6faa163e68ab98 Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Thu, 31 Jul 2025 17:24:02 +0800
Subject: [PATCH 08/11] Update example_amd_flash_attn_fwd.py

---
 examples/amd/example_amd_flash_attn_fwd.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/examples/amd/example_amd_flash_attn_fwd.py b/examples/amd/example_amd_flash_attn_fwd.py
index 97091b74b..aaf7f8ee1 100644
--- a/examples/amd/example_amd_flash_attn_fwd.py
+++ b/examples/amd/example_amd_flash_attn_fwd.py
@@ -1,6 +1,3 @@
-# Copyright (c) Tile-AI Corporation.
-# Licensed under the MIT License.
-
 import torch
 import torch.nn.functional as F
 import tilelang

From 4d427d9e9a7e62cde4cc75a98c5f34dfb1226308 Mon Sep 17 00:00:00 2001
From: xinyxiao <xinyxiao@amd.com>
Date: Mon, 18 Aug 2025 13:25:55 +0000
Subject: [PATCH 09/11] Enhance AMD example script and update CI workflows

- Improved the `example_amd_flash_attn_fwd.py` script for better clarity and organization.
- Added new CI workflows for AMD and documentation publishing.
- Updated various requirements files to include necessary dependencies.
- Introduced new test cases and examples for better coverage and functionality.
- Refactored existing code for improved readability and maintainability.
---
 .clang-tidy                                   |    3 +-
 .github/workflows/amd_ci.yml                  |  121 +
 .github/workflows/ci.yml                      |   47 +-
 .github/workflows/publish_docs.yml            |    5 +-
 3rdparty/tvm                                  |    2 +-
 CMakeLists.txt                                |   17 +-
 benchmark/matmul/benchmark_matmul.py          |    5 +-
 .../matmul/benchmark_matmul_intrinsic.py      |    5 +-
 benchmark/matmul_fp8/benchmark_matmul.py      |    8 +-
 docs/deeplearning_operators/gemv.md           |    2 +-
 examples/amd/example_amd_flash_attn_fwd.py    |   81 +-
 examples/analyze/example_conv_analyze.py      |    6 +-
 examples/analyze/example_gemm_analyze.py      |    5 +-
 examples/bitnet-1.58b/modeling_bitnet.py      |    2 +-
 examples/bitnet-1.58b/requirements.txt        |    2 +-
 ...xample_tilelang_sparse_gqa_decode_paged.py |    3 +-
 ...ilelang_sparse_gqa_decode_varlen_indice.py |    2 +
 ..._tilelang_sparse_gqa_decode_varlen_mask.py |    2 +
 ..._triton_sparse_gqa_decode_varlen_indice.py |    2 +
 ...le_triton_sparse_gqa_decode_varlen_mask.py |    4 +-
 ...ample_group_per_split_token_cast_to_fp8.py |    4 +-
 .../cast/example_per_token_cast_to_fp8.py     |    4 +-
 examples/convolution/example_convolution.py   |    4 +-
 .../example_convolution_autotune.py           |    9 +-
 .../example_deepgemm_fp8_2xAcc.py             |    8 +-
 examples/deepseek_mla/example_mla_decode.py   |   11 +-
 .../deepseek_mla/example_mla_decode_paged.py  |    9 +-
 .../experimental/example_mla_decode_kv_fp8.py |    2 +-
 .../example_dequant_gemm_fp4_hopper.py        |   24 +-
 .../example_dequant_gemm_mxfp4_hopper.py      |  424 ++++
 .../test_example_dequantize_gemm.py           |    7 +
 examples/flash_attention/bert_padding.py      |  213 ++
 .../flash_attention/example_mha_fwd_varlen.py |    7 +-
 .../flash_decoding/example_mha_inference.py   |    2 +-
 .../fusedmoe/example_fusedmoe_tilelang.py     |    2 -
 examples/gdn/README.md                        |   11 +
 examples/gdn/example_chunk_delta_bwd.py       |  577 +++++
 examples/gdn/example_chunk_delta_h.py         |  368 +++
 examples/gdn/example_chunk_o.py               |  239 ++
 examples/gdn/example_chunk_o_bwd.py           |  539 +++++
 examples/gdn/example_chunk_scaled_dot_kkt.py  |  201 ++
 examples/gdn/example_cumsum.py                |  171 ++
 examples/gdn/example_wy_fast.py               |  233 ++
 examples/gdn/example_wy_fast_bwd_split.py     |  536 +++++
 examples/gdn/test_example_gdn_compilation.py  |  206 ++
 examples/gdn/utils.py                         |   40 +
 examples/gemm/example_gemm_autotune.py        |    5 +-
 .../gemm_fp8/example_tilelang_gemm_fp8.py     |    4 +-
 .../example_tilelang_gemm_fp8_2xAcc.py        |    4 +-
 .../example_tilelang_gemm_fp8_intrinsic.py    |   10 +-
 .../example_tilelang_gemm_splitk.py           |    8 -
 ...ilelang_gemm_splitk_vectorize_atomicadd.py |   70 +
 .../gemm_splitk/test_example_gemm_splitk.py   |    9 +-
 .../grouped_gemm/example_grouped_gemm_fwd.py  |   14 +-
 .../example_warp_specialize_flashmla.py       |  114 +-
 format.sh                                     |   16 +-
 requirements-build.txt                        |    1 +
 requirements-dev.txt                          |    1 -
 requirements-rocm.txt                         |   29 +
 requirements-test.txt                         |    5 +-
 setup.py                                      |  112 +-
 src/ir.cc                                     |   47 +-
 src/layout/layout.cc                          |  133 +-
 src/layout/layout.h                           |    4 +-
 src/layout/swizzle.cc                         |    5 +-
 src/layout/swizzle.h                          |    2 +-
 src/layout/utils.cc                           |    8 +-
 src/layout/utils.h                            |    9 +
 src/op/atomic_add.cc                          |  247 +++
 src/op/atomic_add.h                           |   62 +
 src/op/builtin.cc                             |   36 +-
 src/op/builtin.h                              |   26 +-
 src/op/bulk_copy.cc                           |   23 +-
 src/op/bulk_copy.h                            |    6 +-
 src/op/elem.cc                                |   27 +-
 src/op/elem.h                                 |   19 +
 src/op/gemm.cc                                |  234 +-
 src/op/gemm.h                                 |   15 +-
 src/op/gemm_sp.cc                             |   14 +-
 src/op/gemm_sp.h                              |    4 +
 src/op/logical.cc                             |    2 +-
 src/op/math.cc                                |    2 +-
 src/op/op.cc                                  |    5 -
 src/op/op.h                                   |   13 +-
 src/op/parallel.cc                            |  128 +-
 src/op/parallel.h                             |   22 +
 src/op/reduce.cc                              |   50 +-
 src/op/reduce.h                               |    8 +
 src/runtime/runtime.cc                        |  140 +-
 src/target/codegen_cpp.cc                     |   19 +-
 src/target/codegen_cpp.h                      |    2 +-
 src/target/codegen_cuda.cc                    |  429 +++-
 src/target/codegen_cuda.h                     |   15 +-
 src/target/codegen_hip.cc                     |   15 +-
 src/target/codegen_webgpu.cc                  |   24 +-
 src/target/rt_mod_cpp.cc                      |    7 +-
 src/target/rt_mod_cuda.cc                     |   32 +-
 src/target/rt_mod_hip.cc                      |   44 +-
 src/target/utils.cc                           |   16 +-
 src/target/utils.h                            |    2 +
 src/tl_templates/cpp/half.hpp                 |   26 +-
 src/tl_templates/cuda/common.h                |   11 +-
 src/tl_templates/cuda/copy_sm90.h             |  171 +-
 src/tl_templates/cuda/cuda_bf16_fallbacks.cuh |  257 +++
 src/tl_templates/cuda/cuda_bf16_wrapper.h     |   23 +
 src/tl_templates/cuda/debug.h                 |    2 +-
 src/tl_templates/cuda/gemm.h                  |    4 +-
 src/tl_templates/cuda/gemm_mma.h              |  458 ++++
 src/tl_templates/cuda/gemm_sm120.h            |    3 +
 src/tl_templates/cuda/gemm_sm80.h             |  388 +---
 src/tl_templates/cuda/gemm_sm89.h             |  404 +---
 src/tl_templates/cuda/gemm_sm90.h             |  360 ++-
 src/tl_templates/hip/reduce.h                 |    3 +-
 ...align_dynamic_shared_memory_allocations.cc |    8 +-
 src/transform/annotate_device_regions.cc      |   10 +-
 src/transform/atomicadd_vectorize.cc          |  283 +++
 src/transform/atomicadd_vectorize.h           |   23 +
 src/transform/cluster_planning.cc             |   27 +-
 .../common/loop_vectorization_utils.h         |    6 +-
 src/transform/common/union_find.h             |   52 +
 src/transform/config_index_bitwidth.cc        |  100 +-
 .../eliminate_storage_sync_for_mbarrier.cc    |   10 +-
 src/transform/flatten_buffer.cc               |   74 +-
 src/transform/frontend_legalize.cc            |    7 +-
 src/transform/if_stmt_binding.cc              |    6 +-
 src/transform/inject_fence_proxy.cc           |    7 +-
 src/transform/inject_pipeline.cc              |   96 +-
 src/transform/inject_ptx_async_copy.cc        |    7 +-
 src/transform/inject_tma_barrier.cc           |  206 +-
 src/transform/layout_inference.cc             |  398 ++--
 src/transform/legalize_safe_memory_access.cc  |   10 +-
 src/transform/legalize_vectorized_loop.cc     |    8 +-
 src/transform/loop_vectorize.cc               |   34 +-
 src/transform/loop_vectorize_dynamic.cc       |   12 +-
 src/transform/lower_device_kernel_launch.cc   |  418 ++++
 .../lower_device_storage_access_info.cc       |   10 +-
 src/transform/lower_hopper_intrin.cc          |   41 +-
 .../lower_l2_persistent_annotation.cc         |    7 +-
 src/transform/lower_opaque_block.cc           |  238 ++
 src/transform/lower_shared_barrier.cc         |    8 +-
 src/transform/lower_thread_allreduce.cc       |  953 ++++++++
 src/transform/lower_tile_op.cc                |   22 +-
 src/transform/make_packed_api.cc              |   65 +-
 src/transform/merge_if_stmt.cc                |   42 +-
 .../merge_shared_memory_allocations.cc        |  101 +-
 .../multi_version_buffer_rewriter.cc          |   32 +-
 src/transform/persist_threadblock.cc          |    7 +-
 src/transform/pipeline_planning.cc            |  326 +--
 src/transform/simplify.cc                     |   82 +-
 src/transform/storage_rewrite.cc              | 1968 +++++++++++++++++
 src/transform/thread_partial_sync.cc          |   45 +-
 src/transform/thread_storage_sync.cc          |   12 +-
 src/transform/vectorize_loop.cc               |   26 +-
 src/transform/warp_specialized_rewriter.cc    |  367 ++-
 src/transform/wgmma_sync_rewriter.cc          |   28 +-
 .../amd/test_tilelang_gemm_mfma_intrinsic.py  |    1 -
 testing/python/cpu/test_tilelang_cpu_gemm.py  |    2 +
 .../test_tilelang_kernel_bf16_gemm_mma.py     |    9 +-
 .../test_tilelang_kernel_deepseek_nsa.py      |  324 ---
 .../test_tilelang_kernel_dequantize_gemm.py   |    5 +-
 .../kernel/test_tilelang_kernel_fp8_gemm.py   |    4 +-
 .../test_tilelang_kernel_fp8_gemm_mma.py      |   10 +-
 .../test_tilelang_kernel_fp8_gemv_simt.py     |    4 +-
 ...test_tilelang_kernel_gemm_mma_intrinsic.py |   10 +-
 .../test_tilelang_kernel_gemm_with_stride.py  |   86 +
 .../kernel/test_tilelang_kernel_gemv_simt.py  |    4 +-
 .../test_tilelang_kernel_int4_gemm_mma.py     |   14 +-
 .../language/test_tilelang_language_alias.py  |    4 +-
 .../test_tilelang_language_annotate_pad.py    |    1 -
 .../language/test_tilelang_language_copy.py   |   49 +-
 .../test_tilelang_language_pipeline.py        |  224 ++
 .../test_tilelang_language_reshape.py         |   53 +-
 .../test_tilelang_primitives_mma.py           |    2 -
 .../test_tilelang_tilelibrary_gemm_sp.py      |  237 ++
 ...lang_transform_Inject_software_pipeline.py |   37 +-
 ...est_tilelang_transform_cluster_planning.py |    2 +-
 ...test_tilelang_transform_make_packed_api.py |  190 +-
 ...tilelang_transform_multi_version_buffer.py |    4 +-
 ...st_tilelang_transform_pipeline_planning.py |    8 +-
 .../test_tilelang_transform_thread_sync.py    |  105 +-
 .../test_tilelang_transform_vectorize_loop.py |  538 -----
 ...est_tilelang_transform_warp_specialized.py |    4 +-
 testing/python/utils/test_compress_utils.py   |   62 +
 tilelang/__init__.py                          |    4 +-
 tilelang/_ffi_api.py                          |    4 +-
 tilelang/autotuner/param.py                   |   35 +-
 tilelang/autotuner/tuner.py                   |    4 +-
 tilelang/cache/kernel_cache.py                |   31 +-
 tilelang/carver/analysis.py                   |    2 +-
 tilelang/carver/arch/__init__.py              |    1 +
 tilelang/carver/arch/cuda.py                  |    6 +-
 tilelang/carver/matmul_analysis.py            |   16 +-
 tilelang/carver/roller/policy/tensorcore.py   |    5 +-
 tilelang/contrib/cc.py                        |    2 +-
 tilelang/contrib/dlpack.py                    |    6 +-
 tilelang/contrib/hipcc.py                     |    6 +-
 tilelang/contrib/nvcc.py                      |   32 +-
 tilelang/contrib/nvrtc.py                     |    8 +-
 tilelang/contrib/rocm.py                      |   10 +-
 tilelang/engine/lower.py                      |   35 +-
 tilelang/engine/phase.py                      |   27 +-
 tilelang/env.py                               |   10 +-
 tilelang/intrinsics/mfma_macro_generator.py   |    4 +-
 tilelang/intrinsics/mma_macro_generator.py    |    4 +-
 tilelang/intrinsics/utils.py                  |    2 +-
 tilelang/jit/__init__.py                      |    2 +-
 tilelang/jit/adapter/ctypes/adapter.py        |   35 +-
 tilelang/jit/adapter/cython/adapter.py        |  108 +-
 .../jit/adapter/cython/cython_wrapper.pyx     |   94 +-
 tilelang/jit/adapter/libgen.py                |   29 +-
 tilelang/jit/adapter/nvrtc/adapter.py         |    8 +-
 tilelang/jit/adapter/wrapper.py               |   46 +-
 tilelang/jit/env.py                           |    6 +-
 tilelang/jit/kernel.py                        |    7 +
 tilelang/language/__init__.py                 |    1 +
 tilelang/language/ast/_ffi_api.py             |    4 +-
 tilelang/language/ast/ir.py                   |   50 +-
 tilelang/language/copy.py                     |   73 +-
 tilelang/language/customize.py                |  122 +-
 tilelang/language/fill.py                     |    9 +-
 tilelang/language/frame.py                    |    2 +-
 tilelang/language/gemm.py                     |   52 +-
 tilelang/language/kernel.py                   |    2 +-
 tilelang/language/logical.py                  |   21 +-
 tilelang/language/memscope.py                 |    4 +-
 tilelang/language/parser/operation.py         |    6 +-
 tilelang/language/proxy.py                    |   71 +-
 tilelang/language/tir/entry.py                |    6 +-
 tilelang/language/tir/op.py                   |    2 +-
 tilelang/language/warpgroup.py                |    2 +-
 tilelang/layout/fragment.py                   |    6 +-
 tilelang/layout/layout.py                     |    2 +-
 tilelang/quantize/quantization.py             |    2 +-
 tilelang/transform/__init__.py                |   33 +-
 tilelang/transform/_ffi_api.py                |    4 +-
 tilelang/transform/pass_config.py             |    7 +
 tilelang/utils/language.py                    |   20 +-
 tilelang/utils/tensor.py                      |   12 +-
 tilelang/version.py                           |   10 +-
 239 files changed, 13993 insertions(+), 3877 deletions(-)
 create mode 100644 .github/workflows/amd_ci.yml
 create mode 100644 examples/dequantize_gemm/example_dequant_gemm_mxfp4_hopper.py
 create mode 100644 examples/flash_attention/bert_padding.py
 create mode 100644 examples/gdn/README.md
 create mode 100644 examples/gdn/example_chunk_delta_bwd.py
 create mode 100644 examples/gdn/example_chunk_delta_h.py
 create mode 100644 examples/gdn/example_chunk_o.py
 create mode 100644 examples/gdn/example_chunk_o_bwd.py
 create mode 100644 examples/gdn/example_chunk_scaled_dot_kkt.py
 create mode 100644 examples/gdn/example_cumsum.py
 create mode 100644 examples/gdn/example_wy_fast.py
 create mode 100644 examples/gdn/example_wy_fast_bwd_split.py
 create mode 100644 examples/gdn/test_example_gdn_compilation.py
 create mode 100644 examples/gdn/utils.py
 create mode 100644 examples/gemm_splitk/example_tilelang_gemm_splitk_vectorize_atomicadd.py
 create mode 100644 requirements-rocm.txt
 create mode 100644 src/op/atomic_add.cc
 create mode 100644 src/op/atomic_add.h
 create mode 100644 src/tl_templates/cuda/cuda_bf16_fallbacks.cuh
 create mode 100644 src/tl_templates/cuda/cuda_bf16_wrapper.h
 create mode 100644 src/tl_templates/cuda/gemm_mma.h
 create mode 100644 src/tl_templates/cuda/gemm_sm120.h
 create mode 100644 src/transform/atomicadd_vectorize.cc
 create mode 100644 src/transform/atomicadd_vectorize.h
 create mode 100644 src/transform/common/union_find.h
 create mode 100644 src/transform/lower_device_kernel_launch.cc
 create mode 100644 src/transform/lower_opaque_block.cc
 create mode 100644 src/transform/lower_thread_allreduce.cc
 create mode 100644 src/transform/storage_rewrite.cc
 delete mode 100644 testing/python/kernel/test_tilelang_kernel_deepseek_nsa.py
 create mode 100644 testing/python/kernel/test_tilelang_kernel_gemm_with_stride.py
 create mode 100644 testing/python/language/test_tilelang_language_pipeline.py
 create mode 100644 testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp.py
 delete mode 100644 testing/python/transform/test_tilelang_transform_vectorize_loop.py
 create mode 100644 testing/python/utils/test_compress_utils.py

diff --git a/.clang-tidy b/.clang-tidy
index eb18181b7..742c99986 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -3,7 +3,8 @@ Checks: >
   cppcoreguidelines-*,
   modernize-*,
   performance-*,
-  readability-*
+  readability-*,
+  -readability-identifier-length
 WarningsAsErrors: '*'
 
 HeaderFilterRegex: '^(?!.*(3rdparty|build)).*$'
diff --git a/.github/workflows/amd_ci.yml b/.github/workflows/amd_ci.yml
new file mode 100644
index 000000000..b45cb7c74
--- /dev/null
+++ b/.github/workflows/amd_ci.yml
@@ -0,0 +1,121 @@
+name: CI Test on AMD
+on: [pull_request]
+
+env:
+  PYTHON_VERSION: '3.12'
+  VENV_DIR: tilelang_ci
+  PYTORCH_INDEX_URL: https://download.pytorch.org/whl/nightly/rocm6.3/
+
+jobs:
+  format-check:
+    runs-on: [self-hosted, amd, gpu]
+
+    permissions:
+      contents: write 
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
+
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ env.PYTHON_VERSION }}
+
+    - name: Ensure venv (local & persistent)
+      run: |
+        set -e
+        REQS_HASH=$(sha256sum requirements-test.txt 2>/dev/null | awk '{print $1}' || echo "no_requirements")
+        MARKER="${{ runner.tool_cache }}/.venv_marker_${{ env.PYTHON_VERSION }}_${REQS_HASH:0:8}"
+
+        if [[ -f "$MARKER" ]] && [[ -f "${{ runner.tool_cache }}/${{ env.VENV_DIR }}/bin/activate" ]]; then
+          echo "venv exists and hash matches – reuse it"
+        else
+          echo "venv stale or missing – recreating"
+          rm -rf "${{ runner.tool_cache }}/${{ env.VENV_DIR }}" "$MARKER"
+          python -m venv "${{ runner.tool_cache }}/${{ env.VENV_DIR }}"
+          # shellcheck source=/dev/null
+          source "${{ runner.tool_cache }}/${{ env.VENV_DIR }}/bin/activate"
+          python -m pip install --upgrade pip --no-user
+          [[ -f requirements-test.txt ]] && \
+            PIP_NO_BUILD_ISOLATION=1 pip install -r requirements-test.txt --no-user
+          pip install flash_attn==2.5.8 --no-user --no-build-isolation
+          touch "$MARKER"
+        fi
+
+    - name: Run format check
+      run: |
+        source "${{ runner.tool_cache }}/${{ env.VENV_DIR }}/bin/activate"
+        if ! output=$(./format.sh 2>&1); then
+          echo "------------------------------------"
+          echo "message:"
+          echo "$output"
+          printf '%s\n' "$output" | grep "Please review and stage the changes."
+          echo "------------------------------------"
+          exit 1
+        fi
+    
+    - name: Commit and Push Changes
+      uses: stefanzweifel/git-auto-commit-action@v5
+      with:
+        commit_message: "lint"
+
+  build-test-amd:
+    runs-on: [self-hosted, amd, gpu]
+    needs: format-check
+    permissions:
+      contents: read
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
+        repository: ${{ github.event.pull_request.head.repo.full_name }}
+        ref: ${{ github.event.pull_request.head.ref }}
+
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ env.PYTHON_VERSION }}
+
+    - name: Ensure venv (local & persistent)
+      run: |
+        echo "Running on AMD GPU"
+        set -e
+        rm -rf "${{ runner.tool_cache }}"
+        REQS_HASH=$(sha256sum requirements-rocm.txt | cut -d ' ' -f 1)
+        MARKER="${{ runner.tool_cache }}/.venv_marker_${{ env.PYTHON_VERSION }}_${REQS_HASH:0:8}"
+        
+        echo "Installing requirements"
+        if [[ -f "$MARKER" ]] && [[ -f "${{ runner.tool_cache }}/${{ env.VENV_DIR }}/bin/activate" ]]; then
+          echo "venv exists and hash matches – reuse it"
+        else
+          echo "venv stale or missing – recreating"
+          rm -rf "${{ runner.tool_cache }}/${{ env.VENV_DIR }}" "$MARKER"
+          python -m venv "${{ runner.tool_cache }}/${{ env.VENV_DIR }}"
+          source "${{ runner.tool_cache }}/${{ env.VENV_DIR }}/bin/activate"
+          python -m pip install --upgrade pip --no-user
+          if [[ -f requirements-rocm.txt ]]; then
+            pip install --pre torch torchvision torchaudio --index-url ${{ env.PYTORCH_INDEX_URL }}
+            PIP_NO_BUILD_ISOLATION=1 pip install -r requirements-rocm.txt
+          fi
+
+          USE_ROCM=True pip install . --no-user
+          touch "$MARKER"
+        fi
+
+    - name: Install project (wheel form)
+      run: |
+        echo "Installing project (wheel form)"
+        source "${{ runner.tool_cache }}/${{ env.VENV_DIR }}/bin/activate"
+        USE_ROCM=True pip install . --no-user
+
+    - name: Run tests
+      run: |
+        echo "Running tests"
+        source "${{ runner.tool_cache }}/${{ env.VENV_DIR }}/bin/activate"
+        cd testing/python/amd
+        unset PYTHONPATH
+        python -m pytest -v test_tilelang_test_amd.py
\ No newline at end of file
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 618f9059d..57bb76ff0 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -2,16 +2,19 @@ name: CI
 on: [pull_request]
 
 env:
-  PYTHON_VERSION: '3.9'
+  PYTHON_VERSION: '3.12'
   VENV_DIR: tilelang_ci
 
 jobs:
   format-check:
-    runs-on: self-hosted
+    runs-on: [self-hosted, nvidia]
+
+    permissions:
+      contents: write 
 
     steps:
     - name: Checkout repository
-      uses: actions/checkout@v2
+      uses: actions/checkout@v4
       with:
         fetch-depth: 0
 
@@ -23,7 +26,7 @@ jobs:
     - name: Ensure venv (local & persistent)
       run: |
         set -e
-        REQS_HASH=$(cat requirements-test.txt 2>/dev/null || true)
+        REQS_HASH=$(sha256sum requirements-test.txt 2>/dev/null | awk '{print $1}' || echo "no_requirements")
         MARKER="${{ runner.tool_cache }}/.venv_marker_${{ env.PYTHON_VERSION }}_${REQS_HASH:0:8}"
 
         if [[ -f "$MARKER" ]] && [[ -f "${{ runner.tool_cache }}/${{ env.VENV_DIR }}/bin/activate" ]]; then
@@ -37,27 +40,39 @@ jobs:
           python -m pip install --upgrade pip --no-user
           [[ -f requirements-test.txt ]] && \
             PIP_NO_BUILD_ISOLATION=1 pip install -r requirements-test.txt --no-user
-          pip install . --no-user
+          pip install flash_attn==2.5.8 --no-user --no-build-isolation
           touch "$MARKER"
         fi
 
-    - name: Update submodules
-      run: git submodule update --init --recursive
-
     - name: Run format check
       run: |
         source "${{ runner.tool_cache }}/${{ env.VENV_DIR }}/bin/activate"
-        ./format.sh
+        if ! output=$(./format.sh 2>&1); then
+          echo "------------------------------------"
+          echo "message:"
+          echo "$output"
+          printf '%s\n' "$output" | grep "Please review and stage the changes."
+          echo "------------------------------------"
+          exit 1
+        fi
+    
+    - name: Commit and Push Changes
+      uses: stefanzweifel/git-auto-commit-action@v5
+      with:
+        commit_message: "lint"
 
-  build-test:
-    runs-on: self-hosted
+  build-test-nvidia:
+    runs-on: [self-hosted, nvidia]
     needs: format-check
-
+    permissions:
+      contents: read
     steps:
     - name: Checkout repository
-      uses: actions/checkout@v2
+      uses: actions/checkout@v4
       with:
         fetch-depth: 0
+        repository: ${{ github.event.pull_request.head.repo.full_name }}
+        ref: ${{ github.event.pull_request.head.ref }}
 
     - name: Set up Python
       uses: actions/setup-python@v2
@@ -80,6 +95,8 @@ jobs:
           python -m pip install --upgrade pip --no-user
           [[ -f requirements-test.txt ]] && \
             PIP_NO_BUILD_ISOLATION=1 pip install -r requirements-test.txt --no-user
+          # flash attention usually requires no isolation build
+          pip install flash_attn==2.5.8 --no-user --no-build-isolation
           pip install . --no-user
           touch "$MARKER"
         fi
@@ -94,11 +111,11 @@ jobs:
         source "${{ runner.tool_cache }}/${{ env.VENV_DIR }}/bin/activate"
         cd examples
         unset PYTHONPATH
-        python -m pytest -n 8 **/test*.py
+        python -m pytest -n 4 **/test*.py
 
     - name: Run tests
       run: |
         source "${{ runner.tool_cache }}/${{ env.VENV_DIR }}/bin/activate"
         cd testing/python
         unset PYTHONPATH
-        python -m pytest -n 8
+        python -m pytest -n 4
diff --git a/.github/workflows/publish_docs.yml b/.github/workflows/publish_docs.yml
index 6553e8414..3ca576eed 100644
--- a/.github/workflows/publish_docs.yml
+++ b/.github/workflows/publish_docs.yml
@@ -27,11 +27,10 @@ jobs:
           TARGET_REPO: ${{ secrets.TARGET_REPO }}
           TARGET_TOKEN: ${{ secrets.TARGET_TOKEN }}
         run: |
+          git clone https://github.com/${TARGET_REPO}.git -b main target_repo
+          cd target_repo
           git config --local user.name "github-actions[bot]"
           git config --local user.email "github-actions[bot]@users.noreply.github.com"
-          git clone https://github.com/${TARGET_REPO}.git target_repo
-          cd target_repo
-          git checkout main
           find . -mindepth 1 -maxdepth 1 ! -name ".github" ! -name "." ! -name ".git" -exec rm -rf {} +
           cp -r ../docs/_build/html/* ./
           git add .
diff --git a/3rdparty/tvm b/3rdparty/tvm
index 979c8e7f9..a64a5926a 160000
--- a/3rdparty/tvm
+++ b/3rdparty/tvm
@@ -1 +1 @@
-Subproject commit 979c8e7f94473db7d71a41b26ccf51db7e17a734
+Subproject commit a64a5926a6e59f5417ef2501f9d88b467337cf6a
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5d1d1d4ad..712957dcf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -11,6 +11,14 @@ endif()
 
 # Enable compile command export
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+if(NOT Python_EXECUTABLE)
+  execute_process(
+    COMMAND which python
+    OUTPUT_VARIABLE Python_EXECUTABLE
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+  )
+  set(Python_EXECUTABLE "${Python_EXECUTABLE}" CACHE FILEPATH "Path to the Python executable")
+endif()
 
 # Define a custom macro for globbing files with conditional CONFIGURE_DEPENDS
 if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.12.0")
@@ -39,7 +47,8 @@ else()
 
   # Set default build type to RelWithDebInfo if not provided
   if(NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE RelWithDebInfo CACHE STRING "Build type" FORCE)
+  # Set default build type to Release if not provided
+    set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
     message(STATUS "Setting default build type to ${CMAKE_BUILD_TYPE}")
   endif()
 endif()
@@ -145,6 +154,7 @@ message(STATUS "TVM_SOURCE_DIR: ${TVM_SOURCE_DIR}")
 # Include directories for TileLang
 set(TILE_LANG_INCLUDES
   ${TVM_SOURCE_DIR}/include
+  ${TVM_SOURCE_DIR}/ffi/include
   ${TVM_SOURCE_DIR}/src
   ${TVM_SOURCE_DIR}/3rdparty/dlpack/include
   ${TVM_SOURCE_DIR}/3rdparty/dmlc-core/include
@@ -212,6 +222,11 @@ if(CMAKE_BUILD_TYPE STREQUAL "Debug")
   target_compile_definitions(tilelang_static PRIVATE "TVM_LOG_DEBUG")
 endif()
 
+# Building tvm_cython modules
+if(NOT DEFINED TVM_PREBUILD_PATH)
+  add_dependencies(tilelang tvm_cython)
+endif()
+
 # Module shared library
 add_library(tilelang_module SHARED $<TARGET_OBJECTS:tilelang_objs>)
 target_link_libraries(tilelang_module PUBLIC tvm)
diff --git a/benchmark/matmul/benchmark_matmul.py b/benchmark/matmul/benchmark_matmul.py
index d81f1af30..39063b6f2 100644
--- a/benchmark/matmul/benchmark_matmul.py
+++ b/benchmark/matmul/benchmark_matmul.py
@@ -53,10 +53,7 @@ def get_configs(args, kwargs):
         from tilelang.carver.roller.rasterization import NoRasterization
         import torch
 
-        if torch.version.hip is not None:
-            arch=CDNA("hip")
-        else:
-            arch = CUDA("cuda")
+        arch = CUDA("cuda") if torch.version.hip is None else CDNA("hip")
         topk = 10
 
         carve_template = MatmulTemplate(
diff --git a/benchmark/matmul/benchmark_matmul_intrinsic.py b/benchmark/matmul/benchmark_matmul_intrinsic.py
index cd159ed25..3be28419a 100644
--- a/benchmark/matmul/benchmark_matmul_intrinsic.py
+++ b/benchmark/matmul/benchmark_matmul_intrinsic.py
@@ -187,10 +187,7 @@ def get_configs(args, kwargs):
         from tilelang.carver.roller.rasterization import NoRasterization
         import torch
 
-        if torch.version.hip is not None:
-            arch=CDNA("hip")
-        else:
-            arch = CUDA("cuda")
+        arch = CUDA("cuda") if torch.version.hip is None else CDNA("hip")
         topk = 10
 
         carve_template = MatmulTemplate(
diff --git a/benchmark/matmul_fp8/benchmark_matmul.py b/benchmark/matmul_fp8/benchmark_matmul.py
index 5830e9537..3420f4ecc 100644
--- a/benchmark/matmul_fp8/benchmark_matmul.py
+++ b/benchmark/matmul_fp8/benchmark_matmul.py
@@ -54,10 +54,8 @@ def get_configs(args, kwargs):
         from tilelang.carver.roller.rasterization import NoRasterization
         import torch
 
-        if torch.version.hip is not None:
-            arch=CDNA("hip")
-        else:
-            arch = CUDA("cuda")
+        arch = CDNA("hip") if torch.version.hip is not None else CUDA("cuda")
+
         topk = 10
 
         carve_template = MatmulTemplate(
@@ -158,7 +156,7 @@ def matmul(
 
     # Use half-precision for input data to reduce memory bandwidth,
     # accumulate in float for better numerical accuracy
-    dtype = "e4m3_float8"
+    dtype = "float8_e4m3"
     accum_dtype = "float"
 
     @T.prim_func
diff --git a/docs/deeplearning_operators/gemv.md b/docs/deeplearning_operators/gemv.md
index 0ceafe7ed..c75a961b8 100644
--- a/docs/deeplearning_operators/gemv.md
+++ b/docs/deeplearning_operators/gemv.md
@@ -252,7 +252,7 @@ def splitk_gemv_vectorized(
     return main
 ```
 
-With vectorized read, now the kernel finishs in **~0.0084 ms**, which is getting close to cuBLAS performance.
+With vectorized read, now the kernel finishes in **~0.0084 ms**, which is getting close to cuBLAS performance.
 
 
 ## `tvm_thread_allreduce` Instead of `atomicAdd`
diff --git a/examples/amd/example_amd_flash_attn_fwd.py b/examples/amd/example_amd_flash_attn_fwd.py
index aaf7f8ee1..2bbbb3132 100644
--- a/examples/amd/example_amd_flash_attn_fwd.py
+++ b/examples/amd/example_amd_flash_attn_fwd.py
@@ -2,6 +2,7 @@
 import torch.nn.functional as F
 import tilelang
 import tilelang.language as T
+from tilelang.primitives.gemm.base import GemmWarpPolicy
 import itertools
 import argparse
 from functools import partial
@@ -29,18 +30,24 @@ def ref_program(Q, K, V, is_causal, groups=1):
 
 def get_configs():
     """Generates configurations for the autotuner, tailored for FA-2 style parallelism."""
-    block_M = [64, 128, 256]
-    block_N = [32, 64, 128]
-    threads = [128, 256, 512]
-    num_split_q = [32, 64, 128]
-    num_stages = [0, 1, 2]
-    enable_rasterization = [True, False]
-    k_pack = [1, 2]
+    block_M = [32, 64, 128, 256]
+    block_N = [32, 64, 128, 256]
+    threads = [64, 128, 192, 256, 512, 1024]
+    num_split_q = [32, 64, 128, 256, 256]
+    num_stages = [0]
+    enable_rasterization = [True]
+    k_pack = [2]
+    panel_size = [7, 8, 9, 10]
+    qk_coalesced_width = [8]
+    v_coalesced_width = [4]
 
     valid_configs = []
 
-    for m, n, s, t, stages, r, k in itertools.product(block_M, block_N, num_split_q, threads,
-                                                      num_stages, enable_rasterization, k_pack):
+    for m, n, s, t, stages, r, k, p, qkw, vw in itertools.product(block_M, block_N, num_split_q,
+                                                                  threads, num_stages,
+                                                                  enable_rasterization, k_pack,
+                                                                  panel_size, qk_coalesced_width,
+                                                                  v_coalesced_width):
         valid_configs.append({
             "block_M": m,
             "block_N": n,
@@ -48,7 +55,10 @@ def get_configs():
             "threads": t,
             "num_stages": stages,
             "enable_rasterization": r,
-            "k_pack": k
+            "k_pack": k,
+            "panel_size": p,
+            "qk_coalesced_width": qkw,
+            "v_coalesced_width": vw,
         })
     valid_configs.append({
         'block_M': 64,
@@ -57,7 +67,10 @@ def get_configs():
         'threads': 256,
         'num_stages': 1,
         'enable_rasterization': True,
-        'k_pack': 2
+        'k_pack': 2,
+        'panel_size': 64,
+        'qk_coalesced_width': 8,
+        'v_coalesced_width': 8,
     })
     return valid_configs
 
@@ -78,6 +91,9 @@ def fast_flashattn(
     num_stages: int,
     enable_rasterization: bool,
     k_pack: int,
+    panel_size: int,
+    qk_coalesced_width: int,
+    v_coalesced_width: int,
 ):
     scale = (1.0 / dim)**0.5 * 1.44269504
     head_kv = heads // groups
@@ -86,8 +102,8 @@ def fast_flashattn(
     dtype = "float16"
     accum_dtype = "float"
 
-    v_vec_size = 4
-    vec_size = 4 * k_pack
+    vec_size = qk_coalesced_width
+    v_vec_size = v_coalesced_width
 
     @T.prim_func
     def main(
@@ -97,7 +113,7 @@ def main(
             Output: T.Tensor(q_shape, dtype),
     ):
         with T.Kernel(num_split_q, batch * heads, threads=threads) as (b_split, byz_combined):
-            T.use_swizzle(10, enable=enable_rasterization)
+            T.use_swizzle(panel_size, enable=enable_rasterization)
 
             bz = byz_combined // heads
             by = byz_combined % heads
@@ -105,9 +121,9 @@ def main(
             num_q_blocks = T.ceildiv(seq_len, block_M)
 
             bx = T.alloc_var("int32")
-            bx[0] = b_split
+            bx = b_split
 
-            with T.While(bx[0] < num_q_blocks):
+            with T.While(bx < num_q_blocks):
                 acc_o = T.alloc_fragment([block_M, dim], accum_dtype)
                 m_i = T.alloc_fragment([block_M], accum_dtype)
                 l_i = T.alloc_fragment([block_M], accum_dtype)
@@ -115,13 +131,14 @@ def main(
                 T.fill(m_i, -T.infinity(accum_dtype))
                 T.fill(l_i, 0)
 
-                current_bx = bx[0]
+                current_bx = bx
                 q_block_offset = current_bx * block_M
 
                 Q_shared = T.alloc_shared([block_M, dim], dtype)
                 K_shared = T.alloc_shared([block_N, dim], dtype)
                 V_shared = T.alloc_shared([block_N, dim], dtype)
-                P_shared = T.alloc_shared([block_M, block_N], dtype)
+                # Use register fragment for P instead of shared memory to reduce LDS usage
+                acc_s_cast = T.alloc_fragment([block_M, block_N], dtype)
 
                 acc_s = T.alloc_fragment([block_M, block_N], accum_dtype)
                 m_prev = T.alloc_fragment([block_M], accum_dtype)
@@ -135,6 +152,8 @@ def main(
                 loop_end_k = T.ceildiv(q_block_offset + block_M,
                                        block_N) if is_causal else T.ceildiv(seq_len, block_N)
 
+                row_sum = T.alloc_fragment([block_M], accum_dtype)
+
                 for k in T.Pipelined(loop_end_k, num_stages=num_stages):
                     kv_idx = k * block_N
 
@@ -147,13 +166,20 @@ def main(
                         V_shared,
                         coalesced_width=v_vec_size)
 
-                    T.clear(acc_s)
-                    T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, k_pack=k_pack)
-
                     if is_causal:
                         for i, j in T.Parallel(block_M, block_N):
-                            acc_s[i, j] = T.if_then_else(q_block_offset + i >= kv_idx + j,
-                                                         acc_s[i, j], -T.infinity(acc_s.dtype))
+                            acc_s[i, j] = T.if_then_else(q_block_offset + i >= kv_idx + j, 0,
+                                                         -T.infinity(acc_s.dtype))
+                    else:
+                        T.clear(acc_s)
+                    T.gemm(
+                        Q_shared,
+                        K_shared,
+                        acc_s,
+                        transpose_B=True,
+                        k_pack=k_pack,
+                        policy=GemmWarpPolicy.FullRow,
+                    )
 
                     T.copy(m_i, m_prev)
                     T.reduce_max(acc_s, m_i, dim=1, clear=False)
@@ -169,15 +195,14 @@ def main(
                     for i, j in T.Parallel(block_M, block_N):
                         acc_s[i, j] = T.exp2(acc_s[i, j] * scale - m_i[i] * scale)
 
-                    row_sum = T.alloc_fragment([block_M], accum_dtype)
                     T.reduce_sum(acc_s, row_sum, dim=1)
                     for i in T.Parallel(block_M):
                         l_i[i] += row_sum[i]
 
-                    T.copy(acc_s, P_shared)
-                    T.sync_threads()
+                    # Cast acc_s (accum_dtype) to dtype in registers and directly GEMM with V
+                    T.copy(acc_s, acc_s_cast)
 
-                    T.gemm(P_shared, V_shared, acc_o)
+                    T.gemm(acc_s_cast, V_shared, acc_o, policy=GemmWarpPolicy.FullRow)
 
                 l_inv = T.alloc_fragment([block_M], accum_dtype)
                 for i in T.Parallel(block_M):
@@ -187,7 +212,7 @@ def main(
                 for i, j in T.Parallel(block_M, dim):
                     Output[bz, q_block_offset + i, by, j] = acc_o[i, j] * l_inv[i]
 
-                bx[0] = current_bx + num_split_q
+                bx = current_bx + num_split_q
 
     return main
 
diff --git a/examples/analyze/example_conv_analyze.py b/examples/analyze/example_conv_analyze.py
index 1a19502a3..540fcf4b7 100644
--- a/examples/analyze/example_conv_analyze.py
+++ b/examples/analyze/example_conv_analyze.py
@@ -4,6 +4,7 @@
 from tilelang.carver.arch import CDNA
 from tilelang.layout import make_swizzled_layout
 import torch
+
 N = 64
 C = 256
 H = 512
@@ -95,10 +96,7 @@ def conv(
 
 def main():
     my_func = kernel(N, C, H, W, F, K, S, D, P, 64, 128, 32, 3, 256)
-    if torch.version.hip is not None:
-        cuda_device=CDNA("hip")
-    else:
-        cuda_device = CUDA("cuda")
+    cuda_device = CUDA("cuda") if torch.version.hip is None else CDNA("hip")
     result = Analyzer.analysis(my_func, cuda_device)
     print(result)
     print(f"Analyzed FLOPs: {result.total_flops}")
diff --git a/examples/analyze/example_gemm_analyze.py b/examples/analyze/example_gemm_analyze.py
index d35936a2a..bfd934f6a 100644
--- a/examples/analyze/example_gemm_analyze.py
+++ b/examples/analyze/example_gemm_analyze.py
@@ -49,10 +49,7 @@ def matmul(
 def main():
     my_func = kernel(128, 128, 32, 3, 128, True)
 
-    if torch.version.hip is not None:
-        cuda_device=CDNA("hip")
-    else:
-        cuda_device = CUDA("cuda")
+    cuda_device = CUDA("cuda") if torch.version.hip is None else CDNA("hip")
     result = Analyzer.analysis(my_func, cuda_device)
 
     print(f"Analyzed FLOPs: {result.total_flops}")
diff --git a/examples/bitnet-1.58b/modeling_bitnet.py b/examples/bitnet-1.58b/modeling_bitnet.py
index 22a985ce0..c78896c33 100644
--- a/examples/bitnet-1.58b/modeling_bitnet.py
+++ b/examples/bitnet-1.58b/modeling_bitnet.py
@@ -1373,7 +1373,7 @@ def prepare_inputs_for_generation(self,
                     cache_length + input_ids.shape[1] > max_cache_length):
                 attention_mask = attention_mask[:, -max_cache_length:]
 
-        position_ids = kwargs.get("position_ids", None)
+        position_ids = kwargs.get("position_ids")
         if attention_mask is not None and position_ids is None:
             # create position_ids on the fly for batch generation
             position_ids = attention_mask.long().cumsum(-1) - 1
diff --git a/examples/bitnet-1.58b/requirements.txt b/examples/bitnet-1.58b/requirements.txt
index 6781384d4..67357781e 100644
--- a/examples/bitnet-1.58b/requirements.txt
+++ b/examples/bitnet-1.58b/requirements.txt
@@ -1,3 +1,3 @@
 lm_eval==0.3.0
 flash_attn
-transformers==4.52.1
\ No newline at end of file
+transformers==4.53.0
diff --git a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_paged.py b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_paged.py
index 5132fd187..02f9be8a0 100644
--- a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_paged.py
+++ b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_paged.py
@@ -517,11 +517,12 @@ def main(args):
     output_sparse = sparse_attn.forward(Q, K_cache, V_cache, block_indices, cache_seqlens,
                                         block_table)
 
-    output_ref_fa = ref_program_fa(Q, K_cache, V_cache, cache_seqlens, block_table)
+    import flash_attn  # noqa: F401
 
     output_ref_torch = ref_program_torch_paged(Q, K_cache, V_cache, block_indices, cache_seqlens,
                                                block_table, page_block_size, block_N)
 
+    output_ref_fa = ref_program_fa(Q, K_cache, V_cache, cache_seqlens, block_table)
     # Check correctness
     if sparse_ratio == 0.0:
         max_diff = torch.max(torch.abs(output_sparse - output_ref_fa)).item()
diff --git a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_indice.py b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_indice.py
index b9c996bf2..aeeb03cfa 100644
--- a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_indice.py
+++ b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_indice.py
@@ -439,6 +439,8 @@ def main(batch=8,
     out = sparse_kernel(Q, K, V, block_indices, cache_seqlens)
     debug("output", ref, out, atol=1e-3, rtol=1e-3)
 
+    import flash_attn  # noqa: F401
+
     ## latency reference
     for _ in range(10):
         ref = ref_program_fa(Q, K, V, block_indices, cache_seqlens, max_cache_seqlen,
diff --git a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_mask.py b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_mask.py
index 7d1c2f41b..b0607d79e 100644
--- a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_mask.py
+++ b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_mask.py
@@ -419,6 +419,8 @@ def main(batch=8,
     out = model(Q, K, V, block_mask, cache_seqlens)
     debug("output", ref, out, atol=1e-3, rtol=1e-3)
 
+    import flash_attn  # noqa: F401
+
     ## latency reference
     for _ in range(10):
         ref = ref_program_fa(Q, K, V, block_mask, cache_seqlens, max_cache_seqlen, num_blocks,
diff --git a/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_indice.py b/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_indice.py
index a9de66c3b..85b72b775 100644
--- a/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_indice.py
+++ b/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_indice.py
@@ -449,6 +449,8 @@ def main(batch=64,
     print(f"Average time: {avg_time:.6f} seconds")
 
     # Measure performance of reference implementation
+    import flash_attn  # noqa: F401
+
     start = time.time()
     for _ in range(1000):
         ref_program_fa(Q, K, V, cache_seqlens)
diff --git a/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_mask.py b/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_mask.py
index 95c40b735..348572526 100644
--- a/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_mask.py
+++ b/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_mask.py
@@ -429,10 +429,12 @@ def main(batch=64,
     print(f"Average time: {avg_time:.6f} seconds")
     print(f"Average flops: {avg_flops:.2f} GFLOPS")
 
-    # Measure performance of reference implementation
+    import flash_attn  # noqa: F401
+
     start = time.time()
     for _ in range(1000):
         ref_program_fa(Q, K, V, cache_seqlens)
+
     torch.cuda.synchronize()
     end = time.time()
     elapsed_time_ref = end - start
diff --git a/examples/cast/example_group_per_split_token_cast_to_fp8.py b/examples/cast/example_group_per_split_token_cast_to_fp8.py
index 0af10572e..52e78f807 100644
--- a/examples/cast/example_group_per_split_token_cast_to_fp8.py
+++ b/examples/cast/example_group_per_split_token_cast_to_fp8.py
@@ -17,7 +17,7 @@ def group_per_split_token_cast_to_fp8(M, M_max, N, BG, blk_m):
 
     @T.prim_func
     def group_per_split_token_cast(X: T.Tensor((M, N), dtype), batch_sizes: T.Tensor(
-        (BG,), "int32"), X_fp8: T.Tensor((BG, M_max, N), "e4m3_float8"), X_amax: T.Tensor(
+        (BG,), "int32"), X_fp8: T.Tensor((BG, M_max, N), "float8_e4m3"), X_amax: T.Tensor(
             (BG, M_max, T.ceildiv(N, group_size)), accum_dtype)):
         with T.Kernel(
                 T.ceildiv(M_max, blk_m), T.ceildiv(N, group_size), BG, threads=128) as (bx, by, bz):
@@ -28,7 +28,7 @@ def group_per_split_token_cast(X: T.Tensor((M, N), dtype), batch_sizes: T.Tensor
             y_amax_local = T.alloc_fragment((blk_m,), accum_dtype)
             y_s_local = T.alloc_fragment((blk_m,), accum_dtype)
             y_q_local = T.alloc_fragment((blk_m, group_size), accum_dtype)
-            y_q_local_fp8 = T.alloc_fragment((blk_m, group_size), "e4m3_float8")
+            y_q_local_fp8 = T.alloc_fragment((blk_m, group_size), "float8_e4m3")
             row_offset = T.alloc_local((1,), "int32")
 
             T.annotate_layout({
diff --git a/examples/cast/example_per_token_cast_to_fp8.py b/examples/cast/example_per_token_cast_to_fp8.py
index c368b7606..dc4cdd6bc 100644
--- a/examples/cast/example_per_token_cast_to_fp8.py
+++ b/examples/cast/example_per_token_cast_to_fp8.py
@@ -15,7 +15,7 @@ def per_token_cast_to_fp8(M, N, blk_m):
     fp8_max = 448.0
 
     @T.prim_func
-    def per_token_cast(X: T.Tensor((M, N), dtype), X_fp8: T.Tensor((M, N), "e4m3_float8"),
+    def per_token_cast(X: T.Tensor((M, N), dtype), X_fp8: T.Tensor((M, N), "float8_e4m3"),
                        X_amax: T.Tensor((M, T.ceildiv(N, group_size)), dtype)):
         with T.Kernel(T.ceildiv(M, blk_m), T.ceildiv(N, group_size), threads=128) as (bx, by):
             row = bx
@@ -24,7 +24,7 @@ def per_token_cast(X: T.Tensor((M, N), dtype), X_fp8: T.Tensor((M, N), "e4m3_flo
             y_amax_local = T.alloc_fragment((blk_m,), dtype)
             y_s_local = T.alloc_fragment((blk_m,), dtype)
             y_q_local = T.alloc_fragment((blk_m, group_size), dtype)
-            y_q_local_fp8 = T.alloc_fragment((blk_m, group_size), "e4m3_float8")
+            y_q_local_fp8 = T.alloc_fragment((blk_m, group_size), "float8_e4m3")
 
             T.annotate_layout({
                 y_local:
diff --git a/examples/convolution/example_convolution.py b/examples/convolution/example_convolution.py
index 07af24fb7..5ca0c3ccc 100644
--- a/examples/convolution/example_convolution.py
+++ b/examples/convolution/example_convolution.py
@@ -25,6 +25,7 @@ def main(A, B):
     return main
 
 
+@tilelang.jit(out_idx=[2])
 def convolution(N,
                 C,
                 H,
@@ -116,8 +117,7 @@ def main(argv=None):
     block_k = 32
     num_stages = 3
     threads = 256
-    program = convolution(N, C, H, W, F, K, S, D, P, block_m, block_n, block_k, num_stages, threads)
-    kernel = tilelang.compile(program, out_idx=[2])
+    kernel = convolution(N, C, H, W, F, K, S, D, P, block_m, block_n, block_k, num_stages, threads)
 
     out_c = kernel(a, b)
     ref_c = ref_program(S, P, D)(a, b)
diff --git a/examples/convolution/example_convolution_autotune.py b/examples/convolution/example_convolution_autotune.py
index eba906513..1b7494016 100644
--- a/examples/convolution/example_convolution_autotune.py
+++ b/examples/convolution/example_convolution_autotune.py
@@ -32,10 +32,7 @@ def main(A, B):
 
 def get_configs(N, C, H, W, F, K, S, D, P, with_roller=False, topk=15):
     if with_roller:
-        if torch.version.hip is not None:
-            arch=CDNA("hip")
-        else:
-            arch = CUDA("cuda")
+        arch = CDNA("hip") if torch.version.hip is not None else CUDA("cuda")
         carve_template = ConvTemplate(
             N=N,
             C=C,
@@ -102,6 +99,7 @@ def get_configs(N, C, H, W, F, K, S, D, P, with_roller=False, topk=15):
 
 def get_best_config(N, C, H, W, F, K, S, D, P, ref_prog, with_roller=False):
 
+    @tilelang.jit(out_idx=[2])
     def kernel(
         block_M=None,
         block_N=None,
@@ -212,6 +210,7 @@ def get_heuristic_config() -> dict:
         }
 
 
+@tilelang.jit(out_idx=[2])
 def convolution(N,
                 C,
                 H,
@@ -302,7 +301,7 @@ def main(n: int = 128,
         kernel = result.kernel
     else:
         config = get_heuristic_config()
-        kernel = tilelang.compile(convolution(N, C, H, W, F, K, S, D, P, **config), out_idx=[2])
+        kernel = convolution(N, C, H, W, F, K, S, D, P, **config)
 
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Auto)
     tilelang_latency = profiler.do_bench()
diff --git a/examples/deepseek_deepgemm/example_deepgemm_fp8_2xAcc.py b/examples/deepseek_deepgemm/example_deepgemm_fp8_2xAcc.py
index 1f00bd36a..715f09a9b 100644
--- a/examples/deepseek_deepgemm/example_deepgemm_fp8_2xAcc.py
+++ b/examples/deepseek_deepgemm/example_deepgemm_fp8_2xAcc.py
@@ -20,8 +20,8 @@ def tl_gemm(
     accum_dtype,
 ):
     assert in_dtype in [
-        "e4m3_float8",
-    ], "Currently only e4m3_float8 is supported"
+        "float8_e4m3",
+    ], "Currently only float8_e4m3 is supported"
     assert out_dtype in [
         "bfloat16",
         "float32",
@@ -179,11 +179,11 @@ def assert_tl_gemm_correctness(M, N, K, block_N, in_dtype, out_dtype, accum_dtyp
 
 
 def main():
-    assert_tl_gemm_correctness(1024, 1024, 8192, 128, "e4m3_float8", "bfloat16", "float32")
+    assert_tl_gemm_correctness(1024, 1024, 8192, 128, "float8_e4m3", "bfloat16", "float32")
 
 
 if __name__ == "__main__":
-    for dtype in ["e4m3_float8"]:
+    for dtype in ["float8_e4m3"]:
         for out_dtype in ["bfloat16", "float32"]:
             for block_N in [16, 32, 64, 128]:
                 assert_tl_gemm_correctness(1024, 1024, 8192, block_N, dtype, out_dtype, "float32")
diff --git a/examples/deepseek_mla/example_mla_decode.py b/examples/deepseek_mla/example_mla_decode.py
index d08f990ff..d3a07fa7c 100644
--- a/examples/deepseek_mla/example_mla_decode.py
+++ b/examples/deepseek_mla/example_mla_decode.py
@@ -8,8 +8,9 @@
 
 
 @tilelang.jit(out_idx=[6])
-def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_H, num_split):
-    scale = (1.0 / (dim + pe_dim))**0.5 * 1.44269504  # log2(e)
+def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_H, num_split,
+              softmax_scale):
+    scale = float(softmax_scale * 1.44269504)  # log2(e)
     dtype = "float16"
     accum_dtype = "float"
     kv_group_num = heads // kv_head_num
@@ -288,10 +289,12 @@ def main(
     pv_flops = 2 * batch * heads * kv_ctx * dim
     total_flops = qk_flops + pv_flops
     BLOCK_N = 64
-    BLOCK_H = 64
+    BLOCK_H = min(64, heads // kv_heads)
     num_split = 1
+    softmax_scale = (dim + pe_dim)**-0.5
 
-    kernel = flashattn(batch, heads, kv_heads, kv_ctx, dim, pe_dim, BLOCK_N, BLOCK_H, num_split)
+    kernel = flashattn(batch, heads, kv_heads, kv_ctx, dim, pe_dim, BLOCK_N, BLOCK_H, num_split,
+                       softmax_scale)
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Randn)
     profiler.assert_allclose(ref_program, rtol=1e-4, atol=1e-4)
     latency = profiler.do_bench(warmup=500)
diff --git a/examples/deepseek_mla/example_mla_decode_paged.py b/examples/deepseek_mla/example_mla_decode_paged.py
index 6ad3d47b0..a4624a8b6 100644
--- a/examples/deepseek_mla/example_mla_decode_paged.py
+++ b/examples/deepseek_mla/example_mla_decode_paged.py
@@ -9,8 +9,8 @@
 
 @tilelang.jit(out_idx=[8])
 def mla_decode_tilelang(batch, h_q, h_kv, max_seqlen_pad, dv, dpe, block_N, block_H, num_split,
-                        block_size):
-    scale = (1.0 / (dv + dpe))**0.5 * 1.44269504  # log2(e)
+                        block_size, softmax_scale):
+    scale = float(softmax_scale * 1.44269504)  # log2(e)
     dtype = "float16"
     accum_dtype = "float"
     kv_group_num = h_q // h_kv
@@ -318,12 +318,13 @@ def run_tilelang_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s
     dpe = d - dv
     num_kv_splits = 1
     BLOCK_N = 64
-    BLOCK_H = 64
+    BLOCK_H = min(64, h_q // h_kv)
+    softmax_scale = (d + dv)**-0.5
 
     out_partial = torch.empty(b, h_q, num_kv_splits, dv, dtype=dtype, device=q.device)
     glse = torch.empty(b, h_q, num_kv_splits, dtype=dtype, device=q.device)
     kernel = mla_decode_tilelang(b, h_q, h_kv, max_seqlen_pad, dv, dpe, BLOCK_N, BLOCK_H,
-                                 num_kv_splits, block_size)
+                                 num_kv_splits, block_size, softmax_scale)
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Randn)
 
     def flash_mla_tilelang():
diff --git a/examples/deepseek_mla/experimental/example_mla_decode_kv_fp8.py b/examples/deepseek_mla/experimental/example_mla_decode_kv_fp8.py
index 0d8368169..c5fdebd72 100644
--- a/examples/deepseek_mla/experimental/example_mla_decode_kv_fp8.py
+++ b/examples/deepseek_mla/experimental/example_mla_decode_kv_fp8.py
@@ -11,7 +11,7 @@
 def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_H):
     scale = (1.0 / (dim + pe_dim))**0.5 * 1.44269504  # log2(e)
     dtype = "float16"
-    q_dtype = "e4m3_float8"
+    q_dtype = "float8_e4m3"
     accum_dtype = "float"
     kv_group_num = heads // kv_head_num
     VALID_BLOCK_H = min(block_H, kv_group_num)
diff --git a/examples/dequantize_gemm/example_dequant_gemm_fp4_hopper.py b/examples/dequantize_gemm/example_dequant_gemm_fp4_hopper.py
index 668f58a96..f36f02908 100644
--- a/examples/dequantize_gemm/example_dequant_gemm_fp4_hopper.py
+++ b/examples/dequantize_gemm/example_dequant_gemm_fp4_hopper.py
@@ -12,16 +12,18 @@ def _tir_u8_to_f4_to_f16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, dtype:
     assert dtype == "float16"
     assert val.dtype == "uint8"
     # e_f4 == 0 -> e_f16 = 0
-    # e_f4 != 0 -> e_f16 = e_f4 + 8 = e_f4 | (1000)_2
-    # s1e2n1
+    # e_f4 != 0 -> e_f16 = e_f4 + ExponentialBias(f16, f4) = e_f4 + (2^4 - 2^1) = e_f4 + 14
+    # s1e2m1
     mask = tir.const((1 << nbit) - 1, "uint16")
     f4 = (val >> (pos.astype("uint16") * tir.const(nbit, "uint16"))) & mask
     s = f4 >> tir.const(3, "uint16")
-    e_f4 = f4 & tir.const(7, "uint16")
-    e_f16 = e_f4 | tir.const(8, "uint16")
-    val_f16 = tir.reinterpret(
-        "float16",
-        ((e_f16 | (s << tir.const(5, "uint16"))) << tir.const(10, "uint16")).astype("uint16"))
+    e_f4 = (f4 & tir.const(6, "uint16")) >> tir.const(1, "uint16")
+    e_f16 = e_f4 + tir.const(14, "uint16")
+    m_f4 = f4 & tir.const(1, "uint16")
+    m_f16 = m_f4
+    val_f16 = tir.reinterpret("float16",
+                              ((e_f16 | (s << tir.const(5, "uint16"))) << tir.const(10, "uint16")
+                               | m_f16 << tir.const(9, "uint16")).astype("uint16"))
     # return tir.Select(e_f4 == tir.const(0, "uint32"), tir.const(0, "float16"), val_f16)
     return val_f16
 
@@ -39,9 +41,11 @@ def _convert(val, pos):
         mask = (1 << 4) - 1
         f4 = ((val >> (pos * 4)) & mask).to(torch.int16)
         s = f4 >> 3
-        e_f4 = f4 & 7
-        e_f16 = e_f4 | 8
-        val_f16 = ((e_f16 | (s << 5)) << 10) & 0xFFFF
+        e_f4 = (f4 & 6) >> 1
+        e_f16 = e_f4 + 14
+        m_f4 = f4 & 1
+        m_f16 = m_f4
+        val_f16 = (((e_f16 | (s << 5)) << 10) | (m_f16 << 9)) & 0xFFFF
         lower_16_bits = (val_f16 & 0xFFFF).to(torch.uint16)
         return lower_16_bits.view(torch.float16)
 
diff --git a/examples/dequantize_gemm/example_dequant_gemm_mxfp4_hopper.py b/examples/dequantize_gemm/example_dequant_gemm_mxfp4_hopper.py
new file mode 100644
index 000000000..bc318a860
--- /dev/null
+++ b/examples/dequantize_gemm/example_dequant_gemm_mxfp4_hopper.py
@@ -0,0 +1,424 @@
+import tilelang
+import tilelang.language as T
+from tilelang.autotuner import *
+from tvm import tir
+import argparse
+import itertools
+import torch
+
+tilelang.disable_cache()
+
+torch.manual_seed(0)
+
+
+def _tir_u8_to_f4_to_bf16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, scale: tir.PrimExpr,
+                          dtype: str):
+    assert nbit == 4
+    assert dtype == "bfloat16"
+    assert val.dtype == "uint8"
+    mask = tir.const((1 << nbit) - 1, "uint16")
+    f4 = (val >> (pos.astype("uint16") * tir.const(nbit, "uint16"))) & mask
+    s = f4 >> tir.const(3, "uint16")
+    e_f4 = (f4 & tir.const(6, "uint16")) >> tir.const(1, "uint16")
+    # Exponential bias between f4 and bf16 is 2^(8-1) - 2^(2-1) = 126
+    e_bf16 = e_f4 + tir.const(126, "uint16")
+    # Scale is the exponential part, within the representation of uint8
+    # To handle the overflow, we use the max function to limit the exponential part to 8 bits
+    e_bf16 = T.min(e_bf16 + scale, tir.const((1 << 8) - 1, "uint16"))
+    m_f4 = f4 & tir.const(1, "uint16")
+    val_bf16 = tir.reinterpret("bfloat16",
+                               ((((s << tir.const(8, "uint16")) | e_bf16) << tir.const(7, "uint16"))
+                                | (m_f4 << tir.const(6, "uint16"))).astype("uint16"))
+    return val_bf16
+
+
+def torch_convert(tensor, scale_size=None, Scale=None):
+
+    def print_bit(name, val):
+        val_cpu = val.cpu().item()
+        binary_repr = f'{val_cpu:032b}'
+        print(name, binary_repr)
+
+    def _convert(val, pos, scale=None):
+        assert val.dtype == torch.uint8
+        # val = val.view(torch.int8)
+        mask = (1 << 4) - 1
+        f4 = ((val >> (pos * 4)) & mask).to(torch.int16)
+        s = f4 >> 3
+        e_f4 = (f4 & 6) >> 1
+        e_f16 = e_f4 + 126
+        if scale is not None:
+            e_f16 = min(e_f16 + scale, (1 << 8) - 1)
+        m_f4 = f4 & 1
+        m_f16 = m_f4
+        val_f16 = (((e_f16 | (s << 8)) << 7) | (m_f16 << 6)) & 0xFFFF
+        lower_16_bits = (val_f16 & 0xFFFF).to(torch.uint16)
+        return lower_16_bits.view(torch.bfloat16)
+
+    N = tensor.shape[0]
+    K = tensor.shape[1]
+    new_tensor = torch.empty(N, K * 2, dtype=torch.bfloat16, device=tensor.device)
+    for i in range(new_tensor.shape[0]):
+        for j in range(new_tensor.shape[1]):
+            if scale_size is not None:
+                new_tensor[i][j] = _convert(tensor[i][j // 2], j % 2, Scale[i][j // scale_size])
+            else:
+                new_tensor[i][j] = _convert(tensor[i][j // 2], j % 2)
+    return new_tensor
+
+
+@tilelang.jit(out_idx=[-1])
+def convert(N, K, block_N, block_K, in_dtype, num_bits=4, threads=128):
+    num_elems_per_byte = 8 // num_bits
+    storage_dtype = "uint8"
+    B_shape = (N, K // num_elems_per_byte)
+    B_shared_shape = (block_N, block_K // num_elems_per_byte)
+    B_dequantize_shared_shape = (block_N, block_K)
+
+    @T.prim_func
+    def main(
+            B: T.Tensor(B_shape, storage_dtype),
+            C: T.Tensor((N, K), in_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), threads=threads) as (bx):
+            B_shared = T.alloc_shared(B_shared_shape, storage_dtype)
+            B_local = T.alloc_fragment(B_shared_shape, storage_dtype)
+            B_dequantize_local = T.alloc_fragment(B_dequantize_shared_shape, in_dtype)
+
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=1):
+                T.copy(B[bx * block_N, k * block_K // num_elems_per_byte], B_shared)
+                T.copy(B_shared, B_local)
+                for i, j in T.Parallel(block_N, block_K):
+                    B_dequantize_local[i, j] = _tir_u8_to_f4_to_bf16(
+                        num_bits,
+                        B_local[i, j // num_elems_per_byte],
+                        j % num_elems_per_byte,
+                        0,  # No scale for test
+                        dtype=in_dtype,
+                    )
+                T.copy(B_dequantize_local, C[bx * block_N, k * block_K])
+
+    return main
+
+
+@tilelang.jit(out_idx=[-1])
+def convert_scale(N, K, block_N, block_K, in_dtype, num_bits=4, scale_size=32, threads=128):
+    num_elems_per_byte = 8 // num_bits
+    storage_dtype = "uint8"
+    B_shape = (N, K // num_elems_per_byte)
+    B_shared_shape = (block_N, block_K // num_elems_per_byte)
+    B_dequantize_shared_shape = (block_N, block_K)
+    Scale_shape = (N, K // scale_size)
+    Scale_shared_shape = (block_N, block_K // scale_size)
+
+    @T.prim_func
+    def main(
+            B: T.Tensor(B_shape, storage_dtype),
+            Scale: T.Tensor(Scale_shape, storage_dtype),
+            C: T.Tensor((N, K), in_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), threads=threads) as (bx):
+            B_shared = T.alloc_shared(B_shared_shape, storage_dtype)
+            B_local = T.alloc_fragment(B_shared_shape, storage_dtype)
+            B_dequantize_local = T.alloc_fragment(B_dequantize_shared_shape, in_dtype)
+            Scale_shared = T.alloc_shared(Scale_shared_shape, storage_dtype)
+            Scale_local = T.alloc_fragment(Scale_shared_shape, storage_dtype)
+
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=1):
+                T.copy(B[bx * block_N, k * block_K // num_elems_per_byte], B_shared)
+                T.copy(B_shared, B_local)
+                T.copy(Scale[bx * block_N, k * block_K // scale_size], Scale_shared)
+                T.copy(Scale_shared, Scale_local)
+                for i, j in T.Parallel(block_N, block_K):
+                    B_dequantize_local[i, j] = _tir_u8_to_f4_to_bf16(
+                        num_bits,
+                        B_local[i, j // num_elems_per_byte],
+                        j % num_elems_per_byte,
+                        Scale_local[
+                            i, j //
+                            scale_size],  # Scale is the exponential part, within the representation of uint8
+                        dtype=in_dtype,
+                    )
+                T.copy(B_dequantize_local, C[bx * block_N, k * block_K])
+
+    return main
+
+
+def test_fp4_bf16_convert_close():
+    N, K = 256, 256
+    block_N, block_K = 64, 64
+    kernel = convert(
+        N,
+        K,
+        block_N,
+        block_K,
+        "bfloat16",
+    )
+
+    B = torch.randint(0, 16, (N, K // 2), dtype=torch.uint8, device="cuda").to(torch.uint8)
+    tl_out = kernel(B)
+    ref_out = torch_convert(B)
+    assert torch.allclose(tl_out, ref_out, rtol=0.01, atol=0.01), (tl_out, ref_out)
+    print("Convert Pass")
+
+
+def test_fp4_bf16_convert_scale_close():
+    N, K = 256, 256
+    block_N, block_K = 64, 64
+    kernel = convert_scale(N, K, block_N, block_K, "bfloat16", scale_size=32)
+
+    B = torch.randint(0, 16, (N, K // 2), dtype=torch.uint8, device="cuda").to(torch.uint8)
+    Scale = torch.randint(0, 1, (N, K // 32), dtype=torch.uint8, device="cuda").to(torch.uint8)
+    tl_out = kernel(B, Scale)
+    ref_out = torch_convert(B, scale_size=32, Scale=Scale)
+    assert torch.allclose(tl_out, ref_out, rtol=0.01, atol=0.01), (tl_out, ref_out)
+    print("Convert Scale Pass")
+
+
+def get_configs():
+    block_M = [128]
+    block_N = [128, 256]
+    block_K = [128]
+    num_stages = [2]
+    threads = [256]
+    splits = [1]
+    _configs = list(itertools.product(block_M, block_N, block_K, num_stages, threads, splits))
+
+    configs = [{
+        'block_M': c[0],
+        'block_N': c[1],
+        'block_K': c[2],
+        'num_stages': c[3],
+        'threads': c[4],
+        'split': c[5]
+    } for c in _configs]
+    return configs
+
+
+def matmul(M, N, K, in_dtype, out_dtype, accum_dtype, num_bits=4, scale_size=32, tune=False):
+
+    @tilelang.jit(out_idx=[-1])
+    def kernel_func(block_M, block_N, block_K, num_stages, threads, split=1):
+        num_elems_per_byte = 8 // num_bits
+        storage_dtype = "uint8"
+        A_shape = (M, K)
+        B_shape = (N, K // num_elems_per_byte)
+        Scale_shape = (N, K // scale_size)
+        A_shared_shape = (block_M, block_K)
+        B_shared_shape = (block_N, block_K // num_elems_per_byte)
+        B_dequantize_shared_shape = (block_N, block_K)
+        Scale_shared_shape = (block_N, block_K // scale_size)
+        assert K % (block_K * split) == 0
+        KK = K // split
+
+        @T.prim_func
+        def main_split(
+                A: T.Tensor(A_shape, in_dtype),
+                B: T.Tensor(B_shape, storage_dtype),
+                Scale: T.Tensor(Scale_shape, storage_dtype),
+                Ct: T.Tensor((N, M), out_dtype),
+        ):
+            SplitC = T.alloc_buffer([
+                split, (N + block_N - 1) // block_N * block_N,
+                (M + block_M - 1) // block_M * block_M
+            ], out_dtype)
+            with T.Kernel(
+                    T.ceildiv(N, block_N), T.ceildiv(M, block_M), split,
+                    threads=threads) as (bx, by, bz):
+                A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+                B_shared = T.alloc_shared(B_shared_shape, storage_dtype)
+                B_local = T.alloc_fragment(B_shared_shape, storage_dtype)
+                B_dequantize_local = T.alloc_fragment(B_dequantize_shared_shape, in_dtype)
+                B_dequantize_prev_local = T.alloc_fragment(B_dequantize_shared_shape, in_dtype)
+                Ct_local = T.alloc_fragment((block_N, block_M), accum_dtype)
+                Ct_shared = T.alloc_shared((block_N, block_M), out_dtype)
+                Scale_shared = T.alloc_shared(Scale_shared_shape, storage_dtype)
+                Scale_local = T.alloc_fragment(Scale_shared_shape, storage_dtype)
+
+                T.annotate_layout({
+                    B_shared: tilelang.layout.make_swizzled_layout(B_shared),
+                    Ct_shared: tilelang.layout.make_swizzled_layout(Ct_shared),
+                    Scale_shared: tilelang.layout.make_swizzled_layout(Scale_shared),
+                })
+
+                T.clear(Ct_local)
+                for k in T.Pipelined(K // (block_K * split), num_stages=num_stages):
+                    T.copy(A[by * block_M, KK * bz + k * block_K], A_shared)
+                    T.copy(B[bx * block_N, (KK * bz + k * block_K) // num_elems_per_byte], B_shared)
+                    T.copy(B_shared, B_local)
+                    T.copy(Scale[bx * block_N, (KK * bz + k * block_K) // scale_size], Scale_shared)
+                    T.copy(Scale_shared, Scale_local)
+                    for i, j in T.Parallel(block_N, block_K):
+                        B_dequantize_local[i, j] = _tir_u8_to_f4_to_bf16(
+                            num_bits,
+                            B_local[i, j // num_elems_per_byte],
+                            j % num_elems_per_byte,
+                            Scale_local[i, j // scale_size],
+                            dtype=in_dtype,
+                        )
+                    T.copy(B_dequantize_local, B_dequantize_prev_local)
+                    T.gemm(B_dequantize_prev_local, A_shared, Ct_local, transpose_B=True)
+                T.copy(Ct_local, SplitC[bz, bx * block_N:(bx + 1) * block_N,
+                                        by * block_M:(by + 1) * block_M])
+            with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M)) as (bx, by):
+                acc = T.alloc_fragment((block_N, block_M), out_dtype)
+                T.clear(acc)
+                for k in range(split):
+                    for i, j in T.Parallel(block_N, block_M):
+                        acc[i, j] += SplitC[k, bx * block_N + i, by * block_M + j]
+                T.copy(acc, Ct[bx * block_N, by * block_M])
+
+        @T.prim_func
+        def main(
+                A: T.Tensor(A_shape, in_dtype),
+                B: T.Tensor(B_shape, storage_dtype),
+                Scale: T.Tensor(Scale_shape, storage_dtype),
+                Ct: T.Tensor((N, M), out_dtype),
+        ):
+            with T.Kernel(
+                    T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+                A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+                B_shared = T.alloc_shared(B_shared_shape, storage_dtype)
+                B_local = T.alloc_fragment(B_shared_shape, storage_dtype)
+                B_dequantize_local = T.alloc_fragment(B_dequantize_shared_shape, in_dtype)
+                B_dequantize_prev_local = T.alloc_fragment(B_dequantize_shared_shape, in_dtype)
+                Ct_local = T.alloc_fragment((block_N, block_M), accum_dtype)
+                Ct_shared = T.alloc_shared((block_N, block_M), out_dtype)
+                Scale_shared = T.alloc_shared((block_N, block_K // scale_size), storage_dtype)
+                Scale_local = T.alloc_fragment((block_N, block_K // scale_size), storage_dtype)
+
+                T.annotate_layout({
+                    B_shared: tilelang.layout.make_swizzled_layout(B_shared),
+                    Ct_shared: tilelang.layout.make_swizzled_layout(Ct_shared),
+                    Scale_shared: tilelang.layout.make_swizzled_layout(Scale_shared),
+                })
+
+                T.clear(Ct_local)
+                for k in T.Pipelined(K // block_K, num_stages=num_stages):
+                    T.copy(A[by * block_M, k * block_K], A_shared)
+                    T.copy(B[bx * block_N, k * block_K // num_elems_per_byte], B_shared)
+                    T.copy(B_shared, B_local)
+                    T.copy(Scale[bx * block_N, k * block_K // scale_size], Scale_shared)
+                    T.copy(Scale_shared, Scale_local)
+                    for i, j in T.Parallel(block_N, block_K):
+                        B_dequantize_local[i, j] = _tir_u8_to_f4_to_bf16(
+                            num_bits,
+                            B_local[i, j // num_elems_per_byte],
+                            j % num_elems_per_byte,
+                            Scale_local[i, j // scale_size],
+                            dtype=in_dtype,
+                        )
+                    T.copy(B_dequantize_local, B_dequantize_prev_local)
+                    T.gemm(B_dequantize_prev_local, A_shared, Ct_local, transpose_B=True)
+                T.copy(Ct_local, Ct_shared)
+                T.copy(Ct_shared, Ct[bx * block_N:(bx + 1) * block_N,
+                                     by * block_M:(by + 1) * block_M])
+
+        if split == 1:
+            return main
+        else:
+            return main_split
+
+    if tune:
+
+        @autotune(
+            configs=get_configs(),
+            keys=["block_M", "block_N", "block_K", "num_stages", "threads", "split"],
+            warmup=10,
+            rep=10)
+        @tilelang.jit(out_idx=[-1])
+        def kernel(block_M=None,
+                   block_N=None,
+                   block_K=None,
+                   num_stages=None,
+                   threads=None,
+                   split=None):
+            return kernel_func(block_M, block_N, block_K, num_stages, threads, split)
+
+        return kernel()
+    else:
+
+        def kernel(block_M, block_N, block_K, num_stages, threads, split=1):
+            return kernel_func(block_M, block_N, block_K, num_stages, threads, split)
+
+        return kernel
+
+
+def ref_program(A, qB):
+    dtypeC = "bfloat16"
+    B = torch_convert(qB)
+    C = torch.matmul(A.to(torch.float), B.T.to(torch.float))
+    C = C.to(torch.__getattribute__(dtypeC))
+    return C.transpose(0, 1)
+
+
+def ref_program_scale(A, qB, Scale):
+    dtypeC = "bfloat16"
+    B = torch_convert(qB, scale_size=32, Scale=Scale)
+    C = torch.matmul(A.to(torch.float), B.T.to(torch.float))
+    C = C.to(torch.__getattribute__(dtypeC))
+    return C.transpose(0, 1)
+
+
+def main(m=256, n=256, k=256, scale_size=32, tune=False):
+    total_flops = 2 * m * n * k
+
+    if (not tune):
+        kernel = matmul(
+            m,
+            n,
+            k,
+            "bfloat16",
+            "bfloat16",
+            "float32",
+            num_bits=4,
+            scale_size=scale_size,
+            tune=tune)(
+                block_M=128, block_N=128, block_K=128, num_stages=2, threads=256, split=1)
+        profiler = kernel.get_profiler(tilelang.TensorSupplyType.Integer)
+        profiler.assert_allclose(ref_program_scale, rtol=0.01, atol=0.01)
+        print("All checks pass.")
+        latency = profiler.do_bench(ref_program_scale, warmup=500)
+        print("Ref: {:.2f} ms".format(latency))
+        print("Ref: {:.2f} TFlops".format(total_flops / latency * 1e-9))
+        latency = profiler.do_bench(warmup=500)
+        print("Tile-lang: {:.2f} ms".format(latency))
+        print("Tile-lang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
+    else:
+        best_result = matmul(
+            m,
+            n,
+            k,
+            "bfloat16",
+            "bfloat16",
+            "float32",
+            num_bits=4,
+            scale_size=scale_size,
+            tune=tune)
+        best_latency = best_result.latency
+        best_config = best_result.config
+        print(f"Best latency: {best_latency}")
+        print(f"Best TFlops: {total_flops / best_latency * 1e-9}")
+        print(f"Best config: {best_config}")
+
+
+def test_convert():
+    test_fp4_bf16_convert_close()
+    test_fp4_bf16_convert_scale_close()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--m', type=int, default=256, help='M')
+    parser.add_argument('--n', type=int, default=256, help='N')
+    parser.add_argument('--k', type=int, default=256, help='K')
+    parser.add_argument(
+        '--scale_size',
+        type=int,
+        default=32,
+        help='scale size, the exponential part, within the representation of uint8')
+    parser.add_argument('--tune', action='store_true', help='tune configs')
+    args = parser.parse_args()
+    M, N, K = args.m, args.n, args.k
+    # test_convert()
+    main(M, N, K, args.scale_size, args.tune)
diff --git a/examples/dequantize_gemm/test_example_dequantize_gemm.py b/examples/dequantize_gemm/test_example_dequantize_gemm.py
index e662cbd66..6f66c799e 100644
--- a/examples/dequantize_gemm/test_example_dequantize_gemm.py
+++ b/examples/dequantize_gemm/test_example_dequantize_gemm.py
@@ -2,6 +2,7 @@
 
 import example_dequant_gemv_fp16xint4
 import example_dequant_gemm_fp4_hopper
+import example_dequant_gemm_mxfp4_hopper
 
 
 @tilelang.testing.requires_cuda
@@ -15,5 +16,11 @@ def test_example_dequant_gemm_fp4_hopper():
     example_dequant_gemm_fp4_hopper.main()
 
 
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
+def test_example_dequant_gemm_mxfp4_hopper():
+    example_dequant_gemm_mxfp4_hopper.main()
+
+
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/examples/flash_attention/bert_padding.py b/examples/flash_attention/bert_padding.py
new file mode 100644
index 000000000..7058fd773
--- /dev/null
+++ b/examples/flash_attention/bert_padding.py
@@ -0,0 +1,213 @@
+# Adapted from https://github.com/mlcommons/training_results_v1.1/blob/main/NVIDIA/benchmarks/bert/implementations/pytorch/padding.py
+# ruff: noqa
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+
+
+class IndexFirstAxis(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, input, indices):
+        ctx.save_for_backward(indices)
+        assert input.ndim >= 2
+        ctx.first_axis_dim, other_shape = input.shape[0], input.shape[1:]
+        second_dim = other_shape.numel()
+        # TD [2022-03-04] For some reason torch.gather is a bit faster than indexing.
+        # return input[indices]
+        return torch.gather(
+            rearrange(input, "b ... -> b (...)"), 0,
+            repeat(indices, "z -> z d", d=second_dim)).reshape(-1, *other_shape)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        (indices,) = ctx.saved_tensors
+        assert grad_output.ndim >= 2
+        other_shape = grad_output.shape[1:]
+        grad_output = rearrange(grad_output, "b ... -> b (...)")
+        grad_input = torch.zeros(
+            [ctx.first_axis_dim, grad_output.shape[1]],
+            device=grad_output.device,
+            dtype=grad_output.dtype,
+        )
+        # TD [2022-03-04] For some reason torch.scatter is a bit faster than indexing.
+        # grad_input[indices] = grad_output
+        grad_input.scatter_(0, repeat(indices, "z -> z d", d=grad_output.shape[1]), grad_output)
+        return grad_input.reshape(ctx.first_axis_dim, *other_shape), None
+
+
+index_first_axis = IndexFirstAxis.apply
+
+
+class IndexPutFirstAxis(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, values, indices, first_axis_dim):
+        ctx.save_for_backward(indices)
+        assert indices.ndim == 1
+        assert values.ndim >= 2
+        output = torch.zeros(
+            first_axis_dim, *values.shape[1:], device=values.device, dtype=values.dtype)
+        # TD [2022-03-04] For some reason torch.scatter is a bit faster than indexing.
+        output[indices] = values
+        # output.scatter_(0, repeat(indices, 'z -> z d', d=values.shape[1]), values)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        (indices,) = ctx.saved_tensors
+        # TD [2022-03-04] For some reason torch.gather is a bit faster than indexing.
+        grad_values = grad_output[indices]
+        # grad_values = torch.gather(grad_output, 0, repeat(indices, 'z -> z d', d=grad_output.shape[1]))
+        return grad_values, None, None
+
+
+index_put_first_axis = IndexPutFirstAxis.apply
+
+
+class IndexFirstAxisResidual(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, input, indices):
+        ctx.save_for_backward(indices)
+        assert input.ndim >= 2
+        ctx.first_axis_dim, other_shape = input.shape[0], input.shape[1:]
+        second_dim = other_shape.numel()
+        # TD [2022-03-04] For some reason torch.gather is a bit faster than indexing.
+        output = input[indices]
+        # We don't want to reshape input (b ... -> b (...)) since it could change the channel_last
+        # memory format to channel_first. In other words, input might not be contiguous.
+        # If we don't detach, Pytorch complains about output being a view and is being modified inplace
+        return output, input.detach()
+
+    @staticmethod
+    def backward(ctx, grad_output, grad_residual):
+        (indices,) = ctx.saved_tensors
+        assert grad_output.ndim >= 2
+        other_shape = grad_output.shape[1:]
+        assert grad_residual.shape[1:] == other_shape
+        grad_input = grad_residual
+        # grad_input[indices] += grad_output
+        indices = indices.reshape(indices.shape[0], *((1,) * (grad_output.ndim - 1)))
+        indices = indices.expand_as(grad_output)
+        grad_input.scatter_add_(0, indices, grad_output)
+        return grad_input.reshape(ctx.first_axis_dim, *other_shape), None
+
+
+index_first_axis_residual = IndexFirstAxisResidual.apply
+
+
+def unpad_input(hidden_states, attention_mask):
+    """
+    Arguments:
+        hidden_states: (batch, seqlen, ...)
+        attention_mask: (batch, seqlen), bool / int, 1 means valid and 0 means not valid.
+    Return:
+        hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
+        cu_seqlens: (batch + 1), the cumulative sequence lengths, used to index into hidden_states.
+        max_seqlen_in_batch: int
+    """
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+    # TD [2022-03-04] We don't want to index with a bool mask, because Pytorch will expand the
+    # bool mask, then call nonzero to get the indices, then index with those. The indices is @dim
+    # times larger than it needs to be, wasting memory. It's faster and more memory-efficient to
+    # index with integer indices. Moreover, torch's index is a bit slower than it needs to be,
+    # so we write custom forward and backward to make it a bit faster.
+    return (
+        index_first_axis(rearrange(hidden_states, "b s ... -> (b s) ..."), indices),
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+
+
+def unpad_input_for_concatenated_sequences(hidden_states, attention_mask_in_length):
+    """
+    Supports concatenating short samples in one sequence. The attention_mask_in_length is utilized to mask other short samples. It helps efficient training of variant lengths-based samples (e.g., the supervised fine-tuning task in large language model).
+    The motivation for this function is explained [here](https://github.com/Dao-AILab/flash-attention/issues/432#issuecomment-1668822286).
+    
+    For example, if batch = 3 and seqlen = 6, the attention_mask_in_length is:
+        ```
+        [
+          [2, 3, 0, 0, 0, 0],
+          [3, 2, 0, 0, 0, 0],
+          [6, 0, 0, 0, 0, 0]
+        ]
+        ```
+    , which refers to the 3D-attention mask:
+        ```
+        [
+          [
+            [1, 0, 0, 0, 0, 0],
+            [1, 1, 0, 0, 0, 0],
+            [0, 0, 1, 0, 0, 0],
+            [0, 0, 1, 1, 0, 0],
+            [0, 0, 1, 1, 1, 0],
+            [0, 0, 0, 0, 0, 1]
+          ],
+          [
+            [1, 0, 0, 0, 0, 0],
+            [1, 1, 0, 0, 0, 0],
+            [1, 1, 1, 0, 0, 0],
+            [0, 0, 0, 1, 0, 0],
+            [0, 0, 0, 1, 1, 0],
+            [0, 0, 0, 0, 0, 1]
+          ],
+          [
+            [1, 0, 0, 0, 0, 0],
+            [1, 1, 0, 0, 0, 0],
+            [1, 1, 1, 0, 0, 0],
+            [1, 1, 1, 1, 0, 0],
+            [1, 1, 1, 1, 1, 0],
+            [1, 1, 1, 1, 1, 1]
+          ]
+        ]
+        ```.
+
+    Arguments:
+        hidden_states: (batch, seqlen, ...)
+        attention_mask_in_length: (batch, seqlen), int, a nonzero number (e.g., 1, 2, 3, etc.) means length of concatenated sequence in b-th batch, and 0 means none.
+    Return:
+        hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
+        cu_seqlens: (batch + 1), the cumulative sequence lengths, used to index into hidden_states.
+        max_seqlen_in_batch: int
+    """
+    length = attention_mask_in_length.sum(dim=-1)
+    seqlen = attention_mask_in_length.size(-1)
+    attention_mask_2d = torch.arange(
+        seqlen, device=length.device, dtype=length.dtype).expand(len(length),
+                                                                 seqlen) < length.unsqueeze(1)
+    real_indices_idx = torch.nonzero(attention_mask_in_length.flatten(), as_tuple=False).flatten()
+    seqlens_in_batch = attention_mask_in_length.flatten()[real_indices_idx]
+    indices = torch.nonzero(attention_mask_2d.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+    # TD [2022-03-04] We don't want to index with a bool mask, because Pytorch will expand the
+    # bool mask, then call nonzero to get the indices, then index with those. The indices is @dim
+    # times larger than it needs to be, wasting memory. It's faster and more memory-efficient to
+    # index with integer indices. Moreover, torch's index is a bit slower than it needs to be,
+    # so we write custom forward and backward to make it a bit faster.
+    return (
+        index_first_axis(rearrange(hidden_states, "b s ... -> (b s) ..."), indices),
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+
+
+def pad_input(hidden_states, indices, batch, seqlen):
+    """
+    Arguments:
+        hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
+        indices: (total_nnz)
+    Return:
+        hidden_states: (batch, seqlen, ...)
+    """
+    dim = hidden_states.shape[-1]
+    # output = torch.zeros((batch * seqlen), dim, device=hidden_states.device, dtype=hidden_states.dtype)
+    # output[indices] = hidden_states
+    output = index_put_first_axis(hidden_states, indices, batch * seqlen)
+    return rearrange(output, "(b s) ... -> b s ...", b=batch)
diff --git a/examples/flash_attention/example_mha_fwd_varlen.py b/examples/flash_attention/example_mha_fwd_varlen.py
index 197520ad7..83c8e29d5 100644
--- a/examples/flash_attention/example_mha_fwd_varlen.py
+++ b/examples/flash_attention/example_mha_fwd_varlen.py
@@ -7,7 +7,7 @@
 
 import torch
 from einops import rearrange, repeat
-from flash_attn.bert_padding import pad_input, unpad_input
+from bert_padding import pad_input, unpad_input
 
 
 def generate_random_padding_mask(max_seqlen, batch_size, device, mode="random"):
@@ -410,7 +410,10 @@ def main(batch: int = 2, heads: int = 16, seq_len: int = 256, dim: int = 32):
         key_padding_mask,
         causal=causal,
     )
+    torch.testing.assert_close(out, out_ref, rtol=1e-2, atol=1e-2)
+
     import flash_attn
+
     fla_out_unpad = flash_attn.flash_attn_varlen_func(
         q_unpad,
         k_unpad,
@@ -423,8 +426,8 @@ def main(batch: int = 2, heads: int = 16, seq_len: int = 256, dim: int = 32):
         causal=causal,
     )
     fla_out = output_pad_fn(fla_out_unpad)
-    torch.testing.assert_close(out, out_ref, rtol=1e-2, atol=1e-2)
     torch.testing.assert_close(out, fla_out, rtol=1e-2, atol=1e-2)
+
     print("Assert Equal Passed")
 
 
diff --git a/examples/flash_decoding/example_mha_inference.py b/examples/flash_decoding/example_mha_inference.py
index 503d71218..9089c08c3 100644
--- a/examples/flash_decoding/example_mha_inference.py
+++ b/examples/flash_decoding/example_mha_inference.py
@@ -44,7 +44,7 @@ def MMA0(
     @T.macro
     def MMA1(
         V: T.Tensor(shape_kv, dtype),
-        V_shared: T.SharedBuffer([block_M, dim], dtype),
+        V_shared: T.SharedBuffer([block_N, dim], dtype),
         acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
         acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
         k: T.int32,
diff --git a/examples/fusedmoe/example_fusedmoe_tilelang.py b/examples/fusedmoe/example_fusedmoe_tilelang.py
index 6ee1c130b..b8baf8eb1 100644
--- a/examples/fusedmoe/example_fusedmoe_tilelang.py
+++ b/examples/fusedmoe/example_fusedmoe_tilelang.py
@@ -7,8 +7,6 @@
 from tilelang.autotuner import *
 from example_fusedmoe_torch import *
 
-# tilelang.disable_cache()
-
 
 @tilelang.jit(pass_configs={"tl.disable_tma_lower": True, "tl.disable_warp_specialized": True})
 def moe_forward_tilelang_shared(d_hidden,
diff --git a/examples/gdn/README.md b/examples/gdn/README.md
new file mode 100644
index 000000000..086cdea61
--- /dev/null
+++ b/examples/gdn/README.md
@@ -0,0 +1,11 @@
+# Gated Delta Net(GDN) kernel implementation in TileLang
+
+## Requirement
+
+### The Tilelang version for test is 0.1.5+17fafc1b3026d910a83eb8052fdf811ba56be0b1
+
+### We currently use triton=3.3.0 and FLA commit id=f03cb3ae for comparison
+
+## Get started
+
+### The common/chunk_delta_h.py implements the most critical forward kernel of GDN. It's a good start to understand the GDN logic and the tilelang optimization
\ No newline at end of file
diff --git a/examples/gdn/example_chunk_delta_bwd.py b/examples/gdn/example_chunk_delta_bwd.py
new file mode 100644
index 000000000..9c77abb4e
--- /dev/null
+++ b/examples/gdn/example_chunk_delta_bwd.py
@@ -0,0 +1,577 @@
+# Reference: fla/ops/common/chunk_delta_h.py
+
+import sys  # noqa: F401
+
+import tilelang
+import tilelang.language as T
+
+print(tilelang.__file__, flush=True)
+
+# Add your fla repository path to sys.path
+# Currently we use the fla repository from the flash-linear-attention project at commit id f03cb3ae
+# sys.path.insert(0, "/home/tzj/flash-linear-attention")
+try:
+    import fla
+    print(fla.__file__, flush=True)
+    from fla.ops.common.chunk_delta_h import chunk_gated_delta_rule_bwd_dhu
+except ImportError:
+    print("fla not found, using tilelang implementation")
+    fla = None
+
+import torch
+import torch.nn.functional as F
+
+torch.random.manual_seed(0)
+# torch.set_printoptions(profile="full")
+
+tilelang.disable_cache()
+
+from utils import *
+
+
+def prepare_input(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    chunk_size,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    state_dtype,
+):
+    Q = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    K = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    K = F.normalize(K, dim=-1, p=2)
+    W = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    # Note: G should be in logspace and do chunkwise cumsum
+    G = torch.randn(B, S, H, dtype=gate_dtype).cuda()
+    G = F.logsigmoid(G)
+    try:
+        from fla.ops.utils.cumsum import chunk_local_cumsum
+        G = chunk_local_cumsum(G, chunk_size)
+    except ImportError:
+        print("fla not found, skip cumsum")
+
+    h0 = torch.randn(B, H, DK, DV, dtype=input_dtype).cuda()
+    dht = torch.randn(B, H, DK, DV, dtype=input_dtype).cuda()
+    dO = torch.randn(B, S, H, DV, dtype=input_dtype).cuda()
+    dv = torch.randn(B, S, H, DV, dtype=input_dtype).cuda()
+    return Q, K, W, G, h0, dht, dO, dv
+
+
+def prepare_input_fake(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    chunk_size,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    state_dtype,
+):
+    Q = torch.ones(B, S, H, DK, dtype=input_dtype).cuda()
+    K = torch.ones(B, S, H, DK, dtype=input_dtype).cuda()
+    W = torch.ones(B, S, H, DK, dtype=input_dtype).cuda()
+    G = torch.ones(B, S, H, dtype=gate_dtype).cuda()
+    h0 = torch.ones(B, H, DK, DV, dtype=input_dtype).cuda()
+    dht = torch.ones(B, H, DK, DV, dtype=input_dtype).cuda()
+    dO = torch.ones(B, S, H, DV, dtype=input_dtype).cuda()
+    dv = torch.ones(B, S, H, DV, dtype=input_dtype).cuda()
+    return Q, K, W, G, h0, dht, dO, dv
+
+
+def prepare_output(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    chunk_size,
+    output_dtype,
+    gate_dtype,
+    state_dtype,
+):
+    BS = S // chunk_size
+    dh = torch.empty(B, BS, H, DK, DV, dtype=output_dtype).cuda()
+    dh0 = torch.empty(B, H, DK, DV, dtype=state_dtype).cuda()
+    dv2 = torch.empty(B, S, H, DV, dtype=output_dtype).cuda()
+    return dh, dh0, dv2
+
+
+def torch_chunk_gated_delta_rule_bwd_dhu(
+    Q: torch.Tensor,
+    K: torch.Tensor,
+    W: torch.Tensor,
+    G: torch.Tensor,
+    h0: torch.Tensor,
+    dht: torch.Tensor,
+    dO: torch.Tensor,
+    dv: torch.Tensor,
+    scale: float,
+    use_g: bool,
+    use_initial_state: bool,
+    use_final_state_gradient: bool,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    state_dtype,
+):
+    B, S, H, DK = Q.shape
+    DV = dv.shape[-1]
+    block_S = 64
+    BS = S // block_S
+    dh, dh0, dv2 = torch.empty((B, BS, H, DK, DV), dtype=output_dtype), torch.empty(
+        (B, H, DK, DV), dtype=state_dtype), torch.empty((B, S, H, DV), dtype=output_dtype)
+    dh_tmp = torch.empty((B, H, DK, DV), dtype=accum_dtype)
+    dv_tmp = torch.empty((B, S, H, DV), dtype=accum_dtype)
+    Q_tmp = torch.empty((B, S, H, DK), dtype=accum_dtype)
+
+    if use_final_state_gradient:
+        dh_tmp = dht.clone().to(accum_dtype)
+    else:
+        dh_tmp = torch.zeros_like(dht).to(accum_dtype)
+
+    for i_s in range(BS - 1, -1, -1):
+        dh[:, i_s, :, :, :] = dh_tmp
+        dv_tmp = torch.matmul(K[:, i_s * block_S:(i_s + 1) * block_S, :, :].permute(0, 2, 1, 3),
+                              dh_tmp.to(K.dtype)).permute(0, 2, 1, 3)
+        if use_g:
+            for i_bh in range(B * H):
+                i_b, i_h = i_bh // H, i_bh % H
+                for i_s2 in range(block_S):
+                    if G[i_b, i_s * block_S + block_S - 1, i_h] - G[i_b, i_s * block_S + i_s2,
+                                                                    i_h] <= 0:
+                        dv_tmp[i_b, i_s2,
+                               i_h, :] *= torch.exp(G[i_b, i_s * block_S + block_S - 1, i_h] -
+                                                    G[i_b, i_s * block_S + i_s2, i_h])
+                    else:
+                        dv_tmp[i_b, i_s2, i_h, :] = 0
+        dv_tmp += dv[:, i_s * block_S:(i_s + 1) * block_S, :, :]
+        dv2[:, i_s * block_S:(i_s + 1) * block_S, :, :] = dv_tmp
+
+        if use_g:
+            G_last = G[:, i_s * block_S + block_S - 1, :]
+            for i_bh in range(B * H):
+                i_b, i_h = i_bh // H, i_bh % H
+                dh_tmp[i_b, i_h, :, :] *= torch.exp(G_last[i_b, i_h])
+            Q_tmp = Q[:, i_s * block_S:(i_s + 1) * block_S, :, :]
+            for i_s2 in range(block_S):
+                for i_k in range(DK):
+                    Q_tmp[:, i_s2, :, i_k] *= torch.exp(G[:, i_s * block_S + i_s2, :])
+        Q_tmp *= scale
+        W_tmp = W[:, i_s * block_S:(i_s + 1) * block_S, :, :]
+        dO_tmp = dO[:, i_s * block_S:(i_s + 1) * block_S, :, :]
+
+        torch.backends.cuda.matmul.allow_tf32 = True
+        dh_tmp += torch.matmul(Q_tmp.permute(0, 2, 3, 1), dO_tmp.permute(0, 2, 1, 3))
+        dh_tmp -= torch.matmul(W_tmp.permute(0, 2, 3, 1), dv_tmp.permute(0, 2, 1, 3))
+        torch.backends.cuda.matmul.allow_tf32 = False
+
+    if use_initial_state:
+        dh0 = dh_tmp[:, :, :, :]
+    else:
+        dh0 = torch.zeros_like(dh_tmp[:, :, :, :])
+    print(dh0.dtype)
+
+    return dh, dh0, dv2
+
+
+@tilelang.jit(out_idx=[-3, -2, -1])
+def tilelang_chunk_gated_delta_rule_bwd_dhu(
+    # task config
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    state_dtype,
+    chunk_size,
+    scale,
+    use_g=True,
+    use_initial_state=True,
+    use_final_state_gradient=True,
+    # kernel config
+    block_DV=64,
+    threads=256,
+    num_stages=0,
+):
+    block_S = chunk_size
+    # Should support cu_seqlen
+    BS = S // block_S
+
+    Q_shape = (B, S, H, DK)
+    K_shape = (B, S, H, DK)
+    W_shape = (B, S, H, DK)
+    G_shape = (B, S, H)
+    h0_shape = (B, H, DK, DV)
+    dht_shape = (B, H, DK, DV)
+    dO_shape = (B, S, H, DV)
+    dv_shape = (B, S, H, DV)
+
+    dh_shape = (B, BS, H, DK, DV)
+    dh0_shape = (B, H, DK, DV)
+    dv2_shape = (B, S, H, DV)
+
+    @T.prim_func
+    def kernel(
+            # Input
+            Q: T.Tensor(Q_shape, dtype=input_dtype),
+            K: T.Tensor(K_shape, dtype=input_dtype),
+            W: T.Tensor(W_shape, dtype=input_dtype),
+            G: T.Tensor(G_shape, dtype=gate_dtype),
+            h0: T.Tensor(h0_shape, dtype=input_dtype),
+            dht: T.Tensor(dht_shape, dtype=input_dtype),
+            dO: T.Tensor(dO_shape, dtype=input_dtype),
+            dv: T.Tensor(dv_shape, dtype=input_dtype),
+            # Output
+            dh: T.Tensor(dh_shape, dtype=output_dtype),
+            dh0: T.Tensor(dh0_shape, dtype=state_dtype),
+            dv2: T.Tensor(dv2_shape, dtype=output_dtype),
+    ):
+        with T.Kernel(T.ceildiv(DV, block_DV), B * H, threads=threads) as (bv, bbh):
+            bb, bh = bbh // H, bbh % H
+
+            b_dh_shared = T.alloc_shared((DK, block_DV), dtype=output_dtype)
+            b_dh_shared_fp32 = T.alloc_shared((DK, block_DV), dtype=state_dtype)
+            b_dh_fragment = T.alloc_fragment((DK, block_DV), dtype=accum_dtype)
+            b_dh_fragment_1 = T.alloc_fragment((DK, block_DV), dtype=accum_dtype)
+            b_dh_fragment_2 = T.alloc_fragment((DK, block_DV), dtype=accum_dtype)
+            dv_shared = T.alloc_shared((block_S, block_DV), dtype=input_dtype)
+            dv_fragment = T.alloc_fragment((block_S, block_DV), dtype=accum_dtype)
+            dv_fragment_2 = T.alloc_fragment((block_S, block_DV), dtype=accum_dtype)
+            dO_shared = T.alloc_shared((block_S, block_DV), dtype=input_dtype)
+            dO_shared_t = T.alloc_shared((block_DV, block_S), dtype="float32")
+            dO_fragment = T.alloc_fragment((block_S, block_DV), dtype="float32")
+            dO_fragment_t = T.alloc_fragment((block_DV, block_S), dtype="float32")
+            K_shared = T.alloc_shared((block_S, DK), dtype=input_dtype)
+
+            Q_shared = T.alloc_shared((block_S, DK), dtype=input_dtype)
+            Q_shared_fp32 = T.alloc_shared((block_S, DK), dtype="float32")
+            W_shared = T.alloc_shared((block_S, DK), dtype=input_dtype)
+
+            G_last_local = T.alloc_local((1), dtype=gate_dtype)
+            G_last_local_exp = T.alloc_local((1), dtype=gate_dtype)
+            G_shared = T.alloc_shared((block_S), dtype=gate_dtype, scope="shared")
+            G_fragment = T.alloc_fragment((block_S), dtype=gate_dtype)
+            G_fragment_post = T.alloc_fragment((block_S), dtype=gate_dtype)
+            G_fragment_exp = T.alloc_fragment((block_S), dtype=gate_dtype)
+            Q_fragment = T.alloc_fragment((block_S, DK), dtype=accum_dtype)
+            Q_fragment_t = T.alloc_fragment((DK, block_S), dtype=accum_dtype)
+
+            T.use_swizzle(10)
+
+            T.annotate_layout({
+                b_dh_shared: tilelang.layout.make_swizzled_layout(b_dh_shared),
+                b_dh_shared_fp32: tilelang.layout.make_swizzled_layout(b_dh_shared_fp32),
+                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
+                dO_shared: tilelang.layout.make_swizzled_layout(dO_shared),
+                dO_shared_t: tilelang.layout.make_swizzled_layout(dO_shared_t),
+                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
+                Q_shared: tilelang.layout.make_swizzled_layout(Q_shared),
+                Q_shared_fp32: tilelang.layout.make_swizzled_layout(Q_shared_fp32),
+                W_shared: tilelang.layout.make_swizzled_layout(W_shared),
+            })
+
+            if use_final_state_gradient:
+                T.copy(dht[bb, bh, 0:DK, bv * block_DV:(bv + 1) * block_DV], b_dh_shared)
+                T.copy(b_dh_shared, b_dh_fragment)
+            else:
+                T.clear(b_dh_fragment)
+
+            for i_s in T.Pipelined(T.ceildiv(S, block_S), num_stages=num_stages):
+                # The gradient should be stored in the reverse order
+                i_s_inv = T.ceildiv(S, block_S) - i_s - 1
+
+                # Store the updated dh
+                T.copy(b_dh_fragment, b_dh_shared)
+                T.copy(b_dh_shared, dh[bb, i_s_inv, bh, 0:DK, bv * block_DV:(bv + 1) * block_DV])
+
+                # Update dv
+                T.copy(K[bb, i_s_inv * block_S:(i_s_inv + 1) * block_S, bh, 0:DK], K_shared)
+                T.gemm(K_shared, b_dh_shared, dv_fragment, clear_accum=True)
+
+                if use_g:
+                    T.copy(
+                        G[bb, i_s_inv * block_S:(i_s_inv + 1) * block_S, bh],
+                        G_shared,
+                        disable_tma=True)
+                    T.copy(G_shared, G_fragment)
+                    G_last_local[0] = G_shared[block_S - 1]
+                    G_last_local_exp[0] = T.exp(G_last_local[0])
+                    for i_s2 in T.Parallel(block_S):
+                        G_fragment_post[i_s2] = T.exp(G_last_local[0] - G_fragment[i_s2])
+                    for i_s2, i_v in T.Parallel(block_S, block_DV):
+                        # with T.If(G_last_local[0] - G_shared[i_s2] <= 0):
+                        with T.If(G_last_local[0] - G_fragment[i_s2] <= 0):
+                            with T.Then():
+                                dv_fragment[i_s2,
+                                            i_v] = dv_fragment[i_s2, i_v] * G_fragment_post[i_s2]
+                            with T.Else():
+                                dv_fragment[i_s2, i_v] = 0
+
+                T.copy(
+                    dv[bb, i_s_inv * block_S:(i_s_inv + 1) * block_S, bh,
+                       bv * block_DV:(bv + 1) * block_DV], dv_shared)
+                T.copy(dv_shared, dv_fragment_2)
+                for i_s2, i_v in T.Parallel(block_S, block_DV):
+                    dv_fragment[i_s2, i_v] = dv_fragment[i_s2, i_v] + dv_fragment_2[i_s2, i_v]
+
+                # Store the updated dv
+                T.copy(dv_fragment, dv_shared)
+                T.copy(
+                    dv_shared, dv2[bb, i_s_inv * block_S:(i_s_inv + 1) * block_S, bh,
+                                   bv * block_DV:(bv + 1) * block_DV])
+
+                # Update dh
+                T.copy(Q[bb, i_s_inv * block_S:(i_s_inv + 1) * block_S, bh, 0:DK], Q_shared)
+                T.copy(W[bb, i_s_inv * block_S:(i_s_inv + 1) * block_S, bh, 0:DK], W_shared)
+
+                T.clear(Q_fragment)
+                if use_g:
+                    for i_k, i_v in T.Parallel(DK, block_DV):
+                        b_dh_fragment[i_k, i_v] *= G_last_local_exp[0]
+                    T.copy(Q_shared, Q_fragment)
+                    for i_s2 in T.Parallel(block_S):
+                        G_fragment_exp[i_s2] = T.exp(G_shared[i_s2])
+                    for i_s2, i_k in T.Parallel(block_S, DK):
+                        # Q_fragment[i_s2, i_k] = Q_fragment[i_s2, i_k] * T.exp(G_shared[i_s2]) * scale
+                        Q_fragment[i_s2, i_k] = Q_fragment[i_s2, i_k] * G_fragment_exp[i_s2] * scale
+                else:
+                    T.copy(Q_shared, Q_fragment)
+                    for i_s2, i_k in T.Parallel(block_S, DK):
+                        Q_fragment[i_s2, i_k] = Q_fragment[i_s2, i_k] * scale
+                # Get transpose of Q_fragment to meet tf32 gemm requirement
+                for i_s2, i_k in T.Parallel(block_S, DK):
+                    Q_fragment_t[i_k, i_s2] = Q_fragment[i_s2, i_k]
+
+                T.copy(
+                    dO[bb, i_s_inv * block_S:(i_s_inv + 1) * block_S, bh,
+                       bv * block_DV:(bv + 1) * block_DV], dO_shared)
+                T.copy(dO_shared, dO_fragment)
+                for i_s2, i_v in T.Parallel(block_S, block_DV):
+                    dO_fragment_t[i_v, i_s2] = dO_fragment[i_s2, i_v]
+                T.copy(dO_fragment_t, dO_shared_t)
+
+                T.clear(b_dh_fragment_1)
+                T.gemm(Q_fragment_t, dO_shared_t, b_dh_fragment_1, transpose_B=True)
+                T.clear(b_dh_fragment_2)
+                T.gemm(W_shared, dv_shared, b_dh_fragment_2, transpose_A=True)
+                for i_k, i_v in T.Parallel(DK, block_DV):
+                    b_dh_fragment[i_k, i_v] += b_dh_fragment_1[i_k, i_v] - b_dh_fragment_2[i_k, i_v]
+
+            if use_initial_state:
+                T.copy(b_dh_fragment, dh0[bb, bh, 0:DK, bv * block_DV:(bv + 1) * block_DV])
+
+    return kernel
+
+
+def test_result(dh_0, dh0_0, dv2_0, dh_1, dh0_1, dv2_1, name):
+    try:
+        torch.testing.assert_close(dh_0, dh_1, rtol=1e-2, atol=1e-2, equal_nan=True)
+        print(f"{name} dh_0 and dh_1 passed for {name}")
+    except Exception as e:
+        print(f"{name} dh_0 and dh_1 are not close for {name}")
+        print(e, end="\n\n")
+    try:
+        torch.testing.assert_close(dh0_0, dh0_1, rtol=1e-2, atol=1e-2, equal_nan=True)
+        print(f"{name} dh0_0 and dh0_1 passed for {name}")
+    except Exception as e:
+        print(f"{name} dh0_0 and dh0_1 are not close for {name}")
+        print(e, end="\n\n")
+    try:
+        torch.testing.assert_close(dv2_0, dv2_1, rtol=1e-2, atol=1e-2, equal_nan=True)
+        print(f"{name} dv2_0 and dv2_1 passed for {name}")
+    except Exception as e:
+        print(f"{name} dv2_0 and dv2_1 are not close for {name}")
+        print(e, end="\n\n")
+
+    close = torch.isclose(dh_0, dh_1, rtol=1e-2, atol=1e-2)
+    mismatch_indices = torch.nonzero(~close, as_tuple=True)
+    error_num = 0
+    for indices in zip(*mismatch_indices):
+        if error_num < 100:
+            print(
+                f"{name} dh_0[{[idx.item() for idx in indices]}] = {dh_0[indices[0].item(), indices[1].item(), indices[2].item(), indices[3].item(), indices[4].item()]}, dh_1[{[idx.item() for idx in indices]}] = {dh_1[indices[0].item(), indices[1].item(), indices[2].item(), indices[3].item(), indices[4].item()]}"
+            )
+            error_num += 1
+    close = torch.isclose(dh0_0, dh0_1, rtol=1e-2, atol=1e-2)
+    mismatch_indices = torch.nonzero(~close, as_tuple=True)
+    error_num = 0
+    for indices in zip(*mismatch_indices):
+        if error_num < 100:
+            print(
+                f"{name} dh0_0[{[idx.item() for idx in indices]}] = {dh0_0[indices[0].item(), indices[1].item(), indices[2].item(), indices[3].item()]}, dh0_1[{[idx.item() for idx in indices]}] = {dh0_1[indices[0].item(), indices[1].item(), indices[2].item(), indices[3].item()]}"
+            )
+            error_num += 1
+    close = torch.isclose(dv2_0, dv2_1, rtol=1e-2, atol=1e-2)
+    mismatch_indices = torch.nonzero(~close, as_tuple=True)
+    error_num = 0
+    for indices in zip(*mismatch_indices):
+        if error_num < 100:
+            print(
+                f"{name} dv2_0[{[idx.item() for idx in indices]}] = {dv2_0[indices[0].item(), indices[1].item(), indices[2].item(), indices[3].item()]}, dv2_1[{[idx.item() for idx in indices]}] = {dv2_1[indices[0].item(), indices[1].item(), indices[2].item(), indices[3].item()]}"
+            )
+            error_num += 1
+
+
+def run_test(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    state_dtype,
+    chunk_size,
+    scale,
+    use_g=True,
+    use_initial_state=True,
+    use_final_state_gradient=True,
+    block_DV=64,
+    threads=256,
+    num_stages=0,
+    use_torch=False,
+):
+    Q, K, W, G, h0, dht, dO, dv = prepare_input(B, S, H, DK, DV, chunk_size,
+                                                getattr(torch, input_dtype),
+                                                getattr(torch, output_dtype),
+                                                getattr(torch, accum_dtype),
+                                                getattr(torch, gate_dtype),
+                                                getattr(torch, state_dtype))
+    dh_ref, dh0_ref, dv2_ref = prepare_output(B, S, H, DK, DV, chunk_size,
+                                              getattr(torch, output_dtype),
+                                              getattr(torch, gate_dtype),
+                                              getattr(torch, state_dtype))
+    dh_tilelang, dh0_tilelang, dv2_tilelang = prepare_output(B, S, H, DK, DV, chunk_size,
+                                                             getattr(torch, output_dtype),
+                                                             getattr(torch, gate_dtype),
+                                                             getattr(torch, state_dtype))
+
+    # fla ref
+    print("fla running...", flush=True)
+    if use_g:
+        dh_ref, dh0_ref, dv2_ref = chunk_gated_delta_rule_bwd_dhu(Q, K, W, G, h0, dht, dO, dv,
+                                                                  scale)
+    else:
+        G = G.fill_(0)
+        dh_ref, dh0_ref, dv2_ref = chunk_gated_delta_rule_bwd_dhu(Q, K, W, G, h0, dht, dO, dv,
+                                                                  scale)
+
+    # tilelang
+    print("tilelang running...", flush=True)
+    kernel = tilelang_chunk_gated_delta_rule_bwd_dhu(B, S, H, DK, DV, input_dtype, output_dtype,
+                                                     accum_dtype, gate_dtype, state_dtype,
+                                                     chunk_size, scale, use_g, use_initial_state,
+                                                     use_final_state_gradient, block_DV, threads,
+                                                     num_stages)
+    # kernel = tilelang.compile(program)
+    print(kernel.get_kernel_source())
+    dh_tilelang, dh0_tilelang, dv2_tilelang = kernel(Q, K, W, G, h0, dht, dO, dv)
+
+    fla_time = do_bench(
+        chunk_gated_delta_rule_bwd_dhu, Q, K, W, G, h0, dht, dO, dv, scale, chunk_size=chunk_size)
+    tilelang_time = do_bench(kernel, Q, K, W, G, h0, dht, dO, dv)
+
+    print(f"fla time: {fla_time} ms")
+    print(f"tilelang time: {tilelang_time} ms")
+
+    assert_similar(dh_tilelang, dh_ref, 1e-5, "fla-tilelang", data="dh")
+    assert_similar(dh0_tilelang, dh0_ref, 1e-5, "fla-tilelang", data="dh0")
+    assert_similar(dv2_tilelang, dv2_ref, 1e-5, "fla-tilelang", data="dv2")
+
+    # torch ref
+    if use_torch:
+        print("torch running...", flush=True)
+        if use_g:
+            dh_ref_torch, dh0_ref_torch, dv2_ref_torch = torch_chunk_gated_delta_rule_bwd_dhu(
+                Q, K, W, G, h0, dht, dO, dv, scale, use_g, use_initial_state,
+                use_final_state_gradient, getattr(torch, input_dtype), getattr(torch, output_dtype),
+                getattr(torch, accum_dtype), getattr(torch,
+                                                     gate_dtype), getattr(torch, state_dtype))
+            dh_ref_torch = dh_ref_torch.cuda()
+            dh0_ref_torch = dh0_ref_torch.cuda()
+            dv2_ref_torch = dv2_ref_torch.cuda()
+        else:
+            dh_ref_torch, dh0_ref_torch, dv2_ref_torch = torch_chunk_gated_delta_rule_bwd_dhu(
+                Q, K, W, None, h0, dht, dO, dv, scale, use_g, use_initial_state,
+                use_final_state_gradient, getattr(torch, input_dtype), getattr(torch, output_dtype),
+                getattr(torch, accum_dtype), getattr(torch,
+                                                     gate_dtype), getattr(torch, state_dtype))
+            dh_ref_torch = dh_ref_torch.cuda()
+            dh0_ref_torch = dh0_ref_torch.cuda()
+            dv2_ref_torch = dv2_ref_torch.cuda()
+
+        assert_similar(dh_ref_torch, dh_ref, 1e-5, "torch-fla", data="dh")
+        assert_similar(dh0_ref_torch, dh0_ref, 1e-5, "torch-fla", data="dh0")
+        assert_similar(dv2_ref_torch, dv2_ref, 1e-5, "torch-fla", data="dv2")
+        assert_similar(dh_ref_torch, dh_tilelang, 1e-5, "torch-tilelang", data="dh")
+        assert_similar(dh0_ref_torch, dh0_tilelang, 1e-5, "torch-tilelang", data="dh0")
+        assert_similar(dv2_ref_torch, dv2_tilelang, 1e-5, "torch-tilelang", data="dv2")
+
+
+def do_bench(fn, *args, warmup=10, rep=10, **kwargs):
+    """
+    Do benchmark for a function.
+    """
+    start_event = [torch.cuda.Event(enable_timing=True) for i in range(rep)]
+    end_event = [torch.cuda.Event(enable_timing=True) for i in range(rep)]
+    for _ in range(warmup):
+        fn(*args, **kwargs)
+
+    torch.cuda.synchronize()
+    for i in range(rep):
+        start_event[i].record()
+        fn(*args, **kwargs)
+        end_event[i].record()
+    torch.cuda.synchronize()
+
+    # Record clocks
+    times = torch.tensor(
+        [s.elapsed_time(e) for s, e in zip(start_event, end_event)],
+        dtype=torch.float,
+    )
+
+    return times.mean().item()
+
+
+def main():
+    DK = 128
+    run_test(
+        B=1,
+        S=32768,
+        H=8,
+        DK=DK,
+        DV=128,
+        input_dtype="bfloat16",
+        output_dtype="bfloat16",
+        accum_dtype="float32",
+        gate_dtype="float32",
+        state_dtype="float32",
+        chunk_size=64,
+        scale=DK**-0.5,
+        use_g=True,
+        use_initial_state=True,
+        use_final_state_gradient=True,
+        block_DV=32,
+        threads=128,
+        num_stages=1,
+        use_torch=False,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/gdn/example_chunk_delta_h.py b/examples/gdn/example_chunk_delta_h.py
new file mode 100644
index 000000000..dd37e3935
--- /dev/null
+++ b/examples/gdn/example_chunk_delta_h.py
@@ -0,0 +1,368 @@
+# Reference: fla/ops/common/chunk_delta_h.py
+
+import sys  # noqa: F401
+import tilelang
+import tilelang.language as T
+
+# Add your fla repository path to sys.path
+# Currently we use the fla repository from the flash-linear-attention project at commit id f03cb3ae
+# sys.path.insert(0, "/home/tzj/flash-linear-attention")
+try:
+    import fla
+    print(fla.__file__)
+    from fla.ops.common.chunk_delta_h import chunk_gated_delta_rule_fwd_h
+except ImportError:
+    print("fla not found, using tilelang implementation")
+    fla = None
+
+import torch
+import torch.nn.functional as F
+from tilelang.engine.callback import register_cuda_postproc_callback  # noqa: F401
+
+from utils import *
+
+# (zhengju) We can slightly modify the generated cuda code from tilelang lowering
+# in the debug folder to make the performance better. To enable this callback,
+# you can comment out the following function.
+# @register_cuda_postproc_callback
+# def tilelang_callback_cuda_postproc(code, _):
+#     cuda_code = open("../debug/chunk_delta_h_fuse.cu", "r").read()
+#     code = cuda_code
+#     return code
+
+torch.random.manual_seed(0)
+
+tilelang.disable_cache()
+
+
+def prepare_input(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    chunk_size,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+):
+    K = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    K = F.normalize(K, dim=-1, p=2)
+    W = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    W = F.normalize(W, dim=-1, p=2)
+    U = torch.randn(B, S, H, DV, dtype=input_dtype).cuda()
+    U = F.normalize(U, dim=-1, p=2)
+    G = torch.randn(B, S, H, dtype=gate_dtype).cuda()
+    G = F.logsigmoid(G)
+    try:
+        from fla.ops.utils.cumsum import chunk_local_cumsum
+        G = chunk_local_cumsum(G, chunk_size)
+    except ImportError:
+        print("fla not found, skip cumsum")
+
+    initial_state = torch.randn(B, H, DK, DV, dtype=input_dtype).cuda()
+    return K, W, U, G, initial_state
+
+
+def prepare_output(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    chunk_size,
+    output_dtype,
+    state_dtype,
+):
+    BS = S // chunk_size
+    h = torch.empty(B, BS, H, DK, DV, dtype=output_dtype).cuda()
+    final_state = torch.empty(B, H, DK, DV, dtype=state_dtype).cuda()
+    V_new = torch.empty(B, S, H, DV, dtype=output_dtype).cuda()
+    return h, final_state, V_new
+
+
+@tilelang.jit(out_idx=[-3, -2, -1])
+def tilelang_chunk_gated_delta_rule_fwd_h(
+    # task config
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    state_dtype,
+    chunk_size,
+    use_g=True,
+    use_initial_state=True,
+    store_final_state=True,
+    save_new_value=True,
+    # kernel config
+    block_DK=64,
+    block_DV=64,
+    threads=256,
+    num_stages=0,
+):
+    block_S = chunk_size
+    BS = S // block_S
+
+    K_shape = (B, S, H, DK)
+    V_shape = (B, S, H, DV)
+    W_shape = (B, S, H, DK)
+    U_shape = (B, S, H, DV)
+    G_shape = (B, S, H)
+    h_shape = (B, BS, H, DK, DV)
+    initial_state_shape = (B, H, DK, DV)
+    final_state_shape = (B, H, DK, DV)
+
+    @T.prim_func
+    def kernel(
+            K: T.Tensor(K_shape, dtype=input_dtype),
+            W: T.Tensor(W_shape, dtype=input_dtype),
+            U: T.Tensor(U_shape, dtype=input_dtype),
+            G: T.Tensor(G_shape, dtype=gate_dtype),
+            initial_state: T.Tensor(initial_state_shape, dtype=input_dtype),
+            h: T.Tensor(h_shape, dtype=output_dtype),
+            final_state: T.Tensor(final_state_shape, dtype=state_dtype),
+            V_new: T.Tensor(V_shape, dtype=output_dtype),
+    ):
+        with T.Kernel(T.ceildiv(DV, block_DV), B * H, threads=threads) as (bv, bbh):
+            bb, bh = bbh // H, bbh % H
+
+            b_h_shared = T.alloc_shared((DK, block_DV), dtype=input_dtype)
+            b_h_fragment = T.alloc_fragment((DK, block_DV), dtype=accum_dtype)
+
+            U_shared = T.alloc_shared((block_S, block_DV), dtype=input_dtype)
+            U_fragment = T.alloc_fragment((block_S, block_DV), dtype=accum_dtype)
+            W_shared = T.alloc_shared((block_S, DK), dtype=input_dtype)
+            V_new_fragment = T.alloc_fragment((block_S, block_DV), dtype=accum_dtype)
+            V_new_shared = T.alloc_shared((block_S, block_DV), dtype=output_dtype)
+            K_shared = T.alloc_shared((block_S, DK), dtype=input_dtype)
+            G_last_local = T.alloc_local((1), dtype=gate_dtype)
+            G_shared = T.alloc_shared((block_S, block_DV), dtype=gate_dtype)
+            G_fragment = T.alloc_fragment((block_S, block_DV), dtype=gate_dtype)
+
+            T.annotate_layout({
+                b_h_shared: tilelang.layout.make_swizzled_layout(b_h_shared),
+                U_shared: tilelang.layout.make_swizzled_layout(U_shared),
+                W_shared: tilelang.layout.make_swizzled_layout(W_shared),
+                V_new_shared: tilelang.layout.make_swizzled_layout(V_new_shared),
+                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
+                G_shared: tilelang.layout.make_swizzled_layout(G_shared),
+            })
+
+            T.use_swizzle(10)
+
+            if use_initial_state:
+                T.copy(initial_state[bb, bh, 0:DK, bv * block_DV:(bv + 1) * block_DV], b_h_shared)
+                T.copy(b_h_shared, b_h_fragment)
+            else:
+                T.clear(b_h_fragment)
+
+            for i_s in T.Pipelined(T.ceildiv(S, block_S), num_stages=num_stages):
+                # Store previous result to the hidden tensor, like the epilogue
+                T.copy(b_h_shared, h[bb, i_s, bh, 0:DK, bv * block_DV:(bv + 1) * block_DV])
+
+                # Recurrence
+                T.copy(W[bb, i_s * block_S:(i_s + 1) * block_S, bh, 0:DK], W_shared)
+                T.gemm(W_shared, b_h_shared, V_new_fragment, clear_accum=True)
+
+                # U - W * S
+                T.copy(
+                    U[bb, i_s * block_S:(i_s + 1) * block_S, bh, bv * block_DV:(bv + 1) * block_DV],
+                    U_shared)
+                T.copy(U_shared, U_fragment)
+                for i_s2, i_v in T.Parallel(block_S, block_DV):
+                    V_new_fragment[i_s2, i_v] = -V_new_fragment[i_s2, i_v] + U_fragment[i_s2, i_v]
+
+                # Save V_new
+                if save_new_value:
+                    T.copy(V_new_fragment, dst=V_new_shared)
+                    T.copy(
+                        V_new_shared, V_new[bb, i_s * block_S:(i_s + 1) * block_S, bh,
+                                            bv * block_DV:(bv + 1) * block_DV])
+
+                T.copy(K[bb, i_s * block_S:(i_s + 1) * block_S, bh, 0:DK], K_shared)
+                # use_g
+                if use_g:
+                    G_last_local[0] = G[bb, (i_s + 1) * block_S - 1, bh]
+                    for i_s2, i_v in T.Parallel(block_S, block_DV):
+                        G_shared[i_s2, i_v] = G[bb, i_s * block_S + i_s2, bh]
+                    T.copy(G_shared, G_fragment)
+                    for i_s2, i_v in T.Parallel(block_S, block_DV):
+                        with T.If(G_last_local[0] - G_fragment[i_s2, i_v] <= 0):
+                            with T.Then():
+                                V_new_fragment[i_s2, i_v] = V_new_fragment[i_s2, i_v] * T.exp(
+                                    G_last_local[0] - G_fragment[i_s2, i_v])
+                            with T.Else():
+                                V_new_fragment[i_s2, i_v] = 0
+                    G_last_local[0] = T.exp(G_last_local[0])
+                    for i_k, i_v in T.Parallel(DK, block_DV):
+                        b_h_fragment[i_k, i_v] *= G_last_local[0]
+
+                # Update intermediate results
+                T.copy(V_new_fragment, V_new_shared)
+                T.gemm(K_shared, V_new_shared, b_h_fragment, transpose_A=True)
+
+                T.copy(b_h_fragment, b_h_shared)
+
+            # Save final state
+            if store_final_state:
+                T.copy(b_h_fragment, final_state[bb, bh, 0:DK, bv * block_DV:(bv + 1) * block_DV])
+
+    return kernel
+
+
+def do_bench(fn, *args, warmup=10, rep=10, **kwargs):
+    """
+    Do benchmark for a function.
+    """
+    start_event = [torch.cuda.Event(enable_timing=True) for i in range(rep)]
+    end_event = [torch.cuda.Event(enable_timing=True) for i in range(rep)]
+    for _ in range(warmup):
+        fn(*args, **kwargs)
+
+    torch.cuda.synchronize()
+    for i in range(rep):
+        start_event[i].record()
+        fn(*args, **kwargs)
+        end_event[i].record()
+    torch.cuda.synchronize()
+
+    # Record clocks
+    times = torch.tensor(
+        [s.elapsed_time(e) for s, e in zip(start_event, end_event)],
+        dtype=torch.float,
+    )
+
+    return times.mean().item()
+
+
+def run_test(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    state_dtype,
+    chunk_size,
+    use_g=True,
+    use_initial_state=True,
+    store_final_state=True,
+    save_new_value=True,
+    block_DK=64,
+    block_DV=32,
+    threads=128,
+    num_stages=0,
+):
+    K, W, U, G, initial_state = prepare_input(B, S, H, DK, DV, chunk_size,
+                                              getattr(torch, input_dtype),
+                                              getattr(torch, output_dtype),
+                                              getattr(torch, accum_dtype),
+                                              getattr(torch, gate_dtype))
+    h_ref, final_state_ref, V_new_ref = prepare_output(B, S, H, DK, DV, chunk_size,
+                                                       getattr(torch, output_dtype),
+                                                       getattr(torch, state_dtype))
+    h_tilelang, final_state_tilelang, V_new_tilelang = prepare_output(B, S, H, DK, DV, chunk_size,
+                                                                      getattr(torch, output_dtype),
+                                                                      getattr(torch, state_dtype))
+
+    # fla ref
+    h_ref, V_new_ref, final_state_ref = chunk_gated_delta_rule_fwd_h(K, W, U, G, initial_state,
+                                                                     store_final_state, chunk_size,
+                                                                     save_new_value)
+
+    # tilelang
+    kernel = tilelang_chunk_gated_delta_rule_fwd_h(B, S, H, DK, DV, input_dtype, output_dtype,
+                                                   accum_dtype, gate_dtype, state_dtype, chunk_size,
+                                                   use_g, use_initial_state, store_final_state,
+                                                   save_new_value, block_DK, block_DV, threads,
+                                                   num_stages)
+    h_tilelang, final_state_tilelang, V_new_tilelang = kernel(K, W, U, G, initial_state)
+    # (zhengju) If you want to print the generated cuda code, you can uncomment the following line
+    # print("CUDA Code:\n", kernel.get_kernel_source())
+
+    fla_time = do_bench(chunk_gated_delta_rule_fwd_h, K, W, U, G, initial_state, store_final_state,
+                        chunk_size, save_new_value)
+    tilelang_time = do_bench(kernel, K, W, U, G, initial_state)
+
+    # check correctness
+    try:
+        h_ref_fp32 = h_ref.to(torch.float32)
+        h_tilelang_fp32 = h_tilelang.to(torch.float32)
+        assert_similar(
+            h_ref_fp32,
+            h_tilelang_fp32,
+            eps=1e-5,
+            name="tilelang chunk gated delta rule fwd h",
+            raise_assert=False)
+        print("tilelang chunk gated delta rule fwd h passed √")
+    except Exception as e:
+        print("tilelang chunk gated delta rule fwd h failed ✗")
+        print(e)
+
+    try:
+        final_state_ref_fp32 = final_state_ref.to(torch.float32)
+        final_state_tilelang_fp32 = final_state_tilelang.to(torch.float32)
+        assert_similar(
+            final_state_ref_fp32,
+            final_state_tilelang_fp32,
+            eps=1e-5,
+            name="tilelang chunk gated delta rule fwd final_state",
+            raise_assert=False)
+        print("tilelang chunk gated delta rule fwd final_state passed √")
+    except Exception as e:
+        print("tilelang chunk gated delta rule fwd final_state failed ✗")
+        print(e)
+
+    try:
+        V_new_ref_fp32 = V_new_ref.to(torch.float32)
+        V_new_tilelang_fp32 = V_new_tilelang.to(torch.float32)
+        assert_similar(
+            V_new_ref_fp32,
+            V_new_tilelang_fp32,
+            eps=1e-5,
+            name="tilelang chunk gated delta rule fwd V_new",
+            raise_assert=False)
+        print("tilelang chunk gated delta rule fwd V_new passed √")
+    except Exception as e:
+        print("tilelang chunk gated delta rule fwd V_new failed ✗")
+        print(e)
+
+    print(f"tilelang time: {tilelang_time} ms")
+    print(f"fla time: {fla_time} ms")
+
+
+def main():
+    run_test(
+        B=1,
+        S=32768,
+        H=32,
+        DK=128,
+        DV=128,
+        input_dtype="bfloat16",
+        output_dtype="bfloat16",
+        accum_dtype="float32",
+        gate_dtype="float32",
+        state_dtype="float32",
+        chunk_size=64,
+        use_g=True,
+        use_initial_state=True,
+        store_final_state=True,
+        save_new_value=True,
+        block_DK=64,
+        block_DV=32,
+        threads=128,
+        num_stages=1,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/gdn/example_chunk_o.py b/examples/gdn/example_chunk_o.py
new file mode 100644
index 000000000..4ba2b2dbd
--- /dev/null
+++ b/examples/gdn/example_chunk_o.py
@@ -0,0 +1,239 @@
+# Reference: fla/ops/common/chunk_o.py
+
+import tilelang
+import tilelang.language as T
+import sys  # noqa: F401
+
+# Add your fla repository path to sys.path
+# Currently we use the fla repository from the flash-linear-attention project at commit id f03cb3ae
+# sys.path.insert(0, "/home/tzj/flash-linear-attention")
+try:
+    import fla
+    print(fla.__file__)
+    from fla.ops.common.chunk_o import chunk_fwd_o
+except ImportError:
+    print("fla not found, using tilelang implementation")
+    fla = None
+
+import torch
+
+torch.random.manual_seed(1)
+
+tilelang.disable_cache()
+
+
+def prepare_input(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    chunk_size,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+):
+    BS = chunk_size
+    Q = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    K = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    V = torch.randn(B, S, H, DV, dtype=input_dtype).cuda()
+    HIDDEN = torch.randn(B, S // BS, H, DK, DV, dtype=input_dtype).cuda()
+    G = torch.randn(B, S, H, dtype=gate_dtype).cuda()
+    return Q, K, V, HIDDEN, G
+
+
+def prepare_output(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    chunk_size,
+    output_dtype,
+):
+    O = torch.empty(B, S, H, DV, dtype=output_dtype).cuda()
+    return O
+
+
+@tilelang.jit(out_idx=[-1])
+def tilelang_chunk_fwd_o(
+    # task config
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    chunk_size,
+    scale,
+    use_g,
+    # kernel config
+    block_S=64,
+    block_DK=64,
+    block_DV=64,
+    threads=256,
+    num_stages=0,
+):
+    assert chunk_size == block_S, "chunk_size must be equal to block_S"
+    BS = chunk_size
+    Q_shape = (B, S, H, DK)
+    K_shape = (B, S, H, DK)
+    V_shape = (B, S, H, DV)
+    H_shape = (B, S // BS, H, DK, DV)
+    G_shape = (B, S, H)
+    O_shape = (B, S, H, DV)
+
+    @T.prim_func
+    def kernel(
+            Q: T.Tensor(Q_shape, dtype=input_dtype),
+            K: T.Tensor(K_shape, dtype=input_dtype),
+            V: T.Tensor(V_shape, dtype=input_dtype),
+            HIDDEN: T.Tensor(H_shape, dtype=input_dtype),
+            G: T.Tensor(G_shape, dtype=gate_dtype),
+            O: T.Tensor(O_shape, dtype=output_dtype),
+    ):
+        with T.Kernel(
+                T.ceildiv(DV, block_DV), T.ceildiv(S, block_S), B * H,
+                threads=threads) as (bv, bs, bbh):
+            bb, bh = bbh // H, bbh % H
+            Q_shared = T.alloc_shared((block_S, block_DK), dtype=input_dtype)
+            K_shared = T.alloc_shared((block_S, block_DK), dtype=input_dtype)
+            V_shared = T.alloc_shared((block_S, block_DV), dtype=input_dtype)
+            H_shared = T.alloc_shared((block_DK, block_DV), dtype=input_dtype)
+            A_shared = T.alloc_shared((block_S, block_S), dtype=input_dtype)
+            O_shared = T.alloc_shared((block_S, block_DV), dtype=output_dtype)
+            A_fragment = T.alloc_fragment((block_S, block_S), dtype=accum_dtype)
+            O_fragment = T.alloc_fragment((block_S, block_DV), dtype=accum_dtype)
+            G_shared = T.alloc_shared((block_S,), dtype=gate_dtype, scope="shared")
+            G_diff_local = T.alloc_fragment((block_S, block_S), dtype=gate_dtype)
+
+            T.annotate_layout({
+                Q_shared: tilelang.layout.make_swizzled_layout(Q_shared),
+                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
+                V_shared: tilelang.layout.make_swizzled_layout(V_shared),
+                H_shared: tilelang.layout.make_swizzled_layout(H_shared),
+                A_shared: tilelang.layout.make_swizzled_layout(A_shared),
+                O_shared: tilelang.layout.make_swizzled_layout(O_shared),
+            })
+
+            T.clear(A_fragment)
+            T.clear(O_fragment)
+            T.no_set_max_nreg()
+            for i_k in T.Pipelined(T.ceildiv(DK, block_DK), num_stages=num_stages):
+                T.copy(
+                    Q[bb, bs * block_S:(bs + 1) * block_S, bh, i_k * block_DK:(i_k + 1) * block_DK],
+                    Q_shared)
+                T.copy(
+                    K[bb, bs * block_S:(bs + 1) * block_S, bh, i_k * block_DK:(i_k + 1) * block_DK],
+                    K_shared)
+                T.copy(
+                    HIDDEN[bb, bs, bh, i_k * block_DK:(i_k + 1) * block_DK,
+                           bv * block_DV:(bv + 1) * block_DV], H_shared)
+                T.gemm(Q_shared, H_shared, O_fragment)
+                T.gemm(Q_shared, K_shared, A_fragment, transpose_B=True)
+
+            if use_g:
+                for i_s in T.Parallel(block_S):
+                    G_shared[i_s] = G[bb, bs * block_S + i_s, bh]
+                # T.copy(G[bb, bs * block_S:(bs + 1) * block_S, bh], G_shared)
+                for i_s, i_v in T.Parallel(block_S, block_DV):
+                    O_fragment[i_s, i_v] = O_fragment[i_s, i_v] * T.exp(G_shared[i_s])
+                for i_s1, i_s2 in T.Parallel(block_S, block_S):
+                    G_diff_local[i_s1, i_s2] = G_shared[i_s1] - G_shared[i_s2]
+                for i_s1, i_s2 in T.Parallel(block_S, block_S):
+                    with T.If(G_diff_local[i_s1, i_s2] <= 0):
+                        with T.Then():
+                            A_fragment[i_s1, i_s2] = A_fragment[i_s1, i_s2] * T.exp(
+                                G_diff_local[i_s1, i_s2])
+                        with T.Else():
+                            A_fragment[i_s1, i_s2] = 0
+
+            for i_s1, i_s2 in T.Parallel(block_S, block_S):
+                with T.If(i_s1 < i_s2):  # noqa: SIM117
+                    with T.Then():
+                        A_fragment[i_s1, i_s2] = 0
+
+            T.copy(V[bb, bs * block_S:(bs + 1) * block_S, bh, bv * block_DV:(bv + 1) * block_DV],
+                   V_shared)
+            T.copy(A_fragment, A_shared)
+            T.gemm(A_shared, V_shared, O_fragment)
+
+            for i_s, i_v in T.Parallel(block_S, block_DV):
+                O_fragment[i_s, i_v] = O_fragment[i_s, i_v] * scale
+
+            T.copy(O_fragment, O_shared)
+            T.copy(O_shared, O[bb, bs * block_S:(bs + 1) * block_S, bh,
+                               bv * block_DV:(bv + 1) * block_DV])
+
+    return kernel
+
+
+def run_test(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    chunk_size,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    use_g,
+    block_DK,
+    block_DV,
+    threads,
+    num_stages,
+):
+    input_dtype_torch = getattr(torch, input_dtype)
+    output_dtype_torch = getattr(torch, output_dtype)
+    accum_dtype_torch = getattr(torch, accum_dtype)
+    gate_dtype_torch = getattr(torch, gate_dtype)
+    Q, K, V, HIDDEN, G = prepare_input(B, S, H, DK, DV, chunk_size, input_dtype_torch,
+                                       output_dtype_torch, accum_dtype_torch, gate_dtype_torch)
+    scale = 1.0 / DK**0.5
+
+    O_ref = prepare_output(B, S, H, DK, DV, chunk_size, output_dtype_torch)
+    O_ref = chunk_fwd_o(Q, K, V, HIDDEN, G, scale, chunk_size=chunk_size)
+
+    block_S = chunk_size
+    O_tilelang = prepare_output(B, S, H, DK, DV, chunk_size, output_dtype_torch)
+    kernel = tilelang_chunk_fwd_o(B, S, H, DK, DV, input_dtype, output_dtype, accum_dtype,
+                                  gate_dtype, chunk_size, scale, use_g, block_S, block_DK, block_DV,
+                                  threads, num_stages)
+    O_tilelang = kernel(Q, K, V, HIDDEN, G)
+
+    try:
+        torch.testing.assert_close(O_tilelang, O_ref, rtol=1e-2, atol=1e-2)
+        print("tilelang chunk fwd o passed √")
+    except Exception as e:
+        print("tilelang chunk fwd o failed ✗")
+        print(e)
+
+
+def main():
+    run_test(
+        B=1,
+        S=32768,
+        H=32,
+        DK=128,
+        DV=128,
+        chunk_size=64,
+        input_dtype="bfloat16",
+        output_dtype="bfloat16",
+        accum_dtype="float32",
+        gate_dtype="float32",
+        use_g=True,
+        block_DK=128,
+        block_DV=128,
+        threads=128,
+        num_stages=1,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/gdn/example_chunk_o_bwd.py b/examples/gdn/example_chunk_o_bwd.py
new file mode 100644
index 000000000..cff882325
--- /dev/null
+++ b/examples/gdn/example_chunk_o_bwd.py
@@ -0,0 +1,539 @@
+# Reference: fla/ops/common/chunk_o.py
+
+import math
+import sys  # noqa: F401
+
+import tilelang
+import tilelang.language as T
+from tilelang.engine.callback import register_cuda_postproc_callback  # noqa: F401
+
+print(tilelang.__file__)
+
+# Add your fla repository path to sys.path
+# Currently we use the fla repository from the flash-linear-attention project at commit id f03cb3ae
+# sys.path.insert(0, "/home/tzj/flash-linear-attention")
+try:
+    import fla
+    print(fla.__file__)
+    from fla.ops.common.chunk_o import chunk_bwd_dqkwg
+except ImportError:
+    print("fla not found, using tilelang implementation")
+    fla = None
+
+import torch
+from utils import *
+
+torch.random.manual_seed(0)
+# torch.set_printoptions(profile="full")
+
+tilelang.disable_cache()
+
+
+def prepare_input_fake(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    chunk_size,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    state_dtype,
+):
+    BS = S // chunk_size
+    Q = torch.ones(B, S, H, DK, dtype=input_dtype).cuda()
+    K = torch.ones(B, S, H, DK, dtype=input_dtype).cuda()
+    V = torch.ones(B, S, H, DV, dtype=input_dtype).cuda()
+    h = torch.ones(B, BS, H, DK, DV, dtype=input_dtype).cuda()
+    G = torch.ones(B, S, H, dtype=gate_dtype).cuda()
+    dO = torch.ones(B, S, H, DV, dtype=input_dtype).cuda()
+    dh = torch.ones(B, BS, H, DK, DV, dtype=input_dtype).cuda()
+    dv = torch.ones(B, S, H, DV, dtype=output_dtype).cuda()
+    W = torch.ones(B, S, H, DK, dtype=input_dtype).cuda()
+    return Q, K, V, h, G, dO, dh, dv, W
+
+
+def prepare_input(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    chunk_size,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    state_dtype,
+):
+    BS = S // chunk_size
+
+    Q = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    K = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    V = torch.randn(B, S, H, DV, dtype=input_dtype).cuda()
+    h = torch.randn(B, BS, H, DK, DV, dtype=input_dtype).cuda()
+    G = torch.randn(B, S, H, dtype=gate_dtype).cuda()
+    dO = torch.randn(B, S, H, DV, dtype=input_dtype).cuda()
+    dh = torch.randn(B, BS, H, DK, DV, dtype=input_dtype).cuda()
+    dv = torch.randn(B, S, H, DV, dtype=output_dtype).cuda()
+    W = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    return Q, K, V, h, G, dO, dh, dv, W
+
+
+def prepare_output(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    chunk_size,
+    output_dtype,
+    gate_dtype,
+    state_dtype,
+    block_DK,
+):
+    assert DK == 32 and block_DK == 32 or DK > 32 and block_DK >= 64, "When DK > 32, block_DK must be >= 64"
+    NK = math.ceil(DK / block_DK)
+    dq = torch.empty(B, S, H, DK, dtype=output_dtype).cuda()
+    dk = torch.empty(B, S, H, DK, dtype=output_dtype).cuda()
+    dw = torch.empty(B, S, H, DK, dtype=output_dtype).cuda()
+    dg = torch.empty(NK, B, S, H, dtype=gate_dtype).cuda()
+    return dq, dk, dw, dg
+
+
+# @register_cuda_postproc_callback
+# def tilelang_callback_cuda_postproc(code, _):
+#     cuda_code = open("../debug/chunk_o_bwd3.log", "r").read()
+#     code = cuda_code
+#     return code
+
+
+@tilelang.jit(
+    out_idx=[-4, -3, -2, -1],
+    pass_configs={
+        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True
+    })
+def tilelang_chunk_o_bwd_dqkwg(
+    # task config
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    state_dtype,
+    chunk_size,
+    scale,
+    use_g=True,
+    use_dw=True,
+    # kernel config
+    block_DK=64,
+    block_DV=64,
+    threads=256,
+    num_stages=0,
+):
+    block_S = chunk_size
+    BS = S // block_S
+    NK = math.ceil(DK / block_DK)
+
+    Q_shape = (B, S, H, DK)
+    K_shape = (B, S, H, DK)
+    V_shape = (B, S, H, DV)
+    h_shape = (B, BS, H, DK, DV)
+    G_shape = (B, S, H)
+    dO_shape = (B, S, H, DV)
+    dh_shape = (B, BS, H, DK, DV)
+    dv_shape = (B, S, H, DV)
+    W_shape = (B, S, H, DK)
+
+    dq_shape = (B, S, H, DK)
+    dk_shape = (B, S, H, DK)
+    dw_shape = (B, S, H, DK)
+    dg_shape = (NK, B, S, H)
+
+    @T.prim_func
+    def kernel(
+            # input
+            Q: T.Tensor(Q_shape, dtype=input_dtype),
+            K: T.Tensor(K_shape, dtype=input_dtype),
+            V: T.Tensor(V_shape, dtype=input_dtype),
+            h: T.Tensor(h_shape, dtype=input_dtype),
+            G: T.Tensor(G_shape, dtype=gate_dtype),
+            dO: T.Tensor(dO_shape, dtype=input_dtype),
+            dh: T.Tensor(dh_shape, dtype=input_dtype),
+            dv: T.Tensor(dv_shape, dtype=input_dtype),
+            W: T.Tensor(W_shape, dtype=input_dtype),
+            # output
+            dq: T.Tensor(dq_shape, dtype=output_dtype),
+            dk: T.Tensor(dk_shape, dtype=output_dtype),
+            dw: T.Tensor(dw_shape, dtype=output_dtype),
+            dg: T.Tensor(dg_shape, dtype=gate_dtype),
+    ):
+        with T.Kernel(
+                T.ceildiv(DK, block_DK), T.ceildiv(S, block_S), B * H,
+                threads=threads) as (bk, bs, bbh):
+            bb, bh = bbh // H, bbh % H
+
+            V_shared = T.alloc_shared((block_S, block_DV), dtype=input_dtype)
+            dO_shared = T.alloc_shared((block_S, block_DV), dtype=input_dtype)
+            h_shared = T.alloc_shared((block_DK, block_DV), dtype=input_dtype)
+            dh_shared = T.alloc_shared((block_DK, block_DV), dtype=input_dtype)
+            dv_shared = T.alloc_shared((block_S, block_DV), dtype=input_dtype)
+            q_shared = T.alloc_shared((block_S, block_DK), dtype=input_dtype)
+            k_shared = T.alloc_shared((block_S, block_DK), dtype=input_dtype)
+            ds_shared = T.alloc_shared((block_S, block_S), dtype=output_dtype)
+            dg_shared_1 = T.alloc_shared((block_S,), dtype=gate_dtype)
+            dg_shared_2 = T.alloc_shared((block_S,), dtype=gate_dtype)
+            dk_shared = T.alloc_shared((block_S, block_DK), dtype=accum_dtype)
+
+            ds_fragment = T.alloc_fragment((block_S, block_S), dtype=accum_dtype)
+            ds_fragment_positive = T.alloc_fragment((block_S, block_S), dtype=accum_dtype)
+            ds_fragment_positive_transpose = T.alloc_fragment((block_S, block_S), dtype=accum_dtype)
+            dq_fragment = T.alloc_fragment((block_S, block_DK), dtype=accum_dtype)
+            dk_fragment = T.alloc_fragment((block_S, block_DK), dtype=accum_dtype)
+            dk_fragment_2 = T.alloc_fragment((block_S, block_DK), dtype=accum_dtype)
+            dw_fragment = T.alloc_fragment((block_S, block_DK), dtype=accum_dtype)
+            q_fragment = T.alloc_fragment((block_S, block_DK), dtype=input_dtype)
+            k_fragment = T.alloc_fragment((block_S, block_DK), dtype=input_dtype)
+
+            dg_fragment_reduce_tmp = T.alloc_fragment((block_S, block_DK), dtype=gate_dtype)
+            dg_fragment = T.alloc_fragment((block_S,), dtype=gate_dtype)
+            dg_fragment_2 = T.alloc_fragment((block_S,), dtype=gate_dtype)
+            dg_fragment_final = T.alloc_fragment((block_S,), dtype=gate_dtype)
+            dg_last_local = T.alloc_local((2,), dtype=gate_dtype)
+            dg_last_fragment = T.alloc_fragment((block_DV * block_DK), dtype=gate_dtype)
+            dg_last_fragment_scalar = T.alloc_fragment((1,), dtype=gate_dtype)
+            dg_last_fragment_2 = T.alloc_fragment((block_S * block_DK), dtype=gate_dtype)
+            dg_last_fragment_scalar_2 = T.alloc_fragment((1,), dtype=gate_dtype)
+            G_shared = T.alloc_shared((block_S, block_DK), dtype=gate_dtype, scope="shared")
+            G_last_local = T.alloc_local((1,), dtype=gate_dtype)
+
+            T.use_swizzle(10)
+
+            T.annotate_layout({
+                V_shared: tilelang.layout.make_swizzled_layout(V_shared),
+                dO_shared: tilelang.layout.make_swizzled_layout(dO_shared),
+                h_shared: tilelang.layout.make_swizzled_layout(h_shared),
+                dh_shared: tilelang.layout.make_swizzled_layout(dh_shared),
+                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
+                q_shared: tilelang.layout.make_swizzled_layout(q_shared),
+                k_shared: tilelang.layout.make_swizzled_layout(k_shared),
+            })
+
+            T.clear(dg_last_local)
+            T.clear(G_last_local)
+            T.clear(G_shared)
+            T.clear(q_fragment)
+            T.clear(k_fragment)
+            T.clear(dg_last_fragment)
+
+            T.clear(ds_fragment)
+            T.clear(dq_fragment)
+            T.clear(dk_fragment)
+            T.clear(dw_fragment)
+
+            for i_v in T.Pipelined(T.ceildiv(DV, block_DV), num_stages=num_stages):
+                T.copy(
+                    V[bb, bs * block_S:(bs + 1) * block_S, bh, i_v * block_DV:(i_v + 1) * block_DV],
+                    V_shared)
+                T.copy(
+                    dO[bb, bs * block_S:(bs + 1) * block_S, bh,
+                       i_v * block_DV:(i_v + 1) * block_DV], dO_shared)
+                T.copy(
+                    h[bb, bs, bh, bk * block_DK:(bk + 1) * block_DK,
+                      i_v * block_DV:(i_v + 1) * block_DV], h_shared)
+                T.copy(
+                    dh[bb, bs, bh, bk * block_DK:(bk + 1) * block_DK,
+                       i_v * block_DV:(i_v + 1) * block_DV], dh_shared)
+
+                if use_g:
+                    T.clear(dg_last_fragment_scalar)
+                    # FIXME: The reduce operation of a whole buffer to a scalar is not supported and will cause incorrect result
+                    # for i_kv in T.Parallel(block_DK * block_DV):
+                    #     dg_last_fragment[i_kv] = h_shared[i_kv // block_DV, i_kv % block_DV] * dh_shared[i_kv // block_DV, i_kv % block_DV]
+                    for i_kv in T.Parallel(block_DK * block_DV):
+                        i_k, i_v = i_kv // block_DV, i_kv % block_DV
+                        dg_last_fragment[i_kv] = h_shared[i_k, i_v] * dh_shared[i_k, i_v]
+                    T.reduce_sum(dg_last_fragment, dg_last_fragment_scalar, dim=-1, clear=False)
+                    dg_last_local[0] += dg_last_fragment_scalar[0]
+
+                T.gemm(dO_shared, V_shared, ds_fragment, transpose_B=True)
+                T.gemm(dO_shared, h_shared, dq_fragment, transpose_B=True)
+                T.gemm(V_shared, dh_shared, dk_fragment, transpose_B=True)
+
+                if use_dw:
+                    T.copy(
+                        dv[bb, bs * block_S:(bs + 1) * block_S, bh,
+                           i_v * block_DV:(i_v + 1) * block_DV], dv_shared)
+                    T.gemm(dv_shared, h_shared, dw_fragment, transpose_B=True)
+
+            if use_dw:
+                for i_s, i_k in T.Parallel(block_S, block_DK):
+                    dw_fragment[i_s, i_k] = -dw_fragment[i_s, i_k]
+                T.copy(
+                    dw_fragment, dw[bb, bs * block_S:(bs + 1) * block_S, bh,
+                                    bk * block_DK:(bk + 1) * block_DK])
+
+            T.copy(Q[bb, bs * block_S:(bs + 1) * block_S, bh, bk * block_DK:(bk + 1) * block_DK],
+                   q_shared)
+            T.copy(K[bb, bs * block_S:(bs + 1) * block_S, bh, bk * block_DK:(bk + 1) * block_DK],
+                   k_shared)
+            T.copy(q_shared, q_fragment)
+            T.copy(k_shared, k_fragment)
+
+            if use_g:
+                T.clear(dg_fragment)
+                T.clear(dg_fragment_2)
+                for i_s, i_k in T.Parallel(block_S, block_DK):
+                    G_shared[i_s, i_k] = G[bb, bs * block_S + i_s, bh]
+                G_last_local[0] = G[bb, bs * block_S + block_S - 1, bh]
+                # Use gmem directly instead of local register
+                dg_last_local[0] = dg_last_local[0] * T.exp(G[bb, bs * block_S + block_S - 1, bh])
+
+                for i_s, i_k in T.Parallel(block_S, block_DK):
+                    dq_fragment[i_s, i_k] = dq_fragment[i_s, i_k] * T.exp(G[bb, bs * block_S + i_s,
+                                                                            bh]) * scale
+                T.clear(dg_fragment_reduce_tmp)
+                for i_s, i_k in T.Parallel(block_S, block_DK):
+                    dg_fragment_reduce_tmp[i_s, i_k] = dq_fragment[i_s, i_k] * q_shared[i_s, i_k]
+                # FIXME: The reduce_sum statement with clear=True will cause an error of warp specialized pass
+                T.reduce_sum(dg_fragment_reduce_tmp, dg_fragment, dim=-1, clear=False)
+
+                for i_s, i_k in T.Parallel(block_S, block_DK):
+                    with T.If(G_last_local[0] - G[bb, bs * block_S + i_s, bh] <= 0):
+                        with T.Then():
+                            dk_fragment[i_s, i_k] = dk_fragment[i_s, i_k] * T.exp(
+                                G_last_local[0] - G[bb, bs * block_S + i_s, bh])
+                        with T.Else():
+                            dk_fragment[i_s, i_k] = 0
+                T.clear(dg_fragment_reduce_tmp)
+                for i_s, i_k in T.Parallel(block_S, block_DK):
+                    dg_fragment_reduce_tmp[i_s, i_k] = dk_fragment[i_s, i_k] * (-k_shared[i_s, i_k])
+                # FIXME: The reduce_sum statement with clear=True will cause an error of warp specialized pass
+                T.reduce_sum(dg_fragment_reduce_tmp, dg_fragment, dim=-1, clear=False)
+
+                # FIXME: The reduce operation of a whole buffer to a scalar is not supported and will cause incorrect result
+                T.copy(dk_fragment, dk_shared)
+                T.clear(dg_last_fragment_scalar_2)
+                for i_sk in T.Parallel(block_S * block_DK):
+                    i_s, i_k = i_sk // block_DK, i_sk % block_DK
+                    dg_last_fragment_2[i_sk] = dk_shared[i_s, i_k] * k_shared[i_s, i_k]
+                T.reduce_sum(dg_last_fragment_2, dg_last_fragment_scalar_2, dim=-1, clear=False)
+                dg_last_local[1] = dg_last_fragment_scalar_2[0]
+
+                for i_s1, i_s2 in T.Parallel(block_S, block_S):
+                    with T.If(i_s1 >= i_s2 and
+                              G[bb, bs * block_S + i_s1, bh] - G[bb, bs * block_S + i_s2, bh] <= 0):
+                        with T.Then():
+                            ds_fragment[i_s1, i_s2] = ds_fragment[
+                                i_s1, i_s2] * T.exp(G[bb, bs * block_S + i_s1, bh] -
+                                                    G[bb, bs * block_S + i_s2, bh]) * scale
+                        with T.Else():
+                            ds_fragment[i_s1, i_s2] = 0
+
+                T.clear(ds_fragment_positive)
+                T.clear(ds_fragment_positive_transpose)
+                T.gemm(q_shared, k_shared, ds_fragment_positive, transpose_B=True)
+                for i_s1, i_s2 in T.Parallel(block_S, block_S):
+                    ds_fragment_positive[
+                        i_s1, i_s2] = ds_fragment[i_s1, i_s2] * ds_fragment_positive[i_s1, i_s2]
+
+                # FIXME: The reduce_sum statement with clear=True will cause an error of warp specialized pass
+                T.reduce_sum(ds_fragment_positive, dg_fragment, dim=1, clear=False)
+                T.copy(dg_fragment, dg_shared_1)
+
+                # We should transpose the matrix because the reduce_sum statement can only reduce along the last dimension
+                for i_s1, i_s2 in T.Parallel(block_S, block_S):
+                    ds_fragment_positive_transpose[i_s2, i_s1] = ds_fragment_positive[i_s1, i_s2]
+
+                # FIXME: The reduce_sum statement with clear=True will cause an error of warp specialized pass
+                T.reduce_sum(ds_fragment_positive_transpose, dg_fragment_2, dim=1, clear=False)
+                T.copy(dg_fragment_2, dg_shared_2)
+
+                for i_s in T.Parallel(block_S):
+                    dg_fragment_final[i_s] = dg_shared_1[i_s] - dg_shared_2[i_s]
+
+                T.copy(ds_fragment, ds_shared)
+                T.gemm(ds_shared, k_shared, dq_fragment)
+                T.gemm(ds_shared, q_shared, dk_fragment, transpose_A=True)
+
+                for i_s in T.Parallel(block_S):
+                    with T.If(i_s >= block_S - 1):  # noqa: SIM117
+                        with T.Then():
+                            dg_fragment_final[
+                                i_s] = dg_fragment_final[i_s] + dg_last_local[0] + dg_last_local[1]
+
+                T.copy(
+                    dq_fragment, dq[bb, bs * block_S:(bs + 1) * block_S, bh,
+                                    bk * block_DK:(bk + 1) * block_DK])
+                T.copy(
+                    dk_fragment, dk[bb, bs * block_S:(bs + 1) * block_S, bh,
+                                    bk * block_DK:(bk + 1) * block_DK])
+                for i_s in T.Parallel(block_S):
+                    dg[bk, bb, bs * block_S + i_s, bh] = dg_fragment_final[i_s]
+
+            else:
+                for i_s1, i_s2 in T.Parallel(block_S, block_S):
+                    with T.If(i_s1 < i_s2):  # noqa: SIM117
+                        with T.Then():
+                            ds_fragment[i_s1, i_s2] = 0
+                T.clear(dk_fragment_2)
+                T.copy(ds_fragment, ds_shared)
+                T.gemm(ds_shared, k_shared, dq_fragment)
+                T.gemm(ds_shared, q_shared, dk_fragment_2, transpose_A=True)
+                for i_s, i_k in T.Parallel(block_S, block_DK):
+                    dq_fragment[i_s, i_k] = dq_fragment[i_s, i_k] * scale
+                    dk_fragment[i_s, i_k] = dk_fragment[i_s, i_k] + dk_fragment_2[i_s, i_k] * scale
+                T.copy(
+                    dq_fragment, dq[bb, bs * block_S:(bs + 1) * block_S, bh,
+                                    bk * block_DK:(bk + 1) * block_DK])
+                T.copy(
+                    dk_fragment, dk[bb, bs * block_S:(bs + 1) * block_S, bh,
+                                    bk * block_DK:(bk + 1) * block_DK])
+
+    return kernel
+
+
+def do_bench(fn, *args, warmup=10, rep=10, **kwargs):
+    """
+    Do benchmark for a function.
+    """
+    start_event = [torch.cuda.Event(enable_timing=True) for i in range(rep)]
+    end_event = [torch.cuda.Event(enable_timing=True) for i in range(rep)]
+    for _ in range(warmup):
+        fn(*args, **kwargs)
+
+    torch.cuda.synchronize()
+    for i in range(rep):
+        start_event[i].record()
+        fn(*args, **kwargs)
+        end_event[i].record()
+    torch.cuda.synchronize()
+
+    # Record clocks
+    times = torch.tensor(
+        [s.elapsed_time(e) for s, e in zip(start_event, end_event)],
+        dtype=torch.float,
+    )
+
+    return times.mean().item()
+
+
+def run_test(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    state_dtype,
+    chunk_size,
+    scale,
+    use_g=True,
+    use_dw=True,
+    block_DK=64,
+    block_DV=64,
+    threads=256,
+    num_stages=0,
+):
+    Q, K, V, h, G, dO, dh, dv, W = prepare_input(B, S, H, DK, DV, chunk_size,
+                                                 getattr(torch, input_dtype),
+                                                 getattr(torch, output_dtype),
+                                                 getattr(torch, accum_dtype),
+                                                 getattr(torch, gate_dtype),
+                                                 getattr(torch, state_dtype))
+    dq_ref, dk_ref, dw_ref, dg_ref = prepare_output(B, S, H, DK, DV, chunk_size,
+                                                    getattr(torch, output_dtype),
+                                                    getattr(torch, gate_dtype),
+                                                    getattr(torch, state_dtype), block_DK)
+    dq_tilelang, dk_tilelang, dw_tilelang, dg_tilelang = prepare_output(
+        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, gate_dtype),
+        getattr(torch, state_dtype), block_DK)
+
+    # ref
+    if use_g:
+        dq_ref, dk_ref, dw_ref, dg_ref = chunk_bwd_dqkwg(
+            Q, K, V, G, dO, h, dh, dv, W, chunk_size=chunk_size, scale=scale)
+    else:
+        dq_ref, dk_ref, dw_ref, dg_ref = chunk_bwd_dqkwg(
+            Q, K, V, None, dO, h, dh, dv, W, chunk_size=chunk_size, scale=scale)
+
+    # tilelang
+    kernel = tilelang_chunk_o_bwd_dqkwg(B, S, H, DK, DV, input_dtype, output_dtype, accum_dtype,
+                                        gate_dtype, state_dtype, chunk_size, scale, use_g, use_dw,
+                                        block_DK, block_DV, threads, num_stages)
+    print(kernel.get_kernel_source())
+    dq_tilelang, dk_tilelang, dw_tilelang, dg_tilelang = kernel(Q, K, V, h, G, dO, dh, dv, W)
+
+    if use_g:
+        dg_tilelang = dg_tilelang.sum(dim=0)
+
+    # check
+    try:
+        assert_similar(dq_ref, dq_tilelang, 1e-5, "tilelang chunk o bwd dq")
+        print("tilelang chunk o bwd dq passed √")
+    except Exception as e:
+        print("tilelang chunk o bwd dq failed ✗")
+        print(e)
+
+    try:
+        assert_similar(dk_ref, dk_tilelang, 1e-5, "tilelang chunk o bwd dk")
+        print("tilelang chunk o bwd dk passed √")
+    except Exception as e:
+        print("tilelang chunk o bwd dk failed ✗")
+        print(e)
+
+    if use_g:
+        try:
+            assert_similar(dg_ref, dg_tilelang, 1e-5, "tilelang chunk o bwd dg")
+            print("tilelang chunk o bwd dg passed √")
+        except Exception as e:
+            print("tilelang chunk o bwd dg failed ✗")
+            print(e)
+
+    if use_dw:
+        try:
+            assert_similar(dw_ref, dw_tilelang, 1e-5, "tilelang chunk o bwd dw")
+            print("tilelang chunk o bwd dw passed √")
+        except Exception as e:
+            print("tilelang chunk o bwd dw failed ✗")
+            print(e)
+
+
+def main():
+    DK = 128
+    DV = 128
+    run_test(
+        B=1,
+        S=32768,
+        H=8,
+        DK=DK,
+        DV=DV,
+        input_dtype="bfloat16",
+        output_dtype="bfloat16",
+        accum_dtype="float32",
+        gate_dtype="float32",
+        state_dtype="float32",
+        chunk_size=64,
+        scale=DK**-0.5,
+        # scale=1,
+        use_g=True,
+        use_dw=True,
+        block_DK=64,
+        block_DV=64,
+        threads=128,
+        num_stages=0,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/gdn/example_chunk_scaled_dot_kkt.py b/examples/gdn/example_chunk_scaled_dot_kkt.py
new file mode 100644
index 000000000..841f793f7
--- /dev/null
+++ b/examples/gdn/example_chunk_scaled_dot_kkt.py
@@ -0,0 +1,201 @@
+# Reference: fla/ops/common/chunk_scaled_dot_kkt.py
+
+import tilelang
+import tilelang.language as T
+import sys  # noqa: F401
+
+# Add your fla repository path to sys.path
+# Currently we use the fla repository from the flash-linear-attention project at commit id f03cb3ae
+# sys.path.insert(0, "/home/tzj/flash-linear-attention")
+try:
+    import fla
+    print(fla.__file__)
+    from fla.ops.common.chunk_scaled_dot_kkt import chunk_scaled_dot_kkt_fwd
+except ImportError:
+    print("fla not found, using tilelang implementation")
+    fla = None
+
+import torch
+
+torch.set_printoptions(profile="full")
+torch.random.manual_seed(0)
+
+tilelang.disable_cache()
+
+
+def prepare_input(
+    B,
+    S,
+    H,
+    DK,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+):
+    K = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    Beta = torch.randn(B, S, H, dtype=input_dtype).cuda()
+    G = torch.randn(B, S, H, dtype=accum_dtype).cuda()
+    return K, Beta, G
+
+
+def prepare_output(
+    B,
+    S,
+    H,
+    chunk_size,
+    dtype,
+):
+    BS = chunk_size
+    A = torch.empty(B, S, H, BS, dtype=dtype).cuda()
+    return A
+
+
+@tilelang.jit(out_idx=[-1])
+def tilelang_chunk_scaled_dot_kkt_fwd(
+    # task config
+    B,
+    S,
+    H,
+    DK,
+    chunk_size=64,
+    input_dtype="bfloat16",
+    output_dtype="bfloat16",
+    accum_dtype="float32",
+    use_g=True,
+    # kernel config
+    block_S=64,
+    block_DK=64,
+    threads=256,
+    num_stages=0,
+):
+    K_shape = (B, S, H, DK)
+    Beta_shape = (B, S, H)
+    G_shape = (B, S, H)
+    assert chunk_size == block_S, "chunk_size must be equal to block_S"
+    BS = chunk_size
+    output_shape = (B, S, H, BS)
+
+    @T.prim_func
+    def kernel(
+            K: T.Tensor(K_shape, dtype=input_dtype),
+            Beta: T.Tensor(Beta_shape, dtype=input_dtype),
+            G: T.Tensor(G_shape, dtype=accum_dtype),
+            A: T.Tensor(output_shape, dtype=output_dtype),
+    ):
+        with T.Kernel(T.ceildiv(S, block_S), B * H, threads=threads) as (bs, bbh):
+            bb, bh = bbh // H, bbh % H
+            # !! Pay attention to the scope of the shared memory: may cause misaligned address when shape is one dimension or the buffer is too small
+            Beta_shared = T.alloc_shared((block_S,), dtype=input_dtype, scope="shared")
+            K_shared = T.alloc_shared((block_S, block_DK), dtype=input_dtype)
+            A_shared = T.alloc_shared((block_S, block_S), dtype=output_dtype)
+            Beta_K_fragment = T.alloc_fragment((block_S, block_DK), dtype=input_dtype)
+            A_fragment = T.alloc_fragment((block_S, block_S), dtype=accum_dtype)
+
+            # Tensor used for gated:
+            G_shared = T.alloc_shared((block_S,), dtype=accum_dtype, scope="shared")
+            G_diff_local = T.alloc_fragment((block_S, block_S), dtype=accum_dtype)
+
+            T.annotate_layout({
+                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
+                A_shared: tilelang.layout.make_swizzled_layout(A_shared),
+            })
+
+            T.fill(A_fragment, 0)
+            T.no_set_max_nreg()
+            for i_s in T.Parallel(block_S):
+                Beta_shared[i_s] = Beta[bb, bs * block_S + i_s, bh]
+
+            for i_k in T.Pipelined(T.ceildiv(DK, block_DK), num_stages=num_stages):
+                T.copy(
+                    K[bb, bs * block_S:(bs + 1) * block_S, bh, i_k * block_DK:(i_k + 1) * block_DK],
+                    K_shared)
+                for i_s, i_k2 in T.Parallel(block_S, block_DK):
+                    Beta_K_fragment[i_s, i_k2] = K_shared[i_s, i_k2] * Beta_shared[i_s]
+                T.gemm(Beta_K_fragment, K_shared, A_fragment, transpose_B=True)
+
+            if use_g:
+                for i_s in T.Parallel(block_S):
+                    G_shared[i_s] = G[bb, bs * block_S + i_s, bh]
+                for i_s1, i_s2 in T.Parallel(block_S, block_S):
+                    G_diff_local[i_s1, i_s2] = G_shared[i_s1] - G_shared[i_s2]
+                for i_s1, i_s2 in T.Parallel(block_S, block_S):
+                    with T.If(G_diff_local[i_s1, i_s2] <= 0 and i_s1 > i_s2):
+                        with T.Then():
+                            A_fragment[i_s1, i_s2] = A_fragment[i_s1, i_s2] * T.exp(
+                                G_diff_local[i_s1, i_s2])
+                        with T.Else():
+                            A_fragment[i_s1, i_s2] = 0
+            else:
+                for i_s1, i_s2 in T.Parallel(block_S, block_S):
+                    with T.If(i_s1 <= i_s2):  # noqa: SIM117
+                        with T.Then():
+                            A_fragment[i_s1, i_s2] = 0
+
+            T.copy(A_fragment, A_shared)
+            T.copy(A_shared, A[bb, bs * block_S:(bs + 1) * block_S, bh, :])
+
+    return kernel
+
+
+def run_test(
+    B,
+    S,
+    H,
+    DK,
+    chunk_size,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    use_g,
+    block_DK,
+    threads,
+    num_stages,
+):
+    K, Beta, G = prepare_input(B, S, H, DK, getattr(torch, input_dtype),
+                               getattr(torch, output_dtype), getattr(torch, accum_dtype))
+    A_ref = prepare_output(B, S, H, chunk_size, getattr(torch, output_dtype))
+    A_tilelang = prepare_output(B, S, H, chunk_size, getattr(torch, output_dtype))
+
+    # reference
+    if use_g:
+        A_ref = chunk_scaled_dot_kkt_fwd(
+            K, Beta, G, chunk_size=chunk_size, output_dtype=getattr(torch, output_dtype))
+    else:
+        A_ref = chunk_scaled_dot_kkt_fwd(
+            K, Beta, None, chunk_size=chunk_size, output_dtype=getattr(torch, output_dtype))
+
+    # tilelang
+    block_S = chunk_size
+    kernel = tilelang_chunk_scaled_dot_kkt_fwd(B, S, H, DK, chunk_size, input_dtype, output_dtype,
+                                               accum_dtype, use_g, block_S, block_DK, threads,
+                                               num_stages)
+    A_tilelang = kernel(K, Beta, G)
+
+    try:
+        torch.testing.assert_close(A_tilelang, A_ref, rtol=1e-2, atol=1e-2)
+        print("tilelang chunk scaled dot kkt fwd passed √")
+    except Exception as e:
+        print("tilelang chunk scaled dot kkt fwd failed ✗")
+        print(e)
+        print("reference cuda kernel:")
+        print(kernel.get_kernel_source())
+
+
+def main():
+    run_test(
+        B=1,
+        S=32768,
+        H=32,
+        DK=128,
+        chunk_size=64,
+        input_dtype="bfloat16",
+        output_dtype="bfloat16",
+        accum_dtype="float32",
+        use_g=True,
+        block_DK=64,
+        threads=128,
+        num_stages=2)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/gdn/example_cumsum.py b/examples/gdn/example_cumsum.py
new file mode 100644
index 000000000..67d631d61
--- /dev/null
+++ b/examples/gdn/example_cumsum.py
@@ -0,0 +1,171 @@
+# Util functions for flash linear attention cumsum
+# Reference: fla/ops/utils/cumsum.py
+
+import tilelang
+import tilelang.language as T
+import sys  # noqa: F401
+
+# Add your fla repository path to sys.path
+# Currently we use the fla repository from the flash-linear-attention project at commit id f03cb3ae
+# sys.path.insert(0, "/home/tzj/flash-linear-attention")
+try:
+    import fla
+    print(fla.__file__)
+    from fla.ops.utils.cumsum import chunk_local_cumsum_scalar
+except ImportError:
+    print("fla not found, using tilelang implementation")
+    fla = None
+
+import torch
+
+tilelang.disable_cache()
+
+
+@tilelang.jit(
+    out_idx=[-1],
+    pass_configs={
+        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True
+    })
+def tilelang_chunk_local_cumsum_scalar(
+    # task config
+    B,
+    S,
+    H,
+    chunk_size=64,
+    is_varlen=False,
+    head_first=False,
+    reverse=False,
+    input_dtype="float16",
+    output_dtype="float32",
+    # kernel config
+    block_S=64,
+    threads=256,
+    use_fragment=False,
+):
+    G_shape = (B, H, S) if head_first else (B, S, H)
+    assert chunk_size == 2**(chunk_size.bit_length() - 1), "chunk_size must be a power of 2"
+    assert chunk_size == block_S, "chunk_size must be equal to block_S"
+
+    @T.prim_func
+    def kernel(
+            G: T.Tensor(G_shape, dtype=input_dtype),
+            G_new: T.Tensor(G_shape, dtype=output_dtype),
+    ):
+        with T.Kernel(T.ceildiv(S, block_S), B * H, threads=threads) as (bs, bbh):
+            bb, bh = bbh // H, bbh % H
+            G_shared = T.alloc_shared((1, block_S), dtype=output_dtype, scope="shared")
+            if head_first:
+                T.copy(G[bb, bh, bs * block_S:(bs + 1) * block_S], G_shared)
+            else:
+                T.copy(G[bb, bs * block_S:(bs + 1) * block_S, bh], G_shared)
+            if use_fragment:
+                G_fragment = T.alloc_fragment((1, block_S), dtype=output_dtype, scope="shared")
+                T.copy(G_shared, G_fragment)
+                T.cumsum(G_fragment, dim=1, reverse=reverse)
+                if head_first:
+                    T.copy(G_fragment, G_new[bb, bh, bs * block_S:(bs + 1) * block_S])
+                else:
+                    T.copy(G_fragment, G_new[bb, bs * block_S:(bs + 1) * block_S, bh])
+            else:
+                T.cumsum(G_shared, dim=1, reverse=reverse)
+                if head_first:
+                    T.copy(G_shared, G_new[bb, bh, bs * block_S:(bs + 1) * block_S])
+                else:
+                    T.copy(G_shared, G_new[bb, bs * block_S:(bs + 1) * block_S, bh])
+
+    return kernel
+
+
+def prepare_cumsum_input(
+    B,
+    S,
+    H,
+    dtype,
+):
+    G = torch.randn(B, S, H, dtype=dtype).cuda()
+    return G
+
+
+def prepare_cumsum_output(
+    B,
+    S,
+    H,
+    dtype,
+):
+    G_new = torch.empty(B, S, H, dtype=dtype).cuda()
+    return G_new
+
+
+def run_test(
+    B,
+    S,
+    H,
+    chunk_size,
+    reverse,
+    head_first,
+    input_dtype,
+    output_dtype,
+    threads,
+    use_fragment,
+):
+    G = prepare_cumsum_input(B, S, H, getattr(torch, input_dtype))
+    G_new_ref = prepare_cumsum_output(B, S, H, getattr(torch, output_dtype))
+    G_new_tilelang = prepare_cumsum_output(B, S, H, getattr(torch, output_dtype))
+
+    # reference cumsum
+    G_new_ref = chunk_local_cumsum_scalar(
+        g=G,
+        chunk_size=chunk_size,
+        reverse=reverse,
+        head_first=head_first,
+        output_dtype=getattr(torch, output_dtype))
+
+    # tilelang cumsum
+    block_S = chunk_size
+    kernel = tilelang_chunk_local_cumsum_scalar(
+        B=B,
+        S=S,
+        H=H,
+        chunk_size=chunk_size,
+        reverse=reverse,
+        head_first=head_first,
+        input_dtype=input_dtype,
+        output_dtype=output_dtype,
+        block_S=block_S,
+        threads=threads,
+        use_fragment=use_fragment,
+    )
+    torch.cuda.profiler.start()
+    G_new_tilelang = kernel(G)
+    torch.cuda.profiler.stop()
+    try:
+        torch.testing.assert_close(G_new_tilelang, G_new_ref, rtol=1e-2, atol=1e-2)
+        print("tilelang cumsum passed √")
+    except Exception as e:
+        print("tilelang cumsum failed ✗")
+        print(e)
+        print("G:")
+        print(G.view(-1))
+        print("G_new_tilelang:")
+        print(G_new_tilelang.view(-1))
+        print("G_new_ref:")
+        print(G_new_ref.view(-1))
+
+
+def main():
+    run_test(
+        B=1,
+        S=32768,
+        H=32,
+        chunk_size=64,
+        reverse=True,
+        head_first=False,
+        input_dtype="float32",
+        output_dtype="float32",
+        threads=256,
+        use_fragment=False)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/gdn/example_wy_fast.py b/examples/gdn/example_wy_fast.py
new file mode 100644
index 000000000..583cf2123
--- /dev/null
+++ b/examples/gdn/example_wy_fast.py
@@ -0,0 +1,233 @@
+# Reference: fla/ops/gated_delta_rule/wy_fast.py
+
+import tilelang
+import tilelang.language as T
+import sys  # noqa: F401
+
+# Add your fla repository path to sys.path
+# Currently we use the fla repository from the flash-linear-attention project at commit id f03cb3ae
+# sys.path.insert(0, "/home/tzj/flash-linear-attention")
+try:
+    import fla
+    print(fla.__file__)
+    from fla.ops.gated_delta_rule.wy_fast import recompute_w_u_fwd
+except ImportError:
+    print("fla not found, using tilelang implementation")
+    fla = None
+
+import torch
+
+torch.random.manual_seed(1)
+
+tilelang.disable_cache()
+
+
+def prepare_input(B, S, H, DK, DV, chunk_size, input_dtype, output_dtype, gate_dtype=torch.float32):
+    BS = chunk_size
+    K = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    V = torch.randn(B, S, H, DV, dtype=input_dtype).cuda()
+    Beta = torch.randn(B, S, H, dtype=input_dtype).cuda()
+    G = torch.randn(B, S, H, dtype=gate_dtype).cuda()
+    A = torch.randn(B, S, H, BS, dtype=output_dtype).cuda()
+    return K, V, Beta, G, A
+
+
+def prepare_output(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    output_dtype,
+):
+    W = torch.empty(B, S, H, DK, dtype=output_dtype).cuda()
+    U = torch.empty(B, S, H, DV, dtype=output_dtype).cuda()
+    return W, U
+
+
+@tilelang.jit(out_idx=[-2, -1])
+def tilelang_recompute_w_u_fwd(
+    # task config
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    input_dtype,
+    output_dtype,
+    gate_dtype,
+    accum_dtype,
+    chunk_size,
+    # kernel config
+    block_S=64,
+    block_DK=64,
+    block_DV=64,
+    threads=256,
+    num_stages=0,
+):
+    K_shape = (B, S, H, DK)
+    V_shape = (B, S, H, DV)
+    Beta_shape = (B, S, H)
+    assert chunk_size == block_S, "chunk_size must be equal to block_S"
+    BS = chunk_size
+    G_shape = (B, S, H)
+    A_shape = (B, S, H, BS)
+
+    @T.prim_func
+    def kernel(
+            K: T.Tensor(K_shape, dtype=input_dtype),
+            V: T.Tensor(V_shape, dtype=input_dtype),
+            Beta: T.Tensor(Beta_shape, dtype=input_dtype),
+            G: T.Tensor(G_shape, dtype=gate_dtype),
+            A: T.Tensor(A_shape, dtype=output_dtype),
+            W: T.Tensor(K_shape, dtype=output_dtype),
+            U: T.Tensor(V_shape, dtype=output_dtype),
+    ):
+        with T.Kernel(T.ceildiv(S, block_S), B * H, threads=threads) as (bs, bbh):
+            bb, bh = bbh // H, bbh % H
+            Beta_shared = T.alloc_shared((block_S,), dtype=input_dtype, scope="shared")
+            K_shared = T.alloc_shared((block_S, block_DK), dtype=input_dtype)
+            V_shared = T.alloc_shared((block_S, block_DV), dtype=input_dtype)
+            G_shared = T.alloc_shared((block_S,), dtype=gate_dtype, scope="shared")
+            A_shared = T.alloc_shared((block_S, block_S), dtype=output_dtype)
+            W_fragment = T.alloc_fragment((block_S, block_DK), dtype=accum_dtype)
+            U_fragment = T.alloc_fragment((block_S, block_DV), dtype=accum_dtype)
+            W_shared = T.alloc_shared((block_S, block_DK), dtype=output_dtype)
+            U_shared = T.alloc_shared((block_S, block_DV), dtype=output_dtype)
+            W_Beta_shared = T.alloc_shared((block_S, block_DK), dtype=input_dtype)
+            U_Beta_shared = T.alloc_shared((block_S, block_DV), dtype=input_dtype)
+
+            T.annotate_layout({
+                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
+                V_shared: tilelang.layout.make_swizzled_layout(V_shared),
+                A_shared: tilelang.layout.make_swizzled_layout(A_shared),
+                W_shared: tilelang.layout.make_swizzled_layout(W_shared),
+                U_shared: tilelang.layout.make_swizzled_layout(U_shared),
+                W_Beta_shared: tilelang.layout.make_swizzled_layout(W_Beta_shared),
+                U_Beta_shared: tilelang.layout.make_swizzled_layout(U_Beta_shared),
+            })
+
+            T.no_set_max_nreg()
+            for i_s in T.Parallel(block_S):
+                Beta_shared[i_s] = Beta[bb, bs * block_S + i_s, bh]
+                G_shared[i_s] = T.exp(G[bb, bs * block_S + i_s, bh])
+
+            T.copy(A[bb, bs * block_S:(bs + 1) * block_S, bh, :], A_shared)
+
+            for i_v in T.Pipelined(T.ceildiv(DV, block_DV), num_stages=num_stages):
+                T.copy(
+                    V[bb, bs * block_S:(bs + 1) * block_S, bh, i_v * block_DV:(i_v + 1) * block_DV],
+                    V_shared)
+                for i_s, i_v2 in T.Parallel(block_S, block_DV):
+                    U_Beta_shared[i_s, i_v2] = V_shared[i_s, i_v2] * Beta_shared[i_s]
+                T.gemm(A_shared, U_Beta_shared, U_fragment, clear_accum=True)
+                # First copy to smem, then copy to gmem to reduce U2RU instructions
+                T.copy(U_fragment, U_shared)
+                T.copy(
+                    U_shared, U[bb, bs * block_S:(bs + 1) * block_S, bh,
+                                i_v * block_DV:(i_v + 1) * block_DV])
+
+            for i_k in T.Pipelined(T.ceildiv(DK, block_DK), num_stages=num_stages):
+                T.copy(
+                    K[bb, bs * block_S:(bs + 1) * block_S, bh, i_k * block_DK:(i_k + 1) * block_DK],
+                    K_shared)
+                for i_s, i_k2 in T.Parallel(block_S, block_DK):
+                    W_Beta_shared[i_s,
+                                  i_k2] = K_shared[i_s, i_k2] * Beta_shared[i_s] * G_shared[i_s]
+                T.gemm(A_shared, W_Beta_shared, W_fragment, clear_accum=True)
+                # First copy to smem, then copy to gmem to reduce U2RU instructions
+                T.copy(W_fragment, W_shared)
+                T.copy(
+                    W_shared, W[bb, bs * block_S:(bs + 1) * block_S, bh,
+                                i_k * block_DK:(i_k + 1) * block_DK])
+
+    return kernel
+
+
+def run_test(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    chunk_size,
+    input_dtype,
+    output_dtype,
+    gate_dtype,
+    accum_dtype,
+    block_DK,
+    block_DV,
+    threads,
+    num_stages,
+):
+    K, V, Beta, G, A = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        gate_dtype=getattr(torch, gate_dtype))
+    W_ref, U_ref = prepare_output(B, S, H, DK, DV, getattr(torch, output_dtype))
+    W_tilelang, U_tilelang = prepare_output(B, S, H, DK, DV, getattr(torch, output_dtype))
+
+    # reference
+    W_ref, U_ref = recompute_w_u_fwd(K, V, Beta, G, A, None)
+
+    # tilelang
+    block_S = chunk_size
+    kernel = tilelang_recompute_w_u_fwd(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        gate_dtype,
+        accum_dtype,
+        chunk_size,
+        block_S=block_S,
+        block_DK=block_DK,
+        block_DV=block_DV,
+        threads=threads,
+        num_stages=num_stages)
+    print(kernel.get_kernel_source())
+    W_tilelang, U_tilelang = kernel(K, V, Beta, G, A)
+
+    try:
+        torch.testing.assert_close(W_tilelang, W_ref, rtol=1e-2, atol=1e-2)
+        print("tilelang recompute w passed √")
+    except Exception as e:
+        print("tilelang recompute w failed ✗")
+        print(e)
+    try:
+        torch.testing.assert_close(U_tilelang, U_ref, rtol=1e-2, atol=1e-2)
+        print("tilelang recompute u passed √")
+    except Exception as e:
+        print("tilelang recompute u failed ✗")
+        print(e)
+
+
+def main():
+    run_test(
+        B=1,
+        S=32768,
+        H=32,
+        DK=128,
+        DV=128,
+        chunk_size=64,
+        input_dtype="bfloat16",
+        output_dtype="bfloat16",
+        gate_dtype="float32",
+        accum_dtype="float32",
+        block_DK=64,
+        block_DV=32,
+        threads=128,
+        num_stages=3)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/gdn/example_wy_fast_bwd_split.py b/examples/gdn/example_wy_fast_bwd_split.py
new file mode 100644
index 000000000..6ce61b17d
--- /dev/null
+++ b/examples/gdn/example_wy_fast_bwd_split.py
@@ -0,0 +1,536 @@
+# Reference: fla/ops/gated_delta_rule/wy_fast.py
+
+import sys  # noqa: F401
+
+import tilelang
+import tilelang.language as T
+
+# Add your fla repository path to sys.path
+# Currently we use the fla repository from the flash-linear-attention project at commit id 00000000
+# sys.path.insert(0, "/home/tzj/flash-linear-attention")
+try:
+    import fla
+    print(fla.__file__)
+    from fla.ops.gated_delta_rule.wy_fast import bwd_prepare_wy_repr
+except ImportError:
+    print("fla not found, using tilelang implementation")
+    fla = None
+
+import torch
+import torch.nn.functional as F
+from utils import assert_similar
+
+torch.random.manual_seed(0)
+torch.set_printoptions(profile="full")
+
+tilelang.disable_cache()
+
+
+def prepare_input_fake(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    chunk_size,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    state_dtype,
+):
+    BS = chunk_size
+    K = torch.ones(B, S, H, DK, dtype=input_dtype).cuda()
+    V = torch.ones(B, S, H, DV, dtype=input_dtype).cuda()
+    Beta = torch.ones(B, S, H, dtype=input_dtype).cuda()
+    G = torch.ones(B, S, H, dtype=gate_dtype).cuda()
+    A = torch.ones(B, S, H, BS, dtype=input_dtype).cuda()
+    dw = torch.ones(B, S, H, DK, dtype=input_dtype).cuda()
+    du = torch.ones(B, S, H, DV, dtype=input_dtype).cuda()
+    return K, V, Beta, G, A, dw, du
+
+
+def prepare_input(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    chunk_size,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    state_dtype,
+):
+    BS = chunk_size
+    K = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    K = F.normalize(K, dim=-1, p=2)
+    V = torch.randn(B, S, H, DV, dtype=input_dtype).cuda()
+    V = F.normalize(V, dim=-1, p=2)
+    Beta = torch.randn(B, S, H, dtype=input_dtype).cuda()
+    G = torch.randn(B, S, H, dtype=gate_dtype).cuda()
+    A = torch.randn(B, S, H, BS, dtype=input_dtype).cuda()
+    dw = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    du = torch.randn(B, S, H, DV, dtype=input_dtype).cuda()
+    return K, V, Beta, G, A, dw, du
+
+
+def prepare_output(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    chunk_size,
+    output_dtype,
+    gate_dtype,
+    state_dtype,
+):
+    dk = torch.empty(B, S, H, DK, dtype=output_dtype).cuda()
+    dv = torch.empty(B, S, H, DV, dtype=output_dtype).cuda()
+    dbeta = torch.empty(B, S, H, dtype=output_dtype).cuda()
+    dg = torch.empty(B, S, H, dtype=gate_dtype).cuda()
+    return dk, dv, dbeta, dg
+
+
+@tilelang.jit(
+    out_idx=[-5, -4, -3, -2, -1],
+    pass_configs={
+        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True
+    })
+def tilelang_wy_fast_bwd(
+    # task config
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    state_dtype,
+    chunk_size,
+    # kernel config
+    block_DK=64,
+    block_DV=64,
+    threads=128,
+    num_stages=0,
+):
+    block_S = chunk_size
+    BS = block_S
+
+    K_shape = (B, S, H, DK)
+    V_shape = (B, S, H, DV)
+    Beta_shape = (B, S, H)
+    G_shape = (B, S, H)
+    A_shape = (B, S, H, BS)
+    dw_shape = (B, S, H, DK)
+    du_shape = (B, S, H, DV)
+
+    dk_shape = (B, S, H, DK)
+    dv_shape = (B, S, H, DV)
+    dbeta_shape = (B, S, H)
+    dg_shape = (B, S, H)
+    dA_shape = (B, S, H, BS)
+
+    @T.prim_func
+    def kernel(
+            # input
+            K: T.Tensor(K_shape, dtype=input_dtype),
+            V: T.Tensor(V_shape, dtype=input_dtype),
+            Beta: T.Tensor(Beta_shape, dtype=input_dtype),
+            G: T.Tensor(G_shape, dtype=gate_dtype),
+            A: T.Tensor(A_shape, dtype=input_dtype),
+            dw: T.Tensor(dw_shape, dtype=input_dtype),
+            du: T.Tensor(du_shape, dtype=input_dtype),
+            # output
+            dA: T.Tensor(dA_shape, dtype=input_dtype),
+            dk: T.Tensor(dk_shape, dtype=output_dtype),
+            dv: T.Tensor(dv_shape, dtype=output_dtype),
+            dbeta: T.Tensor(dbeta_shape, dtype=output_dtype),
+            dg: T.Tensor(dg_shape, dtype=gate_dtype),
+    ):
+        with T.Kernel(T.ceildiv(S, block_S), B * H, threads=threads) as (bs, bbh):
+            bb, bh = bbh // H, bbh % H
+
+            A_shared = T.alloc_shared((block_S, block_S), dtype=input_dtype)
+            K_shared = T.alloc_shared((block_S, block_DK), dtype=input_dtype)
+            K_shared_beta_g = T.alloc_shared((block_S, block_DK), dtype=input_dtype)
+            V_shared = T.alloc_shared((block_S, block_DV), dtype=input_dtype)
+            V_shared_beta = T.alloc_shared((block_S, block_DV), dtype=input_dtype)
+            Beta_shared = T.alloc_shared((block_S,), dtype=input_dtype)
+            G_shared = T.alloc_shared((block_S,), dtype=gate_dtype)
+            G_shared_exp = T.alloc_shared((block_S,), dtype=gate_dtype)
+            dw_shared = T.alloc_shared((block_S, block_DK), dtype=input_dtype)
+            du_shared = T.alloc_shared((block_S, block_DV), dtype=input_dtype)
+
+            dA_fragment = T.alloc_fragment((block_S, block_S), dtype=accum_dtype)
+            dk_fragment = T.alloc_fragment((block_S, block_DK), dtype=accum_dtype)
+            dk_fragment_beta_g = T.alloc_fragment((block_S, block_DK), dtype=accum_dtype)
+            dv_fragment = T.alloc_fragment((block_S, block_DV), dtype=accum_dtype)
+            dv_fragment_beta = T.alloc_fragment((block_S, block_DV), dtype=accum_dtype)
+            dbeta_fragment_k = T.alloc_fragment((block_S,), dtype=accum_dtype)
+            dbeta_fragment_v = T.alloc_fragment((block_S,), dtype=accum_dtype)
+            dbeta_fragment_reduce_tmpk = T.alloc_fragment((block_S, block_DK), dtype=accum_dtype)
+            dbeta_fragment_reduce_tmpv = T.alloc_fragment((block_S, block_DV), dtype=accum_dtype)
+            dg_fragment = T.alloc_fragment((block_S,), dtype=gate_dtype)
+            dg_fragment_reduce_tmp = T.alloc_fragment((block_S, block_DK), dtype=gate_dtype)
+
+            T.use_swizzle(10)
+
+            T.clear(dA_fragment)
+            T.clear(dk_fragment)
+            T.clear(dk_fragment_beta_g)
+            T.clear(dv_fragment)
+            T.clear(dv_fragment_beta)
+            T.clear(dbeta_fragment_k)
+            T.clear(dbeta_fragment_v)
+            T.clear(dg_fragment)
+
+            T.copy(A[bb, bs * block_S:(bs + 1) * block_S, bh, :], A_shared)
+            for i_s in T.Parallel(block_S):
+                Beta_shared[i_s] = Beta[bb, bs * block_S + i_s, bh]
+                G_shared[i_s] = G[bb, bs * block_S + i_s, bh]
+                G_shared_exp[i_s] = T.exp(G[bb, bs * block_S + i_s, bh])
+
+            # Update dk
+            for i_k in T.Pipelined(T.ceildiv(DK, block_DK), num_stages=num_stages):
+                T.copy(
+                    K[bb, bs * block_S:(bs + 1) * block_S, bh, i_k * block_DK:(i_k + 1) * block_DK],
+                    K_shared)
+                for i_s, i_k2 in T.Parallel(block_S, block_DK):
+                    K_shared_beta_g[i_s,
+                                    i_k2] = K_shared[i_s,
+                                                     i_k2] * Beta_shared[i_s] * G_shared_exp[i_s]
+                T.copy(
+                    dw[bb, bs * block_S:(bs + 1) * block_S, bh,
+                       i_k * block_DK:(i_k + 1) * block_DK], dw_shared)
+                T.gemm(dw_shared, K_shared_beta_g, dA_fragment, transpose_B=True)
+                T.gemm(A_shared, dw_shared, dk_fragment_beta_g, clear_accum=True, transpose_A=True)
+                for i_s, i_k2 in T.Parallel(block_S, block_DK):
+                    dk_fragment[
+                        i_s,
+                        i_k2] = dk_fragment_beta_g[i_s, i_k2] * Beta_shared[i_s] * G_shared_exp[i_s]
+                # for i_s, i_k2 in T.Parallel(block_S, block_DK):
+                #     dbeta_fragment[i_s] = dbeta_fragment[i_s] + dk_fragment_beta_g[i_s, i_k2] * K_shared[i_s, i_k2] * G_shared_exp[i_s]
+                for i_s, i_k2 in T.Parallel(block_S, block_DK):
+                    dbeta_fragment_reduce_tmpk[i_s, i_k2] = dk_fragment_beta_g[
+                        i_s, i_k2] * K_shared[i_s, i_k2] * G_shared_exp[i_s]
+                T.reduce_sum(dbeta_fragment_reduce_tmpk, dbeta_fragment_k, dim=1, clear=False)
+
+                # for i_s, i_k2 in T.Parallel(block_S, block_DK):
+                #     dg_fragment[i_s] = dg_fragment[i_s] + dk_fragment_beta_g[i_s, i_k2] * K_shared[i_s, i_k2] * G_shared_exp[i_s] * Beta_shared[i_s]
+                for i_s, i_k2 in T.Parallel(block_S, block_DK):
+                    dg_fragment_reduce_tmp[i_s, i_k2] = dk_fragment_beta_g[i_s, i_k2] * K_shared[
+                        i_s, i_k2] * G_shared_exp[i_s] * Beta_shared[i_s]
+                T.reduce_sum(dg_fragment_reduce_tmp, dg_fragment, dim=1, clear=False)
+
+                # correct dk
+                T.copy(
+                    dk_fragment, dk[bb, bs * block_S:(bs + 1) * block_S, bh,
+                                    i_k * block_DK:(i_k + 1) * block_DK])
+
+            # Update dv
+            for i_v in T.Pipelined(T.ceildiv(DV, block_DV), num_stages=num_stages):
+                T.copy(
+                    V[bb, bs * block_S:(bs + 1) * block_S, bh, i_v * block_DV:(i_v + 1) * block_DV],
+                    V_shared)
+                for i_s, i_v2 in T.Parallel(block_S, block_DV):
+                    V_shared_beta[i_s, i_v2] = V_shared[i_s, i_v2] * Beta_shared[i_s]
+                T.copy(
+                    du[bb, bs * block_S:(bs + 1) * block_S, bh,
+                       i_v * block_DV:(i_v + 1) * block_DV], du_shared)
+                T.gemm(du_shared, V_shared_beta, dA_fragment, transpose_B=True)
+                T.gemm(A_shared, du_shared, dv_fragment_beta, clear_accum=True, transpose_A=True)
+                for i_s, i_v2 in T.Parallel(block_S, block_DV):
+                    dv_fragment[i_s, i_v2] = dv_fragment_beta[i_s, i_v2] * Beta_shared[i_s]
+                # for i_s, i_v2 in T.Parallel(block_S, block_DV):
+                #     dbeta_fragment[i_s] = dbeta_fragment[i_s] + dv_fragment_beta[i_s, i_v2] * V_shared[i_s, i_v2]
+                for i_s, i_v2 in T.Parallel(block_S, block_DV):
+                    dbeta_fragment_reduce_tmpv[i_s,
+                                               i_v2] = dv_fragment_beta[i_s, i_v2] * V_shared[i_s,
+                                                                                              i_v2]
+                T.reduce_sum(dbeta_fragment_reduce_tmpv, dbeta_fragment_v, dim=1, clear=False)
+
+                T.copy(
+                    dv_fragment, dv[bb, bs * block_S:(bs + 1) * block_S, bh,
+                                    i_v * block_DV:(i_v + 1) * block_DV])
+
+            # Temporary store dbeta, dg and dA
+            for i_s in T.Parallel(block_S):
+                dbeta[bb, bs * block_S + i_s, bh] = dbeta_fragment_k[i_s] + dbeta_fragment_v[i_s]
+                dg[bb, bs * block_S + i_s, bh] = dg_fragment[i_s]
+            # correct dA
+            T.copy(dA_fragment, dA[bb, bs * block_S:(bs + 1) * block_S, bh, :])
+
+    return kernel
+
+
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True
+    })
+def tilelang_wy_fast_bwd_split(
+    # task config
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    state_dtype,
+    chunk_size,
+    # kernel config
+    block_DK=64,
+    block_DV=64,
+    threads=128,
+    num_stages=0,
+):
+    block_S = chunk_size
+    BS = block_S
+
+    K_shape = (B, S, H, DK)
+    V_shape = (B, S, H, DV)
+    Beta_shape = (B, S, H)
+    G_shape = (B, S, H)
+    A_shape = (B, S, H, BS)
+    dw_shape = (B, S, H, DK)
+    du_shape = (B, S, H, DV)
+
+    dk_shape = (B, S, H, DK)
+    dv_shape = (B, S, H, DV)
+    dbeta_shape = (B, S, H)
+    dA_shape = (B, S, H, BS)
+
+    @T.prim_func
+    def kernel(
+            # input
+            K: T.Tensor(K_shape, dtype=input_dtype),
+            V: T.Tensor(V_shape, dtype=input_dtype),
+            Beta: T.Tensor(Beta_shape, dtype=input_dtype),
+            G: T.Tensor(G_shape, dtype=gate_dtype),
+            A: T.Tensor(A_shape, dtype=input_dtype),
+            dw: T.Tensor(dw_shape, dtype=input_dtype),
+            du: T.Tensor(du_shape, dtype=input_dtype),
+            dA: T.Tensor(dA_shape, dtype=input_dtype),
+            dk: T.Tensor(dk_shape, dtype=output_dtype),
+            dv: T.Tensor(dv_shape, dtype=output_dtype),
+            dbeta_k: T.Tensor(dbeta_shape, dtype=output_dtype),
+            dg_A_positive: T.Tensor(dA_shape, dtype=gate_dtype),
+            dg_A_negative: T.Tensor(dA_shape, dtype=gate_dtype),
+    ):
+        with T.Kernel(T.ceildiv(S, block_S), B * H, threads=threads) as (bs, bbh):
+            bb, bh = bbh // H, bbh % H
+
+            A_shared = T.alloc_shared((block_S, block_S), dtype=input_dtype)
+            A_fragment = T.alloc_fragment((block_S, block_S), dtype=accum_dtype)
+            K_shared = T.alloc_shared((block_S, block_DK), dtype=input_dtype)
+            K_shared_beta = T.alloc_shared((block_S, block_DK), dtype=input_dtype)
+            dA_shared = T.alloc_shared((block_S, block_S), dtype=input_dtype)
+            dA_fragment = T.alloc_fragment((block_S, block_S), dtype=accum_dtype)
+            dA_A_fragment = T.alloc_fragment((block_S, block_S), dtype=accum_dtype)
+            dA_A_fragment_1 = T.alloc_fragment((block_S,), dtype=accum_dtype)
+            dA_A_fragment_2 = T.alloc_fragment((block_S,), dtype=accum_dtype)
+            dk_shared = T.alloc_shared((block_S, block_DK), dtype=input_dtype)
+            dk_shared_beta = T.alloc_shared((block_S, block_DK), dtype=input_dtype)
+            dk_fragment = T.alloc_fragment((block_S, block_DK), dtype=accum_dtype)
+            dk_fragment_beta = T.alloc_fragment((block_S, block_DK), dtype=accum_dtype)
+            Beta_shared = T.alloc_shared((block_S,), dtype=input_dtype)
+            dbeta_fragment_reduce_tmpk = T.alloc_fragment((block_S, block_DK), dtype=accum_dtype)
+            dbeta_fragment_k = T.alloc_fragment((block_S,), dtype=accum_dtype)
+            G_shared = T.alloc_shared((block_S,), dtype=gate_dtype)
+            G_shared_exp = T.alloc_shared((block_S,), dtype=gate_dtype)
+
+            T.clear(dbeta_fragment_reduce_tmpk)
+            T.clear(dbeta_fragment_k)
+            T.clear(dA_A_fragment_1)
+            T.clear(dA_A_fragment_2)
+
+            T.copy(A[bb, bs * block_S:(bs + 1) * block_S, bh, :], A_shared)
+            for i_s in T.Parallel(block_S):
+                Beta_shared[i_s] = Beta[bb, bs * block_S + i_s, bh]
+                G_shared[i_s] = G[bb, bs * block_S + i_s, bh]
+            for i_s in T.Parallel(block_S):
+                G_shared_exp[i_s] = T.exp(G_shared[i_s])
+
+            # Load intermediate results
+            # for i_s in T.Parallel(block_S):
+            # dbeta_fragment[i_s] = dbeta[bb, bs * block_S + i_s, bh]
+            # dg_fragment[i_s] = dg[bb, bs * block_S + i_s, bh]
+            T.copy(dA[bb, bs * block_S:(bs + 1) * block_S, bh, :], dA_shared)
+            # T.copy(dA_shared, dA[bb, bs * block_S:(bs + 1) * block_S, bh, :])
+
+            # Update dA
+            T.copy(dA_shared, dA_fragment)
+
+            for i_s1, i_s2 in T.Parallel(block_S, block_S):
+                with T.If(i_s1 <= i_s2):  # noqa: SIM117
+                    with T.Then():
+                        dA_fragment[i_s1, i_s2] = 0
+            T.copy(dA_fragment, dA_shared)
+            T.gemm(dA_shared, A_shared, dA_fragment, clear_accum=True, transpose_B=True)
+            T.copy(dA_fragment, dA_shared)
+            T.gemm(A_shared, dA_shared, dA_fragment, clear_accum=True, transpose_A=True)
+            for i_s1, i_s2 in T.Parallel(block_S, block_S):
+                with T.If(i_s1 <= i_s2):
+                    with T.Then():
+                        dA_fragment[i_s1, i_s2] = 0
+                    with T.Else():
+                        dA_fragment[i_s1, i_s2] = -dA_fragment[i_s1, i_s2]
+
+            for i_s1, i_s2 in T.Parallel(block_S, block_S):
+                with T.If(G[bb, bs * block_S + i_s1, bh] - G[bb, bs * block_S + i_s2, bh] <= 0):
+                    with T.Then():
+                        dA_fragment[i_s1, i_s2] *= T.exp(G[bb, bs * block_S + i_s1, bh] -
+                                                         G[bb, bs * block_S + i_s2, bh])
+                    with T.Else():
+                        dA_fragment[i_s1, i_s2] = 0
+            T.copy(dA_fragment, dA_shared)
+
+            # acceptable dA diff
+            # T.copy(dA_fragment, dA[bb, bs * block_S:(bs + 1) * block_S, bh, :])
+
+            # Update dk using previous dk
+            T.clear(A_fragment)
+            for i_k in T.Pipelined(T.ceildiv(DK, block_DK), num_stages=num_stages):
+                T.copy(
+                    K[bb, bs * block_S:(bs + 1) * block_S, bh, i_k * block_DK:(i_k + 1) * block_DK],
+                    K_shared)
+                T.copy(
+                    dk[bb, bs * block_S:(bs + 1) * block_S, bh,
+                       i_k * block_DK:(i_k + 1) * block_DK], dk_shared)
+                T.copy(dk_shared, dk_fragment)
+                for i_s, i_k2 in T.Parallel(block_S, block_DK):
+                    K_shared_beta[i_s, i_k2] = K_shared[i_s, i_k2] * Beta_shared[i_s]
+                T.gemm(K_shared_beta, K_shared, A_fragment, transpose_B=True)
+                T.gemm(dA_shared, K_shared, dk_fragment_beta, clear_accum=True)
+                # for i_s, i_k2 in T.Parallel(block_S, block_DK):
+                #     dbeta_fragment[i_s] = dbeta_fragment[i_s] + dk_fragment_beta[i_s, i_k2] * K_shared[i_s, i_k2]
+                for i_s, i_k2 in T.Parallel(block_S, block_DK):
+                    dbeta_fragment_reduce_tmpk[i_s,
+                                               i_k2] = dk_fragment_beta[i_s, i_k2] * K_shared[i_s,
+                                                                                              i_k2]
+                T.reduce_sum(dbeta_fragment_reduce_tmpk, dbeta_fragment_k, dim=1, clear=False)
+                T.gemm(dA_shared, K_shared_beta, dk_fragment, transpose_A=True)
+                for i_s, i_k2 in T.Parallel(block_S, block_DK):
+                    dk_shared_beta[i_s, i_k2] = dk_fragment_beta[i_s, i_k2] * Beta_shared[i_s]
+                for i_s, i_k2 in T.Parallel(block_S, block_DK):
+                    dk_fragment[i_s, i_k2] = dk_fragment[i_s, i_k2] + dk_shared_beta[i_s, i_k2]
+                T.copy(
+                    dk_fragment, dk[bb, bs * block_S:(bs + 1) * block_S, bh,
+                                    i_k * block_DK:(i_k + 1) * block_DK])
+
+            # Update dg and dbeta
+            T.copy(A_fragment, A_shared)
+            for i_s1, i_s2 in T.Parallel(block_S, block_S):
+                dA_A_fragment[i_s1, i_s2] = dA_fragment[i_s1, i_s2] * A_fragment[i_s1, i_s2]
+            # Note: Reduce operation now not supported in shared memory
+            # FIXME: reduce will cause incorrect result when dim != -1
+            T.reduce_sum(dA_A_fragment, dA_A_fragment_1, dim=1)
+            T.reduce_sum(dA_A_fragment, dA_A_fragment_2, dim=0)
+
+            for i_s1, i_s2 in T.Parallel(block_S, block_S):
+                dg_A_positive[bb, bs * block_S + i_s1, bh, i_s2] = dA_A_fragment[i_s1, i_s2]
+                dg_A_negative[bb, bs * block_S + i_s2, bh, i_s1] = dA_A_fragment[i_s1, i_s2]
+
+            for i_s in T.Parallel(block_S):
+                dbeta_k[bb, bs * block_S + i_s, bh] = dbeta_fragment_k[i_s]
+
+    return kernel
+
+
+def run_test(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    state_dtype,
+    chunk_size,
+    block_DK=64,
+    block_DV=64,
+    threads=128,
+    num_stages=0,
+):
+    K, V, Beta, G, A, dw, du = prepare_input(B, S, H, DK, DV, chunk_size,
+                                             getattr(torch, input_dtype),
+                                             getattr(torch, output_dtype),
+                                             getattr(torch,
+                                                     accum_dtype), getattr(torch, gate_dtype),
+                                             getattr(torch, state_dtype))
+    dk_ref, dv_ref, dbeta_ref, dg_ref = prepare_output(B, S, H, DK, DV, chunk_size,
+                                                       getattr(torch, output_dtype),
+                                                       getattr(torch, gate_dtype),
+                                                       getattr(torch, state_dtype))
+    dk_tilelang, dv_tilelang, dbeta_tilelang, dg_tilelang = prepare_output(
+        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, gate_dtype),
+        getattr(torch, state_dtype))
+    BS = chunk_size
+    dA_tilelang = torch.empty(B, S, H, BS, dtype=getattr(torch, input_dtype)).cuda()
+    dbeta_tilelang_k = torch.empty(B, S, H, dtype=getattr(torch, output_dtype)).cuda()
+    dg_tilelang_A_positive = torch.empty(B, S, H, BS, dtype=getattr(torch, gate_dtype)).cuda()
+    dg_tilelang_A_negative = torch.empty(B, S, H, BS, dtype=getattr(torch, gate_dtype)).cuda()
+
+    # ref
+    dk_ref, dv_ref, dbeta_ref, dg_ref = bwd_prepare_wy_repr(
+        K, V, G, Beta, A, dw, du, cu_seqlens=None)
+
+    # tilelang
+    kernel = tilelang_wy_fast_bwd(B, S, H, DK, DV, input_dtype, output_dtype, accum_dtype,
+                                  gate_dtype, state_dtype, chunk_size, block_DK, block_DV, threads,
+                                  num_stages)
+    dA_tilelang, dk_tilelang, dv_tilelang, dbeta_tilelang, dg_tilelang = kernel(
+        K, V, Beta, G, A, dw, du)
+    torch.cuda.synchronize()
+    kernel_split = tilelang_wy_fast_bwd_split(B, S, H, DK, DV, input_dtype, output_dtype,
+                                              accum_dtype, gate_dtype, state_dtype, chunk_size,
+                                              block_DK, block_DV, threads, num_stages)
+    kernel_split(K, V, Beta, G, A, dw, du, dA_tilelang, dk_tilelang, dv_tilelang, dbeta_tilelang_k,
+                 dg_tilelang_A_positive, dg_tilelang_A_negative)
+    torch.cuda.synchronize()
+
+    dbeta_tilelang = dbeta_tilelang_k + dbeta_tilelang
+    dg_tilelang = dg_tilelang + dg_tilelang_A_positive.sum(dim=-1) - dg_tilelang_A_negative.sum(
+        dim=-1)
+
+    assert_similar(dk_ref, dk_tilelang, eps=1e-5, name="dk", raise_assert=False)
+    assert_similar(dv_ref, dv_tilelang, eps=1e-5, name="dv", raise_assert=False)
+    assert_similar(dbeta_ref, dbeta_tilelang, eps=1e-5, name="dbeta", raise_assert=False)
+    assert_similar(dg_ref, dg_tilelang, eps=1e-5, name="dg", raise_assert=False)
+
+
+def main():
+    DK = 128
+    DV = 128
+    run_test(
+        B=1,
+        S=32768,
+        H=8,
+        DK=DK,
+        DV=DV,
+        input_dtype="bfloat16",
+        output_dtype="bfloat16",
+        accum_dtype="float32",
+        gate_dtype="float32",
+        state_dtype="float32",
+        chunk_size=64,
+        block_DK=32,
+        block_DV=32,
+        threads=128,
+        num_stages=0,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/gdn/test_example_gdn_compilation.py b/examples/gdn/test_example_gdn_compilation.py
new file mode 100644
index 000000000..f05fa49cd
--- /dev/null
+++ b/examples/gdn/test_example_gdn_compilation.py
@@ -0,0 +1,206 @@
+import tilelang.testing
+import torch
+
+tilelang.disable_cache()
+
+B = 1
+S = 32768
+H = 32
+DK = 128
+DV = 128
+input_dtype = "bfloat16"
+output_dtype = "bfloat16"
+accum_dtype = "float32"
+gate_dtype = "float32"
+state_dtype = "float32"
+chunk_size = 64
+use_g = True
+use_initial_state = True
+store_final_state = True
+use_final_state_gradient = True
+save_new_value = True
+block_DK = 64
+block_DV = 32
+threads = 128
+num_stages = 1
+
+
+def test_example_wy_fast_compilation():
+    from example_wy_fast import tilelang_recompute_w_u_fwd, prepare_input, prepare_output
+    K, V, Beta, G, A = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        gate_dtype=getattr(torch, gate_dtype))
+    W_tilelang, U_tilelang = prepare_output(B, S, H, DK, DV, getattr(torch, output_dtype))
+    # tilelang
+    block_S = chunk_size
+    kernel = tilelang_recompute_w_u_fwd(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        gate_dtype,
+        accum_dtype,
+        chunk_size,
+        block_S=block_S,
+        block_DK=block_DK,
+        block_DV=block_DV,
+        threads=threads,
+        num_stages=num_stages)
+    print(kernel.get_kernel_source())
+    W_tilelang, U_tilelang = kernel(K, V, Beta, G, A)
+
+
+def test_example_wy_fast_bwd_split_compilation():
+    from example_wy_fast_bwd_split import tilelang_wy_fast_bwd, tilelang_wy_fast_bwd_split, prepare_input, prepare_output
+    K, V, Beta, G, A, dw, du = prepare_input(B, S, H, DK, DV, chunk_size,
+                                             getattr(torch, input_dtype),
+                                             getattr(torch, output_dtype),
+                                             getattr(torch,
+                                                     accum_dtype), getattr(torch, gate_dtype),
+                                             getattr(torch, state_dtype))
+    dk_tilelang, dv_tilelang, dbeta_tilelang, dg_tilelang = prepare_output(
+        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, gate_dtype),
+        getattr(torch, state_dtype))
+    BS = chunk_size
+    dA_tilelang = torch.empty(B, S, H, BS, dtype=getattr(torch, input_dtype)).cuda()
+    dbeta_tilelang_k = torch.empty(B, S, H, dtype=getattr(torch, output_dtype)).cuda()
+    dg_tilelang_A_positive = torch.empty(B, S, H, BS, dtype=getattr(torch, gate_dtype)).cuda()
+    dg_tilelang_A_negative = torch.empty(B, S, H, BS, dtype=getattr(torch, gate_dtype)).cuda()
+
+    # tilelang
+    kernel = tilelang_wy_fast_bwd(B, S, H, DK, DV, input_dtype, output_dtype, accum_dtype,
+                                  gate_dtype, state_dtype, chunk_size, block_DK, block_DV, threads,
+                                  num_stages)
+    dA_tilelang, dk_tilelang, dv_tilelang, dbeta_tilelang, dg_tilelang = kernel(
+        K, V, Beta, G, A, dw, du)
+    torch.cuda.synchronize()
+    kernel_split = tilelang_wy_fast_bwd_split(B, S, H, DK, DV, input_dtype, output_dtype,
+                                              accum_dtype, gate_dtype, state_dtype, chunk_size,
+                                              block_DK, block_DV, threads, num_stages)
+    kernel_split(K, V, Beta, G, A, dw, du, dA_tilelang, dk_tilelang, dv_tilelang, dbeta_tilelang_k,
+                 dg_tilelang_A_positive, dg_tilelang_A_negative)
+    torch.cuda.synchronize()
+
+    dbeta_tilelang = dbeta_tilelang_k + dbeta_tilelang
+    dg_tilelang = dg_tilelang + dg_tilelang_A_positive.sum(dim=-1) - dg_tilelang_A_negative.sum(
+        dim=-1)
+
+
+def test_example_chunk_o_compilation():
+    from example_chunk_o import tilelang_chunk_fwd_o, prepare_input, prepare_output
+    Q, K, V, HIDDEN, G = prepare_input(B, S, H, DK, DV, chunk_size, getattr(torch, input_dtype),
+                                       getattr(torch, output_dtype), getattr(torch, accum_dtype),
+                                       getattr(torch, gate_dtype))
+    scale = 1.0 / DK**0.5
+    block_S = chunk_size
+    O_tilelang = prepare_output(B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype))
+    kernel = tilelang_chunk_fwd_o(B, S, H, DK, DV, input_dtype, output_dtype, accum_dtype,
+                                  gate_dtype, chunk_size, scale, use_g, block_S, block_DK, block_DV,
+                                  threads, num_stages)
+    O_tilelang = kernel(Q, K, V, HIDDEN, G)  # noqa: F841
+
+
+def test_example_chunk_o_bwd_compilation():
+    from example_chunk_o_bwd import tilelang_chunk_o_bwd_dqkwg, prepare_input, prepare_output
+    Q, K, V, h, G, dO, dh, dv, W = prepare_input(B, S, H, DK, DV, chunk_size,
+                                                 getattr(torch, input_dtype),
+                                                 getattr(torch, output_dtype),
+                                                 getattr(torch, accum_dtype),
+                                                 getattr(torch, gate_dtype),
+                                                 getattr(torch, state_dtype))
+    dq_tilelang, dk_tilelang, dw_tilelang, dg_tilelang = prepare_output(
+        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, gate_dtype),
+        getattr(torch, state_dtype), block_DK)
+    kernel = tilelang_chunk_o_bwd_dqkwg(B, S, H, DK, DV, input_dtype, output_dtype, accum_dtype,
+                                        gate_dtype, state_dtype, chunk_size, 1.0, use_g, True,
+                                        block_DK, block_DV, threads, num_stages)
+    dq_tilelang, dk_tilelang, dw_tilelang, dg_tilelang = kernel(Q, K, V, h, G, dO, dh, dv,
+                                                                W)  # noqa: F841
+    if use_g:
+        dg_tilelang = dg_tilelang.sum(dim=0)
+
+
+def test_example_chunk_scaled_dot_kkt_compilation():
+    from example_chunk_scaled_dot_kkt import tilelang_chunk_scaled_dot_kkt_fwd, prepare_input, prepare_output
+    K, Beta, G = prepare_input(B, S, H, DK, getattr(torch, input_dtype),
+                               getattr(torch, output_dtype), getattr(torch, accum_dtype))
+    A_tilelang = prepare_output(B, S, H, chunk_size, getattr(torch, output_dtype))
+    block_S = chunk_size
+    kernel = tilelang_chunk_scaled_dot_kkt_fwd(B, S, H, DK, chunk_size, input_dtype, output_dtype,
+                                               accum_dtype, use_g, block_S, block_DK, threads,
+                                               num_stages)
+    A_tilelang = kernel(K, Beta, G)  # noqa: F841
+
+
+def test_example_cumsum_compilation():
+    from example_cumsum import tilelang_chunk_local_cumsum_scalar, prepare_cumsum_input, prepare_cumsum_output
+    G = prepare_cumsum_input(B, S, H, getattr(torch, gate_dtype))
+    G_new_tilelang = prepare_cumsum_output(B, S, H, getattr(torch, gate_dtype))
+    block_S = chunk_size
+    kernel = tilelang_chunk_local_cumsum_scalar(
+        B=B,
+        S=S,
+        H=H,
+        chunk_size=chunk_size,
+        reverse=False,
+        head_first=False,
+        input_dtype=gate_dtype,
+        output_dtype=gate_dtype,
+        block_S=block_S,
+        threads=threads,
+        use_fragment=False,
+    )
+    G_new_tilelang = kernel(G)  # noqa: F841
+
+
+def test_example_chunk_delta_h_compilation():
+    from example_chunk_delta_h import tilelang_chunk_gated_delta_rule_fwd_h, prepare_input, prepare_output
+    K, W, U, G, initial_state = prepare_input(B, S, H, DK, DV, chunk_size,
+                                              getattr(torch, input_dtype),
+                                              getattr(torch, output_dtype),
+                                              getattr(torch, accum_dtype),
+                                              getattr(torch, gate_dtype))
+    h_tilelang, final_state_tilelang, V_new_tilelang = prepare_output(B, S, H, DK, DV, chunk_size,
+                                                                      getattr(torch, output_dtype),
+                                                                      getattr(torch, state_dtype))
+    kernel = tilelang_chunk_gated_delta_rule_fwd_h(B, S, H, DK, DV, input_dtype, output_dtype,
+                                                   accum_dtype, gate_dtype, state_dtype, chunk_size,
+                                                   use_g, use_initial_state, store_final_state,
+                                                   save_new_value, block_DK, block_DV, threads,
+                                                   num_stages)
+    h_tilelang, final_state_tilelang, V_new_tilelang = kernel(K, W, U, G,
+                                                              initial_state)  # noqa: F841
+
+
+def test_example_chunk_delta_bwd_compilation():
+    from example_chunk_delta_bwd import tilelang_chunk_gated_delta_rule_bwd_dhu, prepare_input, prepare_output
+    Q, K, W, G, h0, dht, dO, dv = prepare_input(B, S, H, DK, DV, chunk_size,
+                                                getattr(torch, input_dtype),
+                                                getattr(torch, output_dtype),
+                                                getattr(torch, accum_dtype),
+                                                getattr(torch, gate_dtype),
+                                                getattr(torch, state_dtype))
+    dh_tilelang, dh0_tilelang, dv2_tilelang = prepare_output(B, S, H, DK, DV, chunk_size,
+                                                             getattr(torch, output_dtype),
+                                                             getattr(torch, gate_dtype),
+                                                             getattr(torch, state_dtype))
+    kernel = tilelang_chunk_gated_delta_rule_bwd_dhu(B, S, H, DK, DV, input_dtype, output_dtype,
+                                                     accum_dtype, gate_dtype, state_dtype,
+                                                     chunk_size, 1.0, use_g, use_initial_state,
+                                                     use_final_state_gradient, block_DV, threads,
+                                                     num_stages)
+    dh_tilelang, dh0_tilelang, dv2_tilelang = kernel(Q, K, W, G, h0, dht, dO, dv)  # noqa: F841
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/examples/gdn/utils.py b/examples/gdn/utils.py
new file mode 100644
index 000000000..d1048b392
--- /dev/null
+++ b/examples/gdn/utils.py
@@ -0,0 +1,40 @@
+import torch
+
+
+def print_red_warning(message):
+    print(f"\033[31mWARNING: {message}\033[0m")
+
+
+def calc_sim(x, y, name="tensor"):
+    x, y = x.data.double(), y.data.double()
+    denominator = (x * x + y * y).sum()
+    if denominator == 0:
+        print_red_warning(f'{name} all zero')
+        return 1
+    sim = 2 * (x * y).sum() / denominator
+    return sim
+
+
+def assert_similar(x, y, eps=1e-8, name="tensor", data="", raise_assert=True):
+    x_mask = torch.isfinite(x)
+    y_mask = torch.isfinite(y)
+    if not torch.all(x_mask == y_mask):
+        print_red_warning(f'{name} Error: isfinite mask mismatch')
+        if raise_assert:
+            raise AssertionError
+    if not torch.isclose(
+            x.masked_fill(x_mask, 0), y.masked_fill(y_mask, 0), rtol=0, atol=0,
+            equal_nan=True).all():
+        print_red_warning(f'{name} Error: nonfinite value mismatch')
+        if raise_assert:
+            raise AssertionError
+    x = x.masked_fill(~x_mask, 0)
+    y = y.masked_fill(~y_mask, 0)
+    sim = calc_sim(x, y, name)
+    diff = 1. - sim
+    if not (0 <= diff <= eps):
+        print_red_warning(f'{name} Error: {diff}')
+        if raise_assert:
+            raise AssertionError
+    else:
+        print(f"{name} {data} passed")
\ No newline at end of file
diff --git a/examples/gemm/example_gemm_autotune.py b/examples/gemm/example_gemm_autotune.py
index 733879b01..ce6eb6827 100644
--- a/examples/gemm/example_gemm_autotune.py
+++ b/examples/gemm/example_gemm_autotune.py
@@ -16,10 +16,7 @@ def ref_program(A, B):
 
 def get_configs(M, N, K, with_roller=False, topk=20):
     if with_roller:
-        if torch.version.hip is not None:
-            arch=CDNA("hip")
-        else:
-            arch = CUDA("cuda")
+        arch = CUDA("cuda") if torch.version.hip is None else CDNA("hip")
         carve_template = MatmulTemplate(
             M=M,
             N=N,
diff --git a/examples/gemm_fp8/example_tilelang_gemm_fp8.py b/examples/gemm_fp8/example_tilelang_gemm_fp8.py
index 365b10915..a403ed068 100644
--- a/examples/gemm_fp8/example_tilelang_gemm_fp8.py
+++ b/examples/gemm_fp8/example_tilelang_gemm_fp8.py
@@ -57,8 +57,8 @@ def test_gemm_fp8(M, N, K, dtype):
 
 
 def main():
-    test_gemm_fp8(1024, 1024, 1024, 'e4m3_float8')
-    test_gemm_fp8(1024, 1024, 1024, 'e5m2_float8')
+    test_gemm_fp8(1024, 1024, 1024, 'float8_e4m3')
+    test_gemm_fp8(1024, 1024, 1024, 'float8_e5m2')
 
 
 if __name__ == "__main__":
diff --git a/examples/gemm_fp8/example_tilelang_gemm_fp8_2xAcc.py b/examples/gemm_fp8/example_tilelang_gemm_fp8_2xAcc.py
index aa9e02ff9..1d9207aff 100644
--- a/examples/gemm_fp8/example_tilelang_gemm_fp8_2xAcc.py
+++ b/examples/gemm_fp8/example_tilelang_gemm_fp8_2xAcc.py
@@ -74,8 +74,8 @@ def test_gemm_fp8(M, N, K, dtype):
 
 
 def main():
-    test_gemm_fp8(1024, 1024, 8192, 'e4m3_float8')
-    test_gemm_fp8(1024, 1024, 8192, 'e5m2_float8')
+    test_gemm_fp8(1024, 1024, 8192, 'float8_e4m3')
+    test_gemm_fp8(1024, 1024, 8192, 'float8_e5m2')
 
 
 if __name__ == "__main__":
diff --git a/examples/gemm_fp8/example_tilelang_gemm_fp8_intrinsic.py b/examples/gemm_fp8/example_tilelang_gemm_fp8_intrinsic.py
index bec6775b0..1bfde7de4 100644
--- a/examples/gemm_fp8/example_tilelang_gemm_fp8_intrinsic.py
+++ b/examples/gemm_fp8/example_tilelang_gemm_fp8_intrinsic.py
@@ -40,8 +40,8 @@ def tl_matmul(
 ):
     assert in_dtype in [
         "float16",
-        "e4m3_float8",
-        "e5m2_float8",
+        "float8_e4m3",
+        "float8_e5m2",
         "int8",
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
@@ -52,7 +52,7 @@ def tl_matmul(
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    is_float8 = in_dtype in ["e4m3_float8", "e5m2_float8"]
+    is_float8 = in_dtype in ["float8_e4m3", "float8_e5m2"]
     if out_dtype == "int32" or is_float8:
         micro_size_k = 32
 
@@ -216,8 +216,8 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
 
 
 def main():
-    assert_tl_matmul_correctness(128, 128, 128, "e4m3_float8", "float32", "float32")
-    assert_tl_matmul_correctness(128, 128, 128, "e5m2_float8", "float32", "float32")
+    assert_tl_matmul_correctness(128, 128, 128, "float8_e4m3", "float32", "float32")
+    assert_tl_matmul_correctness(128, 128, 128, "float8_e5m2", "float32", "float32")
 
 
 if __name__ == "__main__":
diff --git a/examples/gemm_splitk/example_tilelang_gemm_splitk.py b/examples/gemm_splitk/example_tilelang_gemm_splitk.py
index 8c0b6b0a9..c96669711 100644
--- a/examples/gemm_splitk/example_tilelang_gemm_splitk.py
+++ b/examples/gemm_splitk/example_tilelang_gemm_splitk.py
@@ -37,14 +37,6 @@ def main(
 
             T.copy(C_local, C_shared)
 
-            # TODO: Automatically add vectorized atomic with enhancement
-            # https://github.com/tile-ai/tilelang/issues/523
-            # if DataType(dtype).bits == 16:
-            #     for i, j in T.Parallel(block_M, block_N // 2):
-            #         m, n = by * block_M + i, bx * block_N + j * 2
-            #         # vectorized atomic
-            #         T.atomic_addx2(C[m, n], C_shared[i, j * 2])
-
             for i, j in T.Parallel(block_M, block_N):
                 T.atomic_add(C[by * block_M + i, bx * block_N + j], C_shared[i, j])
 
diff --git a/examples/gemm_splitk/example_tilelang_gemm_splitk_vectorize_atomicadd.py b/examples/gemm_splitk/example_tilelang_gemm_splitk_vectorize_atomicadd.py
new file mode 100644
index 000000000..145d622ed
--- /dev/null
+++ b/examples/gemm_splitk/example_tilelang_gemm_splitk_vectorize_atomicadd.py
@@ -0,0 +1,70 @@
+import tilelang
+import tilelang.language as T
+
+
+@tilelang.jit
+def matmul(M,
+           N,
+           K,
+           block_M,
+           block_N,
+           block_K,
+           split_k,
+           dtype="float16",
+           accum_dtype="float",
+           out_dtype="float32"):
+
+    splitK = K // split_k
+
+    @T.prim_func
+    def main(
+            A: T.Tensor((M, K), dtype),
+            B: T.Tensor((N, K), dtype),
+            C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(
+                T.ceildiv(N, block_N), T.ceildiv(M, block_M), split_k, threads=128) as (bx, by, bz):
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            B_shared = T.alloc_shared((block_K, block_N), dtype)
+            C_shared = T.alloc_shared((block_M, block_N), out_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+
+            T.clear(C_local)
+            for ko in T.Pipelined(T.ceildiv(splitK, block_K), num_stages=0):
+                T.copy(A[by * block_M, bz * splitK + ko * block_K], A_shared)
+                T.copy(B[bz * splitK + ko * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local)
+
+            T.copy(C_local, C_shared)
+
+            T.atomic_add(C[by * block_M, bx * block_N], C_shared)
+
+    return main
+
+
+def main():
+    M = 1024
+    N = 1024
+    K = 1024
+    block_M = 128
+    block_N = 128
+    block_K = 32
+    split_k = 4
+
+    kernel = matmul(M, N, K, block_M, block_N, block_K, split_k)
+
+    import torch
+
+    torch.random.manual_seed(42)
+    a = torch.randn(M, K).cuda().half()
+    b = torch.randn(K, N).cuda().half()
+    c = torch.zeros(M, N).cuda().float()
+    kernel(a, b, c)
+
+    ref_c = a @ b
+
+    torch.testing.assert_close(c, ref_c.to(c.dtype), rtol=1e-2, atol=1e-2)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/gemm_splitk/test_example_gemm_splitk.py b/examples/gemm_splitk/test_example_gemm_splitk.py
index 0fa1217bc..055b09162 100644
--- a/examples/gemm_splitk/test_example_gemm_splitk.py
+++ b/examples/gemm_splitk/test_example_gemm_splitk.py
@@ -1,10 +1,15 @@
 import tilelang.testing
 
-from example_tilelang_gemm_splitk import main
+import example_tilelang_gemm_splitk
+import example_tilelang_gemm_splitk_vectorize_atomicadd
 
 
 def test_example_tilelang_gemm_splitk():
-    main()
+    example_tilelang_gemm_splitk.main()
+
+
+def test_example_tilelang_gemm_splitk_vectorize_atomicadd():
+    example_tilelang_gemm_splitk_vectorize_atomicadd.main()
 
 
 if __name__ == "__main__":
diff --git a/examples/grouped_gemm/example_grouped_gemm_fwd.py b/examples/grouped_gemm/example_grouped_gemm_fwd.py
index 14227bca6..f0dbd88c4 100644
--- a/examples/grouped_gemm/example_grouped_gemm_fwd.py
+++ b/examples/grouped_gemm/example_grouped_gemm_fwd.py
@@ -7,11 +7,6 @@
 tilelang.disable_cache()
 
 
-@tilelang.jit(
-    out_idx=[2], pass_configs={
-        "tl.disable_tma_lower": True,
-        "tl.disable_warp_specialized": True
-    })
 def torch_gmm(a, b, batch_sizes, batch_offsets_tensor, trans_b=False):
     """
     Perform grouped matrix multiplication using PyTorch.
@@ -44,11 +39,7 @@ def torch_gmm(a, b, batch_sizes, batch_offsets_tensor, trans_b=False):
     return output
 
 
-@tilelang.jit(
-    out_idx=[2], pass_configs={
-        "tl.disable_tma_lower": True,
-        "tl.disable_warp_specialized": True
-    })
+@tilelang.jit(out_idx=[2])
 def grouped_gemm(batch_sizes_list,
                  K,
                  N,
@@ -150,7 +141,8 @@ def run_tilelang_grouped_gemm(batch_sizes_list,
                               profile=False):
     padding_M = block_M
     batch_sum = sum(batch_sizes_list)
-    kernel = grouped_gemm(batch_sizes_list, K, M, block_M, block_N, block_K, num_stages, threads)
+    kernel = grouped_gemm(
+        tuple(batch_sizes_list), K, M, block_M, block_N, block_K, num_stages, threads)
     # print(kernel.get_kernel_source())
 
     device = torch.device("cuda")
diff --git a/examples/warp_specialize/example_warp_specialize_flashmla.py b/examples/warp_specialize/example_warp_specialize_flashmla.py
index 0ccf2594e..844d655b2 100644
--- a/examples/warp_specialize/example_warp_specialize_flashmla.py
+++ b/examples/warp_specialize/example_warp_specialize_flashmla.py
@@ -9,6 +9,7 @@
 tilelang.disable_cache()
 
 
+@tilelang.jit(out_idx=[6])
 def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_H, num_split):
     scale = (1.0 / (dim + pe_dim))**0.5 * 1.44269504  # log2(e)
     dtype = "float16"
@@ -27,39 +28,58 @@ def flash_attn(
             Output: T.Tensor([batch, heads, dim], dtype),
     ):
         with T.Kernel(heads // min(block_H, kv_group_num), batch, threads=256) as (hid, bid):
+            # smem_sQ
             Q_shared_l = T.alloc_shared([block_H, h_dim], dtype)
             Q_shared_r = T.alloc_shared([block_H, h_dim], dtype)
-
             Q_pe_shared = T.alloc_shared([block_H, pe_dim], dtype)
+            Q_pe_local_0 = T.alloc_fragment([block_H, pe_dim], dtype)
+            Q_pe_local_1 = T.alloc_fragment([block_H, pe_dim], dtype)
+
+            # smem_sK0
             KV_shared_0_l = T.alloc_shared([block_N, h_dim], dtype)
             KV_shared_0_r = T.alloc_shared([block_N, h_dim], dtype)
+            K_pe_shared_0 = T.alloc_shared([block_N, pe_dim], dtype)
+
+            # smem_sK1
             KV_shared_1_l = T.alloc_shared([block_N, h_dim], dtype)
             KV_shared_1_r = T.alloc_shared([block_N, h_dim], dtype)
-            K_pe_shared_0 = T.alloc_shared([block_N, pe_dim], dtype)
             K_pe_shared_1 = T.alloc_shared([block_N, pe_dim], dtype)
+
+            # smem_sP0
+            SP0_shared = T.alloc_shared([block_H, block_N], dtype)
+
+            # smem_sP1 reuse Q_pe_shared
+            SP1_shared = Q_pe_shared
+
+            # smem_sM
+            scores_max = T.alloc_shared([block_H], accum_dtype)
+
+            # smem_sScale0
+            scores_scale_0 = T.alloc_shared([block_H], accum_dtype)
+            # smem_sScale1
+            scores_scale_1 = T.alloc_shared([block_H], accum_dtype)
+
+            logsum = T.alloc_shared([block_H], accum_dtype)
+
             O_shared_l = Q_shared_l
             O_shared_r = Q_shared_r
-            S_shared = K_pe_shared_0
-            S_shared_ = K_pe_shared_1
 
             acc_s_0 = T.alloc_fragment([block_H, block_N], accum_dtype)
+            acc_s_0_cast = T.alloc_fragment([block_H, block_N], dtype)
             acc_s_1 = T.alloc_fragment([block_H, block_N], accum_dtype)
+            acc_s_1_cast = T.alloc_fragment([block_H, block_N], dtype)
             acc_o_l = T.alloc_fragment([block_H, h_dim], accum_dtype)
             acc_o_r = T.alloc_fragment([block_H, h_dim], accum_dtype)
             scores_max_0 = T.alloc_fragment([block_H], accum_dtype)
             scores_max_1 = T.alloc_fragment([block_H], accum_dtype)
-            scores_max = T.alloc_shared([block_H], accum_dtype)
 
             scores_max_prev_0 = T.alloc_fragment([block_H], accum_dtype)
             scores_max_prev_1 = T.alloc_fragment([block_H], accum_dtype)
 
-            scores_scale_0 = T.alloc_shared([block_H], accum_dtype)
-            scores_scale_1 = T.alloc_shared([block_H], accum_dtype)
             scores_sum_0 = T.alloc_fragment([block_H], accum_dtype)
             scores_sum_1 = T.alloc_fragment([block_H], accum_dtype)
             logsum_0 = T.alloc_fragment([block_H], accum_dtype)
             logsum_1 = T.alloc_fragment([block_H], accum_dtype)
-            logsum = T.alloc_shared([block_H], accum_dtype)
 
             cur_kv_head = hid // (kv_group_num // block_H)
 
@@ -68,23 +88,25 @@ def flash_attn(
                 O_shared_r: tilelang.layout.make_swizzled_layout(O_shared_r),
             })
 
+            # barriers_Q
+            q_shared_ready_barrier = T.alloc_barrier(arrive_count=256)
+
+            # barriers_K0
             kv_shared_0_l_is_ready = T.alloc_barrier(arrive_count=128)
             kv_shared_0_r_is_ready = T.alloc_barrier(arrive_count=128)
             kv_shared_0_pe_is_ready = T.alloc_barrier(arrive_count=128)
+            # barriers_K1
             kv_shared_1_l_is_ready = T.alloc_barrier(arrive_count=128)
             kv_shared_1_r_is_ready = T.alloc_barrier(arrive_count=128)
             kv_shared_1_pe_is_ready = T.alloc_barrier(arrive_count=128)
+
+            # redundant barriers
             score_max_0_ready_barrier = T.alloc_barrier(arrive_count=128)
             scale_1_ready_barrier = T.alloc_barrier(arrive_count=128)
             p0_1_1_ready_barrier = T.alloc_barrier(arrive_count=128)
             lse_0_ready_barrier = T.alloc_barrier(arrive_count=128)
             lse_1_ready_barrier = T.alloc_barrier(arrive_count=128)
             s_shared_ready_barrier = T.alloc_barrier(arrive_count=128)
-            q_shared_ready_barrier = T.alloc_barrier(arrive_count=256)
-            k_pe_shared_1_free_barrier = T.alloc_barrier(arrive_count=128)
-            k_pe_shared_0_free_barrier = T.alloc_barrier(arrive_count=128)
-            s_shared_ready_barrier = T.alloc_barrier(arrive_count=128)
-            k_shared_1_l_free_barrier = T.alloc_barrier(arrive_count=128)
 
             tx = T.get_thread_binding()
 
@@ -93,11 +115,13 @@ def flash_attn(
             T.copy(Q_pe[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :], Q_pe_shared)
             T.barrier_arrive(q_shared_ready_barrier)
             T.barrier_wait(q_shared_ready_barrier, 0)
+
             T.fill(scores_max, -T.infinity(accum_dtype))
 
             loop_range = T.ceildiv(seqlen_kv, (block_N * 2))
 
             if tx < 128:
+                T.copy(Q_pe_shared, Q_pe_local_0)
                 T.fill(acc_o_l, 0)
                 T.fill(logsum_0, 0)
 
@@ -118,26 +142,13 @@ def flash_attn(
                         KV_shared_0_l,
                         acc_s_0,
                         transpose_B=True,
-                        policy=T.GemmWarpPolicy.FullCol,
                         clear_accum=True,
                         wg_wait=-1)
                     T.barrier_wait(kv_shared_0_r_is_ready, k % 2)
-                    T.gemm(
-                        Q_shared_r,
-                        KV_shared_0_r,
-                        acc_s_0,
-                        transpose_B=True,
-                        policy=T.GemmWarpPolicy.FullCol,
-                        wg_wait=-1)
+                    T.gemm(Q_shared_r, KV_shared_0_r, acc_s_0, transpose_B=True, wg_wait=-1)
 
                     T.barrier_wait(kv_shared_0_pe_is_ready, k % 2)
-                    T.gemm(
-                        Q_pe_shared,
-                        K_pe_shared_0,
-                        acc_s_0,
-                        transpose_B=True,
-                        policy=T.GemmWarpPolicy.FullCol,
-                        wg_wait=-1)
+                    T.gemm(Q_pe_local_0, K_pe_shared_0, acc_s_0, transpose_B=True, wg_wait=-1)
 
                     T.wait_wgmma(0)
 
@@ -158,7 +169,7 @@ def flash_attn(
                     T.reduce_sum(acc_s_0, scores_sum_0, dim=1)
 
                     # Step 5.
-                    T.copy(acc_s_0, S_shared)
+                    T.copy(acc_s_0, acc_s_0_cast)
 
                     for i, j in T.Parallel(block_H, h_dim):
                         acc_o_l[i, j] *= scores_scale_0[i]
@@ -167,7 +178,7 @@ def flash_attn(
                         logsum_0[i] = logsum_0[i] * scores_scale_0[i] + scores_sum_0[i]
 
                     # Step 6.
-                    T.gemm(S_shared, KV_shared_0_l, acc_o_l, policy=T.GemmWarpPolicy.FullCol)
+                    T.gemm(acc_s_0_cast, KV_shared_0_l, acc_o_l)
                     T.barrier_arrive(score_max_0_ready_barrier)
 
                     T.barrier_wait(scale_1_ready_barrier, k % 2)
@@ -180,7 +191,7 @@ def flash_attn(
 
                     # Step 11.
                     for i, j in T.Parallel(block_H, block_N):
-                        S_shared_[i, j] = acc_s_0[i, j] * scores_scale_1[i]
+                        SP0_shared[i, j] = acc_s_0[i, j] * scores_scale_1[i]
 
                     T.barrier_arrive(p0_1_1_ready_barrier)
 
@@ -192,19 +203,15 @@ def flash_attn(
                     T.barrier_wait(s_shared_ready_barrier, k % 2)
 
                     # Step 14.
-                    T.gemm(S_shared, KV_shared_1_l, acc_o_l, policy=T.GemmWarpPolicy.FullCol)
-                    T.barrier_arrive(k_pe_shared_0_free_barrier)
-                    T.barrier_arrive(k_shared_1_l_free_barrier)
+                    T.gemm(SP1_shared, KV_shared_1_l, acc_o_l)
 
                     if k < loop_range - 1:
 
-                        T.barrier_wait(k_shared_1_l_free_barrier, k % 2)
                         T.copy(
                             KV[bid, (2 * k + 3) * block_N:(2 * k + 4) * block_N,
                                cur_kv_head, :h_dim], KV_shared_1_l)
                         T.barrier_arrive(kv_shared_1_l_is_ready)
 
-                        T.barrier_wait(k_pe_shared_1_free_barrier, k % 2)
                         T.copy(
                             K_pe[bid, (2 * k + 3) * block_N:(2 * k + 4) * block_N, cur_kv_head, :],
                             K_pe_shared_1)
@@ -220,6 +227,7 @@ def flash_attn(
                                           hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :h_dim])
 
             else:
+                T.copy(Q_pe_shared, Q_pe_local_1)
                 T.fill(acc_o_r, 0)
                 T.fill(logsum_1, 0)
 
@@ -239,27 +247,14 @@ def flash_attn(
                         KV_shared_1_l,
                         acc_s_1,
                         transpose_B=True,
-                        policy=T.GemmWarpPolicy.FullCol,
                         clear_accum=True,
                         wg_wait=-1)
 
                     T.barrier_wait(kv_shared_1_r_is_ready, k % 2)
-                    T.gemm(
-                        Q_shared_r,
-                        KV_shared_1_r,
-                        acc_s_1,
-                        transpose_B=True,
-                        policy=T.GemmWarpPolicy.FullCol,
-                        wg_wait=-1)
+                    T.gemm(Q_shared_r, KV_shared_1_r, acc_s_1, transpose_B=True, wg_wait=-1)
 
                     T.barrier_wait(kv_shared_1_pe_is_ready, k % 2)
-                    T.gemm(
-                        Q_pe_shared,
-                        K_pe_shared_1,
-                        acc_s_1,
-                        transpose_B=True,
-                        policy=T.GemmWarpPolicy.FullCol,
-                        wg_wait=-1)
+                    T.gemm(Q_pe_local_1, K_pe_shared_1, acc_s_1, transpose_B=True, wg_wait=-1)
 
                     T.wait_wgmma(0)
 
@@ -292,14 +287,10 @@ def flash_attn(
                     T.barrier_arrive(scale_1_ready_barrier)
 
                     # Step 10. compute O1 with KV_shared_1_rd
-                    T.copy(acc_s_1, S_shared)
+                    T.copy(acc_s_1, acc_s_1_cast)
+                    T.gemm(acc_s_1_cast, KV_shared_1_r, acc_o_r, wg_wait=-1)
+                    T.copy(acc_s_1_cast, SP1_shared)
                     T.barrier_arrive(s_shared_ready_barrier)
-                    T.gemm(
-                        S_shared,
-                        KV_shared_1_r,
-                        acc_o_r,
-                        policy=T.GemmWarpPolicy.FullCol,
-                        wg_wait=-1)
 
                     if k < loop_range - 1:
                         T.copy(
@@ -309,8 +300,7 @@ def flash_attn(
 
                     T.barrier_wait(p0_1_1_ready_barrier, k % 2)
                     # Step 12.
-                    T.gemm(S_shared_, KV_shared_0_r, acc_o_r, policy=T.GemmWarpPolicy.FullCol)
-                    T.barrier_arrive(k_pe_shared_1_free_barrier)
+                    T.gemm(SP0_shared, KV_shared_0_r, acc_o_r)
 
                     if k < loop_range - 1:
 
@@ -319,7 +309,6 @@ def flash_attn(
                                h_dim:], KV_shared_0_r)
                         T.barrier_arrive(kv_shared_0_r_is_ready)
 
-                        T.barrier_wait(k_pe_shared_0_free_barrier, k % 2)
                         T.copy(
                             K_pe[bid, (2 * k + 2) * block_N:(2 * k + 3) * block_N, cur_kv_head, :],
                             K_pe_shared_0)
@@ -401,8 +390,7 @@ def main(batch=1, heads=128, kv_heads=1, kv_ctx=8192, dim=512, pe_dim=64):
     BLOCK_H = 64
     num_split = 1
 
-    program = flashattn(batch, heads, kv_heads, kv_ctx, dim, pe_dim, BLOCK_N, BLOCK_H, num_split)
-    kernel = tilelang.compile(program, out_idx=[6])
+    kernel = flashattn(batch, heads, kv_heads, kv_ctx, dim, pe_dim, BLOCK_N, BLOCK_H, num_split)
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Randn)
     profiler.assert_allclose(ref_program, rtol=0.01, atol=0.01)
     latency = profiler.do_bench(warmup=500)
diff --git a/format.sh b/format.sh
index beec09b1d..223753ce4 100755
--- a/format.sh
+++ b/format.sh
@@ -18,6 +18,11 @@ builtin cd "$(dirname "${BASH_SOURCE:-$0}")"
 ROOT="$(git rev-parse --show-toplevel)"
 builtin cd "$ROOT" || exit 1
 
+# If yapf/ruff/codespell is not installed, install according to the requirements
+if ! (yapf --version &>/dev/null && ruff --version &>/dev/null && codespell --version &>/dev/null); then
+    pip install -r requirements-lint.txt
+fi
+
 YAPF_VERSION=$(yapf --version | awk '{print $2}')
 RUFF_VERSION=$(ruff --version | awk '{print $2}')
 CODESPELL_VERSION=$(codespell --version)
@@ -26,7 +31,7 @@ CODESPELL_VERSION=$(codespell --version)
 tool_version_check() {
     if [[ $2 != $3 ]]; then
         echo "Wrong $1 version installed: $3 is required, not $2."
-        exit 1
+        pip install -r requirements-lint.txt
     fi
 }
 
@@ -255,13 +260,4 @@ if ! git diff --quiet &>/dev/null; then
     exit 1
 fi
 
-if ! git diff --quiet &>/dev/null; then
-    echo 'Reformatted files. Please review and stage the changes.'
-    echo 'Changes not staged for commit:'
-    echo
-    git --no-pager diff --name-only
-
-    exit 1
-fi
-
 echo 'tile-lang: All checks passed'
diff --git a/requirements-build.txt b/requirements-build.txt
index 784cb6091..0c18991fd 100644
--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -1,4 +1,5 @@
 # Should be mirrored in pyproject.toml
+Cython
 build
 cmake>=3.26
 packaging
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 81884c279..293023104 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -21,7 +21,6 @@ ml_dtypes
 psutil
 scipy
 torch
-thefuzz
 tabulate
 wheel
 setuptools
\ No newline at end of file
diff --git a/requirements-rocm.txt b/requirements-rocm.txt
new file mode 100644
index 000000000..4c8df9c67
--- /dev/null
+++ b/requirements-rocm.txt
@@ -0,0 +1,29 @@
+# lint requirements
+-r requirements-lint.txt
+# build requirements
+Cython
+cmake>=3.26
+# runtime requirements
+cffi
+cpplint
+Cython
+docutils
+dtlib
+numpy>=1.23.5
+pytest>=6.2.4
+pytest_xdist>=2.2.1
+packaging>=21.0
+PyYAML
+tqdm>=4.62.3
+typing_extensions>=4.10.0
+requests
+cloudpickle
+ml_dtypes
+psutil
+torch
+tabulate
+wheel
+setuptools
+einops
+scipy
+tornado
diff --git a/requirements-test.txt b/requirements-test.txt
index 6ff7cab5c..4c8df9c67 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -1,6 +1,7 @@
 # lint requirements
 -r requirements-lint.txt
 # build requirements
+Cython
 cmake>=3.26
 # runtime requirements
 cffi
@@ -20,13 +21,9 @@ cloudpickle
 ml_dtypes
 psutil
 torch
-thefuzz
 tabulate
 wheel
 setuptools
 einops
-attrs
-decorator
-flash-attn<=2.2.0
 scipy
 tornado
diff --git a/setup.py b/setup.py
index c9cffd8bd..bc545eae9 100644
--- a/setup.py
+++ b/setup.py
@@ -1,11 +1,12 @@
+import fcntl
+import functools
+import hashlib
 import io
 import subprocess
 import shutil
 from setuptools import setup, find_packages, Extension
 from setuptools.command.build_py import build_py
 from setuptools.command.sdist import sdist
-from setuptools.command.develop import develop
-import distutils.dir_util
 from typing import List, Optional
 import re
 import tarfile
@@ -14,17 +15,14 @@
 import os
 import sys
 import site
-import hashlib
 import sysconfig
-import functools
 import urllib.request
-from distutils.version import LooseVersion
+from packaging.version import Version
 import platform
 import multiprocessing
 from setuptools.command.build_ext import build_ext
 import importlib
 import logging
-import fcntl
 
 # Configure logging with basic settings
 logging.basicConfig(
@@ -117,7 +115,7 @@ def get_nvcc_cuda_version():
     nvcc_output = subprocess.check_output(["nvcc", "-V"], universal_newlines=True)
     output = nvcc_output.split()
     release_idx = output.index("release") + 1
-    nvcc_cuda_version = LooseVersion(output[release_idx].split(",")[0])
+    nvcc_cuda_version = Version(output[release_idx].split(",")[0])
     return nvcc_cuda_version
 
 
@@ -128,7 +126,7 @@ def get_rocm_version():
     # Example output: ROCM version: x.y.z-...
     match = re.search(r'ROCm Version: (\d+\.\d+\.\d+)', rocm_output)
     if match:
-        return LooseVersion(match.group(1))
+        return Version(match.group(1))
     else:
         rocm_path = os.environ.get("ROCM_PATH", "/opt/rocm")
         rocm_version_file = os.path.join(rocm_path, "lib", "cmake", "rocm",
@@ -138,9 +136,9 @@ def get_rocm_version():
                 content = f.read()
                 match = re.search(r'set\(PACKAGE_VERSION "(\d+\.\d+\.\d+)"', content)
                 if match:
-                    return LooseVersion(match.group(1))
+                    return Version(match.group(1))
     # return a default
-    return LooseVersion("5.0.0")
+    return Version("5.0.0")
 
 
 def get_tilelang_version(with_cuda=True, with_system_info=True, with_commit_id=False) -> str:
@@ -418,7 +416,7 @@ def run(self):
             target_dir = os.path.join(self.build_lib, item)
             if os.path.isdir(source_dir):
                 self.mkpath(target_dir)
-                distutils.dir_util.copy_tree(source_dir, target_dir)
+                self.copy_tree(source_dir, target_dir)
             else:
                 target_dir = os.path.dirname(target_dir)
                 if not os.path.exists(target_dir):
@@ -434,7 +432,7 @@ def run(self):
             target_dir = os.path.join(self.build_lib, PACKAGE_NAME, item)
             if os.path.isdir(source_dir):
                 self.mkpath(target_dir)
-                distutils.dir_util.copy_tree(source_dir, target_dir)
+                self.copy_tree(source_dir, target_dir)
             else:
                 target_dir = os.path.dirname(target_dir)
                 if not os.path.exists(target_dir):
@@ -511,7 +509,7 @@ def run(self):
             target_dir = os.path.join(self.build_lib, PACKAGE_NAME, item)
             if os.path.isdir(source_dir):
                 self.mkpath(target_dir)
-                distutils.dir_util.copy_tree(source_dir, target_dir)
+                self.copy_tree(source_dir, target_dir)
             else:
                 target_dir = os.path.dirname(target_dir)
                 if not os.path.exists(target_dir):
@@ -528,7 +526,7 @@ def run(self):
             target_dir = os.path.join(self.build_lib, PACKAGE_NAME, item)
             if os.path.isdir(source_dir):
                 self.mkpath(target_dir)
-                distutils.dir_util.copy_tree(source_dir, target_dir)
+                self.copy_tree(source_dir, target_dir)
             else:
                 target_dir = os.path.dirname(target_dir)
                 if not os.path.exists(target_dir):
@@ -544,7 +542,7 @@ def run(self):
             target_dir = os.path.join(self.build_lib, PACKAGE_NAME, item)
             if os.path.isdir(source_dir):
                 self.mkpath(target_dir)
-                distutils.dir_util.copy_tree(source_dir, target_dir)
+                self.copy_tree(source_dir, target_dir)
             else:
                 target_dir = os.path.dirname(target_dir)
                 if not os.path.exists(target_dir):
@@ -570,7 +568,7 @@ def run(self):
 
             if os.path.isdir(source_dir):
                 self.mkpath(target_dir)
-                distutils.dir_util.copy_tree(source_dir, target_dir)
+                self.copy_tree(source_dir, target_dir)
             else:
                 target_dir = os.path.dirname(target_dir)
                 if not os.path.exists(target_dir):
@@ -588,54 +586,6 @@ def make_distribution(self):
         super().make_distribution()
 
 
-# ------------------------------------------------------------------------
-# NEW: Add a custom 'develop' command so that `pip install -e .` works.
-# ------------------------------------------------------------------------
-class TileLangDevelopCommand(develop):
-    """
-    Customized setuptools 'develop' command for an editable install.
-    Ensures the extension is built and all necessary assets are copied.
-    """
-
-    def run(self):
-        logger.info("Running TileLangDevelopCommand")
-        # 1. Build the C/C++ extension modules
-        self.run_command("build_ext")
-
-        build_ext_cmd = self.get_finalized_command("build_ext")
-        ext_modules = build_ext_cmd.extensions
-        for ext in ext_modules:
-            extdir = build_ext_cmd.get_ext_fullpath(ext.name)
-            logger.info(f"Extension {ext.name} output directory: {extdir}")
-
-        ext_output_dir = os.path.dirname(extdir)
-        logger.info(f"Extension output directory (parent): {ext_output_dir}")
-
-        # Copy the built TVM to the package directory
-        TVM_PREBUILD_ITEMS = [
-            f"{ext_output_dir}/libtvm_runtime.so",
-            f"{ext_output_dir}/libtvm.so",
-            f"{ext_output_dir}/libtilelang.so",
-            f"{ext_output_dir}/libtilelang_module.so",
-        ]
-        for item in TVM_PREBUILD_ITEMS:
-            source_lib_file = os.path.join(ROOT_DIR, item)
-            # only copy the file
-            file_name = os.path.basename(item)
-            target_dir = os.path.join(PACKAGE_NAME, file_name)
-            target_dir = os.path.dirname(target_dir)
-            target_dir = os.path.join(target_dir, "lib")
-            if not os.path.exists(target_dir):
-                os.makedirs(target_dir)
-            if os.path.exists(source_lib_file):
-                patch_libs(source_lib_file)
-                shutil.copy2(source_lib_file, target_dir)
-                # remove the original file
-                os.remove(source_lib_file)
-            else:
-                logger.info(f"INFO: {source_lib_file} does not exist.")
-
-
 class CMakeExtension(Extension):
     """
     A specialized setuptools Extension class for building a CMake project.
@@ -742,15 +692,15 @@ def build_cython(self, ext):
                 with open(md5_path, "r") as f:
                     cached_hash = f.read().strip()
                     if cached_hash == code_hash:
-                        logger.info("Cython jit adapter is up to date, no need to compile...")
+                        logger.info("Cython JIT adapter is up to date, no need to compile...")
                         need_compile = False
                     else:
-                        logger.info("Cython jit adapter is out of date, need to recompile...")
+                        logger.info("Cython JIT adapter is out of date, need to recompile...")
             else:
-                logger.info("No cached version found for cython jit adapter, need to compile...")
+                logger.info("No cached version found for Cython JIT adapter, need to compile...")
 
             if need_compile:
-                logger.info("Waiting for lock to compile cython jit adapter...")
+                logger.info("Waiting for lock to compile Cython JIT adapter...")
                 with open(lock_file, 'w') as lock:
                     fcntl.flock(lock.fileno(), fcntl.LOCK_EX)
                     try:
@@ -765,7 +715,7 @@ def build_cython(self, ext):
                                     need_compile = False
 
                         if need_compile:
-                            logger.info("Compiling cython jit adapter...")
+                            logger.info("Compiling Cython JIT adapter...")
                             temp_path = cache_dir / f"temp_{code_hash}.so"
 
                             with open(md5_path, "w") as f:
@@ -786,7 +736,7 @@ def build_cython(self, ext):
                     except Exception as e:
                         if 'temp_path' in locals() and temp_path.exists():
                             temp_path.unlink()
-                        raise Exception(f"Failed to compile cython jit adapter: {e}") from e
+                        raise Exception(f"Failed to compile Cython JIT adapter: {e}") from e
                     finally:
                         if lock_file.exists():
                             lock_file.unlink()
@@ -811,16 +761,31 @@ def build_cmake(self, ext):
         # Determine the directory where the final .so or .pyd library should go.
         extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name)))
 
+        # To make it compatible with in-place build and avoid redundant link during incremental build,
+        # we need to change the build destination to tilelang/lib, where it's actually loaded
+        if self.inplace:
+            extdir = os.path.abspath('./tilelang/lib/')
+
+        logger.info(f"{extdir=}")
+
         # Prepare arguments for the CMake configuration step.
         # -DCMAKE_LIBRARY_OUTPUT_DIRECTORY sets where built libraries go
         # -DPYTHON_EXECUTABLE ensures that the correct Python is used
         cmake_args = [
-            f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={extdir}", f"-DPYTHON_EXECUTABLE={sys.executable}",
-            f"-DCMAKE_BUILD_TYPE={'Debug' if DEBUG_MODE else 'Release'}"
+            f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={extdir}",
+            f"-DPython_EXECUTABLE={sys.executable}",
+            f"-DCMAKE_BUILD_TYPE={'Debug' if DEBUG_MODE else 'Release'}",
+            "-G",
+            "Ninja",
         ]
+        if not USE_ROCM:
+            cmake_args.append(f"-DCMAKE_CUDA_COMPILER={os.path.join(CUDA_HOME, 'bin', 'nvcc')}")
 
         # Create the temporary build directory (if it doesn't exist).
-        build_temp = os.path.abspath(self.build_temp)
+        if self.inplace:
+            build_temp = os.path.abspath('./build')
+        else:
+            build_temp = os.path.abspath(self.build_temp)
         os.makedirs(build_temp, exist_ok=True)
 
         # Copy the default 'config.cmake' from the source tree into our build directory.
@@ -882,6 +847,5 @@ def build_cmake(self, ext):
         "build_py": TileLangBuilPydCommand,
         "sdist": TileLangSdistCommand,
         "build_ext": TilelangExtensionBuild,
-        "develop": TileLangDevelopCommand,
     },
 )
diff --git a/src/ir.cc b/src/ir.cc
index 977df0695..a8589ba9d 100644
--- a/src/ir.cc
+++ b/src/ir.cc
@@ -6,7 +6,9 @@
 
 #include "./transform/common/attr.h"
 #include "op/builtin.h"
+#include "tvm/ffi/any.h"
 #include <tvm/arith/analyzer.h>
+#include <tvm/ffi/reflection/registry.h>
 #include <tvm/script/ir_builder/tir/ir.h>
 
 namespace tvm {
@@ -65,7 +67,7 @@ ForFrame ParallelFor(Array<PrimExpr> extents,
       Var var = vars[i];
       body =
           For(var, dom->min, dom->extent, ForKind::kParallel, std::move(body),
-              /*thread_binding=*/NullOpt, /*annotations=*/annotations);
+              /*thread_binding=*/std::nullopt, /*annotations=*/annotations);
     }
     return body;
   };
@@ -99,7 +101,7 @@ ForFrame PipelinedFor(PrimExpr start, PrimExpr stop, int num_stages,
       anno.Set("tl_pipeline_group", groups);
     body = For(vars[0], doms[0]->min, doms[0]->extent, ForKind::kSerial,
                std::move(body),
-               /*thread_binding=*/NullOpt, /*annotations=*/anno);
+               /*thread_binding=*/std::nullopt, /*annotations=*/anno);
     return body;
   };
   return ForFrame(n);
@@ -157,7 +159,7 @@ ForFrame PersistentFor(Array<PrimExpr> domain, PrimExpr wave_size,
         Stmt());
 
     Stmt outer = For(loop_var, 0, waves, ForKind::kSerial,
-                     SeqStmt({out_if, body}), NullOpt, anno);
+                     SeqStmt({out_if, body}), std::nullopt, anno);
     for (int i = 0; i < vars.size() - 1; ++i) {
       outer = tvm::tir::LetStmt(vars[i], idxs[i + 1], outer);
     }
@@ -178,9 +180,10 @@ class KernelLaunchFrameNode : public TIRFrameNode {
 public:
   Array<TIRFrame> frames;
 
-  void VisitAttrs(tvm::AttrVisitor *v) {
-    TIRFrameNode::VisitAttrs(v);
-    v->Visit("frames", &frames);
+  static void RegisterReflection() {
+    namespace refl = tvm::ffi::reflection;
+    refl::ObjectDef<KernelLaunchFrameNode>().def_ro(
+        "frames", &KernelLaunchFrameNode::frames);
   }
 
   static constexpr const char *_type_key = "tl.KernelLaunchFrame";
@@ -213,14 +216,16 @@ class KernelLaunchFrame : public TIRFrame {
 };
 
 KernelLaunchFrame KernelLaunch(Array<PrimExpr> grid_size,
-                               Array<PrimExpr> block_size,
-                               Map<String, ObjectRef> attrs) {
+                               Optional<Array<PrimExpr>> block_size_opt,
+                               Map<String, ffi::Any> attrs) {
   ObjectPtr<KernelLaunchFrameNode> n = make_object<KernelLaunchFrameNode>();
 
   // If the kernel is a CPU kernel, we don't need to launch any threads.
   bool is_cpu_kernel_frame =
       attrs.defined() && attrs.count(tilelang_is_cpu_kernel_frame);
 
+  auto block_size = block_size_opt.value_or(Array<PrimExpr>());
+
   if (is_cpu_kernel_frame) {
     // Launch CPU Kernel
     ICHECK(grid_size.size() >= 0);
@@ -279,18 +284,23 @@ KernelLaunchFrame KernelLaunch(Array<PrimExpr> grid_size,
 
 TVM_REGISTER_NODE_TYPE(KernelLaunchFrameNode);
 
-TVM_REGISTER_GLOBAL("tl.Parallel").set_body_typed(ParallelFor);
-TVM_REGISTER_GLOBAL("tl.Pipelined").set_body_typed(PipelinedFor);
-TVM_REGISTER_GLOBAL("tl.Persistent").set_body_typed(PersistentFor);
-TVM_REGISTER_GLOBAL("tl.KernelLaunch").set_body_typed(KernelLaunch);
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef()
+      .def("tl.Parallel", ParallelFor)
+      .def("tl.Pipelined", PipelinedFor)
+      .def("tl.Persistent", PersistentFor)
+      .def("tl.KernelLaunch", KernelLaunch);
+});
 
 class WarpSpecializeFrameNode : public TIRFrameNode {
 public:
   Array<TIRFrame> frames;
 
-  void VisitAttrs(tvm::AttrVisitor *v) {
-    TIRFrameNode::VisitAttrs(v);
-    v->Visit("frames", &frames);
+  static void RegisterReflection() {
+    namespace refl = tvm::ffi::reflection;
+    refl::ObjectDef<WarpSpecializeFrameNode>().def_ro(
+        "frames", &WarpSpecializeFrameNode::frames);
   }
 
   static constexpr const char *_type_key = "tl.WarpSpecializeFrame";
@@ -359,7 +369,12 @@ WarpSpecializeFrame WarpSpecialize(Array<IntImm> warp_group_ids,
 }
 
 TVM_REGISTER_NODE_TYPE(WarpSpecializeFrameNode);
-TVM_REGISTER_GLOBAL("tl.WarpSpecialize").set_body_typed(WarpSpecialize);
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.WarpSpecialize", WarpSpecialize);
+  KernelLaunchFrameNode::RegisterReflection();
+  WarpSpecializeFrameNode::RegisterReflection();
+});
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/layout/layout.cc b/src/layout/layout.cc
index 9a1a1e872..f682fd3ee 100644
--- a/src/layout/layout.cc
+++ b/src/layout/layout.cc
@@ -4,6 +4,7 @@
  */
 
 #include "layout.h"
+#include <tvm/ffi/reflection/registry.h>
 
 #include <tvm/arith/pattern.h>
 #include <tvm/tir/op.h>
@@ -73,9 +74,11 @@ Layout::Layout(Array<PrimExpr> input_size, Array<PrimExpr> forward_index) {
   data_ = std::move(n);
 }
 
-void LayoutNode::VisitAttrs(AttrVisitor *v) {
-  v->Visit("input_size", &input_size_);
-  v->Visit("forward_index", &forward_index_);
+void LayoutNode::RegisterReflection() {
+  namespace refl = tvm::ffi::reflection;
+  refl::ObjectDef<LayoutNode>()
+      .def_ro("input_size", &LayoutNode::input_size_)
+      .def_ro("forward_index", &LayoutNode::forward_index_);
 }
 
 void LayoutNode::UpdateAnalyzer(arith::Analyzer *analyzer) const {
@@ -155,7 +158,7 @@ Fragment FragmentNode::Repeat(const Array<PrimExpr> &repeats,
     auto new_forward_thread =
         Substitute(forward_thread_, vmap) + thread_size * repeats_index;
     return Fragment(new_input_size, new_forward_index, new_forward_thread,
-                    replicate_size_, NullOpt);
+                    replicate_size_, std::nullopt);
   } else {
     ICHECK(OutputDim() == 1);
     PrimExpr frag_len = OutputShape()[0];
@@ -163,7 +166,7 @@ Fragment FragmentNode::Repeat(const Array<PrimExpr> &repeats,
                                          frag_len * repeats_index};
     PrimExpr new_forward_thread = Substitute(forward_thread_, vmap);
     return Fragment(new_input_size, new_forward_index, new_forward_thread,
-                    replicate_size_, NullOpt);
+                    replicate_size_, std::nullopt);
   }
 }
 
@@ -176,7 +179,7 @@ Fragment FragmentNode::Replicate(int repeats) const {
       Substitute(forward_thread_, vmap) +
       ThreadExtent() * FloorDiv(ReplicationPlaceholder(), ReplicateExtent());
   return Fragment(input_size_, forward_index_, new_forward_thread,
-                  ReplicateExtent() * repeats, NullOpt);
+                  ReplicateExtent() * repeats, std::nullopt);
 }
 
 Fragment FragmentNode::DeReplicate() const {
@@ -198,7 +201,7 @@ Fragment FragmentNode::DeReplicate() const {
   PrimExpr new_forward_thread = Substitute(forward_thread_, vmap);
   Array<PrimExpr> new_forward_index = {FloorDiv(forward_index_[0], factor)};
   return Fragment(input_size_, new_forward_index, new_forward_thread,
-                  int(*rep_size) / factor, NullOpt);
+                  int(*rep_size) / factor, std::nullopt);
 }
 
 Fragment FragmentNode::BindThreadRange(Range thread_range) const {
@@ -304,18 +307,11 @@ Fragment::Fragment(Array<PrimExpr> input_size, Array<PrimExpr> forward_index,
   data_ = std::move(n);
 }
 
-void FragmentNode::VisitAttrs(tvm::AttrVisitor *v) {
-  LayoutNode::VisitAttrs(v);
-  v->Visit("forward_thread", &forward_thread_);
-  v->Visit("replicate_size", &replicate_size_);
-}
-
 PrimExpr FragmentNode::ThreadExtent() const {
   Array<PrimExpr> ret(OutputDim(), 1);
   arith::Analyzer analyzer;
   UpdateAnalyzer(&analyzer);
   auto ist = analyzer.int_set(forward_thread_ + 1);
-  // CHECK(is_one(ist.min()));
   return ist.max();
 }
 
@@ -435,64 +431,69 @@ bool FragmentNode::IsEqual(const FragmentNode *other, bool skip_index) const {
   return ret;
 }
 
+void FragmentNode::RegisterReflection() {
+  namespace refl = tvm::ffi::reflection;
+  refl::ObjectDef<FragmentNode>()
+      .def_ro("forward_thread", &FragmentNode::forward_thread_)
+      .def_ro("replicate_size", &FragmentNode::replicate_size_);
+}
+
 TVM_REGISTER_NODE_TYPE(LayoutNode);
 TVM_REGISTER_NODE_TYPE(FragmentNode);
 
-TVM_REGISTER_GLOBAL("tl.Layout").set_body([](TVMArgs args, TVMRetValue *ret) {
-  *ret = Layout(Array<IterVar>(args[0]), Array<PrimExpr>(args[1]));
-});
-
-TVM_REGISTER_GLOBAL("tl.Layout_input_shape").set_body_typed([](Layout layout) {
-  return layout->InputShape();
-});
-
-TVM_REGISTER_GLOBAL("tl.Layout_output_shape").set_body_typed([](Layout layout) {
-  return layout->OutputShape();
-});
-
-TVM_REGISTER_GLOBAL("tl.Layout_inverse").set_body_typed([](Layout layout) {
-  return layout->Inverse();
-});
-
-TVM_REGISTER_GLOBAL("tl.Layout_index").set_body_typed([](Layout layout) {
-  return layout->GetForwardIndex();
-});
-
-TVM_REGISTER_GLOBAL("tl.Layout_forward_vars").set_body_typed([](Layout layout) {
-  return layout->GetForwardVars();
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef()
+      .def_packed("tl.Layout",
+                  [](PackedArgs args, Any *rv) {
+                    *rv = Layout(args[0].cast<Array<IterVar>>(),
+                                 args[1].cast<Array<PrimExpr>>());
+                  })
+      .def("tl.Layout_input_shape",
+           [](Layout layout) { return layout->InputShape(); })
+      .def("tl.Layout_output_shape",
+           [](Layout layout) { return layout->OutputShape(); })
+      .def("tl.Layout_inverse", [](Layout layout) { return layout->Inverse(); })
+      .def("tl.Layout_index",
+           [](Layout layout) { return layout->GetForwardIndex(); })
+      .def("tl.Layout_forward_vars",
+           [](Layout layout) { return layout->GetForwardVars(); })
+      .def_packed("tl.Fragment",
+                  [](PackedArgs args, Any *rv) {
+                    *rv = Fragment(
+                        /*forward_var=*/args[0].cast<Array<IterVar>>(),
+                        /*forward_index=*/args[1].cast<Array<PrimExpr>>(),
+                        /*forward_thread=*/args[2].cast<PrimExpr>(),
+                        /*thread_replicate=*/args[3].cast<IterVar>());
+                  })
+      .def("tl.Fragment_thread_size",
+           [](Fragment fragment) { return fragment->ThreadExtent(); })
+      .def("tl.Fragment_thread",
+           [](Fragment fragment) { return fragment->GetForwardThread(); })
+      .def("tl.Fragment_repeat",
+           [](Fragment fragment, Array<PrimExpr> repeats, bool repeat_on_thread,
+              bool lower_dim_first) {
+             return fragment->Repeat(repeats, repeat_on_thread,
+                                     lower_dim_first);
+           })
+      .def("tl.Fragment_replicate",
+           [](Fragment fragment, int repeats) {
+             return fragment->Replicate(repeats);
+           })
+      .def("tl.Fragment_condense_rep_var",
+           [](Fragment fragment) { return fragment->CondenseReplicateVar(); })
+      .def("tl.make_swizzled_layout",
+           [](int stride, int continuous, int element_size) {
+             return makeGemmABLayout(stride, continuous, continuous,
+                                     element_size, 0);
+           });
 });
 
-TVM_REGISTER_GLOBAL("tl.Fragment").set_body([](TVMArgs args, TVMRetValue *ret) {
-  *ret = Fragment(args[0], args[1], args[2], args[3]);
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  LayoutNode::RegisterReflection();
+  FragmentNode::RegisterReflection();
 });
 
-TVM_REGISTER_GLOBAL("tl.Fragment_thread_size")
-    .set_body_typed([](Fragment fragment) { return fragment->ThreadExtent(); });
-
-TVM_REGISTER_GLOBAL("tl.Fragment_thread").set_body_typed([](Fragment fragment) {
-  return fragment->GetForwardThread();
-});
-
-TVM_REGISTER_GLOBAL("tl.Fragment_repeat")
-    .set_body_typed([](Fragment fragment, Array<PrimExpr> repeats,
-                       bool repeat_on_thread, bool lower_dim_first) {
-      return fragment->Repeat(repeats, repeat_on_thread, lower_dim_first);
-    });
-
-TVM_REGISTER_GLOBAL("tl.Fragment_replicate")
-    .set_body_typed([](Fragment fragment, int repeats) {
-      return fragment->Replicate(repeats);
-    });
-
-TVM_REGISTER_GLOBAL("tl.Fragment_condense_rep_var")
-    .set_body_typed([](Fragment fragment) {
-      return fragment->CondenseReplicateVar();
-    });
-
-TVM_REGISTER_GLOBAL("tl.make_swizzled_layout")
-    .set_body_typed([](int stride, int continuous, int element_size) {
-      return makeGemmABLayout(stride, continuous, continuous, element_size, 0);
-    });
-
 } // namespace tl
 } // namespace tvm
diff --git a/src/layout/layout.h b/src/layout/layout.h
index 59647a007..fe2e809a7 100644
--- a/src/layout/layout.h
+++ b/src/layout/layout.h
@@ -44,7 +44,7 @@ class LayoutNode : public Object {
   static constexpr bool _type_has_method_sequal_reduce = true;
   static constexpr const char *_type_key = "tl.Layout";
   bool SEqualReduce(const LayoutNode *other, SEqualReducer equal) const;
-  void VisitAttrs(tvm::AttrVisitor *v);
+  static void RegisterReflection();
   TVM_DECLARE_BASE_OBJECT_INFO(LayoutNode, Object);
 
 protected:
@@ -101,7 +101,7 @@ class FragmentNode : public LayoutNode {
 
   bool IsEqual(const FragmentNode *other, bool skip_index = false) const;
 
-  void VisitAttrs(tvm::AttrVisitor *v);
+  static void RegisterReflection();
 
   bool SEqualReduce(const FragmentNode *other, SEqualReducer equal) const;
   static constexpr const char *_type_key = "tl.Fragment";
diff --git a/src/layout/swizzle.cc b/src/layout/swizzle.cc
index 5c3096498..2da308038 100644
--- a/src/layout/swizzle.cc
+++ b/src/layout/swizzle.cc
@@ -97,8 +97,9 @@ SwizzledLayout::SwizzledLayout(Array<PrimExpr> input_size,
   data_ = std::move(n);
 }
 
-void SwizzledLayoutNode::VisitAttrs(tvm::AttrVisitor *v) {
-  LayoutNode::VisitAttrs(v);
+void SwizzledLayoutNode::RegisterReflection() {
+  namespace refl = tvm::ffi::reflection;
+  refl::ObjectDef<SwizzledLayoutNode>();
 }
 
 bool SwizzledLayoutNode::SEqualReduce(const SwizzledLayoutNode *other,
diff --git a/src/layout/swizzle.h b/src/layout/swizzle.h
index fd7185402..5f7f4f3dd 100644
--- a/src/layout/swizzle.h
+++ b/src/layout/swizzle.h
@@ -46,7 +46,7 @@ class SwizzledLayoutNode : public LayoutNode {
   bool IsEqual(const SwizzledLayoutNode *other, bool skip_index = false) const;
   static constexpr const char *_type_key = "tl.SwizzledLayout";
   bool SEqualReduce(const SwizzledLayoutNode *other, SEqualReducer equal) const;
-  void VisitAttrs(tvm::AttrVisitor *v);
+  static void RegisterReflection();
   TVM_DECLARE_FINAL_OBJECT_INFO(SwizzledLayoutNode, LayoutNode);
 
 private:
diff --git a/src/layout/utils.cc b/src/layout/utils.cc
index 3ceb52c72..83103fd1e 100644
--- a/src/layout/utils.cc
+++ b/src/layout/utils.cc
@@ -124,13 +124,17 @@ Array<IterSplitExpr> DivideUnusedIterators(const Array<PrimExpr> &exprs,
   Array<IterSplitExpr> results;
 
   for (const IterMark &mark : collector.visited_) {
-    ICHECK(mark->source.as<Var>()) << "Not a normalized iterator: " << mark;
+    if (!mark->source.as<Var>()) {
+      std::ostringstream oss;
+      oss << "Not a normalized iterator: " << mark;
+      throw NormalizeIterException(oss.str());
+    }
   }
 
   for (const IterVar &iter : input_iters) {
     IterMark iv_mark;
     for (const IterMark &mark : collector.visited_) {
-      if (mark->source.as<Var>().same_as(iter->var)) {
+      if (mark->source.as<Var>()->same_as(iter->var)) {
         iv_mark = mark;
         break;
       }
diff --git a/src/layout/utils.h b/src/layout/utils.h
index b9175b277..87732bf97 100644
--- a/src/layout/utils.h
+++ b/src/layout/utils.h
@@ -14,6 +14,15 @@ namespace tl {
 
 using namespace tir;
 
+class NormalizeIterException : public std::exception {
+public:
+  const char *what() const noexcept override { return msg_.c_str(); }
+  NormalizeIterException(const std::string &msg) : msg_(msg) {}
+
+private:
+  std::string msg_;
+};
+
 /*!
  * \brief Collect the IterSplit that is not used in expr.
  *
diff --git a/src/op/atomic_add.cc b/src/op/atomic_add.cc
new file mode 100644
index 000000000..4f8cfe3de
--- /dev/null
+++ b/src/op/atomic_add.cc
@@ -0,0 +1,247 @@
+/*!
+ * \file tl/op/atomic_add.cc
+ *
+ * Define elment-wise operators.
+ */
+
+#include "atomic_add.h"
+
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/op.h>
+#include <tvm/tir/op_attr_types.h>
+
+#include "../target/utils.h"
+#include "../transform/atomicadd_vectorize.h"
+#include "../transform/common/loop_fusion_utils.h"
+#include "../transform/loop_partition.h"
+#include "builtin.h"
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+static int GetArchInt(Target target) {
+  int arch_int = 0;
+  auto s = target->GetAttr<String>("arch");
+  ICHECK(s.defined());
+  const char *arch_str = s.value().c_str();
+  if (arch_str[0] == 's' && arch_str[1] == 'm' && arch_str[2] == '_') {
+    arch_int = atoi(&arch_str[3]);
+  } else {
+    arch_int = 0;
+  }
+  return arch_int;
+}
+
+AtomicAdd::AtomicAdd(Array<PrimExpr> args, BufferMap vmap) : args_(args) {
+  Array<Range> rgs[2];
+  Buffer bf[2];
+  for (int i = 0; i < 2; i++) {
+    auto expr = args[i];
+    auto call = expr.as<CallNode>();
+    ICHECK(call);
+    auto region = RegionOp(call->args, vmap);
+    rgs[i] = region.GetRanges();
+    bf[i] = region.GetBuffer();
+  }
+  std::tie(this->src, this->dst) = std::tie(bf[0], bf[1]);
+  std::tie(this->src_range, this->dst_range) = std::tie(rgs[0], rgs[1]);
+  if (args.size() >= 3) {
+    coalesced_width = Downcast<IntImm>(args[2]);
+  }
+}
+
+Array<IterVar> AtomicAdd::MakeIterVars() const {
+  Array<IterVar> loop_vars;
+  size_t idx = 0;
+  for (size_t i = 0; i < src_range.size(); i++) {
+    if (is_one(src_range[i]->extent))
+      continue;
+    Var var = Var(std::string{char('i' + idx)}, src_range[i]->extent->dtype);
+    idx++;
+    loop_vars.push_back(
+        {Range(0, src_range[i]->extent), var, IterVarType::kDataPar});
+  }
+  return loop_vars;
+}
+
+// ivs: itervars returned by MakeIterVars()
+// src_dst: 0 for src_indices, 1 for dst_indices
+Array<PrimExpr> AtomicAdd::MakeIndices(const Array<IterVar> &ivs,
+                                       int src_dst) const {
+  Array<PrimExpr> indices;
+  Array<Range> ranges = src_dst == 0 ? src_range : dst_range;
+  size_t idx = 0;
+  for (size_t i = 0; i < ranges.size(); i++) {
+    if (is_one(ranges[i]->extent))
+      indices.push_back(ranges[i]->min);
+    else {
+      indices.push_back(ranges[i]->min + ivs[idx]->var);
+      idx++;
+    }
+  }
+  ICHECK(idx == ivs.size())
+      << "idx = " << idx << ", ivs.size() = " << ivs.size()
+      << "src name = " << src->name << ", dst name = " << dst->name;
+  return indices;
+}
+
+PrimExpr AtomicAdd::MakePredicate(arith::Analyzer *analyzer,
+                                  const Array<IterVar> &ivs,
+                                  Array<PrimExpr> extents, int src_dst) const {
+  Array<Range> ranges = src_dst == 0 ? src_range : dst_range;
+  Array<PrimExpr> cond_list;
+  ICHECK(extents.size() == ranges.size()) << extents << " " << ranges;
+  size_t idx = 0;
+  for (size_t i = 0; i < ranges.size(); i++) {
+    if (is_one(ranges[i]->extent))
+      continue;
+    PrimExpr cond = ranges[i]->min + ivs[idx]->var < extents[i];
+    if (!analyzer->CanProve(cond, arith::ProofStrength::kSymbolicBound)) {
+      cond_list.push_back(cond);
+    }
+    cond = ranges[i]->min + ivs[idx]->var >= 0;
+    if (!analyzer->CanProve(cond, arith::ProofStrength::kSymbolicBound)) {
+      cond_list.push_back(cond);
+    }
+    idx++;
+  }
+  if (cond_list.empty())
+    return {};
+  else {
+    PrimExpr cond = cond_list[0];
+    for (size_t i = 1; i < cond_list.size(); i++)
+      cond = And(cond, cond_list[i]);
+    return cond;
+  }
+}
+
+For AtomicAdd::MakeSIMTLoop(arith::Analyzer *analyzer) const {
+  Array<IterVar> loop_vars = MakeIterVars();
+  bool is_scalar = loop_vars.size() == 0;
+  if (is_scalar) {
+    return For(Var("i"), 0, 1, ForKind::kSerial,
+               BufferStore(dst, BufferLoad(src, {0}), {0}));
+  }
+
+  for (const auto &iv : loop_vars)
+    analyzer->Bind(iv->var, iv->dom);
+
+  ICHECK(loop_vars.size() <= src_range.size())
+      << "loop_vars.size() = " << loop_vars.size()
+      << ", src_range.size() = " << src_range.size() << ", src = " << src->name
+      << ", dst = " << dst->name;
+
+  ICHECK(loop_vars.size() <= dst_range.size())
+      << "loop_vars.size() = " << loop_vars.size()
+      << ", dst_range.size() = " << dst_range.size() << ", src = " << src->name
+      << ", dst = " << dst->name;
+
+  Array<PrimExpr> src_indices = MakeIndices(loop_vars, 0);
+  Array<PrimExpr> dst_indices = MakeIndices(loop_vars, 1);
+
+  PrimExpr src_predicate = MakePredicate(analyzer, loop_vars, src->shape, 0);
+  PrimExpr dst_predicate = MakePredicate(analyzer, loop_vars, dst->shape, 1);
+
+  Array<PrimExpr> new_args;
+  new_args.push_back(StringImm("AtomicAdd"));
+
+  PrimExpr src_value = BufferLoad(src, src_indices);
+  if (src->dtype != dst->dtype)
+    src_value = Cast(dst->dtype, src_value);
+  if (src_predicate.defined())
+    src_value = if_then_else(src_predicate, src_value, make_zero(dst->dtype));
+
+  PrimExpr dst_value = BufferLoad(dst, dst_indices);
+  if (dst_predicate.defined())
+    dst_value = if_then_else(dst_predicate, dst_value, make_zero(dst->dtype));
+
+  Call address_of_value =
+      tvm::tir::Call(DataType::Handle(), builtin::address_of(), {dst_value});
+
+  new_args.push_back(address_of_value);
+  new_args.push_back(src_value);
+
+  Call atomicadd_call =
+      tvm::tir::Call(dst->dtype, builtin::call_extern(), new_args);
+
+  Stmt body = tvm::tir::Evaluate(atomicadd_call);
+
+  for (int i = loop_vars.size() - 1; i >= 0; i--) {
+    Map<String, ObjectRef> annotations = {};
+    if (coalesced_width.defined()) {
+      annotations.Set("coalesced_width", coalesced_width);
+    }
+
+    body = For(loop_vars[i]->var, 0, loop_vars[i]->dom->extent,
+               ForKind::kParallel, body, std::nullopt, annotations);
+  }
+  return Downcast<For>(body);
+}
+
+Stmt AtomicAdd::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
+  Target target = T.target;
+  bool is_cpu_target = target->GetTargetDeviceType() == kDLCPU;
+  auto simt_loop = MakeSIMTLoop(analyzer);
+  auto fused_loop = Downcast<For>(ParallelLoopFuser::Fuse(simt_loop));
+  For vectorized_thread_loop;
+  auto par_op = std::make_unique<ParallelOp>(fused_loop);
+
+  if (!is_cpu_target) {
+    std::vector<InferLevel> levels = {InferLevel::kCommon, InferLevel::kStrict,
+                                      InferLevel::kFree};
+    for (auto level : levels) {
+      par_op->InferLayout(
+          {T.target, T.thread_bounds, T.layout_map, T.buffer_remap}, level);
+    }
+    auto loop_layout = par_op->GetLoopLayout();
+    Var thread_var = T.thread_var;
+    Range thread_bounds = T.thread_bounds;
+    auto thread_loop =
+        PartitionLoop(par_op->GetRoot(), T.thread_var, analyzer, loop_layout);
+    vectorized_thread_loop = VectorizeAtomicAdd(
+        thread_loop, thread_var, thread_bounds, GetArchInt(target));
+  }
+
+  if (par_op->GetPredicate(T.thread_var).defined()) {
+    return IfThenElse(par_op->GetPredicate(T.thread_var).value(),
+                      vectorized_thread_loop);
+  }
+
+  return vectorized_thread_loop;
+}
+
+LayoutMap AtomicAdd::InferLayout(const LayoutInferArgs &T, InferLevel level) {
+  if (par_op_ == nullptr) {
+    arith::Analyzer analyzer;
+    par_op_ = std::make_unique<ParallelOp>(MakeSIMTLoop(&analyzer));
+  }
+  if (T.layout_map.count(src) && T.layout_map.count(dst)) {
+    if (src.scope() == "local.fragment" && dst.scope() == "local.fragment") {
+      const FragmentNode *src_layout = T.layout_map[src].as<FragmentNode>();
+      const FragmentNode *dst_layout = T.layout_map[dst].as<FragmentNode>();
+      if (src_layout && dst_layout) {
+        ICHECK(src_layout->IsEqual(dst_layout, true))
+            << "Get different layout for " << src << " and " << dst
+            << "\nLHS = " << src_layout->DebugOutput()
+            << "\nRHS = " << dst_layout->DebugOutput()
+            << "\nYou may need to use a shared memory to transform the layout";
+      }
+    }
+  }
+  return par_op_->InferLayout(T, level);
+}
+
+TIR_REGISTER_TL_OP(AtomicAdd, atomicadd)
+    .set_num_inputs(2)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+// TVM_REGISTER_OP("tl.atomicadd")
+//     .set_num_inputs(2)
+//     .add_argument("ref", "Buffer", "The destination buffer")
+//     .add_argument("val", "Expr", "The value to be added atomically");
+
+} // namespace tl
+} // namespace tvm
\ No newline at end of file
diff --git a/src/op/atomic_add.h b/src/op/atomic_add.h
new file mode 100644
index 000000000..b8bb0dd97
--- /dev/null
+++ b/src/op/atomic_add.h
@@ -0,0 +1,62 @@
+/*!
+ * \file tl/op/atomic_add.h
+ * \brief Define atomic add operator.
+ *
+ */
+
+#ifndef TVM_TL_OP_ATOMIC_ADD_H_
+#define TVM_TL_OP_ATOMIC_ADD_H_
+
+#include "op.h"
+#include "parallel.h"
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+class AtomicAdd : public Operator {
+public:
+  AtomicAdd(Array<PrimExpr> args, BufferMap vmap);
+  Stmt Lower(const LowerArgs &T, arith::Analyzer *analyzer) const final;
+  LayoutMap InferLayout(const LayoutInferArgs &T, InferLevel level) final;
+
+  static const Op &Get();
+
+  AtomicAdd(const AtomicAdd &other)
+      : args_(other.args_), src(other.src), dst(other.dst),
+        src_range(other.src_range), dst_range(other.dst_range),
+        coalesced_width(other.coalesced_width) {
+    // No clone nullptr
+    if (other.par_op_)
+      par_op_ = std::unique_ptr<ParallelOp>(
+          static_cast<ParallelOp *>(other.par_op_->Clone().release()));
+  }
+  std::unique_ptr<Operator> Clone() const final {
+    return std::make_unique<AtomicAdd>(*this);
+  }
+
+protected:
+  For MakeSIMTLoop(arith::Analyzer *analyzer) const;
+  Array<IterVar> MakeIterVars() const;
+
+  // ivs: itervars returned by MakeIterVars()
+  // src_dst: 0 for src_indices, 1 for dst_indices
+  Array<PrimExpr> MakeIndices(const Array<IterVar> &ivs, int src_dst) const;
+
+  PrimExpr MakePredicate(arith::Analyzer *analyzer, const Array<IterVar> &ivs,
+                         Array<PrimExpr> extents, int src_dst) const;
+
+  Array<PrimExpr> args_;
+
+  Buffer src, dst;
+  Array<Range> src_range, dst_range;
+  IntImm coalesced_width;
+
+  std::unique_ptr<ParallelOp> par_op_;
+};
+
+} // namespace tl
+} // namespace tvm
+
+#endif //  TVM_TL_OP_ATOMIC_ADD_H_
\ No newline at end of file
diff --git a/src/op/builtin.cc b/src/op/builtin.cc
index c4aa81d81..2b63fc850 100644
--- a/src/op/builtin.cc
+++ b/src/op/builtin.cc
@@ -25,7 +25,9 @@ TVM_REGISTER_PASS_CONFIG_OPTION(kDisableDynamicTailSplit, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kDynamicAlignment, Integer);
 TVM_REGISTER_PASS_CONFIG_OPTION(kEnableAggressiveSharedMemoryMerge, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kDisableFastMath, Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION(kPtxasRegisterUsageLevel, Integer);
 TVM_REGISTER_PASS_CONFIG_OPTION(kEnablePTXASVerboseOutput, Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION(kDisableShuffleElect, Bool);
 
 #define TIR_DEFINE_TL_BUILTIN(OpName)                                          \
   const Op &OpName() {                                                         \
@@ -87,7 +89,7 @@ TIR_DEFINE_TL_BUILTIN(ptx_stmatirx)
                                Integer(CallEffectKind::kOpaque));
 
 TIR_DEFINE_TL_BUILTIN(sync_thread_partial)
-    .set_num_inputs(1)
+    .set_num_inputs(2)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
@@ -130,5 +132,37 @@ TIR_DEFINE_TL_BUILTIN(loop_break)
     .set_num_inputs(0)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(tl_gemm).set_num_inputs(4).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(tl_gemm_sp)
+    .set_num_inputs(5)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(tvm_mfma).set_num_inputs(12).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(tvm_mfma_store)
+    .set_num_inputs(6)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(tvm_rdna_wmma)
+    .set_num_inputs(12)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(tvm_rdna_wmma_store)
+    .set_num_inputs(6)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(tl_shuffle_elect)
+    .set_num_inputs(1)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kPure));
+
 } // namespace tl
 } // namespace tvm
diff --git a/src/op/builtin.h b/src/op/builtin.h
index e368d847c..309d2bac1 100644
--- a/src/op/builtin.h
+++ b/src/op/builtin.h
@@ -28,9 +28,11 @@ static constexpr const char *kConfigIndexBitwidth = "tl.config_index_bitwidth";
 static constexpr const char *kEnableAggressiveSharedMemoryMerge =
     "tl.enable_aggressive_shared_memory_merge";
 static constexpr const char *kDisableFastMath = "tl.disable_fast_math";
+static constexpr const char *kPtxasRegisterUsageLevel =
+    "tl.ptxas_register_usage_level";
 static constexpr const char *kEnablePTXASVerboseOutput =
     "tl.enable_ptxas_verbose_output";
-
+static constexpr const char *kDisableShuffleElect = "tl.disable_shuffle_elect";
 /*!
  * \brief Whether to disable dynamic tail split
  *
@@ -277,6 +279,28 @@ TVM_DLL const Op &tvm_rdna_wmma();
  */
 TVM_DLL const Op &tvm_rdna_wmma_store();
 
+/*!
+ * \brief tilelang intrinsic for general matrix multiplication (GEMM).
+ *
+ *  This op is used to represent a generic GEMM operation in tilelang.
+ */
+TVM_DLL const Op &tl_gemm();
+
+/*!
+ * \brief tilelang intrinsic for sparse matrix multiplication (GEMM with
+ * sparsity).
+ *
+ *  This op is used to represent a sparse GEMM operation in tilelang.
+ */
+TVM_DLL const Op &tl_gemm_sp();
+
+/*!
+ * \brief tilelang intrinsic for shuffle elect.
+ *
+ *  This op is used to represent a shuffle elect operation in tilelang.
+ */
+TVM_DLL const Op &tl_shuffle_elect();
+
 } // namespace tl
 } // namespace tvm
 
diff --git a/src/op/bulk_copy.cc b/src/op/bulk_copy.cc
index 007a3ff01..b0d90d7d1 100644
--- a/src/op/bulk_copy.cc
+++ b/src/op/bulk_copy.cc
@@ -40,7 +40,7 @@ static int to_CUtensorMapDataType(DataType dtype) {
     }
   } else if (dtype.is_bfloat16()) {
     tp = CU_TENSOR_MAP_DATA_TYPE_BFLOAT16;
-  } else if (dtype.is_e4m3_float8() or dtype.is_e5m2_float8()) {
+  } else if (dtype.is_float8_e4m3() || dtype.is_float8_e5m2()) {
     tp = CU_TENSOR_MAP_DATA_TYPE_UINT8;
   } else if (dtype.is_int()) {
     switch (dtype.bits()) {
@@ -111,6 +111,12 @@ Stmt Copy::LowerBulkCopy(const LowerArgs &T, arith::Analyzer *analyzer) const {
     return Stmt();
   }
 
+  if (T.layout_map.count(global_tensor)) {
+    LOG(WARNING) << "TMA bulk copy cannot support a non-swizzled global "
+                    "layout, fallback to normal copy.";
+    return Stmt();
+  }
+
   Array<PrimExpr> indices;
   for (auto r : shared_range)
     indices.push_back(r->min);
@@ -267,7 +273,9 @@ Stmt Copy::LowerBulkCopy(const LowerArgs &T, arith::Analyzer *analyzer) const {
         << "inner_box_dim: " << *inner_box_dim << " is not divisible by 256";
     instruction_dim = 256;
   }
-  ICHECK((*inner_box_dim) % instruction_dim == 0);
+  ICHECK((*inner_box_dim) % instruction_dim == 0)
+      << "inner_box_dim: " << *inner_box_dim
+      << " is not divisible by instruction_dim: " << instruction_dim;
   desc.smem_box.Set(0, PrimExpr(instruction_dim));
 
   int inner_box_dim_ = instruction_dim * shared_tensor->dtype.bytes();
@@ -289,7 +297,7 @@ Stmt Copy::LowerBulkCopy(const LowerArgs &T, arith::Analyzer *analyzer) const {
       Call(DataType::Handle(), create_tma_descriptor(), desc.EncodeCallArgs());
 
   Array<PrimExpr> args;
-  args.reserve(desc.rank + 3);
+  args.reserve(desc.rank + 4);
   args.push_back(create_descriptor);
   if (is_load)
     args.push_back(0); // mbarrier id placeholder
@@ -311,6 +319,7 @@ Stmt Copy::LowerBulkCopy(const LowerArgs &T, arith::Analyzer *analyzer) const {
     global_coords.Set(0, global_coords[0] + instruction_dim * loop_var);
     for (auto coord : global_coords)
       args.push_back(coord);
+    args.push_back(this->eviction_policy);
     tma_copy = For(loop_var, 0, loop_extent, ForKind::kUnrolled,
                    Evaluate(Call(DataType::Handle(), op, args)));
   } else {
@@ -319,6 +328,7 @@ Stmt Copy::LowerBulkCopy(const LowerArgs &T, arith::Analyzer *analyzer) const {
     args.push_back(shared_addr);
     for (auto coord : global_coords)
       args.push_back(coord);
+    args.push_back(this->eviction_policy);
     tma_copy = Evaluate(Call(DataType::Handle(), op, args));
   }
   tma_copy = IfThenElse(EQ(T.thread_var, T.thread_bounds->min), tma_copy);
@@ -360,6 +370,7 @@ Conv2DIm2ColOp::Conv2DIm2ColOp(Array<PrimExpr> args, BufferMap vmap) {
   stride = args[5].as<IntImm>().value()->value;
   dilation = args[6].as<IntImm>().value()->value;
   padding = args[7].as<IntImm>().value()->value;
+  eviction_policy = args[8].as<IntImm>().value()->value;
 }
 
 Stmt Conv2DIm2ColOp::Lower(const LowerArgs &T,
@@ -469,7 +480,7 @@ Stmt Conv2DIm2ColOp::Lower(const LowerArgs &T,
       FloorDiv(nhw_step * desc.smem_box_pixel, w_dim * h_dim));
 
   Array<PrimExpr> args;
-  args.reserve(desc.rank * 2 + 1);
+  args.reserve(desc.rank * 2 + 2);
   args.push_back(create_desc);
   args.push_back(0); // mbar placeholder
   auto dst_buffer = T.buffer_remap.count(dst) ? T.buffer_remap[dst] : dst;
@@ -479,7 +490,7 @@ Stmt Conv2DIm2ColOp::Lower(const LowerArgs &T,
     args.push_back(coord);
   for (auto offset : image_offset)
     args.push_back(offset);
-
+  args.push_back(this->eviction_policy);
   Stmt tma_copy =
       IfThenElse(EQ(T.thread_var, T.thread_bounds->min),
                  Evaluate(Call(DataType::Handle(), tma_load_im2col(), args)));
@@ -514,7 +525,7 @@ Array<PrimExpr> TMAIm2ColDesc::EncodeCallArgs() const {
 }
 
 TIR_REGISTER_TL_OP(Conv2DIm2ColOp, c2d_im2col)
-    .set_num_inputs(8)
+    .set_num_inputs(9)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
diff --git a/src/op/bulk_copy.h b/src/op/bulk_copy.h
index 279f17925..bd7be30dd 100644
--- a/src/op/bulk_copy.h
+++ b/src/op/bulk_copy.h
@@ -51,9 +51,13 @@ class Conv2DIm2ColOp : public Operator {
   Stmt Lower(const LowerArgs &T, arith::Analyzer *analyzer) const final;
   static const Op &Get();
 
+  std::unique_ptr<Operator> Clone() const final {
+    return std::make_unique<Conv2DIm2ColOp>(*this);
+  }
+
 private:
   Buffer src, dst;
-  int stride, padding, dilation, kernel;
+  int stride, padding, dilation, kernel, eviction_policy;
   PrimExpr nhw_step, c_step;
 };
 
diff --git a/src/op/elem.cc b/src/op/elem.cc
index e31cb5f5a..f2a1366a7 100644
--- a/src/op/elem.cc
+++ b/src/op/elem.cc
@@ -45,6 +45,9 @@ Copy::Copy(Array<PrimExpr> args, BufferMap vmap) : args_(args) {
     auto disable_tma = Downcast<Bool>(args[3]);
     this->disable_tma = disable_tma;
   }
+  if (args.size() >= 5) {
+    this->eviction_policy = args[4].as<IntImmNode>()->value;
+  }
 }
 
 Array<IterVar> Copy::MakeIterVars() const {
@@ -154,7 +157,7 @@ For Copy::MakeSIMTLoop(arith::Analyzer *analyzer) const {
       annotations.Set("coalesced_width", coalesced_width);
     }
     body = For(loop_vars[i]->var, 0, loop_vars[i]->dom->extent,
-               ForKind::kParallel, body, NullOpt, annotations);
+               ForKind::kParallel, body, std::nullopt, annotations);
   }
   return Downcast<For>(body);
 }
@@ -254,12 +257,12 @@ Stmt Copy::LowerLDSMCopy(const LowerArgs &T, arith::Analyzer *analyzer) const {
   IterVar col_var = loop_vars[loop_vars.size() - 1];
   IterVar row_var = loop_vars[loop_vars.size() - 2];
   PrimExpr local_layout_thread_map =
-      FloorMod(local_layout->ForwardThread(local_indices, NullOpt), 32);
+      FloorMod(local_layout->ForwardThread(local_indices, std::nullopt), 32);
   PrimExpr matrix_8x8_thread_map = makeGemmFragment8x8()->ForwardThread(
-      {FloorMod(row_var, 8), FloorMod(col_var, 8)}, NullOpt);
+      {FloorMod(row_var, 8), FloorMod(col_var, 8)}, std::nullopt);
   PrimExpr matrix_8x8_thread_map_trans =
       makeGemmFragment8x8Transposed()->ForwardThread(
-          {FloorMod(row_var, 8), FloorMod(col_var, 8)}, NullOpt);
+          {FloorMod(row_var, 8), FloorMod(col_var, 8)}, std::nullopt);
   PrimExpr local_indices_flattened =
       local_tensor.OffsetOf(local_indices_transformed).back();
   if (analyzer->CanProveEqual(matrix_8x8_thread_map, local_layout_thread_map) &&
@@ -373,20 +376,6 @@ LayoutMap Copy::InferLayout(const LayoutInferArgs &T, InferLevel level) {
     arith::Analyzer analyzer;
     par_op_ = std::make_unique<ParallelOp>(MakeSIMTLoop(&analyzer));
   }
-  if (T.layout_map.count(src) && T.layout_map.count(dst)) {
-    // Only compare fragment layout
-    if (src.scope() == "local.fragment" && dst.scope() == "local.fragment") {
-      const FragmentNode *src_layout = T.layout_map[src].as<Fragment>().get();
-      const FragmentNode *dst_layout = T.layout_map[dst].as<Fragment>().get();
-      if (src_layout && dst_layout) {
-        ICHECK(src_layout->IsEqual(dst_layout, true))
-            << "Get different layout for " << src << " and " << dst
-            << "\nLHS = " << src_layout->DebugOutput()
-            << "\nRHS = " << dst_layout->DebugOutput()
-            << "\nYou may need to use a shared memory to transform the layout";
-      }
-    }
-  }
   return par_op_->InferLayout(T, level);
 }
 
@@ -491,7 +480,7 @@ Stmt Fill::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
 }
 
 TIR_REGISTER_TL_OP(Copy, copy)
-    .set_num_inputs(3)
+    .set_num_inputs(4)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
diff --git a/src/op/elem.h b/src/op/elem.h
index a3d422917..6616236d4 100644
--- a/src/op/elem.h
+++ b/src/op/elem.h
@@ -23,6 +23,19 @@ class Copy : public Operator {
 
   static const Op &Get();
 
+  Copy(const Copy &other)
+      : args_(other.args_), src(other.src), dst(other.dst),
+        src_range(other.src_range), dst_range(other.dst_range),
+        coalesced_width(other.coalesced_width), disable_tma(other.disable_tma) {
+    // No clone nullptr
+    if (other.par_op_)
+      par_op_ = std::unique_ptr<ParallelOp>(
+          static_cast<ParallelOp *>(other.par_op_->Clone().release()));
+  }
+  std::unique_ptr<Operator> Clone() const final {
+    return std::make_unique<Copy>(*this);
+  }
+
 protected:
   Stmt LowerBulkCopy(const LowerArgs &T, arith::Analyzer *analyzer) const;
   Stmt LowerLDSMCopy(const LowerArgs &T, arith::Analyzer *analyzer) const;
@@ -45,6 +58,8 @@ class Copy : public Operator {
   Bool disable_tma = Bool(false);
 
   std::unique_ptr<ParallelOp> par_op_;
+
+  int eviction_policy;
 };
 
 class Fill : public Operator {
@@ -53,6 +68,10 @@ class Fill : public Operator {
   Stmt Lower(const LowerArgs &T, arith::Analyzer *analyzer) const final;
   static const Op &Get();
 
+  std::unique_ptr<Operator> Clone() const final {
+    return std::make_unique<Fill>(*this);
+  }
+
 private:
   For MakeSIMTLoop(arith::Analyzer *analyzer) const;
   tir::Buffer dst;
diff --git a/src/op/gemm.cc b/src/op/gemm.cc
index edca2bf66..065e664e5 100644
--- a/src/op/gemm.cc
+++ b/src/op/gemm.cc
@@ -47,29 +47,90 @@ Gemm::Gemm(Array<PrimExpr> args, BufferMap vmap) {
   K = args[7].as<IntImm>().value()->value;
   policy = static_cast<GemmWarpPolicy>(args[8].as<IntImm>().value()->value);
   clear_accum = args[9].as<Bool>().value();
-  if (args.size() > 10) {
-    kPack = args[10].as<IntImm>().value()->value;
+  stride_A = args[10].as<IntImm>().value()->value;
+  stride_B = args[11].as<IntImm>().value()->value;
+  offset_A = args[12].as<IntImm>().value()->value;
+  offset_B = args[13].as<IntImm>().value()->value;
+  if (args.size() > 14) {
+    kPack = args[14].as<IntImm>().value()->value;
     if (kPack != 1 && kPack != 2) {
       ICHECK(false) << "kPack must be 1 or 2";
     }
   }
-  if (args.size() > 11) {
-    wg_wait = args[11].as<IntImm>().value()->value;
+  if (args.size() > 15) {
+    wg_wait = args[15].as<IntImm>().value()->value;
   }
 }
 
-std::pair<int, int> Gemm::ComputeWarpPartition(int num_warps, Target target,
-                                               bool maybe_hopper_wgmma) const {
+Gemm::GemmInst Gemm::GetGemmInst(int block_size, Target target) const {
+  int warp_size = TargetGetWarpSize(target);
+  int num_warps = block_size / warp_size;
+  bool allow_wgmma = TargetIsHopper(target) && (this->M >= 64) &&
+                     (num_warps % 4 == 0) && CheckWGMMA();
+  if (allow_wgmma) {
+    return GemmInst::kWGMMA;
+  } else if (TargetIsCDNA(target)) {
+    return GemmInst::kMFMA;
+  } else if (TargetIsCuda(target)) {
+    return GemmInst::kMMA;
+  } else {
+    ICHECK(0) << "Unsupported target for gemm: " << target->str();
+  }
+}
+
+/**
+ * @brief Compute how warps are partitioned between the M and N GEMM dimensions.
+ *
+ * Determines the number of warps assigned to the M (rows) and N (columns)
+ * dimensions for a block given the selected GEMM implementation and target.
+ * The function enforces constraints required by the implementations (e.g.,
+ * per-warp tile sizes) and adapts the partition according to the configured
+ * GemmWarpPolicy (FullRow, FullCol, Square).
+ *
+ * @param block_size Total number of threads in the block (used to derive num_warps).
+ * @param gemm_inst The chosen GEMM implementation (e.g., kWGMMA, kMFMA, kMMA).
+ * @param target Target device information (used for warp size and target-specific rules).
+ * @return std::pair<int, int> {m_warp, n_warp} where m_warp * n_warp == num_warps.
+ *
+ * Constraints and behavior:
+ * - Each warp is assumed to cover 16 rows (M) and 8 columns (N). The function
+ *   checks that M % 16 == 0 and N % 8 == 0.
+ * - num_warps is computed as block_size / warp_size(target).
+ * - For WGMMA (kWGMMA):
+ *   - num_warps must be a multiple of 4 (warp-groups of 4).
+ *   - m_warp is always a multiple of 4.
+ *   - The warp partition respects the GemmWarpPolicy:
+ *     - FullRow: maximize warps on M (in multiples of 4) while keeping divisibility.
+ *     - FullCol: maximize warps on N, but if N is not evenly divisible, move
+ *       whole warp-groups to M to achieve feasibility.
+ *     - Square: choose a multiple-of-4 m_warp that best balances per-warp work
+ *       between M and N.
+ * - For non-WGMMA implementations:
+ *   - FullRow: favor allocating warps to M first; if M cannot use all warps,
+ *     remaining warps are placed on N.
+ *   - FullCol: favor allocating warps to N first; if N cannot use all warps,
+ *     remaining warps are placed on M.
+ *   - Square: search for the m/n split that best balances per-warp work given
+ *     integer warp counts and the per-warp tile sizes.
+ *
+ * Error handling:
+ * - The function performs internal checks (ICHECK) and will fail if required
+ *   divisibility or policy conditions are not met (e.g., M/N tile divisibility,
+ *   invalid policy, or WGMMA-specific warp-group requirements).
+ */
+std::pair<int, int> Gemm::ComputeWarpPartition(int block_size,
+                                               GemmInst gemm_inst,
+                                               Target target) const {
+  int num_warps = block_size / TargetGetWarpSize(target);
   int m_warp = 1, n_warp = 1;
   constexpr int kMPerWarp = 16; // Rows processed by a single warp
   constexpr int kNPerWarp = 8;  // Columns processed by a single warp
-  bool allow_wgmma = TargetIsHopper(target) && maybe_hopper_wgmma &&
-                     (this->M >= 64) && (num_warps % 4 == 0);
+
   ICHECK(this->M % kMPerWarp == 0)
       << "M must be divisible by " << kMPerWarp << ", but got " << this->M;
   ICHECK(this->N % kNPerWarp == 0)
       << "N must be divisible by " << kNPerWarp << ", but got " << this->N;
-  if (allow_wgmma) {
+  if (gemm_inst == GemmInst::kWGMMA) {
     ICHECK(num_warps % 4 == 0) << "Warp-Group MMA requires 128×k threads.";
 
     constexpr int kGroup = 4; // Number of warps in a warp-group
@@ -219,21 +280,49 @@ std::pair<int, int> Gemm::ComputeWarpPartition(int num_warps, Target target,
   return {m_warp, n_warp};
 }
 
+/**
+ * @brief Checks whether WGMMA (warp-group MMA) can be used for this GEMM.
+ *
+ * Evaluates device-memory placement, data-type combinations, transpose flags,
+ * and K divisibility constraints required for the Hopper WGMMA code path.
+ *
+ * The check returns true only when:
+ * - B resides in shared memory ("shared" or "shared.dyn"); and
+ * - (C, A, B) dtypes match one of the supported combinations below and K
+ *   satisfies the required alignment; and
+ * - for combinations that require specific orientations, A is not transposed
+ *   and B is transposed.
+ *
+ * Supported combinations and constraints:
+ * - C=float16:
+ *   - A=float16, B=float16: K % 16 == 0
+ *   - Various float8 mixes (e4m3/e5m2): require (!trans_A && trans_B) and K % 32 == 0
+ * - C=float32:
+ *   - A=float16, B=float16: K % 16 == 0
+ *   - A=bfloat16, B=bfloat16: K % 16 == 0
+ *   - A=float32, B=float32: require (!trans_A && trans_B) and K % 8 == 0
+ *   - Various float8 mixes: require (!trans_A && trans_B) and K % 32 == 0
+ * - C=int32:
+ *   - 8-bit integer combinations (Int8/UInt8): require (!trans_A && trans_B) and K % 32 == 0
+ *
+ * @return true if WGMMA is supported for the current buffers, dtypes, and
+ *         transpose/shape constraints; false otherwise.
+ */
 bool Gemm::CheckWGMMA() const {
+  if (B.scope() != "shared.dyn" && B.scope() != "shared") {
+    return false;
+  }
+
   if (C->dtype == DataType::Float(16)) {
     if (A->dtype == DataType::Float(16) && B->dtype == DataType::Float(16))
       return K % 16 == 0;
-    else if (A->dtype == DataType::NVFloat8E4M3() &&
-             B->dtype == DataType::NVFloat8E4M3())
+    else if (A->dtype.is_float8_e4m3() && B->dtype.is_float8_e4m3())
       return (!trans_A) && trans_B && K % 32 == 0;
-    else if (A->dtype == DataType::NVFloat8E4M3() &&
-             B->dtype == DataType::NVFloat8E5M2())
+    else if (A->dtype.is_float8_e4m3() && B->dtype.is_float8_e5m2())
       return (!trans_A) && trans_B && K % 32 == 0;
-    else if (A->dtype == DataType::NVFloat8E5M2() &&
-             B->dtype == DataType::NVFloat8E4M3())
+    else if (A->dtype.is_float8_e5m2() && B->dtype.is_float8_e4m3())
       return (!trans_A) && trans_B && K % 32 == 0;
-    else if (A->dtype == DataType::NVFloat8E5M2() &&
-             B->dtype == DataType::NVFloat8E5M2())
+    else if (A->dtype.is_float8_e5m2() && B->dtype.is_float8_e5m2())
       return (!trans_A) && trans_B && K % 32 == 0;
     else
       return false;
@@ -245,17 +334,13 @@ bool Gemm::CheckWGMMA() const {
       return K % 16 == 0;
     else if (A->dtype == DataType::Float(32) && B->dtype == DataType::Float(32))
       return (!trans_A) && trans_B && K % 8 == 0;
-    else if (A->dtype == DataType::NVFloat8E4M3() &&
-             B->dtype == DataType::NVFloat8E4M3())
+    else if (A->dtype.is_float8_e4m3() && B->dtype.is_float8_e4m3())
       return (!trans_A) && trans_B && K % 32 == 0;
-    else if (A->dtype == DataType::NVFloat8E4M3() &&
-             B->dtype == DataType::NVFloat8E5M2())
+    else if (A->dtype.is_float8_e4m3() && B->dtype.is_float8_e5m2())
       return (!trans_A) && trans_B && K % 32 == 0;
-    else if (A->dtype == DataType::NVFloat8E5M2() &&
-             B->dtype == DataType::NVFloat8E4M3())
+    else if (A->dtype.is_float8_e5m2() && B->dtype.is_float8_e4m3())
       return (!trans_A) && trans_B && K % 32 == 0;
-    else if (A->dtype == DataType::NVFloat8E5M2() &&
-             B->dtype == DataType::NVFloat8E5M2())
+    else if (A->dtype.is_float8_e5m2() && B->dtype.is_float8_e5m2())
       return (!trans_A) && trans_B && K % 32 == 0;
     else
       return false;
@@ -275,17 +360,23 @@ bool Gemm::CheckWGMMA() const {
   }
 }
 
-Stmt Gemm::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
-  int warp_size = 32;
-  if (TargetIsCDNA(T.target)) {
-    warp_size = 64;
+static int GetArchInt(Target target) {
+  int arch_int = 0;
+  auto s = target->GetAttr<String>("arch");
+  ICHECK(s.defined());
+  const char *arch_str = s.value().c_str();
+  if (arch_str[0] == 's' && arch_str[1] == 'm' && arch_str[2] == '_') {
+    arch_int = atoi(&arch_str[3]);
+  } else {
+    arch_int = 0;
   }
-  auto block_size = *as_const_int(T.thread_bounds->extent);
-  bool maybe_wgmma = TargetIsHopper(T.target) && (this->M >= 64) &&
-                     (block_size / warp_size % 4 == 0) && CheckWGMMA();
+  return arch_int;
+}
 
-  auto [warp_m, warp_n] =
-      ComputeWarpPartition(block_size / warp_size, T.target, maybe_wgmma);
+Stmt Gemm::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
+  auto block_size = *as_const_int(T.thread_bounds->extent);
+  GemmInst gemm_inst = GetGemmInst(block_size, T.target);
+  auto [warp_m, warp_n] = ComputeWarpPartition(block_size, gemm_inst, T.target);
 
   std::stringstream ss;
   std::string op_name = "tl::gemm_ss";
@@ -299,29 +390,49 @@ Stmt Gemm::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
   ss << warp_m << ", " << warp_n << ", ";
   ss << trans_A << ", " << trans_B;
   ss << ", " << clear_accum;
+  if (TargetIsCuda(T.target) && (GetArchInt(T.target) >= 75)) {
+    ss << ", " << stride_A << ", " << stride_B;
+    ss << ", " << offset_A << ", " << offset_B;
+  }
   if (TargetIsCDNA(T.target)) {
     // for cdna gemm, we need to specify kPack
     ss << ", " << kPack;
   } else if (TargetIsHopper(T.target)) {
-    ss << ", " << (maybe_wgmma ? "true" : "false");
+    ss << ", " << (gemm_inst == GemmInst::kWGMMA ? "true" : "false");
   }
   if (wg_wait != 0) {
     ss << ", " << wg_wait;
   }
   ss << ">";
-  auto A_buffer = T.buffer_remap.count(A) ? T.buffer_remap[A] : A;
-  auto B_buffer = T.buffer_remap.count(B) ? T.buffer_remap[B] : B;
-  auto C_buffer = T.buffer_remap[C];
-
-  Array<PrimExpr> new_args;
-  new_args.push_back(StringImm(ss.str()));
-  new_args.push_back(Aptr);
-  new_args.push_back(Bptr);
-  new_args.push_back(Cptr);
-  auto new_call = Call(DataType::Handle(), builtin::call_extern(), new_args);
+
+  auto new_call = Call(DataType::Handle(), tl::tl_gemm(),
+                       Array<PrimExpr>{StringImm(ss.str()), Aptr, Bptr, Cptr});
   return Evaluate(new_call);
 }
 
+/**
+ * @brief Infer memory/layout mappings for A, B, and C buffers for this GEMM op.
+ *
+ * Generates and returns a LayoutMap that binds buffer A, B, and C to
+ * target- and architecture-specific fragment or shared-memory layouts based
+ * on the current target, thread bounds, warp partitioning, data types, and
+ * transpose flags. This performs target dispatch (Volta, Ampere/Turing/SM120,
+ * Hopper, CDNA), selects the appropriate fragment or shared layout creators,
+ * and binds fragment layouts to the thread range when buffers are local
+ * fragments.
+ *
+ * Preconditions:
+ * - C.scope() must be "local.fragment".
+ *
+ * Postconditions / side effects:
+ * - Marks the operator's layout inference as completed (sets completed_ = true).
+ * - May abort via ICHECK on unsupported targets, invalid buffer scopes, or
+ *   incompatible shape constraints.
+ *
+ * @param T Layout inference inputs (thread bounds and target).
+ * @param level Inference level (unused for side effects but retained for API).
+ * @return LayoutMap mapping each of A, B, and C to their inferred layouts.
+ */
 LayoutMap Gemm::InferLayout(const LayoutInferArgs &T, InferLevel level) {
   if (completed_)
     return {};
@@ -329,10 +440,10 @@ LayoutMap Gemm::InferLayout(const LayoutInferArgs &T, InferLevel level) {
   ICHECK(C.scope() == "local.fragment");
   auto thread_range = T.thread_bounds;
   auto block_size = *as_const_int(thread_range->extent);
+  GemmInst gemm_inst = GetGemmInst(block_size, T.target);
+  auto [warp_m, warp_n] = ComputeWarpPartition(block_size, gemm_inst, T.target);
+
   if (TargetIsVolta(T.target)) {
-    const int warp_size = 32;
-    auto [warp_m, warp_n] =
-        ComputeWarpPartition(block_size / warp_size, T.target);
     auto fragment =
         makeGemmVoltaFragmentC(M, N, M / warp_m, N / warp_n, C->dtype.bits());
     results.Set(C, fragment->BindThreadRange(thread_range));
@@ -354,10 +465,8 @@ LayoutMap Gemm::InferLayout(const LayoutInferArgs &T, InferLevel level) {
     results.Set(B, makeGemmVoltaABLayout(*as_const_int(B->shape[dim_B - 2]),
                                          *as_const_int(B->shape[dim_B - 1]),
                                          false, trans_B ? 2 : 1));
-  } else if (TargetIsAmpere(T.target) || TargetIsTuring(T.target)) {
-    const int warp_size = 32;
-    auto [warp_m, warp_n] =
-        ComputeWarpPartition(block_size / warp_size, T.target);
+  } else if (TargetIsAmpere(T.target) || TargetIsTuring(T.target) ||
+             TargetIsSM120(T.target)) {
     auto fragment =
         makeGemmFragmentC(M, N, M / warp_m, N / warp_n, C->dtype.bits());
     results.Set(C, fragment->BindThreadRange(thread_range));
@@ -391,13 +500,8 @@ LayoutMap Gemm::InferLayout(const LayoutInferArgs &T, InferLevel level) {
       ICHECK(0);
     }
   } else if (TargetIsHopper(T.target)) {
-    const int warp_size = 32;
-    bool maybe_wgmma =
-        (this->M >= 64) && (block_size / warp_size % 4 == 0) && CheckWGMMA();
-    auto [warp_m, warp_n] =
-        ComputeWarpPartition(block_size / warp_size, T.target, maybe_wgmma);
     auto fragment =
-        maybe_wgmma
+        gemm_inst == GemmInst::kWGMMA
             ? makeGemmFragmentCHopper(M, N, M / warp_m, N / warp_n,
                                       C->dtype.bits())
             : makeGemmFragmentC(M, N, M / warp_m, N / warp_n, C->dtype.bits());
@@ -409,7 +513,7 @@ LayoutMap Gemm::InferLayout(const LayoutInferArgs &T, InferLevel level) {
       const int64_t continuity =
           trans_A ? 4 * mat_continuous / warp_m : mat_continuous;
       auto ABLayout =
-          maybe_wgmma
+          gemm_inst == GemmInst::kWGMMA
               ? makeGemmABLayoutHopper(mat_stride, mat_continuous, continuity,
                                        A->dtype.bits(), trans_A ? 1 : 2)
               : makeGemmABLayout(mat_stride, mat_continuous, mat_continuous,
@@ -427,20 +531,18 @@ LayoutMap Gemm::InferLayout(const LayoutInferArgs &T, InferLevel level) {
       const int64_t continuity =
           trans_B ? mat_continuous : mat_continuous / warp_n;
       auto ABLayout =
-          maybe_wgmma
+          gemm_inst == GemmInst::kWGMMA
               ? makeGemmABLayoutHopper(mat_stride, mat_continuous, continuity,
                                        B->dtype.bits(), trans_B ? 2 : 1)
               : makeGemmABLayout(mat_stride, mat_continuous, mat_continuous,
                                  B->dtype.bits(), trans_B ? 2 : 1);
       results.Set(B, ABLayout);
     } else {
-      ICHECK(0) << "WGMMA only support B in shared.";
+      auto fragment =
+          makeGemmFragmentB(M, N, K, M / warp_m, N / warp_n, trans_B);
+      results.Set(B, fragment->BindThreadRange(thread_range));
     }
   } else if (TargetIsCDNA(T.target)) {
-    const int warp_size = 64;
-    auto [warp_m, warp_n] =
-        ComputeWarpPartition(block_size / warp_size, T.target);
-
     auto fragment =
         makeGemmFragmentCCDNA(M, N, M / warp_m, N / warp_n, C->dtype.bits());
     results.Set(C, fragment->BindThreadRange(thread_range));
@@ -485,4 +587,4 @@ TIR_REGISTER_TL_OP(Gemm, gemm)
                                Integer(CallEffectKind::kOpaque));
 
 } // namespace tl
-} // namespace tvm
\ No newline at end of file
+} // namespace tvm
diff --git a/src/op/gemm.h b/src/op/gemm.h
index 26a35af24..55e42b771 100644
--- a/src/op/gemm.h
+++ b/src/op/gemm.h
@@ -26,10 +26,17 @@ class Gemm : public Operator {
     kFullCol = 2,
   } policy;
 
+  std::unique_ptr<Operator> Clone() const final {
+    return std::make_unique<Gemm>(*this);
+  }
+
 private:
-  std::pair<int, int>
-  ComputeWarpPartition(int num_warps, Target target,
-                       bool maybe_hopper_wgmma = true) const;
+  // Target GEMM instruction
+  enum class GemmInst { kMMA, kWGMMA, kUTCMMA, kMFMA };
+  GemmInst GetGemmInst(int block_size, Target target) const;
+
+  std::pair<int, int> ComputeWarpPartition(int num_warps, GemmInst gemm_inst,
+                                           Target target) const;
 
   bool CheckWGMMA() const;
   Array<PrimExpr> call_args;
@@ -38,6 +45,8 @@ class Gemm : public Operator {
   PrimExpr Aptr, Bptr, Cptr;
   bool trans_A, trans_B;
   int M, N, K;
+  int stride_A, stride_B;
+  int offset_A, offset_B;
   bool clear_accum = false;
   // k_pack please ref to bitblas/tl/mfma_macro_generator.py::k_pack
   // only will be enabled under cdna mfma instructions
diff --git a/src/op/gemm_sp.cc b/src/op/gemm_sp.cc
index 7a8b58318..9405c8631 100644
--- a/src/op/gemm_sp.cc
+++ b/src/op/gemm_sp.cc
@@ -230,7 +230,7 @@ Stmt GemmSP::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
       << " and " << B.scope();
   ICHECK((E.scope() == "shared" || E.scope() == "shared.dyn"))
       << "Only support shared.dyn scope for E as copy from smem to rmem are "
-         "delegated to cute implemntation, found "
+         "delegated to cute implementation, found "
       << E.scope();
   ss << op_name << "<" << M << ", " << N << ", " << K << ", ";
   ss << warp_m << ", " << warp_n << ", ";
@@ -248,13 +248,11 @@ Stmt GemmSP::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
   auto C_buffer = T.buffer_remap[C];
   auto E_buffer = T.buffer_remap.count(E) ? T.buffer_remap[E] : E;
 
-  Array<PrimExpr> new_args;
-  new_args.push_back(StringImm(ss.str()));
-  new_args.push_back(A_buffer.access_ptr(1));
-  new_args.push_back(B_buffer.access_ptr(1));
-  new_args.push_back(C_buffer.access_ptr(3));
-  new_args.push_back(E_buffer.access_ptr(1));
-  auto new_call = Call(DataType::Handle(), builtin::call_extern(), new_args);
+  auto new_call =
+      Call(DataType::Handle(), tl::tl_gemm_sp(),
+           Array<PrimExpr>{StringImm(ss.str()), A_buffer.access_ptr(1),
+                           B_buffer.access_ptr(1), C_buffer.access_ptr(3),
+                           E_buffer.access_ptr(1)});
   return Evaluate(new_call);
 }
 
diff --git a/src/op/gemm_sp.h b/src/op/gemm_sp.h
index dbb62b692..4488e4612 100644
--- a/src/op/gemm_sp.h
+++ b/src/op/gemm_sp.h
@@ -26,6 +26,10 @@ class GemmSP : public Operator {
     kFullCol = 2,
   } policy;
 
+  std::unique_ptr<Operator> Clone() const final {
+    return std::make_unique<GemmSP>(*this);
+  }
+
 private:
   std::pair<int, int>
   ComputeWarpPartition(int num_warps, Target target,
diff --git a/src/op/logical.cc b/src/op/logical.cc
index 49afd8a80..0398c38c1 100644
--- a/src/op/logical.cc
+++ b/src/op/logical.cc
@@ -4,7 +4,7 @@
  *
  */
 
-#include <tvm/runtime/registry.h>
+#include <tvm/ffi/function.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/op.h>
 #include <tvm/tir/op_attr_types.h>
diff --git a/src/op/math.cc b/src/op/math.cc
index 1a10f8c23..572399877 100644
--- a/src/op/math.cc
+++ b/src/op/math.cc
@@ -4,7 +4,7 @@
  *
  */
 
-#include <tvm/runtime/registry.h>
+#include <tvm/ffi/function.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/op.h>
 #include <tvm/tir/op_attr_types.h>
diff --git a/src/op/op.cc b/src/op/op.cc
index 145949620..69cd59227 100644
--- a/src/op/op.cc
+++ b/src/op/op.cc
@@ -79,11 +79,6 @@ Stmt Operator::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
   return Evaluate(0);
 }
 
-Stmt Operator::Canonialize(const CanonializeArgs &T,
-                           arith::Analyzer *analyzer) const {
-  return {};
-}
-
 LayoutMap Operator::InferLayout(const LayoutInferArgs &T, InferLevel level) {
   return {};
 }
diff --git a/src/op/op.h b/src/op/op.h
index 5b230ccfb..beb35dd68 100644
--- a/src/op/op.h
+++ b/src/op/op.h
@@ -22,7 +22,7 @@ using namespace tir;
 using AddWorkspaceCallback = std::function<PrimExpr(int, DataType)>;
 using LayoutMap = Map<Buffer, Layout>;
 using BufferMap = Map<Var, Buffer>;
-using OpBuilderFunc = TypedPackedFunc<void *(Array<PrimExpr>, BufferMap)>;
+using OpBuilderFunc = ffi::TypedFunction<void *(Array<PrimExpr>, BufferMap)>;
 
 #define TIR_REGISTER_TL_OP(Entry, OpName)                                      \
   const Op &Entry::Get() {                                                     \
@@ -59,17 +59,12 @@ struct LayoutInferArgs {
   Map<Buffer, Buffer> buffer_remap;
 };
 
-struct CanonializeArgs {
-  Target target;
-};
-
 class Operator {
 public:
   virtual Stmt Lower(const LowerArgs &T, arith::Analyzer *analyzer) const;
-  virtual Stmt Canonialize(const CanonializeArgs &T,
-                           arith::Analyzer *analyzer) const;
   virtual LayoutMap InferLayout(const LayoutInferArgs &T, InferLevel level);
   virtual ~Operator() = default;
+  virtual std::unique_ptr<Operator> Clone() const = 0;
 };
 
 class RegionOp : public Operator {
@@ -77,6 +72,10 @@ class RegionOp : public Operator {
   RegionOp(Array<PrimExpr> args, BufferMap vmap);
   static const Op &Get();
 
+  std::unique_ptr<Operator> Clone() const final {
+    return std::make_unique<RegionOp>(*this);
+  }
+
   const Buffer &GetBuffer() const { return buffer_; }
   const Array<Range> &GetRanges() const { return ranges_; }
   int GetAccessMask() const { return access_mask_; }
diff --git a/src/op/parallel.cc b/src/op/parallel.cc
index 502dd45d2..33ceb7de8 100644
--- a/src/op/parallel.cc
+++ b/src/op/parallel.cc
@@ -22,6 +22,64 @@ namespace attr {
 constexpr const char *coalesced_width = "coalesced_width";
 } // namespace attr
 
+// ProveFragmentContains checks whether the threads that access elements of a
+// smaller fragment (small_frag) are a subset of the threads that access
+// elements of a larger fragment (large_frag) for any given loop index. This
+// function ensures that if the small fragment's layout corresponds to the loop
+// itself, accessing the large fragment's elements is valid. Additionally, if
+// small is updated to large, the originally valid access remains valid. The
+// proof is performed by:
+//
+// 1. Defining a variable `rep_small` to represent the replicate index of the
+//    small fragment that is being checked.
+// 2. Using the `small_frag_indices` and `rep_small` to derive the thread
+// accessing
+//    the element in the small fragment.
+// 3. Using `large_frag_indices` to derive the physical index of the large
+// fragment
+//    along with the thread information, and then feeding these into the inverse
+//    of the large fragment to obtain the logical index and replicate index.
+// 4. Verifying the mapping by checking whether the computed thread using the
+// inverse
+//    layout corresponds to the original thread calculated for the small
+//    fragment. If they don't match, this indicates that the inverse layout's
+//    domain does not include the thread and thus the access is invalid.
+bool ProveFragmentContains(Fragment small_frag, Fragment large_frag,
+                           Array<PrimExpr> small_frag_indices,
+                           Array<PrimExpr> large_frag_indices,
+                           arith::Analyzer &analyzer_) {
+  Var rep_small("__checking_frag_contains_rep");
+  analyzer_.Bind(rep_small,
+                 Range(IntImm(small_frag->ReplicateExtent()->dtype, 0),
+                       small_frag->ReplicateExtent()),
+                 true); // Bind the replicate extent of small_frag.
+  // Derive thread for small_frag.
+  auto thread = small_frag->ForwardThread(small_frag_indices, rep_small);
+
+  // Get physical index and thread for large_frag.
+  auto large_frag_physical_and_thread = large_frag->Forward(large_frag_indices);
+  // Add small_frag's thread to the large fragment's thread info.
+  large_frag_physical_and_thread.push_back(thread);
+  // Get the inverse of the large fragment.
+  auto inv_large_frag = large_frag->Inverse();
+  // Compute logical index and replicate index using inverse layout.
+  auto inv_large_frag_logical_and_rep =
+      inv_large_frag->Forward(large_frag_physical_and_thread);
+
+  // Extract replicate index from the result.
+  auto inv_large_frag_rep =
+      inv_large_frag_logical_and_rep[inv_large_frag_logical_and_rep.size() - 1];
+
+  // Calculate thread based on the logical index and replicate index.
+  auto check_thread =
+      large_frag->ForwardThread(large_frag_indices, inv_large_frag_rep);
+
+  // Simplify the difference between the threads.
+  auto diff = analyzer_.Simplify(thread - check_thread);
+  // If the difference is zero, the threads match and the access is valid.
+  return is_zero(diff);
+}
+
 class IfBufferRemapLoopGenerator : public StmtExprMutator {
 public:
   static For run(Stmt stmt, Map<Buffer, Buffer> buffer_remap,
@@ -230,7 +288,7 @@ LayoutMap ParallelOp::InferLayout(const LayoutInferArgs &T, InferLevel level) {
       // Check if coalesced_width is defined
       if (auto coalesced_width =
               root_->annotations.Get(tl::attr::coalesced_width)) {
-        if (const auto *imm = coalesced_width.as<IntImmNode>()) {
+        if (const auto *imm = coalesced_width->as<IntImmNode>()) {
           int expected = imm->value;
           // Verify that vector_size is divisible by expected
           if (vector_size % expected != 0) {
@@ -267,7 +325,8 @@ LayoutMap ParallelOp::InferLayout(const LayoutInferArgs &T, InferLevel level) {
   }
 
   // Step 2: Check that the loop's partition can correctly align with all source
-  // fragment
+  // fragment, and infer layout only when it's not yet layout-ed
+  LayoutMap results;
   for (const auto &[buffer, _] : indice_map_) {
     if (T.layout_map.count(buffer)) {
       auto fragment = T.layout_map[buffer].as<Fragment>().value();
@@ -278,55 +337,19 @@ LayoutMap ParallelOp::InferLayout(const LayoutInferArgs &T, InferLevel level) {
         continue;
       auto vars =
           loop_vars_.Map([](const IterVar &iv) { return PrimExpr(iv->var); });
-      auto lhs = loop_layout_->ForwardThread(vars, NullOpt);
-      auto rhs = fragment->ForwardThread(indice_map_[buffer], NullOpt);
-      auto diff = analyzer_.Simplify(lhs - rhs);
-      ICHECK(is_zero(diff))
-          << "Layout infer conflict for " << buffer << " " << source_buffer
-          << "\nLHS = " << lhs << "\nRHS = " << rhs;
-    }
-  }
-  // Step 3: Infer other fragment's layout from the loop's partition
-  LayoutMap results;
-  for (const auto &[buffer, _] : indice_map_) {
-    if (!T.layout_map.count(buffer)) {
-      results.Set(buffer, CompleteBufferFragment(buffer)->BindThreadRange(
-                              T.thread_bounds));
-    }
-
-    // Layout infer conflict for local.fragment can not be handled here
-    // because the source_buffer is not always available
-    // (zhengju) do not modify strict layout even if it is conflict with the
-    // dst layout. This will not influence the result because the strict
-    // layout is usually with rep = 1 Since the real layout map is
-    // controlled by layout_inference.cc, we should add this check there
-    if (buffer.scope() == "local.fragment" && source_buffer.defined() &&
-        source_buffer.scope() == "local.fragment") {
-      if (T.layout_map.count(buffer)) {
-        const FragmentNode *src_layout =
-            T.layout_map[buffer].as<Fragment>().get();
-        Fragment dst_layout_fragment =
-            CompleteBufferFragment(buffer)->BindThreadRange(T.thread_bounds);
-        const FragmentNode *dst_layout =
-            dst_layout_fragment.as<Fragment>().get();
-        if (as_const_int(dst_layout->ReplicateExtent()) &&
-            as_const_int(src_layout->ReplicateExtent()) &&
-            (*as_const_int(dst_layout->ReplicateExtent()) >
-             *as_const_int(src_layout->ReplicateExtent()))) {
-          results.Set(buffer, dst_layout_fragment);
-          continue;
-        }
-        if (src_layout && dst_layout) {
-          ICHECK(src_layout->IsEqual(dst_layout, true))
-              << "Layout may conflict with ParallelOp for buffer " << buffer
-              << " vs. " << source_buffer << "\nError body begin:\n"
-              << GetRoot()->body << "\nError body end"
-              << "\nLHS = " << src_layout->DebugOutput()
-              << "\nRHS = " << dst_layout->DebugOutput()
-              << "\nYou may need to use a shared memory to transform the "
-                 "layout";
-        }
+      if (!ProveFragmentContains(loop_layout_, fragment, vars,
+                                 indice_map_[buffer], analyzer_)) {
+        std::ostringstream oss;
+        oss << "Layout infer conflict between " << buffer << " and "
+            << source_buffer << " in T.Parallel loop:" << std::endl
+            << "    loop " << loop_layout_->DebugOutput() << std::endl
+            << "    fragment " << fragment->DebugOutput() << std::endl;
+        throw LayoutConflictException(oss.str());
       }
+    } else {
+      auto dst_layout =
+          CompleteBufferFragment(buffer)->BindThreadRange(T.thread_bounds);
+      results.Set(buffer, dst_layout);
     }
   }
   return results;
@@ -336,7 +359,7 @@ Optional<PrimExpr> ParallelOp::GetPredicate(Var thread_var) const {
   if (predicate_.defined()) {
     return Substitute(predicate_.value(), {{InputPlaceholder(0), thread_var}});
   } else {
-    return NullOpt;
+    return std::nullopt;
   }
 }
 
@@ -362,7 +385,8 @@ Fragment ParallelOp::CompleteBufferFragment(const Buffer &buffer) {
   PrimExpr thd_b = loop_layout_->ForwardThread(
       ind_inv->Forward(fwd),
       FloorDiv(ReplicationPlaceholder(), indice_rep_extent));
-  return Fragment(buffer->shape, {}, thd_b, dest_buffer_rep_extent, NullOpt)
+  return Fragment(buffer->shape, {}, thd_b, dest_buffer_rep_extent,
+                  std::nullopt)
       ->CondenseReplicateVar();
 }
 
diff --git a/src/op/parallel.h b/src/op/parallel.h
index e84ca98a7..fd49acfe9 100644
--- a/src/op/parallel.h
+++ b/src/op/parallel.h
@@ -17,6 +17,20 @@ namespace tl {
 
 using namespace tir;
 
+class LayoutConflictException : public std::exception {
+public:
+  const char *what() const noexcept override { return msg_.c_str(); }
+  LayoutConflictException(const std::string &msg) : msg_(msg) {}
+
+private:
+  std::string msg_;
+};
+
+bool ProveFragmentContains(Fragment small_frag, Fragment large_frag,
+                           Array<PrimExpr> small_frag_indices,
+                           Array<PrimExpr> large_frag_indices,
+                           arith::Analyzer &analyzer_);
+
 class ParallelOp;
 
 class ParallelLoopNestVisitor : public StmtExprVisitor {
@@ -36,6 +50,14 @@ class ParallelOp : public Operator {
   ParallelOp(For root);
   LayoutMap InferLayout(const LayoutInferArgs &T, InferLevel level) final;
 
+  ParallelOp(const ParallelOp &other) : ParallelOp(other.root_) {
+    loop_layout_ = other.loop_layout_;
+    predicate_ = other.predicate_;
+  }
+  std::unique_ptr<Operator> Clone() const final {
+    return std::make_unique<ParallelOp>(*this);
+  }
+
   Fragment GetLoopLayout() const { return loop_layout_; }
   For GetRoot() const { return root_; }
   Map<Buffer, Array<PrimExpr>> GetIndiceMap() const { return indice_map_; }
diff --git a/src/op/reduce.cc b/src/op/reduce.cc
index 6d594da1a..79ce193ba 100644
--- a/src/op/reduce.cc
+++ b/src/op/reduce.cc
@@ -12,6 +12,7 @@
 #include <tvm/tir/stmt_functor.h>
 
 #include "../layout/utils.h"
+#include "../op/parallel.h"
 #include "../transform/loop_partition.h"
 #include "tir/transforms/ir_utils.h"
 
@@ -201,7 +202,7 @@ Stmt ReduceOp::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
   for (int i = src_layout->OutputDim() - 1; i >= 0; i--) {
     reduce_local =
         For(src_var_compressed[i]->var, 0, src_var_compressed[i]->dom->extent,
-            ForKind::kUnrolled, reduce_local, NullOpt,
+            ForKind::kUnrolled, reduce_local, std::nullopt,
             {{tir::attr::pragma_unroll_explicit, Bool(false)}});
   }
   stmts.push_back(reduce_local);
@@ -213,7 +214,7 @@ Stmt ReduceOp::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
       arith::NormalizeToIterSum(src_thread, ToVMap(src_vars), analyzer);
   for (const auto &iter_split : iter_sum->args) {
     auto mark = iter_split->source->source.as<Var>();
-    ICHECK(mark.defined());
+    ICHECK(mark) << "Not a normalized iterator: " << iter_split->source;
     if (mark.value().same_as(src_vars[this->dim]->var)) {
       auto scale = as_const_int(iter_split->scale);
       auto extent = as_const_int(iter_split->extent);
@@ -287,7 +288,7 @@ LayoutMap ReduceOp::InferLayout(const LayoutInferArgs &T, InferLevel level) {
   if (level >= InferLevel::kStrict)
     return {};
   if (src.scope() == "local.fragment" && dst.scope() == "local.fragment" &&
-      T.layout_map.count(src) && !T.layout_map.count(dst)) {
+      T.layout_map.count(src)) {
     auto src_layout = T.layout_map[src].as<Fragment>().value();
 
     PrimExpr indice_rep_extent = src->shape[dim];
@@ -307,10 +308,49 @@ LayoutMap ReduceOp::InferLayout(const LayoutInferArgs &T, InferLevel level) {
     auto thd = src_layout->ForwardThread(
         fwd, FloorDiv(ReplicationPlaceholder(), indice_rep_extent));
     Fragment dst_layout =
-        Fragment(dst->shape, {}, thd, dest_buffer_rep_extent, NullOpt)
+        Fragment(dst->shape, {}, thd, dest_buffer_rep_extent, std::nullopt)
             ->CondenseReplicateVar()
             ->BindThreadRange(T.thread_bounds);
-    return {{dst, dst_layout}};
+    if (!T.layout_map.count(dst))
+      return {{dst, dst_layout}};
+    else {
+      // Check if computed layout is compatible with existing: the existing one
+      // must strictly contains the computed layout
+      auto orig_dst_layout =
+          T.layout_map.Get(dst).value().as<Fragment>().value();
+      ICHECK(dst_layout->InputDim() == orig_dst_layout->InputDim());
+      Array<PrimExpr> indices;
+      indices.reserve(dst_layout->InputDim());
+      arith::Analyzer inner_analyzer;
+      for (int i = 0; i < dst_layout->InputDim(); ++i) {
+        auto x = InputPlaceholder(i);
+        indices.push_back(x);
+        // should be literal - literal = 0, any analyzer will work
+        ICHECK(is_zero(inner_analyzer.Simplify(
+            dst_layout->InputShape()[i] - orig_dst_layout->InputShape()[i])));
+        inner_analyzer.Bind(x, Range(0, dst_layout->InputShape()[i]));
+      }
+
+      ICHECK(as_const_int(dst_layout->ReplicateExtent()));
+      ICHECK(as_const_int(src_layout->ReplicateExtent()));
+      auto dst_rep = *as_const_int(dst_layout->ReplicateExtent());
+      auto src_rep = *as_const_int(src_layout->ReplicateExtent());
+      if (dst_rep < src_rep ||
+          !ProveFragmentContains(orig_dst_layout, dst_layout, indices, indices,
+                                 inner_analyzer)) {
+        std::ostringstream oss;
+        oss << "Layout may conflict with ReduceOp for buffer " << dst << " vs. "
+            << src << "\nLHS = " << src_layout->DebugOutput()
+            << "\nRHS = " << orig_dst_layout->DebugOutput()
+            << "\nYou may need to use a shared memory to transform the "
+               "layout";
+        throw LayoutConflictException(oss.str());
+      }
+
+      if (dst_rep > src_rep) {
+        return {{dst, dst_layout}};
+      }
+    }
   }
   return {};
 }
diff --git a/src/op/reduce.h b/src/op/reduce.h
index 381f64e6f..64954ea43 100644
--- a/src/op/reduce.h
+++ b/src/op/reduce.h
@@ -21,6 +21,10 @@ class ReduceOp : public Operator {
   LayoutMap InferLayout(const LayoutInferArgs &T, InferLevel level) final;
   static const Op &Get();
 
+  std::unique_ptr<Operator> Clone() const final {
+    return std::make_unique<ReduceOp>(*this);
+  }
+
 private:
   tir::Buffer src, dst;
   int dim;
@@ -45,6 +49,10 @@ class CumSumOp : public Operator {
   LayoutMap InferLayout(const LayoutInferArgs &T, InferLevel level) final;
   static const Op &Get();
 
+  std::unique_ptr<Operator> Clone() const final {
+    return std::make_unique<CumSumOp>(*this);
+  }
+
 private:
   tir::Buffer src, dst;
   int dim;
diff --git a/src/runtime/runtime.cc b/src/runtime/runtime.cc
index 615bdc834..d9f1d74cd 100644
--- a/src/runtime/runtime.cc
+++ b/src/runtime/runtime.cc
@@ -7,13 +7,12 @@
 #include "runtime.h"
 
 #include "../target/cuda.h"
-#include <tvm/runtime/registry.h>
+#include <tvm/ffi/function.h>
+#include <tvm/node/node.h>
 
 namespace tvm {
 namespace tl {
 
-using namespace runtime;
-
 #if (CUDA_MAJOR_VERSION >= 12)
 template <typename T> static std::string ArrayToStr(const T *ptr, size_t n) {
   std::stringstream ss;
@@ -39,37 +38,35 @@ struct TensorMapArgs {
   CUtensorMapL2promotion l2Promotion;
   CUtensorMapFloatOOBfill oobFill;
 
-  static TensorMapArgs Extract(TVMArgs args) {
+  static TensorMapArgs Extract(PackedArgs args) {
     TensorMapArgs T;
     int idx = 0;
-    ICHECK(args.num_args >= 8);
-    T.map = reinterpret_cast<CUtensorMap *>(static_cast<void *>(args[idx++]));
-    T.type =
-        static_cast<CUtensorMapDataType>(static_cast<int64_t>(args[idx++]));
-    T.tensorRank = static_cast<cuuint32_t>(static_cast<int64_t>(args[idx++]));
-    T.globalAddress = args[idx++];
+    ICHECK(args.size() >= 8);
+    T.map = reinterpret_cast<CUtensorMap *>(args[idx++].cast<void *>());
+    T.type = static_cast<CUtensorMapDataType>(args[idx++].cast<int64_t>());
+    T.tensorRank = static_cast<cuuint32_t>(args[idx++].cast<int64_t>());
+    T.globalAddress = args[idx++].cast<void *>();
     ICHECK(T.tensorRank >= 1 && T.tensorRank <= 5);
-    ICHECK(args.num_args == static_cast<int>(8 + T.tensorRank * 4));
+    ICHECK(args.size() == static_cast<int>(8 + T.tensorRank * 4));
     for (size_t i = 0; i < T.tensorRank; i++) {
-      T.globalDim[i] = static_cast<cuuint64_t>(args[idx++]);
+      T.globalDim[i] = args[idx++].cast<cuuint64_t>();
     }
     for (size_t i = 0; i < T.tensorRank; i++) {
-      T.globalStride[i] = static_cast<cuuint64_t>(args[idx++]);
+      T.globalStride[i] = args[idx++].cast<cuuint64_t>();
     }
     for (size_t i = 0; i < T.tensorRank; i++) {
-      T.boxDim[i] = static_cast<cuuint64_t>(args[idx++]);
+      T.boxDim[i] = args[idx++].cast<cuuint64_t>();
     }
     for (size_t i = 0; i < T.tensorRank; i++) {
-      T.elementStrides[i] = static_cast<cuuint64_t>(args[idx++]);
+      T.elementStrides[i] = args[idx++].cast<cuuint64_t>();
     }
     T.interleave =
-        static_cast<CUtensorMapInterleave>(static_cast<int64_t>(args[idx++]));
-    T.swizzle =
-        static_cast<CUtensorMapSwizzle>(static_cast<int64_t>(args[idx++]));
+        static_cast<CUtensorMapInterleave>(args[idx++].cast<int64_t>());
+    T.swizzle = static_cast<CUtensorMapSwizzle>(args[idx++].cast<int64_t>());
     T.l2Promotion =
-        static_cast<CUtensorMapL2promotion>(static_cast<int64_t>(args[idx++]));
+        static_cast<CUtensorMapL2promotion>(args[idx++].cast<int64_t>());
     T.oobFill =
-        static_cast<CUtensorMapFloatOOBfill>(static_cast<int64_t>(args[idx++]));
+        static_cast<CUtensorMapFloatOOBfill>(args[idx++].cast<int64_t>());
     return T;
   }
 
@@ -93,20 +90,23 @@ struct TensorMapArgs {
 };
 
 // set device api
-TVM_REGISTER_GLOBAL(tvm_tensormap_create_tiled)
-    .set_body([](TVMArgs args, TVMRetValue *ret) {
-      TensorMapArgs T = TensorMapArgs::Extract(args);
-      CUresult result = cuTensorMapEncodeTiled(
-          T.map, T.type, T.tensorRank, T.globalAddress, T.globalDim,
-          T.globalStride + 1, T.boxDim, T.elementStrides, T.interleave,
-          T.swizzle, T.l2Promotion, T.oobFill);
-      if (result != CUDA_SUCCESS) {
-        LOG_FATAL << "Failed to initialize the TMA descriptor " << result
-                  << std::endl
-                  << T.ToDebugString();
-      }
-      *ret = static_cast<int>(result);
-    });
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def_packed(
+      "tvm_tensormap_create_tiled", [](PackedArgs args, Any *ret) {
+        TensorMapArgs T = TensorMapArgs::Extract(args);
+        CUresult result = cuTensorMapEncodeTiled(
+            T.map, T.type, T.tensorRank, T.globalAddress, T.globalDim,
+            T.globalStride + 1, T.boxDim, T.elementStrides, T.interleave,
+            T.swizzle, T.l2Promotion, T.oobFill);
+        if (result != CUDA_SUCCESS) {
+          LOG_FATAL << "Failed to initialize the TMA descriptor " << result
+                    << std::endl
+                    << T.ToDebugString();
+        }
+        *ret = static_cast<int>(result);
+      });
+});
 
 struct TensorMapIm2ColArgs {
   CUtensorMap *map;
@@ -122,42 +122,40 @@ struct TensorMapIm2ColArgs {
   CUtensorMapL2promotion l2Promotion;
   CUtensorMapFloatOOBfill oobFill;
 
-  static TensorMapIm2ColArgs Extract(TVMArgs args) {
+  static TensorMapIm2ColArgs Extract(PackedArgs args) {
     TensorMapIm2ColArgs T;
     int idx = 0;
-    ICHECK(args.num_args >= 8);
-    T.map = reinterpret_cast<CUtensorMap *>(static_cast<void *>(args[idx++]));
-    T.type =
-        static_cast<CUtensorMapDataType>(static_cast<int64_t>(args[idx++]));
-    T.tensorRank = static_cast<cuuint32_t>(static_cast<int64_t>(args[idx++]));
-    T.globalAddress = args[idx++];
+    ICHECK(args.size() >= 8);
+    T.map = reinterpret_cast<CUtensorMap *>(args[idx++].cast<void *>());
+    T.type = static_cast<CUtensorMapDataType>(args[idx++].cast<int64_t>());
+    T.tensorRank = static_cast<cuuint32_t>(args[idx++].cast<int64_t>());
+    T.globalAddress = args[idx++].cast<void *>();
     ICHECK(T.tensorRank >= 3 && T.tensorRank <= 5);
-    ICHECK(args.num_args == static_cast<int>(6 + T.tensorRank * 5));
+    ICHECK(args.size() == static_cast<int>(6 + T.tensorRank * 5));
     for (size_t i = 0; i < T.tensorRank; i++) {
-      T.globalDim[i] = static_cast<cuuint64_t>(args[idx++]);
+      T.globalDim[i] = args[idx++].cast<cuuint64_t>();
     }
     for (size_t i = 0; i < T.tensorRank; i++) {
-      T.globalStride[i] = static_cast<cuuint64_t>(args[idx++]);
+      T.globalStride[i] = args[idx++].cast<cuuint64_t>();
     }
     for (size_t i = 0; i < T.tensorRank; i++) {
-      T.elementStrides[i] = static_cast<cuuint64_t>(args[idx++]);
+      T.elementStrides[i] = args[idx++].cast<cuuint64_t>();
     }
     for (size_t i = 0; i < T.tensorRank - 2; i++) {
-      T.pixelBoxLowerCorner[i] = static_cast<int>(args[idx++]);
+      T.pixelBoxLowerCorner[i] = args[idx++].cast<int>();
     }
     for (size_t i = 0; i < T.tensorRank - 2; i++) {
-      T.pixelBoxUpperCorner[i] = static_cast<int>(args[idx++]);
+      T.pixelBoxUpperCorner[i] = args[idx++].cast<int>();
     }
-    T.smem_box_pixel = static_cast<cuuint64_t>(args[idx++]);
-    T.smem_box_channel = static_cast<cuuint64_t>(args[idx++]);
+    T.smem_box_pixel = args[idx++].cast<cuuint64_t>();
+    T.smem_box_channel = args[idx++].cast<cuuint64_t>();
     T.interleave =
-        static_cast<CUtensorMapInterleave>(static_cast<int64_t>(args[idx++]));
-    T.swizzle =
-        static_cast<CUtensorMapSwizzle>(static_cast<int64_t>(args[idx++]));
+        static_cast<CUtensorMapInterleave>(args[idx++].cast<int64_t>());
+    T.swizzle = static_cast<CUtensorMapSwizzle>(args[idx++].cast<int64_t>());
     T.l2Promotion =
-        static_cast<CUtensorMapL2promotion>(static_cast<int64_t>(args[idx++]));
+        static_cast<CUtensorMapL2promotion>(args[idx++].cast<int64_t>());
     T.oobFill =
-        static_cast<CUtensorMapFloatOOBfill>(static_cast<int64_t>(args[idx++]));
+        static_cast<CUtensorMapFloatOOBfill>(args[idx++].cast<int64_t>());
     return T;
   }
 
@@ -185,21 +183,25 @@ struct TensorMapIm2ColArgs {
   }
 };
 
-TVM_REGISTER_GLOBAL(tvm_tensormap_create_im2col)
-    .set_body([](TVMArgs args, TVMRetValue *ret) {
-      TensorMapIm2ColArgs T = TensorMapIm2ColArgs::Extract(args);
-      CUresult result = cuTensorMapEncodeIm2col(
-          T.map, T.type, T.tensorRank, T.globalAddress, T.globalDim,
-          T.globalStride + 1, T.pixelBoxLowerCorner, T.pixelBoxUpperCorner,
-          T.smem_box_channel, T.smem_box_pixel, T.elementStrides, T.interleave,
-          T.swizzle, T.l2Promotion, T.oobFill);
-      if (result != CUDA_SUCCESS) {
-        LOG_FATAL << "Failed to initialize the TMA descriptor " << result
-                  << std::endl
-                  << T.ToDebugString();
-      }
-      *ret = static_cast<int>(result);
-    });
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def_packed(
+      "tvm_tensormap_create_im2col", [](PackedArgs args, Any *ret) {
+        TensorMapIm2ColArgs T = TensorMapIm2ColArgs::Extract(args);
+        CUresult result = cuTensorMapEncodeIm2col(
+            T.map, T.type, T.tensorRank, T.globalAddress, T.globalDim,
+            T.globalStride + 1, T.pixelBoxLowerCorner, T.pixelBoxUpperCorner,
+            T.smem_box_channel, T.smem_box_pixel, T.elementStrides,
+            T.interleave, T.swizzle, T.l2Promotion, T.oobFill);
+        if (result != CUDA_SUCCESS) {
+          LOG_FATAL << "Failed to initialize the TMA descriptor " << result
+                    << std::endl
+                    << T.ToDebugString();
+        }
+        *ret = static_cast<int>(result);
+      });
+});
+
 #endif // (CUDA_MAJOR_VERSION >= 12)
 
 } // namespace tl
diff --git a/src/target/codegen_cpp.cc b/src/target/codegen_cpp.cc
index c1ce7d033..09a987be7 100644
--- a/src/target/codegen_cpp.cc
+++ b/src/target/codegen_cpp.cc
@@ -22,27 +22,22 @@
  */
 #include "codegen_cpp.h"
 
-#include <tvm/relay/executor.h>
-#include <tvm/relay/runtime.h>
 #include <tvm/runtime/module.h>
 #include <tvm/target/codegen.h>
 
-#include <algorithm>
 #include <string>
 #include <unordered_set>
 #include <utility>
-#include <vector>
 
 #include "support/str_escape.h"
 #include "target/build_common.h"
-#include "target/func_registry_generator.h"
 #include "target/source/codegen_params.h"
 
 namespace tvm {
 namespace codegen {
 
 CodeGenTileLangCPP::CodeGenTileLangCPP() {
-  module_name_ = name_supply_->FreshName("__tvm_module_ctx");
+  module_name_ = name_supply_->FreshName("__tvm_ffi_library_ctx");
 }
 
 void CodeGenTileLangCPP::Init(bool output_ssa, bool emit_asserts,
@@ -59,7 +54,7 @@ void CodeGenTileLangCPP::Init(bool output_ssa, bool emit_asserts,
 }
 
 void CodeGenTileLangCPP::InitGlobalContext() {
-  decl_stream << "void* " << tvm::runtime::symbol::tvm_module_ctx
+  decl_stream << "void* " << tvm::runtime::symbol::tvm_ffi_library_ctx
               << " = NULL;\n";
 }
 
@@ -384,13 +379,13 @@ void CodeGenTileLangCPP::VisitExpr_(const CallNode *op,
     const std::string &type = op->args[0].as<StringImmNode>()->value;
     const IntImmNode *num = op->args[1].as<IntImmNode>();
     ICHECK(num != nullptr);
-    static_assert(alignof(TVMValue) % alignof(DLTensor) == 0, "invariant");
-    size_t unit = sizeof(TVMValue);
+    static_assert(alignof(TVMFFIAny) % alignof(DLTensor) == 0, "invariant");
+    size_t unit = sizeof(TVMFFIAny);
     size_t size = 0;
     if (type == "shape") {
-      size = (num->value * sizeof(tvm_index_t) + unit - 1) / unit;
+      size = (num->value * sizeof(runtime::tvm_index_t) + unit - 1) / unit;
     } else if (type == "arg_value") {
-      size = (num->value * sizeof(TVMValue) + unit - 1) / unit;
+      size = (num->value * sizeof(TVMFFIAny) + unit - 1) / unit;
     } else if (type == "arg_tcode") {
       size = (num->value * sizeof(int) + unit - 1) / unit;
     } else if (type == "array") {
@@ -399,7 +394,7 @@ void CodeGenTileLangCPP::VisitExpr_(const CallNode *op,
       LOG(FATAL) << "Unknown stack alloca type " << type;
     }
     this->PrintIndent();
-    this->stream << "TVMValue " << stack_name << "[" << size << "];\n";
+    this->stream << "TVMFFIAny " << stack_name << "[" << size << "];\n";
     os << stack_name;
   } else if (op->op.same_as(builtin::tvm_call_packed_lowered())) {
     auto function_info = GetFunctionInfo(op, false /* has_resource_handle */);
diff --git a/src/target/codegen_cpp.h b/src/target/codegen_cpp.h
index 3676c1bbb..c3ce25a0a 100644
--- a/src/target/codegen_cpp.h
+++ b/src/target/codegen_cpp.h
@@ -95,7 +95,7 @@ class CodeGenTileLangCPP : public CodeGenC {
   Array<String> function_names_;
   /*! \brief whether to emit asserts in the resulting C code */
   bool emit_asserts_;
-  /*! \brief whether to emit forwared function declarations in the resulting C
+  /*! \brief whether to emit forward function declarations in the resulting C
    * code */
   bool emit_fwd_func_decl_;
 
diff --git a/src/target/codegen_cuda.cc b/src/target/codegen_cuda.cc
index 9a200650c..04906d61b 100644
--- a/src/target/codegen_cuda.cc
+++ b/src/target/codegen_cuda.cc
@@ -4,7 +4,7 @@
 
 #include "codegen_cuda.h"
 #include <tvm/arith/analyzer.h>
-#include <tvm/runtime/registry.h>
+#include <tvm/ffi/function.h>
 #include <tvm/tir/index_map.h>
 #include <tvm/tir/op.h>
 
@@ -39,15 +39,75 @@ static std::string GetFP8Type(DataType type) {
     LOG(FATAL) << "Only support scalar and vector types of width (2, 4, 8, 16) "
                   "for FP8";
   }
-  if (type.code() == DataType::kFloat8_e4m3fn) {
+  if (type.is_float8_e4m3fn() || type.is_float8_e4m3fnuz() ||
+      type.is_float8_e4m3()) {
     stream << "fp8_e4" << vec << "_t";
-  } else if (type.code() == DataType::kFloat8_e4m3fnuz) {
-    stream << "fp8_e4" << vec << "_t";
-  } else if (type.code() == DataType::kFloat8_e5m2) {
+  } else if (type.is_float8_e5m2() || type.is_float8_e5m2fnuz() ||
+             type.is_float8_e5m2()) {
     stream << "fp8_e5" << vec << "_t";
   } else {
-    LOG(FATAL) << "Unsupported FP8 type in CUDA codegen";
+    LOG(FATAL) << "Unsupported FP8 type in CUDA codegen but got " << type;
+  }
+  return stream.str();
+}
+
+std::string GetFP6Type(DataType type) {
+  std::stringstream stream;
+  int32_t lanes = type.lanes();
+  std::string vec;
+  if (type.is_scalar()) {
+    vec = "";
+  } else if (lanes == 2) {
+    vec = "x2";
+  } else if (lanes == 4) {
+    vec = "x4";
+  } else if (lanes == 8) {
+    vec = "x8";
+  } else if (lanes == 16) {
+    vec = "x16";
+  } else {
+    LOG(FATAL)
+        << "Only support scalar and vector types of width (2, 4) for FP6";
+  }
+  stream << "__nv_fp6";
+  std::string suffix;
+  if (type.code() == DataType::kFloat6_e2m3fn) {
+    suffix = "_e2m3";
+  } else if (type.code() == DataType::kFloat6_e3m2fn) {
+    suffix = "_e3m2";
+  } else {
+    LOG(FATAL) << "Unsupported FP6 type in CUDA codegen";
+  }
+  stream << vec << suffix;
+  return stream.str();
+}
+
+std::string GetFP4Type(DataType type) {
+  std::stringstream stream;
+  int32_t lanes = type.lanes();
+  std::string vec;
+  if (type.is_scalar()) {
+    vec = "";
+  } else if (lanes == 2) {
+    vec = "x2";
+  } else if (lanes == 4) {
+    vec = "x4";
+  } else if (lanes == 8) {
+    vec = "x8";
+  } else if (lanes == 16) {
+    vec = "x16";
+  } else {
+    LOG(FATAL)
+        << "Only support scalar and vector types of width (2, 4) for FP4";
   }
+  stream << "__nv_fp4";
+  std::string suffix;
+  if (type.code() == DataType::kFloat4_e2m1fn) {
+    suffix = "_e2m1";
+  } else {
+    LOG(FATAL) << "Unsupported FP4 type in CUDA codegen";
+  }
+  stream << vec << suffix;
   return stream.str();
 }
 
@@ -132,6 +192,9 @@ std::string CodeGenTileLangCUDA::Finish() {
   decl_stream << "#include <tl_templates/cuda/ldsm.h>\n";
   decl_stream << "#include <tl_templates/cuda/threadblock_swizzle.h>\n";
   decl_stream << "#include <tl_templates/cuda/debug.h>\n";
+  decl_stream << "#ifdef ENABLE_BF16\n";
+  decl_stream << "#include <tl_templates/cuda/cuda_bf16_fallbacks.cuh>\n";
+  decl_stream << "#endif\n";
 
   if (need_global_barrier_) {
     decl_stream << "__device__ unsigned " << vid_global_barrier_state_
@@ -259,6 +322,22 @@ void CodeGenTileLangCUDA::PrintType(DataType t, std::ostream &os) { // NOLINT(*)
     enable_fp8_ = true;
     os << GetFP8Type(t);
     return;
+  } else if (t.is_float6()) {
+    enable_fp6_ = true;
+    if (t.lanes() <= 4) {
+      os << GetFP6Type(t);
+    } else {
+      fail = true;
+    }
+    return;
+  } else if (t.is_float4()) {
+    enable_fp4_ = true;
+    if (t.lanes() <= 4) {
+      os << GetFP4Type(t);
+    } else {
+      fail = true;
+    }
+    return;
   } else if (t == DataType::Bool()) {
     os << "bool";
     return;
@@ -658,18 +737,67 @@ void CodeGenTileLangCUDA::VisitExpr_(const CastNode *op, std::ostream &os) {
   this->PrintIndent();
   this->PrintType(target_ty, stream);
   stream << ' ' << sret << ";\n";
-  {
-    std::string src = SSAGetID(PrintExpr(op->value), from_ty);
-    for (int i = 0, lanes = from_ty.lanes(); i < lanes; ++i) {
-      std::ostringstream val;
-      val << "(";
-      PrintType(target_ty.element_of(), val);
-      val << ")(";
-      PrintVecElemLoad(src, from_ty, i, val);
-      val << ")";
-      PrintVecElemStore(sret, target_ty, i, val.str());
+  std::string src = SSAGetID(PrintExpr(op->value), from_ty);
+
+  // Handle bfloat16 special cases with supported ops
+  bool used_bf16_op = false;
+  if (from_ty.is_bfloat16() || target_ty.is_bfloat16()) {
+    std::ostringstream func_name;
+    if (from_ty.is_bfloat16())
+      func_name << "bf16";
+    else if (from_ty.is_float())
+      func_name << "float";
+    if (from_ty.lanes() > 1)
+      func_name << from_ty.lanes();
+    func_name << "2";
+    if (target_ty.is_bfloat16())
+      func_name << "bf16";
+    else if (target_ty.is_float())
+      func_name << "float";
+    else if (target_ty == DataType::Int(16))
+      func_name << "int16";
+    if (target_ty.lanes() > 1)
+      func_name << target_ty.lanes();
+
+    auto fname = func_name.str();
+    if (bf16_supported_ops_.count(fname)) {
+      used_bf16_op = true;
+      stream << "#ifdef ENABLE_BF16\n";
+      PrintIndent();
+      stream << "reinterpret_cast<";
+      if (target_ty.is_bfloat16())
+        stream << "__nv_bfloat16";
+      else
+        PrintType(target_ty.element_of(), stream);
+      if (target_ty.lanes() > 1)
+        stream << target_ty.lanes();
+      stream << " &>(" << sret << ") = fastertransformer::" << fname
+             << "(reinterpret_cast<";
+      if (from_ty.is_bfloat16())
+        stream << "__nv_bfloat16";
+      else
+        PrintType(from_ty.element_of(), stream);
+      if (from_ty.lanes() > 1)
+        stream << from_ty.lanes();
+      stream << " const &>(" << src << "));\n";
+      stream << "#else\n";
     }
   }
+
+  // Fallback: elementwise cast
+  for (int i = 0, lanes = from_ty.lanes(); i < lanes; ++i) {
+    std::ostringstream val;
+    val << "(";
+    PrintType(target_ty.element_of(), val);
+    val << ")(";
+    PrintVecElemLoad(src, from_ty, i, val);
+    val << ")";
+    PrintVecElemStore(sret, target_ty, i, val.str());
+  }
+
+  if (used_bf16_op) {
+    stream << "#endif\n";
+  }
   os << sret;
 }
 
@@ -678,7 +806,7 @@ void CodeGenTileLangCUDA::PrintCallExtern(Type ret_type, String global_symbol,
                                           bool skip_first_arg,
                                           std::ostream &os) { // NOLINT(*)
   DataType ret_dtype = GetRuntimeDataType(ret_type);
-  if (ret_dtype.is_vector()) {
+  if (ret_dtype.is_fixed_length_vector()) {
     //
     // Emit an unsupported vector call
     //
@@ -798,14 +926,21 @@ std::string CodeGenTileLangCUDA::GetBufferRef(DataType t,
 }
 
 void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
-  auto print_extern_call_stmt = [&](std::string name, size_t offset = 0) {
+  auto print_extern_call_stmt = [&](std::string name, size_t start = 0,
+                                    size_t end = 0) {
+    // Cache context into a private ss, otherwise the let node may generate
+    // within the function call arguments.
+    std::ostringstream ss;
+
+    for (size_t i = start; i < op->args.size() - end; i++) {
+      if (i > start)
+        ss << ", ";
+      ss << this->PrintExpr(op->args[i]);
+    }
+
     this->PrintIndent();
     this->stream << name << "(";
-    for (size_t i = offset; i < op->args.size(); i++) {
-      if (i > offset)
-        this->stream << ", ";
-      this->stream << this->PrintExpr(op->args[i]);
-    }
+    this->stream << ss.str();
     this->stream << ");\n";
   };
   if (op->op.same_as(builtin::ptx_cp_async())) {
@@ -856,28 +991,58 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
   } else if (op->op.same_as(tl::mbarrier_wait_parity())) {
     print_extern_call_stmt("tl::mbarrier_wait");
   } else if (op->op.same_as(tl::sync_thread_partial())) {
-    print_extern_call_stmt("tl::syncthreads_partial");
+    print_extern_call_stmt("cutlass::arch::NamedBarrier::sync");
+  } else if (op->op.same_as(tl::no_set_max_nreg())) {
+    return;
   } else if (op->op.same_as(tl::tma_load())) {
-    this->PrintIndent();
+    std::ostringstream ss;
     ICHECK_GE(op->args.size(), 2);
-    this->stream << "tl::tma_load(";
+    auto eviction_policy =
+        this->eviction_policy_names_
+            [op->args[op->args.size() - 1].as<IntImmNode>()->value];
+    // Simplify the code by using the default eviction policy
+    if (eviction_policy != "EVICT_NORMAL") {
+      ss << "tl::tma_load<tl::CacheHintSm90::" << eviction_policy << ">(";
+    } else {
+      ss << "tl::tma_load(";
+    }
     auto desc = op->args[0];
-    this->stream << this->PrintExpr(desc) << ", ";
+    ss << this->PrintExpr(desc) << ", ";
     if (const IntImmNode *imm = op->args[1].as<IntImmNode>()) {
-      this->stream << "_mbarrier[" << imm->value << "], ";
+      ss << "_mbarrier[" << imm->value << "], ";
     } else {
-      this->stream << this->PrintExpr(op->args[1]) << ", ";
+      ss << this->PrintExpr(op->args[1]) << ", ";
     }
-    for (size_t i = 2; i < op->args.size(); i++) {
+    for (size_t i = 2; i < op->args.size() - 1; i++) {
       if (i > 2)
-        this->stream << ", ";
-      this->stream << this->PrintExpr(op->args[i]);
+        ss << ", ";
+      ss << this->PrintExpr(op->args[i]);
     }
-    this->stream << ");\n";
+    ss << ");\n";
+    this->PrintIndent();
+    this->stream << ss.str();
   } else if (op->op.same_as(tl::tma_load_im2col())) {
-    print_extern_call_stmt("tl::tma_load_im2col");
+    std::stringstream ss;
+    auto eviction_policy =
+        this->eviction_policy_names_
+            [op->args[op->args.size() - 1].as<IntImmNode>()->value];
+    if (eviction_policy != "EVICT_NORMAL") {
+      ss << "tl::tma_load_im2col<tl::CacheHintSm90::" << eviction_policy << ">";
+    } else {
+      ss << "tl::tma_load_im2col";
+    }
+    print_extern_call_stmt(ss.str(), 0, 1);
   } else if (op->op.same_as(tl::tma_store())) {
-    print_extern_call_stmt("tl::tma_store");
+    std::stringstream ss;
+    auto eviction_policy =
+        this->eviction_policy_names_
+            [op->args[op->args.size() - 1].as<IntImmNode>()->value];
+    if (eviction_policy != "EVICT_NORMAL") {
+      ss << "tl::tma_store<tl::CacheHintSm90::" << eviction_policy << ">";
+    } else {
+      ss << "tl::tma_store";
+    }
+    print_extern_call_stmt(ss.str(), 0, 1);
   } else if (op->op.same_as(tl::ptx_ldmatirx())) {
     int trans = Downcast<IntImm>(op->args[0])->value;
     int num = Downcast<IntImm>(op->args[1])->value;
@@ -1111,8 +1276,8 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
     // To store the 32x8 output back to a 16x16 tile in shared or global memory,
     // we invert this map to determine the output location for each 8 element.
 
-    const auto *index_map_func =
-        runtime::Registry::Get("tir.index_map.shared_16x16_to_mma_32x8_layout");
+    const auto index_map_func = ffi::Function::GetGlobal(
+        "tir.index_map.shared_16x16_to_mma_32x8_layout");
 
     IndexMap index_map;
     if (!index_map_func) {
@@ -1289,6 +1454,118 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
     stream << ": \"l\"((void*)(" << global_buffer << "+" << global_addr
            << ")), \"r\"((int)" << guard << ")\n";
     stream << ");\n";
+  } else if (op->op.same_as(builtin::reinterpret())) {
+    DataType tgt_dtype = op->dtype;
+    DataType src_dtype = op->args[0]->dtype;
+    PrimExpr value = op->args[0];
+
+    // Handle float4_e2m1fn reinterpret
+    if (!src_dtype.is_float4_e2m1fn() && !tgt_dtype.is_float4_e2m1fn()) {
+      return CodeGenC::VisitExpr_(op, os);
+    }
+    if (src_dtype == tgt_dtype || tgt_dtype.lanes() * tgt_dtype.bits() ==
+                                      src_dtype.lanes() * src_dtype.bits()) {
+      return CodeGenC::VisitExpr_(op, os);
+    }
+    CHECK_EQ(tgt_dtype.lanes(), src_dtype.lanes())
+        << "E2M1 float4 reinterpret expects source and target to have the same "
+           "number of lanes. "
+        << "Source dtype: " << src_dtype << ", Target dtype: " << tgt_dtype;
+    CHECK_EQ(tgt_dtype.bytes(), src_dtype.bytes())
+        << "E2M1 float4 reinterpret expects source and target to have the same "
+           "number of bytes. "
+        << "Source dtype: " << src_dtype << ", Target dtype: " << tgt_dtype;
+
+    int lanes = tgt_dtype.lanes();
+
+    int ssa_scope = BeginScope();
+    if (lanes == 1) {
+      // The case of lane=1 is same as the normal reinterpret,
+      // except that we allow the src and dst dtype to have different number of
+      // bits.
+      std::string rhs = SSAGetID(PrintExpr(value), src_dtype);
+      os << "(*(";
+      this->PrintType(tgt_dtype, os);
+      os << " *)(&(" << rhs << ")))";
+    } else if (lanes == 2) {
+      if (tgt_dtype.is_float4_e2m1fn()) {
+        // We view the source as an uint16, and then extract bits of two fp4
+        // numbers, and finally reinterpret the result as fp4x2.
+        value =
+            tir::Call(DataType::UInt(16), tir::builtin::reinterpret(), {value});
+        tir::Var temp_var("temp_var", DataType::UInt(16));
+        value =
+            tir::Let(temp_var, value,
+                     tir::Cast(DataType::UInt(8),
+                               (temp_var & IntImm(DataType::UInt(16), 0xF)) |
+                                   ((temp_var >> 4) &
+                                    IntImm(DataType::UInt(16), 0xF0))));
+      } else {
+        value = tir::Cast(
+            DataType::UInt(16),
+            tir::Call(DataType::UInt(8), tir::builtin::reinterpret(), {value}));
+        tir::Var temp_var("temp_var", DataType::UInt(16));
+        value =
+            tir::Let(temp_var, value,
+                     (temp_var & IntImm(DataType::UInt(16), 0xF)) |
+                         ((temp_var & IntImm(DataType::UInt(16), 0xF0)) << 4));
+      }
+      os << PrintExpr(
+          tir::Call(tgt_dtype, tir::builtin::reinterpret(), {value}));
+    } else if (lanes == 4) {
+      if (tgt_dtype.is_float4_e2m1fn()) {
+        // We view the source as an uint32, and then extract bits of four fp4
+        // numbers, and finally reinterpret the result as fp4x4.
+        value =
+            tir::Call(DataType::UInt(32), tir::builtin::reinterpret(), {value});
+        tir::Var temp_var("temp_var", DataType::UInt(32));
+        value = tir::Let(
+            temp_var, value,
+            tir::Cast(
+                DataType::UInt(16),
+                (temp_var & IntImm(DataType::UInt(32), 0xF)) |
+                    ((temp_var >> 4) & IntImm(DataType::UInt(32), 0xF0)) |
+                    ((temp_var >> 8) & IntImm(DataType::UInt(32), 0xF00)) |
+                    ((temp_var >> 12) & IntImm(DataType::UInt(32), 0xF000))));
+      } else {
+        value = tir::Cast(DataType::UInt(32),
+                          tir::Call(DataType::UInt(16),
+                                    tir::builtin::reinterpret(), {value}));
+        tir::Var temp_var("temp_var", DataType::UInt(32));
+        value = tir::Let(
+            temp_var, value,
+            (temp_var & IntImm(DataType::UInt(32), 0xF)) |
+                ((temp_var & IntImm(DataType::UInt(32), 0xF0)) << 4) |
+                ((temp_var & IntImm(DataType::UInt(32), 0xF00)) << 8) |
+                ((temp_var & IntImm(DataType::UInt(32), 0xF000)) << 12));
+      }
+      os << PrintExpr(
+          tir::Call(tgt_dtype, tir::builtin::reinterpret(), {value}));
+    } else {
+      LOG(FATAL) << "Invalid number of lanes for float4_e2m1fn reinterpret: "
+                 << lanes;
+    }
+    EndScope(ssa_scope);
+  } else if (op->op.same_as(builtin::thread_return())) {
+    os << "return";
+  } else if (op->op.same_as(tl::tl_gemm())) {
+    ICHECK(op->args.size() == 4) << "tl_gemm expects 4 arguments <op_instance, "
+                                    "A_ptr, B_ptr, C_ptr>, but got "
+                                 << op->args.size();
+    auto op_instance = Downcast<StringImm>(op->args[0]);
+    this->PrintCallExtern(GetType(GetRef<PrimExpr>(op)), op_instance->value,
+                          op->args, true, os);
+  } else if (op->op.same_as(tl::tl_gemm_sp())) {
+    ICHECK(op->args.size() == 5)
+        << "tl_gemm_sp expects 5 arguments <op_instance, A_ptr, B_ptr, C_ptr, "
+           "E_ptr>, but got "
+        << op->args.size();
+    auto op_instance = Downcast<StringImm>(op->args[0]);
+    enable_sparse_gemm_ = true;
+    this->PrintCallExtern(GetType(GetRef<PrimExpr>(op)), op_instance->value,
+                          op->args, true, os);
+  } else if (op->op.same_as(tl::tl_shuffle_elect())) {
+    os << "tl::tl_shuffle_elect<" << PrintExpr(op->args[0]) << ">()";
   } else {
     CodeGenC::VisitExpr_(op, os);
   }
@@ -1404,14 +1681,6 @@ void CodeGenTileLangCUDA::VisitStmt_(const EvaluateNode *op) {
     stream << "  " << vid_global_barrier_expect_ << " = 0;\n";
     PrintIndent();
     stream << "}\n";
-  } else if (call && call->op.same_as(builtin::call_extern())) {
-    ICHECK(call->args.size() >= 1)
-        << "call_extern must have at least 1 argument";
-    std::string func_name = call->args[0].as<StringImmNode>()->value;
-    if (func_name.find("tl::gemm_sp") == 0) {
-      enable_sparse_gemm_ = true;
-    }
-    CodeGenC::VisitStmt_(op);
   } else {
     CodeGenC::VisitStmt_(op);
   }
@@ -1433,6 +1702,76 @@ void CodeGenTileLangCUDA::VisitExpr_(const RampNode *op, std::ostream &os) {
   os << "))";
 }
 
+void CodeGenTileLangCUDA::VisitExpr_(const BufferLoadNode *op,
+                                     std::ostream &os) { // NOLINT(*)
+  ICHECK_EQ(op->indices.size(), 1)
+      << "Load from non-flat memory not supported.";
+  ICHECK(!op->predicate.defined())
+      << "Predicated buffer load is not supported.";
+
+  DataType value_dtype = op->dtype;
+  PrimExpr index = op->indices[0];
+  Var buffer_var = op->buffer->data;
+  DataType element_dtype = op->buffer->dtype;
+
+  int lanes = op->dtype.lanes();
+  // delcare type.
+  if (value_dtype.lanes() == element_dtype.lanes()) {
+    std::string ref = GetBufferRef(op->dtype, op->buffer.get(), index);
+    HandleVolatileLoads(ref, op, os);
+  } else {
+    bool can_vector_load = false;
+    arith::PVar<PrimExpr> base;
+    if (arith::ramp(base, 1, op->dtype.lanes()).Match(index)) {
+      const RampNode *ramp = index.as<RampNode>();
+      ICHECK(ramp);
+      can_vector_load = true;
+      // arith::ModularSet me = arith::Analyzer().modular_set(ramp->base);
+      // The condition: {k * coeff + base} divisible by the alignment for any k
+      // if (me->coeff % op->dtype.lanes() == 0 && me->base % op->dtype.lanes()
+      // == 0) {
+      //   can_vector_load = true;
+      // }
+    }
+
+    if (value_dtype.is_float4_e2m1fn() && lanes != 1) {
+      // A float4_e2m1fn element has 4 bits, which is an incomplete byte.
+      // So we cannot vector load it.
+      can_vector_load = false;
+    }
+    if (can_vector_load) {
+      std::string ref = GetVecLoad(op->dtype, op->buffer.get(), base.Eval());
+      HandleVolatileLoads(ref, op, os);
+    } else {
+      std::ostringstream svalue_expr;
+      std::string sindex = SSAGetID(PrintExpr(index), index.dtype());
+      std::string vid = GetVarID(buffer_var.get());
+      DataType elem_type = op->dtype.element_of();
+      for (int i = 0; i < lanes; ++i) {
+        std::ostringstream value_temp;
+        if (!HandleTypeMatch(buffer_var.get(), elem_type)) {
+          value_temp << "((";
+          if (buffer_var.get()->dtype.is_handle()) {
+            auto it = alloc_storage_scope_.find(buffer_var.get());
+            if (it != alloc_storage_scope_.end()) {
+              PrintStorageScope(it->second, value_temp);
+            }
+          }
+          PrintType(elem_type, value_temp);
+          value_temp << "*)" << vid << ')';
+        } else {
+          value_temp << vid;
+        }
+        value_temp << '[';
+        PrintVecElemLoad(sindex, index.dtype(), i, value_temp);
+        value_temp << ']';
+        PrintVecElemLoadExpr(op->dtype, i, value_temp.str(), svalue_expr);
+      }
+      os << svalue_expr.str();
+    }
+  }
+}
+
 void CodeGenTileLangCUDA::VisitExpr_(const BroadcastNode *op,
                                      std::ostream &os) { // NOLINT(*)
   int lanes = static_cast<int>(Downcast<IntImm>(op->lanes)->value);
diff --git a/src/target/codegen_cuda.h b/src/target/codegen_cuda.h
index 2661c9b9d..7c87c7b21 100644
--- a/src/target/codegen_cuda.h
+++ b/src/target/codegen_cuda.h
@@ -50,6 +50,7 @@ class CodeGenTileLangCUDA final : public CodeGenC {
   void VisitStmt_(const EvaluateNode *op) final;
   void VisitStmt_(const AllocateNode *op) final;
   void VisitStmt_(const AttrStmtNode *op) final;
+  void VisitExpr_(const BufferLoadNode *op, std::ostream &os) final;
 
   // Override this as a work around for __grid_constant__ parameter
   void AddFunction(const GlobalVar &gvar, const PrimFunc &f);
@@ -80,16 +81,21 @@ class CodeGenTileLangCUDA final : public CodeGenC {
   std::string vid_global_barrier_state_;
   // Global barrier expected node.
   std::string vid_global_barrier_expect_;
+
   // whether enable fp16
   bool enable_fp16_{false};
   // whether enable bf16
   bool enable_bf16_{false};
   // whether enable fp8
   bool enable_fp8_{false};
-  // whether enable sparse gemm
-  bool enable_sparse_gemm_{false};
+  // whether enable fp6
+  bool enable_fp6_{false};
+  // whether enable fp4
+  bool enable_fp4_{false};
   // whether enable int8
   bool enable_int8_{false};
+  // whether enable sparse gemm
+  bool enable_sparse_gemm_{false};
   // whether enable warp shuffle intrinsics
   bool enable_warp_shuffle_{false};
   // whether need math_constants.h
@@ -120,6 +126,11 @@ class CodeGenTileLangCUDA final : public CodeGenC {
                       const VarNode *variable, std::ostream &os);
   int32_t GetWmmaFragmentSize(const std::string &scope, const VarNode *variable,
                               int32_t size);
+
+  std::vector<std::string> eviction_policy_names_ = {
+      "EVICT_NORMAL", "EVICT_FIRST", "EVICT_LAST"};
+  std::unordered_set<std::string> bf16_supported_ops_ = {
+      "bf1622float2", "bf1622int16", "float22bf162", "bf162bf162"};
 };
 
 } // namespace codegen
diff --git a/src/target/codegen_hip.cc b/src/target/codegen_hip.cc
index b62ae3385..a45284452 100644
--- a/src/target/codegen_hip.cc
+++ b/src/target/codegen_hip.cc
@@ -4,7 +4,7 @@
 
 #include "codegen_hip.h"
 #include <tvm/arith/analyzer.h>
-#include <tvm/runtime/registry.h>
+#include <tvm/ffi/function.h>
 #include <tvm/tir/index_map.h>
 #include <tvm/tir/op.h>
 
@@ -882,7 +882,7 @@ void CodeGenTileLangHIP::VisitExpr_(const CallNode *op, std::ostream &os) {
       this->PrintExpr(op->args[i * 2 + 1], os);
       os << "]" << ((i < 3) ? ", " : ")");
     }
-  } else if (op->op.same_as(builtin::tvm_mfma())) {
+  } else if (op->op.same_as(tl::tvm_mfma())) {
     // arg 0: prefix: {otype}_16x16x16{itype}
     // arg 1: A layout: row/col
     // arg 2: B layout: row/col
@@ -946,6 +946,17 @@ void CodeGenTileLangHIP::VisitExpr_(const CallNode *op, std::ostream &os) {
     replacer.register_rule("{c_ref}", c_ref);
     replacer.register_rule("{c_bias}", c_bias);
     os << replacer.rewrite(call_mfma_code);
+  } else if (op->op.same_as(builtin::thread_return())) {
+    os << "return";
+  } else if (op->op.same_as(tl::tl_gemm())) {
+    ICHECK(op->args.size() == 4) << "tl_gemm expects 4 arguments <op_instance, "
+                                    "A_ptr, B_ptr, C_ptr>, but got "
+                                 << op->args.size();
+    auto op_instance = Downcast<StringImm>(op->args[0]);
+    this->PrintCallExtern(GetType(GetRef<PrimExpr>(op)), op_instance->value,
+                          op->args, true, os);
+  } else if (op->op.same_as(tl::tl_gemm_sp())) {
+    LOG(FATAL) << "tl_gemm_sp is not supported on HIP";
   } else {
     CodeGenC::VisitExpr_(op, os);
   }
diff --git a/src/target/codegen_webgpu.cc b/src/target/codegen_webgpu.cc
index d976e6054..b8d2f9d0b 100644
--- a/src/target/codegen_webgpu.cc
+++ b/src/target/codegen_webgpu.cc
@@ -21,6 +21,7 @@
  * \file codegen_webgpu.cc
  */
 #include "codegen_webgpu.h"
+#include <tvm/ffi/reflection/registry.h>
 
 #include <tvm/arith/analyzer.h>
 #include <tvm/tir/builtin.h>
@@ -251,9 +252,9 @@ CodeGenTileLangWebGPU::AddFunction(const PrimFunc &f, bool skip_readonly_decl) {
   os_param_access << "]";
   func_info.launch_param_tags.push_back(os_param_access.str());
 
-  ICHECK(!info.has_block_index_z)
-      << "blockIdx.z is not supported in WebGPU to accomodate large blockIdx.x";
-  // anotate workgroup
+  ICHECK(!info.has_block_index_z) << "blockIdx.z is not supported in WebGPU to "
+                                     "accommodate large blockIdx.x";
+  // annotate workgroup
   this->stream << "@compute @workgroup_size(" << info.workgroup_size[0] << ", "
                << info.workgroup_size[1] << ", " << info.workgroup_size[2]
                << ")\n";
@@ -704,11 +705,11 @@ class WebGPUSourceModuleNode final : public runtime::ModuleNode {
     return runtime::ModulePropertyMask::kBinarySerializable;
   }
 
-  PackedFunc GetFunction(const String &name,
-                         const ObjectPtr<Object> &sptr_to_self) final {
+  ffi::Function GetFunction(const String &name,
+                            const ObjectPtr<Object> &sptr_to_self) final {
     LOG(FATAL) << "WebGPUSourceModule is not directly runnable, export and run "
                   "through tvmjs";
-    return PackedFunc(nullptr);
+    return ffi::Function(nullptr);
   }
 
   void SaveToBinary(dmlc::Stream *stream) final {
@@ -773,10 +774,13 @@ runtime::Module BuildTileLangWebGPU(IRModule mod, Target target) {
   return runtime::Module(n);
 }
 
-TVM_REGISTER_GLOBAL("target.build.tilelang_webgpu")
-    .set_body_typed([](IRModule mod, Target target) {
-      return BuildTileLangWebGPU(mod, target);
-    });
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("target.build.tilelang_webgpu",
+                        [](IRModule mod, Target target) {
+                          return BuildTileLangWebGPU(mod, target);
+                        });
+});
 
 } // namespace codegen
 } // namespace tvm
diff --git a/src/target/rt_mod_cpp.cc b/src/target/rt_mod_cpp.cc
index ff07eecae..a7f2e62b9 100644
--- a/src/target/rt_mod_cpp.cc
+++ b/src/target/rt_mod_cpp.cc
@@ -1,10 +1,10 @@
 #include "codegen_cpp.h"
+#include <tvm/ffi/reflection/registry.h>
 
 namespace tvm {
 namespace codegen {
 
 runtime::Module BuildCPPHost(IRModule mod, Target target) {
-  using tvm::runtime::Registry;
   bool output_ssa = false;
   bool emit_asserts = false;
   bool emit_fwd_func_decl = true;
@@ -67,7 +67,10 @@ runtime::Module BuildCPPHost(IRModule mod, Target target) {
   return CSourceModuleCreate(code, "c", cg.GetFunctionNames());
 }
 
-TVM_REGISTER_GLOBAL("target.build.tilelang_cpp").set_body_typed(BuildCPPHost);
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("target.build.tilelang_cpp", BuildCPPHost);
+});
 
 } // namespace codegen
 } // namespace tvm
diff --git a/src/target/rt_mod_cuda.cc b/src/target/rt_mod_cuda.cc
index c477eca7c..63a9f020b 100644
--- a/src/target/rt_mod_cuda.cc
+++ b/src/target/rt_mod_cuda.cc
@@ -1,5 +1,7 @@
 #include "codegen_cuda.h"
 #include "runtime/cuda/cuda_module.h"
+#include "runtime/pack_args.h"
+#include <tvm/ffi/reflection/registry.h>
 
 namespace tvm {
 namespace codegen {
@@ -18,7 +20,7 @@ ExtractFuncInfo(const IRModule &mod) {
       if (f->params[i]->dtype.is_handle()) {
         auto ptr = f->params[i]->type_annotation.as<PointerTypeNode>();
         if (ptr && ptr->storage_scope == "grid_constant") {
-          info.arg_types.push_back(DataType(kTVMGridConstant, 64, 1));
+          info.arg_types.push_back(DataType(runtime::kDLGridConstant, 64, 1));
           continue;
         }
       }
@@ -36,7 +38,6 @@ ExtractFuncInfo(const IRModule &mod) {
 }
 
 runtime::Module BuildTileLangCUDA(IRModule mod, Target target) {
-  using tvm::runtime::Registry;
   bool output_ssa = false;
   CodeGenTileLangCUDA cg;
   cg.Init(output_ssa);
@@ -52,13 +53,15 @@ runtime::Module BuildTileLangCUDA(IRModule mod, Target target) {
   }
 
   std::string code = cg.Finish();
-  if (const auto *f = Registry::Get("tilelang_callback_cuda_postproc")) {
-    code = (*f)(code, target).operator std::string();
+  if (const auto f =
+          ffi::Function::GetGlobal("tilelang_callback_cuda_postproc")) {
+    code = (*f)(code, target).cast<std::string>();
   }
   std::string fmt = "ptx";
   std::string ptx;
-  if (const auto *f = Registry::Get("tilelang_callback_cuda_compile")) {
-    ptx = (*f)(code, target).operator std::string();
+  if (const auto f =
+          ffi::Function::GetGlobal("tilelang_callback_cuda_compile")) {
+    ptx = (*f)(code, target).cast<std::string>();
     if (ptx[0] != '/')
       fmt = "cubin";
   } else {
@@ -68,7 +71,6 @@ runtime::Module BuildTileLangCUDA(IRModule mod, Target target) {
 }
 
 runtime::Module BuildTileLangCUDAWithoutCompile(IRModule mod, Target target) {
-  using tvm::runtime::Registry;
   bool output_ssa = false;
   CodeGenTileLangCUDA cg;
   cg.Init(output_ssa);
@@ -84,16 +86,20 @@ runtime::Module BuildTileLangCUDAWithoutCompile(IRModule mod, Target target) {
   }
 
   std::string code = cg.Finish();
-  if (const auto *f = Registry::Get("tilelang_callback_cuda_postproc")) {
-    code = (*f)(code, target).operator std::string();
+  if (const auto f =
+          ffi::Function::GetGlobal("tilelang_callback_cuda_postproc")) {
+    code = (*f)(code, target).cast<std::string>();
   }
   return runtime::CUDAModuleCreate("ptx", "ptx", ExtractFuncInfo(mod), code);
 }
 
-TVM_REGISTER_GLOBAL("target.build.tilelang_cuda")
-    .set_body_typed(BuildTileLangCUDA);
-TVM_REGISTER_GLOBAL("target.build.tilelang_cuda_without_compile")
-    .set_body_typed(BuildTileLangCUDAWithoutCompile);
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef()
+      .def("target.build.tilelang_cuda", BuildTileLangCUDA)
+      .def("target.build.tilelang_cuda_without_compile",
+           BuildTileLangCUDAWithoutCompile);
+});
 
 } // namespace codegen
 } // namespace tvm
diff --git a/src/target/rt_mod_hip.cc b/src/target/rt_mod_hip.cc
index 53d09472d..d0041f570 100644
--- a/src/target/rt_mod_hip.cc
+++ b/src/target/rt_mod_hip.cc
@@ -1,5 +1,6 @@
 #if defined(__linux__)
 #include <sys/stat.h>
+#include <tvm/ffi/reflection/registry.h>
 #endif
 
 #include <hip/hip_runtime.h>
@@ -7,6 +8,11 @@
 
 #include "codegen_hip.h"
 #include "runtime/rocm/rocm_module.h"
+#include <tvm/ffi/function.h>
+
+#ifndef kTVMGridConstant
+#define kTVMGridConstant 130
+#endif
 
 namespace tvm {
 namespace codegen {
@@ -43,7 +49,6 @@ ExtractFuncInfo(const IRModule &mod) {
 }
 
 runtime::Module BuildTileLangHIP(IRModule mod, Target target) {
-  using tvm::runtime::Registry;
   bool output_ssa = false;
   CodeGenTileLangHIP cg;
   cg.Init(output_ssa);
@@ -58,23 +63,28 @@ runtime::Module BuildTileLangHIP(IRModule mod, Target target) {
   }
 
   std::string code = cg.Finish();
-  if (const auto *f = Registry::Get("tilelang_callback_hip_postproc")) {
-    code = (*f)(code, target).operator std::string();
+
+  // Use the new FFI API to get registered functions
+  using ffi::Function;
+  if (auto f = Function::GetGlobal("tilelang_callback_hip_postproc")) {
+    code = (*f)(code, target).cast<std::string>();
   }
+
   std::string fmt = "ptx";
   std::string ptx;
-  if (const auto *f = Registry::Get("tilelang_callback_hip_compile")) {
-    ptx = (*f)(code, target).operator std::string();
+
+  if (auto f = Function::GetGlobal("tilelang_callback_hip_compile")) {
+    ptx = (*f)(code, target).cast<std::string>();
     if (ptx[0] != '/')
       fmt = "hsaco";
   } else {
     ICHECK(false) << "tilelang_callback_hip_compile is not set";
   }
+
   return ROCMModuleCreate(ptx, fmt, ExtractFuncInfo(mod), code, std::string());
 }
 
 runtime::Module BuildTileLangHIPWithoutCompile(IRModule mod, Target target) {
-  using tvm::runtime::Registry;
   bool output_ssa = false;
   CodeGenTileLangHIP cg;
   cg.Init(output_ssa);
@@ -89,16 +99,24 @@ runtime::Module BuildTileLangHIPWithoutCompile(IRModule mod, Target target) {
   }
 
   std::string code = cg.Finish();
-  if (const auto *f = Registry::Get("tilelang_callback_hip_postproc")) {
-    code = (*f)(code, target).operator std::string();
+
+  // Use the new FFI API to get registered functions
+  using ffi::Function;
+  if (auto f = Function::GetGlobal("tilelang_callback_hip_postproc")) {
+    code = (*f)(code, target).cast<std::string>();
   }
+
   return ROCMModuleCreate("ptx", "fmt", ExtractFuncInfo(mod), code,
                           std::string());
 }
-TVM_REGISTER_GLOBAL("target.build.tilelang_hip")
-    .set_body_typed(BuildTileLangHIP);
-TVM_REGISTER_GLOBAL("target.build.tilelang_hip_without_compile")
-    .set_body_typed(BuildTileLangHIPWithoutCompile);
+
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef()
+      .def("target.build.tilelang_hip", BuildTileLangHIP)
+      .def("target.build.tilelang_hip_without_compile",
+           BuildTileLangHIPWithoutCompile);
+});
 
 } // namespace codegen
-} // namespace tvm
+} // namespace tvm
\ No newline at end of file
diff --git a/src/target/utils.cc b/src/target/utils.cc
index 0e77032eb..d3c49a26f 100644
--- a/src/target/utils.cc
+++ b/src/target/utils.cc
@@ -50,7 +50,14 @@ bool TargetIsHopper(Target target) {
   if (!TargetIsCuda(target))
     return false;
   int arch = GetArchInt(target);
-  return arch >= 90;
+  return arch >= 90 && arch < 100;
+}
+
+bool TargetIsSM120(Target target) {
+  if (!TargetIsCuda(target))
+    return false;
+  int arch = GetArchInt(target);
+  return arch >= 120 && arch < 130;
 }
 
 bool TargetIsCDNA(Target target) {
@@ -97,5 +104,12 @@ bool TargetHasStmatrix(Target target) {
   return arch >= 90;
 }
 
+int TargetGetWarpSize(Target target) {
+  int res = 32;
+  if (TargetIsCDNA(target))
+    res = 64;
+  return res;
+}
+
 } // namespace tl
 } // namespace tvm
diff --git a/src/target/utils.h b/src/target/utils.h
index 96b0cd219..2526acd60 100644
--- a/src/target/utils.h
+++ b/src/target/utils.h
@@ -19,11 +19,13 @@ bool TargetIsVolta(Target target);
 bool TargetIsTuring(Target target);
 bool TargetIsAmpere(Target target);
 bool TargetIsHopper(Target target);
+bool TargetIsSM120(Target target);
 bool TargetIsCDNA(Target target);
 
 bool TargetHasAsyncCopy(Target target);
 bool TargetHasLdmatrix(Target target);
 bool TargetHasStmatrix(Target target);
+int TargetGetWarpSize(Target target);
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/tl_templates/cpp/half.hpp b/src/tl_templates/cpp/half.hpp
index 395cff938..0107b3d44 100644
--- a/src/tl_templates/cpp/half.hpp
+++ b/src/tl_templates/cpp/half.hpp
@@ -284,7 +284,7 @@
 #endif
 
 #ifndef HALF_ENABLE_F16C_INTRINSICS
-/// Enable F16C intruction set intrinsics.
+/// Enable F16C instruction set intrinsics.
 /// Defining this to 1 enables the use of [F16C compiler
 /// intrinsics](https://en.wikipedia.org/wiki/F16C) for converting between
 /// half-precision and single-precision values which may result in improved
@@ -1674,7 +1674,7 @@ template <typename T> T half2float(unsigned int value) {
 /// \tparam R rounding mode to use
 /// \tparam E `true` for round to even, `false` for round away from zero
 /// \tparam I `true` to raise INEXACT exception (if inexact), `false` to never
-/// raise it \tparam T type to convert to (buitlin integer type with at least 16
+/// raise it \tparam T type to convert to (builtin integer type with at least 16
 /// bits precision, excluding any implicit sign bits) \param value
 /// half-precision value to convert \return rounded integer value \exception
 /// FE_INVALID if value is not representable in type \a T \exception FE_INEXACT
@@ -1778,7 +1778,7 @@ inline uint32 divide64(uint32 x, uint32 y, int &s) {
 /// \tparam R `true` to compute signed remainder, `false` for positive remainder
 /// \param x first operand as positive finite half-precision value
 /// \param y second operand as positive finite half-precision value
-/// \param quo adress to store quotient at, `nullptr` if \a Q `false`
+/// \param quo address to store quotient at, `nullptr` if \a Q `false`
 /// \return modulus of \a x / \a y
 template <bool Q, bool R>
 unsigned int mod(unsigned int x, unsigned int y, int *quo = NULL) {
@@ -2435,7 +2435,7 @@ template <typename, typename, std::float_round_style> struct half_caster;
 /// Half-precision floating-point type.
 /// This class implements an IEEE-conformant half-precision floating-point type
 /// with the usual arithmetic operators and conversions. It is implicitly
-/// convertible to single-precision floating-point, which makes artihmetic
+/// convertible to single-precision floating-point, which makes arithmetic
 /// expressions and functions with mixed-type operands to be of the most precise
 /// operand type.
 ///
@@ -2445,9 +2445,9 @@ template <typename, typename, std::float_round_style> struct half_caster;
 /// which means it can be standard-conformantly copied using raw binary copies.
 /// But in this context some more words about the actual size of the type.
 /// Although the half is representing an IEEE 16-bit type, it does not
-/// neccessarily have to be of exactly 16-bits size. But on any reasonable
+/// necessarily have to be of exactly 16-bits size. But on any reasonable
 /// implementation the actual binary representation of this type will most
-/// probably not ivolve any additional "magic" or padding beyond the simple
+/// probably not involve any additional "magic" or padding beyond the simple
 /// binary representation of the underlying 16-bit IEEE number, even if not
 /// strictly guaranteed by the standard. But even then it only has an actual
 /// size of 16 bits if your C++ implementation supports an unsigned integer type
@@ -2801,7 +2801,7 @@ template <> class numeric_limits<half_float::half> {
   static HALF_CONSTEXPR_CONST bool traps = true;
 #else
   /// Traps only if [HALF_ERRHANDLING_THROW_...](\ref
-  /// HALF_ERRHANDLING_THROW_INVALID) is acitvated.
+  /// HALF_ERRHANDLING_THROW_INVALID) is activated.
   static HALF_CONSTEXPR_CONST bool traps = false;
 #endif
 
@@ -5067,7 +5067,7 @@ inline half frexp(half arg, int *exp) {
 /// [std::scalbln](https://en.cppreference.com/w/cpp/numeric/math/scalbn).
 /// \param arg number to modify
 /// \param exp power of two to multiply with
-/// \return \a arg multplied by 2 raised to \a exp
+/// \return \a arg multiplied by 2 raised to \a exp
 /// \exception FE_INVALID for signaling NaN
 /// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
 inline half scalbln(half arg, long exp) {
@@ -5096,7 +5096,7 @@ inline half scalbln(half arg, long exp) {
 /// **See also:** Documentation for
 /// [std::scalbn](https://en.cppreference.com/w/cpp/numeric/math/scalbn). \param
 /// arg number to modify \param exp power of two to multiply with \return \a arg
-/// multplied by 2 raised to \a exp \exception FE_INVALID for signaling NaN
+/// multiplied by 2 raised to \a exp \exception FE_INVALID for signaling NaN
 /// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
 inline half scalbn(half arg, int exp) { return scalbln(arg, exp); }
 
@@ -5106,7 +5106,7 @@ inline half scalbn(half arg, int exp) { return scalbln(arg, exp); }
 /// **See also:** Documentation for
 /// [std::ldexp](https://en.cppreference.com/w/cpp/numeric/math/ldexp). \param
 /// arg number to modify \param exp power of two to multiply with \return \a arg
-/// multplied by 2 raised to \a exp \exception FE_INVALID for signaling NaN
+/// multiplied by 2 raised to \a exp \exception FE_INVALID for signaling NaN
 /// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
 inline half ldexp(half arg, int exp) { return scalbln(arg, exp); }
 
@@ -5379,7 +5379,7 @@ inline HALF_CONSTEXPR bool islessequal(half x, half y) {
          !isnan(x) && !isnan(y);
 }
 
-/// Quiet comarison for less or greater.
+/// Quiet comparison for less or greater.
 /// **See also:** Documentation for
 /// [std::islessgreater](https://en.cppreference.com/w/cpp/numeric/math/islessgreater).
 /// \param x first operand
@@ -5503,7 +5503,7 @@ inline int feraiseexcept(int excepts) {
 ///
 /// **See also:** Documentation for
 /// [std::fegetexceptflag](https://en.cppreference.com/w/cpp/numeric/fenv/feexceptflag).
-/// \param flagp adress to store flag state at
+/// \param flagp address to store flag state at
 /// \param excepts OR of flags to save
 /// \retval 0 for success
 inline int fegetexceptflag(int *flagp, int excepts) {
@@ -5520,7 +5520,7 @@ inline int fegetexceptflag(int *flagp, int excepts) {
 ///
 /// **See also:** Documentation for
 /// [std::fesetexceptflag](https://en.cppreference.com/w/cpp/numeric/fenv/feexceptflag).
-/// \param flagp adress to take flag state from
+/// \param flagp address to take flag state from
 /// \param excepts OR of flags to restore
 /// \retval 0 for success
 inline int fesetexceptflag(const int *flagp, int excepts) {
diff --git a/src/tl_templates/cuda/common.h b/src/tl_templates/cuda/common.h
index d92b58b3f..409ec84de 100644
--- a/src/tl_templates/cuda/common.h
+++ b/src/tl_templates/cuda/common.h
@@ -48,7 +48,7 @@ using int4_t = int4;
     }                                                                          \
   } while (0)
 
-// abs function for bfloat_t and half_t since there is no implicit convertion
+// abs function for bfloat_t and half_t since there is no implicit conversion
 // method
 TL_PATCH TL_DEVICE half_t __habs(const half_t x) {
   return half_t(__habs(x.to_half()));
@@ -241,4 +241,13 @@ TL_DEVICE void __sync_thread_partial() {
   asm volatile("bar.sync %0, %1;" : : "r"(barrier_id), "r"(thread_count));
 }
 
+template <int thread_extent> TL_DEVICE bool tl_shuffle_elect() {
+  if constexpr (thread_extent == 0) {
+    return cutlass::canonical_warp_idx_sync() == 0 && cute::elect_one_sync();
+  }
+  return __shfl_sync(0xffffffff, (threadIdx.x / 32) % (thread_extent / 32),
+                     0) == 0 &&
+         cute::elect_one_sync();
+}
+
 } // namespace tl
diff --git a/src/tl_templates/cuda/copy_sm90.h b/src/tl_templates/cuda/copy_sm90.h
index 10f9bc1e0..4a17543bf 100644
--- a/src/tl_templates/cuda/copy_sm90.h
+++ b/src/tl_templates/cuda/copy_sm90.h
@@ -7,6 +7,11 @@
 #include "common.h"
 
 namespace tl {
+enum class CacheHintSm90 : uint64_t {
+  EVICT_NORMAL = 0x1000000000000000,
+  EVICT_FIRST = 0x12F0000000000000,
+  EVICT_LAST = 0x14F0000000000000,
+};
 
 TL_DEVICE void tma_load(void *smem_ptr, void *gmem_ptr, uint64_t &smem_mbar,
                         uint32_t size) {
@@ -30,20 +35,22 @@ TL_DEVICE void tma_load_multicast(void *smem_ptr, void *gmem_ptr,
       :);
 }
 
+template <CacheHintSm90 cache_hint = CacheHintSm90::EVICT_NORMAL>
 TL_DEVICE void tma_load(const CUtensorMap &descriptor, uint64_t &smem_mbar,
                         void const *const smem_ptr, int32_t const &crd0) {
   uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(&descriptor);
   uint32_t smem_int_mbar = smem_ptr_to_uint(&smem_mbar);
   uint32_t smem_int_ptr = smem_ptr_to_uint(smem_ptr);
   asm volatile("cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::"
-               "complete_tx::bytes"
-               " [%0], [%1, {%3}], [%2];"
+               "complete_tx::bytes.L2::cache_hint"
+               " [%0], [%1, {%3}], [%2], %4;"
                :
                : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
-                 "r"(crd0)
+                 "r"(crd0), "l"(cache_hint)
                : "memory");
 }
 
+template <CacheHintSm90 cache_hint = CacheHintSm90::EVICT_NORMAL>
 TL_DEVICE void tma_load(const CUtensorMap &descriptor, uint64_t &smem_mbar,
                         void const *const smem_ptr, int32_t const &crd0,
                         int32_t const &crd1) {
@@ -51,14 +58,15 @@ TL_DEVICE void tma_load(const CUtensorMap &descriptor, uint64_t &smem_mbar,
   uint32_t smem_int_mbar = smem_ptr_to_uint(&smem_mbar);
   uint32_t smem_int_ptr = smem_ptr_to_uint(smem_ptr);
   asm volatile("cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::"
-               "complete_tx::bytes"
-               " [%0], [%1, {%3, %4}], [%2];"
+               "complete_tx::bytes.L2::cache_hint"
+               " [%0], [%1, {%3, %4}], [%2], %5;"
                :
                : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
-                 "r"(crd0), "r"(crd1)
+                 "r"(crd0), "r"(crd1), "l"(cache_hint)
                : "memory");
 }
 
+template <CacheHintSm90 cache_hint = CacheHintSm90::EVICT_NORMAL>
 TL_DEVICE void tma_load(const CUtensorMap &descriptor, uint64_t &smem_mbar,
                         void const *const smem_ptr, int32_t const &crd0,
                         int32_t const &crd1, int32_t const &crd2) {
@@ -66,14 +74,14 @@ TL_DEVICE void tma_load(const CUtensorMap &descriptor, uint64_t &smem_mbar,
   uint32_t smem_int_mbar = smem_ptr_to_uint(&smem_mbar);
   uint32_t smem_int_ptr = smem_ptr_to_uint(smem_ptr);
   asm volatile("cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::"
-               "complete_tx::bytes"
-               " [%0], [%1, {%3, %4, %5}], [%2];"
+               "complete_tx::bytes.L2::cache_hint"
+               " [%0], [%1, {%3, %4, %5}], [%2], %6;"
                :
                : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
-                 "r"(crd0), "r"(crd1), "r"(crd2)
+                 "r"(crd0), "r"(crd1), "r"(crd2), "l"(cache_hint)
                : "memory");
 }
-
+template <CacheHintSm90 cache_hint = CacheHintSm90::EVICT_NORMAL>
 TL_DEVICE void tma_load(const CUtensorMap &descriptor, uint64_t &smem_mbar,
                         void const *const smem_ptr, int32_t const &crd0,
                         int32_t const &crd1, int32_t const &crd2,
@@ -82,14 +90,15 @@ TL_DEVICE void tma_load(const CUtensorMap &descriptor, uint64_t &smem_mbar,
   uint32_t smem_int_mbar = smem_ptr_to_uint(&smem_mbar);
   uint32_t smem_int_ptr = smem_ptr_to_uint(smem_ptr);
   asm volatile("cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::"
-               "complete_tx::bytes"
-               " [%0], [%1, {%3, %4, %5, %6}], [%2];"
+               "complete_tx::bytes.L2::cache_hint"
+               " [%0], [%1, {%3, %4, %5, %6}], [%2], %7;"
                :
                : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
-                 "r"(crd0), "r"(crd1), "r"(crd2), "r"(crd3)
+                 "r"(crd0), "r"(crd1), "r"(crd2), "r"(crd3), "l"(cache_hint)
                : "memory");
 }
 
+template <CacheHintSm90 cache_hint = CacheHintSm90::EVICT_NORMAL>
 TL_DEVICE void tma_load(const CUtensorMap &descriptor, uint64_t &smem_mbar,
                         void const *const smem_ptr, int32_t const &crd0,
                         int32_t const &crd1, int32_t const &crd2,
@@ -98,14 +107,16 @@ TL_DEVICE void tma_load(const CUtensorMap &descriptor, uint64_t &smem_mbar,
   uint32_t smem_int_mbar = smem_ptr_to_uint(&smem_mbar);
   uint32_t smem_int_ptr = smem_ptr_to_uint(smem_ptr);
   asm volatile("cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::"
-               "complete_tx::bytes"
-               " [%0], [%1, {%3, %4, %5, %6, %7}], [%2];"
+               "complete_tx::bytes.L2::cache_hint"
+               " [%0], [%1, {%3, %4, %5, %6, %7}], [%2], %8;"
                :
                : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
-                 "r"(crd0), "r"(crd1), "r"(crd2), "r"(crd3), "r"(crd4)
+                 "r"(crd0), "r"(crd1), "r"(crd2), "r"(crd3), "r"(crd4),
+                 "l"(cache_hint)
                : "memory");
 }
 
+template <CacheHintSm90 cache_hint = CacheHintSm90::EVICT_NORMAL>
 TL_DEVICE void tma_load_im2col(const CUtensorMap &descriptor,
                                uint64_t &smem_mbar, void const *const smem_ptr,
                                int32_t const &coord_c, int32_t const &coord_w,
@@ -116,90 +127,83 @@ TL_DEVICE void tma_load_im2col(const CUtensorMap &descriptor,
   uint32_t smem_int_mbar = smem_ptr_to_uint(&smem_mbar);
   uint32_t smem_int_ptr = smem_ptr_to_uint(smem_ptr);
   asm volatile("cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier:"
-               ":complete_tx::bytes"
-               " [%0], [%1, {%3, %4, %5, %6}], [%2], {%7, %8};"
+               ":complete_tx::bytes.L2::cache_hint"
+               " [%0], [%1, {%3, %4, %5, %6}], [%2], {%7, %8}, %9;"
                :
                : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
                  "r"(coord_c), "r"(coord_w), "r"(coord_h), "r"(coord_n),
-                 "h"(offset_w), "h"(offset_h)
+                 "h"(offset_w), "h"(offset_h), "l"(cache_hint)
                : "memory");
 }
 
-TL_DEVICE void tma_store(void *dst_gmem_ptr, void *smem_ptr, uint32_t size) {
-  uint32_t smem_int_ptr = smem_ptr_to_uint(smem_ptr);
-  asm volatile(
-      "cp.async.bulk.global.shared::cta.bulk_group [%1], [%0], %2; \n" ::"r"(
-          smem_int_ptr),
-      "l"(dst_gmem_ptr), "r"(size)
-      :);
-}
-
+template <CacheHintSm90 cache_hint = CacheHintSm90::EVICT_NORMAL>
 TL_DEVICE void tma_store(const CUtensorMap &descriptor,
                          void const *const smem_ptr, int32_t const &crd0) {
   uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(&descriptor);
   uint32_t smem_int_ptr = smem_ptr_to_uint(smem_ptr);
-
-  asm volatile(
-      "cp.async.bulk.tensor.1d.global.shared::cta.bulk_group [%0, {%2}], [%1];"
-      :
-      : "l"(gmem_int_desc), "r"(smem_int_ptr), "r"(crd0)
-      : "memory");
+  asm volatile("cp.async.bulk.tensor.1d.global.shared::cta.bulk_group "
+               ".L2::cache_hint [%0, {%2}], [%1], %3;"
+               :
+               : "l"(gmem_int_desc), "r"(smem_int_ptr), "r"(crd0),
+                 "l"(cache_hint)
+               : "memory");
 }
 
+template <CacheHintSm90 cache_hint = CacheHintSm90::EVICT_NORMAL>
 TL_DEVICE void tma_store(const CUtensorMap &descriptor,
                          void const *const smem_ptr, int32_t const &crd0,
                          int32_t const &crd1) {
   uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(&descriptor);
   uint32_t smem_int_ptr = smem_ptr_to_uint(smem_ptr);
-
-  asm volatile("cp.async.bulk.tensor.2d.global.shared::cta.bulk_group [%0, "
-               "{%2, %3}], [%1];"
+  asm volatile("cp.async.bulk.tensor.2d.global.shared::cta.bulk_group "
+               ".L2::cache_hint [%0, {%2, %3}], [%1], %4;"
                :
-               : "l"(gmem_int_desc), "r"(smem_int_ptr), "r"(crd0), "r"(crd1)
+               : "l"(gmem_int_desc), "r"(smem_int_ptr), "r"(crd0), "r"(crd1),
+                 "l"(cache_hint)
                : "memory");
 }
 
+template <CacheHintSm90 cache_hint = CacheHintSm90::EVICT_NORMAL>
 TL_DEVICE void tma_store(const CUtensorMap &descriptor,
                          void const *const smem_ptr, int32_t const &crd0,
                          int32_t const &crd1, int32_t const &crd2) {
   uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(&descriptor);
   uint32_t smem_int_ptr = smem_ptr_to_uint(smem_ptr);
-
-  asm volatile("cp.async.bulk.tensor.3d.global.shared::cta.bulk_group [%0, "
-               "{%2, %3, %4}], [%1];"
+  asm volatile("cp.async.bulk.tensor.3d.global.shared::cta.bulk_group "
+               ".L2::cache_hint [%0, {%2, %3, %4}], [%1], %5;"
                :
                : "l"(gmem_int_desc), "r"(smem_int_ptr), "r"(crd0), "r"(crd1),
-                 "r"(crd2)
+                 "r"(crd2), "l"(cache_hint)
                : "memory");
 }
 
+template <CacheHintSm90 cache_hint = CacheHintSm90::EVICT_NORMAL>
 TL_DEVICE void tma_store(const CUtensorMap &descriptor,
                          void const *const smem_ptr, int32_t const &crd0,
                          int32_t const &crd1, int32_t const &crd2,
                          int32_t const &crd3) {
   uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(&descriptor);
   uint32_t smem_int_ptr = smem_ptr_to_uint(smem_ptr);
-
-  asm volatile("cp.async.bulk.tensor.4d.global.shared::cta.bulk_group [%0, "
-               "{%2, %3, %4, %5}], [%1];"
+  asm volatile("cp.async.bulk.tensor.4d.global.shared::cta.bulk_group "
+               ".L2::cache_hint [%0, {%2, %3, %4, %5}], [%1], %6;"
                :
                : "l"(gmem_int_desc), "r"(smem_int_ptr), "r"(crd0), "r"(crd1),
-                 "r"(crd2), "r"(crd3)
+                 "r"(crd2), "r"(crd3), "l"(cache_hint)
                : "memory");
 }
 
+template <CacheHintSm90 cache_hint = CacheHintSm90::EVICT_NORMAL>
 TL_DEVICE void tma_store(const CUtensorMap &descriptor,
                          void const *const smem_ptr, int32_t const &crd0,
                          int32_t const &crd1, int32_t const &crd2,
                          int32_t const &crd3, int32_t const &crd4) {
   uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(&descriptor);
   uint32_t smem_int_ptr = smem_ptr_to_uint(smem_ptr);
-
-  asm volatile("cp.async.bulk.tensor.5d.global.shared::cta.bulk_group [%0, "
-               "{%2, %3, %4, %5, %6}], [%1];"
+  asm volatile("cp.async.bulk.tensor.5d.global.shared::cta.bulk_group "
+               ".L2::cache_hint [%0, {%2, %3, %4, %5, %6}], [%1], %7;"
                :
                : "l"(gmem_int_desc), "r"(smem_int_ptr), "r"(crd0), "r"(crd1),
-                 "r"(crd2), "r"(crd3), "r"(crd4)
+                 "r"(crd2), "r"(crd3), "r"(crd4), "l"(cache_hint)
                : "memory");
 }
 
@@ -215,15 +219,54 @@ TL_DEVICE void mbarrier_init(uint64_t &smem_barrier, uint32_t arrive_count) {
                : "r"(arrive_count), "r"(smem_int_ptr));
 }
 
+TL_DEVICE uint32_t mbarrier_try_wait(uint64_t &smem_barrier, int phase_bit) {
+
+  uint32_t smem_int_ptr = smem_ptr_to_uint(&smem_barrier);
+  uint32_t waitComplete;
+
+  asm volatile("{\n\t"
+               ".reg .pred P1; \n\t"
+               "mbarrier.try_wait.parity.shared.b64 P1, [%1], %2; \n\t"
+               "selp.b32 %0, 1, 0, P1; \n\t"
+               "}"
+               : "=r"(waitComplete)
+               : "r"(smem_int_ptr), "r"(phase_bit));
+
+  return waitComplete;
+}
+
 TL_DEVICE void mbarrier_wait(uint64_t &smem_barrier, int phase_bit) {
+  if (mbarrier_try_wait(smem_barrier, phase_bit) == 0) {
+    uint32_t smem_int_ptr = smem_ptr_to_uint(&smem_barrier);
+    // Arbitrarily large timer value after which try-wait expires and re-tries.
+    uint32_t ticks = 0x989680;
+    asm volatile("{\n\t"
+                 ".reg .pred       P1; \n\t"
+                 "LAB_WAIT: \n\t"
+                 "mbarrier.try_wait.parity.shared.b64 P1, [%0], %1, %2; \n\t"
+                 "@P1 bra DONE; \n\t"
+                 "bra     LAB_WAIT; \n\t"
+                 "DONE: \n\t"
+                 "}"
+                 :
+                 : "r"(smem_int_ptr), "r"(phase_bit), "r"(ticks));
+  }
+}
+
+TL_DEVICE void mbarrier_test_wait(uint64_t &smem_barrier, int phase_bit) {
   uint32_t smem_int_ptr = smem_ptr_to_uint(&smem_barrier);
-  asm volatile("{\n"
-               ".reg .pred                P1;\n"
-               "LAB_WAIT:\n"
-               "mbarrier.try_wait.parity.shared.b64 P1, [%0], %1;\n"
-               "@!P1                      bra.uni LAB_WAIT;\n"
-               "}\n" ::"r"(smem_int_ptr),
-               "r"(phase_bit));
+  asm volatile(
+      "{\n"
+      ".reg .pred                P1;\n"
+      "LAB_WAIT:\n"
+      "mbarrier.test_wait.parity.shared::cta.b64 P1, [%0], %1;\n"
+      "@P1                       bra.uni DONE;\n"
+      "nanosleep.u32 5;\n" // wait a few nanoseconds on pre-Hopper architectures
+                           // to save instruction issue slots
+      "bra.uni                   LAB_WAIT;\n"
+      "DONE:\n"
+      "}\n" ::"r"(smem_int_ptr),
+      "r"(phase_bit));
 }
 
 TL_DEVICE void mbarrier_arrive(uint64_t &smem_barrier) {
@@ -231,6 +274,20 @@ TL_DEVICE void mbarrier_arrive(uint64_t &smem_barrier) {
   asm volatile("mbarrier.arrive.shared.b64 _, [%0];" : : "r"(smem_int_ptr));
 }
 
+TL_DEVICE void mbarrier_arrive(uint64_t &smem_barrier, int cta_id,
+                               uint32_t pred) {
+  uint32_t smem_int_ptr = smem_ptr_to_uint(&smem_barrier);
+  if (pred) {
+    asm volatile("{\n\t"
+                 ".reg .b32 remAddr32;\n\t"
+                 "mapa.shared::cluster.u32  remAddr32, %0, %1;\n\t"
+                 "mbarrier.arrive.shared::cluster.b64  _, [remAddr32];\n\t"
+                 "}"
+                 :
+                 : "r"(smem_int_ptr), "r"(cta_id));
+  }
+}
+
 TL_DEVICE void mbarrier_expect_tx(uint64_t &smem_barrier,
                                   uint32_t transaction_bytes) {
   uint32_t smem_int_ptr = smem_ptr_to_uint(&smem_barrier);
diff --git a/src/tl_templates/cuda/cuda_bf16_fallbacks.cuh b/src/tl_templates/cuda/cuda_bf16_fallbacks.cuh
new file mode 100644
index 000000000..f5641f616
--- /dev/null
+++ b/src/tl_templates/cuda/cuda_bf16_fallbacks.cuh
@@ -0,0 +1,257 @@
+// Downloaded from from FasterTransformer v5.2.1
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.2.1_tag/src/fastertransformer/utils/cuda_bf16_fallbacks.cuh
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cuda_bf16_wrapper.h"
+#include <cuda_fp16.h>
+
+namespace fastertransformer {
+
+#ifdef ENABLE_BF16
+inline __device__ float2 bf1622float2(const __nv_bfloat162 val) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float2 f_val;
+    f_val.x = __low2float(val);
+    f_val.y = __high2float(val);
+    return f_val;
+#else
+    return __bfloat1622float2(val);
+#endif
+}
+
+inline __device__ int16_t bf1622int16(__nv_bfloat162 val) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float2 f_val;
+    f_val.x = max(min(__low2float(val), 127.f), -128.f);
+    f_val.y = max(min(__high2float(val), 127.f), -128.f);
+    union { int8_t int8[2]; int16_t int16; };
+    int8[0] = static_cast<int8_t>(static_cast<short>(f_val.x));
+    int8[1] = static_cast<int8_t>(static_cast<short>(f_val.y));
+    return int16;
+#else
+    val = __hmin2(val, make_bfloat162(127., 127.));
+    val = __hmax2(val, make_bfloat162(-128., -128.));
+    union { int8_t int8[2]; int16_t int16; };
+    int8[0] = static_cast<int8_t>(static_cast<short>(val.x));
+    int8[1] = static_cast<int8_t>(static_cast<short>(val.y));
+    return int16;
+#endif
+}
+
+inline __device__ __nv_bfloat162 float22bf162(const float2 val) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return __floats2bfloat162_rn(val.x, val.y);
+#else
+    return __float22bfloat162_rn(val);
+#endif
+}
+
+inline __device__ __nv_bfloat162 bf162bf162(const __nv_bfloat16 val) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    __nv_bfloat162 val2;
+    val2.x = val;
+    val2.y = val;
+    return val2;
+#else
+    return __bfloat162bfloat162(val);
+#endif
+}
+
+inline __device__ __nv_bfloat162 bf16hadd2(const __nv_bfloat162 x, const __nv_bfloat162 y) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float fxl, fxh, fyl, fyh;
+    fxl = __low2float(x);
+    fxh = __high2float(x);
+    fyl = __low2float(y);
+    fyh = __high2float(y);
+    return __floats2bfloat162_rn(fxl + fyl, fxh + fyh);
+#else
+    return __hadd2(x, y);
+#endif
+}
+
+inline __device__ __nv_bfloat16 bf16hadd(const __nv_bfloat16 x, const __nv_bfloat16 y) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return __float2bfloat16( __bfloat162float(x) + __bfloat162float(y) );
+#else
+    return __hadd(x, y);
+#endif
+}
+
+inline __device__ __nv_bfloat162 bf16hsub2(const __nv_bfloat162 x, const __nv_bfloat162 y) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float fxl, fxh, fyl, fyh;
+    fxl = __low2float(x);
+    fxh = __high2float(x);
+    fyl = __low2float(y);
+    fyh = __high2float(y);
+    return __floats2bfloat162_rn(fxl - fyl, fxh - fyh);
+#else
+    return __hsub2(x, y);
+#endif
+}
+
+inline __device__ __nv_bfloat16 bf16hsub(const __nv_bfloat16 x, const __nv_bfloat16 y) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return __float2bfloat16( __bfloat162float(x) - __bfloat162float(y) );
+#else
+    return __hsub(x, y);
+#endif
+}
+
+inline __device__ __nv_bfloat162 bf16hmul2(const __nv_bfloat162 x, const __nv_bfloat162 y) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float fxl, fxh, fyl, fyh;
+    fxl = __low2float(x);
+    fxh = __high2float(x);
+    fyl = __low2float(y);
+    fyh = __high2float(y);
+    return __floats2bfloat162_rn(fxl * fyl, fxh * fyh);
+#else
+    return __hmul2(x, y);
+#endif
+}
+
+inline __device__ __nv_bfloat16 bf16hmul(const __nv_bfloat16 x, const __nv_bfloat16 y) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return __float2bfloat16( __bfloat162float(x) * __bfloat162float(y) );
+#else
+    return __hmul(x, y);
+#endif
+}
+
+inline __device__ __nv_bfloat162 bf16hfma2(const __nv_bfloat162 x, const __nv_bfloat162 y, const __nv_bfloat162 z) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float fxl, fxh, fyl, fyh, fzl, fzh;
+    fxl = __low2float(x);
+    fxh = __high2float(x);
+    fyl = __low2float(y);
+    fyh = __high2float(y);
+    fzl = __low2float(z);
+    fzh = __high2float(z);
+    return __floats2bfloat162_rn(fxl * fyl + fzl, fxh * fyh + fzh);
+#else
+    return __hfma2(x, y, z);
+#endif
+}
+
+inline __device__ __nv_bfloat16 bf16hfma(const __nv_bfloat16 x, const __nv_bfloat16 y, const __nv_bfloat16 z) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return __float2bfloat16( __bfloat162float(x) * __bfloat162float(y) + __bfloat162float(z));
+#else
+    return __hfma(x, y, z);
+#endif
+}
+
+inline __device__ __nv_bfloat162 bf16exp2(const __nv_bfloat162 x) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float fxl, fxh;
+    fxl = __low2float(x);
+    fxh = __high2float(x);;
+    return __floats2bfloat162_rn(expf(fxl), expf(fxh));
+#else
+    return h2exp(x);
+#endif
+}
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)
+inline __device__ __nv_bfloat162 operator*(const __nv_bfloat162 x, const __nv_bfloat162 y) { return bf16hmul2(x, y); };
+inline __device__ __nv_bfloat162 operator+(const __nv_bfloat162 x, const __nv_bfloat162 y) { return bf16hadd2(x, y); };
+
+inline __device__ __nv_bfloat162 make_bfloat162(const __nv_bfloat16 x, const __nv_bfloat16 y)
+{
+    __nv_bfloat162 t; t.x = x; t.y = y; return t;
+}
+
+#endif
+
+inline __device__ __nv_bfloat16 bf16hadd(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return __float2bfloat16(__bfloat162float(a) + __bfloat162float(b) + __bfloat162float(c));
+#else
+    return a + b + c;
+#endif
+}
+
+inline __device__ __nv_bfloat16 bf16hadd(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c, __nv_bfloat16 d) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return __float2bfloat16(__bfloat162float(a) + __bfloat162float(b) + __bfloat162float(c) + __bfloat162float(d));
+#else
+    return (__nv_bfloat16)((float)a + (float)b + (float)c + (float)d);
+#endif
+}
+
+inline __device__ __nv_bfloat162 bf16hadd2(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float fal, fah, fbl, fbh, fcl, fch;
+    fal = __low2float(a);
+    fah = __high2float(a);
+    fbl = __low2float(b);
+    fbh = __high2float(b);
+    fcl = __low2float(c);
+    fch = __high2float(c);
+    return __floats2bfloat162_rn(fal + fbl + fcl, fah + fbh + fch);
+#else
+    return a + b + c;
+#endif
+}
+
+inline __device__ __nv_bfloat16 bf16hmul(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return __float2bfloat16(__bfloat162float(a) * __bfloat162float(b) * __bfloat162float(c));
+#else
+    return a * b * c;
+#endif
+}
+
+inline __device__ __nv_bfloat162 bf16hmul2(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float fal, fah, fbl, fbh, fcl, fch;
+    fal = __low2float(a);
+    fah = __high2float(a);
+    fbl = __low2float(b);
+    fbh = __high2float(b);
+    fcl = __low2float(c);
+    fch = __high2float(c);
+    return __floats2bfloat162_rn(fal * fbl * fcl, fah * fbh * fch);
+#else
+    return a * b * c;
+#endif
+}
+
+inline __device__ __nv_bfloat162 bf16hfma2(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c, __nv_bfloat162 d) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float fal, fah, fbl, fbh, fcl, fch, fdl, fdh;
+    fal = __low2float(a);
+    fah = __high2float(a);
+    fbl = __low2float(b);
+    fbh = __high2float(b);
+    fcl = __low2float(c);
+    fch = __high2float(c);
+    fdl = __low2float(d);
+    fdh = __high2float(d);
+    return __floats2bfloat162_rn(fal * fbl * fcl + fdl, fah * fbh * fch + fdh);
+#else
+    return a * b * c + d;
+#endif
+}
+
+#endif // ENABLE_BF16
+
+}  // namespace fastertransformer
diff --git a/src/tl_templates/cuda/cuda_bf16_wrapper.h b/src/tl_templates/cuda/cuda_bf16_wrapper.h
new file mode 100644
index 000000000..efb6e7987
--- /dev/null
+++ b/src/tl_templates/cuda/cuda_bf16_wrapper.h
@@ -0,0 +1,23 @@
+// Downloaded from from FasterTransformer v5.2.1
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.2.1_tag/src/fastertransformer/utils/cuda_bf16_wrapper.h
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef ENABLE_BF16
+#include <cuda_bf16.h>
+#endif
diff --git a/src/tl_templates/cuda/debug.h b/src/tl_templates/cuda/debug.h
index 07eabe691..cdba7aa0d 100644
--- a/src/tl_templates/cuda/debug.h
+++ b/src/tl_templates/cuda/debug.h
@@ -118,7 +118,7 @@ debug_print_buffer_value<signed char>(const char *msg, const char *buf_name,
          threadIdx.z, buf_name, index, var);
 }
 
-// Specialization for unsiged char type
+// Specialization for unsigned char type
 template <>
 __device__ void
 debug_print_buffer_value<unsigned char>(const char *msg, const char *buf_name,
diff --git a/src/tl_templates/cuda/gemm.h b/src/tl_templates/cuda/gemm.h
index 500b9717b..41a026290 100644
--- a/src/tl_templates/cuda/gemm.h
+++ b/src/tl_templates/cuda/gemm.h
@@ -1,5 +1,7 @@
 #pragma once
-#if (defined(__CUDA_ARCH_LIST__) && (__CUDA_ARCH_LIST__ >= 900))
+#if (defined(__CUDA_ARCH_LIST__) && (__CUDA_ARCH_LIST__ >= 1200))
+#include "gemm_sm120.h"
+#elif (defined(__CUDA_ARCH_LIST__) && (__CUDA_ARCH_LIST__ >= 900))
 #include "gemm_sm90.h"
 #elif (defined(__CUDA_ARCH_LIST__) && (__CUDA_ARCH_LIST__ >= 890))
 #include "gemm_sm89.h"
diff --git a/src/tl_templates/cuda/gemm_mma.h b/src/tl_templates/cuda/gemm_mma.h
new file mode 100644
index 000000000..00f4bf09c
--- /dev/null
+++ b/src/tl_templates/cuda/gemm_mma.h
@@ -0,0 +1,458 @@
+#pragma once
+
+#include <cute/algorithm/clear.hpp>
+#include <cute/arch/mma_sm120.hpp>
+#include <cute/arch/mma_sm80.hpp>
+#include <cute/arch/mma_sm89.hpp>
+#include <cute/atom/mma_atom.hpp>
+#include <cute/underscore.hpp>
+
+#include "common.h"
+#include "cuda_fp8.h"
+
+namespace cute {
+
+template <typename A_type, typename B_type, typename C_type, int num_warp_m,
+          int num_warp_n, int N>
+struct DispatchInstruction;
+
+using _X = Underscore;
+
+#if (defined(__CUDA_ARCH_LIST__) && (__CUDA_ARCH_LIST__ >= 800))
+#if __CUDA_ARCH_LIST__ >= 1200
+template <int num_warp_m, int num_warp_n, int N>
+struct DispatchInstruction<fp8_e4_t, fp8_e4_t, float, num_warp_m, num_warp_n,
+                           N> {
+  using MMA = MMA_Atom<SM120_16x8x32_TN<fp8_e4_t, fp8_e4_t, float>>;
+  using MMA_Group = Tile<_X, Int<std::min(num_warp_n * 16, N)>, _X>;
+};
+template <int num_warp_m, int num_warp_n, int N>
+struct DispatchInstruction<fp8_e5_t, fp8_e5_t, float, num_warp_m, num_warp_n,
+                           N> {
+  using MMA = MMA_Atom<SM120_16x8x32_TN<fp8_e5_t, fp8_e5_t, float>>;
+  using MMA_Group = Tile<_X, Int<std::min(num_warp_n * 16, N)>, _X>;
+};
+#elif __CUDA_ARCH_LIST__ >= 890
+template <int num_warp_m, int num_warp_n, int N>
+struct DispatchInstruction<fp8_e4_t, fp8_e4_t, float, num_warp_m, num_warp_n,
+                           N> {
+  using MMA = MMA_Atom<SM89_16x8x32_F32E4M3E4M3F32_TN>;
+  using MMA_Group = Tile<_X, Int<std::min(num_warp_n * 16, N)>, _X>;
+};
+template <int num_warp_m, int num_warp_n, int N>
+struct DispatchInstruction<fp8_e5_t, fp8_e5_t, float, num_warp_m, num_warp_n,
+                           N> {
+  using MMA = MMA_Atom<SM89_16x8x32_F32E5M2E5M2F32_TN>;
+  using MMA_Group = Tile<_X, Int<std::min(num_warp_n * 16, N)>, _X>;
+};
+#endif
+template <int num_warp_m, int num_warp_n, int N>
+struct DispatchInstruction<half_t, half_t, half_t, num_warp_m, num_warp_n, N> {
+  using MMA = MMA_Atom<SM80_16x8x16_F16F16F16F16_TN>;
+  using MMA_Group = Tile<_X, Int<std::min(num_warp_n * 16, N)>, _X>;
+};
+template <int num_warp_m, int num_warp_n, int N>
+struct DispatchInstruction<half_t, half_t, float, num_warp_m, num_warp_n, N> {
+  using MMA = MMA_Atom<SM80_16x8x16_F32F16F16F32_TN>;
+  using MMA_Group = Tile<_X, Int<std::min(num_warp_n * 16, N)>, _X>;
+};
+template <int num_warp_m, int num_warp_n, int N>
+struct DispatchInstruction<bfloat16_t, bfloat16_t, float, num_warp_m,
+                           num_warp_n, N> {
+  using MMA = MMA_Atom<SM80_16x8x16_F32BF16BF16F32_TN>;
+  using MMA_Group = Tile<_X, Int<std::min(num_warp_n * 16, N)>, _X>;
+};
+template <int num_warp_m, int num_warp_n, int N>
+struct DispatchInstruction<tfloat32_t, tfloat32_t, float, num_warp_m,
+                           num_warp_n, N> {
+  using MMA = MMA_Atom<SM80_16x8x8_F32TF32TF32F32_TN>;
+  using MMA_Group = Tile<_X, Int<std::min(num_warp_n * 16, N)>, _X>;
+};
+template <int num_warp_m, int num_warp_n, int N>
+struct DispatchInstruction<int8_t, int8_t, int, num_warp_m, num_warp_n, N> {
+  using MMA = MMA_Atom<SM80_16x8x32_S32S8S8S32_TN>;
+  using MMA_Group = Tile<_X, Int<std::min(num_warp_n * 16, N)>, _X>;
+};
+template <int num_warp_m, int num_warp_n, int N>
+struct DispatchInstruction<double, double, double, num_warp_m, num_warp_n, N> {
+  using MMA = MMA_Atom<SM80_8x8x4_F64F64F64F64_TN>;
+  using MMA_Group = Tile<Int<num_warp_m * 16>, Int<num_warp_n * 16>, _X>;
+};
+#elif (defined(__CUDA_ARCH_LIST__) && (__CUDA_ARCH_LIST__ >= 750))
+template <int num_warp_m, int num_warp_n, int N>
+struct DispatchInstruction<half_t, half_t, float, num_warp_m, num_warp_n, N> {
+  using MMA = MMA_Atom<SM75_16x8x8_F32F16F16F32_TN>;
+  using MMA_Group = Tile<_X, Int<std::min(num_warp_n * 16, N)>, _16>;
+};
+#endif
+
+template <int N, int num_warp_n, bool transpose> struct SelectCopy {
+  static constexpr int remainder = (N / num_warp_n) % 16;
+  using type = std::conditional_t<
+      remainder == 4 || remainder == 8 || remainder == 0,
+      std::conditional_t<
+          transpose,
+          std::conditional_t<
+              remainder == 4, SM75_U32x1_LDSM_N,
+              std::conditional_t<remainder == 8, SM75_U32x2_LDSM_N,
+                                 SM75_U32x4_LDSM_N>>,
+          std::conditional_t<
+              remainder == 4, SM75_U16x2_LDSM_T,
+              std::conditional_t<remainder == 8, SM75_U16x4_LDSM_T,
+                                 SM75_U16x8_LDSM_T>>>,
+      DefaultCopy>;
+};
+
+template <int Bits, int N, int K, bool K_inner, int num_warp_n, int leading_dim,
+          typename Enable = void>
+struct OperandTraits {
+  // Primary template, use padded layout and default copy
+  static constexpr int stride = leading_dim;
+  static constexpr int padded =
+      stride % (256 / Bits) == 0 ? stride + 128 / Bits : stride;
+  using Layout = typename std::conditional<
+      K_inner, Layout<Shape<Int<N>, Int<leading_dim>>, Shape<Int<padded>, _1>>,
+      Layout<Shape<Int<leading_dim>, Int<K>>, Shape<_1, Int<padded>>>>::type;
+  using Copy = DefaultCopy;
+};
+
+template <int N, int K, int num_warp_n, int leading_dim>
+struct OperandTraits<16, N, K, true, num_warp_n, leading_dim,
+                     typename std::enable_if<leading_dim % 64 == 32>::type> {
+  using LayoutAtom = decltype(composition(
+      Swizzle<2, 3, 3>{}, Layout<Shape<_8, _32>, Stride<_32, _1>>{}));
+  using Layout =
+      decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<leading_dim>>{}));
+  using Copy = typename SelectCopy<N, num_warp_n, true>::type;
+};
+
+template <int N, int K, int num_warp_n, int leading_dim>
+struct OperandTraits<16, N, K, true, num_warp_n, leading_dim,
+                     typename std::enable_if<leading_dim % 64 == 0>::type> {
+  using LayoutAtom = decltype(composition(
+      Swizzle<3, 3, 3>{}, Layout<Shape<_8, _64>, Stride<_64, _1>>{}));
+  using Layout =
+      decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<leading_dim>>{}));
+  using Copy = typename SelectCopy<N, num_warp_n, true>::type;
+};
+
+template <int N, int K, int num_warp_n, int leading_dim>
+struct OperandTraits<16, N, K, false, num_warp_n, leading_dim,
+                     typename std::enable_if<leading_dim % 64 == 32>::type> {
+  using LayoutAtom = decltype(composition(
+      Swizzle<2, 3, 3>{}, Layout<Shape<_32, _8>, Stride<_1, _32>>{}));
+  using Layout = decltype(tile_to_shape(
+      LayoutAtom{}, Shape<Int<leading_dim>, Int<K>>{}, Step<_2, _1>{}));
+  using Copy = typename SelectCopy<N, num_warp_n, false>::type;
+};
+
+template <int N, int K, int num_warp_n, int leading_dim>
+struct OperandTraits<16, N, K, false, num_warp_n, leading_dim,
+                     typename std::enable_if<leading_dim % 64 == 0>::type> {
+  using LayoutAtom = decltype(composition(
+      Swizzle<3, 3, 3>{}, Layout<Shape<_64, _8>, Stride<_1, _64>>{}));
+  using Layout = decltype(tile_to_shape(
+      LayoutAtom{}, Shape<Int<leading_dim>, Int<K>>{}, Step<_2, _1>{}));
+  using Copy = typename SelectCopy<N, num_warp_n, false>::type;
+};
+
+template <int N, int K, int num_warp_n, int leading_dim>
+struct OperandTraits<32, N, K, true, num_warp_n, leading_dim,
+                     typename std::enable_if<leading_dim % 32 == 0>::type> {
+  using LayoutAtom = decltype(composition(
+      Swizzle<3, 2, 3>{}, Layout<Shape<_8, _32>, Stride<_32, _1>>{}));
+  using Layout =
+      decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<leading_dim>>{}));
+  using Copy = typename SelectCopy<N, num_warp_n, true>::type;
+};
+
+template <int N, int K, int num_warp_n, int leading_dim>
+struct OperandTraits<32, N, K, true, num_warp_n, leading_dim,
+                     typename std::enable_if<leading_dim % 32 == 16>::type> {
+  using LayoutAtom = decltype(composition(
+      Swizzle<2, 2, 3>{}, Layout<Shape<_8, _16>, Stride<_16, _1>>{}));
+  using Layout =
+      decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<leading_dim>>{}));
+  using Copy = typename SelectCopy<N, num_warp_n, true>::type;
+};
+
+template <int N, int K, int num_warp_n, int leading_dim>
+struct OperandTraits<32, N, K, false, num_warp_n, leading_dim,
+                     typename std::enable_if<leading_dim % 32 == 0>::type> {
+  using LayoutAtom = decltype(composition(
+      Swizzle<3, 2, 3>{}, Layout<Shape<_32, _8>, Stride<_1, _32>>{}));
+  using Layout = decltype(tile_to_shape(
+      LayoutAtom{}, Shape<Int<leading_dim>, Int<K>>{}, Step<_2, _1>{}));
+  using Copy = UniversalCopy<tfloat32_t>;
+};
+
+template <int N, int K, int num_warp_n, int leading_dim>
+struct OperandTraits<32, N, K, false, num_warp_n, leading_dim,
+                     typename std::enable_if<leading_dim % 32 == 16>::type> {
+  using LayoutAtom = decltype(composition(
+      Swizzle<2, 2, 3>{}, Layout<Shape<_16, _8>, Stride<_1, _16>>{}));
+  using Layout = decltype(tile_to_shape(
+      LayoutAtom{}, Shape<Int<leading_dim>, Int<K>>{}, Step<_2, _1>{}));
+  using Copy = UniversalCopy<tfloat32_t>;
+};
+
+template <int N, int K, int num_warp_n, int leading_dim>
+struct OperandTraits<8, N, K, true, num_warp_n, leading_dim,
+                     typename std::enable_if<leading_dim % 128 == 64>::type> {
+  using LayoutAtom = decltype(composition(
+      Swizzle<2, 4, 3>{}, Layout<Shape<_8, _64>, Stride<_64, _1>>{}));
+  using Layout =
+      decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<leading_dim>>{}));
+  using Copy = typename SelectCopy<N, num_warp_n, true>::type;
+};
+
+template <int N, int K, int num_warp_n, int leading_dim>
+struct OperandTraits<8, N, K, true, num_warp_n, leading_dim,
+                     typename std::enable_if<leading_dim % 128 == 0>::type> {
+  using LayoutAtom = decltype(composition(
+      Swizzle<3, 4, 3>{}, Layout<Shape<_8, _128>, Stride<_128, _1>>{}));
+  using Layout =
+      decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<leading_dim>>{}));
+  using Copy = typename SelectCopy<N, num_warp_n, true>::type;
+};
+
+template <int N, int K, int num_warp_n, int leading_dim>
+struct OperandTraits<64, N, K, true, num_warp_n, leading_dim,
+                     typename std::enable_if<leading_dim % 16 == 0>::type> {
+  using LayoutAtom = decltype(composition(
+      Swizzle<2, 0, 4>{}, Layout<Shape<_4, _16>, Stride<_16, _1>>{}));
+  using Layout =
+      decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<leading_dim>>{}));
+  using Copy = DefaultCopy;
+};
+
+template <int N, int K, int num_warp_n, int leading_dim>
+struct OperandTraits<64, N, K, false, num_warp_n, leading_dim,
+                     typename std::enable_if<leading_dim % 16 == 0>::type> {
+  using LayoutAtom = decltype(composition(
+      Swizzle<2, 2, 2>{}, Layout<Shape<_16, _4>, Stride<_1, _16>>{}));
+  using Layout = decltype(tile_to_shape(
+      LayoutAtom{}, Shape<Int<leading_dim>, Int<K>>{}, Step<_2, _1>{}));
+  using Copy = DefaultCopy;
+};
+
+template <int M, int N, int K, int num_warp_m, int num_warp_n, bool trans_A,
+          bool trans_B, bool clear_accum, int lda, int ldb, int offset_a,
+          int offset_b, typename A_type_raw, typename B_type_raw,
+          typename C_type_raw>
+class GemmTensorOp {
+public:
+  using A_type =
+      typename std::conditional<std::is_same<A_type_raw, float>::value,
+                                tfloat32_t, A_type_raw>::type;
+  using B_type =
+      typename std::conditional<std::is_same<B_type_raw, float>::value,
+                                tfloat32_t, A_type_raw>::type;
+  using C_type = C_type_raw;
+
+  using Instruction =
+      DispatchInstruction<A_type, B_type, C_type, num_warp_m, num_warp_n, N>;
+
+  using OperandATraits = OperandTraits<sizeof_bits<A_type>::value, M, K,
+                                       !trans_A, num_warp_m, lda>;
+  using OperandBTraits =
+      OperandTraits<sizeof_bits<B_type>::value, N, K, trans_B, num_warp_n, ldb>;
+
+  using SmemLayoutA = typename OperandATraits::Layout;
+  using SmemLayoutB = typename OperandBTraits::Layout;
+  using SmemCopyA = Copy_Atom<typename OperandATraits::Copy, A_type>;
+  using SmemCopyB = Copy_Atom<typename OperandBTraits::Copy, B_type>;
+
+  using TileMma = TiledMMA<typename Instruction::MMA,
+                           Layout<Shape<Int<num_warp_m>, Int<num_warp_n>, _1>>,
+                           typename Instruction::MMA_Group>;
+
+  template <class... Args>
+  static CUTE_DEVICE auto remove_swizzle(Layout<Args...> const &layout) {
+    return layout;
+  }
+  // In fp16, when layout is KxN and n_warp is 1 and N % 64 == 0
+  // the original layout fail to compile, currently using this as a workaround
+  template <class... Args>
+  static CUTE_DEVICE auto
+  remove_swizzle(ComposedLayout<Args...> const &layout) {
+    if constexpr (sizeof(A_type) == 2)
+      return layout.layout_b();
+    else
+      return layout;
+  }
+
+  template <int offset, int NN, int KK, bool trans, int lddim, typename Engine0,
+            typename Layout0>
+  static CUTE_DEVICE auto get_region_tensor(Tensor<Engine0, Layout0> &sa) {
+    if constexpr (offset == 0) {
+      return composition(
+          sa,
+          Layout<Shape<Int<NN>, Int<KK>>,
+                 Stride<_1, typename std::conditional<trans, Int<NN>,
+                                                      Int<lddim>>::type>>{});
+    } else {
+      if constexpr (trans) {
+        static_assert(offset % KK == 0, "Offset must be a multiple of K");
+        constexpr int offset_n = offset / KK;
+        return flat_divide(sa, Shape<Int<NN>, Int<KK>>{})(_, _, _0{},
+                                                          Int<offset_n>{});
+      } else {
+        static_assert(offset % NN == 0, "Offset must be a multiple of N");
+        constexpr int offset_n = offset / NN;
+        return flat_divide(sa, Shape<Int<NN>, Int<KK>>{})(_, _, Int<offset_n>{},
+                                                          _0{});
+      }
+    }
+  }
+
+  static CUTE_DEVICE void body(A_type_raw *pA, B_type_raw *pB, C_type_raw *pC) {
+    const int tid = threadIdx.x;
+    Tensor sA_all = make_tensor(make_smem_ptr(reinterpret_cast<A_type *>(pA)),
+                                SmemLayoutA{});
+    Tensor sB_all = make_tensor(make_smem_ptr(reinterpret_cast<B_type *>(pB)),
+                                SmemLayoutB{});
+    Tensor sA = get_region_tensor<offset_a, M, K, !trans_A, lda>(sA_all);
+    Tensor sB = get_region_tensor<offset_b, N, K, trans_B, ldb>(sB_all);
+    TileMma tiled_mma;
+    auto thr_mma = tiled_mma.get_thread_slice(tid);
+    auto tiled_copy_A = make_tiled_copy_A(SmemCopyA{}, tiled_mma);
+    auto tiled_copy_B = make_tiled_copy_B(SmemCopyB{}, tiled_mma);
+    auto thr_copy_A = tiled_copy_A.get_thread_slice(tid);
+    auto thr_copy_B = tiled_copy_B.get_thread_slice(tid);
+
+    Tensor tCrA = thr_mma.partition_fragment_A(sA);
+    Tensor tCrB = thr_mma.partition_fragment_B(sB);
+    Tensor tCsA = thr_copy_A.partition_S(sA);
+    Tensor tCsB = thr_copy_B.partition_S(sB);
+
+    Tensor tCrA_copy_view = thr_copy_A.retile_D(tCrA);
+    Tensor tCrB_copy_view = thr_copy_B.retile_D(tCrB);
+
+    Tensor acc =
+        make_tensor(make_rmem_ptr(reinterpret_cast<C_type *>(pC)),
+                    partition_shape_C(tiled_mma, Shape<Int<M>, Int<N>>{}));
+
+    if constexpr (clear_accum) {
+      clear(acc);
+    }
+    // when layout is KxN and n_warp is 1, there seem to be a bug, use this as a
+    // workaround
+    auto tCrA_view = make_tensor(tCrA.data(), remove_swizzle(tCrA.layout()));
+    auto tCrB_view = make_tensor(tCrB.data(), remove_swizzle(tCrB.layout()));
+    CUTE_UNROLL
+    for (int k = 0; k < size<2>(tCrA); ++k) {
+      copy(tiled_copy_A, tCsA(_, _, k), tCrA_copy_view(_, _, k));
+      copy(tiled_copy_B, tCsB(_, _, k), tCrB_copy_view(_, _, k));
+      gemm(tiled_mma, tCrA_view(_, _, k), tCrB_view(_, _, k), acc);
+    }
+  }
+
+  static CUTE_DEVICE void body_rs(A_type_raw *pA, B_type_raw *pB,
+                                  C_type_raw *pC) {
+    const int tid = threadIdx.x;
+    Tensor sB_all = make_tensor(make_smem_ptr(reinterpret_cast<B_type *>(pB)),
+                                SmemLayoutB{});
+    Tensor sB = get_region_tensor<offset_b, N, K, trans_B, ldb>(sB_all);
+    TileMma tiled_mma;
+    auto thr_mma = tiled_mma.get_thread_slice(tid);
+    auto tiled_copy_B = make_tiled_copy_B(SmemCopyB{}, tiled_mma);
+    auto thr_copy_B = tiled_copy_B.get_thread_slice(tid);
+
+    Tensor tCrB = thr_mma.partition_fragment_B(sB);
+    Tensor tCsB = thr_copy_B.partition_S(sB);
+
+    Tensor tCrB_copy_view = thr_copy_B.retile_D(tCrB);
+
+    Tensor acc =
+        make_tensor(make_rmem_ptr(reinterpret_cast<C_type *>(pC)),
+                    partition_shape_C(tiled_mma, Shape<Int<M>, Int<N>>{}));
+    Tensor tCrA =
+        make_tensor(make_rmem_ptr(reinterpret_cast<A_type *>(pA)),
+                    partition_shape_A(tiled_mma, Shape<Int<M>, Int<K>>{}));
+    if constexpr (clear_accum) {
+      clear(acc);
+    }
+    auto tCrB_view = make_tensor(tCrB.data(), remove_swizzle(tCrB.layout()));
+    copy(tiled_copy_B, tCsB(_, _, 0), tCrB_copy_view(_, _, 0));
+    CUTE_UNROLL
+    for (int k = 0; k < size<2>(tCrA); ++k) {
+      if (k < size<2>(tCrA) - 1) {
+        copy(tiled_copy_B, tCsB(_, _, k + 1), tCrB_copy_view(_, _, k + 1));
+      }
+      gemm(tiled_mma, tCrA(_, _, k), tCrB_view(_, _, k), acc);
+    }
+  }
+
+  static CUTE_DEVICE void body_sr(A_type_raw *pA, B_type_raw *pB,
+                                  C_type_raw *pC) {
+    const int tid = threadIdx.x;
+    Tensor sA_all = make_tensor(make_smem_ptr(reinterpret_cast<A_type *>(pA)),
+                                SmemLayoutA{});
+    Tensor sA = get_region_tensor<offset_a, M, K, !trans_A, lda>(sA_all);
+    TileMma tiled_mma;
+    auto thr_mma = tiled_mma.get_thread_slice(tid);
+    auto tiled_copy_A = make_tiled_copy_A(SmemCopyA{}, tiled_mma);
+    auto thr_copy_A = tiled_copy_A.get_thread_slice(tid);
+
+    Tensor tCrA = thr_mma.partition_fragment_A(sA);
+    Tensor tCsA = thr_copy_A.partition_S(sA);
+
+    Tensor tCrA_copy_view = thr_copy_A.retile_D(tCrA);
+
+    Tensor acc =
+        make_tensor(make_rmem_ptr(reinterpret_cast<C_type *>(pC)),
+                    partition_shape_C(tiled_mma, Shape<Int<M>, Int<N>>{}));
+    Tensor tCrB =
+        make_tensor(make_rmem_ptr(reinterpret_cast<B_type *>(pB)),
+                    partition_shape_B(tiled_mma, Shape<Int<N>, Int<K>>{}));
+    if constexpr (clear_accum) {
+      clear(acc);
+    }
+    auto tCrA_view = make_tensor(tCrA.data(), remove_swizzle(tCrA.layout()));
+    copy(tiled_copy_A, tCsA(_, _, 0), tCrA_copy_view(_, _, 0));
+    CUTE_UNROLL
+    for (int k = 0; k < size<2>(tCrA); ++k) {
+      if (k < size<2>(tCrA) - 1) {
+        copy(tiled_copy_A, tCsA(_, _, k + 1), tCrA_copy_view(_, _, k + 1));
+      }
+      gemm(tiled_mma, tCrA_view(_, _, k), tCrB(_, _, k), acc);
+    }
+  }
+};
+
+} // namespace cute
+
+namespace tl {
+
+template <int M, int N, int K, int num_warp_m, int num_warp_n, bool trans_A,
+          bool trans_B, bool clear_accum, int lda, int ldb, int offset_a,
+          int offset_b, typename A_type, typename B_type, typename C_type>
+CUTLASS_DEVICE void gemm_ss(A_type *pA, B_type *pB, C_type *accum) {
+  using MMA = cute::GemmTensorOp<M, N, K, num_warp_m, num_warp_n, trans_A,
+                                 trans_B, clear_accum, lda, ldb, offset_a,
+                                 offset_b, A_type, B_type, C_type>;
+  MMA::body(pA, pB, accum);
+}
+
+template <int M, int N, int K, int num_warp_m, int num_warp_n, bool trans_A,
+          bool trans_B, bool clear_accum, int lda, int ldb, int offset_a,
+          int offset_b, typename A_type, typename B_type, typename C_type>
+CUTLASS_DEVICE void gemm_rs(A_type *pA, B_type *pB, C_type *accum) {
+  using MMA = cute::GemmTensorOp<M, N, K, num_warp_m, num_warp_n, trans_A,
+                                 trans_B, clear_accum, lda, ldb, offset_a,
+                                 offset_b, A_type, B_type, C_type>;
+  MMA::body_rs(pA, pB, accum);
+}
+
+template <int M, int N, int K, int num_warp_m, int num_warp_n, bool trans_A,
+          bool trans_B, bool clear_accum, int lda, int ldb, int offset_a,
+          int offset_b, typename A_type, typename B_type, typename C_type>
+CUTLASS_DEVICE void gemm_sr(A_type *pA, B_type *pB, C_type *accum) {
+  using MMA = cute::GemmTensorOp<M, N, K, num_warp_m, num_warp_n, trans_A,
+                                 trans_B, clear_accum, lda, ldb, offset_a,
+                                 offset_b, A_type, B_type, C_type>;
+  MMA::body_sr(pA, pB, accum);
+}
+
+} // namespace tl
diff --git a/src/tl_templates/cuda/gemm_sm120.h b/src/tl_templates/cuda/gemm_sm120.h
new file mode 100644
index 000000000..1e7be8fc1
--- /dev/null
+++ b/src/tl_templates/cuda/gemm_sm120.h
@@ -0,0 +1,3 @@
+#pragma once
+
+#include "gemm_mma.h"
diff --git a/src/tl_templates/cuda/gemm_sm80.h b/src/tl_templates/cuda/gemm_sm80.h
index 826cb5ec8..1e7be8fc1 100644
--- a/src/tl_templates/cuda/gemm_sm80.h
+++ b/src/tl_templates/cuda/gemm_sm80.h
@@ -1,389 +1,3 @@
 #pragma once
 
-#include <cute/algorithm/clear.hpp>
-#include <cute/arch/mma_sm80.hpp>
-#include <cute/atom/mma_atom.hpp>
-#include <cute/underscore.hpp>
-
-#include "common.h"
-
-namespace cute {
-
-template <typename A_type, typename B_type, typename C_type, int num_warp_m,
-          int num_warp_n, int N>
-struct DispatchInstruction;
-
-using _X = Underscore;
-
-#if (defined(__CUDA_ARCH_LIST__) && (__CUDA_ARCH_LIST__ >= 800))
-template <int num_warp_m, int num_warp_n, int N>
-struct DispatchInstruction<half_t, half_t, half_t, num_warp_m, num_warp_n, N> {
-  using MMA = MMA_Atom<SM80_16x8x16_F16F16F16F16_TN>;
-  using MMA_Group = Tile<_X, Int<std::min(num_warp_n * 16, N)>, _X>;
-};
-template <int num_warp_m, int num_warp_n, int N>
-struct DispatchInstruction<half_t, half_t, float, num_warp_m, num_warp_n, N> {
-  using MMA = MMA_Atom<SM80_16x8x16_F32F16F16F32_TN>;
-  using MMA_Group = Tile<_X, Int<std::min(num_warp_n * 16, N)>, _X>;
-};
-template <int num_warp_m, int num_warp_n, int N>
-struct DispatchInstruction<bfloat16_t, bfloat16_t, float, num_warp_m,
-                           num_warp_n, N> {
-  using MMA = MMA_Atom<SM80_16x8x16_F32BF16BF16F32_TN>;
-  using MMA_Group = Tile<_X, Int<std::min(num_warp_n * 16, N)>, _X>;
-};
-template <int num_warp_m, int num_warp_n, int N>
-struct DispatchInstruction<tfloat32_t, tfloat32_t, float, num_warp_m,
-                           num_warp_n, N> {
-  using MMA = MMA_Atom<SM80_16x8x8_F32TF32TF32F32_TN>;
-  using MMA_Group = Tile<_X, Int<std::min(num_warp_n * 16, N)>, _X>;
-};
-template <int num_warp_m, int num_warp_n, int N>
-struct DispatchInstruction<int8_t, int8_t, int, num_warp_m, num_warp_n, N> {
-  using MMA = MMA_Atom<SM80_16x8x32_S32S8S8S32_TN>;
-  using MMA_Group = Tile<_X, Int<std::min(num_warp_n * 16, N)>, _X>;
-};
-template <int num_warp_m, int num_warp_n, int N>
-struct DispatchInstruction<double, double, double, num_warp_m, num_warp_n, N> {
-  using MMA = MMA_Atom<SM80_8x8x4_F64F64F64F64_TN>;
-  using MMA_Group = Tile<Int<num_warp_m * 16>, Int<num_warp_n * 16>, _X>;
-};
-#elif (defined(__CUDA_ARCH_LIST__) && (__CUDA_ARCH_LIST__ >= 750))
-template <int num_warp_m, int num_warp_n, int N>
-struct DispatchInstruction<half_t, half_t, float, num_warp_m, num_warp_n, N> {
-  using MMA = MMA_Atom<SM75_16x8x8_F32F16F16F32_TN>;
-  using MMA_Group = Tile<_X, Int<std::min(num_warp_n * 16, N)>, _16>;
-};
-#endif
-
-template <int N, int num_warp_n, bool transpose> struct SelectCopy {
-  static constexpr int remainder = (N / num_warp_n) % 16;
-  using type = std::conditional_t<
-      remainder == 4 || remainder == 8 || remainder == 0,
-      std::conditional_t<
-          transpose,
-          std::conditional_t<
-              remainder == 4, SM75_U32x1_LDSM_N,
-              std::conditional_t<remainder == 8, SM75_U32x2_LDSM_N,
-                                 SM75_U32x4_LDSM_N>>,
-          std::conditional_t<
-              remainder == 4, SM75_U16x2_LDSM_T,
-              std::conditional_t<remainder == 8, SM75_U16x4_LDSM_T,
-                                 SM75_U16x8_LDSM_T>>>,
-      DefaultCopy>;
-};
-
-template <int Bits, int N, int K, bool K_inner, int num_warp_n,
-          typename Enable = void>
-struct OperandTraits {
-  // Primary template, use padded layout and default copy
-  static constexpr int stride = K_inner ? K : N;
-  static constexpr int padded =
-      stride % (256 / Bits) == 0 ? stride + 128 / Bits : stride;
-  using Layout = typename std::conditional<
-      K_inner, Layout<Shape<Int<N>, Int<K>>, Shape<Int<padded>, _1>>,
-      Layout<Shape<Int<N>, Int<K>>, Shape<_1, Int<padded>>>>::type;
-  using Copy = DefaultCopy;
-};
-
-template <int N, int K, int num_warp_n>
-struct OperandTraits<16, N, K, true, num_warp_n,
-                     typename std::enable_if<K % 64 == 32>::type> {
-  using LayoutAtom = decltype(composition(
-      Swizzle<2, 3, 3>{}, Layout<Shape<_8, _32>, Stride<_32, _1>>{}));
-  using Layout = decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<K>>{}));
-  using Copy = typename SelectCopy<N, num_warp_n, true>::type;
-};
-
-template <int N, int K, int num_warp_n>
-struct OperandTraits<16, N, K, true, num_warp_n,
-                     typename std::enable_if<K % 64 == 0>::type> {
-  using LayoutAtom = decltype(composition(
-      Swizzle<3, 3, 3>{}, Layout<Shape<_8, _64>, Stride<_64, _1>>{}));
-  using Layout = decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<K>>{}));
-  using Copy = typename SelectCopy<N, num_warp_n, true>::type;
-};
-
-template <int N, int K, int num_warp_n>
-struct OperandTraits<16, N, K, false, num_warp_n,
-                     typename std::enable_if<N % 64 == 32>::type> {
-  using LayoutAtom = decltype(composition(
-      Swizzle<2, 3, 3>{}, Layout<Shape<_32, _8>, Stride<_1, _32>>{}));
-  using Layout = decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<K>>{},
-                                        Step<_2, _1>{}));
-  using Copy = typename SelectCopy<N, num_warp_n, false>::type;
-};
-
-template <int N, int K, int num_warp_n>
-struct OperandTraits<16, N, K, false, num_warp_n,
-                     typename std::enable_if<N % 64 == 0>::type> {
-  using LayoutAtom = decltype(composition(
-      Swizzle<3, 3, 3>{}, Layout<Shape<_64, _8>, Stride<_1, _64>>{}));
-  using Layout = decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<K>>{},
-                                        Step<_2, _1>{}));
-  using Copy = typename SelectCopy<N, num_warp_n, false>::type;
-};
-
-template <int N, int K, int num_warp_n>
-struct OperandTraits<32, N, K, true, num_warp_n,
-                     typename std::enable_if<K % 32 == 0>::type> {
-  using LayoutAtom = decltype(composition(
-      Swizzle<3, 2, 3>{}, Layout<Shape<_8, _32>, Stride<_32, _1>>{}));
-  using Layout = decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<K>>{}));
-  using Copy = typename SelectCopy<N, num_warp_n, true>::type;
-};
-
-template <int N, int K, int num_warp_n>
-struct OperandTraits<32, N, K, true, num_warp_n,
-                     typename std::enable_if<K % 32 == 16>::type> {
-  using LayoutAtom = decltype(composition(
-      Swizzle<2, 2, 3>{}, Layout<Shape<_8, _16>, Stride<_16, _1>>{}));
-  using Layout = decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<K>>{}));
-  using Copy = typename SelectCopy<N, num_warp_n, true>::type;
-};
-
-template <int N, int K, int num_warp_n>
-struct OperandTraits<32, N, K, false, num_warp_n,
-                     typename std::enable_if<N % 32 == 0>::type> {
-  using LayoutAtom = decltype(composition(
-      Swizzle<3, 2, 3>{}, Layout<Shape<_32, _8>, Stride<_1, _32>>{}));
-  using Layout = decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<K>>{},
-                                        Step<_2, _1>{}));
-  using Copy = UniversalCopy<tfloat32_t>;
-};
-
-template <int N, int K, int num_warp_n>
-struct OperandTraits<32, N, K, false, num_warp_n,
-                     typename std::enable_if<N % 32 == 16>::type> {
-  using LayoutAtom = decltype(composition(
-      Swizzle<2, 2, 3>{}, Layout<Shape<_16, _8>, Stride<_1, _16>>{}));
-  using Layout = decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<K>>{},
-                                        Step<_2, _1>{}));
-  using Copy = UniversalCopy<tfloat32_t>;
-};
-
-template <int N, int K, int num_warp_n>
-struct OperandTraits<8, N, K, true, num_warp_n,
-                     typename std::enable_if<K % 128 == 64>::type> {
-  using LayoutAtom = decltype(composition(
-      Swizzle<2, 4, 3>{}, Layout<Shape<_8, _64>, Stride<_64, _1>>{}));
-  using Layout = decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<K>>{}));
-  using Copy = typename SelectCopy<N, num_warp_n, true>::type;
-};
-
-template <int N, int K, int num_warp_n>
-struct OperandTraits<8, N, K, true, num_warp_n,
-                     typename std::enable_if<K % 128 == 0>::type> {
-  using LayoutAtom = decltype(composition(
-      Swizzle<3, 4, 3>{}, Layout<Shape<_8, _128>, Stride<_128, _1>>{}));
-  using Layout = decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<K>>{}));
-  using Copy = typename SelectCopy<N, num_warp_n, true>::type;
-};
-
-template <int N, int K, int num_warp_n>
-struct OperandTraits<64, N, K, true, num_warp_n,
-                     typename std::enable_if<K % 16 == 0>::type> {
-  using LayoutAtom = decltype(composition(
-      Swizzle<2, 0, 4>{}, Layout<Shape<_4, _16>, Stride<_16, _1>>{}));
-  using Layout = decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<K>>{}));
-  using Copy = DefaultCopy;
-};
-
-template <int N, int K, int num_warp_n>
-struct OperandTraits<64, N, K, false, num_warp_n,
-                     typename std::enable_if<N % 16 == 0>::type> {
-  using LayoutAtom = decltype(composition(
-      Swizzle<2, 2, 2>{}, Layout<Shape<_16, _4>, Stride<_1, _16>>{}));
-  using Layout = decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<K>>{},
-                                        Step<_2, _1>{}));
-  using Copy = DefaultCopy;
-};
-
-template <int M, int N, int K, int num_warp_m, int num_warp_n, bool trans_A,
-          bool trans_B, bool clear_accum, typename A_type_raw,
-          typename B_type_raw, typename C_type_raw>
-class GemmTensorOp {
-public:
-  using A_type =
-      typename std::conditional<std::is_same<A_type_raw, float>::value,
-                                tfloat32_t, A_type_raw>::type;
-  using B_type =
-      typename std::conditional<std::is_same<B_type_raw, float>::value,
-                                tfloat32_t, A_type_raw>::type;
-  using C_type = C_type_raw;
-
-  using Instruction =
-      DispatchInstruction<A_type, B_type, C_type, num_warp_m, num_warp_n, N>;
-
-  using OperandATraits =
-      OperandTraits<sizeof_bits<A_type>::value, M, K, !trans_A, num_warp_m>;
-  using OperandBTraits =
-      OperandTraits<sizeof_bits<B_type>::value, N, K, trans_B, num_warp_n>;
-
-  using SmemLayoutA = typename OperandATraits::Layout;
-  using SmemLayoutB = typename OperandBTraits::Layout;
-  using SmemCopyA = Copy_Atom<typename OperandATraits::Copy, A_type>;
-  using SmemCopyB = Copy_Atom<typename OperandBTraits::Copy, B_type>;
-
-  using TileMma = TiledMMA<typename Instruction::MMA,
-                           Layout<Shape<Int<num_warp_m>, Int<num_warp_n>, _1>>,
-                           typename Instruction::MMA_Group>;
-
-  template <class... Args>
-  static CUTE_DEVICE auto remove_swizzle(Layout<Args...> const &layout) {
-    return layout;
-  }
-  // In fp16, when layout is KxN and n_warp is 1 and N % 64 == 0
-  // the original layout fail to compile, currently using this as a workaround
-  template <class... Args>
-  static CUTE_DEVICE auto
-  remove_swizzle(ComposedLayout<Args...> const &layout) {
-    if constexpr (sizeof(A_type) == 2)
-      return layout.layout_b();
-    else
-      return layout;
-  }
-
-  static CUTE_DEVICE void body(A_type_raw *pA, B_type_raw *pB, C_type_raw *pC) {
-    const int tid = threadIdx.x;
-    Tensor sA = make_tensor(make_smem_ptr(reinterpret_cast<A_type *>(pA)),
-                            SmemLayoutA{});
-    Tensor sB = make_tensor(make_smem_ptr(reinterpret_cast<B_type *>(pB)),
-                            SmemLayoutB{});
-    TileMma tiled_mma;
-    auto thr_mma = tiled_mma.get_thread_slice(tid);
-    auto tiled_copy_A = make_tiled_copy_A(SmemCopyA{}, tiled_mma);
-    auto tiled_copy_B = make_tiled_copy_B(SmemCopyB{}, tiled_mma);
-    auto thr_copy_A = tiled_copy_A.get_thread_slice(tid);
-    auto thr_copy_B = tiled_copy_B.get_thread_slice(tid);
-
-    Tensor tCrA = thr_mma.partition_fragment_A(sA);
-    Tensor tCrB = thr_mma.partition_fragment_B(sB);
-    Tensor tCsA = thr_copy_A.partition_S(sA);
-    Tensor tCsB = thr_copy_B.partition_S(sB);
-
-    Tensor tCrA_copy_view = thr_copy_A.retile_D(tCrA);
-    Tensor tCrB_copy_view = thr_copy_B.retile_D(tCrB);
-
-    Tensor acc =
-        make_tensor(make_rmem_ptr(reinterpret_cast<C_type *>(pC)),
-                    partition_shape_C(tiled_mma, Shape<Int<M>, Int<N>>{}));
-
-    if constexpr (clear_accum) {
-      clear(acc);
-    }
-    // when layout is KxN and n_warp is 1, there seem to be a bug, use this as a
-    // workaround
-    auto tCrA_view = make_tensor(tCrA.data(), remove_swizzle(tCrA.layout()));
-    auto tCrB_view = make_tensor(tCrB.data(), remove_swizzle(tCrB.layout()));
-    CUTE_UNROLL
-    for (int k = 0; k < size<2>(tCrA); ++k) {
-      copy(tiled_copy_A, tCsA(_, _, k), tCrA_copy_view(_, _, k));
-      copy(tiled_copy_B, tCsB(_, _, k), tCrB_copy_view(_, _, k));
-      gemm(tiled_mma, tCrA_view(_, _, k), tCrB_view(_, _, k), acc);
-    }
-  }
-
-  static CUTE_DEVICE void body_rs(A_type_raw *pA, B_type_raw *pB,
-                                  C_type_raw *pC) {
-    const int tid = threadIdx.x;
-    Tensor sB = make_tensor(make_smem_ptr(reinterpret_cast<B_type *>(pB)),
-                            SmemLayoutB{});
-    TileMma tiled_mma;
-    auto thr_mma = tiled_mma.get_thread_slice(tid);
-    auto tiled_copy_B = make_tiled_copy_B(SmemCopyB{}, tiled_mma);
-    auto thr_copy_B = tiled_copy_B.get_thread_slice(tid);
-
-    Tensor tCrB = thr_mma.partition_fragment_B(sB);
-    Tensor tCsB = thr_copy_B.partition_S(sB);
-
-    Tensor tCrB_copy_view = thr_copy_B.retile_D(tCrB);
-
-    Tensor acc =
-        make_tensor(make_rmem_ptr(reinterpret_cast<C_type *>(pC)),
-                    partition_shape_C(tiled_mma, Shape<Int<M>, Int<N>>{}));
-    Tensor tCrA =
-        make_tensor(make_rmem_ptr(reinterpret_cast<A_type *>(pA)),
-                    partition_shape_A(tiled_mma, Shape<Int<M>, Int<K>>{}));
-    if constexpr (clear_accum) {
-      clear(acc);
-    }
-    auto tCrB_view = make_tensor(tCrB.data(), remove_swizzle(tCrB.layout()));
-    copy(tiled_copy_B, tCsB(_, _, 0), tCrB_copy_view(_, _, 0));
-    CUTE_UNROLL
-    for (int k = 0; k < size<2>(tCrA); ++k) {
-      if (k < size<2>(tCrA) - 1) {
-        copy(tiled_copy_B, tCsB(_, _, k + 1), tCrB_copy_view(_, _, k + 1));
-      }
-      gemm(tiled_mma, tCrA(_, _, k), tCrB_view(_, _, k), acc);
-    }
-  }
-
-  static CUTE_DEVICE void body_sr(A_type_raw *pA, B_type_raw *pB,
-                                  C_type_raw *pC) {
-    const int tid = threadIdx.x;
-    Tensor sA = make_tensor(make_smem_ptr(reinterpret_cast<A_type *>(pA)),
-                            SmemLayoutA{});
-    TileMma tiled_mma;
-    auto thr_mma = tiled_mma.get_thread_slice(tid);
-    auto tiled_copy_A = make_tiled_copy_A(SmemCopyA{}, tiled_mma);
-    auto thr_copy_A = tiled_copy_A.get_thread_slice(tid);
-
-    Tensor tCrA = thr_mma.partition_fragment_A(sA);
-    Tensor tCsA = thr_copy_A.partition_S(sA);
-
-    Tensor tCrA_copy_view = thr_copy_A.retile_D(tCrA);
-
-    Tensor acc =
-        make_tensor(make_rmem_ptr(reinterpret_cast<C_type *>(pC)),
-                    partition_shape_C(tiled_mma, Shape<Int<M>, Int<N>>{}));
-    Tensor tCrB =
-        make_tensor(make_rmem_ptr(reinterpret_cast<B_type *>(pB)),
-                    partition_shape_B(tiled_mma, Shape<Int<N>, Int<K>>{}));
-    if constexpr (clear_accum) {
-      clear(acc);
-    }
-    auto tCrA_view = make_tensor(tCrA.data(), remove_swizzle(tCrA.layout()));
-    copy(tiled_copy_A, tCsA(_, _, 0), tCrA_copy_view(_, _, 0));
-    CUTE_UNROLL
-    for (int k = 0; k < size<2>(tCrA); ++k) {
-      if (k < size<2>(tCrA) - 1) {
-        copy(tiled_copy_A, tCsA(_, _, k + 1), tCrA_copy_view(_, _, k + 1));
-      }
-      gemm(tiled_mma, tCrA_view(_, _, k), tCrB(_, _, k), acc);
-    }
-  }
-};
-
-} // namespace cute
-
-namespace tl {
-
-template <int M, int N, int K, int num_warp_m, int num_warp_n, bool trans_A,
-          bool trans_B, bool clear_accum, typename A_type, typename B_type,
-          typename C_type>
-CUTLASS_DEVICE void gemm_ss(A_type *pA, B_type *pB, C_type *accum) {
-  using MMA = cute::GemmTensorOp<M, N, K, num_warp_m, num_warp_n, trans_A,
-                                 trans_B, clear_accum, A_type, B_type, C_type>;
-  MMA::body(pA, pB, accum);
-}
-
-template <int M, int N, int K, int num_warp_m, int num_warp_n, bool trans_A,
-          bool trans_B, bool clear_accum, typename A_type, typename B_type,
-          typename C_type>
-CUTLASS_DEVICE void gemm_rs(A_type *pA, B_type *pB, C_type *accum) {
-  using MMA = cute::GemmTensorOp<M, N, K, num_warp_m, num_warp_n, trans_A,
-                                 trans_B, clear_accum, A_type, B_type, C_type>;
-  MMA::body_rs(pA, pB, accum);
-}
-
-template <int M, int N, int K, int num_warp_m, int num_warp_n, bool trans_A,
-          bool trans_B, bool clear_accum, typename A_type, typename B_type,
-          typename C_type>
-CUTLASS_DEVICE void gemm_sr(A_type *pA, B_type *pB, C_type *accum) {
-  using MMA = cute::GemmTensorOp<M, N, K, num_warp_m, num_warp_n, trans_A,
-                                 trans_B, clear_accum, A_type, B_type, C_type>;
-  MMA::body_sr(pA, pB, accum);
-}
-
-} // namespace tl
+#include "gemm_mma.h"
diff --git a/src/tl_templates/cuda/gemm_sm89.h b/src/tl_templates/cuda/gemm_sm89.h
index 37504e59e..f02ef3e60 100644
--- a/src/tl_templates/cuda/gemm_sm89.h
+++ b/src/tl_templates/cuda/gemm_sm89.h
@@ -1,409 +1,7 @@
 #pragma once
 
-#include <cute/algorithm/clear.hpp>
-#include <cute/arch/mma_sm80.hpp>
 #include <cute/arch/mma_sm89.hpp>
 
-#include <cute/atom/mma_atom.hpp>
-#include <cute/atom/mma_traits.hpp>
-#include <cute/underscore.hpp>
-
-#include "common.h"
 #include "cuda_fp8.h"
 
-namespace cute {
-
-template <typename A_type, typename B_type, typename C_type, int num_warp_m,
-          int num_warp_n, int N>
-struct DispatchInstruction;
-
-using _X = Underscore;
-
-#if (defined(__CUDA_ARCH_LIST__) && (__CUDA_ARCH_LIST__ >= 890))
-
-template <int num_warp_m, int num_warp_n, int N>
-struct DispatchInstruction<fp8_e4_t, fp8_e4_t, float, num_warp_m, num_warp_n,
-                           N> {
-  using MMA = MMA_Atom<SM89_16x8x32_F32E4M3E4M3F32_TN>;
-  using MMA_Group = Tile<_X, Int<std::min(num_warp_n * 16, N)>, _X>;
-};
-template <int num_warp_m, int num_warp_n, int N>
-struct DispatchInstruction<fp8_e5_t, fp8_e5_t, float, num_warp_m, num_warp_n,
-                           N> {
-  using MMA = MMA_Atom<SM89_16x8x32_F32E5M2E5M2F32_TN>;
-  using MMA_Group = Tile<_X, Int<std::min(num_warp_n * 16, N)>, _X>;
-};
-
-template <int num_warp_m, int num_warp_n, int N>
-struct DispatchInstruction<half_t, half_t, half_t, num_warp_m, num_warp_n, N> {
-  using MMA = MMA_Atom<SM80_16x8x16_F16F16F16F16_TN>;
-  using MMA_Group = Tile<_X, Int<std::min(num_warp_n * 16, N)>, _X>;
-};
-template <int num_warp_m, int num_warp_n, int N>
-struct DispatchInstruction<half_t, half_t, float, num_warp_m, num_warp_n, N> {
-  using MMA = MMA_Atom<SM80_16x8x16_F32F16F16F32_TN>;
-  using MMA_Group = Tile<_X, Int<std::min(num_warp_n * 16, N)>, _X>;
-};
-template <int num_warp_m, int num_warp_n, int N>
-struct DispatchInstruction<bfloat16_t, bfloat16_t, float, num_warp_m,
-                           num_warp_n, N> {
-  using MMA = MMA_Atom<SM80_16x8x16_F32BF16BF16F32_TN>;
-  using MMA_Group = Tile<_X, Int<std::min(num_warp_n * 16, N)>, _X>;
-};
-template <int num_warp_m, int num_warp_n, int N>
-struct DispatchInstruction<tfloat32_t, tfloat32_t, float, num_warp_m,
-                           num_warp_n, N> {
-  using MMA = MMA_Atom<SM80_16x8x8_F32TF32TF32F32_TN>;
-  using MMA_Group = Tile<_X, Int<std::min(num_warp_n * 16, N)>, _X>;
-};
-template <int num_warp_m, int num_warp_n, int N>
-struct DispatchInstruction<int8_t, int8_t, int, num_warp_m, num_warp_n, N> {
-  using MMA = MMA_Atom<SM80_16x8x32_S32S8S8S32_TN>;
-  using MMA_Group = Tile<_X, Int<std::min(num_warp_n * 16, N)>, _X>;
-};
-template <int num_warp_m, int num_warp_n, int N>
-struct DispatchInstruction<double, double, double, num_warp_m, num_warp_n, N> {
-  using MMA = MMA_Atom<SM80_8x8x4_F64F64F64F64_TN>;
-  using MMA_Group = Tile<Int<num_warp_m * 16>, Int<num_warp_n * 16>, _X>;
-};
-#elif (defined(__CUDA_ARCH_LIST__) && (__CUDA_ARCH_LIST__ >= 750))
-template <int num_warp_m, int num_warp_n, int N>
-struct DispatchInstruction<half_t, half_t, float, num_warp_m, num_warp_n, N> {
-  using MMA = MMA_Atom<SM75_16x8x8_F32F16F16F32_TN>;
-  using MMA_Group = Tile<_X, Int<std::min(num_warp_n * 16, N)>, _16>;
-};
-#endif
-
-template <int N, int num_warp_n, bool transpose> struct SelectCopy {
-  static constexpr int remainder = (N / num_warp_n) % 16;
-  using type = std::conditional_t<
-      remainder == 4 || remainder == 8 || remainder == 0,
-      std::conditional_t<
-          transpose,
-          std::conditional_t<
-              remainder == 4, SM75_U32x1_LDSM_N,
-              std::conditional_t<remainder == 8, SM75_U32x2_LDSM_N,
-                                 SM75_U32x4_LDSM_N>>,
-          std::conditional_t<
-              remainder == 4, SM75_U16x2_LDSM_T,
-              std::conditional_t<remainder == 8, SM75_U16x4_LDSM_T,
-                                 SM75_U16x8_LDSM_T>>>,
-      DefaultCopy>;
-};
-
-template <int Bits, int N, int K, bool K_inner, int num_warp_n,
-          typename Enable = void>
-struct OperandTraits {
-  // Primary template, use padded layout and default copy
-  static constexpr int stride = K_inner ? K : N;
-  static constexpr int padded =
-      stride % (256 / Bits) == 0 ? stride + 128 / Bits : stride;
-  using Layout = typename std::conditional<
-      K_inner, Layout<Shape<Int<N>, Int<K>>, Shape<Int<padded>, _1>>,
-      Layout<Shape<Int<N>, Int<K>>, Shape<_1, Int<padded>>>>::type;
-  using Copy = DefaultCopy;
-};
-
-template <int N, int K, int num_warp_n>
-struct OperandTraits<16, N, K, true, num_warp_n,
-                     typename std::enable_if<K % 64 == 32>::type> {
-  using LayoutAtom = decltype(composition(
-      Swizzle<2, 3, 3>{}, Layout<Shape<_8, _32>, Stride<_32, _1>>{}));
-  using Layout = decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<K>>{}));
-  using Copy = typename SelectCopy<N, num_warp_n, true>::type;
-};
-
-template <int N, int K, int num_warp_n>
-struct OperandTraits<16, N, K, true, num_warp_n,
-                     typename std::enable_if<K % 64 == 0>::type> {
-  using LayoutAtom = decltype(composition(
-      Swizzle<3, 3, 3>{}, Layout<Shape<_8, _64>, Stride<_64, _1>>{}));
-  using Layout = decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<K>>{}));
-  using Copy = typename SelectCopy<N, num_warp_n, true>::type;
-};
-
-template <int N, int K, int num_warp_n>
-struct OperandTraits<16, N, K, false, num_warp_n,
-                     typename std::enable_if<N % 64 == 32>::type> {
-  using LayoutAtom = decltype(composition(
-      Swizzle<2, 3, 3>{}, Layout<Shape<_32, _8>, Stride<_1, _32>>{}));
-  using Layout = decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<K>>{},
-                                        Step<_2, _1>{}));
-  using Copy = typename SelectCopy<N, num_warp_n, false>::type;
-};
-
-template <int N, int K, int num_warp_n>
-struct OperandTraits<16, N, K, false, num_warp_n,
-                     typename std::enable_if<N % 64 == 0>::type> {
-  using LayoutAtom = decltype(composition(
-      Swizzle<3, 3, 3>{}, Layout<Shape<_64, _8>, Stride<_1, _64>>{}));
-  using Layout = decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<K>>{},
-                                        Step<_2, _1>{}));
-  using Copy = typename SelectCopy<N, num_warp_n, false>::type;
-};
-
-template <int N, int K, int num_warp_n>
-struct OperandTraits<32, N, K, true, num_warp_n,
-                     typename std::enable_if<K % 32 == 0>::type> {
-  using LayoutAtom = decltype(composition(
-      Swizzle<3, 2, 3>{}, Layout<Shape<_8, _32>, Stride<_32, _1>>{}));
-  using Layout = decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<K>>{}));
-  using Copy = typename SelectCopy<N, num_warp_n, true>::type;
-};
-
-template <int N, int K, int num_warp_n>
-struct OperandTraits<32, N, K, true, num_warp_n,
-                     typename std::enable_if<K % 32 == 16>::type> {
-  using LayoutAtom = decltype(composition(
-      Swizzle<2, 2, 3>{}, Layout<Shape<_8, _16>, Stride<_16, _1>>{}));
-  using Layout = decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<K>>{}));
-  using Copy = typename SelectCopy<N, num_warp_n, true>::type;
-};
-
-template <int N, int K, int num_warp_n>
-struct OperandTraits<32, N, K, false, num_warp_n,
-                     typename std::enable_if<N % 32 == 0>::type> {
-  using LayoutAtom = decltype(composition(
-      Swizzle<3, 2, 3>{}, Layout<Shape<_32, _8>, Stride<_1, _32>>{}));
-  using Layout = decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<K>>{},
-                                        Step<_2, _1>{}));
-  using Copy = UniversalCopy<tfloat32_t>;
-};
-
-template <int N, int K, int num_warp_n>
-struct OperandTraits<32, N, K, false, num_warp_n,
-                     typename std::enable_if<N % 32 == 16>::type> {
-  using LayoutAtom = decltype(composition(
-      Swizzle<2, 2, 3>{}, Layout<Shape<_16, _8>, Stride<_1, _16>>{}));
-  using Layout = decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<K>>{},
-                                        Step<_2, _1>{}));
-  using Copy = UniversalCopy<tfloat32_t>;
-};
-
-template <int N, int K, int num_warp_n>
-struct OperandTraits<8, N, K, true, num_warp_n,
-                     typename std::enable_if<K % 128 == 64>::type> {
-  using LayoutAtom = decltype(composition(
-      Swizzle<2, 4, 3>{}, Layout<Shape<_8, _64>, Stride<_64, _1>>{}));
-  using Layout = decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<K>>{}));
-  using Copy = typename SelectCopy<N, num_warp_n, true>::type;
-};
-
-template <int N, int K, int num_warp_n>
-struct OperandTraits<8, N, K, true, num_warp_n,
-                     typename std::enable_if<K % 128 == 0>::type> {
-  using LayoutAtom = decltype(composition(
-      Swizzle<3, 4, 3>{}, Layout<Shape<_8, _128>, Stride<_128, _1>>{}));
-  using Layout = decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<K>>{}));
-  using Copy = typename SelectCopy<N, num_warp_n, true>::type;
-};
-
-template <int N, int K, int num_warp_n>
-struct OperandTraits<64, N, K, true, num_warp_n,
-                     typename std::enable_if<K % 16 == 0>::type> {
-  using LayoutAtom = decltype(composition(
-      Swizzle<2, 0, 4>{}, Layout<Shape<_4, _16>, Stride<_16, _1>>{}));
-  using Layout = decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<K>>{}));
-  using Copy = DefaultCopy;
-};
-
-template <int N, int K, int num_warp_n>
-struct OperandTraits<64, N, K, false, num_warp_n,
-                     typename std::enable_if<N % 16 == 0>::type> {
-  using LayoutAtom = decltype(composition(
-      Swizzle<2, 2, 2>{}, Layout<Shape<_16, _4>, Stride<_1, _16>>{}));
-  using Layout = decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<K>>{},
-                                        Step<_2, _1>{}));
-  using Copy = DefaultCopy;
-};
-
-template <int M, int N, int K, int num_warp_m, int num_warp_n, bool trans_A,
-          bool trans_B, bool clear_accum, typename A_type_raw,
-          typename B_type_raw, typename C_type_raw>
-class GemmTensorOp {
-public:
-  using A_type =
-      typename std::conditional<std::is_same<A_type_raw, float>::value,
-                                tfloat32_t, A_type_raw>::type;
-  using B_type =
-      typename std::conditional<std::is_same<B_type_raw, float>::value,
-                                tfloat32_t, A_type_raw>::type;
-  using C_type = C_type_raw;
-
-  using Instruction =
-      DispatchInstruction<A_type, B_type, C_type, num_warp_m, num_warp_n, N>;
-
-  using OperandATraits =
-      OperandTraits<sizeof_bits<A_type>::value, M, K, !trans_A, num_warp_m>;
-  using OperandBTraits =
-      OperandTraits<sizeof_bits<B_type>::value, N, K, trans_B, num_warp_n>;
-
-  using SmemLayoutA = typename OperandATraits::Layout;
-  using SmemLayoutB = typename OperandBTraits::Layout;
-  using SmemCopyA = Copy_Atom<typename OperandATraits::Copy, A_type>;
-  using SmemCopyB = Copy_Atom<typename OperandBTraits::Copy, B_type>;
-
-  using TileMma = TiledMMA<typename Instruction::MMA,
-                           Layout<Shape<Int<num_warp_m>, Int<num_warp_n>, _1>>,
-                           typename Instruction::MMA_Group>;
-
-  template <class... Args>
-  static CUTE_DEVICE auto remove_swizzle(Layout<Args...> const &layout) {
-    return layout;
-  }
-  // In fp16, when layout is KxN and n_warp is 1 and N % 64 == 0
-  // the original layout fail to compile, currently using this as a workaround
-  template <class... Args>
-  static CUTE_DEVICE auto
-  remove_swizzle(ComposedLayout<Args...> const &layout) {
-    if constexpr (sizeof(A_type) == 2)
-      return layout.layout_b();
-    else
-      return layout;
-  }
-
-  static CUTE_DEVICE void body(A_type_raw *pA, B_type_raw *pB, C_type_raw *pC) {
-    const int tid = threadIdx.x;
-    Tensor sA = make_tensor(make_smem_ptr(reinterpret_cast<A_type *>(pA)),
-                            SmemLayoutA{});
-    Tensor sB = make_tensor(make_smem_ptr(reinterpret_cast<B_type *>(pB)),
-                            SmemLayoutB{});
-    TileMma tiled_mma;
-    auto thr_mma = tiled_mma.get_thread_slice(tid);
-    auto tiled_copy_A = make_tiled_copy_A(SmemCopyA{}, tiled_mma);
-    auto tiled_copy_B = make_tiled_copy_B(SmemCopyB{}, tiled_mma);
-    auto thr_copy_A = tiled_copy_A.get_thread_slice(tid);
-    auto thr_copy_B = tiled_copy_B.get_thread_slice(tid);
-
-    Tensor tCrA = thr_mma.partition_fragment_A(sA);
-    Tensor tCrB = thr_mma.partition_fragment_B(sB);
-    Tensor tCsA = thr_copy_A.partition_S(sA);
-    Tensor tCsB = thr_copy_B.partition_S(sB);
-
-    Tensor tCrA_copy_view = thr_copy_A.retile_D(tCrA);
-    Tensor tCrB_copy_view = thr_copy_B.retile_D(tCrB);
-
-    Tensor acc =
-        make_tensor(make_rmem_ptr(reinterpret_cast<C_type *>(pC)),
-                    partition_shape_C(tiled_mma, Shape<Int<M>, Int<N>>{}));
-
-    if constexpr (clear_accum) {
-      clear(acc);
-    }
-    // when layout is KxN and n_warp is 1, there seem to be a bug, use this as a
-    // workaround
-    auto tCrA_view = make_tensor(tCrA.data(), remove_swizzle(tCrA.layout()));
-    auto tCrB_view = make_tensor(tCrB.data(), remove_swizzle(tCrB.layout()));
-
-    CUTE_UNROLL
-    for (int k = 0; k < size<2>(tCrA); ++k) {
-      copy(tiled_copy_A, tCsA(_, _, k), tCrA_copy_view(_, _, k));
-      copy(tiled_copy_B, tCsB(_, _, k), tCrB_copy_view(_, _, k));
-      gemm(tiled_mma, tCrA_view(_, _, k), tCrB_view(_, _, k), acc);
-    }
-  }
-
-  static CUTE_DEVICE void body_rs(A_type_raw *pA, B_type_raw *pB,
-                                  C_type_raw *pC) {
-    const int tid = threadIdx.x;
-    Tensor sB = make_tensor(make_smem_ptr(reinterpret_cast<B_type *>(pB)),
-                            SmemLayoutB{});
-    TileMma tiled_mma;
-    auto thr_mma = tiled_mma.get_thread_slice(tid);
-    auto tiled_copy_B = make_tiled_copy_B(SmemCopyB{}, tiled_mma);
-    auto thr_copy_B = tiled_copy_B.get_thread_slice(tid);
-
-    Tensor tCrB = thr_mma.partition_fragment_B(sB);
-    Tensor tCsB = thr_copy_B.partition_S(sB);
-
-    Tensor tCrB_copy_view = thr_copy_B.retile_D(tCrB);
-
-    Tensor acc =
-        make_tensor(make_rmem_ptr(reinterpret_cast<C_type *>(pC)),
-                    partition_shape_C(tiled_mma, Shape<Int<M>, Int<N>>{}));
-    Tensor tCrA =
-        make_tensor(make_rmem_ptr(reinterpret_cast<A_type *>(pA)),
-                    partition_shape_A(tiled_mma, Shape<Int<M>, Int<K>>{}));
-
-    if constexpr (clear_accum) {
-      clear(acc);
-    }
-    auto tCrB_view = make_tensor(tCrB.data(), remove_swizzle(tCrB.layout()));
-    copy(tiled_copy_B, tCsB(_, _, 0), tCrB_copy_view(_, _, 0));
-    CUTE_UNROLL
-    for (int k = 0; k < size<2>(tCrA); ++k) {
-      if (k < size<2>(tCrA) - 1) {
-        copy(tiled_copy_B, tCsB(_, _, k + 1), tCrB_copy_view(_, _, k + 1));
-      }
-      gemm(tiled_mma, tCrA(_, _, k), tCrB_view(_, _, k), acc);
-    }
-  }
-
-  static CUTE_DEVICE void body_sr(A_type_raw *pA, B_type_raw *pB,
-                                  C_type_raw *pC) {
-    const int tid = threadIdx.x;
-    Tensor sA = make_tensor(make_smem_ptr(reinterpret_cast<A_type *>(pA)),
-                            SmemLayoutA{});
-    TileMma tiled_mma;
-    auto thr_mma = tiled_mma.get_thread_slice(tid);
-    auto tiled_copy_A = make_tiled_copy_A(SmemCopyA{}, tiled_mma);
-    auto thr_copy_A = tiled_copy_A.get_thread_slice(tid);
-
-    Tensor tCrA = thr_mma.partition_fragment_A(sA);
-    Tensor tCsA = thr_copy_A.partition_S(sA);
-
-    Tensor tCrA_copy_view = thr_copy_A.retile_D(tCrA);
-
-    Tensor acc =
-        make_tensor(make_rmem_ptr(reinterpret_cast<C_type *>(pC)),
-                    partition_shape_C(tiled_mma, Shape<Int<M>, Int<N>>{}));
-    Tensor tCrB =
-        make_tensor(make_rmem_ptr(reinterpret_cast<B_type *>(pB)),
-                    partition_shape_B(tiled_mma, Shape<Int<N>, Int<K>>{}));
-    if constexpr (clear_accum) {
-      clear(acc);
-    }
-    auto tCrA_view = make_tensor(tCrA.data(), remove_swizzle(tCrA.layout()));
-    copy(tiled_copy_A, tCsA(_, _, 0), tCrA_copy_view(_, _, 0));
-    CUTE_UNROLL
-    for (int k = 0; k < size<2>(tCrA); ++k) {
-      if (k < size<2>(tCrA) - 1) {
-        copy(tiled_copy_A, tCsA(_, _, k + 1), tCrA_copy_view(_, _, k + 1));
-      }
-      gemm(tiled_mma, tCrA_view(_, _, k), tCrB(_, _, k), acc);
-    }
-  }
-};
-
-} // namespace cute
-
-namespace tl {
-
-template <int M, int N, int K, int num_warp_m, int num_warp_n, bool trans_A,
-          bool trans_B, bool clear_accum, typename A_type, typename B_type,
-          typename C_type>
-CUTLASS_DEVICE void gemm_ss(A_type *pA, B_type *pB, C_type *accum) {
-  using MMA = cute::GemmTensorOp<M, N, K, num_warp_m, num_warp_n, trans_A,
-                                 trans_B, clear_accum, A_type, B_type, C_type>;
-  MMA::body(pA, pB, accum);
-}
-
-template <int M, int N, int K, int num_warp_m, int num_warp_n, bool trans_A,
-          bool trans_B, bool clear_accum, typename A_type, typename B_type,
-          typename C_type>
-CUTLASS_DEVICE void gemm_rs(A_type *pA, B_type *pB, C_type *accum) {
-  using MMA = cute::GemmTensorOp<M, N, K, num_warp_m, num_warp_n, trans_A,
-                                 trans_B, clear_accum, A_type, B_type, C_type>;
-  MMA::body_rs(pA, pB, accum);
-}
-
-template <int M, int N, int K, int num_warp_m, int num_warp_n, bool trans_A,
-          bool trans_B, bool clear_accum, typename A_type, typename B_type,
-          typename C_type>
-CUTLASS_DEVICE void gemm_sr(A_type *pA, B_type *pB, C_type *accum) {
-  using MMA = cute::GemmTensorOp<M, N, K, num_warp_m, num_warp_n, trans_A,
-                                 trans_B, clear_accum, A_type, B_type, C_type>;
-  MMA::body_sr(pA, pB, accum);
-}
-
-} // namespace tl
+#include "gemm_mma.h"
diff --git a/src/tl_templates/cuda/gemm_sm90.h b/src/tl_templates/cuda/gemm_sm90.h
index 0555ab916..22613d8fe 100644
--- a/src/tl_templates/cuda/gemm_sm90.h
+++ b/src/tl_templates/cuda/gemm_sm90.h
@@ -194,16 +194,16 @@ struct DispatchInstruction<half_t, half_t, float, num_warp_m, num_warp_n, N> {
 };
 #endif
 
-template <int Bits, int N, int K, bool K_inner, int num_warp_n,
+template <int Bits, int N, int K, bool K_inner, int num_warp_n, int leading_dim,
           typename Enable = void>
 struct OperandTraits {
   // Primary template, use padded layout and default copy
-  static constexpr int stride = K_inner ? K : N;
+  static constexpr int stride = leading_dim;
   static constexpr int padded =
       stride % (256 / Bits) == 0 ? stride + 128 / Bits : stride;
   using Layout = typename std::conditional<
-      K_inner, Layout<Shape<Int<N>, Int<K>>, Shape<Int<padded>, _1>>,
-      Layout<Shape<Int<N>, Int<K>>, Shape<_1, Int<padded>>>>::type;
+      K_inner, Layout<Shape<Int<N>, Int<leading_dim>>, Shape<Int<padded>, _1>>,
+      Layout<Shape<Int<leading_dim>, Int<K>>, Shape<_1, Int<padded>>>>::type;
   using Copy = DefaultCopy;
 };
 
@@ -224,124 +224,132 @@ template <int N, int num_warp_n, bool transpose> struct SelectCopy {
       DefaultCopy>;
 };
 
-template <int N, int K, int num_warp_n>
-struct OperandTraits<16, N, K, true, num_warp_n,
-                     typename std::enable_if<K % 64 == 32>::type> {
+template <int N, int K, int num_warp_n, int leading_dim>
+struct OperandTraits<16, N, K, true, num_warp_n, leading_dim,
+                     typename std::enable_if<leading_dim % 64 == 32>::type> {
   using LayoutAtom = decltype(composition(
       Swizzle<2, 3, 3>{}, Layout<Shape<_8, _32>, Stride<_32, _1>>{}));
-  using Layout = decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<K>>{}));
+  using Layout =
+      decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<leading_dim>>{}));
   using Copy = typename SelectCopy<N, num_warp_n, true>::type;
 };
 
-template <int N, int K, int num_warp_n>
-struct OperandTraits<16, N, K, true, num_warp_n,
-                     typename std::enable_if<K % 64 == 0>::type> {
+template <int N, int K, int num_warp_n, int leading_dim>
+struct OperandTraits<16, N, K, true, num_warp_n, leading_dim,
+                     typename std::enable_if<leading_dim % 64 == 0>::type> {
   using LayoutAtom = decltype(composition(
       Swizzle<3, 3, 3>{}, Layout<Shape<_8, _64>, Stride<_64, _1>>{}));
-  using Layout = decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<K>>{}));
+  using Layout =
+      decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<leading_dim>>{}));
   using Copy = typename SelectCopy<N, num_warp_n, true>::type;
 };
 
-template <int N, int K, int num_warp_n>
-struct OperandTraits<16, N, K, false, num_warp_n,
-                     typename std::enable_if<N % 64 == 32>::type> {
+template <int N, int K, int num_warp_n, int leading_dim>
+struct OperandTraits<16, N, K, false, num_warp_n, leading_dim,
+                     typename std::enable_if<leading_dim % 64 == 32>::type> {
   using LayoutAtom = decltype(composition(
       Swizzle<2, 3, 3>{}, Layout<Shape<_32, _8>, Stride<_1, _32>>{}));
-  using Layout = decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<K>>{},
-                                        Step<_2, _1>{}));
+  using Layout = decltype(tile_to_shape(
+      LayoutAtom{}, Shape<Int<leading_dim>, Int<K>>{}, Step<_2, _1>{}));
   using Copy = typename SelectCopy<N, num_warp_n, false>::type;
 };
 
-template <int N, int K, int num_warp_n>
-struct OperandTraits<16, N, K, false, num_warp_n,
-                     typename std::enable_if<N % 64 == 0>::type> {
+template <int N, int K, int num_warp_n, int leading_dim>
+struct OperandTraits<16, N, K, false, num_warp_n, leading_dim,
+                     typename std::enable_if<leading_dim % 64 == 0>::type> {
   using LayoutAtom = decltype(composition(
       Swizzle<3, 3, 3>{}, Layout<Shape<_64, _8>, Stride<_1, _64>>{}));
-  using Layout = decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<K>>{},
-                                        Step<_2, _1>{}));
+  using Layout = decltype(tile_to_shape(
+      LayoutAtom{}, Shape<Int<leading_dim>, Int<K>>{}, Step<_2, _1>{}));
   using Copy = typename SelectCopy<N, num_warp_n, false>::type;
 };
 
-template <int N, int K, int num_warp_n>
-struct OperandTraits<32, N, K, true, num_warp_n,
-                     typename std::enable_if<K % 32 == 0>::type> {
+template <int N, int K, int num_warp_n, int leading_dim>
+struct OperandTraits<32, N, K, true, num_warp_n, leading_dim,
+                     typename std::enable_if<leading_dim % 32 == 0>::type> {
   using LayoutAtom = decltype(composition(
       Swizzle<3, 2, 3>{}, Layout<Shape<_8, _32>, Stride<_32, _1>>{}));
-  using Layout = decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<K>>{}));
+  using Layout =
+      decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<leading_dim>>{}));
   using Copy = typename SelectCopy<N, num_warp_n, true>::type;
 };
 
-template <int N, int K, int num_warp_n>
-struct OperandTraits<32, N, K, true, num_warp_n,
-                     typename std::enable_if<K % 32 == 16>::type> {
+template <int N, int K, int num_warp_n, int leading_dim>
+struct OperandTraits<32, N, K, true, num_warp_n, leading_dim,
+                     typename std::enable_if<leading_dim % 32 == 16>::type> {
   using LayoutAtom = decltype(composition(
       Swizzle<2, 2, 3>{}, Layout<Shape<_8, _16>, Stride<_16, _1>>{}));
-  using Layout = decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<K>>{}));
+  using Layout =
+      decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<leading_dim>>{}));
   using Copy = typename SelectCopy<N, num_warp_n, true>::type;
 };
 
-template <int N, int K, int num_warp_n>
-struct OperandTraits<32, N, K, false, num_warp_n,
-                     typename std::enable_if<N % 32 == 0>::type> {
+template <int N, int K, int num_warp_n, int leading_dim>
+struct OperandTraits<32, N, K, false, num_warp_n, leading_dim,
+                     typename std::enable_if<leading_dim % 32 == 0>::type> {
   using LayoutAtom = decltype(composition(
       Swizzle<3, 2, 3>{}, Layout<Shape<_32, _8>, Stride<_1, _32>>{}));
-  using Layout = decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<K>>{},
-                                        Step<_2, _1>{}));
+  using Layout = decltype(tile_to_shape(
+      LayoutAtom{}, Shape<Int<leading_dim>, Int<K>>{}, Step<_2, _1>{}));
   using Copy = UniversalCopy<tfloat32_t>;
 };
 
-template <int N, int K, int num_warp_n>
-struct OperandTraits<32, N, K, false, num_warp_n,
-                     typename std::enable_if<N % 32 == 16>::type> {
+template <int N, int K, int num_warp_n, int leading_dim>
+struct OperandTraits<32, N, K, false, num_warp_n, leading_dim,
+                     typename std::enable_if<leading_dim % 32 == 16>::type> {
   using LayoutAtom = decltype(composition(
       Swizzle<2, 2, 3>{}, Layout<Shape<_16, _8>, Stride<_1, _16>>{}));
-  using Layout = decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<K>>{},
-                                        Step<_2, _1>{}));
+  using Layout = decltype(tile_to_shape(
+      LayoutAtom{}, Shape<Int<leading_dim>, Int<K>>{}, Step<_2, _1>{}));
   using Copy = UniversalCopy<tfloat32_t>;
 };
 
-template <int N, int K, int num_warp_n>
-struct OperandTraits<8, N, K, true, num_warp_n,
-                     typename std::enable_if<K % 128 == 64>::type> {
+template <int N, int K, int num_warp_n, int leading_dim>
+struct OperandTraits<8, N, K, true, num_warp_n, leading_dim,
+                     typename std::enable_if<leading_dim % 128 == 64>::type> {
   using LayoutAtom = decltype(composition(
       Swizzle<2, 4, 3>{}, Layout<Shape<_8, _64>, Stride<_64, _1>>{}));
-  using Layout = decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<K>>{}));
+  using Layout =
+      decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<leading_dim>>{}));
   using Copy = typename std::conditional<N == 8 * num_warp_n, SM75_U32x2_LDSM_N,
                                          SM75_U32x4_LDSM_N>::type;
 };
 
-template <int N, int K, int num_warp_n>
-struct OperandTraits<8, N, K, true, num_warp_n,
-                     typename std::enable_if<K % 128 == 0>::type> {
+template <int N, int K, int num_warp_n, int leading_dim>
+struct OperandTraits<8, N, K, true, num_warp_n, leading_dim,
+                     typename std::enable_if<leading_dim % 128 == 0>::type> {
   using LayoutAtom = decltype(composition(
       Swizzle<3, 4, 3>{}, Layout<Shape<_8, _128>, Stride<_128, _1>>{}));
-  using Layout = decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<K>>{}));
+  using Layout =
+      decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<leading_dim>>{}));
   using Copy = typename std::conditional<N == 8 * num_warp_n, SM75_U32x2_LDSM_N,
                                          SM75_U32x4_LDSM_N>::type;
 };
 
-template <int N, int K, int num_warp_n>
-struct OperandTraits<64, N, K, true, num_warp_n,
-                     typename std::enable_if<K % 16 == 0>::type> {
+template <int N, int K, int num_warp_n, int leading_dim>
+struct OperandTraits<64, N, K, true, num_warp_n, leading_dim,
+                     typename std::enable_if<leading_dim % 16 == 0>::type> {
   using LayoutAtom = decltype(composition(
       Swizzle<2, 0, 4>{}, Layout<Shape<_4, _16>, Stride<_16, _1>>{}));
-  using Layout = decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<K>>{}));
+  using Layout =
+      decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<leading_dim>>{}));
   using Copy = DefaultCopy;
 };
 
-template <int N, int K, int num_warp_n>
-struct OperandTraits<64, N, K, false, num_warp_n,
-                     typename std::enable_if<N % 16 == 0>::type> {
+template <int N, int K, int num_warp_n, int leading_dim>
+struct OperandTraits<64, N, K, false, num_warp_n, leading_dim,
+                     typename std::enable_if<leading_dim % 16 == 0>::type> {
   using LayoutAtom = decltype(composition(
       Swizzle<2, 2, 2>{}, Layout<Shape<_16, _4>, Stride<_1, _16>>{}));
-  using Layout = decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<K>>{},
-                                        Step<_2, _1>{}));
+  using Layout = decltype(tile_to_shape(
+      LayoutAtom{}, Shape<Int<leading_dim>, Int<K>>{}, Step<_2, _1>{}));
   using Copy = DefaultCopy;
 };
 
 template <int M, int N, int K, int num_warp_m, int num_warp_n, bool trans_A,
-          bool trans_B, bool clear_accum, typename A_type_raw,
-          typename B_type_raw, typename C_type_raw>
+          bool trans_B, bool clear_accum, int lda, int ldb, int offset_a,
+          int offset_b, typename A_type_raw, typename B_type_raw,
+          typename C_type_raw>
 class GemmTensorOp {
 public:
   using A_type =
@@ -355,10 +363,11 @@ class GemmTensorOp {
   using Instruction =
       DispatchInstruction<A_type, B_type, C_type, num_warp_m, num_warp_n, N>;
 
-  using OperandATraits =
-      OperandTraits<sizeof_bits<A_type>::value, M, K, !trans_A, num_warp_m>;
+  using OperandATraits = OperandTraits<sizeof_bits<A_type>::value, M, K,
+                                       !trans_A, num_warp_m, lda>;
   using OperandBTraits =
-      OperandTraits<sizeof_bits<B_type>::value, N, K, trans_B, num_warp_n>;
+      OperandTraits<sizeof_bits<B_type>::value, N, K, trans_B, num_warp_n, ldb>;
+
   using SmemLayoutA = typename OperandATraits::Layout;
   using SmemLayoutB = typename OperandBTraits::Layout;
   using SmemCopyA = Copy_Atom<typename OperandATraits::Copy, A_type>;
@@ -383,12 +392,38 @@ class GemmTensorOp {
       return layout;
   }
 
+  template <int offset, int NN, int KK, bool trans, int lddim, typename Engine0,
+            typename Layout0>
+  static CUTE_DEVICE auto get_region_tensor(Tensor<Engine0, Layout0> &sa) {
+    if constexpr (offset == 0) {
+      return composition(
+          sa,
+          Layout<Shape<Int<NN>, Int<KK>>,
+                 Stride<_1, typename std::conditional<trans, Int<NN>,
+                                                      Int<lddim>>::type>>{});
+    } else {
+      if constexpr (trans) {
+        static_assert(offset % KK == 0, "Offset must be a multiple of K");
+        constexpr int offset_n = offset / KK;
+        return flat_divide(sa, Shape<Int<NN>, Int<KK>>{})(_, _, _0{},
+                                                          Int<offset_n>{});
+      } else {
+        static_assert(offset % NN == 0, "Offset must be a multiple of N");
+        constexpr int offset_n = offset / NN;
+        return flat_divide(sa, Shape<Int<NN>, Int<KK>>{})(_, _, Int<offset_n>{},
+                                                          _0{});
+      }
+    }
+  }
+
   static CUTE_DEVICE void body(A_type_raw *pA, B_type_raw *pB, C_type_raw *pC) {
     const int tid = threadIdx.x;
-    Tensor sA = make_tensor(make_smem_ptr(reinterpret_cast<A_type *>(pA)),
-                            SmemLayoutA{});
-    Tensor sB = make_tensor(make_smem_ptr(reinterpret_cast<B_type *>(pB)),
-                            SmemLayoutB{});
+    Tensor sA_all = make_tensor(make_smem_ptr(reinterpret_cast<A_type *>(pA)),
+                                SmemLayoutA{});
+    Tensor sB_all = make_tensor(make_smem_ptr(reinterpret_cast<B_type *>(pB)),
+                                SmemLayoutB{});
+    Tensor sA = get_region_tensor<offset_a, M, K, !trans_A, lda>(sA_all);
+    Tensor sB = get_region_tensor<offset_b, N, K, trans_B, ldb>(sB_all);
     TileMma tiled_mma;
     auto thr_mma = tiled_mma.get_thread_slice(tid);
     auto tiled_copy_A = make_tiled_copy_A(SmemCopyA{}, tiled_mma);
@@ -426,8 +461,9 @@ class GemmTensorOp {
   static CUTE_DEVICE void body_rs(A_type_raw *pA, B_type_raw *pB,
                                   C_type_raw *pC) {
     const int tid = threadIdx.x;
-    Tensor sB = make_tensor(make_smem_ptr(reinterpret_cast<B_type *>(pB)),
-                            SmemLayoutB{});
+    Tensor sB_all = make_tensor(make_smem_ptr(reinterpret_cast<B_type *>(pB)),
+                                SmemLayoutB{});
+    Tensor sB = get_region_tensor<offset_b, N, K, trans_B, ldb>(sB_all);
     TileMma tiled_mma;
     auto thr_mma = tiled_mma.get_thread_slice(tid);
     auto tiled_copy_B = make_tiled_copy_B(SmemCopyB{}, tiled_mma);
@@ -461,8 +497,9 @@ class GemmTensorOp {
   static CUTE_DEVICE void body_sr(A_type_raw *pA, B_type_raw *pB,
                                   C_type_raw *pC) {
     const int tid = threadIdx.x;
-    Tensor sA = make_tensor(make_smem_ptr(reinterpret_cast<A_type *>(pA)),
-                            SmemLayoutA{});
+    Tensor sA_all = make_tensor(make_smem_ptr(reinterpret_cast<A_type *>(pA)),
+                                SmemLayoutA{});
+    Tensor sA = get_region_tensor<offset_a, M, K, !trans_A, lda>(sA_all);
     TileMma tiled_mma;
     auto thr_mma = tiled_mma.get_thread_slice(tid);
     auto tiled_copy_A = make_tiled_copy_A(SmemCopyA{}, tiled_mma);
@@ -496,79 +533,222 @@ class GemmTensorOp {
 
 } // namespace tl_mma
 
-} // namespace cute
+} /**
+ * Execute a tiled GEMM where both A and B tiles are sourced from shared memory.
+ *
+ * Dispatches to tl_mma::GemmTensorOp<M,N,K,...>::body to perform the computation.
+ *
+ * @param pA Pointer to the A tile region (device memory).
+ * @param pB Pointer to the B tile region (device memory).
+ * @param accum Pointer to the accumulator/output tile region (device memory).
+ */
+/**
+ * Execute a tiled GEMM where A is read from global memory and B is staged in shared memory.
+ *
+ * Dispatches to tl_mma::GemmTensorOp<M,N,K,...>::body_rs to perform the computation.
+ *
+ * @param pA Pointer to the A tile region (device memory).
+ * @param pB Pointer to the B tile region (device memory).
+ * @param accum Pointer to the accumulator/output tile region (device memory).
+ */
+/**
+ * Execute a tiled GEMM where A is staged in shared memory and B is read from global memory.
+ *
+ * Dispatches to tl_mma::GemmTensorOp<M,N,K,...>::body_sr to perform the computation.
+ *
+ * @param pA Pointer to the A tile region (device memory).
+ * @param pB Pointer to the B tile region (device memory).
+ * @param accum Pointer to the accumulator/output tile region (device memory).
+ */
+/**
+ * Perform a tiled GEMM (both operands in shared memory or selected backend) and write to accum.
+ *
+ * If use_wgmma is true, validates wgmma constraints (strides and offsets) and dispatches to
+ * the Hopper wgmma implementation; otherwise dispatches to the tl_mma implementation.
+ *
+ * @param pA Pointer to the A tile region (device memory).
+ * @param pB Pointer to the B tile region (device memory).
+ * @param accum Pointer to the accumulator/output tile region (device memory).
+ */
+/**
+ * Perform a tiled GEMM with A in global memory and B in shared memory (or selected backend).
+ *
+ * If use_wgmma is true, validates wgmma constraints (strides and offsets) and dispatches to
+ * the Hopper wgmma read-share implementation; otherwise dispatches to the tl_mma read-share.
+ *
+ * @param pA Pointer to the A tile region (device memory).
+ * @param pB Pointer to the B tile region (device memory).
+ * @param accum Pointer to the accumulator/output tile region (device memory).
+ */
+/**
+ * Perform a tiled GEMM with A staged in shared memory and B in global memory (tl_mma only).
+ *
+ * wgmma does not support this variant; caller must set use_wgmma == false.
+ * Dispatches to tl_mma::GemmTensorOp<M,N,K,...>::body_sr.
+ *
+ * @param pA Pointer to the A tile region (device memory).
+ * @param pB Pointer to the B tile region (device memory).
+ * @param accum Pointer to the accumulator/output tile region (device memory).
+ */
+/**
+ * Wait for a warp-group of WMMA/MMA warps to complete.
+ *
+ * Wrapper around cute::warpgroup_wait for the specified number of MMA warps.
+ */
+/**
+ * Synchronize a named barrier across NumMmaThreads MMA threads.
+ *
+ * Calls cutlass::arch::NamedBarrier::sync with the canonical warp-group id.
+ */
+/**
+ * Arrive at a named barrier for NumMmaThreads MMA threads using architecture-aware mapping.
+ *
+ * Supported NumMmaThreads values: 256 or 384. The function issues one or two barrier arrives
+ * depending on the thread-group topology to ensure proper rendezvous ordering.
+ */
+/**
+ * Initialize named-barrier state for multi-warp MMA execution.
+ *
+ * For NumMmaThreads == 256 or 384, performs the required initial barrier arrivals for
+ * non-zero canonical warp-group indices to set up subsequent barrier synchronization.
+ */
 
 namespace tl {
 
 namespace tl_mma {
 
 template <int M, int N, int K, int num_warp_m, int num_warp_n, bool trans_A,
-          bool trans_B, bool clear_accum, typename A_type, typename B_type,
-          typename C_type>
+          bool trans_B, bool clear_accum, int lda, int ldb, int offset_a,
+          int offset_b, typename A_type, typename B_type, typename C_type>
 CUTLASS_DEVICE void gemm_ss(A_type *pA, B_type *pB, C_type *accum) {
   using MMA =
       cute::tl_mma::GemmTensorOp<M, N, K, num_warp_m, num_warp_n, trans_A,
-                                 trans_B, clear_accum, A_type, B_type, C_type>;
+                                 trans_B, clear_accum, lda, ldb, offset_a,
+                                 offset_b, A_type, B_type, C_type>;
   MMA::body(pA, pB, accum);
 }
 
 template <int M, int N, int K, int num_warp_m, int num_warp_n, bool trans_A,
-          bool trans_B, bool clear_accum, typename A_type, typename B_type,
-          typename C_type>
+          bool trans_B, bool clear_accum, int lda, int ldb, int offset_a,
+          int offset_b, typename A_type, typename B_type, typename C_type>
 CUTLASS_DEVICE void gemm_rs(A_type *pA, B_type *pB, C_type *accum) {
   using MMA =
       cute::tl_mma::GemmTensorOp<M, N, K, num_warp_m, num_warp_n, trans_A,
-                                 trans_B, clear_accum, A_type, B_type, C_type>;
+                                 trans_B, clear_accum, lda, ldb, offset_a,
+                                 offset_b, A_type, B_type, C_type>;
   MMA::body_rs(pA, pB, accum);
 }
 
 template <int M, int N, int K, int num_warp_m, int num_warp_n, bool trans_A,
-          bool trans_B, bool clear_accum, typename A_type, typename B_type,
-          typename C_type>
+          bool trans_B, bool clear_accum, int lda, int ldb, int offset_a,
+          int offset_b, typename A_type, typename B_type, typename C_type>
 CUTLASS_DEVICE void gemm_sr(A_type *pA, B_type *pB, C_type *accum) {
   using MMA =
       cute::tl_mma::GemmTensorOp<M, N, K, num_warp_m, num_warp_n, trans_A,
-                                 trans_B, clear_accum, A_type, B_type, C_type>;
+                                 trans_B, clear_accum, lda, ldb, offset_a,
+                                 offset_b, A_type, B_type, C_type>;
   MMA::body_sr(pA, pB, accum);
 }
 
 } // namespace tl_mma
 
 template <int M, int N, int K, int num_warp_m, int num_warp_n, bool trans_A,
-          bool trans_B, bool clear_accum = false, bool use_wgmma = true,
+          bool trans_B, bool clear_accum = false, int lda = 0, int ldb = 0,
+          int offset_a = 0, int offset_b = 0, bool use_wgmma = true,
           int wg_wait = 0, typename A_type, typename B_type, typename C_type>
 TL_DEVICE void gemm_ss(A_type *pA, B_type *pB, C_type *accum) {
   if constexpr (use_wgmma) {
+    static_assert((trans_A && lda == M) || (!trans_A && lda == K),
+                  "Hopper wgmma doesn't support custom stride for A");
+    static_assert((trans_B && ldb == K) || (!trans_B && ldb == N),
+                  "Hopper wgmma doesn't support custom stride for B");
+    static_assert(offset_a == 0 && offset_b == 0,
+                  "offset_a and offset_b must be zero for wgmma");
     using MMA = cute::tl_wgmma::GemmTensorOp<M, N, K, num_warp_m, num_warp_n,
                                              trans_A, trans_B, clear_accum,
                                              A_type, B_type, C_type>;
     MMA::body<wg_wait>(pA, pB, accum);
   } else {
-    using MMA = cute::tl_mma::GemmTensorOp<M, N, K, num_warp_m, num_warp_n,
-                                           trans_A, trans_B, clear_accum,
-                                           A_type, B_type, C_type>;
+    using MMA =
+        cute::tl_mma::GemmTensorOp<M, N, K, num_warp_m, num_warp_n, trans_A,
+                                   trans_B, clear_accum, lda, ldb, offset_a,
+                                   offset_b, A_type, B_type, C_type>;
     MMA::body(pA, pB, accum);
   }
 }
 
 template <int M, int N, int K, int num_warp_m, int num_warp_n, bool trans_A,
-          bool trans_B, bool clear_accum = false, bool use_wgmma = true,
+          bool trans_B, bool clear_accum = false, int lda = 0, int ldb = 0,
+          int offset_a = 0, int offset_b = 0, bool use_wgmma = true,
           int wg_wait = 0, typename A_type, typename B_type, typename C_type>
-TL_DEVICE void gemm_rs(A_type *pA, B_type *pB, C_type *accum) {
+TL_DEVICE /**
+ * Perform a read-share (B in shared memory, A in global) tiled GEMM and accumulate into `accum`.
+ *
+ * Dispatches at compile time to either the Hopper wgmma implementation or the fallback MMA implementation
+ * depending on `use_wgmma`. The selected GemmTensorOp::body_rs performs the region-tiled GEMM loop and
+ * updates the accumulator in-place.
+ *
+ * When `use_wgmma == true`, this function enforces wgmma constraints at compile time:
+ * - A's leading dimension must equal (trans_A ? M : K)
+ * - B's leading dimension must equal (trans_B ? K : N)
+ * - offset_a and offset_b must be zero
+ *
+ * @param pA Pointer to operand A (global memory). Layout/stride expectations depend on template parameters.
+ * @param pB Pointer to operand B (base for shared-memory staging). Layout/stride expectations depend on template parameters.
+ * @param accum Pointer to the accumulator/output C buffer updated in-place.
+ */
+void gemm_rs(A_type *pA, B_type *pB, C_type *accum) {
   if constexpr (use_wgmma) {
+    static_assert((trans_A && lda == M) || (!trans_A && lda == K),
+                  "Hopper wgmma doesn't support custom stride for A");
+    static_assert((trans_B && ldb == K) || (!trans_B && ldb == N),
+                  "Hopper wgmma doesn't support custom stride for B");
+    static_assert(offset_a == 0 && offset_b == 0,
+                  "offset_a and offset_b must be zero for wgmma");
     using MMA = cute::tl_wgmma::GemmTensorOp<M, N, K, num_warp_m, num_warp_n,
                                              trans_A, trans_B, clear_accum,
                                              A_type, B_type, C_type>;
     MMA::body_rs<wg_wait>(pA, pB, accum);
   } else {
-    using MMA = cute::tl_mma::GemmTensorOp<M, N, K, num_warp_m, num_warp_n,
-                                           trans_A, trans_B, clear_accum,
-                                           A_type, B_type, C_type>;
+    using MMA =
+        cute::tl_mma::GemmTensorOp<M, N, K, num_warp_m, num_warp_n, trans_A,
+                                   trans_B, clear_accum, lda, ldb, offset_a,
+                                   offset_b, A_type, B_type, C_type>;
     MMA::body_rs(pA, pB, accum);
   }
 }
 
-template <int num_mma> TL_DEVICE void wait_wgmma() {
+template <int M, int N, int K, int num_warp_m, int num_warp_n, bool trans_A,
+          bool trans_B, bool clear_accum = false, int lda = 0, int ldb = 0,
+          int offset_a = 0, int offset_b = 0, bool use_wgmma = true,
+          int wg_wait = 0, typename A_type, typename B_type, typename C_type>
+TL_DEVICE /**
+ * Perform a non-wgmma tiled GEMM where A regions are staged into shared memory
+ * and B is read directly from global memory, accumulating into `accum`.
+ *
+ * This overload dispatches to the tl_mma::GemmTensorOp::body_sr implementation.
+ * Must be instantiated with `use_wgmma = false` (enforced via static_assert).
+ *
+ * @param pA Pointer to the A operand in global memory (source that will be staged to shared memory).
+ * @param pB Pointer to the B operand in global memory (read directly).
+ * @param accum Pointer to the output accumulator matrix in global memory.
+ */
+void gemm_sr(A_type *pA, B_type *pB, C_type *accum) {
+  static_assert(!use_wgmma, "wgmma doesn't support gemm_sr");
+  using MMA =
+      cute::tl_mma::GemmTensorOp<M, N, K, num_warp_m, num_warp_n, trans_A,
+                                 trans_B, clear_accum, lda, ldb, offset_a,
+                                 offset_b, A_type, B_type, C_type>;
+  MMA::body_sr(pA, pB, accum);
+}
+
+template <int num_mma> TL_DEVICE /**
+ * Wait for all WMMA/MMA warps in the current warp-group to synchronize.
+ *
+ * Blocks until the warp-group-wide rendezvous for `num_mma` MMA lanes completes,
+ * ensuring all participating warps have arrived before proceeding.
+ */
+void wait_wgmma() {
   cute::warpgroup_wait<num_mma>();
 }
 
diff --git a/src/tl_templates/hip/reduce.h b/src/tl_templates/hip/reduce.h
index 02464a181..9307a4fdf 100644
--- a/src/tl_templates/hip/reduce.h
+++ b/src/tl_templates/hip/reduce.h
@@ -22,7 +22,8 @@ struct MinOp {
   }
 };
 
-template <class Reducer, int threads, int scale, int thread_offset = 0> struct AllReduce {
+template <class Reducer, int threads, int scale, int thread_offset = 0>
+struct AllReduce {
   static_assert(threads == 1024 || threads == 512 || threads == 256 ||
                 threads == 128 || threads == 64 || threads == 32 ||
                 threads == 16 || threads == 8 || threads == 4 || threads == 2);
diff --git a/src/transform/align_dynamic_shared_memory_allocations.cc b/src/transform/align_dynamic_shared_memory_allocations.cc
index c27d6759c..184d6b329 100644
--- a/src/transform/align_dynamic_shared_memory_allocations.cc
+++ b/src/transform/align_dynamic_shared_memory_allocations.cc
@@ -3,6 +3,7 @@
  * \brief align dynamic shared memory allocations
  */
 
+#include <tvm/ffi/reflection/registry.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/op.h>
 #include <tvm/tir/stmt_functor.h>
@@ -147,8 +148,11 @@ tvm::transform::Pass AlignDynamicSharedMemoryAllocations(int align_bytes) {
                             "tl.AlignDynamicSharedMemoryAllocations", {});
 }
 
-TVM_REGISTER_GLOBAL("tl.transform.AlignDynamicSharedMemoryAllocations")
-    .set_body_typed(AlignDynamicSharedMemoryAllocations);
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.AlignDynamicSharedMemoryAllocations",
+                        AlignDynamicSharedMemoryAllocations);
+});
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/annotate_device_regions.cc b/src/transform/annotate_device_regions.cc
index 394ad70b0..fb16bbdb3 100644
--- a/src/transform/annotate_device_regions.cc
+++ b/src/transform/annotate_device_regions.cc
@@ -22,8 +22,9 @@
  * \brief Split device function from host.
  */
 #include "tir/transforms/ir_utils.h"
+#include <tvm/ffi/function.h>
+#include <tvm/ffi/reflection/registry.h>
 #include <tvm/ir/transform.h>
-#include <tvm/runtime/registry.h>
 #include <tvm/target/target.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/expr.h>
@@ -87,8 +88,11 @@ tvm::transform::Pass AnnotateDeviceRegions() {
   return CreatePrimFuncPass(pass_func, 0, "tl.AnnotateDeviceRegions", {});
 }
 
-TVM_REGISTER_GLOBAL("tl.transform.AnnotateDeviceRegions")
-    .set_body_typed(AnnotateDeviceRegions);
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.AnnotateDeviceRegions",
+                        AnnotateDeviceRegions);
+});
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/atomicadd_vectorize.cc b/src/transform/atomicadd_vectorize.cc
new file mode 100644
index 000000000..3ded2ce7c
--- /dev/null
+++ b/src/transform/atomicadd_vectorize.cc
@@ -0,0 +1,283 @@
+/*!
+ * \file atomicadd_vectorize.cc
+ * \brief A tool to automatically vectorize atomic add
+ */
+
+#include "../layout/layout.h"
+#include "../layout/utils.h"
+#include "arith/int_operator.h"
+#include "arith/ir_visitor_with_analyzer.h"
+#include "common/loop_vectorization_utils.h"
+#include <numeric>
+#include <tvm/arith/analyzer.h>
+#include <tvm/arith/iter_affine_map.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/op.h>
+#include <tvm/tir/stmt_functor.h>
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+using arith::IRMutatorWithAnalyzer;
+using arith::IRVisitorWithAnalyzer;
+
+struct AtomicAddVectorizePlanResult {
+  int vector_size;
+  bool dynamic;
+  PrimExpr condition;
+};
+
+class AtomicAddVectorizePlanner : public arith::IRVisitorWithAnalyzer {
+public:
+  AtomicAddVectorizePlanner() = default;
+  int max_vector_size = 1;
+  AtomicAddVectorizePlanResult Plan(const For &node, Var thread_var,
+                                    Range thread_bounds, int vectorize_hint) {
+    this->max_vector_size = vectorize_hint;
+    this->thread_var = thread_var;
+    this->thread_bounds = thread_bounds;
+    this->operator()(node);
+    return {vector_size_, dynamic_, condition_};
+  }
+
+private:
+  void VisitStmt_(const ForNode *node) final {
+    inner_for_ = node;
+    iter_map_.Set(node->loop_var, Range(node->min, node->extent));
+
+    arith::IRVisitorWithAnalyzer::VisitStmt_(node);
+  }
+
+  void VisitExpr_(const CallNode *node) final {
+    if (node->op == builtin::call_extern() && node->args.size() >= 2) {
+      if (const auto *func_name = node->args[0].as<StringImmNode>()) {
+        if (func_name->value == "AtomicAdd") {
+
+          const CallNode *addr_call = node->args[1].as<CallNode>();
+          if (addr_call && addr_call->op == builtin::address_of() &&
+              addr_call->args.size() == 1) {
+
+            const BufferLoadNode *buffer_load_dst =
+                addr_call->args[0].as<BufferLoadNode>();
+            const BufferLoadNode *buffer_load_src =
+                node->args[2].as<BufferLoadNode>();
+            if (buffer_load_src && buffer_load_src->buffer.defined() &&
+                buffer_load_dst && buffer_load_dst->buffer.defined()) {
+
+              Buffer dst_buffer = buffer_load_dst->buffer;
+              Array<PrimExpr> indices_dst = buffer_load_dst->indices;
+              UpdateVectorSize(indices_dst, dst_buffer);
+              Buffer src_buffer = buffer_load_src->buffer;
+              Array<PrimExpr> indices_src = buffer_load_src->indices;
+              UpdateVectorSize(indices_src, src_buffer);
+            }
+          }
+        }
+      }
+    }
+    return arith::IRVisitorWithAnalyzer::VisitExpr_(node);
+  }
+
+  void UpdateVectorSize(const Array<PrimExpr> indices, const Buffer &buffer) {
+    if (!inner_for_)
+      return;
+    auto extent_ptr = inner_for_->extent.as<IntImmNode>();
+    if (!extent_ptr)
+      return;
+
+    const DataType &access_type = buffer->dtype;
+    // i // 2, i % 8 can also be vectorized as factor 16
+    // so we should disable this GCD optimization
+
+    max_vector_size = arith::ZeroAwareGCD(max_vector_size, extent_ptr->value);
+
+    auto last_dim = buffer->shape.back();
+    auto mod_set = analyzer_.modular_set(last_dim);
+    // when dynamic shape like [m, k]: coeff=1, base=0, GCD will block
+    // conditionally tail vectorize
+    if (buffer->shape.back().as<IntImmNode>()) {
+
+      max_vector_size = arith::ZeroAwareGCD(max_vector_size, mod_set->coeff);
+
+      auto gcd_base = arith::ZeroAwareGCD(max_vector_size, mod_set->base);
+      // If gcd_base is equal to the last dimension,
+      // we should analyze the second-to-last dimension
+      // in relation to the last dimension.
+      if (gcd_base < Downcast<IntImm>(last_dim)->value) {
+        max_vector_size = gcd_base;
+      }
+
+      vector_size_ = arith::ZeroAwareGCD(max_vector_size, vector_size_);
+
+      PrimExpr elem_offset = 0;
+      PrimExpr stride = 1;
+      for (int i = indices.size() - 1; i >= 0; --i) {
+        elem_offset = elem_offset + indices[i] * stride;
+        stride = stride * buffer->shape[i];
+      }
+      PrimExpr thread_extent = thread_bounds->extent;
+      while (!IndiceCanVectorize(elem_offset, thread_var, thread_extent,
+                                 vector_size_, &analyzer_)) {
+        vector_size_ /= 2;
+      }
+    } else if (vector_size_ <= 4) {
+      // dynamic shape load: get the vectorization condition
+      dynamic_ = true;
+      PrimExpr offset = buffer.OffsetOf(indices).back();
+      condition_ = (FloorMod(offset, vector_size_) == 0);
+    }
+  }
+
+  const ForNode *inner_for_;
+  Map<Var, Range> iter_map_;
+  bool has_nonlocal_memory_access_ = false;
+  int vector_size_ = 4;
+  Var thread_var;
+  Range thread_bounds;
+  bool dynamic_ = false;
+  PrimExpr condition_;
+};
+
+class AtomicAddVectorizeRewriter : public StmtExprMutator {
+public:
+  AtomicAddVectorizeRewriter(AtomicAddVectorizePlanResult plan)
+      : vector_size_(plan.vector_size), condition_(plan.condition),
+        dynamic_(plan.dynamic) {}
+
+private:
+  Stmt VisitStmt_(const ForNode *node) final {
+    inner_for_ = node;
+    auto ret = StmtExprMutator::VisitStmt_(node);
+    if (inner_for_ == node) { // rewrite the innermost loop
+      For fnode = ret.as<For>().value();
+      auto old_var = fnode->loop_var;
+      auto extent_ptr = as_const_int(fnode->extent);
+      ICHECK(extent_ptr) << fnode->extent;
+      int extent = *extent_ptr;
+      ICHECK(extent % vector_size_ == 0)
+          << "extent: " << extent << " vector_size_: " << vector_size_;
+      ICHECK(is_zero(fnode->min));
+      if (!dynamic_) {
+        Var tx_var;
+        PostOrderVisit(fnode->body, [&tx_var](const ObjectRef &node) {
+          if (const VarNode *var = node.as<VarNode>()) {
+            if (var->name_hint == "tx") {
+              tx_var = GetRef<Var>(var);
+            }
+          }
+        });
+        ICHECK(tx_var.defined()) << "Failed to find tx var";
+        Var outer_var = Var(old_var->name_hint + "_outer");
+        Map<Var, PrimExpr> vmap;
+        vmap.Set(tx_var,
+                 truncmod(tx_var, extent / vector_size_) * vector_size_);
+        vmap.Set(fnode->loop_var, outer_var * vector_size_ +
+                                      truncdiv(tx_var, extent / vector_size_));
+        Stmt body = Substitute(fnode->body, vmap);
+        return For(outer_var, 0, extent / vector_size_, fnode->kind, body,
+                   fnode->thread_binding, fnode->annotations, fnode->span);
+      } else {
+        return fnode;
+      }
+    } else {
+      return ret;
+    }
+  }
+
+  PrimExpr VisitExpr_(const CallNode *node) final {
+
+    if (vector_size_ == 2 || vector_size_ == 4) {
+      if (node->op == builtin::call_extern() && node->args.size() >= 2) {
+        if (const auto *func_name = node->args[0].as<StringImmNode>()) {
+          if (func_name->value == "AtomicAdd") {
+            PrimExpr value_node = node->args[2];
+
+            Call address_of_value = tvm::tir::Call(
+                DataType::Handle(), builtin::address_of(), {value_node});
+
+            Array<PrimExpr> new_args;
+            if (vector_size_ == 2) {
+              new_args.push_back(StringImm("AtomicAddx2"));
+            } else {
+              new_args.push_back(StringImm("AtomicAddx4"));
+            }
+
+            new_args.push_back(node->args[1]);
+            new_args.push_back(address_of_value);
+
+            Call new_call =
+                tvm::tir::Call(node->dtype, builtin::call_extern(), new_args);
+
+            return new_call;
+          }
+        }
+      }
+    }
+    return StmtExprMutator::VisitExpr_(node);
+  }
+
+  const ForNode *inner_for_;
+  const int vector_size_;
+  const PrimExpr condition_;
+  const bool dynamic_;
+};
+
+static int GetVectorizeSizeMax(int compute_capability, DataType dtype) {
+
+  if (dtype == DataType::Float(16)) {
+    return 2;
+  }
+  if (dtype == DataType::BFloat(16)) {
+    if (compute_capability > 75) {
+      return 2;
+    } else {
+      return 1;
+    }
+  }
+  if (dtype == DataType::Float(32)) {
+    if (compute_capability >= 90) {
+      return 4;
+    } else {
+      return 1;
+    }
+  }
+  return 1;
+}
+
+For VectorizeAtomicAdd(const For &for_node, Var thread_var, Range thread_bounds,
+                       int compute_capability) {
+
+  int vectorize_size_max = 1;
+
+  PostOrderVisit(for_node->body, [&](const ObjectRef &obj) {
+    if (const auto *call = obj.as<CallNode>()) {
+      if (call->op == builtin::call_extern() && call->args.size() >= 2) {
+        const auto *func_name = call->args[0].as<StringImmNode>();
+        if (func_name->value == "AtomicAdd") {
+          DataType dtype =
+              call->args[1].as<CallNode>()->args[0].as<BufferLoadNode>()->dtype;
+          vectorize_size_max = GetVectorizeSizeMax(compute_capability, dtype);
+        }
+      }
+    }
+  });
+
+  if (vectorize_size_max != 1) {
+    int vectorize_hint = vectorize_size_max;
+    AtomicAddVectorizePlanResult res = {1, false, 0};
+    AtomicAddVectorizePlanner planner;
+    res = planner.Plan(for_node, thread_var, thread_bounds, vectorize_hint);
+    vectorize_hint = res.vector_size;
+
+    if (vectorize_hint == 1)
+      return for_node;
+    auto rewriter = AtomicAddVectorizeRewriter(res);
+    return Downcast<For>(rewriter(for_node));
+  } else {
+    return for_node;
+  }
+}
+
+} // namespace tl
+} // namespace tvm
diff --git a/src/transform/atomicadd_vectorize.h b/src/transform/atomicadd_vectorize.h
new file mode 100644
index 000000000..cd1eae08b
--- /dev/null
+++ b/src/transform/atomicadd_vectorize.h
@@ -0,0 +1,23 @@
+/*!
+ * \file atomicadd_vectorize.h
+ * \brief A tool to automatically vectorize a for atomicadd
+ */
+
+#ifndef TVM_TL_ATOMICADD_VECTORIZE_H_
+#define TVM_TL_ATOMICADD_VECTORIZE_H_
+
+#include <tvm/arith/analyzer.h>
+#include <tvm/tir/op.h>
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+For VectorizeAtomicAdd(const For &for_node, Var thread_var, Range thread_bounds,
+                       int compute_capability);
+
+} // namespace tl
+} // namespace tvm
+
+#endif // TVM_TL_ATOMICADD_VECTORIZE_H_
\ No newline at end of file
diff --git a/src/transform/cluster_planning.cc b/src/transform/cluster_planning.cc
index 5fcbf5c4d..014b4c7b2 100644
--- a/src/transform/cluster_planning.cc
+++ b/src/transform/cluster_planning.cc
@@ -1,28 +1,11 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
 /*!
  * \file clasuter_planning.cc
  * \brief Plan the cluster for GPU(sm90+) blocks
  */
 
 #include <tvm/arith/analyzer.h>
+#include <tvm/ffi/function.h>
+#include <tvm/ffi/reflection/registry.h>
 #include <tvm/tir/analysis.h>
 #include <tvm/tir/stmt_functor.h>
 #include <tvm/tir/transform.h>
@@ -132,8 +115,10 @@ tvm::transform::Pass ClusterPlanning() {
   return CreatePrimFuncPass(pass_func, 0, "tl.ClusterPlanning", {});
 }
 
-TVM_REGISTER_GLOBAL("tl.transform.ClusterPlanning")
-    .set_body_typed(ClusterPlanning);
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.ClusterPlanning", ClusterPlanning);
+});
 } // namespace transform
 
 } // namespace tir
diff --git a/src/transform/common/loop_vectorization_utils.h b/src/transform/common/loop_vectorization_utils.h
index 012ce3e74..1ede15098 100644
--- a/src/transform/common/loop_vectorization_utils.h
+++ b/src/transform/common/loop_vectorization_utils.h
@@ -599,7 +599,7 @@ class Vectorizer : public StmtMutator,
       return Scalarize(GetRef<Stmt>(op));
     }
     Stmt then_case = this->VisitStmt(op->then_case);
-    Optional<Stmt> else_case = NullOpt;
+    Optional<Stmt> else_case = std::nullopt;
     if (op->else_case) {
       else_case = this->VisitStmt(op->else_case.value());
     }
@@ -681,10 +681,6 @@ class Vectorizer : public StmtMutator,
     stmt = Substitute(stmt, {{var_, idx}});
     return For(idx, IntImm(var_->dtype, 0), var_lanes_, ForKind::kSerial, stmt);
   }
-  // ProducerStore
-  Stmt VisitStmt_(const ProducerStoreNode *op) final {
-    LOG(FATAL) << "ProducerProvide cannot appear in a TIR PrimFunc";
-  }
 
 private:
   // analyzer
diff --git a/src/transform/common/union_find.h b/src/transform/common/union_find.h
new file mode 100644
index 000000000..75192ad37
--- /dev/null
+++ b/src/transform/common/union_find.h
@@ -0,0 +1,52 @@
+#ifndef TVM_TL_TRANSFORM_COMMON_UNION_FIND_H_
+#define TVM_TL_TRANSFORM_COMMON_UNION_FIND_H_
+
+#include <unordered_map>
+#include <vector>
+
+namespace tvm {
+namespace tl {
+
+template <typename T> class UnionFind {
+public:
+  void MakeSet(const T &x) {
+    if (parent_.find(x) == parent_.end()) {
+      parent_[x] = x;
+      rank_[x] = 0;
+    }
+  }
+
+  T Find(const T &x) {
+    if (parent_[x] != x) {
+      parent_[x] = Find(parent_[x]); // Path compression
+    }
+    return parent_[x];
+  }
+
+  void Union(const T &x, const T &y) {
+    T x_root = Find(x);
+    T y_root = Find(y);
+
+    if (x_root == y_root)
+      return;
+
+    // Union by rank
+    if (rank_[x_root] < rank_[y_root]) {
+      parent_[x_root] = y_root;
+    } else if (rank_[x_root] > rank_[y_root]) {
+      parent_[y_root] = x_root;
+    } else {
+      parent_[y_root] = x_root;
+      rank_[x_root]++;
+    }
+  }
+
+private:
+  std::unordered_map<T, T> parent_;
+  std::unordered_map<T, int> rank_;
+};
+
+} // namespace tl
+} // namespace tvm
+
+#endif // TVM_TL_TRANSFORM_COMMON_UNION_FIND_H_
diff --git a/src/transform/config_index_bitwidth.cc b/src/transform/config_index_bitwidth.cc
index 53a3c9b49..10d242dfe 100644
--- a/src/transform/config_index_bitwidth.cc
+++ b/src/transform/config_index_bitwidth.cc
@@ -1,5 +1,7 @@
 #include "../op/builtin.h"
-#include <tvm/runtime/registry.h>
+#include "arith/ir_mutator_with_analyzer.h"
+#include <tvm/ffi/function.h>
+#include <tvm/ffi/reflection/registry.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/data_type_rewriter.h>
 #include <tvm/tir/op.h>
@@ -9,6 +11,7 @@ namespace tvm {
 namespace tl {
 
 using namespace tir;
+using namespace arith;
 class ConfigIndexBitwidthRewriter : public IndexDataTypeRewriter {
 public:
   using Parent = IndexDataTypeRewriter;
@@ -67,6 +70,92 @@ class ConfigIndexBitwidthRewriter : public IndexDataTypeRewriter {
   int _index_bitwidth_;
 };
 
+class IndexLegalizer : public IRMutatorWithAnalyzer {
+
+public:
+  static Stmt Rewrite(Stmt stmt) {
+    Analyzer ana;
+    auto pass = IndexLegalizer(&ana);
+    return pass.VisitStmt(stmt);
+  }
+
+private:
+  explicit IndexLegalizer(arith::Analyzer *ana) : IRMutatorWithAnalyzer(ana) {}
+
+  class Int64Promoter : public IndexDataTypeRewriter {
+  public:
+    using Parent = IndexDataTypeRewriter;
+
+    PrimExpr VisitExpr_(const VarNode *op) final {
+      if (op->dtype.is_int() && op->dtype.bits() < 64) {
+        return cast(DataType::Int(64), GetRef<Var>(op));
+      }
+      return GetRef<PrimExpr>(op);
+    }
+
+    PrimExpr VisitExpr_(const IntImmNode *op) final {
+      if (op->dtype.is_int() && op->dtype.bits() < 64) {
+        return IntImm(DataType::Int(64), op->value);
+      }
+      return GetRef<PrimExpr>(op);
+    }
+
+    PrimExpr VisitExpr_(const CastNode *op) final {
+      if (op->dtype.is_int() && op->dtype.bits() < 64) {
+        return cast(DataType::Int(64), op->value);
+      }
+      return GetRef<PrimExpr>(op);
+    }
+
+    Stmt VisitStmt_(const BufferStoreNode *op) final {
+      // Force indices to be int64
+      auto node = Downcast<BufferStore>(Parent::VisitStmt_(op));
+      return std::move(node);
+    }
+
+    PrimExpr VisitExpr_(const BufferLoadNode *op) final {
+      auto node = Downcast<BufferLoad>(Parent::VisitExpr_(op));
+      return std::move(node);
+    }
+  };
+
+  Stmt VisitStmt_(const BufferStoreNode *op) final {
+    auto buffer_store =
+        Downcast<BufferStore>(IRMutatorWithAnalyzer::VisitStmt_(op));
+    auto indices = buffer_store->indices;
+    for (auto index : indices) {
+      if (index->dtype.is_int() && index->dtype.bits() < 64) {
+        auto int_bound = analyzer_->const_int_bound(index);
+        if (int_bound->max_value >= (1LL << (index->dtype.bits() - 1)) - 1 ||
+            int_bound->min_value < -(1LL << (index->dtype.bits() - 1))) {
+          Int64Promoter promoter;
+          index = promoter(index);
+        }
+      }
+    }
+    buffer_store.CopyOnWrite()->indices = indices;
+    return std::move(buffer_store);
+  }
+
+  PrimExpr VisitExpr_(const BufferLoadNode *op) final {
+    auto buffer_load =
+        Downcast<BufferLoad>(IRMutatorWithAnalyzer::VisitExpr_(op));
+    auto indices = buffer_load->indices;
+    for (auto index : indices) {
+      if (index->dtype.is_int() && index->dtype.bits() < 64) {
+        auto int_bound = analyzer_->const_int_bound(index);
+        if (int_bound->max_value >= (1LL << (index->dtype.bits() - 1)) - 1 ||
+            int_bound->min_value < -(1LL << (index->dtype.bits() - 1))) {
+          Int64Promoter promoter;
+          index = promoter(index);
+        }
+      }
+    }
+    buffer_load.CopyOnWrite()->indices = indices;
+    return std::move(buffer_load);
+  }
+};
+
 tvm::transform::Pass ConfigIndexBitwidth() {
   using namespace tir::transform;
   auto pass_func = [](PrimFunc f, IRModule m, PassContext ctx) {
@@ -80,13 +169,18 @@ tvm::transform::Pass ConfigIndexBitwidth() {
       n->body = ConfigIndexBitwidthRewriter(config_index_bitwidth)(
           std::move(n->body));
     }
+    // Legalize out-of-bound indices to be int64
+    n->body = IndexLegalizer::Rewrite(std::move(n->body));
     return f;
   };
   return CreatePrimFuncPass(pass_func, 0, "tl.ConfigIndexBitwidth", {});
 }
 
-TVM_REGISTER_GLOBAL("tl.transform.ConfigIndexBitwidth")
-    .set_body_typed(ConfigIndexBitwidth);
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.ConfigIndexBitwidth",
+                        ConfigIndexBitwidth);
+});
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/eliminate_storage_sync_for_mbarrier.cc b/src/transform/eliminate_storage_sync_for_mbarrier.cc
index ea18f3596..7d48dcd08 100644
--- a/src/transform/eliminate_storage_sync_for_mbarrier.cc
+++ b/src/transform/eliminate_storage_sync_for_mbarrier.cc
@@ -5,7 +5,8 @@
 #include "./storage_access.h"
 #include "arith/ir_mutator_with_analyzer.h"
 #include "arith/ir_visitor_with_analyzer.h"
-#include <tvm/runtime/registry.h>
+#include <tvm/ffi/function.h>
+#include <tvm/ffi/reflection/registry.h>
 #include <tvm/tir/analysis.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/expr.h>
@@ -115,8 +116,11 @@ tvm::transform::Pass EliminateStorageSyncForMBarrier() {
                             {});
 }
 
-TVM_REGISTER_GLOBAL("tl.transform.EliminateStorageSyncForMBarrier")
-    .set_body_typed(EliminateStorageSyncForMBarrier);
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.EliminateStorageSyncForMBarrier",
+                        EliminateStorageSyncForMBarrier);
+});
 
 } // namespace transform
 } // namespace tl
diff --git a/src/transform/flatten_buffer.cc b/src/transform/flatten_buffer.cc
index 190b98db8..11ea423f0 100644
--- a/src/transform/flatten_buffer.cc
+++ b/src/transform/flatten_buffer.cc
@@ -24,6 +24,7 @@
 #include "arith/ir_mutator_with_analyzer.h"
 #include "tir/transforms/ir_utils.h"
 #include <tvm/arith/iter_affine_map.h>
+#include <tvm/ffi/reflection/registry.h>
 #include <tvm/tir/analysis.h>
 #include <tvm/tir/data_type_rewriter.h>
 #include <tvm/tir/stmt_functor.h>
@@ -59,43 +60,6 @@ class BufferFlattener : public arith::IRMutatorWithAnalyzer {
   using IRMutatorWithAnalyzer::VisitStmt;
   using IRMutatorWithAnalyzer::VisitStmt_;
 
-  class Int64Promoter : public tir::IndexDataTypeRewriter {
-  public:
-    using Parent = IndexDataTypeRewriter;
-
-    PrimExpr VisitExpr_(const VarNode *op) final {
-      if (op->dtype.is_int() && op->dtype.bits() < 64) {
-        return cast(DataType::Int(64), GetRef<Var>(op));
-      }
-      return GetRef<PrimExpr>(op);
-    }
-
-    PrimExpr VisitExpr_(const IntImmNode *op) final {
-      if (op->dtype.is_int() && op->dtype.bits() < 64) {
-        return IntImm(DataType::Int(64), op->value);
-      }
-      return GetRef<PrimExpr>(op);
-    }
-
-    PrimExpr VisitExpr_(const CastNode *op) final {
-      if (op->dtype.is_int() && op->dtype.bits() < 64) {
-        return cast(DataType::Int(64), op->value);
-      }
-      return GetRef<PrimExpr>(op);
-    }
-
-    Stmt VisitStmt_(const BufferStoreNode *op) final {
-      // Force indices to be int64
-      auto node = Downcast<BufferStore>(Parent::VisitStmt_(op));
-      return std::move(node);
-    }
-
-    PrimExpr VisitExpr_(const BufferLoadNode *op) final {
-      auto node = Downcast<BufferLoad>(Parent::VisitExpr_(op));
-      return std::move(node);
-    }
-  };
-
   explicit BufferFlattener(arith::Analyzer *ana) : IRMutatorWithAnalyzer(ana) {}
 
   Stmt VisitStmt_(const BlockNode *op) final {
@@ -276,29 +240,7 @@ class BufferFlattener : public arith::IRMutatorWithAnalyzer {
   Array<PrimExpr> GetSimplifiedElemOffset(const Buffer &buffer,
                                           const Array<PrimExpr> &indices) {
     auto flattened_indices = buffer->ElemOffset(indices);
-    Array<PrimExpr> safe_indices;
-    for (auto index : flattened_indices) {
-      auto int_bound = analyzer_->const_int_bound(index);
-      DataType dtype = index->dtype;
-      if (dtype.is_int() && dtype.bits() < 64) {
-        int64_t max_value = int_bound->max_value;
-        int64_t min_value = int_bound->min_value;
-        const int64_t type_max = (1LL << (dtype.bits() - 1));
-        const int64_t type_min = -(1LL << (dtype.bits() - 1));
-
-        if (max_value >= (type_max - 1) || min_value < type_min) {
-          Int64Promoter promoter;
-          for (auto &index : flattened_indices) {
-            safe_indices.push_back(promoter(index));
-          }
-        } else {
-          safe_indices.push_back(index);
-        }
-      } else {
-        safe_indices.push_back(index);
-      }
-    }
-    return this->IterMapSimplifyWithContext(safe_indices, false);
+    return this->IterMapSimplifyWithContext(flattened_indices, false);
   }
 
   template <typename Node> Node VisitBufferAccess(Node node) {
@@ -352,12 +294,7 @@ class BufferFlattener : public arith::IRMutatorWithAnalyzer {
 };
 
 PrimFunc FlattenBufferRewriter(PrimFunc f) {
-  // Only apply this pass to TIR that is not from TE schedules
-  if (!IsFromLegacyTESchedule(f)) {
-    return BufferFlattener::Flatten(f);
-  } else {
-    return f;
-  }
+  return BufferFlattener::Flatten(f);
 }
 
 using namespace tir::transform;
@@ -368,7 +305,10 @@ tvm::transform::Pass FlattenBuffer() {
   return CreatePrimFuncPass(pass_func, 0, "tl.FlattenBuffer", {});
 }
 
-TVM_REGISTER_GLOBAL("tl.transform.FlattenBuffer").set_body_typed(FlattenBuffer);
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.FlattenBuffer", FlattenBuffer);
+});
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/frontend_legalize.cc b/src/transform/frontend_legalize.cc
index 8b3d0300d..2d8129b59 100644
--- a/src/transform/frontend_legalize.cc
+++ b/src/transform/frontend_legalize.cc
@@ -22,6 +22,7 @@
  * \brief Legalize the program from frontend
  */
 
+#include <tvm/ffi/reflection/registry.h>
 #include <tvm/tir/op.h>
 #include <tvm/tir/stmt_functor.h>
 #include <tvm/tir/transform.h>
@@ -88,8 +89,10 @@ Pass FrontendLegalize() {
   return CreatePrimFuncPass(pass_func, 0, "tl.FrontendLegalize", {});
 }
 
-TVM_REGISTER_GLOBAL("tl.transform.FrontendLegalize")
-    .set_body_typed(FrontendLegalize);
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.FrontendLegalize", FrontendLegalize);
+});
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/if_stmt_binding.cc b/src/transform/if_stmt_binding.cc
index d27571e1e..0247676d1 100644
--- a/src/transform/if_stmt_binding.cc
+++ b/src/transform/if_stmt_binding.cc
@@ -3,6 +3,7 @@
  * \brief Bind the If Stmt to each Stmt in SeqStmt
  */
 
+#include <tvm/ffi/reflection/registry.h>
 #include <tvm/tir/analysis.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/op.h>
@@ -80,7 +81,10 @@ tvm::transform::Pass IfStmtBinding() {
   return CreatePrimFuncPass(pass_func, 0, "tl.IfStmtBinding", {});
 }
 
-TVM_REGISTER_GLOBAL("tl.transform.IfStmtBinding").set_body_typed(IfStmtBinding);
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.IfStmtBinding", IfStmtBinding);
+});
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/inject_fence_proxy.cc b/src/transform/inject_fence_proxy.cc
index 896e9ab85..e9950ad1d 100644
--- a/src/transform/inject_fence_proxy.cc
+++ b/src/transform/inject_fence_proxy.cc
@@ -22,6 +22,7 @@
  * \brief Inject fence between generic and async proxies (sm90+)
  */
 
+#include <tvm/ffi/reflection/registry.h>
 #include <tvm/tir/analysis.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/op.h>
@@ -193,8 +194,10 @@ tvm::transform::Pass InjectFenceProxy() {
   return CreatePrimFuncPass(pass_func, 0, "tl.InjectFenceProxy", {});
 }
 
-TVM_REGISTER_GLOBAL("tl.transform.InjectFenceProxy")
-    .set_body_typed(InjectFenceProxy);
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.InjectFenceProxy", InjectFenceProxy);
+});
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/inject_pipeline.cc b/src/transform/inject_pipeline.cc
index 0766be9a9..0432c7333 100644
--- a/src/transform/inject_pipeline.cc
+++ b/src/transform/inject_pipeline.cc
@@ -36,6 +36,8 @@ namespace tvm {
 namespace tl {
 using namespace tir;
 
+namespace software_pipeline {
+
 /*!
  * \brief Create a block and infer the access region with the given body.
  *
@@ -80,35 +82,6 @@ struct BufferAccessInfo {
   int use = -1; // the last using stage of the buffer
 };
 
-/*!
- * \brief Replace IfThenElse nodes with their then_case, preserving attribute
- * nodes \param body The statement to process \param condition The condition to
- * match in IfThenElse nodes \return The transformed statement
- */
-Stmt replace_if_then_else(Stmt body, PrimExpr condition) {
-  if (const auto *if_node = body.as<IfThenElseNode>()) {
-    // If this is an IfThenElse with the matching condition, replace it with its
-    // then_case
-    if (if_node->condition.same_as(condition)) {
-      return if_node->then_case;
-    }
-  } else if (const auto *attr_node = body.as<AttrStmtNode>()) {
-    // For attribute nodes, preserve the attribute but process its body
-    AttrStmt attr_stmt = GetRef<AttrStmt>(attr_node);
-    attr_stmt.CopyOnWrite()->body =
-        replace_if_then_else(attr_node->body, condition);
-    return attr_stmt;
-  } else if (const auto *block_node = body.as<BlockNode>()) {
-    // For block nodes, process the body
-    Block block = GetRef<Block>(block_node);
-    block.CopyOnWrite()->body =
-        replace_if_then_else(block_node->body, condition);
-    return block;
-  }
-  // For any other node type, return it unchanged
-  return body;
-}
-
 /*!
  * \brief Rewriter for the body of the software pipeline. This pass inserts
  * `floormod` to indices of the remapped buffer to select the version
@@ -201,14 +174,14 @@ class PipelineBodyRewriter : public StmtExprMutator {
     for (const Buffer &alloc_buffer : op->alloc_buffers) {
       buffer_data_to_buffer_.erase(alloc_buffer->data);
     }
-    return std::move(block);
+    return block;
   }
 
   Stmt VisitStmt_(const BufferStoreNode *op) final {
     BufferStore store = Downcast<BufferStore>(StmtExprMutator::VisitStmt_(op));
     auto it = buffer_remap_.find(store->buffer);
     if (it == buffer_remap_.end()) {
-      return std::move(store);
+      return store;
     }
     const Buffer &new_buffer = (*it).second;
     auto *n = store.CopyOnWrite();
@@ -216,14 +189,14 @@ class PipelineBodyRewriter : public StmtExprMutator {
     PrimExpr version = floormod(
         (pipeline_loop_->loop_var - pipeline_loop_->min), new_buffer->shape[0]);
     n->indices.insert(n->indices.begin(), version);
-    return std::move(store);
+    return store;
   }
 
   PrimExpr VisitExpr_(const BufferLoadNode *op) final {
     BufferLoad load = Downcast<BufferLoad>(StmtExprMutator::VisitExpr_(op));
     auto it = buffer_remap_.find(load->buffer);
     if (it == buffer_remap_.end()) {
-      return std::move(load);
+      return load;
     }
     const Buffer &new_buffer = (*it).second;
     auto *n = load.CopyOnWrite();
@@ -231,7 +204,7 @@ class PipelineBodyRewriter : public StmtExprMutator {
     PrimExpr version = floormod(
         (pipeline_loop_->loop_var - pipeline_loop_->min), new_buffer->shape[0]);
     n->indices.insert(n->indices.begin(), version);
-    return std::move(load);
+    return load;
   }
 
   PrimExpr VisitExpr_(const CallNode *op) final {
@@ -256,12 +229,10 @@ class PipelineRewriter : public StmtExprMutator {
 public:
   PipelineRewriter(Map<Var, Buffer> buffer_data_to_buffer,
                    const Array<Buffer> &pipeline_allocs,
-                   const For &pipeline_loop, const PipelineInfo &pipeline_info,
-                   PrimExpr predicate_condition = PrimExpr())
+                   const For &pipeline_loop, const PipelineInfo &pipeline_info)
       : buffer_data_to_buffer_(std::move(buffer_data_to_buffer)),
         pipeline_allocs_(pipeline_allocs), pipeline_loop_(pipeline_loop),
-        pipeline_info_(pipeline_info),
-        predicate_condition_(predicate_condition) {}
+        pipeline_info_(pipeline_info) {}
 
   Stmt BuildPipeline() {
     // Step 1: Analyze accesses to the buffers in the pipeline and compute the
@@ -665,7 +636,6 @@ class PipelineRewriter : public StmtExprMutator {
 
     // Async related
     std::map<int, AsyncStateLocal> async_states_local;
-    PrimExpr normalized_access_index;
 
     for (const Block &block : ordered_stmts_) {
       int stage = pipeline_info_.at(block).stage;
@@ -688,7 +658,7 @@ class PipelineRewriter : public StmtExprMutator {
       // - "producer_head" if this stage is an async producer
       // - "consumer_head" if this stage reads from asynchronously written
       // buffers.
-      normalized_access_index =
+      PrimExpr normalized_access_index =
           is_unit_loop ? skewed_loop_var : skewed_loop_var + delta;
 
       // Adjust the block predicate and the body according to the final loop
@@ -700,13 +670,6 @@ class PipelineRewriter : public StmtExprMutator {
       }
       new_block = Downcast<Block>(Substitute(
           new_block, {{pipeline_loop_->loop_var, normalized_access_index}}));
-      if (predicate_condition_.defined()) {
-        BlockNode *n = new_block.CopyOnWrite();
-        n->body = IfThenElse(
-            Substitute(predicate_condition_,
-                       {{pipeline_loop_->loop_var, normalized_access_index}}),
-            n->body);
-      }
       if (pipeline_info_[block].async) {
         auto &local_state = async_states_local[stage];
         local_state.producer_head = normalized_access_index;
@@ -737,7 +700,7 @@ class PipelineRewriter : public StmtExprMutator {
     }
 
     if (!is_unit_loop) {
-      Map<String, ObjectRef> preserved_annotations;
+      Map<String, Any> preserved_annotations;
       for (const auto &kv : pipeline_loop_->annotations) {
         const String &key = kv.first;
         if (kv.first != tir::attr::software_pipeline_stage &&
@@ -748,7 +711,7 @@ class PipelineRewriter : public StmtExprMutator {
       }
       new_loop = For(Downcast<Var>(new_loop_var), pipeline_loop_->min, extent,
                      unroll_loop ? ForKind::kUnrolled : pipeline_loop_->kind,
-                     std::move(new_loop), NullOpt, preserved_annotations);
+                     std::move(new_loop), std::nullopt, preserved_annotations);
     }
     // Update producer heads in the global async states.
     for (const auto &[stage_id, state] : async_states_local) {
@@ -764,7 +727,6 @@ class PipelineRewriter : public StmtExprMutator {
   Array<Buffer> pipeline_allocs_;
   For pipeline_loop_;
   PipelineInfo pipeline_info_;
-  PrimExpr predicate_condition_;
   int max_stage_ = -1;
   Map<Buffer, Buffer> buffer_remap_;
   Array<Block> ordered_stmts_;
@@ -873,13 +835,12 @@ class PipelineInjector : private StmtExprMutator {
     // Step 1: Recursively rewrite the children first.
     For for_node = Downcast<For>(StmtExprMutator::VisitStmt_(op));
     if (!HasPipelineAnnotation(op)) {
-      return std::move(for_node);
+      return for_node;
     }
     // Step 2: Find the body and buffer allocations of the pipeline. The body
     // can be direct child of the for-loop. If the for-loop has BlockRealize as
     // its child, the pipeline body will be the child of the block.
     Stmt pipeline_body{nullptr};
-    PrimExpr predicate_condition{nullptr};
     Array<Buffer> pipeline_allocs;
     if (const auto *realize = for_node->body.as<BlockRealizeNode>()) {
       const auto &block = realize->block;
@@ -887,15 +848,7 @@ class PipelineInjector : private StmtExprMutator {
         ICHECK(buffer->IsInstance<BufferNode>());
         buffer_data_to_buffer_.Set(buffer->data, buffer);
       }
-      if (const auto *if_then_else = block->body.as<IfThenElseNode>()) {
-        ICHECK(!if_then_else->else_case.defined())
-            << "Pipeline_Planning: Can't handle the body of the loop because "
-               "it is not a SeqStmt";
-        pipeline_body = if_then_else->then_case;
-        predicate_condition = if_then_else->condition;
-      } else {
-        pipeline_body = block->body;
-      }
+      pipeline_body = block->body;
       pipeline_allocs = block->alloc_buffers;
     } else {
       pipeline_body = for_node->body;
@@ -955,7 +908,7 @@ class PipelineInjector : private StmtExprMutator {
     std::unordered_set<int> pipeline_async_stages;
     if (auto annot =
             op->annotations.Get(tir::attr::software_pipeline_async_stages)) {
-      for (auto s : Downcast<Array<Integer>>(annot)) {
+      for (auto s : Downcast<Array<Integer>>(annot.value())) {
         pipeline_async_stages.insert(s->value);
       }
     }
@@ -973,10 +926,9 @@ class PipelineInjector : private StmtExprMutator {
     ValidatePipelineBody(pipeline_info, original_order);
 
     // Step 4: Rewrite the pipeline body.
-    Stmt pipeline =
-        PipelineRewriter(buffer_data_to_buffer_, pipeline_allocs,
-                         GetRef<For>(op), pipeline_info, predicate_condition)
-            .BuildPipeline();
+    Stmt pipeline = PipelineRewriter(buffer_data_to_buffer_, pipeline_allocs,
+                                     GetRef<For>(op), pipeline_info)
+                        .BuildPipeline();
 
     if (const auto *realize = op->body.as<BlockRealizeNode>()) {
       const auto &block = realize->block;
@@ -997,7 +949,7 @@ class PipelineInjector : private StmtExprMutator {
     for (const auto &buffer : op->alloc_buffers) {
       buffer_data_to_buffer_.erase(buffer->data);
     }
-    return std::move(block);
+    return block;
   }
 
   bool HasPipelineAnnotation(const ForNode *op) const {
@@ -1022,6 +974,7 @@ class PipelineInjector : private StmtExprMutator {
   Map<Var, Buffer> buffer_data_to_buffer_;
   Optional<String> global_symbol_;
 };
+} // namespace software_pipeline
 
 /*!
  * \brief Transform annotated loops into pipelined one that parallelize
@@ -1031,15 +984,18 @@ tir::transform::Pass InjectSoftwarePipeline() {
   using namespace tir::transform;
   auto pass_func = [=](PrimFunc f, IRModule m, PassContext ctx) {
     auto *fptr = f.CopyOnWrite();
-    fptr->body = PipelineInjector::Inject(f);
+    fptr->body = software_pipeline::PipelineInjector::Inject(f);
     fptr->body = ConvertSSA(std::move(fptr->body));
     return f;
   };
   return CreatePrimFuncPass(pass_func, 0, "tl.InjectSoftwarePipeline", {});
 }
 
-TVM_REGISTER_GLOBAL("tl.transform.InjectSoftwarePipeline")
-    .set_body_typed(InjectSoftwarePipeline);
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.InjectSoftwarePipeline",
+                        InjectSoftwarePipeline);
+});
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/inject_ptx_async_copy.cc b/src/transform/inject_ptx_async_copy.cc
index f4259d21e..af9ae8e63 100644
--- a/src/transform/inject_ptx_async_copy.cc
+++ b/src/transform/inject_ptx_async_copy.cc
@@ -21,6 +21,7 @@
  * \brief Replace copy from global to shared with async copy
  * \file inject_ptx_async_copy.cc
  */
+#include <tvm/ffi/reflection/registry.h>
 #include <tvm/tir/analysis.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/expr.h>
@@ -231,8 +232,10 @@ tvm::transform::Pass InjectPTXAsyncCopy() {
   return CreatePrimFuncPass(pass_func, 0, "tl.InjectPTXAsyncCopy", {});
 }
 
-TVM_REGISTER_GLOBAL("tl.transform.InjectPTXAsyncCopy")
-    .set_body_typed(InjectPTXAsyncCopy);
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.InjectPTXAsyncCopy", InjectPTXAsyncCopy);
+});
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/inject_tma_barrier.cc b/src/transform/inject_tma_barrier.cc
index 7d5ede9dd..5df349bb7 100644
--- a/src/transform/inject_tma_barrier.cc
+++ b/src/transform/inject_tma_barrier.cc
@@ -23,6 +23,7 @@
  */
 
 #include <tvm/arith/analyzer.h>
+#include <tvm/ffi/reflection/registry.h>
 #include <tvm/tir/analysis.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/expr.h>
@@ -121,8 +122,11 @@ class TmaExpectTxRewriter : public IRMutatorWithAnalyzer {
 
   Stmt VisitStmt_(const IfThenElseNode *op) {
     // Check if this is the TMA block
-    const EQNode *eq = op->condition.as<EQNode>();
-    if (eq != nullptr) {
+    bool flag = false;
+    if (op->condition.as<CallNode>()) {
+      flag = op->condition.as<CallNode>()->op.same_as(tl_shuffle_elect());
+    }
+    if (op->condition.as<EQNode>() || flag) {
       Stmt ret = IRMutatorWithAnalyzer::VisitStmt_(op);
 
       if (visited_tma_load_) {
@@ -163,6 +167,9 @@ class TmaExpectTxRewriter : public IRMutatorWithAnalyzer {
 
 class TmaBarrierCollector : public IRVisitorWithAnalyzer {
 public:
+  TmaBarrierCollector(Map<Var, Buffer> buffer_data_to_buffer)
+      : buffer_data_to_buffer_(std::move(buffer_data_to_buffer)) {}
+
   Map<ObjectRef, PrimExpr> tma_op_to_barrier_id() {
     return tma_op_to_barrier_id_;
   }
@@ -221,7 +228,128 @@ class TmaBarrierCollector : public IRVisitorWithAnalyzer {
   std::vector<Call> pending_tma_ops_;
   Map<ObjectRef, PrimExpr> tma_op_to_barrier_id_;
   Map<PrimExpr, IntImm> barrier_id_to_range_;
+  Map<Var, Buffer> buffer_data_to_buffer_;
+};
+
+class TmaSequenceCollector : public IRVisitorWithAnalyzer {
+public:
+  TmaSequenceCollector(Map<ObjectRef, PrimExpr> tma_op_to_barrier_id)
+      : tma_op_to_barrier_id_(std::move(tma_op_to_barrier_id)) {}
+
+  std::vector<bool> GetSequence() {
+    std::vector<bool> clear_zero_list(expect_tx_count_, false);
+    int zero_idx = -1;
+    int zero_count = 0;
+
+    for (auto v : sequence) {
+      if (v == 0) {
+        zero_count += 1;
+        zero_idx += 1;
+      } else {
+        if (zero_count == 1) {
+          clear_zero_list[zero_idx] = expect_[zero_idx] && !has_simt_copy_;
+          if (clear_zero_list[zero_idx] == false) {
+            int begin = int_sets_[zero_idx].min().as<IntImmNode>()->value;
+            int end = int_sets_[zero_idx].max().as<IntImmNode>()->value;
+            for (int i = begin; i <= end; ++i) {
+              restore_barrier_ids_.push_back(i);
+            }
+          }
+        } else {
+          for (int i{zero_idx}; i > zero_idx - zero_count; --i) {
+            int begin = int_sets_[i].min().as<IntImmNode>()->value;
+            int end = int_sets_[i].max().as<IntImmNode>()->value;
+            for (int i = begin; i <= end; ++i) {
+              restore_barrier_ids_.push_back(i);
+            }
+          }
+        }
+        zero_count = 0;
+      }
+    }
+
+    return clear_zero_list;
+  }
+
+  std::vector<int> GetRestoreBarrierIds() { return restore_barrier_ids_; }
+
+  void VisitStmt_(const ForNode *op) final {
+    var_int_set_.Set(op->loop_var,
+                     arith::IntSet::FromMinExtent(op->min, op->extent));
+    IRVisitorWithAnalyzer::VisitStmt_(op);
+  }
+
+  void VisitExpr_(const CallNode *op) final {
+    if (op->op.same_as(mbarrier_expect_tx())) {
+      PrimExpr e =
+          tma_op_to_barrier_id_[GetRef<Call>(op)].as<CallNode>()->args[0];
+      auto int_set = arith::EvalSet(e, var_int_set_);
+      expect_.push_back(if_depth_ == 1);
+      sequence.push_back(0);
+      int_sets_.push_back(int_set);
+      expect_tx_count_ += 1;
+    } else if (op->op.same_as(builtin::ptx_arrive_barrier())) {
+      sequence.push_back(1);
+    } else if (op->op.same_as(builtin::ptx_cp_async_barrier())) {
+      has_simt_copy_ = true;
+    }
+    IRVisitorWithAnalyzer::VisitExpr_(op);
+  }
+
+  void VisitStmt_(const IfThenElseNode *op) final {
+    if_depth_ += 1;
+
+    IRVisitorWithAnalyzer::VisitStmt(op->then_case);
+
+    if (op->else_case) {
+      IRVisitorWithAnalyzer::VisitStmt(op->else_case.value());
+    }
+    if_depth_ -= 1;
+  }
+
+  std::vector<int> sequence;
+  int expect_tx_count_{0};
+  std::vector<bool> expect_;
+  bool has_simt_copy_{false};
+  std::vector<int> restore_barrier_ids_;
+  int if_depth_{0};
+  Map<ObjectRef, PrimExpr> tma_op_to_barrier_id_;
+  arith::Analyzer *analyzer_;
+  Map<Var, arith::IntSet> var_int_set_;
+  std::vector<arith::IntSet> int_sets_;
 };
+
+class BarrierCreationRewriter : public StmtExprMutator {
+public:
+  BarrierCreationRewriter(std::vector<int> restore_barrier_ids,
+                          PrimExpr producer_thread_extent)
+      : restore_barrier_ids_(std::move(restore_barrier_ids)),
+        producer_thread_extent_(producer_thread_extent) {}
+
+  PrimExpr VisitExpr_(const CallNode *op) {
+    if (op->op.same_as(create_list_of_mbarrier())) {
+      std::vector<bool> tmp_(op->args.size(), false);
+      Array<PrimExpr> new_args;
+      for (auto &id : restore_barrier_ids_) {
+        tmp_[id] = true;
+      }
+
+      for (size_t i{0}; i < op->args.size(); ++i) {
+        if (tmp_[i]) {
+          new_args.push_back(producer_thread_extent_);
+        } else {
+          new_args.push_back(op->args[i]);
+        }
+      }
+      return Call(op->dtype, op->op, new_args);
+    } else {
+      return StmtExprMutator::VisitExpr_(op);
+    }
+  }
+  std::vector<int> restore_barrier_ids_;
+  PrimExpr producer_thread_extent_;
+};
+
 // we trust mbarrier_wait_parity to be correct
 class TmaBarrierRewriter : public IRMutatorWithAnalyzer {
 public:
@@ -235,8 +363,12 @@ class TmaBarrierRewriter : public IRMutatorWithAnalyzer {
         has_create_list_of_mbarrier_(has_create_list_of_mbarrier) {}
 
   static PrimFunc Rewrite(PrimFunc f, arith::Analyzer *analyzer) {
+    auto buffer_lca = DetectBufferAccessLCA(f);
+    Map<Var, Buffer> buffer_data_to_buffer_;
+    for (auto [buffer, _] : buffer_lca)
+      buffer_data_to_buffer_.Set(buffer->data, buffer);
     f = TmaExpectTxRewriter::Rewrite(f, analyzer);
-    TmaBarrierCollector collector;
+    TmaBarrierCollector collector(buffer_data_to_buffer_);
     collector(f->body);
     bool has_create_list_of_mbarrier = false;
     PostOrderVisit(f->body, [&](const ObjectRef &node) {
@@ -252,6 +384,9 @@ class TmaBarrierRewriter : public IRMutatorWithAnalyzer {
                                 collector.barrier_id_to_range(),
                                 has_create_list_of_mbarrier);
     f.CopyOnWrite()->body = rewriter(f->body);
+    auto barrier_creation_rewriter = BarrierCreationRewriter(
+        rewriter.restore_barrier_ids_, rewriter.producer_thread_extent_);
+    f.CopyOnWrite()->body = barrier_creation_rewriter(f->body);
     return f;
   }
 
@@ -265,6 +400,42 @@ class TmaBarrierRewriter : public IRMutatorWithAnalyzer {
     return IRMutatorWithAnalyzer::VisitStmt_(op);
   }
 
+  Stmt VisitStmt_(const IfThenElseNode *op) {
+    if (first_if) {
+      if (op->condition.as<GENode>()) {
+        producer_thread_extent_ =
+            thread_var_->dom->extent - op->condition.as<GENode>()->b;
+      }
+      TmaSequenceCollector collector(tma_op_to_barrier_id_);
+      collector(op->then_case);
+      clear_expect_list_ = collector.GetSequence();
+      restore_barrier_ids_ = collector.GetRestoreBarrierIds();
+      first_if = false;
+
+      is_producer_ = true;
+
+      auto then_case = StmtExprMutator::VisitStmt(op->then_case);
+
+      is_producer_ = false;
+      Stmt else_case;
+      if (op->else_case.defined())
+        else_case = StmtExprMutator::VisitStmt(op->else_case.value());
+      return IfThenElse(op->condition, then_case, else_case);
+    }
+    return StmtExprMutator::VisitStmt_(op);
+  }
+
+  Stmt VisitStmt_(const AttrStmtNode *op) final {
+    if (op->attr_key == "kWarpSpecializationScope") {
+      has_warp_specialization_ = true;
+      first_if = true;
+    } else if (op->attr_key == tir::attr::thread_extent &&
+               Downcast<IterVar>(op->node)->thread_tag == "threadIdx.x") {
+      thread_var_ = Downcast<IterVar>(op->node);
+    }
+    return IRMutatorWithAnalyzer::VisitStmt_(op);
+  }
+
   PrimExpr VisitExpr_(const CallNode *op) {
     if (op->op.same_as(tma_load())) {
       // check this must be in the tma_op_to_barrier_id_
@@ -280,6 +451,22 @@ class TmaBarrierRewriter : public IRMutatorWithAnalyzer {
       auto barrier_id = tma_op_to_barrier_id_[GetRef<Call>(op)];
       auto new_args = op->args;
       new_args.Set(0, barrier_id);
+      if (!has_warp_specialization_)
+        clear_arrive_ = false;
+      else
+        clear_arrive_ = clear_expect_list_[cur_expect_idx_++];
+      if (clear_arrive_) {
+        return Call(op->dtype, builtin::ptx_arrive_barrier_expect_tx(),
+                    new_args);
+      }
+      return Call(op->dtype, op->op, new_args);
+    } else if (op->op.same_as(builtin::ptx_arrive_barrier())) {
+      if (clear_arrive_) {
+        clear_arrive_ = false;
+        return 0;
+      }
+      // by default, all threads must wait.
+      auto new_args = op->args;
       return Call(op->dtype, op->op, new_args);
     }
     return IRMutatorWithAnalyzer::VisitExpr_(op);
@@ -287,6 +474,13 @@ class TmaBarrierRewriter : public IRMutatorWithAnalyzer {
   Map<ObjectRef, PrimExpr> tma_op_to_barrier_id_;
   Map<PrimExpr, IntImm> barrier_id_to_range_;
   bool has_create_list_of_mbarrier_;
+  bool clear_arrive_{false};
+  bool first_if{false}, has_warp_specialization_{false}, is_producer_{false};
+  IterVar thread_var_;
+  int tma_expect_tx_{0}, cur_expect_idx_{0};
+  std::vector<bool> clear_expect_list_;
+  std::vector<int> restore_barrier_ids_;
+  PrimExpr producer_thread_extent_;
 };
 
 tvm::transform::Pass InjectTmaBarrier() {
@@ -306,8 +500,10 @@ tvm::transform::Pass InjectTmaBarrier() {
   return CreatePrimFuncPass(pass_func, 0, "tl.InjectTmaBarrier", {});
 }
 
-TVM_REGISTER_GLOBAL("tl.transform.InjectTmaBarrier")
-    .set_body_typed(InjectTmaBarrier);
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.InjectTmaBarrier", InjectTmaBarrier);
+});
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/layout_inference.cc b/src/transform/layout_inference.cc
index 8c08eb888..fdbe6b861 100644
--- a/src/transform/layout_inference.cc
+++ b/src/transform/layout_inference.cc
@@ -3,6 +3,7 @@
  * \brief infer the fragment/shared memory layout
  */
 
+#include <tvm/ffi/reflection/registry.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/index_map.h>
 #include <tvm/tir/op.h>
@@ -12,11 +13,13 @@
 
 #include <queue>
 
+#include "../layout/utils.h"
 #include "../op/parallel.h"
 #include "arith/ir_mutator_with_analyzer.h"
 #include "arith/ir_visitor_with_analyzer.h"
 #include "common/loop_fusion_utils.h"
 #include "common/loop_parallel_transform_utils.h"
+#include "common/union_find.h"
 #include "loop_partition.h"
 #include "loop_vectorize.h"
 #include "runtime/thread_storage_scope.h"
@@ -59,6 +62,131 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
   BufferUseDefCollector(bool skip_thread_partition)
       : skip_thread_partition_(skip_thread_partition) {}
 
+  void RunInferStep(int cur_infer_id, InferLevel level, bool update_queue,
+                    LayoutMap &layout_map, const LayoutMap &strict_layout_map,
+                    std::queue<int> &q, std::vector<bool> &in_queue) {
+    auto num_infer = infer_list_.size();
+
+    // Range check for cur_infer_id
+    ICHECK_GE(cur_infer_id, 0) << "cur_infer_id is negative, which is invalid.";
+    ICHECK_LT(cur_infer_id, num_infer)
+        << "cur_infer_id " << cur_infer_id << " is out of range, must be < "
+        << num_infer << ".";
+
+    // Make sure we can safely access infer_list_[cur_infer_id] and
+    // thread_var_vec_[cur_infer_id]
+    auto &next = infer_list_[cur_infer_id];
+    auto iter_var = thread_var_vec_[cur_infer_id];
+    auto thread_bounds = thread_bounds_vec_[cur_infer_id];
+    // Double-check that 'next' is valid
+    ICHECK(next != nullptr)
+        << "infer_list_[" << cur_infer_id << "] is null inside run_infer_step.";
+
+    // Check iter_var->dom and dom->extent
+    ICHECK(iter_var.defined())
+        << "thread_var_vec_[" << cur_infer_id << "] is not defined.";
+    ICHECK(iter_var->dom.defined())
+        << "iter_var->dom is not defined for infer_list_[" << cur_infer_id
+        << "].";
+    ICHECK(iter_var->dom->extent.defined())
+        << "iter_var->dom->extent is not defined for infer_list_["
+        << cur_infer_id << "].";
+
+    const int64_t *extent_ptr = as_const_int(iter_var->dom->extent);
+    ICHECK(extent_ptr != nullptr)
+        << "iter_var->dom->extent is not a constant integer, which is "
+           "required for layout inference.";
+
+    // Run InferLayout
+    auto updates = next->InferLayout(
+        LayoutInferArgs{target_, thread_bounds, layout_map}, level);
+    // Process the returned updates
+    for (const auto &[buffer, layout] : updates) {
+      // Basic validity checks
+      ICHECK(buffer.defined()) << "InferLayout returned an undefined buffer.";
+      ICHECK(layout.defined()) << "InferLayout returned an undefined layout.";
+
+      if (layout_map.count(buffer)) {
+        // If new layout contains the old one, update map
+        if (buffer.scope() == "local.fragment" &&
+            level != InferLevel::kStrict && !strict_layout_map.count(buffer)) {
+          // Actually this test has been done in ParallelOp::InferLayout
+          // already. Just do it again to avoid missing implementations in other
+          // `Operator`s.
+          auto dst_layout = layout.as<Fragment>().value();
+          auto src_layout = layout_map[buffer].as<Fragment>().value();
+          ICHECK(dst_layout->InputDim() == src_layout->InputDim());
+          Array<PrimExpr> indices;
+          indices.reserve(dst_layout->InputDim());
+          arith::Analyzer inner_analyzer;
+          for (int i = 0; i < dst_layout->InputDim(); ++i) {
+            auto x = InputPlaceholder(i);
+            indices.push_back(x);
+            // should be literal - literal = 0, any analyzer will work
+            ICHECK(is_zero(inner_analyzer.Simplify(
+                dst_layout->InputShape()[i] - src_layout->InputShape()[i])));
+            inner_analyzer.Bind(x, Range(0, dst_layout->InputShape()[i]));
+          }
+          if (ProveFragmentContains(src_layout, dst_layout, indices, indices,
+                                    inner_analyzer)) {
+            layout_map.Set(buffer, layout);
+            continue;
+          }
+        }
+        // If already in map, ensure they are structurally equal
+        ICHECK(StructuralEqual()(layout, layout_map[buffer]))
+            << "Get different layout for " << buffer
+            << "\n current layout: " << layout->DebugOutput()
+            << "\n previous layout: " << layout_map[buffer]->DebugOutput();
+      } else {
+        // Otherwise, update map
+        layout_map.Set(buffer, layout);
+        if (!update_queue)
+          continue;
+
+        // Check if buffer exists in use_list_
+        if (!use_list_.count(buffer)) {
+          LOG(WARNING) << "Layout inference failed for buffer " << buffer
+                       << ". "
+                       << "The buffer cannot be inferred with current layout "
+                          "inference rules.";
+          continue;
+        }
+
+        // Push back into BFS queue
+        for (int idx : use_list_[buffer]) {
+          ICHECK_GE(idx, 0)
+              << "Index in use_list_ for buffer " << buffer << " is negative.";
+          ICHECK_LT(idx, num_infer)
+              << "Index in use_list_ for buffer " << buffer
+              << " out of range: " << idx << " >= " << num_infer << ".";
+
+          if (!in_queue[idx] && idx != cur_infer_id) {
+            in_queue[idx] = true;
+            q.push(idx);
+          }
+        }
+      }
+    }
+  };
+
+  void FinishInferQueue(InferLevel level, LayoutMap &layout_map,
+                        const LayoutMap &strict_layout_map, std::queue<int> &q,
+                        std::vector<bool> &in_queue) {
+    auto num_infer = infer_list_.size();
+    while (!q.empty()) {
+      int cur_infer_id = q.front();
+      q.pop();
+      // Range check again, just to be safe
+      ICHECK_GE(cur_infer_id, 0);
+      ICHECK_LT(cur_infer_id, num_infer);
+
+      in_queue[cur_infer_id] = false;
+      RunInferStep(cur_infer_id, level, true, layout_map, strict_layout_map, q,
+                   in_queue);
+    }
+  };
+
   LayoutInferenceResult Run() {
     // Basic consistency check: infer_list_ and thread_var_vec_ should have the
     // same size
@@ -93,125 +221,10 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
       q.push(i);
     }
 
-    auto run_infer_step = [&](int cur_infer_id, InferLevel level,
-                              bool update_queue) {
-      // Range check for cur_infer_id
-      ICHECK_GE(cur_infer_id, 0)
-          << "cur_infer_id is negative, which is invalid.";
-      ICHECK_LT(cur_infer_id, num_infer)
-          << "cur_infer_id " << cur_infer_id << " is out of range, must be < "
-          << num_infer << ".";
-
-      // Make sure we can safely access infer_list_[cur_infer_id] and
-      // thread_var_vec_[cur_infer_id]
-      auto &next = infer_list_[cur_infer_id];
-      auto iter_var = thread_var_vec_[cur_infer_id];
-      auto thread_bounds = thread_bounds_vec_[cur_infer_id];
-      // Double-check that 'next' is valid
-      ICHECK(next != nullptr) << "infer_list_[" << cur_infer_id
-                              << "] is null inside run_infer_step.";
-
-      // Check iter_var->dom and dom->extent
-      ICHECK(iter_var.defined())
-          << "thread_var_vec_[" << cur_infer_id << "] is not defined.";
-      ICHECK(iter_var->dom.defined())
-          << "iter_var->dom is not defined for infer_list_[" << cur_infer_id
-          << "].";
-      ICHECK(iter_var->dom->extent.defined())
-          << "iter_var->dom->extent is not defined for infer_list_["
-          << cur_infer_id << "].";
-
-      const int64_t *extent_ptr = as_const_int(iter_var->dom->extent);
-      ICHECK(extent_ptr != nullptr)
-          << "iter_var->dom->extent is not a constant integer, which is "
-             "required for layout inference.";
-
-      // Run InferLayout
-      auto updates = next->InferLayout(
-          LayoutInferArgs{target_, thread_bounds, layout_map}, level);
-      // Process the returned updates
-      for (const auto &[buffer, layout] : updates) {
-        // Basic validity checks
-        ICHECK(buffer.defined()) << "InferLayout returned an undefined buffer.";
-        ICHECK(layout.defined()) << "InferLayout returned an undefined layout.";
-
-        if (layout_map.count(buffer)) {
-          // If replicate size of this buffer is greater than the old one
-          if (buffer.scope() == "local.fragment" &&
-              level != InferLevel::kStrict &&
-              !strict_layout_map.count(buffer)) {
-            const FragmentNode *dst_layout = layout.as<Fragment>().get();
-            const FragmentNode *src_layout =
-                layout_map[buffer].as<Fragment>().get();
-            if (as_const_int(dst_layout->ReplicateExtent()) &&
-                as_const_int(src_layout->ReplicateExtent()) &&
-                (*as_const_int(dst_layout->ReplicateExtent()) >
-                 *as_const_int(src_layout->ReplicateExtent()))) {
-              // update map
-              layout_map.Set(buffer, layout);
-              continue;
-            }
-          }
-          // If already in map, ensure they are structurally equal
-          // (zhengju) We can not modify the strict layout map when current
-          // level is not strict. This check should be done in certain
-          // conditions, since the strict layout map is not updated in the
-          // above code when current level is not strict
-          if (level == InferLevel::kStrict ||
-              !strict_layout_map.count(buffer)) {
-            ICHECK(StructuralEqual()(layout, layout_map[buffer]))
-                << "Get different layout for " << buffer
-                << "\n current layout: " << layout->DebugOutput()
-                << "\n previous layout: " << layout_map[buffer]->DebugOutput();
-          }
-        } else {
-          // Otherwise, update map
-          layout_map.Set(buffer, layout);
-          if (!update_queue)
-            continue;
-
-          // Check if buffer exists in use_list_
-          if (!use_list_.count(buffer)) {
-            LOG(WARNING) << "Layout inference failed for buffer " << buffer
-                         << ". "
-                         << "The buffer cannot be inferred with current layout "
-                            "inference rules.";
-            continue;
-          }
-
-          // Push back into BFS queue
-          for (int idx : use_list_[buffer]) {
-            ICHECK_GE(idx, 0) << "Index in use_list_ for buffer " << buffer
-                              << " is negative.";
-            ICHECK_LT(idx, num_infer)
-                << "Index in use_list_ for buffer " << buffer
-                << " out of range: " << idx << " >= " << num_infer << ".";
-
-            if (!in_queue[idx] && idx != cur_infer_id) {
-              in_queue[idx] = true;
-              q.push(idx);
-            }
-          }
-        }
-      }
-    };
-
-    auto finish_infer_queue = [&]() {
-      while (!q.empty()) {
-        int cur_infer_id = q.front();
-        q.pop();
-        // Range check again, just to be safe
-        ICHECK_GE(cur_infer_id, 0);
-        ICHECK_LT(cur_infer_id, num_infer);
-
-        in_queue[cur_infer_id] = false;
-        run_infer_step(cur_infer_id, InferLevel::kCommon, true);
-      }
-    };
-
     // step 1: infer strict layout
     for (int i = 0; i < num_infer; i++) {
-      run_infer_step(i, InferLevel::kStrict, false);
+      RunInferStep(i, InferLevel::kStrict, false, layout_map, strict_layout_map,
+                   q, in_queue);
     }
 
     for (const auto &[buffer, layout] : layout_map) {
@@ -219,13 +232,12 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
     }
 
     // step 2: infer common layout with BFS
-    finish_infer_queue();
+    FinishInferQueue(InferLevel::kCommon, layout_map, strict_layout_map, q,
+                     in_queue);
 
     // step 3: relax constraints to free and re-run
-    for (int i = 0; i < num_infer; i++) {
-      run_infer_step(i, InferLevel::kFree, true);
-      finish_infer_queue();
-    }
+    InferInFreeMode(layout_map, strict_layout_map);
+
     // Check that all local.fragment buffers have inferred layouts
     for (const auto &[buffer, _] : use_list_) {
       if (buffer.scope() == "local.fragment") {
@@ -291,6 +303,7 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
           addToUseList(buffer.value());
         }
       }
+      infer_list_stmt_.push_back(GetRef<ObjectRef>(op));
       infer_list_.push_back(std::move(p));
       thread_var_vec_.push_back(thread_var_);
       if (analyzer_.const_int_bound.IsBound(thread_var_->var)) {
@@ -309,11 +322,16 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
 
   Optional<Buffer> getBufferFromAccessPtr(const PrimExpr &expr) {
     auto call = expr.as<CallNode>();
-    if (call && call->op.same_as(builtin::tvm_access_ptr())) {
+    if (!call) {
+      return std::nullopt;
+    }
+    if (call->op.same_as(builtin::tvm_access_ptr())) {
       auto var = call->args[1].as<Var>().value();
       return buffer_data_to_buffer_[var];
+    } else if (call->op.same_as(RegionOp::Get())) {
+      return call->args[0].as<BufferLoadNode>()->buffer;
     }
-    return NullOpt;
+    return std::nullopt;
   }
 
   void addToUseList(const Buffer &buffer) {
@@ -330,6 +348,7 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
       for (const auto &[buffer, _] : infer->GetIndiceMap()) {
         addToUseList(buffer);
       }
+      infer_list_stmt_.push_back(GetRef<ObjectRef>(op));
       infer_list_.push_back(std::move(infer));
       thread_var_vec_.push_back(thread_var_);
       if (thread_var_.defined() &&
@@ -354,11 +373,9 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
     }
     if (op->annotations.count(attr::kLayoutMap)) {
       // Check if the layout map is Map<Var, Layout>
-      auto map = op->annotations.Get(attr::kLayoutMap).as<Map<Var, Layout>>();
-      ICHECK(map.defined()) << "layout map is not defined";
-      ICHECK(map.value().defined()) << "layout map is not defined";
-
-      for (const auto &[var, layout] : map.value()) {
+      auto map =
+          op->annotations.Get(attr::kLayoutMap)->as<Map<Var, Layout>>().value();
+      for (const auto &[var, layout] : map) {
         ICHECK(buffer_data_to_buffer_.count(var))
             << "buffer " << var << " is not found in the block";
         auto buffer = buffer_data_to_buffer_[var];
@@ -381,6 +398,7 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
   }
 
   Map<Var, Buffer> buffer_data_to_buffer_;
+  std::vector<ObjectRef> infer_list_stmt_;
   std::vector<std::unique_ptr<Operator>> infer_list_;
   std::unordered_map<Buffer, std::vector<int>, ObjectPtrHash, ObjectPtrEqual>
       use_list_;
@@ -393,6 +411,122 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
   Target target_;
   LayoutMap annotated_layout_map_;
   bool skip_thread_partition_{false};
+
+  std::vector<std::unique_ptr<Operator>> BackupInferList() {
+    std::vector<std::unique_ptr<Operator>> back_infer_list;
+    back_infer_list.reserve(infer_list_.size());
+    for (auto &&p : infer_list_) {
+      back_infer_list.push_back(p->Clone());
+    }
+    return back_infer_list;
+  }
+
+  void InferInFreeMode(LayoutMap &layout_map,
+                       const LayoutMap &strict_layout_map) {
+    // Group operators into connected components
+    UnionFind<int> uf;
+    for (int i = 0; i < infer_list_.size(); i++) {
+      uf.MakeSet(i);
+    }
+    for (const auto &[buffer, infer_indices] : use_list_) {
+      if (infer_indices.empty())
+        continue;
+
+      // Union all infer_list_ indices that share the same buffer
+      int first_idx = infer_indices[0];
+      for (size_t i = 1; i < infer_indices.size(); i++) {
+        uf.Union(first_idx, infer_indices[i]);
+      }
+    }
+    std::unordered_map<int, std::vector<int>> components;
+    for (int i = 0; i < infer_list_.size(); i++) {
+      int root = uf.Find(i);
+      components[root].push_back(i);
+    }
+    std::unordered_map<int, std::vector<Buffer>> components_buffers;
+    for (const auto &[buffer, infer_indices] : use_list_) {
+      int root = uf.Find(infer_indices[0]);
+      components_buffers[root].push_back(buffer);
+    }
+
+    // For each component, try each op as root, and determine the least
+    // replicated one
+    std::queue<int> q;
+    std::vector<bool> in_queue(infer_list_.size(), false);
+    for (auto &&[root, members] : components) {
+      decltype(infer_list_) best_infer_list;
+      LayoutMap best_layout_map;
+      int64_t min_reg_num = INT64_MAX;
+      for (int attempt_infer_root : members) {
+        // backup infer_list_ in class member
+        auto back_infer_list = BackupInferList();
+        // create temporarily used layout_map, new handle so that it copies on
+        // write
+        LayoutMap tmp_layout_map = layout_map;
+        // infer from attempt_infer_root in free mode
+        bool do_update = true;
+        try {
+          RunInferStep(attempt_infer_root, InferLevel::kFree, true,
+                       tmp_layout_map, strict_layout_map, q, in_queue);
+          FinishInferQueue(InferLevel::kFree, tmp_layout_map, strict_layout_map,
+                           q, in_queue);
+
+          // Silly workaround: we have no clue if single root will iterate over
+          // the entire component, since the InferLayout implementations have
+          // complicated conditioning inside and we know nothing about it.
+          // This would constantly result in incomplete layouts for buffers in
+          // this component. Instead of trying all combinations of root
+          // selection order, we simply go through all other loops in order
+          // after the first search from attempt_infer_root.
+          for (int other_infer_root : members) {
+            if (other_infer_root != attempt_infer_root) {
+              RunInferStep(other_infer_root, InferLevel::kFree, true,
+                           tmp_layout_map, strict_layout_map, q, in_queue);
+              // must also be kFree here to avoid conflicts.
+              FinishInferQueue(InferLevel::kFree, tmp_layout_map,
+                               strict_layout_map, q, in_queue);
+            }
+          }
+        } catch (LayoutConflictException e) {
+          // such an order fails, try others
+          do_update = false;
+        } catch (NormalizeIterException e) {
+          // such an order encounters iterators that is not normalizable, try
+          // others e.g. i * 576 % 2048
+          do_update = false;
+        }
+
+        if (do_update) {
+          // compute total register number
+          int64_t reg_num = 0;
+          for (auto &&[buffer, layout] : tmp_layout_map) {
+            if (auto frag = layout.as<Fragment>()) {
+              int64_t frag_reg_num = 1;
+              for (auto i : frag.value()->OutputShape()) {
+                auto pci = as_const_int(i);
+                ICHECK(pci != nullptr);
+                frag_reg_num *= *pci;
+              }
+              reg_num += frag_reg_num;
+            }
+          }
+          // if it's any better, update the best_* storage
+          if (reg_num < min_reg_num) {
+            best_infer_list = std::move(infer_list_);
+            best_layout_map = tmp_layout_map;
+            min_reg_num = reg_num;
+          }
+        }
+        // recover stateful infer_list_, head on next
+        infer_list_ = std::move(back_infer_list);
+      }
+      if (min_reg_num < INT64_MAX) {
+        // now apply the best plan for this component
+        infer_list_ = std::move(best_infer_list);
+        layout_map = best_layout_map;
+      }
+    }
+  }
 };
 
 class LayoutInferencer : public IRMutatorWithAnalyzer {
@@ -519,8 +653,10 @@ tvm::transform::Pass LayoutInference() {
   return CreatePrimFuncPass(pass_func, 0, "tl.LayoutInference", {});
 }
 
-TVM_REGISTER_GLOBAL("tl.transform.LayoutInference")
-    .set_body_typed(LayoutInference);
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.LayoutInference", LayoutInference);
+});
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/legalize_safe_memory_access.cc b/src/transform/legalize_safe_memory_access.cc
index ee82f8812..a61fb2674 100644
--- a/src/transform/legalize_safe_memory_access.cc
+++ b/src/transform/legalize_safe_memory_access.cc
@@ -3,6 +3,7 @@
  * \brief legalize safe memory access
  */
 
+#include <tvm/ffi/reflection/registry.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/op.h>
 #include <tvm/tir/stmt_functor.h>
@@ -313,7 +314,7 @@ class SafeMemoryLegalizer : IRMutatorWithAnalyzer {
     }
     if (op->annotations.count(attr::kPaddingMap)) {
       auto map = op->annotations.Get(attr::kPaddingMap)
-                     .as<Map<Var, PrimExpr>>()
+                     ->as<Map<Var, PrimExpr>>()
                      .value();
       for (const auto &[var, padding] : map) {
         ICHECK(buffer_data_to_buffer_.count(var))
@@ -353,8 +354,11 @@ tvm::transform::Pass LegalizeSafeMemoryAccess() {
 }
 
 // Register the pass globally so it can be used in the compilation pipeline
-TVM_REGISTER_GLOBAL("tl.transform.LegalizeSafeMemoryAccess")
-    .set_body_typed(LegalizeSafeMemoryAccess);
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.LegalizeSafeMemoryAccess",
+                        LegalizeSafeMemoryAccess);
+});
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/legalize_vectorized_loop.cc b/src/transform/legalize_vectorized_loop.cc
index 941b12a1d..f65ad400c 100644
--- a/src/transform/legalize_vectorized_loop.cc
+++ b/src/transform/legalize_vectorized_loop.cc
@@ -22,6 +22,7 @@
  * \brief infer the fragment/shared memory layout
  */
 
+#include <tvm/ffi/reflection/registry.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/op.h>
 #include <tvm/tir/stmt_functor.h>
@@ -88,8 +89,11 @@ tvm::transform::Pass LegalizeVectorizedLoop() {
 }
 
 // Register the pass globally so it can be used in the compilation pipeline
-TVM_REGISTER_GLOBAL("tl.transform.LegalizeVectorizedLoop")
-    .set_body_typed(LegalizeVectorizedLoop);
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.LegalizeVectorizedLoop",
+                        LegalizeVectorizedLoop);
+});
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/loop_vectorize.cc b/src/transform/loop_vectorize.cc
index 85563ba40..bf61498f4 100644
--- a/src/transform/loop_vectorize.cc
+++ b/src/transform/loop_vectorize.cc
@@ -136,11 +136,23 @@ class VectorizePlanner : public arith::IRVisitorWithAnalyzer {
         max_vector_size = gcd_base;
       }
       vector_size_ = arith::ZeroAwareGCD(max_vector_size, vector_size_);
+
+      // Generate strides if not existed
+      auto strides = buffer->strides;
+      if (buffer->strides.size() == 0) {
+        PrimExpr stride = 1;
+        for (int i = indices.size() - 1; i >= 0; --i) {
+          strides.push_back(stride);
+          stride = stride * buffer->shape[i];
+        }
+        strides = Array<PrimExpr>{strides.rbegin(), strides.rend()};
+      }
+
+      // Generate and check element offset expression
+      ICHECK(indices.size() == strides.size()) << "Invalid indices and strides";
       PrimExpr elem_offset = 0;
-      PrimExpr stride = 1;
-      for (int i = indices.size() - 1; i >= 0; --i) {
-        elem_offset = elem_offset + indices[i] * stride;
-        stride = stride * buffer->shape[i];
+      for (int i = 0; i < indices.size(); ++i) {
+        elem_offset += indices[i] * strides[i];
       }
       while (!IndiceCanVectorize(elem_offset, inner_for_->loop_var,
                                  inner_for_->extent, vector_size_,
@@ -229,10 +241,19 @@ bool IndiceCanVectorize(PrimExpr expr, Var var, PrimExpr iter_var_size,
   ICHECK(target_vectorized_size >= 1);
   if (target_vectorized_size == 1)
     return true;
-  // bind thread range
+
+  // Extent must be divisible
   if (!analyzer->CanProveEqual(FloorMod(iter_var_size, target_vectorized_size),
                                0))
     return false;
+
+  // The base offset must be divisible
+  if (!analyzer->CanProveEqual(
+          FloorMod(Substitute(expr, {{var, 0}}), target_vectorized_size), 0)) {
+    return false;
+  }
+
+  // Bind thread range
   Var v0("v0"), v1("v1");
   analyzer->Bind(v0, Range(0, target_vectorized_size));
   analyzer->Bind(v1, Range(0, analyzer->Simplify(FloorDiv(
@@ -241,7 +262,8 @@ bool IndiceCanVectorize(PrimExpr expr, Var var, PrimExpr iter_var_size,
       Substitute(expr, {{var, v0 + v1 * target_vectorized_size}}));
   Vectorizer vectorizer(v0, IntImm(v0->dtype, target_vectorized_size));
   PrimExpr expr_vectorized = vectorizer.VisitExpr(expr_transformed);
-  // This simplify is necessary for thread region specifiled
+
+  // This simplify is necessary for thread region specified
   // optimizations.
   expr_vectorized = analyzer->Simplify(expr_vectorized);
   auto ramp_node = expr_vectorized.as<RampNode>();
diff --git a/src/transform/loop_vectorize_dynamic.cc b/src/transform/loop_vectorize_dynamic.cc
index 9e8bcb5a9..b413e0db1 100644
--- a/src/transform/loop_vectorize_dynamic.cc
+++ b/src/transform/loop_vectorize_dynamic.cc
@@ -6,6 +6,7 @@
 
 #include <cstdint>
 #include <tvm/arith/iter_affine_map.h>
+#include <tvm/ffi/reflection/registry.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/op.h>
 #include <tvm/tir/stmt_functor.h>
@@ -145,9 +146,7 @@ class VectorizePlannerDynamic : public arith::IRVisitorWithAnalyzer {
     const DataType &access_type = buffer->dtype;
     // i // 2, i % 8 can also be vectorized as factor 16
     int max_vector_size = vector_load_bits_max_ / access_type.bits();
-    if (access_type.is_e4m3_float8() or access_type.is_e5m2_float8()) {
-      max_vector_size = 1; // [temporarily] do not vectorize float8
-    }
+
     // so we should disable this GCD optimization
     max_vector_size = arith::ZeroAwareGCD(max_vector_size, extent_ptr->value);
 
@@ -532,8 +531,11 @@ tvm::transform::Pass LoopVectorizeDynamic() {
 }
 
 // Register the pass globally so it can be used in the compilation pipeline
-TVM_REGISTER_GLOBAL("tl.transform.LoopVectorizeDynamic")
-    .set_body_typed(LoopVectorizeDynamic);
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.LoopVectorizeDynamic",
+                        LoopVectorizeDynamic);
+});
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/lower_device_kernel_launch.cc b/src/transform/lower_device_kernel_launch.cc
new file mode 100644
index 000000000..7eb777cfe
--- /dev/null
+++ b/src/transform/lower_device_kernel_launch.cc
@@ -0,0 +1,418 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file lower_device_kernel_launch.cc
+ * \brief Split device function from host.
+ */
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/ir/transform.h>
+#include <tvm/target/target.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+#include "runtime/thread_storage_scope.h"
+#include "tir/transforms/ir_utils.h"
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+namespace {
+struct KernelInfo {
+  // The device on which the PrimFunc runs
+  Target target;
+
+  // The externally visible symbol which may refer to the PrimFunc
+  // when launching a device kernel.
+  String global_symbol;
+
+  // The parameters accepted by the PrimFunc.  Used to rewrite
+  // `launch_args` to be in terms of the calling scope.
+  Array<Var> params;
+
+  // The launch parameters that should annotate the PrimFunc, if the
+  // kernel is ever called from the host.
+  Array<String> launch_params;
+
+  // Additional arguments which must be provided to the host-side
+  // PackedFunc.  These may be in terms of the function's parameters
+  // (e.g. a function that computes the average of `N` elements, and
+  // which must be launched with `N` CUDA threads).
+  Array<PrimExpr> launch_args;
+
+  // The extent of each thread
+  Map<String, PrimExpr> thread_extent;
+  // The amount of dynamic shared memory used
+  Optional<PrimExpr> dyn_shmem_size{std::nullopt};
+};
+
+/*!
+ * \brief Visitor class to collect device-side program information.
+ */
+class DeviceInfoCollector : public StmtVisitor {
+public:
+  static KernelInfo Collect(const GlobalVar &gvar, const PrimFunc &func) {
+    DeviceInfoCollector collector;
+    collector.info_.target =
+        func->GetAttr<Target>(tvm::attr::kTarget).value().WithoutHost();
+    collector.info_.params = func->params;
+
+    collector(func->body);
+
+    // The dynamic shared memory is required to be the last of the
+    // kernel launch parameters
+    if (collector.dyn_shmem_size) {
+      collector.info_.launch_params.push_back(
+          tvm::runtime::launch_param::kUseDynamicSharedMemoryTag);
+    }
+
+    collector.info_.global_symbol =
+        func->GetAttr<String>(tvm::attr::kGlobalSymbol)
+            .value_or(gvar->name_hint);
+
+    collector.info_.launch_args = collector.info_.launch_params.Map(
+        [&](const auto &param) { return collector.GetArgument(param); });
+    collector.info_.dyn_shmem_size = collector.dyn_shmem_size;
+    collector.info_.thread_extent = collector.thread_extent;
+    return collector.info_;
+  }
+
+private:
+  PrimExpr GetArgument(const String &launch_param) const {
+    if (launch_param ==
+        tvm::runtime::launch_param::kUseDynamicSharedMemoryTag) {
+      CHECK(dyn_shmem_size.defined())
+          << "Compute kernel requires launch parameter \"" << launch_param
+          << "\", but PrimFunc did not contain Allocate node with shared "
+             "dynamic scope.";
+      return dyn_shmem_size.value();
+    }
+
+    auto extent = thread_extent.Get(launch_param);
+    CHECK(extent) << "Compute kernel requires launch parameter \""
+                  << launch_param
+                  << "\", but PrimFunc does not contain AttrStmt \""
+                  << tir::attr::thread_extent
+                  << "\" defining this thread extent";
+    return extent.value();
+  }
+
+  void VisitStmt_(const AttrStmtNode *op) final {
+    if (op->attr_key == tir::attr::thread_extent) {
+      IterVar iv = Downcast<IterVar>(op->node);
+      ICHECK_NE(iv->thread_tag.length(), 0U);
+      // thread_extent can appear multiple times
+      // use the first appearance as def.
+      if (!defined_thread.count(iv.get())) {
+        defined_thread.insert(iv.get());
+        info_.launch_params.push_back(iv->thread_tag);
+        thread_extent.Set(iv->thread_tag, op->value);
+      }
+    }
+
+    StmtVisitor::VisitStmt_(op);
+  }
+
+  void VisitStmt_(const AllocateNode *op) final {
+    auto storage_scope =
+        runtime::StorageScope::Create(GetPtrStorageScope(op->buffer_var));
+    if (storage_scope.rank == runtime::StorageRank::kShared &&
+        storage_scope.tag == ".dyn") {
+      ICHECK(!dyn_shmem_size.defined())
+          << "Only one dynamic shared memory allocation is allowed.";
+      ICHECK_GT(op->extents.size(), 0);
+
+      PrimExpr dyn_size = Integer(1);
+      for (const auto &extent : op->extents) {
+        dyn_size *= extent;
+      }
+      dyn_size *= op->dtype.bytes() * op->dtype.lanes();
+
+      dyn_shmem_size = dyn_size;
+    }
+    StmtVisitor::VisitStmt_(op);
+  }
+
+  // The collected results
+  KernelInfo info_;
+  // recording what thread axis have been visited.
+  std::unordered_set<const IterVarNode *> defined_thread;
+  // The extent of each thread
+  Map<String, PrimExpr> thread_extent;
+  // The amount of dynamic shared memory used
+  Optional<PrimExpr> dyn_shmem_size{std::nullopt};
+};
+
+class ReturnRemover : public StmtExprMutator {
+public:
+  static Stmt Apply(const Stmt &stmt) {
+    ReturnRemover mutator;
+    return mutator(stmt);
+  }
+
+private:
+  using Parent = StmtExprMutator;
+  Stmt VisitStmt_(const EvaluateNode *op) override {
+    if (auto *call = op->value.as<CallNode>()) {
+      if (call->op.same_as(builtin::ret())) {
+        ICHECK_EQ(call->args.size(), 1);
+        auto as_int = call->args[0].as<IntImmNode>();
+        ICHECK(as_int && as_int->value == 0)
+            << "Device kernel may only contain successful return, T.ret(0)";
+        return Evaluate(0);
+      }
+    }
+    return Parent::VisitStmt_(op);
+  }
+
+  PrimExpr VisitExpr_(const CallNode *op) override {
+    if (op->op.same_as(builtin::ret())) {
+      LOG(FATAL) << "Call to builtin::ret() should only appear within an "
+                    "Evaluate node";
+    }
+    return Parent::VisitExpr_(op);
+  }
+};
+} // namespace
+
+class DeviceKernelMutator : public StmtExprMutator {
+public:
+  using Parent = StmtExprMutator;
+
+  explicit DeviceKernelMutator(
+      std::unordered_map<const GlobalVarNode *, KernelInfo> device_info_map)
+      : device_info_map_(std::move(device_info_map)) {}
+
+  PrimFunc RewriteKernelLaunchSite(const GlobalVar &gvar, PrimFunc func) {
+    ICHECK(!current_target_.defined());
+    auto it = device_info_map_.find(gvar.get());
+    ICHECK(it != device_info_map_.end());
+    current_target_ = it->second.target;
+
+    auto body = VisitStmt(func->body);
+    if (!body.same_as(func->body)) {
+      func.CopyOnWrite()->body = body;
+    }
+
+    current_target_ = std::nullopt;
+    return func;
+  }
+
+  PrimFunc UpdateKernelAttributes(const GlobalVar &gvar, PrimFunc func) const {
+    bool is_kernel_launch = device_kernel_launch_.count(gvar.get());
+    bool is_call_extern = extern_function_call_.count(gvar.get());
+    CHECK(!is_kernel_launch || !is_call_extern)
+        << "Function " << gvar << " has multiple callees, "
+        << "and would need to be lowered into a call_extern at some call "
+           "sites, "
+        << "and a device kernel launch at others.  "
+        << "This case is not yet supported.";
+
+    if (is_kernel_launch || is_call_extern) {
+      func =
+          WithAttr(std::move(func), tvm::tir::attr::kIsGlobalFunc, Bool(true));
+    }
+
+    if (is_kernel_launch) {
+      const auto &info = device_info_map_.at(gvar.get());
+
+      // Kernel launches provide an int32 error code to the caller,
+      // but do not accept any return type from the callee.
+      {
+        auto write_ptr = func.CopyOnWrite();
+        write_ptr->ret_type = VoidType();
+        write_ptr->body = ReturnRemover::Apply(write_ptr->body);
+      }
+
+      func =
+          WithAttrs(std::move(func),
+                    {{tvm::attr::kCallingConv,
+                      Integer(tvm::CallingConv::kDeviceKernelLaunch)},
+                     {tvm::tir::attr::kKernelLaunchParams, info.launch_params},
+                     {tvm::attr::kGlobalSymbol, info.global_symbol}});
+    }
+    // @lei: workaround as we may require c host codegen, so we need to set the
+    // global symbol for cpu backend.
+    func = WithAttr(func, tvm::attr::kGlobalSymbol, gvar->name_hint);
+
+    const auto &info = device_info_map_.at(gvar.get());
+    const auto &thread_extent = info.thread_extent;
+    func = WithAttr(std::move(func), "thread_extent", thread_extent);
+    if (info.dyn_shmem_size.defined()) {
+      func = WithAttr(std::move(func), "dyn_shared_memory_buf",
+                      info.dyn_shmem_size.value());
+    }
+    return func;
+  }
+
+private:
+  PrimExpr VisitExpr_(const CallNode *op) override {
+    auto node = Downcast<Call>(Parent::VisitExpr_(op));
+
+    auto *gvar = op->op.as<GlobalVarNode>();
+    if (!gvar)
+      return std::move(node);
+
+    auto it = device_info_map_.find(gvar);
+    ICHECK(it != device_info_map_.end())
+        << "CallNode attempted subroutine call to " << gvar->name_hint
+        << ", but " << gvar->name_hint << " did not appear within the IRModule";
+    const KernelInfo &dev_info = it->second;
+
+    auto caller_target = current_target_.value();
+    auto callee_target = dev_info.target;
+
+    bool same_target = caller_target->str() == callee_target->str();
+
+    if (same_target) {
+      // Calls within the same target may be handled at codegen time
+      // as internal subroutine calls.
+      return std::move(node);
+    }
+
+    bool same_device_type = caller_target->GetTargetDeviceType() ==
+                            callee_target->GetTargetDeviceType();
+    if (same_device_type) {
+      // Calls to another target using the same device (e.g. LLVM
+      // calling a custom TIRToRuntime target) do not require a kernel
+      // launch, but need to be replaced with call_extern.
+      extern_function_call_.insert(gvar);
+      Array<PrimExpr> args;
+      args.push_back(StringImm(gvar->name_hint));
+      for (const auto &arg : node->args) {
+        args.push_back(arg);
+      }
+      return Call(node->dtype, builtin::call_extern(), args);
+    }
+
+    ICHECK(dev_info.launch_params.defined())
+        << "CallNode attempted kernel launch to " << gvar->name_hint
+        << " on target " << dev_info.target << ", but subroutine "
+        << gvar->name_hint
+        << " did not have the tir::attr::kKernelLaunchParams attribute "
+        << "required for cross-target kernel launch";
+
+    // Collected kernel information may be in terms of the callee's
+    // arguments, but we need expressions for them in terms of the
+    // caller's parameters.  The param_map allows substitution of
+    // parameter values into the thread extents, to generate
+    // expressions that are valid within the caller.
+    Map<Var, PrimExpr> param_map = [&]() {
+      Map<Var, PrimExpr> param_map;
+      CHECK_EQ(node->args.size(), dev_info.params.size())
+          << "Function " << gvar->name_hint << " accepts "
+          << dev_info.params.size()
+          << " arguments as input, but is called using " << node->args.size()
+          << " arguments";
+      for (size_t i = 0; i < node->args.size(); i++) {
+        param_map.Set(dev_info.params[i], node->args[i]);
+      }
+      return param_map;
+    }();
+
+    device_kernel_launch_.insert(gvar);
+
+    Array<PrimExpr> call_args;
+    call_args.push_back(StringImm(dev_info.global_symbol));
+    for (PrimExpr arg : node->args) {
+      call_args.push_back(arg);
+    }
+    for (const auto &launch_arg : dev_info.launch_args) {
+      call_args.push_back(Substitute(launch_arg, param_map));
+    }
+
+    auto dtype = node->dtype.is_void() ? DataType::Int(32) : node->dtype;
+
+    return Call(dtype, builtin::tvm_call_packed(), call_args);
+  }
+
+  Optional<Target> current_target_;
+  std::unordered_map<const GlobalVarNode *, KernelInfo> device_info_map_;
+  std::unordered_set<const GlobalVarNode *> device_kernel_launch_;
+  std::unordered_set<const GlobalVarNode *> extern_function_call_;
+};
+
+namespace transform {
+
+tvm::transform::Pass LowerDeviceKernelLaunch() {
+  auto pass_func = [](IRModule mod,
+                      tir::transform::PassContext ctx) -> IRModule {
+    auto mutator = [&mod]() {
+      std::unordered_map<const GlobalVarNode *, KernelInfo> device_info_map;
+      for (const auto &[gvar, base_func] : mod->functions) {
+        if (auto prim_func = base_func.as<PrimFunc>()) {
+          device_info_map[gvar.get()] =
+              DeviceInfoCollector::Collect(gvar, prim_func.value());
+        }
+      }
+      return DeviceKernelMutator(std::move(device_info_map));
+    }();
+
+    {
+      IRModule updates;
+      for (const auto &[gvar, base_func] : mod->functions) {
+        if (auto *ptr = base_func.as<PrimFuncNode>()) {
+          auto prim_func =
+              mutator.RewriteKernelLaunchSite(gvar, GetRef<PrimFunc>(ptr));
+          if (!prim_func.same_as(base_func)) {
+            updates->Add(gvar, prim_func);
+          }
+        }
+      }
+
+      if (updates->functions.size()) {
+        mod.CopyOnWrite()->Update(updates);
+      }
+    }
+    {
+      IRModule updates;
+      for (const auto &[gvar, base_func] : mod->functions) {
+        if (auto *ptr = base_func.as<PrimFuncNode>()) {
+          auto prim_func =
+              mutator.UpdateKernelAttributes(gvar, GetRef<PrimFunc>(ptr));
+          if (!prim_func.same_as(base_func)) {
+            updates->Add(gvar, prim_func);
+          }
+        }
+      }
+
+      if (updates->functions.size()) {
+        mod.CopyOnWrite()->Update(updates);
+      }
+    }
+    return mod;
+  };
+
+  return tvm::transform::CreateModulePass(pass_func, 0,
+                                          "tl.LowerDeviceKernelLaunch", {});
+}
+
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.LowerDeviceKernelLaunch",
+                        LowerDeviceKernelLaunch);
+});
+
+} // namespace transform
+} // namespace tl
+} // namespace tvm
diff --git a/src/transform/lower_device_storage_access_info.cc b/src/transform/lower_device_storage_access_info.cc
index c9f042d9e..9bd026b55 100644
--- a/src/transform/lower_device_storage_access_info.cc
+++ b/src/transform/lower_device_storage_access_info.cc
@@ -22,7 +22,8 @@
  * \brief Lower the special device storage access.
  */
 #include <tvm/arith/analyzer.h>
-#include <tvm/runtime/registry.h>
+#include <tvm/ffi/function.h>
+#include <tvm/ffi/reflection/registry.h>
 #include <tvm/target/target_info.h>
 #include <tvm/tir/buffer.h>
 #include <tvm/tir/builtin.h>
@@ -141,8 +142,11 @@ Pass LowerDeviceStorageAccessInfo() {
                             {});
 }
 
-TVM_REGISTER_GLOBAL("tl.transform.LowerDeviceStorageAccessInfo")
-    .set_body_typed(LowerDeviceStorageAccessInfo);
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.LowerDeviceStorageAccessInfo",
+                        LowerDeviceStorageAccessInfo);
+});
 
 } // namespace transform
 } // namespace tl
diff --git a/src/transform/lower_hopper_intrin.cc b/src/transform/lower_hopper_intrin.cc
index 44dd3fae7..3a459e17c 100644
--- a/src/transform/lower_hopper_intrin.cc
+++ b/src/transform/lower_hopper_intrin.cc
@@ -3,8 +3,10 @@
  * \brief Lower Hopper intrinsics cuda GPU(sm90+)
  */
 
+#include <tvm/ffi/reflection/registry.h>
 #include <tvm/tir/analysis.h>
 #include <tvm/tir/builtin.h>
+#include <tvm/tir/op.h>
 #include <tvm/tir/stmt_functor.h>
 #include <tvm/tir/transform.h>
 
@@ -20,9 +22,9 @@ using namespace tir;
 #if (CUDA_MAJOR_VERSION >= 12)
 class LowerHopperIntrin : public StmtExprMutator {
 public:
-  static PrimFunc Substitute(PrimFunc &f) {
+  static PrimFunc Substitute(PrimFunc &f, bool disable_shuffle_elect) {
     PrimFuncNode *fptr = f.CopyOnWrite();
-    LowerHopperIntrin substituter;
+    LowerHopperIntrin substituter(disable_shuffle_elect);
     fptr->body = substituter.VisitStmt(f->body);
     Map<String, Array<PrimExpr>> init_desc_arg_map;
     for (auto [call, var] : substituter.desc_map_) {
@@ -72,10 +74,15 @@ class LowerHopperIntrin : public StmtExprMutator {
           auto stmts = prefetch_calls_;
           stmts.insert(stmts.end(), init_mbarrier_calls_.begin(),
                        init_mbarrier_calls_.end());
-          auto init_stmt =
-              IfThenElse(EQ(iv->var, IntImm(iv->var->dtype, 0)),
-                         stmts.size() > 1 ? SeqStmt(stmts) : stmts[0]);
-          stmt_seq.push_back(init_stmt);
+          PrimExpr condition;
+          if (!disable_shuffle_elect_) {
+            condition = Call(DataType::Bool(), tl_shuffle_elect(), {0});
+          } else {
+            condition = EQ(iv->var, 0);
+          }
+          auto stmt_ = IfThenElse(condition,
+                                  stmts.size() > 1 ? SeqStmt(stmts) : stmts[0]);
+          stmt_seq.push_back(stmt_);
           if (!init_mbarrier_calls_.empty()) {
             Stmt mem_sync =
                 Evaluate(Call(DataType::Handle(), builtin::tvm_storage_sync(),
@@ -120,14 +127,6 @@ class LowerHopperIntrin : public StmtExprMutator {
                  {mbarrier, call->args[i]})));
       }
       return 0;
-    } else if (call->op.same_as(sync_thread_partial())) {
-      int barrier_id = init_mbarrier_calls_.size();
-      PrimExpr mbarrier =
-          Call(DataType::Handle(), get_mbarrier(), {barrier_id});
-      init_mbarrier_calls_.push_back(Evaluate(
-          Call(DataType::Handle(), builtin::ptx_init_barrier_thread_count(),
-               {mbarrier, call->args[0]})));
-      return Call(DataType::Handle(), sync_thread_partial(), {mbarrier});
     } else {
       return StmtExprMutator::VisitExpr_(call);
     }
@@ -137,20 +136,26 @@ class LowerHopperIntrin : public StmtExprMutator {
   Array<Stmt> prefetch_calls_;
   Array<Stmt> init_mbarrier_calls_;
   std::unordered_map<Call, Var, StructuralHash, ExprDeepEqual> desc_map_;
-  LowerHopperIntrin() = default;
+  LowerHopperIntrin(bool disable_shuffle_elect)
+      : disable_shuffle_elect_(disable_shuffle_elect) {}
+  bool disable_shuffle_elect_;
 };
 
 using namespace tir::transform;
 
 tvm::transform::Pass LowerHopperIntrin() {
   auto pass_func = [=](PrimFunc f, IRModule m, PassContext ctx) {
-    return LowerHopperIntrin::Substitute(f);
+    bool disable_shuffle_elect =
+        ctx->GetConfig<Bool>(kDisableShuffleElect, Bool(false)).value();
+    return LowerHopperIntrin::Substitute(f, disable_shuffle_elect);
   };
   return CreatePrimFuncPass(pass_func, 0, "tl.LowerHopperIntrin", {});
 }
 
-TVM_REGISTER_GLOBAL("tl.transform.LowerHopperIntrin")
-    .set_body_typed(LowerHopperIntrin);
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.LowerHopperIntrin", LowerHopperIntrin);
+});
 #endif // (CUDA_MAJOR_VERSION >= 12)
 
 } // namespace tl
diff --git a/src/transform/lower_l2_persistent_annotation.cc b/src/transform/lower_l2_persistent_annotation.cc
index 82d945c6a..8d80dce5c 100644
--- a/src/transform/lower_l2_persistent_annotation.cc
+++ b/src/transform/lower_l2_persistent_annotation.cc
@@ -3,6 +3,7 @@
  * \brief Lower L2 persistent annotation
  */
 
+#include <tvm/ffi/reflection/registry.h>
 #include <tvm/tir/analysis.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/stmt_functor.h>
@@ -98,8 +99,10 @@ tvm::transform::Pass LowerL2Persistent() {
   return CreatePrimFuncPass(pass_func, 0, "tl.LowerL2Persistent", {});
 }
 
-TVM_REGISTER_GLOBAL("tl.transform.LowerL2Persistent")
-    .set_body_typed(LowerL2Persistent);
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.LowerL2Persistent", LowerL2Persistent);
+});
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/lower_opaque_block.cc b/src/transform/lower_opaque_block.cc
new file mode 100644
index 000000000..0a048393a
--- /dev/null
+++ b/src/transform/lower_opaque_block.cc
@@ -0,0 +1,238 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file lower_opaque_block.cc
+ */
+
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+#include "tir/transforms/ir_utils.h"
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+using namespace tir::attr;
+/*!
+ * \brief Remove Block to ensure that the TIR can not be scheduled again.
+ */
+class OpaqueBlockLower : public StmtExprMutator {
+public:
+  static Stmt Rewrite(Stmt body) {
+    OpaqueBlockLower lower;
+    lower.storage_align_ = CollectStorageAlignAnnotation(body);
+    return lower(std::move(body));
+  }
+
+private:
+  Stmt VisitStmt_(const BlockRealizeNode *op) final {
+    // We have convert blocks into opaque blocks in previous passes.
+    ICHECK(op->iter_values.empty())
+        << "Non-opaque blocks are not allowed in FlattenBuffer. Please "
+           "call pass ConvertBlocksToOpaque before.";
+    // Step 1. Visit the body
+    Block new_block = Downcast<Block>(this->VisitStmt(op->block));
+    PrimExpr predicate = this->VisitExpr(op->predicate);
+    // Step 2. Transform the `predicate` to if-then-else
+    Stmt body = new_block->body;
+    if (!is_one(predicate)) {
+      body = IfThenElse(predicate, std::move(body));
+    }
+    // Step 3. Handle allocations in reverse order
+    for (size_t i = new_block->alloc_buffers.size(); i > 0; --i) {
+      const Buffer &buffer = new_block->alloc_buffers[i - 1];
+      Array<PrimExpr> allocation_shape = GetBufferAllocationShape(buffer);
+      body = DeclBuffer(buffer, std::move(body));
+      Map<String, ffi::Any> allocate_annotations;
+      auto it = storage_align_.find(buffer->data);
+      if (it != storage_align_.end()) {
+        StorageAlignAnnotation allocate_aligns;
+        for (auto tuple : it->second) {
+          tuple.Set<0>(-1);
+          allocate_aligns.push_back(tuple);
+        }
+        allocate_annotations.Set(tir::attr::buffer_dim_align, allocate_aligns);
+      }
+
+      body = Allocate(buffer->data, buffer->dtype, allocation_shape,
+                      const_true(), std::move(body), allocate_annotations);
+    }
+    // Step 4. Handle annotations, block annotations are not preserved by
+    // default.
+    std::vector<std::pair<std::string, PrimExpr>> pragma_attrs;
+    HandleAnnotations(new_block->annotations, &pragma_attrs, /*is_block=*/true);
+    for (auto it = pragma_attrs.rbegin(); it != pragma_attrs.rend(); ++it) {
+      body = AttrStmt(Integer(0), it->first, it->second, std::move(body));
+    }
+    return body;
+  }
+  Stmt VisitStmt_(const BlockNode *op) final {
+    Block block = Downcast<Block>(StmtExprMutator::VisitStmt_(op));
+    if (block->annotations.count("stmt_group")) {
+      return block->body;
+    }
+    return block;
+  }
+
+  Stmt VisitStmt_(const ForNode *op) final {
+    // Step 1. Update unit loop info.
+    PrimExpr min = this->VisitExpr(op->min);
+    PrimExpr extent = this->VisitExpr(op->extent);
+    if (is_one(extent) && op->annotations.empty()) {
+      // handling unit loop
+      unit_loop_vars_[op->loop_var] = min;
+    }
+    // Step 2. Visit recursively
+    Stmt body = this->VisitStmt(op->body);
+    // Step 3. Handle annotations
+    std::vector<std::pair<std::string, PrimExpr>> pragma_attrs;
+    Map<String, ffi::Any> new_annotations =
+        HandleAnnotations(op->annotations, &pragma_attrs, /*is_block=*/false);
+    // Step 4. Create new For loop accordingly
+    if (op->kind == ForKind::kThreadBinding) {
+      // Case 1. Thread binding
+      ICHECK(op->thread_binding.defined());
+      String thread_tag = op->thread_binding.value()->thread_tag;
+      body = MakeLaunchThread(min, extent, op->loop_var, thread_tag, body);
+    } else if (is_one(extent) && op->annotations.empty()) {
+      // Case 2. Unit loop
+      return body;
+    } else {
+      // Case 3. An ordinary loop
+      body = For(op->loop_var, std::move(min), std::move(extent), op->kind,
+                 std::move(body), std::nullopt, new_annotations);
+    }
+    // Step 5. Insert nested attrs
+    for (auto it = pragma_attrs.rbegin(); it != pragma_attrs.rend(); ++it) {
+      body = AttrStmt(op->loop_var, it->first, it->second, std::move(body));
+    }
+    return body;
+  }
+
+  PrimExpr VisitExpr_(const VarNode *op) final {
+    Var var = GetRef<Var>(op);
+    auto it = unit_loop_vars_.find(var);
+    if (it == unit_loop_vars_.end()) {
+      return var;
+
+    } else {
+      PrimExpr expr = it->second;
+      if (expr.dtype() != var.dtype()) {
+        expr = tvm::cast(var.dtype(), std::move(expr));
+      }
+      return expr;
+    }
+  }
+
+  static Stmt MakeLaunchThread(PrimExpr min, PrimExpr extent, Var var,
+                               String thread_tag, Stmt body) {
+    IterVar iter_var(/*dom=*/Range::FromMinExtent(min, extent),
+                     /*var=*/std::move(var),
+                     /*iter_type=*/IterVarType::kThreadIndex,
+                     /*thread_tag=*/thread_tag);
+    String attr_key = (thread_tag == "vthread" || thread_tag == "vthread.x" ||
+                       thread_tag == "vthread.y" || thread_tag == "vthread.z")
+                          ? tir::attr::virtual_thread
+                          : tir::attr::thread_extent;
+    return AttrStmt(/*node=*/std::move(iter_var),
+                    /*attr_key=*/std::move(attr_key),
+                    /*value=*/std::move(extent),
+                    /*body=*/std::move(body));
+  }
+
+  /*! \brief Convert attr value from annotation map into PrimExpr. */
+  PrimExpr ConvertAttrValue(const String &key, const Any &obj) {
+    if (obj == nullptr) {
+      return PrimExpr();
+    } else if (auto expr = obj.try_cast<PrimExpr>()) {
+      return expr.value();
+    } else if (auto str = obj.try_cast<String>()) {
+      return std::move(StringImm(str.value()));
+    } else {
+      LOG(FATAL) << "Illegal attribute of key " << key << ", value type "
+                 << obj.GetTypeKey() << " not supported";
+      return PrimExpr();
+    }
+  }
+
+  /*!
+   * \brief Helper to handle annotation dict.
+   * (1) if the attr key is prefixed by `pragma_`, move to ordered kv list. They
+   * are lowered to `AttrStmt` by legacy TE schedule convention.
+   * (2) the non-pragma loop annotations are preserved
+   * (3) the non-pragma block annotations are dropped
+   * \return New annotation dict with preserved keys. Also update pragma attr
+   * pairs ordered by key.
+   */
+  Map<String, ffi::Any>
+  HandleAnnotations(const Map<String, ffi::Any> &annotations,
+                    std::vector<std::pair<std::string, PrimExpr>> *pragma_attrs,
+                    bool is_block) {
+    Map<String, ffi::Any> preserved_annotations;
+    pragma_attrs->clear();
+    for (const auto &kv : annotations) {
+      const String &key = kv.first;
+      if (tir::attr::IsPragmaKey(key)) {
+        pragma_attrs->emplace_back(key, ConvertAttrValue(key, kv.second));
+      } else if (!is_block) {
+        // the loop annotation is preserved
+        preserved_annotations.Set(key, kv.second);
+      }
+    }
+    std::sort(
+        pragma_attrs->begin(), pragma_attrs->end(),
+        [](const auto &p1, const auto &p2) { return p1.first < p2.first; });
+    return preserved_annotations;
+  }
+
+  /*! \brief Record the loop_var and loop start value of unit loops, whose
+   * extent is one. */
+  std::unordered_map<Var, PrimExpr> unit_loop_vars_;
+
+  /*! \brief Attr keys to preserve into loop annotations. */
+  std::unordered_set<std::string> preserved_annotations_;
+
+  /*! \brief The map from buffer var to its storage alignment information. */
+  std::unordered_map<Var, StorageAlignAnnotation> storage_align_;
+};
+
+PrimFunc TLLowerOpaqueBlock(PrimFunc f) {
+  auto fptr = f.CopyOnWrite();
+  fptr->body = OpaqueBlockLower::Rewrite(std::move(fptr->body));
+  return f;
+}
+
+tir::transform::Pass LowerOpaqueBlock() {
+  using namespace tir::transform;
+  auto pass_func = [=](PrimFunc f, IRModule m, PassContext ctx) {
+    return TLLowerOpaqueBlock(std::move(f));
+  };
+  return CreatePrimFuncPass(pass_func, 0, "tl.LowerOpaqueBlock", {});
+}
+
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.LowerOpaqueBlock", LowerOpaqueBlock);
+});
+
+} // namespace tl
+} // namespace tvm
diff --git a/src/transform/lower_shared_barrier.cc b/src/transform/lower_shared_barrier.cc
index a40e3041d..6f8cb0665 100644
--- a/src/transform/lower_shared_barrier.cc
+++ b/src/transform/lower_shared_barrier.cc
@@ -6,7 +6,7 @@
 #include "tvm/tir/expr.h"
 #include "tvm/tir/stmt.h"
 #include <tvm/arith/analyzer.h>
-#include <tvm/runtime/registry.h>
+#include <tvm/ffi/reflection/registry.h>
 #include <tvm/tir/analysis.h>
 #include <tvm/tir/op.h>
 #include <tvm/tir/stmt_functor.h>
@@ -209,8 +209,10 @@ tvm::transform::Pass LowerSharedBarrier() {
   return CreatePrimFuncPass(pass_func, 0, "tl.LowerSharedBarrier", {});
 }
 
-TVM_REGISTER_GLOBAL("tl.transform.LowerSharedBarrier")
-    .set_body_typed(LowerSharedBarrier);
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.LowerSharedBarrier", LowerSharedBarrier);
+});
 
 } // namespace transform
 } // namespace tl
diff --git a/src/transform/lower_thread_allreduce.cc b/src/transform/lower_thread_allreduce.cc
new file mode 100644
index 000000000..f36d6fdc0
--- /dev/null
+++ b/src/transform/lower_thread_allreduce.cc
@@ -0,0 +1,953 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Lower allreduce to device implementable ir.
+ * \file lower_thread_allreduce.cc
+ */
+#include <tvm/arith/analyzer.h>
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/target/target.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+#include <unordered_set>
+
+#include "runtime/thread_storage_scope.h"
+#include "tir/transforms/ir_utils.h"
+#include "tir/transforms/update_pointer_storage_scope.h"
+
+namespace tvm {
+namespace tl {
+using namespace tir;
+
+using runtime::StorageRank;
+using runtime::StorageScope;
+
+/*!
+ * \brief collect the mapping from the buffer var to its allocate
+ */
+class AllocateCollector : public StmtExprVisitor {
+
+private:
+  bool IsDynamicSharedMemory(Var buffer_var) {
+    StorageScope storage_scope =
+        runtime::StorageScope::Create(GetPtrStorageScope(buffer_var));
+    return storage_scope.rank == runtime::StorageRank::kShared &&
+           storage_scope.tag == ".dyn";
+  }
+
+  bool IsStaticSharedMemory(Var buffer_var) {
+    StorageScope storage_scope =
+        runtime::StorageScope::Create(GetPtrStorageScope(buffer_var));
+    return storage_scope.rank == runtime::StorageRank::kShared &&
+           storage_scope.tag == "";
+  }
+
+public:
+  void VisitStmt_(const AllocateNode *op) final {
+    if (IsDynamicSharedMemory(op->buffer_var)) {
+      dyn_shmem_allocs_[op->buffer_var.get()] = op;
+    } else if (IsStaticSharedMemory(op->buffer_var)) {
+      static_shmem_allocs_[op->buffer_var.get()] = op;
+    }
+    StmtExprVisitor::VisitStmt_(op);
+  }
+  // The dynamic mapping from the original buffer var to its allocate
+  std::unordered_map<const VarNode *, const AllocateNode *> dyn_shmem_allocs_;
+  // The static mapping from the original buffer var to its allocate
+  std::unordered_map<const VarNode *, const AllocateNode *>
+      static_shmem_allocs_;
+};
+
+class ThreadAllreduceBuilder final : public StmtExprMutator {
+public:
+  explicit ThreadAllreduceBuilder(const TargetNode *target,
+                                  bool is_dynamic = false)
+      : target_(target),
+        warp_size_(
+            target->GetAttr<Integer>("thread_warp_size", 1).value().IntValue()),
+        max_num_threads_(target->GetAttr<Integer>("max_num_threads", -1)
+                             .value()
+                             .IntValue()) {
+    if (is_dynamic) {
+      shared_scope = "shared.dyn";
+    }
+  }
+
+  Stmt VisitStmt_(const AttrStmtNode *op) final {
+    if (op->attr_key == tir::attr::thread_extent) {
+      thread_extents_.push_back(op);
+      Stmt ret = StmtExprMutator::VisitStmt_(op);
+      thread_extents_.pop_back();
+      return ret;
+    } else if (op->attr_key == tir::attr::reduce_scope) {
+      const CommReducerNode *combiner = op->node.as<CommReducerNode>();
+      ICHECK(combiner);
+      reduce_combiner_.push_back(combiner);
+      Stmt ret = StmtExprMutator::VisitStmt_(op);
+      reduce_combiner_.pop_back();
+      return ret;
+    } else {
+      return StmtExprMutator::VisitStmt_(op);
+    }
+  }
+  Stmt VisitStmt_(const EvaluateNode *op) final {
+    Stmt stmt = StmtExprMutator::VisitStmt_(op);
+    op = stmt.as<EvaluateNode>();
+    const CallNode *call = op->value.as<CallNode>();
+    if (call && call->op.same_as(builtin::tvm_thread_allreduce())) {
+      return MakeAllreduce(call);
+    } else {
+      return stmt;
+    }
+  }
+  Stmt VisitStmt_(const AllocateNode *op) final {
+    auto node = Downcast<Allocate>(StmtExprMutator::VisitStmt_(op));
+
+    if (auto it = alloc_remap_.find(node->buffer_var.get());
+        it != alloc_remap_.end()) {
+      Buffer buf = Downcast<Buffer>(it->second);
+      auto write_ptr = node.CopyOnWrite();
+      write_ptr->buffer_var = buf->data;
+      write_ptr->dtype = buf->dtype;
+      write_ptr->extents = buf->shape;
+      write_ptr->condition = const_true(buf->dtype.lanes());
+
+      if (buf.scope() == shared_scope) {
+        // Use volatile access to shared buffer.
+        write_ptr->body =
+            AttrStmt(buf->data, tir::attr::volatile_scope, 1, write_ptr->body);
+      }
+    }
+    return std::move(node);
+  }
+
+  Optional<Buffer> GetRemappedBuffer(const Buffer &buf) {
+    if (auto it = buf_remap_.find(buf.get()); it != buf_remap_.end()) {
+      return it->second;
+    }
+
+    if (auto it = var_remap_.find(buf->data.get()); it != var_remap_.end()) {
+      Buffer new_buf = buf;
+      new_buf.CopyOnWrite()->data = it->second;
+      buf_remap_[buf.get()] = new_buf;
+      return new_buf;
+    }
+
+    return std::nullopt;
+  }
+
+  Stmt VisitStmt_(const DeclBufferNode *op) final {
+    auto node = Downcast<DeclBuffer>(StmtExprMutator::VisitStmt_(op));
+    if (auto buf = GetRemappedBuffer(node->buffer)) {
+      node.CopyOnWrite()->buffer = buf.value();
+    }
+    return std::move(node);
+  }
+
+  PrimExpr VisitExpr_(const BufferLoadNode *op) final {
+    if (auto it = load_remap_.find(op->buffer->data.get());
+        it != load_remap_.end()) {
+      for (const auto &index : op->indices) {
+        ICHECK(is_zero(index))
+            << "The index of buffer " << op->buffer << " is " << index;
+      }
+      return it->second;
+    }
+
+    BufferLoad load = Downcast<BufferLoad>(StmtExprMutator::VisitExpr_(op));
+    op = load.get();
+
+    if (auto opt = GetRemappedBuffer(load->buffer)) {
+      load.CopyOnWrite()->buffer = opt.value();
+    }
+    return std::move(load);
+  }
+
+  Stmt VisitStmt_(const BufferStoreNode *op) final {
+    BufferStore store = Downcast<BufferStore>(StmtExprMutator::VisitStmt_(op));
+
+    if (auto opt = GetRemappedBuffer(store->buffer)) {
+      store.CopyOnWrite()->buffer = opt.value();
+    }
+    return std::move(store);
+  }
+
+private:
+  // Thread entry
+  struct ThreadEntry {
+    runtime::ThreadScope scope;
+    IterVar iv;
+    int extent;
+    // comparator
+    bool operator<(const ThreadEntry &other) const {
+      return scope.dim_index < other.scope.dim_index;
+    }
+  };
+
+  // make allreduce.
+  Stmt MakeAllreduce(const CallNode *call) {
+    ICHECK(!reduce_combiner_.empty());
+    const CommReducerNode *combiner = reduce_combiner_.back();
+    size_t size = combiner->result.size();
+
+    const IntImmNode *size_of_args = call->args[0].as<IntImmNode>();
+    ICHECK(size_of_args) << call->args[0]->GetTypeKey();
+    ICHECK_EQ(size, size_of_args->value);
+    Array<PrimExpr> inits = combiner->identity_element;
+    std::vector<PrimExpr> values(size);
+    std::vector<DataType> types(size);
+    PrimExpr cond = call->args[size + 1];
+    for (size_t idx = 0; idx < size; ++idx) {
+      values[idx] = call->args[1 + idx];
+      if (!is_one(cond)) {
+        values[idx] = Select(cond, values[idx], inits[idx]);
+      }
+      types[idx] = values[idx].dtype();
+    }
+    std::vector<Buffer> buffers(size);
+    for (size_t idx = 0; idx < size; ++idx) {
+      PrimExpr arg = call->args[2 + size + idx];
+      // Loads from boolean buffers may have cast nodes inserted by
+      // earlier passes.
+      if (auto cast = arg.as<CastNode>()) {
+        arg = cast->value;
+      }
+      buffers[idx] = Downcast<BufferLoad>(arg)->buffer;
+    }
+
+    std::unordered_set<const VarNode *> reduce_set;
+    for (size_t i = 2 + 2 * size; i < call->args.size(); ++i) {
+      const VarNode *v = call->args[i].as<VarNode>();
+      // The simply optimization replace a iteration variable with a constant
+      // when extent of the iteration is 1. As threaded IterVar always started
+      // from 0, we can just ignore this variable in this case.
+      if (v) {
+        reduce_set.insert(v);
+      } else {
+        ICHECK(call->args[i].as<IntImmNode>() &&
+               call->args[i].as<IntImmNode>()->value == 0)
+            << "arg" << i << "should be a VarNode or IntImmNode "
+            << "while it is " << call->args[i];
+      }
+    }
+
+    size_t nmatch = 0;
+    std::vector<ThreadEntry> vred, vpar;
+    int reduce_dim_index = -1;
+    for (const AttrStmtNode *attr : thread_extents_) {
+      ThreadEntry e;
+      IterVar iv = Downcast<IterVar>(attr->node);
+      e.scope = runtime::ThreadScope::Create(iv->thread_tag);
+      e.iv = iv;
+      ICHECK_LE(e.scope.rank, 1);
+      ICHECK_GE(e.scope.dim_index, 0)
+          << "vthread do not work with cross thread reduction";
+      if (e.scope.rank == 1) {
+        const auto *ptr = attr->value.as<IntImmNode>();
+        ICHECK(ptr) << "Need constant extent for reduce set " << iv;
+        e.extent = static_cast<int>(ptr->value);
+        // ignore variables equal to 0
+        if (e.extent == 1) {
+          continue;
+        }
+
+        if (reduce_set.count(iv->var.get())) {
+          bool already_exists = false;
+          for (const auto &entry : vred) {
+            if (entry.scope.dim_index == e.scope.dim_index) {
+              already_exists = true;
+              break;
+            }
+          }
+          if (!already_exists) {
+            vred.push_back(e);
+            ++nmatch;
+            reduce_dim_index = e.scope.dim_index;
+          }
+        } else {
+          bool already_exists = false;
+          for (const auto &entry : vpar) {
+            if (entry.scope.dim_index == e.scope.dim_index) {
+              already_exists = true;
+              break;
+            }
+          }
+          if (!already_exists) {
+            vpar.push_back(e);
+          }
+        }
+      }
+    }
+
+    // remove reduce thread from parallel thread
+    if (reduce_dim_index != -1) {
+      for (size_t i = 0; i < vpar.size(); ++i) {
+        if (vpar[i].scope.dim_index == reduce_dim_index) {
+          vpar.erase(vpar.begin() + i);
+          break;
+        }
+      }
+    }
+
+    ICHECK_EQ(nmatch, reduce_set.size())
+        << "Not all reduce index are presented in the context";
+    std::sort(vred.begin(), vred.end());
+    std::sort(vpar.begin(), vpar.end());
+    // the size of each index.
+    int reduce_extent, group_extent;
+    PrimExpr reduce_index = FlattenThread(vred, &reduce_extent);
+    PrimExpr group_index = FlattenThread(vpar, &group_extent);
+
+    // the longest contiguous reduce extent after flattening
+    int contiguous_reduce_extent = 1;
+    std::vector<std::tuple<int, int, bool>>
+        block_threads; // tuple(dim_index, extent, is_reduce)
+    for (const ThreadEntry &thr : vred) {
+      if (thr.scope.rank == 1) { // threadIdx
+        block_threads.emplace_back(thr.scope.dim_index, thr.extent, true);
+      }
+    }
+    for (const ThreadEntry &thr : vpar) {
+      if (thr.scope.rank == 1) { // threadIdx
+        block_threads.emplace_back(thr.scope.dim_index, thr.extent, false);
+      }
+    }
+    // sort according to dim_index
+    std::sort(block_threads.begin(), block_threads.end());
+    for (auto &&thr_attr : block_threads) {
+      auto [dim_index, extent, is_reduce] = thr_attr;
+      (void)dim_index; // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81767
+      if (is_reduce) {
+        contiguous_reduce_extent *= extent;
+      } else {
+        break;
+      }
+    }
+
+    std::vector<Stmt> seq;
+    std::vector<Buffer> new_alloc_bufs;
+    //
+    // This is an optimization. For small reduction sizes, it may be beneficial
+    // for a single warp to performance the entire reduction. No trips to shared
+    // memory and no cross warp synchronizations are required.
+    // The following code emits the reduction as follows:
+    //
+    // Allocate reduction vars v[i], i = 0..size-1
+    //
+    // for offset from WARP_SIZE to 1 by 2
+    //
+    //   a    <- load(v[i])
+    //   b    <- shuffle_down(load(v[i], offset))
+    //   v[i] <- reduction(a, b)
+    //
+    // broadcast results from lane 0 to all other lanes and store
+    // the final reduction result to the proper location.
+    //
+    // When the thread extent is multiple of warp size, we can use a two-stage
+    // warp-level reduction to optimize. This is implemented by applying the
+    // algorithm above twice.
+    //
+    // For example, suppose we want to use 512 threads to reduce 512 elements
+    // and the warp size is 32. In this case there are (512 / 32) = 16 warps.
+    // In the first stage, each of the 16 warps reduces 32 elements. So after
+    // the stage, we have 16 remaining elements to be reduced, one for each
+    // warp. We store the 16 elements in shared memory, and start the second
+    // stage. In the second stage we use the first 16 lanes of the first warp to
+    // reduce the remaining elements, and this reduction can also be optimized
+    // by shuffle_down warp-level primitives.
+    PrimExpr zero_index = make_const(reduce_index->dtype, 0);
+
+    if (IsWarpReduction(types, group_extent, reduce_extent,
+                        contiguous_reduce_extent)) {
+      std::vector<PrimExpr> reduce_results;
+      DataType mask_dtype = DataType::UInt(32);
+      PrimExpr mask = Call(mask_dtype, builtin::tvm_warp_activemask(), {});
+
+      if (reduce_extent <= warp_size_) {
+        std::tie(reduce_results, new_alloc_bufs) = MakeWarpAllreduce(
+            values, types, combiner, reduce_index, reduce_extent, group_index,
+            mask, std::nullopt, &seq);
+
+        // Broadcast the reduction result from lane 0 to all other lanes.
+        // This avoids to emit predicated stores, as all threads are
+        // uniformly writing the same result.
+        for (size_t i = 0; i < size; ++i) {
+          Buffer buf = Downcast<BufferLoad>(reduce_results[i])->buffer;
+          PrimExpr val = BufferLoad(buf, {zero_index});
+          ICHECK_EQ(val->dtype, types[i]);
+          PrimExpr splat =
+              WarpShuffle(builtin::tvm_warp_shuffle(), new_alloc_bufs.back(),
+                          val, reduce_extent * group_index);
+          seq.push_back(BufferStore(buf, splat, {zero_index}));
+        }
+      } else {
+        int n_warps = reduce_extent / warp_size_;
+        std::vector<Buffer> local_bufs;
+
+        // 1. Create the staging buffer in shared memory.
+        std::vector<Buffer> staging_shared_bufs;
+        staging_shared_bufs.reserve(size);
+        for (size_t i = 0; i < size; ++i) {
+          Buffer staging_shared_buf = decl_buffer(
+              /*shape=*/{make_const(reduce_index->dtype,
+                                    n_warps * group_extent)},
+              /*dtype=*/buffers[i]->dtype, /*name=*/"red_buf_staging",
+              /*storage_scope=*/shared_scope);
+          staging_shared_bufs.push_back(staging_shared_buf);
+          new_alloc_bufs.push_back(staging_shared_buf);
+        }
+
+        // 2. First round of allreduce.
+        std::tie(reduce_results, local_bufs) =
+            MakeWarpAllreduce(values, types, combiner, reduce_index, warp_size_,
+                              group_index, mask, std::nullopt, &seq);
+        new_alloc_bufs.insert(new_alloc_bufs.end(), local_bufs.begin(),
+                              local_bufs.end());
+
+        // 3. Write allreduce results to staging buffer.
+        std::vector<Stmt> write_staging_buf;
+        write_staging_buf.reserve(size);
+        for (size_t i = 0; i < size; ++i) {
+          new_alloc_bufs.push_back(
+              Downcast<BufferLoad>(reduce_results[i])->buffer);
+          write_staging_buf.push_back(BufferStore(
+              /*buffer=*/staging_shared_bufs[i],
+              /*value=*/reduce_results[i],
+              /*indices=*/
+              {group_index * n_warps + floordiv(reduce_index, warp_size_)}));
+        }
+        PrimExpr cond = floormod(reduce_index, warp_size_) == zero_index;
+        seq.push_back(IfThenElse(cond, SeqStmt::Flatten(write_staging_buf)));
+        seq.push_back(SyncThread(shared_scope));
+
+        // 4. Load staging buffer.
+        //    Second round of allreduce.
+        for (size_t i = 0; i < size; ++i) {
+          values[i] =
+              BufferLoad(/*buffer=*/staging_shared_bufs[i],
+                         /*indices=*/{group_index * n_warps + reduce_index});
+        }
+        std::tie(reduce_results, local_bufs) = MakeWarpAllreduce(
+            values, types, combiner, reduce_index, n_warps, group_index, mask,
+            /*predicate=*/reduce_index <
+                make_const(reduce_index->dtype, n_warps),
+            &seq);
+        new_alloc_bufs.insert(new_alloc_bufs.end(), local_bufs.begin(),
+                              local_bufs.end());
+
+        // 5. Create shared memory buffer(s) of `group_extent` elements, storing
+        // the allreduce results so each thread can access.
+        std::vector<Stmt> write_result;
+        write_result.reserve(size);
+        for (size_t i = 0; i < size; ++i) {
+          new_alloc_bufs.push_back(
+              Downcast<BufferLoad>(reduce_results[i])->buffer);
+          Buffer broadcast_shared_buf = decl_buffer(
+              /*shape=*/{make_const(reduce_index->dtype, group_extent)},
+              /*dtype=*/buffers[i]->dtype, /*name=*/"red_result",
+              /*storage_scope=*/shared_scope);
+          write_result.push_back(BufferStore(broadcast_shared_buf,
+                                             reduce_results[i], {group_index}));
+          // Update `reduce_results`, pointing to the value loaded from the
+          // shared memory buffer.
+          reduce_results[i] = BufferLoad(broadcast_shared_buf, {group_index});
+        }
+        seq.push_back(IfThenElse(reduce_index == zero_index,
+                                 SeqStmt::Flatten(write_result)));
+        seq.push_back(SyncThread(shared_scope));
+      }
+
+      // Write back allreduce results and update existing allocations.
+      for (size_t i = 0; i < size; ++i) {
+        ICHECK(!load_remap_.count(buffers[i]->data.get()));
+        PrimExpr pred = const_true(types[i].lanes());
+        Buffer buf = Downcast<BufferLoad>(reduce_results[i])->buffer;
+        ICHECK_EQ(reduce_results[i]->dtype, types[i]);
+        load_remap_[buffers[i]->data.get()] = reduce_results[i];
+
+        auto node =
+            Allocate(buf->data, types[i], buf->shape, pred, Evaluate(0));
+        alloc_remap_[buffers[i]->data.get()] = buf;
+        var_remap_[buffers[i]->data.get()] = buf->data;
+        buf_remap_[buffers[i].get()] = buf;
+      }
+    } else {
+      std::vector<Buffer> shared_bufs(size);
+      if (reduce_extent == 1) {
+        // special case, no reduction is needed.
+        std::vector<Stmt> stores;
+        for (size_t i = 0; i < size; ++i) {
+          stores.push_back(BufferStore(buffers[i], values[i], {0}));
+        }
+        return SeqStmt::Flatten(stores);
+      }
+      // This sync is necessary because there might be incomplete read of
+      // previous iteration on the same buffer.
+      seq.emplace_back(SyncThread(shared_scope));
+      for (size_t idx = 0; idx < size; ++idx) {
+        shared_bufs[idx] = decl_buffer(
+            {IntImm(group_index->dtype, group_extent * reduce_extent)},
+            types[idx], "red_buf" + std::to_string(idx), shared_scope);
+        seq.emplace_back(
+            BufferStore(shared_bufs[idx], values[idx],
+                        {BufIndex(reduce_index, group_index, reduce_extent)}));
+      }
+      seq.emplace_back(SyncThread(shared_scope));
+      seq.emplace_back(MakeBufAllreduce(
+          combiner, types, shared_bufs, reduce_index, group_index,
+          reduce_extent, group_extent, contiguous_reduce_extent));
+      for (size_t idx = 0; idx < size; ++idx) {
+        ICHECK(!load_remap_.count(buffers[idx]->data.get()));
+        PrimExpr pred = const_true(types[idx].lanes());
+        BufferLoad load(shared_bufs[idx],
+                        {BufIndex(make_zero(reduce_index.dtype()), group_index,
+                                  reduce_extent)});
+        ICHECK_EQ(load->dtype, types[idx]);
+        load_remap_[buffers[idx]->data.get()] = load;
+        alloc_remap_[buffers[idx]->data.get()] = shared_bufs[idx];
+        var_remap_[buffers[idx]->data.get()] = shared_bufs[idx]->data;
+        buf_remap_[buffers[idx].get()] = shared_bufs[idx];
+      }
+    }
+
+    // Fix all local allocations as all statements are built.
+    Stmt body = SeqStmt::Flatten(seq);
+    for (Buffer buf : new_alloc_bufs) {
+      body = DeclBuffer(buf, body);
+      body = Allocate(buf->data, buf->dtype, buf->shape,
+                      const_true(buf->dtype.lanes()), body);
+    }
+
+    return body;
+  }
+
+  std::pair<std::vector<PrimExpr>, std::vector<Buffer>>
+  MakeWarpAllreduce(std::vector<PrimExpr> src_values,            //
+                    std::vector<DataType> dtypes,                //
+                    const CommReducerNode *combiner,             //
+                    PrimExpr reduce_index, int reduce_extent,    //
+                    PrimExpr group_index,                        //
+                    PrimExpr mask, Optional<PrimExpr> predicate, //
+                    std::vector<Stmt> *seq) {
+    int n_buffers = src_values.size();
+
+    std::vector<Buffer> shared_bufs;
+    std::vector<Buffer> local_bufs;
+    shared_bufs.reserve(n_buffers);
+
+    // This is the index to the reduction variable, one reduction
+    // variable per warp. Local scope seems easier to reason without
+    // relying on a pattern match pass to fix it later.
+    Array<PrimExpr> zero_indices = {0};
+    Array<PrimExpr> shape = {1};
+
+    std::vector<Stmt> load_values;
+    load_values.reserve(n_buffers);
+    for (int idx = 0; idx < n_buffers; ++idx) {
+      shared_bufs.push_back(decl_buffer(
+          shape, dtypes[idx], "red_buf" + std::to_string(idx), "local"));
+      load_values.push_back(
+          BufferStore(shared_bufs[idx], src_values[idx], zero_indices));
+
+      // Uses a local variable to store the shuffled data.  Later
+      // on, an allocation will be built for this local variable.
+      local_bufs.push_back(
+          decl_buffer(shape, dtypes[idx], "t" + std::to_string(idx), "local"));
+    }
+
+    if (predicate.defined()) {
+      seq->push_back(
+          IfThenElse(predicate.value(), SeqStmt::Flatten(load_values)));
+    } else {
+      seq->insert(seq->end(), load_values.begin(), load_values.end());
+    }
+
+    // The mask for this reducer, as this reducer may sit inside
+    // a divergent control flow. Here it uses a variable to cache the current
+    // active channels.
+    Optional<Buffer> mask_buffer;
+    if (need_warp_shuffle_mask_) {
+      mask_buffer = decl_buffer(shape, mask->dtype, "mask", "local");
+      seq->emplace_back(BufferStore(mask_buffer.value(), mask, zero_indices));
+      // Push the buffer description.  Later this will have an
+      // allocation built for it.
+      local_bufs.push_back(mask_buffer.value());
+    }
+
+    // Emit reductions within a warp.
+    int start_offset = 1;
+    while (start_offset * 2 < reduce_extent) {
+      start_offset *= 2;
+    }
+    for (int offset = start_offset; offset > 0; offset /= 2) {
+      // Load reduction values, no synchronization needed.
+      Array<PrimExpr> a, b;
+      for (int i = 0; i < n_buffers; ++i) {
+        Buffer shared_buf = shared_bufs[i];
+        BufferLoad val(shared_buf, zero_indices);
+        ICHECK_EQ(val->dtype, dtypes[i]);
+        a.push_back(val);
+
+        // __shfl_*sync calls shall not appear in if_then_else expressions
+        // as this is causing extra divergency. E.g.
+        //
+        // v1 = (v2 < v3) ? v3 : __shfl_sync(mask, v1, 0);
+        //
+        // behaves differently from
+        //
+        // int t = __shfl_sync(mask, v1, 0);
+        // v1 = (v2 < v3) ? v3 : t;
+        //
+        // The former may cause dead lock as there is a divergent
+        // branch with a warp sync call inside.
+        PrimExpr other = WarpShuffle(builtin::tvm_warp_shuffle_down(),
+                                     mask_buffer, val, offset);
+        Buffer local_buf = local_bufs[i];
+        Stmt s = BufferStore(local_buf, other, zero_indices);
+        seq->push_back(s);
+
+        BufferLoad load = BufferLoad(local_buf, zero_indices);
+        ICHECK_EQ(load->dtype, dtypes[i]);
+        b.push_back(load);
+      }
+
+      // Do reductions.
+      Array<PrimExpr> ret = (*combiner)(a, b);
+
+      // Store the reduction result to itself.
+      std::vector<Stmt> stores;
+      stores.reserve(n_buffers);
+      for (int i = 0; i < n_buffers; ++i) {
+        Buffer buf = shared_bufs[i];
+        stores.push_back(BufferStore(buf, ret[i], zero_indices));
+      }
+
+      // During the sub-warp reduction, values from inactive threads could be
+      // read, which is an undefined behavior according to the cuda document.
+      //
+      // In practice, the return value are usually 0, which does no harm to sum
+      // reduction. However, the result can be incorrect in max or prod
+      // reduction. Therefore an additional range check has to be performed to
+      // ensure the correctness.
+      if (offset * 2 > reduce_extent) {
+        PrimExpr cond = reduce_index + offset < reduce_extent;
+        seq->push_back(IfThenElse(cond, SeqStmt::Flatten(stores)));
+      } else {
+        seq->push_back(SeqStmt::Flatten(stores));
+      }
+    }
+
+    std::vector<PrimExpr> reduce_results;
+    reduce_results.reserve(n_buffers);
+    for (int i = 0; i < n_buffers; ++i) {
+      reduce_results.push_back(BufferLoad(shared_bufs[i], zero_indices));
+    }
+
+    return {reduce_results, local_bufs};
+  }
+
+  // make allreduce.
+  Stmt MakeBufAllreduce(const CommReducerNode *combiner,
+                        const std::vector<DataType> &types,
+                        const Array<Buffer> &shared_bufs, PrimExpr reduce_index,
+                        PrimExpr group_index, int reduce_extent,
+                        int group_extent, int contiguous_reduce_extent) {
+    // Get next power of two
+    int reduce_align = 1;
+    while (reduce_extent > reduce_align) {
+      reduce_align = reduce_align << 1;
+    }
+    ICHECK_GT(reduce_align, 1);
+    std::vector<Stmt> seq;
+
+    size_t size = shared_bufs.size();
+    PrimExpr buf_index = BufIndex(reduce_index, group_index, reduce_extent);
+    // make reduction
+    auto fload = [&](int offset) {
+      Array<PrimExpr> a, b;
+      for (size_t i = 0; i < size; ++i) {
+        BufferLoad b_load(
+            shared_bufs[i],
+            {BufIndex(reduce_index + offset, group_index, reduce_extent)});
+        ICHECK_EQ(b_load->dtype, types[i]);
+        b.push_back(b_load);
+
+        BufferLoad a_load(shared_bufs[i], {buf_index});
+        ICHECK_EQ(a_load->dtype, types[i]);
+        a.push_back(a_load);
+      }
+      Array<PrimExpr> ret = (*combiner)(a, b);
+      return ret;
+    };
+    auto fstore = [&](const Array<PrimExpr> &ret) {
+      std::vector<Stmt> stores(size);
+      for (size_t i = 0; i < size; ++i) {
+        stores[i] = BufferStore(shared_bufs[i], ret[i], {buf_index});
+      }
+      return SeqStmt::Flatten(stores);
+    };
+    auto freduce = [&](int offset) {
+      auto ret = fload(offset);
+      return fstore(ret);
+    };
+    // Step one, check for
+    if (reduce_align > reduce_extent) {
+      // reduction with the boundary condition
+      reduce_align = reduce_align >> 1;
+      PrimExpr cond = reduce_index < (reduce_extent - reduce_align);
+      seq.emplace_back(IfThenElse(cond, freduce(reduce_align)));
+      seq.emplace_back(SyncThread(shared_scope));
+    }
+
+    // normal synchronization
+    bool warp_align =
+        group_extent == 1 || contiguous_reduce_extent % warp_size_ == 0;
+    while (reduce_align > contiguous_reduce_extent ||
+           reduce_align > warp_size_ || !warp_align) {
+      if (reduce_align == 1) {
+        break;
+      }
+      reduce_align = reduce_align >> 1;
+      PrimExpr cond = reduce_index < reduce_align;
+      seq.emplace_back(IfThenElse(cond, freduce(reduce_align)));
+      seq.emplace_back(SyncThread(shared_scope));
+    }
+    // in warp synchronization.
+    if (reduce_align > 1) {
+      PrimExpr in_warp_cond = reduce_index < (reduce_align >> 1);
+
+      std::vector<Stmt> in_warp_seq;
+
+      while (reduce_align > 1) {
+        reduce_align = reduce_align >> 1;
+
+        // freduce can read/write to the same memory location.  For
+        // example, with reduce_align of 4, threadIdx 3 reads from
+        // memory location 7 as threadIdx 7 is writing to it.
+        // Therefore, we need to separate out the load from the store
+        // with a memory barrier in-between.  This isn't necessary for
+        // the earlier normal synchronization, because those are each
+        // protected by an if-statement.  The if-statement is avoided
+        // here to reduce thread divergence.
+        auto loads = fload(reduce_align);
+
+        Array<Var> in_warp_local_vars;
+        for (auto expr : loads) {
+          Var var("w_" + std::to_string(reduce_align) + "_" +
+                      std::to_string(in_warp_local_vars.size()),
+                  expr->dtype);
+          in_warp_local_vars.push_back(var);
+        }
+
+        std::vector<Stmt> in_let_statement;
+        in_let_statement.emplace_back(SyncThread("warp"));
+        in_let_statement.emplace_back(
+            fstore({in_warp_local_vars.begin(), in_warp_local_vars.end()}));
+        in_let_statement.emplace_back(SyncThread("warp"));
+
+        Stmt body = SeqStmt::Flatten(in_let_statement);
+        for (size_t i = 0; i < size; i++) {
+          body = LetStmt(in_warp_local_vars[i], loads[i], body);
+        }
+        in_warp_seq.push_back(body);
+      }
+
+      Stmt warp_body = SeqStmt::Flatten(in_warp_seq);
+
+      seq.emplace_back(IfThenElse(in_warp_cond, warp_body));
+      seq.emplace_back(SyncThread(shared_scope));
+    }
+    return SeqStmt::Flatten(seq);
+  }
+  // Flatten the thread index.
+  // Also return a warp number,
+  PrimExpr FlattenThread(const std::vector<ThreadEntry> &tvec,
+                         int *out_total_extent) {
+    int &total_extent = *out_total_extent;
+    total_extent = 1;
+    if (tvec.size() == 0) {
+      return make_zero(DataType::Int(32));
+    }
+
+    PrimExpr ret;
+    for (const ThreadEntry &e : tvec) {
+      if (ret.defined()) {
+        ret = ret + e.iv->var * total_extent;
+      } else {
+        ICHECK_EQ(total_extent, 1);
+        ret = e.iv->var;
+      }
+      total_extent *= e.extent;
+    }
+    return ret;
+  }
+  // The local buffer index.
+  PrimExpr BufIndex(PrimExpr reduce_index, PrimExpr group_index,
+                    int reduce_extent) {
+    if (!is_zero(group_index)) {
+      return analyzer_.Simplify(group_index * reduce_extent + reduce_index);
+    } else {
+      return reduce_index;
+    }
+  }
+  // sync thread op.
+  static Stmt SyncThread(const std::string &sync) {
+    return Evaluate(Call(DataType::Int(32), builtin::tvm_storage_sync(),
+                         {StringImm(sync)}));
+  }
+
+  // Emit warp shuffle  calls.
+  PrimExpr WarpShuffle(const Op &op, Optional<Buffer> mask_buffer, PrimExpr val,
+                       PrimExpr delta_or_lane) {
+    Array<PrimExpr> indices = {0};
+    PrimExpr mask;
+    if (mask_buffer.defined()) {
+      mask = BufferLoad(mask_buffer.value(), indices);
+    } else {
+      mask = IntImm(DataType::Int(32), 0);
+    }
+    PrimExpr width = IntImm(DataType::Int(32), warp_size_);
+    Array<PrimExpr> args{mask, val, delta_or_lane, width, width};
+    return Call(val.dtype(), op, args);
+  }
+
+  // Check if we can use warp level reduction.
+  //
+  // Note: The ROCm backend will only have warp reductions for now.
+  // Also, the warp/wavefront size differs (64 on rocm, 32 on cuda and metal).
+  bool IsWarpReduction(const std::vector<DataType> &types, int group_extent,
+                       int reduce_extent, int contiguous_reduce_extent) {
+    if ((target_->kind->name != "cuda") && (target_->kind->name != "rocm") &&
+        (target_->kind->name != "metal")) {
+      return false;
+    }
+
+    need_warp_shuffle_mask_ = target_->kind->name != "metal";
+
+    // rocm only supports 32 bit operands for shuffling at the moment
+    if ((target_->kind->name == "rocm") &&
+        (std::any_of(types.begin(), types.end(), [](DataType ty) {
+          if (ty.is_fixed_length_vector())
+            return ty.bits() * ty.lanes() != 32;
+          return ty.bits() != 32;
+        }))) {
+      return false;
+    }
+
+    // Supported types:
+    // {u}int, {u}long, {u}long long, float, double, half/half2
+    if (std::any_of(types.begin(), types.end(), [](DataType ty) {
+          if (ty.is_float16())
+            return ty.lanes() > 2;
+          if (ty.is_fixed_length_vector())
+            return true;
+          return ty.bytes() < 4 || ty.bytes() > 8;
+        })) {
+      return false;
+    }
+    if (thread_extents_.empty()) {
+      return false;
+    }
+
+    // reduce region must be contiguous.
+    if (contiguous_reduce_extent != reduce_extent) {
+      return false;
+    }
+
+    // whether reduce_extent and group_extent are valid for warp reduction.
+    if (target_->kind->name == "rocm") {
+      return reduce_extent == warp_size_;
+    } else {
+      if (reduce_extent == 1) {
+        return false; // no need to warp reduce
+      } else {
+        bool is_subwarp_reduction = warp_size_ % reduce_extent == 0;
+        bool is_multiwarp_reduction =
+            max_num_threads_ != -1 &&
+            max_num_threads_ <= warp_size_ * warp_size_ &&
+            reduce_extent % warp_size_ == 0;
+        if (is_subwarp_reduction || is_multiwarp_reduction) {
+          return true;
+        } else {
+          return group_extent == 1 && reduce_extent <= warp_size_;
+        }
+      }
+    }
+  }
+
+  // The target.
+  const TargetNode *target_ = nullptr;
+  // The shared scope.
+  String shared_scope = "shared";
+  // The warp size of the device.
+  int warp_size_{1};
+  // The maximum number of threads of the device. "-1" denotes unknown.
+  int max_num_threads_{-1};
+  // A boolean indicating if the target supports warp-level masking.
+  bool need_warp_shuffle_mask_;
+
+  // surrounding scope of thread extent.
+  std::vector<const AttrStmtNode *> thread_extents_;
+  std::vector<const CommReducerNode *> reduce_combiner_;
+  // The load remap
+  std::unordered_map<const VarNode *, PrimExpr> load_remap_;
+  // Allocate remap
+  std::unordered_map<const VarNode *, Buffer> alloc_remap_;
+  // BufferVar remap
+  std::unordered_map<const VarNode *, Var> var_remap_;
+  // Buffer remap
+  std::unordered_map<const BufferNode *, Buffer> buf_remap_;
+  // Internal analyzer
+  arith::Analyzer analyzer_;
+};
+
+namespace transform {
+using namespace tir::transform;
+
+tvm::transform::Pass LowerThreadAllreduce() {
+  auto pass_func = [](PrimFunc f, IRModule m, PassContext ctx) {
+    AllocateCollector collector;
+    collector(f->body);
+    bool is_dynamic = collector.dyn_shmem_allocs_.size() > 1;
+
+    auto *n = f.CopyOnWrite();
+    auto target = f->GetAttr<Target>(tvm::attr::kTarget);
+    ICHECK(target.defined())
+        << "LowerThreadAllreduce: Require the target attribute";
+    const TargetNode *target_node = target.as<TargetNode>();
+    ThreadAllreduceBuilder thread_all_reduce(target_node, is_dynamic);
+    n->body = thread_all_reduce(n->body);
+    return f;
+  };
+  return CreatePrimFuncPass(pass_func, 0, "tl.LowerThreadAllreduce", {});
+}
+
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.LowerThreadAllreduce",
+                        LowerThreadAllreduce);
+});
+
+} // namespace transform
+} // namespace tl
+} // namespace tvm
diff --git a/src/transform/lower_tile_op.cc b/src/transform/lower_tile_op.cc
index 28201b1c7..81e58f831 100644
--- a/src/transform/lower_tile_op.cc
+++ b/src/transform/lower_tile_op.cc
@@ -3,6 +3,7 @@
  * \brief Lower the tile op for further codegen.
  */
 
+#include <tvm/ffi/reflection/registry.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/stmt_functor.h>
 #include <tvm/tir/transform.h>
@@ -108,12 +109,14 @@ class RemapBufferRewriter : public arith::IRMutatorWithAnalyzer {
    * \return The rewritten block.
    */
   Stmt RewritePaddingMap(const BlockNode *op) {
-    auto padding_map =
-        op->annotations.Get(attr::kPaddingMap).as<Map<Var, PrimExpr>>().value();
+    auto padding_map = op->annotations.Get(attr::kPaddingMap);
+    if (!padding_map) {
+      LOG(FATAL) << "Padding map annotation is missing";
+    }
 
     Map<Var, Var> var_remap = CreateVarRemap();
-    Map<Var, PrimExpr> new_padding_map =
-        RemapPaddingMap(padding_map, var_remap);
+    Map<Var, PrimExpr> new_padding_map = RemapPaddingMap(
+        Downcast<Map<Var, PrimExpr>>(padding_map.value()), var_remap);
 
     auto block = Downcast<Block>(IRMutatorWithAnalyzer::VisitStmt_(op));
     auto block_ptr = block.CopyOnWrite();
@@ -235,7 +238,7 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
   }
 
   PrimExpr HandleAccessPtrAndOffset(PrimExpr access_ptr,
-                                    Optional<PrimExpr> offset = NullOpt,
+                                    Optional<PrimExpr> offset = std::nullopt,
                                     DataType dtype = DataType::Int(32)) {
     // The 2th arg of T.tvm_access_ptr call is offset, we set it to 0 and
     // accumulate it to smem_offset
@@ -318,7 +321,7 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
                         op->op.same_as(tl::tma_store()))) {
       has_tma_ = true;
     }
-    Array<RelayExpr> ptx_instructions = {builtin::ptx_ldmatrix(),
+    Array<RelaxExpr> ptx_instructions = {builtin::ptx_ldmatrix(),
                                          builtin::mma_store()};
 
     if (std::find(ptx_instructions.begin(), ptx_instructions.end(), op->op) ==
@@ -354,7 +357,7 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
       // mma_store now
       auto access_ptr = call->args[2];
       auto new_access_ptr =
-          HandleAccessPtrAndOffset(access_ptr, NullOpt, call->dtype);
+          HandleAccessPtrAndOffset(access_ptr, std::nullopt, call->dtype);
       auto new_call = call.CopyOnWrite();
       new_call->args.Set(2, new_access_ptr);
     } else {
@@ -496,7 +499,10 @@ tvm::transform::Pass LowerTileOp() {
   return CreatePrimFuncPass(pass_func, 0, "tl.LowerTileOp", {});
 }
 
-TVM_REGISTER_GLOBAL("tl.transform.LowerTileOp").set_body_typed(LowerTileOp);
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.LowerTileOp", LowerTileOp);
+});
 } // namespace transform
 
 } // namespace tl
diff --git a/src/transform/make_packed_api.cc b/src/transform/make_packed_api.cc
index af2a8447d..57c7c0155 100644
--- a/src/transform/make_packed_api.cc
+++ b/src/transform/make_packed_api.cc
@@ -20,8 +20,10 @@
 /*!
  * \file make_packed_api.cc Lower PrimFunc to use the packed function API.
  */
+#include <tvm/ffi/function.h>
+#include <tvm/ffi/reflection/registry.h>
 #include <tvm/runtime/device_api.h>
-#include <tvm/runtime/registry.h>
+#include <tvm/runtime/module.h>
 #include <tvm/target/target.h>
 #include <tvm/tir/analysis.h>
 #include <tvm/tir/buffer.h>
@@ -30,7 +32,6 @@
 #include <tvm/tir/stmt_functor.h>
 #include <tvm/tir/transform.h>
 
-#include <unordered_set>
 #include <utility>
 #include <vector>
 
@@ -75,7 +76,7 @@ class ReturnRewriter : public StmtMutator {
 
 private:
   struct ConvertedInfo {
-    int tcode{-1};
+    int type_index{-1};
     PrimExpr expr;
     Buffer dummy_val_buffer;
     Buffer dummy_tcode_buffer;
@@ -87,13 +88,13 @@ class ReturnRewriter : public StmtMutator {
     // convert val's data type to FFI data type, return type code
     DataType dtype = val.dtype();
     if (dtype.is_int() || dtype.is_uint()) {
-      info.tcode = kTVMArgInt;
+      info.type_index = ffi::TypeIndex::kTVMFFIInt;
       info.expr = Cast(DataType::Int(64), val);
     } else if (dtype.is_float()) {
-      info.tcode = kTVMArgFloat;
+      info.type_index = ffi::TypeIndex::kTVMFFIFloat;
       info.expr = Cast(DataType::Float(64), val);
     } else if (dtype.is_void()) {
-      info.tcode = kTVMNullptr;
+      info.type_index = ffi::TypeIndex::kTVMFFINone;
       info.expr = val;
     } else {
       LOG(FATAL) << "data type " << dtype << " not supported yet";
@@ -101,18 +102,18 @@ class ReturnRewriter : public StmtMutator {
 
     // If multiple return locations have the same data type, use the
     // same dummy buffer declaration.
-    auto it = dummy_val_buffer_map_.find(info.tcode);
+    auto it = dummy_val_buffer_map_.find(info.type_index);
     if (it != dummy_val_buffer_map_.end()) {
       info.dummy_val_buffer = it->second;
     } else {
       info.dummy_val_buffer =
           Buffer(ret_var_, info.expr.dtype(), {1}, {1}, ConstInt32(0),
                  ret_var_->name_hint, 0, 0, kDefault);
-      dummy_val_buffer_map_[info.tcode] = info.dummy_val_buffer;
+      dummy_val_buffer_map_[info.type_index] = info.dummy_val_buffer;
     }
 
-    // The tcode is always a 32-bit int, so we don't need to have a separate
-    // map.
+    // The type_index is always a 32-bit int, so we don't need to have a
+    // separate map.
     if (!dummy_tcode_buffer_.defined()) {
       dummy_tcode_buffer_ =
           Buffer(ret_tcode_, DataType::Int(32), {1}, {1}, ConstInt32(0),
@@ -126,7 +127,8 @@ class ReturnRewriter : public StmtMutator {
   Stmt WriteToOut(PrimExpr val) {
     auto info = ConvertForFFI(val);
     Stmt store_val = BufferStore(info.dummy_val_buffer, info.expr, {0});
-    Stmt store_tcode = BufferStore(info.dummy_tcode_buffer, info.tcode, {0});
+    Stmt store_tcode =
+        BufferStore(info.dummy_tcode_buffer, info.type_index, {0});
     Stmt ret_zero = Evaluate(tvm::ret(0));
     return SeqStmt({store_val, store_tcode, ret_zero});
   }
@@ -153,7 +155,7 @@ class SubroutineCallRewriter : public StmtExprMutator {
     if (rewriter.made_change_) {
       return stmt;
     } else {
-      return NullOpt;
+      return std::nullopt;
     }
   }
 
@@ -204,21 +206,21 @@ inline Stmt MakeAssertNotNull(PrimExpr ptr, std::string msg) {
  * \param func The function to be inspected
  *
  * \returns The global_symbol to be used for the function at call
- * sites, or NullOpt if the function is to remain unchanged.
+ * sites, or std::nullopt if the function is to remain unchanged.
  */
 Optional<String> RequiresPackedAPI(const PrimFunc &func) {
   // A function with an explicit calling convention has already been
   // lowered, and should not be modified.
   if (auto opt = func->GetAttr<Integer>(tvm::attr::kCallingConv)) {
     if (CallingConv(opt.value()->value) != CallingConv::kDefault) {
-      return NullOpt;
+      return std::nullopt;
     }
   }
 
   // Internal function calls do not need the PackedFunc API
   auto global_symbol = func->GetAttr<String>(tvm::attr::kGlobalSymbol);
   if (!global_symbol.defined()) {
-    return NullOpt;
+    return std::nullopt;
   }
 
   return global_symbol;
@@ -344,9 +346,9 @@ PrimFunc MakePackedAPI(PrimFunc func) {
     }
 
     // type code checks
-    Var tcode(param->name_hint + ".code", DataType::Int(32));
+    Var type_index(param->name_hint + ".code", DataType::Int(32));
     seq_init.emplace_back(LetStmt(
-        tcode,
+        type_index,
         BufferLoad(buf_packed_arg_type_ids, {IntImm(DataType::Int(32), i)}),
         nop));
     DataType t = param.dtype();
@@ -354,20 +356,22 @@ PrimFunc MakePackedAPI(PrimFunc func) {
       std::ostringstream msg;
       msg << name_hint << ": Expect arg[" << i << "] to be pointer";
       seq_init.emplace_back(
-          AssertStmt(tcode == kTVMOpaqueHandle || tcode == kTVMNDArrayHandle ||
-                         tcode == kTVMDLTensorHandle || tcode == kTVMNullptr,
+          AssertStmt(type_index == ffi::TypeIndex::kTVMFFINone ||
+                         type_index == ffi::TypeIndex::kTVMFFIOpaquePtr ||
+                         type_index == ffi::TypeIndex::kTVMFFIDLTensorPtr ||
+                         type_index >= ffi::TypeIndex::kTVMFFIStaticObjectBegin,
                      tvm::tir::StringImm(msg.str()), nop));
     } else if (t.is_int() || t.is_uint()) {
       std::ostringstream msg;
       msg << name_hint << ": Expect arg[" << i << "] to be int";
-      seq_init.emplace_back(
-          AssertStmt(tcode == kDLInt, tvm::tir::StringImm(msg.str()), nop));
+      seq_init.emplace_back(AssertStmt(type_index == kDLInt,
+                                       tvm::tir::StringImm(msg.str()), nop));
     } else {
       ICHECK(t.is_float());
       std::ostringstream msg;
       msg << name_hint << ": Expect arg[" << i << "] to be float";
-      seq_init.emplace_back(
-          AssertStmt(tcode == kDLFloat, tvm::tir::StringImm(msg.str()), nop));
+      seq_init.emplace_back(AssertStmt(type_index == kDLFloat,
+                                       tvm::tir::StringImm(msg.str()), nop));
     }
   }
 
@@ -406,13 +410,7 @@ PrimFunc MakePackedAPI(PrimFunc func) {
     seq_check.push_back(
         AttrStmt(node, tir::attr::device_type, device_type, nop));
 
-    bool need_set_device =
-        (target_device_type != kDLMicroDev &&
-         (
-             // or is c source target
-             target_device_type != kDLCPU || target->kind->name != "llvm"));
-
-    if (need_set_device) {
+    if (runtime::DeviceAPI::NeedSetDevice(target_device_type)) {
       Stmt set_device =
           Evaluate(Call(DataType::Int(32), builtin::tvm_call_packed(),
                         {StringImm(runtime::symbol::tvm_set_device),
@@ -468,7 +466,6 @@ PrimFunc MakePackedAPI(PrimFunc func) {
       << " are used, but are not passed in as API arguments";
 
   func_ptr->buffer_map = Map<Var, Buffer>();
-  func_ptr->checked_type_ = func_ptr->func_type_annotation();
   func_ptr->ret_type = PrimType(DataType::Int(32)); // return the function.
   return func;
 }
@@ -516,8 +513,10 @@ tvm::transform::Pass MakePackedAPI() {
   return tvm::transform::CreateModulePass(pass_func, 0, "tl.MakePackedAPI", {});
 }
 
-TVM_REGISTER_GLOBAL("tl.transform.MakePackedAPI").set_body_typed([]() {
-  return MakePackedAPI();
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.MakePackedAPI",
+                        []() { return MakePackedAPI(); });
 });
 
 } // namespace tl
diff --git a/src/transform/merge_if_stmt.cc b/src/transform/merge_if_stmt.cc
index 539001917..5a11d2a8c 100644
--- a/src/transform/merge_if_stmt.cc
+++ b/src/transform/merge_if_stmt.cc
@@ -3,6 +3,7 @@
  * \brief Merge the If Stmt in SeqStmt
  */
 
+#include <tvm/ffi/reflection/registry.h>
 #include <tvm/tir/analysis.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/op.h>
@@ -43,11 +44,13 @@ class MergeIfStmtRewriter : public StmtExprMutator {
             continue;
           } else {
             if (!current_if_bodies.empty()) {
-              new_seq.push_back(IfThenElse(current_condition,
-                                           current_if_bodies.size() == 1
-                                               ? current_if_bodies[0]
-                                               : SeqStmt(current_if_bodies),
-                                           Stmt()));
+              auto if_stmt =
+                  IfThenElse(current_condition,
+                             current_if_bodies.size() == 1
+                                 ? current_if_bodies[0]
+                                 : this->VisitStmt(SeqStmt(current_if_bodies)),
+                             Stmt());
+              new_seq.push_back(if_stmt);
               current_if_bodies.clear();
             }
 
@@ -59,11 +62,13 @@ class MergeIfStmtRewriter : public StmtExprMutator {
       }
 
       if (!current_if_bodies.empty()) {
-        new_seq.push_back(IfThenElse(current_condition,
-                                     current_if_bodies.size() == 1
-                                         ? current_if_bodies[0]
-                                         : SeqStmt(current_if_bodies),
-                                     Stmt()));
+        auto if_stmt =
+            IfThenElse(current_condition,
+                       current_if_bodies.size() == 1
+                           ? current_if_bodies[0]
+                           : this->VisitStmt(SeqStmt(current_if_bodies)),
+                       Stmt());
+        new_seq.push_back(if_stmt);
         current_condition = PrimExpr();
         current_if_bodies.clear();
       }
@@ -72,11 +77,13 @@ class MergeIfStmtRewriter : public StmtExprMutator {
     }
 
     if (!current_if_bodies.empty()) {
-      new_seq.push_back(IfThenElse(current_condition,
-                                   current_if_bodies.size() == 1
-                                       ? current_if_bodies[0]
-                                       : SeqStmt(current_if_bodies),
-                                   Stmt()));
+      auto if_stmt =
+          IfThenElse(current_condition,
+                     current_if_bodies.size() == 1
+                         ? current_if_bodies[0]
+                         : this->VisitStmt(SeqStmt(current_if_bodies)),
+                     Stmt());
+      new_seq.push_back(if_stmt);
     }
 
     return new_seq.size() == 1 ? new_seq[0] : SeqStmt(new_seq);
@@ -91,7 +98,10 @@ tvm::transform::Pass MergeIfStmt() {
   return CreatePrimFuncPass(pass_func, 0, "tl.MergeIfStmt", {});
 }
 
-TVM_REGISTER_GLOBAL("tl.transform.MergeIfStmt").set_body_typed(MergeIfStmt);
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.MergeIfStmt", MergeIfStmt);
+});
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/merge_shared_memory_allocations.cc b/src/transform/merge_shared_memory_allocations.cc
index 60720d226..f6a4ce882 100644
--- a/src/transform/merge_shared_memory_allocations.cc
+++ b/src/transform/merge_shared_memory_allocations.cc
@@ -23,8 +23,9 @@
  * memory allocation. This pass merges multiple TIR-level dynamic or static
  * shared memory allocations into one allocation.
  */
+#include <tvm/ffi/function.h>
+#include <tvm/ffi/reflection/registry.h>
 #include <tvm/runtime/logging.h>
-#include <tvm/runtime/registry.h>
 #include <tvm/tir/expr.h>
 #include <tvm/tir/op.h>
 #include <tvm/tir/stmt_functor.h>
@@ -34,9 +35,11 @@
 #include <unordered_set>
 
 #include "../op/builtin.h"
+#include "../target/utils.h"
 #include "runtime/thread_storage_scope.h"
 #include "support/arena.h"
 #include "tir/transforms/ir_utils.h"
+#include "tvm/tir/function.h"
 
 namespace tvm {
 namespace tl {
@@ -300,7 +303,7 @@ class SharedMemLinearAccessPatternFinder final : public StmtExprVisitor {
   bool IsAppropriateSharedMemory(const Var &var) {
     return is_dynamic_ ? IsDynamicSharedMemory(var) : IsStaticSharedMemory(var);
   }
-  // Whether do dyanmic analysis.
+  // Whether do dynamic analysis.
   bool is_dynamic_{true};
   // Whether do aggressive merge.
   bool enable_aggressive_merge_{false};
@@ -314,6 +317,46 @@ class SharedMemLinearAccessPatternFinder final : public StmtExprVisitor {
   size_t scope_level_{0};
 };
 
+class SharedMemoryAlignmentPlanner : public StmtExprVisitor {
+
+public:
+  static std::unordered_map<const VarNode *, int> Plan(const Stmt &stmt) {
+    SharedMemoryAlignmentPlanner planner;
+    planner(stmt);
+    return planner.shmem_alignment_map_;
+  }
+
+private:
+  void VisitExpr_(const CallNode *op) {
+    if (op->op.same_as(tl::tl_gemm()) || op->op.same_as(tl::tl_gemm_sp()) ||
+        op->op.same_as(tl::tma_load()) || op->op.same_as(tl::tma_store())) {
+      under_alignment_scope_ = true;
+      StmtExprVisitor::VisitExpr_(op);
+      under_alignment_scope_ = false;
+    } else {
+      StmtExprVisitor::VisitExpr_(op);
+    }
+  }
+
+  void VisitExpr_(const VarNode *op) {
+    auto ptr_type = op->type_annotation.as<PointerTypeNode>();
+    if (ptr_type && under_alignment_scope_) {
+      auto scope = GetPtrStorageScope(GetRef<Var>(op));
+      if (scope == "shared" || scope == "shared.dyn") {
+        auto target = Target::Current();
+        ICHECK(target.defined()) << "Target is not defined";
+        const int alignment = TargetIsHopper(target) ? 1024 : 16;
+        shmem_alignment_map_[op] = alignment;
+      }
+    }
+    StmtExprVisitor::VisitExpr_(op);
+  }
+
+  bool under_alignment_scope_{false};
+
+  std::unordered_map<const VarNode *, int> shmem_alignment_map_;
+};
+
 /*!
  * \brief merge the buffers whose live range has no intersection and rewrite the
  * body
@@ -341,6 +384,7 @@ class SharedMemoryRewriter : public StmtExprMutator {
     SharedMemLinearAccessPatternFinder finder(is_dynamic,
                                               enable_aggressive_merge, verbose);
     finder(stmt);
+    shmem_alignment_map_ = SharedMemoryAlignmentPlanner::Plan(stmt);
     this->LivenessAnalysis(finder.linear_seq_, finder.stmt_attrs_);
     this->PlanMemory(finder.linear_seq_, finder.stmt_attrs_);
   }
@@ -358,6 +402,14 @@ class SharedMemoryRewriter : public StmtExprMutator {
       for (const StorageEntry *e : sym_free_list_) {
         all_entry.push_back(e);
       }
+      // Sort the storage entries in descending order of their total allocation
+      // size (in bits). This ensures that larger allocations are placed first,
+      // which can help minimize fragmentation and improve memory packing
+      // efficiency when merging shared memory buffers.
+      std::sort(all_entry.begin(), all_entry.end(),
+                [](const StorageEntry *a, const StorageEntry *b) {
+                  return a->const_nbits > b->const_nbits;
+                });
       for (const StorageEntry *e : all_entry) {
         max_layer_num =
             std::max(max_layer_num, static_cast<int>(e->allocs.size()));
@@ -374,18 +426,28 @@ class SharedMemoryRewriter : public StmtExprMutator {
           }
         }
       }
-      // calculate offset for each buffer based on the align of each layer
+
       for (const StorageEntry *e : all_entry) {
         PrimExpr max_inner_offset = 0;
         for (int i = 0; i < static_cast<int>(e->allocs.size()); i++) {
           PrimExpr inner_offset = 0;
           for (const VarNode *buffer : e->allocs[i]) {
             const AllocateNode *alloc = shmem_allocs_[buffer];
-            buffer_byte_offsets_[buffer] = merged_alloc_size_ + inner_offset;
-            inner_offset +=
+            auto alignment = align[i];
+            // Modern nvidia architecture performs hardware swizzling (hopper
+            // wgmma/tma for example) requires dynamic shared memory address to
+            // be aligned to 1024 bytes For other devices, we align to 16 bytes
+            if (shmem_alignment_map_.find(buffer) !=
+                shmem_alignment_map_.end()) {
+              alignment = std::max(align[i], shmem_alignment_map_[buffer]);
+            }
+            PrimExpr start_offset = merged_alloc_size_ + inner_offset;
+            PrimExpr aligned_offset =
+                indexdiv(start_offset + alignment - 1, alignment) * alignment;
+            buffer_byte_offsets_[buffer] = aligned_offset;
+            inner_offset =
+                aligned_offset - merged_alloc_size_ +
                 alloc->extents[0] * alloc->dtype.bytes() * alloc->dtype.lanes();
-            inner_offset +=
-                indexmod(align[i] - indexmod(inner_offset, align[i]), align[i]);
           }
           max_inner_offset = max(max_inner_offset, inner_offset);
         }
@@ -575,6 +637,18 @@ class SharedMemoryRewriter : public StmtExprMutator {
     std::vector<const VarNode *> kill;
   };
 
+  void PlanAlignment(const Stmt &stmt) {
+    LOG(INFO) << "PlanAlignment";
+    PostOrderVisit(stmt, [&](const ObjectRef &node) {
+      if (const auto *call = node.as<CallNode>()) {
+        if (call->op.same_as(tl::tl_gemm()) ||
+            call->op.same_as(tl::tl_gemm_sp())) {
+          LOG(INFO) << "PostOrderVisit CallNode tl_gemm and tl_gemm_sp: "
+                    << call->op;
+        }
+      }
+    });
+  }
   /*!
    * \brief Liveness analysis to find gen and kill point of each variable.
    * \param seq the linear pattern of storage access
@@ -869,7 +943,7 @@ class SharedMemoryRewriter : public StmtExprMutator {
    */
   StorageEntry *NewAlloc(const AllocateNode *op, size_t const_nbits) {
     ICHECK(op != nullptr);
-    // Re-use not successful, allocate a new buffer.
+    // Reuse not successful, allocate a new buffer.
     StorageEntry *entry = arena_.make<StorageEntry>();
     entry->allocs.push_back({op->buffer_var.get()});
     entry->const_nbits = const_nbits;
@@ -972,7 +1046,7 @@ class SharedMemoryRewriter : public StmtExprMutator {
       sym_free_list_.push_back(e);
     }
   }
-  // Wheather enable dyanmic analysis.
+  // Whether enable dynamic analysis.
   bool is_dynamic_{true};
 
   // Whether enable verbose logging.
@@ -1003,6 +1077,8 @@ class SharedMemoryRewriter : public StmtExprMutator {
   std::unordered_map<const VarNode *, StorageEntry *> alloc_map_;
   /*! \brief allocator of all the StorageEntry*/
   support::Arena arena_;
+  // The mapping of buffer bytes alignment
+  std::unordered_map<const VarNode *, int> shmem_alignment_map_;
 };
 
 Stmt MergeSharedMemoryAllocations(Stmt stmt, bool merge_static_smem,
@@ -1048,8 +1124,11 @@ Pass MergeSharedMemoryAllocations(bool enable_aggressive_merge = false,
                             {});
 }
 
-TVM_REGISTER_GLOBAL("tl.transform.MergeSharedMemoryAllocations")
-    .set_body_typed(MergeSharedMemoryAllocations);
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.MergeSharedMemoryAllocations",
+                        MergeSharedMemoryAllocations);
+});
 
 } // namespace transform
 } // namespace tl
diff --git a/src/transform/multi_version_buffer_rewriter.cc b/src/transform/multi_version_buffer_rewriter.cc
index 337deff04..38154aed9 100644
--- a/src/transform/multi_version_buffer_rewriter.cc
+++ b/src/transform/multi_version_buffer_rewriter.cc
@@ -1,27 +1,9 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
 /*!
  * \file warp_specialized_pipeline.cc
  * \brief Warp specialized Pipeline for cuda GPU (sm90+)
  */
 
+#include <tvm/ffi/reflection/registry.h>
 #include <tvm/tir/analysis.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/op.h>
@@ -220,14 +202,14 @@ class MultiVersionBufferRewriter : public StmtExprMutator {
   Stmt VisitStmt_(const ForNode *op) final {
     loop_stack_.emplace_back(op->loop_var, op->extent);
     auto num_stages_anno = op->annotations.Get("num_stages");
-    if (!num_stages_anno.defined()) {
+    if (!num_stages_anno) {
       auto for_node = StmtExprMutator::VisitStmt_(op);
       loop_stack_.pop_back();
       return for_node;
     }
 
-    ICHECK(num_stages_anno.as<IntImmNode>());
-    int num_stages = static_cast<int>(num_stages_anno.as<IntImmNode>()->value);
+    ICHECK(num_stages_anno->as<IntImmNode>());
+    int num_stages = static_cast<int>(num_stages_anno->as<IntImmNode>()->value);
 
     const SeqStmtNode *pipeline_body_seq = op->body.as<SeqStmtNode>();
     CHECK(pipeline_body_seq) << "ValueError: The body of the software pipeline "
@@ -340,8 +322,10 @@ tvm::transform::Pass MultiVersionBuffer() {
   return CreatePrimFuncPass(pass_func, 0, "tl.MultiVersionBuffer", {});
 }
 
-TVM_REGISTER_GLOBAL("tl.transform.MultiVersionBuffer")
-    .set_body_typed(MultiVersionBuffer);
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.MultiVersionBuffer", MultiVersionBuffer);
+});
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/persist_threadblock.cc b/src/transform/persist_threadblock.cc
index b7784d201..c43bf32a0 100644
--- a/src/transform/persist_threadblock.cc
+++ b/src/transform/persist_threadblock.cc
@@ -3,6 +3,7 @@
  * \brief Lower L2 persistent annotation
  */
 
+#include <tvm/ffi/reflection/registry.h>
 #include <tvm/tir/analysis.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/stmt_functor.h>
@@ -59,8 +60,10 @@ tvm::transform::Pass PersistThreadblock() {
   return CreatePrimFuncPass(pass_func, 0, "tl.PersistThreadblock", {});
 }
 
-TVM_REGISTER_GLOBAL("tl.transform.PersistThreadblock")
-    .set_body_typed(PersistThreadblock);
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.PersistThreadblock", PersistThreadblock);
+});
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/pipeline_planning.cc b/src/transform/pipeline_planning.cc
index f3dc0d78d..13630b620 100644
--- a/src/transform/pipeline_planning.cc
+++ b/src/transform/pipeline_planning.cc
@@ -1,34 +1,12 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file pipeline_planning.cc
- * \brief Plan the software pipeline
- */
-
 #include <tvm/arith/analyzer.h>
+#include <tvm/ffi/reflection/registry.h>
 #include <tvm/tir/analysis.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/stmt_functor.h>
 #include <tvm/tir/transform.h>
 
 #include "../target/utils.h"
+#include "tvm/ir/expr.h"
 
 namespace tvm {
 namespace tl {
@@ -72,8 +50,6 @@ class BufferRegionCollector : public StmtExprVisitor {
 
   bool GetGlobalCopyPattern() const { return is_global_copy_pattern_; }
 
-  PrimExpr GetConditonalExpr() const { return conditonal_expr; }
-
 private:
   void VisitStmt_(const BufferStoreNode *op) final {
     Buffer store_buffer = op->buffer;
@@ -106,7 +82,11 @@ class BufferRegionCollector : public StmtExprVisitor {
     auto load_region = BufferRegion(load_buffer, region);
     reads_.push_back(load_region);
 
-    if (op->buffer.scope() == "global") {
+    if (op->buffer.scope() == "global" && !within_condition_expr_) {
+      // skip condition expr of if_then_else node
+      // shared[i] = T.if_then_else(global[i] < n, register_a[i], register_b[i])
+      // is not a global read shared[i] = T.if_then_else(global[i] < n,
+      // global_a[i], global_b[i]) is a global read
       is_global_read_ = true;
     }
   }
@@ -128,28 +108,27 @@ class BufferRegionCollector : public StmtExprVisitor {
         // because we only care about the buffer itself instead of indices
         reads_.push_back(buffer_region);
       }
-    } else if (op->op.same_as(tir::builtin::if_then_else())) {
-      // Simplify nested if_then_else
-      // if (cond) { if (inner_cond) { inner_then_expr } else { inner_else_expr
-      // } } else { else_expr }
-      // => if (cond && inner_cond) { inner_then_expr } else { else_expr }
-      const PrimExpr &cond = op->args[0];
-      const PrimExpr &then_expr = op->args[1];
-      const PrimExpr &else_expr = op->args[2];
-      conditonal_expr = cond;
-      this->VisitExpr(then_expr);
-      this->VisitExpr(else_expr);
+    } else if (op->op.same_as(builtin::if_then_else())) {
+      within_condition_expr_ = true;
+      this->VisitExpr(op->args[0]);
+      within_condition_expr_ = false;
+      for (auto i = 1; i < op->args.size(); i++) {
+        this->VisitExpr(op->args[i]);
+      }
     } else {
       StmtExprVisitor::VisitExpr_(op);
     }
   }
 
   void VisitStmt_(const IfThenElseNode *op) final {
-    // Skip condition
+    within_condition_expr_ = true;
+    this->VisitExpr(op->condition);
+    within_condition_expr_ = false;
     this->VisitStmt(op->then_case);
-    conditonal_expr = op->condition;
     if (op->else_case.defined()) {
+      within_condition_expr_ = true;
       this->VisitStmt(op->else_case.value());
+      within_condition_expr_ = false;
     }
   }
 
@@ -160,7 +139,7 @@ class BufferRegionCollector : public StmtExprVisitor {
   bool is_global_read_ = false;
   bool under_buffer_store_ = false;
   bool is_global_copy_pattern_ = false;
-  PrimExpr conditonal_expr;
+  bool within_condition_expr_ = false;
 };
 
 class PipelinePlanner : public StmtExprMutator {
@@ -185,23 +164,38 @@ class PipelinePlanner : public StmtExprMutator {
    *
    * \param reads Array of buffer regions read by this stage
    * \param writes Array of buffer regions written by this stage
-   * \param original_order Original position of this stage in the pipeline
+   * \param original_stmt_index Original position of this stage in the pipeline
    * before reordering \param order Current position of this stage in the
    * pipeline after reordering (-1 if not yet assigned) \param stage Pipeline
    * stage number this operation belongs to (-1 if not yet assigned) \param
    * copy_stage Whether this stage is a memory copy operation \param
-   * last_use_stage Last pipeline stage that uses the results of this stage (-1
-   * if not yet determined)
+   * last_use_stmt_index Index of the last statement (in original order) that
+   * uses the results of this stage (-1 if not yet determined). This field is
+   * crucial for pipeline optimization:
+   * - For copy stages: indicates the index of the last statement that reads
+   * from the copied data, helping determine optimal placement of copy
+   * operations
+   * - Used to ensure copy operations are scheduled before their consumers
+   * - A value of -1 means no subsequent statement uses this stage's output
+   * - This information enables better pipeline scheduling by minimizing data
+   *   dependencies and maximizing parallelism
    */
   struct PipelineStageInfo {
     Array<BufferRegion> reads, writes;
-    int original_order;
+    int original_stmt_index;
     int order = -1, stage = -1;
     bool copy_stage = false;
-    bool prepare_for_condition = false;
-    int last_use_stage = -1;
-    // represent the stage is used in a conditional statement
-    PrimExpr conditonal_expr;
+    bool producer_for_copy = false;
+    int last_use_stmt_index =
+        -1; // Initialized to -1, indicating no consumers found yet
+
+  public:
+    bool is_first_stage() const { return copy_stage || producer_for_copy; }
+    bool is_copy_stage() const { return copy_stage; }
+    bool is_producer_for_copy() const { return producer_for_copy; }
+    bool is_last_use_stmt_index_valid() const {
+      return last_use_stmt_index != -1;
+    }
   };
 
   PipelineStageInfo MakePipelineStageInfo(Stmt stmt, int idx) {
@@ -214,9 +208,8 @@ class PipelinePlanner : public StmtExprMutator {
     PipelineStageInfo pinfo;
     pinfo.reads = std::move(collector.GetReads());
     pinfo.writes = std::move(collector.GetWrites());
-    pinfo.original_order = idx;
+    pinfo.original_stmt_index = idx;
     pinfo.copy_stage = collector.GetGlobalCopyPattern();
-    pinfo.conditonal_expr = collector.GetConditonalExpr();
     return std::move(pinfo);
   }
 
@@ -224,12 +217,12 @@ class PipelinePlanner : public StmtExprMutator {
     auto order_anno = loop->annotations.Get("tl_pipeline_order");
     auto stage_anno = loop->annotations.Get("tl_pipeline_stage");
     auto num_stages_anno = loop->annotations.Get("num_stages");
-    if (order_anno.defined() && stage_anno.defined()) {
+    if (order_anno && stage_anno) {
       // Check if order_anno or stage_anno contains -1, which means TMA+WS is
       // enabled
       bool ws_tma_enabled = false;
-      auto order_array = Downcast<Array<Integer>>(order_anno);
-      auto stage_array = Downcast<Array<Integer>>(stage_anno);
+      auto order_array = Downcast<Array<Integer>>(order_anno.value());
+      auto stage_array = Downcast<Array<Integer>>(stage_anno.value());
       for (const auto &val : order_array) {
         if (val->value == -1) {
           ws_tma_enabled = true;
@@ -249,20 +242,20 @@ class PipelinePlanner : public StmtExprMutator {
         return StmtExprMutator::VisitStmt_(loop);
       }
 
-      Map<String, ObjectRef> annotations;
+      Map<String, Any> annotations;
       for (const auto &[key, value] : loop->annotations) {
         if (key != "tl_pipeline_order") {
           annotations.Set(key, value);
         }
       }
-      annotations.Set(tir::attr::software_pipeline_order, order_anno);
+      annotations.Set(tir::attr::software_pipeline_order, order_anno.value());
 
       for (const auto &[key, value] : loop->annotations) {
         if (key != "tl_pipeline_stage") {
           annotations.Set(key, value);
         }
       }
-      annotations.Set(tir::attr::software_pipeline_stage, stage_anno);
+      annotations.Set(tir::attr::software_pipeline_stage, stage_anno.value());
       if (TargetHasAsyncCopy(target_) && use_async_copy_)
         annotations.Set(tir::attr::software_pipeline_async_stages,
                         Array<Integer>{0});
@@ -271,9 +264,9 @@ class PipelinePlanner : public StmtExprMutator {
       return for_node;
     }
 
-    if (!num_stages_anno.defined())
+    if (!num_stages_anno)
       return StmtExprMutator::VisitStmt_(loop);
-    int num_stages = num_stages_anno.as<IntImmNode>()->value;
+    int num_stages = num_stages_anno->as<IntImmNode>()->value;
     Stmt pipeline_body{nullptr};
     if (const auto *realize = loop->body.as<BlockRealizeNode>()) {
       const auto &block = realize->block;
@@ -310,52 +303,150 @@ class PipelinePlanner : public StmtExprMutator {
       pipeline_stage_infos.push_back(std::move(pinfo));
     }
 
-    // process the conditional stage
-    // assign conditional stage (analysis the copy stage)
+    // For every copy stage, mark all its dependency stages as producer_for_copy
+    // Helper struct to manage copy stage dependency reads
+    struct CopyStageDependencyReadsManager {
+      std::vector<BufferRegion> regions;
+
+      // Add a region if not already present (by structural equality)
+      void AddUnique(const BufferRegion &region) {
+        for (const BufferRegion &copy_read : regions) {
+          if (region->buffer.same_as(copy_read->buffer)) {
+            return;
+          }
+        }
+        regions.push_back(region);
+      }
+
+      // Check if a region is present (by structural equality)
+      bool Contains(const BufferRegion &region) const {
+        for (const BufferRegion &copy_read : regions) {
+          if (region->buffer.same_as(copy_read->buffer)) {
+            return true;
+          }
+        }
+        return false;
+      }
+
+      size_t Size() const { return regions.size(); }
+    };
+
+    CopyStageDependencyReadsManager copy_stage_dependency_reads_mgr;
+
+    // Step 1. Collect Copy reads
+    for (const auto &pinfo : pipeline_stage_infos) {
+      if (pinfo.is_copy_stage()) {
+        for (const BufferRegion &read : pinfo.reads) {
+          copy_stage_dependency_reads_mgr.AddUnique(read);
+        }
+      }
+    }
+
+    // Step 2. find if pinfo write the copy reads, then update the
+    // copy_stage_dependency_reads To prevent infinite loops, we set a maximum
+    // number of iterations. In theory, the number of possible updates is
+    // bounded by the number of pipeline stages, since each stage can only be
+    // marked as producer_for_copy once, and each read can only be added once.
+    // But for safety, we add a hard limit.
+    const size_t max_iterations = (pipeline_stage_infos.size() * 4) + 16;
+    size_t iter_count = 0;
+
     for (auto &pinfo : pipeline_stage_infos) {
-      for (const auto &write : pinfo.writes) {
-        for (const auto &other : pipeline_stage_infos) {
-          if (other.conditonal_expr.defined()) {
-            auto check_var = [&](const ObjectRef &n) {
-              if (const auto *buffer_load = n.as<BufferLoadNode>()) {
-                if (buffer_load->buffer == write->buffer) {
-                  pinfo.prepare_for_condition = true;
-                }
+      if (!pinfo.is_copy_stage()) {
+        continue;
+      }
+      auto original_copy_stmt_index = pinfo.original_stmt_index;
+      bool updated = true;
+      while (updated) {
+        updated = false;
+        for (auto &pinfo_inner : pipeline_stage_infos) {
+          if (pinfo_inner.is_copy_stage()) {
+            continue;
+          }
+          if (pinfo_inner.original_stmt_index >= original_copy_stmt_index) {
+            break;
+          }
+
+          bool should_prepare = false;
+          for (const BufferRegion &write : pinfo_inner.writes) {
+            if (copy_stage_dependency_reads_mgr.Contains(write)) {
+              should_prepare = true;
+              break;
+            }
+          }
+          if (should_prepare && !pinfo_inner.is_producer_for_copy()) {
+            pinfo_inner.producer_for_copy = true;
+            updated = true;
+          }
+          if (should_prepare) {
+            for (const BufferRegion &read : pinfo_inner.reads) {
+              size_t before = copy_stage_dependency_reads_mgr.Size();
+              copy_stage_dependency_reads_mgr.AddUnique(read);
+              if (copy_stage_dependency_reads_mgr.Size() > before) {
+                updated = true;
               }
-            };
-            PostOrderVisit(other.conditonal_expr, check_var);
+            }
           }
         }
+        iter_count++;
+        if (iter_count > max_iterations) {
+          LOG(FATAL)
+              << "Pipeline planning: Exceeded maximum iterations ("
+              << max_iterations << ") in copy stage dependency propagation. "
+              << "This may indicate a cyclic or pathological dependency graph.";
+        }
       }
     }
 
-    // analysis use-def chain
+    // Analysis use-def chain to determine last_use_stmt_index for copy
+    // operations This step is critical for pipeline optimization as it
+    // identifies the index of the last statement that consumes data produced by
+    // copy stages, enabling optimal placement of copy operations in the
+    // pipeline schedule.
     for (auto &pinfo : pipeline_stage_infos) {
-      for (int i = pinfo.original_order + 1;
+      // Only analyze copy stages (memory copy operations)
+      if (!pinfo.is_first_stage())
+        continue;
+
+      // Check all subsequent statements to find the latest consumer
+      for (int i = pinfo.original_stmt_index + 1;
            i < static_cast<int>(pipeline_body_seq->size()); i++) {
-        if (!pinfo.copy_stage)
-          continue;
+
+        // Check if any read operation in statement 'i' uses data written by
+        // this copy stage
         for (const BufferRegion &read : pipeline_stage_infos[i].reads) {
+          // Look for overlapping buffer regions between this stage's writes and
+          // stage 'i's reads
           if (std::find_if(pinfo.writes.begin(), pinfo.writes.end(),
                            [&](const BufferRegion &r) {
                              return r->buffer == read->buffer &&
                                     MayConflict(r->region, read->region);
                            }) != pinfo.writes.end()) {
-            pinfo.last_use_stage = std::max(pinfo.last_use_stage, i);
+            // Update last_use_stmt_index to the maximum (latest) statement
+            // index that uses this data This ensures we capture the final
+            // consumer of the copied data
+            pinfo.last_use_stmt_index = std::max(pinfo.last_use_stmt_index, i);
           }
         }
-        for (const BufferRegion &write : pipeline_stage_infos[i].writes) {
-          if (std::find_if(pinfo.writes.begin(), pinfo.writes.end(),
-                           [&](const BufferRegion &r) {
-                             return r->buffer == write->buffer &&
-                                    MayConflict(r->region, write->region);
-                           }) != pinfo.writes.end()) {
-            LOG(FATAL) << "Pipeline planning error: Multiple writes to "
-                          "overlapping buffer regions detected. "
-                       << "Stage " << pinfo.original_order << " and stage " << i
-                       << " are both writing to buffer '" << write->buffer->name
-                       << "' with overlapping regions. This is not supported "
-                          "in pipeline planning.";
+        // Check for write-after-write conflicts (multiple stages writing to
+        // same buffer region) This is important for pipeline correctness and
+        // affects last_use_stmt_index analysis
+        if (pinfo.is_copy_stage()) {
+          for (const BufferRegion &write : pipeline_stage_infos[i].writes) {
+            if (std::find_if(pinfo.writes.begin(), pinfo.writes.end(),
+                             [&](const BufferRegion &r) {
+                               return r->buffer == write->buffer &&
+                                      MayConflict(r->region, write->region);
+                             }) != pinfo.writes.end()) {
+              LOG(FATAL) << "Pipeline planning error: Multiple writes to "
+                            "overlapping buffer regions detected. "
+                         << "Stage " << pinfo.original_stmt_index
+                         << " and stage " << i
+                         << " are both writing to buffer '"
+                         << write->buffer->name
+                         << "' with overlapping regions. This is not supported "
+                            "in pipeline planning.";
+            }
           }
         }
       }
@@ -363,14 +454,16 @@ class PipelinePlanner : public StmtExprMutator {
 
     // Making stages and orders
     int order_idx = 0;
-    // Create pipeline stages and assign order
+    // Stage 1. Create pipeline stages and assign order
     for (auto &pinfo : pipeline_stage_infos) {
       // Skip elements that must be in first stage:
-      // 1. Copy stages (with active last_use_stage)
-      // 2. Condition preparation stages
-      if ((pinfo.copy_stage && pinfo.last_use_stage != -1) ||
-          pinfo.prepare_for_condition)
+      // 1. Copy stages (with active last_use_stmt_index) - these need special
+      // handling
+      //    because they have consumers that depend on their data
+      // 2. All Producer stages for copy stages.
+      if (pinfo.is_first_stage() && pinfo.is_last_use_stmt_index_valid()) {
         continue;
+      }
 
       // Main logic stage assignment:
       // - Increment order index
@@ -378,34 +471,15 @@ class PipelinePlanner : public StmtExprMutator {
       pinfo.order = order_idx++;
       pinfo.stage = num_stages;
 
+      // Schedule copy stages that have this stage as their last consumer
+      // This ensures copy operations are placed right before their final
+      // consumer for optimal pipeline efficiency
       for (auto &pinfo_1 : pipeline_stage_infos) {
-        if ((pinfo_1.copy_stage &&
-             pinfo_1.last_use_stage == pinfo.original_order)) {
+        if ((pinfo_1.is_first_stage() &&
+             pinfo_1.last_use_stmt_index == pinfo.original_stmt_index)) {
           pinfo_1.order = order_idx++;
-          pinfo_1.stage = 0;
-        }
-      }
-    }
-
-    // Handle trailing unassigned copy stages:
-    // These are typically final copy operations needing post-main-stage
-    // insertion
-    auto &head_pinfo = pipeline_stage_infos.at(0);
-    int unassigned_order_elem = -1;
-
-    // Process dependent copy stages:
-    // Insert copy stages after current stage but assign to stage 0
-    // and adjust the order index
-    for (auto &pinfo : pipeline_stage_infos) {
-      if (pinfo.order == unassigned_order_elem) {
-        pinfo.order = unassigned_order_elem++;
-        // traverse the from the next info
-        for (auto it = pipeline_stage_infos.begin() + unassigned_order_elem;
-             it != pipeline_stage_infos.end(); it++) {
-          it->order += 1;
+          pinfo_1.stage = 0; // Copy stages are typically assigned to stage 0
         }
-        pinfo.stage = 0;
-        order_idx++;
       }
     }
 
@@ -415,14 +489,14 @@ class PipelinePlanner : public StmtExprMutator {
         << "Got " << order_idx << " stages and " << pipeline_stage_infos.size()
         << " pipeline stages.";
 
-    // if all the copy is at the end of the order, we can move these copy to the
-    // beginning of the order and shrink the stage offset by 1.
+    // Step 2. if all the copy is at the end of the order, we can move these
+    // copy to the beginning of the order and shrink the stage offset by 1.
     int copy_stage_at_end = [&]() {
       int copy_stage_cnt = 0;
       int copy_order_min = pipeline_stage_infos.size();
       int non_copy_order_max = 0;
       for (auto &pinfo : pipeline_stage_infos) {
-        if (pinfo.copy_stage || pinfo.prepare_for_condition) {
+        if (pinfo.is_first_stage()) {
           copy_stage_cnt++;
           copy_order_min = std::min(copy_order_min, pinfo.order);
         } else {
@@ -437,13 +511,13 @@ class PipelinePlanner : public StmtExprMutator {
       for (auto &pinfo : pipeline_stage_infos) { // move copy to the beginning
         pinfo.order =
             (pinfo.order + copy_stage_at_end) % pipeline_stage_infos.size();
-        if (!pinfo.copy_stage && !pinfo.prepare_for_condition)
+        if (!pinfo.is_copy_stage() && !pinfo.is_producer_for_copy())
           pinfo.stage--;
       }
     }
 
     // Finally, make the pipeline annotation
-    Map<String, ObjectRef> annotations;
+    Map<String, Any> annotations;
     for (const auto &[key, value] : loop->annotations) {
       if (key != "num_stages") {
         annotations.Set(key, value);
@@ -496,8 +570,10 @@ tvm::transform::Pass PipelinePlanning() {
   return CreatePrimFuncPass(pass_func, 0, "tl.PipelinePlanning", {});
 }
 
-TVM_REGISTER_GLOBAL("tl.transform.PipelinePlanning")
-    .set_body_typed(PipelinePlanning);
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.PipelinePlanning", PipelinePlanning);
+});
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/simplify.cc b/src/transform/simplify.cc
index bdde70ad2..0cc6baf87 100644
--- a/src/transform/simplify.cc
+++ b/src/transform/simplify.cc
@@ -1,8 +1,10 @@
 /*!
  * \file simplify.cc
- * \brief Remove useless parameters of TL PrimFunc.
+ * \brief Statement simplifier based on analyzer and remove useless parameters
+ * of TL PrimFunc.
  */
 
+#include <tvm/ffi/reflection/registry.h>
 #include <tvm/tir/buffer.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/stmt_functor.h>
@@ -19,39 +21,45 @@ namespace tl {
 using namespace tir;
 using namespace arith;
 
-struct SimplifyConfigNode : public tvm::AttrsNode<SimplifyConfigNode> {
+struct SimplifyConfigNode : public AttrsNodeReflAdapter<SimplifyConfigNode> {
   bool transitively_prove_inequalities;
   bool propagate_knowns_to_prove_conditional;
   bool propagate_knowns_to_simplify_expressions;
   bool convert_boolean_to_and_of_ors;
   bool apply_constraints_to_boolean_branches;
 
-  TVM_DECLARE_ATTRS(SimplifyConfigNode, "tl.transform.SimplifyConfig") {
-    TVM_ATTR_FIELD(transitively_prove_inequalities)
-        .describe("If true, simplify conditionals with transitive combinations "
-                  "of scoped constraints")
-        .set_default(false);
-
-    TVM_ATTR_FIELD(propagate_knowns_to_prove_conditional)
-        .describe("If true, known buffer values are propagated and used to "
-                  "statically prove conditionals")
-        .set_default(false);
-
-    TVM_ATTR_FIELD(propagate_knowns_to_simplify_expressions)
-        .describe("If true, known buffer values are propagated and used to "
-                  "replace BufferLoad wherever "
-                  "possible")
-        .set_default(false);
-
-    TVM_ATTR_FIELD(convert_boolean_to_and_of_ors)
-        .describe("If true, simplify conditionals into an AND of ORs")
-        .set_default(false);
-
-    TVM_ATTR_FIELD(apply_constraints_to_boolean_branches)
-        .describe("If true, simplify each branch of AND/OR "
-                  "under a constraints provided by the other branch")
-        .set_default(false);
+  static void RegisterReflection() {
+    namespace refl = tvm::ffi::reflection;
+    refl::ObjectDef<SimplifyConfigNode>()
+        .def_ro("transitively_prove_inequalities",
+                &SimplifyConfigNode::transitively_prove_inequalities,
+                "If true, simplify conditionals with transitive combinations "
+                "of scoped constraints",
+                refl::DefaultValue(false))
+        .def_ro("propagate_knowns_to_prove_conditional",
+                &SimplifyConfigNode::propagate_knowns_to_prove_conditional,
+                "If true, known buffer values are propagated and used to "
+                "statically prove conditionals",
+                refl::DefaultValue(false))
+        .def_ro("propagate_knowns_to_simplify_expressions",
+                &SimplifyConfigNode::propagate_knowns_to_simplify_expressions,
+                "If true, known buffer values are propagated and used to "
+                "replace BufferLoad wherever "
+                "possible",
+                refl::DefaultValue(false))
+        .def_ro("convert_boolean_to_and_of_ors",
+                &SimplifyConfigNode::convert_boolean_to_and_of_ors,
+                "If true, simplify conditionals into an AND of ORs",
+                refl::DefaultValue(false))
+        .def_ro("apply_constraints_to_boolean_branches",
+                &SimplifyConfigNode::apply_constraints_to_boolean_branches,
+                "If true, simplify each branch of AND/OR under a constraints "
+                "provided by the other "
+                "branch",
+                refl::DefaultValue(false));
   }
+  static constexpr const char *_type_key = "tl.transform.SimplifyConfig";
+  TVM_FFI_DECLARE_FINAL_OBJECT_INFO(SimplifyConfigNode, BaseAttrsNode);
 
   RewriteSimplifier::Extension GetEnabledExtensions() const {
     RewriteSimplifier::Extension flags = RewriteSimplifier::kNone;
@@ -200,6 +208,7 @@ class SimplifyConfig : public Attrs {
   TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(SimplifyConfig, Attrs,
                                             SimplifyConfigNode);
 };
+TVM_FFI_STATIC_INIT_BLOCK({ SimplifyConfigNode::RegisterReflection(); });
 
 TVM_REGISTER_NODE_TYPE(SimplifyConfigNode);
 TVM_REGISTER_PASS_CONFIG_OPTION("tl.Simplify", SimplifyConfig);
@@ -207,7 +216,7 @@ TVM_REGISTER_PASS_CONFIG_OPTION("tl.Simplify", SimplifyConfig);
 class StmtSimplifier : public IRMutatorWithAnalyzer {
 public:
   static PrimFunc Apply(PrimFunc func, Analyzer *analyzer,
-                        Optional<SimplifyConfig> config_opt = NullOpt,
+                        Optional<SimplifyConfig> config_opt = std::nullopt,
                         bool simplify_arguments = false) {
     auto config = config_opt.value_or(AttrsWithDefaultValues<SimplifyConfig>());
     analyzer->rewrite_simplify.SetEnabledExtensions(
@@ -229,6 +238,7 @@ class StmtSimplifier : public IRMutatorWithAnalyzer {
     // Begin to remove useless var and buffer
     // First get used buffers
     simplifier.used_buffers_ = CollectUsedBuffers(func);
+
     bool param_updated = false;
     Array<Var> new_params;
     Map<Var, Buffer> new_buffer_map;
@@ -239,13 +249,18 @@ class StmtSimplifier : public IRMutatorWithAnalyzer {
             simplifier.used_buffers_.end()) {
           new_params.push_back(var);
           new_buffer_map.Set(var, func->buffer_map[var]);
+        } else if (simplifier.used_in_buffer_def_.find(
+                       func->buffer_map[var]->data.get()) !=
+                   simplifier.used_in_buffer_def_.end()) {
+          new_params.push_back(var);
+          new_buffer_map.Set(var, func->buffer_map[var]);
         } else {
           param_updated = true;
         }
       }
     }
 
-    if (simplify_arguments && param_updated) {
+    if (param_updated) {
       return PrimFunc(new_params, func.CopyOnWrite()->body, func->ret_type,
                       new_buffer_map, func->attrs, func->span);
     } else {
@@ -444,7 +459,7 @@ class StmtSimplifier : public IRMutatorWithAnalyzer {
                               arith::ProofStrength::kSymbolicBound)) {
         return Bool(true);
       }
-      return NullOpt;
+      return std::nullopt;
     }
   }
 
@@ -452,7 +467,7 @@ class StmtSimplifier : public IRMutatorWithAnalyzer {
   std::optional<ControlFlowGraph> touch_pattern_;
 
   Map<Var, PrimExpr> non_inlined_bindings_;
-  Optional<Stmt> current_stmt_{NullOpt};
+  Optional<Stmt> current_stmt_{std::nullopt};
   std::unordered_set<const VarNode *> used_in_buffer_def_;
   std::unordered_set<const VarNode *> used_vars_;
   std::unordered_set<const BufferNode *> used_buffers_;
@@ -469,7 +484,10 @@ tvm::transform::Pass Simplify(bool simplify_arguments = true) {
   return CreatePrimFuncPass(pass_func, 0, "tl.Simplify", {});
 }
 
-TVM_REGISTER_GLOBAL("tl.transform.Simplify").set_body_typed(Simplify);
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.Simplify", Simplify);
+});
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/storage_rewrite.cc b/src/transform/storage_rewrite.cc
new file mode 100644
index 000000000..56d9d4ac0
--- /dev/null
+++ b/src/transform/storage_rewrite.cc
@@ -0,0 +1,1968 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file storage_rewrite.cc
+ * \brief Memory access pattern analysis and optimization.
+ *  Re-write data access to enable memory sharing when possible.
+ */
+#include <tvm/arith/analyzer.h>
+#include <tvm/ffi/function.h>
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/ir/type.h>
+#include <tvm/target/target_info.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+#include <map>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "arith/int_operator.h"
+#include "runtime/thread_storage_scope.h"
+#include "tir/ir/buffer_common.h"
+#include "tir/transforms/ir_utils.h"
+
+namespace tvm {
+namespace tl {
+
+using runtime::StorageRank;
+using runtime::StorageScope;
+using namespace tir;
+
+/*!
+ * \brief Perform data type legalization on the given BufferLoadNode pointer.
+ * Equal to BufferLoadNode::LegalizeDType, but operates on a pointer.
+ * \param n A pointer to a writable BufferLoadNode.
+ */
+static void LegalizeBufferLoadDType(BufferLoadNode *n) {
+  // Check that all indices except the last one have a scalar dtype
+  for (int i = 0; i < static_cast<int>(n->indices.size()) - 1; i++) {
+    ICHECK(n->indices[i].dtype().is_scalar())
+        << "Only the last index of a buffer access may be a vector type.";
+  }
+
+  // If there are no indices, set the dtype to the buffer's dtype
+  if (n->indices.empty()) {
+    n->dtype = n->buffer->dtype;
+  } else {
+    auto index_dtype = n->indices.back().dtype();
+    bool is_buffer_dtype_scalable = n->buffer->dtype.is_scalable_vector();
+    bool is_index_scalable = index_dtype.is_scalable_vector();
+
+    // Do not allow both index dtype and buffer dtype to be scalable vectors
+    ICHECK(!(is_index_scalable && is_buffer_dtype_scalable))
+        << "Index dtype and buffer dtype cannot both be scalable.";
+
+    if (is_index_scalable) {
+      // Index is a scalable vector, while the buffer is not
+      n->dtype = n->buffer->dtype.with_scalable_vscale_factor(
+          index_dtype.vscale_factor() * n->buffer->dtype.lanes());
+    } else if (is_buffer_dtype_scalable) {
+      // The buffer is a scalable vector, while the index is not
+      n->dtype = n->buffer->dtype.with_scalable_vscale_factor(
+          n->buffer->dtype.vscale_factor() * index_dtype.lanes());
+    } else {
+      // Neither side is a scalable vector, multiply lanes
+      n->dtype = n->buffer->dtype.with_lanes(index_dtype.lanes() *
+                                             n->buffer->dtype.lanes());
+    }
+  }
+}
+
+/*!
+ * \brief collect the mapping from the buffer var to its allocate
+ */
+class AllocateCollector : public StmtExprVisitor {
+private:
+  bool IsDynamicSharedMemory(Var buffer_var) {
+    StorageScope storage_scope =
+        runtime::StorageScope::Create(GetPtrStorageScope(buffer_var));
+    return storage_scope.rank == runtime::StorageRank::kShared &&
+           storage_scope.tag == ".dyn";
+  }
+
+  bool IsStaticSharedMemory(Var buffer_var) {
+    StorageScope storage_scope =
+        runtime::StorageScope::Create(GetPtrStorageScope(buffer_var));
+    return storage_scope.rank == runtime::StorageRank::kShared &&
+           storage_scope.tag == "";
+  }
+
+public:
+  void VisitStmt_(const AllocateNode *op) final {
+    if (IsDynamicSharedMemory(op->buffer_var)) {
+      dyn_shmem_allocs_[op->buffer_var.get()] = op;
+    } else if (IsStaticSharedMemory(op->buffer_var)) {
+      static_shmem_allocs_[op->buffer_var.get()] = op;
+    }
+    StmtExprVisitor::VisitStmt_(op);
+  }
+  // The dynamic mapping from the original buffer var to its allocate
+  std::unordered_map<const VarNode *, const AllocateNode *> dyn_shmem_allocs_;
+  // The static mapping from the original buffer var to its allocate
+  std::unordered_map<const VarNode *, const AllocateNode *>
+      static_shmem_allocs_;
+};
+
+// Find a linear pattern of storage access
+// Used for liveness analysis.
+// Composite scopes(loop/thread_launch/IfThen) is represented by two points:
+// before_scope -> scope_body -> after_scope
+//
+// The linear_seq_ stores before_scope and after_scope.
+// The access to the arrays are stored at the after_scope point.
+//
+// Define "scope" as the body of For/thread_launch/IfThenElse
+// This pass tries to detect last point that we need to keep memory
+// alive under the same scope as allocate.
+// The storage need to be kept alive between allocate and last access.
+// The free point is only inserted at the same scope of allocate.
+//
+class LinearAccessPatternFinder final : public StmtExprVisitor {
+public:
+  /*! \brief record the touch hist of statement. */
+  struct StmtEntry {
+    // The statement
+    const Object *stmt;
+    // The index in the linear_seq_ to point to end of the nested scope.
+    // This is only set to non-zero if stmt is a nested scope.
+    // if offset > 0, means this is the begin, the end entry is current_index +
+    // offset if offset < 0, means this is the end, the begin entry is
+    // current_index + offset
+    int64_t scope_pair_offset{0};
+    // The buffer variables this statement touched.
+    std::vector<const VarNode *> touched;
+  };
+  // The scope of each allocation
+  struct AllocEntry {
+    // The physical dimension of the allocation.
+    size_t num_physical_dimensions{0};
+    // scope level
+    size_t level{0};
+    // allocation stmt
+    const AllocateNode *alloc{nullptr};
+  };
+
+  void VisitStmt_(const AllocateNode *op) final {
+    size_t level = scope_.size();
+    const VarNode *buf = op->buffer_var.get();
+
+    AllocEntry entry;
+    entry.alloc = op;
+    entry.level = level;
+    // Since StorageRewrite occurs after StorageFlatten/FlattenBuffer,
+    // all allocations specify the extent of physical dimensions, and
+    // is 1 for flat memory spaces.
+    entry.num_physical_dimensions = op->extents.size();
+    alloc_info_[buf] = entry;
+
+    StmtExprVisitor::VisitStmt_(op);
+  }
+
+  void VisitStmt_(const BufferStoreNode *op) final {
+    scope_.push_back(StmtEntry());
+    // visit subexpr
+    StmtExprVisitor::VisitStmt_(op);
+    all_buffers_accessed_.insert(op->buffer.get());
+
+    // Add write access.
+    const VarNode *buffer_var = op->buffer->data.get();
+    auto it = alloc_info_.find(buffer_var);
+    if (it != alloc_info_.end() && it->second.alloc) {
+      ICHECK_LT(it->second.level, scope_.size());
+      scope_[it->second.level].touched.push_back(buffer_var);
+
+      ICHECK_EQ(op->buffer->axis_separators.size() + 1,
+                it->second.num_physical_dimensions)
+          << "Buffer " << op->buffer->name << " is allocated with "
+          << it->second.num_physical_dimensions
+          << " physical dimensions, but is accessed as having "
+          << op->buffer->axis_separators.size() + 1 << " physical dimensions"
+          << std::endl;
+    }
+    StmtEntry e = scope_.back();
+    scope_.pop_back();
+    if (e.touched.size() != 0) {
+      e.stmt = op;
+      linear_seq_.push_back(e);
+    }
+  }
+
+  void VisitExpr_(const BufferLoadNode *op) final {
+    // Add write access.
+    StmtExprVisitor::VisitExpr_(op);
+
+    all_buffers_accessed_.insert(op->buffer.get());
+
+    const VarNode *buffer_var = op->buffer->data.get();
+    auto it = alloc_info_.find(buffer_var);
+    if (it != alloc_info_.end() && it->second.alloc) {
+      ICHECK_LT(it->second.level, scope_.size())
+          << "Load memory in places other than store.";
+      scope_[it->second.level].touched.push_back(buffer_var);
+
+      ICHECK_EQ(op->buffer->axis_separators.size() + 1,
+                it->second.num_physical_dimensions)
+          << "Buffer " << op->buffer->name << " is allocated with "
+          << it->second.num_physical_dimensions
+          << " physical dimensions, but is accessed as having "
+          << op->buffer->axis_separators.size() + 1 << " physical dimensions"
+          << std::endl;
+    }
+  }
+
+  void VisitStmt_(const EvaluateNode *op) final {
+    scope_.push_back(StmtEntry());
+    // visit subexpr
+    StmtExprVisitor::VisitStmt_(op);
+    StmtEntry e = scope_.back();
+    scope_.pop_back();
+    if (e.touched.size() != 0) {
+      e.stmt = op;
+      linear_seq_.push_back(e);
+    }
+  }
+
+  void VisitExpr_(const VarNode *buf) final {
+    // Directly reference to the variable count as a read.
+    auto it = alloc_info_.find(buf);
+    if (it != alloc_info_.end() && it->second.alloc) {
+      ICHECK_LT(it->second.level, scope_.size()) << " buf=" << buf->name_hint;
+      scope_[it->second.level].touched.push_back(buf);
+    }
+  }
+
+  template <typename T> void VisitNewScope(const T *op) {
+    scope_.push_back(StmtEntry());
+    StmtEntry e;
+    e.stmt = op;
+    int64_t begin_index = static_cast<int64_t>(linear_seq_.size());
+    // before scope.
+    linear_seq_.push_back(e);
+    StmtExprVisitor::VisitStmt_(op);
+    // after scope.
+    e.touched = std::move(scope_.back().touched);
+    scope_.pop_back();
+    int64_t end_index = static_cast<int64_t>(linear_seq_.size());
+    ICHECK_GT(end_index, begin_index);
+    e.scope_pair_offset = begin_index - end_index;
+    linear_seq_.push_back(e);
+    // record the pointer to end index.
+    ICHECK_NE(end_index, 0U);
+    linear_seq_[begin_index].scope_pair_offset = end_index - begin_index;
+  }
+
+  void VisitStmt_(const AttrStmtNode *op) final {
+    // Only record the outer most thread extent.
+    if (op->attr_key == tir::attr::thread_extent && !in_thread_env_) {
+      in_thread_env_ = true;
+      VisitNewScope(op);
+      in_thread_env_ = false;
+    } else if (op->attr_key == tir::attr::extern_scope) {
+      VisitNewScope(op);
+    } else if (op->attr_key == tir::attr::virtual_thread) {
+      VisitNewScope(op);
+    } else {
+      StmtExprVisitor::VisitStmt_(op);
+    }
+  }
+
+  void VisitStmt_(const IfThenElseNode *op) final { VisitNewScope(op); }
+
+  void VisitStmt_(const ForNode *op) final { VisitNewScope(op); }
+
+  void VisitStmt_(const WhileNode *op) final { VisitNewScope(op); }
+
+  void VisitStmt_(const AssertStmtNode *op) final { VisitNewScope(op); }
+
+  void VisitStmt_(const LetStmtNode *op) final { VisitNewScope(op); }
+
+  // linearized access sequence.
+  std::vector<StmtEntry> linear_seq_;
+  // The storage scope of each buffer
+  std::unordered_map<const VarNode *, AllocEntry> alloc_info_;
+  // A record of which Buffer objects have been accessed, to prune
+  // unused DeclBuffer instances.
+  std::unordered_set<const BufferNode *> all_buffers_accessed_;
+
+private:
+  // Whether already in thread env.
+  bool in_thread_env_{false};
+  // The scope stack.
+  std::vector<StmtEntry> scope_;
+};
+
+// Verify if the statement can be run safely via inplace fashion
+//
+// Detect pattern: dst[index] = f(src[index])
+//
+// WARNING: the current detection algorithm cannot handle the case
+// when a location in an array is written multiple times
+//
+// For example, the following program will pass the check,
+// but we cannot make A and B to be the same array.
+//
+//  A[0] = B[0] + 1
+//  A[0] = B[0] + 1
+//
+// The high level code generator needs to ensure that the generated
+// code only write each location of the target array once.
+//
+// This is the case with IR generated by the current compute schedule.
+// We explicitly return false if we find there is an extern block
+// which can be arbitrary IR.
+//
+// Neve-the-less, inplace detector should be used with care in mind.
+// We may also consider introduce a condition checker that checks
+// if every index only visited once for an absolute sufficient condition.
+//
+// The code after inplace transformation is no longer idempotent.
+//
+class InplaceOpVerifier : public StmtExprVisitor {
+public:
+  bool Check(const Object *stmt, const VarNode *dst, const VarNode *src) {
+    dst_ = dst;
+    src_ = src;
+    result_ = true;
+    if (stmt->IsInstance<AttrStmtNode>()) {
+      VisitStmt_(static_cast<const AttrStmtNode *>(stmt));
+    } else if (stmt->IsInstance<ForNode>()) {
+      VisitStmt_(static_cast<const ForNode *>(stmt));
+    } else if (stmt->IsInstance<IfThenElseNode>()) {
+      VisitStmt_(static_cast<const IfThenElseNode *>(stmt));
+    } else if (stmt->IsInstance<WhileNode>()) {
+      VisitStmt_(static_cast<const WhileNode *>(stmt));
+    } else if (stmt->IsInstance<BufferStoreNode>()) {
+      VisitStmt_(static_cast<const BufferStoreNode *>(stmt));
+    } else {
+      return false;
+    }
+    return result_;
+  }
+
+  using StmtExprVisitor::VisitStmt_;
+
+  void VisitStmt(const Stmt &n) final {
+    if (!result_)
+      return;
+    StmtExprVisitor::VisitStmt(n);
+  }
+  void VisitExpr(const PrimExpr &n) final {
+    if (!result_)
+      return;
+    StmtExprVisitor::VisitExpr(n);
+  }
+
+  void VisitExpr_(const VarNode *op) final {
+    // assume all opaque access is unsafe
+    if (op == dst_ || op == src_) {
+      result_ = false;
+      return;
+    }
+  }
+
+  void VisitStmt_(const BufferStoreNode *op) final {
+    ++mem_nest_;
+    for (const auto &index : op->indices) {
+      this->VisitExpr(index);
+    }
+    --mem_nest_;
+    if (op->buffer->data.get() == dst_) {
+      store_ = op;
+      this->VisitExpr(op->value);
+      store_ = nullptr;
+    } else {
+      this->VisitExpr(op->value);
+    }
+  }
+
+  void VisitStmt_(const AttrStmtNode *op) final {
+    // always reject extern code
+    if (op->attr_key == tir::attr::extern_scope ||
+        op->attr_key == tir::attr::volatile_scope) {
+      result_ = false;
+      return;
+    }
+    StmtExprVisitor::VisitStmt_(op);
+  }
+
+  void VisitExpr_(const BufferLoadNode *op) final {
+    const VarNode *buf = op->buffer->data.get();
+    // cannot read from dst_ (no reduction)
+    if (buf == dst_) {
+      result_ = false;
+      return;
+    }
+    // do not allow indirect memory load
+    if (mem_nest_ != 0) {
+      result_ = false;
+      return;
+    }
+    if (src_ == buf) {
+      if (store_ == nullptr || store_->value.dtype() != op->dtype) {
+        result_ = false;
+        return;
+      }
+      ICHECK_EQ(store_->indices.size(), op->indices.size())
+          << "Store/Load occur to the same buffer " << buf->name_hint
+          << " with differing number of indices";
+      for (size_t i = 0; i < store_->indices.size(); i++) {
+        if (!tir::ExprDeepEqual()(store_->indices[i], op->indices[i])) {
+          result_ = false;
+          return;
+        }
+      }
+    }
+    ++mem_nest_;
+    StmtExprVisitor::VisitExpr_(op);
+    --mem_nest_;
+  }
+
+private:
+  // result of the check
+  bool result_{true};
+  // destination memory
+  const VarNode *dst_;
+  // source variable
+  const VarNode *src_;
+  // counter of load,
+  // it is not safe to inplace when there is nested load like A[B[i]]
+  int mem_nest_{0};
+  // The current store to be inspected
+  const BufferStoreNode *store_{nullptr};
+};
+
+/* \brief Rewrite and merge memory allocation.
+ *
+ * Using LinearAccessPatternFinder, determines which buffers could share an
+ * allocation.  This includes both sequential usage of the same buffer and
+ * merging small allocations at the same scope into a single larger allocation.
+ * The merging of small allocations requires the codegen to cast the resulting
+ * value from the storage type to the output type after access.
+ */
+class StoragePlanRewriter : public StmtExprMutator {
+public:
+  using StmtEntry = LinearAccessPatternFinder::StmtEntry;
+  using AllocEntry = LinearAccessPatternFinder::AllocEntry;
+
+  Stmt Rewrite(Stmt stmt, bool detect_inplace, bool enable_reuse,
+               bool reuse_require_exact_matched_dtype) {
+    detect_inplace_ = detect_inplace;
+    // plan the rewrite
+    LinearAccessPatternFinder finder;
+    finder(stmt);
+    this->LivenessAnalysis(finder.linear_seq_);
+    this->PlanMemory(finder.linear_seq_, finder.alloc_info_, enable_reuse,
+                     reuse_require_exact_matched_dtype);
+    all_buffers_accessed_ = finder.all_buffers_accessed_;
+    this->PrepareNewAlloc();
+    // start rewrite
+    stmt = operator()(std::move(stmt));
+    if (attach_map_.count(nullptr)) {
+      return MakeAttach(attach_map_.at(nullptr), stmt);
+    }
+    return stmt;
+  }
+
+  template <typename Node> Node VisitBufferAccess(Node node) {
+    auto it = alloc_map_.find(node->buffer->data.get());
+    if (it != alloc_map_.end()) {
+      Buffer buf = RemapBuffer(node->buffer, it->second->alloc_var);
+
+      Array<PrimExpr> indices = node->indices;
+      indices.Set(indices.size() - 1,
+                  RemapIndex(node->buffer->dtype, indices[indices.size() - 1],
+                             it->second));
+
+      auto writer = node.CopyOnWrite();
+      writer->buffer = buf;
+      writer->indices = indices;
+    }
+    return node;
+  }
+
+  Buffer RemapBuffer(Buffer buf, Var new_backing_array) {
+    auto key = buf.get();
+    auto it = buffer_remap_.find(key);
+    if (it != buffer_remap_.end()) {
+      ICHECK_EQ(it->second->data.get(), new_backing_array.get())
+          << "Cannot remap buffer " << buf->name << " to use backing array "
+          << new_backing_array->name_hint << ", previously used backing array "
+          << it->second->data->name_hint;
+      return it->second;
+    }
+
+    Buffer remapped = Buffer(
+        new_backing_array, buf->dtype, buf->shape, buf->strides,
+        buf->elem_offset, new_backing_array->name_hint, buf->data_alignment,
+        buf->offset_factor, buf->buffer_type, buf->axis_separators, buf->span);
+    buffer_remap_[key] = remapped;
+    return remapped;
+  }
+
+  Stmt VisitStmt_(const BufferStoreNode *op) final {
+    auto node = Downcast<BufferStore>(StmtExprMutator::VisitStmt_(op));
+    return VisitBufferAccess(std::move(node));
+  }
+
+  PrimExpr VisitExpr_(const BufferLoadNode *op) final {
+    auto node = Downcast<BufferLoad>(StmtExprMutator::VisitExpr_(op));
+    return VisitBufferAccess(std::move(node));
+  }
+
+  PrimExpr VisitExpr_(const VarNode *op) final {
+    auto it = alloc_map_.find(op);
+    if (it != alloc_map_.end()) {
+      if (it->second->bits_offset != 0) {
+        LOG(WARNING)
+            << "Use a merged buffer variable address, could cause error";
+      }
+      return it->second->alloc_var;
+    } else {
+      return GetRef<PrimExpr>(op);
+    }
+  }
+  PrimExpr VisitExpr_(const CallNode *op) final {
+    if (op->op.same_as(builtin::tvm_access_ptr())) {
+      ICHECK_EQ(op->args.size(), 5U);
+      DataType dtype = op->args[0].dtype();
+      const VarNode *buffer = op->args[1].as<VarNode>();
+      auto it = alloc_map_.find(buffer);
+      if (it == alloc_map_.end()) {
+        return StmtExprMutator::VisitExpr_(op);
+      }
+      const StorageEntry *se = it->second;
+      PrimExpr offset = this->VisitExpr(op->args[2]);
+      PrimExpr extent = this->VisitExpr(op->args[3]);
+      uint64_t elem_bits = dtype.bits() * dtype.lanes();
+      ICHECK_EQ(se->bits_offset % elem_bits, 0U);
+      if (se->bits_offset != 0) {
+        offset =
+            make_const(offset.dtype(), se->bits_offset / elem_bits) + offset;
+      }
+      return Call(op->dtype, op->op,
+                  {op->args[0], se->alloc_var, offset, extent, op->args[4]});
+    } else {
+      return StmtExprMutator::VisitExpr_(op);
+    }
+  }
+
+  Stmt VisitStmt_(const AttrStmtNode *op) final {
+    if (op->attr_key == tir::attr::thread_extent ||
+        op->attr_key == tir::attr::virtual_thread ||
+        tir::attr::IsPragmaKey(op->attr_key)) {
+      // remake all the allocation at the attach scope.
+      if (attach_map_.count(op)) {
+        auto &svec = attach_map_[op];
+        Stmt stmt = StmtExprMutator::VisitStmt_(op);
+        op = stmt.as<AttrStmtNode>();
+        return AttrStmt(op->node, op->attr_key, op->value,
+                        MakeAttach(svec, op->body));
+      } else {
+        return StmtExprMutator::VisitStmt_(op);
+      }
+    } else if (op->attr_key == tir::attr::volatile_scope) {
+      Stmt stmt = StmtExprMutator::VisitStmt_(op);
+      op = stmt.as<AttrStmtNode>();
+      auto it = alloc_map_.find(op->node.as<VarNode>());
+      if (it == alloc_map_.end())
+        return stmt;
+      return AttrStmt(it->second->alloc_var, op->attr_key, op->value, op->body);
+    } else {
+      return StmtExprMutator::VisitStmt_(op);
+    }
+  }
+
+  Stmt VisitStmt_(const ForNode *op) final {
+    ICHECK(op->kind != ForKind::kVectorized)
+        << "VectorizeLoop before LiftStorageAlloc";
+    // remake all the allocation at the attach scope.
+    if (attach_map_.count(op)) {
+      auto &svec = attach_map_[op];
+      Stmt stmt = StmtExprMutator::VisitStmt_(op);
+      op = stmt.as<ForNode>();
+      return For(op->loop_var, op->min, op->extent, op->kind,
+                 MakeAttach(svec, op->body), op->thread_binding,
+                 op->annotations);
+    } else {
+      return StmtExprMutator::VisitStmt_(op);
+    }
+  }
+
+  Stmt VisitStmt_(const AllocateNode *op) final {
+    return this->VisitStmt(op->body);
+  }
+
+  Stmt VisitStmt_(const DeclBufferNode *op) final {
+    if (hoisted_buffer_decls_.count(op->buffer.get()) ||
+        !all_buffers_accessed_.count(op->buffer.get())) {
+      return this->VisitStmt(op->body);
+    }
+    auto node = Downcast<DeclBuffer>(StmtExprMutator::VisitStmt_(op));
+
+    if (auto it = alloc_map_.find(op->buffer->data.get());
+        it != alloc_map_.end()) {
+      Buffer buf = RemapBuffer(op->buffer, it->second->alloc_var);
+      node.CopyOnWrite()->buffer = buf;
+    }
+    return std::move(node);
+  }
+
+private:
+  struct StorageEntry {
+    // The scope that this alloc attaches after
+    // For shared/local memory it is beginning of the thread extent.
+    // for global memory it is nullptr, means beginning of everything.
+    const Object *attach_scope_{nullptr};
+    // The constant size of the buffer in bits, only used if it is constant
+    uint64_t const_nbits{0};
+    // The storage scope.
+    StorageScope scope;
+    // The physical dimensionality of the allocations.  Since
+    // StorageRewrite is applied after StorageFlatten/FlattenBuffer,
+    // this is size of `AllocateNode::extents`.  If moved
+    size_t ndim;
+    // Allocs that shares this entry.
+    std::vector<const AllocateNode *> allocs;
+    // The children of this entry, not including itself.
+    std::vector<StorageEntry *> merged_children;
+    // The replacement Allocate, if any.  May also include associated
+    // DeclBuffer statement.
+    std::vector<Stmt> alloc_nest;
+    // The var expr of new allocation.
+    Var alloc_var;
+    // The allocation element type.
+    DataType elem_type;
+    // This is non-zero if this allocate is folded into another one
+    // the address(in bits) becomes alloc_var + bits_offset;
+    // can be effectively converted to the element type.
+    // We need to convert bit_offset to offset of specific element type later.
+    //
+    // We use bits(instead of bytes) to support non-conventional indexing in
+    // hardware. When we are merging buffer together, the bits_offset are set to
+    // be aligned to certain value given by the max_simd_bits property of the
+    // special memory.
+    //
+    // This allows effective sharing among different types as long as their
+    // alignment requirement fits into the max_simd_bits.
+    uint64_t bits_offset{0};
+  };
+
+  // Checks whether the storage_scope is especially tagged for a specific
+  // memory. Special memory is all combined into a single allocation.
+  bool IsSpecialTaggedMemory(const StorageScope &scope) {
+    return scope.tag.length() != 0 && scope.tag != ".dyn" &&
+           scope.tag != ".workspace" && scope.tag != ".vtcm";
+  }
+
+  // Allocate entry of node.
+  // Event entry in liveness analysis
+  struct EventEntry {
+    // variables we generate
+    std::vector<const VarNode *> gen;
+    // variables we kill
+    std::vector<const VarNode *> kill;
+  };
+
+  Stmt MakeAttach(const std::vector<StorageEntry *> &svec, Stmt body) {
+    for (auto it = svec.rbegin(); it != svec.rend(); it++) {
+      body = MergeNest((*it)->alloc_nest, body);
+    }
+    return body;
+  }
+  // Remap the index
+  PrimExpr RemapIndex(DataType dtype, PrimExpr index, StorageEntry *e) {
+    if (e->bits_offset == 0)
+      return index;
+    uint64_t elem_bits = dtype.bits();
+    ICHECK_EQ(e->bits_offset % elem_bits, 0U);
+    return make_const(index.dtype(), e->bits_offset / elem_bits) + index;
+  }
+  // Prepare the new allocations
+  void PrepareNewAlloc() {
+    for (size_t i = 0; i < alloc_vec_.size(); ++i) {
+      StorageEntry *e = alloc_vec_[i].get();
+      attach_map_[e->attach_scope_].push_back(e);
+    }
+    // find allocation via attach map.
+    for (auto &kv : attach_map_) {
+      // find the element with the most amount of bytes.
+      std::vector<StorageEntry *> &vec = kv.second;
+      // try to find merge, for tagged memory
+      for (size_t i = 0; i < vec.size(); ++i) {
+        StorageEntry *e = vec[i];
+        if (IsSpecialTaggedMemory(e->scope)) {
+          ICHECK_NE(e->const_nbits, 0U)
+              << "Special tagged memory must be const size";
+          for (size_t j = 0; j < i; ++j) {
+            if (e->scope == vec[j]->scope) {
+              vec[j]->merged_children.push_back(e);
+              break;
+            }
+          }
+        }
+      }
+      // Start allocation
+      for (size_t i = 0; i < vec.size(); ++i) {
+        StorageEntry *e = vec[i];
+        // already merged
+        if (e->bits_offset != 0)
+          continue;
+        if (e->merged_children.size() != 0) {
+          NewAllocTagMerged(e);
+          continue;
+        }
+        // Get the allocation size;
+        e->alloc_var = e->allocs[0]->buffer_var;
+        DataType alloc_type = e->allocs[0]->dtype;
+        for (const AllocateNode *op : e->allocs) {
+          if (op->dtype.lanes() > alloc_type.lanes()) {
+            alloc_type = op->dtype;
+          }
+        }
+
+        bool all_allocs_identical = std::all_of(
+            e->allocs.begin() + 1, e->allocs.end(),
+            [&](const AllocateNode *op) -> bool {
+              const AllocateNode *first = *e->allocs.begin();
+              if (op->dtype != first->dtype) {
+                return false;
+              }
+              if (op->extents.size() != first->extents.size()) {
+                return false;
+              }
+              ExprDeepEqual expr_equal;
+              for (size_t i = 0; i < op->extents.size(); i++) {
+                if (!expr_equal(op->extents[i], first->extents[i])) {
+                  return false;
+                }
+              }
+              return true;
+            });
+
+        if (all_allocs_identical) {
+          // simply use the original allocation.
+          e->alloc_nest.push_back(
+              Allocate(e->alloc_var, alloc_type, e->allocs[0]->extents,
+                       e->allocs[0]->condition, Evaluate(0)));
+          if (auto ptr = e->allocs[0]->body.as<DeclBufferNode>()) {
+            e->alloc_nest.push_back(DeclBuffer(
+                RemapBuffer(ptr->buffer, e->alloc_var), Evaluate(0)));
+            hoisted_buffer_decls_.insert(ptr->buffer.get());
+          }
+          if (IsSpecialTaggedMemory(e->scope)) {
+            MemoryInfo info = GetMemoryInfo(e->scope.to_string());
+            if (info.defined()) {
+              uint64_t total_elem = e->const_nbits / e->elem_type.bits();
+              ICHECK_LE(total_elem * e->elem_type.bits(), info->max_num_bits)
+                  << "Allocation exceed bound of memory tag "
+                  << e->scope.to_string();
+            }
+          }
+        } else {
+          // Build a merged allocation
+          PrimExpr combo_size;
+          for (const AllocateNode *op : e->allocs) {
+            ICHECK_EQ(op->extents.size(), 1)
+                << "Buffer var " << op->buffer_var->name_hint
+                << " was identified as a reusable allocation, but has "
+                << op->extents.size() << " physical dimensions.  "
+                << "Currently, only flat 1-d memory spaces should be "
+                   "identified as reusable "
+                   "allocations.";
+            PrimExpr sz = op->extents[0];
+            auto nbits = op->dtype.bits() * op->dtype.lanes();
+            if (const auto *imm = sz.as<IntImmNode>()) {
+              if (imm->value > std::numeric_limits<int>::max() / nbits) {
+                LOG(WARNING) << "The allocation requires : " << imm->value
+                             << " * " << nbits
+                             << " bits, which is greater than the maximum of"
+                                " int32. The size is cast to int64."
+                             << "\n";
+                sz = make_const(DataType::Int(64), imm->value);
+              }
+            }
+            // transform to bits
+            auto sz_nbits = sz * nbits;
+            if (combo_size.defined()) {
+              combo_size = max(combo_size, sz_nbits);
+            } else {
+              combo_size = sz_nbits;
+            }
+          }
+          // transform to alloc bytes
+          auto type_bits = alloc_type.bits() * alloc_type.lanes();
+          bool divided =
+              analyzer_.CanProve(indexmod(combo_size, type_bits) == 0);
+          combo_size = indexdiv(combo_size, type_bits);
+          // round up for can not divided
+          if (!divided) {
+            combo_size = combo_size + make_const(DataType::Int(32), 1);
+          }
+          combo_size = analyzer_.Simplify(combo_size);
+          e->alloc_nest.push_back(Allocate(e->alloc_var, alloc_type,
+                                           {combo_size}, const_true(),
+                                           Evaluate(0)));
+          if (IsSpecialTaggedMemory(e->scope)) {
+            MemoryInfo info = GetMemoryInfo(e->scope.to_string());
+            if (info.defined()) {
+              uint64_t total_elem = e->const_nbits / e->elem_type.bits();
+              ICHECK_LE(total_elem * e->elem_type.bits(), info->max_num_bits)
+                  << "Allocation exceed bound of memory tag "
+                  << e->scope.to_string();
+            }
+          }
+        }
+      }
+    }
+  }
+  // New allocation for merged data
+  void NewAllocTagMerged(StorageEntry *e) {
+    ICHECK_NE(e->scope.tag.length(), 0U);
+    // allocate with element type.
+    ICHECK_NE(e->const_nbits, 0U);
+    MemoryInfo info = GetMemoryInfo(e->scope.to_string());
+    uint64_t total_bits = e->const_nbits;
+    // By default, align to 32 bits.
+    size_t align = 32;
+    if (info.defined()) {
+      align = info->max_simd_bits;
+    }
+    // Always align to max_simd_bits
+    // so we can remap types by keeping this property
+    if (total_bits % align != 0) {
+      total_bits += align - (total_bits % align);
+    }
+    e->alloc_var = e->allocs[0]->buffer_var;
+    for (StorageEntry *child : e->merged_children) {
+      ICHECK_NE(child->const_nbits, 0U);
+      ICHECK_NE(total_bits, 0U);
+      child->bits_offset = total_bits;
+      child->alloc_var = e->alloc_var;
+      total_bits += child->const_nbits;
+      if (total_bits % align != 0) {
+        total_bits += align - (total_bits % align);
+      }
+    }
+    uint64_t type_bits = e->elem_type.bits() * e->elem_type.lanes();
+    PrimExpr alloc_size = make_const(e->allocs[0]->extents[0].dtype(),
+                                     (total_bits + type_bits - 1) / type_bits);
+    e->alloc_nest.push_back(Allocate(e->alloc_var, e->elem_type, {alloc_size},
+                                     const_true(), Evaluate(0)));
+    if (info.defined()) {
+      ICHECK_LE(total_bits, info->max_num_bits)
+          << "Allocation exceed bound of memory tag " << e->scope.to_string();
+    }
+  }
+  // Liveness analysis to find gen and kill point of each variable.
+  void LivenessAnalysis(const std::vector<StmtEntry> &seq) {
+    // find kill point, do a reverse linear scan.
+    std::unordered_set<const VarNode *> touched;
+    for (size_t i = seq.size(); i != 0; --i) {
+      const StmtEntry &s = seq[i - 1];
+      for (const VarNode *buffer : s.touched) {
+        if (!touched.count(buffer)) {
+          touched.insert(buffer);
+          event_map_[s.stmt].kill.push_back(buffer);
+        }
+      }
+    }
+    // find gen point, do forward scan
+    touched.clear();
+    for (size_t i = 0; i < seq.size(); ++i) {
+      int64_t offset = seq[i].scope_pair_offset;
+      if (offset < 0)
+        continue;
+      const StmtEntry &s = seq[i + offset];
+      for (const VarNode *buffer : s.touched) {
+        if (!touched.count(buffer)) {
+          touched.insert(buffer);
+          event_map_[s.stmt].gen.push_back(buffer);
+        }
+      }
+    }
+  }
+  void PlanNewScope(const Object *op) {
+    if (thread_scope_ != nullptr) {
+      ICHECK(thread_scope_ == op);
+      // erase all memory attached to this scope.
+      for (auto it = const_free_map_.begin(); it != const_free_map_.end();) {
+        if (it->second->attach_scope_ == op) {
+          it = const_free_map_.erase(it);
+        } else {
+          ++it;
+        }
+      }
+      for (auto it = sym_free_list_.begin(); it != sym_free_list_.end();) {
+        if ((*it)->attach_scope_ == op) {
+          it = sym_free_list_.erase(it);
+        } else {
+          ++it;
+        }
+      }
+      thread_scope_ = nullptr;
+    } else {
+      thread_scope_ = op;
+    }
+  }
+
+  // Memory plan algorithm
+  void
+  PlanMemory(const std::vector<StmtEntry> &seq,
+             const std::unordered_map<const VarNode *, AllocEntry> &alloc_info,
+             bool enable_reuse, bool reuse_require_exact_matched_dtype) {
+    std::unordered_set<const VarNode *> inplace_flag;
+
+    for (size_t i = 0; i < seq.size(); ++i) {
+      const StmtEntry &s = seq[i];
+      auto it = event_map_.find(seq[i].stmt);
+
+      // scope_pair_offset >= 0 means it is either
+      // - leaf stmt(offset = 0)
+      // - beginning of scope(offset < 0)
+      // In both cases, we need to handle the gen event correctly
+      if (it != event_map_.end() && seq[i].scope_pair_offset >= 0) {
+        // Inplace operation detection
+        // specially handle this
+        bool detect_inplace = detect_inplace_ && (it->second.gen.size() <= 2);
+
+        for (const VarNode *var : it->second.gen) {
+          ICHECK(alloc_info.count(var));
+          const AllocEntry &entry = alloc_info.at(var);
+          const AllocateNode *alloc = entry.alloc;
+          auto storage_scope =
+              StorageScope::Create(GetPtrStorageScope(GetRef<Var>(var)));
+          StorageEntry *dst_entry = nullptr;
+          // inplace detection
+          if (detect_inplace) {
+            // only one inplace var for s.stmt
+            bool inplace_found = false;
+            for (const VarNode *src : it->second.kill) {
+              if (!inplace_flag.count(src) && alloc_map_.count(src)) {
+                InplaceOpVerifier visitor;
+                StorageEntry *src_entry = alloc_map_.at(src);
+                if (src_entry->scope == storage_scope &&
+                    src_entry->attach_scope_ == thread_scope_ &&
+                    src_entry->elem_type == alloc->dtype.element_of() &&
+                    visitor.Check(s.stmt, var, src)) {
+                  uint64_t const_nbits =
+                      static_cast<uint64_t>(alloc->ConstantAllocationSize()) *
+                      alloc->dtype.bits() * alloc->dtype.lanes();
+                  if (src_entry->const_nbits == const_nbits && !inplace_found) {
+                    // successfully inplace
+                    dst_entry = src_entry;
+                    inplace_flag.insert(src);
+                    inplace_found = true;
+                  }
+                }
+              }
+            }
+          }
+          if (dst_entry == nullptr) {
+            dst_entry = FindAlloc(alloc, thread_scope_, storage_scope,
+                                  entry.num_physical_dimensions, enable_reuse,
+                                  reuse_require_exact_matched_dtype);
+          }
+          dst_entry->allocs.emplace_back(alloc);
+          alloc_map_[var] = dst_entry;
+        }
+      }
+      // enter/exit new scope
+      if (s.stmt->IsInstance<AttrStmtNode>()) {
+        const auto *op = static_cast<const AttrStmtNode *>(s.stmt);
+        if (op->attr_key == tir::attr::thread_extent ||
+            op->attr_key == tir::attr::virtual_thread ||
+            tir::attr::IsPragmaKey(op->attr_key)) {
+          PlanNewScope(op);
+        } else {
+          ICHECK(op->attr_key == tir::attr::extern_scope);
+        }
+      } else if (s.stmt->IsInstance<ForNode>()) {
+        const auto *op = static_cast<const ForNode *>(s.stmt);
+        if (op->kind == ForKind::kParallel) {
+          if (thread_scope_ == nullptr || thread_scope_ == op) {
+            PlanNewScope(op);
+          }
+        }
+      }
+      // scope_pair_offset <= 0 means it is either
+      // - leaf stmt(offset = 0)
+      // - end of scope(offset < 0)
+      // In both cases, we need to handle the kill event correctly
+      if (it != event_map_.end() && seq[i].scope_pair_offset <= 0) {
+        for (const VarNode *var : it->second.kill) {
+          // skip space which are already replaced by inplace
+          if (!inplace_flag.count(var)) {
+            this->Free(var);
+          }
+        }
+      }
+    }
+  }
+  // Allocate new storage entry.
+  StorageEntry *NewAlloc(const AllocateNode *op, const Object *attach_scope,
+                         const StorageScope &scope, size_t const_nbits) {
+    ICHECK(op != nullptr);
+    // Reuse not successful, allocate a new buffer.
+    auto entry = std::make_unique<StorageEntry>();
+    entry->attach_scope_ = attach_scope;
+    entry->scope = scope;
+    entry->elem_type = op->dtype.element_of();
+    entry->const_nbits = const_nbits;
+    StorageEntry *e = entry.get();
+    alloc_vec_.emplace_back(std::move(entry));
+    return e;
+  }
+
+  StorageEntry *FindAlloc(const AllocateNode *op, const Object *attach_scope,
+                          const StorageScope &scope,
+                          size_t num_physical_dimensions, bool enable_reuse,
+                          bool reuse_require_exact_matched_dtype) {
+    ICHECK(op != nullptr);
+    // skip plan for local variable,
+    // compiler can do a better job with register allocation.
+    const uint64_t match_range = 16;
+    uint64_t op_elem_bits = op->dtype.bits() * op->dtype.lanes();
+    uint64_t const_nbits =
+        static_cast<uint64_t>(op->ConstantAllocationSize() * op_elem_bits);
+
+    // If the size of the array isn't known at compile-time, it must
+    // have its own allocation with size determined at runtime.
+    bool is_known_size = (const_nbits != 0);
+
+    // Currently, only flat memory spaces can be reused.  Packing
+    // into N-d space (e.g. 2-d texture memory on GPUs) will require
+    // more in-depth algorithms.
+    bool is_flat_memory_space = (num_physical_dimensions == 1);
+
+    // disable reuse of small arrays, they will be lowered to registers in LLVM
+    // This rules only apply if we are using non special memory
+    bool is_small_array =
+        (scope.tag.length() == 0) &&
+        (scope.rank >= StorageRank::kWarp || op->dtype.is_handle() ||
+         (is_known_size && const_nbits <= 32));
+
+    if (!enable_reuse || is_small_array || !is_flat_memory_space) {
+      return NewAlloc(op, attach_scope, scope, const_nbits);
+    }
+
+    if (is_known_size) {
+      // constant allocation.
+      auto begin = const_free_map_.lower_bound(const_nbits / match_range);
+      auto mid = const_free_map_.lower_bound(const_nbits);
+      auto end = const_free_map_.upper_bound(const_nbits * match_range);
+      // start looking at the buffer that is bigger than the required size first
+      for (auto it = mid; it != end; ++it) {
+        StorageEntry *e = it->second;
+        if (e->attach_scope_ != attach_scope)
+          continue;
+        if (e->scope != scope)
+          continue;
+        // when not divided, no reuse, eg, float4 vs float3
+        if (e->bits_offset % op_elem_bits != 0)
+          continue;
+        if (reuse_require_exact_matched_dtype && e->elem_type != op->dtype) {
+          continue;
+        }
+        e->const_nbits = std::max(const_nbits, e->const_nbits);
+        const_free_map_.erase(it);
+        return e;
+      }
+      // then start looking at smaller buffers.
+      for (auto it = mid; it != begin;) {
+        --it;
+        StorageEntry *e = it->second;
+        if (e->attach_scope_ != attach_scope)
+          continue;
+        if (e->scope != scope)
+          continue;
+        if (e->elem_type != op->dtype.element_of())
+          continue;
+        if (reuse_require_exact_matched_dtype && e->elem_type != op->dtype) {
+          continue;
+        }
+        e->const_nbits = std::max(const_nbits, e->const_nbits);
+        const_free_map_.erase(it);
+        return e;
+      }
+    } else {
+      // Simple strategy: round roubin.
+      for (auto it = sym_free_list_.begin(); it != sym_free_list_.end(); ++it) {
+        StorageEntry *e = *it;
+        if (e->attach_scope_ != attach_scope)
+          continue;
+        if (e->scope != scope)
+          continue;
+        if (e->elem_type != op->dtype.element_of())
+          continue;
+        sym_free_list_.erase(it);
+        return e;
+      }
+    }
+    return NewAlloc(op, attach_scope, scope, const_nbits);
+  }
+  // simulated free.
+  void Free(const VarNode *var) {
+    auto it = alloc_map_.find(var);
+    ICHECK(it != alloc_map_.end());
+    StorageEntry *e = it->second;
+    ICHECK_NE(e->allocs.size(), 0U);
+
+    // disable reuse of small arrays, they will be lowered to registers in LLVM
+    // This rules only apply if we are using non special memory
+    if (e->scope.tag.length() == 0) {
+      // Disable sharing of local memory.
+      if (e->scope.rank >= StorageRank::kWarp ||
+          e->allocs[0]->dtype.is_handle())
+        return;
+      // disable reuse of small arrays
+      if (e->const_nbits > 0 && e->const_nbits <= 32)
+        return;
+    }
+    // normal free.
+    if (e->const_nbits != 0) {
+      const_free_map_.insert({e->const_nbits, e});
+    } else {
+      sym_free_list_.push_back(e);
+    }
+  }
+  // thread scope.
+  const Object *thread_scope_{nullptr};
+  // whether enable inplace detection.
+  bool detect_inplace_{false};
+  // Locations of free ops.
+  std::unordered_map<const Object *, EventEntry> event_map_;
+  // constant size free map.
+  std::multimap<uint64_t, StorageEntry *> const_free_map_;
+  // symbolic free list, for non constant items.
+  std::list<StorageEntry *> sym_free_list_;
+  // The allocation attach map
+  std::unordered_map<const Object *, std::vector<StorageEntry *>> attach_map_;
+  // The allocation assign map
+  std::unordered_map<const VarNode *, StorageEntry *> alloc_map_;
+  // The allocations
+  std::vector<std::unique_ptr<StorageEntry>> alloc_vec_;
+  // The buffer objects being remapped
+  std::unordered_map<const BufferNode *, Buffer> buffer_remap_;
+  // Buffers whose DeclBuffer has been hoisted to be adjacent to the new
+  // Allocate location
+  std::unordered_set<const BufferNode *> hoisted_buffer_decls_;
+  // Any buffers that is accessed at some point.  DeclBuffer instances
+  // that do not appear in this list may be removed.
+  std::unordered_set<const BufferNode *> all_buffers_accessed_;
+  // analyzer
+  arith::Analyzer analyzer_;
+};
+
+/* Helper struct containing information on how a buffer is declared and used
+ *
+ */
+struct BufferVarInfo {
+  enum DeclarationLocation {
+    kPrimFuncParam = (1 << 0),
+    kPrimFuncBufferMap = (1 << 1),
+    kAllocateNode = (1 << 2),
+    kAllocateConstNode = (1 << 3),
+    kLetNode = (1 << 4),
+  };
+
+  // The tir::Var that represents this buffer.
+  Var var;
+
+  // The data type of an element of the buffer.
+  DataType element_dtype;
+
+  /* The extent of the buffer.
+   *
+   * If multidimensional, the extent of the last dimension of the buffer.  If
+   * the size is unknown (e.g. pointer arguments to PrimFunc with no
+   * corresponding entry in buffer_map), then extent is zero.
+   */
+  PrimExpr extent;
+
+  // Where the buffer was declared
+  DeclarationLocation declaration_location;
+
+  // When accessed, which element type is it accessed as.  This may
+  // differ both in base type (e.g. int32* cast to float32* after
+  // packing in StorageRewrite) or in number of lanes (e.g. float16*
+  // cast to float16x4*).
+  std::unordered_set<DataType> access_dtype;
+  // Data types used for scalar reads. This is used to record vectorized read
+  // dtypes that can be shuffled for scalar reads when
+  // rewrite_scalar_read_to_vector_shuffle is enabled.
+  std::unordered_set<DataType> scalar_read_dtype;
+
+  DataType get_preferred_dtype() const {
+    std::unordered_set<DataType> base_access_dtype;
+    for (auto dtype : access_dtype) {
+      base_access_dtype.insert(dtype.element_of());
+    }
+    for (auto dtype : scalar_read_dtype) {
+      base_access_dtype.insert(dtype.element_of());
+    }
+    // If the array is accessed as multiple base types within a
+    // function, no point in changing the declared type.  CodeGenC can
+    // handle this with a type-cast prior to indexing.  Vulkan will
+    // raise an error at code-gen time, if a later pass doesn't split
+    // it out.
+    if (base_access_dtype.size() != 1) {
+      return element_dtype;
+    }
+
+    DataType preferred_base_type = *base_access_dtype.begin();
+
+    // If there is only one vectorizable size used to access the
+    // buffer, and if that access size is compatible with the array
+    // size, then the buffer is vectorizable.  In the future, this
+    // could be improved to allow vectorized buffer access of size
+    // GCD(*lanes_used), if necessary.
+    // When there are scalar reads and no writes, access_dtype can be empty and
+    // we should avoid rewriting.
+    int preferred_lanes = element_dtype.lanes();
+    if (element_dtype.lanes() == 1 && (access_dtype.size() == 1)) {
+      int lanes = access_dtype.begin()->lanes();
+      // Check the scalar read dtypes are compatible with the vectorized access
+      // dtype.
+      for (auto dtype : scalar_read_dtype) {
+        if (dtype.lanes() % lanes != 0) {
+          return element_dtype;
+        }
+      }
+      arith::Analyzer analyzer_;
+      arith::ModularSet me = analyzer_.modular_set(extent);
+      if ((me->coeff % lanes == 0) && (me->base % lanes == 0)) {
+        preferred_lanes = lanes;
+      }
+    }
+
+    return preferred_base_type.with_lanes(preferred_lanes);
+  }
+};
+
+/* Checks whether buffers are accessed as scalar or vector parameters in a
+ * function.
+ *
+ */
+class VectorTypeAccessChecker : public StmtExprVisitor {
+public:
+  /* Constructor
+   *
+   * @param params The parameters passed to a PrimFunc
+   *
+   * @param buffer_map The buffer_map associated with a PrimFunc
+   *
+   * @param allow_untyped_handles If a buffer or pointer variable is
+   * missing a type annotation, assume that it has the same underlying
+   * type as it is later accessed, with scalar element types.
+   */
+  VectorTypeAccessChecker(const Array<tir::Var> &params,
+                          const Map<Var, Buffer> &buffer_map,
+                          bool allow_untyped_pointers = false,
+                          bool detect_scalar_read_patterns = true)
+      : allow_untyped_pointers_(allow_untyped_pointers),
+        detect_scalar_read_patterns_(detect_scalar_read_patterns) {
+    // If a parameter is in the buffer map, we want to track the
+    // version in the map.
+    for (auto it : buffer_map) {
+      Buffer &buffer = it.second;
+      Var buffer_var = buffer->data;
+      DataType dtype = buffer->dtype;
+      PrimExpr extent =
+          buffer->shape.size() ? buffer->shape[buffer->shape.size() - 1] : 0;
+      OnArrayDeclaration(buffer_var, dtype, extent,
+                         BufferVarInfo::kPrimFuncParam);
+    }
+
+    // If a pointer parameter isn't in the buffer map, then we want to
+    // track the parameter itself.
+    for (Var buffer_var : params) {
+      auto pointer_type = GetPointerType(buffer_var->type_annotation);
+      if (pointer_type.has_value() && (buffer_map.count(buffer_var) == 0)) {
+        DataType dtype = pointer_type.value();
+        PrimExpr extent = 0;
+        OnArrayDeclaration(buffer_var, dtype, extent,
+                           BufferVarInfo::kPrimFuncBufferMap);
+      }
+    }
+  }
+
+  void VisitExpr_(const BufferLoadNode *op) final {
+    OnArrayAccess(op->dtype, op->buffer->data.get(), op->indices,
+                  /*is_buffer_load=*/true);
+    StmtExprVisitor::VisitExpr_(op);
+  }
+
+  void VisitStmt_(const BufferStoreNode *op) final {
+    OnArrayAccess(op->value.dtype(), op->buffer->data.get(), op->indices,
+                  /*is_buffer_load=*/false);
+    StmtExprVisitor::VisitStmt_(op);
+  }
+
+  void VisitExpr_(const CallNode *op) final {
+    if (op->op.same_as(builtin::tvm_access_ptr())) {
+      DataType dtype = op->args[0].dtype();
+      const VarNode *buffer = op->args[1].as<VarNode>();
+      PrimExpr index = op->args[2];
+      OnArrayAccess(dtype, buffer, {index}, false);
+    } else if (op->op.same_as(builtin::address_of())) {
+      if (auto load = op->args[0].as<BufferLoadNode>()) {
+        OnArrayAccess(load->dtype, load->buffer->data.get(), load->indices,
+                      /*is_buffer_load=*/false);
+      }
+    }
+    StmtExprVisitor::VisitExpr_(op);
+  }
+
+  void VisitStmt_(const AllocateNode *op) final {
+    const Array<PrimExpr> &extents = op->extents;
+    PrimExpr extent = extents[extents.size() - 1];
+    OnArrayDeclaration(op->buffer_var, op->dtype, extent,
+                       BufferVarInfo::kAllocateNode);
+
+    StmtExprVisitor::VisitStmt_(op);
+  }
+
+  void VisitStmt_(const AllocateConstNode *op) final {
+    const Array<PrimExpr> &extents = op->extents;
+    PrimExpr extent =
+        extents.size() ? extents[extents.size() - 1] : NullValue<PrimExpr>();
+    OnArrayDeclaration(op->buffer_var, op->dtype, extent,
+                       BufferVarInfo::kAllocateConstNode);
+
+    StmtExprVisitor::VisitStmt_(op);
+  }
+
+  void VisitExpr_(const LetNode *op) final {
+    HandleLetNode(op->var);
+    StmtExprVisitor::VisitExpr_(op);
+  }
+
+  void VisitStmt_(const LetStmtNode *op) final {
+    HandleLetNode(op->var);
+    StmtExprVisitor::VisitStmt_(op);
+  }
+
+  void HandleLetNode(Var let_var) {
+    if (let_var->dtype.is_handle()) {
+      auto pointer_type = GetPointerType(let_var->type_annotation);
+      if (pointer_type.has_value()) {
+        OnArrayDeclaration(let_var, pointer_type.value(), 0,
+                           BufferVarInfo::kLetNode);
+      } else if (allow_untyped_pointers_) {
+        OnArrayDeclaration(let_var, let_var->dtype, 0, BufferVarInfo::kLetNode);
+      } else {
+        LOG(FATAL) << "Let statement of variable " << let_var->name_hint
+                   << " is missing a type annotation, "
+                   << "or type annotation is not a pointer to primitive";
+      }
+    }
+  }
+
+  /* Update the type map for a buffer based on its declaration
+   *
+   * @param buffer The VarNode representing the buffer.
+   *
+   * @param element_dtype The dtype of a single element of the buffer.
+   * If unknown, when used with the allow_untyped_handles option,
+   * should be a handle dtype.
+   *
+   * @param extent The extent of the buffer.  Zero if size is unknown.
+   *
+   * @param declaration_location How the buffer was allocated, so that
+   * some locations can be rewritten without others.
+   */
+  void
+  OnArrayDeclaration(Var buffer, DataType element_dtype, PrimExpr extent,
+                     BufferVarInfo::DeclarationLocation declaration_location) {
+    ICHECK(info_map_.find(buffer.get()) == info_map_.end())
+        << "Array declaration of " << buffer->name_hint
+        << " occurred multiple times.";
+
+    if (element_dtype == DataType::Bool()) {
+      element_dtype = DataType::Int(8).with_lanes(element_dtype.lanes());
+    }
+    info_map_[buffer.get()] =
+        BufferVarInfo{buffer, element_dtype, extent, declaration_location};
+  }
+
+  /* Update the type map for a buffer based on its usage
+   *
+   * @param value_dtype The dtype of the value being stored to or
+   * loaded from the buffer.
+   *
+   * @param buffer The VarNode representing the buffer.
+   *
+   * @param indices The index at which the value is being stored/loaded.
+   *
+   * @param is_buffer_load Whether the access is BufferLoad
+   */
+  void OnArrayAccess(DataType value_dtype, const VarNode *buffer,
+                     const Array<PrimExpr> &indices, bool is_buffer_load) {
+    auto it = info_map_.find(buffer);
+    ICHECK(it != info_map_.end())
+        << "Load/Store of buffer " << buffer->name_hint << " (" << buffer
+        << ") occurred before its declaration.";
+
+    if (value_dtype.is_scalable_vector()) {
+      // Scalable types are not currently supported in storage_rewrite. Scalable
+      // buffer accesses are not currently checked and therefore are not
+      // rewritten.
+      return;
+    }
+
+    BufferVarInfo &var_info = it->second;
+
+    if (value_dtype.element_of() == DataType::Bool()) {
+      value_dtype = DataType::Int(8).with_lanes(value_dtype.lanes());
+    }
+
+    if (var_info.element_dtype.is_handle()) {
+      ICHECK(allow_untyped_pointers_)
+          << "Variable " << buffer->name_hint
+          << " was missing a type annotation in its declaration";
+      var_info.element_dtype = value_dtype.element_of();
+    }
+
+    for (int i = 0; i < static_cast<int>(indices.size()) - 1; i++) {
+      ICHECK(indices[i].dtype().is_scalar())
+          << "Only the last index of a buffer access may be a vector type.";
+    }
+    int index_lanes = indices.size() ? indices.back().dtype().lanes() : 1;
+
+    DataType access_dtype = value_dtype;
+
+    int lanes_used = var_info.element_dtype.lanes();
+
+    // This can happen due to a previous pass that had rewrite_store_load =
+    // false.  This occurs from the StorageRewrite in tvm::lower, followed by
+    // the PointerValueTypeRewrite in BuildSPIRV.  The rewrite_store_load =
+    // false is necessary because the C-based codegens do not yet support
+    // vectorized pointer types (e.g. float16x4*).  Once they do, this if
+    // statement should instead be replaced by the below ICHECK_EQ.
+    if (index_lanes * var_info.element_dtype.lanes() != value_dtype.lanes()) {
+      ICHECK_EQ(index_lanes, value_dtype.lanes());
+      lanes_used = 1;
+      var_info.element_dtype = var_info.element_dtype.with_lanes(1);
+    }
+
+    // TODO(Lunderberg): Uncomment this check once it can be applied.
+    // See https://discuss.tvm.apache.org/t/pre-rfc-vectorized-tir-buffers/10615
+    // for discussion.
+
+    // ICHECK_EQ(index_lanes * var_info.element_dtype.lanes(),
+    // value_dtype.lanes())
+    //     << "Attempting to retrieve " << value_dtype.lanes() << " lanes of
+    //     data with "
+    //     << index_lanes << " indices into an array whose elements have "
+    //     << var_info.element_dtype.lanes() << " lanes.  "
+    //     << "Expected output with " << index_lanes *
+    //     var_info.element_dtype.lanes()
+    //     << " lanes.";
+
+    // If the index is a RampNode with stride of 1 and offset
+    // divisible by the number of number of lanes, and the predicate
+    // does not apply any masking, then this array access could be
+    // vectorized.
+    if (indices.size()) {
+      const RampNode *ramp_index = indices[indices.size() - 1].as<RampNode>();
+      if (ramp_index && is_one(ramp_index->stride)) {
+        if (ramp_index->lanes->IsInstance<IntImmNode>()) {
+          int lanes =
+              static_cast<int>(Downcast<IntImm>(ramp_index->lanes)->value);
+          arith::ModularSet me = analyzer_.modular_set(ramp_index->base);
+          if ((me->coeff % lanes == 0) && (me->base % lanes == 0)) {
+            lanes_used = lanes;
+          }
+        }
+      }
+    }
+
+    if (detect_scalar_read_patterns_ && is_buffer_load && indices.size()) {
+      const PrimExpr last_dim_index = indices[indices.size() - 1];
+      if (last_dim_index.dtype().lanes() == 1) {
+        arith::ModularSet me = analyzer_.modular_set(last_dim_index);
+        var_info.scalar_read_dtype.emplace(access_dtype.with_lanes(me->coeff));
+        return;
+      }
+    }
+    var_info.access_dtype.insert(access_dtype.with_lanes(lanes_used));
+  }
+
+  // Map of buffer variable information determined
+  std::unordered_map<const VarNode *, BufferVarInfo> info_map_;
+
+  //
+  bool allow_untyped_pointers_{false};
+  // Whether to detect scalar read patterns for rewriting to vector shuffle
+  bool detect_scalar_read_patterns_{true};
+
+  // internal analyzer
+  arith::Analyzer analyzer_;
+};
+
+/* \brief Rewrites buffer/pointer variables from scalar types to vectorized
+ * types.
+ *
+ * Some runtimes do not allow casting between composite types and the underlying
+ * base type (e.g. Vulkan, casting from 1-lane float16* to 4-lane float16x4*).
+ * In these cases, in order to have vectorized load/store on an array, the
+ * element type of that array must be vectorized.  This is in contrast to
+ * C-style runtimes, in which `float16x4* vec = *(float16x4*)(float_arr +
+ * offset)` is valid.
+ *
+ * By default, VectorTypeRewriter will attempt to rewrite all buffer variables
+ * to vectorized access, if the load/store occurring in the PrimFunc are all
+ * vectorized.  This includes adjusting the indices being used to access the
+ * array.  (e.g. If `float16* scalar_arr` is being converted to `float16x4*
+ * vec_arr`, then `scalar_arr[Ramp(offset, 1, 4)]` will be converted to
+ * `vec_arr[offset/4]`.)
+ *
+ * Currently, several of the C-style runtimes do not support buffers whose
+ * elements are vectorized types, or rely on the presence of the Ramp nodes to
+ * identify vectorized loads.  The boolean parameters in the constructor are to
+ * mimic the previous behavior of VectorTypeRewriter, to avoid breaking these
+ * runtimes.  Once all runtimes support vectorized buffer elements, these
+ * parameters can be removed.
+ */
+class VectorTypeRewriter : public StmtExprMutator {
+public:
+  /* Constructor
+   *
+   * @param checker The VectorTypeAccessChecker that has previously read out
+   * information from the PrimFunc
+   *
+   * @param rewrite_params Whether pointer-type parameters passed into the
+   * function should be rewritten from scalar types to vectorized types.
+   *
+   * @param rewrite_buffer_map Whether buffers present in the buffer_map should
+   * have their data variable be rewritten from scalar types to vectorized
+   * types.
+   *
+   * @param rewrite_allocate_node Whether the buffer variable associated with
+   * AllocateNodes should be rewritten from scalar types to vectorized types.
+   *
+   * @param rewrite_indices Whether the indices to the Load and Store nodes
+   * should be rewritten to correspond to the new buffer_var type.
+   *
+   * @param rewrite_let_node Whether pointer declarations in let nodes
+   * should be re-written.
+   */
+  VectorTypeRewriter(
+      const std::unordered_map<const VarNode *, BufferVarInfo> &info_map,
+      bool rewrite_params = true, bool rewrite_buffer_map = true,
+      bool rewrite_allocate_node = true, bool rewrite_indices = true,
+      bool rewrite_let_node = true, bool rewrite_allocate_const_node = true,
+      bool rewrite_scalar_read_to_vector_shuffle = true)
+      : rewrite_indices_(rewrite_indices) {
+    int rewrite_mask = 0;
+    if (rewrite_params) {
+      rewrite_mask |= BufferVarInfo::kPrimFuncParam;
+    }
+    if (rewrite_buffer_map) {
+      rewrite_mask |= BufferVarInfo::kPrimFuncBufferMap;
+    }
+    if (rewrite_allocate_node) {
+      rewrite_mask |= BufferVarInfo::kAllocateNode;
+    }
+    if (rewrite_let_node) {
+      rewrite_mask |= BufferVarInfo::kLetNode;
+    }
+    if (rewrite_allocate_const_node) {
+      rewrite_mask |= BufferVarInfo::kAllocateConstNode;
+    }
+
+    // Rewrite any buffer variables whose preferred type isn't their current
+    // type.
+    for (const auto &pair : info_map) {
+      const auto &var_info = pair.second;
+      DataType preferred = var_info.get_preferred_dtype();
+      if (preferred != var_info.element_dtype &&
+          (rewrite_mask & var_info.declaration_location)) {
+        Var old_buffer_var = var_info.var;
+        Var new_buffer_var(old_buffer_var->name_hint,
+                           PointerType(PrimType(preferred),
+                                       GetPtrStorageScope(old_buffer_var)),
+                           old_buffer_var->span);
+
+        rewrite_map_[var_info.var.get()] = {var_info.var, new_buffer_var,
+                                            var_info.element_dtype, preferred};
+      }
+    }
+  }
+
+  /*!
+   * \brief Mutator for BufferLoad or BufferStore.
+   * \return The rewritten node and the shuffle index. (Only for BufferLoad)
+   * When the shuffle index is non-negative, the caller should generate Shuffle
+   * to extract the element from the vector.
+   */
+  template <typename Node> std::pair<Node, int> VisitBufferAccess(Node node) {
+    int shuffle_index = -1;
+    if (!rewrite_indices_) {
+      return {node, shuffle_index};
+    }
+
+    auto it = rewrite_map_.find(node->buffer->data.get());
+    if (it == rewrite_map_.end()) {
+      return {node, shuffle_index};
+    }
+    const auto &info = it->second;
+
+    Array<PrimExpr> indices = node->indices;
+    const PrimExpr &last_dim_index = indices[indices.size() - 1];
+    const RampNode *ramp_index = indices[indices.size() - 1].as<RampNode>();
+
+    if (node->buffer->dtype.is_scalable_vector() ||
+        last_dim_index.dtype().is_scalable_vector()) {
+      // Scalable types are not currently supported in storage_rewrite. Scalable
+      // buffer accesses are not currently checked and therefore are not
+      // rewritten.
+      return {node, shuffle_index};
+    }
+
+    if (ramp_index && is_one(ramp_index->stride) &&
+        ramp_index->lanes->IsInstance<IntImmNode>()) {
+      int lanes = static_cast<int>(Downcast<IntImm>(ramp_index->lanes)->value);
+      PrimExpr new_index =
+          ramp_index->base / make_const(ramp_index->base.dtype(), lanes);
+      if (lanes != info.factor()) {
+        ICHECK(info.factor() && lanes % info.factor() == 0);
+        int new_lanes = lanes / info.factor();
+        new_index = Ramp(new_index * new_lanes, ramp_index->stride, new_lanes,
+                         ramp_index->span);
+      }
+      indices.Set(indices.size() - 1, new_index);
+    } else if (last_dim_index.dtype().lanes() == 1 && info.factor() > 1) {
+      arith::ModularSet me = analyzer_.modular_set(last_dim_index);
+      ICHECK(me->coeff == 0 || info.factor() % me->coeff == 0);
+      PrimExpr new_index =
+          last_dim_index / make_const(last_dim_index.dtype(), info.factor());
+      shuffle_index = me->base % info.factor();
+      ;
+      indices.Set(indices.size() - 1, new_index);
+    }
+
+    auto writer = node.CopyOnWrite();
+    writer->buffer = RemapBuffer(node->buffer);
+    writer->indices = indices;
+    return {node, shuffle_index};
+  }
+
+  PrimExpr VisitExpr_(const BufferLoadNode *op) final {
+    auto node = Downcast<BufferLoad>(StmtExprMutator::VisitExpr_(op));
+    auto [modified, shuffle_index] = VisitBufferAccess(node);
+
+    // Not needed for BufferStoreNode, so we can't just call
+    // LegalizeDtype() in VisitBufferAccess.
+    if (node.same_as(modified)) {
+      return std::move(node);
+    } else {
+      auto writer = modified.CopyOnWrite();
+      // writer->LegalizeDType();
+      LegalizeBufferLoadDType(writer);
+      if (shuffle_index >= 0) {
+        return Shuffle::ExtractElement(std::move(modified), shuffle_index);
+      }
+      return std::move(modified);
+    }
+  }
+
+  Stmt VisitStmt_(const BufferStoreNode *op) final {
+    auto node = Downcast<BufferStore>(StmtExprMutator::VisitStmt_(op));
+    auto [modified, shuffle_index] = VisitBufferAccess(std::move(node));
+    ICHECK(shuffle_index < 0);
+    return std::move(modified);
+  }
+
+  Stmt VisitStmt_(const LetStmtNode *op) final {
+    auto it = rewrite_map_.find(op->var.get());
+    PrimExpr value = this->VisitExpr(op->value);
+    Stmt body = this->VisitStmt(op->body);
+    Var var = (it == rewrite_map_.end()) ? op->var : it->second.new_buffer_var;
+    if (var.same_as(op->var) && value.same_as(op->value) &&
+        body.same_as(op->body)) {
+      return GetRef<Stmt>(op);
+    }
+    return LetStmt(var, value, body);
+  }
+
+  Buffer RemapBuffer(Buffer buf) {
+    auto cache_key = buf.get();
+
+    auto cache_it = buffer_map_.find(cache_key);
+    if (cache_it != buffer_map_.end()) {
+      return cache_it->second;
+    }
+
+    auto info_it = rewrite_map_.find(buf->data.get());
+    if (info_it != rewrite_map_.end()) {
+      auto &info = info_it->second;
+
+      Array<PrimExpr> shape = buf->shape;
+      PrimExpr last_dim = shape[shape.size() - 1];
+      shape.Set(shape.size() - 1,
+                last_dim / make_const(last_dim.dtype(), info.factor()));
+
+      auto writer = buf.CopyOnWrite();
+      writer->data = info.new_buffer_var;
+      writer->dtype = info.new_element_dtype;
+      writer->shape = shape;
+    }
+
+    buffer_map_[cache_key] = buf;
+    return buf;
+  }
+
+  PrimExpr VisitExpr_(const CallNode *op) final {
+    if (op->op.same_as(builtin::tvm_access_ptr())) {
+      PrimExpr expr = StmtExprMutator::VisitExpr_(op);
+      op = expr.as<CallNode>();
+
+      if (!rewrite_indices_) {
+        return expr;
+      }
+
+      const VarNode *buffer_var = op->args[1].as<VarNode>();
+      auto it = rewrite_map_.find(buffer_var);
+      if (it == rewrite_map_.end()) {
+        return expr;
+      }
+      const auto &info = it->second;
+
+      PrimExpr index = op->args[2];
+      PrimExpr extent = op->args[3];
+      PrimExpr flag = op->args[4];
+
+      PrimExpr e_dtype = tir::TypeAnnotation(info.new_element_dtype);
+      int factor = info.factor();
+      extent = extent / make_const(extent.dtype(), factor);
+      index = index / make_const(index.dtype(), factor);
+      Array<PrimExpr> acc_args{e_dtype, info.new_buffer_var, index, extent,
+                               flag};
+      return Call(info.new_element_dtype, builtin::tvm_access_ptr(), acc_args);
+
+    } else {
+      return StmtExprMutator::VisitExpr_(op);
+    }
+  }
+
+  Stmt VisitStmt_(const AllocateNode *op) final {
+    Stmt stmt = StmtExprMutator::VisitStmt_(op);
+    op = stmt.as<AllocateNode>();
+
+    auto it = rewrite_map_.find(op->buffer_var.get());
+    if (it == rewrite_map_.end()) {
+      return stmt;
+    }
+
+    const auto &info = it->second;
+
+    Var new_buffer_var = info.new_buffer_var;
+
+    Array<PrimExpr> extents = op->extents;
+    PrimExpr last_extent = extents[extents.size() - 1];
+    extents.Set(extents.size() - 1,
+                last_extent / make_const(last_extent.dtype(), info.factor()));
+    return Allocate(new_buffer_var, info.new_element_dtype, extents,
+                    op->condition, op->body);
+  }
+
+  Stmt VisitStmt_(const AllocateConstNode *op) final {
+    Stmt stmt = StmtExprMutator::VisitStmt_(op);
+    op = stmt.as<AllocateConstNode>();
+
+    auto it = rewrite_map_.find(op->buffer_var.get());
+    if (it == rewrite_map_.end()) {
+      return stmt;
+    }
+
+    const auto &info = it->second;
+
+    Var new_buffer_var = info.new_buffer_var;
+
+    int factor = info.new_element_dtype.lanes() / op->dtype.lanes();
+
+    Array<PrimExpr> extents = op->extents;
+    extents.Set(extents.size() - 1, extents[extents.size() - 1] /
+                                        make_const(extents[0].dtype(), factor));
+    return AllocateConst(new_buffer_var, info.new_element_dtype, extents,
+                         op->data, op->body);
+  }
+
+  /* Update the parameters and all remaining variable references
+   *
+   * Should be called after calling operator() on the body of the
+   * function.
+   *
+   * @param func A pointer to the PrimFunc being modified.
+   */
+  void Finalize(PrimFunc *func_ptr) {
+    ICHECK(func_ptr) << "Finalize expects a non-null pointer";
+    auto &func = *func_ptr;
+    auto *n = func.CopyOnWrite();
+
+    // Remap any remaining references to the old buffer variables
+    Map<Var, Var> var_remap;
+    for (const auto &pair : rewrite_map_) {
+      const auto &info = pair.second;
+      var_remap.Set(info.old_buffer_var, info.new_buffer_var);
+    }
+    n->body = Substitute(n->body, var_remap);
+
+    // Remap the argument list to use the new buffer variables.
+    Array<Var> new_params;
+    for (const auto &old_param : n->params) {
+      auto it = rewrite_map_.find(old_param.get());
+      if (it == rewrite_map_.end()) {
+        new_params.push_back(old_param);
+      } else {
+        const auto &info = it->second;
+        new_params.push_back(info.new_buffer_var);
+      }
+    }
+    n->params = new_params;
+
+    // Remap the Buffer objects in PrimFunc::buffer_map so that the
+    // buffers use the new buffer variables
+    Map<Var, Buffer> new_buffer_map;
+    for (const auto &pair : n->buffer_map) {
+      Var key = pair.first;
+      Buffer old_buffer = pair.second;
+      Var old_var = old_buffer->data;
+      Buffer new_buffer = RemapBuffer(old_buffer);
+      new_buffer_map.Set(key, new_buffer);
+    }
+    n->buffer_map = new_buffer_map;
+  }
+
+private:
+  struct RewriteInfo {
+    Var old_buffer_var;
+    Var new_buffer_var;
+    DataType old_element_dtype;
+    DataType new_element_dtype;
+
+    int factor() const {
+      int old_lanes = old_element_dtype.lanes();
+      int new_lanes = new_element_dtype.lanes();
+      ICHECK_EQ(new_lanes % old_lanes, 0);
+      return new_lanes / old_lanes;
+    }
+  };
+
+  bool rewrite_indices_{true};
+  std::unordered_map<const VarNode *, RewriteInfo> rewrite_map_;
+  std::unordered_map<const BufferNode *, Buffer> buffer_map_;
+  arith::Analyzer analyzer_;
+};
+
+// Rewrite allocates, pointer parameters, and buffer map into vectorized
+// versions if each access into a buffer is the same vector type.
+PrimFunc PointerValueTypeRewrite(
+    PrimFunc f, bool allow_untyped_pointers = false, bool rewrite_params = true,
+    bool rewrite_buffer_map = true, bool rewrite_allocate_node = true,
+    bool rewrite_indices = true, bool rewrite_let_node = true,
+    bool rewrite_allocate_const_node = true,
+    bool rewrite_scalar_read_to_vector_shuffle = true) {
+  VectorTypeAccessChecker checker(f->params, f->buffer_map,
+                                  allow_untyped_pointers,
+                                  rewrite_scalar_read_to_vector_shuffle);
+  checker(f->body);
+
+  VectorTypeRewriter rewriter(
+      checker.info_map_, rewrite_params, rewrite_buffer_map,
+      rewrite_allocate_node, rewrite_indices, rewrite_let_node,
+      rewrite_allocate_const_node, rewrite_scalar_read_to_vector_shuffle);
+  PrimFuncNode *n = f.CopyOnWrite();
+  n->body = rewriter(std::move(n->body));
+  rewriter.Finalize(&f);
+
+  return f;
+}
+
+using namespace tir::transform;
+namespace transform {
+Pass StorageRewrite() {
+  auto pass_func = [](PrimFunc f, IRModule m, PassContext ctx) {
+    bool enable_reuse = true;
+    bool reuse_require_exact_matched_dtype = false;
+    bool merge_static_smem =
+        ctx->GetConfig<Bool>("tir.merge_static_smem", Bool(false)).value();
+    AllocateCollector collector;
+    collector(f->body);
+    bool has_dynamic = collector.dyn_shmem_allocs_.size() > 1;
+    if (has_dynamic || merge_static_smem) {
+      // For IRModule utilizing dynamic shared memory, reuse is not enabled
+      // Because dynamic doesn't require maintaining the readability and
+      // it benefits from a more optimized allocation strategy through the
+      // Pass `MergeSharedMemoryAllocations`.
+      // When `merge_static_smem` is true, we will reuse and merge shared
+      // memory in a dedicated pass `MergeSharedMemoryAllocations`.
+      // And so we don't enable reuse in this pass.
+      enable_reuse = false;
+    }
+
+    Optional<Target> target = f->GetAttr<Target>("target");
+    if (target.defined() && (target.value()->kind->name == "vulkan" ||
+                             target.value()->kind->name == "webgpu")) {
+      // Require exactly same-dtype matching in smem reuse for Vulkan and WebGPU
+      reuse_require_exact_matched_dtype = true;
+    }
+    auto *n = f.CopyOnWrite();
+    n->body =
+        StoragePlanRewriter().Rewrite(std::move(n->body), true, enable_reuse,
+                                      reuse_require_exact_matched_dtype);
+    // Parameters may not be rewritten, but internal allocations may.
+    // Vectorization of AllocateConst is currently disabled, as it has
+    // indexing issues for types that include padding (e.g. int8x3
+    // padded out to 32 bits) would require either rewriting
+    // AllocateConst::data, or would require the code generators to
+    // handle vectorized constants.
+    return PointerValueTypeRewrite(std::move(f), true, false, false, false,
+                                   true, true, false, false);
+  };
+  return CreatePrimFuncPass(pass_func, 0, "tir.StorageRewrite", {});
+}
+
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.StorageRewrite", StorageRewrite);
+});
+
+Pass PointerValueTypeRewrite() {
+  auto pass_func = [](PrimFunc f, IRModule m, PassContext ctx) {
+    return tl::PointerValueTypeRewrite(std::move(f));
+  };
+  return CreatePrimFuncPass(pass_func, 0, "tl.PointerValueTypeRewrite", {});
+}
+
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.PointerValueTypeRewrite",
+                        PointerValueTypeRewrite);
+});
+
+} // namespace transform
+} // namespace tl
+} // namespace tvm
diff --git a/src/transform/thread_partial_sync.cc b/src/transform/thread_partial_sync.cc
index 8ffb30000..0d6aa0e9d 100644
--- a/src/transform/thread_partial_sync.cc
+++ b/src/transform/thread_partial_sync.cc
@@ -1,7 +1,8 @@
 /*!
  * \file thread_storage_sync.cc
  */
-#include <tvm/runtime/registry.h>
+#include <tvm/ffi/function.h>
+#include <tvm/ffi/reflection/registry.h>
 #include <tvm/tir/analysis.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/expr.h>
@@ -28,7 +29,8 @@ class TileLangThreadPartialSyncPlanner : public TileLangStorageAccessVisitor {
 
   // The syncs inserted before each statement
   std::unordered_set<const Object *> syncs_inserted_;
-  std::unordered_map<const Object *, int> partial_syncs_inserted_;
+  std::unordered_map<const Object *, std::tuple<int, int>>
+      partial_syncs_inserted_;
 
 protected:
   bool Enabled(const VarNode *buf, const StorageScope &scope) const final {
@@ -256,20 +258,27 @@ class TileLangThreadPartialSyncPlanner : public TileLangStorageAccessVisitor {
 
       scope_.push_back(std::vector<StmtEntry>());
       num_partial_threads_ = partitions[0];
+      barrier_id_ += 1;
       this->VisitStmt(body->then_case);
       StmtEntry s;
       s.stmt = op;
       s.access = Summarize(std::move(scope_.back()), nullptr);
       scope_.pop_back();
-
+      if (!has_sync_)
+        barrier_id_ -= 1;
+      has_sync_ = false;
       num_partial_threads_ = partitions[1];
       scope_.push_back(std::vector<StmtEntry>());
+      barrier_id_ += 1;
       VisitStmt(body->else_case.value());
       auto v = Summarize(std::move(scope_.back()), nullptr);
       scope_.pop_back();
+      if (!has_sync_)
+        barrier_id_ -= 1;
+      has_sync_ = false;
       s.access.insert(s.access.end(), v.begin(), v.end());
 
-      num_partial_threads_ = NullOpt;
+      num_partial_threads_ = std::nullopt;
     } else {
       TileLangStorageAccessVisitor::VisitStmt_(op);
     }
@@ -280,10 +289,12 @@ class TileLangThreadPartialSyncPlanner : public TileLangStorageAccessVisitor {
     // condition";
     if (syncs_inserted_.count(obj))
       return;
-    if (num_partial_threads_.defined()) {
+    if (num_partial_threads_.defined() && barrier_id_ >= 0 &&
+        barrier_id_ < 16) {
       syncs_inserted_.insert(obj);
-      partial_syncs_inserted_[obj] =
-          static_cast<int>(num_partial_threads_.value()->value);
+      partial_syncs_inserted_[obj] = std::make_tuple(
+          static_cast<int>(num_partial_threads_.value()->value), barrier_id_);
+      has_sync_ = true;
     } else {
       syncs_inserted_.insert(obj);
     }
@@ -293,6 +304,8 @@ class TileLangThreadPartialSyncPlanner : public TileLangStorageAccessVisitor {
   Optional<IntImm> num_partial_threads_;
   // synchronization scope
   StorageScope sync_scope_;
+  int barrier_id_{-1};
+  bool has_sync_{false};
 };
 
 // There are cases where necessary syncthreads is not inserted by
@@ -317,7 +330,7 @@ class ThreadPartialSyncInserter : public StmtExprMutator {
 public:
   ThreadPartialSyncInserter(
       StorageScope sync_scope, const std::unordered_set<const Object *> &syncs,
-      std::unordered_map<const Object *, int> partial_syncs)
+      std::unordered_map<const Object *, std::tuple<int, int>> partial_syncs)
       : sync_scope_(sync_scope), syncs_(syncs), partial_syncs_(partial_syncs) {}
 
   Stmt VisitStmt(const Stmt &stmt) final {
@@ -328,8 +341,10 @@ class ThreadPartialSyncInserter : public StmtExprMutator {
       if (partial_syncs_.count(stmt.get())) {
         auto iter = partial_syncs_.find(stmt.get());
         ICHECK(sync_scope_.rank == StorageRank::kShared);
-        barrier = Evaluate(
-            Call(DataType::Int(32), tl::sync_thread_partial(), {iter->second}));
+        int num_threads, barrier_id;
+        std::tie(num_threads, barrier_id) = iter->second;
+        barrier = Evaluate(Call(DataType::Int(32), tl::sync_thread_partial(),
+                                {num_threads, barrier_id}));
       } else {
         return StmtExprMutator::VisitStmt(stmt);
       }
@@ -346,7 +361,8 @@ class ThreadPartialSyncInserter : public StmtExprMutator {
   // data structure.
   StorageScope sync_scope_;
   const std::unordered_set<const Object *> &syncs_;
-  const std::unordered_map<const Object *, int> &partial_syncs_;
+  const std::unordered_map<const Object *, std::tuple<int, int>>
+      &partial_syncs_;
 };
 
 Stmt TileLangThreadPartialSync(Stmt stmt, std::string storage_scope) {
@@ -371,8 +387,11 @@ Pass TileLangThreadPartialSync(String storage_scope) {
   return CreatePrimFuncPass(pass_func, 0, "tl.ThreadPartialSync", {});
 }
 
-TVM_REGISTER_GLOBAL("tl.transform.ThreadPartialSync")
-    .set_body_typed(TileLangThreadPartialSync);
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.ThreadPartialSync",
+                        TileLangThreadPartialSync);
+});
 
 } // namespace transform
 } // namespace tl
diff --git a/src/transform/thread_storage_sync.cc b/src/transform/thread_storage_sync.cc
index fadba4c45..019ef294e 100644
--- a/src/transform/thread_storage_sync.cc
+++ b/src/transform/thread_storage_sync.cc
@@ -20,7 +20,8 @@
 /*!
  * \file thread_storage_sync.cc
  */
-#include <tvm/runtime/registry.h>
+#include <tvm/ffi/function.h>
+#include <tvm/ffi/reflection/registry.h>
 #include <tvm/tir/analysis.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/expr.h>
@@ -188,7 +189,7 @@ class TileLangThreadSyncPlanner : public TileLangStorageAccessVisitor {
         }
       }
     }
-    // return the exposed entries, remove unecessary ones.
+    // return the exposed entries, remove unnecessary ones.
     int sync_count = 0;
     // head are before first sync, tail are after last sync
     std::vector<AccessEntry> head, tail;
@@ -367,7 +368,7 @@ class TileLangThreadSyncPlanner : public TileLangStorageAccessVisitor {
       scope_.pop_back();
       s.access.insert(s.access.end(), v.begin(), v.end());
 
-      num_partial_threads_ = NullOpt;
+      num_partial_threads_ = std::nullopt;
     } else {
       TileLangStorageAccessVisitor::VisitStmt_(op);
     }
@@ -786,7 +787,10 @@ tvm::transform::Pass ThreadSync(String storage_scope) {
   return CreatePrimFuncPass(pass_func, 0, "tl.ThreadSync", {});
 }
 
-TVM_REGISTER_GLOBAL("tl.transform.ThreadSync").set_body_typed(ThreadSync);
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.ThreadSync", ThreadSync);
+});
 
 } // namespace transform
 } // namespace tl
diff --git a/src/transform/vectorize_loop.cc b/src/transform/vectorize_loop.cc
index 5addd040d..248c12498 100644
--- a/src/transform/vectorize_loop.cc
+++ b/src/transform/vectorize_loop.cc
@@ -22,7 +22,8 @@
  */
 // Loop vectorizer as in Halide pipeline.
 #include <tvm/arith/analyzer.h>
-#include <tvm/runtime/registry.h>
+#include <tvm/ffi/function.h>
+#include <tvm/ffi/reflection/registry.h>
 #include <tvm/tir/analysis.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/expr.h>
@@ -526,7 +527,7 @@ class TLVectorizer : public StmtMutator,
     // A single var can be binded in multiple lets
     // but they have to bind to the same value.
     // This is used to allow cases when we reuse a single let
-    // expression to cosntruct a nested expr.
+    // expression to construct a nested expr.
     // (let x = 1 in x + 1) * (let x = 1 in x + 1)
     auto it = let_binding_.find(op->var);
     if (it != let_binding_.end()) {
@@ -631,7 +632,7 @@ class TLVectorizer : public StmtMutator,
       return Scalarize(GetRef<Stmt>(op));
     }
     Stmt then_case = this->VisitStmt(op->then_case);
-    Optional<Stmt> else_case = NullOpt;
+    Optional<Stmt> else_case = std::nullopt;
     if (op->else_case) {
       else_case = this->VisitStmt(op->else_case.value());
     }
@@ -682,16 +683,12 @@ class TLVectorizer : public StmtMutator,
     return StmtMutator::VisitStmt_(op);
   }
 
-  // scalarize the statment
+  // scalarize the statement
   Stmt Scalarize(Stmt stmt) {
     Var idx(var_->name_hint + ".s", var_->dtype);
     stmt = Substitute(stmt, {{var_, idx}});
     return For(idx, IntImm(var_->dtype, 0), var_lanes_, ForKind::kSerial, stmt);
   }
-  // ProducerStore
-  Stmt VisitStmt_(const ProducerStoreNode *op) final {
-    LOG(FATAL) << "ProducerProvide cannot appear in a TIR PrimFunc";
-  }
 
 private:
   // analyzer
@@ -704,7 +701,7 @@ class TLVectorizer : public StmtMutator,
   PrimExpr var_lanes_;
   // ramp representing the var.
   PrimExpr ramp_;
-  // flag to mark requirment of scalarization.
+  // flag to mark requirement of scalarization.
   bool need_scalarize_{false};
   // Let binding
   std::unordered_map<Var, PrimExpr, ObjectPtrHash, ObjectPtrEqual> let_binding_;
@@ -787,6 +784,10 @@ class TLVectorizer : public StmtMutator,
   }
 };
 
+inline bool TargetHasSVE() {
+  return Target::Current()->GetFeature<Bool>("has_sve").value_or(false);
+}
+
 class LoopVectorizer : public StmtMutator {
 public:
   Stmt VisitStmt_(const ForNode *op) final {
@@ -796,7 +797,7 @@ class LoopVectorizer : public StmtMutator {
       if (!extent_as_int || extent_as_int->value < 1) {
         bool is_scalable_expr =
             CheckContains::ExprContains(op->extent, arith::IsVScaleCall);
-        ICHECK(is_scalable_expr && arith::TargetHasSVE())
+        ICHECK(is_scalable_expr && TargetHasSVE())
             << "Failed to vectorize loop with extent " << op->extent
             << " for target " << Target::Current();
       }
@@ -837,7 +838,10 @@ tvm::transform::Pass VectorizeLoop(bool enable_vectorize = true) {
   return CreatePrimFuncPass(pass_func, 0, "tl.VectorizeLoop", {});
 }
 
-TVM_REGISTER_GLOBAL("tl.transform.VectorizeLoop").set_body_typed(VectorizeLoop);
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.VectorizeLoop", VectorizeLoop);
+});
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/warp_specialized_rewriter.cc b/src/transform/warp_specialized_rewriter.cc
index c8ba56949..b17db4bec 100644
--- a/src/transform/warp_specialized_rewriter.cc
+++ b/src/transform/warp_specialized_rewriter.cc
@@ -5,6 +5,7 @@
 
 #include "arith/ir_visitor_with_analyzer.h"
 #include "tir/analysis/var_use_def_analysis.h"
+#include <tvm/ffi/reflection/registry.h>
 #include <tvm/tir/analysis.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/op.h>
@@ -22,24 +23,45 @@ using arith::IRVisitorWithAnalyzer;
 
 enum class Role { kConsumer, kProducer, kBoth };
 
-class TMAFinder : public StmtExprVisitor {
+class ProducerBufferDetector : public StmtExprVisitor {
 public:
-  void clear() { has_tma_load_ = false; }
+  ProducerBufferDetector(
+      std::unordered_set<const BufferNode *> cur_producer_buffers)
+      : cur_producer_buffers_(cur_producer_buffers) {}
+
+  void clear() { has_producer_buffer_ = false; }
 
   void VisitExpr_(const CallNode *call) final {
     if (call->op.same_as(tma_load()) || call->op.same_as(tma_load_im2col())) {
-      has_tma_load_ = true;
+      has_producer_buffer_ = true;
     }
+    StmtExprVisitor::VisitExpr_(call);
   }
 
-  bool has_tma_load_ = false;
+  void VisitExpr_(const BufferLoadNode *op) final {
+    if (cur_producer_buffers_.count(op->buffer.get())) {
+      has_producer_buffer_ = true;
+    }
+    StmtExprVisitor::VisitExpr_(op);
+  }
+
+  bool has_producer_buffer_ = false;
+  std::unordered_set<const BufferNode *> cur_producer_buffers_;
 };
 
 class ProducerUsedBufferFinder : public StmtExprVisitor {
 public:
   auto FindProducerusedBuffer(Stmt stmt) {
-    VisitStmt(stmt);
-    return used_in_producer_cond_;
+    producer_buffers_.clear();
+    std::unordered_set<const BufferNode *> last_producer_buffers_;
+    for (;;) {
+      VisitStmt(stmt);
+      if (producer_buffers_ == last_producer_buffers_) {
+        break;
+      }
+      last_producer_buffers_ = producer_buffers_;
+    }
+    return producer_buffers_;
   }
 
   void InsertBuffer(const PrimExpr &expr) {
@@ -47,36 +69,51 @@ class ProducerUsedBufferFinder : public StmtExprVisitor {
     VarUseDefAnalyzer usage(Array<Var>{});
     usage(expr);
     for (const auto &buffer : usage.buffer_use_count_) {
-      used_in_producer_cond_.insert(buffer.first);
-    }
-    for (const auto &buffer : used_in_producer_cond_) {
+      producer_buffers_.insert(buffer.first);
     }
   }
 
   void VisitStmt_(const IfThenElseNode *op) final {
-    TMAFinder tma_finder;
-    tma_finder(op->then_case);
+    ProducerBufferDetector producer_buffer_detector(producer_buffers_);
+    producer_buffer_detector(op->then_case);
     if (op->else_case.defined()) {
-      tma_finder(op->else_case.value());
+      producer_buffer_detector(op->else_case.value());
     }
-    if (tma_finder.has_tma_load_) {
+    if (producer_buffer_detector.has_producer_buffer_) {
       InsertBuffer(op->condition);
     }
     StmtExprVisitor::VisitStmt_(op);
   }
 
   void VisitStmt_(const ForNode *op) final {
-    TMAFinder tma_finder;
-    tma_finder(op->body);
-    if (tma_finder.has_tma_load_) {
+    ProducerBufferDetector producer_buffer_detector(producer_buffers_);
+    producer_buffer_detector(op->body);
+    if (producer_buffer_detector.has_producer_buffer_) {
       InsertBuffer(op->min);
       InsertBuffer(op->extent);
     }
     StmtExprVisitor::VisitStmt_(op);
   }
 
+  void VisitStmt_(const BufferStoreNode *op) final {
+    if (producer_buffers_.count(op->buffer.get())) {
+      InsertBuffer(op->value);
+    }
+    StmtExprVisitor::VisitStmt_(op);
+  }
+
+  void VisitExpr_(const CallNode *op) final {
+    if (op->op.same_as(tma_load()) || op->op.same_as(tma_load_im2col())) {
+      for (auto arg : op->args) {
+        if (auto buffer_load = arg.as<BufferLoadNode>()) {
+          producer_buffers_.insert(buffer_load->buffer.get());
+        }
+      }
+    }
+  }
+
 private:
-  std::unordered_set<const BufferNode *> used_in_producer_cond_;
+  std::unordered_set<const BufferNode *> producer_buffers_;
 };
 
 class WarpSpecializedRoleMarker : public StmtVisitor {
@@ -86,7 +123,7 @@ class WarpSpecializedRoleMarker : public StmtVisitor {
 
   void Prepare(const Stmt &stmt) {
     ProducerUsedBufferFinder finder;
-    used_in_producer_cond_ = finder.FindProducerusedBuffer(stmt);
+    producer_buffers_ = finder.FindProducerusedBuffer(stmt);
   }
 
   Role GetRole(const StmtNode *stmt) const {
@@ -114,7 +151,7 @@ class WarpSpecializedRoleMarker : public StmtVisitor {
   void VisitStmt_(const BufferStoreNode *op) final {
     bool is_shared_store =
         op->buffer.scope() == "shared.dyn" || op->buffer.scope() == "shared";
-    if (used_in_producer_cond_.count(op->buffer.get())) {
+    if (producer_buffers_.count(op->buffer.get())) {
       SetRole(op, Role::kBoth);
       return;
     }
@@ -198,17 +235,22 @@ class WarpSpecializedRoleMarker : public StmtVisitor {
   std::unordered_map<const StmtNode *, Role> map_;
   bool has_simt_copy_ = false;
   bool has_bulk_copy_ = false;
-  std::unordered_set<const BufferNode *> used_in_producer_cond_;
+  std::unordered_set<const BufferNode *> producer_buffers_;
 };
 
 static PrimExpr makeGetBarrier(PrimExpr barrier_id) {
   return Call(DataType::Handle(), get_mbarrier(), {barrier_id});
 }
 
-static Stmt makeArriveBarrier(PrimExpr barrier_id) {
-  auto call = Call(DataType::Handle(), builtin::ptx_arrive_barrier(),
-                   {makeGetBarrier(barrier_id)});
-  return Evaluate(call);
+static Stmt makeArriveBarrier(PrimExpr barrier_id, int cta_id = -1,
+                              PrimExpr pred = 1) {
+  Array<PrimExpr> args = {makeGetBarrier(barrier_id)};
+  if (cta_id != -1) {
+    args.push_back(cta_id);
+    args.push_back(pred);
+  }
+  return Evaluate(
+      Call(DataType::Handle(), builtin::ptx_arrive_barrier(), args));
 }
 
 static Stmt makeCpAsyncBarrier(PrimExpr barrier_id) {
@@ -281,14 +323,18 @@ class MbarrierRewriter : public StmtExprMutator {
 
 class ThreadIdxRewriter : public StmtExprMutator {
 public:
-  static Stmt Rewrite(Stmt stmt, Var thread_var, PrimExpr replaced) {
-    auto rewriter = ThreadIdxRewriter(thread_var, replaced);
+  static Stmt Rewrite(Stmt stmt, Var thread_var, PrimExpr replaced,
+                      PrimExpr thread_extent, bool do_shuffle = false) {
+    auto rewriter =
+        ThreadIdxRewriter(thread_var, replaced, thread_extent, do_shuffle);
     return rewriter(stmt);
   }
 
 private:
-  ThreadIdxRewriter(Var thread_var, PrimExpr replaced)
-      : thread_var_(thread_var), replaced_(replaced) {}
+  ThreadIdxRewriter(Var thread_var, PrimExpr replaced, PrimExpr thread_extent,
+                    bool do_shuffle)
+      : thread_var_(thread_var), replaced_(replaced),
+        thread_extent_(thread_extent), do_shuffle_(do_shuffle) {}
 
   PrimExpr VisitExpr_(const VarNode *var) final {
     if (var == thread_var_.get()) {
@@ -298,8 +344,34 @@ class ThreadIdxRewriter : public StmtExprMutator {
     }
   }
 
+  Stmt VisitStmt_(const IfThenElseNode *op) final {
+    auto f_uses_thread_index = [=](const tvm::tir::VarNode *parameter) {
+      return parameter == thread_var_.get();
+    };
+    maybe_thread_opt_ = false;
+    if (!op->else_case.defined() && op->condition.as<EQNode>() &&
+        UsesVar(op->condition, f_uses_thread_index) &&
+        !(UsesVar(op->then_case, f_uses_thread_index))) {
+      auto eq_op = Downcast<EQ>(op->condition);
+      if (eq_op->a.as<VarNode>() == thread_var_.get() ||
+          eq_op->b.as<VarNode>() == thread_var_.get()) {
+        maybe_thread_opt_ = true;
+      }
+      maybe_thread_opt_ = do_shuffle_ && maybe_thread_opt_;
+    }
+    if (maybe_thread_opt_)
+      return IfThenElse(
+          Call(DataType::Bool(), tl_shuffle_elect(), {thread_extent_}),
+          StmtExprMutator::VisitStmt(op->then_case), std::nullopt);
+    else
+      return StmtExprMutator::VisitStmt_(op);
+  }
+
   Var thread_var_;
   PrimExpr replaced_;
+  PrimExpr thread_extent_;
+  bool maybe_thread_opt_ = false;
+  bool do_shuffle_;
 };
 
 Block MakeGroupBlock(const Stmt &stmt,
@@ -447,7 +519,7 @@ class GroupOpRewriter : public StmtExprMutator {
       order_anno.push_back(Integer(op_info.order));
       stage_anno.push_back(Integer(op_info.stage));
     }
-    Map<String, ObjectRef> for_annotations = op->annotations;
+    Map<String, Any> for_annotations = op->annotations;
     for_annotations.erase("tl_pipeline_group");
     for_annotations.Set("software_pipeline_order", order_anno);
     for_annotations.Set("software_pipeline_stage", stage_anno);
@@ -460,15 +532,84 @@ class GroupOpRewriter : public StmtExprMutator {
 
   PipelineInfo pipeline_info_;
 };
+
+class WgMMACollector : public StmtExprVisitor {
+public:
+  WgMMACollector() = default;
+
+  void VisitExpr_(const CallNode *op) final {
+    if (op->op.same_as(tl_gemm()) || op->op.same_as(tl_gemm_sp())) {
+      auto op_name = std::string(op->args[0].as<StringImmNode>()->value);
+      if (has_wgmma_) {
+        has_wgmma_ =
+            op_name.find("false") == std::string::npos && !in_if_scope_;
+      }
+    }
+    StmtExprVisitor::VisitExpr_(op);
+  }
+
+  void VisitStmt_(const IfThenElseNode *op) final {
+    in_if_scope_ = true;
+    StmtExprVisitor::VisitStmt(op->then_case);
+    if (op->else_case.defined()) {
+      StmtExprVisitor::VisitStmt(op->else_case.value());
+    }
+    in_if_scope_ = false;
+  }
+
+  static bool HasWgMMA(Stmt stmt) {
+    auto collector = WgMMACollector();
+    collector(stmt);
+    return collector.has_wgmma_;
+  }
+
+  bool has_wgmma_{true};
+  bool in_if_scope_{false};
+};
+
 class WSCodeEmitter : public StmtMutator {
 public:
-  WSCodeEmitter(bool is_emitting_producer, IterVar thread_iv,
+  /**
+         * @brief Construct a warp-specialized code emitter configured for producer or consumer emission.
+         *
+         * Initializes a WSCodeEmitter that will emit barrier-aware, role-filtered code for a single
+         * warp-specialized block. The emitter is configured with the loop/thread iteration variable,
+         * buffer mapping, role marker used to classify statements, and two flags that control emission
+         * behavior:
+         *
+         * - `mbarrier_only`: when true, emission is restricted to barrier-related operations only.
+         * - `only_has_wgmma`: when true, the emitter will account for the presence of WgMMA
+         *   (workgroup MMA) operations when computing barrier/thread gating behavior.
+         *
+         * @param is_emitting_producer True to emit producer-side groups; false to emit consumer-side groups.
+         * @param thread_iv IterVar representing the thread iteration variable (threadIdx.*) whose Var is used
+         *                  for thread-index rewrites and gating.
+         * @param buffer_data_to_buffer Map from buffer data Var to the corresponding Buffer (used to resolve
+         *                              buffer references during emission).
+         * @param marker Role marker that classifies statements as producer/consumer/both; used to filter
+         *               which statements are emitted on this path.
+         * @param mbarrier_only If true, restrict emission to mbarrier-related statements and helpers.
+         * @param only_has_wgmma If true, adjust emission and barrier-thread-count logic for blocks that
+         *                       contain WgMMA operations.
+         */
+        WSCodeEmitter(bool is_emitting_producer, IterVar thread_iv,
                 Map<Var, Buffer> buffer_data_to_buffer,
                 const WarpSpecializedRoleMarker &marker,
-                bool mbarrier_only = false)
+                bool mbarrier_only = false, bool only_has_wgmma = false)
       : is_emitting_producer_(is_emitting_producer),
         buffer_data_to_buffer_(buffer_data_to_buffer), marker_(marker),
-        thread_var_(thread_iv->var), mbarrier_only_(mbarrier_only) {}
+        thread_var_(thread_iv->var), mbarrier_only_(mbarrier_only),
+        only_has_wgmma_(only_has_wgmma) {}
+
+  /**
+ * @brief Whether a SIMT-style bulk copy was detected.
+ *
+ * Returns true when a simulated SIMT (thread-parallel) copy pattern was observed
+ * during analysis/emission, which can affect barrier insertion and copy emission.
+ *
+ * @return true if a SIMT copy was detected; false otherwise.
+ */
+bool hasSimtCopy() const { return has_simt_copy_; }
 
 private:
   template <typename NodeType> Stmt FilterByRole(const NodeType *op) {
@@ -486,7 +627,47 @@ class WSCodeEmitter : public StmtMutator {
     }
   }
 
-  // TODO: only need to add block for ops in the loop
+  /**
+   * @brief Visit and transform a SeqStmt node, emitting grouped blocks with barrier
+   * synchronization according to producer/consumer roles.
+   *
+   * This method examines the sequence to determine whether producer-side
+   * synchronization is required (based on marker_ roles). If no producer sync is
+   * needed it delegates to FilterByRole. Otherwise it:
+   * - Recursively visits and transforms each child statement.
+   * - Extracts an acquire/release sync pattern for the sequence via
+   *   ExtractSyncPattern.
+   * - For producer emission (is_emitting_producer_ == true):
+   *   - Skips consumer-only statements unless marker_ marks a statement as Both,
+   *     in which case the statement is emitted as its own group.
+   *   - For each statement, inserts parity waits for acquire patterns, rewrites
+   *     release statements with MbarrierRewriter using a computed barrier id,
+   *     collects SimT-copy presence (setting has_simt_copy_ and inserting
+   *     cp.async barriers when found), optionally emits arrive barriers for
+   *     release-after events, and emits each resulting set of statements as a
+   *     group block annotated with "stmt_group".
+   * - For consumer emission (is_emitting_producer_ == false):
+   *   - Skips producer-only statements.
+   *   - Inserts parity waits for acquire patterns, appends the transformed
+   *     statement, and emits arrive barriers for release-after events. When
+   *     only_has_wgmma_ is set, the arrive barrier uses a per-thread predicate
+   *     (FloorMod(thread_var_,128)==0) with CTA=0; otherwise a full arrive is
+   *     emitted.
+   *   - Recomputes pipeline_info_ to drop producer-only ops.
+   *
+   * Side effects / state updates:
+   * - Increments num_barriers_ by (number of extracted patterns * num_stages_).
+   * - May set has_simt_copy_ when a SimT copy is detected in producer rewrites.
+   * - Inserts barrier ids into released_barrier_ for release-after events.
+   * - Updates pipeline_info_ for the consumer path to remove producer ops.
+   *
+   * The resulting statements are emitted as grouped blocks (via MakeGroupBlock)
+   * with the annotation "stmt_group" and returned as either a single Stmt (if
+   * there's only one group) or a SeqStmt containing the grouped blocks.
+   *
+   * @return Stmt The transformed statement (either a single group block or a
+   * SeqStmt of group blocks).
+   */
   Stmt VisitStmt_(const SeqStmtNode *op) final {
 
     bool has_producer = false;
@@ -505,6 +686,7 @@ class WSCodeEmitter : public StmtMutator {
         op->seq.Map([&](Stmt stmt) { return VisitStmt(stmt); });
 
     auto map = ExtractSyncPattern(op->seq);
+
     /*
       std::cout << "Print ExtractSyncPattern" << std::endl;
       for (int i = 0; i < static_cast<int>(op->seq.size()); i++) {
@@ -557,8 +739,9 @@ class WSCodeEmitter : public StmtMutator {
               MbarrierRewriter::Rewrite(seq_transformed[i], release_barrier_id);
           collector.Collect(stmt);
           block_stmt.push_back(stmt);
-          if (collector.HasSimtCopy() > 0) {
+          if (collector.HasSimtCopy()) {
             block_stmt.push_back(makeCpAsyncBarrier(release_barrier_id));
+            has_simt_copy_ = true;
           }
           if (map.release_after[i][j]) {
             block_stmt.push_back(makeArriveBarrier(release_barrier_id));
@@ -593,7 +776,11 @@ class WSCodeEmitter : public StmtMutator {
             int pattern_idx = map.release[i][j];
             PrimExpr release_barrier_id =
                 stage_ + num_barriers_ + num_stages_ * pattern_idx;
-            block_stmt.push_back(makeArriveBarrier(release_barrier_id));
+            if (only_has_wgmma_)
+              block_stmt.push_back(makeArriveBarrier(
+                  release_barrier_id, 0, EQ(FloorMod(thread_var_, 128), 0)));
+            else
+              block_stmt.push_back(makeArriveBarrier(release_barrier_id));
             for (int s = 0; s < num_stages_; s++) {
               released_barrier_.insert(s + num_barriers_ +
                                        num_stages_ * pattern_idx);
@@ -636,9 +823,9 @@ class WSCodeEmitter : public StmtMutator {
   Stmt VisitStmt_(const ForNode *op) final {
     int num_stages = 1;
     auto num_stages_anno = op->annotations.Get("num_stages");
-    if (num_stages_anno.defined()) {
-      ICHECK(num_stages_anno.as<IntImmNode>());
-      num_stages = static_cast<int>(num_stages_anno.as<IntImmNode>()->value);
+    if (num_stages_anno) {
+      ICHECK(num_stages_anno->as<IntImmNode>());
+      num_stages = static_cast<int>(num_stages_anno->as<IntImmNode>()->value);
       ICHECK(num_stages_ == 1) << "Nested pipeline not supported.";
     }
     loop_stack_.emplace_back(op->loop_var, op->extent);
@@ -648,16 +835,16 @@ class WSCodeEmitter : public StmtMutator {
     Array<Integer> stage_info_array;
 
     auto group_anno = op->annotations.Get("tl_pipeline_group");
-    if (group_anno.defined()) {
-      group_info_array = Downcast<Array<Array<Integer>>>(group_anno);
+    if (group_anno) {
+      group_info_array = Downcast<Array<Array<Integer>>>(group_anno.value());
     }
     auto order_anno = op->annotations.Get("tl_pipeline_order");
-    if (order_anno.defined()) {
-      order_info_array = Downcast<Array<Integer>>(order_anno);
+    if (order_anno) {
+      order_info_array = Downcast<Array<Integer>>(order_anno.value());
     }
     auto stage_anno = op->annotations.Get("tl_pipeline_stage");
-    if (stage_anno.defined()) {
-      stage_info_array = Downcast<Array<Integer>>(stage_anno);
+    if (stage_anno) {
+      stage_info_array = Downcast<Array<Integer>>(stage_anno.value());
     }
 
     PipelineInfo pipeline_info(group_info_array, order_info_array,
@@ -686,8 +873,8 @@ class WSCodeEmitter : public StmtMutator {
     auto result = FilterByRole(op);
 
     Stmt grouped_for_node;
-    if (result.as<ForNode>() && group_anno.defined() &&
-        group_info_array.size() > 0 && !is_emitting_producer_) {
+    if (result.as<ForNode>() && group_anno && group_info_array.size() > 0 &&
+        !is_emitting_producer_) {
       GroupOpRewriter group_op_rewriter(pipeline_info_);
       auto for_node = Downcast<For>(result);
       grouped_for_node = group_op_rewriter(for_node);
@@ -707,7 +894,7 @@ class WSCodeEmitter : public StmtMutator {
         for_node.CopyOnWrite()->annotations.erase("tl_pipeline_order");
         for_node.CopyOnWrite()->annotations.erase("tl_pipeline_stage");
       }
-      if (is_emitting_producer_ || !group_anno.defined() ||
+      if (is_emitting_producer_ || !group_anno ||
           group_info_array.size() == 0) {
         loop_stack_.pop_back();
         return for_node;
@@ -945,6 +1132,8 @@ class WSCodeEmitter : public StmtMutator {
   bool mbarrier_only_ = false;
   PipelineInfo pipeline_info_;
   friend class WarpSpecializedRewriter;
+  bool only_has_wgmma_ = false;
+  bool has_simt_copy_ = false;
 };
 
 class SetMaxNRegCollector : public StmtExprVisitor {
@@ -985,9 +1174,12 @@ class SetMaxNRegCollector : public StmtExprVisitor {
 
 class WarpSpecializedRewriter : public StmtExprMutator {
 public:
-  WarpSpecializedRewriter(bool disable_warp_specialized)
-      : disable_warp_specialized_(disable_warp_specialized) {}
-  static PrimFunc Substitute(PrimFunc f, bool disable_warp_specialized) {
+  WarpSpecializedRewriter(bool disable_warp_specialized,
+                          bool disable_shuffle_elect)
+      : disable_warp_specialized_(disable_warp_specialized),
+        disable_shuffle_elect_(disable_shuffle_elect) {}
+  static PrimFunc Substitute(PrimFunc f, bool disable_warp_specialized,
+                             bool disable_shuffle_elect) {
     // Check if function only uses threadIdx.x before proceeding
     if (!ThreadTagChecker::HasOnlyThreadIdxX(f)) {
       LOG(WARNING) << "WarpSpecialize will be disabled because the program "
@@ -998,7 +1190,8 @@ class WarpSpecializedRewriter : public StmtExprMutator {
       return f;
     }
 
-    auto T = WarpSpecializedRewriter(disable_warp_specialized);
+    auto T = WarpSpecializedRewriter(disable_warp_specialized,
+                                     disable_shuffle_elect);
     T.nreg_ = SetMaxNRegCollector::Collect(f);
     T.buffer_lca_ = DetectBufferAccessLCA(f);
     for (auto [buffer, _] : T.buffer_lca_)
@@ -1048,12 +1241,44 @@ class WarpSpecializedRewriter : public StmtExprMutator {
       ICHECK(thread_tag == "threadIdx.x") << "Only support threadIdx.x";
       Var thread_iv = Downcast<Var>(for_node->loop_var);
       Stmt new_body =
-          ThreadIdxRewriter::Rewrite(for_node->body, thread_iv, thread_iv_);
+          ThreadIdxRewriter::Rewrite(for_node->body, thread_iv, thread_iv_, 0);
       return new_body;
     }
     return for_node;
   }
 
+  /**
+   * @brief Rewrite a BlockRealize for warp specialization, inserting barriers and
+   *        emitting producer/consumer bodies.
+   *
+   * This visitor handles BlockRealize nodes when a thread IterVar (thread_iv_)
+   * is defined and warp-specialization is applicable. It:
+   * - Determines producer/consumer roles via WarpSpecializedRoleMarker and
+   *   returns the original block if no producer is detected.
+   * - If warp specialization is disabled, emits only mbarrier initialization and
+   *   the mbarrier-only transformed body.
+   * - Otherwise, detects WgMMA usage for the block body and constructs separate
+   *   WSCodeEmitter instances for producer and consumer paths (propagating the
+   *   WgMMA flag to the consumer emitter).
+   * - Generates producer/consumer code, applies register hint calls (set_max_nreg)
+   *   when available, and rewrites thread indices with ThreadIdxRewriter to
+   *   partition threads between producer and consumer roles.
+   * - Computes and initializes a list of mbarrier handles with per-barrier
+   *   arrive thread counts (taking SIMT-copy and WgMMA cases into account).
+   * - Wraps the transformed body in an IfThenElse that dispatches producer vs
+   *   consumer based on thread index, and annotates the region with the
+   *   "kWarpSpecializationScope" attribute that contains producer/consumer
+   *   thread extents.
+   *
+   * Side effects:
+   * - May update member state: only_has_wgmma_, updated_thread_extent_,
+   *   need_update_thread_extent_.
+   * - May abort via ICHECK if invariants (e.g., matching barrier counts) are
+   *   violated.
+   *
+   * @return The possibly rewritten BlockRealize statement (original when no
+   *         warp-specialization is applied or thread_iv_ is undefined).
+   */
   Stmt VisitStmt_(const BlockRealizeNode *op) final {
     BlockRealize block_realize =
         Downcast<BlockRealize>(StmtExprMutator::VisitStmt_(op));
@@ -1087,8 +1312,10 @@ class WarpSpecializedRewriter : public StmtExprMutator {
       block_realize.CopyOnWrite()->block = block;
       return block_realize;
     }
+    only_has_wgmma_ = WgMMACollector::HasWgMMA(block->body);
     WSCodeEmitter producer(true, thread_iv_, buffer_data_to_buffer_, marker);
-    WSCodeEmitter consumer(false, thread_iv_, buffer_data_to_buffer_, marker);
+    WSCodeEmitter consumer(false, thread_iv_, buffer_data_to_buffer_, marker,
+                           false, only_has_wgmma_);
     Stmt producer_code = producer(block->body);
     Stmt consumer_code = consumer(block->body);
     PrimExpr consumer_thread_extent = thread_iv_->dom->extent;
@@ -1103,7 +1330,7 @@ class WarpSpecializedRewriter : public StmtExprMutator {
 
     auto inc_reg_stmt = Evaluate(0);
     auto dec_reg_stmt = Evaluate(0);
-    if (dec_reg >= 0 && inc_reg >= 0) {
+    if (dec_reg >= 0 && inc_reg >= 0 && !marker.HasSimtCopy()) {
       inc_reg_stmt = Evaluate(Call(DataType::Handle(), set_max_nreg(),
                                    {inc_reg == 0 ? 240 : inc_reg, 1}));
       dec_reg_stmt = Evaluate(Call(DataType::Handle(), set_max_nreg(),
@@ -1113,10 +1340,15 @@ class WarpSpecializedRewriter : public StmtExprMutator {
     producer_code = SeqStmt({dec_reg_stmt, producer_code});
     consumer_code = SeqStmt({inc_reg_stmt, consumer_code});
 
-    producer_code =
-        ThreadIdxRewriter::Rewrite(producer_code, thread_iv_->var,
-                                   thread_iv_->var - consumer_thread_extent);
     updated_thread_extent_ = consumer_thread_extent + producer_thread_extent;
+
+    producer_code = ThreadIdxRewriter::Rewrite(
+        producer_code, thread_iv_->var,
+        thread_iv_->var - consumer_thread_extent, producer_thread_extent,
+        !disable_shuffle_elect_);
+    consumer_code = ThreadIdxRewriter::Rewrite(
+        consumer_code, thread_iv_->var, thread_iv_->var, consumer_thread_extent,
+        !disable_shuffle_elect_);
     need_update_thread_extent_ = true;
 
     ICHECK(producer.num_barriers_ == consumer.num_barriers_)
@@ -1125,9 +1357,11 @@ class WarpSpecializedRewriter : public StmtExprMutator {
     Array<PrimExpr> barrier_num_threads;
     barrier_num_threads.reserve(num_barriers);
     for (int i = 0; i < num_barriers; i++) {
-      PrimExpr arrive_thread_count = producer.released_barrier_.count(i)
-                                         ? producer_thread_extent
-                                         : consumer_thread_extent;
+      PrimExpr arrive_thread_count =
+          producer.released_barrier_.count(i)
+              ? (producer.hasSimtCopy() ? producer_thread_extent : 1)
+              : (only_has_wgmma_ ? FloorDiv(consumer_thread_extent, 128)
+                                 : consumer_thread_extent);
       barrier_num_threads.push_back(arrive_thread_count);
     }
 
@@ -1154,7 +1388,9 @@ class WarpSpecializedRewriter : public StmtExprMutator {
   Optional<PrimExpr> updated_thread_extent_;
   bool need_update_thread_extent_ = false;
   bool disable_warp_specialized_ = false;
+  bool disable_shuffle_elect_ = false;
   Array<IntImm> nreg_;
+  bool only_has_wgmma_ = false;
 };
 
 class WarpSpecializedDetector : public IRVisitorWithAnalyzer {
@@ -1220,18 +1456,23 @@ tvm::transform::Pass WarpSpecialized() {
   auto pass_func = [=](PrimFunc f, IRModule m, PassContext ctx) {
     bool disable_warp_specialized =
         ctx->GetConfig<Bool>(kDisableWarpSpecialized, Bool(false)).value();
+    bool disable_shuffle_elect =
+        ctx->GetConfig<Bool>(kDisableShuffleElect, Bool(false)).value();
     bool warp_specialized = WarpSpecializedDetector::Detect(f->body);
 
     if (!warp_specialized) {
-      return WarpSpecializedRewriter::Substitute(f, disable_warp_specialized);
+      return WarpSpecializedRewriter::Substitute(f, disable_warp_specialized,
+                                                 disable_shuffle_elect);
     }
     return f;
   };
   return CreatePrimFuncPass(pass_func, 0, "tl.WarpSpecialized", {});
 }
 
-TVM_REGISTER_GLOBAL("tl.transform.WarpSpecialized")
-    .set_body_typed(WarpSpecialized);
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.WarpSpecialized", WarpSpecialized);
+});
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/wgmma_sync_rewriter.cc b/src/transform/wgmma_sync_rewriter.cc
index eae3efe2d..4b6614af0 100644
--- a/src/transform/wgmma_sync_rewriter.cc
+++ b/src/transform/wgmma_sync_rewriter.cc
@@ -1,27 +1,9 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
 /*!
  * \file warp_specialized_pipeline.cc
  * \brief Warp specialized Pipeline for cuda GPU (sm90+)
  */
 
+#include <tvm/ffi/reflection/registry.h>
 #include <tvm/tir/analysis.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/op.h>
@@ -131,7 +113,7 @@ class WgmmaSyncRewriter : public StmtExprMutator {
 
   Stmt VisitStmt_(const ForNode *op) final {
     auto order_anno = op->annotations.Get("tl_pipeline_order");
-    if (!order_anno.defined()) {
+    if (!order_anno) {
       return StmtExprMutator::VisitStmt_(op);
     }
 
@@ -281,8 +263,10 @@ tvm::transform::Pass RewriteWgmmaSync() {
   return CreatePrimFuncPass(pass_func, 0, "tl.RewriteWgmmaSync", {});
 }
 
-TVM_REGISTER_GLOBAL("tl.transform.RewriteWgmmaSync")
-    .set_body_typed(RewriteWgmmaSync);
+TVM_FFI_STATIC_INIT_BLOCK({
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.RewriteWgmmaSync", RewriteWgmmaSync);
+});
 
 } // namespace tl
 } // namespace tvm
diff --git a/testing/python/amd/test_tilelang_gemm_mfma_intrinsic.py b/testing/python/amd/test_tilelang_gemm_mfma_intrinsic.py
index 8244b173f..8b66d5dab 100644
--- a/testing/python/amd/test_tilelang_gemm_mfma_intrinsic.py
+++ b/testing/python/amd/test_tilelang_gemm_mfma_intrinsic.py
@@ -1,5 +1,4 @@
 import torch
-import torch.backends
 import tilelang.testing
 from tilelang import tvm as tvm
 import tilelang.language as T
diff --git a/testing/python/cpu/test_tilelang_cpu_gemm.py b/testing/python/cpu/test_tilelang_cpu_gemm.py
index 2b53a047c..42e7a8158 100644
--- a/testing/python/cpu/test_tilelang_cpu_gemm.py
+++ b/testing/python/cpu/test_tilelang_cpu_gemm.py
@@ -4,6 +4,8 @@
 import tilelang.language as T
 import torch
 
+tilelang.disable_cache()
+
 
 def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
     num_stages = 0
diff --git a/testing/python/kernel/test_tilelang_kernel_bf16_gemm_mma.py b/testing/python/kernel/test_tilelang_kernel_bf16_gemm_mma.py
index 331c4e4a5..b4509fadc 100644
--- a/testing/python/kernel/test_tilelang_kernel_bf16_gemm_mma.py
+++ b/testing/python/kernel/test_tilelang_kernel_bf16_gemm_mma.py
@@ -40,8 +40,8 @@ def tl_matmul(
     assert in_dtype in [
         "float16",
         "bfloat16",
-        "e4m3_float8",
-        "e5m2_float8",
+        "float8_e4m3",
+        "float8_e5m2",
         "int8",
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
@@ -52,7 +52,7 @@ def tl_matmul(
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    is_float8 = in_dtype in ["e4m3_float8", "e5m2_float8"]
+    is_float8 = in_dtype in ["float8_e4m3", "float8_e5m2"]
     if out_dtype == "int32" or is_float8:
         micro_size_k = 32
 
@@ -220,4 +220,5 @@ def test_assert_tl_matmul_bfloat16():
 
 
 if __name__ == "__main__":
-    tilelang.testing.main()
+    # tilelang.testing.main()
+    test_assert_tl_matmul_bfloat16()
diff --git a/testing/python/kernel/test_tilelang_kernel_deepseek_nsa.py b/testing/python/kernel/test_tilelang_kernel_deepseek_nsa.py
deleted file mode 100644
index c7ff2d641..000000000
--- a/testing/python/kernel/test_tilelang_kernel_deepseek_nsa.py
+++ /dev/null
@@ -1,324 +0,0 @@
-# ruff: noqa
-from tilelang import tvm as tvm
-import tilelang.testing
-import tilelang.language as T
-import torch
-from typing import Optional, Union
-from einops import rearrange, repeat
-
-tilelang.testing.set_random_seed(42)
-
-
-def naive_nsa_ref(q: torch.Tensor,
-                  k: torch.Tensor,
-                  v: torch.Tensor,
-                  g_slc: torch.Tensor,
-                  g_swa: torch.Tensor,
-                  block_indices: torch.LongTensor,
-                  block_counts: Optional[Union[torch.LongTensor, int]] = None,
-                  block_size: int = 64,
-                  window_size: int = 0,
-                  scale: Optional[float] = None,
-                  cu_seqlens: Optional[torch.LongTensor] = None,
-                  head_first: bool = False) -> torch.Tensor:
-
-    if scale is None:
-        scale = k.shape[-1]**-0.5
-    if cu_seqlens is not None:
-        assert q.shape[0] == 1, "batch size must be 1 when cu_seqlens are provided"
-        if head_first:
-            raise RuntimeError(
-                "Sequences with variable lengths are not supported for head-first mode")
-    if head_first:
-        q, k, v, block_indices = map(lambda x: rearrange(x, 'b h t d -> b t h d'),
-                                     (q, k, v, block_indices))
-        g_slc, g_swa = map(lambda x: rearrange(x, 'b h t -> b t h'), (g_slc, g_swa))
-        if isinstance(block_counts, torch.Tensor):
-            block_counts = rearrange(block_counts, 'b h t -> b t h')
-
-    dtype = q.dtype
-    G = q.shape[2] // k.shape[2]
-    BS = block_size
-    S = block_indices.shape[-1]
-    k, v, block_indices = (repeat(x, 'b t h d -> b t (h g) d', g=G) for x in (k, v, block_indices))
-    if isinstance(block_counts, torch.Tensor):
-        block_counts = repeat(block_counts, 'b t h -> b t (h g)', g=G)
-    c = torch.arange(S).repeat_interleave(BS).unsqueeze(1).expand(-1, q.shape[2]).to(q.device)
-    q, k, v = map(lambda x: x.float(), (q, k, v))
-
-    o_slc = torch.zeros_like(v)
-    o_swa = torch.zeros_like(v) if window_size > 0 else None
-    varlen = True
-    if cu_seqlens is None:
-        varlen = False
-        B, T = q.shape[:2]
-        cu_seqlens = torch.cat(
-            [block_indices.new_tensor(range(0, B * T, T)),
-             block_indices.new_tensor([B * T])])
-
-    for i in range(len(cu_seqlens) - 1):
-        if not varlen:
-            q_b, k_b, v_b, g_slc_b, g_swa_b, i_b = q[i], k[i], v[i], g_slc[i], g_swa[
-                i], block_indices[i]
-            if isinstance(block_counts, torch.Tensor):
-                s_b = block_counts[i]
-            else:
-                s_b = block_counts
-        else:
-            T = cu_seqlens[i + 1] - cu_seqlens[i]
-            q_b, k_b, v_b, g_slc_b, g_swa_b, i_b = map(
-                lambda x: x[0][cu_seqlens[i]:cu_seqlens[i + 1]],
-                (q, k, v, g_slc, g_swa, block_indices))
-            if isinstance(block_counts, torch.Tensor):
-                s_b = block_counts[0][cu_seqlens[i]:cu_seqlens[i + 1]]
-            else:
-                s_b = block_counts
-
-        i_b = i_b.unsqueeze(-1) * BS + i_b.new_tensor(range(BS))
-        # [T, S*BS, HQ]
-        i_b = i_b.view(T, block_indices.shape[2], -1).transpose(1, 2)
-        for i_q in range(T):
-            # [HQ, D]
-            q_i = q_b[i_q] * scale
-            # [HQ]
-            g_slc_i = g_slc_b[i_q]
-            # [HQ]
-            g_swa_i = g_swa_b[i_q]
-            # [S*BS, HQ]
-            i_i = i_b[i_q]
-            # [HQ]
-            if isinstance(block_counts, torch.Tensor):
-                s_i = s_b[i_q]
-            else:
-                s_i = s_b
-            # [S*BS, HQ, -1]
-            k_i_slc, v_i_slc = map(
-                lambda x: x.gather(
-                    0,
-                    i_i.clamp(0, T - 1).unsqueeze(-1).expand(*i_i.shape, x.shape[-1])), (k_b, v_b))
-            # [S*BS, HQ]
-            attn_slc = torch.einsum('h d, n h d -> n h', q_i, k_i_slc).masked_fill(
-                torch.logical_or(i_i < 0, i_i > i_q) |
-                (c >= s_i if block_counts is not None else False), float('-inf')).softmax(0)
-            if not varlen:
-                o_slc[i, i_q] = torch.einsum('n h, n h v -> h v', attn_slc,
-                                             v_i_slc) * g_slc_i.unsqueeze(-1)
-            else:
-                o_slc[0][cu_seqlens[i] + i_q] = torch.einsum('n h, n h v -> h v', attn_slc,
-                                                             v_i_slc) * g_slc_i.unsqueeze(-1)
-            if window_size > 0:
-                k_i_swa, v_i_swa = map(lambda x: x[max(0, i_q - window_size + 1):i_q + 1],
-                                       (k_b, v_b))
-                attn_swa = torch.einsum('h d, n h d -> n h', q_i, k_i_swa).softmax(0)
-                if not varlen:
-                    o_swa[i, i_q] = torch.einsum('n h, n h v -> h v', attn_swa,
-                                                 v_i_swa) * g_swa_i.unsqueeze(-1)
-                else:
-                    o_swa[0][cu_seqlens[i] + i_q] = torch.einsum('n h, n h v -> h v', attn_swa,
-                                                                 v_i_swa) * g_swa_i.unsqueeze(-1)
-
-    if head_first:
-        o_slc = rearrange(o_slc, 'b t h d -> b h t d')
-        o_swa = rearrange(o_swa, 'b t h d -> b h t d')
-
-    return o_slc.to(dtype) + o_swa.to(dtype) if o_swa is not None else o_slc.to(dtype)
-
-
-def native_sparse_attention(batch,
-                            heads,
-                            seq_len,
-                            dim,
-                            is_causal,
-                            scale=None,
-                            block_size=64,
-                            groups=16,
-                            selected_blocks=16,
-                            num_stages=0,
-                            threads=32):
-    if scale is None:
-        scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
-    else:
-        scale = scale * 1.44269504  # log2(e)
-
-    head_kv = heads // groups
-    q_shape = [batch, seq_len, heads, dim]
-    kv_shape = [batch, seq_len, head_kv, dim]
-    block_indices_shape = [batch, seq_len, head_kv, selected_blocks]
-    block_indices_dtype = "int32"
-    dtype = "float16"
-    accum_dtype = "float"
-    block_S = block_size
-    block_T = min(128, tilelang.math.next_power_of_2(dim))
-
-    NK = tilelang.cdiv(dim, block_T)
-    NV = tilelang.cdiv(dim, block_T)
-    assert NK == 1, "The key dimension can not be larger than 256"
-
-    S = selected_blocks
-    G = groups
-    BS = block_S
-    BK = BV = block_T
-
-    @T.prim_func
-    def native_sparse_attention(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            BlockIndices: T.Tensor(block_indices_shape, block_indices_dtype),
-            Output: T.Tensor(q_shape, dtype),
-    ):
-        with T.Kernel(seq_len, NV, batch * head_kv, threads=threads) as (bx, by, bz):
-            Q_shared = T.alloc_shared([G, BK], dtype)
-            K_shared = T.alloc_shared([BS, BK], dtype)
-            V_shared = T.alloc_shared([BS, BV], dtype)
-            O_shared = T.alloc_shared([G, BV], dtype)
-
-            acc_s = T.alloc_fragment([G, BS], accum_dtype)
-            acc_s_cast = T.alloc_fragment([G, BS], dtype)
-            acc_o = T.alloc_fragment([G, BV], accum_dtype)
-            scores_max = T.alloc_fragment([G], accum_dtype)
-            scores_max_prev = T.alloc_fragment([G], accum_dtype)
-            scores_scale = T.alloc_fragment([G], accum_dtype)
-            scores_sum = T.alloc_fragment([G], accum_dtype)
-            logsum = T.alloc_fragment([G], accum_dtype)
-
-            i_t, i_v, i_bh = bx, by, bz
-            i_b, i_h = i_bh // head_kv, i_bh % head_kv
-
-            NS = S
-            T.copy(Q[i_b, i_t, i_h * G:(i_h + 1) * G, :], Q_shared)
-
-            T.fill(acc_o, 0)
-            T.fill(logsum, 0)
-            T.fill(scores_max, -T.infinity(accum_dtype))
-
-            for i in T.Pipelined(NS, num_stages=num_stages):
-                i_s = BlockIndices[i_b, i_t, i_h, i] * BS
-                if i_s <= i_t and i_s >= 0:
-                    # [BS, BK]
-                    T.copy(K[i_b, i_s:i_s + BS, i_h, :], K_shared)
-
-                    if is_causal:
-                        for i, j in T.Parallel(G, BS):
-                            acc_s[i, j] = T.if_then_else(i_t >= (i_s + j), 0,
-                                                         -T.infinity(acc_s.dtype))
-                    else:
-                        T.clear(acc_s)
-
-                    T.gemm(
-                        Q_shared,
-                        K_shared,
-                        acc_s,
-                        transpose_B=True,
-                        policy=T.GemmWarpPolicy.FullRow)
-
-                    # Softmax
-                    T.copy(scores_max, scores_max_prev)
-                    T.fill(scores_max, -T.infinity(accum_dtype))
-                    T.reduce_max(acc_s, scores_max, dim=1, clear=True)
-                    for i in T.Parallel(G):
-                        scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
-                    for i, j in T.Parallel(G, BS):
-                        acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
-                    T.reduce_sum(acc_s, scores_sum, dim=1)
-                    for i in T.Parallel(G):
-                        logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
-                    T.copy(acc_s, acc_s_cast)
-
-                    # Rescale
-                    for i, j in T.Parallel(G, BV):
-                        acc_o[i, j] *= scores_scale[i]
-
-                    # V * softmax(Q * K)
-                    T.copy(V[i_b, i_s:i_s + BS, i_h, i_v * BV:(i_v + 1) * BV], V_shared)
-                    T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
-
-            for i, j in T.Parallel(G, BV):
-                acc_o[i, j] /= logsum[i]
-            T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[i_b, i_t, i_h * G:(i_h + 1) * G, i_v * BV:(i_v + 1) * BV])
-
-    return native_sparse_attention
-
-
-def run_native_sparse_attention(batch,
-                                heads,
-                                seq_len,
-                                dim,
-                                is_causal,
-                                scale=None,
-                                block_size=64,
-                                groups=16,
-                                selected_blocks=16,
-                                num_stages=0,
-                                threads=32):
-    dtype = torch.float16
-    head_kv = heads // groups
-    program = native_sparse_attention(batch, heads, seq_len, dim, is_causal, scale, block_size,
-                                      groups, selected_blocks, num_stages, threads)
-    kernel = tilelang.compile(program, out_idx=-1)
-    Q = torch.randn((batch, seq_len, heads, dim), dtype=dtype).cuda()
-    K = torch.randn((batch, seq_len, head_kv, dim), dtype=dtype).cuda()
-    V = torch.randn((batch, seq_len, head_kv, dim), dtype=dtype).cuda()
-    g_slc = torch.ones((batch, seq_len, heads), dtype=dtype).cuda()
-    g_swa = torch.ones((batch, seq_len, heads), dtype=dtype).cuda()
-
-    block_indices = torch.full((batch, seq_len, head_kv, selected_blocks),
-                               seq_len,
-                               dtype=torch.long,
-                               device='cuda')
-    for b in range(batch):
-        for t in range(seq_len):
-            for h in range(head_kv):
-                i_i = torch.randperm(max(1, (t // block_size)))[:selected_blocks]
-                block_indices[b, t, h, :len(i_i)] = i_i
-    block_indices = block_indices.sort(-1)[0]
-    block_counts = torch.randint(1, selected_blocks + 1, (batch, seq_len, head_kv), device='cuda')
-
-    out = kernel(Q, K, V, block_indices.to(torch.int32))
-
-    ref = naive_nsa_ref(
-        q=Q,
-        k=K,
-        v=V,
-        g_slc=g_slc,
-        g_swa=g_swa,
-        block_indices=block_indices,
-        block_counts=block_counts,
-        block_size=block_size,
-        scale=scale,
-    )
-    torch.testing.assert_close(ref, out, atol=1e-2, rtol=1e-2)
-
-
-def test_tilelang_kernel_deepseek_nsa():
-    # disable pipeline
-    run_native_sparse_attention(
-        batch=2,
-        heads=64,
-        seq_len=1,
-        dim=16,
-        is_causal=True,
-        scale=None,
-        block_size=32,
-        groups=16,
-        selected_blocks=16,
-        num_stages=0,
-        threads=32)
-    # enable pipeline
-    run_native_sparse_attention(
-        batch=2,
-        heads=64,
-        seq_len=1,
-        dim=16,
-        is_causal=True,
-        scale=None,
-        block_size=32,
-        groups=16,
-        selected_blocks=16,
-        num_stages=2,
-        threads=32)
-
-
-if __name__ == "__main__":
-    tilelang.testing.main()
diff --git a/testing/python/kernel/test_tilelang_kernel_dequantize_gemm.py b/testing/python/kernel/test_tilelang_kernel_dequantize_gemm.py
index 2f0394941..c4df8fa67 100644
--- a/testing/python/kernel/test_tilelang_kernel_dequantize_gemm.py
+++ b/testing/python/kernel/test_tilelang_kernel_dequantize_gemm.py
@@ -97,7 +97,7 @@ def test_fp4_fp16_convert_close():
         block_K,
         "float16",
     )
-
+    print(program.script())
     kernel = tilelang.compile(program, out_idx=[1])
 
     B = torch.randint(0, 16, (N, K // 2), dtype=torch.uint8, device="cuda").to(torch.uint8)
@@ -642,4 +642,5 @@ def test_assert_tl_matmul_with_ladder_weight_only_transform_block_reduce_int4():
 
 
 if __name__ == "__main__":
-    tilelang.testing.main()
+    # tilelang.testing.main()
+    test_fp4_fp16_convert_close()
diff --git a/testing/python/kernel/test_tilelang_kernel_fp8_gemm.py b/testing/python/kernel/test_tilelang_kernel_fp8_gemm.py
index a785ad7b2..19f327d66 100644
--- a/testing/python/kernel/test_tilelang_kernel_fp8_gemm.py
+++ b/testing/python/kernel/test_tilelang_kernel_fp8_gemm.py
@@ -56,8 +56,8 @@ def assert_matmul_correctness(M, N, K, block_M, block_N, block_K, in_dtype, out_
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version(9)
 def test_assert_matmul():
-    assert_matmul_correctness(1024, 1024, 1024, 128, 128, 64, "e4m3_float8", "float32", "float32")
-    assert_matmul_correctness(1024, 1024, 1024, 128, 128, 64, "e5m2_float8", "float32", "float32")
+    assert_matmul_correctness(1024, 1024, 1024, 128, 128, 64, "float8_e4m3", "float32", "float32")
+    assert_matmul_correctness(1024, 1024, 1024, 128, 128, 64, "float8_e5m2", "float32", "float32")
 
 
 if __name__ == "__main__":
diff --git a/testing/python/kernel/test_tilelang_kernel_fp8_gemm_mma.py b/testing/python/kernel/test_tilelang_kernel_fp8_gemm_mma.py
index a1ccf2f42..34def174d 100644
--- a/testing/python/kernel/test_tilelang_kernel_fp8_gemm_mma.py
+++ b/testing/python/kernel/test_tilelang_kernel_fp8_gemm_mma.py
@@ -39,8 +39,8 @@ def tl_matmul(
 ):
     assert in_dtype in [
         "float16",
-        "e4m3_float8",
-        "e5m2_float8",
+        "float8_e4m3",
+        "float8_e5m2",
         "int8",
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
@@ -51,7 +51,7 @@ def tl_matmul(
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    is_float8 = in_dtype in ["e4m3_float8", "e5m2_float8"]
+    is_float8 = in_dtype in ["float8_e4m3", "float8_e5m2"]
     if out_dtype == "int32" or is_float8:
         micro_size_k = 32
 
@@ -216,8 +216,8 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version(8, 9)
 def test_assert_tl_matmul():
-    assert_tl_matmul_correctness(128, 128, 128, "e4m3_float8", "float32", "float32")
-    assert_tl_matmul_correctness(128, 128, 128, "e5m2_float8", "float32", "float32")
+    assert_tl_matmul_correctness(128, 128, 128, "float8_e4m3", "float32", "float32")
+    assert_tl_matmul_correctness(128, 128, 128, "float8_e5m2", "float32", "float32")
 
 
 if __name__ == "__main__":
diff --git a/testing/python/kernel/test_tilelang_kernel_fp8_gemv_simt.py b/testing/python/kernel/test_tilelang_kernel_fp8_gemv_simt.py
index 010af763f..afd01f337 100644
--- a/testing/python/kernel/test_tilelang_kernel_fp8_gemv_simt.py
+++ b/testing/python/kernel/test_tilelang_kernel_fp8_gemv_simt.py
@@ -166,8 +166,8 @@ def evaluate_gemv_simt(
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version(8, 9)
 def test_gemv_simt():
-    evaluate_gemv_simt(1, 1024, 1024, "e4m3_float8", "float32", "float32", with_bias=False)
-    evaluate_gemv_simt(1, 1024, 1024, "e5m2_float8", "float32", "float32", with_bias=False)
+    evaluate_gemv_simt(1, 1024, 1024, "float8_e4m3", "float32", "float32", with_bias=False)
+    evaluate_gemv_simt(1, 1024, 1024, "float8_e5m2", "float32", "float32", with_bias=False)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/kernel/test_tilelang_kernel_gemm_mma_intrinsic.py b/testing/python/kernel/test_tilelang_kernel_gemm_mma_intrinsic.py
index acf8d1765..da2e12cdc 100644
--- a/testing/python/kernel/test_tilelang_kernel_gemm_mma_intrinsic.py
+++ b/testing/python/kernel/test_tilelang_kernel_gemm_mma_intrinsic.py
@@ -40,8 +40,8 @@ def tl_matmul(
     assert in_dtype in [
         "float16",
         "bfloat16",
-        "e4m3_float8",
-        "e5m2_float8",
+        "float8_e4m3",
+        "float8_e5m2",
         "int8",
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
@@ -52,7 +52,7 @@ def tl_matmul(
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    is_float8 = in_dtype in ["e4m3_float8", "e5m2_float8"]
+    is_float8 = in_dtype in ["float8_e4m3", "float8_e5m2"]
     if out_dtype == "int32" or is_float8:
         micro_size_k = 32
 
@@ -228,8 +228,8 @@ def test_assert_tl_matmul_bfloat16():
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version(8, 9)
 def test_assert_tl_matmul_fp8():
-    assert_tl_matmul_correctness(128, 128, 128, "e4m3_float8", "float32", "float32")
-    assert_tl_matmul_correctness(128, 128, 128, "e5m2_float8", "float32", "float32")
+    assert_tl_matmul_correctness(128, 128, 128, "float8_e4m3", "float32", "float32")
+    assert_tl_matmul_correctness(128, 128, 128, "float8_e5m2", "float32", "float32")
 
 
 if __name__ == "__main__":
diff --git a/testing/python/kernel/test_tilelang_kernel_gemm_with_stride.py b/testing/python/kernel/test_tilelang_kernel_gemm_with_stride.py
new file mode 100644
index 000000000..bbc2e79e2
--- /dev/null
+++ b/testing/python/kernel/test_tilelang_kernel_gemm_with_stride.py
@@ -0,0 +1,86 @@
+import tilelang.testing
+import tilelang
+import tilelang.language as T
+import torch
+
+
+def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+
+    @T.prim_func
+    def main(
+            A: T.Tensor((M, K), dtype),
+            B: T.Tensor((K, N), dtype),
+            C: T.Tensor((M, N), dtype),
+    ):
+        # Initialize Kernel Context
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K * 2), dtype, scope="shared")
+            B_shared = T.alloc_shared((block_K, block_N * 2), dtype, scope="shared")
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+
+            # Clear local accumulation
+            T.clear(C_local)
+            T.clear(B_shared)
+            T.clear(A_shared)
+
+            for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=0):
+                # Copy tile of A
+                # T.copy(A[by * block_M, ko * block_K], A_shared)
+                for i, k in T.Parallel(block_M, block_K):
+                    A_shared[i, k + block_K] = A[by * block_M + i, ko * block_K + k]
+
+                # Copy tile of B
+                # T.copy(B[ko * block_K, bx * block_N], B_shared)
+                for i, k in T.Parallel(block_K, block_N):
+                    B_shared[i, k] = B[ko * block_K + i, bx * block_N + k]
+
+                # Perform a tile-level GEMM on the shared buffers
+                # Currently we dispatch to the cute/hip on Nvidia/AMD GPUs
+                T.gemm(A_shared[:, block_K:], B_shared[0:block_K, 0:block_N], C_local)
+
+            # Copy result back to global memory
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm_with_stride_ss(M: int, N: int, K: int, block_M: int, block_N: int, block_K: int):
+    # 1. Define the kernel (matmul) and compile/lower it into an executable module
+    func = matmul(M, N, K, block_M, block_N, block_K)
+
+    # 2. Compile the kernel into a torch function
+    # out_idx specifies the index of the output buffer in the argument list
+    # if out_idx is specified, the tensor will be created during runtime
+    # target currently can be "cuda" or "hip" or "cpu".
+    jit_kernel = tilelang.compile(
+        func,
+        out_idx=[2],
+        target="cuda",
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        })
+    # Create random input tensors on the GPU
+    a = torch.randn(M, K, device="cuda", dtype=torch.float16)
+    b = torch.randn(K, N, device="cuda", dtype=torch.float16)
+
+    # Run the kernel through the Profiler
+    c = jit_kernel(a, b)
+
+    print(c)
+    # Reference multiplication using PyTorch
+    ref_c = a @ b
+
+    # Validate correctness
+    torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
+    print("Kernel output matches PyTorch reference.")
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(7, 5)
+def test_tilelang_kernel_gemm_with_stride():
+    run_gemm_with_stride_ss(128, 128, 64, 32, 32, 32)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/kernel/test_tilelang_kernel_gemv_simt.py b/testing/python/kernel/test_tilelang_kernel_gemv_simt.py
index 9e68de9d9..86d6acbda 100644
--- a/testing/python/kernel/test_tilelang_kernel_gemv_simt.py
+++ b/testing/python/kernel/test_tilelang_kernel_gemv_simt.py
@@ -173,8 +173,8 @@ def test_gemv_simt():
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version(8, 9)
 def test_gemv_simt_fp8():
-    evaluate_gemv_simt(1, 1024, 1024, "e4m3_float8", "float32", "float32", with_bias=False)
-    evaluate_gemv_simt(1, 1024, 1024, "e5m2_float8", "float32", "float32", with_bias=False)
+    evaluate_gemv_simt(1, 1024, 1024, "float8_e4m3", "float32", "float32", with_bias=False)
+    evaluate_gemv_simt(1, 1024, 1024, "float8_e5m2", "float32", "float32", with_bias=False)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/kernel/test_tilelang_kernel_int4_gemm_mma.py b/testing/python/kernel/test_tilelang_kernel_int4_gemm_mma.py
index 7319e0d1f..b11abefd1 100644
--- a/testing/python/kernel/test_tilelang_kernel_int4_gemm_mma.py
+++ b/testing/python/kernel/test_tilelang_kernel_int4_gemm_mma.py
@@ -14,9 +14,10 @@
 from tilelang.transform import simplify_prim_func
 
 tilelang.testing.set_random_seed(42)
+tilelang.disable_cache()
 
 
-@simplify_prim_func
+# @simplify_prim_func
 def tl_matmul(
     M,
     N,
@@ -164,7 +165,13 @@ def main(
 
 def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
     matmul = tl_matmul(M, N, K, in_dtype, out_dtype, accum_dtype)
-    kernel = tilelang.compile(matmul, out_idx=[2])
+    kernel = tilelang.compile(
+        matmul,
+        out_idx=[2],
+        pass_configs={
+            tilelang.PassConfigKey.TL_DEBUG_MERGE_SHARED_MEMORY_ALLOCATIONS: True,
+        })
+    print(kernel.get_kernel_source())
     profiler = kernel.get_profiler()
 
     src_code = kernel.get_kernel_source()
@@ -400,4 +407,5 @@ def test_assert_tl_matmul_weight_only_transform():
 
 
 if __name__ == "__main__":
-    tilelang.testing.main()
+    # tilelang.testing.main()
+    assert_tl_matmul_correctness(128, 128, 128, "int8", "int32", "int32")
diff --git a/testing/python/language/test_tilelang_language_alias.py b/testing/python/language/test_tilelang_language_alias.py
index 038474fce..c99d36102 100644
--- a/testing/python/language/test_tilelang_language_alias.py
+++ b/testing/python/language/test_tilelang_language_alias.py
@@ -27,7 +27,9 @@ def main(
             for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=0):
                 # Copy tile of A
                 # This is a sugar syntax for parallelized copy
-                T.copy(A[by * block_M, ko * block_K], X_shared)
+                aliased_offset = T.int32()
+                T.let(aliased_offset, ko * block_K)
+                T.copy(A[by * block_M, aliased_offset], X_shared)
 
                 # Demonstrate parallelized copy from global to shared for B
                 T.copy(B[bx * block_N, ko * block_K], B_shared[:block_N, :block_K])
diff --git a/testing/python/language/test_tilelang_language_annotate_pad.py b/testing/python/language/test_tilelang_language_annotate_pad.py
index 3cfc69615..7717db339 100644
--- a/testing/python/language/test_tilelang_language_annotate_pad.py
+++ b/testing/python/language/test_tilelang_language_annotate_pad.py
@@ -39,7 +39,6 @@ def run_tilelang_copy(M=1024, N=1024, block_M=128, block_N=128, dtype="float16",
             "tl.disable_warp_specialized": True,
             "tl.disable_tma_lower": True
         })
-    print(kernel.get_kernel_source())
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
     ref_b = torch.zeros_like(a)
diff --git a/testing/python/language/test_tilelang_language_copy.py b/testing/python/language/test_tilelang_language_copy.py
index d44b25f03..953f1b0b4 100644
--- a/testing/python/language/test_tilelang_language_copy.py
+++ b/testing/python/language/test_tilelang_language_copy.py
@@ -1,6 +1,7 @@
 import tilelang
 import tilelang.language as T
 import torch
+import tilelang.testing
 
 
 # add decorator @tilelang.jit if you want to return a torch function
@@ -27,8 +28,8 @@ def run_tilelang_copy(M=1024, N=1024, block_M=128, block_N=128, dtype="float16")
         out_idx=[1],
         target="cuda",
         pass_configs={
-            "tl.disable_warp_specialized": True,
-            "tl.disable_tma_lower": True
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True
         })
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
@@ -41,5 +42,49 @@ def test_tilelang_copy():
     run_tilelang_copy(M=1024, N=576, block_M=32, block_N=576, dtype="float")
 
 
+def tilelang_copy_with_stride(M, N, NN, block_M, block_N, dtype="float16"):
+
+    @T.prim_func
+    def main(
+            A: T.StridedTensor((M, N), (NN, 1), dtype),
+            B: T.Tensor((M, N), dtype),
+    ):
+        # Initialize Kernel Context
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
+            for i, j in T.Parallel(block_M, block_N):
+                B[by * block_M + i, bx * block_N + j] = A[by * block_M + i, bx * block_N + j]
+
+    return main
+
+
+def run_tilelang_copy_with_stride(M=1024,
+                                  N=1024,
+                                  NN=2048,
+                                  block_M=128,
+                                  block_N=128,
+                                  dtype="float16"):
+    if isinstance(NN, int):
+        assert NN > N, "NN must be greater than N"
+    program = tilelang_copy_with_stride(M, N, NN, block_M, block_N, dtype)
+    kernel = tilelang.compile(
+        program,
+        out_idx=[1],
+        target="cuda",
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+        })
+    if isinstance(NN, T.Var):
+        NN = N * 2
+    a = torch.randn(M, NN, device="cuda", dtype=getattr(torch, dtype))
+    b = kernel(a[:, :N])
+    torch.testing.assert_close(b, a[:, :N], rtol=1e-2, atol=1e-2)
+
+
+def test_tilelang_copy_with_stride():
+    run_tilelang_copy_with_stride(M=1024, N=1024, NN=2048, block_M=128, block_N=128)
+    run_tilelang_copy_with_stride(M=1024, N=1024, NN=T.symbolic("NN"), block_M=128, block_N=128)
+
+
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_pipeline.py b/testing/python/language/test_tilelang_language_pipeline.py
new file mode 100644
index 000000000..212f281ea
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_pipeline.py
@@ -0,0 +1,224 @@
+from tilelang import tvm as tvm
+import tilelang.testing
+
+
+def matmul(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    threads,
+    order,
+    stage,
+):
+    A_shape = (K, M) if trans_A else (M, K)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+
+    import tilelang.language as T
+
+    @T.prim_func
+    def main(
+            A: T.Tensor(A_shape, in_dtype),
+            B: T.Tensor(B_shape, in_dtype),
+            C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), order=order, stage=stage):
+                if trans_A:
+                    T.copy(A[k * block_K, by * block_M], A_shared)
+                else:
+                    T.copy(A[by * block_M, k * block_K], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm(
+    order,
+    stage,
+):
+    M = 1024
+    N = 1024
+    K = 1024
+    block_M = 128
+    block_N = 128
+    block_K = 32
+    trans_A = False
+    trans_B = False
+    in_dtype = "float16"
+    out_dtype = "float16"
+    dtypeAccum = "float32"
+    num_threads = 128
+    program = matmul(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_threads,
+        order,
+        stage,
+    )
+
+    kernel = tilelang.compile(
+        program,
+        out_idx=[2],
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        })
+    profiler = kernel.get_profiler()
+
+    def ref_program(A, B):
+        import torch
+
+        if trans_A:
+            A = A.T
+        if trans_B:
+            B = B.T
+        if in_dtype == "float32":
+            # Convert float32 to tfloat32 because tfloat32 mma cannot truncate
+            # float32 automatically, -0x1000 meas
+            A = ((A.view(torch.int32) - 0x1000)).view(torch.float32)
+            B = ((B.view(torch.int32) - 0x1000)).view(torch.float32)
+        C = torch.matmul(A.to(torch.float), B.to(torch.float))
+        C = C.to(torch.__getattribute__(out_dtype))
+        return C
+
+    profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
+
+
+def test_pipeline_order_stage():
+    run_gemm(order=[0, 1, 2], stage=[0, 0, 1])
+    run_gemm(order=[0, 1, 2], stage=[0, 0, 2])
+    run_gemm(order=[1, 2, 0], stage=[0, 0, 2])
+    run_gemm(order=[1, 2, 0], stage=[0, 0, 1])
+
+
+@tilelang.jit(
+    out_idx=[-1],
+    pass_configs={
+        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+    })
+def blocksparse_matmul(M,
+                       N,
+                       K,
+                       block_M,
+                       block_N,
+                       block_K,
+                       num_stages,
+                       dtype="float16",
+                       accum_dtype="float"):
+
+    block_mask_shape = (M // block_M, N // block_N, K // block_K)
+
+    import tilelang.language as T
+
+    @T.prim_func
+    def block_sparse_matmul(
+            A: T.Tensor((M, K), dtype),
+            B: T.Tensor((K, N), dtype),
+            BlockMask: T.Tensor(block_mask_shape, "bool"),
+            C: T.Tensor((M, N), dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            B_shared = T.alloc_shared((block_K, block_N), dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            block_mask = T.alloc_local((1,), "bool")
+            C_shared = T.alloc_shared((block_M, block_N), dtype)
+
+            T.clear(C_local)
+
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                block_mask[0] = BlockMask[by, bx, k]
+                if block_mask[0]:
+                    T.copy(A[by * block_M, k * block_K], A_shared)
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                    T.gemm(A_shared, B_shared, C_local)
+
+            T.copy(C_local, C_shared)
+            T.copy(C_shared, C[by * block_M, bx * block_N])
+
+    return block_sparse_matmul
+
+
+def run_blocksparse_matmul(num_stages):
+    import torch
+
+    M = 256
+    N = 256
+    K = 256
+    block_M = 128
+    block_N = 128
+    block_K = 32
+    sparsity = 0.5
+
+    # Initialize input matrices A and B on the GPU with half precision
+    a = torch.randn(M, K).cuda().half()
+    b = torch.randn(K, N).cuda().half()
+
+    kernel = blocksparse_matmul(
+        M, N, K, block_M=block_M, block_N=block_N, block_K=block_K, num_stages=num_stages)
+    print(kernel.get_kernel_source())
+    # Create block mask with desired sparsity
+    mask_shape = (M // block_M, N // block_N, K // block_K)
+    block_mask = torch.rand(mask_shape).cuda() > sparsity
+
+    # Run the compiled kernel (either tuned or default) with the inputs
+    c = kernel(a, b, block_mask)
+
+    def ref_program(A, B, BlockMask, block_M, block_N, block_K):
+        ref_c = torch.zeros((M, N), dtype=torch.float16, device=A.device)
+        for i in range(M // block_M):
+            for j in range(N // block_N):
+                accu = torch.zeros((block_M, block_N), dtype=torch.float32, device=A.device)
+                for k in range(K // block_K):
+                    if BlockMask[i, j, k]:
+                        accu += (
+                            A[i * block_M:(i + 1) * block_M, k * block_K:(k + 1) * block_K].to(
+                                torch.float32) @ B[k * block_K:(k + 1) * block_K,
+                                                   j * block_N:(j + 1) * block_N].to(torch.float32))
+                ref_c[i * block_M:(i + 1) * block_M,
+                      j * block_N:(j + 1) * block_N] = accu.to(torch.float16)
+        return ref_c
+
+    # Compute the reference result using the naive PyTorch implementation
+    ref_c = ref_program(a, b, block_mask, block_M, block_N, block_K)
+
+    torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
+
+
+def test_blocksparse_matmul():
+    run_blocksparse_matmul(num_stages=1)
+    run_blocksparse_matmul(num_stages=2)
+    run_blocksparse_matmul(num_stages=3)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_reshape.py b/testing/python/language/test_tilelang_language_reshape.py
index fb56365b7..279ba1016 100644
--- a/testing/python/language/test_tilelang_language_reshape.py
+++ b/testing/python/language/test_tilelang_language_reshape.py
@@ -35,7 +35,7 @@ def test_reshape_smem():
     run_reshape(2048, 64, "float16")
 
 
-def reshape_test_smem(N, M, dtype):
+def reshape_test_smem_1d_2_2d(N, M, dtype):
     import tilelang.language as T
 
     @T.prim_func
@@ -45,19 +45,17 @@ def main(
     ):
         with T.Kernel(1) as _:
             A_shared = T.alloc_shared((N,), dtype)
-            for i in range(N):
+            for i in T.Parallel(N):
                 A_shared[i] = A[i]
 
             A_smem_reshaped = T.reshape(A_shared, [N // M, M])
-            for i in range(N // M):
-                for j in range(M):
-                    B[i, j] = A_smem_reshaped[i, j]
+            T.copy(A_smem_reshaped, B)
 
     return main
 
 
-def run_reshape_smem(N, M, dtype):
-    program = reshape_test_smem(N, M, dtype)
+def run_reshape_smem_1d_2_2d(N, M, dtype):
+    program = reshape_test_smem_1d_2_2d(N, M, dtype)
     jit_kernel = tl.compile(program, out_idx=-1)
     profiler = jit_kernel.get_profiler()
 
@@ -67,9 +65,44 @@ def ref_program(A):
     profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
 
 
-def test_reshape_smem_shared():
-    run_reshape_smem(1024, 32, "float32")
-    run_reshape_smem(2048, 64, "float16")
+def test_reshape_smem_1d_2_2d():
+    run_reshape_smem_1d_2_2d(1024, 32, "float32")
+    run_reshape_smem_1d_2_2d(2048, 64, "float16")
+
+
+def reshape_test_smem_2d_2_1d(N, M, dtype):
+    import tilelang.language as T
+
+    @T.prim_func
+    def main(
+            A: T.Tensor((N // M, M), dtype),
+            B: T.Tensor((N,), dtype),
+    ):
+        with T.Kernel(1) as _:
+            A_shared = T.alloc_shared((N // M, M), dtype)
+            for i, j in T.Parallel(N // M, M):
+                A_shared[i, j] = A[i, j]
+
+            A_smem_reshaped = T.reshape(A_shared, [N])
+            T.copy(A_smem_reshaped, B)
+
+    return main
+
+
+def run_reshape_smem_2d_2_1d(N, M, dtype):
+    program = reshape_test_smem_2d_2_1d(N, M, dtype)
+    jit_kernel = tl.compile(program, out_idx=-1)
+    profiler = jit_kernel.get_profiler()
+
+    def ref_program(A):
+        return A.reshape(N)
+
+    profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
+
+
+def test_reshape_smem_2d_2_1d():
+    run_reshape_smem_2d_2_1d(1024, 32, "float32")
+    run_reshape_smem_2d_2_1d(2048, 64, "float16")
 
 
 if __name__ == "__main__":
diff --git a/testing/python/primitives/test_tilelang_primitives_mma.py b/testing/python/primitives/test_tilelang_primitives_mma.py
index b3033359c..4447151b5 100644
--- a/testing/python/primitives/test_tilelang_primitives_mma.py
+++ b/testing/python/primitives/test_tilelang_primitives_mma.py
@@ -83,7 +83,6 @@ def run_matmul_ssr(
     )
     kernel = tilelang.compile(program, out_idx=[2])
     profiler = kernel.get_profiler()
-    print(kernel.get_kernel_source())
 
     def ref_program(A, B):
         import torch
@@ -204,7 +203,6 @@ def run_matmul_rsr(
     )
     kernel = tilelang.compile(program, out_idx=[2])
     profiler = kernel.get_profiler()
-    print(kernel.get_kernel_source())
 
     def ref_program(A, B):
         import torch
diff --git a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp.py b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp.py
new file mode 100644
index 000000000..31ed7a7e0
--- /dev/null
+++ b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp.py
@@ -0,0 +1,237 @@
+import torch
+import tilelang
+import tilelang.testing
+
+from tilelang.utils.sparse import compress_sm90
+from tilelang.layout import make_metadata_layout
+
+torch.set_printoptions(threshold=float('inf'), edgeitems=float('inf'), linewidth=10000)
+torch.manual_seed(42)
+
+STR_TO_TYPE = {
+    "float16": torch.float16,
+    "bfloat16": torch.bfloat16,
+    "float8_e4m3": torch.float8_e4m3fn,
+    "int8": torch.int8,
+}
+
+SPARSITY_MAP = {
+    torch.float16: (2, 4),
+    torch.bfloat16: (2, 4),
+    torch.float8_e4m3fn: (2, 4),
+    torch.int8: (2, 4),
+}
+
+
+def matmul_sp(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    threads,
+    trans_A,
+    trans_B,
+):
+    E_factor = 4 if in_dtype == "float32" else 8
+    A_sparse_shape = (M, K // 2) if not trans_A else (K // 2, M)
+    B_shape = (K, N) if not trans_B else (N, K)
+    A_shared_shape = (block_M, block_K // 2) if not trans_A else (block_K // 2, block_M)
+    B_shared_shape = (block_K, block_N) if not trans_B else (block_N, block_K)
+
+    import tilelang.language as T
+
+    @T.prim_func
+    def main(
+            A_sparse: T.Tensor(A_sparse_shape, in_dtype),
+            E: T.Tensor((M, K // E_factor), 'uint8'),
+            B: T.Tensor(B_shape, in_dtype),
+            C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            E_shared = T.alloc_shared((block_M, block_K // E_factor), 'uint8')
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.annotate_layout({
+                E:
+                    make_metadata_layout(
+                        E, mma_dtype="float16", arch="sm90", backend="cutlass", block_k=block_K),
+                E_shared:
+                    make_metadata_layout(
+                        E_shared,
+                        mma_dtype="float16",
+                        arch="sm90",
+                        backend="cutlass",
+                        block_k=block_K),
+            })
+            T.no_set_max_nreg()
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                T.copy(E[by * block_M, k * block_K // E_factor], E_shared)
+                if trans_A:
+                    T.copy(A_sparse[k * block_K // 2, by * block_M], A_shared)
+                else:
+                    T.copy(A_sparse[by * block_M, k * block_K // 2], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm_sp(A_shared, E_shared, B_shared, C_local, trans_A, trans_B)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def generate_sparse_tensor_float32(M: int, K: int, dtype: torch.dtype, device='cpu', trans_A=False):
+    elem, group = SPARSITY_MAP[dtype]
+    if K % group != 0:
+        raise ValueError(
+            f"Last dimension must be divisible by {group} for {elem}:{group} sparsity.")
+
+    if trans_A:
+        full_tensor = torch.randn(K * M, dtype=torch.float32, device=device).view(K, M)
+        mask = torch.zeros_like(full_tensor, dtype=torch.bool)
+        for j in range(M):
+            for i in range(0, K, group):
+                flat_idx = torch.randint(0, group, (elem,), dtype=torch.int64)
+                for k in range(1, len(flat_idx)):
+                    while flat_idx[k] in flat_idx[:k]:
+                        flat_idx[k] = torch.randint(0, group, (1,), dtype=torch.int64)
+                for idx in flat_idx:
+                    mask[i + idx, j] = True
+    else:
+        full_tensor = torch.randn((M, K), dtype=torch.float32, device=device).view(M, K)
+        mask = torch.zeros_like(full_tensor, dtype=torch.bool)
+        for i in range(M):
+            for j in range(0, K, group):
+                flat_idx = torch.randint(0, group, (elem,), dtype=torch.int64)
+                for k in range(1, len(flat_idx)):
+                    while flat_idx[k] in flat_idx[:k]:
+                        flat_idx[k] = torch.randint(0, group, (1,), dtype=torch.int64)
+                for idx in flat_idx:
+                    mask[i, j + idx] = True
+
+    return full_tensor * mask
+
+
+def normalize(tensor, max_range=100.0):
+    assert max_range <= 448.0
+    max_v = tensor.abs().max().clamp(1e-4)
+    scaler = max_range / max_v
+    return tensor * scaler
+
+
+def calc_diff(x, y):
+    x, y = x.double(), y.double()
+    denominator = (x * x + y * y).sum()
+    sim = 2 * (x * y).sum() / denominator
+    return 1 - sim
+
+
+def run_gemm_sp(
+    M,
+    N,
+    K,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    block_M,
+    block_N,
+    block_K,
+    num_stages,
+    num_threads,
+    trans_A=False,
+    trans_B=False,
+):
+    program = matmul_sp(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        in_dtype,
+        out_dtype,
+        accum_dtype,
+        num_stages,
+        num_threads,
+        trans_A,
+        trans_B,
+    )
+    if in_dtype == "float32":
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    kernel = tilelang.compile(
+        program,
+        out_idx=[-1],
+    )
+    A = generate_sparse_tensor_float32(
+        M, K, dtype=STR_TO_TYPE[in_dtype], device='cuda', trans_A=trans_A)
+    if trans_B:
+        B = torch.randn((N, K), device='cuda', dtype=torch.float32)
+    else:
+        B = torch.randn((K, N), device='cuda', dtype=torch.float32)
+
+    if "float8" in in_dtype or "int8" in in_dtype:
+        A = normalize(A)
+        B = normalize(B)
+
+    A = A.to(STR_TO_TYPE[in_dtype])
+    B = B.to(STR_TO_TYPE[in_dtype])
+
+    A_sparse, E = compress_sm90(A, block_K, trans_A)
+
+    C_sp = kernel(A_sparse, E, B)
+
+    def _matmul(A, B):
+        if trans_A:
+            A = A.T
+        if trans_B:
+            B = B.T
+        if "float8" in in_dtype or "int8" in in_dtype:
+            A = A.to(torch.float32)
+            B = B.to(torch.float32)
+        return torch.matmul(A, B).to(STR_TO_TYPE[out_dtype])
+
+    C = _matmul(A, B)
+    if 'float8' in in_dtype:
+        diff = calc_diff(C_sp, C)
+        assert diff < 1e-3, f"{diff=}"
+    else:
+        torch.testing.assert_close(C_sp, C, atol=1e-3, rtol=1e-3)
+    print("pass")
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version(9, 0)
+def test_gemm_sp():
+    run_gemm_sp(512, 1024, 768, "float16", "float16", "float32", 64, 64, 32, 2, 128)
+    run_gemm_sp(512, 1024, 768, "float16", "float16", "float32", 64, 64, 32, 0, 256)
+
+    run_gemm_sp(512, 1024, 768, "float16", "float16", "float32", 64, 64, 64, 0, 128)
+    run_gemm_sp(512, 1024, 768, "float16", "float16", "float32", 64, 64, 64, 2, 128)
+
+    run_gemm_sp(512, 1024, 768, "float16", "float16", "float32", 128, 128, 128, 0, 128)
+    run_gemm_sp(512, 1024, 768, "float16", "float16", "float32", 128, 128, 128, 2, 128)
+
+    run_gemm_sp(512, 1024, 768, "float16", "float16", "float32", 64, 128, 256, 0, 128)
+    run_gemm_sp(512, 1024, 768, "float16", "float16", "float32", 64, 128, 256, 2, 128)
+
+    run_gemm_sp(512, 1024, 768, "float16", "float16", "float32", 64, 64, 64, 0, 128, False, True)
+    run_gemm_sp(512, 1024, 768, "float16", "float16", "float32", 64, 64, 64, 0, 128, True, False)
+    run_gemm_sp(512, 1024, 768, "float16", "float16", "float32", 64, 64, 64, 0, 128, True, True)
+
+    run_gemm_sp(512, 1024, 768, "float8_e4m3", "float16", "float16", 64, 64, 64, 2, 128, False,
+                True)
+
+    run_gemm_sp(512, 1024, 768, "int8", "int8", "int32", 64, 64, 64, 2, 128, False, True)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/transform/test_tilelang_transform_Inject_software_pipeline.py b/testing/python/transform/test_tilelang_transform_Inject_software_pipeline.py
index 8057dd34c..c0444043d 100644
--- a/testing/python/transform/test_tilelang_transform_Inject_software_pipeline.py
+++ b/testing/python/transform/test_tilelang_transform_Inject_software_pipeline.py
@@ -9,6 +9,7 @@ def _check(original, transformed):
     mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
     mod = tl.transform.InjectSoftwarePipeline()(mod)
     mod = tl.transform.Simplify()(mod)
+    mod = tl.transform.LowerOpaqueBlock()(mod)
     tvm.ir.assert_structural_equal(mod["main"], transformed.with_attr("global_symbol", "main"),
                                    True)
 
@@ -39,32 +40,16 @@ def before(A: T.Tensor((16, 1), "float32"), C: T.Tensor((16, 1), "float32")):
                         C[tx, i] = B[tx, 0] + T.float32(1)
 
     @T.prim_func
-    def expected(A: T.Tensor((16, 1), "float32"), C: T.Tensor((16, 1), "float32")) -> None:
-        for tx in T.thread_binding(16, thread="threadIdx.x"):
-            with T.block():
-                T.reads(A[tx, 0])
-                T.writes(C[tx, 0])
-                B = T.alloc_buffer([2, 16, 1], dtype="float32", scope="shared")
-                with T.block():
-                    T.reads(A[tx, 0])
-                    T.writes(B[0, tx, 0])
-                    B[0, tx, 0] = A[tx, 0] * T.float32(2)
-                with T.block():
-                    T.reads(A[tx, 1:1], B[0:2, tx, 0])
-                    T.writes(B[1:1, tx, 0], C[tx, 0:0])
-                    for i in range(0):
-                        with T.block(""):
-                            T.reads(A[tx, i + 1])
-                            T.writes(B[i + 1, tx, 0])
-                            B[i + 1, tx, 0] = A[tx, i + 1] * T.float32(2)
-                        with T.block(""):
-                            T.reads(B[i, tx, 0])
-                            T.writes(C[tx, i])
-                            C[tx, i] = B[i, tx, 0] + T.float32(1)
-                with T.block():
-                    T.reads(B[0, tx, 0])
-                    T.writes(C[tx, 0])
-                    C[tx, 0] = B[0, tx, 0] + T.float32(1)
+    def expected(A_handle: T.handle, C_handle: T.handle):
+        A = T.match_buffer(A_handle, (16, 1), strides=(1, 1))
+        C = T.match_buffer(C_handle, (16, 1), strides=(1, 1))
+        tx = T.launch_thread("threadIdx.x", 16)
+        B = T.decl_buffer((2, 16, 1), scope="shared")
+        B[0, tx, 0] = A[tx, 0] * T.float32(2.0)
+        for i in range(0):
+            B[i + 1, tx, 0] = A[tx, i + 1] * T.float32(2.0)
+            C[tx, i] = B[i, tx, 0] + T.float32(1.0)
+        C[tx, 0] = B[0, tx, 0] + T.float32(1.0)
 
     _check(before, expected)
 
diff --git a/testing/python/transform/test_tilelang_transform_cluster_planning.py b/testing/python/transform/test_tilelang_transform_cluster_planning.py
index c2f880242..8029305ae 100644
--- a/testing/python/transform/test_tilelang_transform_cluster_planning.py
+++ b/testing/python/transform/test_tilelang_transform_cluster_planning.py
@@ -43,7 +43,7 @@ def before(A: T.Tensor((1024, 32), "float16"), B: T.Tensor((32, 1024), "float16"
     @T.prim_func
     def after(A: T.Tensor((1024, 32), "float16"), B: T.Tensor((32, 1024), "float16"), C: T.Tensor(
         (1024, 1024), "float16")):
-        T.func_attr({"clusterIdx.y": 2})
+        T.func_attr({"clusterIdx.y": T.int32(2)})
         with T.Kernel(8, 8, threads=128) as (bx, by):
             A_shared = T.alloc_shared((128, 32), "float16")
             B_shared = T.alloc_shared((32, 128), "float16")
diff --git a/testing/python/transform/test_tilelang_transform_make_packed_api.py b/testing/python/transform/test_tilelang_transform_make_packed_api.py
index f502cb3cd..ff4487326 100644
--- a/testing/python/transform/test_tilelang_transform_make_packed_api.py
+++ b/testing/python/transform/test_tilelang_transform_make_packed_api.py
@@ -1,34 +1,29 @@
-import pytest
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# ruff: noqa
 
+import pytest
+import numpy as np
 import tilelang
-import tilelang.testing
 from tilelang import tvm as tvm
-from tvm import te, tir
-from tilelang import language as T
-from tvm.script import ir as I
-from tvm.driver.build_module import schedule_to_module
-
-
-def test_makeapi():
-    """Not yet working, mock design"""
-    n = te.size_var("n")
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-    C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name="C")
-    s = te.create_schedule(C.op)
-
-    mod = schedule_to_module(s, [n, A, B, C])
-    mod = tvm.tir.transform.StorageFlatten(64)(mod)
-    mod = tvm.tir.transform.Apply(lambda f: f.with_attr({
-        "target": tvm.target.Target("llvm", host="llvm"),
-        "global_symbol": "main",
-    }))(
-        mod)
-
-    before = mod
-    after = tilelang.transform.MakePackedAPI()(before)
-    f = after["main"]
-    assert len(f.params) == 6
+import tvm
+import tilelang.testing
+from tvm import tir
+from tvm.script import tir as T, ir as I
 
 
 def _find_assignment(stmt, var_name):
@@ -41,21 +36,6 @@ def _find_assignment(stmt, var_name):
     return stmt
 
 
-def _find_next(stmt, type):
-    search_stack = [stmt]
-
-    while search_stack:
-        stmt = search_stack.pop()
-        if isinstance(stmt, type):
-            return stmt
-        elif isinstance(stmt, tvm.tir.SeqStmt):
-            search_stack.extend(reversed(stmt))
-        else:
-            search_stack.append(stmt.body)
-
-    return None
-
-
 def _find_compute_scope(func):
     result = None
 
@@ -69,91 +49,7 @@ def _visitor(stmt):
     return result
 
 
-def test_variable_passed_from_args():
-    ib = tvm.tir.ir_builder.create()
-
-    input_buffer = tvm.tir.decl_buffer(name="input_buffer", shape=[1])
-    not_device_context = tvm.tir.Var("not_device_context", dtype="handle")
-
-    ib.emit(
-        tvm.tir.call_extern("float32", "some_external_call", input_buffer.data,
-                            not_device_context),)
-    stmt = ib.get()
-
-    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([input_buffer, not_device_context], stmt))
-    mod = tvm.tir.transform.Apply(
-        lambda f: f.with_attr("target", tvm.target.Target("llvm", host="llvm")))(
-            mod)
-    mod = tvm.tir.transform.Apply(lambda f: f.with_attr("global_symbol", "main"))(mod)
-    func = tilelang.transform.MakePackedAPI()(mod)["main"]
-
-    num_args = func.params[2]
-
-    # num_args assertion
-    assert func.body.condition.a == num_args
-    assert func.body.condition.b == 2
-
-    # Arguments unpacking
-    assignment = _find_assignment(func.body, "input_buffer")
-    assert str(assignment.value) == 'T.tvm_struct_get(args, 0, 12, "handle")'
-
-    assignment = _find_assignment(assignment.body, "input_buffer")
-    assert str(assignment.value) == 'T.tvm_struct_get(input_buffer, 0, 1, "handle")'
-    unpacked_input_buffer = assignment.var
-
-    assignment = _find_assignment(func.body, "not_device_context")
-    assert str(assignment.value) == 'T.tvm_struct_get(args, 1, 12, "handle")'
-    unpacked_not_device_context = assignment.var
-
-    seq_stmt = _find_next(assignment, tvm.tir.SeqStmt)
-    call = _find_next(seq_stmt[1], tvm.tir.Evaluate)
-    call_extern = call.value
-
-    assert call_extern.args[1] == unpacked_input_buffer
-    assert call_extern.args[2] == unpacked_not_device_context
-
-
-def test_device_api_context_implicit_resource_handle():
-    ib = tvm.tir.ir_builder.create()
-
-    input_buffer = tvm.tir.decl_buffer(name="input_buffer", shape=[1])
-    device_context = tvm.tir.Var("device_api_context", dtype="handle")
-
-    ib.emit(
-        tvm.tir.call_extern("float32", "some_external_call", input_buffer.data, device_context),)
-    stmt = ib.get()
-
-    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([input_buffer, device_context], stmt))
-    mod = tvm.tir.transform.Apply(
-        lambda f: f.with_attr("target", tvm.target.Target("llvm", host="llvm")))(
-            mod)
-    mod = tvm.tir.transform.Apply(lambda f: f.with_attr("global_symbol", "main"))(mod)
-    func = tilelang.transform.MakePackedAPI()(mod)["main"]
-
-    num_args = func.params[2]
-    device_context_in_resource_handle = func.params[5]
-
-    # num_args assertion
-    assert func.body.condition.a == num_args
-    assert func.body.condition.b == 1
-
-    # Arguments unpacking
-    assignment = _find_assignment(func.body, "input_buffer")
-    assert str(assignment.value) == 'T.tvm_struct_get(args, 0, 12, "handle")'
-
-    assignment = _find_assignment(assignment.body, "input_buffer")
-    assert str(assignment.value) == 'T.tvm_struct_get(input_buffer, 0, 1, "handle")'
-    unpacked_input_buffer = assignment.var
-
-    seq_stmt = _find_next(assignment, tvm.tir.SeqStmt)
-    call = _find_next(seq_stmt[1], tvm.tir.Evaluate)
-    call_extern = call.value
-
-    assert call_extern.args[1] == unpacked_input_buffer
-    assert call_extern.args[2] == device_context_in_resource_handle
-
-
-@pytest.mark.parametrize("use_global_symbol", [True, False])
+@pytest.mark.parametrize("use_global_symbol", [False])
 def test_no_op_when_global_symbol_is_absent(use_global_symbol):
     func_attr = {"target": tvm.target.Target("llvm", host="llvm")}
 
@@ -167,7 +63,7 @@ def before():
 
     after = tilelang.transform.MakePackedAPI()(tvm.IRModule.from_expr(before))["main"]
     if use_global_symbol:
-        assert len(after.params) == 6
+        assert len(after.params) == 4
     else:
         tvm.ir.assert_structural_equal(before, after)
 
@@ -186,7 +82,7 @@ def test_target_host_removed():
     class before:
 
         @T.prim_func
-        def main(A: T.Tensor(1, "float32")):
+        def main(A: T.Buffer(1, "float32")):
             T.func_attr({"global_symbol": "main", "target": T.target("cuda", host=host)})
             T.evaluate(0)
 
@@ -208,7 +104,7 @@ def test_internal_subroutine_call():
     class before:
 
         @T.prim_func
-        def main(A: T.Tensor(1, "float32")):
+        def main(A: T.Buffer(1, "float32")):
             T.func_attr({"target": T.target("llvm", host="llvm")})
             before.subroutine(A.data)
 
@@ -241,7 +137,7 @@ def test_subroutine_call_to_externally_visible_subroutine():
     class before:
 
         @T.prim_func
-        def main(A: T.Tensor(1, "float32")):
+        def main(A: T.Buffer(1, "float32")):
             T.func_attr({"global_symbol": "main", "target": T.target("llvm", host="llvm")})
             before.subroutine(A.data)
 
@@ -271,14 +167,14 @@ def test_function_call_with_wrong_argument_count():
 
     @T.prim_func
     def func(
-            A: T.Tensor([16, 16], "int32"),
-            B: T.Tensor([16, 16], "int32"),
-            C: T.Tensor([16, 16], "int32"),
-            D: T.Tensor([16, 16], "int32"),
+            A: T.Buffer([16, 16], "int32"),
+            B: T.Buffer([16, 16], "int32"),
+            C: T.Buffer([16, 16], "int32"),
+            D: T.Buffer([16, 16], "int32"),
     ):
         pass
 
-    built = tvm.build(func, target="llvm")
+    built = tvm.compile(func, target="llvm")
 
     with pytest.raises(tvm.TVMError):
         built()
@@ -289,10 +185,10 @@ def test_function_call_with_wrong_type_code():
     """Type codes must be checked before accessing the arguments"""
 
     @T.prim_func
-    def func(A: T.Tensor([16, 16], "int32")):
+    def func(A: T.Buffer([16, 16], "int32")):
         pass
 
-    built = tvm.build(func, target="llvm")
+    built = tvm.compile(func, target="llvm")
 
     with pytest.raises(tvm.TVMError):
         built(0)
@@ -303,17 +199,15 @@ def test_function_call_with_null_data_pointer():
     """The data pointer must be checked before accessing the array"""
 
     @T.prim_func
-    def func(A: T.Tensor([16, 16], "int32"), B: T.Tensor([16, 16], "int32")):
+    def func(A: T.Buffer([16, 16], "int32"), B: T.Buffer([16, 16], "int32")):
         for i, j in T.grid(16, 16):
             B[i, j] = A[i, j]
 
-    built = tvm.build(func, target="llvm")
+    built = tvm.compile(func, target="llvm")
 
-    A = tvm.nd.empty([16, 16], "int32", tvm.cpu())
+    A = tvm.nd.array(np.zeros([16], dtype="int32"))
     B = tvm.nd.empty([16, 16], "int32", tvm.cpu())
 
-    A.handle.contents.data = 0
-
     with pytest.raises(tvm.TVMError):
         built(A, B)
 
@@ -323,17 +217,15 @@ def test_function_call_with_wrong_dimensionality():
     """The dimensionality must be checked before validating the shape"""
 
     @T.prim_func
-    def func(A: T.Tensor([16, 16], "int32"), B: T.Tensor([16, 16], "int32")):
+    def func(A: T.Buffer([16, 16], "int32"), B: T.Buffer([16, 16], "int32")):
         for i, j in T.grid(16, 16):
             B[i, j] = A[i, j]
 
-    built = tvm.build(func, target="llvm")
+    built = tvm.compile(func, target="llvm")
 
-    A = tvm.nd.empty([16], "int32", tvm.cpu())
+    A = tvm.nd.array(np.zeros([16], dtype="int32"))
     B = tvm.nd.empty([16], "int32", tvm.cpu())
 
-    A.handle.contents.data = 0
-
     with pytest.raises(tvm.TVMError):
         built(A, B)
 
diff --git a/testing/python/transform/test_tilelang_transform_multi_version_buffer.py b/testing/python/transform/test_tilelang_transform_multi_version_buffer.py
index 582ea8b37..a8e4a45f4 100644
--- a/testing/python/transform/test_tilelang_transform_multi_version_buffer.py
+++ b/testing/python/transform/test_tilelang_transform_multi_version_buffer.py
@@ -46,7 +46,7 @@ def before(A: T.Tensor((M, K), dtype), B: T.Tensor((K, N), dtype)):
             for i in T.unroll(16, annotations={"pragma_unroll_explicit": T.bool(False)}):
                 for vec in T.vectorized(2):
                     C_local[i * 2 + vec] = T.float32(0)
-            for k in T.serial(16, annotations={"num_stages": 3}):
+            for k in T.serial(16, annotations={"num_stages": T.int32(3)}):
                 if v == 0:
                     T.tma_load(
                         T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1, 0, 2,
@@ -79,7 +79,7 @@ def after(A: T.Tensor((M, K), dtype), B: T.Tensor((K, N), dtype)):
             for i in T.unroll(16, annotations={"pragma_unroll_explicit": T.bool(False)}):
                 for vec in T.vectorized(2):
                     C_local[i * 2 + vec] = T.float32(0)
-            for k in T.serial(16, annotations={"num_stages": 3}):
+            for k in T.serial(16, annotations={"num_stages": T.int32(3)}):
                 if v == 0:
                     T.tma_load(
                         T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1, 0, 2,
diff --git a/testing/python/transform/test_tilelang_transform_pipeline_planning.py b/testing/python/transform/test_tilelang_transform_pipeline_planning.py
index 3c01115a7..b7448a204 100644
--- a/testing/python/transform/test_tilelang_transform_pipeline_planning.py
+++ b/testing/python/transform/test_tilelang_transform_pipeline_planning.py
@@ -51,9 +51,11 @@ def after(A: T.Tensor((1024, 32), "float32"), B: T.Tensor((32, 1024), "float32")
             for ko in T.serial(
                     32,
                     annotations={
-                        "software_pipeline_async_stages": [0],
-                        "software_pipeline_order": [0, 1, 2],
-                        "software_pipeline_stage": [3, 3, 3]
+                        "software_pipeline_async_stages": [T.int32(0)],
+                        "software_pipeline_order": [T.int32(0), T.int32(1),
+                                                    T.int32(2)],
+                        "software_pipeline_stage": [T.int32(3), T.int32(3),
+                                                    T.int32(3)]
                     }):
                 T.copy(A[by * 128, ko * 32], A_shared)
                 T.copy(B[ko * 32, bx * 128], B_shared)
diff --git a/testing/python/transform/test_tilelang_transform_thread_sync.py b/testing/python/transform/test_tilelang_transform_thread_sync.py
index 33d4cc476..11916671f 100644
--- a/testing/python/transform/test_tilelang_transform_thread_sync.py
+++ b/testing/python/transform/test_tilelang_transform_thread_sync.py
@@ -1,30 +1,13 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import tilelang
-import tilelang.testing
+# ruff: noqa
+
 from tilelang import tvm as tvm
-from tvm import te
+import tilelang.testing
 from tvm.script import tir as T
+from tvm import te
 
 
 def run_passes(func: tvm.tir.PrimFunc):
     mod = tvm.IRModule.from_expr(func)
-    mod = tvm.tir.transform.StorageFlatten(64)(mod)
 
     cuda_target = tvm.target.Target("cuda", host="llvm")
 
@@ -42,7 +25,7 @@ def run_passes(func: tvm.tir.PrimFunc):
 @tilelang.testing.requires_cuda
 def test_sync_if_with_same_index():
 
-    @T.prim_func
+    @T.prim_func(check_well_formed=False)
     def func(p0_arg: T.Buffer((1, 2, 1, 1), "float32"), p1: T.Buffer(2, "float32")) -> None:
         threadIdx_x = T.env_thread("threadIdx.x")
         threadIdx_y = T.env_thread("threadIdx.y")
@@ -62,42 +45,6 @@ def func(p0_arg: T.Buffer((1, 2, 1, 1), "float32"), p1: T.Buffer(2, "float32"))
     assert "T.tvm_storage_sync" in str(mod)
 
 
-@tilelang.testing.requires_cuda
-def test_sync_else_branch():
-
-    def ir(A, B):
-        ib = tvm.tir.ir_builder.create()
-        Aptr = ib.buffer_ptr(A)
-        Bptr = ib.buffer_ptr(B)
-
-        tx = te.thread_axis("threadIdx.x")
-        ib.scope_attr(tx, "thread_extent", 1)
-
-        local = ib.allocate(A.dtype, (8,), name="buf_local", scope="local")
-        shared = ib.allocate(A.dtype, (8,), name="buf_shared", scope="shared")
-
-        with ib.for_range(0, 8) as i:
-            with ib.if_scope(Aptr[i] < 0):
-                local[i] = Aptr[i]
-            with ib.else_scope():
-                shared[i] = Aptr[i]
-
-        with ib.for_range(0, 8) as i:
-            with ib.if_scope(Aptr[i] < 0):
-                Bptr[i] = local[i]
-            with ib.else_scope():
-                Bptr[i] = shared[i]
-
-        return ib.get()
-
-    A = tvm.tir.decl_buffer((8,), "float32")
-    B = tvm.tir.decl_buffer((8,), "float32")
-    stmt = ir(A, B)
-    func = tvm.te.schedule.SchedulePostProcToPrimFunc([A, B], stmt, None)
-    mod = run_passes(func)
-    assert "T.tvm_storage_sync" in str(mod)
-
-
 @tilelang.testing.requires_cuda
 def test_sync_read_thread_id_independent_location():
 
@@ -123,6 +70,48 @@ def func(p0_arg: T.Buffer((1, 2, 1, 1), "float32"), p1: T.Buffer(2, "float32"))
 
 
 @tilelang.testing.requires_cuda
+def test_sync_shared_dyn():
+
+    @T.prim_func(private=True)
+    def func(A: T.Buffer((4, 4), "float32"), E: T.Buffer((4, 4), "float32")):
+        blockIdx_x = T.launch_thread("blockIdx.x", 1)
+        B = T.allocate([24], "float32", "shared.dyn")
+        C = T.allocate([1], "float32", "local")
+        D = T.allocate([16], "float32", "shared.dyn")
+        threadIdx_x = T.launch_thread("threadIdx.x", 16)
+        B_1 = T.Buffer((24,), data=B, scope="shared.dyn")
+        A_1 = T.Buffer((16,), data=A.data)
+        B_1[threadIdx_x // 4 * 6 + threadIdx_x % 4] = A_1[threadIdx_x]
+        C_1 = T.Buffer((1,), data=C, scope="local")
+        C_1[0] = B_1[threadIdx_x // 4 * 6 + threadIdx_x % 4]
+        D_1 = T.Buffer((16,), data=D, scope="shared.dyn")
+        D_1[threadIdx_x] = C_1[0]
+        E_1 = T.Buffer((16,), data=E.data)
+        E_1[threadIdx_x] = D_1[threadIdx_x]
+
+    @T.prim_func(private=True)
+    def expected(A: T.Buffer((4, 4), "float32"), E: T.Buffer((4, 4), "float32")):
+        blockIdx_x = T.launch_thread("blockIdx.x", 1)
+        B_1 = T.allocate([24], "float32", "shared.dyn")
+        C_1 = T.allocate([1], "float32", "local")
+        D_1 = T.allocate([16], "float32", "shared.dyn")
+        threadIdx_x = T.launch_thread("threadIdx.x", 16)
+        B_1_1 = T.Buffer((24,), data=B_1, scope="shared.dyn")
+        A_1 = T.Buffer((16,), data=A.data)
+        B_1_1[threadIdx_x // 4 * 6 + threadIdx_x % 4] = A_1[threadIdx_x]
+        C_1_1 = T.Buffer((1,), data=C_1, scope="local")
+        C_1_1[0] = B_1_1[threadIdx_x // 4 * 6 + threadIdx_x % 4]
+        D_1_1 = T.Buffer((16,), data=D_1, scope="shared.dyn")
+        D_1_1[threadIdx_x] = C_1_1[0]
+        E_1 = T.Buffer((16,), data=E.data)
+        E_1[threadIdx_x] = D_1_1[threadIdx_x]
+
+    mod = tvm.IRModule({"main": func})
+    mod = tilelang.transform.ThreadSync("shared.dyn")(mod)
+    tvm.ir.assert_structural_equal(mod["main"], expected)
+
+
+@tvm.testing.requires_cuda
 def test_sync_let_stmt():
 
     @T.prim_func(private=True)
diff --git a/testing/python/transform/test_tilelang_transform_vectorize_loop.py b/testing/python/transform/test_tilelang_transform_vectorize_loop.py
deleted file mode 100644
index edf0d4986..000000000
--- a/testing/python/transform/test_tilelang_transform_vectorize_loop.py
+++ /dev/null
@@ -1,538 +0,0 @@
-# ruff: noqa
-import tilelang
-from tilelang import tvm as tvm
-import tilelang.testing
-from tvm import te
-from tvm.script import ir as I
-from tilelang import language as T
-import pytest
-
-simple_target = tvm.target.Target("llvm -mtriple=x86_64-linux-gnu")
-sve_target = tvm.target.Target("llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr=+v8.2a,+sve")
-
-
-@tilelang.testing.requires_llvm
-@pytest.mark.parametrize("extent, target", [(4, simple_target), (T.vscale() * 4, sve_target)])
-def test_vectorize_loop(extent, target):
-
-    @I.ir_module
-    class Before:
-
-        @T.prim_func
-        def main(A: T.Tensor((16,), "float32")):
-            for j in T.vectorized(0, extent):
-                A[j] = 1
-
-    @I.ir_module
-    class After:
-
-        @T.prim_func
-        def main(A: T.Tensor((16,), "float32")):
-            A[T.Ramp(0, 1, extent)] = T.Broadcast(1, extent)
-
-    with tvm.target.Target(target):
-        mod = tilelang.transform.VectorizeLoop()(Before)
-        tvm.ir.assert_structural_equal(mod, After)
-
-
-@tilelang.testing.requires_llvm
-def test_vectorize_vector():
-    n = te.var("n")
-    ib = tvm.tir.ir_builder.create()
-    A = ib.pointer("float32x4", name="A")
-    with ib.for_range(0, n) as i:
-        with ib.for_range(0, 4, kind="vectorize") as j:
-            A[j] = tvm.tir.const(1, A.dtype)
-    stmt = ib.get()
-    assert isinstance(stmt.body, tvm.tir.For)
-
-    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([A, n], stmt))
-    stmt = tilelang.transform.VectorizeLoop()(mod)["main"].body
-
-    assert isinstance(stmt, tvm.tir.For)
-    assert not isinstance(stmt.body, tvm.tir.For)
-    assert len(stmt.body.indices) == 1
-    assert isinstance(stmt.body.indices[0], tvm.tir.Ramp)
-    assert isinstance(stmt.body.value, tvm.tir.Broadcast)
-
-
-@tilelang.testing.requires_llvm
-def test_vectorize_vector_scalable_error():
-
-    @I.ir_module
-    class Module:
-
-        @T.prim_func
-        def main(A: T.Tensor((25,), "float32")):
-            for j in T.vectorized(T.vscale() * 4):
-                A[j * 4:j * 4 + 4] = T.Broadcast(T.float32(1), 4)
-
-    error_msg = f"Creating scalable vectors from existing vectors is not supported."
-    with tvm.target.Target(sve_target):
-        with pytest.raises(tvm.error.InternalError, match=error_msg):
-            tilelang.transform.VectorizeLoop()(Module)
-
-
-@tilelang.testing.requires_llvm
-def test_vectorize_vector_scalable_error2():
-
-    @I.ir_module
-    class Module:
-
-        @T.prim_func
-        def main(A: T.Tensor((25,), "float32xvscalex4")):
-            for j in T.vectorized(4):
-                A[j] = T.Broadcast(T.float32(1), T.vscale() * 4)
-
-    error_msg = f"Vectorizing over scalable buffer elements is not supported in vectorizer."
-    with pytest.raises(tvm.error.InternalError, match=error_msg):
-        tilelang.transform.VectorizeLoop()(Module)
-
-
-@tilelang.testing.requires_llvm
-def test_vectorize_vector_scalable_error3():
-
-    @I.ir_module
-    class Module:
-
-        @T.prim_func
-        def main(A: T.Tensor((25,), "float32")):
-            for j in T.vectorized(4):
-                A[j * T.vscale() * 4:j * T.vscale() * 4 + T.vscale() * 4] = T.Broadcast(
-                    T.float32(1),
-                    T.vscale() * 4)
-
-    error_msg = f"Vectorizing over existing scalable vectors is not supported."
-    with pytest.raises(tvm.error.InternalError, match=error_msg):
-        with tvm.target.Target(sve_target):
-            tilelang.transform.VectorizeLoop()(Module)
-
-
-@tilelang.testing.requires_llvm
-def test_vectorize_vector_scalable_error4():
-
-    @I.ir_module
-    class Module:
-
-        @T.prim_func(private=True)
-        def main(A: T.Tensor((25,), "float32")):
-            for j in T.vectorized(T.vscale() * 4):
-                A[j * T.vscale() * 4:j * T.vscale() * 4 + T.vscale() * 4] = T.Broadcast(
-                    T.float32(1),
-                    T.vscale() * 4)
-
-    error_msg = f"Creating scalable vectors from existing vectors is not supported."
-    with pytest.raises(tvm.error.InternalError, match=error_msg):
-        with tvm.target.Target(sve_target):
-            tilelang.transform.VectorizeLoop()(Module)
-
-
-@tilelang.testing.requires_llvm
-@pytest.mark.parametrize("extent, target", [(4, simple_target), (T.vscale() * 4, sve_target)])
-def test_vectorize_with_if(extent, target):
-
-    @I.ir_module
-    class Before:
-
-        @T.prim_func
-        def main(A: T.Tensor((25,), "float32"), n: T.int32, x: T.int32):
-            for i in T.vectorized(extent):
-                if x < n:
-                    A[i] = A[i] + T.float32(1)
-                else:
-                    if i < n:
-                        A[i] = T.float32(2)
-
-    @I.ir_module
-    class After:
-
-        @T.prim_func
-        def main(A: T.Tensor((25,), "float32"), n: T.int32, x: T.int32):
-            if x < n:
-                A[T.Ramp(0, 1,
-                         extent)] = A[T.Ramp(0, 1, extent)] + T.Broadcast(T.float32(1), extent)
-            else:
-                for i_s in range(extent):
-                    if i_s < n:
-                        A[i_s] = T.float32(2)
-
-    with tvm.target.Target(target):
-        mod = tilelang.transform.VectorizeLoop()(Before)
-        tvm.ir.assert_structural_equal(mod, After)
-
-
-@tilelang.testing.requires_llvm
-def test_vectorize_with_if_cond_int64():
-    m = te.size_var("m", dtype="int64")
-    A = te.placeholder((m,), name="A", dtype="float32")
-    B = te.compute((m,), lambda i: te.if_then_else(i < 2, A[i], A[i] * 2), name="B")
-    s = te.create_schedule(B.op)
-    x, y = s[B].split(B.op.axis[0], factor=4)
-    s[B].vectorize(y)
-    f = tvm.build(s, [A, B], "llvm")
-
-
-@tilelang.testing.requires_llvm
-@pytest.mark.parametrize("extent, target", [(4, simple_target), (T.vscale() * 4, sve_target)])
-def test_vectorize_let(extent, target):
-
-    @I.ir_module
-    class Before:
-
-        @T.prim_func
-        def main(A: T.Tensor((25,), "float32")):
-            for i in T.vectorized(extent):
-                v = A[i] + T.float32(1)
-                A[i] = v + T.float32(2)
-
-    @I.ir_module
-    class After:
-
-        @T.prim_func
-        def main(A: T.Tensor((25,), "float32")):
-            v = A[T.Ramp(0, 1, extent)] + T.Broadcast(T.float32(1), extent)
-            A[T.Ramp(0, 1, extent)] = v + T.Broadcast(T.float32(2), extent)
-
-    with tvm.target.Target(target):
-        mod = tilelang.transform.VectorizeLoop()(Before)
-        tvm.ir.assert_structural_equal(mod, After)
-
-
-@tilelang.testing.requires_llvm
-@pytest.mark.parametrize("extent, target", [(4, simple_target), (tvm.tir.vscale() * 4, sve_target)])
-def test_vectorize_with_le_cond(extent, target):
-    n = te.var("n")
-    ib = tvm.tir.ir_builder.create()
-    A = ib.pointer("float32", name="A")
-    with ib.for_range(0, extent, kind="vectorize") as i:
-        with ib.if_scope(i <= n):
-            A[i] = A[i] + 1
-    stmt = ib.get()
-
-    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([A, n], stmt))
-
-    with tvm.target.Target(target):
-        stmt = tilelang.transform.VectorizeLoop()(mod)["main"].body
-
-        # Check that the loop wasn't vectorised
-        assert isinstance(stmt, tvm.tir.For)
-
-
-@tilelang.testing.requires_llvm
-@pytest.mark.parametrize("extent, target", [(4, simple_target), (tvm.tir.vscale() * 4, sve_target)])
-def test_vectorize_with_ge_cond(extent, target):
-    n = te.var("n")
-    ib = tvm.tir.ir_builder.create()
-    A = ib.pointer("float32", name="A")
-    with ib.for_range(0, extent, kind="vectorize") as i:
-        with ib.if_scope(i >= n):
-            A[i] = A[i] + 1
-    stmt = ib.get()
-
-    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([A, n], stmt))
-
-    with tvm.target.Target(target):
-        stmt = tilelang.transform.VectorizeLoop()(mod)["main"].body
-
-        # Check that the loop wasn't vectorised
-        assert isinstance(stmt, tvm.tir.For)
-
-
-@tilelang.testing.requires_llvm
-@pytest.mark.parametrize("extent, target", [(4, simple_target), (T.vscale() * 4, sve_target)])
-def test_vectorize_if_then_else_scalarize(extent, target):
-
-    @I.ir_module
-    class Before:
-
-        @T.prim_func
-        def main(A: T.Tensor((25,), "float32")):
-            for i in T.vectorized(extent):
-                A[i] = T.if_then_else(i > 0, A[i] + T.float32(1), A[i])
-
-    @I.ir_module
-    class After:
-
-        @T.prim_func
-        def main(A: T.Tensor((25,), "float32")):
-            for i_s in range(extent):
-                A[i_s] = T.if_then_else(i_s > 0, A[i_s] + T.float32(1), A[i_s])
-
-    with tvm.target.Target(target):
-        mod = tilelang.transform.VectorizeLoop()(Before)
-        tvm.ir.assert_structural_equal(mod, After)
-
-
-@tilelang.testing.requires_llvm
-@pytest.mark.parametrize("extent, target", [(4, simple_target), (T.vscale() * 4, sve_target)])
-def test_vectorize_if_then_else_vector(extent, target):
-
-    @I.ir_module
-    class Before:
-
-        @T.prim_func
-        def main(A: T.Tensor((25,), "float32"), n: T.int32):
-            for i in range(n):
-                for j in T.vectorized(extent):
-                    A[i * extent + j] = T.if_then_else(i > 0, A[i * extent + j], 0)
-
-    @I.ir_module
-    class After:
-
-        @T.prim_func
-        def main(A: T.Tensor((25,), "float32"), n: T.int32):
-            for i in range(n):
-                A[T.Ramp(i * extent, 1, extent)] = T.if_then_else(i > 0,
-                                                                  A[T.Ramp(i * extent, 1, extent)],
-                                                                  T.Broadcast(0, extent))
-
-    with tvm.target.Target(target):
-        mod = tilelang.transform.VectorizeLoop()(Before)
-        tvm.ir.assert_structural_equal(mod, After)
-
-
-@tilelang.testing.requires_llvm
-def test_vectorize_while_fail():
-    """A while loop inside a vectorized loop should fail."""
-
-    n = 64
-    num_iter = 10
-
-    def test_ir(A, B, C):
-        ib = tvm.tir.ir_builder.create()
-        n = C.shape[0]
-        A = ib.buffer_ptr(A)
-        B = ib.buffer_ptr(B)
-        C = ib.buffer_ptr(C)
-        i = ib.allocate("int32", (1,), name="i", scope="local")
-        i[0] = 0
-
-        with ib.for_range(0, n) as j:
-            C[j] = 0.0
-
-        with ib.for_range(0, n, kind="vectorize") as j:
-            with ib.while_loop(i[0] < num_iter):
-                C[j] += A[j] + B[j]
-                i[0] += 1
-
-        return ib.get()
-
-    dtype = "float32"
-    A = te.placeholder((n,), name="A", dtype=dtype)
-    B = te.placeholder((n,), name="B", dtype=dtype)
-
-    C = te.extern(
-        (n,),
-        [A, B],
-        lambda ins, outs: test_ir(ins[0], ins[1], outs[0]),
-        name="while_vectorize",
-        dtype=dtype,
-    )
-    s = te.create_schedule(C.op)
-
-    try:
-        tvm.lower(s, [A, B, C], "llvm")
-        assert False
-    except tvm.error.TVMError as e:
-        error_msg = str(e).split("\n")[-1]
-        expected = "A while loop inside a vectorized loop not supported"
-        assert expected in error_msg
-
-
-@tilelang.testing.requires_llvm
-def test_vectorize_dtype_mismatch():
-    n = tvm.tir.IntImm("int64", 4)
-    A = te.compute((n,), lambda i: tvm.tir.IntImm("int64", 2**31 - 1) + i, name="A")
-    s = te.create_schedule(A.op)
-    s[A].vectorize(A.op.axis[0])
-    tvm.lower(s, [A], "llvm", simple_mode=True)
-
-
-@tilelang.testing.requires_llvm
-@pytest.mark.parametrize(
-    "extent, vec_str, target",
-    [(16, "float32x16", simple_target), (T.vscale() * 8, "float32xvscalex8", sve_target)],
-)
-def test_vectorize_with_reinterpret(extent, vec_str, target):
-
-    @I.ir_module
-    class Before:
-
-        @T.prim_func
-        def main(A: T.Tensor((16,), "int32"), B: T.Tensor((16,), "float32")):
-            for i in T.vectorized(0, extent):
-                B[i] = T.reinterpret("float32", A[i])
-
-    @I.ir_module
-    class After:
-
-        @T.prim_func
-        def main(A: T.Tensor((16,), "int32"), B: T.Tensor((16,), "float32")):
-            B[T.Ramp(0, 1, extent)] = T.reinterpret(vec_str, A[T.Ramp(0, 1, extent)])
-
-    with tvm.target.Target(target):
-        mod = tilelang.transform.VectorizeLoop()(Before)
-        tvm.ir.assert_structural_equal(mod, After)
-
-
-@tilelang.testing.requires_llvm
-@pytest.mark.parametrize("extent, target", [(4, simple_target), (T.vscale() * 4, sve_target)])
-@pytest.mark.parametrize(
-    "op",
-    (
-        T.Mul,
-        T.Add,
-        T.Sub,
-        T.Div,
-        T.Mod,
-        T.FloorDiv,
-        T.FloorMod,
-        T.Min,
-        T.Max,
-        T.EQ,
-        T.LT,
-        T.LE,
-        T.GE,
-        T.GT,
-        T.NE,
-    ),
-)
-def test_vectorize_binary(op, extent, target):
-
-    @I.ir_module
-    class Before:
-
-        @T.prim_func
-        def main(A: T.Tensor((25,), "float32"), B: T.Tensor((25,), "float32")):
-            for j in T.vectorized(extent):
-                A[j] = op(T.float32(3), B[j])
-
-    @I.ir_module
-    class After:
-
-        @T.prim_func
-        def main(A: T.Tensor((25,), "float32"), B: T.Tensor((25,), "float32")):
-            A[T.Ramp(0, 1, extent)] = op(T.Broadcast(T.float32(3), extent), B[T.Ramp(0, 1, extent)])
-
-    with tvm.target.Target(target):
-        mod = tilelang.transform.VectorizeLoop()(Before)
-        tvm.ir.assert_structural_equal(mod, After)
-
-
-@tilelang.testing.requires_llvm
-@pytest.mark.parametrize("extent, target", [(4, simple_target), (T.vscale() * 4, sve_target)])
-@pytest.mark.parametrize("op", (T.And, T.Or))
-def test_vectorize_logical(op, extent, target):
-
-    @I.ir_module
-    class Before:
-
-        @T.prim_func
-        def main(A: T.Tensor((25,), "bool"), B: T.Tensor((25,), "bool")):
-            for j in T.vectorized(extent):
-                A[j] = op(T.bool(1), B[j])
-
-    @I.ir_module
-    class After:
-
-        @T.prim_func
-        def main(A: T.Tensor((25,), "bool"), B: T.Tensor((25,), "bool")):
-            A[T.Ramp(0, 1, extent)] = op(T.Broadcast(T.bool(1), extent), B[T.Ramp(0, 1, extent)])
-
-    with tvm.target.Target(target):
-        mod = tilelang.transform.VectorizeLoop()(Before)
-        tvm.ir.assert_structural_equal(mod, After)
-
-
-@tilelang.testing.requires_llvm
-@pytest.mark.parametrize("extent, target", [(4, simple_target), (T.vscale() * 4, sve_target)])
-def test_vectorize_select(extent, target):
-
-    @I.ir_module
-    class Before:
-
-        @T.prim_func
-        def main(A: T.Tensor((25,), "float32"), B: T.Tensor((25,), "float32")):
-            for j in T.vectorized(extent):
-                A[j] = T.Select(T.bool(True), A[j], B[j])
-
-    @I.ir_module
-    class After:
-
-        @T.prim_func
-        def main(A: T.Tensor((25,), "float32"), B: T.Tensor((25,), "float32")):
-            A[T.Ramp(0, 1, extent)] = T.Select(
-                T.Broadcast(T.bool(True), extent),
-                A[T.Ramp(0, 1, extent)],
-                B[T.Ramp(0, 1, extent)],
-            )
-
-    with tvm.target.Target(target):
-        mod = tilelang.transform.VectorizeLoop()(Before)
-        tvm.ir.assert_structural_equal(mod, After)
-
-
-@tilelang.testing.requires_llvm
-@pytest.mark.parametrize(
-    "extent, vec_str, target",
-    [(4, "int32x4", simple_target), (T.vscale() * 4, "int32xvscalex4", sve_target)],
-)
-def test_vectorize_cast(extent, vec_str, target):
-
-    @I.ir_module
-    class Before:
-
-        @T.prim_func
-        def main(A: T.Tensor((25,), "int32"), B: T.Tensor((25,), "float32")):
-            for j in T.vectorized(extent):
-                A[j] = T.Cast("int32", B[j])
-
-    @I.ir_module
-    class After:
-
-        @T.prim_func
-        def main(A: T.Tensor((25,), "int32"), B: T.Tensor((25,), "float32")):
-            A[T.Ramp(0, 1, extent)] = T.Cast(vec_str, B[T.Ramp(0, 1, extent)])
-
-    with tvm.target.Target(target):
-        mod = tilelang.transform.VectorizeLoop()(Before)
-        tvm.ir.assert_structural_equal(mod, After)
-
-
-@tilelang.testing.requires_llvm
-def test_illegal_extent():
-
-    @I.ir_module(check_well_formed=False)
-    class Mod:
-
-        @T.prim_func
-        def main(A: T.Tensor((25,), "int32")):
-            n = T.Var("n", dtype="int32")
-            for j in T.vectorized(n):
-                A[j] = 3
-
-    error_msg = f"Failed to vectorize loop with extent n for target \\(nullptr\\)"
-    with pytest.raises(tvm.error.InternalError, match=error_msg):
-        tilelang.transform.VectorizeLoop()(Mod)
-
-
-@tilelang.testing.requires_llvm
-def test_illegal_vscale_in_non_sve_compilation():
-
-    @I.ir_module
-    class Mod:
-
-        @T.prim_func
-        def main(A: T.Tensor((16,), "float32")):
-            for j in T.vectorized(0, 4 * T.vscale()):
-                A[j] = 13
-
-    msg = (f"Failed to vectorize loop with extent T.vscale\\(\\) \\* 4 for target "
-           f"llvm -keys=cpu -mtriple=x86_64-linux-gnu")
-    with tvm.target.Target(simple_target):
-        with pytest.raises(tvm.error.InternalError, match=msg):
-            tilelang.transform.VectorizeLoop()(Mod)
-
-
-if __name__ == "__main__":
-    tilelang.testing.main()
diff --git a/testing/python/transform/test_tilelang_transform_warp_specialized.py b/testing/python/transform/test_tilelang_transform_warp_specialized.py
index bd787621a..b075d04f9 100644
--- a/testing/python/transform/test_tilelang_transform_warp_specialized.py
+++ b/testing/python/transform/test_tilelang_transform_warp_specialized.py
@@ -44,7 +44,7 @@ def before(A: T.Tensor((M, K), dtype), B: T.Tensor((K, N), dtype)):
             A_shared = T.alloc_buffer((3, 1, 8, 256), "float16", scope="shared.dyn")
             B_shared = T.alloc_buffer((3, 1, 4, 512), "float16", scope="shared.dyn")
             C_local = T.alloc_buffer((32,), scope="local")
-            for k in T.serial(16, annotations={"num_stages": 3}):
+            for k in T.serial(16, annotations={"num_stages": T.int32(3)}):
                 if v == 0:
                     T.tma_load(
                         T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1, 0, 2,
@@ -118,4 +118,4 @@ def after(A: T.Tensor((M, K), dtype), B: T.Tensor((K, N), dtype)):
 
 
 if __name__ == "__main__":
-    test_warp_specialized()
+    tilelang.testing.main()
\ No newline at end of file
diff --git a/testing/python/utils/test_compress_utils.py b/testing/python/utils/test_compress_utils.py
new file mode 100644
index 000000000..ce88a3a09
--- /dev/null
+++ b/testing/python/utils/test_compress_utils.py
@@ -0,0 +1,62 @@
+import torch
+import tilelang
+from tilelang.utils.sparse import compress_sm90
+
+
+def generate_2_to_4_sparse_tensor(shape, dtype=torch.float32, device='cpu'):
+    if shape[-1] % 4 != 0:
+        raise ValueError("Last dimension must be divisible by 4 for 2:4 sparsity.")
+
+    full_tensor = torch.randn(shape, dtype=torch.float32, device=device)
+    mask = torch.zeros_like(full_tensor, dtype=torch.bool)
+
+    group_count = shape[-1] // 4
+    group_shape = shape[:-1] + (group_count, 4)
+
+    reshaped = full_tensor.view(*group_shape)
+
+    for idx in range(reshaped.numel() // 4):
+        flat_idx = torch.randint(0, 4, (2,), dtype=torch.int64)
+        while flat_idx[0] == flat_idx[1]:
+            flat_idx[1] = torch.randint(0, 4, (1,), dtype=torch.int64)
+        i = idx // group_count
+        j = idx % group_count
+        mask.view(*group_shape)[i, j, flat_idx[0]] = True
+        mask.view(*group_shape)[i, j, flat_idx[1]] = True
+
+    sparse_tensor = full_tensor * mask
+    return sparse_tensor.to(dtype)
+
+
+def _test_compress_sm90(M, K, block_k, dtype):
+    A = generate_2_to_4_sparse_tensor((M, K), dtype=dtype, device='cuda')
+    A_sparse, E = compress_sm90(A, block_k, False)
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version(9, 0)
+def test_compress_sm90():
+    _test_compress_sm90(1024, 1024, 128, torch.float16)
+    _test_compress_sm90(1024, 1024, 64, torch.float16)
+    _test_compress_sm90(1024, 1024, 32, torch.float16)
+
+    _test_compress_sm90(1024, 1024, 128, torch.bfloat16)
+    _test_compress_sm90(1024, 1024, 64, torch.bfloat16)
+    _test_compress_sm90(1024, 1024, 32, torch.bfloat16)
+
+    _test_compress_sm90(1024, 1024, 64, torch.float32)
+    _test_compress_sm90(1024, 1024, 32, torch.float32)
+    _test_compress_sm90(1024, 1024, 16, torch.float32)
+
+    _test_compress_sm90(1024, 1024, 256, torch.float8_e4m3fn)
+    _test_compress_sm90(1024, 1024, 128, torch.float8_e4m3fn)
+    _test_compress_sm90(1024, 1024, 64, torch.float8_e4m3fn)
+
+    _test_compress_sm90(1024, 1024, 256, torch.float8_e5m2)
+    _test_compress_sm90(1024, 1024, 128, torch.float8_e5m2)
+    _test_compress_sm90(1024, 1024, 64, torch.float8_e5m2)
+
+
+if __name__ == "__main__":
+    test_compress_sm90()
+    print("All tests passed.")
diff --git a/tilelang/__init__.py b/tilelang/__init__.py
index 8fe53c2bb..0c0146bdc 100644
--- a/tilelang/__init__.py
+++ b/tilelang/__init__.py
@@ -57,7 +57,7 @@ def _init_logger():
 from .env import enable_cache, disable_cache, is_cache_enabled  # noqa: F401
 
 import tvm
-import tvm._ffi.base
+import tvm.base
 from tvm import DataType  # noqa: F401
 
 from . import libinfo
@@ -69,7 +69,7 @@ def _load_tile_lang_lib():
         for path in libinfo.get_dll_directories():
             os.add_dll_directory(path)
     # pylint: disable=protected-access
-    lib_name = "tilelang" if tvm._ffi.base._RUNTIME_ONLY else "tilelang_module"
+    lib_name = "tilelang" if tvm.base._RUNTIME_ONLY else "tilelang_module"
     # pylint: enable=protected-access
     lib_path = libinfo.find_lib_path(lib_name, optional=False)
     return ctypes.CDLL(lib_path[0]), lib_path[0]
diff --git a/tilelang/_ffi_api.py b/tilelang/_ffi_api.py
index 550601f94..d4fb0be49 100644
--- a/tilelang/_ffi_api.py
+++ b/tilelang/_ffi_api.py
@@ -1,6 +1,6 @@
 """FFI APIs for tilelang"""
 
-import tvm._ffi
+import tvm.ffi
 
 # TVM_REGISTER_GLOBAL("tl.name").set_body_typed(func);
-tvm._ffi._init_api("tl", __name__)  # pylint: disable=protected-access
+tvm.ffi._init_api("tl", __name__)  # pylint: disable=protected-access
diff --git a/tilelang/autotuner/param.py b/tilelang/autotuner/param.py
index 93c72c18d..fcf9eb7ff 100644
--- a/tilelang/autotuner/param.py
+++ b/tilelang/autotuner/param.py
@@ -149,14 +149,14 @@ class AutotuneResult:
     func: Optional[Callable] = None
     kernel: Optional[Callable] = None
 
-    def _save_kernel_to_disk(self, cache_path: Path, kernel: JITKernel):
+    def _save_kernel_to_disk(self, cache_path: Path, kernel: JITKernel, verbose: bool = False):
         """
         Persists a compiled kernel to disk cache.
 
         Args:
-            key (str): The hash key identifying the kernel.
+            cache_path (Path): The root path for the cache files.
             kernel (JITKernel): The compiled kernel to be saved.
-            func (Callable, optional): The original function.
+            verbose (bool): Enable verbose log messages.
 
         Note:
             Saves the following files:
@@ -170,6 +170,8 @@ def _save_kernel_to_disk(self, cache_path: Path, kernel: JITKernel):
         # Save kernel source code
         try:
             kernel_path = os.path.join(cache_path, KERNEL_PATH)
+            if verbose:
+                logger.debug(f"Saving kernel source code to file: {kernel_path}")
             if kernel.artifact.kernel_source is not None:
                 with open(kernel_path, "w") as f:
                     f.write(kernel.artifact.kernel_source)
@@ -179,6 +181,8 @@ def _save_kernel_to_disk(self, cache_path: Path, kernel: JITKernel):
         # Save wrapped kernel source code
         try:
             wrapped_kernel_path = os.path.join(cache_path, WRAPPED_KERNEL_PATH)
+            if verbose:
+                logger.debug(f"Saving wrapped kernel source code to file: {wrapped_kernel_path}")
             with open(wrapped_kernel_path, "w") as f:
                 f.write(kernel.get_kernel_source())
         except Exception as e:
@@ -188,6 +192,8 @@ def _save_kernel_to_disk(self, cache_path: Path, kernel: JITKernel):
         try:
             kernel_lib_path = os.path.join(cache_path, KERNEL_LIB_PATH)
             src_lib_path = kernel.adapter.libpath
+            if verbose:
+                logger.debug(f"Saving kernel library to file: {kernel_lib_path}")
             shutil.copy(src_lib_path, kernel_lib_path)
         except Exception as e:
             logger.error(f"Error saving kernel library to disk: {e}")
@@ -195,6 +201,8 @@ def _save_kernel_to_disk(self, cache_path: Path, kernel: JITKernel):
         # Save kernel parameters
         try:
             params_path = os.path.join(cache_path, PARAMS_PATH)
+            if verbose:
+                logger.debug(f"Saving kernel parameters to disk: {params_path}")
             with open(params_path, "wb") as f:
                 cloudpickle.dump(kernel.params, f)
         except Exception as e:
@@ -209,6 +217,7 @@ def _load_kernel_from_disk(
         execution_backend: Literal["dlpack", "ctypes", "cython"] = "cython",
         pass_configs: dict = None,
         func: Callable = None,
+        verbose: bool = False,
     ) -> JITKernel:
         """
         Loads a previously compiled kernel from disk cache.
@@ -221,6 +230,7 @@ def _load_kernel_from_disk(
             execution_backend (Literal): Backend type for execution. Defaults to "cython".
             pass_configs (dict, optional): Configuration for compiler passes.
             func (Callable, optional): The original function.
+            verbose (bool): Enable verbose log messages.
 
         Returns:
             JITKernel: The loaded kernel if found, None otherwise.
@@ -234,6 +244,8 @@ def _load_kernel_from_disk(
 
         try:
             wrapped_kernel_path = os.path.join(cache_path, WRAPPED_KERNEL_PATH)
+            if verbose:
+                logger.debug(f"Loading wrapped kernel source code from file: {wrapped_kernel_path}")
             with open(wrapped_kernel_path, "r") as f:
                 kernel_global_source = f.read()
         except Exception as e:
@@ -244,6 +256,8 @@ def _load_kernel_from_disk(
         # Load kernel parameters
         try:
             params_path = os.path.join(cache_path, PARAMS_PATH)
+            if verbose:
+                logger.debug(f"Loading kernel parameters from file: {params_path}")
             with open(params_path, "rb") as f:
                 kernel_params = cloudpickle.load(f)
         except Exception as e:
@@ -264,19 +278,25 @@ def _load_kernel_from_disk(
         else:
             return None
 
-    def save_to_disk(self, path: Path):
+    def save_to_disk(self, path: Path, verbose: bool = False):
         if not os.path.exists(path):
             os.makedirs(path)
 
         # save best config
+        if verbose:
+            logger.debug(f"Saving best config to file: {path / BEST_CONFIG_PATH}")
         with open(path / BEST_CONFIG_PATH, "w") as f:
             json.dump(self.config, f)
 
         # save function
+        if verbose:
+            logger.debug(f"Saving function to file: {path / FUNCTION_PATH}")
         with open(path / FUNCTION_PATH, "wb") as f:
             cloudpickle.dump(self.func, f)
 
         # save ref latency
+        if verbose:
+            logger.debug(f"Saving latency to file: {path / LATENCY_PATH}")
         with open(path / LATENCY_PATH, "w") as f:
             json.dump({
                 "latency": self.latency,
@@ -291,15 +311,22 @@ def load_from_disk(cls, path: Path, compile_args: CompileArgs) -> 'AutotuneResul
         if not os.path.exists(path):
             return None
 
+        verbose = compile_args.verbose
         # load best config
+        if verbose:
+            logger.debug(f"Loading best config from file: {path / BEST_CONFIG_PATH}")
         with open(path / BEST_CONFIG_PATH, "r") as f:
             config = json.load(f)
 
         # load function
+        if verbose:
+            logger.debug(f"Loading function from file: {path / FUNCTION_PATH}")
         with open(path / FUNCTION_PATH, "rb") as f:
             func = cloudpickle.load(f)
 
         # load latency
+        if verbose:
+            logger.debug(f"Loading latency from file: {path / LATENCY_PATH}")
         with open(path / LATENCY_PATH, "r") as f:
             latency = json.load(f)
             latency, ref_latency = latency["latency"], latency["ref_latency"]
diff --git a/tilelang/autotuner/tuner.py b/tilelang/autotuner/tuner.py
index c2a0b1a15..008807a79 100644
--- a/tilelang/autotuner/tuner.py
+++ b/tilelang/autotuner/tuner.py
@@ -203,7 +203,7 @@ def set_profile_args(self,
                 logger.warning(
                     "`supply_prog` will be ignored as this program is under `with set_autotune_inputs` context."
                 )
-            supply_prog = lambda _: get_autotune_inputs()  # noqa: E731·
+            supply_prog = lambda _: get_autotune_inputs()  # noqa: E731
 
         self.profile_args = ProfileArgs(
             supply_type=supply_type,
@@ -257,7 +257,7 @@ def generate_cache_key(self, parameters: Dict[str, Any]) -> Optional[AutotuneRes
         return hashlib.sha256(key_string.encode()).hexdigest()
 
     def _save_result_to_disk(self, key, result: AutotuneResult):
-        result.save_to_disk(self.cache_dir / key)
+        result.save_to_disk(self.cache_dir / key, self.compile_args.verbose)
 
     def _load_result_from_disk(self, key) -> AutotuneResult:
         result = AutotuneResult.load_from_disk(self.cache_dir / key, self.compile_args)
diff --git a/tilelang/cache/kernel_cache.py b/tilelang/cache/kernel_cache.py
index bd483b8d7..02b1e0086 100644
--- a/tilelang/cache/kernel_cache.py
+++ b/tilelang/cache/kernel_cache.py
@@ -165,7 +165,7 @@ def cached(
 
             # Then check disk cache
             kernel = self._load_kernel_from_disk(key, target, target_host, out_idx,
-                                                 execution_backend, pass_configs, func)
+                                                 execution_backend, pass_configs, func, verbose)
             if kernel is not None:
                 if verbose:
                     self.logger.debug(
@@ -174,6 +174,8 @@ def cached(
                 self._memory_cache[key] = kernel
                 return kernel
 
+        if verbose:
+            self.logger.debug(f"No cached kernel for {func.attrs['global_symbol']}")
         # Compile kernel if cache miss; leave critical section
         kernel = JITKernel(
             func,
@@ -189,7 +191,7 @@ def cached(
         else:
             with self._lock:
                 if is_cache_enabled():
-                    self._save_kernel_to_disk(key, kernel, func)
+                    self._save_kernel_to_disk(key, kernel, func, verbose)
 
         # Store in memory cache after compilation
         self._memory_cache[key] = kernel
@@ -231,7 +233,11 @@ def _safe_write_file(path: str, mode: str, operation: Callable):
         # Use atomic POSIX replace, so other processes cannot see a partial write
         os.replace(temp_path, path)
 
-    def _save_kernel_to_disk(self, key: str, kernel: JITKernel, func: Callable = None):
+    def _save_kernel_to_disk(self,
+                             key: str,
+                             kernel: JITKernel,
+                             func: Callable = None,
+                             verbose: bool = False):
         """
         Persists a compiled kernel to disk cache.
 
@@ -239,6 +245,7 @@ def _save_kernel_to_disk(self, key: str, kernel: JITKernel, func: Callable = Non
             key (str): The hash key identifying the kernel.
             kernel (JITKernel): The compiled kernel to be saved.
             func (Callable, optional): The original function.
+            verbose (bool): Enable verbose log messages.
 
         Note:
             Saves the following files:
@@ -253,6 +260,8 @@ def _save_kernel_to_disk(self, key: str, kernel: JITKernel, func: Callable = Non
         # Save kernel source code
         try:
             kernel_path = os.path.join(cache_path, KERNEL_PATH)
+            if verbose:
+                self.logger.debug(f"Saving kernel source code to file: {kernel_path}")
             if kernel.artifact.kernel_source is not None:
                 KernelCache._safe_write_file(kernel_path, "w",
                                              lambda file: file.write(kernel.artifact.kernel_source))
@@ -262,6 +271,9 @@ def _save_kernel_to_disk(self, key: str, kernel: JITKernel, func: Callable = Non
         # Save wrapped kernel source code
         try:
             wrapped_kernel_path = os.path.join(cache_path, WRAPPED_KERNEL_PATH)
+            if verbose:
+                self.logger.debug(
+                    f"Saving wrapped kernel source code to file: {wrapped_kernel_path}")
             KernelCache._safe_write_file(
                 wrapped_kernel_path, "w",
                 lambda file: file.write(kernel.adapter.get_kernel_source()))
@@ -274,6 +286,8 @@ def _save_kernel_to_disk(self, key: str, kernel: JITKernel, func: Callable = Non
             kernel_lib_path = KERNEL_CUBIN_PATH if self.execution_backend == "nvrtc" else KERNEL_LIB_PATH
             kernel_lib_path = os.path.join(cache_path, kernel_lib_path)
             src_lib_path = kernel.adapter.libpath
+            if verbose:
+                self.logger.debug(f"Saving kernel library to file: {kernel_lib_path}")
             KernelCache._safe_write_file(
                 kernel_lib_path, "wb",
                 lambda file: file.write(KernelCache._load_binary(src_lib_path)))
@@ -282,6 +296,8 @@ def _save_kernel_to_disk(self, key: str, kernel: JITKernel, func: Callable = Non
             if self.execution_backend == "nvrtc":
                 kernel_py_path = os.path.join(cache_path, KERNEL_PY_PATH)
                 src_lib_path = src_lib_path.replace(".cubin", ".py")
+                if verbose:
+                    self.logger.debug(f"Saving kernel nvrtc python code to file: {kernel_py_path}")
                 KernelCache._safe_write_file(
                     kernel_py_path, "wb",
                     lambda file: file.write(KernelCache._load_binary(src_lib_path)))
@@ -291,6 +307,8 @@ def _save_kernel_to_disk(self, key: str, kernel: JITKernel, func: Callable = Non
         # Save kernel parameters
         try:
             params_path = os.path.join(cache_path, PARAMS_PATH)
+            if verbose:
+                self.logger.debug(f"Saving kernel parameters to disk: {params_path}")
             KernelCache._safe_write_file(params_path, "wb",
                                          lambda file: cloudpickle.dump(kernel.params, file))
         except Exception as e:
@@ -305,6 +323,7 @@ def _load_kernel_from_disk(
         execution_backend: Literal["dlpack", "ctypes", "cython", "nvrtc"] = "cython",
         pass_configs: dict = None,
         func: Callable = None,
+        verbose: bool = False,
     ) -> Optional[JITKernel]:
         """
         Loads a previously compiled kernel from disk cache.
@@ -317,6 +336,7 @@ def _load_kernel_from_disk(
             execution_backend (Literal): Backend type for execution. Defaults to "cython".
             pass_configs (dict, optional): Configuration for compiler passes.
             func (Callable, optional): The original function.
+            verbose (bool): Enable verbose log messages.
 
         Returns:
             JITKernel: The loaded kernel if found, None otherwise.
@@ -334,6 +354,9 @@ def _load_kernel_from_disk(
 
         # Load the kernel source file (optional)
         try:
+            if verbose:
+                self.logger.debug(
+                    f"Loading wrapped kernel source code from file: {wrapped_kernel_path}")
             with open(wrapped_kernel_path, "r") as f:
                 kernel_global_source = f.read()
         except Exception as e:
@@ -341,6 +364,8 @@ def _load_kernel_from_disk(
 
         # Load kernel parameters
         try:
+            if verbose:
+                self.logger.debug(f"Loading kernel parameters from file: {params_path}")
             with open(params_path, "rb") as f:
                 kernel_params = cloudpickle.load(f)
         except Exception as e:
diff --git a/tilelang/carver/analysis.py b/tilelang/carver/analysis.py
index e37a39f8c..653392df7 100644
--- a/tilelang/carver/analysis.py
+++ b/tilelang/carver/analysis.py
@@ -3,7 +3,7 @@
 from typing_extensions import Literal
 
 from tvm import ir, tir, DataType
-from tvm._ffi import get_global_func
+from tvm.ffi import get_global_func
 from tvm.target.target import Target
 from tvm.tir import Schedule, IterVar
 from tvm.tir.schedule import BlockRV
diff --git a/tilelang/carver/arch/__init__.py b/tilelang/carver/arch/__init__.py
index 8e4361340..d14645e24 100644
--- a/tilelang/carver/arch/__init__.py
+++ b/tilelang/carver/arch/__init__.py
@@ -6,6 +6,7 @@
 from tvm.target import Target
 import torch
 
+
 def get_arch(target: Union[str, Target] = "cuda") -> TileDevice:
     if isinstance(target, str):
         target = Target(target)
diff --git a/tilelang/carver/arch/cuda.py b/tilelang/carver/arch/cuda.py
index c778b1679..82952f38d 100644
--- a/tilelang/carver/arch/cuda.py
+++ b/tilelang/carver/arch/cuda.py
@@ -68,15 +68,15 @@ def has_mma_support(arch: TileDevice) -> bool:
     ("float16", "float32"),
     ("float16", "float16"),
     ("int8", "int32"),
-    ("e5m2_float8", "float32"),
-    ("e4m3_float8", "float32"),
+    ("float8_e5m2", "float32"),
+    ("float8_e4m3", "float32"),
 ]
 hopper_tensorcore_supported = ada_tensorcore_supported
 
 
 # TODO(lei): we should consider the dtype of the input a and b
 # instead of assuming both a and b share the same dtype.
-# As the tensorcore may supports e4m3_float8 * e5m2_float8
+# As the tensorcore may supports float8_e4m3 * float8_e5m2
 def is_tensorcore_supported_precision(in_dtype: str, accum_dtype: str, arch: TileDevice) -> bool:
 
     if is_volta_arch(arch):
diff --git a/tilelang/carver/matmul_analysis.py b/tilelang/carver/matmul_analysis.py
index 5f687437e..dfc1a53e9 100644
--- a/tilelang/carver/matmul_analysis.py
+++ b/tilelang/carver/matmul_analysis.py
@@ -695,14 +695,14 @@ def get_propagate_map(trans: bool = True, dtype="float16", matrix_name="A", inde
         "bfloat16",
         "float16",
         "int8",
-        "e4m3_float8",
-        "e5m2_float8",
-    ], "Only support bfloat16, float16, int8, e4m3_float8, e5m2_float8"
+        "float8_e4m3",
+        "float8_e5m2",
+    ], "Only support bfloat16, float16, int8, float8_e4m3, float8_e5m2"
     # TODO(lei): actually should analyze based on bits instead of dtype
     if dtype in ["bfloat16", "float16"]:
         ldmatrix_layout = ldmatrix_32x8_to_shared_16x16_layout
         ldmatrix_layout_trans = ldmatrix_trans_32x8_to_shared_16x16_layout
-    elif dtype in ["int8", "e4m3_float8", "e5m2_float8"]:
+    elif dtype in ["int8", "float8_e4m3", "float8_e5m2"]:
         # int8 mma only support 32x16 to 16x32 layout
         if matrix_name == "A" and trans is False:
             ldmatrix_layout = ldmatrix_32x16_to_shared_16x32_layout_a
@@ -760,12 +760,12 @@ def shared_32x16_to_mma_32x16_layout(i, j):
         "bfloat16",
         "float16",
         "int8",
-        "e4m3_float8",
-        "e5m2_float8",
-    ], "Only support float16, int8, e4m3_float8, e5m2_float8"
+        "float8_e4m3",
+        "float8_e5m2",
+    ], "Only support float16, int8, float8_e4m3, float8_e5m2"
     if dtype in ["bfloat16", "float16"]:
         stage3_layout = shared_32x8_to_mma_32x8_layout
-    elif dtype in ["int8", "e4m3_float8", "e5m2_float8"]:
+    elif dtype in ["int8", "float8_e4m3", "float8_e5m2"]:
         stage3_layout = shared_32x16_to_mma_32x16_layout
     else:
         raise ValueError("Unknown dtype ", dtype)
diff --git a/tilelang/carver/roller/policy/tensorcore.py b/tilelang/carver/roller/policy/tensorcore.py
index 2a042c833..60edc930e 100644
--- a/tilelang/carver/roller/policy/tensorcore.py
+++ b/tilelang/carver/roller/policy/tensorcore.py
@@ -281,10 +281,9 @@ def _assign_block_size(self, node: PrimFuncNode, td: TileDict, block_size: int):
 
         factors = factorize(np.prod(space) // warps)
 
-        def _score(node, thread):  # small is better
+        def _score(node, warp_tile):  # small is better
             score = 0
-            block_tile = [int(np.ceil(tile[i] / thread[i])) for i in range(ndim)]
-            shape = node.propagate_inputs_on_reduction(block_tile)
+            shape = node.propagate_inputs_on_reduction(warp_tile)
             input_buffers = node.block_analyzer.get_input_buffers(node.reduction_block)
             for i, _ in enumerate(input_buffers):
                 score += np.prod(shape[i]) / self.arch.bandwidth[1]
diff --git a/tilelang/contrib/cc.py b/tilelang/contrib/cc.py
index 5c168a3ac..d833d4a9e 100644
--- a/tilelang/contrib/cc.py
+++ b/tilelang/contrib/cc.py
@@ -24,7 +24,7 @@
 import sys
 from typing import Dict
 
-from tvm._ffi.base import py_str
+from tvm.base import py_str
 from tvm.contrib import tar as _tar
 from tvm.contrib import utils as _utils
 
diff --git a/tilelang/contrib/dlpack.py b/tilelang/contrib/dlpack.py
index 1a3c72638..58e82f8b1 100644
--- a/tilelang/contrib/dlpack.py
+++ b/tilelang/contrib/dlpack.py
@@ -37,10 +37,10 @@ def convert_func(tvm_func, tensor_type, to_dlpack_func):
     import torch
 
     float8_dtype_map = {
-        torch.float8_e4m3fn: "e4m3_float8",
+        torch.float8_e4m3fn: "float8_e4m3",
         torch.float8_e4m3fnuz: "float8_e4m3fnuz",
-        torch.float8_e5m2: "e5m2_float8",
-        torch.float8_e5m2fnuz: "e5m2_float8",
+        torch.float8_e5m2: "float8_e5m2",
+        torch.float8_e5m2fnuz: "float8_e5m2",
     }
 
     def adapt_tensor(arg):
diff --git a/tilelang/contrib/hipcc.py b/tilelang/contrib/hipcc.py
index 7ecb0c13b..afd381223 100644
--- a/tilelang/contrib/hipcc.py
+++ b/tilelang/contrib/hipcc.py
@@ -9,10 +9,10 @@
 
 import subprocess
 
-import tvm._ffi
+import tvm.ffi
 
 from tvm.contrib import utils
-from tvm._ffi.base import py_str
+from tvm.base import py_str
 from tvm.contrib.rocm import get_rocm_arch, find_rocm_path
 
 
@@ -96,7 +96,7 @@ def compile_hip(code,
         return data
 
 
-@tvm._ffi.register_func("tilelang_callback_hip_compile", override=True)
+@tvm.ffi.register_func("tilelang_callback_hip_compile", override=True)
 def tilelang_callback_hip_compile(code, target):
     """use hipcc to generate fatbin code for better optimization"""
     hsaco = compile_hip(code, target_format="hsaco")
diff --git a/tilelang/contrib/nvcc.py b/tilelang/contrib/nvcc.py
index 8022389b5..5cfe90ced 100644
--- a/tilelang/contrib/nvcc.py
+++ b/tilelang/contrib/nvcc.py
@@ -8,10 +8,10 @@
 import warnings
 from ..env import CUDA_HOME
 
-import tvm._ffi
+import tvm.ffi
 from tvm.target import Target
 
-from tvm._ffi.base import py_str
+from tvm.base import py_str
 from tvm.contrib import utils
 
 
@@ -52,9 +52,9 @@ def compile_cuda(code,
         #   "-gencode", "arch=compute_52,code=sm_52",
         #   "-gencode", "arch=compute_70,code=sm_70"
         # ]
-        compute_version = "".join(
-            get_target_compute_version(Target.current(allow_none=True)).split("."))
-        arch = ["-gencode", f"arch=compute_{compute_version},code=sm_{compute_version}"]
+        compute_version = get_target_compute_version(Target.current(allow_none=True))
+        target_arch = get_target_arch(compute_version)
+        arch = ["-gencode", f"arch=compute_{target_arch},code=sm_{target_arch}"]
 
     temp = utils.tempdir()
     file_name = "tvm_kernels"
@@ -181,14 +181,14 @@ def get_cuda_version(cuda_path=None):
     raise RuntimeError("Cannot read cuda version file")
 
 
-@tvm._ffi.register_func("tilelang_callback_cuda_compile", override=True)
+@tvm.ffi.register_func("tilelang_callback_cuda_compile", override=True)
 def tilelang_callback_cuda_compile(code, target):  # pylint: disable=unused-argument
     """use nvcc to generate fatbin code for better optimization"""
     ptx = compile_cuda(code, target_format="fatbin")
     return ptx
 
 
-@tvm._ffi.register_func("tilelang_callback_libdevice_path", override=True)
+@tvm.ffi.register_func("tilelang_callback_libdevice_path", override=True)
 def find_libdevice_path(arch):
     """Utility function to find libdevice
 
@@ -253,7 +253,7 @@ def callback_libdevice_path(arch):
         return ""
 
 
-@tvm._ffi.register_func("tvm.contrib.nvcc.get_compute_version", override=True)
+@tvm.ffi.register_func("tvm.contrib.nvcc.get_compute_version", override=True)
 def get_target_compute_version(target=None):
     """Utility function to get compute capability of compilation target.
 
@@ -298,7 +298,7 @@ def get_target_compute_version(target=None):
                      "Try specifying it by adding '-arch=sm_xx' to your target.")
 
 
-def parse_compute_version(compute_version):
+def parse_compute_version(compute_version) -> tuple[int, int]:
     """Parse compute capability string to divide major and minor version
 
     Parameters
@@ -323,6 +323,14 @@ def parse_compute_version(compute_version):
         raise RuntimeError("Compute version parsing error") from err
 
 
+def get_target_arch(compute_version) -> str:
+    major, minor = parse_compute_version(compute_version)
+    target_arch = str(major * 10 + minor)
+    if major >= 9:
+        target_arch += "a"
+    return target_arch
+
+
 def have_fp16(compute_version):
     """Either fp16 support is provided in the compute capability or not
 
@@ -391,7 +399,7 @@ def have_cudagraph():
         return False
 
 
-@tvm._ffi.register_func("tvm.contrib.nvcc.supports_bf16", override=True)
+@tvm.ffi.register_func("tvm.contrib.nvcc.supports_bf16", override=True)
 def have_bf16(compute_version):
     """Either bf16 support is provided in the compute capability or not
 
@@ -404,7 +412,7 @@ def have_bf16(compute_version):
     return major >= 8
 
 
-@tvm._ffi.register_func("tvm.contrib.nvcc.supports_fp8", override=True)
+@tvm.ffi.register_func("tvm.contrib.nvcc.supports_fp8", override=True)
 def have_fp8(compute_version):
     """Whether fp8 support is provided in the specified compute capability or not
 
@@ -421,7 +429,7 @@ def have_fp8(compute_version):
     return any(conditions)
 
 
-@tvm._ffi.register_func("tvm.contrib.nvcc.supports_tma", override=True)
+@tvm.ffi.register_func("tvm.contrib.nvcc.supports_tma", override=True)
 def have_tma(target):
     """Whether TMA support is provided in the specified compute capability or not
 
diff --git a/tilelang/contrib/nvrtc.py b/tilelang/contrib/nvrtc.py
index 97371701a..0f07022c9 100644
--- a/tilelang/contrib/nvrtc.py
+++ b/tilelang/contrib/nvrtc.py
@@ -1,7 +1,7 @@
 import cuda.bindings.nvrtc as nvrtc
 from typing import Literal, Union, List, Optional, Tuple
 from tvm.target import Target
-from .nvcc import get_target_compute_version
+from .nvcc import get_target_compute_version, parse_compute_version
 
 
 def get_nvrtc_version() -> Tuple[int, int]:
@@ -42,9 +42,9 @@ def compile_cuda(code: str,
     if arch is None:
         # If None, then it will use `tvm.target.Target.current().arch`.
         # Target arch could be a str like "80", "90", "90a", etc.
-        compute_version = "".join(
-            get_target_compute_version(Target.current(allow_none=True)).split("."))
-        arch = int(compute_version)
+        major, minor = parse_compute_version(
+            get_target_compute_version(Target.current(allow_none=True)))
+        arch = major * 10 + minor
     prefix = "compute" if target_format == "ptx" else "sm"
     suffix = "a" if arch >= 90 else ""
     arch_option = f"--gpu-architecture={prefix}_{arch}{suffix}"
diff --git a/tilelang/contrib/rocm.py b/tilelang/contrib/rocm.py
index a5ad87d56..8bb9e1d85 100644
--- a/tilelang/contrib/rocm.py
+++ b/tilelang/contrib/rocm.py
@@ -21,8 +21,8 @@
 import os
 from os.path import join, exists
 
-import tvm._ffi
-from tvm._ffi.base import py_str
+import tvm.ffi
+from tvm.base import py_str
 import tvm.runtime
 import tvm.target
 
@@ -100,7 +100,7 @@ def rocm_link(in_file, out_file, lld=None):
         raise RuntimeError(msg)
 
 
-@tvm._ffi.register_func("tvm_callback_rocm_link", override=True)
+@tvm.ffi.register_func("tvm_callback_rocm_link", override=True)
 def callback_rocm_link(obj_bin):
     """Links object file generated from LLVM to HSA Code Object
 
@@ -124,7 +124,7 @@ def callback_rocm_link(obj_bin):
     return cobj_bin
 
 
-@tvm._ffi.register_func("tvm_callback_rocm_bitcode_path", override=True)
+@tvm.ffi.register_func("tvm_callback_rocm_bitcode_path", override=True)
 def callback_rocm_bitcode_path(rocdl_dir=None):
     """Utility function to find ROCm device library bitcodes
 
@@ -226,7 +226,7 @@ def have_matrixcore(compute_version=None):
     return False
 
 
-@tvm._ffi.register_func("tvm_callback_rocm_get_arch", override=True)
+@tvm.ffi.register_func("tvm_callback_rocm_get_arch", override=True)
 def get_rocm_arch(rocm_path="/opt/rocm"):
     """Utility function to get the AMD GPU architecture
 
diff --git a/tilelang/engine/lower.py b/tilelang/engine/lower.py
index a242f33b2..65a14e6e6 100644
--- a/tilelang/engine/lower.py
+++ b/tilelang/engine/lower.py
@@ -29,9 +29,11 @@ def has_device_kernel_launch(attrs) -> bool:
 
 def is_device_call_c_device(func: tir.PrimFunc):
     attrs = func.attrs
+    calling_conv = attrs.get("calling_conv", CallingConv.DEFAULT)
+    is_cpacked = (calling_conv == CallingConv.C_PACKED_FUNC)
 
     # Check if it's a C target
-    if "target" in attrs and attrs["target"].kind.name == "c":
+    if "target" in attrs and attrs["target"].kind.name == "c" and not is_cpacked:
         return True
 
     return has_device_kernel_launch(attrs)
@@ -62,15 +64,10 @@ def tilelang_callback_cuda_compile(code, target):
         cutlass_path = os.environ["TL_CUTLASS_PATH"]
     else:
         cutlass_path = osp.abspath(osp.join(project_root, "3rdparty/cutlass/include"))
-    compute_version = "".join(nvcc.get_target_compute_version(target).split("."))
+    target_arch = nvcc.get_target_arch(nvcc.get_target_compute_version(target))
 
-    # special handle for Hopper
-    if compute_version == "90":
-        arch = ["-arch=sm_90a"]
-        format = "cubin"
-    else:
-        arch = [f"-arch=sm_{compute_version}"]
-        format = "cubin"
+    arch = [f"-arch=sm_{target_arch}"]
+    format = "cubin"
 
     # printing out number of registers
     debug_option = "--ptxas-options=--verbose,--register-usage-level=10,--warn-on-local-memory-usage"
@@ -130,7 +127,7 @@ def extrac_params(func: tir.PrimFunc) -> List[KernelParam]:
 def canon_target_host(target: Union[str, Target], target_host: Optional[Union[str, Target]]):
 
     if not target_host:
-        target_host = "llvm" if tvm.runtime.enabled("llvm") else "stackvm"
+        target_host = "llvm" if tvm.runtime.enabled("llvm") else "c"
 
     return target_host
 
@@ -145,9 +142,9 @@ def host_codegen(host_mod: tvm.IRModule, target_host: Target) -> tvm.IRModule:
     host_mod = tilelang.transform.LowerDeviceStorageAccessInfo()(host_mod)
     host_mod = tir.transform.CombineContextCall()(host_mod)
     if target_host.kind.name == "llvm":
-        host_mod = tvm._ffi.get_global_func("target.build.llvm")(host_mod, target_host)
+        host_mod = tvm.ffi.get_global_func("target.build.llvm")(host_mod, target_host)
     elif target_host.kind.name == "c":
-        host_mod = tvm._ffi.get_global_func("target.build.c")(host_mod, target_host)
+        host_mod = tvm.ffi.get_global_func("target.build.c")(host_mod, target_host)
     else:
         raise ValueError(f"Target host {target_host.kind.name} is not supported")
     return host_mod
@@ -159,9 +156,9 @@ def device_codegen(device_mod: tvm.IRModule, target: Target) -> tvm.IRModule:
     device_mod = tir.transform.Simplify()(device_mod)
 
     if target.kind.name == "cuda":
-        device_mod = tvm._ffi.get_global_func("target.build.tilelang_cuda")(device_mod, target)
+        device_mod = tvm.ffi.get_global_func("target.build.tilelang_cuda")(device_mod, target)
     elif target.kind.name == "hip":
-        device_mod = tvm._ffi.get_global_func("target.build.tilelang_hip")(device_mod, target)
+        device_mod = tvm.ffi.get_global_func("target.build.tilelang_hip")(device_mod, target)
     else:
         raise ValueError(f"Target {target.kind.name} is not supported")
 
@@ -173,17 +170,17 @@ def device_codegen_without_compile(device_mod: tvm.IRModule, target: Target) ->
     device_mod = tir.transform.LowerIntrin()(device_mod)
     device_mod = tir.transform.Simplify()(device_mod)
     if target.kind.name == "cuda":
-        device_mod = tvm._ffi.get_global_func("target.build.tilelang_cuda_without_compile")(
+        device_mod = tvm.ffi.get_global_func("target.build.tilelang_cuda_without_compile")(
             device_mod, target)
     elif target.kind.name == "hip":
-        device_mod = tvm._ffi.get_global_func("target.build.tilelang_hip_without_compile")(
+        device_mod = tvm.ffi.get_global_func("target.build.tilelang_hip_without_compile")(
             device_mod, target)
     elif target.kind.name == "c":
-        device_mod = tvm._ffi.get_global_func("target.build.tilelang_cpp")(device_mod, target)
+        device_mod = tvm.ffi.get_global_func("target.build.tilelang_cpp")(device_mod, target)
     elif target.kind.name == "llvm":
-        device_mod = tvm._ffi.get_global_func("target.build.llvm")(device_mod, target)
+        device_mod = tvm.ffi.get_global_func("target.build.llvm")(device_mod, target)
     elif target.kind.name == "webgpu":
-        device_mod = tvm._ffi.get_global_func("target.build.tilelang_webgpu")(device_mod, target)
+        device_mod = tvm.ffi.get_global_func("target.build.tilelang_webgpu")(device_mod, target)
     else:
         raise ValueError(f"Target {target.kind.name} is not supported")
 
diff --git a/tilelang/engine/phase.py b/tilelang/engine/phase.py
index cfbbfded8..17bc2c0b8 100644
--- a/tilelang/engine/phase.py
+++ b/tilelang/engine/phase.py
@@ -13,8 +13,7 @@ def allow_warp_specialized(pass_ctx: Optional[PassContext] = None,
 
     if pass_ctx is None:
         pass_ctx = tilelang.transform.get_pass_context()
-    # Warp specialized pass is recommended for Hopper or later architectures
-    if not is_cuda_target(target) or not have_tma(target):
+    if (not is_cuda_target(target)) or (not have_tma(target)):
         return False
     disable_warp_specialized = pass_ctx.config.get("tl.disable_warp_specialized", False)
     return not disable_warp_specialized
@@ -80,7 +79,6 @@ def LowerAndLegalize(mod: IRModule, target: Target) -> IRModule:
     mod = tilelang.transform.LegalizeVectorizedLoop()(mod)
     # Add safety checks for memory accesses
     mod = tilelang.transform.LegalizeSafeMemoryAccess()(mod)
-    # Align dynamic shared memory allocations
     # Simplify again to clean up any duplicated conditions
     # that may have been introduced by safety checks
     # use an enhanced pass to simplify the dynamic symbolics
@@ -109,7 +107,7 @@ def OptimizeForTarget(mod: IRModule, target: Target) -> IRModule:
         mod = tilelang.transform.InjectSoftwarePipeline()(mod)
         # warp_specialized pass will pack the if stmt into the block
         # so we need to lower the opaque block first
-        mod = tir.transform.LowerOpaqueBlock()(mod)
+        mod = tilelang.transform.LowerOpaqueBlock()(mod)
         mod = tilelang.transform.MergeIfStmt()(mod)
         mod = tilelang.transform.RewriteWgmmaSync()(mod)
         mod = tilelang.transform.InjectFenceProxy()(mod)
@@ -124,15 +122,16 @@ def OptimizeForTarget(mod: IRModule, target: Target) -> IRModule:
             # in hopper device, wgmma is an async proxy
             # so we need to inject a fence proxy before it
             mod = tilelang.transform.InjectFenceProxy()(mod)
-
-    mod = tir.transform.LowerOpaqueBlock()(mod)
+    mod = tilelang.transform.LowerOpaqueBlock()(mod)
     mod = tir.transform.NarrowDataType(32)(mod)
-    mod = tilelang.transform.ConfigIndexBitwidth()(mod)
     mod = tilelang.transform.FlattenBuffer()(mod)
+    # ConfigIndexBitwidth must be applied after FlattenBuffer
+    # as it will flatten index computing
+    mod = tilelang.transform.ConfigIndexBitwidth()(mod)
     mod = tir.transform.Simplify()(mod)
 
     mod = tilelang.transform.VectorizeLoop(enable_vectorize=allow_vectorize(pass_ctx=pass_ctx))(mod)
-    mod = tir.transform.StorageRewrite()(mod)
+    mod = tilelang.transform.StorageRewrite()(mod)
     mod = tir.transform.UnrollLoop()(mod)
     mod = tir.transform.RenormalizeSplitPattern()(mod)
     mod = tir.transform.Simplify()(mod)
@@ -153,7 +152,7 @@ def OptimizeForTarget(mod: IRModule, target: Target) -> IRModule:
     # the Legalization.
     mod = tilelang.transform.ThreadPartialSync("shared.dyn")(mod)
     mod = tir.transform.InferFragment()(mod)
-    mod = tir.transform.LowerThreadAllreduce()(mod)
+    mod = tilelang.transform.LowerThreadAllreduce()(mod)
 
     mod = tilelang.transform.LowerHopperIntrin()(mod)
 
@@ -166,21 +165,17 @@ def OptimizeForTarget(mod: IRModule, target: Target) -> IRModule:
     # MergeSharedMemoryAllocations must be applied after SplitHostDevice
     # because the merged allocation site is at the beginning of each device function
     enable_aggressive_merge = should_enable_aggressive_merge(pass_ctx=pass_ctx, target=target)
-    # Hopper Swizzling requires dynamic shared memory address to be aligned to 1024 bytes
-    # For other devices, we align to 16 bytes
-    smem_align_bytes = 1024 if have_tma(target) else 16
-    # Workaround, wait for a element wise synchronization pass
     mod = tilelang.transform.MergeSharedMemoryAllocations(
-        enable_aggressive_merge=enable_aggressive_merge, align_bytes=smem_align_bytes)(
+        enable_aggressive_merge=enable_aggressive_merge)(
             mod)
     mod = tilelang.transform.ThreadSync("shared")(mod)
     mod = tilelang.transform.ThreadSync("shared.dyn")(mod)
     # Inject PTX async copy must behind the thread sync pass
     # as ptx async copy won't be recognized as a valid buffer load
     mod = tilelang.transform.InjectPTXAsyncCopy()(mod)
-
     mod = tilelang.transform.MakePackedAPI()(mod)
-    mod = tir.transform.LowerDeviceKernelLaunch()(mod)
+    mod = tilelang.transform.LowerDeviceKernelLaunch()(mod)
+
     # Transform threadblock to persistent threadblock
     mod = tilelang.transform.PersistThreadblock()(mod)
 
diff --git a/tilelang/env.py b/tilelang/env.py
index d2488e311..adc8860e9 100644
--- a/tilelang/env.py
+++ b/tilelang/env.py
@@ -53,11 +53,10 @@ def _initialize_torch_cuda_arch_flags():
 
     target = determine_target(return_object=True)
     # create tmp source file for torch cpp extension
-    compute_version = "".join(nvcc.get_target_compute_version(target).split("."))
-    # set TORCH_CUDA_ARCH_LIST
-    major = compute_version[0]
-    minor = compute_version[1]
+    compute_version = nvcc.get_target_compute_version(target)
+    major, minor = nvcc.parse_compute_version(compute_version)
 
+    # set TORCH_CUDA_ARCH_LIST
     os.environ["TORCH_CUDA_ARCH_LIST"] = f"{major}.{minor}"
 
 
@@ -75,6 +74,9 @@ def _initialize_torch_cuda_arch_flags():
                                          os.path.expanduser("~/.tilelang/cache"))
 TILELANG_TMP_DIR: str = os.path.join(TILELANG_CACHE_DIR, "tmp")
 
+# Print the kernel name on every compilation
+TILELANG_PRINT_ON_COMPILATION: str = os.environ.get("TILELANG_PRINT_COMPILATION", "0")
+
 # Auto-clear cache if environment variable is set
 TILELANG_CLEAR_CACHE = os.environ.get("TILELANG_CLEAR_CACHE", "0")
 
diff --git a/tilelang/intrinsics/mfma_macro_generator.py b/tilelang/intrinsics/mfma_macro_generator.py
index 7314f6b50..4bd68cec0 100644
--- a/tilelang/intrinsics/mfma_macro_generator.py
+++ b/tilelang/intrinsics/mfma_macro_generator.py
@@ -25,8 +25,8 @@ class MatrixCoreIntrinEmitter(object):
         "float32": "fp32",
         "int8": "int8",
         "int32": "int32",
-        "e4m3_float8": "e4m3",
-        "e5m2_float8": "e5m2",
+        "float8_e4m3": "e4m3",
+        "float8_e5m2": "e5m2",
         "float8_e4m3fnuz": "e4m3fnuz",
     }
 
diff --git a/tilelang/intrinsics/mma_macro_generator.py b/tilelang/intrinsics/mma_macro_generator.py
index 19e9f357b..8d4d43ebc 100644
--- a/tilelang/intrinsics/mma_macro_generator.py
+++ b/tilelang/intrinsics/mma_macro_generator.py
@@ -28,8 +28,8 @@ class TensorCoreIntrinEmitter(object):
         "float32": "fp32",
         "int8": "int8",
         "int32": "int32",
-        "e4m3_float8": "e4m3",
-        "e5m2_float8": "e5m2",
+        "float8_e4m3": "e4m3",
+        "float8_e5m2": "e5m2",
     }
 
     # Represent the thread binding in the form of (tx, warp_n, warp_m)
diff --git a/tilelang/intrinsics/utils.py b/tilelang/intrinsics/utils.py
index 50bec0cc0..157a967be 100644
--- a/tilelang/intrinsics/utils.py
+++ b/tilelang/intrinsics/utils.py
@@ -78,7 +78,7 @@ def get_mma_micro_size(dtype: Literal["float16", "int8"]):
     # Basic Tensor Core Matrix Multiply operation Unit
     micro_size_x = micro_size_y = 16
     micro_size_k = 16
-    if dtype in {"e4m3_float8", "e5m2_float8", "int8"}:
+    if dtype in {"float8_e4m3", "float8_e5m2", "int8"}:
         micro_size_k = 32
     return micro_size_x, micro_size_y, micro_size_k
 
diff --git a/tilelang/jit/__init__.py b/tilelang/jit/__init__.py
index b57d5101b..8f9a4a381 100644
--- a/tilelang/jit/__init__.py
+++ b/tilelang/jit/__init__.py
@@ -235,7 +235,7 @@ def jit(  # This is the new public interface
         out_idx: Any = None,
         target: Union[str, Target] = "auto",
         target_host: Union[str, Target] = None,
-        execution_backend: Literal["dlpack", "ctypes", "cython"] = "cython",
+        execution_backend: Literal["dlpack", "ctypes", "cython", "nvrtc"] = "cython",
         verbose: bool = False,
         pass_configs: Optional[Dict[str, Any]] = None,
         debug_root_path: Optional[str] = None,
diff --git a/tilelang/jit/adapter/ctypes/adapter.py b/tilelang/jit/adapter/ctypes/adapter.py
index f38e32109..e13a1da47 100644
--- a/tilelang/jit/adapter/ctypes/adapter.py
+++ b/tilelang/jit/adapter/ctypes/adapter.py
@@ -6,7 +6,7 @@
 from typing import List, Optional, Union, Callable, Dict, Tuple, Any
 from tilelang import tvm as tvm
 from tvm.target import Target
-from tvm.relay import TensorType
+from tvm.relax import TensorType
 from tvm import tir
 from tilelang.jit.adapter.wrapper import TLWrapper
 from tilelang.jit.adapter.libgen import LibraryGenerator
@@ -88,7 +88,7 @@ def __init__(self,
         self.target = Target.canon_target(determine_target(target))
         self.verbose = verbose
         self.wrapper = TLWrapper(self.target)
-        self.lib_generator = LibraryGenerator(self.target)
+        self.lib_generator = LibraryGenerator(self.target, verbose=verbose)
         self.lib_generator.assign_pass_configs(pass_configs)
         self.lib_generator.assign_compile_flags(compile_flags)
 
@@ -146,7 +146,7 @@ def from_database(cls,
 
         adapter.target = Target.canon_target(determine_target(target))
         adapter.verbose = verbose
-        adapter.lib_generator = LibraryGenerator(adapter.target)
+        adapter.lib_generator = LibraryGenerator(adapter.target, verbose=verbose)
         adapter.lib_generator.assign_pass_configs(pass_configs)
         adapter.lib_generator.assign_compile_flags(compile_flags)
         adapter.lib = adapter.lib_generator.load_lib(lib_path=kernel_lib_path)
@@ -155,21 +155,31 @@ def from_database(cls,
         adapter._post_init()
         return adapter
 
-    def _process_dynamic_symbolic(self):
+    def _process_dynamic_symbolic(self) -> Dict[tir.Var, Tuple[int, int, int]]:
         """Extract information about dynamic shapes from the TIR function.
         
-        Maps symbolic variables to their corresponding (buffer_index, shape_dimension)
+        Maps symbolic variables to their corresponding (id, buffer_index, dimension)
         for runtime shape resolution.
+        id represents shape or stride, 0 represents shape, 1 represents stride
         """
         func = self.prim_func
         params = func.params
         buffer_map = func.buffer_map
         dynamic_symbolic_map = {}
         for i, param in enumerate(params):
-            buffer = buffer_map[param]
-            for j, shape in enumerate(buffer.shape):
-                if isinstance(shape, tir.Var) and (shape not in dynamic_symbolic_map):
-                    dynamic_symbolic_map[shape] = (i, j)
+            if param in buffer_map:
+                buffer = buffer_map[param]
+                for j, shape in enumerate(buffer.shape):
+                    if (isinstance(shape, tir.Var) and (shape not in dynamic_symbolic_map) and
+                        (shape not in params)):
+                        dynamic_symbolic_map[shape] = (0, i, j)
+        for i, param in enumerate(params):
+            if param in buffer_map:
+                buffer = buffer_map[param]
+                for j, stride in enumerate(buffer.strides):
+                    if (isinstance(stride, tir.Var) and (stride not in dynamic_symbolic_map) and
+                        (stride not in params)):
+                        dynamic_symbolic_map[stride] = (1, i, j)
         return dynamic_symbolic_map
 
     def _forward_from_prebuild_lib(self, *args, stream: Optional[int] = None):
@@ -228,8 +238,11 @@ def _wrap_forward_from_prebuild_lib(self,
             args.append(tensor)
 
         # dynamic symbolics
-        for _, (buffer_idx, shape_idx) in self.dynamic_symbolic_map.items():
-            args.append(ins[buffer_idx].shape[shape_idx])
+        for _, (ref_id, buffer_idx, shape_idx) in self.dynamic_symbolic_map.items():
+            if ref_id == 0:
+                args.append(ins[buffer_idx].shape[shape_idx])
+            else:
+                args.append(ins[buffer_idx].stride(shape_idx))
 
         # if stream is not None, we need to pass the stream to the library
         if stream is None:
diff --git a/tilelang/jit/adapter/cython/adapter.py b/tilelang/jit/adapter/cython/adapter.py
index 102ca4c27..12623906b 100644
--- a/tilelang/jit/adapter/cython/adapter.py
+++ b/tilelang/jit/adapter/cython/adapter.py
@@ -1,13 +1,24 @@
 """The profiler and convert to torch utils"""
 
-from ..base import BaseKernelAdapter
 import ctypes
+import fcntl
+import hashlib
+import logging
+import site
+import sys
+import sysconfig
+import torch
+import os
+from pathlib import Path
+
 from typing import List, Optional, Union, Callable, Dict, Tuple, Any
 from tilelang import tvm as tvm
 from tvm.target import Target
 from tilelang.engine.param import KernelParam
 from tvm import tir
-from tvm.relay import TensorType
+from tvm.relax import TensorType
+
+from tilelang.jit.adapter.base import BaseKernelAdapter
 from tilelang.jit.adapter.wrapper import TLWrapper
 from tilelang.jit.adapter.libgen import LibraryGenerator
 from tilelang.jit.adapter.utils import is_cuda_target, is_hip_target, is_cpu_target
@@ -15,15 +26,6 @@
 from tilelang.utils.language import retrieve_func_from_module
 from tilelang.utils.tensor import map_torch_type
 from tilelang.contrib.cc import get_cplus_compiler
-import torch
-import sys
-import sysconfig
-import hashlib
-import os
-import fcntl
-from pathlib import Path
-import logging
-import site
 
 logger = logging.getLogger(__name__)
 
@@ -116,15 +118,15 @@ def get_cached_lib(source_code: str) -> Tuple[Optional[ctypes.CDLL], Path]:
         with open(md5_path, "r") as f:
             cached_hash = f.read().strip()
             if cached_hash == code_hash:
-                logger.debug("Cython jit adapter is up to date, no need to compile...")
+                logger.debug("Cython JIT adapter is up to date, no need to compile...")
                 need_compile = False
             else:
-                logger.info("Cython jit adapter is out of date, need to recompile...")
+                logger.info("Cython JIT adapter is out of date, need to recompile...")
     else:
-        logger.info("No cached version found for cython jit adapter, need to compile...")
+        logger.info("No cached version found for Cython JIT adapter, need to compile...")
 
     if need_compile:
-        logger.info("Waiting for lock to compile cython jit adapter...")
+        logger.info("Waiting for lock to compile Cython JIT adapter...")
         with open(lock_file, 'w') as lock:
             fcntl.flock(lock.fileno(), fcntl.LOCK_EX)
             try:
@@ -138,7 +140,7 @@ def get_cached_lib(source_code: str) -> Tuple[Optional[ctypes.CDLL], Path]:
                             need_compile = False
 
                 if need_compile:
-                    logger.info("Compiling cython jit adapter...")
+                    logger.info("Compiling Cython JIT adapter...")
                     temp_path = cache_dir / f"temp_{code_hash}.so"
 
                     with open(md5_path, "w") as f:
@@ -159,7 +161,7 @@ def get_cached_lib(source_code: str) -> Tuple[Optional[ctypes.CDLL], Path]:
             except Exception as e:
                 if 'temp_path' in locals() and temp_path.exists():
                     temp_path.unlink()
-                raise Exception(f"Failed to compile cython jit adapter: {e}") from e
+                raise Exception(f"Failed to compile Cython JIT adapter: {e}") from e
             finally:
                 if lock_file.exists():
                     lock_file.unlink()
@@ -195,11 +197,14 @@ class CythonKernelAdapter(BaseKernelAdapter):
     ptr_map: Optional[Dict[int, str]] = None
     # Maps buffer variables to their corresponding dtypes
     buffer_dtype_map: Optional[Dict[tir.Var, Tuple[int, torch.dtype]]] = None
-    # Maps buffer variables to their corresponding static shapes
-    # {
-    #     "A": [(0, 16), (1, 16)] -> represents A.shape = (16, 16)
+    # Maps buffer variables to their corresponding static shapes and strides,
+    # e.g., {
+    #     "A": [(0, 16), (1, 16)] -> represents A.shape/strides = (16, 16)
     # }
     static_shape_map: Optional[Dict[tir.Var, Tuple[int, List[Tuple[int, int]]]]] = None
+    static_strides_map: Optional[Dict[tir.Var, Tuple[int, List[Tuple[int, int]]]]] = None
+    # Contains contiguous buffers
+    static_contiguous_list: Optional[List[tir.Var]] = None
     # Maps buffer variables to their corresponding devices
     buffer_device_map: Optional[Dict[tir.Var, Tuple[int, torch.device]]] = None
     # Pass configs for the compiler
@@ -239,12 +244,16 @@ def __init__(self,
         self.dynamic_symbolic_map = self._process_dynamic_symbolic()
         self.buffer_dtype_map = self._process_buffer_dtype()
         self.ptr_map = self._process_ptr_map()
-        self.static_shape_map = self._process_static_shape()
         self.buffer_device_map = self._process_buffer_device()
 
+        static_buffer_infos = self._process_static_buffer_infos()
+        self.static_shape_map = static_buffer_infos[0]
+        self.static_strides_map = static_buffer_infos[1]
+        self.static_contiguous_list = static_buffer_infos[2]
+
         self.verbose = verbose
         self.wrapper = TLWrapper(self.target)
-        self.lib_generator = LibraryGenerator(self.target)
+        self.lib_generator = LibraryGenerator(self.target, verbose=verbose)
         self.lib_generator.assign_pass_configs(pass_configs)
         self.lib_generator.assign_compile_flags(compile_flags)
 
@@ -269,6 +278,8 @@ def __init__(self,
         self.cython_wrapper.set_dynamic_symbolic_map(self.dynamic_symbolic_map)
         self.cython_wrapper.set_buffer_dtype_map(self.buffer_dtype_map)
         self.cython_wrapper.set_static_shape_map(self.static_shape_map)
+        self.cython_wrapper.set_static_strides_map(self.static_strides_map)
+        self.cython_wrapper.set_static_contiguous_list(self.static_contiguous_list)
         self.cython_wrapper.set_buffer_device_map(self.buffer_device_map)
         self.cython_wrapper.set_ptr_map(self.ptr_map)
         self._post_init()
@@ -301,12 +312,16 @@ def from_database(cls,
 
         adapter.dynamic_symbolic_map = adapter._process_dynamic_symbolic()
         adapter.buffer_dtype_map = adapter._process_buffer_dtype()
-        adapter.static_shape_map = adapter._process_static_shape()
         adapter.ptr_map = adapter._process_ptr_map()
         adapter.buffer_device_map = adapter._process_buffer_device()
 
+        static_buffer_infos = adapter._process_static_buffer_infos()
+        adapter.static_shape_map = static_buffer_infos[0]
+        adapter.static_strides_map = static_buffer_infos[1]
+        adapter.static_contiguous_list = static_buffer_infos[2]
+
         adapter.verbose = verbose
-        adapter.lib_generator = LibraryGenerator(adapter.target)
+        adapter.lib_generator = LibraryGenerator(adapter.target, verbose=verbose)
         adapter.lib_generator.assign_pass_configs(pass_configs)
         adapter.lib_generator.assign_compile_flags(compile_flags)
         adapter.lib = adapter.lib_generator.load_lib(lib_path=kernel_lib_path)
@@ -322,17 +337,20 @@ def from_database(cls,
         adapter.cython_wrapper.set_dynamic_symbolic_map(adapter.dynamic_symbolic_map)
         adapter.cython_wrapper.set_buffer_dtype_map(adapter.buffer_dtype_map)
         adapter.cython_wrapper.set_static_shape_map(adapter.static_shape_map)
+        adapter.cython_wrapper.set_static_strides_map(adapter.static_strides_map)
+        adapter.cython_wrapper.set_static_contiguous_list(adapter.static_contiguous_list)
         adapter.cython_wrapper.set_buffer_device_map(adapter.buffer_device_map)
         adapter.cython_wrapper.set_ptr_map(adapter.ptr_map)
 
         adapter._post_init()
         return adapter
 
-    def _process_dynamic_symbolic(self) -> Dict[tir.Var, Tuple[int, int]]:
+    def _process_dynamic_symbolic(self) -> Dict[tir.Var, Tuple[int, int, int]]:
         """Extract information about dynamic shapes from the TIR function.
         
-        Maps symbolic variables to their corresponding (buffer_index, shape_dimension)
+        Maps symbolic variables to their corresponding (id, buffer_index, dimension)
         for runtime shape resolution.
+        id represents shape or stride, 0 represents shape, 1 represents stride
         """
         func = self.prim_func
         params = func.params
@@ -344,7 +362,14 @@ def _process_dynamic_symbolic(self) -> Dict[tir.Var, Tuple[int, int]]:
                 for j, shape in enumerate(buffer.shape):
                     if (isinstance(shape, tir.Var) and (shape not in dynamic_symbolic_map) and
                         (shape not in params)):
-                        dynamic_symbolic_map[shape] = (i, j)
+                        dynamic_symbolic_map[shape] = (0, i, j)
+        for i, param in enumerate(params):
+            if param in buffer_map:
+                buffer = buffer_map[param]
+                for j, stride in enumerate(buffer.strides):
+                    if (isinstance(stride, tir.Var) and (stride not in dynamic_symbolic_map) and
+                        (stride not in params)):
+                        dynamic_symbolic_map[stride] = (1, i, j)
         return dynamic_symbolic_map
 
     def _process_buffer_dtype(self) -> Dict[tir.Var, Tuple[int, torch.dtype]]:
@@ -377,7 +402,10 @@ def _process_ptr_map(self) -> Dict[int, str]:
                 ptr_map[i] = param.name
         return ptr_map
 
-    def _process_static_shape(self) -> Dict[tir.Var, List[Tuple[int, int]]]:
+    def _process_static_buffer_infos(self) -> \
+            Tuple[Dict[tir.Var, Tuple[int, List[Tuple[int, int]]]],
+                  Dict[tir.Var, Tuple[int, List[Tuple[int, int]]]],
+                  List[Tuple[tir.Var]]]:
         """Extract information about static shapes from the TIR function.
         
         Maps buffer variables to their corresponding static shapes.
@@ -386,17 +414,27 @@ def _process_static_shape(self) -> Dict[tir.Var, List[Tuple[int, int]]]:
         params = func.params
         buffer_map = func.buffer_map
         static_shape_map = {}
+        static_strides_map = {}
+        static_contiguous_list = list()
         for i, param in enumerate(params):
             if param in buffer_map:
                 buffer = buffer_map[param]
-                name = buffer.name
-                shape = buffer.shape
-                static_shape = []
-                for j, s in enumerate(shape):
+                static_shape, static_strides = [], []
+                for j, s in enumerate(buffer.shape):
                     if isinstance(s, tir.IntImm):
                         static_shape.append((j, s.value))
-                static_shape_map[name] = (i, static_shape)
-        return static_shape_map
+                for j, s in enumerate(buffer.strides):
+                    if isinstance(s, tir.IntImm):
+                        static_strides.append((j, s.value))
+                is_contiguous, prod = True, 1
+                for dim, stride in reversed(list(zip(buffer.shape, buffer.strides))):
+                    is_contiguous &= bool(stride == prod)
+                    prod *= dim
+                static_shape_map[buffer.name] = (i, static_shape)
+                static_strides_map[buffer.name] = (i, static_strides)
+                if is_contiguous:
+                    static_contiguous_list.append((i, buffer.name))
+        return static_shape_map, static_strides_map, static_contiguous_list
 
     def _process_buffer_device(self) -> Dict[tir.Var, Tuple[int, torch.device]]:
         """Extract information about buffer devices from the TIR function.
@@ -473,7 +511,7 @@ def lib_code(self):
     @property
     def is_dynamic(self):
         """Indicates whether the kernel handles dynamic shapes."""
-        return (self.dynamic_symbolic_map is not None and len(self.dynamic_symbolic_map) > 0)
+        return self.dynamic_symbolic_map is not None and len(self.dynamic_symbolic_map) > 0
 
     def get_kernel_source(self, kernel_only: bool = False):
         """Returns the source code of the compiled kernel."""
diff --git a/tilelang/jit/adapter/cython/cython_wrapper.pyx b/tilelang/jit/adapter/cython/cython_wrapper.pyx
index 6e80765dd..8b06b58d1 100644
--- a/tilelang/jit/adapter/cython/cython_wrapper.pyx
+++ b/tilelang/jit/adapter/cython/cython_wrapper.pyx
@@ -11,17 +11,19 @@ from tilelang.utils.tensor import map_torch_type
 cdef class CythonKernelWrapper:
     # Class attributes to store kernel configuration and library reference
     cdef:
-        object dynamic_symbolic_map  # Maps dynamic dimensions to their corresponding tensor indices
-        object buffer_device_map     # Maps buffer variables to their corresponding devices
-        object buffer_dtype_map     # Maps buffer variables to their corresponding dtypes
-        object static_shape_map     # Maps buffer variables to their corresponding static shapes
-        object ptr_map              # Maps pointer arguments to their corresponding buffer indices
-        list result_idx             # Indices of output tensors in the params list
-        list params                 # List of parameter specifications (includes both inputs and outputs)
-        object lib                  # Reference to the compiled library containing the kernel
+        object dynamic_symbolic_map    # Maps dynamic dimensions to their corresponding tensor indices
+        object buffer_device_map       # Maps buffer variables to their corresponding devices
+        object buffer_dtype_map        # Maps buffer variables to their corresponding dtypes
+        object static_shape_map        # Maps buffer variables to their corresponding static shapes
+        object static_strides_map      # Maps buffer variables to their corresponding static strides
+        object static_contiguous_list  # A list contains contiguous buffers
+        object ptr_map                 # Maps pointer arguments to their corresponding buffer indices
+        list result_idx                # Indices of output tensors in the params list
+        list params                    # List of parameter specifications (includes both inputs and outputs)
+        object lib                     # Reference to the compiled library containing the kernel
         # Add new cache attributes
-        list param_dtypes    # Cache for parameter dtypes
-        list param_shapes    # Cache for parameter shapes as native Python lists
+        list param_dtypes              # Cache for parameter dtypes
+        list param_shapes              # Cache for parameter shapes as native Python lists
         object get_current_device
 
     def __cinit__(self, result_idx, params, lib):
@@ -57,6 +59,14 @@ cdef class CythonKernelWrapper:
         self.static_shape_map = static_shape_map
         return self
 
+    def set_static_strides_map(self, static_strides_map):
+        self.static_strides_map = static_strides_map
+        return self
+
+    def set_static_contiguous_list(self, static_contiguous_list):
+        self.static_contiguous_list = static_contiguous_list
+        return self
+
     def set_ptr_map(self, ptr_map):
         self.ptr_map = ptr_map
         return self
@@ -94,15 +104,41 @@ cdef class CythonKernelWrapper:
     cpdef void _check_static_shape(self, list tensor_list):
         for param, (buffer_idx, shape_list) in self.static_shape_map.items():
             tensor = tensor_list[buffer_idx]
-            if isinstance(tensor, torch.Tensor):
-                for shape_idx, expected_shape in shape_list:
-                    actual_shape = tensor.shape[shape_idx]
-                    if actual_shape != expected_shape:
-                        raise ValueError(
-                            f"Static shape mismatch for parameter {param}: "
-                            f"expected {expected_shape} at index {shape_idx}, "
-                            f"got {actual_shape}"
-                        )
+            if not isinstance(tensor, torch.Tensor):
+                # otherwise, maybe torch.data_ptr() for T.ptr inputs
+                continue
+            for shape_idx, expected_shape in shape_list:
+                actual_shape = tensor.shape[shape_idx]
+                if actual_shape != expected_shape:
+                    raise ValueError(
+                        f"Static shape mismatch for parameter {param}: "
+                        f"expected {expected_shape} at index {shape_idx}, "
+                        f"got {actual_shape}"
+                    )
+
+    cpdef void _check_static_strides(self, list tensor_list):
+        for param, (buffer_idx, strides_list) in self.static_strides_map.items():
+            tensor = tensor_list[buffer_idx]
+            if not isinstance(tensor, torch.Tensor):
+                # otherwise, maybe torch.data_ptr() for T.ptr inputs
+                continue
+            for stride_idx, expected_stride in strides_list:
+                actual_stride = tensor.stride(stride_idx)
+                if actual_stride != expected_stride:
+                    raise ValueError(
+                        f"Static stride mismatch for parameter {param}: "
+                        f"expected {expected_stride} at index {stride_idx}, "
+                        f"got {actual_stride}"
+                    )
+
+    cpdef void _check_static_contiguous(self, list tensor_list):
+        for buffer_idx, param in self.static_contiguous_list:
+            tensor = tensor_list[buffer_idx]
+            if not isinstance(tensor, torch.Tensor):
+                # otherwise, maybe torch.data_ptr() for T.ptr inputs
+                continue
+            if not tensor.is_contiguous():
+                raise ValueError(f"Expected parameter {param} to be a contiguous tensor")
 
     cpdef forward(self, list inputs, int64_t stream = -1, bint skip_tensor_validation = False):
         # Validate input dimensions and prepare for kernel execution
@@ -140,7 +176,7 @@ cdef class CythonKernelWrapper:
                     if isinstance(s, tir.Var):
                         for key in self.dynamic_symbolic_map:
                             if(str(s) == str(key)):
-                                ref_tensor_idx, ref_shape_idx = self.dynamic_symbolic_map[key]
+                                ref_id, ref_tensor_idx, ref_shape_idx = self.dynamic_symbolic_map[key]
                                 shape.append(tensor_list[ref_tensor_idx].shape[ref_shape_idx])
                     else:  # Already converted to Python int during initialization
                         shape.append(s)
@@ -155,6 +191,13 @@ cdef class CythonKernelWrapper:
             else:
                 tensor = inputs[ins_idx]
                 ins_idx += 1
+            # TODO(chenggang): remove this check or rewrite by ourselves?
+            if isinstance(tensor, torch.Tensor) and tensor._base is not None and not tensor.is_contiguous():
+                base_tensor = tensor._base.as_strided(tensor._base.shape, tensor.stride())
+                if torch._debug_has_internal_overlap(base_tensor):
+                    raise ValueError(f"Cannot use an overlapping tensor"
+                                     f"(shape={tensor.shape}, strides={tensor.stride()}, "
+                                     f"overlap={torch._debug_has_internal_overlap(base_tensor)}) as the kernel input")
             tensor_list.append(tensor)
 
         # Convert tensor pointers to C void pointers for kernel call
@@ -172,8 +215,6 @@ cdef class CythonKernelWrapper:
         call_args = []
         for i, tensor in enumerate(tensor_list):
             if isinstance(tensor, torch.Tensor):
-                if not tensor.is_contiguous():
-                    raise ValueError(f"Input tensor at index {i} must be contiguous")
                 call_args.append(ctypes.c_void_p(tensor.data_ptr()))
             elif isinstance(tensor, (int, float, bool)):
                 if i in self.ptr_map:
@@ -191,10 +232,15 @@ cdef class CythonKernelWrapper:
             self._check_buffer_device(tensor_list)
             self._check_buffer_dtype(tensor_list)
             self._check_static_shape(tensor_list)
+            self._check_static_strides(tensor_list)
+            self._check_static_contiguous(tensor_list)
 
         # Add dynamic dimension values to kernel arguments
-        for _, (buffer_idx, shape_idx) in self.dynamic_symbolic_map.items():
-            call_args.append(tensor_list[buffer_idx].shape[shape_idx])
+        for _, (ref_id, buffer_idx, shape_idx) in self.dynamic_symbolic_map.items():
+            if ref_id == 0:
+                call_args.append(tensor_list[buffer_idx].shape[shape_idx])
+            else:
+                call_args.append(tensor_list[buffer_idx].stride(shape_idx))
 
         # Add CUDA stream to kernel arguments
         call_args.append(ctypes.c_void_p(stream))
diff --git a/tilelang/jit/adapter/libgen.py b/tilelang/jit/adapter/libgen.py
index bb93984f0..6c7317fdb 100644
--- a/tilelang/jit/adapter/libgen.py
+++ b/tilelang/jit/adapter/libgen.py
@@ -11,7 +11,7 @@
 
 from tilelang import tvm as tvm
 from tilelang.transform import PassConfigKey
-from tilelang.contrib.nvcc import get_nvcc_compiler, get_target_compute_version
+from tilelang.contrib.nvcc import get_nvcc_compiler, get_target_arch, get_target_compute_version
 from tilelang.contrib.rocm import find_rocm_path, get_rocm_arch
 from tilelang.env import TILELANG_TEMPLATE_PATH
 
@@ -38,8 +38,9 @@ class LibraryGenerator(object):
     pass_configs: Optional[Dict[str, Any]] = None
     compile_flags: Optional[List[str]] = None
 
-    def __init__(self, target: Target):
+    def __init__(self, target: Target, verbose: bool = False):
         self.target = target
+        self.verbose = verbose
 
     def assign_pass_configs(self, pass_configs: Optional[Dict[str, Any]] = None):
         self.pass_configs = pass_configs
@@ -62,15 +63,16 @@ def load_lib(self, lib_path: Optional[str] = None):
 
     def compile_lib(self, timeout: float = None):
         target = self.target
+        verbose = self.verbose
         if is_cuda_target(target):
             from tilelang.env import CUTLASS_INCLUDE_DIR
             src = tempfile.NamedTemporaryFile(mode="w", suffix=".cu", delete=False)
-            compute_version = "".join(get_target_compute_version(target).split("."))
-            if compute_version == "90":
-                compute_version = "90a"
+            target_arch = get_target_arch(get_target_compute_version(target))
             libpath = src.name.replace(".cu", ".so")
 
             disable_fast_math = self.pass_configs.get(PassConfigKey.TL_DISABLE_FAST_MATH, False)
+            ptxas_usage_level = self.pass_configs.get(PassConfigKey.TL_PTXAS_REGISTER_USAGE_LEVEL,
+                                                      None)
             verbose_ptxas_output = self.pass_configs.get(
                 PassConfigKey.TL_ENABLE_PTXAS_VERBOSE_OUTPUT, False)
 
@@ -87,14 +89,14 @@ def compile_lib(self, timeout: float = None):
                 src.name,
                 "-lcuda",
                 "-gencode",
-                f"arch=compute_{compute_version},code=sm_{compute_version}",
+                f"arch=compute_{target_arch},code=sm_{target_arch}",
             ]
             if not disable_fast_math:
                 command += ["--use_fast_math"]
+            if ptxas_usage_level is not None:
+                command += [f"--ptxas-options=--register-usage-level={ptxas_usage_level}"]
             if verbose_ptxas_output:
-                command += ["--ptxas-options", "-v"]
-            if compute_version == "90a":
-                command += ["-D", "CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED"]
+                command += ["--ptxas-options=--verbose"]
             command += [
                 "-I" + CUTLASS_INCLUDE_DIR,
             ]
@@ -143,6 +145,8 @@ def compile_lib(self, timeout: float = None):
         src.flush()
 
         try:
+            if verbose:
+                print(f"compile_lib compilation command: {' '.join(command)}")
             ret = subprocess.run(command, timeout=timeout)
         except Exception as e:
             raise RuntimeError(f"Compile kernel failed because of {e}") from e
@@ -177,10 +181,10 @@ class PyLibraryGenerator(LibraryGenerator):
     culib = None
     pymodule = None
 
-    def __init__(self, target: Target):
+    def __init__(self, target: Target, verbose: bool = False):
         if not is_nvrtc_available:
             raise ImportError(NVRTC_UNAVAILABLE_WARNING)
-        super().__init__(target)
+        super().__init__(target, verbose)
 
     @staticmethod
     def import_from_file(module_name, file_path):
@@ -211,6 +215,7 @@ def load_lib(self, lib_path: Optional[str] = None):
 
     def compile_lib(self, timeout: float = None):
         target = self.target
+        verbose = self.verbose
         if is_cuda_target(target):
             from tilelang.env import (CUDA_HOME, CUTLASS_INCLUDE_DIR, TILELANG_TEMPLATE_PATH)
             src = tempfile.NamedTemporaryFile(mode="w", suffix=".cu", delete=False)
@@ -237,7 +242,7 @@ def compile_lib(self, timeout: float = None):
                 ]
 
             cubin_bytes = compile_cuda(
-                self.lib_code, target_format="cubin", options=options, verbose=True)
+                self.lib_code, target_format="cubin", options=options, verbose=verbose)
             with open(libpath, "wb") as f:
                 f.write(cubin_bytes)
 
diff --git a/tilelang/jit/adapter/nvrtc/adapter.py b/tilelang/jit/adapter/nvrtc/adapter.py
index aca64a2ff..d44108580 100644
--- a/tilelang/jit/adapter/nvrtc/adapter.py
+++ b/tilelang/jit/adapter/nvrtc/adapter.py
@@ -81,7 +81,7 @@ def __init__(self,
         self.wrapper.assign_device_module(device_mod)
         self.host_func, self.function_names = self.wrapper.wrap(kernel_global_source)
 
-        self.lib_generator = PyLibraryGenerator(self.target)
+        self.lib_generator = PyLibraryGenerator(self.target, self.verbose)
         self.lib_generator.update_lib_code(self.kernel_global_source)
         self.lib_generator.update_host_func(self.host_func)
         self.lib_generator.assign_compile_flags(compile_flags)
@@ -105,7 +105,8 @@ def from_database(cls,
                       kernel_global_source: str,
                       kernel_lib_path: str,
                       verbose: bool = False,
-                      pass_configs: Optional[Dict[str, Any]] = None):
+                      pass_configs: Optional[Dict[str, Any]] = None,
+                      compile_flags: Optional[List[str]] = None):
         adapter = cls.__new__(cls)
         adapter.params = params
         adapter.result_idx = adapter._legalize_result_idx(result_idx)
@@ -135,7 +136,8 @@ def from_database(cls,
 
         adapter.target = Target.canon_target(determine_target(target))
         adapter.verbose = verbose
-        adapter.lib_generator = PyLibraryGenerator(adapter.target)
+        adapter.lib_generator = PyLibraryGenerator(adapter.target, adapter.verbose)
+        adapter.lib_generator.assign_compile_flags(compile_flags)
         adapter.lib_generator.load_lib(lib_path=kernel_lib_path)
         adapter.pymodule = adapter.lib_generator.pymodule
         adapter.function_names = adapter.pymodule._function_names
diff --git a/tilelang/jit/adapter/wrapper.py b/tilelang/jit/adapter/wrapper.py
index 7c3a87b1e..f1b0ff3ae 100644
--- a/tilelang/jit/adapter/wrapper.py
+++ b/tilelang/jit/adapter/wrapper.py
@@ -180,8 +180,8 @@ class TLCUDASourceWrapper(object):
         "float32": "float",
         "float16": "half_t",
         "bfloat16": "bfloat16_t",
-        "e4m3_float8": "fp8_e4_t",
-        "e5m2_float8": "fp8_e5_t",
+        "float8_e4m3": "fp8_e4_t",
+        "float8_e5m2": "fp8_e5_t",
         "float64": "double",
         "int64": "int64_t",
         "int32": "int",
@@ -234,7 +234,10 @@ def create_dispatch_func(self, code, function_informations):
         dynamic_symbolic_set = self.get_dynamic_symbolic_set(self.prim_func)
 
         function_args = []
+
         # Collect function arguments based on primary function's parameters and buffer mappings
+        # QA(@lei): Why not use device_mod.params?
+        # device func lack buffer map (to convert buffer handle to buffer)
         for param in self.prim_func.params:
             if param in self.prim_func.buffer_map:
                 buffer = self.prim_func.buffer_map[param]
@@ -484,12 +487,26 @@ def parse_source_information(self):
     def get_dynamic_symbolic_set(self, prim_func):
         # Determine the set of dynamic symbols used in the function
         dynamic_symbolic_set: List[str] = []
+
+        def unique_push_back(name: str):
+            if name not in dynamic_symbolic_set:
+                dynamic_symbolic_set.append(name)
+
         for param in prim_func.params:
             if param in prim_func.buffer_map:
                 buffer = prim_func.buffer_map[param]
                 for dim in buffer.shape:
-                    if isinstance(dim, tvm.tir.Var) and (dim.name not in dynamic_symbolic_set):
-                        dynamic_symbolic_set.append(dim.name)
+                    if isinstance(dim, tvm.tir.Var):
+                        unique_push_back(dim.name)
+
+        # Note: In buffer definitions, any dynamic symbols appearing in strides are listed after those in the shape.
+        for param in prim_func.params:
+            if param in prim_func.buffer_map:
+                buffer = prim_func.buffer_map[param]
+                for stride in buffer.strides:
+                    if isinstance(stride, tvm.tir.Var):
+                        unique_push_back(stride.name)
+
         return dynamic_symbolic_set
 
     def get_init_func(self):
@@ -549,6 +566,19 @@ def prim_func(self):
                     return function
             raise ValueError("Cannot find primary function in the module.")
 
+    @property
+    def device_func(self):
+        if len(self.device_mod.get_global_vars()) == 1:
+            return self.device_mod[self.device_mod.get_global_vars()[0]]
+        elif "main" in self.device_mod:
+            return self.device_mod["main"]
+        else:
+            for _, function in self.device_mod.functions.items():
+                attr = function.attrs
+                if "tir.is_global_func" in attr and attr["tir.is_global_func"]:
+                    return function
+            raise ValueError("Cannot find primary function in the module.")
+
 
 class TLNVRTCSourceWrapper(TLCUDASourceWrapper):
     """
@@ -559,8 +589,8 @@ class TLNVRTCSourceWrapper(TLCUDASourceWrapper):
         "float32": "ctypes.c_float",
         "float16": "ctypes.c_uint16",
         "bfloat16": "ctypes.c_uint16",
-        "e4m3_float8": "ctypes.c_uint8",
-        "e5m2_float8": "ctypes.c_uint8",
+        "float8_e4m3": "ctypes.c_uint8",
+        "float8_e5m2": "ctypes.c_uint8",
         "float64": "ctypes.c_double",
         "int64": "ctypes.c_int64",
         "int32": "ctypes.c_int32",
@@ -766,8 +796,8 @@ class TLHIPSourceWrapper(TLCUDASourceWrapper):
         "float32": "float",
         "float16": "half_t",
         "bfloat16": "bfloat16_t",
-        "e4m3_float8": "fp8_e4_t",
-        "e5m2_float8": "fp8_e5_t",
+        "float8_e4m3": "fp8_e4_t",
+        "float8_e5m2": "fp8_e5_t",
         "float8_e4m3fnuz": "fp8_e4_t",
         "e4m3fnuz_float8": "fp8_e4_t",
         "float64": "double",
diff --git a/tilelang/jit/env.py b/tilelang/jit/env.py
index 0870a66a1..78983ed27 100644
--- a/tilelang/jit/env.py
+++ b/tilelang/jit/env.py
@@ -36,11 +36,7 @@ def _get_workspace_dir_name() -> pathlib.Path:
 
         target = determine_target(return_object=True)
         # create tmp source file for torch cpp extension
-        compute_version = "".join(nvcc.get_target_compute_version(target).split("."))
-        # set TORCH_CUDA_ARCH_LIST
-        major = compute_version[0]
-        minor = compute_version[1]
-        arch = f"{major}_{minor}"
+        arch = nvcc.get_target_arch(nvcc.get_target_compute_version(target))
     except Exception:
         arch = "noarch"
     # e.g.: $HOME/.cache/tilelang/75_80_89_90/
diff --git a/tilelang/jit/kernel.py b/tilelang/jit/kernel.py
index f5a3198ad..3a2de02ef 100644
--- a/tilelang/jit/kernel.py
+++ b/tilelang/jit/kernel.py
@@ -6,6 +6,7 @@
 import tilelang
 from tilelang import tvm as tvm
 from tilelang.engine.param import CompiledArtifact, KernelParam
+from tilelang.env import TILELANG_PRINT_ON_COMPILATION
 from tilelang.jit.adapter import (BaseKernelAdapter, CtypesKernelAdapter, CythonKernelAdapter,
                                   NVRTCKernelAdapter, TorchDLPackKernelAdapter)
 from tilelang.profiler import Profiler, TensorSupplyType
@@ -110,6 +111,12 @@ def __init__(
         if from_database:
             return
 
+        # Print log on compilation starts
+        # NOTE(Chenggang): printing could let the training/inference framework easier to know
+        # whether the communication timeout is from compilation
+        if TILELANG_PRINT_ON_COMPILATION.lower() in ("1", "true", "yes", "on"):
+            print(f"TileLang begins to compile kernel `{func.__name__}` with `{out_idx=}`")
+
         # Compile the TileLang function and create a kernel adapter for execution.
         adapter = self._compile_and_create_adapter(func, out_idx)
 
diff --git a/tilelang/language/__init__.py b/tilelang/language/__init__.py
index c369b101e..57508d5f0 100644
--- a/tilelang/language/__init__.py
+++ b/tilelang/language/__init__.py
@@ -17,6 +17,7 @@
     make_tensor,  # noqa: F401
     Buffer,  # noqa: F401
     Tensor,  # noqa: F401
+    StridedTensor,  # noqa: F401
     FragmentBuffer,  # noqa: F401
     SharedBuffer,  # noqa: F401
     LocalBuffer,  # noqa: F401
diff --git a/tilelang/language/ast/_ffi_api.py b/tilelang/language/ast/_ffi_api.py
index 96b41de8e..518d57ea8 100644
--- a/tilelang/language/ast/_ffi_api.py
+++ b/tilelang/language/ast/_ffi_api.py
@@ -17,6 +17,6 @@
 # This file is modified from the original version,
 # which is part of the TVM project (https://tvm.apache.org/).
 """FFI APIs"""
-import tvm._ffi
+import tvm.ffi
 
-tvm._ffi._init_api("script.ir_builder.tir", __name__)  # pylint: disable=protected-access
+tvm.ffi._init_api("script.ir_builder.tir", __name__)  # pylint: disable=protected-access
diff --git a/tilelang/language/ast/ir.py b/tilelang/language/ast/ir.py
index 781b8f489..e49e6d5c3 100644
--- a/tilelang/language/ast/ir.py
+++ b/tilelang/language/ast/ir.py
@@ -1428,19 +1428,19 @@ def func(
 float32x64 = func_gen(("Float32x64"))
 float64x64 = func_gen(("Float64x64"))
 
-e4m3_float8 = func_gen(("E4M3Float8"))
-e4m3_float8x4 = func_gen(("E4M3Float8x4"))
-e4m3_float8x8 = func_gen(("E4M3Float8x8"))
-e4m3_float8x16 = func_gen(("E4M3Float8x16"))
-e4m3_float8x32 = func_gen(("E4M3Float8x32"))
-e4m3_float8x64 = func_gen(("E4M3Float8x64"))
-
-e5m2_float8 = func_gen(("E5M2Float8"))
-e5m2_float8x4 = func_gen(("E5M2Float8x4"))
-e5m2_float8x8 = func_gen(("E5M2Float8x8"))
-e5m2_float8x16 = func_gen(("E5M2Float8x16"))
-e5m2_float8x32 = func_gen(("E5M2Float8x32"))
-e5m2_float8x64 = func_gen(("E5M2Float8x64"))
+float8_e4m3 = func_gen(("E4M3Float8"))
+float8_e4m3x4 = func_gen(("E4M3Float8x4"))
+float8_e4m3x8 = func_gen(("E4M3Float8x8"))
+float8_e4m3x16 = func_gen(("E4M3Float8x16"))
+float8_e4m3x32 = func_gen(("E4M3Float8x32"))
+float8_e4m3x64 = func_gen(("E4M3Float8x64"))
+
+float8_e5m2 = func_gen(("E5M2Float8"))
+float8_e5m2x4 = func_gen(("E5M2Float8x4"))
+float8_e5m2x8 = func_gen(("E5M2Float8x8"))
+float8_e5m2x16 = func_gen(("E5M2Float8x16"))
+float8_e5m2x32 = func_gen(("E5M2Float8x32"))
+float8_e5m2x64 = func_gen(("E5M2Float8x64"))
 
 # pylint: enable=invalid-name
 
@@ -1964,33 +1964,33 @@ def wrapped(*args, **kwargs):
     "uint16x64",
     "uint32x64",
     "uint64x64",
-    "e4m3_float8",
-    "e5m2_float8",
+    "float8_e4m3",
+    "float8_e5m2",
     "float16",
     "float32",
     "float64",
-    "e4m3_float8x4",
-    "e5m2_float8x4",
+    "float8_e4m3x4",
+    "float8_e5m2x4",
     "float16x4",
     "float32x4",
     "float64x4",
-    "e4m3_float8x8",
-    "e5m2_float8x8",
+    "float8_e4m3x8",
+    "float8_e5m2x8",
     "float16x8",
     "float32x8",
     "float64x8",
-    "e4m3_float8x16",
-    "e5m2_float8x16",
+    "float8_e4m3x16",
+    "float8_e5m2x16",
     "float16x16",
     "float32x16",
     "float64x16",
-    "e4m3_float8x32",
-    "e5m2_float8x32",
+    "float8_e4m3x32",
+    "float8_e5m2x32",
     "float16x32",
     "float32x32",
     "float64x32",
-    "e4m3_float8x64",
-    "e5m2_float8x64",
+    "float8_e4m3x64",
+    "float8_e5m2x64",
     "float16x64",
     "float32x64",
     "float64x64",
diff --git a/tilelang/language/copy.py b/tilelang/language/copy.py
index f492c1bc9..c08ca3836 100644
--- a/tilelang/language/copy.py
+++ b/tilelang/language/copy.py
@@ -1,7 +1,8 @@
 """The language interface for tl programs."""
 
-from typing import Union, List, Optional
+from typing import Union, List, Optional, Literal
 from tilelang import language as T
+from tilelang.utils.language import get_buffer_region_from_load
 from tvm import ir, tir
 
 
@@ -80,12 +81,11 @@ def buffer_region_to_tile_region(buffer_region: tir.BufferRegion, access_type: s
     return region(T.BufferLoad(buffer_region.buffer, mins), access_type, *region_extents)
 
 
-def copy(
-    src: Union[tir.Buffer, tir.BufferLoad, tir.BufferRegion],
-    dst: Union[tir.Buffer, tir.BufferLoad],
-    coalesced_width: Optional[int] = None,
-    disable_tma: bool = False,
-):
+def copy(src: Union[tir.Buffer, tir.BufferLoad, tir.BufferRegion],
+         dst: Union[tir.Buffer, tir.BufferLoad],
+         coalesced_width: Optional[int] = None,
+         disable_tma: bool = False,
+         eviction_policy: Optional[Literal["evict_normal", "evict_first", "evict_last"]] = None):
     """Copy data between memory regions.
 
     Args:
@@ -109,6 +109,11 @@ def get_extent(data):
             return data.shape
         elif isinstance(data, tir.BufferRegion):
             return [x.extent for x in data.region]
+        elif isinstance(data, tir.BufferLoad):
+            region = get_buffer_region_from_load(data)
+            if region is None:
+                return None
+            return [x.extent for x in region.region]
         else:
             return None
 
@@ -126,6 +131,11 @@ def _to_region(data, access_type):
             return buffer_to_tile_region(data, access_type)
         elif isinstance(data, tir.BufferRegion):
             return buffer_region_to_tile_region(data, access_type, extent)
+        elif isinstance(data, tir.BufferLoad):
+            region = get_buffer_region_from_load(data)
+            if region is None:
+                return buffer_load_to_tile_region(data, access_type, extent)
+            return buffer_region_to_tile_region(region, access_type, extent)
         else:
             return buffer_load_to_tile_region(data, access_type, extent)
 
@@ -134,20 +144,24 @@ def _to_region(data, access_type):
 
     if coalesced_width is None:
         coalesced_width = -1  # PrimExpr can not be None
+    if eviction_policy is None:
+        eviction_policy = 0
+    else:
+        eviction_policy = {"evict_normal": 0, "evict_first": 1, "evict_last": 2}[eviction_policy]
     return tir.call_intrin("handle", tir.op.Op.get("tl.copy"), src, dst, coalesced_width,
-                           disable_tma)
-
-
-def c2d_im2col(
-    img: tir.Buffer,
-    col: tir.Buffer,
-    nhw_step: tir.PrimExpr,
-    c_step: tir.PrimExpr,
-    kernel: int,
-    stride: int,
-    dilation: int,
-    pad: int,
-):
+                           disable_tma, eviction_policy)
+
+
+def c2d_im2col(img: tir.Buffer,
+               col: tir.Buffer,
+               nhw_step: tir.PrimExpr,
+               c_step: tir.PrimExpr,
+               kernel: int,
+               stride: int,
+               dilation: int,
+               pad: int,
+               eviction_policy: Optional[Literal["evict_normal", "evict_first",
+                                                 "evict_last"]] = None):
     """Perform im2col transformation for 2D convolution.
 
     Args:
@@ -163,15 +177,10 @@ def c2d_im2col(
     Returns:
         tir.Call: A handle to the im2col operation
     """
-    return tir.call_intrin(
-        "handle",
-        tir.op.Op.get("tl.c2d_im2col"),
-        img.access_ptr("r"),
-        col.access_ptr("w"),
-        nhw_step,
-        c_step,
-        kernel,
-        stride,
-        dilation,
-        pad,
-    )
+    if eviction_policy is None:
+        eviction_policy = 0
+    else:
+        eviction_policy = {"evict_normal": 0, "evict_first": 1, "evict_last": 2}[eviction_policy]
+    return tir.call_intrin("handle", tir.op.Op.get("tl.c2d_im2col"), img.access_ptr("r"),
+                           col.access_ptr("w"), nhw_step, c_step, kernel, stride, dilation, pad,
+                           eviction_policy)
diff --git a/tilelang/language/customize.py b/tilelang/language/customize.py
index 1e87a70be..3e99ccf79 100644
--- a/tilelang/language/customize.py
+++ b/tilelang/language/customize.py
@@ -1,10 +1,88 @@
+# Copyright (c) Tile-AI Corporation.
+# Licensed under the MIT License.
 """The language interface for tl programs."""
 
 import tilelang.language as T
-from tvm.tir import PrimExpr, Buffer
+from tvm import ir
+from tvm.tir import PrimExpr, Buffer, BufferLoad, BufferRegion, Var, op
 from typing import List, Union
 
 
+def region(buffer: BufferLoad, access_type: str, *args: PrimExpr):
+    """Create a memory region descriptor for tile operations.
+
+    Args:
+        buffer (tir.BufferLoad): The buffer to create a region for
+        access_type (str): Type of access - 'r' for read, 'w' for write, 'rw' for read-write
+        *args (tir.PrimExpr): Extent expressions defining the region size
+
+    Returns:
+        tir.Call: A region descriptor for tile operations
+    """
+    access_type = {"r": 1, "w": 2, "rw": 3}[access_type]
+    return T.call_intrin("handle", op.Op.get("tl.region"), buffer, access_type, *args)
+
+
+def buffer_to_tile_region(buffer: Buffer, access_type: str):
+    """Convert a TVM buffer to a tile region descriptor.
+
+    Args:
+        buffer (tir.Buffer): The buffer to convert
+        access_type (str): Type of access - 'r' for read, 'w' for write, 'rw' for read-write
+
+    Returns:
+        tir.Call: A region descriptor covering the entire buffer
+    """
+    mins = [0 for _ in buffer.shape]
+    extents = [x for x in buffer.shape]
+    return region(T.BufferLoad(buffer, mins), access_type, *extents)
+
+
+def buffer_load_to_tile_region(load: BufferLoad, access_type: str, extents: List[PrimExpr]):
+    """Convert a buffer load operation to a tile region descriptor.
+
+    Args:
+        load (tir.BufferLoad): The buffer load operation
+        access_type (str): Type of access - 'r' for read, 'w' for write, 'rw' for read-write
+        extents (List[tir.PrimExpr]): List of expressions defining the region size
+
+    Returns:
+        tir.Call: A region descriptor for the loaded area
+    """
+    indices = load.indices
+    if len(indices) > len(extents):
+        # (f"mismatch between indices and extents for buffer load {load}: indices = {indices}, extents = {extents}, "
+        # f"region will be expanded in the last 2 dimensions")
+        new_extents = []
+        for _ in range(len(indices) - len(extents)):
+            new_extents.append(1)
+        for extent in extents:
+            new_extents.append(extent)
+        extents = new_extents
+    assert len(indices) == len(extents), f"indices = {indices}, extents = {extents}"
+    return region(load, access_type, *extents)
+
+
+def buffer_region_to_tile_region(buffer_region: BufferRegion, access_type: str,
+                                 extents: List[PrimExpr]):
+    """Convert a buffer region to a tile region descriptor.
+
+    Args:
+        buffer_region (tir.BufferRegion): The buffer region to convert
+        access_type (str): Type of access - 'r' for read, 'w' for write, 'rw' for read-write
+
+    Returns:
+        tir.Call: A region descriptor for the specified buffer region
+    """
+    mins = [x.min for x in buffer_region.region]
+    region_extents = [x.extent for x in buffer_region.region]
+    assert len(region_extents) >= len(
+        extents
+    ), f"region_extents must be >= extents, region_extents = {region_extents}, extents = {extents}"
+
+    return region(T.BufferLoad(buffer_region.buffer, mins), access_type, *region_extents)
+
+
 def atomic_add(dst: Buffer, value: PrimExpr) -> PrimExpr:
     """Perform an atomic addition operation.
 
@@ -15,7 +93,41 @@ def atomic_add(dst: Buffer, value: PrimExpr) -> PrimExpr:
     Returns:
         PrimExpr: Handle to the atomic addition operation
     """
-    return T.call_extern("handle", "AtomicAdd", T.address_of(dst), value)
+    if isinstance(dst, BufferLoad) and isinstance(value, BufferLoad):
+        return T.call_extern("handle", "AtomicAdd", T.address_of(dst), value)
+    if isinstance(dst, Buffer) and isinstance(value, Buffer):
+        ir.assert_structural_equal(dst.shape, value.shape)
+
+    def get_extent(data):
+        if isinstance(data, Var) and T.has_let_value(data):
+            data = T.get_let_value(data)
+        if isinstance(data, Buffer):
+            return data.shape
+        elif isinstance(data, BufferRegion):
+            return [x.extent for x in data.region]
+        else:
+            return None
+
+    src_extent = get_extent(value)
+    dst_extent = get_extent(dst)
+    assert src_extent or dst_extent, "Can't deduce atomicadd extents from args"
+    src_extent = list(src_extent) if src_extent else [1] * len(dst_extent)
+    dst_extent = list(dst_extent) if dst_extent else [1] * len(src_extent)
+    extent = max(src_extent, dst_extent)
+
+    def _to_region(data, access_type):
+        if isinstance(data, Var) and T.has_let_value(data):
+            data = T.get_let_value(data)
+        if isinstance(data, Buffer):
+            return buffer_to_tile_region(data, access_type)
+        elif isinstance(data, BufferRegion):
+            return buffer_region_to_tile_region(data, access_type, extent)
+        else:
+            return buffer_load_to_tile_region(data, access_type, extent)
+
+    value = _to_region(value, "r")
+    dst = _to_region(dst, "w")
+    return T.call_intrin("handle", op.Op.get("tl.atomicadd"), value, dst)
 
 
 def atomic_addx2(dst: Buffer, value: PrimExpr) -> PrimExpr:
@@ -32,14 +144,14 @@ def atomic_addx2(dst: Buffer, value: PrimExpr) -> PrimExpr:
 
 
 def atomic_addx4(dst: Buffer, value: PrimExpr) -> PrimExpr:
-    """Perform an atomic addition operation with double-width operands.
+    """Perform an atomic addition operation with quad-width operands.
 
     Args:
         dst (Buffer): Destination buffer where the atomic addition will be performed
-        value (PrimExpr): Value to be atomically added (double-width)
+        value (PrimExpr): Value to be atomically added (quad-width)
 
     Returns:
-        PrimExpr: Handle to the double-width atomic addition operation
+        PrimExpr: Handle to the quad-width atomic addition operation
     """
     return T.call_extern("handle", "AtomicAddx4", T.address_of(dst), T.address_of(value))
 
diff --git a/tilelang/language/fill.py b/tilelang/language/fill.py
index 123c9026f..a1482f501 100644
--- a/tilelang/language/fill.py
+++ b/tilelang/language/fill.py
@@ -3,6 +3,7 @@
 from tvm import tir
 from typing import Union
 from tilelang.language import has_let_value, get_let_value
+from tilelang.utils.language import get_buffer_region_from_load
 
 
 def fill(buffer: Union[tir.Buffer, tir.BufferRegion], value: tir.PrimExpr):
@@ -36,6 +37,12 @@ def clear(buffer: Union[tir.Buffer, tir.Var]):
         buffer_region = get_let_value(buffer)  # Get the actual buffer region from variable
         if isinstance(buffer_region, tir.BufferRegion):
             return fill(buffer_region, 0)
+        elif isinstance(buffer_region, tir.BufferLoad):
+            region = get_buffer_region_from_load(buffer_region)
+            if region is None:
+                raise ValueError(
+                    f"Invalid buffer region: {buffer_region}, type: {type(buffer_region)}")
+            return fill(region, 0)
         else:
-            raise ValueError(f"Invalid buffer region: {buffer_region}")
+            raise ValueError(f"Invalid buffer region: {buffer_region}, type: {type(buffer_region)}")
     return fill(buffer, 0)
diff --git a/tilelang/language/frame.py b/tilelang/language/frame.py
index ebc2ee673..b82cfe5ef 100644
--- a/tilelang/language/frame.py
+++ b/tilelang/language/frame.py
@@ -1,6 +1,6 @@
 """Override the LetFrame to print a message when entering the frame."""
 
-from tvm._ffi import register_object as _register_object
+from tvm.ffi import register_object as _register_object
 from tvm.tir import Var, PrimExpr, BufferLoad, BufferRegion
 from tvm.ir import Range
 from tvm import DataType
diff --git a/tilelang/language/gemm.py b/tilelang/language/gemm.py
index 209aac47a..aab540ed2 100644
--- a/tilelang/language/gemm.py
+++ b/tilelang/language/gemm.py
@@ -69,10 +69,32 @@ def retrieve_shape(object: Union[tir.Buffer, tir.BufferRegion]) -> List[int]:
         else:
             raise ValueError(f"Unsupported argument type: {type(object)} for buffer {object}")
 
+    def retrieve_stride(object: Union[tir.Buffer, tir.BufferRegion]) -> List[int]:
+        if isinstance(object, tir.Buffer):
+            strides = []
+            stride = 1
+            for s in reversed(object.shape):
+                strides.insert(0, stride)
+                stride *= s
+            return strides
+        elif isinstance(object, tir.BufferRegion):
+            buffer, _ = object.buffer, object.region
+            strides = []
+            stride = 1
+            for s in reversed(buffer.shape):
+                strides.insert(0, stride)
+                stride *= s
+            return strides
+        else:
+            raise ValueError(f"Unsupported argument type: {type(object)} for buffer {object}")
+
     A_shape = retrieve_shape(A)
     B_shape = retrieve_shape(B)
     C_shape = retrieve_shape(C)
 
+    A_stride = retrieve_stride(A)
+    B_stride = retrieve_stride(B)
+
     assert len(C_shape) == 2, "current only support C as a 2D tensor"
     assert len(A_shape) >= 2, "current only support A as a 2D or higher-order tensor"
     assert len(B_shape) >= 2, "current only support B as a 2D or higher-order tensor"
@@ -90,6 +112,9 @@ def retrieve_shape(object: Union[tir.Buffer, tir.BufferRegion]) -> List[int]:
     K_B = B_shape[-1] if transpose_B else B_shape[-2]
     assert K == K_B, f"T.gemm K shape check failed: K_A = {K}, K_B = {K_B}"
 
+    stride_a = A_stride[-2]
+    stride_b = B_stride[-2]
+
     def retrieve_ptr(object: Union[tir.Buffer, tir.BufferRegion],
                      access_type: str = "r") -> tir.PrimExpr:
         if isinstance(object, tir.Buffer):
@@ -105,12 +130,33 @@ def retrieve_ptr(object: Union[tir.Buffer, tir.BufferRegion],
                 strides.insert(0, stride)
                 stride *= s
             offset = 0
-            for i in range(len(indices)):
+            # not offset the last two dimension
+            for i in range(len(indices) - 2):
                 offset += indices[i] * strides[i]
             return buffer.access_ptr(access_mask=access_type, offset=offset)
         else:
             raise ValueError(f"Unsupported argument type: {type(object)} for buffer {object}")
 
+    def retrieve_offset(object: Union[tir.Buffer, tir.BufferRegion]) -> tir.PrimExpr:
+        """Retrieve the offset of the buffer or buffer region."""
+        if isinstance(object, tir.Buffer):
+            return [0] * len(object.shape)
+        elif isinstance(object, tir.BufferRegion):
+            _, region = object.buffer, object.region
+            indices = []
+            for r in region:
+                indices.append(r.min)
+            return indices
+        else:
+            raise ValueError(f"Unsupported argument type: {type(object)} for buffer {object}")
+
+    A_offset = retrieve_offset(A)
+    B_offset = retrieve_offset(B)
+    assert A_offset[-2] == 0, "The offset of the first dimension of A must be 0"
+    assert B_offset[-2] == 0, "The offset of the first dimension of B must be 0"
+    offset_a = A_offset[-1]
+    offset_b = B_offset[-1]
+
     Aptr = retrieve_ptr(A, "r")
     Bptr = retrieve_ptr(B, "r")
     Cptr = retrieve_ptr(C, "rw")
@@ -127,6 +173,10 @@ def retrieve_ptr(object: Union[tir.Buffer, tir.BufferRegion],
         K,
         policy,
         clear_accum,
+        stride_a,
+        stride_b,
+        offset_a,
+        offset_b,
         k_pack,
         wg_wait,
     )
diff --git a/tilelang/language/kernel.py b/tilelang/language/kernel.py
index deddfb4ce..0ce6e6ece 100644
--- a/tilelang/language/kernel.py
+++ b/tilelang/language/kernel.py
@@ -5,7 +5,7 @@
 from tvm import tir
 from tvm.tir import Var
 from tvm.script.ir_builder.tir.frame import TIRFrame, BlockFrame
-from tvm._ffi import register_object
+from tvm.ffi import register_object
 from tilelang import _ffi_api
 import threading
 
diff --git a/tilelang/language/logical.py b/tilelang/language/logical.py
index 1af6f04cc..b98f291c9 100644
--- a/tilelang/language/logical.py
+++ b/tilelang/language/logical.py
@@ -1,8 +1,7 @@
 """The language interface for tl programs."""
 
 from tilelang import language as T
-from tvm.tir import Buffer, BufferRegion
-from tvm.ir import Range
+from tvm.tir import Buffer, BufferRegion, BufferLoad
 from tvm import tir
 from typing import Union
 from tilelang.utils.language import get_buffer_elems
@@ -28,16 +27,17 @@ def any_of(buffer: Union[T.Tensor, BufferRegion]):
         for i, r in enumerate(region):
             extent = r.extent
             if extent == 1:
-                new_region.append(r)
+                new_region.append(r.min)
             else:
                 # check the idx is the last dimension
                 if i != len(region) - 1:
                     raise ValueError(
                         "Only support the last dimension to be for T.any currently, please contact us if you need this feature"
                     )
-                new_region.append(Range(r.min, 1))
-        buffer = BufferRegion(buffer, new_region)
-        return T.call_intrin(return_type, tir.op.Op.get("tl.any_of"), T.address_of(buffer), extent)
+                new_region.append(r.min)
+        buffer_load = BufferLoad(buffer, new_region)
+        return T.call_intrin(return_type, tir.op.Op.get("tl.any_of"), T.address_of(buffer_load),
+                             extent)
     else:
         raise ValueError(f"Invalid buffer type: {type(buffer)}")
 
@@ -62,15 +62,16 @@ def all_of(buffer: Union[T.Tensor, BufferRegion]):
         for i, r in enumerate(region):
             extent = r.extent
             if extent == 1:
-                new_region.append(r)
+                new_region.append(r.min)
             else:
                 # check the idx is the last dimension
                 if i != len(region) - 1:
                     raise ValueError(
                         "Only support the last dimension to be for T.any currently, please contact us if you need this feature"
                     )
-                new_region.append(Range(r.min, 1))
-        buffer = BufferRegion(buffer, new_region)
-        return T.call_intrin(return_type, tir.op.Op.get("tl.all_of"), T.address_of(buffer), extent)
+                new_region.append(r.min)
+        buffer_load = BufferLoad(buffer, new_region)
+        return T.call_intrin(return_type, tir.op.Op.get("tl.all_of"), T.address_of(buffer_load),
+                             extent)
     else:
         raise ValueError(f"Invalid buffer type: {type(buffer)}")
diff --git a/tilelang/language/memscope.py b/tilelang/language/memscope.py
index 15535388c..3999f5cee 100644
--- a/tilelang/language/memscope.py
+++ b/tilelang/language/memscope.py
@@ -1,4 +1,4 @@
-from tvm._ffi.registry import register_func
+from tvm.ffi.registry import register_func
 from tvm.ir import make_node
 
 
@@ -10,7 +10,7 @@ def mem_info_local_var():
         tvm.ir.make_node: A node containing memory information
     """
     return make_node(
-        "MemoryInfo",
+        "target.MemoryInfo",
         unit_bits=8,
         max_num_bits=64,
         max_simd_bits=128,
diff --git a/tilelang/language/parser/operation.py b/tilelang/language/parser/operation.py
index 9b5a67a7a..e16fa261b 100644
--- a/tilelang/language/parser/operation.py
+++ b/tilelang/language/parser/operation.py
@@ -21,7 +21,7 @@
 from typing import Type
 
 from tvm import tir
-from tvm._ffi.runtime_ctypes import DataType, DataTypeCode
+from tvm.ffi.runtime_ctypes import DataType, DataTypeCode
 from tvm.tir import IntImm
 from tvm.tir.expr import FloatImm
 
@@ -88,10 +88,10 @@ def _auto_broadcast(a, b, op):
 
         if DataType(a.dtype).lanes == DataType(b.dtype).lanes:
             return op(a, b)
-        elif DataType(a.dtype).lanes == 1 and DataType(a.dtype).lanes != DataType(b.dtype).lanes:
+        elif (DataType(a.dtype).lanes == 1 and DataType(a.dtype).lanes != DataType(b.dtype).lanes):
             broadcast_a = tir.Broadcast(a, DataType(b.dtype).lanes)
             return op(broadcast_a, b)
-        elif DataType(b.dtype).lanes == 1 and DataType(a.dtype).lanes != DataType(b.dtype).lanes:
+        elif (DataType(b.dtype).lanes == 1 and DataType(a.dtype).lanes != DataType(b.dtype).lanes):
             broadcast_b = tir.Broadcast(b, DataType(a.dtype).lanes)
             return op(a, broadcast_b)
         else:
diff --git a/tilelang/language/proxy.py b/tilelang/language/proxy.py
index d6559f49b..7f74aa5d3 100644
--- a/tilelang/language/proxy.py
+++ b/tilelang/language/proxy.py
@@ -1,7 +1,7 @@
 """The language interface for tl programs."""
 
 from __future__ import annotations
-from typing import Any, Optional, Sequence, SupportsIndex, TYPE_CHECKING
+from typing import Any, Optional, Sequence, SupportsIndex, TYPE_CHECKING, Tuple, Union
 from typing_extensions import Self
 
 from tvm import tir
@@ -53,7 +53,8 @@ def __getitem__(self, keys) -> tir.Buffer:
     def from_ptr(self,
                  pointer_var: Var,
                  shape: tuple[PrimExpr, ...],
-                 dtype: str = "float32") -> Buffer:
+                 dtype: str = "float32",
+                 strides: tuple[PrimExpr, ...] = None) -> Buffer:
         """Create a buffer from a pointer, shape, and data type.
 
         Args:
@@ -64,7 +65,7 @@ def from_ptr(self,
         Returns:
             A buffer created from the given parameters
         """
-        return match_buffer(pointer_var, shape, dtype=dtype)
+        return match_buffer(pointer_var, shape, dtype=dtype, strides=strides)
 
 
 class BaseTensorProxy:
@@ -110,16 +111,17 @@ def __call__(
         )
 
     def __getitem__(self, keys) -> tir.Buffer:
-        if not isinstance(keys, tuple):
-            return self(keys)
-        if len(keys) >= 2 and not isinstance(keys[1], str):
-            return self(keys)
+        assert isinstance(keys, tuple)
+        # Single argument (the shape)
+        if all([type(s) not in (tuple, str, list) for s in keys]):
+            keys = (keys,)
         return self(*keys)
 
     def from_ptr(self,
                  pointer_var: Var,
                  shape: tuple[PrimExpr, ...],
-                 dtype: str = "float32") -> tir.Buffer:
+                 dtype: str = "float32",
+                 strides: tuple[PrimExpr, ...] = None) -> tir.Buffer:
         """Create a buffer from a pointer, shape, and data type.
 
         Args:
@@ -130,16 +132,51 @@ def from_ptr(self,
         Returns:
             A buffer created from the given parameters
         """
-        return match_buffer(pointer_var, shape, dtype=dtype)
+        return match_buffer(pointer_var, shape, dtype=dtype, strides=strides)
 
 
 class TensorProxy(BaseTensorProxy):
     """Main tensor proxy class for global scope buffers.
     
     This class implements the default tensor proxy with global memory scope,
-    inheriting all functionality from BaseTensorProxy without modifications.
+    the tensor should be by default contiguous.
     """
 
+    @staticmethod
+    def _construct_strides(shape: Tuple[Any]):
+        s, strides = 1, [1]
+        for dim in shape[:0:-1]:
+            s *= dim
+            strides.append(s)
+        return tuple(reversed(strides))
+
+    def __call__(self,
+                 shape: Union[Tuple[Any], PrimExpr, int],
+                 dtype: str = "float32",
+                 data=None) -> tir.Buffer:
+        if isinstance(shape, (int, PrimExpr)):
+            shape = (shape,)
+        return super().__call__(
+            shape, dtype=dtype, strides=TensorProxy._construct_strides(shape), data=data)
+
+
+class StridedTensorProxy(BaseTensorProxy):
+    """Main tensor proxy class for global scope buffers, with strides supported.
+
+    This class implements the default tensor proxy with global memory scope, with the stride information required.
+    """
+
+    def __call__(self,
+                 shape: Tuple[Any],
+                 strides: Tuple[Any],
+                 dtype: str = "float32") -> tir.Buffer:
+        if len(shape) != len(strides):
+            raise ValueError("Invalid shape/strides' dimensions")
+        if not bool(strides[-1] == 1):
+            # TODO(chenggang): shall we support non-contiguous even for the last dimension?
+            raise ValueError("The stride of the last dimension must be 1 (contiguous)")
+        return super().__call__(shape, dtype=dtype, strides=strides)
+
 
 class FragmentBufferProxy(BaseTensorProxy):
     """Proxy class for fragment memory buffers.
@@ -204,12 +241,16 @@ def __init__(
         def from_ptr(cls,
                      pointer_var: Var,
                      shape: Sequence[PrimExpr, ...],
-                     dtype: str = "float32") -> Self:
+                     dtype: str = "float32",
+                     strides: tuple[PrimExpr, ...] = None) -> Self:
             ...
 
     class Tensor(BaseTensor):
         ...
 
+    class StridedTensor(BaseTensor):
+        ...
+
     class FragmentBuffer(BaseTensor):
         ...
 
@@ -220,6 +261,7 @@ class LocalBuffer(BaseTensor):
         ...
 else:
     Tensor = TensorProxy()  # pylint: disable=invalid-name
+    StridedTensor = StridedTensorProxy()  # pylint: disable=invalid-name
     FragmentBuffer = FragmentBufferProxy()  # pylint: disable=invalid-name
     SharedBuffer = SharedBufferProxy()  # pylint: disable=invalid-name
     LocalBuffer = LocalBufferProxy()  # pylint: disable=invalid-name
@@ -250,5 +292,8 @@ def ptr(dtype: Optional[str] = None,
     return handle(dtype=dtype, storage_scope=storage_scope, is_size_var=is_size_var)
 
 
-def make_tensor(ptr: Var, shape: tuple[PrimExpr, ...], dtype: str = "float32") -> tir.Buffer:
-    return Tensor.from_ptr(ptr, shape, dtype)
+def make_tensor(ptr: Var,
+                shape: tuple[PrimExpr, ...],
+                dtype: str = "float32",
+                strides: tuple[PrimExpr, ...] = None) -> tir.Buffer:
+    return Tensor.from_ptr(ptr, shape, dtype, strides)
diff --git a/tilelang/language/tir/entry.py b/tilelang/language/tir/entry.py
index d663ee11e..86edad811 100644
--- a/tilelang/language/tir/entry.py
+++ b/tilelang/language/tir/entry.py
@@ -1,14 +1,14 @@
+import inspect
 from typing import Callable, Optional, Union
 
-from tvm.tir.function import PrimFunc
 import tvm.script.parser.tir.entry as _tir_entry
-import inspect
+from tvm.tir.function import PrimFunc
 from tvm.script.parser._core import parse, scan_macro, utils
 
 
 def prim_func(func: Optional[Callable] = None,
               private: bool = False,
-              check_well_formed=True) -> Union[PrimFunc, Callable]:
+              check_well_formed: bool = False) -> Union[PrimFunc, Callable]:
     """The parsing method for tir prim func, by using `@prim_func` as decorator.
 
     Parameters
diff --git a/tilelang/language/tir/op.py b/tilelang/language/tir/op.py
index 77be7e123..b6cc55fc8 100644
--- a/tilelang/language/tir/op.py
+++ b/tilelang/language/tir/op.py
@@ -2602,7 +2602,7 @@ def isinf(x, span=None):
 
 def pow_of_int(x: PrimExpr, y: int) -> PrimExpr:
     """Fast power operation than pow(float, float).
-    
+
     Args:
         x (PrimExpr): Base value
         y (int): Exponent value
diff --git a/tilelang/language/warpgroup.py b/tilelang/language/warpgroup.py
index 0d994be63..2e64d66fa 100644
--- a/tilelang/language/warpgroup.py
+++ b/tilelang/language/warpgroup.py
@@ -1,7 +1,7 @@
 """The language interface for tl programs."""
 
 from tvm.script.ir_builder.tir.frame import TIRFrame
-from tvm._ffi import register_object
+from tvm.ffi import register_object
 from tilelang import _ffi_api
 from .kernel import get_thread_bindings, get_thread_extents
 from typing import List
diff --git a/tilelang/layout/fragment.py b/tilelang/layout/fragment.py
index 8b2312bd0..2cd64563e 100644
--- a/tilelang/layout/fragment.py
+++ b/tilelang/layout/fragment.py
@@ -9,7 +9,7 @@
 from typing import List
 
 
-@tvm._ffi.register_object("tl.Fragment")
+@tvm.ffi.register_object("tl.Fragment")
 class Fragment(Layout):
     """
     A Fragment layout object that encapsulates iteration variables (forward_vars),
@@ -90,7 +90,9 @@ def __init__(self,
                 forward_thread = forward_thread_fn(*vars)
 
         # Ensure forward_index is an array if it isn't None
-        if forward_index is not None and not isinstance(forward_index, tvm.ir.container.Array):
+        if forward_index is None:
+            forward_index = []
+        elif not isinstance(forward_index, tvm.ir.container.Array):
             forward_index = [forward_index]
 
         # Call TVM FFI constructor to set up internal data structures
diff --git a/tilelang/layout/layout.py b/tilelang/layout/layout.py
index ef5d5d1e3..ee0bd8ea3 100644
--- a/tilelang/layout/layout.py
+++ b/tilelang/layout/layout.py
@@ -9,7 +9,7 @@
 
 
 # Register the Layout class as a TVM object under the name "tl.Layout"
-@tvm._ffi.register_object("tl.Layout")
+@tvm.ffi.register_object("tl.Layout")
 class Layout(Node):
 
     def __init__(self, shape, forward_fn):
diff --git a/tilelang/quantize/quantization.py b/tilelang/quantize/quantization.py
index 4cc931c46..92f288cde 100644
--- a/tilelang/quantize/quantization.py
+++ b/tilelang/quantize/quantization.py
@@ -180,7 +180,7 @@ def _tir_u8_to_f8_e4m3_to_f16(nbit: int, val: tir.PrimExpr, dtype: str):
 def _tir_u8_to_f8_e5m2_to_f16(nbit: int, val: tir.PrimExpr, dtype: str):
     assert nbit == 8
     assert dtype == "float16"
-    return tir.reinterpret("e5m2_float8", val).astype("float16")
+    return tir.reinterpret("float8_e5m2", val).astype("float16")
 
 
 def _tir_packed_to_signed_convert(storage_type="uint", storage_nbit=8):
diff --git a/tilelang/transform/__init__.py b/tilelang/transform/__init__.py
index 326266bac..001f2a9a7 100644
--- a/tilelang/transform/__init__.py
+++ b/tilelang/transform/__init__.py
@@ -87,8 +87,8 @@ def LowerHopperIntrin():
     fpass : tvm.transform.Pass
         The result pass
     """
-    return _ffi_api.LowerHopperIntrin() \
-        if hasattr(_ffi_api, "LowerHopperIntrin") else lambda f: f # type: ignore
+    return (_ffi_api.LowerHopperIntrin() if hasattr(_ffi_api, "LowerHopperIntrin") else lambda f: f
+           )  # type: ignore
 
 
 def WarpSpecializedPipeline():
@@ -375,3 +375,32 @@ def LowerSharedBarrier():
     """LowerSharedBarrier
     """
     return _ffi_api.LowerSharedBarrier()  # type: ignore
+
+
+def StorageRewrite():
+    """StorageRewrite
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.StorageRewrite()  # type: ignore
+
+
+def LowerOpaqueBlock():
+    """LowerOpaqueBlock
+    """
+    return _ffi_api.LowerOpaqueBlock()  # type: ignore
+
+
+def LowerThreadAllreduce():
+    """LowerThreadAllreduce
+    """
+    return _ffi_api.LowerThreadAllreduce()  # type: ignore
+
+
+def LowerDeviceKernelLaunch():
+    """LowerDeviceKernelLaunch
+    """
+    return _ffi_api.LowerDeviceKernelLaunch()  # type: ignore
diff --git a/tilelang/transform/_ffi_api.py b/tilelang/transform/_ffi_api.py
index 26284ebcd..c89dddda1 100644
--- a/tilelang/transform/_ffi_api.py
+++ b/tilelang/transform/_ffi_api.py
@@ -1,6 +1,6 @@
 """FFI APIs for tilelang"""
 
-import tvm._ffi
+import tvm.ffi
 
 # TVM_REGISTER_GLOBAL("tl.name").set_body_typed(func);
-tvm._ffi._init_api("tl.transform", __name__)  # pylint: disable=protected-access
+tvm.ffi._init_api("tl.transform", __name__)  # pylint: disable=protected-access
diff --git a/tilelang/transform/pass_config.py b/tilelang/transform/pass_config.py
index 5db5e928d..861abea76 100644
--- a/tilelang/transform/pass_config.py
+++ b/tilelang/transform/pass_config.py
@@ -21,6 +21,10 @@ class PassConfigKey(str, Enum):
     TL_DISABLE_FAST_MATH = "tl.disable_fast_math"
     """Disable fast math optimization. Default: False"""
 
+    TL_PTXAS_REGISTER_USAGE_LEVEL = "tl.ptxas_register_usage_level"
+    """The PTXAS register usage level in [0, 10], which controls the 
+    aggressiveness of optimizations that affect register usage. Default: None"""
+
     TL_ENABLE_PTXAS_VERBOSE_OUTPUT = "tl.enable_ptxas_verbose_output"
     """Enable ptxas verbose output. Default: False"""
 
@@ -39,6 +43,9 @@ class PassConfigKey(str, Enum):
     TL_ENABLE_AGGRESSIVE_SHARED_MEMORY_MERGE = "tl.enable_aggressive_shared_memory_merge"
     """Enable aggressive merge of shared memory allocations. Default: False"""
 
+    TL_DISABLE_SHUFFLE_ELECT = "tl.disable_shuffle_elect"
+    """Disable shuffle election optimization. Default: False"""
+
     # TIR related configs
     TIR_ENABLE_EQUIV_TERMS_IN_CSE = "tir.enable_equiv_terms_in_cse_tir"
     """Enable equivalent terms in TIR Common Subexpression Elimination. Default: True"""
diff --git a/tilelang/utils/language.py b/tilelang/utils/language.py
index b9da8a1c1..ab24d5161 100644
--- a/tilelang/utils/language.py
+++ b/tilelang/utils/language.py
@@ -1,8 +1,9 @@
 from tvm.tir import Buffer
-from typing import List
+from typing import List, Optional
 from functools import reduce
 from tvm import IRModule
 from tvm.tir import PrimFunc
+from tvm import ir, tir
 
 # Scope Checkers for TVM Buffers
 # These utility functions check the memory scope of a given TVM buffer.
@@ -118,3 +119,20 @@ def retrieve_func_from_module(ir_module: IRModule) -> PrimFunc:
         "The optimized module should only have one global variable for default schedule.")
     func = list(ir_module.functions.values())[0]
     return func
+
+
+def get_buffer_region_from_load(buffer_load: tir.BufferLoad) -> Optional[tir.BufferRegion]:
+    """
+    Get the buffer region from a buffer load.
+
+    May encounter buffer load like C[0:128, 0:32], ref to pull request
+    for buffer wise op: https://github.com/apache/tvm/pull/14693
+    convert load to region
+    """
+    buffer, indices = buffer_load.buffer, buffer_load.indices
+    regions = []
+    for indice in indices:
+        if not isinstance(indice, tir.Ramp):
+            return None
+        regions.append(ir.Range.from_min_extent(indice.base, indice.lanes))
+    return tir.BufferRegion(buffer, regions)
diff --git a/tilelang/utils/tensor.py b/tilelang/utils/tensor.py
index 403f92a0e..bab967a85 100644
--- a/tilelang/utils/tensor.py
+++ b/tilelang/utils/tensor.py
@@ -19,12 +19,12 @@ class TensorSupplyType(Enum):
 
 
 def map_torch_type(intype: str) -> torch.dtype:
-    if intype == "e4m3_float8":
+    if intype == "float8_e4m3":
         assert hasattr(torch, "float8_e4m3fn"), \
             "torch.float8_e4m3fn is not supported in this version of torch" \
                 "Please upgrade torch >= 2.1.0"
         return torch.float8_e4m3fn
-    elif intype == "e5m2_float8":
+    elif intype == "float8_e5m2":
         assert hasattr(torch, "float8_e5m2"), \
             "torch.float8_e5m2 is not supported in this version of torch" \
                 "Please upgrade torch >= 2.1.0"
@@ -40,10 +40,10 @@ def map_torch_type(intype: str) -> torch.dtype:
 
 def adapt_torch2tvm(arg):
     float8_dtype_map = {
-        torch.float8_e4m3fn: "e4m3_float8",
-        torch.float8_e4m3fnuz: "e4m3_float8",
-        torch.float8_e5m2: "e5m2_float8",
-        torch.float8_e5m2fnuz: "e5m2_float8",
+        torch.float8_e4m3fn: "float8_e4m3",
+        torch.float8_e4m3fnuz: "float8_e4m3",
+        torch.float8_e5m2: "float8_e5m2",
+        torch.float8_e5m2fnuz: "float8_e5m2",
     }
     if isinstance(arg, torch.Tensor):
         if arg.dtype in {
diff --git a/tilelang/version.py b/tilelang/version.py
index 0efd0f11c..ac3b792f9 100644
--- a/tilelang/version.py
+++ b/tilelang/version.py
@@ -26,13 +26,10 @@
 
 
 def get_git_commit_id() -> Union[str, None]:
-    """Get the current git commit hash.
-    
-    Returns:
-        str | None: The git commit hash if available, None otherwise.
-    """
+    """Get the current git commit hash by running git in the current file's directory."""
     try:
         return subprocess.check_output(['git', 'rev-parse', 'HEAD'],
+                                       cwd=os.path.dirname(os.path.abspath(__file__)),
                                        stderr=subprocess.DEVNULL,
                                        encoding='utf-8').strip()
     except subprocess.SubprocessError:
@@ -40,6 +37,9 @@ def get_git_commit_id() -> Union[str, None]:
 
 
 # Append git commit hash to version if not already present
+# NOTE(lei): Although the local commit id cannot capture locally staged changes,
+# the local commit id can help mitigate issues caused by incorrect cache to some extent,
+# so it should still be kept.
 if "+" not in __version__ and (commit_id := get_git_commit_id()):
     __version__ = f"{__version__}+{commit_id}"
 

From cf99bef98649a23958131eb79d9f3fa9229749f6 Mon Sep 17 00:00:00 2001
From: xinyxiao <xinyxiao@amd.com>
Date: Mon, 18 Aug 2025 13:31:12 +0000
Subject: [PATCH 10/11] Remove redundant tool cache cleanup step in AMD CI
 workflow

---
 .github/workflows/amd_ci.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/amd_ci.yml b/.github/workflows/amd_ci.yml
index b45cb7c74..2ef300b66 100644
--- a/.github/workflows/amd_ci.yml
+++ b/.github/workflows/amd_ci.yml
@@ -84,7 +84,6 @@ jobs:
       run: |
         echo "Running on AMD GPU"
         set -e
-        rm -rf "${{ runner.tool_cache }}"
         REQS_HASH=$(sha256sum requirements-rocm.txt | cut -d ' ' -f 1)
         MARKER="${{ runner.tool_cache }}/.venv_marker_${{ env.PYTHON_VERSION }}_${REQS_HASH:0:8}"
         

From e839192d262909de49da053f463b688b0296f837 Mon Sep 17 00:00:00 2001
From: xinyxiao <xinyxiao@amd.com>
Date: Mon, 18 Aug 2025 13:50:18 +0000
Subject: [PATCH 11/11] Remove `torch` dependency from `requirements-rocm.txt`
 to streamline requirements.

---
 requirements-rocm.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/requirements-rocm.txt b/requirements-rocm.txt
index 4c8df9c67..bdf1aa985 100644
--- a/requirements-rocm.txt
+++ b/requirements-rocm.txt
@@ -20,7 +20,6 @@ requests
 cloudpickle
 ml_dtypes
 psutil
-torch
 tabulate
 wheel
 setuptools