chore

Rachmanino · Rachmanino · commit 5ded38ccbce6 · 2025-10-19T07:04:02.000Z
diff --git a/examples/linear_attention/example_linear_attn_bwd.py b/examples/linear_attention/example_linear_attn_bwd.py
@@ -1,5 +1,5 @@
 import torch
-import tilelang as tl
+import tilelang
 import tilelang.language as T
 from tilelang.profiler import do_bench
 import argparse
@@ -9,10 +9,11 @@
 from typing import Optional, Tuple
 
 
-@tl.jit(pass_configs={
-    tl.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-    tl.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-})
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+    })
 def tl_fused_chunk_bwd_kernel(
     B,
     S,
@@ -30,12 +31,12 @@ def tl_fused_chunk_bwd_kernel(
     chunk_size = 64
     BK = BV = 64  # Set to 128 can be faster, but has some numerical differences with FLA
     assert S % chunk_size == 0 and DK % BK == 0 and DV % BV == 0
-    NK = tl.cdiv(DK, BK)
-    NV = tl.cdiv(DV, BV)
-    NT = tl.cdiv(S, chunk_size)
+    NK = tilelang.cdiv(DK, BK)
+    NV = tilelang.cdiv(DV, BV)
+    NT = tilelang.cdiv(S, chunk_size)
 
     @T.prim_func
-    def chunk_linear_attn_bwd(
+    def fused_chunk_linear_attn_bwd(
             Q: T.Tensor([B, S, H, DK], dtype),  # type: ignore
             K: T.Tensor([B, S, H, DK], dtype),  # type: ignore
             V: T.Tensor([B, S, H, DV], dtype),  # type: ignore
@@ -64,18 +65,19 @@ def chunk_linear_attn_bwd(
             h_shared = T.alloc_shared([BV, BK], dtype)
             dh = T.alloc_fragment([BK, BV], accum_dtype)
             dh_shared = T.alloc_shared([BK, BV], dtype)
-            T.clear(h)
-            T.clear(dh)
 
             T.annotate_layout({
-                dq_shared: tl.layout.make_swizzled_layout(dq_shared),
-                dk_shared: tl.layout.make_swizzled_layout(dk_shared),
-                dv_shared: tl.layout.make_swizzled_layout(dv_shared)
+                dq_shared: tilelang.layout.make_swizzled_layout(dq_shared),
+                dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
+                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared)
             })
             T.use_swizzle(10)
 
+            T.clear(h)
+            T.clear(dh)
+
             # Calculate dQ
-            for i in T.Pipelined(0, NT, num_stages=1):
+            for i in T.Pipelined(0, NT):
                 T.copy(K[i_b, i * chunk_size:(i + 1) * chunk_size, i_h, i_k * BK:(i_k + 1) * BK], k)
                 T.copy(V[i_b, i * chunk_size:(i + 1) * chunk_size, i_h, i_v * BV:(i_v + 1) * BV], v)
                 T.copy(dO[i_b, i * chunk_size:(i + 1) * chunk_size, i_h, i_v * BV:(i_v + 1) * BV],
@@ -97,7 +99,7 @@ def chunk_linear_attn_bwd(
                     dq_shared)
 
             # Calculate dK, dV (reversely)
-            for i in T.Pipelined(1, NT + 1, num_stages=1):
+            for i in T.Pipelined(1, NT + 1):
                 start = NT - i
                 for row, col in T.Parallel(chunk_size, BK):
                     q[row, col] = Q[i_b, start * chunk_size + row, i_h, i_k * BK + col] * scale
@@ -139,9 +141,8 @@ def chunk_linear_attn_bwd(
                 T.atomic_add(
                     dV[i_b, start * chunk_size:(start + 1) * chunk_size, i_h,
                        i_v * BV:(i_v + 1) * BV], dv_shared)
-                #TODO: consider using vectorized atomic add or tma reduce for sm90
 
-    return chunk_linear_attn_bwd
+    return fused_chunk_linear_attn_bwd
 
 
 def tl_fused_chunk_bwd(Q, K, V, dO):
@@ -188,6 +189,7 @@ def main(B=1, S=1024, H=16, D=128):
     k = l2norm_fwd(k)[0].requires_grad_(True)
 
     dq, dk, dv = tl_fused_chunk_bwd(q, k, v, do)
+    q.grad = k.grad = v.grad = None
     o_ref, _ = ref_program(q, k, v)
     o_ref.backward(do, retain_graph=True)
 
@@ -202,9 +204,8 @@ def main(B=1, S=1024, H=16, D=128):
     # Benchmark
     q.grad = k.grad = v.grad = None
     o_ref, _ = fused_chunk_linear_attn(q, k, v, output_final_state=True, normalize=False)
-    t1 = do_bench(
-        lambda: o_ref.backward(do, retain_graph=True), warmup=25, rep=100, backend='cupti')
-    t2 = do_bench(lambda: tl_fused_chunk_bwd(q, k, v, do), warmup=25, rep=100, backend='cupti')
+    t1 = do_bench(lambda: o_ref.backward(do, retain_graph=True), backend='cupti')
+    t2 = do_bench(lambda: tl_fused_chunk_bwd(q, k, v, do), backend='cupti')
     print(f'Triton latency: {t1:.3f} ms')
     print(f'TileLang latency: {t2:.3f} ms')
     print(f'Speedup: {t1/t2:.3f}x')
diff --git a/examples/linear_attention/example_linear_attn_fwd.py b/examples/linear_attention/example_linear_attn_fwd.py
@@ -1,5 +1,5 @@
 import torch
-import tilelang as tl
+import tilelang
 import tilelang.language as T
 from tilelang.profiler import do_bench
 import argparse
@@ -9,11 +9,11 @@
 from typing import Optional, Tuple
 
 
-@tl.jit(
+@tilelang.jit(
     out_idx=[4],
     pass_configs={
-        tl.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        tl.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
     })
 def tl_fused_chunk_fwd_kernel(
     B,
@@ -32,12 +32,12 @@ def tl_fused_chunk_fwd_kernel(
     chunk_size = 64
     BK = BV = 64  # Set to 128 can be faster, but has some numerical differences with FLA
     assert S % chunk_size == 0 and DK % BK == 0 and DV % BV == 0
-    NK = tl.cdiv(DK, BK)
-    NV = tl.cdiv(DV, BV)
-    NT = tl.cdiv(S, chunk_size)
+    NK = tilelang.cdiv(DK, BK)
+    NV = tilelang.cdiv(DV, BV)
+    NT = tilelang.cdiv(S, chunk_size)
 
     @T.prim_func
-    def chunk_linear_attn_fwd(
+    def fused_chunk_linear_attn_fwd(
             Q: T.Tensor([B, S, H, DK], dtype),  # type: ignore
             K: T.Tensor([B, S, H, DK], dtype),  # type: ignore
             V: T.Tensor([B, S, H, DV], dtype),  # type: ignore
@@ -56,11 +56,13 @@ def chunk_linear_attn_fwd(
             s_shared = T.alloc_shared([chunk_size, chunk_size], dtype)
             o = T.alloc_fragment([chunk_size, BV], accum_dtype)
             o_shared = T.alloc_shared([chunk_size, BV], accum_dtype)
-            T.clear(h)
 
+            T.annotate_layout({o_shared: tilelang.layout.make_swizzled_layout(o_shared)})
             T.use_swizzle(10)
 
-            for i in T.Pipelined(0, NT, num_stages=1):
+            T.clear(h)
+
+            for i in T.Pipelined(0, NT):
                 for row, col in T.Parallel(chunk_size, BK):
                     q[row, col] = Q[i_b, i * chunk_size + row, i_h, i_k * BK + col] * scale
                 T.copy(K[i_b, i * chunk_size:(i + 1) * chunk_size, i_h, i_k * BK:(i_k + 1) * BK], k)
@@ -83,7 +85,7 @@ def chunk_linear_attn_fwd(
             # Output final state
             T.copy(h, final_state[i_b, i_h, i_k * BK:(i_k + 1) * BK, i_v * BV:(i_v + 1) * BV])
 
-    return chunk_linear_attn_fwd
+    return fused_chunk_linear_attn_fwd
 
 
 def tl_fused_chunk_fwd(q, k, v):
@@ -135,10 +137,8 @@ def main(B=1, S=512, H=16, D=128):
 
     t1 = do_bench(
         lambda: fused_chunk_linear_attn(q, k, v, output_final_state=True, normalize=False),
-        warmup=25,
-        rep=100,
         backend='cupti')
-    t2 = do_bench(lambda: tl_fused_chunk_fwd(q, k, v), warmup=25, rep=100, backend='cupti')
+    t2 = do_bench(lambda: tl_fused_chunk_fwd(q, k, v), backend='cupti')
     print(f'Triton latency: {t1:.3f} ms')
     print(f'TileLang latency: {t2:.3f} ms')
     print(f'Speedup: {t1/t2:.3f}x')