tile-ai
diff --git a/‎benchmark/blocksparse_attention/benchmark_library_dense_fmha.py‎
Lines changed: 4 additions & 3 deletions b/‎benchmark/blocksparse_attention/benchmark_library_dense_fmha.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎benchmark/blocksparse_attention/benchmark_tilelang_block_sparse_fmha.py‎
Lines changed: 38 additions & 39 deletions b/‎benchmark/blocksparse_attention/benchmark_tilelang_block_sparse_fmha.py‎
Lines changed: 38 additions & 39 deletions
diff --git a/‎benchmark/blocksparse_attention/benchmark_torch_block_sparse_fmha.py‎
Lines changed: 7 additions & 6 deletions b/‎benchmark/blocksparse_attention/benchmark_torch_block_sparse_fmha.py‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎benchmark/blocksparse_attention/benchmark_triton_block_sparse_fmha.py‎
Lines changed: 12 additions & 14 deletions b/‎benchmark/blocksparse_attention/benchmark_triton_block_sparse_fmha.py‎
Lines changed: 12 additions & 14 deletions
diff --git a/‎benchmark/matmul/benchmark_matmul.py‎
Lines changed: 7 additions & 10 deletions b/‎benchmark/matmul/benchmark_matmul.py‎
Lines changed: 7 additions & 10 deletions
diff --git a/‎benchmark/matmul/benchmark_matmul_intrinsic.py‎
Lines changed: 13 additions & 20 deletions b/‎benchmark/matmul/benchmark_matmul_intrinsic.py‎
Lines changed: 13 additions & 20 deletions
@@ -7,9 +7,10 @@ def get_sparse_attn_mask_from_topk(x, topk, use_dense_for_last_block=False):
     bsz, num_head, downsample_len, _ = x.shape
     # N_CTX = downsample_len * BLOCK
     sparse_index = torch.topk(x, topk, dim=-1).indices
-    dense_mask = torch.full(
-        [bsz, num_head, downsample_len, downsample_len], False, dtype=torch.bool, device=x.device
-    )
+    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len],
+                            False,
+                            dtype=torch.bool,
+                            device=x.device)
     dense_mask.scatter_(-1, sparse_index, True)
     if use_dense_for_last_block:
         dense_mask[:, :, -2:, :] = True
 
@@ -15,9 +15,10 @@ def get_sparse_attn_mask_from_topk(x, topk, use_dense_for_last_block=False):
     bsz, num_head, downsample_len, _ = x.shape
     # N_CTX = downsample_len * BLOCK
     sparse_index = torch.topk(x, topk, dim=-1).indices
-    dense_mask = torch.full(
-        [bsz, num_head, downsample_len, downsample_len], False, dtype=torch.bool, device=x.device
-    )
+    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len],
+                            False,
+                            dtype=torch.bool,
+                            device=x.device)
     dense_mask.scatter_(-1, sparse_index, True)
     if use_dense_for_last_block:
         dense_mask[:, :, -2:, :] = True
@@ -38,7 +39,7 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
     block_N = 64
     num_stages = 2
     threads = 128
-    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
     shape = [batch, heads, seq_len, dim]
     block_mask_shape = [batch, heads, downsample_len, downsample_len]
 
@@ -47,6 +48,7 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
     block_mask_dtype = "bool"
 
     def kernel_func(block_M, block_N, num_stages, threads):
+
         @T.macro
         def MMA0(
             K: T.Tensor(shape, dtype),
@@ -58,12 +60,11 @@ def MMA0(
             by: T.int32,
             bz: T.int32,
         ):
-            T.copy(K[bz, by, k * block_N : (k + 1) * block_N, :], K_shared)
+            T.copy(K[bz, by, k * block_N:(k + 1) * block_N, :], K_shared)
             if is_causal:
                 for i, j in T.Parallel(block_M, block_N):
-                    acc_s[i, j] = T.if_then_else(
-                        bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype)
-                    )
+                    acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
+                                                 -T.infinity(acc_s.dtype))
             else:
                 T.clear(acc_s)
             T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
@@ -78,18 +79,18 @@ def MMA1(
             by: T.int32,
             bz: T.int32,
         ):
-            T.copy(V[bz, by, k * block_N : (k + 1) * block_N, :], V_shared)
+            T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
             T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
         @T.macro
         def Softmax(
-            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-            scores_max: T.FragmentBuffer([block_M], accum_dtype),
-            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-            logsum: T.FragmentBuffer([block_M], accum_dtype),
+                acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
+                acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
+                scores_max: T.FragmentBuffer([block_M], accum_dtype),
+                scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
+                scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+                scores_sum: T.FragmentBuffer([block_M], accum_dtype),
+                logsum: T.FragmentBuffer([block_M], accum_dtype),
         ):
             T.copy(scores_max, scores_max_prev)
             T.fill(scores_max, -T.infinity(accum_dtype))
@@ -113,25 +114,26 @@ def Softmax(
 
         @T.macro
         def Rescale(
-            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+                acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
+                scores_scale: T.FragmentBuffer([block_M], accum_dtype),
         ):
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] *= scores_scale[i]
 
         @T.prim_func
         def main(
-            Q: T.Tensor(shape, dtype),
-            K: T.Tensor(shape, dtype),
-            V: T.Tensor(shape, dtype),
-            BlockSparseMask: T.Tensor(block_mask_shape, block_mask_dtype),
-            Output: T.Tensor(shape, dtype),
+                Q: T.Tensor(shape, dtype),
+                K: T.Tensor(shape, dtype),
+                V: T.Tensor(shape, dtype),
+                BlockSparseMask: T.Tensor(block_mask_shape, block_mask_dtype),
+                Output: T.Tensor(shape, dtype),
         ):
-            with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (
-                bx,
-                by,
-                bz,
-            ):
+            with T.Kernel(
+                    T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (
+                        bx,
+                        by,
+                        bz,
+                    ):
                 Q_shared = T.alloc_shared([block_M, dim], dtype)
                 K_shared = T.alloc_shared([block_N, dim], dtype)
                 V_shared = T.alloc_shared([block_N, dim], dtype)
@@ -146,7 +148,7 @@ def main(
                 logsum = T.alloc_fragment([block_M], accum_dtype)
                 block_mask = T.alloc_local([downsample_len], block_mask_dtype)
 
-                T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
+                T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
                 T.fill(acc_o, 0)
                 T.fill(logsum, 0)
                 T.fill(scores_max, -T.infinity(accum_dtype))
@@ -155,10 +157,8 @@ def main(
                     block_mask[vj] = BlockSparseMask[bz, by, bx, vj]
 
                 loop_range = (
-                    T.min(T.ceildiv(seq_len, block_N), T.ceildiv((bx + 1) * block_M, block_N))
-                    if is_causal
-                    else T.ceildiv(seq_len, block_N)
-                )
+                    T.min(T.ceildiv(seq_len, block_N), T.ceildiv(
+                        (bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N))
 
                 for k in T.Pipelined(loop_range, num_stages=num_stages):
                     if block_mask[k]:
@@ -177,7 +177,7 @@ def main(
                 for i, j in T.Parallel(block_M, dim):
                     acc_o[i, j] /= logsum[i]
                 T.copy(acc_o, O_shared)
-                T.copy(O_shared, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
+                T.copy(O_shared, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
 
         return main
 
@@ -199,14 +199,13 @@ def benchmark_topk_sparse_attention():
         # Create sparse mask (downsampled to block level)
         downsample_factor = BLOCK
         downsample_len = math.ceil(SEQ_LEN / downsample_factor)
-        x_ds = torch.randn(
-            [BATCH, N_HEADS, downsample_len, downsample_len], device="cuda", dtype=torch.bfloat16
-        )
+        x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len],
+                           device="cuda",
+                           dtype=torch.bfloat16)
         x_ds[:, :, :, 0] = 100
         block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
         program = blocksparse_flashattn(
-            BATCH, N_HEADS, SEQ_LEN, D_HEAD, downsample_len, is_causal=True
-        )
+            BATCH, N_HEADS, SEQ_LEN, D_HEAD, downsample_len, is_causal=True)
         kernel = tilelang.compile(program, out_idx=4)
 
         def benchmark_fn():
 
@@ -10,9 +10,10 @@ def get_sparse_attn_mask_from_topk(x, topk, use_dense_for_last_block=False):
     bsz, num_head, downsample_len, _ = x.shape
     # N_CTX = downsample_len * BLOCK
     sparse_index = torch.topk(x, topk, dim=-1).indices
-    dense_mask = torch.full(
-        [bsz, num_head, downsample_len, downsample_len], False, dtype=torch.bool, device=x.device
-    )
+    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len],
+                            False,
+                            dtype=torch.bool,
+                            device=x.device)
     dense_mask.scatter_(-1, sparse_index, True)
     if use_dense_for_last_block:
         dense_mask[:, :, -2:, :] = True
@@ -45,9 +46,9 @@ def benchmark_topk_sparse_attention():
         # Create sparse mask (downsampled to block level)
         downsample_factor = BLOCK
         downsample_len = math.ceil(SEQ_LEN / downsample_factor)
-        x_ds = torch.randn(
-            [BATCH, N_HEADS, downsample_len, downsample_len], device="cuda", dtype=torch.bfloat16
-        )
+        x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len],
+                           device="cuda",
+                           dtype=torch.bfloat16)
         x_ds[:, :, :, 0] = 100
         block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
 
 
@@ -15,9 +15,10 @@ def get_sparse_attn_mask_from_topk(x, topk, use_dense_for_last_block=False):
     bsz, num_head, downsample_len, _ = x.shape
     # N_CTX = downsample_len * BLOCK
     sparse_index = torch.topk(x, topk, dim=-1).indices
-    dense_mask = torch.full(
-        [bsz, num_head, downsample_len, downsample_len], False, dtype=torch.bool, device=x.device
-    )
+    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len],
+                            False,
+                            dtype=torch.bool,
+                            device=x.device)
     dense_mask.scatter_(-1, sparse_index, True)
     if use_dense_for_last_block:
         dense_mask[:, :, -2:, :] = True
@@ -70,9 +71,8 @@ def _fwd_kernel_inner(
 
         # the following is needed only when LAST_K_BLOCK or BLOCK_M < BLOCK_N
         if LAST_K_BLOCK:
-            qk += tl.where(
-                offs_m[:, None] + past_len >= (start_n + offs_n[None, :]), 0, float("-inf")
-            )
+            qk += tl.where(offs_m[:, None] + past_len >= (start_n + offs_n[None, :]), 0,
+                           float("-inf"))
 
         m_ij = tl.maximum(m_i, tl.max(qk, 1))
         qk -= m_ij[:, None]
@@ -191,11 +191,8 @@ def _fwd_kernel(
     acc = acc.to(Out.dtype.element_ty)
 
     off_o = (
-        off_z * stride_oz
-        + off_h * stride_oh
-        + offs_m[:, None] * stride_om
-        + offs_d[None, :] * stride_od
-    )
+        off_z * stride_oz + off_h * stride_oh + offs_m[:, None] * stride_om +
+        offs_d[None, :] * stride_od)
     out_ptrs = Out + off_o
     tl.store(out_ptrs, acc, mask=offs_m[:, None] < N_CTX)
 
@@ -257,6 +254,7 @@ def _forward(
 
 
 class _sparse_attention(torch.autograd.Function):
+
     @staticmethod
     def forward(ctx, q, k, v, block_sparse_dense, sm_scale):
         # shape constraints
@@ -289,9 +287,9 @@ def benchmark_topk_sparse_attention():
         # Create sparse mask (downsampled to block level)
         downsample_factor = BLOCK
         downsample_len = math.ceil(SEQ_LEN / downsample_factor)
-        x_ds = torch.randn(
-            [BATCH, N_HEADS, downsample_len, downsample_len], device="cuda", dtype=torch.bfloat16
-        )
+        x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len],
+                           device="cuda",
+                           dtype=torch.bfloat16)
         x_ds[:, :, :, 0] = 100
         block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
 
 
@@ -101,10 +101,9 @@ def get_configs(args, kwargs):
             policy=[T.GemmWarpPolicy.Square],
             enable_rasteration=[True, False],
         )
-        return [
-            {k: v for k, v in zip(iter_params, values)}
-            for values in itertools.product(*iter_params.values())
-        ]
+        return [{
+            k: v for k, v in zip(iter_params, values)
+        } for values in itertools.product(*iter_params.values())]
     return configs
 
 
@@ -113,9 +112,7 @@ def get_configs(args, kwargs):
     warmup=3,
     rep=20,
 )
-@jit(
-    out_idx=[2],
-)
+@jit(out_idx=[2],)
 def matmul(
     M,
     N,
@@ -162,9 +159,9 @@ def matmul(
 
     @T.prim_func
     def main(
-        A: T.Tensor((M, K), dtype),
-        B: T.Tensor((N, K), dtype),
-        C: T.Tensor((M, N), dtype),
+            A: T.Tensor((M, K), dtype),
+            B: T.Tensor((N, K), dtype),
+            C: T.Tensor((M, N), dtype),
     ):
         """
         The compiled TVM function for block-level matrix multiplication.
 
@@ -6,8 +6,7 @@
 import tilelang.language as T
 from tilelang.intrinsics import get_swizzle_layout
 from tilelang.intrinsics.mma_macro_generator import (
-    TensorCoreIntrinEmitter,
-)
+    TensorCoreIntrinEmitter,)
 from tilelang.transform import simplify_prim_func
 from tilelang.autotuner import autotune
 import itertools
@@ -104,9 +103,9 @@ def tl_matmul(
 
     @T.prim_func
     def main(
-        A: T.Tensor(A_shape, in_dtype),
-        B: T.Tensor(B_shape, in_dtype),
-        C: T.Tensor((M, N), out_dtype),
+            A: T.Tensor(A_shape, in_dtype),
+            B: T.Tensor(B_shape, in_dtype),
+            C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
@@ -116,12 +115,10 @@ def main(
             B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
-            T.annotate_layout(
-                {
-                    A_shared: make_swizzle_layout(A_shared),
-                    B_shared: make_swizzle_layout(B_shared),
-                }
-            )
+            T.annotate_layout({
+                A_shared: make_swizzle_layout(A_shared),
+                B_shared: make_swizzle_layout(B_shared),
+            })
 
             # Improve L2 Cache
             T.use_swizzle(panel_size=10, enable=enable_rasteration)
@@ -232,10 +229,9 @@ def get_configs(args, kwargs):
             stage=[0, 2],
             enable_rasteration=[True, False],
         )
-        return [
-            {k: v for k, v in zip(iter_params, values)}
-            for values in itertools.product(*iter_params.values())
-        ]
+        return [{
+            k: v for k, v in zip(iter_params, values)
+        } for values in itertools.product(*iter_params.values())]
 
     return configs
 
@@ -247,9 +243,7 @@ def get_configs(args, kwargs):
     ref_prog=ref_program,
     skip_check=True,
 )
-@tl.jit(
-    out_idx=[2],
-)
+@tl.jit(out_idx=[2],)
 def matmul(
     M,
     N,
@@ -300,8 +294,7 @@ def kernel():
         help="Whether to use roller to deduce search spaces",
     )
     parser.add_argument(
-        "--dtype", type=str, default="float16", choices=["float16", "int8"], help="Input data type"
-    )
+        "--dtype", type=str, default="float16", choices=["float16", "int8"], help="Input data type")
     args = parser.parse_args()
 
     M, N, K = args.m, args.n, args.k