tile-ai
diff --git a/‎.clang-tidy‎
Lines changed: 10 additions & 5 deletions b/‎.clang-tidy‎
Lines changed: 10 additions & 5 deletions
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 30 additions & 12 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 30 additions & 12 deletions
diff --git a/‎.gitignore‎
Lines changed: 4 additions & 0 deletions b/‎.gitignore‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 2 additions & 2 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎benchmark/mamba2/README.md‎
Lines changed: 53 additions & 0 deletions b/‎benchmark/mamba2/README.md‎
Lines changed: 53 additions & 0 deletions
diff --git a/‎benchmark/mamba2/benchmark_mamba_chunk_scan.py‎
Lines changed: 223 additions & 0 deletions b/‎benchmark/mamba2/benchmark_mamba_chunk_scan.py‎
Lines changed: 223 additions & 0 deletions
diff --git a/‎benchmark/mamba2/mamba_benchmark_result.png‎
85.6 KB b/‎benchmark/mamba2/mamba_benchmark_result.png‎
85.6 KB
diff --git a/‎docs/conf.py‎
Lines changed: 2 additions & 4 deletions b/‎docs/conf.py‎
Lines changed: 2 additions & 4 deletions
@@ -1,4 +1,13 @@
-Checks: >
+---
+InheritParentConfig: true
+ExtraArgs: ['-v']
+FormatStyle: file
+UseColor: true
+WarningsAsErrors: '*'
+ExcludeHeaderFilterRegex: '^(3rdparty|tvm)/.*$'
+
+# NOTE: there must be no spaces before the '-', so put the comma last.
+Checks: >-
   # 1. Retained categories: easier to find bugs/performance issues
   clang-analyzer-*,
   cppcoreguidelines-pro-type-static-cast-downcast,
@@ -47,7 +56,3 @@ Checks: >
   -clang-analyzer-deadcode.DeadStores,
   -clang-analyzer-optin.cplusplus.VirtualCall,
   -clang-diagnostic-tautological-constant-compare,
-
-WarningsAsErrors: '*'
-
-HeaderFilterRegex: '^(?!.*(3rdparty|build)).*$'
 
@@ -287,21 +287,39 @@ jobs:
           echo "Clearing uv cache at ${UV_CACHE_DIR} due to failure."
           uv cache clean
 
-      - name: Run format check
-        id: format-check
+      - name: Run clang-tidy
+        id: clang-tidy
+        if: runner.os == 'Linux'
         run: |
-          mkdir -p build
+          echo "\$ $(command -v clang-tidy) --version" && clang-tidy --version
+
+          if [[ -x "$(command -v run-clang-tidy)" ]]; then
+            echo "Using run-clang-tidy from $(command -v run-clang-tidy)"
+            CLANG_TIDY=(run-clang-tidy)
+          else
+            echo "Downloading run-clang-tidy script"
+            wget -O run-clang-tidy.py https://raw.githubusercontent.com/llvm/llvm-project/refs/heads/release/21.x/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py
+            CLANG_TIDY=(uv run --no-project --script -- run-clang-tidy.py)
+          fi
+          if [[ -x "$(command -v clang-apply-replacements)" ]]; then
+            echo "Using clang-apply-replacements from $(command -v clang-apply-replacements)"
+            CLANG_TIDY+=(-fix -clang-apply-replacements-binary="$(command -v clang-apply-replacements)")
+          else
+            echo "::warning::clang-apply-replacements not found in PATH, automatic fixing disabled."
+          fi
+
           # Run cmake to create the build directory with compile_commands.json
-          (
-            cd build
-            cmake .. ${CLANG_TIDY_CMAKE_OPTIONS}  # no quotes here
-          )
+          cmake -S . -B cmake-build --fresh ${CLANG_TIDY_CMAKE_OPTIONS}  # no quotes here
+
+          CXX_FILES=$(find src -type f -iname "*.[ch]pp" -o -iname "*.cc" -o -iname "*.c" -o -iname "*.h")
           rc=0
-          bash format.sh || rc="$?"
-          rm -rf build
-          if [[ "${rc}" -ne 0 ]]; then
-            echo "::error::Format check failed. Please run 'bash format.sh' locally to fix the issues."
-            exit 1
+          "${CLANG_TIDY[@]}" -clang-tidy-binary="$(command -v clang-tidy)" \
+            -p="cmake-build" ${CXX_FILES} || rc="$?"
+          rm -rf cmake-build run-clang-tidy.py
+          if (( rc != 0 )); then
+            echo "::error::clang-tidy found issues (exit code: ${rc}). Please run 'clang-tidy --fix' locally to fix them."
+            git diff --color=always || true
+            exit "${rc}"
           fi
 
       - name: Enable core dump generation (Linux / GitHub-hosted runners)
 
@@ -97,3 +97,7 @@ tilelang/jit/adapter/cython/.cycache
 
 # claude
 **/.claude
+
+# CMake
+cmake-build/
+cmake-build-*/
@@ -32,7 +32,7 @@ repos:
         args: [--ignore-case]
         files: ^docs/spelling_wordlist\.txt$
   - repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v15.0.7  # sync with requirements-lint.txt
+    rev: v21.1.2  # sync with requirements-lint.txt
     hooks:
       - id: clang-format
         exclude: |
@@ -41,7 +41,7 @@ repos:
             ^.+\.json$
           )
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.14.0  # sync with requirements-lint.txt
+    rev: v0.14.1  # sync with requirements-lint.txt
     hooks:
       - id: ruff-check
         args: [--fix, --exit-non-zero-on-fix]
 
@@ -0,0 +1,53 @@
+# Mamba2_chunk_scan Benchmark
+
+This document records the throughput achieved by `benchmark_mamba_chunk_scan.py` when computing `batch = 8`, `heads = 80`, `groups = 1`, `chunk_size = 256`, `dim = 64`, and `dstate = 128` across different `seq_len` using the default autotuning search space.
+
+## Environment
+
+- Repository commit: `8a5eb569704bfea64478c29adcfe3a09e3c2b12c`
+- GPUs: `NVIDIA H800 SXM` on driver `560.35.05`
+
+## How to Reproduce
+
+```bash
+cd benchmark/mamba2
+python - <<'PY'
+from benchmark_mamba_chunk_scan import chunk_scan_fwd
+
+batch = 8
+heads = 80
+groups = 1
+chunk_size = 256
+dim = 64
+dstate = 128
+for seq_len in [1024, 2048, 4096, 8192, 16384, 32768]:
+    res = chunk_scan_fwd(
+            batch,
+            seq_len,
+            chunk_size,
+            groups,
+            heads,
+            dim,
+            dstate)
+    tflops = (2 * batch * seq_len * chunk_size * heads * dim * 0.5 + 2 * batch * seq_len * heads * dim * dstate) / res.latency * 1e-9
+    print(f"seq_len={seq_len:5d}  latency={res.latency:.6f}ms  TFlops={tflops:.3f}")
+PY
+```
+
+## Results
+
+| Seq_len| Latency (ms) | Throughput (TFLOPs) |
+|-------|-------------|---------------------|
+|  1024 | 0.169    | 126.477                 |
+|  2048 | 0.329    | 130.195                 |
+|  4096 | 0.645    | 133.054                 |
+|  8192 | 1.278    | 134.362                 |
+| 16384 | 2.531    | 135.711                 |
+| 32768 | 5.076    | 135.379                 |
+
+<figure style="text-align: center">
+  <a href="mamba_benchmark_result.png">
+    <img src="mamba_benchmark_result.png" alt="Mamba2_chunk_scan Performance Comparison on H100">
+   </a>
+  <figcaption style="text-align: center;">Performance comparison across compilers on NVIDIA H100</figcaption>
+</figure>
@@ -0,0 +1,223 @@
+import argparse
+import torch
+import tilelang
+from tilelang.autotuner import *
+import tilelang.language as T
+from einops import rearrange, repeat
+import itertools
+
+
+def ref_program(cb, x, dt, dA_cumsum, C, prev_states, D):
+    """
+    Argument:
+        cb: (batch, nchunks, ngroups, chunk_size, chunk_size)
+        x: (batch, seqlen, nheads, headdim)
+        dt: (batch, nheads, nchunks, chunk_size)
+        dA_cumsum: (batch, nheads, nchunks, chunk_size)
+        C: (batch, seqlen, ngroups, dstate)
+        prev_states: (batch, nchunks, nheads, headdim, dstate)
+        D: (nheads, headdim) or (nheads,)
+        z: (batch, seqlen, nheads, headdim)
+    Return:
+        out: (batch, seqlen, nheads, headdim)
+    """
+    _, _, ngroups, _, _ = cb.shape
+    batch, seqlen, nheads, headdim = x.shape
+    # _, _, ngroups, dstate = B.shape
+    # assert B.shape == (batch, seqlen, ngroups, dstate)
+    _, _, nchunks, chunk_size = dt.shape
+    assert seqlen == nchunks * chunk_size
+    # assert C.shape == B.shape
+    # B = repeat(B, "b l g d -> b l (g h) d", h=nheads // ngroups)
+    C = repeat(C, "b l g d -> b l (g h) d", h=nheads // ngroups)
+    cb = repeat(cb, "b c g l s -> b c (g h) l s", h=nheads // ngroups)
+    # CB = torch.einsum("bclhn,bcshn->bchls", rearrange(C, "b (c l) h n -> b c l h n", c=nchunks),
+    #                   rearrange(B, "b (c s) h n -> b c s h n", c=nchunks))
+    # (batch, nheads, nchunks, chunksize, chunksize)
+    dt_segment_sum = dA_cumsum[:, :, :, :, None] - dA_cumsum[:, :, :, None, :]
+    decay = torch.exp(dt_segment_sum)
+    scores_decay = cb * rearrange(decay, "b h c l s -> b c h l s")
+    causal_mask = torch.tril(
+        torch.ones(chunk_size, chunk_size, device=x.device, dtype=bool), diagonal=0)
+    scores_decay = scores_decay.masked_fill(~causal_mask, 0)
+    out = torch.einsum('bchls,bhcs,bcshp->bclhp', scores_decay.to(x.dtype), dt.to(x.dtype),
+                       rearrange(x, "b (c s) h p -> b c s h p", c=nchunks))
+    state_decay_out = torch.exp(rearrange(dA_cumsum, "b h c l -> b c l h 1"))
+    out_prev = torch.einsum('bclhn,bchpn->bclhp', rearrange(
+        C, "b (c l) h n -> b c l h n", c=nchunks), prev_states.to(C.dtype)) * state_decay_out
+    out = out + out_prev
+    out = rearrange(out, "b c l h p -> b (c l) h p")
+    if D is not None:
+        if D.dim() == 1:
+            D = rearrange(D, "h -> h 1")
+        out = out + x * D
+    return out
+
+
+def get_configs():
+    iter_params = dict(
+        block_M=[64, 128, 256],
+        block_N=[32, 64],
+        block_K=[64, 128, 256],
+        block_Dstate=[128],
+        num_stages=[1, 2, 3, 4, 5])
+    return [dict(zip(iter_params, values)) for values in itertools.product(*iter_params.values())]
+
+
+@autotune(configs=get_configs(), warmup=10, rep=10)
+@tilelang.jit(
+    out_idx=[7],
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    },
+)
+def chunk_scan_fwd(batch,
+                   seqlen,
+                   chunk_size,
+                   ngroups,
+                   nheads,
+                   headdim,
+                   dstate,
+                   block_M=64,
+                   block_N=64,
+                   block_K=64,
+                   block_Dstate=128,
+                   num_stages=2,
+                   threads=128):
+    dtype = "float16"
+    accum_dtype = "float"
+    nchunks = T.ceildiv(seqlen, chunk_size)
+    p = 1.44269504
+
+    @T.prim_func
+    def main(
+            cb: T.Tensor((batch, nchunks, ngroups, chunk_size, chunk_size), dtype),  # type: ignore
+            x: T.Tensor((batch, seqlen, nheads, headdim), dtype),  # type: ignore
+            dt: T.Tensor((batch, nheads, nchunks, chunk_size), dtype),  # type: ignore
+            dA_cumsum: T.Tensor((batch, nheads, nchunks, chunk_size), dtype),  # type: ignore
+            C: T.Tensor((batch, seqlen, ngroups, dstate), dtype),  # type: ignore
+            prev_states: T.Tensor((batch, nchunks, nheads, headdim, dstate), dtype),  # type: ignore
+            D: T.Tensor((nheads), dtype),  # type: ignore
+            Output: T.Tensor((batch, seqlen, nheads, headdim), dtype)  # type: ignore
+    ):
+        with T.Kernel(
+                nheads,
+                T.ceildiv(chunk_size, block_M) * T.ceildiv(headdim, block_N),
+                batch * nchunks,
+                threads=threads) as (bz, bx, by):
+            acc_o = T.alloc_fragment((block_M, block_N), accum_dtype)
+            acc_o_shared = T.alloc_shared((block_M, block_N), dtype)
+            cb_shared = T.alloc_shared((block_M, block_K), dtype, scope="shared.dyn")
+            cb_local = T.alloc_fragment((block_M, block_K), dtype)
+            dA_cs_k_shared = T.alloc_shared((block_K), dtype, scope="shared")
+            dA_cs_k_local = T.alloc_fragment((block_K), accum_dtype)
+            dA_cs_m_local = T.alloc_fragment((block_M), accum_dtype)
+            dt_shared = T.alloc_shared((block_K), dtype, scope="shared")
+            dt_local = T.alloc_fragment((block_K), accum_dtype)
+            x_shared = T.alloc_shared((block_K, block_N), dtype, scope="shared.dyn")
+            dA_cs_m_shared = T.alloc_shared((block_M), dtype, scope="shared")
+            scale_m_local = T.alloc_fragment((block_M), accum_dtype)
+            C_shared = T.alloc_shared((block_M, block_Dstate), dtype)
+            prev_state_shared = T.alloc_shared((block_N, block_Dstate), dtype)
+            D_local = T.alloc_fragment((1), accum_dtype)
+            x_residual_shared = T.alloc_shared((block_M, block_N), dtype, scope="shared.dyn")
+            x_residual_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+
+            batch_idx = by % batch
+            chunk_idx = by // batch
+            # m: chunk_size
+            # n : headdim
+            m_idx = bx // T.ceildiv(headdim, block_N)
+            n_idx = bx % T.ceildiv(headdim, block_N)
+
+            T.annotate_layout({
+                acc_o_shared: tilelang.layout.make_swizzled_layout(acc_o_shared),
+                cb_shared: tilelang.layout.make_swizzled_layout(cb_shared),
+                x_residual_shared: tilelang.layout.make_swizzled_layout(x_residual_shared)
+            })
+
+            T.no_set_max_nreg()
+
+            T.copy(dA_cumsum[batch_idx, bz, chunk_idx, m_idx * block_M:(m_idx + 1) * block_M],
+                   dA_cs_m_shared)
+            T.copy(dA_cs_m_shared, dA_cs_m_local)
+            T.clear(acc_o)
+
+            for i in T.Parallel(block_M):
+                scale_m_local[i] = T.exp2(dA_cs_m_local[i] * p)
+            T.copy(
+                C[batch_idx, chunk_idx * chunk_size + m_idx * block_M:chunk_idx * chunk_size +
+                  (m_idx + 1) * block_M, bz // (nheads // ngroups), 0:block_Dstate], C_shared)
+            T.copy(
+                prev_states[batch_idx, chunk_idx, bz, n_idx * block_N:(n_idx + 1) * block_N,
+                            0:block_Dstate], prev_state_shared)
+            T.gemm(C_shared, prev_state_shared, acc_o, transpose_B=True)
+            for i, j in T.Parallel(block_M, block_N):
+                acc_o[i, j] *= scale_m_local[i]
+
+            loop_range = T.ceildiv((m_idx + 1) * block_M, block_K)
+
+            for k in T.Pipelined(loop_range, num_stages=num_stages):
+                T.copy(
+                    cb[batch_idx, chunk_idx, bz // (nheads // ngroups),
+                       m_idx * block_M:(m_idx + 1) * block_M, k * block_K:(k + 1) * block_K],
+                    cb_shared)
+                T.copy(cb_shared, cb_local)
+                T.copy(dA_cumsum[batch_idx, bz, chunk_idx, k * block_K:(k + 1) * block_K],
+                       dA_cs_k_shared)
+                T.copy(dA_cs_k_shared, dA_cs_k_local)
+                for i, j in T.Parallel(block_M, block_K):
+                    cb_local[i,
+                             j] = cb_local[i,
+                                           j] * T.exp2(dA_cs_m_local[i] * p - dA_cs_k_local[j] * p)
+                T.copy(dt[batch_idx, bz, chunk_idx, k * block_K:(k + 1) * block_K], dt_shared)
+                T.copy(dt_shared, dt_local)
+                for i, j in T.Parallel(block_M, block_K):
+                    cb_local[i, j] *= dt_local[j]
+                for i, j in T.Parallel(block_M, block_K):
+                    cb_local[i, j] = T.if_then_else(m_idx * block_M + i >= k * block_K + j,
+                                                    cb_local[i, j], 0)
+                T.copy(
+                    x[batch_idx, chunk_idx * chunk_size + k * block_K:chunk_idx * chunk_size +
+                      (k + 1) * block_K, bz, n_idx * block_N:(n_idx + 1) * block_N], x_shared)
+                T.gemm(cb_local, x_shared, acc_o)
+
+            D_local[0] = D[bz]
+            T.copy(
+                x[batch_idx, chunk_idx * chunk_size + m_idx * block_M:chunk_idx * chunk_size +
+                  (m_idx + 1) * block_M, bz, n_idx * block_N:(n_idx + 1) * block_N],
+                x_residual_shared)
+            T.copy(x_residual_shared, x_residual_local)
+            for i, j in T.Parallel(block_M, block_N):
+                acc_o[i, j] += x_residual_local[i, j] * D_local[0]
+
+            T.copy(acc_o, acc_o_shared)
+            T.copy(
+                acc_o_shared,
+                Output[batch_idx, chunk_idx * chunk_size + m_idx * block_M:chunk_idx * chunk_size +
+                       (m_idx + 1) * block_M, bz, n_idx * block_N:(n_idx + 1) * block_N])
+
+    return main
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--batch', type=int, default=8, help='batch size')
+    parser.add_argument('--heads', type=int, default=80, help='heads')
+    parser.add_argument('--groups', type=int, default=1, help='groups')
+    parser.add_argument('--seq_len', type=int, default=4096, help='sequence length')
+    parser.add_argument('--chunk_size', type=int, default=256, help='chunk size')
+    parser.add_argument('--dim', type=int, default=64, help='dim')
+    parser.add_argument('--dstate', type=int, default=128, help='dstate')
+    parser.add_argument('--tune', action='store_true', help='tune configs')
+    args = parser.parse_args()
+    batch, heads, groups, seq_len, chunk_size, dim, dstate = args.batch, args.heads, args.groups, args.seq_len, args.chunk_size, args.dim, args.dstate
+    total_flops = 2 * batch * seq_len * chunk_size * heads * dim * 0.5 + 2 * batch * seq_len * heads * dim * dstate
+
+    kernel = chunk_scan_fwd(batch, seq_len, chunk_size, groups, heads, dim, dstate)
+    best_latency = kernel.latency
+    best_config = kernel.config
+    ref_latency = kernel.ref_latency
+    print(f"Best latency: {best_latency}")
+    print(f"Best TFlops: {total_flops / best_latency * 1e-9}")
+    print(f"Best config: {best_config}")
@@ -1,12 +1,10 @@
-# -*- coding: utf-8 -*-
-
 # General information about the project.
 project = "Tile Language <br>"
 author = "Tile Lang Contributors"
-copyright = "2025-2025, %s" % author
+copyright = f"2025-2025, {author}"
 
 # Version information.
-with open("../VERSION", "r") as f:
+with open("../VERSION") as f:
     version = f.read().strip()
 release = version