tile-ai
diff --git a/‎.clang-tidy‎
Lines changed: 1 addition & 1 deletion b/‎.clang-tidy‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/dist.yml‎
Lines changed: 80 additions & 5 deletions b/‎.github/workflows/dist.yml‎
Lines changed: 80 additions & 5 deletions
diff --git a/‎MANIFEST.in‎
Lines changed: 33 additions & 8 deletions b/‎MANIFEST.in‎
Lines changed: 33 additions & 8 deletions
diff --git a/‎benchmark/mamba2/README.md‎
Lines changed: 6 additions & 0 deletions b/‎benchmark/mamba2/README.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎benchmark/mamba2/benchmark_mamba_chunk_scan.py‎
Lines changed: 145 additions & 0 deletions b/‎benchmark/mamba2/benchmark_mamba_chunk_scan.py‎
Lines changed: 145 additions & 0 deletions
@@ -4,7 +4,7 @@ ExtraArgs: ['-v']
 FormatStyle: file
 UseColor: true
 WarningsAsErrors: '*'
-ExcludeHeaderFilterRegex: '^(3rdparty|tvm)/.*$'
+HeaderFilterRegex: '^(?!.*(?:/|^)(3rdparty|tvm)/).*'
 
 # NOTE: there must be no spaces before the '-', so put the comma last.
 Checks: >-
 
@@ -22,6 +22,7 @@ env:
   PYTHONDEVMODE: "1"
   PYTHONUNBUFFERED: "1"
   PYTHONPATH: "" # explicit cleanup
+  COLUMNS: "100"
   FORCE_COLOR: "1"
   CLICOLOR_FORCE: "1"
   UV_INDEX_STRATEGY: "unsafe-best-match"
 
@@ -28,7 +28,74 @@ concurrency:
   group: "${{ github.workflow }}-${{ github.ref }}"
   cancel-in-progress: true
 
+env:
+  PYTHONDEVMODE: "1"
+  PYTHONUNBUFFERED: "1"
+  COLUMNS: "100"
+  FORCE_COLOR: "1"
+  CLICOLOR_FORCE: "1"
+
 jobs:
+  build-sdist:
+    name: Build SDist
+    if: |
+      github.repository_owner == 'tile-ai' &&
+      (github.event_name != 'pull_request' || !github.event.pull_request.draft)
+    runs-on: macos-latest
+    timeout-minutes: 30
+    env:
+      NO_VERSION_LABEL: ${{ github.event_name == 'release' && 'OFF' || 'ON' }}
+      # NO_GIT_VERSION disables embedding the git commit hash in version metadata.
+      # Otherwise, the version of the SDist has a git hash suffix (e.g., 0.1.0+gitabcdef12),
+      # but the package built from the SDist has no way to get the git hash (it is not a git repo),
+      # leading to inconsistent versions between SDist and built packages (+gitabcdef12 vs. +gitunknown).
+      NO_GIT_VERSION: "ON"
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v5
+        with:
+          fetch-depth: 1
+          submodules: recursive
+
+      - name: Setup Python and uv with caching
+        id: setup-uv
+        uses: astral-sh/setup-uv@v7
+        with:
+          python-version: "3.12"
+          activate-environment: true
+
+      - name: Build SDist
+        run: |
+          uv run --no-project --with=build -m -- build --sdist --outdir=dist
+
+      - name: Setup ccache
+        uses: hendrikmuhs/ccache-action@v1
+        with:
+          create-symlink: true
+          key: ccache-${{ runner.os }}-${{ runner.arch }}
+          evict-old-files: "7d"
+
+      - name: Test SDist buildable
+        run: |
+          TEMP_DIR="$(mktemp -d -t tilelang-sdist-test)"
+          cp -r dist "${TEMP_DIR}/dist"
+          uv venv --seed "${TEMP_DIR}/venv"
+          source "${TEMP_DIR}/venv/bin/activate"
+          cd "${TEMP_DIR}"
+          python3 -m pip install --upgrade pip setuptools wheel
+          python3 -m pip install -v dist/*.tar.gz
+          python3 -c "import tilelang; print(tilelang.__version__)"
+
+      - name: Upload SDist
+        # Not PR to save artifact storage, as SDist is only needed for releases.
+        if: github.event_name != 'pull_request'
+        uses: actions/upload-artifact@v4
+        with:
+          name: sdist
+          path: dist/*.tar.gz
+          if-no-files-found: error
+
   build-wheels:
     name: Build wheels for Python ${{ matrix.python-version }} on ${{ matrix.target.runner }} with ${{ matrix.target.toolkit }}
     if: |
@@ -94,22 +161,30 @@ jobs:
       - name: Upload wheels
         # Not PR to save artifact storage, as wheels are only needed for releases.
         if: github.event_name != 'pull_request'
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v5
         with:
           name: wheels-${{ matrix.python-version }}-${{ runner.os }}-${{ runner.arch }}-${{ matrix.target.toolkit }}
           path: wheelhouse/*.whl
           if-no-files-found: error
 
   list-artifacts:
     name: List artifacts
-    # Not PR to save artifact storage, as wheels are only needed for releases.
+    # Not PR to save artifact storage, as artifacts are only needed for releases.
     if: github.event_name != 'pull_request'
     runs-on: ubuntu-latest
-    needs: [build-wheels]
+    needs: [build-sdist, build-wheels]
     timeout-minutes: 15
     steps:
-      - name: Download built wheels
+      - name: Download built SDist
         uses: actions/download-artifact@v5
+        with:
+          # unpacks default artifact into dist/
+          # if `name: artifact` is omitted, the action will create extra parent dir
+          name: sdist
+          path: dist
+
+      - name: Download built wheels
+        uses: actions/download-artifact@v6
         with:
           pattern: wheels-*
           path: dist
@@ -119,7 +194,7 @@ jobs:
         run: ls -lh dist/*
 
       - name: Upload artifacts
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v5
         with:
           name: artifacts
           path: dist/*
 
@@ -1,10 +1,35 @@
+# Reference: https://setuptools.pypa.io/en/latest/userguide/miscellaneous.html
+
+# Include licenses
 include VERSION
-include CMakeLists.txt
-include requirements.txt
-include requirements-test.txt
-include requirements-dev.txt
+include LICENSE
+include THIRDPARTYNOTICES.txt
+
+# Version and dependency files
+include version_provider.py
+include requirements*.txt
 include tilelang/jit/adapter/cython/cython_wrapper.pyx
-recursive-include src *
-recursive-include 3rdparty *
-recursive-exclude 3rdparty/clang* * 
-recursive-exclude 3rdparty/llvm* *
+
+# Include source files in SDist
+include CMakeLists.txt
+graft src
+graft cmake
+graft 3rdparty
+
+# Include test suites in SDist
+graft testing
+graft examples
+global-exclude .coverage .coverage.* coverage.xml coverage-*.xml coverage.*.xml
+global-exclude .junit .junit.* junit.xml junit-*.xml junit.*.xml
+
+# Exclude unneeded files and directories
+prune .git
+prune .github
+prune */.git
+prune */.github
+prune 3rdparty/clang*
+prune 3rdparty/llvm*
+
+# Prune compiled files
+prune */__pycache__
+global-exclude *~ *.py[cod] *.so *.a *.dylib *.pxd *.dll *.lib *.o *.obj
@@ -45,6 +45,12 @@ PY
 | 16384 | 2.531    | 135.711                 |
 | 32768 | 5.076    | 135.379                 |
 
+
+## Compare with Baselines
+
+- Triton: v3.5.0, mamba-ssm: v2.2.6.post3
+- Helion: v0.2.1
+
 <figure style="text-align: center">
   <a href="mamba_benchmark_result.png">
     <img src="mamba_benchmark_result.png" alt="Mamba2_chunk_scan Performance Comparison on H100">
 
@@ -5,6 +5,20 @@
 import tilelang.language as T
 from einops import rearrange, repeat
 import itertools
+import math
+from tilelang.profiler import do_bench
+
+try:
+    from mamba_ssm.ops.triton.ssd_chunk_scan import _chunk_scan_fwd
+except ImportError as err:
+    raise ImportError("Please install mamba-ssm to use the triton chunk scan operator.") from err
+
+try:
+    import helion
+    from helion._testing import run_example
+    import helion.language as hl
+except ImportError as err:
+    raise ImportError("Please install helion to use the helion chunk scan operator.") from err
 
 
 def ref_program(cb, x, dt, dA_cumsum, C, prev_states, D):
@@ -54,6 +68,119 @@ def ref_program(cb, x, dt, dA_cumsum, C, prev_states, D):
     return out
 
 
+def chunk_scan_triton(cb, x, dt, dA_cumsum, C, states, D):
+    out, _ = _chunk_scan_fwd(cb, x, dt, dA_cumsum, C, states, D)
+    return out
+
+
+def chunk_scan_helion(cb, x, dt, dA_cumsum, C, states, D):
+
+    @helion.kernel()
+    def helion_mamba2_chunk_scan_kernel(
+        cb: torch.Tensor,
+        x: torch.Tensor,
+        dt: torch.Tensor,
+        dA_cumsum: torch.Tensor,
+        C: torch.Tensor,
+        prev_states: torch.Tensor,
+        D: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Argument:
+            cb: (batch, nchunks, ngroups, chunk_size, chunk_size)
+            x: (batch, seqlen, nheads, headdim)
+            dt: (batch, nheads, nchunks, chunk_size)
+            dA_cumsum: (batch, nheads, nchunks, chunk_size)
+            C: (batch, seqlen, ngroups, dstate)
+            prev_states: (batch, nchunks, nheads, headdim, dstate)
+            D: (nheads,)
+        Return:
+            out: (batch, seqlen, nheads, headdim)
+        """
+
+        batch, nchunks, ngroups, chunk_size, _ = cb.shape
+        _, seqlen, nheads, headdim = x.shape
+        _, _, _, dstate = C.shape
+        assert nchunks == (seqlen + chunk_size - 1) // chunk_size
+
+        block_m = hl.register_block_size(chunk_size)
+        block_n = hl.register_block_size(headdim)
+        block_k = hl.register_block_size(64, 64)
+        dstate = hl.specialize(dstate)
+
+        assert cb.shape == (batch, nchunks, ngroups, chunk_size, chunk_size)
+        assert x.shape == (batch, seqlen, nheads, headdim)
+        assert dt.shape == (batch, nheads, nchunks, chunk_size)
+        assert dA_cumsum.shape == (batch, nheads, nchunks, chunk_size)
+        assert C.shape == (batch, seqlen, ngroups, dstate)
+        assert prev_states.shape == (batch, nchunks, nheads, headdim, dstate)
+        assert D.shape == (nheads,)
+
+        dtype = cb.dtype
+        accum_dtype = torch.float32
+        assert (x.dtype == dt.dtype == dA_cumsum.dtype == C.dtype == prev_states.dtype == D.dtype ==
+                dtype)
+
+        out = torch.empty_like(x)
+
+        p = 1.44269504
+
+        for tile_h, tile_m, tile_n, tile_b, tile_c in hl.tile(
+            [nheads, chunk_size, headdim, batch, nchunks],
+                block_size=[1, block_m, block_n, 1, 1],
+        ):
+            acc_o = hl.zeros([tile_m, tile_n], dtype=accum_dtype)
+            dA_cumsum_local_m = dA_cumsum[tile_b.begin, tile_h.begin, tile_c.begin,
+                                          tile_m].to(torch.float32)
+            scale_m_local = torch.exp2(dA_cumsum_local_m * p)
+
+            C_local = C[
+                tile_b.begin,
+                tile_m.index + tile_c.begin * chunk_size,
+                tile_h.begin // (nheads // ngroups),
+                :,
+            ]
+            prev_states_local = prev_states[tile_b.begin, tile_c.begin, tile_h.begin, tile_n, :]
+            acc_o = hl.dot(C_local, prev_states_local.T, acc=acc_o)
+            acc_o *= scale_m_local[:, None]
+
+            for tile_k in hl.tile((tile_m.id + 1) * block_m, block_size=block_k):
+                cb_local = cb[
+                    tile_b.begin,
+                    tile_c.begin,
+                    tile_h.begin // (nheads // ngroups),
+                    tile_m,
+                    tile_k,
+                ]
+                dA_cumsum_local_k = dA_cumsum[tile_b.begin, tile_h.begin, tile_c.begin,
+                                              tile_k].to(torch.float32)
+                cb_local *= torch.exp2(dA_cumsum_local_m[:, None] * p -
+                                       dA_cumsum_local_k[None, :] * p)
+                dt_local = dt[tile_b.begin, tile_h.begin, tile_c.begin, tile_k].to(torch.float32)
+                cb_local = (cb_local * dt_local[None, :]).to(dtype)
+                pred = (tile_m.index + 0)[:, None] >= (tile_k.index + 0)[None, :]
+                cb_local = torch.where(pred, cb_local, torch.zeros_like(cb_local))
+                x_local = x[
+                    tile_b.begin,
+                    tile_c.begin * chunk_size + tile_k.index,
+                    tile_h.begin,
+                    tile_n,
+                ]
+                acc_o = hl.dot(cb_local, x_local, acc=acc_o)
+
+            D_local = D[tile_h.begin].to(torch.float32)
+            x_residual = x[tile_b.begin, tile_c.begin * chunk_size + tile_m.index, tile_h.begin,
+                           tile_n].to(torch.float32)
+            acc_o += x_residual * D_local
+            out[tile_b.begin, tile_c.begin * chunk_size + tile_m.index, tile_h.begin,
+                tile_n] = acc_o.to(dtype=dtype)
+
+        return out
+
+    args = (cb, x, dt, dA_cumsum, C, states, D)
+    run_example(helion_mamba2_chunk_scan_kernel, ref_program, args)
+
+
 def get_configs():
     iter_params = dict(
         block_M=[64, 128, 256],
@@ -212,12 +339,30 @@ def main(
     parser.add_argument('--tune', action='store_true', help='tune configs')
     args = parser.parse_args()
     batch, heads, groups, seq_len, chunk_size, dim, dstate = args.batch, args.heads, args.groups, args.seq_len, args.chunk_size, args.dim, args.dstate
+    nchunks = math.ceil(seq_len / chunk_size)
     total_flops = 2 * batch * seq_len * chunk_size * heads * dim * 0.5 + 2 * batch * seq_len * heads * dim * dstate
 
+    print("Benchmarking TileLang...")
     kernel = chunk_scan_fwd(batch, seq_len, chunk_size, groups, heads, dim, dstate)
     best_latency = kernel.latency
     best_config = kernel.config
     ref_latency = kernel.ref_latency
     print(f"Best latency: {best_latency}")
     print(f"Best TFlops: {total_flops / best_latency * 1e-9}")
     print(f"Best config: {best_config}")
+
+    cb = torch.randn(batch, nchunks, groups, chunk_size, chunk_size).half().cuda()
+    x = torch.randn(batch, seq_len, heads, dim).half().cuda()
+    dt = torch.randn(batch, heads, nchunks, chunk_size).half().cuda()
+    dA_cumsum = torch.randn(batch, heads, nchunks, chunk_size).half().cuda()
+    C = torch.randn(batch, seq_len, groups, dstate).half().cuda()
+    states = torch.randn(batch, nchunks, heads, dim, dstate).half().cuda()
+    D = torch.randn(heads).half().cuda()
+
+    print("Benchmarking Triton...")
+    triton_latency = do_bench(
+        lambda: chunk_scan_triton(cb, x, dt, dA_cumsum, C, states, D), _n_warmup=10, _n_repeat=10)
+    print(f"Triton TFlops: {total_flops / triton_latency * 1e-9}")
+
+    print("Benchmarking Helion...")
+    chunk_scan_helion(cb, x, dt, dA_cumsum, C, states, D)