tile-ai · yyttt6 · Nov 11, 2025 · Nov 11, 2025 · Nov 11, 2025 · Nov 11, 2025
diff --git a/.github/workflows/pr-perfbench-bot.yml b/.github/workflows/pr-perfbench-bot.yml
@@ -6,7 +6,7 @@ on:
       - created
 
 permissions:
-  contents: read
+  contents: write
 
 concurrency:
   group: "${{ github.workflow }}-${{ github.ref }}"
@@ -16,7 +16,9 @@ env:
   PYTHONDEVMODE: "1"
   PYTHONUNBUFFERED: "1"
   PYTHONPATH: "" # explicit cleanup
-  PIP_USER: "" # explicit cleanup
+  PIP_USER: "0"
+  PIP_NO_USER: "1"
+  PIP_DISABLE_PIP_VERSION_CHECK: "1"
   COLUMNS: "100"
   FORCE_COLOR: "1"
   CLICOLOR_FORCE: "1"
@@ -72,17 +74,58 @@ jobs:
         run: |
           source tl/bin/activate
           python maint/scripts/ci_performance.py
+      - name: Read markdown table
+        id: read_md
+        run: |
+          echo "content<<EOF" >> $GITHUB_OUTPUT
+          cat bench.md >> $GITHUB_OUTPUT
+          echo "EOF" >> $GITHUB_OUTPUT
+      - name: Upload PNG to GitHub and get URL
+        id: upload_png
+        uses: actions/github-script@v8
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            const fs = require('fs');
+            const content = fs.readFileSync('bench.png').toString('base64');
+            // Create blob in the repo
+            const blob = await github.rest.git.createBlob({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              content: content,
+              encoding: "base64",
+            });
+            // Attach blob as a tree item
+            const tree = await github.rest.git.createTree({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              tree: [{
+                path: `bench_${context.runId}.png`,
+                mode: '100644',
+                type: 'blob',
+                sha: blob.data.sha
+              }]
+            });
+            // Raw file URL (works for embedding image)
+            const url = `https://raw.githubusercontent.com/${context.repo.owner}/${context.repo.repo}/${tree.data.sha}/bench_${context.runId}.png`
+            core.setOutput("url", url);
 
       - name: Post test results as PR comment
         uses: actions/github-script@v8
         with:
           github-token: ${{ secrets.GITHUB_TOKEN }}
           script: |
+            const md = `${{ steps.read_md.outputs.content }}`;
+            const img = `${{ steps.upload_png.outputs.url }}`;
             github.rest.issues.createComment({
               owner: context.repo.owner,
               repo: context.repo.repo,
               issue_number: context.issue.number,
-              body: '📊 **Performance Test Results** (triggered by @' + context.payload.comment.user.login + '):\n\n' +
-                'Run listed here: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}\n\n' +
-                "${{ steps.perfbench.outputs.stdout }}"
+              body:
+                '📊 **Performance Test Results** (triggered by @' +
+                context.payload.comment.user.login + ')\n\n' +
+                'Run: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}\n\n' +
+                md +
+                '\n\n📈 **Speedup Plot:**\n\n' +
+                `![Speedup Plot](${img})`
             })
diff --git a/examples/analyze/bench_example_analyze.py b/examples/analyze/bench_example_analyze.py
@@ -0,0 +1,15 @@
+import tilelang.tools.bench
+import example_conv_analyze
+import example_gemm_analyze
+
+
+def bench_example_gemm_analyze():
+    tilelang.tools.bench.process_func(example_gemm_analyze.main)
+
+
+def bench_example_conv_analyze():
+    tilelang.tools.bench.process_func(example_conv_analyze.main)
+
+
+if globals().get("__name__") == "__main__":
+    tilelang.tools.bench.main()
diff --git a/examples/attention_sink/bench_example_attention_sink.py b/examples/attention_sink/bench_example_attention_sink.py
@@ -0,0 +1,52 @@
+import tilelang.tools.bench
+import example_gqa_sink_bwd_bhsd
+import example_gqa_sink_fwd_bhsd_wgmma_pipelined
+import example_mha_sink_bwd_bhsd
+import example_mha_sink_fwd_bhsd
+import example_mha_sink_fwd_bhsd_wgmma_pipelined
+
+
+def bench_example_mha_sink_fwd_bhsd():
+    tilelang.tools.bench.process_func(example_mha_sink_fwd_bhsd.main)
+
+
+def bench_example_mha_sink_fwd_bhsd_sliding_window():
+    tilelang.tools.bench.process_func(example_mha_sink_fwd_bhsd.main, window_size=128)
+
+
+def bench_example_mha_sink_fwd_bhsd_wgmma_pipelined():
+    tilelang.tools.bench.process_func(example_mha_sink_fwd_bhsd_wgmma_pipelined.main)
+
+
+def bench_example_mha_sink_fwd_bhsd_wgmma_pipelined_sliding_window():
+    tilelang.tools.bench.process_func(
+        example_mha_sink_fwd_bhsd_wgmma_pipelined.main, window_size=128)
+
+
+def bench_example_gqa_sink_fwd_bhsd_wgmma_pipelined():
+    tilelang.tools.bench.process_func(example_gqa_sink_fwd_bhsd_wgmma_pipelined.main)
+
+
+def bench_example_gqa_sink_fwd_bhsd_wgmma_pipelined_sliding_window():
+    tilelang.tools.bench.process_func(
+        example_gqa_sink_fwd_bhsd_wgmma_pipelined.main, window_size=128)
+
+
+def bench_example_mha_sink_bwd_bhsd():
+    tilelang.tools.bench.process_func(example_mha_sink_bwd_bhsd.main)
+
+
+def bench_example_mha_sink_bwd_bhsd_sliding_window():
+    tilelang.tools.bench.process_func(example_mha_sink_bwd_bhsd.main, window_size=128)
+
+
+def bench_example_gqa_sink_bwd_bhsd():
+    tilelang.tools.bench.process_func(example_gqa_sink_bwd_bhsd.main)
+
+
+def bench_example_gqa_sink_bwd_bhsd_sliding_window():
+    tilelang.tools.bench.process_func(example_gqa_sink_bwd_bhsd.main, window_size=128)
+
+
+if globals().get("__name__") == "__main__":
+    tilelang.tools.bench.main()
diff --git a/examples/blocksparse_attention/bench_example_blocksparse_attention.py b/examples/blocksparse_attention/bench_example_blocksparse_attention.py
@@ -0,0 +1,55 @@
+import tilelang.tools.bench
+import block_sparse_attn_triton
+import example_tilelang_block_sparse_attn
+import example_tilelang_sparse_gqa_decode_varlen_indice
+import example_tilelang_sparse_gqa_decode_varlen_mask
+import example_triton_sparse_gqa_decode_varlen_indice
+import example_triton_sparse_gqa_decode_varlen_mask
+
+
+def bench_block_sparse_attn_triton():
+    tilelang.tools.bench.process_func(block_sparse_attn_triton.main)
+
+
+def bench_example_tilelang_block_sparse_attn():
+    tilelang.tools.bench.process_func(example_tilelang_block_sparse_attn.main)
+
+
+def bench_example_tilelang_sparse_gqa_decode_varlen_indice():
+    tilelang.tools.bench.process_func(
+        example_tilelang_sparse_gqa_decode_varlen_indice.main, batch=1, max_cache_seqlen=2048)
+
+
+def bench_example_tilelang_sparse_gqa_decode_varlen_mask():
+    tilelang.tools.bench.process_func(
+        example_tilelang_sparse_gqa_decode_varlen_mask.main, batch=1, max_cache_seqlen=2048)
+
+
+def bench_example_triton_sparse_gqa_decode_varlen_indice():
+    tilelang.tools.bench.process_func(
+        example_triton_sparse_gqa_decode_varlen_indice.main,
+        batch=8,
+        heads=8,
+        heads_kv=4,
+        max_cache_seqlen=2048,
+        dim=128,
+        dim_v=128,
+        sparse_ratio=0.8,
+        block_size=32)
+
+
+def bench_example_triton_sparse_gqa_decode_varlen_mask():
+    tilelang.tools.bench.process_func(
+        example_triton_sparse_gqa_decode_varlen_mask.main,
+        batch=8,
+        heads=8,
+        heads_kv=4,
+        max_cache_seqlen=2048,
+        dim=128,
+        dim_v=128,
+        sparse_ratio=0.8,
+        block_size=32)
+
+
+if globals().get("__name__") == "__main__":
+    tilelang.tools.bench.main()
diff --git a/examples/blocksparse_gemm/bench_example_blocksparse_gemm.py b/examples/blocksparse_gemm/bench_example_blocksparse_gemm.py
@@ -0,0 +1,10 @@
+import tilelang.tools.bench
+import example_blocksparse_gemm
+
+
+def bench_example_blocksparse_gemm():
+    tilelang.tools.bench.process_func(example_blocksparse_gemm.main)
+
+
+if globals().get("__name__") == "__main__":
+    tilelang.tools.bench.main()
diff --git a/examples/cast/bench_example_cast.py b/examples/cast/bench_example_cast.py
@@ -0,0 +1,21 @@
+import tilelang.tools.bench
+import example_group_per_split_token_cast_to_fp8
+import example_per_token_cast_to_fp8
+
+
+def bench_example_group_per_split_token_cast_to_fp8():
+    tilelang.tools.bench.process_func(
+        example_group_per_split_token_cast_to_fp8.main,
+        M=1024,
+        N=1024,
+        BG=2,
+        blk_m=4,
+        batch_sizes=[128, 896])
+
+
+def bench_example_per_token_cast_to_fp8():
+    tilelang.tools.bench.process_func(example_per_token_cast_to_fp8.main, M=2048, N=512, blk_m=8)
+
+
+if globals().get("__name__") == "__main__":
+    tilelang.tools.bench.main()
diff --git a/examples/convolution/bench_example_convolution.py b/examples/convolution/bench_example_convolution.py
@@ -0,0 +1,15 @@
+import tilelang.tools.bench
+import example_convolution
+import example_convolution_autotune
+
+
+def bench_example_convolution():
+    tilelang.tools.bench.process_func(example_convolution.main)
+
+
+def bench_example_convolution_autotune():
+    tilelang.tools.bench.process_func(example_convolution_autotune.main)
+
+
+if globals().get("__name__") == "__main__":
+    tilelang.tools.bench.main()
diff --git a/examples/deepseek_deepgemm/bench_example_deepgemm_fp8_2xAcc.py b/examples/deepseek_deepgemm/bench_example_deepgemm_fp8_2xAcc.py
@@ -0,0 +1,10 @@
+import tilelang.tools.bench
+import example_deepgemm_fp8_2xAcc
-import tilelang.tools.bench
-import example_deepgemm_fp8_2xAcc
+import tilelang.tools.bench
+import tilelang.testing
+import example_deepgemm_fp8_2xAcc
-import tilelang.tools.bench
-import example_deepgemm_fp8_2xAcc
+import tilelang.tools.bench
+import tilelang.testing
+import example_deepgemm_fp8_2xAcc
+
+
+def bench_example_deepgemm_fp8_2xAcc():
+    tilelang.tools.bench.process_func(example_deepgemm_fp8_2xAcc.main)
+
+
+if globals().get("__name__") == "__main__":
+    tilelang.tools.bench.main()
diff --git a/examples/deepseek_mla/bench_example_mla_decode.py b/examples/deepseek_mla/bench_example_mla_decode.py
@@ -0,0 +1,10 @@
+import tilelang.tools.bench
+import example_mla_decode
-import tilelang.tools.bench
-import example_mla_decode
+import tilelang.tools.bench
+import tilelang.testing
+import example_mla_decode
-import tilelang.tools.bench
-import example_mla_decode
+import tilelang.tools.bench
+import tilelang.testing
+import example_mla_decode
+
+
+def bench_example_mla_decode():
+    tilelang.tools.bench.process_func(example_mla_decode.main)
+
+
+if globals().get("__name__") == "__main__":
+    tilelang.tools.bench.main()
diff --git a/examples/deepseek_nsa/bench_example_tilelang_nsa.py b/examples/deepseek_nsa/bench_example_tilelang_nsa.py
@@ -0,0 +1,15 @@
+import tilelang.tools.bench
+import example_tilelang_nsa_fwd
+import example_tilelang_nsa_decode
+
+
+def bench_example_tilelang_nsa_fwd():
+    tilelang.tools.bench.process_func(example_tilelang_nsa_fwd.main)
+
+
+def bench_example_tilelang_nsa_fwd_decode():
+    tilelang.tools.bench.process_func(example_tilelang_nsa_decode.main)
+
+
+if globals().get("__name__") == "__main__":
+    tilelang.tools.bench.main()
diff --git a/examples/deepseek_v32/bench_tilelang_example_deepseek_v32.py b/examples/deepseek_v32/bench_tilelang_example_deepseek_v32.py
@@ -0,0 +1,64 @@
+import tilelang.tools.bench
+import fp8_lighting_indexer
+import sparse_mla_bwd
+import sparse_mla_fwd
+import sparse_mla_fwd_pipelined
+import topk_selector
-import tilelang.tools.bench
-import fp8_lighting_indexer
-import sparse_mla_bwd
-import sparse_mla_fwd
-import sparse_mla_fwd_pipelined
-import topk_selector
+import tilelang.tools.bench
+import tilelang.testing
+import fp8_lighting_indexer
+import sparse_mla_bwd
+import sparse_mla_fwd
+import sparse_mla_fwd_pipelined
+import topk_selector
-import tilelang.tools.bench
-import fp8_lighting_indexer
-import sparse_mla_bwd
-import sparse_mla_fwd
-import sparse_mla_fwd_pipelined
-import topk_selector
+import tilelang.tools.bench
+import tilelang.testing
+import fp8_lighting_indexer
+import sparse_mla_bwd
+import sparse_mla_fwd
+import sparse_mla_fwd_pipelined
+import topk_selector
+
+
+def bench_topk_selector():
+    tilelang.tools.bench.process_func(topk_selector.test_topk_selector)
+
+
+def bench_fp8_lighting_indexer():
+    tilelang.tools.bench.process_func(
+        fp8_lighting_indexer.test_fp8_lighting_indexer,
+        S=512,
+        SKV=1024,
+        H=32,
+        HKV=1,
+        D=64,
+        kv_stride=1)
+
+
+def bench_sparse_mla_fwd():
+    tilelang.tools.bench.process_func(
+        sparse_mla_fwd.test_sparse_mla_fwd,
+        S=256,
+        SKV=1024,
+        H=64,
+        HKV=1,
+        DQK=576,
+        DV=512,
+        topk=256,
+        check_correctness=False)
+
+
+def bench_sparse_mla_fwd_pipelined():
+    tilelang.tools.bench.process_func(
+        sparse_mla_fwd_pipelined.test_sparse_mla_fwd_pipelined,
+        S=256,
+        SKV=512,
+        H=64,
+        HKV=1,
+        DQK=576,
+        DV=512,
+        topk=256,
+        check_correctness=False)
+
+
+def bench_sparse_mla_bwd():
+    tilelang.tools.bench.process_func(
+        sparse_mla_bwd.test_sparse_mla_bwd,
+        S=256,
+        SKV=512,
+        H=64,
+        HKV=1,
+        DQKV=576,
+        DV=512,
+        topk=256,
+        check_correctness=False)
+
+
+if globals().get("__name__") == "__main__":
+    tilelang.tools.bench.main()
diff --git a/examples/dequantize_gemm/bench_example_dequantize_gemm.py b/examples/dequantize_gemm/bench_example_dequantize_gemm.py
@@ -0,0 +1,35 @@
+import tilelang.tools.bench
+import example_dequant_gemm_bf16_mxfp4_hopper
+import example_dequant_gemm_bf16_mxfp4_hopper_tma
+import example_dequant_gemm_fp4_hopper
+import example_dequant_gemm_w4a8
+import example_dequant_gemv_fp16xint4
+import example_dequant_groupedgemm_bf16_mxfp4_hopper
+
+
+def bench_example_dequant_gemv_fp16xint4():
+    tilelang.tools.bench.process_func(example_dequant_gemv_fp16xint4.main)
+
+
+def bench_example_dequant_gemm_fp4_hopper():
+    tilelang.tools.bench.process_func(example_dequant_gemm_fp4_hopper.main)
+
+
+def bench_example_dequant_gemm_bf16_mxfp4_hopper():
+    tilelang.tools.bench.process_func(example_dequant_gemm_bf16_mxfp4_hopper.main)
+
+
+def bench_example_dequant_gemm_bf16_mxfp4_hopper_tma():
+    tilelang.tools.bench.process_func(example_dequant_gemm_bf16_mxfp4_hopper_tma.main)
+
+
+def bench_example_dequant_groupedgemm_bf16_mxfp4_hopper():
+    tilelang.tools.bench.process_func(example_dequant_groupedgemm_bf16_mxfp4_hopper.main)
+
+
+def bench_example_dequant_gemm_w4a8():
+    tilelang.tools.bench.process_func(example_dequant_gemm_w4a8.main)
+
+
+if globals().get("__name__") == "__main__":
+    tilelang.tools.bench.main()