diff --git a/.github/workflows/pr-perfbench-bot.yml b/.github/workflows/pr-perfbench-bot.yml
index 37da4e3c8..c278391d9 100644
--- a/.github/workflows/pr-perfbench-bot.yml
+++ b/.github/workflows/pr-perfbench-bot.yml
@@ -6,7 +6,7 @@ on:
       - created
 
 permissions:
-  contents: read
+  contents: write
 
 concurrency:
   group: "${{ github.workflow }}-${{ github.ref }}"
@@ -16,7 +16,9 @@ env:
   PYTHONDEVMODE: "1"
   PYTHONUNBUFFERED: "1"
   PYTHONPATH: "" # explicit cleanup
-  PIP_USER: "" # explicit cleanup
+  PIP_USER: "0"
+  PIP_NO_USER: "1"
+  PIP_DISABLE_PIP_VERSION_CHECK: "1"
   COLUMNS: "100"
   FORCE_COLOR: "1"
   CLICOLOR_FORCE: "1"
@@ -72,17 +74,58 @@ jobs:
         run: |
           source tl/bin/activate
           python maint/scripts/ci_performance.py
+      - name: Read markdown table
+        id: read_md
+        run: |
+          echo "content<<EOF" >> $GITHUB_OUTPUT
+          cat bench.md >> $GITHUB_OUTPUT
+          echo "EOF" >> $GITHUB_OUTPUT
+      - name: Upload PNG to GitHub and get URL
+        id: upload_png
+        uses: actions/github-script@v8
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            const fs = require('fs');
+            const content = fs.readFileSync('bench.png').toString('base64');
+            // Create blob in the repo
+            const blob = await github.rest.git.createBlob({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              content: content,
+              encoding: "base64",
+            });
+            // Attach blob as a tree item
+            const tree = await github.rest.git.createTree({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              tree: [{
+                path: `bench_${context.runId}.png`,
+                mode: '100644',
+                type: 'blob',
+                sha: blob.data.sha
+              }]
+            });
+            // Raw file URL (works for embedding image)
+            const url = `https://raw.githubusercontent.com/${context.repo.owner}/${context.repo.repo}/${tree.data.sha}/bench_${context.runId}.png`
+            core.setOutput("url", url);
 
       - name: Post test results as PR comment
         uses: actions/github-script@v8
         with:
           github-token: ${{ secrets.GITHUB_TOKEN }}
           script: |
+            const md = `${{ steps.read_md.outputs.content }}`;
+            const img = `${{ steps.upload_png.outputs.url }}`;
             github.rest.issues.createComment({
               owner: context.repo.owner,
               repo: context.repo.repo,
               issue_number: context.issue.number,
-              body: '📊 ​**Performance Test Results** (triggered by @' + context.payload.comment.user.login + '):\n\n' +
-                'Run listed here: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}\n\n' +
-                "${{ steps.perfbench.outputs.stdout }}"
+              body:
+                '📊 **Performance Test Results** (triggered by @' +
+                context.payload.comment.user.login + ')\n\n' +
+                'Run: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}\n\n' +
+                md +
+                '\n\n📈 **Speedup Plot:**\n\n' +
+                `![Speedup Plot](${img})`
             })
diff --git a/examples/analyze/bench_example_analyze.py b/examples/analyze/bench_example_analyze.py
new file mode 100644
index 000000000..775b9bccd
--- /dev/null
+++ b/examples/analyze/bench_example_analyze.py
@@ -0,0 +1,15 @@
+import tilelang.tools.bench
+import example_conv_analyze
+import example_gemm_analyze
+
+
+def bench_example_gemm_analyze():
+    tilelang.tools.bench.process_func(example_gemm_analyze.main)
+
+
+def bench_example_conv_analyze():
+    tilelang.tools.bench.process_func(example_conv_analyze.main)
+
+
+if globals().get("__name__") == "__main__":
+    tilelang.tools.bench.main()
diff --git a/examples/attention_sink/bench_example_attention_sink.py b/examples/attention_sink/bench_example_attention_sink.py
new file mode 100644
index 000000000..2c281229b
--- /dev/null
+++ b/examples/attention_sink/bench_example_attention_sink.py
@@ -0,0 +1,52 @@
+import tilelang.tools.bench
+import example_gqa_sink_bwd_bhsd
+import example_gqa_sink_fwd_bhsd_wgmma_pipelined
+import example_mha_sink_bwd_bhsd
+import example_mha_sink_fwd_bhsd
+import example_mha_sink_fwd_bhsd_wgmma_pipelined
+
+
+def bench_example_mha_sink_fwd_bhsd():
+    tilelang.tools.bench.process_func(example_mha_sink_fwd_bhsd.main)
+
+
+def bench_example_mha_sink_fwd_bhsd_sliding_window():
+    tilelang.tools.bench.process_func(example_mha_sink_fwd_bhsd.main, window_size=128)
+
+
+def bench_example_mha_sink_fwd_bhsd_wgmma_pipelined():
+    tilelang.tools.bench.process_func(example_mha_sink_fwd_bhsd_wgmma_pipelined.main)
+
+
+def bench_example_mha_sink_fwd_bhsd_wgmma_pipelined_sliding_window():
+    tilelang.tools.bench.process_func(
+        example_mha_sink_fwd_bhsd_wgmma_pipelined.main, window_size=128)
+
+
+def bench_example_gqa_sink_fwd_bhsd_wgmma_pipelined():
+    tilelang.tools.bench.process_func(example_gqa_sink_fwd_bhsd_wgmma_pipelined.main)
+
+
+def bench_example_gqa_sink_fwd_bhsd_wgmma_pipelined_sliding_window():
+    tilelang.tools.bench.process_func(
+        example_gqa_sink_fwd_bhsd_wgmma_pipelined.main, window_size=128)
+
+
+def bench_example_mha_sink_bwd_bhsd():
+    tilelang.tools.bench.process_func(example_mha_sink_bwd_bhsd.main)
+
+
+def bench_example_mha_sink_bwd_bhsd_sliding_window():
+    tilelang.tools.bench.process_func(example_mha_sink_bwd_bhsd.main, window_size=128)
+
+
+def bench_example_gqa_sink_bwd_bhsd():
+    tilelang.tools.bench.process_func(example_gqa_sink_bwd_bhsd.main)
+
+
+def bench_example_gqa_sink_bwd_bhsd_sliding_window():
+    tilelang.tools.bench.process_func(example_gqa_sink_bwd_bhsd.main, window_size=128)
+
+
+if globals().get("__name__") == "__main__":
+    tilelang.tools.bench.main()
diff --git a/examples/blocksparse_attention/bench_example_blocksparse_attention.py b/examples/blocksparse_attention/bench_example_blocksparse_attention.py
new file mode 100644
index 000000000..1085b6a5b
--- /dev/null
+++ b/examples/blocksparse_attention/bench_example_blocksparse_attention.py
@@ -0,0 +1,55 @@
+import tilelang.tools.bench
+import block_sparse_attn_triton
+import example_tilelang_block_sparse_attn
+import example_tilelang_sparse_gqa_decode_varlen_indice
+import example_tilelang_sparse_gqa_decode_varlen_mask
+import example_triton_sparse_gqa_decode_varlen_indice
+import example_triton_sparse_gqa_decode_varlen_mask
+
+
+def bench_block_sparse_attn_triton():
+    tilelang.tools.bench.process_func(block_sparse_attn_triton.main)
+
+
+def bench_example_tilelang_block_sparse_attn():
+    tilelang.tools.bench.process_func(example_tilelang_block_sparse_attn.main)
+
+
+def bench_example_tilelang_sparse_gqa_decode_varlen_indice():
+    tilelang.tools.bench.process_func(
+        example_tilelang_sparse_gqa_decode_varlen_indice.main, batch=1, max_cache_seqlen=2048)
+
+
+def bench_example_tilelang_sparse_gqa_decode_varlen_mask():
+    tilelang.tools.bench.process_func(
+        example_tilelang_sparse_gqa_decode_varlen_mask.main, batch=1, max_cache_seqlen=2048)
+
+
+def bench_example_triton_sparse_gqa_decode_varlen_indice():
+    tilelang.tools.bench.process_func(
+        example_triton_sparse_gqa_decode_varlen_indice.main,
+        batch=8,
+        heads=8,
+        heads_kv=4,
+        max_cache_seqlen=2048,
+        dim=128,
+        dim_v=128,
+        sparse_ratio=0.8,
+        block_size=32)
+
+
+def bench_example_triton_sparse_gqa_decode_varlen_mask():
+    tilelang.tools.bench.process_func(
+        example_triton_sparse_gqa_decode_varlen_mask.main,
+        batch=8,
+        heads=8,
+        heads_kv=4,
+        max_cache_seqlen=2048,
+        dim=128,
+        dim_v=128,
+        sparse_ratio=0.8,
+        block_size=32)
+
+
+if globals().get("__name__") == "__main__":
+    tilelang.tools.bench.main()
diff --git a/examples/blocksparse_gemm/bench_example_blocksparse_gemm.py b/examples/blocksparse_gemm/bench_example_blocksparse_gemm.py
new file mode 100644
index 000000000..13d8ba983
--- /dev/null
+++ b/examples/blocksparse_gemm/bench_example_blocksparse_gemm.py
@@ -0,0 +1,10 @@
+import tilelang.tools.bench
+import example_blocksparse_gemm
+
+
+def bench_example_blocksparse_gemm():
+    tilelang.tools.bench.process_func(example_blocksparse_gemm.main)
+
+
+if globals().get("__name__") == "__main__":
+    tilelang.tools.bench.main()
diff --git a/examples/cast/bench_example_cast.py b/examples/cast/bench_example_cast.py
new file mode 100644
index 000000000..7f2a6b2ea
--- /dev/null
+++ b/examples/cast/bench_example_cast.py
@@ -0,0 +1,21 @@
+import tilelang.tools.bench
+import example_group_per_split_token_cast_to_fp8
+import example_per_token_cast_to_fp8
+
+
+def bench_example_group_per_split_token_cast_to_fp8():
+    tilelang.tools.bench.process_func(
+        example_group_per_split_token_cast_to_fp8.main,
+        M=1024,
+        N=1024,
+        BG=2,
+        blk_m=4,
+        batch_sizes=[128, 896])
+
+
+def bench_example_per_token_cast_to_fp8():
+    tilelang.tools.bench.process_func(example_per_token_cast_to_fp8.main, M=2048, N=512, blk_m=8)
+
+
+if globals().get("__name__") == "__main__":
+    tilelang.tools.bench.main()
diff --git a/examples/convolution/bench_example_convolution.py b/examples/convolution/bench_example_convolution.py
new file mode 100644
index 000000000..45f87366e
--- /dev/null
+++ b/examples/convolution/bench_example_convolution.py
@@ -0,0 +1,15 @@
+import tilelang.tools.bench
+import example_convolution
+import example_convolution_autotune
+
+
+def bench_example_convolution():
+    tilelang.tools.bench.process_func(example_convolution.main)
+
+
+def bench_example_convolution_autotune():
+    tilelang.tools.bench.process_func(example_convolution_autotune.main)
+
+
+if globals().get("__name__") == "__main__":
+    tilelang.tools.bench.main()
diff --git a/examples/deepseek_deepgemm/bench_example_deepgemm_fp8_2xAcc.py b/examples/deepseek_deepgemm/bench_example_deepgemm_fp8_2xAcc.py
new file mode 100644
index 000000000..7b95d727f
--- /dev/null
+++ b/examples/deepseek_deepgemm/bench_example_deepgemm_fp8_2xAcc.py
@@ -0,0 +1,10 @@
+import tilelang.tools.bench
+import example_deepgemm_fp8_2xAcc
+
+
+def bench_example_deepgemm_fp8_2xAcc():
+    tilelang.tools.bench.process_func(example_deepgemm_fp8_2xAcc.main)
+
+
+if globals().get("__name__") == "__main__":
+    tilelang.tools.bench.main()
diff --git a/examples/deepseek_mla/bench_example_mla_decode.py b/examples/deepseek_mla/bench_example_mla_decode.py
new file mode 100644
index 000000000..05dce252b
--- /dev/null
+++ b/examples/deepseek_mla/bench_example_mla_decode.py
@@ -0,0 +1,10 @@
+import tilelang.tools.bench
+import example_mla_decode
+
+
+def bench_example_mla_decode():
+    tilelang.tools.bench.process_func(example_mla_decode.main)
+
+
+if globals().get("__name__") == "__main__":
+    tilelang.tools.bench.main()
diff --git a/examples/deepseek_nsa/bench_example_tilelang_nsa.py b/examples/deepseek_nsa/bench_example_tilelang_nsa.py
new file mode 100644
index 000000000..6dc5b7248
--- /dev/null
+++ b/examples/deepseek_nsa/bench_example_tilelang_nsa.py
@@ -0,0 +1,15 @@
+import tilelang.tools.bench
+import example_tilelang_nsa_fwd
+import example_tilelang_nsa_decode
+
+
+def bench_example_tilelang_nsa_fwd():
+    tilelang.tools.bench.process_func(example_tilelang_nsa_fwd.main)
+
+
+def bench_example_tilelang_nsa_fwd_decode():
+    tilelang.tools.bench.process_func(example_tilelang_nsa_decode.main)
+
+
+if globals().get("__name__") == "__main__":
+    tilelang.tools.bench.main()
diff --git a/examples/deepseek_v32/bench_tilelang_example_deepseek_v32.py b/examples/deepseek_v32/bench_tilelang_example_deepseek_v32.py
new file mode 100644
index 000000000..bc4f54922
--- /dev/null
+++ b/examples/deepseek_v32/bench_tilelang_example_deepseek_v32.py
@@ -0,0 +1,64 @@
+import tilelang.tools.bench
+import fp8_lighting_indexer
+import sparse_mla_bwd
+import sparse_mla_fwd
+import sparse_mla_fwd_pipelined
+import topk_selector
+
+
+def bench_topk_selector():
+    tilelang.tools.bench.process_func(topk_selector.test_topk_selector)
+
+
+def bench_fp8_lighting_indexer():
+    tilelang.tools.bench.process_func(
+        fp8_lighting_indexer.test_fp8_lighting_indexer,
+        S=512,
+        SKV=1024,
+        H=32,
+        HKV=1,
+        D=64,
+        kv_stride=1)
+
+
+def bench_sparse_mla_fwd():
+    tilelang.tools.bench.process_func(
+        sparse_mla_fwd.test_sparse_mla_fwd,
+        S=256,
+        SKV=1024,
+        H=64,
+        HKV=1,
+        DQK=576,
+        DV=512,
+        topk=256,
+        check_correctness=False)
+
+
+def bench_sparse_mla_fwd_pipelined():
+    tilelang.tools.bench.process_func(
+        sparse_mla_fwd_pipelined.test_sparse_mla_fwd_pipelined,
+        S=256,
+        SKV=512,
+        H=64,
+        HKV=1,
+        DQK=576,
+        DV=512,
+        topk=256,
+        check_correctness=False)
+
+
+def bench_sparse_mla_bwd():
+    tilelang.tools.bench.process_func(
+        sparse_mla_bwd.test_sparse_mla_bwd,
+        S=256,
+        SKV=512,
+        H=64,
+        HKV=1,
+        DQKV=576,
+        DV=512,
+        topk=256,
+        check_correctness=False)
+
+
+if globals().get("__name__") == "__main__":
+    tilelang.tools.bench.main()
diff --git a/examples/dequantize_gemm/bench_example_dequantize_gemm.py b/examples/dequantize_gemm/bench_example_dequantize_gemm.py
new file mode 100644
index 000000000..6f840da16
--- /dev/null
+++ b/examples/dequantize_gemm/bench_example_dequantize_gemm.py
@@ -0,0 +1,35 @@
+import tilelang.tools.bench
+import example_dequant_gemm_bf16_mxfp4_hopper
+import example_dequant_gemm_bf16_mxfp4_hopper_tma
+import example_dequant_gemm_fp4_hopper
+import example_dequant_gemm_w4a8
+import example_dequant_gemv_fp16xint4
+import example_dequant_groupedgemm_bf16_mxfp4_hopper
+
+
+def bench_example_dequant_gemv_fp16xint4():
+    tilelang.tools.bench.process_func(example_dequant_gemv_fp16xint4.main)
+
+
+def bench_example_dequant_gemm_fp4_hopper():
+    tilelang.tools.bench.process_func(example_dequant_gemm_fp4_hopper.main)
+
+
+def bench_example_dequant_gemm_bf16_mxfp4_hopper():
+    tilelang.tools.bench.process_func(example_dequant_gemm_bf16_mxfp4_hopper.main)
+
+
+def bench_example_dequant_gemm_bf16_mxfp4_hopper_tma():
+    tilelang.tools.bench.process_func(example_dequant_gemm_bf16_mxfp4_hopper_tma.main)
+
+
+def bench_example_dequant_groupedgemm_bf16_mxfp4_hopper():
+    tilelang.tools.bench.process_func(example_dequant_groupedgemm_bf16_mxfp4_hopper.main)
+
+
+def bench_example_dequant_gemm_w4a8():
+    tilelang.tools.bench.process_func(example_dequant_gemm_w4a8.main)
+
+
+if globals().get("__name__") == "__main__":
+    tilelang.tools.bench.main()
diff --git a/examples/dynamic_shape/bench_example_dynamic.py b/examples/dynamic_shape/bench_example_dynamic.py
new file mode 100644
index 000000000..8f735c477
--- /dev/null
+++ b/examples/dynamic_shape/bench_example_dynamic.py
@@ -0,0 +1,10 @@
+import tilelang.tools.bench
+import example_dynamic
+
+
+def bench_example_dynamic():
+    tilelang.tools.bench.process_func(example_dynamic.main, M=1024, N=1024, K=1024)
+
+
+if globals().get("__name__") == "__main__":
+    tilelang.tools.bench.main()
diff --git a/examples/elementwise/bench_example_elementwise.py b/examples/elementwise/bench_example_elementwise.py
new file mode 100644
index 000000000..1a70419f9
--- /dev/null
+++ b/examples/elementwise/bench_example_elementwise.py
@@ -0,0 +1,10 @@
+import tilelang.tools.bench
+import example_elementwise_add
+
+
+def bench_example_elementwise_add():
+    tilelang.tools.bench.process_func(example_elementwise_add.main)
+
+
+if globals().get("__name__") == "__main__":
+    tilelang.tools.bench.main()
diff --git a/examples/flash_attention/bench_example_flash_attention.py b/examples/flash_attention/bench_example_flash_attention.py
new file mode 100644
index 000000000..5a5e4122e
--- /dev/null
+++ b/examples/flash_attention/bench_example_flash_attention.py
@@ -0,0 +1,108 @@
+import tilelang.tools.bench
+import example_gqa_bwd
+import example_gqa_bwd_tma_reduce_varlen
+import example_gqa_bwd_wgmma_pipelined
+import example_gqa_fwd_bshd
+import example_gqa_fwd_bshd_wgmma_pipelined
+import example_mha_bwd_bshd
+import example_mha_bwd_bhsd
+import example_mha_bwd_bshd_wgmma_pipelined
+import example_mha_fwd_bhsd
+import example_mha_fwd_bhsd_wgmma_pipelined
+import example_mha_fwd_bshd
+import example_mha_fwd_bshd_wgmma_pipelined
+import example_mha_fwd_varlen
+
+
+def bench_example_gqa_bwd_tma_reduce_varlen():
+    tilelang.tools.bench.process_func(example_gqa_bwd_tma_reduce_varlen.main)
+
+
+def bench_example_gqa_bwd():
+    tilelang.tools.bench.process_func(example_gqa_bwd.main)
+
+
+def bench_example_gqa_bwd_wgmma_pipelined():
+    tilelang.tools.bench.process_func(example_gqa_bwd_wgmma_pipelined.main)
+
+
+def bench_example_mha_bwd_bshd():
+    tilelang.tools.bench.process_func(
+        example_mha_bwd_bshd.main,
+        BATCH=1,
+        H=16,
+        N_CTX=512,
+        D_HEAD=64,
+        causal=False,
+    )
+
+
+def bench_example_mha_bwd_bhsd():
+    tilelang.tools.bench.process_func(
+        example_mha_bwd_bhsd.main,
+        BATCH=1,
+        H=16,
+        N_CTX=512,
+        D_HEAD=64,
+        causal=False,
+    )
+
+
+def bench_example_mha_bwd_bshd_wgmma_pipelined():
+    tilelang.tools.bench.process_func(
+        example_mha_bwd_bshd_wgmma_pipelined.main,
+        BATCH=1,
+        H=32,
+        N_CTX=256,
+        D_HEAD=64,
+        causal=False)
+
+
+def bench_example_gqa_fwd_bshd_wgmma_pipelined():
+    tilelang.tools.bench.process_func(
+        example_gqa_fwd_bshd_wgmma_pipelined.main,
+        batch=1,
+        heads=16,
+        seq_len=1024,
+        dim=128,
+        is_causal=False,
+        groups=16,
+        tune=False)
+
+
+def bench_example_gqa_fwd_bshd():
+    tilelang.tools.bench.process_func(
+        example_gqa_fwd_bshd.main,
+        batch=1,
+        heads=16,
+        seq_len=1024,
+        dim=128,
+        is_causal=False,
+        groups=16,
+        tune=False)
+
+
+def bench_example_mha_fwd_bhsd_wgmma_pipelined():
+    tilelang.tools.bench.process_func(example_mha_fwd_bhsd_wgmma_pipelined.main)
+
+
+def bench_example_mha_fwd_bhsd():
+    tilelang.tools.bench.process_func(example_mha_fwd_bhsd.main)
+
+
+def bench_example_mha_fwd_bshd_wgmma_pipelined():
+    tilelang.tools.bench.process_func(
+        example_mha_fwd_bshd_wgmma_pipelined.main, batch=1, heads=32, seq_len=256)
+
+
+def bench_example_mha_fwd_bshd():
+    tilelang.tools.bench.process_func(example_mha_fwd_bshd.main, batch=1, seq_len=256)
+
+
+def bench_example_mha_fwd_varlen():
+    tilelang.tools.bench.process_func(
+        example_mha_fwd_varlen.main, batch=4, heads=16, seq_len=512, dim=64)
+
+
+if globals().get("__name__") == "__main__":
+    tilelang.tools.bench.main()
diff --git a/examples/flash_decoding/bench_example_flash_decoding.py b/examples/flash_decoding/bench_example_flash_decoding.py
new file mode 100644
index 000000000..e2fd30742
--- /dev/null
+++ b/examples/flash_decoding/bench_example_flash_decoding.py
@@ -0,0 +1,16 @@
+import tilelang.tools.bench
+import example_gqa_decode
+import example_mha_inference
+
+
+def bench_example_gqa_decode():
+    tilelang.tools.bench.process_func(example_gqa_decode.main)
+
+
+def bench_example_mha_inference():
+    tilelang.tools.bench.process_func(
+        example_mha_inference.main, BATCH=1, H=32, Q_CTX=128, KV_CTX=2048, D_HEAD=128, causal=False)
+
+
+if globals().get("__name__") == "__main__":
+    tilelang.tools.bench.main()
diff --git a/examples/fusedmoe/bench_example_fusedmoe.py b/examples/fusedmoe/bench_example_fusedmoe.py
new file mode 100644
index 000000000..dcc4b0598
--- /dev/null
+++ b/examples/fusedmoe/bench_example_fusedmoe.py
@@ -0,0 +1,18 @@
+import tilelang.tools.bench
+import example_fusedmoe_tilelang
+
+
+def bench_example_fusedmoe_tilelang():
+    tilelang.tools.bench.process_func(
+        example_fusedmoe_tilelang.main,
+        d_hidden=1024,
+        d_expert=256,
+        n_routed_experts=8,
+        n_shared_experts=1,
+        n_experts_per_token=4,
+        batch_size=1,
+        seq_len=1024)
+
+
+if globals().get("__name__") == "__main__":
+    tilelang.tools.bench.main()
diff --git a/examples/gemm/bench_example_gemm.py b/examples/gemm/bench_example_gemm.py
new file mode 100644
index 000000000..c58073490
--- /dev/null
+++ b/examples/gemm/bench_example_gemm.py
@@ -0,0 +1,26 @@
+import tilelang.tools.bench
+import example_gemm
+import example_gemm_autotune
+import example_gemm_intrinsics
+import example_gemm_schedule
+
+
+def bench_example_gemm_autotune():
+    tilelang.tools.bench.process_func(
+        example_gemm_autotune.main, M=1024, N=1024, K=1024, with_roller=True)
+
+
+def bench_example_gemm_intrinsics():
+    tilelang.tools.bench.process_func(example_gemm_intrinsics.main, M=1024, N=1024, K=1024)
+
+
+def bench_example_gemm_schedule():
+    tilelang.tools.bench.process_func(example_gemm_schedule.main)
+
+
+def bench_example_gemm():
+    tilelang.tools.bench.process_func(example_gemm.main)
+
+
+if globals().get("__name__") == "__main__":
+    tilelang.tools.bench.main()
diff --git a/examples/gemm_fp8/bench_example_gemm_fp8.py b/examples/gemm_fp8/bench_example_gemm_fp8.py
new file mode 100644
index 000000000..f3a09cc46
--- /dev/null
+++ b/examples/gemm_fp8/bench_example_gemm_fp8.py
@@ -0,0 +1,20 @@
+import tilelang.tools.bench
+import example_tilelang_gemm_fp8
+import example_tilelang_gemm_fp8_2xAcc
+import example_tilelang_gemm_fp8_intrinsic
+
+
+def bench_example_tilelang_gemm_fp8_2xAcc():
+    tilelang.tools.bench.process_func(example_tilelang_gemm_fp8_2xAcc.main)
+
+
+def bench_example_tilelang_gemm_fp8_intrinsic():
+    tilelang.tools.bench.process_func(example_tilelang_gemm_fp8_intrinsic.main)
+
+
+def bench_example_tilelang_gemm_fp8():
+    tilelang.tools.bench.process_func(example_tilelang_gemm_fp8.main)
+
+
+if globals().get("__name__") == "__main__":
+    tilelang.tools.bench.main()
diff --git a/examples/gemm_splitk/bench_example_gemm_splitk.py b/examples/gemm_splitk/bench_example_gemm_splitk.py
new file mode 100644
index 000000000..1892d1dc8
--- /dev/null
+++ b/examples/gemm_splitk/bench_example_gemm_splitk.py
@@ -0,0 +1,15 @@
+import tilelang.tools.bench
+import example_tilelang_gemm_splitk
+import example_tilelang_gemm_splitk_vectorize_atomicadd
+
+
+def bench_example_tilelang_gemm_splitk():
+    tilelang.tools.bench.process_func(example_tilelang_gemm_splitk.main)
+
+
+def bench_example_tilelang_gemm_splitk_vectorize_atomicadd():
+    tilelang.tools.bench.process_func(example_tilelang_gemm_splitk_vectorize_atomicadd.main)
+
+
+if globals().get("__name__") == "__main__":
+    tilelang.tools.bench.main()
diff --git a/examples/gemm_streamk/bench_example_tilelang_gemm_splitk.py b/examples/gemm_streamk/bench_example_tilelang_gemm_splitk.py
new file mode 100644
index 000000000..0c0b98f65
--- /dev/null
+++ b/examples/gemm_streamk/bench_example_tilelang_gemm_splitk.py
@@ -0,0 +1,10 @@
+import tilelang.tools.bench
+import example_tilelang_gemm_streamk
+
+
+def bench_example_tilelang_gemm_streamk():
+    tilelang.tools.bench.process_func(example_tilelang_gemm_streamk.main)
+
+
+if globals().get("__name__") == "__main__":
+    tilelang.tools.bench.main()
diff --git a/examples/gemv/bench_example_gemv.py b/examples/gemv/bench_example_gemv.py
new file mode 100644
index 000000000..ad5fe8562
--- /dev/null
+++ b/examples/gemv/bench_example_gemv.py
@@ -0,0 +1,10 @@
+import tilelang.tools.bench
+import example_gemv
+
+
+def bench_example_gemv():
+    tilelang.tools.bench.process_func(example_gemv.main, do_bench=False)
+
+
+if globals().get("__name__") == "__main__":
+    tilelang.tools.bench.main()
diff --git a/examples/linear_attention/bench_linear_attn.py b/examples/linear_attention/bench_linear_attn.py
new file mode 100644
index 000000000..c5b554b07
--- /dev/null
+++ b/examples/linear_attention/bench_linear_attn.py
@@ -0,0 +1,15 @@
+import tilelang.tools.bench
+import example_linear_attn_bwd
+import example_linear_attn_fwd
+
+
+def bench_example_linear_attn_fwd():
+    tilelang.tools.bench.process_func(example_linear_attn_fwd.main)
+
+
+def bench_example_linear_attn_bwd():
+    tilelang.tools.bench.process_func(example_linear_attn_bwd.main)
+
+
+if globals().get("__name__") == "__main__":
+    tilelang.tools.bench.main()
diff --git a/examples/minference/bench_vs_sparse_attn.py b/examples/minference/bench_vs_sparse_attn.py
new file mode 100644
index 000000000..d925379ff
--- /dev/null
+++ b/examples/minference/bench_vs_sparse_attn.py
@@ -0,0 +1,10 @@
+import tilelang.tools.bench
+import example_vertical_slash_sparse_attn
+
+
+def bench_example_vertical_slash_sparse_attn():
+    tilelang.tools.bench.process_func(example_vertical_slash_sparse_attn.main, argv=[])
+
+
+if globals().get("__name__") == "__main__":
+    tilelang.tools.bench.main()
diff --git a/examples/seer_attention/bench_block_sparse_attn_tilelang.py b/examples/seer_attention/bench_block_sparse_attn_tilelang.py
new file mode 100644
index 000000000..5d0375f09
--- /dev/null
+++ b/examples/seer_attention/bench_block_sparse_attn_tilelang.py
@@ -0,0 +1,10 @@
+import tilelang.tools.bench
+import block_sparse_attn_tilelang
+
+
+def bench_block_sparse_attn_tilelang():
+    tilelang.tools.bench.process_func(block_sparse_attn_tilelang.main)
+
+
+if globals().get("__name__") == "__main__":
+    tilelang.tools.bench.main()
diff --git a/examples/sparse_tensorcore/bench_example_sparse_tensorcore.py b/examples/sparse_tensorcore/bench_example_sparse_tensorcore.py
new file mode 100644
index 000000000..669ebfa16
--- /dev/null
+++ b/examples/sparse_tensorcore/bench_example_sparse_tensorcore.py
@@ -0,0 +1,11 @@
+import tilelang.tools.bench
+import tilelang
+import tilelang_example_sparse_tensorcore
+
+
+def bench_example_sparse_tensorcore():
+    tilelang.tools.bench.process_func(tilelang_example_sparse_tensorcore.main)
+
+
+if globals().get("__name__") == "__main__":
+    tilelang.tools.bench.main()
diff --git a/examples/topk/bench_topk_tilelang.py b/examples/topk/bench_topk_tilelang.py
new file mode 100644
index 000000000..12c8ce609
--- /dev/null
+++ b/examples/topk/bench_topk_tilelang.py
@@ -0,0 +1,10 @@
+import tilelang.tools.bench
+import example_topk
+
+
+def bench_example_topk():
+    tilelang.tools.bench.process_func(example_topk.main)
+
+
+if globals().get("__name__") == "__main__":
+    tilelang.tools.bench.main()
diff --git a/examples/warp_specialize/bench_example_warp_specialize.py b/examples/warp_specialize/bench_example_warp_specialize.py
new file mode 100644
index 000000000..a463526ba
--- /dev/null
+++ b/examples/warp_specialize/bench_example_warp_specialize.py
@@ -0,0 +1,29 @@
+import tilelang.tools.bench
+import example_warp_specialize_gemm_barrierpipe_stage2
+import example_warp_specialize_gemm_copy_0_gemm_1
+import example_warp_specialize_gemm_copy_1_gemm_0
+import example_warp_specialize_gemm_softpipe_stage2
+
+
+def bench_example_warp_specialize_gemm_barrierpipe_stage2():
+    tilelang.tools.bench.process_func(
+        example_warp_specialize_gemm_barrierpipe_stage2.main, M=1024, N=1024, K=1024)
+
+
+def bench_example_warp_specialize_gemm_copy_0_gemm_1():
+    tilelang.tools.bench.process_func(
+        example_warp_specialize_gemm_copy_0_gemm_1.main, M=1024, N=1024, K=1024)
+
+
+def bench_example_warp_specialize_gemm_copy_1_gemm_0():
+    tilelang.tools.bench.process_func(
+        example_warp_specialize_gemm_copy_1_gemm_0.main, M=1024, N=1024, K=1024)
+
+
+def bench_example_warp_specialize_gemm_softpipe_stage2():
+    tilelang.tools.bench.process_func(
+        example_warp_specialize_gemm_softpipe_stage2.main, M=1024, N=1024, K=1024)
+
+
+if globals().get("__name__") == "__main__":
+    tilelang.tools.bench.main()
diff --git a/log.txt b/log.txt
new file mode 100644
index 000000000..42387935d
--- /dev/null
+++ b/log.txt
@@ -0,0 +1,477 @@
+/opt/conda/lib/python3.10/site-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
+  import pynvml  # type: ignore[import]
+2025-11-15 04:16:16  [TileLang:tilelang.env:WARNING]: Loading tilelang libs from dev root: /root/TileLang/build
+WARNING:fla.utils:Current Python version 3.10 is below the recommended 3.11 version. It is recommended to upgrade to Python 3.11 or higher for the best experience.
+[04:19:55] /root/TileLang/src/op/copy.cc:1322: Warning: TMA bulk copy cannot support a non-swizzled global layout, fallback to normal copy.
+[04:20:15] /root/TileLang/src/transform/warp_specialized_rewriter.h:43: Warning: Auto warp specialization will be disabled because TMA and mbarrier are both present
+Traceback (most recent call last):
+  File "/root/TileLang/tilelang/tools/bench.py", line 48, in process_func
+    func(*args, **kwargs)
+  File "/root/TileLang/examples/gemm_streamk/example_tilelang_gemm_streamk.py", line 174, in main
+    kernel = tl_matmul_streamk(
+  File "/root/TileLang/tilelang/jit/__init__.py", line 273, in __call__
+    self._kernel_cache[key] = self.compile(*args, **kwargs, **tune_params)
+  File "/root/TileLang/tilelang/jit/__init__.py", line 223, in compile
+    func = self.get_tir(*args, **kwargs)
+  File "/root/TileLang/tilelang/jit/__init__.py", line 192, in get_tir
+    program_result = program_result_source(*args, **kwargs)
+  File "/root/TileLang/examples/gemm_streamk/example_tilelang_gemm_streamk.py", line 152, in tl_matmul_streamk
+    def main(
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 721, in prim_func
+    return impl(func) if func is not None else impl
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 708, in impl
+    return prim_func_generator(**annot)
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 693, in prim_func_generator
+    ir_gen.gen(builder)(*args, **kwargs)
+  File "/root/TileLang/examples/gemm_streamk/example_tilelang_gemm_streamk.py", line 165, in main
+    compute_first_wave(pid, A, A_shared, B, B_shared, C, C_local)
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 529, in __call__
+    res = self.ir_gen.gen(builder)(*args, **kwargs)
+  File "/root/TileLang/examples/gemm_streamk/example_tilelang_gemm_streamk.py", line 126, in compute_first_wave
+    start_iter[0] = end_iter[0]
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 295, in ctx_while
+    raise RuntimeError("while loops are not supported in TileLang builder")
+RuntimeError: while loops are not supported in TileLang builder
+Traceback (most recent call last):
+  File "/root/TileLang/tilelang/tools/bench.py", line 48, in process_func
+    func(*args, **kwargs)
+  File "/root/TileLang/examples/gemm_streamk/example_tilelang_gemm_streamk.py", line 174, in main
+    kernel = tl_matmul_streamk(
+  File "/root/TileLang/tilelang/jit/__init__.py", line 273, in __call__
+    self._kernel_cache[key] = self.compile(*args, **kwargs, **tune_params)
+  File "/root/TileLang/tilelang/jit/__init__.py", line 223, in compile
+    func = self.get_tir(*args, **kwargs)
+  File "/root/TileLang/tilelang/jit/__init__.py", line 192, in get_tir
+    program_result = program_result_source(*args, **kwargs)
+  File "/root/TileLang/examples/gemm_streamk/example_tilelang_gemm_streamk.py", line 152, in tl_matmul_streamk
+    def main(
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 721, in prim_func
+    return impl(func) if func is not None else impl
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 708, in impl
+    return prim_func_generator(**annot)
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 693, in prim_func_generator
+    ir_gen.gen(builder)(*args, **kwargs)
+  File "/root/TileLang/examples/gemm_streamk/example_tilelang_gemm_streamk.py", line 165, in main
+    compute_first_wave(pid, A, A_shared, B, B_shared, C, C_local)
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 529, in __call__
+    res = self.ir_gen.gen(builder)(*args, **kwargs)
+  File "/root/TileLang/examples/gemm_streamk/example_tilelang_gemm_streamk.py", line 126, in compute_first_wave
+    start_iter[0] = end_iter[0]
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 295, in ctx_while
+    raise RuntimeError("while loops are not supported in TileLang builder")
+RuntimeError: while loops are not supported in TileLang builder
+Traceback (most recent call last):
+  File "/root/TileLang/tilelang/tools/bench.py", line 48, in process_func
+    func(*args, **kwargs)
+  File "/root/TileLang/examples/gemm_streamk/example_tilelang_gemm_streamk.py", line 174, in main
+    kernel = tl_matmul_streamk(
+  File "/root/TileLang/tilelang/jit/__init__.py", line 273, in __call__
+    self._kernel_cache[key] = self.compile(*args, **kwargs, **tune_params)
+  File "/root/TileLang/tilelang/jit/__init__.py", line 223, in compile
+    func = self.get_tir(*args, **kwargs)
+  File "/root/TileLang/tilelang/jit/__init__.py", line 192, in get_tir
+    program_result = program_result_source(*args, **kwargs)
+  File "/root/TileLang/examples/gemm_streamk/example_tilelang_gemm_streamk.py", line 152, in tl_matmul_streamk
+    def main(
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 721, in prim_func
+    return impl(func) if func is not None else impl
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 708, in impl
+    return prim_func_generator(**annot)
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 693, in prim_func_generator
+    ir_gen.gen(builder)(*args, **kwargs)
+  File "/root/TileLang/examples/gemm_streamk/example_tilelang_gemm_streamk.py", line 165, in main
+    compute_first_wave(pid, A, A_shared, B, B_shared, C, C_local)
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 529, in __call__
+    res = self.ir_gen.gen(builder)(*args, **kwargs)
+  File "/root/TileLang/examples/gemm_streamk/example_tilelang_gemm_streamk.py", line 126, in compute_first_wave
+    start_iter[0] = end_iter[0]
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 295, in ctx_while
+    raise RuntimeError("while loops are not supported in TileLang builder")
+RuntimeError: while loops are not supported in TileLang builder
+Traceback (most recent call last):
+  File "/root/TileLang/tilelang/tools/bench.py", line 48, in process_func
+    func(*args, **kwargs)
+  File "/root/TileLang/examples/gemm_streamk/example_tilelang_gemm_streamk.py", line 174, in main
+    kernel = tl_matmul_streamk(
+  File "/root/TileLang/tilelang/jit/__init__.py", line 273, in __call__
+    self._kernel_cache[key] = self.compile(*args, **kwargs, **tune_params)
+  File "/root/TileLang/tilelang/jit/__init__.py", line 223, in compile
+    func = self.get_tir(*args, **kwargs)
+  File "/root/TileLang/tilelang/jit/__init__.py", line 192, in get_tir
+    program_result = program_result_source(*args, **kwargs)
+  File "/root/TileLang/examples/gemm_streamk/example_tilelang_gemm_streamk.py", line 152, in tl_matmul_streamk
+    def main(
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 721, in prim_func
+    return impl(func) if func is not None else impl
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 708, in impl
+    return prim_func_generator(**annot)
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 693, in prim_func_generator
+    ir_gen.gen(builder)(*args, **kwargs)
+  File "/root/TileLang/examples/gemm_streamk/example_tilelang_gemm_streamk.py", line 165, in main
+    compute_first_wave(pid, A, A_shared, B, B_shared, C, C_local)
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 529, in __call__
+    res = self.ir_gen.gen(builder)(*args, **kwargs)
+  File "/root/TileLang/examples/gemm_streamk/example_tilelang_gemm_streamk.py", line 126, in compute_first_wave
+    start_iter[0] = end_iter[0]
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 295, in ctx_while
+    raise RuntimeError("while loops are not supported in TileLang builder")
+RuntimeError: while loops are not supported in TileLang builder
+Traceback (most recent call last):
+  File "/root/TileLang/tilelang/tools/bench.py", line 48, in process_func
+    func(*args, **kwargs)
+  File "/root/TileLang/examples/gemm_streamk/example_tilelang_gemm_streamk.py", line 174, in main
+    kernel = tl_matmul_streamk(
+  File "/root/TileLang/tilelang/jit/__init__.py", line 273, in __call__
+    self._kernel_cache[key] = self.compile(*args, **kwargs, **tune_params)
+  File "/root/TileLang/tilelang/jit/__init__.py", line 223, in compile
+    func = self.get_tir(*args, **kwargs)
+  File "/root/TileLang/tilelang/jit/__init__.py", line 192, in get_tir
+    program_result = program_result_source(*args, **kwargs)
+  File "/root/TileLang/examples/gemm_streamk/example_tilelang_gemm_streamk.py", line 152, in tl_matmul_streamk
+    def main(
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 721, in prim_func
+    return impl(func) if func is not None else impl
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 708, in impl
+    return prim_func_generator(**annot)
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 693, in prim_func_generator
+    ir_gen.gen(builder)(*args, **kwargs)
+  File "/root/TileLang/examples/gemm_streamk/example_tilelang_gemm_streamk.py", line 165, in main
+    compute_first_wave(pid, A, A_shared, B, B_shared, C, C_local)
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 529, in __call__
+    res = self.ir_gen.gen(builder)(*args, **kwargs)
+  File "/root/TileLang/examples/gemm_streamk/example_tilelang_gemm_streamk.py", line 126, in compute_first_wave
+    start_iter[0] = end_iter[0]
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 295, in ctx_while
+    raise RuntimeError("while loops are not supported in TileLang builder")
+RuntimeError: while loops are not supported in TileLang builder
+Traceback (most recent call last):
+  File "/root/TileLang/tilelang/tools/bench.py", line 48, in process_func
+    func(*args, **kwargs)
+  File "/root/TileLang/examples/gemm_streamk/example_tilelang_gemm_streamk.py", line 174, in main
+    kernel = tl_matmul_streamk(
+  File "/root/TileLang/tilelang/jit/__init__.py", line 273, in __call__
+    self._kernel_cache[key] = self.compile(*args, **kwargs, **tune_params)
+  File "/root/TileLang/tilelang/jit/__init__.py", line 223, in compile
+    func = self.get_tir(*args, **kwargs)
+  File "/root/TileLang/tilelang/jit/__init__.py", line 192, in get_tir
+    program_result = program_result_source(*args, **kwargs)
+  File "/root/TileLang/examples/gemm_streamk/example_tilelang_gemm_streamk.py", line 152, in tl_matmul_streamk
+    def main(
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 721, in prim_func
+    return impl(func) if func is not None else impl
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 708, in impl
+    return prim_func_generator(**annot)
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 693, in prim_func_generator
+    ir_gen.gen(builder)(*args, **kwargs)
+  File "/root/TileLang/examples/gemm_streamk/example_tilelang_gemm_streamk.py", line 165, in main
+    compute_first_wave(pid, A, A_shared, B, B_shared, C, C_local)
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 529, in __call__
+    res = self.ir_gen.gen(builder)(*args, **kwargs)
+  File "/root/TileLang/examples/gemm_streamk/example_tilelang_gemm_streamk.py", line 126, in compute_first_wave
+    start_iter[0] = end_iter[0]
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 295, in ctx_while
+    raise RuntimeError("while loops are not supported in TileLang builder")
+RuntimeError: while loops are not supported in TileLang builder
+Traceback (most recent call last):
+  File "/root/TileLang/tilelang/tools/bench.py", line 48, in process_func
+    func(*args, **kwargs)
+  File "/root/TileLang/examples/gemm_streamk/example_tilelang_gemm_streamk.py", line 174, in main
+    kernel = tl_matmul_streamk(
+  File "/root/TileLang/tilelang/jit/__init__.py", line 273, in __call__
+    self._kernel_cache[key] = self.compile(*args, **kwargs, **tune_params)
+  File "/root/TileLang/tilelang/jit/__init__.py", line 223, in compile
+    func = self.get_tir(*args, **kwargs)
+  File "/root/TileLang/tilelang/jit/__init__.py", line 192, in get_tir
+    program_result = program_result_source(*args, **kwargs)
+  File "/root/TileLang/examples/gemm_streamk/example_tilelang_gemm_streamk.py", line 152, in tl_matmul_streamk
+    def main(
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 721, in prim_func
+    return impl(func) if func is not None else impl
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 708, in impl
+    return prim_func_generator(**annot)
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 693, in prim_func_generator
+    ir_gen.gen(builder)(*args, **kwargs)
+  File "/root/TileLang/examples/gemm_streamk/example_tilelang_gemm_streamk.py", line 165, in main
+    compute_first_wave(pid, A, A_shared, B, B_shared, C, C_local)
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 529, in __call__
+    res = self.ir_gen.gen(builder)(*args, **kwargs)
+  File "/root/TileLang/examples/gemm_streamk/example_tilelang_gemm_streamk.py", line 126, in compute_first_wave
+    start_iter[0] = end_iter[0]
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 295, in ctx_while
+    raise RuntimeError("while loops are not supported in TileLang builder")
+RuntimeError: while loops are not supported in TileLang builder
+Traceback (most recent call last):
+  File "/root/TileLang/tilelang/tools/bench.py", line 48, in process_func
+    func(*args, **kwargs)
+  File "/root/TileLang/examples/gemm_streamk/example_tilelang_gemm_streamk.py", line 174, in main
+    kernel = tl_matmul_streamk(
+  File "/root/TileLang/tilelang/jit/__init__.py", line 273, in __call__
+    self._kernel_cache[key] = self.compile(*args, **kwargs, **tune_params)
+  File "/root/TileLang/tilelang/jit/__init__.py", line 223, in compile
+    func = self.get_tir(*args, **kwargs)
+  File "/root/TileLang/tilelang/jit/__init__.py", line 192, in get_tir
+    program_result = program_result_source(*args, **kwargs)
+  File "/root/TileLang/examples/gemm_streamk/example_tilelang_gemm_streamk.py", line 152, in tl_matmul_streamk
+    def main(
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 721, in prim_func
+    return impl(func) if func is not None else impl
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 708, in impl
+    return prim_func_generator(**annot)
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 693, in prim_func_generator
+    ir_gen.gen(builder)(*args, **kwargs)
+  File "/root/TileLang/examples/gemm_streamk/example_tilelang_gemm_streamk.py", line 165, in main
+    compute_first_wave(pid, A, A_shared, B, B_shared, C, C_local)
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 529, in __call__
+    res = self.ir_gen.gen(builder)(*args, **kwargs)
+  File "/root/TileLang/examples/gemm_streamk/example_tilelang_gemm_streamk.py", line 126, in compute_first_wave
+    start_iter[0] = end_iter[0]
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 295, in ctx_while
+    raise RuntimeError("while loops are not supported in TileLang builder")
+RuntimeError: while loops are not supported in TileLang builder
+Traceback (most recent call last):
+  File "/root/TileLang/tilelang/tools/bench.py", line 48, in process_func
+    func(*args, **kwargs)
+  File "/root/TileLang/examples/gemm_streamk/example_tilelang_gemm_streamk.py", line 174, in main
+    kernel = tl_matmul_streamk(
+  File "/root/TileLang/tilelang/jit/__init__.py", line 273, in __call__
+    self._kernel_cache[key] = self.compile(*args, **kwargs, **tune_params)
+  File "/root/TileLang/tilelang/jit/__init__.py", line 223, in compile
+    func = self.get_tir(*args, **kwargs)
+  File "/root/TileLang/tilelang/jit/__init__.py", line 192, in get_tir
+    program_result = program_result_source(*args, **kwargs)
+  File "/root/TileLang/examples/gemm_streamk/example_tilelang_gemm_streamk.py", line 152, in tl_matmul_streamk
+    def main(
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 721, in prim_func
+    return impl(func) if func is not None else impl
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 708, in impl
+    return prim_func_generator(**annot)
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 693, in prim_func_generator
+    ir_gen.gen(builder)(*args, **kwargs)
+  File "/root/TileLang/examples/gemm_streamk/example_tilelang_gemm_streamk.py", line 165, in main
+    compute_first_wave(pid, A, A_shared, B, B_shared, C, C_local)
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 529, in __call__
+    res = self.ir_gen.gen(builder)(*args, **kwargs)
+  File "/root/TileLang/examples/gemm_streamk/example_tilelang_gemm_streamk.py", line 126, in compute_first_wave
+    start_iter[0] = end_iter[0]
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 295, in ctx_while
+    raise RuntimeError("while loops are not supported in TileLang builder")
+RuntimeError: while loops are not supported in TileLang builder
+Traceback (most recent call last):
+  File "/root/TileLang/tilelang/tools/bench.py", line 48, in process_func
+    func(*args, **kwargs)
+  File "/root/TileLang/examples/gemm_streamk/example_tilelang_gemm_streamk.py", line 174, in main
+    kernel = tl_matmul_streamk(
+  File "/root/TileLang/tilelang/jit/__init__.py", line 273, in __call__
+    self._kernel_cache[key] = self.compile(*args, **kwargs, **tune_params)
+  File "/root/TileLang/tilelang/jit/__init__.py", line 223, in compile
+    func = self.get_tir(*args, **kwargs)
+  File "/root/TileLang/tilelang/jit/__init__.py", line 192, in get_tir
+    program_result = program_result_source(*args, **kwargs)
+  File "/root/TileLang/examples/gemm_streamk/example_tilelang_gemm_streamk.py", line 152, in tl_matmul_streamk
+    def main(
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 721, in prim_func
+    return impl(func) if func is not None else impl
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 708, in impl
+    return prim_func_generator(**annot)
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 693, in prim_func_generator
+    ir_gen.gen(builder)(*args, **kwargs)
+  File "/root/TileLang/examples/gemm_streamk/example_tilelang_gemm_streamk.py", line 165, in main
+    compute_first_wave(pid, A, A_shared, B, B_shared, C, C_local)
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 529, in __call__
+    res = self.ir_gen.gen(builder)(*args, **kwargs)
+  File "/root/TileLang/examples/gemm_streamk/example_tilelang_gemm_streamk.py", line 126, in compute_first_wave
+    start_iter[0] = end_iter[0]
+  File "/root/TileLang/tilelang/language/v2/builder.py", line 295, in ctx_while
+    raise RuntimeError("while loops are not supported in TileLang builder")
+RuntimeError: while loops are not supported in TileLang builder
+/root/TileLang/examples/gemm_streamk/bench_example_tilelang_gemm_splitk.py:6: RuntimeWarning: benchmark for example_tilelang_gemm_streamk failed in all repeats (no valid run)
+  tilelang.tools.bench.process_func(example_tilelang_gemm_streamk.main)Traceback (most recent call last):
+  File "/root/TileLang/tilelang/tools/bench.py", line 48, in process_func
+    func(*args, **kwargs)
+  File "/root/TileLang/examples/deepseek_v32/sparse_mla_fwd.py", line 282, in test_sparse_mla_fwd
+    ref_out = ref_sparse_mla_fwd_interface(q, kv, indices)
+  File "/root/TileLang/examples/deepseek_v32/sparse_mla_fwd.py", line 244, in ref_sparse_mla_fwd_interface
+    score = score.masked_fill(~mask, float("-inf")).mul(sm_scale)
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU 0 has a total capacity of 79.10 GiB of which 8.16 GiB is free. Process 1079578 has 70.93 GiB memory in use. Of the allocated memory 54.43 GiB is allocated by PyTorch, and 15.41 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/root/TileLang/tilelang/tools/bench.py", line 48, in process_func
+    func(*args, **kwargs)
+  File "/root/TileLang/examples/deepseek_v32/sparse_mla_fwd.py", line 282, in test_sparse_mla_fwd
+    ref_out = ref_sparse_mla_fwd_interface(q, kv, indices)
+  File "/root/TileLang/examples/deepseek_v32/sparse_mla_fwd.py", line 244, in ref_sparse_mla_fwd_interface
+    score = score.masked_fill(~mask, float("-inf")).mul(sm_scale)
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU 0 has a total capacity of 79.10 GiB of which 8.16 GiB is free. Process 1079578 has 70.93 GiB memory in use. Of the allocated memory 54.43 GiB is allocated by PyTorch, and 15.41 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/root/TileLang/tilelang/tools/bench.py", line 48, in process_func
+    func(*args, **kwargs)
+  File "/root/TileLang/examples/deepseek_v32/sparse_mla_fwd.py", line 282, in test_sparse_mla_fwd
+    ref_out = ref_sparse_mla_fwd_interface(q, kv, indices)
+  File "/root/TileLang/examples/deepseek_v32/sparse_mla_fwd.py", line 244, in ref_sparse_mla_fwd_interface
+    score = score.masked_fill(~mask, float("-inf")).mul(sm_scale)
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU 0 has a total capacity of 79.10 GiB of which 8.16 GiB is free. Process 1079578 has 70.93 GiB memory in use. Of the allocated memory 54.43 GiB is allocated by PyTorch, and 15.41 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/root/TileLang/tilelang/tools/bench.py", line 48, in process_func
+    func(*args, **kwargs)
+  File "/root/TileLang/examples/deepseek_v32/sparse_mla_fwd.py", line 282, in test_sparse_mla_fwd
+    ref_out = ref_sparse_mla_fwd_interface(q, kv, indices)
+  File "/root/TileLang/examples/deepseek_v32/sparse_mla_fwd.py", line 244, in ref_sparse_mla_fwd_interface
+    score = score.masked_fill(~mask, float("-inf")).mul(sm_scale)
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU 0 has a total capacity of 79.10 GiB of which 8.16 GiB is free. Process 1079578 has 70.93 GiB memory in use. Of the allocated memory 54.43 GiB is allocated by PyTorch, and 15.41 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/root/TileLang/tilelang/tools/bench.py", line 48, in process_func
+    func(*args, **kwargs)
+  File "/root/TileLang/examples/deepseek_v32/sparse_mla_fwd.py", line 282, in test_sparse_mla_fwd
+    ref_out = ref_sparse_mla_fwd_interface(q, kv, indices)
+  File "/root/TileLang/examples/deepseek_v32/sparse_mla_fwd.py", line 244, in ref_sparse_mla_fwd_interface
+    score = score.masked_fill(~mask, float("-inf")).mul(sm_scale)
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU 0 has a total capacity of 79.10 GiB of which 8.16 GiB is free. Process 1079578 has 70.93 GiB memory in use. Of the allocated memory 54.43 GiB is allocated by PyTorch, and 15.41 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/root/TileLang/tilelang/tools/bench.py", line 48, in process_func
+    func(*args, **kwargs)
+  File "/root/TileLang/examples/deepseek_v32/sparse_mla_fwd.py", line 282, in test_sparse_mla_fwd
+    ref_out = ref_sparse_mla_fwd_interface(q, kv, indices)
+  File "/root/TileLang/examples/deepseek_v32/sparse_mla_fwd.py", line 244, in ref_sparse_mla_fwd_interface
+    score = score.masked_fill(~mask, float("-inf")).mul(sm_scale)
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU 0 has a total capacity of 79.10 GiB of which 8.16 GiB is free. Process 1079578 has 70.93 GiB memory in use. Of the allocated memory 54.43 GiB is allocated by PyTorch, and 15.41 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/root/TileLang/tilelang/tools/bench.py", line 48, in process_func
+    func(*args, **kwargs)
+  File "/root/TileLang/examples/deepseek_v32/sparse_mla_fwd.py", line 282, in test_sparse_mla_fwd
+    ref_out = ref_sparse_mla_fwd_interface(q, kv, indices)
+  File "/root/TileLang/examples/deepseek_v32/sparse_mla_fwd.py", line 244, in ref_sparse_mla_fwd_interface
+    score = score.masked_fill(~mask, float("-inf")).mul(sm_scale)
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU 0 has a total capacity of 79.10 GiB of which 8.16 GiB is free. Process 1079578 has 70.93 GiB memory in use. Of the allocated memory 54.43 GiB is allocated by PyTorch, and 15.41 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/root/TileLang/tilelang/tools/bench.py", line 48, in process_func
+    func(*args, **kwargs)
+  File "/root/TileLang/examples/deepseek_v32/sparse_mla_fwd.py", line 282, in test_sparse_mla_fwd
+    ref_out = ref_sparse_mla_fwd_interface(q, kv, indices)
+  File "/root/TileLang/examples/deepseek_v32/sparse_mla_fwd.py", line 244, in ref_sparse_mla_fwd_interface
+    score = score.masked_fill(~mask, float("-inf")).mul(sm_scale)
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU 0 has a total capacity of 79.10 GiB of which 8.16 GiB is free. Process 1079578 has 70.93 GiB memory in use. Of the allocated memory 54.43 GiB is allocated by PyTorch, and 15.41 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/root/TileLang/tilelang/tools/bench.py", line 48, in process_func
+    func(*args, **kwargs)
+  File "/root/TileLang/examples/deepseek_v32/sparse_mla_fwd.py", line 282, in test_sparse_mla_fwd
+    ref_out = ref_sparse_mla_fwd_interface(q, kv, indices)
+  File "/root/TileLang/examples/deepseek_v32/sparse_mla_fwd.py", line 244, in ref_sparse_mla_fwd_interface
+    score = score.masked_fill(~mask, float("-inf")).mul(sm_scale)
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU 0 has a total capacity of 79.10 GiB of which 8.16 GiB is free. Process 1079578 has 70.93 GiB memory in use. Of the allocated memory 54.43 GiB is allocated by PyTorch, and 15.41 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/root/TileLang/tilelang/tools/bench.py", line 48, in process_func
+    func(*args, **kwargs)
+  File "/root/TileLang/examples/deepseek_v32/sparse_mla_fwd.py", line 282, in test_sparse_mla_fwd
+    ref_out = ref_sparse_mla_fwd_interface(q, kv, indices)
+  File "/root/TileLang/examples/deepseek_v32/sparse_mla_fwd.py", line 244, in ref_sparse_mla_fwd_interface
+    score = score.masked_fill(~mask, float("-inf")).mul(sm_scale)
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU 0 has a total capacity of 79.10 GiB of which 8.16 GiB is free. Process 1079578 has 70.93 GiB memory in use. Of the allocated memory 54.43 GiB is allocated by PyTorch, and 15.41 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+/root/TileLang/examples/deepseek_v32/bench_tilelang_example_deepseek_v32.py:18: RuntimeWarning: benchmark for sparse_mla_fwd failed in all repeats (no valid run)
+  tilelang.tools.bench.process_func(sparse_mla_fwd.test_sparse_mla_fwd)
+[05:04:44] /root/TileLang/src/transform/warp_specialized_rewriter.h:43: Warning: Auto warp specialization will be disabled because TMA and mbarrier are both present
+Traceback (most recent call last):
+  File "/root/TileLang/tilelang/tools/bench.py", line 48, in process_func
+    func(*args, **kwargs)
+  File "/root/TileLang/examples/deepseek_v32/sparse_mla_fwd_pipelined.py", line 438, in test_sparse_mla_fwd_pipelined
+    ref_out = ref_sparse_mla_fwd_interface(q, kv, indices, q_start_s_index, KV_stride)
+  File "/root/TileLang/examples/deepseek_v32/sparse_mla_fwd_pipelined.py", line 391, in ref_sparse_mla_fwd_interface
+    score = score.masked_fill(~mask, float("-inf")).mul(sm_scale)
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU 0 has a total capacity of 79.10 GiB of which 8.16 GiB is free. Process 1079578 has 70.93 GiB memory in use. Of the allocated memory 57.22 GiB is allocated by PyTorch, and 12.62 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/root/TileLang/tilelang/tools/bench.py", line 48, in process_func
+    func(*args, **kwargs)
+  File "/root/TileLang/examples/deepseek_v32/sparse_mla_fwd_pipelined.py", line 438, in test_sparse_mla_fwd_pipelined
+    ref_out = ref_sparse_mla_fwd_interface(q, kv, indices, q_start_s_index, KV_stride)
+  File "/root/TileLang/examples/deepseek_v32/sparse_mla_fwd_pipelined.py", line 391, in ref_sparse_mla_fwd_interface
+    score = score.masked_fill(~mask, float("-inf")).mul(sm_scale)
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU 0 has a total capacity of 79.10 GiB of which 8.16 GiB is free. Process 1079578 has 70.93 GiB memory in use. Of the allocated memory 57.22 GiB is allocated by PyTorch, and 12.62 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/root/TileLang/tilelang/tools/bench.py", line 48, in process_func
+    func(*args, **kwargs)
+  File "/root/TileLang/examples/deepseek_v32/sparse_mla_fwd_pipelined.py", line 438, in test_sparse_mla_fwd_pipelined
+    ref_out = ref_sparse_mla_fwd_interface(q, kv, indices, q_start_s_index, KV_stride)
+  File "/root/TileLang/examples/deepseek_v32/sparse_mla_fwd_pipelined.py", line 391, in ref_sparse_mla_fwd_interface
+    score = score.masked_fill(~mask, float("-inf")).mul(sm_scale)
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU 0 has a total capacity of 79.10 GiB of which 8.16 GiB is free. Process 1079578 has 70.93 GiB memory in use. Of the allocated memory 57.22 GiB is allocated by PyTorch, and 12.62 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/root/TileLang/tilelang/tools/bench.py", line 48, in process_func
+    func(*args, **kwargs)
+  File "/root/TileLang/examples/deepseek_v32/sparse_mla_fwd_pipelined.py", line 438, in test_sparse_mla_fwd_pipelined
+    ref_out = ref_sparse_mla_fwd_interface(q, kv, indices, q_start_s_index, KV_stride)
+  File "/root/TileLang/examples/deepseek_v32/sparse_mla_fwd_pipelined.py", line 391, in ref_sparse_mla_fwd_interface
+    score = score.masked_fill(~mask, float("-inf")).mul(sm_scale)
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU 0 has a total capacity of 79.10 GiB of which 8.16 GiB is free. Process 1079578 has 70.93 GiB memory in use. Of the allocated memory 57.22 GiB is allocated by PyTorch, and 12.62 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/root/TileLang/tilelang/tools/bench.py", line 48, in process_func
+    func(*args, **kwargs)
+  File "/root/TileLang/examples/deepseek_v32/sparse_mla_fwd_pipelined.py", line 438, in test_sparse_mla_fwd_pipelined
+    ref_out = ref_sparse_mla_fwd_interface(q, kv, indices, q_start_s_index, KV_stride)
+  File "/root/TileLang/examples/deepseek_v32/sparse_mla_fwd_pipelined.py", line 391, in ref_sparse_mla_fwd_interface
+    score = score.masked_fill(~mask, float("-inf")).mul(sm_scale)
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU 0 has a total capacity of 79.10 GiB of which 8.16 GiB is free. Process 1079578 has 70.93 GiB memory in use. Of the allocated memory 57.22 GiB is allocated by PyTorch, and 12.62 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/root/TileLang/tilelang/tools/bench.py", line 48, in process_func
+    func(*args, **kwargs)
+  File "/root/TileLang/examples/deepseek_v32/sparse_mla_fwd_pipelined.py", line 438, in test_sparse_mla_fwd_pipelined
+    ref_out = ref_sparse_mla_fwd_interface(q, kv, indices, q_start_s_index, KV_stride)
+  File "/root/TileLang/examples/deepseek_v32/sparse_mla_fwd_pipelined.py", line 391, in ref_sparse_mla_fwd_interface
+    score = score.masked_fill(~mask, float("-inf")).mul(sm_scale)
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU 0 has a total capacity of 79.10 GiB of which 8.16 GiB is free. Process 1079578 has 70.93 GiB memory in use. Of the allocated memory 57.22 GiB is allocated by PyTorch, and 12.62 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/root/TileLang/tilelang/tools/bench.py", line 48, in process_func
+    func(*args, **kwargs)
+  File "/root/TileLang/examples/deepseek_v32/sparse_mla_fwd_pipelined.py", line 438, in test_sparse_mla_fwd_pipelined
+    ref_out = ref_sparse_mla_fwd_interface(q, kv, indices, q_start_s_index, KV_stride)
+  File "/root/TileLang/examples/deepseek_v32/sparse_mla_fwd_pipelined.py", line 391, in ref_sparse_mla_fwd_interface
+    score = score.masked_fill(~mask, float("-inf")).mul(sm_scale)
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU 0 has a total capacity of 79.10 GiB of which 8.16 GiB is free. Process 1079578 has 70.93 GiB memory in use. Of the allocated memory 57.22 GiB is allocated by PyTorch, and 12.62 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/root/TileLang/tilelang/tools/bench.py", line 48, in process_func
+    func(*args, **kwargs)
+  File "/root/TileLang/examples/deepseek_v32/sparse_mla_fwd_pipelined.py", line 438, in test_sparse_mla_fwd_pipelined
+    ref_out = ref_sparse_mla_fwd_interface(q, kv, indices, q_start_s_index, KV_stride)
+  File "/root/TileLang/examples/deepseek_v32/sparse_mla_fwd_pipelined.py", line 391, in ref_sparse_mla_fwd_interface
+    score = score.masked_fill(~mask, float("-inf")).mul(sm_scale)
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU 0 has a total capacity of 79.10 GiB of which 8.16 GiB is free. Process 1079578 has 70.93 GiB memory in use. Of the allocated memory 57.22 GiB is allocated by PyTorch, and 12.62 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/root/TileLang/tilelang/tools/bench.py", line 48, in process_func
+    func(*args, **kwargs)
+  File "/root/TileLang/examples/deepseek_v32/sparse_mla_fwd_pipelined.py", line 438, in test_sparse_mla_fwd_pipelined
+    ref_out = ref_sparse_mla_fwd_interface(q, kv, indices, q_start_s_index, KV_stride)
+  File "/root/TileLang/examples/deepseek_v32/sparse_mla_fwd_pipelined.py", line 391, in ref_sparse_mla_fwd_interface
+    score = score.masked_fill(~mask, float("-inf")).mul(sm_scale)
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU 0 has a total capacity of 79.10 GiB of which 8.16 GiB is free. Process 1079578 has 70.93 GiB memory in use. Of the allocated memory 57.22 GiB is allocated by PyTorch, and 12.62 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/root/TileLang/tilelang/tools/bench.py", line 48, in process_func
+    func(*args, **kwargs)
+  File "/root/TileLang/examples/deepseek_v32/sparse_mla_fwd_pipelined.py", line 438, in test_sparse_mla_fwd_pipelined
+    ref_out = ref_sparse_mla_fwd_interface(q, kv, indices, q_start_s_index, KV_stride)
+  File "/root/TileLang/examples/deepseek_v32/sparse_mla_fwd_pipelined.py", line 391, in ref_sparse_mla_fwd_interface
+    score = score.masked_fill(~mask, float("-inf")).mul(sm_scale)
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU 0 has a total capacity of 79.10 GiB of which 8.16 GiB is free. Process 1079578 has 70.93 GiB memory in use. Of the allocated memory 57.22 GiB is allocated by PyTorch, and 12.62 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+/root/TileLang/examples/deepseek_v32/bench_tilelang_example_deepseek_v32.py:22: RuntimeWarning: benchmark for sparse_mla_fwd_pipelined failed in all repeats (no valid run)
+  tilelang.tools.bench.process_func(sparse_mla_fwd_pipelined.test_sparse_mla_fwd_pipelined)
+                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         Traceback (most recent call last):
+  File "<string>", line 1, in <module>
+  File "/root/TileLang/tilelang/tools/bench.py", line 146, in bench_all
+    func()
+  File "/root/TileLang/examples/dequantize_gemm/bench_example_dequantize_gemm.py", line 15, in bench_example_dequant_gemm_fp4_hopper
+    tilelang.tools.bench.process_func(example_dequant_gemm_fp4_hopper.main)
+  File "/root/TileLang/tilelang/tools/bench.py", line 48, in process_func
+    func(*args, **kwargs)
+  File "/root/TileLang/examples/dequantize_gemm/example_dequant_gemm_fp4_hopper.py", line 277, in main
+    profiler.assert_allclose(ref_program, rtol=0.01, atol=0.01)
+  File "/root/TileLang/tilelang/profiler/__init__.py", line 95, in assert_allclose
+    ref_outs = reference_program(*ins)
+  File "/root/TileLang/examples/dequantize_gemm/example_dequant_gemm_fp4_hopper.py", line 263, in ref_program
+    B = torch_convert(qB)
+  File "/root/TileLang/examples/dequantize_gemm/example_dequant_gemm_fp4_hopper.py", line 57, in torch_convert
+    new_tensor[i][j] = _convert(tensor[i][j // 2], j % 2)
+  File "/root/TileLang/examples/dequantize_gemm/example_dequant_gemm_fp4_hopper.py", line 49, in _convert
+    lower_16_bits = (val_f16 & 0xFFFF).to(torch.uint16)
+KeyboardInterrupt
+Running BlockSparse MatMul Benchmark for M=1024, N=1024, K=1024
+Target Block Sparsity: 0.5
+Using Autotuner: False
+
+total_tiles=128 
+iters_per_tile=16 
diff --git a/maint/scripts/ci_performance.py b/maint/scripts/ci_performance.py
index 998e7b650..3a1ac876c 100644
--- a/maint/scripts/ci_performance.py
+++ b/maint/scripts/ci_performance.py
@@ -1,49 +1,75 @@
 import subprocess
 import re
 from tabulate import tabulate
+import tilelang
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns
 
-import os
-
-env = os.environ.copy()
-env["TILELANG_CLEAR_CACHE"] = "1"
+tilelang.disable_cache()
 
 
 def parse_output(output):
     data = {}
     for line in output.split('\n'):
         line = line.strip()
-        if line.startswith('Latency:'):
-            match = re.search(r'Latency: ([\d.]+)', line)
-            data['latency'] = match.group(1) if match else 'N/A'
-        elif line.startswith('TFlops:'):
-            match = re.search(r'TFlops: ([\d.]+)', line)
-            data['best_tflops'] = match.group(1) if match else 'N/A'
-        elif line.startswith('Config:'):
-            data['config'] = line.split('Config: ')[-1]
-        elif line.startswith('Reference TFlops:'):
-            match = re.search(r'Reference TFlops: ([\d.]+)', line)
-            data['ref_tflops'] = match.group(1) if match else 'N/A'
+        m = re.match(r"\|\s*([^\|]+)\s*\|\s*([0-9\.]+)\s*\|", line)
+        if m is not None:
+            data[m.group(1)] = float(m.group(2))
     return data
 
 
-output_v1 = subprocess.run(['./tl/bin/python', './maint/scripts/performance.py'],
-                           capture_output=True,
-                           text=True,
-                           env=env).stdout
-data_v1 = parse_output(output_v1)
+output_v1 = subprocess.run(
+    ['./tl/bin/python', '-c', 'import tilelang.tools.bench as b; b.bench_all()'],
+    capture_output=True,
+    text=True).stdout
+output_v2 = subprocess.run(
+    ['./tll/bin/python', '-c', 'import tilelang.tools.bench as b; b.bench_all()'],
+    capture_output=True,
+    text=True).stdout
 
-output_v2 = subprocess.run(['./tll/bin/python', './maint/scripts/performance.py'],
-                           capture_output=True,
-                           text=True,
-                           env=env).stdout
+data_v1 = parse_output(output_v1)
 data_v2 = parse_output(output_v2)
+table = []
+for key in data_v1.keys():
+    speedup = data_v1[key] / data_v2[key]
+    table.append([key, data_v1[key], data_v2[key], speedup])
+table.sort(key=lambda x: x[-1])
+
+headers = ["File", "Original Latency", "Current Latency", "Speedup"]
+
+with open("bench.md", "w") as f:
+    f.write(
+        tabulate(table, headers=headers, tablefmt="github", stralign="left", numalign="decimal"))
+    f.write("\n")
 
-table = [[
-    "original", data_v1['latency'], data_v1['best_tflops'], data_v1['ref_tflops'], data_v1['config']
-], [
-    "current", data_v2['latency'], data_v2['best_tflops'], data_v2['ref_tflops'], data_v2['config']
-]]
+df = pd.DataFrame(table, columns=headers)
+df = df.sort_values("Speedup", ascending=False).reset_index(drop=True)
+fig_width = max(0, len(df) * 0.35)
+plt.figure(figsize=(fig_width, 8))
+sns.set_theme(style="whitegrid", font_scale=0.9)
+bar_colors = sns.color_palette("magma", len(df))
+bars = plt.bar(range(len(df)), df["Speedup"], color=bar_colors, edgecolor="black")
+top3_idx = df.nlargest(3, "Speedup").index
+bot3_idx = df.nsmallest(3, "Speedup").index
+label_idx = set(top3_idx.tolist() + bot3_idx.tolist())
 
-headers = ["version", "Best Latency (s)", "Best TFlops", "Reference TFlops", "Best Config"]
+for i, val in enumerate(df["Speedup"]):
+    if i in label_idx:
+        plt.text(
+            i,
+            val + 0.02,
+            f"{val:.2f}x",
+            ha="center",
+            va="bottom",
+            color="red",
+            fontsize=8,
+            fontweight="bold")
 
-print(tabulate(table, headers=headers, tablefmt="github", stralign="left", numalign="decimal"))
+plt.xticks(range(len(df)), df["File"], rotation=70, ha='right', fontsize=12)
+plt.ylabel("Current Speedup vs Original", fontsize=14)
+plt.title("Current Speedup vs Original", fontsize=14, fontweight="bold")
+plt.ylim(0, max(df["Speedup"]) * 1.2)
+sns.despine()
+plt.tight_layout()
+plt.savefig("bench.png", dpi=300)
diff --git a/tilelang/tools/bench.py b/tilelang/tools/bench.py
new file mode 100644
index 000000000..07243de4d
--- /dev/null
+++ b/tilelang/tools/bench.py
@@ -0,0 +1,144 @@
+import os
+import re
+import sys
+import inspect
+import time
+import traceback
+import contextlib
+import warnings
+from tabulate import tabulate
+import matplotlib.pyplot as plt
+import importlib.util
+
+__all__ = ["main", "process_func"]
+_RECORDS = []
+
+
+@contextlib.contextmanager
+def suppress_output():
+    # Context manager that redirects stdout/stderr to os.devnull (supports fileno)
+    devnull = open(os.devnull, "w")
+    saved_stdout = sys.stdout
+    saved_stderr = sys.stderr
+    sys.stdout = devnull
+    sys.stderr = devnull
+    try:
+        yield
+    finally:
+        sys.stdout = saved_stdout
+        sys.stderr = saved_stderr
+        devnull.close()
+
+
+def process_func(func, *args, repeat=10, warmup=3, **kwargs):
+    # Run a target function multiple times and measure average latency.
+    try:
+        with suppress_output():
+            for _ in range(warmup):
+                func(*args, **kwargs)
+    except Exception:
+        pass
+
+    times = []
+    fail_count = 0
+    for _ in range(repeat):
+        start = time.time()
+        try:
+            with suppress_output():
+                func(*args, **kwargs)
+            elapsed = (time.time() - start) * 1000
+            times.append(elapsed)
+        except Exception:
+            fail_count += 1
+            traceback.print_exc(file=sys.stderr)
+
+    if times:
+        avg_latency = sum(times) / len(times)
+        if fail_count == 0:
+            _RECORDS.append((f"{func.__module__}", avg_latency))
+        else:
+            warnings.warn(
+                f"benchmark for {func.__module__} failed {fail_count} times in {repeat} repeats",
+                RuntimeWarning,
+                stacklevel=2,
+            )
+            _RECORDS.append((f"{func.__module__}", avg_latency))
+    else:
+        warnings.warn(
+            f"benchmark for {func.__module__} failed in all repeats (no valid run)",
+            RuntimeWarning,
+            stacklevel=2,
+        )
+
+
+def analyze_records(records, out_dir):
+    # Analyze the data and draw a chart
+    records.sort(key=lambda x: x[1])
+    headers = ["Functions", "Avg Latency (ms)"]
+    print(
+        tabulate(_RECORDS, headers=headers, tablefmt="github", stralign="left", numalign="decimal"))
+
+    names = [r[0] for r in records]
+    lats = [r[1] for r in records]
+    plt.figure(figsize=(max(len(names) * 2.2, 6), 6))
+    plt.bar(names, lats)
+    plt.xlabel("Latency (ms)")
+    plt.title("Benchmark Results")
+    out_path = os.path.join(out_dir, "bench_result.png")
+
+    plt.tight_layout()
+    plt.savefig(out_path, dpi=200)
+    plt.close()
+
+    print(f"Saved Bar chart to {out_path}")
+
+
+def main():
+    # Entry point — automatically run all bench_* functions in caller file.
+    test_file = inspect.getsourcefile(sys._getframe(1))
+    out_dir = os.path.dirname(test_file)
+    module = {}
+    with open(test_file) as f:
+        exec(f.read(), module)
+
+    for name, func in module.items():
+        if name.startswith("bench_") and callable(func):
+            func()
+
+    analyze_records(_RECORDS, out_dir)
+
+
+def bench_all():
+    # Do benchmark for all bench_* functions in examples
+
+    # Load a Python file as a real module (preserves sys.path, __file__, imports)
+    def _load_module(full_path):
+        module_name = os.path.splitext(os.path.basename(full_path))[0]
+        spec = importlib.util.spec_from_file_location(module_name, full_path)
+        mod = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(mod)
+        return mod
+
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    examples_root = os.path.abspath(os.path.join(current_dir, "../../examples"))
+
+    bench_funcs = []
+    added_roots = set()
+
+    for root, _, files in os.walk(examples_root):
+        for file_name in files:
+            if re.match(r"^bench_.*\.py$", file_name):
+                full_path = os.path.join(root, file_name)
+                if root not in added_roots:
+                    sys.path.insert(0, root)
+                    added_roots.add(root)
+                mod = _load_module(full_path)
+                for name in dir(mod):
+                    if name.startswith("bench_"):
+                        func = getattr(mod, name)
+                        if callable(func):
+                            bench_funcs.append(func)
+    for func in bench_funcs:
+        func()
+
+    print(tabulate(_RECORDS, tablefmt="github", stralign="left", numalign="decimal"))