[Bugfix] Fix flops comp and softmax scale in mla (#900)

Edenzzzz · web-flow · commit 1656115917ea · 2025-09-30T02:37:27.000+08:00
* fix flops comp and softmax scale

* format
diff --git a/examples/deepseek_mla/benchmark_mla.py b/examples/deepseek_mla/benchmark_mla.py
@@ -87,8 +87,8 @@ def flash_mla():
 
 
 @torch.inference_mode()
-def run_flash_infer(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens,
-                    h_q, h_kv, d, dv, causal, dtype):
+def run_flashinfer(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens,
+                   h_q, h_kv, d, dv, causal, dtype):
     # pip install flashinfer-python
     import flashinfer
     assert d > dv, "mla with rope dim should be larger than no rope dim"
@@ -128,7 +128,7 @@ def run_flash_infer(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_
         blocked_k.dtype,
     )
 
-    def flash_infer():
+    def flashinfer():
         output, lse = mla_wrapper.run(
             q_nope.view(-1, h_q, dv),
             q_pe.view(-1, h_q, d - dv),
@@ -137,8 +137,8 @@ def flash_infer():
             return_lse=True)
         return output.view(b, -1, h_q, dv), lse.view(b, h_q, 1)
 
-    out_flash, lse_flash = flash_infer()
-    t = triton.testing.do_bench(flash_infer)
+    out_flash, lse_flash = flashinfer()
+    t = triton.testing.do_bench(flashinfer)
     return out_flash, lse_flash, t
 
 
@@ -459,7 +459,7 @@ def flash_mla_tilelang():
     "torch": run_torch_mla,
     "tilelang": run_flash_mla_tilelang,
     "flash_mla": run_flash_mla,
-    "flash_infer": run_flash_infer,
+    "flashinfer": run_flashinfer,
     "flash_mla_triton": run_flash_mla_triton,
 }
 
@@ -496,9 +496,9 @@ def compare_ab(baseline, target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal
                                        s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
 
     torch.testing.assert_close(out_b.float(), out_a.float(), atol=1e-2, rtol=1e-2), "out"
-    if target not in ["flash_infer", "flash_mla_triton", "tilelang"
-                     ] and baseline not in ["flash_infer", "flash_mla_triton", "tilelang"]:
-        # flash_infer has a different lse return value
+    if target not in ["flashinfer", "flash_mla_triton", "tilelang"
+                     ] and baseline not in ["flashinfer", "flash_mla_triton", "tilelang"]:
+        # flashinfer has a different lse return value
         # flash_mla_triton and flash_mla_tilelang doesn't return lse
         torch.testing.assert_close(lse_b.float(), lse_a.float(), atol=1e-2, rtol=1e-2), "lse"
 
@@ -554,7 +554,7 @@ def compare_a(target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
     "torch",
     "tilelang",
     "flash_mla",
-    "flash_infer",
+    "flashinfer",
     "flash_mla_triton",
 ]
 
diff --git a/examples/deepseek_mla/example_mla_decode_paged.py b/examples/deepseek_mla/example_mla_decode_paged.py
@@ -11,8 +11,19 @@
     out_idx=[8], pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
     })
-def mla_decode_tilelang(batch, h_q, h_kv, max_seqlen_pad, dv, dpe, block_N, block_H, num_split,
-                        block_size, softmax_scale):
+def mla_decode_tilelang(batch,
+                        h_q,
+                        h_kv,
+                        max_seqlen_pad,
+                        dv,
+                        dpe,
+                        block_N,
+                        block_H,
+                        num_split,
+                        block_size,
+                        softmax_scale=None):
+    if softmax_scale is None:
+        softmax_scale = (dv + dpe)**-0.5
     scale = float(softmax_scale * 1.44269504)  # log2(e)
     dtype = "float16"
     accum_dtype = "float"
@@ -322,7 +333,7 @@ def run_tilelang_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s
     num_kv_splits = 1
     BLOCK_N = 64
     BLOCK_H = min(64, h_q // h_kv)
-    softmax_scale = (d + dv)**-0.5
+    softmax_scale = d**-0.5
 
     out_partial = torch.empty(b, h_q, num_kv_splits, dv, dtype=dtype, device=q.device)
     glse = torch.empty(b, h_q, num_kv_splits, dtype=dtype, device=q.device)
@@ -379,7 +390,7 @@ def flash_mla_tilelang():
     max_seqlen = cache_seqlens.max().item()
     max_seqlen_pad = math.ceil(max_seqlen / 256) * 256
 
-    total_flops = s_q * total_seqlens * h_q * (d + dv) * 2
+    total_flops = s_q * total_seqlens * h_q * d * 2
 
     q = torch.randn(b, s_q, h_q, d, dtype=dtype, device=device)
     block_table = torch.arange(