rename and add test

Rachmanino · Rachmanino · commit 6eecc9d729de · 2025-09-15T11:33:51.000Z
diff --git a/examples/dequantize_gemm/example_dequant_groupedgemm_bf16_mxfp4_hopper.py b/examples/dequantize_gemm/example_dequant_groupedgemm_bf16_mxfp4_hopper.py
@@ -206,7 +206,6 @@ def simple_dequant_bf16_fp4(B_shared, B_dequantize_shared, Scale_shared, k):
             B_local = T.alloc_fragment(B_shared_shape, storage_dtype)
             B_dequantize_local = T.alloc_fragment(B_dequantize_shared_shape, out_dtype)
 
-            bx = T.get_block_binding(0)  # noqa: F841
             T.copy(B_shared, B_local)
             for i, j in T.Parallel(block_N, block_K):
                 B_dequantize_local[i, j] = _tir_u8_to_f4_to_bf16(
@@ -244,7 +243,7 @@ def main(
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
             C_shared = T.alloc_shared((block_M, block_N), out_dtype)
             topk_weights_shared = T.alloc_shared((block_M), out_dtype)
-            sorted_token_ids_shared = T.alloc_shared((block_M), "int32")  # todo: frag?
+            sorted_token_ids_shared = T.alloc_shared((block_M), "int32")
             expert_id = T.alloc_local((1), "int32")  # the expert id for the current block
             # To use 1D TMA, the last dim of Scale_shared must have stride=1
             # May use much more shared memory than necessary
@@ -462,4 +461,4 @@ def main(m=256, n=256, k=256, scale_size=32, fast_dequant=True, with_bias=False,
     scale_size = 32
     topk = 4
     E = 32
-    main(M, N, K, scale_size, fast_dequant=True, with_bias=True, topk=topk, E=E)
+    main(M, N, K, scale_size, fast_dequant=True, with_bias=True, topk=topk, E=E)
diff --git a/examples/dequantize_gemm/test_example_dequantize_gemm.py b/examples/dequantize_gemm/test_example_dequantize_gemm.py
@@ -4,6 +4,7 @@
 import example_dequant_gemm_fp4_hopper
 import example_dequant_gemm_bf16_mxfp4_hopper
 import example_dequant_gemm_bf16_mxfp4_hopper_tma
+import example_dequant_groupedgemm_bf16_mxfp4_hopper
 
 
 @tilelang.testing.requires_cuda
@@ -29,5 +30,11 @@ def test_example_dequant_gemm_bf16_mxfp4_hopper_tma():
     example_dequant_gemm_bf16_mxfp4_hopper_tma.main()
 
 
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
+def test_example_dequant_groupedgemm_bf16_mxfp4_hopper():
+    example_dequant_groupedgemm_bf16_mxfp4_hopper.main()
+
+
 if __name__ == "__main__":
     tilelang.testing.main()