dsxsteven
diff --git a/‎benchmarks/kernels/bench_block_fp8_gemm.py‎
Lines changed: 44 additions & 13 deletions b/‎benchmarks/kernels/bench_block_fp8_gemm.py‎
Lines changed: 44 additions & 13 deletions
diff --git a/‎csrc/cutlass_extensions/gemm/collective/collective_builder.hpp‎
Lines changed: 0 additions & 123 deletions b/‎csrc/cutlass_extensions/gemm/collective/collective_builder.hpp‎
Lines changed: 0 additions & 123 deletions
@@ -4,7 +4,10 @@
 import torch
 
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    w8a8_block_fp8_matmul,
+    apply_w8a8_block_fp8_linear,
+)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    CUTLASS_BLOCK_FP8_SUPPORTED,
 )
 from vllm.platforms import current_platform
 from vllm.triton_utils import triton as vllm_triton
@@ -29,45 +32,62 @@
 ]
 
 
-def build_w8a8_block_fp8_runner(M, N, K, block_size, device):
+def build_w8a8_block_fp8_runner(M, N, K, block_size, device, use_cutlass):
     """Build runner function for w8a8 block fp8 matmul."""
     factor_for_scale = 1e-2
 
     fp8_info = torch.finfo(torch.float8_e4m3fn)
     fp8_max, fp8_min = fp8_info.max, fp8_info.min
 
     # Create random FP8 tensors
-    A_fp32 = (torch.rand(M, K, dtype=torch.float32, device=device) - 0.5) * 2 * fp8_max
-    A = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+    A_ref = (torch.rand(M, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max
 
-    B_fp32 = (torch.rand(N, K, dtype=torch.float32, device=device) - 0.5) * 2 * fp8_max
-    B = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+    B_ref = (torch.rand(N, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max
+    B = B_ref.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
 
     # Create scales
     block_n, block_k = block_size[0], block_size[1]
     n_tiles = (N + block_n - 1) // block_n
     k_tiles = (K + block_k - 1) // block_k
 
-    As = torch.rand(M, k_tiles, dtype=torch.float32, device=device) * factor_for_scale
     Bs = (
         torch.rand(n_tiles, k_tiles, dtype=torch.float32, device=device)
         * factor_for_scale
     )
 
+    # SM90 CUTLASS requires row-major format for scales
+    if use_cutlass and current_platform.is_device_capability(90):
+        Bs = Bs.T.contiguous()
+
     def run():
-        return w8a8_block_fp8_matmul(A, B, As, Bs, block_size, torch.bfloat16)
+        if use_cutlass:
+            return apply_w8a8_block_fp8_linear(
+                A_ref, B, block_size, Bs, cutlass_block_fp8_supported=True
+            )
+        else:
+            return apply_w8a8_block_fp8_linear(
+                A_ref, B, block_size, Bs, cutlass_block_fp8_supported=False
+            )
 
     return run
 
 
+# Determine available providers
+available_providers = ["torch-bf16", "w8a8-block-fp8-triton"]
+plot_title = "BF16 vs W8A8 Block FP8 GEMMs"
+
+if CUTLASS_BLOCK_FP8_SUPPORTED:
+    available_providers.append("w8a8-block-fp8-cutlass")
+
+
 @vllm_triton.testing.perf_report(
     vllm_triton.testing.Benchmark(
         x_names=["batch_size"],
         x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
         x_log=False,
         line_arg="provider",
-        line_vals=["torch-bf16", "w8a8-block-fp8"],
-        line_names=["torch-bf16", "w8a8-block-fp8"],
+        line_vals=available_providers,
+        line_names=available_providers,
         ylabel="TFLOP/s (larger is better)",
         plot_name="BF16 vs W8A8 Block FP8 GEMMs",
         args={},
@@ -85,11 +105,22 @@ def benchmark_tflops(batch_size, provider, N, K, block_size=(128, 128)):
         ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
             lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
         )
-    else:  # w8a8-block-fp8
-        run_w8a8 = build_w8a8_block_fp8_runner(M, N, K, block_size, device)
+    elif provider == "w8a8-block-fp8-triton":
+        run_w8a8_triton = build_w8a8_block_fp8_runner(
+            M, N, K, block_size, device, use_cutlass=False
+        )
+        ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
+            lambda: run_w8a8_triton(), quantiles=quantiles
+        )
+    elif provider == "w8a8-block-fp8-cutlass":
+        run_w8a8_cutlass = build_w8a8_block_fp8_runner(
+            M, N, K, block_size, device, use_cutlass=True
+        )
         ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
-            lambda: run_w8a8(), quantiles=quantiles
+            lambda: run_w8a8_cutlass(), quantiles=quantiles
         )
+    else:
+        raise ValueError(f"Unknown provider: {provider}")
 
     to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
     return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)