Add Sparsify overhead benchmark (#3021)

namgyu-youn · web-flow · commit d18a34257a91 · 2025-09-25T13:14:58.000-04:00
* Summary: This PR adds sparsify overhead benchmark, omitted in ICLR workshop paper: https://arxiv.org/abs/2503.16672 In the paper, there are two parts for the benchmark: 1) Sparsify operation overhead, 2) Sparse-GEMM kernel performance. Part 1) was omitted from the original benchmark, so this PR adds the missing sparsify-only benchmark comparing `torchao.sparse24_sm90_sparsify` against `torch._cslt_compress` (cuSPASRELt) baseline. Test plan: CI * remove lambda, scale for fair comparison * rename attributes to prevent duplicate naming
diff --git a/benchmarks/benchmark_e2e_fp8_sparse_linear.py b/benchmarks/benchmark_e2e_fp8_sparse_linear.py
@@ -40,6 +40,18 @@ def benchmark(num_tokens, hidden_size=8192, intermediate_size=8192):
     input_tensor = torch.randn(num_tokens, hidden_size).to(torch.bfloat16).cuda()
     fp16_time = benchmark_microseconds(ffn_ref, input_tensor)
 
+    # Sparsify-only benchmarks
+    ao_fast_sparsification_time = benchmark_microseconds(
+        torch.ops.torchao.sparse24_sm90_sparsify(
+            input_tensor,
+            "cutlass",
+            "identity",
+            "largest",
+            dtype=torch.float8_e4m3fn,
+        )
+    )
+    cusparselt_time = benchmark_microseconds(torch._cslt_compress, input_tensor)
+
     # bf16
     ffn_clone = (
         nn.Sequential(
@@ -117,7 +129,10 @@ def benchmark(num_tokens, hidden_size=8192, intermediate_size=8192):
         "fp8_c_time (us)": fp8_c_time,
         "fp8_c_sparse_time (us)": fp8_c_sparse_time,
         "fp8_c_activation_sparse_time (us)": fp8_c_activation_sparse_time,
+        "ao_fast_sparsification_time (us)": ao_fast_sparsification_time,
+        "cusparselt_compress_time (us)": cusparselt_time,
         "speedup": fp8_c_time / fp8_c_activation_sparse_time,
+        "sparsify_speedup": cusparselt_time / ao_fast_sparsification_time,
     }