add infernece only roofline

drisspg · drisspg · commit 1f5d105a1e78 · 2025-10-13T21:48:26.000Z
diff --git a/.gitignore b/.gitignore
@@ -34,6 +34,7 @@ aten/build/
 aten/src/ATen/Config.h
 aten/src/ATen/cuda/CUDAConfig.h
 benchmarks/.data
+benchmarks/data
 caffe2/cpp_test/
 dist/
 docs/build/
diff --git a/benchmarks/float8/float8_inference_roofline.py b/benchmarks/float8/float8_inference_roofline.py
@@ -122,37 +122,53 @@ def run(
     shape_gen_name: str = "pow2",
     n_limit: Optional[int] = None,
     float8_recipe_name: Optional[str] = None,
+    mx_recipe_name: Optional[str] = None,
+    nvfp4_recipe_name: Optional[str] = None,
 ):
     """
     Args:
     * `do_benchmarks`: if True, gemm and e2e fwd+bwd of LNLinearSigmoid are benchmarked
     * `shape_gen_name`: `llama`, `pow2`, `pow2_extended`, or `sweep`
     * `n_limit (optional)`: if specified, only runs `n_limit` iterations
+    * `float8_recipe_name (optional)`: float8 quantization recipe
+    * `mx_recipe_name (optional)`: MX format recipe
+    * `nvfp4_recipe_name (optional)`: NVFP4 format recipe
     """
 
-    assert float8_recipe_name is not None, "unsupported"
+    recipe_count = sum(
+        x is not None for x in [float8_recipe_name, mx_recipe_name, nvfp4_recipe_name]
+    )
+
+    assert recipe_count <= 1, "Only one recipe type can be specified at a time"
+
+    if recipe_count == 0:
+        float8_recipe_name = "tensorwise"
 
     print(f"GPU: {torch.cuda.get_device_name(0)}")
     print(f"torch version: {torch.__version__}")
     print(f"torchao version: {torchao.__version__}")
     print(f"do_benchmarks: {do_benchmarks}")
     print(f"shape_gen_name: {shape_gen_name}")
     print(f"float8_recipe_name: {float8_recipe_name}")
+    print(f"mx_recipe_name: {mx_recipe_name}")
+    print(f"nvfp4_recipe_name: {nvfp4_recipe_name}")
 
     M, K, N = sympy.symbols("M K N")
 
     fp8_ovhd_time_sympy = get_inference_float8_mem_sympy(
-        M,
-        K,
-        N,
-        float8_recipe_name,
+        M, K, N, float8_recipe_name, mx_recipe_name, nvfp4_recipe_name
     )
     bf16_gemm_time_sympy = get_inference_gemm_time_sympy(
         M, K, N, torch.bfloat16, None, None
     )
-    fp8_gemm_time_sympy = get_inference_gemm_time_sympy(
-        M, K, N, torch.float8_e4m3fn, float8_recipe_name, None
-    )
+    if nvfp4_recipe_name is not None:
+        fp8_gemm_time_sympy = get_inference_gemm_time_sympy(
+            M, K, N, torch.float4_e2m1fn_x2, float8_recipe_name, nvfp4_recipe_name
+        )
+    else:
+        fp8_gemm_time_sympy = get_inference_gemm_time_sympy(
+            M, K, N, torch.float8_e4m3fn, float8_recipe_name, None
+        )
     print("bf16_gemm_time_sympy", bf16_gemm_time_sympy)
     print("fp8_gemm_time_sympy", fp8_gemm_time_sympy)
     print("fp8_ovhd_time_sympy", fp8_ovhd_time_sympy)
@@ -261,6 +277,8 @@ def run(
             m_fp8_dyn = torch.compile(m_fp8_dyn)
             b_fp8_e2e_time_s = get_gpu_kernel_time(m_fp8_dyn, x)
 
+        roofline_speedup = r_bf16_gemm_time_s / (r_fp8_gemm_time_s + r_fp8_ovhd_time_s)
+
         results.append(
             [
                 M_val,
@@ -273,7 +291,7 @@ def run(
                 r_fp8_ovhd_time_s,
                 # roofline - gemm + overhead, and speedup
                 r_fp8_gemm_time_s + r_fp8_ovhd_time_s,
-                r_bf16_gemm_time_s / (r_fp8_gemm_time_s + r_fp8_ovhd_time_s),
+                roofline_speedup,
                 # benchmarks - gemm
                 b_bf16_gemm_time_s,
                 b_fp8_gemm_time_s,
diff --git a/benchmarks/float8/float8_roofline.py b/benchmarks/float8/float8_roofline.py
@@ -214,6 +214,8 @@ def run(
     * `shape_gen_name`: `llama`, `pow2`, `pow2_extended`, or `sweep`
     * `gemm_cache_filename (optional)`: file to cache gemm benchmark results
     * `n_limit (optional)`: if specified, only runs `n_limit` iterations
+    * `float8_recipe_name (optional)`: float8 quantization recipe
+    * `mx_recipe_name (optional)`: MX format recipe
     * `enable_fusion_modeling`: if False uses Linear, if True uses LNLinearSigmoid and models the fusion of float8 overhead
     """
 
diff --git a/torchao/testing/training/roofline_utils.py b/torchao/testing/training/roofline_utils.py
@@ -12,6 +12,8 @@
 BYTES_PER_EL_FLOAT4 = 0.5
 BYTES_PER_EL_FLOAT8 = 1
 BYTES_PER_EL_BF16 = 2
+BYTES_PER_EL_FLOAT8_E8M0 = 1
+BYTES_PER_EL_FLOAT32 = 4
 
 gpu_name_to_specs = {
     "NVIDIA H100": {
@@ -241,7 +243,7 @@ def get_individual_gemm_time_sympy(
     elif dtype is torch.float4_e2m1fn_x2:
         peak_tops = specs["fp4_peak_tops"]
     else:
-        assert False, "unsupported"
+        assert False, f"unsupported dtype: {dtype}"
     compute_gemm_time_s = gemm_ops / peak_tops / specs["pct_achievable_gemm_tops"]
 
     # memory bound
@@ -274,7 +276,7 @@ def get_individual_gemm_time_sympy(
     elif dtype is torch.float4_e2m1fn_x2:
         bytes_rw = num_reads * BYTES_PER_EL_FLOAT4 + num_writes * BYTES_PER_EL_BF16
     else:
-        assert False, "unsupported"
+        assert False, f"unsupported dtype: {dtype}"
     mem_gemm_time_s = (
         bytes_rw / specs["peak_mem_bw_bytes_sec"] / specs["pct_achievable_mem_bw"]
     )
@@ -376,27 +378,56 @@ def get_inference_tensor_memory_traffic_ovhd_s(
     dim1,
     tensor_role: str,
     float8_recipe_name: Optional[str],
+    mx_recipe_name: Optional[str],
     fuse_with_prev=False,
 ) -> List[Union[sympy.Symbol, float]]:
     """
     Inference version of `get_tensor_memory_traffic_ovhd_s`.
     The only thing happening here is we quantize the activation.
     """
-    assert float8_recipe_name == "rowwise", "unsupported"
     assert fuse_with_prev is False, "unsupported"
+    assert tensor_role == "input", "inference only quantizes input activations"
 
     # assumes input bf16, output f8
     numel = dim0 * dim1
 
     res_bytes = None
 
-    assert tensor_role == "input"
-    # x_bf16 = ...
-    # kernel 1:               x_bf16 -> x_fp8
-    kernel_1_rw = BYTES_PER_EL_BF16 * numel + BYTES_PER_EL_FLOAT8 * numel
-    res_bytes = [
-        kernel_1_rw,
-    ]
+    if float8_recipe_name == "tensorwise":
+        # x_bf16 = ...
+        # kernel 1:               x_bf16 -> max_abs_stage_1 -> tmp
+        # kernel 2 (mem traffic not modeled): tmp -> max_abs_stage_2 -> max_abs
+        # kernel 3:               x_bf16, max_abs -> to_float8 -> x_fp8
+        # kernel 1: read numel, write 0 (assume size(tmp) ~ 0)
+        kernel_1_rw = BYTES_PER_EL_BF16 * numel
+        # kernel 3: read in bf16, write in float8
+        kernel_3_rw = BYTES_PER_EL_BF16 * numel + BYTES_PER_EL_FLOAT8 * numel
+        res_bytes = [kernel_1_rw, kernel_3_rw]
+
+    elif float8_recipe_name == "rowwise":
+        # x_bf16 = ...
+        # kernel 1:               x_bf16 -> x_fp8 (with per-row scaling)
+        kernel_1_rw = BYTES_PER_EL_BF16 * numel + BYTES_PER_EL_FLOAT8 * numel
+        # add in the bytes for scale writes
+        kernel_1_rw += BYTES_PER_EL_FLOAT32 * dim0
+        res_bytes = [kernel_1_rw]
+
+    elif mx_recipe_name in ("mxfp8_emulated", "mxfp8_cublas", "mxfp8_cublas_rceil"):
+        # x_bf16 = ...
+        # kernel 1:               x_bf16 -> x_mxfp8 (block-wise scaling for inference)
+        kernel_1_rw = BYTES_PER_EL_BF16 * numel + BYTES_PER_EL_FLOAT8 * numel
+        # add in the bytes for scale writes
+        kernel_1_rw += BYTES_PER_EL_FLOAT8_E8M0 * dim0 * (dim1 // 32)
+        res_bytes = [kernel_1_rw]
+
+    else:
+        # For NVFP4, assume minimal overhead since it's primarily a compute format
+        # x_bf16 = ...
+        # kernel 1:               x_bf16 -> x_nvfp4 (per-tensor scaling for inference)
+        kernel_1_rw = BYTES_PER_EL_BF16 * numel + BYTES_PER_EL_FLOAT4 * numel
+        # add minimal scaling overhead (per-tensor scale)
+        kernel_1_rw += BYTES_PER_EL_FLOAT32  # single scale factor
+        res_bytes = [kernel_1_rw]
 
     # convert from bytes to seconds
     res_s = [
@@ -415,6 +446,8 @@ def get_inference_float8_mem_sympy(
     K,
     N,
     float8_recipe_name: Optional[str],
+    mx_recipe_name: Optional[str] = None,
+    nvfp4_recipe_name: Optional[str] = None,
     gpu_name: Optional[str] = None,
 ):
     specs = get_specs(gpu_name)
@@ -426,6 +459,7 @@ def get_inference_float8_mem_sympy(
         K,
         tensor_role="input",
         float8_recipe_name=float8_recipe_name,
+        mx_recipe_name=mx_recipe_name,
         fuse_with_prev=False,
     )
     res = sum([*fwd_fp8_input_mem])
@@ -438,9 +472,9 @@ def get_inference_gemm_time_sympy(
     N: sympy.Symbol,
     dtype,
     float8_recipe_name: Optional[str],
-    gpu_name: Optional[str],
+    nvfp4_recipe_name: Optional[str] = None,
+    gpu_name: Optional[str] = None,
 ):
-    assert float8_recipe_name == "rowwise" or float8_recipe_name is None, "unsupported"
     # note: this function is currently not super accurate for small shapes:
     # when M,K,N <= 1k,1k,1k it undercounts by around 2x
     gemm_output_time_s = get_individual_gemm_time_sympy(M, K, N, dtype, None, gpu_name)