Float8Tensor per row quantization pass bias to fbgemm kernel (#2884)

jerryzh168 · web-flow · commit e7b310b8441a · 2025-09-05T10:32:12.000-07:00
Summary: Previously bias is not passed to fbgemm kernel for float8 per row quant, this PR adds it. Difference is we should have a faster float8 per row quantized kernel, without changing numerics or other things. Test Plan: ``` python test/quantization/quantize_/workflows/float8/test_float8_tensor.py -k test_kernel_preference_numerical_equivalence python test/quantization/quantize_/workflows/float8/test_float8_tensor.py -k test_expected_gpu_kernel_fbgemm ``` Reviewers: Subscribers: Tasks: Tags: stack-info: PR: #2884, branch: jerryzh168/stack/60
diff --git a/test/quantization/quantize_/workflows/float8/test_float8_tensor.py b/test/quantization/quantize_/workflows/float8/test_float8_tensor.py
@@ -418,7 +418,9 @@ def test_moe_weight_reshape_ops(self):
     # https://github.com/pytorch/ao/issues/2649
     @unittest.skipIf(not is_sm_at_least_90(), "Nedd sm90+")
     def test_expected_gpu_kernel_fbgemm(self):
-        """Making sure KernelPreference.FBGEMM calls correct quantize and gemm kernels"""
+        """Making sure KernelPreference.FBGEMM calls correct quantize and gemm kernels
+        and the bias add happens in the gemm kernel for per row quantization
+        """
         torch.compiler.reset()
 
         M, K, N = 128, 256, 512
@@ -434,10 +436,15 @@ def test_expected_gpu_kernel_fbgemm(self):
         x = torch.randn(M, K, device="cuda", dtype=torch.bfloat16)
         out, code = run_and_get_code(m, x)
 
-        # check at least one occurrence of the quantize op and rowwise gemm op
+        # 1. check at least one occurrence of the quantize op and rowwise gemm op
+        # 2. check that there are no additional kernels like `triton_poi_fused_add_0`
+        # are run, since the bias add should happen in the `f8f8bf16_rowwise.default`
+        # op instead of separately
         FileCheck().check_count(
-            "torch.ops.triton.quantize_fp8_row.default", 1
-        ).check_count("torch.ops.fbgemm.f8f8bf16_rowwise.default", 1).run(code[0])
+            "torch.ops.triton.quantize_fp8_row.default(", 1
+        ).check_count("torch.ops.fbgemm.f8f8bf16_rowwise.default(", 1).check_not(
+            ".run("
+        ).run(code[0])
 
 
 common_utils.instantiate_parametrized_tests(TestFloat8Tensor)
diff --git a/torchao/quantization/quantize_/workflows/float8/float8_tensor.py b/torchao/quantization/quantize_/workflows/float8/float8_tensor.py
@@ -285,6 +285,8 @@ def _(func, types, args, kwargs):
                 "Expected fbgemm_gpu_genai package to be installed"
             )
             assert is_sm_at_least_90(), "Expected SM90+ for fbgemm_gpu_genai"
+            mm_config = weight_tensor.mm_config
+            assert mm_config is not None
 
             out_shape = get_out_shape(input_tensor.shape, weight_tensor.shape)
             xq = input_tensor.qdata.reshape(-1, input_tensor.qdata.shape[-1])
@@ -300,6 +302,8 @@ def _(func, types, args, kwargs):
                     wq,
                     x_scale,
                     w_scale,
+                    bias=bias,
+                    use_fast_accum=mm_config.use_fast_accum,
                 ).reshape(out_shape)
             else:
                 assert _is_tensorwise_scaled(weight_tensor)
@@ -308,9 +312,10 @@ def _(func, types, args, kwargs):
                     xq,
                     wq,
                     x_scale * w_scale,
+                    use_fast_accum=mm_config.use_fast_accum,
                 ).reshape(out_shape)
-            if bias is not None:
-                res = res + bias
+                if bias is not None:
+                    res = res + bias
             return res
         else:
             assert kernel_choice == "torch"