Fix Float8Tensor quantize op kernrel preference dispatch

jerryzh168 · jerryzh168 · commit 4516f6ecb632 · 2025-08-28T15:43:23.000-07:00
Summary: Previously if user specifies kernel_preference == "fbgemm", we'll use torch ops like `_choose_scale_float8` and `_quantize_affine_float8` to quantize the high precision Tensor into a float8 Tensor this PR makes sure we use fbgemm kernels when kernel_preference is "fbgemm", meaning: `torch.ops.triton.quantize_fp8_row` for per row, and `torch.ops.fbgemm.quantize_fp8_per_tensor` for per tensor (while `torch.ops.fbgemm.quantize_fp8_per_tensor` has some issues right now and we'll enable later when it's fixed) This doesn't have impact on BC, meaning old serialized model can still be loaded and run, only thing is fixing the kernel choice for fbgemm kernel preference means users who requested FBGEMM kernelpreference now actually run fbgemm quantize op instead of torch op Test Plan: python test/quantization/quantize_/workflows/float8/test_float8_tensor.py -k test_expected_gpu_kernel_fbgemm Reviewers: Subscribers: Tasks: Tags: stack-info: PR: #2883, branch: jerryzh168/stack/59
diff --git a/test/quantization/quantize_/workflows/float8/test_float8_tensor.py b/test/quantization/quantize_/workflows/float8/test_float8_tensor.py
@@ -10,6 +10,8 @@
 from typing import Tuple
 
 import torch
+from torch._inductor.utils import run_and_get_code
+from torch.testing import FileCheck
 from torch.testing._internal import common_utils
 from torch.testing._internal.common_utils import (
     run_tests,
@@ -85,6 +87,14 @@ def test_fp8_linear_variants(
         kernel_preference: KernelPreference,
         sizes: Tuple,
     ):
+        if (
+            isinstance(granularity, PerTensor)
+            and kernel_preference == KernelPreference.FBGEMM
+        ):
+            return unittest.skip(
+                "per tensor with fbgemm kernel preferece does not work yet"
+            )
+
         error_message = None
         if isinstance(granularity, PerRow):
             if mode == "dynamic" and dtype != torch.bfloat16:
@@ -237,7 +247,11 @@ def test_kernel_preference_numerical_equivalence(self, granularity, sizes):
         other_kernel_preferences = [
             KernelPreference.AUTO,
         ]
-        if _is_fbgemm_genai_gpu_available() and is_sm_at_least_90():
+        if (
+            _is_fbgemm_genai_gpu_available()
+            and is_sm_at_least_90()
+            and not isinstance(granularity, PerTensor)
+        ):
             other_kernel_preferences.append(KernelPreference.FBGEMM)
 
         quantized_outputs = {}
@@ -399,6 +413,32 @@ def test_moe_weight_reshape_ops(self):
         config = Float8DynamicActivationFloat8WeightConfig(granularity=granularity)
         self._test_moe_weight_reshape_ops(config)
 
+    # TODO: we have some other tests living in https://github.com/pytorch/ao/blob/4ecc89edd7b5cfc12e6f80854c85d04c472a0eb0/test/dtypes/test_affine_quantized_float.py#L743
+    # that should be moved here after v1 config is deprecated:
+    # https://github.com/pytorch/ao/issues/2649
+    @unittest.skipIf(not is_sm_at_least_90(), "Nedd sm90+")
+    def test_expected_gpu_kernel_fbgemm(self):
+        """Making sure KernelPreference.FBGEMM calls correct quantize and gemm kernels"""
+        torch.compiler.reset()
+
+        M, K, N = 128, 256, 512
+        m = torch.nn.Sequential(
+            torch.nn.Linear(K, N, device="cuda", dtype=torch.bfloat16)
+        )
+        config = Float8DynamicActivationFloat8WeightConfig(
+            granularity=PerRow(),
+            kernel_preference=KernelPreference.FBGEMM,
+        )
+        quantize_(m, config)
+        m = torch.compile(m)
+        x = torch.randn(M, K, device="cuda", dtype=torch.bfloat16)
+        out, code = run_and_get_code(m, x)
+
+        # check at least one occurrence of the quantize op and rowwise gemm op
+        FileCheck().check_count(
+            "torch.ops.triton.quantize_fp8_row.default", 1
+        ).check_count("torch.ops.fbgemm.f8f8bf16_rowwise.default", 1).run(code[0])
+
 
 common_utils.instantiate_parametrized_tests(TestFloat8Tensor)
 
diff --git a/torchao/quantization/quantize_/common/kernel_preference.py b/torchao/quantization/quantize_/common/kernel_preference.py
@@ -26,7 +26,7 @@ class KernelPreference(str, Enum):
     """
     TORCH = "torch"
 
-    """Use fbgemm quantize and quantized mm kernels, requires fbgemm_gpu_genai library
+    """Use quantize and quantized mm kernels from fbgemm_gpu_genai library, requires fbgemm_gpu_genai library
     """
     FBGEMM = "fbgemm"
 
diff --git a/torchao/quantization/quantize_/workflows/float8/float8_tensor.py b/torchao/quantization/quantize_/workflows/float8/float8_tensor.py
@@ -22,7 +22,7 @@
     preprocess_data,
     preprocess_scale,
 )
-from torchao.quantization.granularity import PerRow
+from torchao.quantization.granularity import PerRow, PerTensor
 from torchao.quantization.observer import get_block_size
 from torchao.quantization.quant_primitives import (
     _choose_scale_float8,
@@ -177,32 +177,61 @@ def to_float8(
         block_size = get_block_size(hp_tensor.shape, granularity)
         block_size = list(block_size)
 
-        # for per row quantization and kernel_preference default setting, we'll use triton kernel for best performance
+        kernel_choice = None
         if (
             kernel_preference == KernelPreference.AUTO
             and _is_fbgemm_genai_gpu_available()
-            and (
-                tuple(block_size)
-                == (1,) * (hp_tensor.ndim - 1) + (hp_tensor.shape[-1],)
-            )
+            and is_sm_at_least_90()
+            and isinstance(granularity, PerRow)
+            and float8_dtype == torch.float8_e4m3fn
+            and hp_value_lb is None
         ):
-            assert float8_dtype == torch.float8_e4m3fn, (
-                f"Only torch.float8_e4m3fn is supported, got: {float8_dtype}"
+            # if kernel_preference is AUTO and per row quantization
+            # we'll use fbgemm quantize kernel for best performance
+            kernel_choice = "fbgemm"
+        elif kernel_preference == KernelPreference.FBGEMM:
+            # if user explicitly chose FBGEMM kernel preference, we'll also use fbgemm kernel
+            assert _is_fbgemm_genai_gpu_available() and is_sm_at_least_90(), (
+                "Specified fbgemm but fbgemm_gpu_genai is not installed or hardware is not >= SM 9.0 (>= H100)"
+            )
+            assert hp_value_lb is None, (
+                "hp_value_lb should not be specified if with KerenelPreference.FBGEMM"
             )
+            kernel_choice = "fbgemm"
+        else:
+            # fallback quantize kernel for everything else will be torch
+            kernel_choice = "torch"
+
+        if kernel_choice == "fbgemm":
+            assert hp_value_lb is None, f"{hp_value_lb=} is not supported"
             if hp_value_ub is not None:
                 maybe_hp_value_ub_tensor = torch.tensor(
                     hp_value_ub, dtype=torch.float, device=hp_tensor.device
                 )
             else:
                 maybe_hp_value_ub_tensor = None
-            data, scale = torch.ops.triton.quantize_fp8_row(
-                hp_tensor, scale_ub=maybe_hp_value_ub_tensor
-            )
-            scale_shape = []
-            for i in range(hp_tensor.ndim):
-                scale_shape.append(hp_tensor.shape[i] // block_size[i])
-            scale = scale.reshape(*scale_shape)
+            if isinstance(granularity, PerRow):
+                data, scale = torch.ops.triton.quantize_fp8_row(
+                    hp_tensor, scale_ub=maybe_hp_value_ub_tensor
+                )
+                scale_shape = []
+                for i in range(hp_tensor.ndim):
+                    scale_shape.append(hp_tensor.shape[i] // block_size[i])
+                scale = scale.reshape(*scale_shape)
+            else:
+                assert isinstance(granularity, PerTensor), (
+                    f"Expected per tensor, got {granularity}"
+                )
+                # current error: torch.AcceleratorError: CUDA error: an illegal memory access was encountered
+                # TODO: enable after this is working
+                # data, scale = torch.ops.fbgemm.quantize_fp8_per_tensor(
+                #     hp_tensor, num_tokens, scale_ub=maybe_hp_value_ub_tensor
+                # )
+                raise NotImplementedError(
+                    "Currently KernelPreference.FBGEMM does not work for per tensor float8 quant"
+                )
         else:
+            assert kernel_choice == "torch", f"Expected torch, got {kernel_choice}"
             scale = _choose_scale_float8(
                 hp_tensor,
                 float8_dtype=float8_dtype,