Fix Per Row scaling for inference

drisspg · drisspg · commit f106d46caa9f · 2025-05-23T16:24:34.000-07:00
stack-info: PR: #2253, branch: drisspg/stack/56
diff --git a/test/dtypes/test_affine_quantized_float.py b/test/dtypes/test_affine_quantized_float.py
@@ -297,21 +297,53 @@ def test_fp8_weight_dimension_warning(self):
     @unittest.skipIf(
         not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
     )
-    def test_mm_float8dq(self):
+    @common_utils.parametrize(
+        "in_features,out_features", [(512, 1024), (256, 768), (1024, 512)]
+    )
+    @common_utils.parametrize(
+        "leading_shape", [(1,), (8,), (16,), (2, 8,), (2, 2, 16,)]
+    )  # fmt: skip
+    @common_utils.parametrize("bias", [True, False])
+    def test_mm_float8dq(self, in_features, out_features, leading_shape, bias: bool):
         device = "cuda"
         dtype = torch.bfloat16
-        weight = torch.randn(512, 1024).to(device).to(dtype)
-        weight = weight.t()
-
-        l = torch.nn.Linear(512, 1024).to(device).to(dtype)
-        l.weight = torch.nn.Parameter(weight)
-        quantize_(l, Float8DynamicActivationFloat8WeightConfig(granularity=PerRow()))
-        # weight shape: 1024 x 512
-        weight = l.weight
-
-        input = torch.randn(1, 512, device=device, dtype=dtype)
-        # make sure it runs
-        torch.nn.functional.linear(input, weight)
+        input_shape = leading_shape + (in_features,)
+
+        ref_linear = (
+            torch.nn.Linear(in_features, out_features, bias=bias).to(device).to(dtype)
+        )
+        test_linear = copy.deepcopy(ref_linear)
+        quantize_(
+            test_linear, Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())
+        )
+
+        quant_weight = test_linear.weight
+
+        self.assertTrue(hasattr(quant_weight, "original_weight_tensor"))
+        weight_impl = quant_weight.original_weight_tensor.tensor_impl
+
+        self.assertTrue(hasattr(weight_impl, "float8_data"))
+        self.assertTrue(hasattr(weight_impl, "scale"))
+        self.assertFalse(weight_impl.transposed)
+
+        # Verify scale shape for row-wise quantization
+        expected_scale_shape = (out_features, 1)
+        actual_scale_shape = weight_impl.scale.shape
+        self.assertEqual(actual_scale_shape, expected_scale_shape)
+
+        self.assertEqual(weight_impl.float8_data.shape, (out_features, in_features))
+
+        input_tensor = torch.randn(*input_shape, device=device, dtype=dtype)
+
+        with torch.no_grad():
+            ref_output = ref_linear(input_tensor)
+            quant_output = torch.nn.functional.linear(input_tensor, quant_weight)
+
+        expected_output_shape = input_tensor.shape[:-1] + (out_features,)
+        self.assertEqual(quant_output.shape, expected_output_shape)
+
+        error = compute_error(ref_output, quant_output)
+        assert error > 20, f"Quantization error is too high got a SQNR of {error}"
 
 
 common_utils.instantiate_parametrized_tests(TestAffineQuantizedFloat8Compile)
diff --git a/torchao/dtypes/affine_quantized_tensor.py b/torchao/dtypes/affine_quantized_tensor.py
@@ -462,10 +462,10 @@ def from_hp_to_floatx(
         if target_dtype in FP8_TYPES:
             original_shape = input_float.shape
             input_float = _layout.pre_process(input_float)
-
-            scale = choose_qparams_affine_float8(input_float, float8_dtype=target_dtype)
+            scale = choose_qparams_affine_float8(
+                input_float, float8_dtype=target_dtype, block_size=block_size
+            )
             data = quantize_affine_float8(input_float, scale, target_dtype)
-
             data, scale, zero_point = _layout.post_process(
                 data, scale, None, block_size
             )
@@ -503,7 +503,6 @@ def from_hp_to_floatx_static(
                 input_float,
                 scale,
                 target_dtype,
-                scale_dtype,
             )
 
             data, scale, zero_point = _layout.post_process(
diff --git a/torchao/dtypes/floatx/float8_layout.py b/torchao/dtypes/floatx/float8_layout.py
@@ -195,28 +195,32 @@ def __torch_dispatch__(cls, func, types, args, kwargs):
         elif func is aten.slice.Tensor:
             self, dim, start, end, step = fill_defaults(args, 5, [0, None, None, 1])
             if dim == 0:
-                # TODO: scale replecation should be dependent on block size
-                if self.scale.ndim == 1:
+                if self.scale.ndim == 0 or (
+                    self.scale.ndim == 1 and self.scale.size(0) == 1
+                ):
+                    # Per Tensor
                     return return_and_correct_aliasing(
                         func,
                         args,
                         kwargs,
-                        args[0]._apply_fn_to_data(
-                            lambda x: aten.slice.Tensor(x, dim, start, end, step)
+                        Float8AQTTensorImpl(
+                            aten.slice.Tensor(self.float8_data, dim, start, end, step),
+                            self.scale,
+                            False,
+                            self._layout,
                         ),
                     )
-                elif self.scale.ndim == 0:
+                elif self.scale.ndim == 2:
+                    # TODO: scale replecation should be dependent on block size
                     return return_and_correct_aliasing(
                         func,
                         args,
                         kwargs,
-                        Float8AQTTensorImpl(
-                            aten.slice.Tensor(self.float8_data, dim, start, end, step),
-                            self.scale,
-                            None,
-                            self._layout,
+                        args[0]._apply_fn_to_data(
+                            lambda x: aten.slice.Tensor(x, dim, start, end, step)
                         ),
                     )
+
                 else:
                     raise NotImplementedError(
                         f"Float8AQTTensorImpl dispatch: attempting to run {func}, with scale ndim={dim}, that is not supported"
@@ -333,13 +337,12 @@ def _linear_fp8_act_fp8_weight_impl(
     input_scale = input_tensor.tensor_impl.scale
     # Handle case where input tensor is more than 2D
     inpt_data = inpt_data.reshape(-1, inpt_data.shape[-1])
-
     # Handle rowwise case
     if _is_rowwise_scaled(weight_tensor):
         assert _is_rowwise_scaled(input_tensor), (
             "Input tensor must be rowwise block size"
         )
-        w_scale = w_scale.unsqueeze(-1).T
+        w_scale = w_scale.T
         input_scale = preprocess_scale(input_scale, input_tensor.shape)
 
     # Preprocess data
diff --git a/torchao/float8/inference.py b/torchao/float8/inference.py
@@ -7,11 +7,20 @@
 Defines an nn module designed to be used during inference
 """
 
-from typing import NamedTuple, Optional, Tuple
+from typing import NamedTuple, Optional, Tuple, Union
 
 import torch
 
 from torchao.float8.float8_utils import is_row_major, pad_tensor_for_matmul
+from torchao.quantization.granularity import (
+    PerRow,
+    PerTensor,
+)
+from torchao.utils import (
+    is_MI300,
+    is_sm_at_least_89,
+    is_sm_at_least_90,
+)
 
 Tensor = torch.Tensor
 
@@ -106,3 +115,66 @@ def _is_rowwise_scaled(x) -> bool:
         x: AffineQuantizedTensor tensor
     """
     return x.block_size == (1,) * (x.dim() - 1) + (x.shape[-1],)
+
+
+FP8Granularity = Union[PerTensor, PerRow]
+
+
+def _normalize_granularity(
+    granularity: Optional[
+        Union[
+            FP8Granularity,
+            Tuple[FP8Granularity, FP8Granularity],
+            list[FP8Granularity],
+        ]
+    ],
+) -> Tuple[FP8Granularity, FP8Granularity]:
+    processed_granularity = None
+    if granularity is None:
+        processed_granularity = (PerTensor(), PerTensor())
+    elif isinstance(granularity, (PerTensor, PerRow)):
+        processed_granularity = (granularity, granularity)
+    elif isinstance(granularity, (tuple, list)) and len(granularity) == 2:
+        if not (
+            isinstance(granularity[0], (PerTensor, PerRow))
+            and isinstance(granularity[1], (PerTensor, PerRow))
+        ):
+            raise ValueError(
+                f"Invalid granularity types: {granularity}, only PerTensor or PerRow are supported."
+            )
+        if not isinstance(granularity[0], type(granularity[1])):
+            raise ValueError(
+                f"Different granularities for activation and weight are not supported: {granularity}, only PerTensor or PerRow are supported."
+            )
+        processed_granularity = tuple(granularity)
+    else:
+        raise ValueError(
+            f"Invalid granularity specification: {granularity}, only PerTensor or PerRow are supported."
+        )
+    return processed_granularity
+
+
+def _check_hardware_support(
+    granularities: Tuple[FP8Granularity, FP8Granularity],
+) -> None:
+    """
+    Validate that the hardware supports the requested granularities.
+
+    Args:
+        granularities: Tuple of (activation_granularity, weight_granularity)
+
+    Raises:
+        AssertionError: If hardware doesn't support the requested granularity
+        ValueError: If invalid granularity type is provided
+    """
+    for _granularity in granularities:
+        if isinstance(_granularity, PerTensor):
+            assert is_sm_at_least_89() or is_MI300(), (
+                "PerTensor quantization only works for CUDA>=8.9 and MI300+"
+            )
+        elif isinstance(_granularity, PerRow):
+            assert is_sm_at_least_90() or is_MI300(), (
+                "PerRow quantization only works for CUDA>=9.0 and MI300+"
+            )
+        else:
+            raise ValueError(f"Invalid granularity type: {_granularity}")
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -54,7 +54,12 @@
 from torchao.dtypes.utils import Layout
 from torchao.float8.config import e4m3_dtype, e5m2_dtype
 from torchao.float8.float8_linear import Float8Linear
-from torchao.float8.inference import Float8MMConfig
+from torchao.float8.inference import (
+    Float8MMConfig,
+    FP8Granularity,
+    _check_hardware_support,
+    _normalize_granularity,
+)
 from torchao.quantization.linear_activation_weight_observed_tensor import (
     LinearActivationWeightObservedTensor,
 )
@@ -1431,56 +1436,9 @@ def _float8_weight_only_transform(
     return module
 
 
-_fp8_granularities = Union[PerTensor, PerRow]
-
-
-# Validate and process granularity input
-def _normalize_granularity(
-    granularity: Optional[
-        Union[_fp8_granularities, Tuple[_fp8_granularities, _fp8_granularities]]
-    ],
-) -> Tuple[_fp8_granularities, _fp8_granularities]:
-    processed_granularity = None
-    if granularity is None:
-        processed_granularity = (PerTensor(), PerTensor())
-    elif isinstance(granularity, (PerTensor, PerRow)):
-        processed_granularity = (granularity, granularity)
-    elif isinstance(granularity, tuple) and len(granularity) == 2:
-        if not (
-            isinstance(granularity[0], (PerTensor, PerRow))
-            and isinstance(granularity[1], (PerTensor, PerRow))
-        ):
-            raise ValueError(
-                f"Invalid granularity types: {granularity}, only PerTensor or PerRow are supported."
-            )
-        if not isinstance(granularity[0], type(granularity[1])):
-            raise ValueError(
-                f"Different granularities for activation and weight are not supported: {granularity}, only PerTensor or PerRow are supported."
-            )
-        processed_granularity = granularity
-    else:
-        raise ValueError(
-            f"Invalid granularity specification: {granularity}, only PerTensor or PerRow are supported."
-        )
-    # Validate granularity with supported Hardware
-    for _granularity in processed_granularity:
-        if isinstance(_granularity, PerTensor):
-            assert is_sm_at_least_89() or is_MI300(), (
-                "PerTensor quantization only works for CUDA>=8.9 and MI300+"
-            )
-        elif isinstance(_granularity, PerRow):
-            assert is_sm_at_least_90() or is_MI300(), (
-                "PerRow quantization only works for CUDA>=9.0 and MI300+"
-            )
-        else:
-            raise ValueError(f"Invalid granularity type: {_granularity}")
-
-    return processed_granularity
-
-
 def _input_activation_quant_func_fp8(
     x: torch.Tensor,
-    activation_granularity: _fp8_granularities,
+    activation_granularity: FP8Granularity,
     activation_dtype: torch.dtype,
     scale: Optional[torch.Tensor] = None,
     zero_point: Optional[torch.Tensor] = None,
@@ -1567,7 +1525,7 @@ class Float8DynamicActivationFloat8WeightConfig(AOBaseConfig):
     activation_dtype: torch.dtype = e4m3_dtype
     weight_dtype: torch.dtype = e4m3_dtype
     granularity: Optional[
-        Union[_fp8_granularities, Tuple[_fp8_granularities, _fp8_granularities]]
+        Union[FP8Granularity, Tuple[FP8Granularity, FP8Granularity]]
     ] = None
     mm_config: Optional[Float8MMConfig] = None
     set_inductor_config: bool = True
@@ -1576,6 +1534,11 @@ def __post_init__(self):
         if self.mm_config is None:
             self.mm_config = Float8MMConfig(use_fast_accum=True)
 
+        activation_granularity, weight_granularity = _normalize_granularity(
+            self.granularity
+        )
+        self.granularity = (activation_granularity, weight_granularity)
+
 
 # for bc
 float8_dynamic_activation_float8_weight = Float8DynamicActivationFloat8WeightConfig
@@ -1587,7 +1550,9 @@ def _float8_dynamic_activation_float8_weight_quantize_tensor(weight, config):
     granularity = config.granularity
     mm_config = config.mm_config
 
-    activation_granularity, weight_granularity = _normalize_granularity(granularity)
+    # Ensure works on device
+    _check_hardware_support(granularity)
+    activation_granularity, weight_granularity = granularity
 
     if not _fp8_mm_compat(weight):
         # TODO(future PR): this should really throw an exception instead of silently
@@ -1704,7 +1669,7 @@ class Float8StaticActivationFloat8WeightConfig(AOBaseConfig):
     activation_dtype: torch.dtype = e4m3_dtype
     weight_dtype: torch.dtype = e4m3_dtype
     granularity: Optional[
-        Union[_fp8_granularities, Tuple[_fp8_granularities, _fp8_granularities]]
+        Union[FP8Granularity, Tuple[FP8Granularity, FP8Granularity]]
     ] = None
     mm_config: Optional[Float8MMConfig] = None
     set_inductor_config: bool = True
diff --git a/torchao/quantization/quant_primitives.py b/torchao/quantization/quant_primitives.py