Fix Per Row scaling for inference

drisspg · drisspg · commit f550778e13c5 · 2025-05-23T14:19:26.000-07:00
stack-info: PR: #2253, branch: drisspg/stack/56
diff --git a/test/dtypes/test_affine_quantized_float.py b/test/dtypes/test_affine_quantized_float.py
@@ -297,21 +297,53 @@ def test_fp8_weight_dimension_warning(self):
     @unittest.skipIf(
         not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
     )
-    def test_mm_float8dq(self):
+    @common_utils.parametrize(
+        "in_features,out_features", [(512, 1024), (256, 768), (1024, 512)]
+    )
+    @common_utils.parametrize(
+        "leading_shape", [(1,), (8,), (16,), (2, 8,), (2, 2, 16,)]
+    )  # fmt: skip
+    @common_utils.parametrize("bias", [True, False])
+    def test_mm_float8dq(self, in_features, out_features, leading_shape, bias: bool):
         device = "cuda"
         dtype = torch.bfloat16
-        weight = torch.randn(512, 1024).to(device).to(dtype)
-        weight = weight.t()
-
-        l = torch.nn.Linear(512, 1024).to(device).to(dtype)
-        l.weight = torch.nn.Parameter(weight)
-        quantize_(l, Float8DynamicActivationFloat8WeightConfig(granularity=PerRow()))
-        # weight shape: 1024 x 512
-        weight = l.weight
-
-        input = torch.randn(1, 512, device=device, dtype=dtype)
-        # make sure it runs
-        torch.nn.functional.linear(input, weight)
+        input_shape = leading_shape + (in_features,)
+
+        ref_linear = (
+            torch.nn.Linear(in_features, out_features, bias=bias).to(device).to(dtype)
+        )
+        test_linear = copy.deepcopy(ref_linear)
+        quantize_(
+            test_linear, Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())
+        )
+
+        quant_weight = test_linear.weight
+
+        self.assertTrue(hasattr(quant_weight, "original_weight_tensor"))
+        weight_impl = quant_weight.original_weight_tensor.tensor_impl
+
+        self.assertTrue(hasattr(weight_impl, "float8_data"))
+        self.assertTrue(hasattr(weight_impl, "scale"))
+        self.assertFalse(weight_impl.transposed)
+
+        # Verify scale shape for row-wise quantization
+        expected_scale_shape = (out_features, 1)
+        actual_scale_shape = weight_impl.scale.shape
+        self.assertEqual(actual_scale_shape, expected_scale_shape)
+
+        self.assertEqual(weight_impl.float8_data.shape, (out_features, in_features))
+
+        input_tensor = torch.randn(*input_shape, device=device, dtype=dtype)
+
+        with torch.no_grad():
+            ref_output = ref_linear(input_tensor)
+            quant_output = torch.nn.functional.linear(input_tensor, quant_weight)
+
+        expected_output_shape = input_tensor.shape[:-1] + (out_features,)
+        self.assertEqual(quant_output.shape, expected_output_shape)
+
+        error = compute_error(ref_output, quant_output)
+        assert error > 20, f"Quantization error is too high got a SQNR of {error}"
 
 
 common_utils.instantiate_parametrized_tests(TestAffineQuantizedFloat8Compile)
diff --git a/torchao/dtypes/affine_quantized_tensor.py b/torchao/dtypes/affine_quantized_tensor.py
@@ -462,10 +462,10 @@ def from_hp_to_floatx(
         if target_dtype in FP8_TYPES:
             original_shape = input_float.shape
             input_float = _layout.pre_process(input_float)
-
-            scale = choose_qparams_affine_float8(input_float, float8_dtype=target_dtype)
+            scale = choose_qparams_affine_float8(
+                input_float, float8_dtype=target_dtype, block_size=block_size
+            )
             data = quantize_affine_float8(input_float, scale, target_dtype)
-
             data, scale, zero_point = _layout.post_process(
                 data, scale, None, block_size
             )
@@ -503,7 +503,6 @@ def from_hp_to_floatx_static(
                 input_float,
                 scale,
                 target_dtype,
-                scale_dtype,
             )
 
             data, scale, zero_point = _layout.post_process(
diff --git a/torchao/dtypes/floatx/float8_layout.py b/torchao/dtypes/floatx/float8_layout.py
@@ -333,13 +333,12 @@ def _linear_fp8_act_fp8_weight_impl(
     input_scale = input_tensor.tensor_impl.scale
     # Handle case where input tensor is more than 2D
     inpt_data = inpt_data.reshape(-1, inpt_data.shape[-1])
-
     # Handle rowwise case
     if _is_rowwise_scaled(weight_tensor):
         assert _is_rowwise_scaled(input_tensor), (
             "Input tensor must be rowwise block size"
         )
-        w_scale = w_scale.unsqueeze(-1).T
+        w_scale = w_scale.T
         input_scale = preprocess_scale(input_scale, input_tensor.shape)
 
     # Preprocess data
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -1576,6 +1576,11 @@ def __post_init__(self):
         if self.mm_config is None:
             self.mm_config = Float8MMConfig(use_fast_accum=True)
 
+        activation_granularity, weight_granularity = _normalize_granularity(
+            self.granularity
+        )
+        self.granularity = (activation_granularity, weight_granularity)
+
 
 # for bc
 float8_dynamic_activation_float8_weight = Float8DynamicActivationFloat8WeightConfig
diff --git a/torchao/quantization/quant_primitives.py b/torchao/quantization/quant_primitives.py
@@ -1970,20 +1970,38 @@ def choose_qparams_affine_float8(
     tensor: torch.Tensor,
     float8_dtype: torch.dtype = torch.float8_e4m3fn,
     scale_dtype: torch.dtype = torch.float32,
+    block_size: Optional[Tuple[int, ...]] = None,
 ) -> torch.Tensor:
     """
     Calculates float8 scaling factor for the given high precision tensor, using tensorwise granularity.
 
     Args:
         tensor (torch.Tensor): Input tensor to be quantized.
         float8_dtype (torch.dtype): Data type of the quantized tensor (e.g., torch.float8_e4m3fn, torch.float8_e5m2).
+        scale_dtype (torch.dtype): Data type of the scaling factor (e.g., torch.float32).
+        block_size (Optional[Tuple[int, ...]]): Block size for block-wise quantization. If None, tensorwise quantization is used.
     """
+    quant_max = torch.finfo(float8_dtype).max
     # only tensorwise scaling is supported for now:
-    quant_min, quant_max = torch.finfo(float8_dtype).min, torch.finfo(float8_dtype).max
-    min_val_neg = torch.min(tensor)
-    max_val_pos = torch.max(tensor)
-    max_val_pos = torch.max(-min_val_neg, max_val_pos)
-    scale = max_val_pos / (float(quant_max - quant_min) / 2)
+    if block_size is None:
+        max_abs = tensor.abs().max()
+        scale = max_abs / quant_max
+    else:
+        shape_for_reduction, reduction_dims = _get_reduction_params(
+            block_size, tensor.shape
+        )
+        tensor_reshaped = tensor.view(shape_for_reduction)
+        max_abs = tensor_reshaped.abs().amax(dim=reduction_dims, keepdim=True)
+
+        scale = max_abs / quant_max
+        # Reshape scale back to match the expected output shape
+        # The scale tensor should have the same shape as the input divided by block_size
+        output_shape = [
+            input_size // block_size[i] if block_size[i] > 1 else input_size
+            for i, input_size in enumerate(tensor.shape)
+        ]
+        scale = scale.reshape(output_shape)
+
     return scale.to(dtype=scale_dtype)
 
 
@@ -2027,5 +2045,24 @@ def dequantize_affine_float8(
     # upcasted to `float32` to divide by the scale, since scale is a fp32 for float8 quantization.
     # In order to match numerics between eager and compile, we upcast manually here.
     fp8_tensor = tensor.to(torch.float32)
-    hp_tensor = fp8_tensor * scale
+    # For block-wise quantization, we need to broadcast the scale to match tensor dimensions
+    if scale.shape != tensor.shape:
+        # Calculate the block size from the shape difference
+        block_size = tuple(
+            tensor.shape[i] // scale.shape[i]
+            if scale.shape[i] != tensor.shape[i]
+            else 1
+            for i in range(len(tensor.shape))
+        )
+
+        scale_expanded = scale
+        for i in range(len(tensor.shape)):
+            if block_size[i] > 1:
+                # Repeat the scale values for each block
+                scale_expanded = scale_expanded.repeat_interleave(block_size[i], dim=i)
+    else:
+        # Tensor-wise quantization - scale already matches
+        scale_expanded = scale
+
+    hp_tensor = fp8_tensor * scale_expanded
     return hp_tensor.to(output_dtype)