Skip expanding scales for rowwise fp8 quantize

andrewor14 · andrewor14 · commit 935ac1a6bdec · 2025-09-05T17:09:59.000-07:00
**Summary:** #2253 added a step in `quantize_affine_float8` to expand the scales for blockwise quantization. The purpose of this step is to make the scales always broadcastable with the input tensor. However, this is unnecessary for rowwise quantization, which already has broadcastable shapes, e.g. ``` scale = [32, 1] input = [32, 16] ``` Today, we will `repeat_interleave` the above scales to pad the scale tensor until it reaches `[32, 16]`, which adds non-trivial memory and latency overhead. This commit adds a fast path to skip this expanding step if we detect rowwise quantization. **Test Plan:** ``` python test/quantization/test_quant_primitives.py -k test_maybe_expand_scale_to_tensor_shape ``` Also compared fine-tuning Qwen3-1.7B with fp8-fp8 QAT using batch size 32 on a single H100 GPU: - Before: 25.34 GB peak memory, 3047.25 tok/s - After: 22.53 GB peak memory, 3358.49 tok/s - This PR uses 11.1% less memory and is 10.2% faster
diff --git a/test/quantization/test_quant_primitives.py b/test/quantization/test_quant_primitives.py
@@ -16,6 +16,7 @@
     _choose_qparams_affine_tinygemm,
     _fake_quantize_affine,
     _fake_quantize_affine_cachemask,
+    _maybe_expand_scale_to_tensor_shape,
     choose_qparams_affine,
     dequantize_affine,
     quantize_affine,
@@ -771,6 +772,20 @@ def test_fake_quantize_affine_cachemask(self):
         torch.testing.assert_close(dequantized, fake_quantized)
         torch.testing.assert_close(expected_mask, mask)
 
+    def test_maybe_expand_scale_to_tensor_shape(self):
+        # rowwise quantization: if all dimensions match except for the last one,
+        # and the last dimension is 1, then just return the scale as is
+        scale = torch.randn([3, 2, 1])
+        target_shape = torch.Size([3, 2, 8])
+        new_scale = _maybe_expand_scale_to_tensor_shape(scale, target_shape)
+        self.assertIs(scale, new_scale)
+        # blockwise quantization: scales are repeated to fit target_shape
+        scale = torch.randn([3, 2, 2])
+        target_shape = torch.Size([3, 2, 8])
+        new_scale = _maybe_expand_scale_to_tensor_shape(scale, target_shape)
+        self.assertEqual(new_scale.shape, torch.Size([3, 2, 8]))
+        self.assertEqual(new_scale.unique(dim=-1).shape, torch.Size([3, 2, 2]))
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/torchao/quantization/quant_primitives.py b/torchao/quantization/quant_primitives.py
@@ -2221,11 +2221,12 @@ def _choose_scale_float8(
     return scale.to(dtype=torch.float32)
 
 
-def _expand_scale_to_tensor_shape(
+def _maybe_expand_scale_to_tensor_shape(
     scale: torch.Tensor, target_shape: torch.Size
 ) -> torch.Tensor:
     """
     Expand a scale tensor to match the target tensor shape for block-wise quantization.
+    If this is rowwise quantization, however, just return the scale as is.
 
     Args:
         scale (torch.Tensor): Scale tensor with shape corresponding to block structure
@@ -2242,6 +2243,10 @@ def _expand_scale_to_tensor_shape(
         # Scalar scale - can broadcast naturally
         return scale
 
+    # For rowwise quantization, just return the scale as is
+    if scale.shape[:-1] == target_shape[:-1] and scale.shape[-1] == 1:
+        return scale
+
     # Calculate block sizes from shape difference
     if len(scale.shape) != len(target_shape):
         raise ValueError(
@@ -2283,7 +2288,7 @@ def _quantize_affine_float8(
     tensor_fp32 = tensor.to(torch.float32)
 
     # Expand scale to match tensor dimensions for block-wise quantization
-    scale_expanded = _expand_scale_to_tensor_shape(scale, tensor.shape)
+    scale_expanded = _maybe_expand_scale_to_tensor_shape(scale, tensor.shape)
 
     tensor_scaled = tensor_fp32 / scale_expanded
     max_value = torch.finfo(float8_dtype).max
@@ -2306,7 +2311,7 @@ def _dequantize_affine_float8(
     fp8_tensor = tensor.to(torch.float32)
 
     # Expand scale to match tensor dimensions for block-wise quantization
-    scale_expanded = _expand_scale_to_tensor_shape(scale, tensor.shape)
+    scale_expanded = _maybe_expand_scale_to_tensor_shape(scale, tensor.shape)
 
     hp_tensor = fp8_tensor * scale_expanded
     return hp_tensor.to(output_dtype)