Improve QAT fp8-int4 numerics

andrewor14 · andrewor14 · commit ecd3a4956ce7 · 2025-09-04T15:17:03.000-07:00
**Summary:** This commit improved the prepare vs convert SQNR
of fp8-int4 QAT from 12 to 22. This is achieved by mimicking the
numerics of the target FBGEMM fp8-int4 kernel more closely.
In particular, FBGEMM first quantizes the weights to fp8, and
then uses max abs values to compute the scale, which is
significantly different from what torchao's quant primitives do.

**Test Plan:**
```
python test/quantization/test_qat.py -k test_fbgemm_fp8_primitives
python test/quantization/test_qat.py -k test_fbgemm_int4_primitives
python test/quantization/test_qat.py -k test_quantize_api_fp8_int4
```
diff --git a/test/quantization/test_qat.py b/test/quantization/test_qat.py
@@ -49,6 +49,7 @@
 )
 from torchao.quantization.qat.fake_quantize_config import (
     Float8FakeQuantizeConfig,
+    Int4WeightPreshuffledFakeQuantizeConfig,
     IntxFakeQuantizeConfig,
 )
 from torchao.quantization.qat.fake_quantizer import (
@@ -1929,7 +1930,7 @@ def test_quantize_api_fp8_int4(self):
         """
         self._test_quantize_api_against_ptq(
             Float8DynamicActivationInt4WeightConfig(),
-            target_prepare_sqnr=12,
+            target_prepare_sqnr=22,
             target_convert_sqnr=float("inf"),
         )
 
@@ -1950,6 +1951,19 @@ def test_quantize_api_int4(self, version: int):
             target_convert_sqnr=float("inf"),
         )
 
+    @unittest.skipIf(not _CUDA_IS_AVAILABLE, "skipping when cuda is not available")
+    def test_quantize_api_int8_int4(self):
+        """
+        Test the following:
+            quantize_(model, QATConfig(Int8DynamicActivationInt4WeightConfig(), step="prepare"))
+            quantize_(model, QATConfig(Int8DynamicActivationInt4WeightConfig(), step="convert"))
+        """
+        self._test_quantize_api_against_ptq(
+            Int8DynamicActivationInt4WeightConfig(group_size=32),
+            target_prepare_sqnr=30,
+            target_convert_sqnr=float("inf"),
+        )
+
     def test_infer_fp8_int4_config(self):
         """
         Test that fake quantize configs are correctly inferred from
@@ -1964,10 +1978,9 @@ def test_infer_fp8_int4_config(self):
         self.assertIsInstance(act_config, Float8FakeQuantizeConfig)
         self.assertEqual(act_config.dtype, torch.float8_e4m3fn)
         self.assertIsInstance(act_config.granularity, PerRow)
-        self.assertIsInstance(weight_config, IntxFakeQuantizeConfig)
-        self.assertEqual(weight_config.dtype, torch.int4)
+        self.assertIsInstance(weight_config, Int4WeightPreshuffledFakeQuantizeConfig)
         self.assertEqual(weight_config.group_size, 128)
-        self.assertTrue(weight_config.is_symmetric)
+        self.assertEqual(weight_config.activation_dtype, torch.float8_e4m3fn)
 
     def test_infer_int4_weight_only_config(self):
         """
@@ -2033,6 +2046,126 @@ def test_qat_nvfp4(self, use_per_tensor_scale: bool):
         sqnr = compute_error(out, baseline_out).item()
         self.assertGreater(sqnr, 24)
 
+    @unittest.skipIf(not _CUDA_IS_AVAILABLE, "skipping when cuda is not available")
+    @unittest.skipIf(
+        not _is_fbgemm_genai_gpu_available(), "Requires fbgemm-gpu-genai >= 1.2.0"
+    )
+    def test_fbgemm_fp8_primitives(self):
+        """
+        Compare numerics between:
+            (1) fbgemm_gpu.experimental.gen_ai.quantize.quantize_fp8_row
+            (2) Our reference QAT version in `Float8FakeQuantizer`
+        """
+        from fbgemm_gpu.experimental.gen_ai.quantize import quantize_fp8_row
+
+        from torchao.quantization.quant_primitives import (
+            _choose_scale_float8,
+            _quantize_affine_float8,
+        )
+
+        x1 = torch.randn([128, 256], dtype=torch.bfloat16).cuda()
+        x2 = copy.deepcopy(x1)
+
+        # (1) Just call `quantize_fp8_row`
+        (q1, scale1) = quantize_fp8_row(x1)
+
+        # (2) Our reference implementation for QAT without the dequantize
+        scale2 = _choose_scale_float8(
+            x2,
+            (1, x2.shape[-1]),
+            torch.float8_e4m3fn,
+            hp_value_lb=1e-12,
+        )
+        q2 = _quantize_affine_float8(x2, scale2, torch.float8_e4m3fn)
+        sqnr = compute_error(q1.to(torch.float32), q2.to(torch.float32))
+        scale_sqnr = compute_error(
+            scale1.to(torch.float32).flatten(),
+            scale2.to(torch.float32).flatten(),
+        )
+        self.assertGreater(sqnr, 40)
+        self.assertGreater(scale_sqnr, 50)
+
+    @unittest.skipIf(not _CUDA_IS_AVAILABLE, "skipping when cuda is not available")
+    @unittest.skipIf(
+        not _is_fbgemm_genai_gpu_available(), "Requires fbgemm-gpu-genai >= 1.2.0"
+    )
+    def test_fbgemm_int4_preshuffled_primitives(self):
+        """
+        Compare numerics between:
+            (1) fbgemm_gpu.experimental.gen_ai.quantize.quantize_int4_preshuffle
+            (2) Our reference QAT version in `Int4WeightPreshuffledFakeQuantizer`
+        """
+        from fbgemm_gpu.experimental.gen_ai.quantize import (
+            int4_row_quantize,
+            pack_int4,
+            quantize_fp8_row,
+            quantize_int4_preshuffle,
+        )
+
+        from torchao.quantization.quant_primitives import (
+            _choose_scale_float8,
+            _quantize_affine_float8,
+            _quantize_affine_no_dtype_cast,
+        )
+
+        group_size = 128
+        x1 = torch.randn([128, 256], dtype=torch.bfloat16).cuda()
+        x2 = copy.deepcopy(x1)
+        x3 = copy.deepcopy(x1)
+
+        # (1) Just call `quantize_int4_preshuffle`
+        (q1, (scale1, _)) = quantize_int4_preshuffle(x1, group_size, dtype="fp8")
+
+        # (2) Call `quantize_int4_preshuffle` but skip packing and shuffling
+        (q2, _) = quantize_fp8_row(x2)
+        (q2, scale2) = int4_row_quantize(q2, group_size)
+
+        # (3) Reference implementation for QAT without the dequantize
+        fp8_scale = _choose_scale_float8(
+            x3,
+            (1, x3.shape[-1]),
+            torch.float8_e4m3fn,
+            hp_value_lb=1e-12,
+        )
+        x3_fp8 = _quantize_affine_float8(x3, fp8_scale, torch.float8_e4m3fn)
+        x3_fp8 = x3_fp8.to(torch.float32)
+        x3_fp8_grouped = x3_fp8.view(x3_fp8.shape[0], -1, group_size)
+        max_abs = torch.amax(torch.abs(x3_fp8_grouped), dim=-1, keepdim=False)
+        scale = torch.clamp(max_abs / 8, min=1e-6)
+        zero_point = torch.zeros_like(scale)
+        q3 = _quantize_affine_no_dtype_cast(
+            x3_fp8,
+            (1, group_size),
+            scale,
+            zero_point,
+            quant_min=-8,
+            quant_max=7,
+        )
+        scale3 = scale
+
+        def shuffle_and_pack(t: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
+            t = pack_int4(t.to(torch.int8))
+            return torch.ops.fbgemm.preshuffle_i4(t, scale.to(torch.float8_e4m3fn))[0]
+
+        # First, sanity check that shuffle_and_pack(q2) == q1
+        torch.testing.assert_close(q1, shuffle_and_pack(q2, scale2), atol=0, rtol=0)
+
+        # Now check q2 vs q3 with and without shuffle
+        sqnr_q2_q3 = compute_error(q2.to(torch.float32), q3.to(torch.float32))
+        sqnr_q2_q3_preshuffle = compute_error(
+            shuffle_and_pack(q2, scale2).to(torch.float32),
+            shuffle_and_pack(q3, scale3).to(torch.float32),
+        )
+        self.assertGreater(sqnr_q2_q3, 32)
+        self.assertGreater(sqnr_q2_q3_preshuffle, 32)
+
+        # Now check shuffle_and_pack(q3) vs q1
+        sqnr_q1_q3_preshuffle = compute_error(
+            q1.to(torch.float32),
+            shuffle_and_pack(q3, scale3).to(torch.float32),
+        )
+        self.assertGreater(sqnr_q1_q3_preshuffle, 32)
+
 
 instantiate_parametrized_tests(TestQAT)
 
diff --git a/torchao/quantization/qat/fake_quantize_config.py b/torchao/quantization/qat/fake_quantize_config.py
@@ -77,6 +77,25 @@ def __post_init__(self):
             )
 
 
+@dataclass
+class Int4WeightPreshuffledFakeQuantizeConfig(FakeQuantizeConfigBase):
+    """
+    Config for pint4 weight fake quantization that targets the numerics in the following preshuffled kernel:
+        torch.ops.fbgemm.f8i4bf16_shuffled
+
+    Currently this only supports float8 input activations. It is expected to be used in conjunction with
+    :class:`~torchao.quantization.Float8DynamicActivationInt4WeightConfig`. In the future, we may extend
+    this to support bfloat16 as well.
+    """
+
+    group_size: int = 128
+    activation_dtype: torch.dtype = e4m3_dtype
+
+    def __post_init__(self):
+        if self.activation_dtype != e4m3_dtype:
+            raise ValueError(f"Only {e4m3_dtype} activation is supported currently")
+
+
 @dataclass
 class IntxFakeQuantizeConfig(FakeQuantizeConfigBase):
     """
@@ -404,10 +423,9 @@ def _infer_fake_quantize_configs(
             dtype=torch.float8_e4m3fn,
             granularity=PerRow(),
         )
-        weight_config = IntxFakeQuantizeConfig(
-            dtype=torch.int4,
+        weight_config = Int4WeightPreshuffledFakeQuantizeConfig(
             group_size=128,
-            is_symmetric=True,
+            activation_dtype=e4m3_dtype,
         )
     elif isinstance(base_config, NVFP4InferenceConfig):
         # Note: today the PTQ config does not allow the user to specify
diff --git a/torchao/quantization/qat/fake_quantizer.py b/torchao/quantization/qat/fake_quantizer.py
@@ -4,13 +4,14 @@
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Optional
+from typing import Optional, Tuple
 
 import torch
 
 from torchao.quantization.granularity import (
     PerAxis,
     PerGroup,
+    PerRow,
     PerToken,
 )
 from torchao.quantization.observer import get_block_size
@@ -20,6 +21,7 @@
     MappingType,
     _choose_scale_float8,
     _dequantize_affine_float8,
+    _fake_quantize_affine,
     _quantize_affine_float8,
     _Round,
     choose_qparams_affine,
@@ -33,6 +35,7 @@
 from .fake_quantize_config import (
     FakeQuantizeConfigBase,
     Float8FakeQuantizeConfig,
+    Int4WeightPreshuffledFakeQuantizeConfig,
     IntxFakeQuantizeConfig,
 )
 from .utils import (
@@ -65,6 +68,8 @@ def from_config(config: FakeQuantizeConfigBase) -> "FakeQuantizerBase":
 
         if isinstance(config, IntxFakeQuantizeConfig):
             return IntxFakeQuantizer(config)
+        elif isinstance(config, Int4WeightPreshuffledFakeQuantizeConfig):
+            return Int4WeightPreshuffledFakeQuantizer(config)
         elif isinstance(config, Float8FakeQuantizeConfig):
             return Float8FakeQuantizer(config)
         elif isinstance(config, NVFP4FakeQuantizeConfig):
@@ -93,13 +98,73 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             hp_value_lb=self.config.hp_value_lb,
             hp_value_ub=self.config.hp_value_ub,
         )
-        q = _quantize_affine_float8(
-            x, scale, self.config.dtype, cast_to_float8_dtype=False
-        )
+        q = _quantize_affine_float8(x, scale, self.config.dtype)
         dq = _dequantize_affine_float8(q, scale, original_dtype)
         return dq
 
 
+class Int4WeightPreshuffledFakeQuantizer(FakeQuantizerBase):
+    """
+    Generic module for applying int4 fake quantization to a weight tensor,
+    targeting the following FBGEMM kernel:
+        torch.ops.fbgemm.f8i4bf16_shuffled
+    """
+
+    def __init__(self, config: Int4WeightPreshuffledFakeQuantizeConfig):
+        super().__init__()
+        self.config = config
+        torch._C._log_api_usage_once(
+            "torchao.quantization.qat.Int4WeightPreshuffledFakeQuantizer"
+        )
+
+    def forward(self, w: torch.Tensor) -> torch.Tensor:
+        return self._forward(w)[0]
+
+    def _forward(self, w: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Apply int4 fake quantization to the weight tensor, using the following as a reference:
+        https://github.com/pytorch/FBGEMM/blob/80cc48c4b2b7fcc579e53211fc8715a8592cbd2c/fbgemm_gpu/experimental/gen_ai/gen_ai/quantize.py#L112
+
+        Currently, we expect the activations to always be rowwise float8.
+
+        Returns a 2-tuple of (fake quantized weight, per group scale).
+        """
+        assert w.dim() == 2
+        assert self.config.activation_dtype == torch.float8_e4m3fn
+
+        # First quantize weights to fp8 per row
+        # This simulates the numerics of fbgemm_gpu.experimental.gen_ai.quantize.quantize_fp8_row
+        per_row_block_size = get_block_size(w.shape, PerRow())
+        fp8_scale = _choose_scale_float8(
+            w,
+            per_row_block_size,
+            torch.float8_e4m3fn,
+            hp_value_lb=1e-12,
+        )
+        w_fp8 = _quantize_affine_float8(w, fp8_scale, torch.float8_e4m3fn)
+        w_fp8 = _dequantize_affine_float8(w_fp8, fp8_scale, w.dtype)
+
+        # Now quantize to int4 per group
+        # This simulates the numerics of fbgemm_gpu.experimental.gen_ai.quantize.int4_row_quantize
+        eps = 1e-6
+        fbgemm_scale_quant_max = 8
+        w_fp8_grouped = w_fp8.view(w_fp8.shape[0], -1, self.config.group_size)
+        max_abs = torch.amax(torch.abs(w_fp8_grouped), dim=-1, keepdim=False)
+        scale = torch.clamp(max_abs / fbgemm_scale_quant_max, min=eps)
+        zero_point = torch.zeros_like(scale)
+        per_group_block_size = (1, self.config.group_size)
+        fq = _fake_quantize_affine(
+            w_fp8,
+            per_group_block_size,
+            scale,
+            zero_point,
+            quant_dtype=torch.int8,
+            quant_min=-8,
+            quant_max=7,
+        )
+        return (fq.to(w.dtype), scale)
+
+
 class IntxFakeQuantizer(FakeQuantizerBase):
     """
     Generic module for applying integer fake quantization to a tensor, as specified in the config.
diff --git a/torchao/quantization/quant_primitives.py b/torchao/quantization/quant_primitives.py