Update on "Add NVFP4 QAT"

andrewor14 · andrewor14 · commit a024e29ec12f · 2025-08-25T08:51:08.000-07:00
**Summary:** This commit adds a QAT flow for NVFP4, following the
numerics in `NVFP4Tensor` closely but without the dtyping casting,
swizzling, and the packing/unpacking. Users can call this flow as follows:

```
from torchao.quantization import quantize_
from torchao.quantization.qat import NVFP4FakeQuantizeConfig, QATConfig

qat_config = QATConfig(
    activation_config=NVFP4FakeQuantizeConfig(),
    weight_config=NVFP4FakeQuantizeConfig(),
    step="prepare",
)
quantize_(model, qat_config)
```

**Test Plan:**
```
python test/quantization/test_qat.py -k test_qat_nvfp4
```

Initial benchmarks on fine-tuning Qwen3-1.7B on alpaca for 3 epochs:
```
# Without QAT
| Tasks  |Version|Filter|n-shot|    Metric     |   | Value |   |Stderr|
|--------|------:|------|------|---------------|---|------:|---|------|
|wikitext|      2|none  |None  |bits_per_byte  |↓  | 0.8322|±  |   N/A|
|        |       |none  |None  |byte_perplexity|↓  | 1.7804|±  |   N/A|
|        |       |none  |None  |word_perplexity|↓  |21.8611|±  |   N/A|

# With QAT
| Tasks  |Version|Filter|n-shot|    Metric     |   | Value |   |Stderr|
|--------|------:|------|------|---------------|---|------:|---|------|
|wikitext|      2|none  |None  |bits_per_byte  |↓  | 0.8271|±  |   N/A|
|        |       |none  |None  |byte_perplexity|↓  | 1.7741|±  |   N/A|
|        |       |none  |None  |word_perplexity|↓  |21.4467|±  |   N/A|
```

[ghstack-poisoned]
diff --git a/docs/source/api_ref_qat.rst b/docs/source/api_ref_qat.rst
@@ -62,5 +62,3 @@ Prototype
     :nosignatures:
 
     initialize_fake_quantizers
-    NVFP4FakeQuantizeConfig
-    NVFP4FakeQuantizer
diff --git a/test/quantization/test_qat.py b/test/quantization/test_qat.py
@@ -50,7 +50,6 @@
 from torchao.quantization.qat.fake_quantize_config import (
     Float8FakeQuantizeConfig,
     IntxFakeQuantizeConfig,
-    NVFP4FakeQuantizeConfig,
 )
 from torchao.quantization.qat.fake_quantizer import (
     Float8FakeQuantizer,
@@ -1974,6 +1973,8 @@ def test_qat_nvfp4(self, use_per_tensor_scale: bool):
         """
         Test QAT with `NVFP4FakeQuantizeConfig`.
         """
+        from torchao.prototype.qat import NVFP4FakeQuantizeConfig
+
         torch.manual_seed(self.SEED)
         m = M().cuda()
         baseline_model = copy.deepcopy(m)
diff --git a/torchao/prototype/qat/__init__.py b/torchao/prototype/qat/__init__.py
@@ -0,0 +1,12 @@
+# Temporary location for prototype QAT features that will
+# eventually live in torchao/quantization/qat
+
+from .nvfp4 import (
+    NVFP4FakeQuantizeConfig,
+    NVFP4FakeQuantizer,
+)
+
+__all__ = [
+    "NVFP4FakeQuantizeConfig",
+    "NVFP4FakeQuantizer",
+]
diff --git a/torchao/prototype/qat/nvfp4.py b/torchao/prototype/qat/nvfp4.py
@@ -0,0 +1,69 @@
+from dataclasses import dataclass
+
+import torch
+
+from torchao.prototype.mx_formats.nvfp4_tensor import (
+    _nvfp4_quantize,
+    per_tensor_amax_to_scale,
+)
+from torchao.quantization.qat import (
+    FakeQuantizeConfigBase,
+    FakeQuantizerBase,
+)
+
+
+@dataclass
+class NVFP4FakeQuantizeConfig(FakeQuantizeConfigBase):
+    """
+    Config for fake quantizing weights or activations to NVIDIA's NVFP4 format
+    according to https://developer.nvidia.com/blog/introducing-nvfp4-for-efficient-and-accurate-low-precision-inference/.
+
+    Fake quantization numerics follow `NVFP4Tensor` closely: https://github.com/pytorch/ao/blob/main/torchao/prototype/mx_formats/nvfp4_tensor.py.
+
+    Args:
+        use_per_tensor_scale (bool): Whether to use two-level per-tensor fp32 scaling
+            after the initial fp8 (e4m3) block-wise scaling (default True)
+    """
+
+    use_per_tensor_scale: bool = True
+
+
+class NVFP4FakeQuantizer(FakeQuantizerBase):
+    """
+    (Prototype) Generic module for applying NVFP4 fake quantization to a tensor, as specified in the config.
+    """
+
+    def __init__(self, config: NVFP4FakeQuantizeConfig):
+        super().__init__()
+        torch._C._log_api_usage_once("torchao.quantization.qat.NVFP4FakeQuantizer")
+        self.config = config
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        block_size = 16
+        original_shape = x.shape
+        if x.dim() == 3:
+            x = x.view(-1, x.shape[-1])
+        if self.config.use_per_tensor_scale:
+            tensor_amax = torch.max(torch.abs(x))
+            per_tensor_scale = per_tensor_amax_to_scale(tensor_amax)
+        else:
+            per_tensor_scale = None
+
+        # quantize
+        scale, q = _nvfp4_quantize(
+            x,
+            block_size=block_size,
+            per_tensor_scale=per_tensor_scale,
+            skip_dtype_cast_and_packing=True,
+        )
+        if self.config.use_per_tensor_scale:
+            scale = scale * per_tensor_scale
+        assert q.dtype == x.dtype
+        assert scale.dtype == torch.float32
+
+        # dequantize
+        M, K = q.shape[0], q.shape[1]
+        q = q.view(M, K // block_size, block_size)
+        scale = scale.view(M, K // block_size, 1)
+        dq = q * scale
+        return dq.view(original_shape).to(x.dtype)
diff --git a/torchao/quantization/qat/__init__.py b/torchao/quantization/qat/__init__.py
@@ -17,14 +17,12 @@
     FakeQuantizeConfigBase,
     Float8FakeQuantizeConfig,
     IntxFakeQuantizeConfig,
-    NVFP4FakeQuantizeConfig,
 )
 from .fake_quantizer import (
     FakeQuantizer,
     FakeQuantizerBase,
     Float8FakeQuantizer,
     IntxFakeQuantizer,
-    NVFP4FakeQuantizer,
 )
 from .linear import (
     FakeQuantizedLinear,
@@ -42,8 +40,6 @@
     "Float8FakeQuantizer",
     "IntxFakeQuantizeConfig",
     "IntxFakeQuantizer",
-    "NVFP4FakeQuantizeConfig",
-    "NVFP4FakeQuantizer",
     "FakeQuantizedLinear",
     "FakeQuantizedEmbedding",
     # Prototype
diff --git a/torchao/quantization/qat/fake_quantize_config.py b/torchao/quantization/qat/fake_quantize_config.py
@@ -77,22 +77,6 @@ def __post_init__(self):
             )
 
 
-@dataclass
-class NVFP4FakeQuantizeConfig(FakeQuantizeConfigBase):
-    """
-    (Prototype) Config for fake quantizing weights or activations to NVIDIA's NVFP4 format
-    according to https://developer.nvidia.com/blog/introducing-nvfp4-for-efficient-and-accurate-low-precision-inference/.
-
-    Fake quantization numerics follow `NVFP4Tensor` closely: https://github.com/pytorch/ao/blob/main/torchao/prototype/mx_formats/nvfp4_tensor.py.
-
-    Args:
-        use_per_tensor_scale (bool): Whether to use two-level per-tensor fp32 scaling
-            after the initial fp8 (e4m3) block-wise scaling (default True)
-    """
-
-    use_per_tensor_scale: bool = True
-
-
 @dataclass
 class IntxFakeQuantizeConfig(FakeQuantizeConfigBase):
     """
@@ -336,7 +320,6 @@ def __post_init__(self):
         _log_deprecation_warning(self)
 
 
-# TODO: rewrite using registration API?
 def _infer_fake_quantize_configs(
     base_config: AOBaseConfig,
 ) -> Tuple[Optional[FakeQuantizeConfigBase], Optional[FakeQuantizeConfigBase]]:
@@ -347,11 +330,15 @@ def _infer_fake_quantize_configs(
 
     Return a 2-tuple of (activation_config, weight_config) for fake quantization.
     """
+    # TODO: rewrite using registration API so we don't need to import here
     # avoid circular imports
     from torchao.prototype.mx_formats import (
         NVFP4InferenceConfig,
         NVFP4MMConfig,
     )
+    from torchao.prototype.qat import (
+        NVFP4FakeQuantizeConfig,
+    )
     from torchao.quantization import (
         Float8DynamicActivationFloat8WeightConfig,
         Float8DynamicActivationInt4WeightConfig,
diff --git a/torchao/quantization/qat/fake_quantizer.py b/torchao/quantization/qat/fake_quantizer.py
@@ -34,7 +34,6 @@
     FakeQuantizeConfigBase,
     Float8FakeQuantizeConfig,
     IntxFakeQuantizeConfig,
-    NVFP4FakeQuantizeConfig,
 )
 from .utils import (
     _fake_quantize_per_channel_group,
@@ -58,6 +57,12 @@ def __repr__(self) -> str:
 
     @staticmethod
     def from_config(config: FakeQuantizeConfigBase) -> "FakeQuantizerBase":
+        # TODO: rewrite using registration API so we don't need to import here
+        from torchao.prototype.qat import (
+            NVFP4FakeQuantizeConfig,
+            NVFP4FakeQuantizer,
+        )
+
         if isinstance(config, IntxFakeQuantizeConfig):
             return IntxFakeQuantizer(config)
         elif isinstance(config, Float8FakeQuantizeConfig):
@@ -95,52 +100,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return dq
 
 
-class NVFP4FakeQuantizer(FakeQuantizerBase):
-    """
-    (Prototype) Generic module for applying NVFP4 fake quantization to a tensor, as specified in the config.
-    """
-
-    def __init__(self, config: NVFP4FakeQuantizeConfig):
-        super().__init__()
-        torch._C._log_api_usage_once("torchao.quantization.qat.NVFP4FakeQuantizer")
-        self.config = config
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        from torchao.prototype.mx_formats.nvfp4_tensor import (
-            _nvfp4_quantize,
-            per_tensor_amax_to_scale,
-        )
-
-        block_size = 16
-        original_shape = x.shape
-        if x.dim() == 3:
-            x = x.view(-1, x.shape[-1])
-        if self.config.use_per_tensor_scale:
-            tensor_amax = torch.max(torch.abs(x))
-            per_tensor_scale = per_tensor_amax_to_scale(tensor_amax)
-        else:
-            per_tensor_scale = None
-
-        # quantize
-        scale, q = _nvfp4_quantize(
-            x,
-            block_size=block_size,
-            per_tensor_scale=per_tensor_scale,
-            skip_dtype_cast_and_packing=True,
-        )
-        if self.config.use_per_tensor_scale:
-            scale = scale * per_tensor_scale
-        assert q.dtype == x.dtype
-        assert scale.dtype == torch.float32
-
-        # dequantize
-        M, K = q.shape[0], q.shape[1]
-        q = q.view(M, K // block_size, block_size)
-        scale = scale.view(M, K // block_size, 1)
-        dq = q * scale
-        return dq.view(original_shape).to(x.dtype)
-
-
 class IntxFakeQuantizer(FakeQuantizerBase):
     """
     Generic module for applying integer fake quantization to a tensor, as specified in the config.