[draft] Pass QAT learned qparams in convert

andrewor14 · andrewor14 · commit adb966c4181c · 2025-09-17T11:44:08.000-07:00
**Summary:** Draft prototype to pass scales and zero points
learned during QAT range learning to the PTQ base config.

**Test Plan:** TBD
diff --git a/torchao/quantization/qat/api.py b/torchao/quantization/qat/api.py
@@ -21,6 +21,7 @@
 from .fake_quantize_config import (
     FakeQuantizeConfig,  # noqa: F401, for BC
     FakeQuantizeConfigBase,
+    IntxFakeQuantizeConfig,
     _infer_fake_quantize_configs,
 )
 from .linear import FakeQuantizedLinear
@@ -227,15 +228,26 @@ def _qat_config_transform(
         assert step == QATStep.CONVERT, "unexpected step '%s' in QATConfig" % step
         assert config.activation_config is None, "unexpected `activation_config`"
         assert config.weight_config is None, "unexpected `weight_config`"
+        kwargs = {}
         if isinstance(module, FakeQuantizedLinear):
+            # Optionally pass custom scales and zero points to base config handler
+            weight_config = module.weight_fake_quantizer.config
+            if (
+                isinstance(weight_config, IntxFakeQuantizeConfig)
+                and weight_config.range_learning
+            ):
+                kwargs["custom_scale"] = weight_config.scale
+                kwargs["custom_zero_point"] = weight_config.zero_point
             module = module.to_linear()
         elif isinstance(module, FakeQuantizedEmbedding):
             module = module.to_embedding()
         else:
             # Unrelated module, ignore
             return module
         if base_config is not None:
-            return _QUANTIZE_CONFIG_HANDLER[type(base_config)](module, base_config)
+            return _QUANTIZE_CONFIG_HANDLER[type(base_config)](
+                module, base_config, **kwargs
+            )
         else:
             return module
 
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -809,7 +809,9 @@ def __post_init__(self):
                     )
 
 
-def _int8_dynamic_activation_intx_weight_quantize_tensor(weight, bias, config):
+def _int8_dynamic_activation_intx_weight_quantize_tensor(
+    weight, bias, config, **kwargs
+):
     weight_dtype = config.weight_dtype
     weight_granularity = config.weight_granularity
     weight_mapping_type = config.weight_mapping_type
@@ -853,6 +855,7 @@ def _int8_dynamic_activation_intx_weight_quantize_tensor(weight, bias, config):
             weight_dtype,
             mapping_type=weight_mapping_type,
             activation_quantization="int8_asym_per_token",
+            **kwargs,
         )
         if weight_scale_dtype is not None and weight_scale_dtype != weight.dtype:
             _adjust_scale_dtype_in_intx_unpacked_tensor(
@@ -939,10 +942,15 @@ def _int8_dynamic_activation_intx_weight_quantize_tensor(weight, bias, config):
 
 @register_quantize_module_handler(Int8DynamicActivationIntxWeightConfig)
 def _int8_dynamic_activation_intx_weight_transform(
-    module: torch.nn.Module, config: Int8DynamicActivationIntxWeightConfig
+    module: torch.nn.Module,
+    config: Int8DynamicActivationIntxWeightConfig,
+    **kwargs,
 ) -> torch.nn.Module:
     new_weight, new_bias = _int8_dynamic_activation_intx_weight_quantize_tensor(
-        module.weight, module.bias, config
+        module.weight,
+        module.bias,
+        config,
+        **kwargs,
     )
     module.weight = torch.nn.Parameter(new_weight, requires_grad=False)
     if new_bias is None:
@@ -2177,7 +2185,7 @@ def __post_init__(self):
         )
 
 
-def _intx_weight_only_quantize_tensor(weight, config):
+def _intx_weight_only_quantize_tensor(weight, config, **kwargs):
     weight_dtype = config.weight_dtype
     granularity = config.granularity
     mapping_type = config.mapping_type
@@ -2207,6 +2215,7 @@ def _intx_weight_only_quantize_tensor(weight, config):
                 block_size,
                 weight_dtype,
                 mapping_type=mapping_type,
+                **kwargs,
             )
             if scale_dtype is not None and scale_dtype != weight.dtype:
                 _adjust_scale_dtype_in_intx_unpacked_tensor(
diff --git a/torchao/quantization/quantize_/workflows/intx/intx_unpacked_to_int8_tensor.py b/torchao/quantization/quantize_/workflows/intx/intx_unpacked_to_int8_tensor.py
@@ -177,20 +177,29 @@ def from_hp(
         activation_quantization: Optional[
             IntxUnpackedToInt8TensorActivationQuantization
         ] = None,
+        custom_scale: Optional[torch.Tensor] = None,
+        custom_zero_point: Optional[torch.Tensor] = None,
     ):
         """
         Create an IntxUnpackedToInt8Tensor from a high-precision tensor
         """
         qmin, qmax = _DTYPE_TO_QVALUE_BOUNDS[target_dtype]
-        scale, zero_point = choose_qparams_affine(
-            hp_tensor,
-            mapping_type,
-            block_size,
-            target_dtype=torch.int8,
-            quant_min=qmin,
-            quant_max=qmax,
-            zero_point_dtype=torch.int8,
-        )
+        if custom_scale is not None and custom_zero_point is not None:
+            scale, zero_point = custom_scale, custom_zero_point
+        elif custom_scale is None and custom_zero_point is None:
+            scale, zero_point = choose_qparams_affine(
+                hp_tensor,
+                mapping_type,
+                block_size,
+                target_dtype=torch.int8,
+                quant_min=qmin,
+                quant_max=qmax,
+                zero_point_dtype=torch.int8,
+            )
+        else:
+            raise ValueError(
+                "`custom_scale` and `custom_zero_point` must be both defined or both None"
+            )
         qdata = quantize_affine(
             hp_tensor,
             block_size,