[draft] Pass QAT learned qparams in convert

andrewor14 · andrewor14 · commit 2f0b50471480 · 2025-09-17T10:23:17.000-07:00
**Summary:** Draft prototype to pass scales and zero points
learned during QAT range learning to the PTQ base config.

**Test Plan:** TBD
diff --git a/torchao/quantization/qat/api.py b/torchao/quantization/qat/api.py
@@ -227,15 +227,21 @@ def _qat_config_transform(
         assert step == QATStep.CONVERT, "unexpected step '%s' in QATConfig" % step
         assert config.activation_config is None, "unexpected `activation_config`"
         assert config.weight_config is None, "unexpected `weight_config`"
+        kwargs = {}
         if isinstance(module, FakeQuantizedLinear):
+            # Optionally pass custom scales and zero points to base config handler
+            weight_config = module.weight_fake_quantizer.config
+            if isinstance(weight_config, IntxFakeQuantizeConfig) and weight_config.range_learning:
+                kwargs["custom_scales"] = weight_config.scale
+                kwargs["custom_zero_point"] = weight_config.zero_point
             module = module.to_linear()
         elif isinstance(module, FakeQuantizedEmbedding):
             module = module.to_embedding()
         else:
             # Unrelated module, ignore
             return module
         if base_config is not None:
-            return _QUANTIZE_CONFIG_HANDLER[type(base_config)](module, base_config)
+            return _QUANTIZE_CONFIG_HANDLER[type(base_config)](module, base_config, **kwargs)
         else:
             return module
 
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -809,7 +809,7 @@ def __post_init__(self):
                     )
 
 
-def _int8_dynamic_activation_intx_weight_quantize_tensor(weight, bias, config):
+def _int8_dynamic_activation_intx_weight_quantize_tensor(weight, bias, config, **kwargs):
     weight_dtype = config.weight_dtype
     weight_granularity = config.weight_granularity
     weight_mapping_type = config.weight_mapping_type
@@ -853,6 +853,7 @@ def _int8_dynamic_activation_intx_weight_quantize_tensor(weight, bias, config):
             weight_dtype,
             mapping_type=weight_mapping_type,
             activation_quantization="int8_asym_per_token",
+            **kwargs,
         )
         if weight_scale_dtype is not None and weight_scale_dtype != weight.dtype:
             _adjust_scale_dtype_in_intx_unpacked_tensor(
@@ -939,10 +940,10 @@ def _int8_dynamic_activation_intx_weight_quantize_tensor(weight, bias, config):
 
 @register_quantize_module_handler(Int8DynamicActivationIntxWeightConfig)
 def _int8_dynamic_activation_intx_weight_transform(
-    module: torch.nn.Module, config: Int8DynamicActivationIntxWeightConfig
+    module: torch.nn.Module, config: Int8DynamicActivationIntxWeightConfig, **kwargs,
 ) -> torch.nn.Module:
     new_weight, new_bias = _int8_dynamic_activation_intx_weight_quantize_tensor(
-        module.weight, module.bias, config
+        module.weight, module.bias, config, **kwargs,
     )
     module.weight = torch.nn.Parameter(new_weight, requires_grad=False)
     if new_bias is None:
@@ -2177,7 +2178,7 @@ def __post_init__(self):
         )
 
 
-def _intx_weight_only_quantize_tensor(weight, config):
+def _intx_weight_only_quantize_tensor(weight, config, **kwargs):
     weight_dtype = config.weight_dtype
     granularity = config.granularity
     mapping_type = config.mapping_type
@@ -2207,6 +2208,7 @@ def _intx_weight_only_quantize_tensor(weight, config):
                 block_size,
                 weight_dtype,
                 mapping_type=mapping_type,
+                **kwargs,
             )
             if scale_dtype is not None and scale_dtype != weight.dtype:
                 _adjust_scale_dtype_in_intx_unpacked_tensor(
diff --git a/torchao/quantization/quantize_/workflows/intx/intx_unpacked_to_int8_tensor.py b/torchao/quantization/quantize_/workflows/intx/intx_unpacked_to_int8_tensor.py
@@ -177,20 +177,25 @@ def from_hp(
         activation_quantization: Optional[
             IntxUnpackedToInt8TensorActivationQuantization
         ] = None,
+        custom_scale: Optional[torch.Tensor] = None,
+        custom_zero_point: Optional[torch.Tensor] = None,
     ):
         """
         Create an IntxUnpackedToInt8Tensor from a high-precision tensor
         """
         qmin, qmax = _DTYPE_TO_QVALUE_BOUNDS[target_dtype]
-        scale, zero_point = choose_qparams_affine(
-            hp_tensor,
-            mapping_type,
-            block_size,
-            target_dtype=torch.int8,
-            quant_min=qmin,
-            quant_max=qmax,
-            zero_point_dtype=torch.int8,
-        )
+        if custom_scale is None or custom_zero_point is None:
+            scale, zero_point = choose_qparams_affine(
+                hp_tensor,
+                mapping_type,
+                block_size,
+                target_dtype=torch.int8,
+                quant_min=qmin,
+                quant_max=qmax,
+                zero_point_dtype=torch.int8,
+            )
+        else:
+            scale, zero_point = custom_scale, custom_zero_point
         qdata = quantize_affine(
             hp_tensor,
             block_size,