Bump int4 weight only config version to 2

jerryzh168 · jerryzh168 · commit 5301a7eadb63 · 2025-09-08T16:16:53.000-07:00
Summary:
Current Int4WeightOnlyConfig has version 1 and 2, and default is 1, this PR changes the default to 2
and made modification to callsites.
For the Int4WeightOnlyConfig that's using the old configuration, we added explicit `version=1`, we can migrate the callsite to
use the version 2 separately

For READMEs we migrate the usage to version 2 directly

Deprecation: TODO

Test Plan:
Regression tests:
python test/dtypes/test_affine_quantized.py
python test/quantization/test_quant_api.py
python test/quantization/quantize_/workflows/int4/test_int4_marlin_sparse_tensor.py
python test/quantization/quantize_/workflows/int4/test_int4_opaque_tensor.py
python test/quantization/quantize_/workflows/int4/test_int4_plain_int32_tensor.py
python test/quantization/quantize_/workflows/int4/test_int4_preshuffled_tensor.py
python test/quantization/quantize_/workflows/int4/test_int4_tensor.py
python test/quantization/quantize_/workflows/int4/test_int4_tile_packed_to_4d_tensor.py

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/.github/scripts/torchao_model_releases/quantize_and_upload.py b/.github/scripts/torchao_model_releases/quantize_and_upload.py
@@ -206,7 +206,7 @@ def _untie_weights_and_save_locally(model_id):
 
 _int4_quant_code = """
 from torchao.quantization import Int4WeightOnlyConfig
-quant_config = Int4WeightOnlyConfig(group_size=128, packing_format="tile_packed_to_4d", int4_choose_qparams_algorithm="hqq", version=2)
+quant_config = Int4WeightOnlyConfig(group_size=128, packing_format="tile_packed_to_4d", int4_choose_qparams_algorithm="hqq")
 quantization_config = TorchAoConfig(quant_type=quant_config)
 quantized_model = AutoModelForCausalLM.from_pretrained(model_to_quantize, device_map="auto", torch_dtype=torch.bfloat16, quantization_config=quantization_config)
 tokenizer = AutoTokenizer.from_pretrained(model_id)
@@ -256,7 +256,7 @@ def _untie_weights_and_save_locally(model_id):
 )
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 
-base_config = Int4WeightOnlyConfig(group_size=128, version=2)
+base_config = Int4WeightOnlyConfig(group_size=128)
 quant_config = AWQConfig(base_config, step="prepare")
 quantize_(
     model,
@@ -635,7 +635,6 @@ def quantize_and_upload(
             group_size=128,
             packing_format="tile_packed_to_4d",
             int4_choose_qparams_algorithm="hqq",
-            version=2,
         ),
         "INT8-INT4": ModuleFqnToConfig(
             {
@@ -669,7 +668,7 @@ def quantize_and_upload(
         )
         tokenizer = AutoTokenizer.from_pretrained(model_id)
 
-        base_config = Int4WeightOnlyConfig(group_size=128, version=2)
+        base_config = Int4WeightOnlyConfig(group_size=128)
         quant_config = AWQConfig(base_config, step="prepare")
         quantize_(
             model,
diff --git a/test/dtypes/test_affine_quantized.py b/test/dtypes/test_affine_quantized.py
@@ -28,7 +28,6 @@
 from torchao.quantization import (
     FbgemmConfig,
     GemliteUIntXWeightOnlyConfig,
-    Int4WeightOnlyConfig,
     Int8DynamicActivationInt8WeightConfig,
     float8_weight_only,
     int4_dynamic_activation_int4_weight,
@@ -354,7 +353,7 @@ def test_slice_int4wo(self, device, dtype):
         # out_feature not divisible by 8
         # to test slice + padding for int4 weight only quantization
         dummy = nn.Linear(256, 321, dtype=dtype, device=device)
-        quantize_(dummy, Int4WeightOnlyConfig(version=1))
+        quantize_(dummy, int4_weight_only(version=1))
         # make sure these run without error
         _ = dummy.weight.narrow(0, 0, 64)
         _ = dummy.weight.narrow(1, 0, 128)
@@ -468,7 +467,7 @@ def test_slice_and_copy_int4wo(self, device, dtype):
         l.weight = torch.nn.Parameter(
             torch.zeros(1024, 1024, dtype=torch.bfloat16, device="cuda")
         )
-        quantize_(l, Int4WeightOnlyConfig(version=1))
+        quantize_(l, int4_weight_only(version=1))
         param = l.weight
         param_data = param.data
         param_data = param_data.narrow(0, 0, 512)
@@ -484,7 +483,7 @@ def test_slice_and_copy_int4wo(self, device, dtype):
 
         # dummy_l has random input (shouldn't be 0)
         dummy_l = torch.nn.Linear(1024, 1024).to("cuda").to(torch.bfloat16)
-        quantize_(dummy_l, Int4WeightOnlyConfig(version=1))
+        quantize_(dummy_l, int4_weight_only(version=1))
         quantized = dummy_l.weight
         quantized = quantized.narrow(0, 0, 512)
 
@@ -503,7 +502,7 @@ def test_mm_int4wo(self, device, dtype):
 
         l = torch.nn.Linear(512, 1024).to(device).to(dtype)
         l.weight = torch.nn.Parameter(weight)
-        quantize_(l, Int4WeightOnlyConfig(version=1))
+        quantize_(l, int4_weight_only(version=1))
         # weight shape: 1024 x 512
         weight = l.weight
 
diff --git a/test/integration/test_load_and_run_checkpoint.py b/test/integration/test_load_and_run_checkpoint.py
@@ -24,9 +24,22 @@
 
 # please check model card for how to generate these models
 
-_DEPRECATED_SINGLE_LINEAR_MODEL_NAMES = [
+# high precision model, used for testing config deprecation warning
+_HIGH_PRECISION_MODEL = "facebook/opt-125m"
+
+_DEPRECATED_SINGLE_LINEAR_MODEL_INFO = [
     # model card: https://huggingface.co/torchao-testing/single-linear-Float8DynamicActivationFloat8WeightConfig-v1-0.13.dev
-    "torchao-testing/single-linear-Float8DynamicActivationFloat8WeightConfig-v1-0.13.dev"
+    (
+        "torchao-testing/single-linear-Float8DynamicActivationFloat8WeightConfig-v1-0.13.dev",
+        1,
+        "Float8DynamicActivationFloat8WeightConfig",
+    ),
+    # model card: https://huggingface.co/torchao-testing/single-linear-Int4WeightOnlyConfig-v1-0.14.dev
+    (
+        "torchao-testing/single-linear-Int4WeightOnlyConfig-v1-0.14.dev",
+        1,
+        "Int4WeightOnlyConfig",
+    ),
 ]
 
 _DEPRECATED_MODEL_INFO = [
@@ -36,15 +49,33 @@
         1,
         "Float8DynamicActivationFloat8WeightConfig",
     ),
+    # model card: https://huggingface.co/torchao-testing/opt-125m-Int4WeightOnlyConfig-v1-0.14.dev
+    (
+        "torchao-testing/opt-125m-Int4WeightOnlyConfig-v1-0.14.dev",
+        1,
+        "Int4WeightOnlyConfig",
+    ),
 ]
 
-_SINGLE_LINEAR_MODEL_NAMES = [
+_SINGLE_LINEAR_MODEL_INFO = [
     # model card: https://huggingface.co/torchao-testing/single-linear-Float8DynamicActivationFloat8WeightConfig-v2-0.13.dev
-    "torchao-testing/single-linear-Float8DynamicActivationFloat8WeightConfig-v2-0.13.dev",
+    (
+        "torchao-testing/single-linear-Float8DynamicActivationFloat8WeightConfig-v2-0.13.dev",
+        2,
+        "Float8DynamicActivationFloat8WeightConfig",
+    ),
     # model card: https://huggingface.co/torchao-testing/single-linear-Int4WeightOnlyConfig-v2-0.13.dev
-    "torchao-testing/single-linear-Int4WeightOnlyConfig-v2-0.13.dev",
+    (
+        "torchao-testing/single-linear-Int4WeightOnlyConfig-v2-0.13.dev",
+        2,
+        "Int4WeightOnlyConfig",
+    ),
     # model card: https://huggingface.co/torchao-testing/single-linear-Int4WeightOnlyConfig-preshuffled-v2-0.13.dev
-    "torchao-testing/single-linear-Int4WeightOnlyConfig-preshuffled-v2-0.13.dev",
+    (
+        "torchao-testing/single-linear-Int4WeightOnlyConfig-preshuffled-v2-0.13.dev",
+        2,
+        "Int4WeightOnlyConfig",
+    ),
 ]
 
 
@@ -55,7 +86,9 @@
     "Skipping the test in fbcode for now, not sure how to download from transformers",
 )
 class TestLoadAndRunCheckpoint(TestCase):
-    def _test_single_linear_helper(self, model_name):
+    def _test_single_linear_helper(
+        self, model_name, version, config_name, is_deprecated
+    ):
         from huggingface_hub import hf_hub_download
 
         downloaded_model = hf_hub_download(model_name, filename="model.pt")
@@ -69,8 +102,20 @@ def _test_single_linear_helper(self, model_name):
             model = torch.nn.Sequential(
                 torch.nn.Linear(32, 256, dtype=torch.bfloat16, device="cuda")
             )
-        with open(downloaded_model, "rb") as f:
+
+        with (
+            open(downloaded_model, "rb") as f,
+            warnings.catch_warnings(record=True) as caught_warnings,
+        ):
             model.load_state_dict(torch.load(f), assign=True)
+            if is_deprecated:
+                assert any(
+                    f"Models quantized with version {version} of {config_name} is deprecated"
+                    in str(w.message)
+                    for w in caught_warnings
+                ), (
+                    f"Didn't get expected warning message for deprecation for model: {model_name}"
+                )
 
         downloaded_example_inputs = hf_hub_download(
             model_name, filename="model_inputs.pt"
@@ -84,17 +129,23 @@ def _test_single_linear_helper(self, model_name):
         output = model(*example_inputs)
         self.assertTrue(torch.equal(output, ref_output))
 
-    @common_utils.parametrize("model_name", _DEPRECATED_SINGLE_LINEAR_MODEL_NAMES)
-    def test_deprecated_single_linear(self, model_name):
-        self._test_single_linear_helper(model_name)
+    @common_utils.parametrize("model_info", _DEPRECATED_SINGLE_LINEAR_MODEL_INFO)
+    def test_deprecated_single_linear(self, model_info):
+        model_name, version, config_name = model_info
+        self._test_single_linear_helper(
+            model_name, version, config_name, is_deprecated=True
+        )
 
-    @common_utils.parametrize("model_name", _SINGLE_LINEAR_MODEL_NAMES)
-    def test_single_linear(self, model_name):
+    @common_utils.parametrize("model_info", _SINGLE_LINEAR_MODEL_INFO)
+    def test_single_linear(self, model_info):
         """Test that we can load and run the quantized linear checkpoint with saved sample input
         and match the saved output, to make sure there is no BC breaking changes
         when we make changes to tensor subclass implementations
         """
-        self._test_single_linear_helper(model_name)
+        model_name, version, config_name = model_info
+        self._test_single_linear_helper(
+            model_name, version, config_name, is_deprecated=False
+        )
 
     @common_utils.parametrize("model_info", _DEPRECATED_MODEL_INFO)
     def test_deprecated_hf_models(self, model_info):
@@ -109,17 +160,23 @@ def test_deprecated_hf_models(self, model_info):
                 torch_dtype="bfloat16",
                 device_map="cuda:0",
             )
+            # version mismatch check in config.py
             assert any(
                 "Stored version is not the same as current default version of the config"
                 in str(w.message)
                 for w in caught_warnings
-            ), "Didn't get expected warning message for version mismatch"
+            ), (
+                f"Didn't get expected warning message for version mismatch for config {config_name}, model {model_name}"
+            )
 
+            # checkpoint deprecation
             assert any(
-                f"Models quantized with version 1 of {config_name} is deprecated"
+                f"Models quantized with version {version} of {config_name} is deprecated"
                 in str(w.message)
                 for w in caught_warnings
-            ), "Didn't get expected warning message for deprecation"
+            ), (
+                f"Didn't get expected warning message for deprecation for model {model_name}"
+            )
             assert isinstance(quantized_model.config.quantization_config, TorchAoConfig)
             assert (
                 quantized_model.config.quantization_config.quant_type.version == version
@@ -139,7 +196,8 @@ def test_deprecated_hf_models(self, model_info):
             return_tensors="pt",
         ).to("cuda")
         generated_ids = quantized_model.generate(
-            **inputs, max_new_tokens=128, temperature=0
+            **inputs,
+            max_new_tokens=128,
         )
 
         downloaded_output = hf_hub_download(model_name, filename="model_output.pt")
@@ -153,6 +211,23 @@ def test_deprecated_hf_models(self, model_info):
             generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
         )
 
+        # make sure we throw warning for config deprecation
+        with warnings.catch_warnings(record=True) as caught_warnings:
+            _ = AutoModelForCausalLM.from_pretrained(
+                _HIGH_PRECISION_MODEL,
+                torch_dtype="bfloat16",
+                device_map="cuda:0",
+                quantization_config=quantized_model.config.quantization_config,
+            )
+            # config version deprecation in quant_api.py
+            assert any(
+                f"Config Deprecation: version {version} of {config_name} is deprecated and will no longer be supported in a future release"
+                in str(w.message)
+                for w in caught_warnings
+            ), (
+                f"Didn't get expected warning message for version deprecation for config {config_name}, model {model_name}"
+            )
+
 
 common_utils.instantiate_parametrized_tests(TestLoadAndRunCheckpoint)
 
diff --git a/test/prototype/test_awq.py b/test/prototype/test_awq.py
@@ -73,7 +73,7 @@ def test_awq_functionality(self):
         m = ToyLinearModel(l1, l2, l3).eval().to(original_dtype).to(device)
 
         # baseline quantization
-        base_config = Int4WeightOnlyConfig(group_size=group_size, version=2)
+        base_config = Int4WeightOnlyConfig(group_size=group_size)
         m_baseline = copy.deepcopy(m)
         quantize_(m_baseline, base_config)
 
@@ -123,7 +123,7 @@ def test_awq_loading(self):
         calibration_data = dataset[:n_calibration_examples]
 
         # calibrate
-        base_config = Int4WeightOnlyConfig(group_size=group_size, version=2)
+        base_config = Int4WeightOnlyConfig(group_size=group_size)
         quant_config = AWQConfig(base_config, step=AWQStep.PREPARE)
         quantize_(m, quant_config)
 
@@ -177,7 +177,7 @@ def test_awq_loading_vllm(self):
         calibration_data = dataset[:n_calibration_examples]
 
         # calibrate
-        base_config = Int4WeightOnlyConfig(group_size=group_size, version=2)
+        base_config = Int4WeightOnlyConfig(group_size=group_size)
         quant_config = AWQConfig(base_config, step=AWQStep.PREPARE)
         quantize_(m, quant_config)
 
diff --git a/test/quantization/quantize_/workflows/int4/test_int4_marlin_sparse_tensor.py b/test/quantization/quantize_/workflows/int4/test_int4_marlin_sparse_tensor.py
@@ -27,7 +27,6 @@
 BF16_ACT_CONFIG = Int4WeightOnlyConfig(
     group_size=128,
     int4_packing_format="marlin_sparse",
-    version=2,
 )
 
 
diff --git a/test/quantization/quantize_/workflows/int4/test_int4_opaque_tensor.py b/test/quantization/quantize_/workflows/int4/test_int4_opaque_tensor.py
@@ -29,7 +29,6 @@ def get_config(group_size):
     return Int4WeightOnlyConfig(
         group_size=group_size,
         int4_packing_format="opaque",
-        version=2,
     )
 
 
diff --git a/test/quantization/quantize_/workflows/int4/test_int4_plain_int32_tensor.py b/test/quantization/quantize_/workflows/int4/test_int4_plain_int32_tensor.py
@@ -29,7 +29,6 @@ def get_config(group_size):
     return Int4WeightOnlyConfig(
         group_size=group_size,
         int4_packing_format="plain_int32",
-        version=2,
     )
 
 
diff --git a/test/quantization/quantize_/workflows/int4/test_int4_preshuffled_tensor.py b/test/quantization/quantize_/workflows/int4/test_int4_preshuffled_tensor.py
@@ -30,7 +30,6 @@
 BF16_ACT_CONFIG = Int4WeightOnlyConfig(
     group_size=128,
     int4_packing_format="preshuffled",
-    version=2,
 )
 
 # only 128 group_size is supported
diff --git a/test/quantization/quantize_/workflows/int4/test_int4_tensor.py b/test/quantization/quantize_/workflows/int4/test_int4_tensor.py
@@ -35,7 +35,6 @@ def setUp(self):
         self.config = Int4WeightOnlyConfig(
             group_size=128,
             int4_packing_format="plain",
-            version=2,
         )
         self.GPU_DEVICES = ["cuda"] if torch.cuda.is_available() else []
 
diff --git a/test/quantization/quantize_/workflows/int4/test_int4_tile_packed_to_4d_tensor.py b/test/quantization/quantize_/workflows/int4/test_int4_tile_packed_to_4d_tensor.py
@@ -25,14 +25,12 @@
 INT4_CONFIG = Int4WeightOnlyConfig(
     group_size=128,
     int4_packing_format="tile_packed_to_4d",
-    version=2,
 )
 
 INT4_HQQ_CONFIG = Int4WeightOnlyConfig(
     group_size=128,
     int4_packing_format="tile_packed_to_4d",
     int4_choose_qparams_algorithm="hqq",
-    version=2,
 )
 
 
diff --git a/torchao/dtypes/uintx/int4_cpu_layout.py b/torchao/dtypes/uintx/int4_cpu_layout.py
@@ -3,6 +3,7 @@
 #
 # This source code is licensed under the BSD 3-Clause license found in the
 # LICENSE file in the root directory of this source tree.
+import warnings
 from dataclasses import dataclass
 from typing import Optional, Tuple
 
@@ -78,6 +79,9 @@ def __init__(
         transposed: bool,
         _layout: Layout,
     ):
+        warnings.warn(
+            "Models quantized with version 1 of Int4WeightOnlyConfig is deprecated and will no longer be supported in a future release, please upgrade torchao and quantize again, or download a newer torchao checkpoint, see https://github.com/pytorch/ao/issues/2948 for more details"
+        )
         self.packed_weight = packed_weight
         self.scale_and_zero = scale_and_zero
         self.transposed = False
diff --git a/torchao/dtypes/uintx/int4_xpu_layout.py b/torchao/dtypes/uintx/int4_xpu_layout.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD 3-Clause license found in the
 # LICENSE file in the root directory of this source tree.
 
+import warnings
 from dataclasses import dataclass
 from typing import Optional, Tuple
 
@@ -207,6 +208,9 @@ def __init__(
         scale: torch.Tensor = None,
         zero: torch.Tensor = None,
     ):
+        warnings.warn(
+            "Models quantized with version 1 of Int4WeightOnlyConfig is deprecated and will no longer be supported in a future release, please upgrade torchao and quantize again, or download a newer torchao checkpoint, see https://github.com/pytorch/ao/issues/2948 for more details"
+        )
         self.packed_weight = packed_weight
         self.scale_and_zero = scale_and_zero
         self.transposed = False
diff --git a/torchao/dtypes/uintx/marlin_sparse_layout.py b/torchao/dtypes/uintx/marlin_sparse_layout.py
diff --git a/torchao/dtypes/uintx/tensor_core_tiled_layout.py b/torchao/dtypes/uintx/tensor_core_tiled_layout.py
diff --git a/torchao/prototype/awq/example.py b/torchao/prototype/awq/example.py
diff --git a/torchao/quantization/README.md b/torchao/quantization/README.md
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
diff --git a/torchao/sparsity/README.md b/torchao/sparsity/README.md

Original file line number	Diff line number	Diff line change
`@@ -27,7 +27,6 @@`
`27`	`27`	`BF16_ACT_CONFIG = Int4WeightOnlyConfig(`
`28`	`28`	`group_size=128,`
`29`	`29`	`int4_packing_format="marlin_sparse",`
`30`		`- version=2,`
`31`	`30`	`)`
`32`	`31`
`33`	`32`
Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,6 @@ def get_config(group_size):`
`29`	`29`	`return Int4WeightOnlyConfig(`
`30`	`30`	`group_size=group_size,`
`31`	`31`	`int4_packing_format="opaque",`
`32`		`- version=2,`
`33`	`32`	`)`
`34`	`33`
`35`	`34`
Original file line number	Diff line number	Diff line change
`@@ -30,7 +30,6 @@`
`30`	`30`	`BF16_ACT_CONFIG = Int4WeightOnlyConfig(`
`31`	`31`	`group_size=128,`
`32`	`32`	`int4_packing_format="preshuffled",`
`33`		`- version=2,`
`34`	`33`	`)`
`35`	`34`
`36`	`35`	`# only 128 group_size is supported`
Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,6 @@ def setUp(self):`
`35`	`35`	`self.config = Int4WeightOnlyConfig(`
`36`	`36`	`group_size=128,`
`37`	`37`	`int4_packing_format="plain",`
`38`		`- version=2,`
`39`	`38`	`)`
`40`	`39`	`self.GPU_DEVICES = ["cuda"] if torch.cuda.is_available() else []`
`41`	`40`
Original file line number	Diff line number	Diff line change
`@@ -25,14 +25,12 @@`
`25`	`25`	`INT4_CONFIG = Int4WeightOnlyConfig(`
`26`	`26`	`group_size=128,`
`27`	`27`	`int4_packing_format="tile_packed_to_4d",`
`28`		`- version=2,`
`29`	`28`	`)`
`30`	`29`
`31`	`30`	`INT4_HQQ_CONFIG = Int4WeightOnlyConfig(`
`32`	`31`	`group_size=128,`
`33`	`32`	`int4_packing_format="tile_packed_to_4d",`
`34`	`33`	`int4_choose_qparams_algorithm="hqq",`
`35`		`- version=2,`
`36`	`34`	`)`
`37`	`35`
`38`	`36`