pytorch
diff --git a/‎.github/scripts/torchao_model_releases/quantize_and_upload.py‎
Lines changed: 3 additions & 4 deletions b/‎.github/scripts/torchao_model_releases/quantize_and_upload.py‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎benchmarks/microbenchmarks/test/test_benchmark_inference.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/microbenchmarks/test/test_benchmark_inference.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/microbenchmarks/utils.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/microbenchmarks/utils.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/torchao_vllm_integration.md‎
Lines changed: 2 additions & 1 deletion b/‎docs/source/torchao_vllm_integration.md‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎test/dtypes/test_affine_quantized.py‎
Lines changed: 10 additions & 10 deletions b/‎test/dtypes/test_affine_quantized.py‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎test/integration/test_integration.py‎
Lines changed: 10 additions & 4 deletions b/‎test/integration/test_integration.py‎
Lines changed: 10 additions & 4 deletions
diff --git a/‎test/integration/test_load_and_run_checkpoint.py‎
Lines changed: 93 additions & 18 deletions b/‎test/integration/test_load_and_run_checkpoint.py‎
Lines changed: 93 additions & 18 deletions
@@ -206,7 +206,7 @@ def _untie_weights_and_save_locally(model_id):
 
 _int4_quant_code = """
 from torchao.quantization import Int4WeightOnlyConfig
-quant_config = Int4WeightOnlyConfig(group_size=128, packing_format="tile_packed_to_4d", int4_choose_qparams_algorithm="hqq", version=2)
+quant_config = Int4WeightOnlyConfig(group_size=128, packing_format="tile_packed_to_4d", int4_choose_qparams_algorithm="hqq")
 quantization_config = TorchAoConfig(quant_type=quant_config)
 quantized_model = AutoModelForCausalLM.from_pretrained(model_to_quantize, device_map="auto", torch_dtype=torch.bfloat16, quantization_config=quantization_config)
 tokenizer = AutoTokenizer.from_pretrained(model_id)
@@ -256,7 +256,7 @@ def _untie_weights_and_save_locally(model_id):
 )
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 
-base_config = Int4WeightOnlyConfig(group_size=128, version=2)
+base_config = Int4WeightOnlyConfig(group_size=128)
 quant_config = AWQConfig(base_config, step="prepare")
 quantize_(
     model,
@@ -635,7 +635,6 @@ def quantize_and_upload(
             group_size=128,
             packing_format="tile_packed_to_4d",
             int4_choose_qparams_algorithm="hqq",
-            version=2,
         ),
         "INT8-INT4": ModuleFqnToConfig(
             {
@@ -669,7 +668,7 @@ def quantize_and_upload(
         )
         tokenizer = AutoTokenizer.from_pretrained(model_id)
 
-        base_config = Int4WeightOnlyConfig(group_size=128, version=2)
+        base_config = Int4WeightOnlyConfig(group_size=128)
         quant_config = AWQConfig(base_config, step="prepare")
         quantize_(
             model,
 
@@ -58,7 +58,7 @@ def test_run_inference_with_semi_sparse_marlin(self, mock_string_to_config):
 
         # Test with semi-sparse config
         mock_string_to_config.return_value = Int4WeightOnlyConfig(
-            layout=MarlinSparseLayout()
+            layout=MarlinSparseLayout(), version=1
         )
         config = BenchmarkConfig(
             quantization="marlin",
 
@@ -206,7 +206,7 @@ def string_to_config(
             128,
             256,
         ], f"int4wo group_size needs to be one of [32,64,128,256] but got {group_size}"
-        return Int4WeightOnlyConfig(group_size=group_size, use_hqq=use_hqq)
+        return Int4WeightOnlyConfig(group_size=group_size, use_hqq=True, version=1)
     elif "int8adq-int4w-symm" in quantization:
         from torchao.dtypes import CutlassInt4PackedLayout
 
 
@@ -45,6 +45,7 @@ from torchao.quantization import Int4WeightOnlyConfig
 config = Int4WeightOnlyConfig(
     group_size=128,
     use_hqq=True,
+    version=1,
 )
 assert isinstance(config, AOBaseConfig)
 ```
@@ -81,7 +82,7 @@ from torchao.quantization import Int4WeightOnlyConfig
 
 # Create quantization configuration
 quantization_config = TorchAoConfig(
-    quant_type=Int4WeightOnlyConfig(group_size=128, use_hqq=True)
+    quant_type=Int4WeightOnlyConfig(group_size=128, use_hqq=True, version=1)
 )
 
 # Load and automatically quantize the model
 
@@ -32,7 +32,6 @@
     Int8DynamicActivationInt8WeightConfig,
     float8_weight_only,
     int4_dynamic_activation_int4_weight,
-    int4_weight_only,
     int8_dynamic_activation_int4_weight,
     int8_dynamic_activation_int8_weight,
     int8_weight_only,
@@ -66,22 +65,23 @@ def get_quantization_functions(
     if do_int4:
         if check_cpu_version(device):
             base_functions.append(
-                int4_weight_only(group_size=32, layout=Int4CPULayout())
+                Int4WeightOnlyConfig(group_size=32, layout=Int4CPULayout(), version=1)
             )
         elif check_xpu_version(device):
             base_functions.append(
-                int4_weight_only(group_size=32, layout=Int4XPULayout())
+                Int4WeightOnlyConfig(group_size=32, layout=Int4XPULayout(), version=1)
             )
             if int4_zp_int:
                 base_functions.append(
-                    int4_weight_only(
+                    Int4WeightOnlyConfig(
                         group_size=32,
                         layout=Int4XPULayout(),
                         zero_point_domain=ZeroPointDomain.INT,
+                        version=1,
                     )
                 )
         else:
-            base_functions.append(int4_weight_only(group_size=32))
+            base_functions.append(Int4WeightOnlyConfig(group_size=32, version=1))
             if device == "cuda" and not is_ROCM():
                 base_functions.append(
                     int8_dynamic_activation_int4_weight(
@@ -118,7 +118,7 @@ def test_tensor_core_layout_transpose(self):
         linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device="cuda")
         t = linear.weight
         shape = t.shape
-        apply_int4_weight_only_quant = int4_weight_only(group_size=32)
+        apply_int4_weight_only_quant = Int4WeightOnlyConfig(group_size=32, version=1)
         quantize_(linear, apply_int4_weight_only_quant)
         ql = linear
         aqt = ql.weight
@@ -353,7 +353,7 @@ def test_slice_int4wo(self, device, dtype):
         # out_feature not divisible by 8
         # to test slice + padding for int4 weight only quantization
         dummy = nn.Linear(256, 321, dtype=dtype, device=device)
-        quantize_(dummy, Int4WeightOnlyConfig())
+        quantize_(dummy, Int4WeightOnlyConfig(version=1))
         # make sure these run without error
         _ = dummy.weight.narrow(0, 0, 64)
         _ = dummy.weight.narrow(1, 0, 128)
@@ -467,7 +467,7 @@ def test_slice_and_copy_int4wo(self, device, dtype):
         l.weight = torch.nn.Parameter(
             torch.zeros(1024, 1024, dtype=torch.bfloat16, device="cuda")
         )
-        quantize_(l, Int4WeightOnlyConfig())
+        quantize_(l, Int4WeightOnlyConfig(version=1))
         param = l.weight
         param_data = param.data
         param_data = param_data.narrow(0, 0, 512)
@@ -483,7 +483,7 @@ def test_slice_and_copy_int4wo(self, device, dtype):
 
         # dummy_l has random input (shouldn't be 0)
         dummy_l = torch.nn.Linear(1024, 1024).to("cuda").to(torch.bfloat16)
-        quantize_(dummy_l, Int4WeightOnlyConfig())
+        quantize_(dummy_l, Int4WeightOnlyConfig(version=1))
         quantized = dummy_l.weight
         quantized = quantized.narrow(0, 0, 512)
 
@@ -502,7 +502,7 @@ def test_mm_int4wo(self, device, dtype):
 
         l = torch.nn.Linear(512, 1024).to(device).to(dtype)
         l.weight = torch.nn.Parameter(weight)
-        quantize_(l, Int4WeightOnlyConfig())
+        quantize_(l, Int4WeightOnlyConfig(version=1))
         # weight shape: 1024 x 512
         weight = l.weight
 
 
@@ -135,17 +135,23 @@ def _int4wo_api(mod, use_hqq=False):
         quantize_(
             mod,
             int4_weight_only(
-                layout=Int4CPULayout(), use_hqq=use_hqq, set_inductor_config=False
+                layout=Int4CPULayout(),
+                use_hqq=use_hqq,
+                set_inductor_config=False,
+                version=1,
             ),
         )
         unwrap_tensor_subclass(mod)
     elif check_xpu_version(next(mod.parameters()).device):
         quantize_(
-            mod, int4_weight_only(layout=Int4XPULayout()), set_inductor_config=False
+            mod,
+            int4_weight_only(layout=Int4XPULayout()),
+            set_inductor_config=False,
+            version=1,
         )
         unwrap_tensor_subclass(mod)
     else:
-        quantize_(mod, int4_weight_only(set_inductor_config=False))
+        quantize_(mod, int4_weight_only(set_inductor_config=False, version=1))
 
 
 def _int8da_int4w_api(mod):
@@ -1077,7 +1083,7 @@ def test_int4_weight_only_quant_subclass_api_grouped(self, device, dtype):
         ):
             for groupsize in [64, 32]:
                 for layout in layout_list:
-                    kwargs = {"groupsize": groupsize, "layout": layout}
+                    kwargs = {"groupsize": groupsize, "layout": layout, "version": 1}
 
                     def api(mod):
                         kwargs_copy = kwargs.copy()
 
@@ -24,9 +24,22 @@
 
 # please check model card for how to generate these models
 
-_DEPRECATED_SINGLE_LINEAR_MODEL_NAMES = [
+# high precision model, used for testing config deprecation warning
+_HIGH_PRECISION_MODEL = "facebook/opt-125m"
+
+_DEPRECATED_SINGLE_LINEAR_MODEL_INFO = [
     # model card: https://huggingface.co/torchao-testing/single-linear-Float8DynamicActivationFloat8WeightConfig-v1-0.13.dev
-    "torchao-testing/single-linear-Float8DynamicActivationFloat8WeightConfig-v1-0.13.dev"
+    (
+        "torchao-testing/single-linear-Float8DynamicActivationFloat8WeightConfig-v1-0.13.dev",
+        1,
+        "Float8DynamicActivationFloat8WeightConfig",
+    ),
+    # model card: https://huggingface.co/torchao-testing/single-linear-Int4WeightOnlyConfig-v1-0.14.dev
+    (
+        "torchao-testing/single-linear-Int4WeightOnlyConfig-v1-0.14.dev",
+        1,
+        "Int4WeightOnlyConfig",
+    ),
 ]
 
 _DEPRECATED_MODEL_INFO = [
@@ -36,15 +49,33 @@
         1,
         "Float8DynamicActivationFloat8WeightConfig",
     ),
+    # model card: https://huggingface.co/torchao-testing/opt-125m-Int4WeightOnlyConfig-v1-0.14.dev
+    (
+        "torchao-testing/opt-125m-Int4WeightOnlyConfig-v1-0.14.dev",
+        1,
+        "Int4WeightOnlyConfig",
+    ),
 ]
 
-_SINGLE_LINEAR_MODEL_NAMES = [
+_SINGLE_LINEAR_MODEL_INFO = [
     # model card: https://huggingface.co/torchao-testing/single-linear-Float8DynamicActivationFloat8WeightConfig-v2-0.13.dev
-    "torchao-testing/single-linear-Float8DynamicActivationFloat8WeightConfig-v2-0.13.dev",
+    (
+        "torchao-testing/single-linear-Float8DynamicActivationFloat8WeightConfig-v2-0.13.dev",
+        2,
+        "Float8DynamicActivationFloat8WeightConfig",
+    ),
     # model card: https://huggingface.co/torchao-testing/single-linear-Int4WeightOnlyConfig-v2-0.13.dev
-    "torchao-testing/single-linear-Int4WeightOnlyConfig-v2-0.13.dev",
+    (
+        "torchao-testing/single-linear-Int4WeightOnlyConfig-v2-0.13.dev",
+        2,
+        "Int4WeightOnlyConfig",
+    ),
     # model card: https://huggingface.co/torchao-testing/single-linear-Int4WeightOnlyConfig-preshuffled-v2-0.13.dev
-    "torchao-testing/single-linear-Int4WeightOnlyConfig-preshuffled-v2-0.13.dev",
+    (
+        "torchao-testing/single-linear-Int4WeightOnlyConfig-preshuffled-v2-0.13.dev",
+        2,
+        "Int4WeightOnlyConfig",
+    ),
 ]
 
 
@@ -55,7 +86,9 @@
     "Skipping the test in fbcode for now, not sure how to download from transformers",
 )
 class TestLoadAndRunCheckpoint(TestCase):
-    def _test_single_linear_helper(self, model_name):
+    def _test_single_linear_helper(
+        self, model_name, version, config_name, is_deprecated
+    ):
         from huggingface_hub import hf_hub_download
 
         downloaded_model = hf_hub_download(model_name, filename="model.pt")
@@ -69,8 +102,20 @@ def _test_single_linear_helper(self, model_name):
             model = torch.nn.Sequential(
                 torch.nn.Linear(32, 256, dtype=torch.bfloat16, device="cuda")
             )
-        with open(downloaded_model, "rb") as f:
+
+        with (
+            open(downloaded_model, "rb") as f,
+            warnings.catch_warnings(record=True) as caught_warnings,
+        ):
             model.load_state_dict(torch.load(f), assign=True)
+            if is_deprecated:
+                assert any(
+                    f"Models quantized with version {version} of {config_name} is deprecated"
+                    in str(w.message)
+                    for w in caught_warnings
+                ), (
+                    f"Didn't get expected warning message for deprecation for model: {model_name}"
+                )
 
         downloaded_example_inputs = hf_hub_download(
             model_name, filename="model_inputs.pt"
@@ -84,17 +129,23 @@ def _test_single_linear_helper(self, model_name):
         output = model(*example_inputs)
         self.assertTrue(torch.equal(output, ref_output))
 
-    @common_utils.parametrize("model_name", _DEPRECATED_SINGLE_LINEAR_MODEL_NAMES)
-    def test_deprecated_single_linear(self, model_name):
-        self._test_single_linear_helper(model_name)
+    @common_utils.parametrize("model_info", _DEPRECATED_SINGLE_LINEAR_MODEL_INFO)
+    def test_deprecated_single_linear(self, model_info):
+        model_name, version, config_name = model_info
+        self._test_single_linear_helper(
+            model_name, version, config_name, is_deprecated=True
+        )
 
-    @common_utils.parametrize("model_name", _SINGLE_LINEAR_MODEL_NAMES)
-    def test_single_linear(self, model_name):
+    @common_utils.parametrize("model_info", _SINGLE_LINEAR_MODEL_INFO)
+    def test_single_linear(self, model_info):
         """Test that we can load and run the quantized linear checkpoint with saved sample input
         and match the saved output, to make sure there is no BC breaking changes
         when we make changes to tensor subclass implementations
         """
-        self._test_single_linear_helper(model_name)
+        model_name, version, config_name = model_info
+        self._test_single_linear_helper(
+            model_name, version, config_name, is_deprecated=False
+        )
 
     @common_utils.parametrize("model_info", _DEPRECATED_MODEL_INFO)
     def test_deprecated_hf_models(self, model_info):
@@ -109,17 +160,23 @@ def test_deprecated_hf_models(self, model_info):
                 torch_dtype="bfloat16",
                 device_map="cuda:0",
             )
+            # version mismatch check in config.py
             assert any(
                 "Stored version is not the same as current default version of the config"
                 in str(w.message)
                 for w in caught_warnings
-            ), "Didn't get expected warning message for version mismatch"
+            ), (
+                f"Didn't get expected warning message for version mismatch for config {config_name}, model {model_name}"
+            )
 
+            # checkpoint deprecation
             assert any(
-                f"Models quantized with version 1 of {config_name} is deprecated"
+                f"Models quantized with version {version} of {config_name} is deprecated"
                 in str(w.message)
                 for w in caught_warnings
-            ), "Didn't get expected warning message for deprecation"
+            ), (
+                f"Didn't get expected warning message for deprecation for model {model_name}"
+            )
             assert isinstance(quantized_model.config.quantization_config, TorchAoConfig)
             assert (
                 quantized_model.config.quantization_config.quant_type.version == version
@@ -139,7 +196,8 @@ def test_deprecated_hf_models(self, model_info):
             return_tensors="pt",
         ).to("cuda")
         generated_ids = quantized_model.generate(
-            **inputs, max_new_tokens=128, temperature=0
+            **inputs,
+            max_new_tokens=128,
         )
 
         downloaded_output = hf_hub_download(model_name, filename="model_output.pt")
@@ -153,6 +211,23 @@ def test_deprecated_hf_models(self, model_info):
             generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
         )
 
+        # make sure we throw warning for config deprecation
+        with warnings.catch_warnings(record=True) as caught_warnings:
+            _ = AutoModelForCausalLM.from_pretrained(
+                _HIGH_PRECISION_MODEL,
+                torch_dtype="bfloat16",
+                device_map="cuda:0",
+                quantization_config=quantized_model.config.quantization_config,
+            )
+            # config version deprecation in quant_api.py
+            assert any(
+                f"Config Deprecation: version {version} of {config_name} is deprecated and will no longer be supported in a future release"
+                in str(w.message)
+                for w in caught_warnings
+            ), (
+                f"Didn't get expected warning message for version deprecation for config {config_name}, model {model_name}"
+            )
+
 
 common_utils.instantiate_parametrized_tests(TestLoadAndRunCheckpoint)
Original file line number	Diff line number	Diff line change
`@@ -58,7 +58,7 @@ def test_run_inference_with_semi_sparse_marlin(self, mock_string_to_config):`
`58`	`58`
`59`	`59`	`# Test with semi-sparse config`
`60`	`60`	`mock_string_to_config.return_value = Int4WeightOnlyConfig(`
`61`		`- layout=MarlinSparseLayout()`
	`61`	`+ layout=MarlinSparseLayout(), version=1`
`62`	`62`	`)`
`63`	`63`	`config = BenchmarkConfig(`
`64`	`64`	`quantization="marlin",`
Original file line number	Diff line number	Diff line change
`@@ -45,6 +45,7 @@ from torchao.quantization import Int4WeightOnlyConfig`
`45`	`45`	`config = Int4WeightOnlyConfig(`
`46`	`46`	`group_size=128,`
`47`	`47`	`use_hqq=True,`
	`48`	`+ version=1,`
`48`	`49`	`)`
`49`	`50`	`assert isinstance(config, AOBaseConfig)`
`50`	`51`	```
`@@ -81,7 +82,7 @@ from torchao.quantization import Int4WeightOnlyConfig`
`81`	`82`
`82`	`83`	`# Create quantization configuration`
`83`	`84`	`quantization_config = TorchAoConfig(`
`84`		`- quant_type=Int4WeightOnlyConfig(group_size=128, use_hqq=True)`
	`85`	`+ quant_type=Int4WeightOnlyConfig(group_size=128, use_hqq=True, version=1)`
`85`	`86`	`)`
`86`	`87`
`87`	`88`	`# Load and automatically quantize the model`