pytorch
diff --git a/‎test/prototype/test_awq.py‎
Lines changed: 136 additions & 136 deletions b/‎test/prototype/test_awq.py‎
Lines changed: 136 additions & 136 deletions
diff --git a/‎test/quantization/quantize_/workflows/int4/test_int4_tile_packed_to_4d_tensor.py‎
Lines changed: 24 additions & 2 deletions b/‎test/quantization/quantize_/workflows/int4/test_int4_tile_packed_to_4d_tensor.py‎
Lines changed: 24 additions & 2 deletions
@@ -7,9 +7,10 @@
 import tempfile
 
 import torch
-from parameterized import parameterized
 from torch.testing._internal.common_utils import (
     TestCase,
+    instantiate_parametrized_tests,
+    parametrize,
     run_tests,
 )
 
@@ -55,6 +56,17 @@ def forward(self, x):
     devices.append("xpu")
 
 
+device_to_base_configs = {
+    "cuda": [
+        Int4WeightOnlyConfig(group_size=128),
+        # Note: the functionality unit test doesn't work for hqq
+        Int4WeightOnlyConfig(group_size=128, int4_packing_format="tile_packed_to_4d"),
+    ],
+    "cpu": [Int4WeightOnlyConfig(group_size=128, int4_packing_format="opaque")],
+    "xpu": [Int4WeightOnlyConfig(group_size=128, int4_packing_format="plain_int32")],
+}
+
+
 class TestAWQ(TestCase):
     def test_awq_config(self):
         base_config = Int4WeightOnlyConfig()
@@ -69,190 +81,178 @@ def test_awq_config(self):
         with self.assertRaisesRegex(ValueError, "is not one of"):
             AWQConfig(base_config, step="not_supported")
 
-    @parameterized.expand([(device,) for device in devices])
+    @parametrize("device", devices)
     def test_awq_functionality(self, device):
-        dataset_size = 100
+        dataset_size = 10
         l1, l2, l3 = 512, 256, 128
         original_dtype = torch.bfloat16  # tinygemm kernel only uses bfloat16 inputs
-        group_size = 128
-        n_calibration_examples = 10
         sequence_length = 5
 
-        m = ToyLinearModel(l1, l2, l3).eval().to(original_dtype).to(device)
+        assert device in device_to_base_configs, "Unsupported device: {}".format(device)
+        base_configs = device_to_base_configs[device]
 
-        # baseline quantization
-        if device == "cuda":
-            base_config = Int4WeightOnlyConfig(group_size=group_size)
-        elif device == "xpu":
-            base_config = Int4WeightOnlyConfig(
-                group_size=group_size, int4_packing_format="plain_int32"
-            )
-        elif device == "cpu":
-            base_config = Int4WeightOnlyConfig(
-                group_size=group_size, int4_packing_format="opaque"
-            )
-            torch.manual_seed(1234)
-        else:
-            assert False, "Unsupported device: {}".format(device)
-        m_baseline = copy.deepcopy(m)
-        quantize_(m_baseline, base_config)
+        for base_config in base_configs:
+            m = ToyLinearModel(l1, l2, l3).eval().to(original_dtype).to(device)
+            m_baseline = copy.deepcopy(m)
 
-        # awq quantization
-        dataset = m.example_inputs(
-            dataset_size,
-            sequence_length=sequence_length,
-            dtype=original_dtype,
-            device=device,
-        )
-        ref_out = torch.cat([m(d.squeeze(0)) for d in dataset])
+            dataset = m.example_inputs(
+                dataset_size,
+                sequence_length=sequence_length,
+                dtype=original_dtype,
+                device=device,
+            )
+            # for test, we use calibration_data = dataset so that awq is
+            # guranteed to be better than baseline
+            # in reality, calibration_data will be a small subset or a different
+            # dataset
+            calibration_data = dataset
+            # concatenatd inputs
+            input_cat = torch.cat(calibration_data, dim=-2)
+            ref_out = m(input_cat)
 
-        calibration_data = dataset[:n_calibration_examples]
+            # baseline quantization
+            quantize_(m_baseline, base_config)
 
-        quant_config = AWQConfig(base_config, step=AWQStep.PREPARE)
-        quantize_(m, quant_config)
+            # awq quantization
+            quant_config = AWQConfig(base_config, step=AWQStep.PREPARE)
+            quantize_(m, quant_config)
 
-        for example in calibration_data:
-            m(example)
+            for example in calibration_data:
+                m(example)
 
-        quant_config = AWQConfig(base_config, step=AWQStep.CONVERT)
-        quantize_(m, quant_config)
+            quant_config = AWQConfig(base_config, step=AWQStep.CONVERT)
+            quantize_(m, quant_config)
 
-        awq_out = torch.cat([m(d.squeeze(0)) for d in dataset])
-        baseline_out = torch.cat([m_baseline(d.squeeze(0)) for d in dataset])
+            # evaluating on calibration data set to remove any uncertainty
+            awq_out = m(input_cat)
+            baseline_out = m_baseline(input_cat)
 
-        loss_awq = (ref_out - awq_out).pow(2).mean().item()
-        loss_base = (ref_out - baseline_out).pow(2).mean().item()
-        assert loss_awq < loss_base
+            loss_awq = (ref_out - awq_out).pow(2).mean().item()
+            loss_base = (ref_out - baseline_out).pow(2).mean().item()
+            assert loss_awq <= loss_base
 
-    @parameterized.expand([(device,) for device in devices])
+    @parametrize("device", devices)
     def test_awq_loading(self, device):
-        dataset_size = 100
+        dataset_size = 10
         l1, l2, l3 = 512, 256, 128
         original_dtype = torch.bfloat16  # tinygemm kernel only uses bfloat16 inputs
-        group_size = 128
-        n_calibration_examples = 10
         sequence_length = 5
 
-        m = ToyLinearModel(l1, l2, l3).eval().to(original_dtype).to(device)
-        dataset = m.example_inputs(
-            dataset_size,
-            sequence_length=sequence_length,
-            dtype=original_dtype,
-            device=device,
-        )
-        calibration_data = dataset[:n_calibration_examples]
-
-        # calibrate
-        if device == "cuda":
-            base_config = Int4WeightOnlyConfig(group_size=group_size)
-        elif device == "xpu":
-            base_config = Int4WeightOnlyConfig(
-                group_size=group_size, int4_packing_format="plain_int32"
-            )
-        elif device == "cpu":
-            base_config = Int4WeightOnlyConfig(
-                group_size=group_size, int4_packing_format="opaque"
+        assert device in device_to_base_configs, "Unsupported device: {}".format(device)
+        base_configs = device_to_base_configs[device]
+
+        for base_config in base_configs:
+            m = ToyLinearModel(l1, l2, l3).eval().to(original_dtype).to(device)
+            dataset = m.example_inputs(
+                dataset_size,
+                sequence_length=sequence_length,
+                dtype=original_dtype,
+                device=device,
             )
-        else:
-            assert False, "Unsupported device: {}".format(device)
-        quant_config = AWQConfig(base_config, step=AWQStep.PREPARE)
-        quantize_(m, quant_config)
+            # for test purpose, we don't need to get a subset
+            calibration_data = dataset
+            # concatenatd inputs
+            input_cat = torch.cat(calibration_data, dim=-2)
 
-        for example in calibration_data:
-            m(example)
+            # calibrate
 
-        # quantize
-        quant_config = AWQConfig(base_config, step=AWQStep.CONVERT)
-        quantize_(m, quant_config)
+            quant_config = AWQConfig(base_config, step=AWQStep.PREPARE)
+            quantize_(m, quant_config)
 
-        with tempfile.NamedTemporaryFile() as f:
-            torch.save(m.state_dict(), f)
-            f.seek(0)
-            state_dict = torch.load(f)
+            for example in calibration_data:
+                m(example)
 
-        loaded_model = ToyLinearModel(l1, l2, l3).eval().to(original_dtype).to(device)
-        loaded_model.load_state_dict(state_dict, assign=True)
+            # quantize
+            quant_config = AWQConfig(base_config, step=AWQStep.CONVERT)
+            quantize_(m, quant_config)
 
-        m = torch.compile(m, fullgraph=True)
-        loaded_model = torch.compile(loaded_model, fullgraph=True)
+            with tempfile.NamedTemporaryFile() as f:
+                torch.save(m.state_dict(), f)
+                f.seek(0)
+                state_dict = torch.load(f)
 
-        awq_out = torch.cat([m(d.squeeze(0)) for d in dataset])
-        awq_save_load_out = torch.cat([loaded_model(d.squeeze(0)) for d in dataset])
+            loaded_model = (
+                ToyLinearModel(l1, l2, l3).eval().to(original_dtype).to(device)
+            )
+            loaded_model.load_state_dict(state_dict, assign=True)
 
-        assert awq_out is not None
-        assert awq_save_load_out is not None
-        assert torch.allclose(awq_out, awq_save_load_out, atol=1e-2)
+            m = torch.compile(m, fullgraph=True)
+            loaded_model = torch.compile(loaded_model, fullgraph=True)
 
-    @parameterized.expand([(device,) for device in devices])
+            awq_out = m(input_cat)
+            awq_save_load_out = loaded_model(input_cat)
+
+            assert awq_out is not None
+            assert awq_save_load_out is not None
+            assert torch.allclose(awq_out, awq_save_load_out, atol=1e-2)
+
+    @parametrize("device", devices)
     def test_awq_loading_vllm(self, device):
         """Simulate weight loading in vllm:
         * prepare model weight to the same format (awq weight)
         * use weight.copy_(state_dict["weight"]) to copy over the quantized weights from checkpoint
 
         There is also a slicing op that is ommitted here, overall e2e is tested in tests in vllm repo
         """
-        dataset_size = 100
+        dataset_size = 10
         l1, l2, l3 = 512, 256, 128
         original_dtype = torch.bfloat16  # tinygemm kernel only uses bfloat16 inputs
-        group_size = 128
-        n_calibration_examples = 10
         sequence_length = 5
 
-        m = ToyLinearModel(l1, l2, l3).eval().to(original_dtype).to(device)
-        dataset = m.example_inputs(
-            dataset_size,
-            sequence_length=sequence_length,
-            dtype=original_dtype,
-            device=device,
-        )
-        calibration_data = dataset[:n_calibration_examples]
-
-        # calibrate
-        if device == "cuda":
-            base_config = Int4WeightOnlyConfig(group_size=group_size)
-        elif device == "xpu":
-            base_config = Int4WeightOnlyConfig(
-                group_size=group_size, int4_packing_format="plain_int32"
-            )
-        elif device == "cpu":
-            base_config = Int4WeightOnlyConfig(
-                group_size=group_size, int4_packing_format="opaque"
+        assert device in device_to_base_configs, "Unsupported device: {}".format(device)
+        base_configs = device_to_base_configs[device]
+
+        for base_config in base_configs:
+            m = ToyLinearModel(l1, l2, l3).eval().to(original_dtype).to(device)
+            dataset = m.example_inputs(
+                dataset_size,
+                sequence_length=sequence_length,
+                dtype=original_dtype,
+                device=device,
             )
-        else:
-            assert False, "Unsupported device: {}".format(device)
-        quant_config = AWQConfig(base_config, step=AWQStep.PREPARE)
-        quantize_(m, quant_config)
+            # for test purpose, we don't need to get a subset
+            calibration_data = dataset
+            # concatenatd inputs
+            input_cat = torch.cat(calibration_data, dim=-2)
 
-        for example in calibration_data:
-            m(example)
+            # calibrate
+            quant_config = AWQConfig(base_config, step=AWQStep.PREPARE)
+            quantize_(m, quant_config)
 
-        # quantize
-        quant_config = AWQConfig(base_config, step=AWQStep.CONVERT)
-        quantize_(m, quant_config)
+            for example in calibration_data:
+                m(example)
 
-        with tempfile.NamedTemporaryFile() as f:
-            torch.save(m.state_dict(), f)
-            f.seek(0)
-            state_dict = torch.load(f)
+            # quantize
+            quant_config = AWQConfig(base_config, step=AWQStep.CONVERT)
+            quantize_(m, quant_config)
+
+            with tempfile.NamedTemporaryFile() as f:
+                torch.save(m.state_dict(), f)
+                f.seek(0)
+                state_dict = torch.load(f)
+
+            loaded_model = (
+                ToyLinearModel(l1, l2, l3).eval().to(original_dtype).to(device)
+            )
+            quant_config = AWQConfig(base_config, step=AWQStep.PREPARE_FOR_LOADING)
+            quantize_(loaded_model, quant_config)
 
-        loaded_model = ToyLinearModel(l1, l2, l3).eval().to(original_dtype).to(device)
-        quant_config = AWQConfig(base_config, step=AWQStep.PREPARE_FOR_LOADING)
-        quantize_(loaded_model, quant_config)
+            loaded_model.linear1.weight.copy_(state_dict["linear1.weight"])
+            loaded_model.linear2.weight.copy_(state_dict["linear2.weight"])
+            loaded_model.linear3.weight.copy_(state_dict["linear3.weight"])
 
-        loaded_model.linear1.weight.copy_(state_dict["linear1.weight"])
-        loaded_model.linear2.weight.copy_(state_dict["linear2.weight"])
-        loaded_model.linear3.weight.copy_(state_dict["linear3.weight"])
+            m = torch.compile(m, fullgraph=True)
+            loaded_model = torch.compile(loaded_model, fullgraph=True)
 
-        m = torch.compile(m, fullgraph=True)
-        loaded_model = torch.compile(loaded_model, fullgraph=True)
+            awq_out = m(input_cat)
+            awq_save_load_out = loaded_model(input_cat)
 
-        awq_out = torch.cat([m(d.squeeze(0)) for d in dataset])
-        awq_save_load_out = torch.cat([loaded_model(d.squeeze(0)) for d in dataset])
+            assert awq_out is not None
+            assert awq_save_load_out is not None
+            assert torch.allclose(awq_out, awq_save_load_out, atol=1e-2)
 
-        assert awq_out is not None
-        assert awq_save_load_out is not None
-        assert torch.allclose(awq_out, awq_save_load_out, atol=1e-2)
 
+instantiate_parametrized_tests(TestAWQ)
 
 if __name__ == "__main__":
     run_tests()
@@ -14,10 +14,12 @@
     run_tests,
 )
 
-from torchao.quantization import Int4WeightOnlyConfig, quantize_
-from torchao.quantization.quantize_.workflows.int4.int4_tile_packed_to_4d_tensor import (
+from torchao.quantization import (
     Int4TilePackedTo4dTensor,
+    Int4WeightOnlyConfig,
+    quantize_,
 )
+from torchao.quantization.quantize_.common import SupportsActivationPreScaling
 from torchao.quantization.utils import compute_error
 from torchao.testing.utils import TorchAOIntegrationTestCase
 from torchao.utils import is_sm_at_least_90
@@ -236,6 +238,26 @@ def test_mm_int4wo(self, device, dtype):
         # make sure it runs
         torch.nn.functional.linear(input, weight)
 
+    @parametrize("config", [INT4_CONFIG, INT4_HQQ_CONFIG])
+    def test_activation_prescaling(self, config):
+        dtype = torch.bfloat16
+        device = "cuda"
+        input = torch.randn(1, 128, dtype=dtype, device=device)
+        linear = torch.nn.Linear(128, 256, bias=False, dtype=dtype, device=device)
+        original = linear(input)
+        quantize_(linear, config)
+        qw = linear.weight
+        assert isinstance(qw, SupportsActivationPreScaling), (
+            "Expected int4 tensor supports activation prescaling"
+        )
+        assert qw.act_pre_scale is None, "Default `act_pre_scale` is None"
+        _ACT_PRE_SCALE = 2
+        qw.act_pre_scale = _ACT_PRE_SCALE
+        quantized = linear(input)
+
+        # making sure activation pre scaling is successfully applied to the activation
+        self.assertTrue(compute_error(original * _ACT_PRE_SCALE, quantized) > 20)
+
     @parametrize("group_size", [32, 64, 128])
     def test_different_group_sizes(self, group_size):
         """Test with different group sizes"""