Refactor int4 weight only quantization with call to quantize

Summary: This is similar to #294 but applied for int4 weight only quantization Test Plan: unit perf test: python test/quantization/test_quant_api.py -k test_quantized_tensor_subclass_int4_wo_quant_perf elapsed time: 0.2166275215148926, ref elapsed time: 0.2191881561279297 elapsed time: 0.2376406478881836, ref elapsed time: 0.22721023559570314 elapsed time: 0.21919679641723633, ref elapsed time: 0.2154969596862793 integration perf test: reference: elapsed_time: 2.5900126953125 milliseconds after refactor: elapsed_time: 2.56680078125 milliseconds diff: no diff TORCH_LOGS='output_code' python tutorials/quantize_vit/run_vit_b_quant.py Before: After: generated code diff: Reviewers: Subscribers: Tasks: Tags:
pytorch · Jun 4, 2024 · b250618 · b250618
1 parent 55a4676
commit b250618
Show file tree

Hide file tree

Showing 7 changed files with 353 additions and 205 deletions.
diff --git a/benchmarks/benchmark_aq.py b/benchmarks/benchmark_aq.py
@@ -0,0 +1,118 @@
+"""Benchmarks for affine quantized tensor, this includes int8 dynamic quant, int8 weight only quant and int4 weight only quant APIs
+"""
+import torch
+from torchao.quantization.subclass import (
+    Int8WeightOnlyQuantizedLinearWeight,
+    Int4WeightOnlyQuantizedLinearWeight,
+)
+from torchao.quantization.utils import (
+    TORCH_VERSION_AFTER_2_4,
+)
+from torchao.quantization.quant_api import (
+    _replace_with_custom_fn_if_matches_filter,
+)
+import copy
+
+class ToyLinearModel(torch.nn.Module):
+    def __init__(self, m=64, n=32, k=64):
+        super().__init__()
+        self.linear1 = torch.nn.Linear(m, n, bias=False).to(torch.float)
+        self.linear2 = torch.nn.Linear(n, k, bias=False).to(torch.float)
+
+    def example_inputs(self, batch_size=1, dtype=torch.float, device="cpu"):
+        return (torch.randn(batch_size, self.linear1.in_features, dtype=dtype, device=device),)
+
+    def forward(self, x):
+        x = self.linear1(x)
+        x = self.linear2(x)
+        return x
+
+def _ref_change_linear_weights_to_int8_dqtensors(model, filter_fn=None, **kwargs):
+    """
+    The deprecated implementation for int8 dynamic quant API, used as a reference for
+    numerics and performance
+    """
+    from torchao.quantization.quant_api import _in_features_greater_than_16
+    from torchao.quantization.quant_api import _is_linear
+    from torchao.quantization.quant_api import _get_subclass_inserter
+    from torchao.quantization.subclass import Int8DynamicallyQuantizedLinearWeight
+
+    if filter_fn is None:
+        filter_fn = lambda *args: _is_linear(*args) and _in_features_greater_than_16(
+            *args
+        )
+
+    _replace_with_custom_fn_if_matches_filter(
+        model, _get_subclass_inserter(Int8DynamicallyQuantizedLinearWeight, enable_parametrization=False, **kwargs), filter_fn
+    )
+
+def _get_ref_change_linear_weights_to_woqtensors(deprecated_tenosr_subclass):
+    def _ref_change_linear_weights_to_woqtensors(model, filter_fn=None, **kwargs):
+        """
+        The deprecated implementation for weight only quant API, used as a reference for
+        numerics and performance
+        """
+        from torchao.quantization.quant_api import _is_linear
+        from torchao.quantization.quant_api import _get_subclass_inserter
+
+        filter_fn = kwargs.pop("filter_fn", _is_linear)
+
+        _replace_with_custom_fn_if_matches_filter(
+            model,
+            _get_subclass_inserter(deprecated_tenosr_subclass, enable_parametrization=True, **kwargs),
+            filter_fn,
+        )
+
+    return _ref_change_linear_weights_to_woqtensors
+
+_ref_change_linear_weights_to_int8_woqtensors = _get_ref_change_linear_weights_to_woqtensors(Int8WeightOnlyQuantizedLinearWeight)
+_ref_change_linear_weights_to_int4_woqtensors = _get_ref_change_linear_weights_to_woqtensors(Int4WeightOnlyQuantizedLinearWeight)
+
+
+def _bench_quantized_tensor_subclass_perf(api, ref_api, kwargs=None):
+    if kwargs is None:
+        kwargs = {}
+
+    m = ToyLinearModel(1024, 1024, 1024).eval().to(torch.bfloat16).to("cuda")
+    m_ref = copy.deepcopy(m)
+    # setting batch_size to 20 to be compatible with the kernel
+    example_inputs = m.example_inputs(batch_size=20, dtype=torch.bfloat16, device="cuda")
+
+    api(m, **kwargs)
+
+    # reference
+    ref_api(m_ref, **kwargs)
+
+    res = m(*example_inputs)
+    ref = m_ref(*example_inputs)
+
+    assert torch.equal(res, ref)
+
+    # perf comparison
+    from torchao.utils import benchmark_model
+    # warmup
+    WARMUP = 5
+    RUNS = 100
+    input_tensor = example_inputs[0]
+    m = torch.compile(m, mode='max-autotune', fullgraph=True)
+
+    benchmark_model(m, WARMUP, input_tensor)
+    elapsed_time = benchmark_model(m, RUNS, input_tensor)
+
+    m_ref = torch.compile(m_ref, mode='max-autotune', fullgraph=True)
+    benchmark_model(m_ref, WARMUP, input_tensor)
+    ref_elapsed_time = benchmark_model(m_ref, RUNS, input_tensor)
+
+    print(f"elapsed time: {elapsed_time}, ref elapsed time: {ref_elapsed_time}")
+    assert elapsed_time < 1.05 * ref_elapsed_time
+
+if __name__ == "__main__" and TORCH_VERSION_AFTER_2_4 and torch.cuda.is_available():
+    from torchao.quantization.quant_api import change_linear_weights_to_int8_dqtensors
+    _bench_quantized_tensor_subclass_perf(change_linear_weights_to_int8_dqtensors, _ref_change_linear_weights_to_int8_dqtensors)
+
+    from torchao.quantization.quant_api import change_linear_weights_to_int8_woqtensors
+    _bench_quantized_tensor_subclass_perf(change_linear_weights_to_int8_woqtensors, _ref_change_linear_weights_to_int8_woqtensors)
+
+    kwargs = {"groupsize": 32}
+    from torchao.quantization.quant_api import change_linear_weights_to_int4_woqtensors
+    _bench_quantized_tensor_subclass_perf(change_linear_weights_to_int4_woqtensors, _ref_change_linear_weights_to_int4_woqtensors, kwargs)
diff --git a/test/quantization/test_quant_api.py b/test/quantization/test_quant_api.py
@@ -29,6 +29,8 @@
 from torchao.quantization.subclass import (
     to_laq,
     LinearActQuantizedTensor,
+    Int8WeightOnlyQuantizedLinearWeight,
+    Int4WeightOnlyQuantizedLinearWeight,
 )
 from torchao.quantization.quant_api import (
     _replace_with_custom_fn_if_matches_filter,
@@ -138,39 +140,27 @@ def _ref_change_linear_weights_to_int8_dqtensors(model, filter_fn=None, **kwargs
         model, _get_subclass_inserter(Int8DynamicallyQuantizedLinearWeight, enable_parametrization=False, **kwargs), filter_fn
     )
 
-def _ref_change_linear_weights_to_int8_woqtensors(model, filter_fn=None, **kwargs):
-    """
-    The deprecated implementation for int8 weight only quant API, used as a reference for
-    numerics and performance
-    """
-    from torchao.quantization.quant_api import _is_linear
-    from torchao.quantization.quant_api import _get_subclass_inserter
-    from torchao.quantization.subclass import Int8WeightOnlyQuantizedLinearWeight
-
-    filter_fn = kwargs.pop("filter_fn", _is_linear)
+def _get_ref_change_linear_weights_to_woqtensors(deprecated_tenosr_subclass):
+    def _ref_change_linear_weights_to_woqtensors(model, filter_fn=None, **kwargs):
+        """
+        The deprecated implementation for weight only quant API, used as a reference for
+        numerics and performance
+        """
+        from torchao.quantization.quant_api import _is_linear
+        from torchao.quantization.quant_api import _get_subclass_inserter
 
-    _replace_with_custom_fn_if_matches_filter(
-        model,
-        _get_subclass_inserter(Int8WeightOnlyQuantizedLinearWeight, enable_parametrization=True, **kwargs),
-        filter_fn,
-    )
+        filter_fn = kwargs.pop("filter_fn", _is_linear)
 
-def _ref_change_linear_weights_to_int4_woqtensors(model, **kwargs):
-    """
-    The deprecated implementation for int4 weight only quant API, used as a reference for
-    numerics and performance
-    """
-    from torchao.quantization.quant_api import _is_linear
-    from torchao.quantization.quant_api import _get_subclass_inserter
-    from torchao.quantization.subclass import Int4WeightOnlyQuantizedLinearWeight
+        _replace_with_custom_fn_if_matches_filter(
+            model,
+            _get_subclass_inserter(deprecated_tenosr_subclass, enable_parametrization=True, **kwargs),
+            filter_fn,
+        )
 
-    filter_fn = kwargs.pop("filter_fn", _is_linear)
+    return _ref_change_linear_weights_to_woqtensors
 
-    _replace_with_custom_fn_if_matches_filter(
-        model,
-        _get_subclass_inserter(Int4WeightOnlyQuantizedLinearWeight, enable_parametrization=False, **kwargs),
-        filter_fn,
-    )
+_ref_change_linear_weights_to_int8_woqtensors = _get_ref_change_linear_weights_to_woqtensors(Int8WeightOnlyQuantizedLinearWeight)
+_ref_change_linear_weights_to_int4_woqtensors = _get_ref_change_linear_weights_to_woqtensors(Int4WeightOnlyQuantizedLinearWeight)
 
 class TestQuantFlow(unittest.TestCase):
     def test_dynamic_quant_gpu_singleline(self):
@@ -512,8 +502,7 @@ def test_quantized_tensor_subclass_int4(self):
         assert isinstance(m.linear2.weight, AffineQuantizedTensor)
 
         # reference
-        from torchao.quantization.quant_api import change_linear_weights_to_int4_woqtensors
-        change_linear_weights_to_int4_woqtensors(m_copy, groupsize=groupsize)
+        _ref_change_linear_weights_to_int4_woqtensors(m_copy, groupsize=groupsize)
 
         res = m(*example_inputs)
         ref = m_copy(*example_inputs)
@@ -534,9 +523,9 @@ def test_quantized_tensor_subclass_int8_wo(self):
         assert isinstance(m.linear2.weight, AffineQuantizedTensor)
 
         # reference
-        from torchao.quantization.quant_api import change_linear_weights_to_int8_woqtensors
         _ref_change_linear_weights_to_int8_woqtensors(m_copy)
 
+
         res = m(*example_inputs)
         ref = m_copy(*example_inputs)
 
@@ -559,8 +548,7 @@ def test_quantized_tensor_subclass_int8_dyn_quant(self):
         assert isinstance(m.linear2.weight.original_weight_tensor, AffineQuantizedTensor)
 
         # reference
-        from torchao.quantization.quant_api import change_linear_weights_to_int8_dqtensors
-        change_linear_weights_to_int8_dqtensors(m_copy)
+        _ref_change_linear_weights_to_int8_dqtensors(m_copy)
 
         res = m(*example_inputs)
         ref = m_copy(*example_inputs)
@@ -579,65 +567,5 @@ def test_quantized_tensor_subclass_int8_dyn_quant(self):
         # make sure it compiles
         torch._export.aot_compile(m_unwrapped, example_inputs)
 
-
-    def _test_quantized_tensor_subclass_perf(self, api, ref_api, kwargs=None):
-        if kwargs is None:
-            kwargs = {}
-
-        m = ToyLinearModel(1024, 1024, 1024).eval().to(torch.bfloat16).to("cuda")
-        m_ref = copy.deepcopy(m)
-        # setting batch_size to 20 to be compatible with the kernel
-        example_inputs = m.example_inputs(batch_size=20, dtype=torch.bfloat16, device="cuda")
-
-        api(m, **kwargs)
-
-        # reference
-        ref_api(m_ref, **kwargs)
-
-        res = m(*example_inputs)
-        ref = m_ref(*example_inputs)
-
-        self.assertTrue(torch.equal(res, ref))
-
-        # perf comparison
-        from torchao.utils import benchmark_model
-        # warmup
-        WARMUP = 5
-        RUNS = 100
-        input_tensor = example_inputs[0]
-        m = torch.compile(m, mode='max-autotune', fullgraph=True)
-
-        benchmark_model(m, WARMUP, input_tensor)
-        elapsed_time = benchmark_model(m, RUNS, input_tensor)
-
-        m_ref = torch.compile(m_ref, mode='max-autotune', fullgraph=True)
-        benchmark_model(m_ref, WARMUP, input_tensor)
-        ref_elapsed_time = benchmark_model(m_ref, RUNS, input_tensor)
-
-        print(f"elapsed time: {elapsed_time}, ref elapsed time: {ref_elapsed_time}")
-        self.assertTrue(elapsed_time < 1.05 * ref_elapsed_time)
-
-    @unittest.skipIf(not TORCH_VERSION_AFTER_2_4, "Test only enabled for 2.4+")
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
-    @unittest.skip("This perf test is supposed to be run locally for sanity check performance when there is a change of int8 dynamic quant implementation")
-    def test_quantized_tensor_subclass_int8_dyn_quant_perf(self):
-        from torchao.quantization.quant_api import change_linear_weights_to_int8_dqtensors
-        self._test_quantized_tensor_subclass_perf(change_linear_weights_to_int8_dqtensors, _ref_change_linear_weights_to_int8_dqtensors)
-
-    @unittest.skipIf(not TORCH_VERSION_AFTER_2_4, "Test only enabled for 2.4+")
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
-    @unittest.skip("This perf test is supposed to be run locally for sanity check performance when there is a change of int8 weight only quant implementation")
-    def test_quantized_tensor_subclass_int8_wo_quant_perf(self):
-        from torchao.quantization.quant_api import change_linear_weights_to_int8_woqtensors
-        self._test_quantized_tensor_subclass_perf(change_linear_weights_to_int8_woqtensors, _ref_change_linear_weights_to_int8_woqtensors)
-
-    @unittest.skipIf(not TORCH_VERSION_AFTER_2_4, "Test only enabled for 2.4+")
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
-    @unittest.skip("This perf test is supposed to be run locally for sanity check performance when there is a change of int4 weight only quant implementation")
-    def test_quantized_tensor_subclass_int4_wo_quant_perf(self):
-        kwargs = {"groupsize": 32}
-        from torchao.quantization.quant_api import change_linear_weights_to_int4_woqtensors
-        self._test_quantized_tensor_subclass_perf(change_linear_weights_to_int4_woqtensors, _ref_change_linear_weights_to_int4_woqtensors, kwargs)
-
 if __name__ == "__main__":
     unittest.main()