Refactor int4 weight only quantization with call to quantize

Summary: This is similar to pytorch#294 but applied for int4 weight only quantization Test Plan: unit perf test: python test/quantization/test_quant_api.py -k test_quantized_tensor_subclass_int4_wo_quant_perf elapsed time: 0.2166275215148926, ref elapsed time: 0.2191881561279297 elapsed time: 0.2376406478881836, ref elapsed time: 0.22721023559570314 elapsed time: 0.21919679641723633, ref elapsed time: 0.2154969596862793 integration perf test: reference: elapsed_time: 2.5900126953125 milliseconds after refactor: elapsed_time: 2.56680078125 milliseconds diff: no diff TORCH_LOGS='output_code' python tutorials/quantize_vit/run_vit_b_quant.py Before: After: generated code diff: Reviewers: Subscribers: Tasks: Tags:
jerryzh168 · Jun 4, 2024 · 0069e53 · 0069e53
1 parent 55a4676
commit 0069e53
Show file tree

Hide file tree

Showing 7 changed files with 363 additions and 208 deletions.
diff --git a/benchmarks/benchmark_aq.py b/benchmarks/benchmark_aq.py
@@ -0,0 +1,118 @@
+"""Benchmarks for affine quantized tensor, this includes int8 dynamic quant, int8 weight only quant and int4 weight only quant APIs
+"""
+import torch
+from torchao.quantization.subclass import (
+ Int8WeightOnlyQuantizedLinearWeight,
+ Int4WeightOnlyQuantizedLinearWeight,
+)
+from torchao.quantization.utils import (
+ TORCH_VERSION_AFTER_2_4,
+)
+from torchao.quantization.quant_api import (
+ _replace_with_custom_fn_if_matches_filter,
+)
+import copy
+
+class ToyLinearModel(torch.nn.Module):
+ def __init__(self, m=64, n=32, k=64):
+ super().__init__()
+ self.linear1 = torch.nn.Linear(m, n, bias=False).to(torch.float)
+ self.linear2 = torch.nn.Linear(n, k, bias=False).to(torch.float)
+
+ def example_inputs(self, batch_size=1, dtype=torch.float, device="cpu"):
+ return (torch.randn(batch_size, self.linear1.in_features, dtype=dtype, device=device),)
+
+ def forward(self, x):
+ x = self.linear1(x)
+ x = self.linear2(x)
+ return x
+
+def _ref_change_linear_weights_to_int8_dqtensors(model, filter_fn=None, **kwargs):
+ """
+ The deprecated implementation for int8 dynamic quant API, used as a reference for
+ numerics and performance
+ """
+ from torchao.quantization.quant_api import _in_features_greater_than_16
+ from torchao.quantization.quant_api import _is_linear
+ from torchao.quantization.quant_api import _get_subclass_inserter
+ from torchao.quantization.subclass import Int8DynamicallyQuantizedLinearWeight
+
+ if filter_fn is None:
+ filter_fn = lambda *args: _is_linear(*args) and _in_features_greater_than_16(
+ *args
+ )
+
+ _replace_with_custom_fn_if_matches_filter(
+ model, _get_subclass_inserter(Int8DynamicallyQuantizedLinearWeight, enable_parametrization=False, **kwargs), filter_fn
+ )
+
+def _get_ref_change_linear_weights_to_woqtensors(deprecated_tenosr_subclass):
+ def _ref_change_linear_weights_to_woqtensors(model, filter_fn=None, **kwargs):
+ """
+ The deprecated implementation for weight only quant API, used as a reference for
+ numerics and performance
+ """
+ from torchao.quantization.quant_api import _is_linear
+ from torchao.quantization.quant_api import _get_subclass_inserter
+
+ filter_fn = kwargs.pop("filter_fn", _is_linear)
+
+ _replace_with_custom_fn_if_matches_filter(
+ model,
+ _get_subclass_inserter(deprecated_tenosr_subclass, enable_parametrization=True, **kwargs),
+ filter_fn,
+ )
+
+ return _ref_change_linear_weights_to_woqtensors
+
+_ref_change_linear_weights_to_int8_woqtensors = _get_ref_change_linear_weights_to_woqtensors(Int8WeightOnlyQuantizedLinearWeight)
+_ref_change_linear_weights_to_int4_woqtensors = _get_ref_change_linear_weights_to_woqtensors(Int4WeightOnlyQuantizedLinearWeight)
+
+
+def _bench_quantized_tensor_subclass_perf(api, ref_api, kwargs=None):
+ if kwargs is None:
+ kwargs = {}
+
+ m = ToyLinearModel(1024, 1024, 1024).eval().to(torch.bfloat16).to("cuda")
+ m_ref = copy.deepcopy(m)
+ # setting batch_size to 20 to be compatible with the kernel
+ example_inputs = m.example_inputs(batch_size=20, dtype=torch.bfloat16, device="cuda")
+
+ api(m, **kwargs)
+
+ # reference
+ ref_api(m_ref, **kwargs)
+
+ res = m(*example_inputs)
+ ref = m_ref(*example_inputs)
+
+ assert torch.equal(res, ref)
+
+ # perf comparison
+ from torchao.utils import benchmark_model
+ # warmup
+ WARMUP = 5
+ RUNS = 100
+ input_tensor = example_inputs[0]
+ m = torch.compile(m, mode='max-autotune', fullgraph=True)
+
+ benchmark_model(m, WARMUP, input_tensor)
+ elapsed_time = benchmark_model(m, RUNS, input_tensor)
+
+ m_ref = torch.compile(m_ref, mode='max-autotune', fullgraph=True)
+ benchmark_model(m_ref, WARMUP, input_tensor)
+ ref_elapsed_time = benchmark_model(m_ref, RUNS, input_tensor)
+
+ print(f"elapsed time: {elapsed_time}, ref elapsed time: {ref_elapsed_time}")
+ assert elapsed_time < 1.05 * ref_elapsed_time
+
+if __name__ == "__main__" and TORCH_VERSION_AFTER_2_4 and torch.cuda.is_available():
+ from torchao.quantization.quant_api import change_linear_weights_to_int8_dqtensors
+ _bench_quantized_tensor_subclass_perf(change_linear_weights_to_int8_dqtensors, _ref_change_linear_weights_to_int8_dqtensors)
+
+ from torchao.quantization.quant_api import change_linear_weights_to_int8_woqtensors
+ _bench_quantized_tensor_subclass_perf(change_linear_weights_to_int8_woqtensors, _ref_change_linear_weights_to_int8_woqtensors)
+
+ kwargs = {"groupsize": 32}
+ from torchao.quantization.quant_api import change_linear_weights_to_int4_woqtensors
+ _bench_quantized_tensor_subclass_perf(change_linear_weights_to_int4_woqtensors, _ref_change_linear_weights_to_int4_woqtensors, kwargs)
diff --git a/test/quantization/test_quant_api.py b/test/quantization/test_quant_api.py
@@ -29,6 +29,8 @@
 from torchao.quantization.subclass import (
  to_laq,
  LinearActQuantizedTensor,
+ Int8WeightOnlyQuantizedLinearWeight,
+ Int4WeightOnlyQuantizedLinearWeight,
 )
 from torchao.quantization.quant_api import (
  _replace_with_custom_fn_if_matches_filter,
@@ -138,39 +140,27 @@ def _ref_change_linear_weights_to_int8_dqtensors(model, filter_fn=None, **kwargs
  model, _get_subclass_inserter(Int8DynamicallyQuantizedLinearWeight, enable_parametrization=False, **kwargs), filter_fn
  )
 
-def _ref_change_linear_weights_to_int8_woqtensors(model, filter_fn=None, **kwargs):
- """
- The deprecated implementation for int8 weight only quant API, used as a reference for
- numerics and performance
- """
- from torchao.quantization.quant_api import _is_linear
- from torchao.quantization.quant_api import _get_subclass_inserter
- from torchao.quantization.subclass import Int8WeightOnlyQuantizedLinearWeight
-
- filter_fn = kwargs.pop("filter_fn", _is_linear)
+def _get_ref_change_linear_weights_to_woqtensors(deprecated_tenosr_subclass):
+ def _ref_change_linear_weights_to_woqtensors(model, filter_fn=None, **kwargs):
+ """
+ The deprecated implementation for weight only quant API, used as a reference for
+ numerics and performance
+ """
+ from torchao.quantization.quant_api import _is_linear
+ from torchao.quantization.quant_api import _get_subclass_inserter
 
- _replace_with_custom_fn_if_matches_filter(
- model,
- _get_subclass_inserter(Int8WeightOnlyQuantizedLinearWeight, enable_parametrization=True, **kwargs),
- filter_fn,
- )
+ filter_fn = kwargs.pop("filter_fn", _is_linear)
 
-def _ref_change_linear_weights_to_int4_woqtensors(model, **kwargs):
- """
- The deprecated implementation for int4 weight only quant API, used as a reference for
- numerics and performance
- """
- from torchao.quantization.quant_api import _is_linear
- from torchao.quantization.quant_api import _get_subclass_inserter
- from torchao.quantization.subclass import Int4WeightOnlyQuantizedLinearWeight
+ _replace_with_custom_fn_if_matches_filter(
+ model,
+ _get_subclass_inserter(deprecated_tenosr_subclass, enable_parametrization=True, **kwargs),
+ filter_fn,
+ )
 
- filter_fn = kwargs.pop("filter_fn", _is_linear)
+ return _ref_change_linear_weights_to_woqtensors
 
- _replace_with_custom_fn_if_matches_filter(
- model,
- _get_subclass_inserter(Int4WeightOnlyQuantizedLinearWeight, enable_parametrization=False, **kwargs),
- filter_fn,
- )
+_ref_change_linear_weights_to_int8_woqtensors = _get_ref_change_linear_weights_to_woqtensors(Int8WeightOnlyQuantizedLinearWeight)
+_ref_change_linear_weights_to_int4_woqtensors = _get_ref_change_linear_weights_to_woqtensors(Int4WeightOnlyQuantizedLinearWeight)
 
 class TestQuantFlow(unittest.TestCase):
  def test_dynamic_quant_gpu_singleline(self):
@@ -512,8 +502,7 @@ def test_quantized_tensor_subclass_int4(self):
  assert isinstance(m.linear2.weight, AffineQuantizedTensor)
 
  # reference
- from torchao.quantization.quant_api import change_linear_weights_to_int4_woqtensors
- change_linear_weights_to_int4_woqtensors(m_copy, groupsize=groupsize)
+ _ref_change_linear_weights_to_int4_woqtensors(m_copy, groupsize=groupsize)
 
  res = m(*example_inputs)
  ref = m_copy(*example_inputs)
@@ -534,9 +523,9 @@ def test_quantized_tensor_subclass_int8_wo(self):
  assert isinstance(m.linear2.weight, AffineQuantizedTensor)
 
  # reference
- from torchao.quantization.quant_api import change_linear_weights_to_int8_woqtensors
  _ref_change_linear_weights_to_int8_woqtensors(m_copy)
 
+
  res = m(*example_inputs)
  ref = m_copy(*example_inputs)
 
@@ -559,8 +548,7 @@ def test_quantized_tensor_subclass_int8_dyn_quant(self):
  assert isinstance(m.linear2.weight.original_weight_tensor, AffineQuantizedTensor)
 
  # reference
- from torchao.quantization.quant_api import change_linear_weights_to_int8_dqtensors
- change_linear_weights_to_int8_dqtensors(m_copy)
+ _ref_change_linear_weights_to_int8_dqtensors(m_copy)
 
  res = m(*example_inputs)
  ref = m_copy(*example_inputs)
@@ -579,65 +567,5 @@ def test_quantized_tensor_subclass_int8_dyn_quant(self):
  # make sure it compiles
  torch._export.aot_compile(m_unwrapped, example_inputs)
 
-
- def _test_quantized_tensor_subclass_perf(self, api, ref_api, kwargs=None):
- if kwargs is None:
- kwargs = {}
-
- m = ToyLinearModel(1024, 1024, 1024).eval().to(torch.bfloat16).to("cuda")
- m_ref = copy.deepcopy(m)
- # setting batch_size to 20 to be compatible with the kernel
- example_inputs = m.example_inputs(batch_size=20, dtype=torch.bfloat16, device="cuda")
-
- api(m, **kwargs)
-
- # reference
- ref_api(m_ref, **kwargs)
-
- res = m(*example_inputs)
- ref = m_ref(*example_inputs)
-
- self.assertTrue(torch.equal(res, ref))
-
- # perf comparison
- from torchao.utils import benchmark_model
- # warmup
- WARMUP = 5
- RUNS = 100
- input_tensor = example_inputs[0]
- m = torch.compile(m, mode='max-autotune', fullgraph=True)
-
- benchmark_model(m, WARMUP, input_tensor)
- elapsed_time = benchmark_model(m, RUNS, input_tensor)
-
- m_ref = torch.compile(m_ref, mode='max-autotune', fullgraph=True)
- benchmark_model(m_ref, WARMUP, input_tensor)
- ref_elapsed_time = benchmark_model(m_ref, RUNS, input_tensor)
-
- print(f"elapsed time: {elapsed_time}, ref elapsed time: {ref_elapsed_time}")
- self.assertTrue(elapsed_time < 1.05 * ref_elapsed_time)
-
- @unittest.skipIf(not TORCH_VERSION_AFTER_2_4, "Test only enabled for 2.4+")
- @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
- @unittest.skip("This perf test is supposed to be run locally for sanity check performance when there is a change of int8 dynamic quant implementation")
- def test_quantized_tensor_subclass_int8_dyn_quant_perf(self):
- from torchao.quantization.quant_api import change_linear_weights_to_int8_dqtensors
- self._test_quantized_tensor_subclass_perf(change_linear_weights_to_int8_dqtensors, _ref_change_linear_weights_to_int8_dqtensors)
-
- @unittest.skipIf(not TORCH_VERSION_AFTER_2_4, "Test only enabled for 2.4+")
- @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
- @unittest.skip("This perf test is supposed to be run locally for sanity check performance when there is a change of int8 weight only quant implementation")
- def test_quantized_tensor_subclass_int8_wo_quant_perf(self):
- from torchao.quantization.quant_api import change_linear_weights_to_int8_woqtensors
- self._test_quantized_tensor_subclass_perf(change_linear_weights_to_int8_woqtensors, _ref_change_linear_weights_to_int8_woqtensors)
-
- @unittest.skipIf(not TORCH_VERSION_AFTER_2_4, "Test only enabled for 2.4+")
- @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
- @unittest.skip("This perf test is supposed to be run locally for sanity check performance when there is a change of int4 weight only quant implementation")
- def test_quantized_tensor_subclass_int4_wo_quant_perf(self):
- kwargs = {"groupsize": 32}
- from torchao.quantization.quant_api import change_linear_weights_to_int4_woqtensors
- self._test_quantized_tensor_subclass_perf(change_linear_weights_to_int4_woqtensors, _ref_change_linear_weights_to_int4_woqtensors, kwargs)
-
 if __name__ == "__main__":
  unittest.main()