Refactor int8 weight only quant to use quantize

Summary: Similar to pytorch#294 we replaced the implementation of int8 weight only quant to used the newly added `quantize` function, as a part of the unification effort for affine quantization Test Plan: 1. unit perf test: python test/quantization/test_quant_api.py -k test_quantized_tensor_subclass_int8_wo_quant_perf elapsed time: 0.23909856796264647, ref elapsed time: 0.25150911331176756 elapsed time: 0.24894208908081056, ref elapsed time: 0.2570047950744629 elapsed time: 0.21607391357421876, ref elapsed time: 0.22809568405151368 2. integration test: TORCH_LOGS='output_code' python tutorials/quantize_vit/run_vit_b_quant.py Reference: elapsed_time: 1.355208740234375 milliseconds After refactor: elapsed_time: 1.32778857421875 milliseconds code diff (gist): https://gist.github.com/jerryzh168/921a722cf20d476c8fc5888482e722dc code diff (meta-only paste): https://www.internalfb.com/phabricator/paste/view/P1387333845 Reviewers: Subscribers: Tasks: Tags:
jerryzh168 · May 31, 2024 · 09a6c2d · 09a6c2d
1 parent cd1ebc8
commit 09a6c2d
Show file tree

Hide file tree

Showing 4 changed files with 93 additions and 24 deletions.
diff --git a/test/quantization/test_quant_api.py b/test/quantization/test_quant_api.py
@@ -138,6 +138,40 @@ def _ref_change_linear_weights_to_int8_dqtensors(model, filter_fn=None, **kwargs
  model, _get_subclass_inserter(Int8DynamicallyQuantizedLinearWeight, enable_parametrization=False, **kwargs), filter_fn
  )
 
+def _ref_change_linear_weights_to_int8_woqtensors(model, filter_fn=None, **kwargs):
+ """
+ The deprecated implementation for int8 weight only quant API, used as a reference for
+ numerics and performance
+ """
+ from torchao.quantization.quant_api import _is_linear
+ from torchao.quantization.quant_api import _get_subclass_inserter
+ from torchao.quantization.subclass import Int8WeightOnlyQuantizedLinearWeight
+
+ filter_fn = kwargs.pop("filter_fn", _is_linear)
+
+ _replace_with_custom_fn_if_matches_filter(
+ model,
+ _get_subclass_inserter(Int8WeightOnlyQuantizedLinearWeight, enable_parametrization=True, **kwargs),
+ filter_fn,
+ )
+
+def _ref_change_linear_weights_to_int4_woqtensors(model, **kwargs):
+ """
+ The deprecated implementation for int4 weight only quant API, used as a reference for
+ numerics and performance
+ """
+ from torchao.quantization.quant_api import _is_linear
+ from torchao.quantization.quant_api import _get_subclass_inserter
+ from torchao.quantization.subclass import Int4WeightOnlyQuantizedLinearWeight
+
+ filter_fn = kwargs.pop("filter_fn", _is_linear)
+
+ _replace_with_custom_fn_if_matches_filter(
+ model,
+ _get_subclass_inserter(Int4WeightOnlyQuantizedLinearWeight, enable_parametrization=False, **kwargs),
+ filter_fn,
+ )
+
 class TestQuantFlow(unittest.TestCase):
  def test_dynamic_quant_gpu_singleline(self):
  m = ToyLinearModel().eval()
@@ -489,7 +523,7 @@ def test_quantized_tensor_subclass_int4(self):
 
  @unittest.skipIf(not TORCH_VERSION_AFTER_2_4, "Test only enabled for 2.4+")
  @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
- def test_quantized_tensor_subclass_int8(self):
+ def test_quantized_tensor_subclass_int8_wo(self):
  m = ToyLinearModel().eval().to(torch.bfloat16)
  m_copy = copy.deepcopy(m)
  example_inputs = tuple(map(lambda x: x.to(torch.bfloat16), m.example_inputs()))
@@ -501,12 +535,12 @@ def test_quantized_tensor_subclass_int8(self):
 
  # reference
  from torchao.quantization.quant_api import change_linear_weights_to_int8_woqtensors
- change_linear_weights_to_int8_woqtensors(m_copy)
+ _ref_change_linear_weights_to_int8_woqtensors(m_copy)
 
  res = m(*example_inputs)
  ref = m_copy(*example_inputs)
 
- torch.testing.assert_close(res, ref, rtol=0.00001, atol=1e-2)
+ self.assertTrue(torch.equal(res, ref))
 
 
  @unittest.skipIf(not TORCH_VERSION_AFTER_2_4, "Test only enabled for 2.4+")
@@ -545,20 +579,20 @@ def test_quantized_tensor_subclass_int8_dyn_quant(self):
  # make sure it compiles
  torch._export.aot_compile(m_unwrapped, example_inputs)
 
- @unittest.skipIf(not TORCH_VERSION_AFTER_2_4, "Test only enabled for 2.4+")
- @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
- @unittest.skip("This perf test is supposed to be run locally for sanity check performance when there is a change of int8 dynamic quant implementation")
- def test_quantized_tensor_subclass_int8_dyn_quant_perf(self):
+
+ def _test_quantized_tensor_subclass_perf(self, api, ref_api, kwargs=None):
+ if kwargs is None:
+ kwargs = {}
+
  m = ToyLinearModel(1024, 1024, 1024).eval().to(torch.bfloat16).to("cuda")
  m_ref = copy.deepcopy(m)
  # setting batch_size to 20 to be compatible with the kernel
  example_inputs = m.example_inputs(batch_size=20, dtype=torch.bfloat16, device="cuda")
 
- from torchao.quantization.quant_api import change_linear_weights_to_int8_dqtensors
- change_linear_weights_to_int8_dqtensors(m)
+ api(m, **kwargs)
 
  # reference
- _ref_change_linear_weights_to_int8_dqtensors(m_ref)
+ ref_api(m_ref, **kwargs)
 
  res = m(*example_inputs)
  ref = m_ref(*example_inputs)
@@ -583,7 +617,27 @@ def test_quantized_tensor_subclass_int8_dyn_quant_perf(self):
  print(f"elapsed time: {elapsed_time}, ref elapsed time: {ref_elapsed_time}")
  self.assertTrue(elapsed_time < 1.05 * ref_elapsed_time)
 
+ @unittest.skipIf(not TORCH_VERSION_AFTER_2_4, "Test only enabled for 2.4+")
+ @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+ @unittest.skip("This perf test is supposed to be run locally for sanity check performance when there is a change of int8 dynamic quant implementation")
+ def test_quantized_tensor_subclass_int8_dyn_quant_perf(self):
+ from torchao.quantization.quant_api import change_linear_weights_to_int8_dqtensors
+ self._test_quantized_tensor_subclass_perf(change_linear_weights_to_int8_dqtensors, _ref_change_linear_weights_to_int8_dqtensors)
+
+ @unittest.skipIf(not TORCH_VERSION_AFTER_2_4, "Test only enabled for 2.4+")
+ @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+ # @unittest.skip("This perf test is supposed to be run locally for sanity check performance when there is a change of int8 weight only quant implementation")
+ def test_quantized_tensor_subclass_int8_wo_quant_perf(self):
+ from torchao.quantization.quant_api import change_linear_weights_to_int8_woqtensors
+ self._test_quantized_tensor_subclass_perf(change_linear_weights_to_int8_woqtensors, _ref_change_linear_weights_to_int8_woqtensors)
 
+ @unittest.skipIf(not TORCH_VERSION_AFTER_2_4, "Test only enabled for 2.4+")
+ @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+ @unittest.skip("This perf test is supposed to be run locally for sanity check performance when there is a change of int4 weight only quant implementation")
+ def test_quantized_tensor_subclass_int4_wo_quant_perf(self):
+ kwargs = {"groupsize": 32}
+ from torchao.quantization.quant_api import change_linear_weights_to_int4_woqtensors
+ self._test_quantized_tensor_subclass_perf(change_linear_weights_to_int4_woqtensors, _ref_change_linear_weights_to_int4_woqtensors, kwargs)
 
 if __name__ == "__main__":
  unittest.main()
diff --git a/torchao/dtypes/aqt.py b/torchao/dtypes/aqt.py
@@ -574,29 +574,30 @@ def _quantized_linear_op(input_tensor, weight_qtensor, bias):
  scale_and_zero = weight_qtensor.layout_tensor.scale_and_zero
  return torch.ops.aten._weight_int4pack_mm(input_tensor.contiguous(), packed_weight, groupsize, scale_and_zero)
  elif (
- is_cpu and
  weight_is_int8 and
  len(weight_qtensor.shape) == 2 and
  len(weight_qtensor.block_size) == 2 and
  weight_qtensor.block_size[0] == 1 and
  weight_qtensor.block_size[1] == weight_qtensor.shape[1] and
+ weight_qtensor.zero_point_domain == ZeroPointDomain.INT and
  weight_qtensor.layout == "plain"
  ):
  # TODO: enable cpu and mps efficient path
  # per channel int8 weight only quantizated mm
- w_vals_int8_t = weight_qtensor.layout_tensor.int_data.t().contiguous()
+ w_vals_int8_t = weight_qtensor.layout_tensor.int_data.t()
+ scale = weight_qtensor.layout_tensor.scale
  orig_dtype = input_tensor.dtype
  y = (
  torch.mm(
  input_tensor.reshape(-1, input_tensor.shape[-1]),
  w_vals_int8_t.to(input_tensor.dtype),
  )
- * weight_qtensor.scale
+ * scale
  )
  y = y.reshape(*input_tensor.shape[:-1], y.shape[-1])
  if bias is not None:
  y += bias
-  return y.to(orig_dtype)
+ return y.to(orig_dtype)
 
  # is_cpu and is_mps only, some issue with is_contiguous() currently
  # return torch.ops.aten._weight_int8pack_mm(input_tensor.contiguous(), w_vals_int8_t, weight_qtensor.layout_tensor.scale)

diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -206,13 +206,18 @@ def change_linear_weights_to_int8_woqtensors(model, filter_fn=None, **kwargs):
  Converts all linear weight tensors to the
  `Int8WeightOnlyQuantizedLinearWeight` tensor subclass,
  effectively applying the same form of quantization
- as apply_dynamic_quant while not modifying the linear modules.
+ as apply_weight_only_int8_quant while not modifying the linear modules.
  """
- _replace_with_custom_fn_if_matches_filter(
- model,
- _get_subclass_inserter(Int8WeightOnlyQuantizedLinearWeight, enable_parametrization=TORCH_VERSION_AFTER_2_4, **kwargs),
- _is_linear if filter_fn is None else filter_fn,
- )
+
+ if TORCH_VERSION_AFTER_2_4:
+ quantize(model, get_apply_int8wo_quant(), filter_fn)
+ unwrap_tensor_subclass(model, filter_fn)
+ else:
+ _replace_with_custom_fn_if_matches_filter(
+ model,
+ _get_subclass_inserter(Int8WeightOnlyQuantizedLinearWeight, enable_parametrization=False, **kwargs),
+ _is_linear if filter_fn is None else filter_fn,
+ )
 
 
 def change_linear_weights_to_int4_woqtensors(model, **kwargs):

diff --git a/tutorials/quantize_vit/run_vit_b_quant.py b/tutorials/quantize_vit/run_vit_b_quant.py
@@ -14,18 +14,27 @@
 input_tensor = torch.randn(1, 3, 224, 224, dtype=torch.bfloat16, device='cuda')
 
 ## Quantization code - start
+# int8 act, int8 weight dynamic quantization
 torchao.apply_dynamic_quant(model)
-from torch._inductor import config as inductorconfig
-inductorconfig.force_fuse_int_mm_with_mul = True
+
+# int8 weight only quantization
+# torchao.quantization.change_linear_weights_to_int8_woqtensors(model)
 ## Quantization code - end
 
+
+## compilation configs
+torch._dynamo.config.automatic_dynamic_shapes = False
+torch._inductor.config.force_fuse_int_mm_with_mul = True
+torch._inductor.config.use_mixed_mm = True
+## compilation configs end
+
 model = torch.compile(model, mode='max-autotune', fullgraph=True)
 
 # Must run with no_grad when optimizing for inference
 with torch.no_grad():
  # warmup
- benchmark_model(model, 5, input_tensor)
+ benchmark_model(model, 20, input_tensor)
  # benchmark
- print("elapsed_time: ", benchmark_model(model, 100, input_tensor), " milliseconds")
+ print("elapsed_time: ", benchmark_model(model, 1000, input_tensor), " milliseconds")
  # Create a trace
  profiler_runner("quant.json.gz", benchmark_model, model, 5, input_tensor)