Refactor int4 weight only quantization with call to quantize

Summary: This is similar to pytorch#294 but applied for int4 weight only quantization Test Plan: unit perf test: python test/quantization/test_quant_api.py -k test_quantized_tensor_subclass_int4_wo_quant_perf elapsed time: 0.2166275215148926, ref elapsed time: 0.2191881561279297 elapsed time: 0.2376406478881836, ref elapsed time: 0.22721023559570314 elapsed time: 0.21919679641723633, ref elapsed time: 0.2154969596862793 integration perf test: reference: elapsed_time: 2.5900126953125 milliseconds after refactor: elapsed_time: 2.56680078125 milliseconds diff: no diff TORCH_LOGS='output_code' python tutorials/quantize_vit/run_vit_b_quant.py Before: After: generated code diff: Reviewers: Subscribers: Tasks: Tags:
jerryzh168 · Jun 1, 2024 · f87cc12 · f87cc12
1 parent 55a4676
commit f87cc12
Show file tree

Hide file tree

Showing 4 changed files with 150 additions and 48 deletions.
diff --git a/test/quantization/test_quant_api.py b/test/quantization/test_quant_api.py
@@ -168,7 +168,7 @@ def _ref_change_linear_weights_to_int4_woqtensors(model, **kwargs):
 
  _replace_with_custom_fn_if_matches_filter(
  model,
- _get_subclass_inserter(Int4WeightOnlyQuantizedLinearWeight, enable_parametrization=False, **kwargs),
+ _get_subclass_inserter(Int4WeightOnlyQuantizedLinearWeight, enable_parametrization=True, **kwargs),
  filter_fn,
  )
 
@@ -633,7 +633,7 @@ def test_quantized_tensor_subclass_int8_wo_quant_perf(self):
 
  @unittest.skipIf(not TORCH_VERSION_AFTER_2_4, "Test only enabled for 2.4+")
  @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
- @unittest.skip("This perf test is supposed to be run locally for sanity check performance when there is a change of int4 weight only quant implementation")
+ # @unittest.skip("This perf test is supposed to be run locally for sanity check performance when there is a change of int4 weight only quant implementation")
  def test_quantized_tensor_subclass_int4_wo_quant_perf(self):
  kwargs = {"groupsize": 32}
  from torchao.quantization.quant_api import change_linear_weights_to_int4_woqtensors

diff --git a/torchao/dtypes/aqt.py b/torchao/dtypes/aqt.py
@@ -74,11 +74,11 @@ def implements_aqt_torch_function(torch_function):
 def register_aqt_layout_cls(extended_layout: str):
  def decorator(layout_cls):
  layout_cls.extended_layout = extended_layout
- _EXTENDED_LAYOUT_TO_AQT_LAYOUT_CLS[extended_layout] = layout_cls
+ _EXTENDED_LAYOUT_TO_AQT_LAYOUT_CLS[extended_layout] = layout_cls.from_plain
  return layout_cls
  return decorator
 
-def get_aqt_layout_cls(extended_layout: str) -> Callable:
+def get_aqt_layout_cls_ctr(extended_layout: str) -> Callable:
  if extended_layout not in _EXTENDED_LAYOUT_TO_AQT_LAYOUT_CLS:
  raise ValueError(f"extended_layout: {extended_layout} is not supported yet")
  return _EXTENDED_LAYOUT_TO_AQT_LAYOUT_CLS.get(extended_layout)
@@ -90,17 +90,18 @@ class AQTLayout(torch.Tensor):
  # this should be set for each layout class during registration
  extended_layout: Optional[str] = None
 
- def __init__(
- self,
+ def get_plain() -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+ pass
+
+ @classmethod
+ def from_plain(
+ cls,
  int_data: torch.Tensor,
  scale: torch.Tensor,
  zero_point: torch.Tensor,
  ):
  pass
 
- def get_plain() -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
- pass
-
  def _get_to_kwargs(self, *args, **kwargs):
  device, dtype, _, memory_format = torch._C._nn._parse_to(*args, **kwargs)
  device = self.device if device is None else device
@@ -205,6 +206,14 @@ def __torch_dispatch__(cls, func, types, args, kwargs):
  def get_plain(self):
  return self.int_data, self.scale, self.zero_point
 
+ @classmethod
+ def from_plain(
+ cls,
+ int_data: torch.Tensor,
+ scale: torch.Tensor,
+ zero_point: torch.Tensor,
+ ):
+ return cls(int_data, scale, zero_point)
 
 @register_aqt_layout_cls("tensor_core_tiled")
 class TensorCoreTiledAQTLayout(AQTLayout):
@@ -222,40 +231,45 @@ class TensorCoreTiledAQTLayout(AQTLayout):
 
  def __new__(
  cls,
- int_data: torch.Tensor,
- scale: torch.Tensor,
- zero_point: torch.Tensor,
+ packed_weight: torch.Tensor,
+ scale_and_zero: torch.Tensor,
  ):
  kwargs = {}
- kwargs["device"] = int_data.device
+ kwargs["device"] = packed_weight.device
  kwargs["layout"] = (
- kwargs.get("layout") if kwargs.get("layout", False) else int_data.layout
+ kwargs.get("layout") if kwargs.get("layout", False) else packed_weight.layout
  )
- kwargs["dtype"] = int_data.dtype
+ kwargs["dtype"] = packed_weight.dtype
  kwargs["requires_grad"] = False
- shape = int_data.shape
+ shape = packed_weight.shape
  return torch.Tensor._make_wrapper_subclass(cls, shape, **kwargs) # type: ignore[attr-defined]
 
  def __init__(
  self,
- int_data: torch.Tensor,
- scale: torch.Tensor,
- zero_point: torch.Tensor,
+ packed_weight: torch.Tensor,
+ scale_and_zero: torch.Tensor,
  ):
- # TODO: expose the arg
- innerKTiles = 8
- self.packed_weight = torch.ops.aten._convert_weight_to_int4pack(int_data.to(torch.int32), innerKTiles)
- self.scale_and_zero = pack_tinygemm_scales_and_zeros(scale, zero_point)
+ self.packed_weight = packed_weight
+ self.scale_and_zero = scale_and_zero
 
  def __tensor_flatten__(self):
- return ["packed_weight", "scale_and_zero"]
+ return ["packed_weight", "scale_and_zero"], []
 
  @classmethod
  def __tensor_unflatten__(
  cls, tensor_data_dict, tensor_attributes, outer_size, outer_stride
  ):
  packed_weight, scale_and_zero = tensor_data_dict["packed_weight"], tensor_data_dict["scale_and_zero"]
- # TODO: fix the unflatten logic
+ return cls(packed_weight, scale_and_zero)
+
+ @classmethod
+ def from_plain(cls, int_data, scale, zero_point):
+ # TODO: expose the arg
+ innerKTiles = 8
+ packed_weight = torch.ops.aten._convert_weight_to_int4pack(int_data.to(torch.int32), innerKTiles)
+ scale = scale.reshape(int_data.shape[0], -1)
+ zero_point = zero_point.reshape(int_data.shape[0], -1)
+ scale_and_zero = pack_tinygemm_scales_and_zeros(scale, zero_point)
  return cls(packed_weight, scale_and_zero)
 
  def to(self, *args, **kwargs):
@@ -273,6 +287,14 @@ def _apply_fn_to_data(self, fn):
  self.scale_and_zero = fn(self.scale_and_zero)
  return self
 
+ def _change_shape(self, shape):
+ # int_data, scale, zero = self.get_plain()
+ # int_data = int_data.view(shape)
+ # changed = self.from_plain(int_data, scale, zero)
+ # return changed
+ # TODO: changing shape is no-op for int4 packed weight right now
+ return self
+
  @classmethod
  def __torch_dispatch__(cls, func, types, args, kwargs):
  kwargs = {} if kwargs is None else kwargs
@@ -282,16 +304,47 @@ def __torch_dispatch__(cls, func, types, args, kwargs):
  func, args, kwargs, args[0]._apply_fn_to_data(torch.detach)
  )
 
+ if func is aten.view.default:
+ assert len(args) == 2
+ new = args[0]._change_shape(args[1])
+ return return_and_correct_aliasing(func, args, kwargs, new)
+
  raise NotImplementedError(
- f"PlainAQTLayout dispatch: attempting to run {func}, this is not supported"
+ f"TensorCoreTiledAQTLayout dispatch: attempting to run {func}, this is not supported"
  )
 
  __torch_function__ = torch._C._disabled_torch_function_impl
 
  def get_plain(self):
- raise NotImplementedError(
- f"Unpacking for tensor core tiled storage is not yet implemented"
+ from torchao.quantization.quant_primitives import (
+ ZeroPointDomain,
+ unpack_tinygemm_scales_and_zeros,
+ quantize_affine,
  )
+ cur_shape = self.shape
+ assert len(cur_shape) == 4
+ # TODO: expose the arg
+ innerKTiles = 8
+ original_shape = (cur_shape[0] * 8, cur_shape[1] * (innerKTiles * 16))
+ eye_shape = original_shape[1]
+ block_size = (1, 32)
+ device = self.device
+ original_dtype = torch.bfloat16
+ groupsize = 32
+ target_dtype = torch.int32
+ quant_min = 0
+ quant_max = 15
+ zero_point_domain = ZeroPointDomain.FLOAT
+ assert len(block_size) == 2 and block_size[0] == 1
+ groupsize = block_size[-1]
+ dequantized = torch.ops.aten._weight_int4pack_mm(torch.eye(eye_shape, device=device, dtype=original_dtype), self.packed_weight, groupsize, self.scale_and_zero)
+ dequantized = dequantized.t().contiguous()
+ scale, zero = unpack_tinygemm_scales_and_zeros(self.scale_and_zero)
+ # TODO: move this to `unpack_tinygemm_scales_and_zeros`?
+ scale = scale.reshape(scale.shape[:-1]).contiguous()
+ zero = zero.reshape(zero.shape[:-1]).contiguous()
+ int_data = quantize_affine(dequantized, block_size, scale, zero, target_dtype, quant_min, quant_max, zero_point_domain)
+ return int_data, scale, zero
 
 class AffineQuantizedTensor(torch.Tensor):
  """
@@ -413,15 +466,26 @@ def from_float(
  zero_point_domain: ZeroPointDomain = ZeroPointDomain.INT,
  extended_layout: str = "plain",
  ):
+ original_shape = input_float.shape
+ if extended_layout == "tensor_core_tiled":
+ from torchao.quantization.utils import find_multiple
+ orig_out_features, orig_in_features = input_float.shape
+ in_features = find_multiple(orig_in_features, 1024)
+ out_features = find_multiple(orig_out_features, 8)
+ input_float = torch.nn.functional.pad(
+ input_float,
+ (0, in_features - orig_in_features, 0, out_features - orig_out_features),
+ )
+
  scale, zero_point = choose_qparams_affine(input_float, mapping_type, block_size, target_dtype, quant_min, quant_max, eps, scale_dtype, zero_point_dtype, preserve_zero, zero_point_domain)
  int_data = quantize_affine(input_float, block_size, scale, zero_point, target_dtype, quant_min, quant_max, zero_point_domain)
 
- layout_cls = get_aqt_layout_cls(extended_layout)
- layout_tensor = layout_cls(int_data, scale, zero_point)
+ layout_cls_ctr = get_aqt_layout_cls_ctr(extended_layout)
+ layout_tensor = layout_cls_ctr(int_data, scale, zero_point)
  return cls(
  layout_tensor,
  block_size,
- input_float.shape,
+ original_shape,
  quant_min,
  quant_max,
  zero_point_domain,
@@ -507,7 +571,13 @@ def __torch_dispatch__(cls, func, types, args, kwargs):
  f"AffineQuantizedTensor dispatch: attempting to run {func}, this is not supported"
  )
 
-def _quantized_linear_op(input_tensor, weight_qtensor, bias):
+def _quantized_linear_op(input_tensor, weight_qtensor, bias, _from_flinear=True):
+ # TODO: the old tensor subclass can use the single implementation for both F.linear dispatch
+ # and aten.addmm/aten.mm dispatch because `_change_shape` is not implmeneted correctly (got ignored
+ # for the int_data), this makes the dimension for weight_qtensor indeterministic, we need to fix
+ # the issue and make sure we have a clear accepted dimension for `_quantized_linear_op`
+ # after that we can remove _from_linear flag
+
  is_cuda = weight_qtensor.is_cuda
  is_cpu = weight_qtensor.device == torch.device("cpu")
  if isinstance(weight_qtensor, AffineQuantizedTensor):
@@ -564,15 +634,42 @@ def _quantized_linear_op(input_tensor, weight_qtensor, bias):
  weight_is_uint4 and
  weight_qtensor.dtype == torch.bfloat16 and
  len(weight_qtensor.shape) == 2 and
- weight_qtensor.block_size[0] == 1 and
  weight_qtensor.zero_point_domain == ZeroPointDomain.FLOAT and
  weight_qtensor.layout == "tensor_core_tiled"
  ):
- # groupwise int4 quantization
- groupsize = weight_qtensor.block_size[-1]
+ if not _from_flinear:
+ weight_qtensor = weight_qtensor.t()
+ assert weight_qtensor.block_size[0] == 1, f"Requires groupwise quantization, got block_size: {block_size}"
+
+ # TODO: check groupsize quantization
+ # avoid circular dep, TODO: move this to a common util.py
+ from torchao.quantization.utils import find_multiple
+ act_mat = input_tensor
+ # weight is packed from padded (out_features, in_features) weight tensor
+ # (same dimension requirement as F.linear weight)
  packed_weight = weight_qtensor.layout_tensor.packed_weight
  scale_and_zero = weight_qtensor.layout_tensor.scale_and_zero
- return torch.ops.aten._weight_int4pack_mm(input_tensor.contiguous(), packed_weight, groupsize, scale_and_zero)
+
+ orig_act_size = act_mat.size()
+ orig_dtype = act_mat.dtype
+
+ # reshape and pad activation
+ act_mat = act_mat.reshape(-1, act_mat.shape[-1]).to(torch.bfloat16)
+ pad_size = find_multiple(act_mat.shape[-1], 1024)
+ act_mat = torch.nn.functional.pad(act_mat, (0, pad_size - act_mat.shape[-1]))
+
+ # groupwise int4 quantization
+ groupsize = weight_qtensor.block_size[1]
+ y = torch.ops.aten._weight_int4pack_mm(act_mat.contiguous(), packed_weight, groupsize, scale_and_zero)
+
+ # remove out_feature padding
+ orig_out_features = weight_qtensor.shape[-2]
+ y = y[:, :orig_out_features]
+ y = y.reshape(*orig_act_size[:-1], orig_out_features)
+
+ if bias is not None:
+ y += bias
+ return y.to(orig_dtype)
  elif (
  weight_is_int8 and
  len(weight_qtensor.shape) == 2 and
@@ -602,6 +699,7 @@ def _quantized_linear_op(input_tensor, weight_qtensor, bias):
  # is_cpu and is_mps only, some issue with is_contiguous() currently
  # return torch.ops.aten._weight_int8pack_mm(input_tensor.contiguous(), w_vals_int8_t, weight_qtensor.layout_tensor.scale)
 
+ breakpoint()
  raise NotImplementedError("No specialized dispatch found for quantized linear op")
 
 
@@ -639,7 +737,7 @@ def aten_mm(func, *args, **kwargs):
  args[0],
  )
  try:
- return _quantized_linear_op(input_tensor, weight_tensor, bias)
+ return _quantized_linear_op(input_tensor, weight_tensor, bias, _from_flinear=False)
  except:
  if isinstance(input_tensor, AffineQuantizedTensor):
  input_tensor = input_tensor.dequantize()
@@ -653,7 +751,7 @@ def aten_mm(func, *args, **kwargs):
  None
  )
  try:
- return _quantized_linear_op(input_tensor, weight_tensor, bias)
+ return _quantized_linear_op(input_tensor, weight_tensor, bias, _from_flinear=False)
  except:
  if isinstance(input_tensor, AffineQuantizedTensor):
  input_tensor = input_tensor.dequantize()

diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -37,7 +37,6 @@
  Int8WeightOnlyQuantizedLinearWeight,
  QuantizedLinearWeightBase,
  to_laq,
- LinearActQuantizedTensor,
 )
 
 from .quant_primitives import (
@@ -229,11 +228,15 @@ def change_linear_weights_to_int4_woqtensors(model, **kwargs):
  """
  filter_fn = kwargs.pop("filter_fn", _is_linear)
 
- _replace_with_custom_fn_if_matches_filter(
- model,
- _get_subclass_inserter(Int4WeightOnlyQuantizedLinearWeight, enable_parametrization=TORCH_VERSION_AFTER_2_4, **kwargs),
- filter_fn,
- )
+ if TORCH_VERSION_AFTER_2_4:
+ quantize(model, get_apply_int4wo_quant(**kwargs), filter_fn)
+ unwrap_tensor_subclass(model, filter_fn)
+ else:
+ _replace_with_custom_fn_if_matches_filter(
+ model,
+ _get_subclass_inserter(Int4WeightOnlyQuantizedLinearWeight, enable_parametrization=False, **kwargs),
+ filter_fn,
+ )
 
 def swap_conv2d_1x1_to_linear(model, filter_fn=None):
  """
@@ -352,7 +355,6 @@ def apply_int4wo_quant(weight):
  # avoid circular dep
  from torchao.dtypes.aqt import to_aq
 
- groupsize = 32
  mapping_type = MappingType.ASYMMETRIC
  block_size = (1, groupsize)
  target_dtype = torch.int32

diff --git a/tutorials/quantize_vit/run_vit_b_quant.py b/tutorials/quantize_vit/run_vit_b_quant.py
@@ -15,20 +15,22 @@
 
 ## Quantization code - start
 # int8 act, int8 weight dynamic quantization
-torchao.apply_dynamic_quant(model)
+# torchao.apply_dynamic_quant(model)
 
 # int8 weight only quantization
 # torchao.quantization.change_linear_weights_to_int8_woqtensors(model)
-## Quantization code - end
 
+# int4 weight only quantization
+torchao.quantization.change_linear_weights_to_int4_woqtensors(model, groupsize=32)
+## Quantization code - end
 
 ## compilation configs
 torch._dynamo.config.automatic_dynamic_shapes = False
 torch._inductor.config.force_fuse_int_mm_with_mul = True
 torch._inductor.config.use_mixed_mm = True
 ## compilation configs end
 
-model = torch.compile(model, mode='max-autotune', fullgraph=True)
+model = torch.compile(model, mode='max-autotune')
 
 # Must run with no_grad when optimizing for inference
 with torch.no_grad():