neuralmagic · Satrat · May 3, 2024 · Apr 19, 2024 · Apr 19, 2024 · Apr 23, 2024
diff --git a/src/compressed_tensors/quantization/lifecycle/forward.py b/src/compressed_tensors/quantization/lifecycle/forward.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from functools import wraps
+from math import ceil
 
 import torch
 from compressed_tensors.quantization.quant_args import QuantizationArgs
@@ -33,9 +34,7 @@ def quantize(
     q_max: torch.Tensor,
 ) -> torch.Tensor:
     return torch.clamp(
-        torch.round(
-            x / scale + zero_point,
-        ),
+        torch.round(x / scale + zero_point),
         q_min,
         q_max,
     )
@@ -57,12 +56,61 @@ def fake_quantize(
     zero_point: torch.Tensor,
     args: QuantizationArgs,
 ) -> torch.Tensor:
+    """
+    Fake quantize the input tensor x depending on the group_size.
+    if group_size is greater than 0, then q/dq by groups. The groups
+    must be divisible by the column size
+    if group_size is -1, then channel wise q/dq. THe input scale and
+    zero_points are reshaped to support vectorization (Assumes 1 is
+    the channel dimension)
+
+    :param x: Input tensor
+    :param scale: scale tensor
+    :param zero_point: zero point tensor
+    :param args: quantization args that contain group_size info
+    :return: fake quantized tensor
+
+    """
     bit_range = 2**args.num_bits
     max_q = torch.tensor(bit_range / 2 - 1, device=x.device)
     min_q = torch.tensor(-bit_range / 2, device=x.device)
-    Q = torch.zeros_like(x)
-    Q = quantize(x, scale, zero_point, min_q, max_q)
-    return dequantize(Q, scale, zero_point)
+
+    group_size = args.group_size
+
+    if group_size is None or group_size == 0:
+        Q = quantize(x, scale, zero_point, min_q, max_q)
+        DQ = dequantize(Q, scale, zero_point)
+
+    # group
+    elif group_size > 0:
+
+        DQ = torch.zeros_like(x)
+
+        # TODO: vectorize the for loop
+        # TODO: fix genetric assumption about the tensor size for computing group
+        columns = x.shape[1]
+
+        # TODO: make validation step for inputs
+        assert columns % group_size == 0
+        for i in range(ceil(columns / group_size)):
+            sc = scale[i]
+            zp = zero_point[i]
+
+            idx = i * group_size
+            Q = quantize(x[:, idx : (idx + group_size)], sc, zp, min_q, max_q)
+            DQ[:, idx : (idx + group_size)] = dequantize(Q, sc, zp)
+
+    # channel-wise
+    else:  # group_size == -1
+        # before: scale shape = [channel_size]
+        # after: scale shape = [1, channel_size]
+        scale = scale.unsqueeze(0)
+        zero_point = zero_point.unsqueeze(0)
+
+        Q = quantize(x, scale, zero_point, min_q, max_q)
+        DQ = dequantize(Q, scale, zero_point)
+
+    return DQ
 
 
 def wrap_module_forward_quantized(module: Module, scheme: QuantizationScheme):

diff --git a/src/compressed_tensors/quantization/observers/base.py b/src/compressed_tensors/quantization/observers/base.py
@@ -14,6 +14,7 @@
 
 from typing import Optional, Tuple
 
+import torch
 from compressed_tensors.quantization.quant_args import QuantizationArgs
 from compressed_tensors.registry.registry import RegistryMixin
 from torch import FloatTensor, IntTensor, Tensor
@@ -52,6 +53,12 @@ def calculate_qparams(self, observed: Tensor) -> Tuple[FloatTensor, IntTensor]:
         """
         raise NotImplementedError(f"{self.__class__} must implement calculate_qparams")
 
+    def post_calculate_qparams(self) -> None:
+        """
+        Run any logic specific to its observers after running calculate_qparams
+        """
+        ...
+
     def get_qparams(
         self, observed: Optional[Tensor] = None
     ) -> Tuple[FloatTensor, IntTensor]:
@@ -64,6 +71,51 @@ def get_qparams(
         :return: tuple of scale and zero point based on last observed value
         """
         if observed is not None:
-            # re-calcualte scale and zero point, update the stored value
-            self._scale, self._zero_point = self.calculate_qparams(observed)
+            group_size = self.quantization_args.group_size
+            if group_size is None:
+
+                # re-calcualte scale and zero point, update the stored value
+                self._scale, self._zero_point = self.calculate_qparams(observed)
+
+            elif group_size > 0:  # quantize by groups
+                columns = observed.shape[1]
+                scales, zero_points = [], []
+                for i in range(0, columns, self.quantization_args.group_size):
+                    scale, zero_point = self.calculate_qparams(
+                        observed[:, i : (i + group_size)]
+                    )
+                    scales.append(scale)
+                    zero_points.append(zero_point)
+
+                self._scale = torch.cat(scales)
+                self._zero_point = torch.cat(zero_points)
+
+            elif group_size < 0:  # channel-wise quantization
+
+                # TODO: make a genertic way to get the channel
+                channel = 1
+                self._scale, self._zero_point = self.get_qparams_per_channel(
+                    observed, channel
+                )
+
+        self.post_calculate_qparams()
         return self._scale, self._zero_point
+
+    def get_qparams_per_channel(self, observed, channel: int):
+        # TODO: add documentation that specifies the shape must
+        #   be padded with 1-dims so the scales are along the right channel
+        # TODO: generalize the logic for reduce_dims
+        scales, zero_points = [], []
+
+        # TODO: make a more generic way to get the channel
+        num_channels = observed.shape[channel]
+
+        for channel_idx in range(num_channels):
+            scale, zero_point = self.calculate_qparams(
+                observed.select(dim=channel, index=channel_idx)
+            )
+
+            scales.append(scale)
+            zero_points.append(zero_point)
+
+        return torch.cat(scales), torch.cat(zero_points)
diff --git a/src/compressed_tensors/quantization/observers/min_max.py b/src/compressed_tensors/quantization/observers/min_max.py
@@ -40,6 +40,7 @@ def __init__(
         self.max_val = -float("inf")
         self.averaging_constant = averaging_constant
 
+
     def calculate_qparams(self, observed: Tensor) -> Tuple[FloatTensor, IntTensor]:
         """
         Updates the observed min and max using a moving average smoothed by the