vllm-project · robertgshaw2-redhat · Jun 18, 2024 · Jun 13, 2024 · Jun 14, 2024 · Jun 14, 2024
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
@@ -468,13 +468,6 @@ def weight_loader(self,
                     "MergedColumnParallelLinear, assume the weight is "
                     "the same for all partitions.")
 
-        if fp8_scales_shard_indexer is None:
-            if len(param_data.shape) == 0:
-                param_data = param_data.reshape(1)
-
-            if len(loaded_weight.shape) == 0:
-                loaded_weight = loaded_weight.reshape(1)
-
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
@@ -686,12 +679,6 @@ def weight_loader(self,
                     "QKVParallelLinear, assume the weight is the same "
                     "for all partitions.")
 
-        if len(param_data.shape) == 0:
-            param_data = param_data.reshape(1)
-
-        if len(loaded_weight.shape) == 0:
-            loaded_weight = loaded_weight.reshape(1)
-
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
@@ -784,7 +771,7 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
                                                                  shard_id=0)
 
         if fp8_scales_shard_indexer is None and len(loaded_weight.shape) == 0:
-            loaded_weight = loaded_weight.reshape(1)
+            loaded_weight = loaded_weight.reshape(1, 1)
 
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)

@@ -88,14 +88,15 @@ def _is_static_tensor_w8a8(self, weight_quant: BaseModel,
     def _is_dynamic_token_w8a8(self, weight_quant: BaseModel,
                                input_quant: BaseModel) -> bool:
         is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
-        is_token_tensor = (weight_quant.strategy
-                           == QuantizationStrategy.TENSOR.value) and (
-                               input_quant.strategy
-                               == QuantizationStrategy.TOKEN.value)
+        weight_strategy = (
+            weight_quant.strategy == QuantizationStrategy.TENSOR.value
+            or weight_quant.strategy == QuantizationStrategy.CHANNEL.value)
+        is_token = (weight_strategy and input_quant.strategy
+                    == QuantizationStrategy.TOKEN.value)
         is_symmetric = weight_quant.symmetric and input_quant.symmetric
         is_dynamic = not weight_quant.dynamic and input_quant.dynamic
 
-        return is_8_bits and is_token_tensor and is_symmetric and is_dynamic
+        return is_8_bits and is_token and is_symmetric and is_dynamic
 
     def _is_w4a16(self, weight_quant: BaseModel,
                   input_quant: BaseModel) -> bool:
@@ -118,7 +119,8 @@ def _get_schema(self, weight_quant: BaseModel,
             return CompressedTensorsW8A8StaticTensor()
 
         if self._is_dynamic_token_w8a8(weight_quant, input_quant):
-            return CompressedTensorsW8A8DynamicToken()
+            return CompressedTensorsW8A8DynamicToken(
+                strategy=weight_quant.strategy)
 
         raise NotImplementedError("Scheme not supported.")
 

@@ -6,13 +6,18 @@
 from vllm import _custom_ops as custom_ops
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme)
+from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
+    QuantizationStrategy)
 from vllm.model_executor.utils import set_weight_attrs
 
 __all__ = ["CompressedTensorsW8A8DynamicToken"]
 
 
 class CompressedTensorsW8A8DynamicToken(CompressedTensorsScheme):
 
+    def __init__(self, strategy: str):
+        self.strategy = strategy
+
     def _shard_id_as_int(self, shard_id: Union[str, int]) -> int:
         if isinstance(shard_id, int):
             return shard_id
@@ -31,6 +36,9 @@ def scales_shard_splitter(
         size = logical_widths[shard_id]
         # update loaded weight with copies for broadcast.
         loaded_weight = loaded_weight.repeat(size)
+        # parameter defined for scale is 2D; expand
+        if len(loaded_weight.shape) == 1:
+            loaded_weight = torch.unsqueeze(loaded_weight, -1)
         return param[offset:offset + size], loaded_weight
 
     def create_weights(self, layer: torch.nn.Module,
@@ -45,13 +53,17 @@ def create_weights(self, layer: torch.nn.Module,
         # CompressedTensorsW8A8StaticTensor::create_weights for further
         # information.
         is_tensor_partitioned = len(output_partition_sizes) != 1
-        weight_scale_dim = sum(
-            output_partition_sizes) if is_tensor_partitioned else 1
+        # when doing channel-wise quantization, number of scales
+        # is equal to output_dim
+        weight_scale_dim = sum(output_partition_sizes) if (
+            is_tensor_partitioned
+            or self.strategy == QuantizationStrategy.CHANNEL) else 1
 
         weight_zero_point = Parameter(torch.empty(1, dtype=torch.int8),
                                       requires_grad=False)
 
         weight_scale = Parameter(torch.empty(weight_scale_dim,
+                                             1,
                                              dtype=torch.float32),
                                  requires_grad=False)
 
@@ -67,11 +79,19 @@ def create_weights(self, layer: torch.nn.Module,
 
         layer.register_parameter("weight_scale", weight_scale)
         set_weight_attrs(weight_scale, {"weight_loader": weight_loader})
-        set_weight_attrs(
-            weight_scale, {
-                "shard_splitter": self.scales_shard_splitter,
-                "logical_widths": output_partition_sizes
+
+        # Don't need a shard_splitter for channel-wise quantization
+        # Use the default loading method
+        if self.strategy == QuantizationStrategy.CHANNEL:
+            set_weight_attrs(weight_scale, {
+                "output_dim": 0,
             })
+        else:
+            set_weight_attrs(
+                weight_scale, {
+                    "logical_widths": output_partition_sizes,
+                    "shard_splitter": self.scales_shard_splitter,
+                })
 
         layer.register_parameter("weight_zero_point", weight_zero_point)
         set_weight_attrs(weight_zero_point, {"weight_loader": weight_loader})