Skip to content

Commit c74730a

Browse files
committed
clean-up; add test cases
1 parent 6d6f320 commit c74730a

File tree

2 files changed

+30
-17
lines changed

2 files changed

+30
-17
lines changed

tests/quantization/test_compressed_tensors.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -261,16 +261,23 @@ def check_model(model):
261261

262262
@pytest.mark.parametrize(
263263
"wNa16_args",
264-
[
265-
("nm-testing/tinyllama-oneshot-w4a16-channel-v2", "channel", None, 8),
266-
("nm-testing/tinyllama-oneshot-w4a16-group128-v2", "group", 128, 8),
267-
("nm-testing/tinyllama-oneshot-w8a16-per-channel", "channel", None, 4),
268-
],
264+
[("nm-testing/tinyllama-oneshot-w4a16-channel-v2", "channel", None, 8,
265+
True, False),
266+
("nm-testing/tinyllama-oneshot-w4a16-group128-v2", "group", 128, 8, True,
267+
False),
268+
("nm-testing/tinyllama-oneshot-w8a16-per-channel", "channel", None, 4,
269+
True, False),
270+
("nm-testing/TinyLlama-1.1B-Chat-v1.0-awq-group128-asym256", "group", 128,
271+
4, False, False
272+
)("nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-Asym-Updated-Channel",
273+
"channel", None, 4, False, False)
274+
("nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-Asym-Updated-ActOrder",
275+
"group", 128, 4, False, True)],
269276
)
270277
@pytest.mark.skipif(not current_platform.is_cuda(),
271278
reason="The tests are skipped on non-CUDA platform.")
272279
def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
273-
model, strategy, group, pack_factor = wNa16_args
280+
model, strategy, group, pack_factor, symmetric, has_g_idx = wNa16_args
274281
with vllm_runner(model) as llm:
275282

276283
def check_model(model):
@@ -286,6 +293,8 @@ def check_model(model):
286293
if group is None else group)
287294

288295
assert qkv_proj.scheme.pack_factor == pack_factor
296+
assert qkv_proj.scheme.symmetric == symmetric
297+
assert qkv_proj.scheme.has_g_idx == has_g_idx
289298

290299
llm.apply_model(check_model)
291300

vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,6 @@ def __init__(self,
6161
self.quant_type = (WNA16_ZP_SUPPORTED_TYPES_MAP[num_bits]
6262
if zero_points else
6363
WNA16_SUPPORTED_TYPES_MAP[num_bits])
64-
self.zero_points = zero_points
6564

6665
@classmethod
6766
def get_min_capability(cls) -> int:
@@ -143,19 +142,22 @@ def create_weights(self, layer: torch.nn.Module, output_size: int,
143142
if not partition_scales:
144143
weight_scale = ChannelQuantScaleParameter(output_dim=0,
145144
**weight_scale_args)
146-
qzeros = PackedColumnParameter(output_dim=0,
147-
packed_dim=0,
148-
packed_factor=self.pack_factor,
149-
**zeros_args)
145+
146+
if not self.symmetric:
147+
qzeros = PackedColumnParameter(output_dim=0,
148+
packed_dim=0,
149+
packed_factor=self.pack_factor,
150+
**zeros_args)
150151
else:
151152
weight_scale = GroupQuantScaleParameter(output_dim=0,
152153
input_dim=1,
153154
**weight_scale_args)
154-
qzeros = PackedvLLMParameter(input_dim=1,
155-
output_dim=0,
156-
packed_dim=0,
157-
packed_factor=self.pack_factor,
158-
**zeros_args)
155+
if not self.symmetric:
156+
qzeros = PackedvLLMParameter(input_dim=1,
157+
output_dim=0,
158+
packed_dim=0,
159+
packed_factor=self.pack_factor,
160+
**zeros_args)
159161

160162
# A 2D array defining the original shape of the weights
161163
# before packing
@@ -166,7 +168,9 @@ def create_weights(self, layer: torch.nn.Module, output_size: int,
166168
layer.register_parameter("weight_packed", weight)
167169
layer.register_parameter("weight_scale", weight_scale)
168170
layer.register_parameter("weight_shape", weight_shape)
169-
layer.register_parameter("weight_zero_point", qzeros)
171+
172+
if not self.symmetric:
173+
layer.register_parameter("weight_zero_point", qzeros)
170174

171175
# group index (for activation reordering)
172176
if self.has_g_idx:

0 commit comments

Comments
 (0)