From ffec0dafc22b9319ccc0323b9f6692399f0f526d Mon Sep 17 00:00:00 2001 From: Mickael Seznec Date: Wed, 27 Aug 2025 18:27:06 +0000 Subject: [PATCH 1/3] fix: awq x mergedreplicatedlinear Signed-off-by: Mickael Seznec --- vllm/model_executor/layers/linear.py | 6 +++++- vllm/model_executor/parameter.py | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index c0fcacd1e6ee..c96ddb70ad7a 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -438,7 +438,11 @@ def weight_loader(self, shard_offset = sum(self.output_sizes[:loaded_shard_id]) shard_size = self.output_sizes[loaded_shard_id] - param.data[shard_offset:shard_offset + shard_size] = loaded_weight + param.load_merged_column_weight(loaded_weight=loaded_weight, + shard_id=loaded_shard_id, + shard_offset=shard_offset, + shard_size=shard_size, + tp_rank=0) @CustomOp.register("column_parallel_linear") diff --git a/vllm/model_executor/parameter.py b/vllm/model_executor/parameter.py index 750ee7850268..5466ed50ca42 100644 --- a/vllm/model_executor/parameter.py +++ b/vllm/model_executor/parameter.py @@ -122,7 +122,7 @@ def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs): param_data = self.data - tp_rank = get_tensor_model_parallel_rank() + tp_rank = kwargs.get("tp_rank", get_tensor_model_parallel_rank()) param_data = param_data.narrow(self.output_dim, shard_offset, shard_size) loaded_weight = loaded_weight.narrow(self.output_dim, @@ -456,4 +456,4 @@ def _adjust_shard_indexes_for_packing(shard_size, shard_offset, packed_factor, shard_offset=shard_offset, bitblas_tile_size=bitblas_tile_size) - return shard_size, shard_offset \ No newline at end of file + return shard_size, shard_offset From 0a4568ec6d1100c61ce282a2b79920724b578bfd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Seznec?= Date: Wed, 27 Aug 2025 21:19:13 +0200 Subject: [PATCH 2/3] nice catch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Signed-off-by: Mickaƫl Seznec --- vllm/model_executor/layers/linear.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index c96ddb70ad7a..b3a1a82d5afe 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -438,11 +438,14 @@ def weight_loader(self, shard_offset = sum(self.output_sizes[:loaded_shard_id]) shard_size = self.output_sizes[loaded_shard_id] - param.load_merged_column_weight(loaded_weight=loaded_weight, - shard_id=loaded_shard_id, - shard_offset=shard_offset, - shard_size=shard_size, - tp_rank=0) + if isinstance(param, BasevLLMParameter): + param.load_merged_column_weight(loaded_weight=loaded_weight, + shard_id=loaded_shard_id, + shard_offset=shard_offset, + shard_size=shard_size, + tp_rank=0) + else: + param.data[shard_offset:shard_offset + shard_size] = loaded_weight @CustomOp.register("column_parallel_linear") From f6bded1361d37e255f745aec0d2390a018ace118 Mon Sep 17 00:00:00 2001 From: Mickael Seznec Date: Fri, 29 Aug 2025 09:38:28 +0000 Subject: [PATCH 3/3] nit: no need for specific PerTensorScale path Signed-off-by: Mickael Seznec --- vllm/model_executor/layers/linear.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index b3a1a82d5afe..ebc4f67f1ad7 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -431,9 +431,6 @@ def weight_loader(self, block_n) shard_size = ((self.output_sizes[loaded_shard_id] + block_n - 1) // block_n) - elif isinstance(param, PerTensorScaleParameter): - shard_offset = loaded_shard_id - shard_size = 1 else: shard_offset = sum(self.output_sizes[:loaded_shard_id]) shard_size = self.output_sizes[loaded_shard_id]