From 6f58c568b6709a057bddac45c073029e50b60d2d Mon Sep 17 00:00:00 2001 From: SzymonOzog Date: Wed, 12 Mar 2025 08:30:28 +0000 Subject: [PATCH 1/2] fix chunked prefill for GGUF Signed-off-by: SzymonOzog --- vllm/model_executor/layers/quantization/gguf.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py index 5d4c1c6ec893..c92bcbea540a 100644 --- a/vllm/model_executor/layers/quantization/gguf.py +++ b/vllm/model_executor/layers/quantization/gguf.py @@ -98,6 +98,13 @@ def get_quant_method(self, layer: torch.nn.Module, def _fuse_mul_mat(x: torch.Tensor, qweight: torch.Tensor, qweight_type: int) -> torch.Tensor: + # HACK: when doing chunked prefill we don't generate output tokens + # so input to logits generator is empty which causes invalid parameter + if x.shape[0] == 0: + return torch.empty(x.shape[0], + qweight.shape[0], + dtype=x.dtype, + device=x.device) # there is no need to call any kernel for fp16/bf16 if qweight_type in UNQUANTIZED_TYPES: return x @ qweight.T From 252fa995caae6f54ab16ce6bfcb7a87e053bbefc Mon Sep 17 00:00:00 2001 From: SzymonOzog Date: Wed, 12 Mar 2025 09:10:21 +0000 Subject: [PATCH 2/2] rerun ci Signed-off-by: SzymonOzog