From 6f3169ad8ce39ce60462286a698e96478dda104f Mon Sep 17 00:00:00 2001 From: SangBin Cho Date: Sat, 15 Jun 2024 23:59:36 +0900 Subject: [PATCH] [misc] Do not allow to use lora with chunked prefill. (#5538) Co-authored-by: Cyrus Leung --- vllm/config.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/config.py b/vllm/config.py index 403959cb79d22..87c4a48adcb5e 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1124,6 +1124,8 @@ def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig): "Due to limitations of the custom LoRA CUDA kernel, " "max_num_batched_tokens must be <= 65528 when " "LoRA is enabled.") + if scheduler_config.chunked_prefill_enabled: + raise ValueError("LoRA is not supported with chunked prefill yet.") @dataclass