From 6f3169ad8ce39ce60462286a698e96478dda104f Mon Sep 17 00:00:00 2001
From: SangBin Cho <rkooo567@gmail.com>
Date: Sat, 15 Jun 2024 23:59:36 +0900
Subject: [PATCH] [misc] Do not allow to use lora with chunked prefill. (#5538)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 vllm/config.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/config.py b/vllm/config.py
index 403959cb79d22..87c4a48adcb5e 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1124,6 +1124,8 @@ def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
                 "Due to limitations of the custom LoRA CUDA kernel, "
                 "max_num_batched_tokens must be <= 65528 when "
                 "LoRA is enabled.")
+        if scheduler_config.chunked_prefill_enabled:
+            raise ValueError("LoRA is not supported with chunked prefill yet.")
 
 
 @dataclass