From b9930c06faf9fafb90ebcbc64d4d63178daf6a39 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Thu, 25 Sep 2025 15:49:01 +0000 Subject: [PATCH] Done Signed-off-by: Jee Jee Li --- vllm/lora/punica_wrapper/punica_gpu.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py index 2db0e9fee142..467f50050eb2 100644 --- a/vllm/lora/punica_wrapper/punica_gpu.py +++ b/vllm/lora/punica_wrapper/punica_gpu.py @@ -11,7 +11,6 @@ import torch -import vllm.envs as envs from vllm.lora.layers import LoRAMapping from vllm.triton_utils import HAS_TRITON @@ -41,14 +40,8 @@ def __init__(self, max_num_batched_tokens: int, max_batches: int, max_num_batched_tokens, device=device) - # When cudagraph capture size is greater than max_num_seqs (max_batches, - # here), V0 captures the graph as if max_num_seqs is set to - # the capture size. - # V1 doesn't have this problem and always respects max_num_seqs. - max_num_prompts = (max_batches - if envs.VLLM_USE_V1 else max_num_batched_tokens) self.prompt_mapping_meta = LoRAKernelMeta.make(self.max_loras, - max_num_prompts, + max_batches, device=device) def update_metadata(self, mapping: LoRAMapping,