[V0 deprecation] Clean up LoRA (#25686)

jeejeelee · yewentao256 · commit 80385959af5d · 2025-10-03T13:35:55.000-07:00
Signed-off-by: Jee Jee Li &lt;pandaleefree@gmail.com&gt;
Signed-off-by: yewentao256 &lt;zhyanwentao@126.com&gt;
diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py
@@ -11,7 +11,6 @@
 
 import torch
 
-import vllm.envs as envs
 from vllm.lora.layers import LoRAMapping
 from vllm.triton_utils import HAS_TRITON
 
@@ -41,14 +40,8 @@ def __init__(self, max_num_batched_tokens: int, max_batches: int,
                                                       max_num_batched_tokens,
                                                       device=device)
 
-        # When cudagraph capture size is greater than max_num_seqs (max_batches,
-        # here), V0 captures the graph as if max_num_seqs is set to
-        # the capture size.
-        # V1 doesn't have this problem and always respects max_num_seqs.
-        max_num_prompts = (max_batches
-                           if envs.VLLM_USE_V1 else max_num_batched_tokens)
         self.prompt_mapping_meta = LoRAKernelMeta.make(self.max_loras,
-                                                       max_num_prompts,
+                                                       max_batches,
                                                        device=device)
 
     def update_metadata(self, mapping: LoRAMapping,