Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 8 additions & 6 deletions tests/lora/test_llama_tp.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,12 +84,14 @@ def v1(run_with_both_engines_lora):
@create_new_process_for_each_test()
def test_llama_lora(sql_lora_files):

llm = vllm.LLM(MODEL_PATH,
enable_lora=True,
max_num_seqs=16,
max_loras=4,
tensor_parallel_size=1,
enable_chunked_prefill=True)
llm = vllm.LLM(
MODEL_PATH,
enable_lora=True,
# also test odd max_num_seqs
max_num_seqs=13,
max_loras=4,
tensor_parallel_size=1,
enable_chunked_prefill=True)
generate_and_test(llm, sql_lora_files)


Expand Down
10 changes: 9 additions & 1 deletion vllm/lora/punica_wrapper/punica_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import torch

import vllm.envs as envs
from vllm.lora.layers import LoRAMapping
from vllm.triton_utils import HAS_TRITON

Expand Down Expand Up @@ -42,8 +43,15 @@ def __init__(self, max_num_batched_tokens: int, max_batches: int,
self.token_mapping_meta = LoRAKernelMeta.make(self.max_loras,
max_num_batched_tokens,
device=device)

# When cudagraph capture size is greater than max_num_seqs (max_batches,
# here), V0 captures the graph as if max_num_seqs is set to
# the capture size.
# V1 doesn't have this problem and always respects max_num_seqs.
max_num_prompts = (max_batches
if envs.VLLM_USE_V1 else max_num_batched_tokens)
self.prompt_mapping_meta = LoRAKernelMeta.make(self.max_loras,
max_batches,
max_num_prompts,
device=device)

def update_metadata(
Expand Down