1+ # SPDX-License-Identifier: Apache-2.0
2+ """
3+ Script to test add_lora, remove_lora, pin_lora, list_loras functions.
4+ """
5+
6+ from pathlib import Path
7+ import pytest
8+ from typing import List
9+ import os
10+
11+ from vllm .engine .arg_utils import AsyncEngineArgs , EngineArgs
12+ from vllm .entrypoints .llm import LLM
13+ from vllm .lora .request import LoRARequest
14+
15+ from huggingface_hub import snapshot_download
16+
17+ MODEL_PATH = "meta-llama/Llama-2-7b-hf"
18+ LORA_MODULE_PATH = "yard1/llama-2-7b-sql-lora-test"
19+ LORA_RANK = 8
20+
21+ @pytest .fixture (autouse = True )
22+ def v1 (run_with_both_engines_lora ):
23+ # Simple autouse wrapper to run both engines for each test
24+ # This can be promoted up to conftest.py to run for every
25+ # test in a package
26+ pass
27+
28+ def make_lora_request (lora_id : int ):
29+ return LoRARequest (lora_name = f"{ lora_id } " ,
30+ lora_int_id = lora_id ,
31+ lora_path = LORA_MODULE_PATH )
32+
33+
34+ def test_lora_functions_sync ():
35+
36+ max_loras = 4
37+ # Create engine in eager-mode. Due to high max_loras, the CI can
38+ # OOM during cuda-graph capture.
39+ engine_args = EngineArgs (
40+ model = MODEL_PATH ,
41+ enable_lora = True ,
42+ max_loras = max_loras ,
43+ max_lora_rank = LORA_RANK ,
44+ max_model_len = 128 ,
45+ gpu_memory_utilization = 0.8 , #avoid OOM
46+ enforce_eager = True )
47+
48+ llm = LLM .get_engine_class ().from_engine_args (engine_args )
49+
50+ def run_check (fn , args , expected : List ):
51+ fn (args )
52+ assert set (llm .list_loras ()) == set (expected )
53+
54+ run_check (llm .add_lora , make_lora_request (1 ), [1 ])
55+ run_check (llm .add_lora , make_lora_request (2 ), [1 , 2 ])
56+
57+ # Pin LoRA 1 and test that it is never removed on subsequent adds.
58+ run_check (llm .pin_lora , 1 , [1 , 2 ])
59+ run_check (llm .add_lora , make_lora_request (3 ), [1 , 2 , 3 ])
60+ run_check (llm .add_lora , make_lora_request (4 ), [1 , 2 , 3 , 4 ])
61+ run_check (llm .add_lora , make_lora_request (5 ), [1 , 5 , 3 , 4 ])
62+ run_check (llm .add_lora , make_lora_request (6 ), [1 , 5 , 6 , 4 ])
63+ run_check (llm .add_lora , make_lora_request (7 ), [1 , 5 , 6 , 7 ])
64+ run_check (llm .add_lora , make_lora_request (8 ), [1 , 8 , 6 , 7 ])
65+ run_check (llm .add_lora , make_lora_request (9 ), [1 , 8 , 9 , 7 ])
66+ run_check (llm .add_lora , make_lora_request (10 ), [1 , 8 , 9 , 10 ])
67+
68+ # Remove LoRA 1 and continue adding.
69+ run_check (llm .remove_lora , 1 , [8 , 9 , 10 ])
70+ run_check (llm .add_lora , make_lora_request (11 ), [8 , 9 , 10 , 11 ])
71+ run_check (llm .add_lora , make_lora_request (12 ), [12 , 9 , 10 , 11 ])
72+ run_check (llm .add_lora , make_lora_request (13 ), [12 , 13 , 10 , 11 ])
73+
74+ # Remove all LoRAs
75+ run_check (llm .remove_lora , 13 , [12 , 10 , 11 ])
76+ run_check (llm .remove_lora , 12 , [10 , 11 ])
77+ run_check (llm .remove_lora , 11 , [10 ])
78+ run_check (llm .remove_lora , 10 , [])
79+
80+
81+ @pytest .mark .asyncio
82+ async def test_lora_functions_async ():
83+
84+ if os .getenv ("VLLM_USE_V1" ) == "0" :
85+ pytest .skip (
86+ reason = f"V0 AsyncLLMEngine does not expose remove/list/pin LoRA functions" )
87+
88+ # The run_with_both_engines_lora fixture sets up the `VLLM_USE_V1`
89+ # environment variable. reload vllm.enging.async_llm_engine as
90+ # vllm.engine.async_llm_engine.AsyncLLMEgnine changes depending on the
91+ # env var.
92+ import importlib
93+
94+ import vllm .engine .async_llm_engine
95+ importlib .reload (vllm .engine .async_llm_engine )
96+ from vllm .entrypoints .openai .api_server import (
97+ build_async_engine_client_from_engine_args )
98+
99+ max_loras = 4
100+ engine_args = AsyncEngineArgs (
101+ model = MODEL_PATH ,
102+ enable_lora = True ,
103+ max_loras = max_loras ,
104+ max_lora_rank = LORA_RANK ,
105+ max_model_len = 128 ,
106+ gpu_memory_utilization = 0.8 ,
107+ enforce_eager = True )
108+
109+ async def run_check (fn , args , expected : List ):
110+ await fn (args )
111+ assert set (await llm .list_loras ()) == set (expected )
112+
113+ async with build_async_engine_client_from_engine_args (engine_args ) as llm :
114+ await run_check (llm .add_lora , make_lora_request (1 ), [1 ])
115+ await run_check (llm .add_lora , make_lora_request (2 ), [1 , 2 ])
116+
117+ # Pin LoRA 1 and test that it is never removed on subsequent adds.
118+ await run_check (llm .pin_lora , 1 , [1 , 2 ])
119+ await run_check (llm .add_lora , make_lora_request (3 ), [1 , 2 , 3 ])
120+ await run_check (llm .add_lora , make_lora_request (4 ), [1 , 2 , 3 , 4 ])
121+ await run_check (llm .add_lora , make_lora_request (5 ), [1 , 5 , 3 , 4 ])
122+ await run_check (llm .add_lora , make_lora_request (6 ), [1 , 5 , 6 , 4 ])
123+ await run_check (llm .add_lora , make_lora_request (7 ), [1 , 5 , 6 , 7 ])
124+ await run_check (llm .add_lora , make_lora_request (8 ), [1 , 8 , 6 , 7 ])
125+ await run_check (llm .add_lora , make_lora_request (9 ), [1 , 8 , 9 , 7 ])
126+ await run_check (llm .add_lora , make_lora_request (10 ), [1 , 8 , 9 , 10 ])
127+
128+ # Remove LoRA 1 and continue adding.
129+ await run_check (llm .remove_lora , 1 , [8 , 9 , 10 ])
130+ await run_check (llm .add_lora , make_lora_request (11 ), [8 , 9 , 10 , 11 ])
131+ await run_check (llm .add_lora , make_lora_request (12 ), [12 , 9 , 10 , 11 ])
132+ await run_check (llm .add_lora , make_lora_request (13 ), [12 , 13 , 10 , 11 ])
133+
134+ # Remove all LoRAs
135+ await run_check (llm .remove_lora , 13 , [12 , 10 , 11 ])
136+ await run_check (llm .remove_lora , 12 , [10 , 11 ])
137+ await run_check (llm .remove_lora , 11 , [10 ])
138+ await run_check (llm .remove_lora , 10 , [])
0 commit comments