From 140fbcfdcd19caf1348a579f1abc90caa680aec7 Mon Sep 17 00:00:00 2001 From: Jiaxin Shan Date: Wed, 24 Jul 2024 11:40:33 -0700 Subject: [PATCH] [Frontend] Support load & unload api endpoint --- .../llm/test_generate_multiple_loras.py | 2 +- vllm/entrypoints/openai/api_server.py | 34 ++++++++++++++++++- 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/tests/entrypoints/llm/test_generate_multiple_loras.py b/tests/entrypoints/llm/test_generate_multiple_loras.py index 35eabf079964a..9f5727ecd0406 100644 --- a/tests/entrypoints/llm/test_generate_multiple_loras.py +++ b/tests/entrypoints/llm/test_generate_multiple_loras.py @@ -50,7 +50,7 @@ def zephyr_lora_files(): @pytest.mark.skip_global_cleanup def test_multiple_lora_requests(llm: LLM, zephyr_lora_files): lora_request = [ - LoRARequest(LORA_NAME, idx + 1, zephyr_lora_files) + LoRARequest(LORA_NAME + str(idx), idx + 1, zephyr_lora_files) for idx in range(len(PROMPTS)) ] # Multiple SamplingParams should be matched with each prompt diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index add5c91900b23..f0af261a5ff49 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -29,8 +29,10 @@ DetokenizeRequest, DetokenizeResponse, EmbeddingRequest, ErrorResponse, + LoadLoraAdapterRequest, TokenizeRequest, - TokenizeResponse) + TokenizeResponse, + UnloadLoraAdapterRequest) # yapf: enable from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion @@ -165,6 +167,36 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request): return JSONResponse(content=generator.model_dump()) +@router.post("/v1/load_lora_adapter") +async def load_lora_adapter(request: LoadLoraAdapterRequest): + response = await openai_serving_chat.load_lora_adapter(request) + if isinstance(response, ErrorResponse): + return JSONResponse(content=response.model_dump(), + status_code=response.code) + + response = await openai_serving_completion.load_lora_adapter(request) + if isinstance(response, ErrorResponse): + return JSONResponse(content=response.model_dump(), + status_code=response.code) + + return Response(status_code=200) + + +@router.post("/v1/unload_lora_adapter") +async def unload_lora_adapter(request: UnloadLoraAdapterRequest): + response = await openai_serving_chat.unload_lora_adapter(request) + if isinstance(response, ErrorResponse): + return JSONResponse(content=response.model_dump(), + status_code=response.code) + + response = await openai_serving_completion.unload_lora_adapter(request) + if isinstance(response, ErrorResponse): + return JSONResponse(content=response.model_dump(), + status_code=response.code) + + return Response(status_code=200) + + def build_app(args): app = fastapi.FastAPI(lifespan=lifespan) app.include_router(router)