Skip to content

Commit

Permalink
[Frontend] Support load & unload api endpoint
Browse files Browse the repository at this point in the history
  • Loading branch information
Jeffwan committed Jul 24, 2024
1 parent f20f147 commit 140fbcf
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 2 deletions.
2 changes: 1 addition & 1 deletion tests/entrypoints/llm/test_generate_multiple_loras.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def zephyr_lora_files():
@pytest.mark.skip_global_cleanup
def test_multiple_lora_requests(llm: LLM, zephyr_lora_files):
lora_request = [
LoRARequest(LORA_NAME, idx + 1, zephyr_lora_files)
LoRARequest(LORA_NAME + str(idx), idx + 1, zephyr_lora_files)
for idx in range(len(PROMPTS))
]
# Multiple SamplingParams should be matched with each prompt
Expand Down
34 changes: 33 additions & 1 deletion vllm/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,10 @@
DetokenizeRequest,
DetokenizeResponse,
EmbeddingRequest, ErrorResponse,
LoadLoraAdapterRequest,
TokenizeRequest,
TokenizeResponse)
TokenizeResponse,
UnloadLoraAdapterRequest)
# yapf: enable
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
Expand Down Expand Up @@ -165,6 +167,36 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request):
return JSONResponse(content=generator.model_dump())


@router.post("/v1/load_lora_adapter")
async def load_lora_adapter(request: LoadLoraAdapterRequest):
response = await openai_serving_chat.load_lora_adapter(request)
if isinstance(response, ErrorResponse):
return JSONResponse(content=response.model_dump(),
status_code=response.code)

response = await openai_serving_completion.load_lora_adapter(request)
if isinstance(response, ErrorResponse):
return JSONResponse(content=response.model_dump(),
status_code=response.code)

return Response(status_code=200)


@router.post("/v1/unload_lora_adapter")
async def unload_lora_adapter(request: UnloadLoraAdapterRequest):
response = await openai_serving_chat.unload_lora_adapter(request)
if isinstance(response, ErrorResponse):
return JSONResponse(content=response.model_dump(),
status_code=response.code)

response = await openai_serving_completion.unload_lora_adapter(request)
if isinstance(response, ErrorResponse):
return JSONResponse(content=response.model_dump(),
status_code=response.code)

return Response(status_code=200)


def build_app(args):
app = fastapi.FastAPI(lifespan=lifespan)
app.include_router(router)
Expand Down

0 comments on commit 140fbcf

Please sign in to comment.