Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion requirements-neuron.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,5 @@ torch-neuronx >= 2.1.0
neuronx-cc
fastapi
uvicorn[standard]
pydantic == 1.10.13 # Required for OpenAI server.
pydantic >= 2.0 # Required for OpenAI server.
aioprometheus[starlette]
2 changes: 1 addition & 1 deletion requirements-rocm.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ tokenizers>=0.15.0
transformers >= 4.36.0 # Required for Mixtral.
fastapi
uvicorn[standard]
pydantic == 1.10.13 # Required for OpenAI server.
pydantic >= 2.0 # Required for OpenAI server.
aioprometheus[starlette]
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,5 @@ transformers >= 4.36.0 # Required for Mixtral.
xformers == 0.0.23.post1 # Required for CUDA 12.1.
fastapi
uvicorn[standard]
pydantic == 1.10.13 # Required for OpenAI server.
pydantic >= 2.0 # Required for OpenAI server.
aioprometheus[starlette]
18 changes: 12 additions & 6 deletions vllm/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def parse_args():
@app.exception_handler(RequestValidationError)
async def validation_exception_handler(_, exc):
err = openai_serving_chat.create_error_response(message=str(exc))
return JSONResponse(err.dict(), status_code=HTTPStatus.BAD_REQUEST)
return JSONResponse(err.model_dump(), status_code=HTTPStatus.BAD_REQUEST)


@app.get("/health")
Expand All @@ -118,30 +118,36 @@ async def health() -> Response:
@app.get("/v1/models")
async def show_available_models():
models = await openai_serving_chat.show_available_models()
return JSONResponse(content=models.dict())
return JSONResponse(content=models.model_dump())


@app.post("/v1/chat/completions")
async def create_chat_completion(request: ChatCompletionRequest,
raw_request: Request):
generator = await openai_serving_chat.create_chat_completion(
request, raw_request)
if request.stream and not isinstance(generator, ErrorResponse):
if isinstance(generator, ErrorResponse):
return JSONResponse(content=generator.model_dump(),
status_code=generator.code)
if request.stream:
return StreamingResponse(content=generator,
media_type="text/event-stream")
else:
return JSONResponse(content=generator.dict())
return JSONResponse(content=generator.model_dump())


@app.post("/v1/completions")
async def create_completion(request: CompletionRequest, raw_request: Request):
generator = await openai_serving_completion.create_completion(
request, raw_request)
if request.stream and not isinstance(generator, ErrorResponse):
if isinstance(generator, ErrorResponse):
return JSONResponse(content=generator.model_dump(),
status_code=generator.code)
if request.stream:
return StreamingResponse(content=generator,
media_type="text/event-stream")
else:
return JSONResponse(content=generator.dict())
return JSONResponse(content=generator.model_dump())


if __name__ == "__main__":
Expand Down
7 changes: 3 additions & 4 deletions vllm/entrypoints/openai/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ class ErrorResponse(BaseModel):
message: str
type: str
param: Optional[str] = None
code: Optional[str] = None
code: int


class ModelPermission(BaseModel):
Expand Down Expand Up @@ -189,7 +189,7 @@ class CompletionStreamResponse(BaseModel):
created: int = Field(default_factory=lambda: int(time.time()))
model: str
choices: List[CompletionResponseStreamChoice]
usage: Optional[UsageInfo]
usage: Optional[UsageInfo] = Field(default=None)


class ChatMessage(BaseModel):
Expand Down Expand Up @@ -229,5 +229,4 @@ class ChatCompletionStreamResponse(BaseModel):
created: int = Field(default_factory=lambda: int(time.time()))
model: str
choices: List[ChatCompletionResponseStreamChoice]
usage: Optional[UsageInfo] = Field(
default=None, description="data about request and response")
usage: Optional[UsageInfo] = Field(default=None)
11 changes: 5 additions & 6 deletions vllm/entrypoints/openai/serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ async def chat_completion_stream_generator(
created=created_time,
choices=[choice_data],
model=model_name)
data = chunk.json(exclude_unset=True, ensure_ascii=False)
data = chunk.model_dump_json(exclude_unset=True)
yield f"data: {data}\n\n"

# Send response to echo the input portion of the last message
Expand All @@ -125,7 +125,7 @@ async def chat_completion_stream_generator(
created=created_time,
choices=[choice_data],
model=model_name)
data = chunk.json(exclude_unset=True, ensure_ascii=False)
data = chunk.model_dump_json(exclude_unset=True)
yield f"data: {data}\n\n"

# Send response for each token for each request.n (index)
Expand Down Expand Up @@ -156,7 +156,7 @@ async def chat_completion_stream_generator(
created=created_time,
choices=[choice_data],
model=model_name)
data = chunk.json(exclude_unset=True, ensure_ascii=False)
data = chunk.model_dump_json(exclude_unset=True)
yield f"data: {data}\n\n"
else:
# Send the finish response for each request.n only once
Expand All @@ -178,9 +178,8 @@ async def chat_completion_stream_generator(
model=model_name)
if final_usage is not None:
chunk.usage = final_usage
data = chunk.json(exclude_unset=True,
exclude_none=True,
ensure_ascii=False)
data = chunk.model_dump_json(exclude_unset=True,
exclude_none=True)
yield f"data: {data}\n\n"
finish_reason_sent[i] = True
# Send the final done message after all response.n are finished
Expand Down
6 changes: 3 additions & 3 deletions vllm/entrypoints/openai/serving_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ async def completion_stream_generator(
logprobs=logprobs,
finish_reason=finish_reason,
)
]).json(exclude_unset=True, ensure_ascii=False)
]).model_dump_json(exclude_unset=True)
yield f"data: {response_json}\n\n"

if output.finish_reason is not None:
Expand All @@ -99,7 +99,7 @@ async def completion_stream_generator(
)
],
usage=final_usage,
).json(exclude_unset=True, ensure_ascii=False)
).model_dump_json(exclude_unset=True)
yield f"data: {response_json}\n\n"

yield "data: [DONE]\n\n"
Expand Down Expand Up @@ -279,7 +279,7 @@ async def create_completion(self, request: CompletionRequest,
# When user requests streaming but we don't stream, we still need to
# return a streaming response with a single event.
if request.stream:
response_json = response.json(ensure_ascii=False)
response_json = response.model_dump_json()

async def fake_stream_generator() -> AsyncGenerator[str, None]:
yield f"data: {response_json}\n\n"
Expand Down