diff --git a/requirements-neuron.txt b/requirements-neuron.txt index 6dbfa71eb99f..3f30ed08f037 100644 --- a/requirements-neuron.txt +++ b/requirements-neuron.txt @@ -5,5 +5,5 @@ torch-neuronx >= 2.1.0 neuronx-cc fastapi uvicorn[standard] -pydantic == 1.10.13 # Required for OpenAI server. +pydantic >= 2.0 # Required for OpenAI server. aioprometheus[starlette] diff --git a/requirements-rocm.txt b/requirements-rocm.txt index fd537f9cd461..6b10eec4801a 100644 --- a/requirements-rocm.txt +++ b/requirements-rocm.txt @@ -9,5 +9,5 @@ tokenizers>=0.15.0 transformers >= 4.36.0 # Required for Mixtral. fastapi uvicorn[standard] -pydantic == 1.10.13 # Required for OpenAI server. +pydantic >= 2.0 # Required for OpenAI server. aioprometheus[starlette] diff --git a/requirements.txt b/requirements.txt index cee7f190db31..a4819061bd67 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,5 +8,5 @@ transformers >= 4.36.0 # Required for Mixtral. xformers == 0.0.23.post1 # Required for CUDA 12.1. fastapi uvicorn[standard] -pydantic == 1.10.13 # Required for OpenAI server. +pydantic >= 2.0 # Required for OpenAI server. aioprometheus[starlette] diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index d652045c8ad7..b10e83903737 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -106,7 +106,7 @@ def parse_args(): @app.exception_handler(RequestValidationError) async def validation_exception_handler(_, exc): err = openai_serving_chat.create_error_response(message=str(exc)) - return JSONResponse(err.dict(), status_code=HTTPStatus.BAD_REQUEST) + return JSONResponse(err.model_dump(), status_code=HTTPStatus.BAD_REQUEST) @app.get("/health") @@ -118,7 +118,7 @@ async def health() -> Response: @app.get("/v1/models") async def show_available_models(): models = await openai_serving_chat.show_available_models() - return JSONResponse(content=models.dict()) + return JSONResponse(content=models.model_dump()) @app.post("/v1/chat/completions") @@ -126,22 +126,28 @@ async def create_chat_completion(request: ChatCompletionRequest, raw_request: Request): generator = await openai_serving_chat.create_chat_completion( request, raw_request) - if request.stream and not isinstance(generator, ErrorResponse): + if isinstance(generator, ErrorResponse): + return JSONResponse(content=generator.model_dump(), + status_code=generator.code) + if request.stream: return StreamingResponse(content=generator, media_type="text/event-stream") else: - return JSONResponse(content=generator.dict()) + return JSONResponse(content=generator.model_dump()) @app.post("/v1/completions") async def create_completion(request: CompletionRequest, raw_request: Request): generator = await openai_serving_completion.create_completion( request, raw_request) - if request.stream and not isinstance(generator, ErrorResponse): + if isinstance(generator, ErrorResponse): + return JSONResponse(content=generator.model_dump(), + status_code=generator.code) + if request.stream: return StreamingResponse(content=generator, media_type="text/event-stream") else: - return JSONResponse(content=generator.dict()) + return JSONResponse(content=generator.model_dump()) if __name__ == "__main__": diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 6a24e7e9e951..fff94366a670 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -14,7 +14,7 @@ class ErrorResponse(BaseModel): message: str type: str param: Optional[str] = None - code: Optional[str] = None + code: int class ModelPermission(BaseModel): @@ -189,7 +189,7 @@ class CompletionStreamResponse(BaseModel): created: int = Field(default_factory=lambda: int(time.time())) model: str choices: List[CompletionResponseStreamChoice] - usage: Optional[UsageInfo] + usage: Optional[UsageInfo] = Field(default=None) class ChatMessage(BaseModel): @@ -229,5 +229,4 @@ class ChatCompletionStreamResponse(BaseModel): created: int = Field(default_factory=lambda: int(time.time())) model: str choices: List[ChatCompletionResponseStreamChoice] - usage: Optional[UsageInfo] = Field( - default=None, description="data about request and response") + usage: Optional[UsageInfo] = Field(default=None) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 83d70e02919d..a9e4c355560b 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -102,7 +102,7 @@ async def chat_completion_stream_generator( created=created_time, choices=[choice_data], model=model_name) - data = chunk.json(exclude_unset=True, ensure_ascii=False) + data = chunk.model_dump_json(exclude_unset=True) yield f"data: {data}\n\n" # Send response to echo the input portion of the last message @@ -125,7 +125,7 @@ async def chat_completion_stream_generator( created=created_time, choices=[choice_data], model=model_name) - data = chunk.json(exclude_unset=True, ensure_ascii=False) + data = chunk.model_dump_json(exclude_unset=True) yield f"data: {data}\n\n" # Send response for each token for each request.n (index) @@ -156,7 +156,7 @@ async def chat_completion_stream_generator( created=created_time, choices=[choice_data], model=model_name) - data = chunk.json(exclude_unset=True, ensure_ascii=False) + data = chunk.model_dump_json(exclude_unset=True) yield f"data: {data}\n\n" else: # Send the finish response for each request.n only once @@ -178,9 +178,8 @@ async def chat_completion_stream_generator( model=model_name) if final_usage is not None: chunk.usage = final_usage - data = chunk.json(exclude_unset=True, - exclude_none=True, - ensure_ascii=False) + data = chunk.model_dump_json(exclude_unset=True, + exclude_none=True) yield f"data: {data}\n\n" finish_reason_sent[i] = True # Send the final done message after all response.n are finished diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index d668ed501b5c..7eaa7de4a8ac 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -74,7 +74,7 @@ async def completion_stream_generator( logprobs=logprobs, finish_reason=finish_reason, ) - ]).json(exclude_unset=True, ensure_ascii=False) + ]).model_dump_json(exclude_unset=True) yield f"data: {response_json}\n\n" if output.finish_reason is not None: @@ -99,7 +99,7 @@ async def completion_stream_generator( ) ], usage=final_usage, - ).json(exclude_unset=True, ensure_ascii=False) + ).model_dump_json(exclude_unset=True) yield f"data: {response_json}\n\n" yield "data: [DONE]\n\n" @@ -279,7 +279,7 @@ async def create_completion(self, request: CompletionRequest, # When user requests streaming but we don't stream, we still need to # return a streaming response with a single event. if request.stream: - response_json = response.json(ensure_ascii=False) + response_json = response.model_dump_json() async def fake_stream_generator() -> AsyncGenerator[str, None]: yield f"data: {response_json}\n\n"