diff --git a/src/litserve/specs/openai.py b/src/litserve/specs/openai.py index 558689f6..ebcd2880 100644 --- a/src/litserve/specs/openai.py +++ b/src/litserve/specs/openai.py @@ -365,7 +365,7 @@ async def chat_completion(self, request: ChatCompletionRequest, background_tasks if request.stream: return StreamingResponse( self.streaming_completion(request, responses), - media_type="application/x-ndjson", + media_type="text/event-stream", background=background_tasks, ) @@ -394,9 +394,9 @@ async def streaming_completion(self, request: ChatCompletionRequest, pipe_respon # Only use the last item from encode_response usage_info = sum(usage_infos) - chunk = ChatCompletionChunk(model=model, choices=choices, usage=None).json() + chunk = ChatCompletionChunk(model=model, choices=choices, usage=None) logger.debug(chunk) - yield f"data: {chunk}\n\n" + yield f"data: {chunk.model_dump_json()}\n\n" choices = [ ChatCompletionStreamingChoice( @@ -410,8 +410,8 @@ async def streaming_completion(self, request: ChatCompletionRequest, pipe_respon model=model, choices=choices, usage=usage_info, - ).json() - yield f"data: {last_chunk}\n\n" + ) + yield f"data: {last_chunk.model_dump_json()}\n\n" yield "data: [DONE]\n\n" async def non_streaming_completion(self, request: ChatCompletionRequest, generator_list: List[AsyncGenerator]):