From d0b07bb82ebfe825c616bc95d6da35103ea942c9 Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Wed, 27 Aug 2025 13:27:17 -0700 Subject: [PATCH 1/2] refactor streaming output loops --- vllm/entrypoints/cli/openai.py | 68 +++++++++++++++++++++------------- 1 file changed, 42 insertions(+), 26 deletions(-) diff --git a/vllm/entrypoints/cli/openai.py b/vllm/entrypoints/cli/openai.py index 7c01de94a343..cf125e31a725 100644 --- a/vllm/entrypoints/cli/openai.py +++ b/vllm/entrypoints/cli/openai.py @@ -45,6 +45,28 @@ def _interactive_cli(args: argparse.Namespace) -> tuple[str, OpenAI]: return model_name, openai_client +def _print_chat_stream(stream) -> str: + output = "" + for chunk in stream: + delta = chunk.choices[0].delta + if delta.content: + output += delta.content + print(delta.content, end="", flush=True) + print() + return output + + +def _print_completion_stream(stream) -> str: + output = "" + for chunk in stream: + text = chunk.choices[0].text + if text is not None: + output += text + print(text, end="", flush=True) + print() + return output + + def chat(system_prompt: str | None, model_name: str, client: OpenAI) -> None: conversation: list[ChatCompletionMessageParam] = [] if system_prompt is not None: @@ -58,14 +80,10 @@ def chat(system_prompt: str | None, model_name: str, client: OpenAI) -> None: break conversation.append({"role": "user", "content": input_message}) - chat_completion = client.chat.completions.create(model=model_name, - messages=conversation) - - response_message = chat_completion.choices[0].message - output = response_message.content - - conversation.append(response_message) # type: ignore - print(output) + stream = client.chat.completions.create( + model=model_name, messages=conversation, stream=True) + output = _print_chat_stream(stream) + conversation.append({"role": "assistant", "content": output}) def _add_query_options( @@ -108,9 +126,10 @@ def cmd(args: argparse.Namespace) -> None: if args.quick: conversation.append({"role": "user", "content": args.quick}) - chat_completion = client.chat.completions.create( - model=model_name, messages=conversation) - print(chat_completion.choices[0].message.content) + stream = client.chat.completions.create( + model=model_name, messages=conversation, stream=True) + output = _print_chat_stream(stream) + conversation.append({"role": "assistant", "content": output}) return print("Please enter a message for the chat model:") @@ -121,14 +140,10 @@ def cmd(args: argparse.Namespace) -> None: break conversation.append({"role": "user", "content": input_message}) - chat_completion = client.chat.completions.create( - model=model_name, messages=conversation) - - response_message = chat_completion.choices[0].message - output = response_message.content - - conversation.append(response_message) # type: ignore - print(output) + stream = client.chat.completions.create( + model=model_name, messages=conversation, stream=True) + output = _print_chat_stream(stream) + conversation.append({"role": "assistant", "content": output}) @staticmethod def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: @@ -168,9 +183,10 @@ def cmd(args: argparse.Namespace) -> None: model_name, client = _interactive_cli(args) if args.quick: - completion = client.completions.create(model=model_name, - prompt=args.quick) - print(completion.choices[0].text) + stream = client.completions.create(model=model_name, + prompt=args.quick, + stream=True) + _print_completion_stream(stream) return print("Please enter prompt to complete:") @@ -179,10 +195,10 @@ def cmd(args: argparse.Namespace) -> None: input_prompt = input("> ") except EOFError: break - completion = client.completions.create(model=model_name, - prompt=input_prompt) - output = completion.choices[0].text - print(output) + stream = client.completions.create(model=model_name, + prompt=input_prompt, + stream=True) + _print_completion_stream(stream) @staticmethod def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: From 70fcd699a4621c73723e0b89179ee6f80f52fe91 Mon Sep 17 00:00:00 2001 From: simon-mo Date: Wed, 17 Sep 2025 16:12:50 -0700 Subject: [PATCH 2/2] lint Signed-off-by: simon-mo --- vllm/entrypoints/cli/openai.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/vllm/entrypoints/cli/openai.py b/vllm/entrypoints/cli/openai.py index cf125e31a725..1929d6a7f77a 100644 --- a/vllm/entrypoints/cli/openai.py +++ b/vllm/entrypoints/cli/openai.py @@ -80,8 +80,9 @@ def chat(system_prompt: str | None, model_name: str, client: OpenAI) -> None: break conversation.append({"role": "user", "content": input_message}) - stream = client.chat.completions.create( - model=model_name, messages=conversation, stream=True) + stream = client.chat.completions.create(model=model_name, + messages=conversation, + stream=True) output = _print_chat_stream(stream) conversation.append({"role": "assistant", "content": output}) @@ -126,8 +127,9 @@ def cmd(args: argparse.Namespace) -> None: if args.quick: conversation.append({"role": "user", "content": args.quick}) - stream = client.chat.completions.create( - model=model_name, messages=conversation, stream=True) + stream = client.chat.completions.create(model=model_name, + messages=conversation, + stream=True) output = _print_chat_stream(stream) conversation.append({"role": "assistant", "content": output}) return @@ -140,8 +142,9 @@ def cmd(args: argparse.Namespace) -> None: break conversation.append({"role": "user", "content": input_message}) - stream = client.chat.completions.create( - model=model_name, messages=conversation, stream=True) + stream = client.chat.completions.create(model=model_name, + messages=conversation, + stream=True) output = _print_chat_stream(stream) conversation.append({"role": "assistant", "content": output})