From 749fb52d20ae091516366ee63c8fb7dadfebc3c2 Mon Sep 17 00:00:00 2001 From: Alvaro Mateos Date: Sat, 28 Oct 2023 20:16:49 +0200 Subject: [PATCH 1/4] Enable streaming support for openai ChatCompletion #217 --- autogen/oai/chat_completion_proxy.py | 57 ++++++++++++++++++++++++++++ autogen/oai/completion.py | 3 +- setup.py | 1 + 3 files changed, 60 insertions(+), 1 deletion(-) create mode 100644 autogen/oai/chat_completion_proxy.py diff --git a/autogen/oai/chat_completion_proxy.py b/autogen/oai/chat_completion_proxy.py new file mode 100644 index 000000000000..5ac64e5ce9af --- /dev/null +++ b/autogen/oai/chat_completion_proxy.py @@ -0,0 +1,57 @@ +import openai +import tiktoken + +class ChatCompletionProxy(): + @classmethod + def _prompt_tokens(cls, messages): + # Get the encoding for OpenAI's "cl100k_base" model + encoding = tiktoken.get_encoding("cl100k_base") + + # Calculate the total number of tokens in the prompt + # by iterating over each message in the 'messages' list, + # encoding its content, and summing up the token counts. + return sum([len(encoding.encode(msg['content'])) for msg in messages]) + + @classmethod + def create(cls, *args, **kwargs): + # Check if streaming is enabled in the function arguments + if kwargs.get("stream", False): + response_content = "" + completion_tokens = 0 + + # Set the terminal text color to green for better visibility + print("\033[32m", end='') + + # Send the chat completion request to OpenAI's API and process the response in chunks + for chunk in openai.ChatCompletion.create(*args, **kwargs): + if chunk["choices"]: + content = chunk["choices"][0].get("delta", {}).get("content") + # If content is present, print it to the terminal and update response variables + if content is not None: + print(content, end='', flush=True) + response_content += content + completion_tokens += 1 + + # Reset the terminal text color + print("\033[0m\n") + + # Prepare the final response object based on the accumulated data + response = chunk + response["choices"][0]["message"] = { + 'role': 'assistant', + 'content': response_content + } + + prompt_tokens = cls._prompt_tokens(kwargs["messages"]) + # Add usage information to the response + response["usage"] = { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": prompt_tokens + completion_tokens + } + else: + # If streaming is not enabled, send a regular chat completion request + response = openai.ChatCompletion.create(*args, **kwargs) + + # Return the final response object + return response diff --git a/autogen/oai/completion.py b/autogen/oai/completion.py index a720ccc24466..38d8cde4dee9 100644 --- a/autogen/oai/completion.py +++ b/autogen/oai/completion.py @@ -9,6 +9,7 @@ from flaml.tune.space import is_constant from flaml.automl.logger import logger_formatter from .openai_utils import get_key +from .chat_completion_proxy import ChatCompletionProxy from collections import defaultdict try: @@ -207,7 +208,7 @@ def _get_response(cls, config: Dict, raise_on_ratelimit_or_timeout=False, use_ca cls._book_keeping(config, response) return response openai_completion = ( - openai.ChatCompletion + ChatCompletionProxy # Support streaming for chat models if config["model"].replace("gpt-35-turbo", "gpt-3.5-turbo") in cls.chat_models or issubclass(cls, ChatCompletion) else openai.Completion diff --git a/setup.py b/setup.py index d3308bf1ccc6..2b89fc8e291d 100644 --- a/setup.py +++ b/setup.py @@ -15,6 +15,7 @@ install_requires = [ "openai<1", + "tiktoken", "diskcache", "termcolor", "flaml", From d72a86828b2f95116f92b56a5c9f213803ec834f Mon Sep 17 00:00:00 2001 From: Alvaro Mateos Date: Mon, 30 Oct 2023 22:27:33 +0100 Subject: [PATCH 2/4] Improve streaming implementation (openai v0.28) --- autogen/oai/chat_completion_proxy.py | 50 +++++++++++++++++++--------- 1 file changed, 34 insertions(+), 16 deletions(-) diff --git a/autogen/oai/chat_completion_proxy.py b/autogen/oai/chat_completion_proxy.py index 5ac64e5ce9af..5a1d105715f1 100644 --- a/autogen/oai/chat_completion_proxy.py +++ b/autogen/oai/chat_completion_proxy.py @@ -1,6 +1,8 @@ import openai import tiktoken +from openai.openai_object import OpenAIObject + class ChatCompletionProxy(): @classmethod def _prompt_tokens(cls, messages): @@ -15,8 +17,10 @@ def _prompt_tokens(cls, messages): @classmethod def create(cls, *args, **kwargs): # Check if streaming is enabled in the function arguments - if kwargs.get("stream", False): - response_content = "" + if kwargs.get('stream', False) and 'functions' not in kwargs: + # Prepare response array based on parameter 'n' + response_contents = [""] * kwargs.get('n', 1) + finish_reasons = [""] * kwargs.get('n', 1) completion_tokens = 0 # Set the terminal text color to green for better visibility @@ -25,32 +29,46 @@ def create(cls, *args, **kwargs): # Send the chat completion request to OpenAI's API and process the response in chunks for chunk in openai.ChatCompletion.create(*args, **kwargs): if chunk["choices"]: - content = chunk["choices"][0].get("delta", {}).get("content") - # If content is present, print it to the terminal and update response variables - if content is not None: - print(content, end='', flush=True) - response_content += content - completion_tokens += 1 + for choice in chunk["choices"]: + content = choice.get("delta", {}).get("content") + # If content is present, print it to the terminal and update response variables + if content is not None: + print(content, end='', flush=True) + response_contents[choice.index] += content + finish_reasons[choice.index] = choice.get("finish_reasons", None) + completion_tokens += 1 + else: + print() # Reset the terminal text color print("\033[0m\n") # Prepare the final response object based on the accumulated data - response = chunk - response["choices"][0]["message"] = { - 'role': 'assistant', - 'content': response_content - } - prompt_tokens = cls._prompt_tokens(kwargs["messages"]) - # Add usage information to the response - response["usage"] = { + response = OpenAIObject() + response.id = chunk['id'] + response.object = 'chat.completion' + response.created = chunk['created'] + response.model = chunk['model'] + response.choices = [] + response.usage = { "prompt_tokens": prompt_tokens, "completion_tokens": completion_tokens, "total_tokens": prompt_tokens + completion_tokens } + for i in range(len(response_contents)): + response["choices"].append({ + "index": i, + "finish_reason": finish_reasons[i], + "message": { + 'role': 'assistant', + 'content': response_contents[i] + } + }) else: # If streaming is not enabled, send a regular chat completion request + # Ensure streaming is disabled + kwargs['stream'] = False response = openai.ChatCompletion.create(*args, **kwargs) # Return the final response object From 1ee148c998891262919f0ec26cc01445f73a254b Mon Sep 17 00:00:00 2001 From: Alvaro Mateos Date: Mon, 30 Oct 2023 22:31:42 +0100 Subject: [PATCH 3/4] Improve streaming implementation (openai v0.28) --- autogen/oai/chat_completion_proxy.py | 50 +++++++++++++++++++--------- 1 file changed, 34 insertions(+), 16 deletions(-) diff --git a/autogen/oai/chat_completion_proxy.py b/autogen/oai/chat_completion_proxy.py index 5ac64e5ce9af..5a1d105715f1 100644 --- a/autogen/oai/chat_completion_proxy.py +++ b/autogen/oai/chat_completion_proxy.py @@ -1,6 +1,8 @@ import openai import tiktoken +from openai.openai_object import OpenAIObject + class ChatCompletionProxy(): @classmethod def _prompt_tokens(cls, messages): @@ -15,8 +17,10 @@ def _prompt_tokens(cls, messages): @classmethod def create(cls, *args, **kwargs): # Check if streaming is enabled in the function arguments - if kwargs.get("stream", False): - response_content = "" + if kwargs.get('stream', False) and 'functions' not in kwargs: + # Prepare response array based on parameter 'n' + response_contents = [""] * kwargs.get('n', 1) + finish_reasons = [""] * kwargs.get('n', 1) completion_tokens = 0 # Set the terminal text color to green for better visibility @@ -25,32 +29,46 @@ def create(cls, *args, **kwargs): # Send the chat completion request to OpenAI's API and process the response in chunks for chunk in openai.ChatCompletion.create(*args, **kwargs): if chunk["choices"]: - content = chunk["choices"][0].get("delta", {}).get("content") - # If content is present, print it to the terminal and update response variables - if content is not None: - print(content, end='', flush=True) - response_content += content - completion_tokens += 1 + for choice in chunk["choices"]: + content = choice.get("delta", {}).get("content") + # If content is present, print it to the terminal and update response variables + if content is not None: + print(content, end='', flush=True) + response_contents[choice.index] += content + finish_reasons[choice.index] = choice.get("finish_reasons", None) + completion_tokens += 1 + else: + print() # Reset the terminal text color print("\033[0m\n") # Prepare the final response object based on the accumulated data - response = chunk - response["choices"][0]["message"] = { - 'role': 'assistant', - 'content': response_content - } - prompt_tokens = cls._prompt_tokens(kwargs["messages"]) - # Add usage information to the response - response["usage"] = { + response = OpenAIObject() + response.id = chunk['id'] + response.object = 'chat.completion' + response.created = chunk['created'] + response.model = chunk['model'] + response.choices = [] + response.usage = { "prompt_tokens": prompt_tokens, "completion_tokens": completion_tokens, "total_tokens": prompt_tokens + completion_tokens } + for i in range(len(response_contents)): + response["choices"].append({ + "index": i, + "finish_reason": finish_reasons[i], + "message": { + 'role': 'assistant', + 'content': response_contents[i] + } + }) else: # If streaming is not enabled, send a regular chat completion request + # Ensure streaming is disabled + kwargs['stream'] = False response = openai.ChatCompletion.create(*args, **kwargs) # Return the final response object From 3afc70dafe58bfc01dbde35d28c3fbc990179b38 Mon Sep 17 00:00:00 2001 From: Alvaro Mateos Date: Tue, 31 Oct 2023 11:23:47 +0100 Subject: [PATCH 4/4] Fixed code formatting issues with pre-commit --- autogen/oai/chat_completion_proxy.py | 52 ++++++++++++++-------------- autogen/oai/completion.py | 2 +- 2 files changed, 27 insertions(+), 27 deletions(-) diff --git a/autogen/oai/chat_completion_proxy.py b/autogen/oai/chat_completion_proxy.py index 5a1d105715f1..a8f6c443400a 100644 --- a/autogen/oai/chat_completion_proxy.py +++ b/autogen/oai/chat_completion_proxy.py @@ -3,29 +3,30 @@ from openai.openai_object import OpenAIObject -class ChatCompletionProxy(): + +class ChatCompletionProxy: @classmethod def _prompt_tokens(cls, messages): # Get the encoding for OpenAI's "cl100k_base" model encoding = tiktoken.get_encoding("cl100k_base") - + # Calculate the total number of tokens in the prompt # by iterating over each message in the 'messages' list, # encoding its content, and summing up the token counts. - return sum([len(encoding.encode(msg['content'])) for msg in messages]) + return sum([len(encoding.encode(msg["content"])) for msg in messages]) @classmethod def create(cls, *args, **kwargs): # Check if streaming is enabled in the function arguments - if kwargs.get('stream', False) and 'functions' not in kwargs: + if kwargs.get("stream", False) and "functions" not in kwargs: # Prepare response array based on parameter 'n' - response_contents = [""] * kwargs.get('n', 1) - finish_reasons = [""] * kwargs.get('n', 1) + response_contents = [""] * kwargs.get("n", 1) + finish_reasons = [""] * kwargs.get("n", 1) completion_tokens = 0 - + # Set the terminal text color to green for better visibility - print("\033[32m", end='') - + print("\033[32m", end="") + # Send the chat completion request to OpenAI's API and process the response in chunks for chunk in openai.ChatCompletion.create(*args, **kwargs): if chunk["choices"]: @@ -33,43 +34,42 @@ def create(cls, *args, **kwargs): content = choice.get("delta", {}).get("content") # If content is present, print it to the terminal and update response variables if content is not None: - print(content, end='', flush=True) + print(content, end="", flush=True) response_contents[choice.index] += content finish_reasons[choice.index] = choice.get("finish_reasons", None) completion_tokens += 1 else: print() - + # Reset the terminal text color print("\033[0m\n") - + # Prepare the final response object based on the accumulated data prompt_tokens = cls._prompt_tokens(kwargs["messages"]) response = OpenAIObject() - response.id = chunk['id'] - response.object = 'chat.completion' - response.created = chunk['created'] - response.model = chunk['model'] + response.id = chunk["id"] + response.object = "chat.completion" + response.created = chunk["created"] + response.model = chunk["model"] response.choices = [] response.usage = { "prompt_tokens": prompt_tokens, "completion_tokens": completion_tokens, - "total_tokens": prompt_tokens + completion_tokens + "total_tokens": prompt_tokens + completion_tokens, } for i in range(len(response_contents)): - response["choices"].append({ - "index": i, - "finish_reason": finish_reasons[i], - "message": { - 'role': 'assistant', - 'content': response_contents[i] + response["choices"].append( + { + "index": i, + "finish_reason": finish_reasons[i], + "message": {"role": "assistant", "content": response_contents[i]}, } - }) + ) else: # If streaming is not enabled, send a regular chat completion request # Ensure streaming is disabled - kwargs['stream'] = False + kwargs["stream"] = False response = openai.ChatCompletion.create(*args, **kwargs) - + # Return the final response object return response diff --git a/autogen/oai/completion.py b/autogen/oai/completion.py index 38d8cde4dee9..18239842a324 100644 --- a/autogen/oai/completion.py +++ b/autogen/oai/completion.py @@ -208,7 +208,7 @@ def _get_response(cls, config: Dict, raise_on_ratelimit_or_timeout=False, use_ca cls._book_keeping(config, response) return response openai_completion = ( - ChatCompletionProxy # Support streaming for chat models + ChatCompletionProxy # Support streaming for chat models if config["model"].replace("gpt-35-turbo", "gpt-3.5-turbo") in cls.chat_models or issubclass(cls, ChatCompletion) else openai.Completion