diff --git a/.gitignore b/.gitignore index 96b53ca2e..067e81903 100644 --- a/.gitignore +++ b/.gitignore @@ -19,9 +19,6 @@ goex/docker/misc/images.json ################## Berkley Function Call Leaderboard ########################## -# Ignore API keys -berkeley-function-call-leaderboard/function_credential_config.json - # Ignore tree-sitter berkeley-function-call-leaderboard/eval_checker/tree-sitter-java berkeley-function-call-leaderboard/eval_checker/tree-sitter-javascript diff --git a/berkeley-function-call-leaderboard/README.md b/berkeley-function-call-leaderboard/README.md index ed7d67d38..85a905fcf 100644 --- a/berkeley-function-call-leaderboard/README.md +++ b/berkeley-function-call-leaderboard/README.md @@ -138,6 +138,8 @@ Below is *a table of models we support* to run our leaderboard evaluation agains |gpt-4o-2024-05-13| Prompt| |gpt-4o-mini-2024-07-18-FC | Function Calling| |gpt-4o-mini-2024-07-18 | Prompt| +|o1-mini-2024-09-12 | Prompt| +|o1-preview-2024-09-12 | Prompt| |google/gemma-7b-it 💻| Prompt| |meetkai/functionary-medium-v3.1-FC| Function Calling| |meetkai/functionary-small-{v3.1,v3.2}-FC| Function Calling| @@ -269,6 +271,7 @@ Some companies have proposed some optimization strategies in their models' handl ## Changelog * [Sept 13, 2024] [#638](https://github.com/ShishirPatil/gorilla/pull/638): Fix prompt formatting issue for `THUDM/glm-4-9b-chat`. +* [Sept 12, 2024] [#635](https://github.com/ShishirPatil/gorilla/pull/635): Add new models `o1-preview-2024-09-12` and `o1-mini-2024-09-12` to the leaderboard. * [Sept 8, 2024] [#627](https://github.com/ShishirPatil/gorilla/pull/627) Add new model `MadeAgents/Hammer-7b` to the leaderboard. * [Sept 7, 2024] [#626](https://github.com/ShishirPatil/gorilla/pull/626): Fix prompt format for Llama models. * [Sept 4, 2024] [#623](https://github.com/ShishirPatil/gorilla/pull/623): Fix decoding issue in the `NvidiaHandler`; remove duplicate `ArcticHandler` class. diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/custom_exception.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/custom_exception.py index 3504862d8..9daa51f7d 100644 --- a/berkeley-function-call-leaderboard/bfcl/eval_checker/custom_exception.py +++ b/berkeley-function-call-leaderboard/bfcl/eval_checker/custom_exception.py @@ -1,6 +1,6 @@ class NoAPIKeyError(Exception): def __init__(self): - self.message = "❗️Please fill in the API keys in the function_credential_config.json file. If you do not provide the API keys, the executable test category results will be inaccurate." + self.message = "❗️Please fill in the API keys in the .env file. If you do not provide the API keys, the executable test category results will be inaccurate." super().__init__(self.message) diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py index 8c7876cdd..3f812d47c 100644 --- a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py +++ b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py @@ -89,6 +89,18 @@ ] MODEL_METADATA_MAPPING = { + "o1-preview-2024-09-12": [ + "o1-preview-2024-09-12 (Prompt)", + "https://openai.com/index/introducing-openai-o1-preview/", + "OpenAI", + "Proprietary", + ], + "o1-mini-2024-09-12": [ + "o1-mini-2024-09-12 (Prompt)", + "https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/", + "OpenAI", + "Proprietary", + ], "gpt-4o-2024-08-06": [ "GPT-4o-2024-08-06 (Prompt)", "https://openai.com/index/hello-gpt-4o/", @@ -585,6 +597,8 @@ "mistral-small-2402-FC-Auto": 1, "mistral-small-2402": 1, "mistral-tiny-2312": 0.25, + "o1-preview-2024-09-12": 15, + "o1-mini-2024-09-12": 3, "gpt-4o-2024-05-13-FC": 5, "gpt-4o-2024-05-13": 5, "gpt-4o-2024-08-06-FC": 2.5, @@ -639,6 +653,8 @@ "mistral-small-2402-FC-Any": 3, "mistral-small-2402-FC-Auto": 3, "mistral-tiny-2312": 0.25, + "o1-preview-2024-09-12": 60, + "o1-mini-2024-09-12": 12, "gpt-4o-2024-05-13-FC": 15, "gpt-4o-2024-05-13": 15, "gpt-4o-2024-08-06-FC": 10, diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/handler_map.py b/berkeley-function-call-leaderboard/bfcl/model_handler/handler_map.py index ee28fe082..d583b3723 100644 --- a/berkeley-function-call-leaderboard/bfcl/model_handler/handler_map.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/handler_map.py @@ -24,6 +24,8 @@ handler_map = { "gorilla-openfunctions-v0": GorillaHandler, "gorilla-openfunctions-v2": GorillaHandler, + "o1-preview-2024-09-12": OpenAIHandler, + "o1-mini-2024-09-12": OpenAIHandler, "gpt-4o-2024-08-06": OpenAIHandler, "gpt-4o-2024-08-06-FC": OpenAIHandler, "gpt-4o-2024-05-13": OpenAIHandler, diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/openai.py b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/openai.py index 5b7de41c2..8562faf09 100644 --- a/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/openai.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/openai.py @@ -6,6 +6,8 @@ system_prompt_pre_processing_chat_model, func_doc_language_specific_pre_processing, ast_parse, + convert_system_prompt_into_user_prompt, + combine_consecutive_user_prompr, ) from bfcl.model_handler.constant import ( GORILLA_TO_OPENAPI, @@ -31,16 +33,31 @@ def inference(self, prompt, functions, test_category): prompt = system_prompt_pre_processing_chat_model( prompt, DEFAULT_SYSTEM_PROMPT, functions ) + # Special handling for o1-preview and o1-mini as they don't support system prompts yet + if "o1-preview" in self.model_name or "o1-mini" in self.model_name: + prompt = convert_system_prompt_into_user_prompt(prompt) + prompt = combine_consecutive_user_prompr(prompt) message = prompt start_time = time.time() - response = self.client.chat.completions.create( - messages=message, - model=self.model_name, - temperature=self.temperature, - max_tokens=self.max_tokens, - top_p=self.top_p, - ) + # These two models have temperature and top_p fixed to 1, and max_tokens is not supported + # Beta limitation: https://platform.openai.com/docs/guides/reasoning/beta-limitations + if "o1-preview" in self.model_name or "o1-mini" in self.model_name: + response = self.client.chat.completions.create( + messages=message, + model=self.model_name, + temperature=1, + # max_tokens=self.max_tokens, + top_p=1, + ) + else: + response = self.client.chat.completions.create( + messages=message, + model=self.model_name, + temperature=self.temperature, + max_tokens=self.max_tokens, + top_p=self.top_p, + ) latency = time.time() - start_time result = response.choices[0].message.content metadata = {} diff --git a/berkeley-function-call-leaderboard/function_credential_config.json b/berkeley-function-call-leaderboard/function_credential_config.json deleted file mode 100644 index 9d36e9bbd..000000000 --- a/berkeley-function-call-leaderboard/function_credential_config.json +++ /dev/null @@ -1 +0,0 @@ -[{"RAPID-API-KEY" : ""},{"EXCHANGERATE-API-KEY" : ""},{"OMDB-API-KEY" : ""}, {"GEOCODE-API-KEY": ""}] \ No newline at end of file