From c96f1fe3321d3a6621a86da57e5036363ced1a39 Mon Sep 17 00:00:00 2001 From: Huanzhi Mao Date: Thu, 12 Sep 2024 13:25:12 -0700 Subject: [PATCH 1/5] update openai handler --- .../model_handler/proprietary_model/openai.py | 31 ++++++++++++++----- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/openai.py b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/openai.py index 5b7de41c2..4b9024f5a 100644 --- a/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/openai.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/openai.py @@ -6,6 +6,8 @@ system_prompt_pre_processing_chat_model, func_doc_language_specific_pre_processing, ast_parse, + convert_system_prompt_into_user_prompt, + combine_consecutive_user_prompr, ) from bfcl.model_handler.constant import ( GORILLA_TO_OPENAPI, @@ -31,16 +33,31 @@ def inference(self, prompt, functions, test_category): prompt = system_prompt_pre_processing_chat_model( prompt, DEFAULT_SYSTEM_PROMPT, functions ) + # Special handling for o1-preview and o1-mini as they don't support system prompts yet + if self.model_name in ["o1-preview", "o1-mini"]: + prompt = convert_system_prompt_into_user_prompt(prompt) + prompt = combine_consecutive_user_prompr(prompt) message = prompt start_time = time.time() - response = self.client.chat.completions.create( - messages=message, - model=self.model_name, - temperature=self.temperature, - max_tokens=self.max_tokens, - top_p=self.top_p, - ) + # These two models have temperature and top_p fixed to 1 + # Beta limitation: https://platform.openai.com/docs/guides/reasoning/beta-limitations + if self.model_name in ["o1-preview", "o1-mini"]: + response = self.client.chat.completions.create( + messages=message, + model=self.model_name, + temperature=1, + max_tokens=self.max_tokens, + top_p=1, + ) + else: + response = self.client.chat.completions.create( + messages=message, + model=self.model_name, + temperature=self.temperature, + max_tokens=self.max_tokens, + top_p=self.top_p, + ) latency = time.time() - start_time result = response.choices[0].message.content metadata = {} From 3b3ac1d049a75bd19d96cf03bc449c43f1ff8dca Mon Sep 17 00:00:00 2001 From: Huanzhi Mao Date: Thu, 12 Sep 2024 13:30:07 -0700 Subject: [PATCH 2/5] update model metadata --- .../bfcl/eval_checker/eval_runner_helper.py | 16 ++++++++++++++++ .../bfcl/model_handler/handler_map.py | 2 ++ 2 files changed, 18 insertions(+) diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py index 8c7876cdd..3f812d47c 100644 --- a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py +++ b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py @@ -89,6 +89,18 @@ ] MODEL_METADATA_MAPPING = { + "o1-preview-2024-09-12": [ + "o1-preview-2024-09-12 (Prompt)", + "https://openai.com/index/introducing-openai-o1-preview/", + "OpenAI", + "Proprietary", + ], + "o1-mini-2024-09-12": [ + "o1-mini-2024-09-12 (Prompt)", + "https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/", + "OpenAI", + "Proprietary", + ], "gpt-4o-2024-08-06": [ "GPT-4o-2024-08-06 (Prompt)", "https://openai.com/index/hello-gpt-4o/", @@ -585,6 +597,8 @@ "mistral-small-2402-FC-Auto": 1, "mistral-small-2402": 1, "mistral-tiny-2312": 0.25, + "o1-preview-2024-09-12": 15, + "o1-mini-2024-09-12": 3, "gpt-4o-2024-05-13-FC": 5, "gpt-4o-2024-05-13": 5, "gpt-4o-2024-08-06-FC": 2.5, @@ -639,6 +653,8 @@ "mistral-small-2402-FC-Any": 3, "mistral-small-2402-FC-Auto": 3, "mistral-tiny-2312": 0.25, + "o1-preview-2024-09-12": 60, + "o1-mini-2024-09-12": 12, "gpt-4o-2024-05-13-FC": 15, "gpt-4o-2024-05-13": 15, "gpt-4o-2024-08-06-FC": 10, diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/handler_map.py b/berkeley-function-call-leaderboard/bfcl/model_handler/handler_map.py index ee28fe082..d583b3723 100644 --- a/berkeley-function-call-leaderboard/bfcl/model_handler/handler_map.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/handler_map.py @@ -24,6 +24,8 @@ handler_map = { "gorilla-openfunctions-v0": GorillaHandler, "gorilla-openfunctions-v2": GorillaHandler, + "o1-preview-2024-09-12": OpenAIHandler, + "o1-mini-2024-09-12": OpenAIHandler, "gpt-4o-2024-08-06": OpenAIHandler, "gpt-4o-2024-08-06-FC": OpenAIHandler, "gpt-4o-2024-05-13": OpenAIHandler, From 6c6abbb636b4270c41a03ba731e7f85ebb9b10b0 Mon Sep 17 00:00:00 2001 From: Huanzhi Mao Date: Thu, 12 Sep 2024 13:33:22 -0700 Subject: [PATCH 3/5] remove outdated function_credential_config.json --- .gitignore | 3 --- .../bfcl/eval_checker/custom_exception.py | 2 +- .../bfcl/model_handler/proprietary_model/openai.py | 2 ++ .../function_credential_config.json | 1 - 4 files changed, 3 insertions(+), 5 deletions(-) delete mode 100644 berkeley-function-call-leaderboard/function_credential_config.json diff --git a/.gitignore b/.gitignore index 96b53ca2e..067e81903 100644 --- a/.gitignore +++ b/.gitignore @@ -19,9 +19,6 @@ goex/docker/misc/images.json ################## Berkley Function Call Leaderboard ########################## -# Ignore API keys -berkeley-function-call-leaderboard/function_credential_config.json - # Ignore tree-sitter berkeley-function-call-leaderboard/eval_checker/tree-sitter-java berkeley-function-call-leaderboard/eval_checker/tree-sitter-javascript diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/custom_exception.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/custom_exception.py index 3504862d8..9daa51f7d 100644 --- a/berkeley-function-call-leaderboard/bfcl/eval_checker/custom_exception.py +++ b/berkeley-function-call-leaderboard/bfcl/eval_checker/custom_exception.py @@ -1,6 +1,6 @@ class NoAPIKeyError(Exception): def __init__(self): - self.message = "❗️Please fill in the API keys in the function_credential_config.json file. If you do not provide the API keys, the executable test category results will be inaccurate." + self.message = "❗️Please fill in the API keys in the .env file. If you do not provide the API keys, the executable test category results will be inaccurate." super().__init__(self.message) diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/openai.py b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/openai.py index 4b9024f5a..b04104cee 100644 --- a/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/openai.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/openai.py @@ -43,6 +43,8 @@ def inference(self, prompt, functions, test_category): # These two models have temperature and top_p fixed to 1 # Beta limitation: https://platform.openai.com/docs/guides/reasoning/beta-limitations if self.model_name in ["o1-preview", "o1-mini"]: + # Rate limit workaround, 20 requests per minute even for top tier users + time.sleep(4) response = self.client.chat.completions.create( messages=message, model=self.model_name, diff --git a/berkeley-function-call-leaderboard/function_credential_config.json b/berkeley-function-call-leaderboard/function_credential_config.json deleted file mode 100644 index 9d36e9bbd..000000000 --- a/berkeley-function-call-leaderboard/function_credential_config.json +++ /dev/null @@ -1 +0,0 @@ -[{"RAPID-API-KEY" : ""},{"EXCHANGERATE-API-KEY" : ""},{"OMDB-API-KEY" : ""}, {"GEOCODE-API-KEY": ""}] \ No newline at end of file From 64ae02b74c69fa93ca480fbe45bc187bafd7cdbe Mon Sep 17 00:00:00 2001 From: Huanzhi Mao Date: Thu, 12 Sep 2024 13:38:22 -0700 Subject: [PATCH 4/5] update change log --- berkeley-function-call-leaderboard/README.md | 3 +++ .../bfcl/model_handler/proprietary_model/openai.py | 8 ++++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/berkeley-function-call-leaderboard/README.md b/berkeley-function-call-leaderboard/README.md index ef84d6fd0..6823ad2af 100644 --- a/berkeley-function-call-leaderboard/README.md +++ b/berkeley-function-call-leaderboard/README.md @@ -138,6 +138,8 @@ Below is *a table of models we support* to run our leaderboard evaluation agains |gpt-4o-2024-05-13| Prompt| |gpt-4o-mini-2024-07-18-FC | Function Calling| |gpt-4o-mini-2024-07-18 | Prompt| +|o1-mini-2024-09-12 | Prompt| +|o1-preview-2024-09-12 | Prompt| |google/gemma-7b-it 💻| Prompt| |meetkai/functionary-medium-v3.1-FC| Function Calling| |meetkai/functionary-small-{v3.1,v3.2}-FC| Function Calling| @@ -268,6 +270,7 @@ Some companies have proposed some optimization strategies in their models' handl ## Changelog +* [Sept 12, 2024] [#635](https://github.com/ShishirPatil/gorilla/pull/635): Add new model `o1-preview-2024-09-12` and `o1-mini-2024-09-12` to the leaderboard. * [Sept 8, 2024] [#627](https://github.com/ShishirPatil/gorilla/pull/627) Add new model `MadeAgents/Hammer-7b` to the leaderboard. * [Sept 7, 2024] [#626](https://github.com/ShishirPatil/gorilla/pull/626): Fix prompt format for Llama models. * [Sept 4, 2024] [#623](https://github.com/ShishirPatil/gorilla/pull/623): Fix decoding issue in the `NvidiaHandler`; remove duplicate `ArcticHandler` class. diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/openai.py b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/openai.py index b04104cee..e1ee701f1 100644 --- a/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/openai.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/openai.py @@ -34,22 +34,22 @@ def inference(self, prompt, functions, test_category): prompt, DEFAULT_SYSTEM_PROMPT, functions ) # Special handling for o1-preview and o1-mini as they don't support system prompts yet - if self.model_name in ["o1-preview", "o1-mini"]: + if "o1-preview" in self.model_name or "o1-mini" in self.model_name: prompt = convert_system_prompt_into_user_prompt(prompt) prompt = combine_consecutive_user_prompr(prompt) message = prompt start_time = time.time() - # These two models have temperature and top_p fixed to 1 + # These two models have temperature and top_p fixed to 1, and max_tokens is not supported # Beta limitation: https://platform.openai.com/docs/guides/reasoning/beta-limitations - if self.model_name in ["o1-preview", "o1-mini"]: + if "o1-preview" in self.model_name or "o1-mini" in self.model_name: # Rate limit workaround, 20 requests per minute even for top tier users time.sleep(4) response = self.client.chat.completions.create( messages=message, model=self.model_name, temperature=1, - max_tokens=self.max_tokens, + # max_tokens=self.max_tokens, top_p=1, ) else: From 88fc6f1f99954272c1f0eee95634e32f2f2b672d Mon Sep 17 00:00:00 2001 From: Huanzhi Mao Date: Thu, 12 Sep 2024 13:52:33 -0700 Subject: [PATCH 5/5] remove rate limit walkaround --- berkeley-function-call-leaderboard/README.md | 2 +- .../bfcl/model_handler/proprietary_model/openai.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/berkeley-function-call-leaderboard/README.md b/berkeley-function-call-leaderboard/README.md index 6823ad2af..9a1a27ec5 100644 --- a/berkeley-function-call-leaderboard/README.md +++ b/berkeley-function-call-leaderboard/README.md @@ -270,7 +270,7 @@ Some companies have proposed some optimization strategies in their models' handl ## Changelog -* [Sept 12, 2024] [#635](https://github.com/ShishirPatil/gorilla/pull/635): Add new model `o1-preview-2024-09-12` and `o1-mini-2024-09-12` to the leaderboard. +* [Sept 12, 2024] [#635](https://github.com/ShishirPatil/gorilla/pull/635): Add new models `o1-preview-2024-09-12` and `o1-mini-2024-09-12` to the leaderboard. * [Sept 8, 2024] [#627](https://github.com/ShishirPatil/gorilla/pull/627) Add new model `MadeAgents/Hammer-7b` to the leaderboard. * [Sept 7, 2024] [#626](https://github.com/ShishirPatil/gorilla/pull/626): Fix prompt format for Llama models. * [Sept 4, 2024] [#623](https://github.com/ShishirPatil/gorilla/pull/623): Fix decoding issue in the `NvidiaHandler`; remove duplicate `ArcticHandler` class. diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/openai.py b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/openai.py index e1ee701f1..8562faf09 100644 --- a/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/openai.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/openai.py @@ -43,8 +43,6 @@ def inference(self, prompt, functions, test_category): # These two models have temperature and top_p fixed to 1, and max_tokens is not supported # Beta limitation: https://platform.openai.com/docs/guides/reasoning/beta-limitations if "o1-preview" in self.model_name or "o1-mini" in self.model_name: - # Rate limit workaround, 20 requests per minute even for top tier users - time.sleep(4) response = self.client.chat.completions.create( messages=message, model=self.model_name,