[BFCL] Add New Model o1-preview-2024-09-12 and o1-mini-2024-09-12 (…

…ShishirPatil#635) This PR adds new models `o1-preview-2024-09-12` and `o1-mini-2024-09-12` to the leaderboard.
empower-ai · Sep 15, 2024 · 3f5ace7 · 3f5ace7
1 parent 46c3e85
commit 3f5ace7
Show file tree

Hide file tree

Showing 7 changed files with 46 additions and 12 deletions.
diff --git a/.gitignore b/.gitignore
@@ -19,9 +19,6 @@ goex/docker/misc/images.json
 
 ################## Berkley Function Call Leaderboard ##########################
 
-# Ignore API keys
-berkeley-function-call-leaderboard/function_credential_config.json
-
 # Ignore tree-sitter
 berkeley-function-call-leaderboard/eval_checker/tree-sitter-java
 berkeley-function-call-leaderboard/eval_checker/tree-sitter-javascript

diff --git a/berkeley-function-call-leaderboard/README.md b/berkeley-function-call-leaderboard/README.md
@@ -138,6 +138,8 @@ Below is *a table of models we support* to run our leaderboard evaluation agains
 |gpt-4o-2024-05-13| Prompt|
 |gpt-4o-mini-2024-07-18-FC | Function Calling|
 |gpt-4o-mini-2024-07-18 | Prompt|
+|o1-mini-2024-09-12 | Prompt|
+|o1-preview-2024-09-12 | Prompt|
 |google/gemma-7b-it 💻| Prompt|
 |meetkai/functionary-medium-v3.1-FC| Function Calling|
 |meetkai/functionary-small-{v3.1,v3.2}-FC| Function Calling|
@@ -269,6 +271,7 @@ Some companies have proposed some optimization strategies in their models' handl
 ## Changelog
 
 * [Sept 13, 2024] [#638](https://github.com/ShishirPatil/gorilla/pull/638): Fix prompt formatting issue for `THUDM/glm-4-9b-chat`.
+* [Sept 12, 2024] [#635](https://github.com/ShishirPatil/gorilla/pull/635): Add new models `o1-preview-2024-09-12` and `o1-mini-2024-09-12` to the leaderboard.
 * [Sept 8, 2024] [#627](https://github.com/ShishirPatil/gorilla/pull/627) Add new model `MadeAgents/Hammer-7b` to the leaderboard.
 * [Sept 7, 2024] [#626](https://github.com/ShishirPatil/gorilla/pull/626): Fix prompt format for Llama models.
 * [Sept 4, 2024] [#623](https://github.com/ShishirPatil/gorilla/pull/623): Fix decoding issue in the `NvidiaHandler`; remove duplicate `ArcticHandler` class.

diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/custom_exception.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/custom_exception.py
@@ -1,6 +1,6 @@
 class NoAPIKeyError(Exception):
  def __init__(self):
- self.message = "❗️Please fill in the API keys in the function_credential_config.json file. If you do not provide the API keys, the executable test category results will be inaccurate."
+ self.message = "❗️Please fill in the API keys in the .env file. If you do not provide the API keys, the executable test category results will be inaccurate."
  super().__init__(self.message)
 
 

diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py
@@ -89,6 +89,18 @@
 ]
 
 MODEL_METADATA_MAPPING = {
+ "o1-preview-2024-09-12": [
+ "o1-preview-2024-09-12 (Prompt)",
+ "https://openai.com/index/introducing-openai-o1-preview/",
+ "OpenAI",
+ "Proprietary",
+ ],
+ "o1-mini-2024-09-12": [
+ "o1-mini-2024-09-12 (Prompt)",
+ "https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/",
+ "OpenAI",
+ "Proprietary",
+ ],
  "gpt-4o-2024-08-06": [
  "GPT-4o-2024-08-06 (Prompt)",
  "https://openai.com/index/hello-gpt-4o/",
@@ -585,6 +597,8 @@
  "mistral-small-2402-FC-Auto": 1,
  "mistral-small-2402": 1,
  "mistral-tiny-2312": 0.25,
+ "o1-preview-2024-09-12": 15,
+ "o1-mini-2024-09-12": 3,
  "gpt-4o-2024-05-13-FC": 5,
  "gpt-4o-2024-05-13": 5,
  "gpt-4o-2024-08-06-FC": 2.5,
@@ -639,6 +653,8 @@
  "mistral-small-2402-FC-Any": 3,
  "mistral-small-2402-FC-Auto": 3,
  "mistral-tiny-2312": 0.25,
+ "o1-preview-2024-09-12": 60,
+ "o1-mini-2024-09-12": 12,
  "gpt-4o-2024-05-13-FC": 15,
  "gpt-4o-2024-05-13": 15,
  "gpt-4o-2024-08-06-FC": 10,

diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/handler_map.py b/berkeley-function-call-leaderboard/bfcl/model_handler/handler_map.py
@@ -24,6 +24,8 @@
 handler_map = {
  "gorilla-openfunctions-v0": GorillaHandler,
  "gorilla-openfunctions-v2": GorillaHandler,
+ "o1-preview-2024-09-12": OpenAIHandler,
+ "o1-mini-2024-09-12": OpenAIHandler,
  "gpt-4o-2024-08-06": OpenAIHandler,
  "gpt-4o-2024-08-06-FC": OpenAIHandler,
  "gpt-4o-2024-05-13": OpenAIHandler,

diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/openai.py b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/openai.py
@@ -6,6 +6,8 @@
  system_prompt_pre_processing_chat_model,
  func_doc_language_specific_pre_processing,
  ast_parse,
+ convert_system_prompt_into_user_prompt,
+ combine_consecutive_user_prompr,
 )
 from bfcl.model_handler.constant import (
  GORILLA_TO_OPENAPI,
@@ -31,16 +33,31 @@ def inference(self, prompt, functions, test_category):
  prompt = system_prompt_pre_processing_chat_model(
  prompt, DEFAULT_SYSTEM_PROMPT, functions
  )
+ # Special handling for o1-preview and o1-mini as they don't support system prompts yet
+ if "o1-preview" in self.model_name or "o1-mini" in self.model_name:
+ prompt = convert_system_prompt_into_user_prompt(prompt)
+ prompt = combine_consecutive_user_prompr(prompt)
  message = prompt
 
  start_time = time.time()
- response = self.client.chat.completions.create(
- messages=message,
- model=self.model_name,
- temperature=self.temperature,
- max_tokens=self.max_tokens,
- top_p=self.top_p,
- )
+ # These two models have temperature and top_p fixed to 1, and max_tokens is not supported
+ # Beta limitation: https://platform.openai.com/docs/guides/reasoning/beta-limitations
+ if "o1-preview" in self.model_name or "o1-mini" in self.model_name:
+ response = self.client.chat.completions.create(
+ messages=message,
+ model=self.model_name,
+ temperature=1,
+ # max_tokens=self.max_tokens,
+ top_p=1,
+ )
+ else:
+ response = self.client.chat.completions.create(
+ messages=message,
+ model=self.model_name,
+ temperature=self.temperature,
+ max_tokens=self.max_tokens,
+ top_p=self.top_p,
+ )
  latency = time.time() - start_time
  result = response.choices[0].message.content
  metadata = {}

diff --git a/berkeley-function-call-leaderboard/function_credential_config.json b/berkeley-function-call-leaderboard/function_credential_config.json