BFCL May 14th Release (GPT-4o and Gemini) (ShishirPatil#426)

This PR makes 3 models(4 entries) available for inference on BFCL: - gpt-4o-2024-05-13 (Function Calling Mode and Prompting Mode) - gemini-1.5-pro-preview-0514 (Function Calling Mode) - gemini-1.5-flash-preview-0514(Function Calling Mode) You can start the evaluation by running `python openfunctions_evaluation.py --model MODEL_NAME` and get score by running `python ./eval_runner.py --model MODEL_NAME`. For more detail, refer to Readme under the BFCL page. Score changes are reflected in ShishirPatil#428 . This PR also updated different models' pricing: - For Gemini, when prompts are less than 128K tokens, the new Gemini series' prices are lowered by around half(https://ai.google.dev/pricing). All the BFCL test cases are less than 128K tokens. - For Anthropic Models, the prices have decreased for claude-2.1 and Claude-instant-1.2 which have updated accordingly - For Mistral Models, the prices have been halved for Mistral-large and Mistral-Small - For OpenAI Models, we have corrected GPT-3.5-turbo-0125 to the price it should have --------- Co-authored-by: Huanzhi Mao <huanzhimao@gmail.com>
vinaybagade · May 15, 2024 · 4a36ac7 · 4a36ac7
1 parent a3de1d9
commit 4a36ac7
Show file tree

Hide file tree

Showing 5 changed files with 92 additions and 43 deletions.
diff --git a/berkeley-function-call-leaderboard/README.md b/berkeley-function-call-leaderboard/README.md
@@ -198,12 +198,15 @@ Below is *a table of models we support* to run our leaderboard evaluation agains
 |deepseek-ai/deepseek-coder-6.7b-instruct 💻| Prompt|
 |fire-function-v1-FC | Function Calling|
 |gemini-1.0-pro | Function Calling|
-|gemini-1.5-pro-preview-0409 | Function Calling|
+|gemini-1.5-pro-preview-{0409,0514} | Function Calling|
+|gemini-1.5-flash-preview-0514 | Function Calling|
 |glaiveai/glaive-function-calling-v1 💻| Function Calling|
 |gpt-3.5-turbo-0125-FC| Function Calling|
 |gpt-3.5-turbo-0125| Prompt|
 |gpt-4-{0613,1106-preview,0125-preview,turbo-2024-04-09}-FC| Function Calling|
 |gpt-4-{0613,1106-preview,0125-preview,turbo-2024-04-09}| Prompt|
+|gpt-4o-2024-05-13-FC | Function Calling|
+|gpt-4o-2024-05-13| Prompt|
 |google/gemma-7b-it 💻| Prompt|
 |meetkai/functionary-{small,medium}-v2.4-FC| Function Calling|
 |meetkai/functionary-small-v2.2-FC| Function Calling|
@@ -232,6 +235,17 @@ For inferencing `Databrick-DBRX-instruct`, you need to create a Databrick Azure
 
 ## Changelog
 
+* [May 14, 2024] [#426](https://github.com/ShishirPatil/gorilla/pull/426):
+    - Add the following new models to the leaderboard:
+        + `gpt-4o-2024-05-13`
+        + `gpt-4o-2024-05-13-FC`
+        + `gemini-1.5-pro-preview-0514`
+        + `gemini-1.5-flash-preview-0514`
+    - Update price for the following models:
+        + All Gemini Series
+        + `Claude-2.1 (Prompt)` and `Claude-instant-1.2 (Prompt)`
+        + `Mistral-large` and `Mistral-Small`
+        + `GPT-3.5-Turbo-0125`
 * [May 8, 2024] [#406](https://github.com/ShishirPatil/gorilla/pull/406) and [#421](https://github.com/ShishirPatil/gorilla/pull/421): Update the `gemini_handler.py` to better handle parallel function calls for Gemini models.
 * [May 6, 2024] [#412](https://github.com/ShishirPatil/gorilla/pull/412): Bug fix in evaluation dataset for AST categories. This includes updates to both prompts and function docs.
 * [May 2, 2024] [#405](https://github.com/ShishirPatil/gorilla/pull/405): Bug fix in the possible answers for the AST Simple evaluation dataset. Prompt and function docs are not affected.

diff --git a/berkeley-function-call-leaderboard/eval_checker/eval_runner_helper.py b/berkeley-function-call-leaderboard/eval_checker/eval_runner_helper.py
@@ -44,6 +44,18 @@
 
 # Note that we don't need to substitute `_` with `/` in the model name here.
 MODEL_METADATA_MAPPING = {
+    "gpt-4o-2024-05-13-FC": [
+        "GPT-4o-2024-05-13 (FC)",
+        "https://openai.com/index/hello-gpt-4o/",
+        "OpenAI",
+        "Proprietary",
+    ],
+    "gpt-4o-2024-05-13": [
+        "GPT-4o-2024-05-13 (Prompt)",
+        "https://openai.com/index/hello-gpt-4o/",
+        "OpenAI",
+        "Proprietary",
+    ],
     "gpt-4-1106-preview-FC": [
         "GPT-4-1106-Preview (FC)",
         "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo",
@@ -230,8 +242,26 @@
         "Fireworks",
         "Apache 2.0",
     ],
+    "gemini-1.5-pro-preview-0514": [
+        "Gemini-1.5-Pro-Preview-0514 (FC)",
+        "https://deepmind.google/technologies/gemini/pro/",
+        "Google",
+        "Proprietary",
+    ],
+    "gemini-1.5-flash-preview-0514": [
+        "Gemini-1.5-Flash-Preview-0514 (FC)",
+        "https://deepmind.google/technologies/gemini/flash/",
+        "Google",
+        "Proprietary",
+    ],
+    "gemini-1.5-pro-preview-0409": [
+        "Gemini-1.5-Pro-Preview-0409 (FC)",
+        "https://deepmind.google/technologies/gemini/#introduction",
+        "Google",
+        "Proprietary",
+    ],
     "gemini-1.0-pro": [
-        "Gemini-1.0-Pro (FC)",
+        "Gemini-1.0-Pro-001 (FC)",
         "https://deepmind.google/technologies/gemini/#introduction",
         "Google",
         "Proprietary",
@@ -278,12 +308,6 @@
         "NousResearch",
         "apache-2.0",
     ],
-    "gemini-1.5-pro-preview-0409": [
-        "Gemini-1.5-Pro (FC)",
-        "https://deepmind.google/technologies/gemini/#introduction",
-        "Google",
-        "Proprietary",
-    ],
     "meta-llama_Meta-Llama-3-8B-Instruct": [
         "Meta-Llama-3-8B-Instruct (Prompt)",
         "https://llama.meta.com/llama3",
@@ -335,15 +359,17 @@
     "claude-3-sonnet-20240229": 3,
     "claude-3-haiku-20240307-FC": 0.25,
     "claude-3-haiku-20240307": 0.25,
-    "claude-2.1": 11.02,
-    "claude-instant-1.2": 1.63,
-    "mistral-large-2402-FC-Any": 8,
-    "mistral-large-2402-FC-Auto": 8,
+    "claude-2.1": 8,
+    "claude-instant-1.2": 0.8,
+    "mistral-large-2402-FC-Any": 4,
+    "mistral-large-2402-FC-Auto": 4,
     "mistral-medium-2312": 2.7,
-    "mistral-small-2402-FC-Any": 2,
-    "mistral-small-2402-FC-Auto": 2,
-    "mistral-small-2402": 2,
+    "mistral-small-2402-FC-Any": 1,
+    "mistral-small-2402-FC-Auto": 1,
+    "mistral-small-2402": 1,
     "mistral-tiny-2312": 0.25,
+    "gpt-4o-2024-05-13-FC": 5,
+    "gpt-4o-2024-05-13": 5,
     "gpt-4-1106-preview-FC": 10,
     "gpt-4-1106-preview": 10,
     "gpt-4-0125-preview": 10,
@@ -352,10 +378,12 @@
     "gpt-4-turbo-2024-04-09": 10,
     "gpt-4-0613": 30,
     "gpt-4-0613-FC": 30,
-    "gpt-3.5-turbo-0125": 1.5,
-    "gpt-3.5-turbo-0125-FC": 1.5,
-    "gemini-1.0-pro": 1,
-    "gemini-1.5-pro-preview-0409": 7,
+    "gpt-3.5-turbo-0125": 0.5,
+    "gpt-3.5-turbo-0125-FC": 0.5,
+    "gemini-1.0-pro": 0.5,
+    "gemini-1.5-pro-preview-0409": 3.5,
+    "gemini-1.5-pro-preview-0514": 3.5,
+    "gemini-1.5-flash-preview-0514": 0.35,
     "databricks-dbrx-instruct": 2.25,
     "command-r-plus-FC": 3,
     "command-r-plus": 3,
@@ -370,15 +398,17 @@
     "claude-3-sonnet-20240229": 15,
     "claude-3-haiku-20240307-FC": 1.25,
     "claude-3-haiku-20240307": 1.25,
-    "claude-2.1": 32.68,
-    "claude-instant-1.2": 5.51,
-    "mistral-large-2402-FC-Any": 24,
-    "mistral-large-2402-FC-Auto": 24,
-    "mistral-small-2402": 24,
+    "claude-2.1": 24,
+    "claude-instant-1.2": 2.4,
+    "mistral-large-2402-FC-Any": 12,
+    "mistral-large-2402-FC-Auto": 12,
+    "mistral-small-2402": 3,
     "mistral-medium-2312": 8.1,
-    "mistral-small-2402-FC-Any": 6,
-    "mistral-small-2402-FC-Auto": 6,
+    "mistral-small-2402-FC-Any": 3,
+    "mistral-small-2402-FC-Auto": 3,
     "mistral-tiny-2312": 0.25,
+    "gpt-4o-2024-05-13-FC": 15,
+    "gpt-4o-2024-05-13": 15,
     "gpt-4-turbo-2024-04-09-FC": 30,
     "gpt-4-turbo-2024-04-09": 30,
     "gpt-4-1106-preview": 30,
@@ -387,10 +417,12 @@
     "gpt-4-0125-preview": 30,
     "gpt-4-0613": 60,
     "gpt-4-0613-FC": 60,
-    "gpt-3.5-turbo-0125": 2,
-    "gpt-3.5-turbo-0125-FC": 2,
-    "gemini-1.0-pro": 2,
-    "gemini-1.5-pro-preview-0409": 14,
+    "gpt-3.5-turbo-0125": 1.5,
+    "gpt-3.5-turbo-0125-FC": 1.5,
+    "gemini-1.0-pro": 1.5,
+    "gemini-1.5-pro-preview-0409": 10.50,
+    "gemini-1.5-pro-preview-0514": 10.50,
+    "gemini-1.5-flash-preview-0514": 0.53,
     "databricks-dbrx-instruct": 6.75,
     "command-r-plus-FC": 15,
     "command-r-plus": 15,

diff --git a/berkeley-function-call-leaderboard/model_handler/constant.py b/berkeley-function-call-leaderboard/model_handler/constant.py
@@ -116,6 +116,7 @@
 
 # If there is any underscore in folder name, you should change it to `/` in the following strings
 UNDERSCORE_TO_DOT = [
+    "gpt-4o-2024-05-13-FC",
     "gpt-4-turbo-2024-04-09-FC",
     "gpt-4-1106-preview-FC",
     "gpt-4-0125-preview-FC",
@@ -132,6 +133,8 @@
     "mistral-small-2402-FC",
     "gemini-1.0-pro",
     "gemini-1.5-pro-preview-0409",
+    "gemini-1.5-pro-preview-0514",
+    "gemini-1.5-flash-preview-0514",
     "meetkai/functionary-small-v2.2-FC",
     "meetkai/functionary-medium-v2.2-FC",
     "meetkai/functionary-small-v2.4-FC",

diff --git a/berkeley-function-call-leaderboard/model_handler/gemini_handler.py b/berkeley-function-call-leaderboard/model_handler/gemini_handler.py
@@ -76,25 +76,21 @@ def _query_gemini(self, user_query, functions):
                 else:
                     parts.append(part["text"])
             result = parts
-        # This try-except is necessary because sometimes `result["candidates"][0]` does not have the key "content"
-        except Exception as e:
-            result = f"Parsing error: {e}" 
-
-        metatdata = {}
-        try:
+            metatdata = {}
             metatdata["input_tokens"] = json.loads(response.content)["usageMetadata"][
                 "promptTokenCount"
             ]
-        except:
-            metatdata["input_tokens"] = 0  # We special handle the 0 value when aggregating the results. 0 token will be ignored and not be counted in the average.
-        try:
             metatdata["output_tokens"] = json.loads(response.content)["usageMetadata"][
                 "candidatesTokenCount"
             ]
-        except:
-            metatdata["output_tokens"] = 0  # We special handle the 0 value when aggregating the results. 0 token will be ignored and not be counted in the average.
-
-        metatdata["latency"] = latency
+            metatdata["latency"] = latency
+        except Exception as e:
+            result = "Parsing error: " + json.dumps(result)
+            metatdata = {
+                "input_tokens": 0,
+                "output_tokens": 0,
+                "latency": latency,
+            }
         return result, metatdata
 
     def inference(self, prompt, functions, test_category):

diff --git a/berkeley-function-call-leaderboard/model_handler/handler_map.py b/berkeley-function-call-leaderboard/model_handler/handler_map.py
@@ -20,6 +20,8 @@
 handler_map = {
     "gorilla-openfunctions-v0": GorillaHandler,
     "gorilla-openfunctions-v2": GorillaHandler,
+    "gpt-4o-2024-05-13": OpenAIHandler,
+    "gpt-4o-2024-05-13-FC": OpenAIHandler,
     "gpt-4-turbo-2024-04-09-FC": OpenAIHandler,
     "gpt-4-turbo-2024-04-09": OpenAIHandler,
     "gpt-4-1106-preview-FC": OpenAIHandler,
@@ -50,6 +52,8 @@
     "Nexusflow-Raven-v2": NexusHandler,
     "gemini-1.0-pro": GeminiHandler,
     "gemini-1.5-pro-preview-0409": GeminiHandler,
+    "gemini-1.5-pro-preview-0514": GeminiHandler,
+    "gemini-1.5-flash-preview-0514": GeminiHandler,
     "gemma": OSSHandler,
     "google/gemma-7b-it": GemmaHandler,
     "glaiveai/glaive-function-calling-v1": GlaiveHandler,