Merge remote-tracking branch 'upstream/main' into pr/Cppowboy/718

ShishirPatil · Nov 26, 2024 · 3c87d3e · 3c87d3e
2 parents af2b2bb + 7d3bf66
commit 3c87d3e
Show file tree

Hide file tree

Showing 10 changed files with 339 additions and 59 deletions.
diff --git a/berkeley-function-call-leaderboard/CHANGELOG.md b/berkeley-function-call-leaderboard/CHANGELOG.md
@@ -2,6 +2,13 @@
 
 All notable changes to the Berkeley Function Calling Leaderboard will be documented in this file.
 
+- [Nov 25, 2024] [#697](https://github.com/ShishirPatil/gorilla/pull/697): Add the following new models to the leaderboard:
+  - `deepseek-ai/DeepSeek-V2.5`
+  - `deepseek-ai/DeepSeek-Coder-V2-Instruct-0724`
+  - `deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct`
+  - `deepseek-ai/DeepSeek-V2-Chat-0628`
+  - `deepseek-ai/DeepSeek-V2-Lite-Chat`
+- [Nov 25, 2024] [#787](https://github.com/ShishirPatil/gorilla/pull/787): Add new model `Qwen/Qwen2.5-72B-Instruct` to the leaderboard.
 - [Nov 24, 2024] [#743](https://github.com/ShishirPatil/gorilla/pull/743): Add support for regeneration, specific test entry IDs, and custom directory locations:
   - Introduce the `--allow-overwrite` flag for the `generate` command to enable regeneration of test entries even if they already exist.
   - Add a new `--run-ids` flag for the `generate` command, allowing execution of specific test entry IDs from `test_case_ids_to_generate.json`.

diff --git a/berkeley-function-call-leaderboard/README.md b/berkeley-function-call-leaderboard/README.md
@@ -158,7 +158,9 @@ Below is _a table of models we support_ to run our leaderboard evaluation agains
 |command-r-plus-FC | Function Calling|
 |command-r-plus | Prompt|
 |databrick-dbrx-instruct | Prompt|
-|deepseek-ai/deepseek-coder-6.7b-instruct 💻| Prompt|
+|deepseek-ai/DeepSeek-V2.5 💻| Function Calling|
+|deepseek-ai/DeepSeek-V2-{Chat-0628,Lite-Chat} 💻| Prompt|
+|deepseek-ai/DeepSeek-Coder-V2-{Instruct-0724,Lite-Instruct} 💻| Function Calling|
 |firefunction-{v1,v2}-FC | Function Calling|
 |gemini-1.0-pro-{001,002}-FC | Function Calling|
 |gemini-1.0-pro-{001,002} | Prompt|

diff --git a/berkeley-function-call-leaderboard/bfcl/_llm_response_generation.py b/berkeley-function-call-leaderboard/bfcl/_llm_response_generation.py
@@ -125,20 +125,18 @@ def collect_test_cases(
         for test_case in all_test_entries_involved
         if test_case["id"] not in existing_ids
     ]
-    test_cases_to_generate = process_multi_turn_test_case(
-        test_cases_to_generate, test_category
-    )
+    test_cases_to_generate = process_multi_turn_test_case(test_cases_to_generate)
 
     return sorted(test_cases_to_generate, key=sort_key)
 
 
-def process_multi_turn_test_case(test_cases, test_category):
+def process_multi_turn_test_case(test_cases):
     """
     Multi-turn test cases don't have the function doc in the prompt. We need to add them here.
     """
-    if not is_multi_turn(test_category):
-        return test_cases
     for entry in test_cases:
+        if not is_multi_turn(entry["id"]):
+            continue
         involved_classes = entry["involved_classes"]
         entry["function"] = []
         for func_collection in involved_classes:

diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/model_metadata.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/model_metadata.py
@@ -355,11 +355,35 @@
         "OpenAI",
         "Proprietary",
     ],
-    "deepseek-ai/deepseek-coder-6.7b-instruct": [
-        "Deepseek-v1.5 (Prompt)",
-        "https://huggingface.co/deepseek-ai/deepseek-coder-7b-instruct-v1.5",
-        "Deepseek",
-        "Deepseek License",
+    "deepseek-ai/DeepSeek-V2.5": [
+        "DeepSeek-V2.5 (FC)",
+        "https://huggingface.co/deepseek-ai/DeepSeek-V2.5",
+        "DeepSeek",
+        "DeepSeek License"
+    ],
+    "deepseek-ai/DeepSeek-Coder-V2-Instruct-0724": [
+        "DeepSeek-Coder-V2 (FC)",
+        "https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Instruct-0724",
+        "DeepSeek",
+        "DeepSeek License"
+    ],
+    "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": [
+        "DeepSeek-Coder-V2-Lite-Instruct (FC)",
+        "https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct",
+        "DeepSeek",
+        "DeepSeek License"
+    ],
+    "deepseek-ai/DeepSeek-V2-Chat-0628": [
+        "DeepSeek-V2 (Prompt)",
+        "https://huggingface.co/deepseek-ai/DeepSeek-V2-Chat-0628",
+        "DeepSeek",
+        "DeepSeek License",
+    ],
+    "deepseek-ai/DeepSeek-V2-Lite-Chat": [
+        "DeepSeek-V2-Lite (Prompt)",
+        "https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite-Chat",
+        "DeepSeek",
+        "DeepSeek License",
     ],
     "google/gemma-7b-it": [
         "Gemma-7b-it (Prompt)",
@@ -649,6 +673,12 @@
         "Qwen",
         "apache-2.0",
     ],
+    "Qwen/Qwen2.5-72B-Instruct": [
+        "Qwen2.5-72B-Instruct (Prompt)",
+        "https://huggingface.co/Qwen/Qwen2.5-72B-Instruct",
+        "Qwen",
+        "apache-2.0",
+    ],
     "Team-ACE/ToolACE-8B": [
         "ToolACE-8B (FC)",
         "https://huggingface.co/Team-ACE/ToolACE-8B",
@@ -815,17 +845,4 @@
     "meetkai/functionary-medium-v3.1-FC",
     "snowflake/arctic",
     "nvidia/nemotron-4-340b-instruct",
-    "ibm-granite/granite-20b-functioncalling",
-    "THUDM/glm-4-9b-chat",
-    "Salesforce/xLAM-1b-fc-r",
-    "Salesforce/xLAM-7b-fc-r",
-    "Salesforce/xLAM-7b-r",
-    "Salesforce/xLAM-8x7b-r",
-    "Salesforce/xLAM-8x22b-r",
-    "Team-ACE/ToolACE-8B",
-    "MadeAgents/Hammer2.0-7b",
-    "MadeAgents/Hammer2.0-3b",
-    "MadeAgents/Hammer2.0-1.5b",
-    "MadeAgents/Hammer2.0-0.5b",
-    "BitAgent/GoGoAgent",
 ]
diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/handler_map.py b/berkeley-function-call-leaderboard/bfcl/model_handler/handler_map.py
@@ -1,4 +1,5 @@
 from bfcl.model_handler.oss_model.deepseek import DeepseekHandler
+from bfcl.model_handler.oss_model.deepseek_coder import DeepseekCoderHandler
 from bfcl.model_handler.oss_model.gemma import GemmaHandler
 from bfcl.model_handler.oss_model.glaive import GlaiveHandler
 from bfcl.model_handler.oss_model.glm import GLMHandler
@@ -7,26 +8,26 @@
 from bfcl.model_handler.oss_model.hermes import HermesHandler
 from bfcl.model_handler.oss_model.llama import LlamaHandler
 from bfcl.model_handler.oss_model.llama_fc import LlamaFCHandler
-from bfcl.model_handler.oss_model.phi import PhiHandler
-from bfcl.model_handler.oss_model.salesforce import SalesforceHandler
-from bfcl.model_handler.oss_model.qwen import QwenHandler
 from bfcl.model_handler.oss_model.minicpm import MiniCPMHandler
 from bfcl.model_handler.oss_model.minicpm_fc import MiniCPMFCHandler
+from bfcl.model_handler.oss_model.phi import PhiHandler
+from bfcl.model_handler.oss_model.qwen import QwenHandler
+from bfcl.model_handler.oss_model.salesforce import SalesforceHandler
 from bfcl.model_handler.proprietary_model.claude import ClaudeHandler
 from bfcl.model_handler.proprietary_model.cohere import CohereHandler
 from bfcl.model_handler.proprietary_model.databricks import DatabricksHandler
 from bfcl.model_handler.proprietary_model.fireworks import FireworksHandler
 from bfcl.model_handler.proprietary_model.functionary import FunctionaryHandler
 from bfcl.model_handler.proprietary_model.gemini import GeminiHandler
+from bfcl.model_handler.proprietary_model.gogoagent import GoGoAgentHandler
 from bfcl.model_handler.proprietary_model.gorilla import GorillaHandler
 from bfcl.model_handler.proprietary_model.mistral import MistralHandler
 from bfcl.model_handler.proprietary_model.nexus import NexusHandler
 from bfcl.model_handler.proprietary_model.nvidia import NvidiaHandler
 from bfcl.model_handler.proprietary_model.openai import OpenAIHandler
 from bfcl.model_handler.proprietary_model.yi import YiHandler
-from bfcl.model_handler.proprietary_model.gogoagent import GoGoAgentHandler
 
-# TODO: Add Deepseek V2, meta-llama/Llama-3.1-405B-Instruct
+# TODO: Add meta-llama/Llama-3.1-405B-Instruct
 
 # Inference through API calls
 api_inference_handler_map = {
@@ -129,9 +130,15 @@
     "Qwen/Qwen2-7B-Instruct": QwenHandler,
     "Qwen/Qwen2.5-1.5B-Instruct": QwenHandler,
     "Qwen/Qwen2.5-7B-Instruct": QwenHandler,
+    "Qwen/Qwen2.5-72B-Instruct": QwenHandler,
     "Team-ACE/ToolACE-8B": LlamaHandler,
     "openbmb/MiniCPM3-4B": MiniCPMHandler,
     "openbmb/MiniCPM3-4B-FC": MiniCPMFCHandler,
+    "deepseek-ai/DeepSeek-V2.5": DeepseekCoderHandler,
+    "deepseek-ai/DeepSeek-Coder-V2-Instruct-0724": DeepseekCoderHandler,
+    "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": DeepseekCoderHandler,
+    "deepseek-ai/DeepSeek-V2-Chat-0628": DeepseekHandler,
+    "deepseek-ai/DeepSeek-V2-Lite-Chat": DeepseekHandler,
 }
 
 # Deprecated/outdated models, no longer on the leaderboard

diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/base_oss_handler.py b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/base_oss_handler.py
@@ -52,17 +52,17 @@ def batch_inference(
         backend: str,
         include_input_log: bool,
         include_state_log: bool,
-        overwrite: bool,
+        update_mode: bool,
         result_dir=RESULT_PATH,
     ):
         """
         Batch inference for OSS models.
         """
         from transformers import AutoConfig, AutoTokenizer
 
-        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_huggingface)
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_huggingface, trust_remote_code=True)
 
-        config = AutoConfig.from_pretrained(self.model_name_huggingface)
+        config = AutoConfig.from_pretrained(self.model_name_huggingface, trust_remote_code=True)
         if hasattr(config, "max_position_embeddings"):
             self.max_context_length = config.max_position_embeddings
         elif self.tokenizer.model_max_length is not None:
@@ -194,10 +194,7 @@ def log_subprocess_output(pipe, stop_event):
                     for future in futures:
                         # This will wait for the task to complete, so that we are always writing in order
                         result = future.result()
-                        if overwrite:
-                            self.overwrite(result, result_dir)
-                        else:
-                            self.write(result, result_dir)
+                        self.write(result, result_dir, update_mode=update_mode)
                         pbar.update()
 
 

diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/deepseek.py b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/deepseek.py
@@ -1,11 +1,29 @@
 from bfcl.model_handler.oss_model.base_oss_handler import OSSHandler
 
 
-
 class DeepseekHandler(OSSHandler):
+    """
+    This is the handler for the Deepseek model. Deepseek-Coder models should use the DeepseekCoderHandler instead.
+    Note: `deepseek-ai/DeepSeek-V2.5` DO NOT use this handler, but the DeepseekCoderHandler, because it share the same chat template with the DeepSeek-Coder models.
+    """
     def __init__(self, model_name, temperature) -> None:
         super().__init__(model_name, temperature)
 
+    def decode_ast(self, result, language="Python"):
+        result = result.strip()
+        if result.startswith("```json"):
+            result = result[len("```json"):]
+        if result.startswith("```python"):
+            result = result[len("```python"):]
+        return super().decode_ast(result, language)
+
+    def decode_execute(self, result):
+        if result.startswith("```json"):
+            result = result[len("```json"):]
+        if result.startswith("```python"):
+            result = result[len("```python"):]
+        return super().decode_execute(result)
+
     def _format_prompt(self, messages, function):
         """
         "bos_token": {
@@ -15,30 +33,50 @@ def _format_prompt(self, messages, function):
             "normalized": true,
             "rstrip": false,
             "single_word": false
-        }
-        "chat_template": "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set ns.found = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n        {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}"
+        },
+        "eos_token": {
+            "__type": "AddedToken",
+            "content": "<｜end▁of▁sentence｜>",
+            "lstrip": false,
+            "normalized": true,
+            "rstrip": false,
+            "single_word": false
+        },
+        "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}"
         """
-
         formatted_prompt = "<｜begin▁of▁sentence｜>"
 
         for message in messages:
-            formatted_prompt += "\n    "
-            if message["role"] == "system":
-                formatted_prompt += f"\n{message['content']}\n    "
-            else:
-                formatted_prompt += "\n        "
-                if message["role"] == "user":
-                    formatted_prompt += (
-                        f"\n### Instruction:\\n{message['content']}\\n\n        "
-                    )
-                else:
-                    formatted_prompt += (
-                        f"\n### Response:\\n{message['content']}\\n<|EOT|>\\n\n        "
-                    )
-                formatted_prompt += "\n    "
-            formatted_prompt += "\n"
-
-        formatted_prompt += "\n### Response:\n"
+            if message["role"] == "user":
+                formatted_prompt += f"User: {message['content']}\n\n"
+            elif message["role"] == "assistant":
+                formatted_prompt += f"Assistant: {message['content']}<｜end▁of▁sentence｜>"
+            elif message["role"] == "system":
+                formatted_prompt += f"{message['content']}\n\n"
+
+        formatted_prompt += "Assistant:"
 
         return formatted_prompt
 
+    def _add_execution_results_prompting(
+        self, inference_data: dict, execution_results: list[str], model_response_data: dict
+    ) -> dict:
+        # Deepseek don't take the tool role; so we use the user role to send the tool output
+        tool_message = {
+            "role": "user",
+            "content": [],
+        }
+        for execution_result, decoded_model_response in zip(
+            execution_results, model_response_data["model_responses_decoded"]
+        ):
+            tool_message["content"].append(
+                {
+                    "role": "tool",
+                    "name": decoded_model_response,
+                    "content": execution_result,
+                }
+            )
+
+        inference_data["message"].append(tool_message)
+
+        return inference_data