xorbitsai · aresnow1 · Dec 14, 2023 · Dec 13, 2023 · Dec 13, 2023 · Dec 13, 2023
diff --git a/xinference/api/restful_api.py b/xinference/api/restful_api.py
@@ -773,7 +773,7 @@ async def create_chat_completion(
         is_chatglm_ggml = desc.get(
             "model_format"
         ) == "ggmlv3" and "chatglm" in desc.get("model_name", "")
-        is_chatglm3 = "chatglm3" == desc.get("model_name", "")
+        function_call_models = ["chatglm3", "gorilla-openfunctions-v1"]
 
         is_qwen = desc.get("model_format") == "ggmlv3" and "qwen" in desc.get(
             "model_name", ""
@@ -783,13 +783,14 @@ async def create_chat_completion(
             raise HTTPException(
                 status_code=400, detail="ChatGLM ggml does not have system prompt"
             )
-        if is_chatglm3 and body.tools and body.stream:
+        if body.tools and desc.get("model_name", "") not in function_call_models:
             raise HTTPException(
-                status_code=400, detail="ChatGLM3 tool calls does not support stream"
+                status_code=400,
+                detail=f"Only {function_call_models} support tool calls",
             )
-        if body.tools and not is_chatglm3:
+        if body.tools and body.stream:
             raise HTTPException(
-                status_code=400, detail="Only ChatGLM3 support tool calls"
+                status_code=400, detail="Tool calls does not support stream"
             )
 
         if body.stream:

diff --git a/xinference/client/tests/test_client.py b/xinference/client/tests/test_client.py
@@ -568,7 +568,7 @@ def setup_cluster():
     supervisor_address = f"localhost:{xo.utils.get_next_port()}"
     local_cluster = supervisor_run_in_subprocess(supervisor_address, TEST_LOGGING_CONF)
 
-    if not health_check(address=supervisor_address, max_attempts=10, sleep_interval=1):
+    if not health_check(address=supervisor_address, max_attempts=20, sleep_interval=1):
         raise RuntimeError("Supervisor is not available after multiple attempts")
 
     try:
@@ -610,7 +610,7 @@ def test_auto_recover(setup_cluster):
 
     model_proc.kill()
 
-    for _ in range(10):
+    for _ in range(60):
         try:
             completion = model.generate(
                 "Once upon a time, there was a very old computer", {"max_tokens": 64}

diff --git a/xinference/core/tests/test_restful_api.py b/xinference/core/tests/test_restful_api.py
@@ -500,6 +500,101 @@ def test_restful_api_for_tool_calls(setup, model_format, quantization):
     assert arg == {"symbol": "10111"}
 
 
+@pytest.mark.parametrize(
+    "model_format, quantization", [("ggufv2", "Q4_K_S"), ("pytorch", None)]
+)
+@pytest.mark.skip(reason="Cost too many resources.")
+def test_restful_api_for_gorilla_openfunctions_tool_calls(
+    setup, model_format, quantization
+):
+    model_name = "gorilla-openfunctions-v1"
+
+    endpoint, _ = setup
+    url = f"{endpoint}/v1/models"
+
+    # list
+    response = requests.get(url)
+    response_data = response.json()
+    assert len(response_data) == 0
+
+    # launch
+    payload = {
+        "model_uid": "test_tool",
+        "model_name": model_name,
+        "model_size_in_billions": 7,
+        "model_format": model_format,
+        "quantization": quantization,
+    }
+
+    response = requests.post(url, json=payload)
+    response_data = response.json()
+    model_uid_res = response_data["model_uid"]
+    assert model_uid_res == "test_tool"
+
+    response = requests.get(url)
+    response_data = response.json()
+    assert len(response_data) == 1
+
+    # tool
+    tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": "uber_ride",
+                "description": "Find suitable ride for customers given the location, "
+                "type of ride, and the amount of time the customer is "
+                "willing to wait as parameters",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "loc": {
+                            "type": "int",
+                            "description": "Location of the starting place of the Uber ride",
+                        },
+                        "type": {
+                            "type": "string",
+                            "enum": ["plus", "comfort", "black"],
+                            "description": "Types of Uber ride user is ordering",
+                        },
+                        "time": {
+                            "type": "int",
+                            "description": "The amount of time in minutes the customer is willing to wait",
+                        },
+                    },
+                },
+            },
+        }
+    ]
+    url = f"{endpoint}/v1/chat/completions"
+    payload = {
+        "model": model_uid_res,
+        "messages": [
+            {
+                "role": "user",
+                "content": 'Call me an Uber ride type "Plus" in Berkeley at zipcode 94704 in 10 minutes',
+            },
+        ],
+        "tools": tools,
+        "stop": ["\n"],
+        "max_tokens": 200,
+        "temperature": 0,
+    }
+    response = requests.post(url, json=payload)
+    completion = response.json()
+
+    assert "content" in completion["choices"][0]["message"]
+    assert "tool_calls" == completion["choices"][0]["finish_reason"]
+    assert (
+        "uber_ride"
+        == completion["choices"][0]["message"]["tool_calls"][0]["function"]["name"]
+    )
+    arguments = completion["choices"][0]["message"]["tool_calls"][0]["function"][
+        "arguments"
+    ]
+    arg = json.loads(arguments)
+    assert arg == {"loc": 94704, "time": 10, "type": "plus"}
+
+
 def test_restful_api_with_request_limits(setup):
     model_name = "gte-base"
 

diff --git a/xinference/model/llm/ggml/llamacpp.py b/xinference/model/llm/ggml/llamacpp.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import datetime
 import logging
 import os
@@ -301,7 +300,8 @@ def chat(
 
         chat_history = chat_history or []
         assert prompt_style is not None
-        full_prompt = self.get_prompt(prompt, chat_history, prompt_style)
+        tools = generate_config.pop("tools", []) if generate_config else None
+        full_prompt = self.get_prompt(prompt, chat_history, prompt_style, tools=tools)
 
         generate_config = self._sanitize_generate_config(generate_config)
 
@@ -313,4 +313,8 @@ def chat(
         else:
             c = self.generate(full_prompt, generate_config)
             assert not isinstance(c, Iterator)
+            if tools:
+                return self._tool_calls_completion(
+                    self.model_family.model_name, self.model_uid, c, tools
+                )
             return self._to_chat_completion(c)
diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
@@ -2541,5 +2541,64 @@
         "</s>"
       ]
     }
+  },
+  {
+    "version": 1,
+    "context_length": 4096,
+    "model_name": "gorilla-openfunctions-v1",
+    "model_lang": [
+      "en"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "OpenFunctions is designed to extend Large Language Model (LLM) Chat Completion feature to formulate executable APIs call given natural language instructions and API context.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "gorilla-llm/gorilla-openfunctions-v1",
+        "model_revision": "74615f614ee845eab114e71541fd5098d1709958"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Q2_K",
+          "Q3_K_L",
+          "Q3_K_M",
+          "Q3_K_S",
+          "Q4_0",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_0",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6_K",
+          "Q8_0"
+        ],
+        "model_id": "TheBloke/gorilla-openfunctions-v1-GGUF",
+        "model_file_name_template": "gorilla-openfunctions-v1.{quantization}.gguf"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "GORILLA_OPENFUNCTIONS",
+      "system_prompt": "",
+      "roles": [
+        "",
+        ""
+      ],
+      "intra_message_sep": "\n",
+      "inter_message_sep": "\n",
+      "stop_token_ids": [
+      ],
+      "stop": [
+      ]
+    }
   }
 ]
diff --git a/xinference/model/llm/pytorch/chatglm.py b/xinference/model/llm/pytorch/chatglm.py
@@ -11,9 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import json
-import time
-import uuid
 from typing import Iterator, List, Optional, Union
 
 from ....types import (
@@ -99,41 +96,6 @@ def _handle_tools(generate_config) -> Optional[dict]:
             "tools": chatglm_tools,
         }
 
-    @staticmethod
-    def _tool_calls_completion(msg, model_name) -> ChatCompletion:
-        _id = str(uuid.uuid4())
-        return {
-            "id": "chat" + f"cmpl-{_id}",
-            "model": model_name,
-            "object": "chat.completion",
-            "created": int(time.time()),
-            "choices": [
-                {
-                    "index": 0,
-                    "message": {
-                        "role": "assistant",
-                        "content": None,
-                        "tool_calls": [
-                            {
-                                "id": f"call_{_id}",
-                                "type": "function",
-                                "function": {
-                                    "name": msg["name"],
-                                    "arguments": json.dumps(msg["parameters"]),
-                                },
-                            }
-                        ],
-                    },
-                    "finish_reason": "tool_calls",
-                }
-            ],
-            "usage": {
-                "prompt_tokens": -1,
-                "completion_tokens": -1,
-                "total_tokens": -1,
-            },
-        }
-
     def chat(
         self,
         prompt: str,
@@ -156,7 +118,9 @@ def chat(
             if max_length is not None:
                 kwargs["max_length"] = int(max_length)
             msg = self._model.chat(self._tokenizer, prompt, [tools], **kwargs)
-            return self._tool_calls_completion(msg[0], self.model_uid)
+            return self._tool_calls_completion(
+                self.model_family.model_name, self.model_uid, msg, tools
+            )
         else:
             return super().chat(
                 prompt=prompt,

diff --git a/xinference/model/llm/pytorch/core.py b/xinference/model/llm/pytorch/core.py
@@ -483,7 +483,8 @@ def chat(
         if system_prompt:
             prompt_style.system_prompt = system_prompt
         chat_history = chat_history or []
-        full_prompt = self.get_prompt(prompt, chat_history, prompt_style)
+        tools = generate_config.pop("tools", []) if generate_config else None
+        full_prompt = self.get_prompt(prompt, chat_history, prompt_style, tools=tools)
 
         generate_config = self._sanitize_generate_config(generate_config)
 
@@ -495,4 +496,8 @@ def chat(
         else:
             c = self.generate(full_prompt, generate_config)
             assert not isinstance(c, Iterator)
+            if tools:
+                return self._tool_calls_completion(
+                    self.model_family.model_name, self.model_uid, c, tools
+                )
             return self._to_chat_completion(c)