ENH: qwen switch to llama cpp (xorbitsai#778)

Bojun-Feng · Dec 27, 2023 · 9ae71f6 · 9ae71f6
1 parent 164a9cc
commit 9ae71f6
Show file tree

Hide file tree

Showing 12 changed files with 44 additions and 582 deletions.
diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
@@ -102,7 +102,7 @@ jobs:
           MODULE: ${{ matrix.module }}
         if: ${{ matrix.module != 'gpu' }}
         run: |
-          pip install "llama-cpp-python>=0.2.0,<0.2.12"
+          pip install "llama-cpp-python>=0.2.23"
           pip install transformers
           pip install torch
           pip install accelerate

diff --git a/setup.cfg b/setup.cfg
@@ -73,7 +73,7 @@ dev =
 all =
     chatglm-cpp>=0.3.0
     ctransformers
-    llama-cpp-python>=0.2.0
+    llama-cpp-python>=0.2.23
     transformers>=4.34.1
     torch
     accelerate>=0.20.3
@@ -91,7 +91,7 @@ all =
     auto-gptq ; sys_platform!='darwin'
     optimum
 ggml =
-    llama-cpp-python>=0.2.0
+    llama-cpp-python>=0.2.23
     ctransformers
     chatglm-cpp>=0.3.0
 transformers =

diff --git a/xinference/core/tests/test_restful_api.py b/xinference/core/tests/test_restful_api.py
@@ -649,6 +649,7 @@ def test_restful_api_for_gorilla_openfunctions_tool_calls(
     "model_format, quantization",
     [
         ("pytorch", None),
+        ("ggufv2", "Q4_K_M"),
     ],
 )
 @pytest.mark.skip(reason="Cost too many resources.")

diff --git a/xinference/model/llm/__init__.py b/xinference/model/llm/__init__.py
@@ -40,7 +40,6 @@ def _install():
     from .ggml.chatglm import ChatglmCppChatModel
     from .ggml.ctransformers import CtransformersModel
     from .ggml.llamacpp import LlamaCppChatModel, LlamaCppModel
-    from .ggml.qwen import QWenModel
     from .pytorch.baichuan import BaichuanPytorchChatModel
     from .pytorch.chatglm import ChatglmPytorchChatModel
     from .pytorch.core import PytorchChatModel, PytorchModel
@@ -61,11 +60,6 @@ def _install():
             ChatglmCppChatModel,
         ]
     )
-    LLM_CLASSES.extend(
-        [
-            QWenModel,
-        ]
-    )
     LLM_CLASSES.extend(
         [
             CtransformersModel,

diff --git a/xinference/model/llm/ggml/llamacpp.py b/xinference/model/llm/ggml/llamacpp.py
@@ -14,7 +14,7 @@
 import datetime
 import logging
 import os
-from typing import Iterator, List, Optional, Union
+from typing import Iterable, Iterator, List, Optional, Union
 
 from ....types import (
     ChatCompletion,
@@ -272,7 +272,6 @@ def match(
             return False
         if (
             "chatglm" in llm_family.model_name
-            or "qwen" in llm_family.model_name
             or llm_family.model_name in CTRANSFORMERS_SUPPORTED_MODEL
         ):
             return False
@@ -306,6 +305,16 @@ def chat(
         full_prompt = self.get_prompt(prompt, chat_history, prompt_style, tools=tools)
 
         generate_config = self._sanitize_generate_config(generate_config)
+        # TODO(codingl2k1): qwen hacky to set stop for function call.
+        if tools and self.model_family.model_name == "qwen-chat":
+            stop = generate_config.get("stop")
+            if isinstance(stop, str):
+                generate_config["stop"] = [stop, "Observation:"]
+            elif isinstance(stop, Iterable):
+                assert not isinstance(stop, str)
+                generate_config["stop"] = stop + ["Observation:"]
+            else:
+                generate_config["stop"] = "Observation:"
 
         stream = generate_config.get("stream", False)
         if stream:

diff --git a/xinference/model/llm/ggml/qwen.py b/xinference/model/llm/ggml/qwen.py