diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
index adcdb2bbf2..1d2e6ba6b2 100644
--- a/.github/workflows/python.yaml
+++ b/.github/workflows/python.yaml
@@ -102,7 +102,7 @@ jobs:
           MODULE: ${{ matrix.module }}
         if: ${{ matrix.module != 'gpu' }}
         run: |
-          pip install "llama-cpp-python>=0.2.0,<0.2.12"
+          pip install "llama-cpp-python>=0.2.23"
           pip install transformers
           pip install torch
           pip install accelerate
diff --git a/setup.cfg b/setup.cfg
index 03512d0144..db81cfe9e7 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -73,7 +73,7 @@ dev =
 all =
     chatglm-cpp>=0.3.0
     ctransformers
-    llama-cpp-python>=0.2.0
+    llama-cpp-python>=0.2.23
     transformers>=4.34.1
     torch
     accelerate>=0.20.3
@@ -91,7 +91,7 @@ all =
     auto-gptq ; sys_platform!='darwin'
     optimum
 ggml =
-    llama-cpp-python>=0.2.0
+    llama-cpp-python>=0.2.23
     ctransformers
     chatglm-cpp>=0.3.0
 transformers =
diff --git a/xinference/core/tests/test_restful_api.py b/xinference/core/tests/test_restful_api.py
index 9101c7bf21..92cfe012c3 100644
--- a/xinference/core/tests/test_restful_api.py
+++ b/xinference/core/tests/test_restful_api.py
@@ -649,6 +649,7 @@ def test_restful_api_for_gorilla_openfunctions_tool_calls(
     "model_format, quantization",
     [
         ("pytorch", None),
+        ("ggufv2", "Q4_K_M"),
     ],
 )
 @pytest.mark.skip(reason="Cost too many resources.")
diff --git a/xinference/model/llm/__init__.py b/xinference/model/llm/__init__.py
index 8e8ce93241..c30b1517fb 100644
--- a/xinference/model/llm/__init__.py
+++ b/xinference/model/llm/__init__.py
@@ -40,7 +40,6 @@ def _install():
     from .ggml.chatglm import ChatglmCppChatModel
     from .ggml.ctransformers import CtransformersModel
     from .ggml.llamacpp import LlamaCppChatModel, LlamaCppModel
-    from .ggml.qwen import QWenModel
     from .pytorch.baichuan import BaichuanPytorchChatModel
     from .pytorch.chatglm import ChatglmPytorchChatModel
     from .pytorch.core import PytorchChatModel, PytorchModel
@@ -61,11 +60,6 @@ def _install():
             ChatglmCppChatModel,
         ]
     )
-    LLM_CLASSES.extend(
-        [
-            QWenModel,
-        ]
-    )
     LLM_CLASSES.extend(
         [
             CtransformersModel,
diff --git a/xinference/model/llm/ggml/llamacpp.py b/xinference/model/llm/ggml/llamacpp.py
index 006d93e970..a3e95bb545 100644
--- a/xinference/model/llm/ggml/llamacpp.py
+++ b/xinference/model/llm/ggml/llamacpp.py
@@ -14,7 +14,7 @@
 import datetime
 import logging
 import os
-from typing import Iterator, List, Optional, Union
+from typing import Iterable, Iterator, List, Optional, Union
 
 from ....types import (
     ChatCompletion,
@@ -272,7 +272,6 @@ def match(
             return False
         if (
             "chatglm" in llm_family.model_name
-            or "qwen" in llm_family.model_name
             or llm_family.model_name in CTRANSFORMERS_SUPPORTED_MODEL
         ):
             return False
@@ -306,6 +305,16 @@ def chat(
         full_prompt = self.get_prompt(prompt, chat_history, prompt_style, tools=tools)
 
         generate_config = self._sanitize_generate_config(generate_config)
+        # TODO(codingl2k1): qwen hacky to set stop for function call.
+        if tools and self.model_family.model_name == "qwen-chat":
+            stop = generate_config.get("stop")
+            if isinstance(stop, str):
+                generate_config["stop"] = [stop, "Observation:"]
+            elif isinstance(stop, Iterable):
+                assert not isinstance(stop, str)
+                generate_config["stop"] = stop + ["Observation:"]
+            else:
+                generate_config["stop"] = "Observation:"
 
         stream = generate_config.get("stream", False)
         if stream:
diff --git a/xinference/model/llm/ggml/qwen.py b/xinference/model/llm/ggml/qwen.py
deleted file mode 100644
index 8ae4d28083..0000000000
--- a/xinference/model/llm/ggml/qwen.py
+++ /dev/null
@@ -1,291 +0,0 @@
-# Copyright 2022-2023 XProbe Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import os
-import time
-import uuid
-from typing import TYPE_CHECKING, Iterator, List, Optional, Union
-
-from ....types import (
-    ChatCompletion,
-    ChatCompletionChunk,
-    ChatCompletionMessage,
-    Completion,
-    CompletionChunk,
-    QWenCppGenerateConfig,
-    QWenCppModelConfig,
-)
-from .. import LLMFamilyV1, LLMSpecV1
-from ..core import LLM
-
-if TYPE_CHECKING:
-    from qwen_cpp import Pipeline
-
-
-logger = logging.getLogger(__name__)
-
-
-class QWenModel(LLM):
-    def __init__(
-        self,
-        model_uid: str,
-        model_family: "LLMFamilyV1",
-        model_spec: "LLMSpecV1",
-        quantization: str,
-        model_path: str,
-        model_config: Optional[QWenCppModelConfig] = None,
-    ):
-        super().__init__(model_uid, model_family, model_spec, quantization, model_path)
-        self._llm: Optional["Pipeline"] = None
-
-        # just a placeholder for now as the qwen_cpp repo doesn't support model config.
-        self._model_config = model_config
-
-    def _get_input_ids_by_prompt(
-        self, prompt: str, max_context_length: int
-    ) -> List[int]:
-        assert self._llm is not None
-        return self._llm.tokenizer.encode_history([prompt], max_context_length)  # type: ignore
-
-    @classmethod
-    def _sanitize_generate_config(
-        cls,
-        qwen_cpp_generate_config: Optional[QWenCppGenerateConfig],
-    ) -> QWenCppGenerateConfig:
-        if qwen_cpp_generate_config is None:
-            qwen_cpp_generate_config = QWenCppGenerateConfig()
-        qwen_cpp_generate_config.setdefault("stream", False)
-        return qwen_cpp_generate_config
-
-    def load(self):
-        try:
-            import qwen_cpp
-        except ImportError:
-            error_message = "Failed to import module 'qwen_cpp'"
-            installation_guide = [
-                "Please make sure 'qwen_cpp' is installed. ",
-                "You can install it by running the following command in the terminal:\n",
-                "pip install -U qwen_cpp\n\n",
-                "Or visit the original git repo if the above command fails:\n",
-                "https://github.com/QwenLM/qwen.cpp",
-            ]
-
-            raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
-
-        model_file_path = os.path.join(
-            self.model_path,
-            self.model_spec.model_file_name_template.format(
-                quantization=self.quantization
-            ),
-        )
-
-        tiktoken_path = os.path.join(os.path.dirname(model_file_path), "qwen.tiktoken")
-        assert os.path.exists(tiktoken_path)
-
-        self._llm = qwen_cpp.Pipeline(model_file_path, tiktoken_path)
-
-    @classmethod
-    def match(
-        cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
-    ) -> bool:
-        if llm_spec.model_format != "ggmlv3":
-            return False
-        if "qwen" not in llm_family.model_name:
-            return False
-        if "chat" not in llm_family.model_ability:
-            return False
-        return True
-
-    @staticmethod
-    def _convert_raw_text_chunks_to_chat(
-        tokens: Iterator[str], model_name: str
-    ) -> Iterator[ChatCompletionChunk]:
-        yield {
-            "id": "chat" + f"cmpl-{str(uuid.uuid4())}",
-            "model": model_name,
-            "object": "chat.completion.chunk",
-            "created": int(time.time()),
-            "choices": [
-                {
-                    "index": 0,
-                    "delta": {
-                        "role": "assistant",
-                    },
-                    "finish_reason": None,
-                }
-            ],
-        }
-        for token in enumerate(tokens):
-            yield {
-                "id": "chat" + f"cmpl-{str(uuid.uuid4())}",
-                "model": model_name,
-                "object": "chat.completion.chunk",
-                "created": int(time.time()),
-                "choices": [
-                    {
-                        "index": 0,
-                        "delta": {
-                            "content": token[1],
-                        },
-                        "finish_reason": None,
-                    }
-                ],
-            }
-
-    @staticmethod
-    def _convert_raw_text_completion_to_chat(
-        text: str, model_name: str
-    ) -> ChatCompletion:
-        return {
-            "id": "chat" + f"cmpl-{str(uuid.uuid4())}",
-            "model": model_name,
-            "object": "chat.completion",
-            "created": int(time.time()),
-            "choices": [
-                {
-                    "index": 0,
-                    "message": {
-                        "role": "assistant",
-                        "content": text,
-                    },
-                    "finish_reason": None,
-                }
-            ],
-            "usage": {
-                "prompt_tokens": -1,
-                "completion_tokens": -1,
-                "total_tokens": -1,
-            },
-        }
-
-    def chat(
-        self,
-        prompt: str,
-        chat_history: Optional[List[ChatCompletionMessage]] = None,
-        generate_config: Optional[QWenCppGenerateConfig] = None,
-    ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
-        if chat_history is not None:
-            chat_history_list = [message["content"] for message in chat_history]
-        else:
-            chat_history_list = []
-
-        chat_history_list.append(prompt)
-        logger.debug("Full conversation history:\n%s", str(chat_history_list))
-
-        generate_config = self._sanitize_generate_config(generate_config)
-
-        params = {
-            "max_length": generate_config.get("max_tokens"),
-            "max_context_length": generate_config.get("max_tokens"),
-            "top_k": generate_config.get("top_k"),
-            "top_p": generate_config.get("top_p"),
-            "temperature": generate_config.get("temperature"),
-            "stream": generate_config.get("stream", False),
-        }
-
-        # Remove None values to exclude missing keys from params
-        params = {k: v for k, v in params.items() if v is not None}
-
-        assert self._llm is not None
-
-        if generate_config["stream"]:
-            it = self._llm.chat(
-                chat_history_list,
-                **params,
-            )
-            assert not isinstance(it, str)
-            return self._convert_raw_text_chunks_to_chat(it, self.model_uid)
-        else:
-            c = self._llm.chat(
-                chat_history_list,
-                **params,
-            )
-            assert not isinstance(c, Iterator)
-            return self._convert_raw_text_completion_to_chat(c, self.model_uid)
-
-    @staticmethod
-    def _convert_str_to_completion(data: str, model_name: str) -> Completion:
-        return {
-            "id": "generate" + f"-{str(uuid.uuid4())}",
-            "model": model_name,
-            "object": "text_completion",
-            "created": int(time.time()),
-            "choices": [
-                {"index": 0, "text": data, "finish_reason": None, "logprobs": None}
-            ],
-            "usage": {
-                "prompt_tokens": -1,
-                "completion_tokens": -1,
-                "total_tokens": -1,
-            },
-        }
-
-    @staticmethod
-    def _convert_str_to_completion_chunk(
-        tokens: Iterator[str], model_name: str
-    ) -> Iterator[CompletionChunk]:
-        for token in tokens:
-            yield {
-                "id": "generate" + f"-{str(uuid.uuid4())}",
-                "model": model_name,
-                "object": "text_completion",
-                "created": int(time.time()),
-                "choices": [
-                    {"index": 0, "text": token, "finish_reason": None, "logprobs": None}
-                ],
-            }
-
-    def generate(
-        self,
-        prompt: str,
-        generate_config: Optional[QWenCppGenerateConfig] = None,
-    ) -> Union[Completion, Iterator[CompletionChunk]]:
-        logger.debug(f"Prompt for generate:\n{prompt}")
-
-        generate_config = self._sanitize_generate_config(generate_config)
-
-        params = {
-            "max_length": generate_config.get("max_tokens"),
-            "max_context_length": generate_config.get("max_tokens"),
-            "top_k": generate_config.get("top_k"),
-            "top_p": generate_config.get("top_p"),
-            "temperature": generate_config.get("temperature"),
-            "stream": generate_config.get("stream", False),
-        }
-
-        # Remove None values to exclude missing keys from params
-        params = {k: v for k, v in params.items() if v is not None}
-
-        assert self._llm is not None
-
-        # See source code in qwen.cpp
-        input_ids = self._get_input_ids_by_prompt(
-            prompt, params.get("max_context_length", 512)  # type: ignore
-        )
-
-        if generate_config["stream"]:
-            it = self._llm._generate(
-                input_ids,
-                **params,
-            )
-            assert not isinstance(it, str)
-            return self._convert_str_to_completion_chunk(it, self.model_uid)
-        else:
-            c = self._llm._generate(
-                input_ids,
-                **params,
-            )
-            assert not isinstance(c, Iterator)
-            return self._convert_str_to_completion(c, self.model_uid)
diff --git a/xinference/model/llm/ggml/tests/test_qwen.py b/xinference/model/llm/ggml/tests/test_qwen.py
deleted file mode 100644
index 550039fc37..0000000000
--- a/xinference/model/llm/ggml/tests/test_qwen.py
+++ /dev/null
@@ -1,228 +0,0 @@
-# Copyright 2022-2023 XProbe Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import random
-import string
-from typing import Any, List
-
-import pytest
-
-from ...ggml.qwen import QWenModel
-from ...llm_family import GgmlLLMSpecV1, LLMFamilyV1
-
-
-class MockPipeline:
-    def __init__(self) -> None:
-        pass
-
-    def chat(self, *args, **kwargs) -> Any:
-        stream = kwargs.get("stream", False)
-        res = (
-            "qwen_test_chat"
-            if not stream
-            else iter([f"qwen_test_chat_{i}" for i in range(5)])
-        )
-        return res
-
-    def _generate(self, *args, **kwargs) -> Any:
-        stream = kwargs.get("stream", False)
-        res = (
-            "qwen_test_gen"
-            if not stream
-            else iter([f"qwen_test_gen_{i}" for i in range(5)])
-        )
-        return res
-
-
-class MockChatglmCppChatModel(QWenModel):
-    def load(self):
-        self._llm = MockPipeline()
-
-    def _get_input_ids_by_prompt(
-        self, prompt: str, max_context_length: int
-    ) -> List[int]:
-        return []
-
-
-mock_model_spec = GgmlLLMSpecV1(
-    model_format="ggmlv3",
-    model_size_in_billions=7,
-    quantizations=["q4_0"],
-    model_id="test_id",
-    model_file_name_template="qwen7b-ggml-{quantization}.bin",
-)
-
-serialized = """{
-   "version":1,
-   "context_length":2048,
-   "model_name":"TestModel",
-   "model_lang":[
-      "en"
-   ],
-   "model_ability":[
-      "embed", "generate"
-   ],
-   "model_specs":[
-      {
-         "model_format":"ggmlv3",
-         "model_size_in_billions":7,
-         "quantizations": ["q4_0"],
-         "model_id":"test_id",
-         "model_file_name_template":"qwen7b-ggml-{quantization}.bin"
-      },
-      {
-         "model_format":"pytorch",
-         "model_size_in_billions":3,
-         "quantizations": ["int8", "int4", "none"],
-         "model_id":"example/TestModel"
-      }
-   ],
-   "prompt_style": {
-       "style_name": "ADD_COLON_SINGLE",
-       "system_prompt": "TEST",
-       "roles": ["user", "assistant"],
-       "intra_message_sep": "\\n### ",
-       "inter_message_sep": "\\n### ",
-       "stop": null,
-       "stop_token_ids": null
-   }
-}"""
-
-mock_model_family = LLMFamilyV1.parse_raw(serialized)
-
-
-@pytest.mark.parametrize(
-    "model_spec, model_family", [(mock_model_spec, mock_model_family)]
-)
-def test_model_init(model_spec, model_family):
-    quantization = "q4_0"
-    uid = "".join(random.choice(string.digits) for i in range(100))
-    path = "".join(
-        random.choice(string.ascii_letters + string.punctuation) for i in range(100)
-    )
-    model = MockChatglmCppChatModel(
-        model_uid=uid,
-        model_family=model_family,
-        model_spec=model_spec,
-        quantization=quantization,
-        model_path=path,
-    )
-
-    assert model.model_uid == uid
-    assert model.quantization == quantization
-    assert model.model_path == path
-
-    assert isinstance(model.model_spec, GgmlLLMSpecV1)
-    assert isinstance(model.model_family, LLMFamilyV1)
-    assert isinstance(model.model_family.model_specs[0], GgmlLLMSpecV1)
-
-    assert (
-        model.model_family.model_specs[0].model_format == model.model_spec.model_format
-    )
-    assert model.model_family.model_specs[0].model_format == model_spec.model_format
-    assert (
-        model.model_family.model_specs[0].model_size_in_billions
-        == model.model_spec.model_size_in_billions
-    )
-    assert (
-        model.model_family.model_specs[0].model_size_in_billions
-        == model_spec.model_size_in_billions
-    )
-    assert (
-        model.model_family.model_specs[0].quantizations
-        == model.model_spec.quantizations
-    )
-    assert model.model_family.model_specs[0].quantizations == model_spec.quantizations
-    assert model.model_family.model_specs[0].model_id == model.model_spec.model_id
-    assert model.model_family.model_specs[0].model_id == model_spec.model_id
-    assert (
-        model.model_family.model_specs[0].model_file_name_template
-        == model.model_spec.model_file_name_template
-    )
-    assert (
-        model.model_family.model_specs[0].model_file_name_template
-        == model_spec.model_file_name_template
-    )
-    assert model.model_family.model_specs[0].model_uri == model.model_spec.model_uri
-    assert model.model_family.model_specs[0].model_uri == model_spec.model_uri
-
-    assert model._llm is None
-    assert model._model_config is None
-    model._model_config = model._sanitize_generate_config(None)
-    assert not model._model_config["stream"]
-
-
-@pytest.mark.parametrize(
-    "model_spec, model_family", [(mock_model_spec, mock_model_family)]
-)
-def test_model_chat(model_spec, model_family):
-    quantization = "q4_0"
-    uid = "".join(random.choice(string.digits) for i in range(100))
-    path = "".join(
-        random.choice(string.ascii_letters + string.punctuation) for i in range(100)
-    )
-    model = MockChatglmCppChatModel(
-        model_uid=uid,
-        model_family=model_family,
-        model_spec=model_spec,
-        quantization=quantization,
-        model_path=path,
-    )
-
-    assert model._llm is None
-
-    model.load()
-    assert isinstance(model._llm, MockPipeline)
-
-    responses_stream = list(model.chat("Hello", generate_config={"stream": True}))
-    assert responses_stream[0]["choices"][0]["delta"] == {"role": "assistant"}
-    for i in range(3):
-        assert responses_stream[i + 1]["choices"][0]["delta"] == {
-            "content": f"qwen_test_chat_{i}"
-        }
-
-    responses_non_stream = model.chat("Hello", generate_config={"stream": False})
-    assert responses_non_stream["choices"][0]["message"] == {
-        "role": "assistant",
-        "content": "qwen_test_chat",
-    }
-
-
-@pytest.mark.parametrize(
-    "model_spec, model_family", [(mock_model_spec, mock_model_family)]
-)
-def test_model_generate(model_spec, model_family):
-    quantization = "q4_0"
-    uid = "".join(random.choice(string.digits) for i in range(100))
-    path = "".join(
-        random.choice(string.ascii_letters + string.punctuation) for i in range(100)
-    )
-    model = MockChatglmCppChatModel(
-        model_uid=uid,
-        model_family=model_family,
-        model_spec=model_spec,
-        quantization=quantization,
-        model_path=path,
-    )
-    assert model._llm is None
-
-    model.load()
-    assert isinstance(model._llm, MockPipeline)
-
-    responses_stream = list(model.generate("Hello", generate_config={"stream": True}))
-    for i in range(5):
-        assert responses_stream[i]["choices"][0]["text"] == f"qwen_test_gen_{i}"
-
-    responses_non_stream = model.generate("Hello", generate_config={"stream": False})
-    assert responses_non_stream["choices"][0]["text"] == "qwen_test_gen"
diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
index 01100ff4ca..9385b80e8e 100644
--- a/xinference/model/llm/llm_family.json
+++ b/xinference/model/llm/llm_family.json
@@ -1097,24 +1097,22 @@
     "model_description": "Qwen-chat is a fine-tuned version of the Qwen LLM trained with alignment techniques, specializing in chatting.",
     "model_specs": [
       {
-        "model_format": "ggmlv3",
+        "model_format": "ggufv2",
         "model_size_in_billions": 7,
         "quantizations": [
-          "q4_0"
+          "Q4_K_M"
         ],
-        "model_id": "Xorbits/qwen-chat-7B-ggml",
-        "model_file_name_template": "qwen7b-ggml-{quantization}.bin",
-        "model_revision": "19f87d09e5ad58333aeeb7daf4b83b055c7a309e"
+        "model_id": "Xorbits/Qwen-7B-Chat-GGUF",
+        "model_file_name_template": "Qwen-7B-Chat.{quantization}.gguf"
       },
       {
-        "model_format": "ggmlv3",
+        "model_format": "ggufv2",
         "model_size_in_billions": 14,
         "quantizations": [
-          "q4_0"
+          "Q4_K_M"
         ],
-        "model_id": "Xorbits/qwen-chat-14B-ggml",
-        "model_file_name_template": "qwen14b-ggml-{quantization}.bin",
-        "model_revision": "11efca556af372b6f3c730322a4962e9900a2990"
+        "model_id": "Xorbits/Qwen-14B-Chat-GGUF",
+        "model_file_name_template": "Qwen-14B-Chat.{quantization}.gguf"
       },
       {
         "model_format": "pytorch",
diff --git a/xinference/model/llm/llm_family.py b/xinference/model/llm/llm_family.py
index 9fcff01180..44b7e6d2a4 100644
--- a/xinference/model/llm/llm_family.py
+++ b/xinference/model/llm/llm_family.py
@@ -510,20 +510,6 @@ def cache_from_modelscope(
             revision=llm_spec.model_revision,
         )
         symlink_local_file(download_path, cache_dir, filename)
-        # need to download another file named "qwen.tiktoken" for qwen model
-        if "qwen" in llm_family.model_name:
-            tiktoken_path = retry_download(
-                model_file_download,
-                llm_family.model_name,
-                {
-                    "model_size": llm_spec.model_size_in_billions,
-                    "model_format": llm_spec.model_format,
-                },
-                llm_spec.model_id,
-                "qwen.tiktoken",
-                revision=llm_spec.model_revision,
-            )
-            symlink_local_file(tiktoken_path, cache_dir, "qwen.tiktoken")
     else:
         raise ValueError(f"Unsupported format: {llm_spec.model_format}")
 
@@ -586,21 +572,6 @@ def cache_from_huggingface(
             local_dir=cache_dir,
             local_dir_use_symlinks=True,
         )
-        # need to download another file named "qwen.tiktoken" for qwen model
-        if "qwen" in llm_family.model_name:
-            retry_download(
-                huggingface_hub.hf_hub_download,
-                llm_family.model_name,
-                {
-                    "model_size": llm_spec.model_size_in_billions,
-                    "model_format": llm_spec.model_format,
-                },
-                llm_spec.model_id,
-                revision=llm_spec.model_revision,
-                filename="qwen.tiktoken",
-                local_dir=cache_dir,
-                local_dir_use_symlinks=True,
-            )
     else:
         raise ValueError(f"Unsupported model format: {llm_spec.model_format}")
 
diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json
index bf6d8a522d..5669bcbbeb 100644
--- a/xinference/model/llm/llm_family_modelscope.json
+++ b/xinference/model/llm/llm_family_modelscope.json
@@ -1345,26 +1345,24 @@
     "model_description": "Qwen-chat is a fine-tuned version of the Qwen LLM trained with alignment techniques, specializing in chatting.",
     "model_specs": [
       {
-        "model_format": "ggmlv3",
+        "model_format": "ggufv2",
         "model_size_in_billions": 7,
         "quantizations": [
-          "q4_0"
+          "Q4_K_M"
         ],
         "model_hub": "modelscope",
-        "model_id": "Xorbits/qwen-chat-7B-ggml",
-        "model_file_name_template": "qwen7b-ggml-{quantization}.bin",
-        "model_revision": "v0.0.4"
+        "model_id": "Xorbits/Qwen-7B-Chat-GGUF",
+        "model_file_name_template": "Qwen-7B-Chat.{quantization}.gguf"
       },
       {
-        "model_format": "ggmlv3",
+        "model_format": "ggufv2",
         "model_size_in_billions": 14,
         "quantizations": [
-          "q4_0"
+          "Q4_K_M"
         ],
         "model_hub": "modelscope",
-        "model_id": "Xorbits/qwen-chat-14B-ggml",
-        "model_file_name_template": "qwen14b-ggml-{quantization}.bin",
-        "model_revision": "v0.0.2"
+        "model_id": "Xorbits/Qwen-14B-Chat-GGUF",
+        "model_file_name_template": "Qwen-14B-Chat.{quantization}.gguf"
       },
       {
         "model_format": "pytorch",
diff --git a/xinference/model/llm/utils.py b/xinference/model/llm/utils.py
index b4fc94df20..9c600c4785 100644
--- a/xinference/model/llm/utils.py
+++ b/xinference/model/llm/utils.py
@@ -436,6 +436,7 @@ def _tool_calls_completion(cls, model_name, model_uid, c, tools):
             content, func, args = cls._eval_qwen_chat_arguments(c, tools)
         else:
             raise Exception(f"Model {model_name} is not support tool calls.")
+        logger.debug("Tool call content: %s, func: %s, args: %s", content, func, args)
 
         if content:
             m = {"role": "assistant", "content": content, "tool_calls": []}
diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index e1ee21d117..9ec82c6c4f 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -80,11 +80,20 @@ def retry_download(
     *args,
     **kwargs,
 ):
+    last_ex = None
     for current_attempt in range(1, MAX_ATTEMPTS + 1):
         try:
             return download_func(*args, **kwargs)
-        except:
+        except Exception as e:
             remaining_attempts = MAX_ATTEMPTS - current_attempt
+            last_ex = e
+            logger.debug(
+                "Download failed: %s, download func: %s, download args: %s, kwargs: %s",
+                e,
+                download_func,
+                args,
+                kwargs,
+            )
             logger.warning(
                 f"Attempt {current_attempt} failed. Remaining attempts: {remaining_attempts}"
             )
@@ -101,11 +110,11 @@ def retry_download(
                 f"Failed to download model '{model_name}' "
                 f"(size: {model_size}, format: {model_format}) "
                 f"after multiple retries"
-            )
+            ) from last_ex
         else:  # Embedding models
             raise RuntimeError(
                 f"Failed to download model '{model_name}' after multiple retries"
-            )
+            ) from last_ex
 
 
 def valid_model_revision(