diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml index adcdb2bbf2..1d2e6ba6b2 100644 --- a/.github/workflows/python.yaml +++ b/.github/workflows/python.yaml @@ -102,7 +102,7 @@ jobs: MODULE: ${{ matrix.module }} if: ${{ matrix.module != 'gpu' }} run: | - pip install "llama-cpp-python>=0.2.0,<0.2.12" + pip install "llama-cpp-python>=0.2.23" pip install transformers pip install torch pip install accelerate diff --git a/setup.cfg b/setup.cfg index 03512d0144..db81cfe9e7 100644 --- a/setup.cfg +++ b/setup.cfg @@ -73,7 +73,7 @@ dev = all = chatglm-cpp>=0.3.0 ctransformers - llama-cpp-python>=0.2.0 + llama-cpp-python>=0.2.23 transformers>=4.34.1 torch accelerate>=0.20.3 @@ -91,7 +91,7 @@ all = auto-gptq ; sys_platform!='darwin' optimum ggml = - llama-cpp-python>=0.2.0 + llama-cpp-python>=0.2.23 ctransformers chatglm-cpp>=0.3.0 transformers = diff --git a/xinference/core/tests/test_restful_api.py b/xinference/core/tests/test_restful_api.py index 9101c7bf21..92cfe012c3 100644 --- a/xinference/core/tests/test_restful_api.py +++ b/xinference/core/tests/test_restful_api.py @@ -649,6 +649,7 @@ def test_restful_api_for_gorilla_openfunctions_tool_calls( "model_format, quantization", [ ("pytorch", None), + ("ggufv2", "Q4_K_M"), ], ) @pytest.mark.skip(reason="Cost too many resources.") diff --git a/xinference/model/llm/__init__.py b/xinference/model/llm/__init__.py index 8e8ce93241..c30b1517fb 100644 --- a/xinference/model/llm/__init__.py +++ b/xinference/model/llm/__init__.py @@ -40,7 +40,6 @@ def _install(): from .ggml.chatglm import ChatglmCppChatModel from .ggml.ctransformers import CtransformersModel from .ggml.llamacpp import LlamaCppChatModel, LlamaCppModel - from .ggml.qwen import QWenModel from .pytorch.baichuan import BaichuanPytorchChatModel from .pytorch.chatglm import ChatglmPytorchChatModel from .pytorch.core import PytorchChatModel, PytorchModel @@ -61,11 +60,6 @@ def _install(): ChatglmCppChatModel, ] ) - LLM_CLASSES.extend( - [ - QWenModel, - ] - ) LLM_CLASSES.extend( [ CtransformersModel, diff --git a/xinference/model/llm/ggml/llamacpp.py b/xinference/model/llm/ggml/llamacpp.py index 006d93e970..a3e95bb545 100644 --- a/xinference/model/llm/ggml/llamacpp.py +++ b/xinference/model/llm/ggml/llamacpp.py @@ -14,7 +14,7 @@ import datetime import logging import os -from typing import Iterator, List, Optional, Union +from typing import Iterable, Iterator, List, Optional, Union from ....types import ( ChatCompletion, @@ -272,7 +272,6 @@ def match( return False if ( "chatglm" in llm_family.model_name - or "qwen" in llm_family.model_name or llm_family.model_name in CTRANSFORMERS_SUPPORTED_MODEL ): return False @@ -306,6 +305,16 @@ def chat( full_prompt = self.get_prompt(prompt, chat_history, prompt_style, tools=tools) generate_config = self._sanitize_generate_config(generate_config) + # TODO(codingl2k1): qwen hacky to set stop for function call. + if tools and self.model_family.model_name == "qwen-chat": + stop = generate_config.get("stop") + if isinstance(stop, str): + generate_config["stop"] = [stop, "Observation:"] + elif isinstance(stop, Iterable): + assert not isinstance(stop, str) + generate_config["stop"] = stop + ["Observation:"] + else: + generate_config["stop"] = "Observation:" stream = generate_config.get("stream", False) if stream: diff --git a/xinference/model/llm/ggml/qwen.py b/xinference/model/llm/ggml/qwen.py deleted file mode 100644 index 8ae4d28083..0000000000 --- a/xinference/model/llm/ggml/qwen.py +++ /dev/null @@ -1,291 +0,0 @@ -# Copyright 2022-2023 XProbe Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import os -import time -import uuid -from typing import TYPE_CHECKING, Iterator, List, Optional, Union - -from ....types import ( - ChatCompletion, - ChatCompletionChunk, - ChatCompletionMessage, - Completion, - CompletionChunk, - QWenCppGenerateConfig, - QWenCppModelConfig, -) -from .. import LLMFamilyV1, LLMSpecV1 -from ..core import LLM - -if TYPE_CHECKING: - from qwen_cpp import Pipeline - - -logger = logging.getLogger(__name__) - - -class QWenModel(LLM): - def __init__( - self, - model_uid: str, - model_family: "LLMFamilyV1", - model_spec: "LLMSpecV1", - quantization: str, - model_path: str, - model_config: Optional[QWenCppModelConfig] = None, - ): - super().__init__(model_uid, model_family, model_spec, quantization, model_path) - self._llm: Optional["Pipeline"] = None - - # just a placeholder for now as the qwen_cpp repo doesn't support model config. - self._model_config = model_config - - def _get_input_ids_by_prompt( - self, prompt: str, max_context_length: int - ) -> List[int]: - assert self._llm is not None - return self._llm.tokenizer.encode_history([prompt], max_context_length) # type: ignore - - @classmethod - def _sanitize_generate_config( - cls, - qwen_cpp_generate_config: Optional[QWenCppGenerateConfig], - ) -> QWenCppGenerateConfig: - if qwen_cpp_generate_config is None: - qwen_cpp_generate_config = QWenCppGenerateConfig() - qwen_cpp_generate_config.setdefault("stream", False) - return qwen_cpp_generate_config - - def load(self): - try: - import qwen_cpp - except ImportError: - error_message = "Failed to import module 'qwen_cpp'" - installation_guide = [ - "Please make sure 'qwen_cpp' is installed. ", - "You can install it by running the following command in the terminal:\n", - "pip install -U qwen_cpp\n\n", - "Or visit the original git repo if the above command fails:\n", - "https://github.com/QwenLM/qwen.cpp", - ] - - raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}") - - model_file_path = os.path.join( - self.model_path, - self.model_spec.model_file_name_template.format( - quantization=self.quantization - ), - ) - - tiktoken_path = os.path.join(os.path.dirname(model_file_path), "qwen.tiktoken") - assert os.path.exists(tiktoken_path) - - self._llm = qwen_cpp.Pipeline(model_file_path, tiktoken_path) - - @classmethod - def match( - cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str - ) -> bool: - if llm_spec.model_format != "ggmlv3": - return False - if "qwen" not in llm_family.model_name: - return False - if "chat" not in llm_family.model_ability: - return False - return True - - @staticmethod - def _convert_raw_text_chunks_to_chat( - tokens: Iterator[str], model_name: str - ) -> Iterator[ChatCompletionChunk]: - yield { - "id": "chat" + f"cmpl-{str(uuid.uuid4())}", - "model": model_name, - "object": "chat.completion.chunk", - "created": int(time.time()), - "choices": [ - { - "index": 0, - "delta": { - "role": "assistant", - }, - "finish_reason": None, - } - ], - } - for token in enumerate(tokens): - yield { - "id": "chat" + f"cmpl-{str(uuid.uuid4())}", - "model": model_name, - "object": "chat.completion.chunk", - "created": int(time.time()), - "choices": [ - { - "index": 0, - "delta": { - "content": token[1], - }, - "finish_reason": None, - } - ], - } - - @staticmethod - def _convert_raw_text_completion_to_chat( - text: str, model_name: str - ) -> ChatCompletion: - return { - "id": "chat" + f"cmpl-{str(uuid.uuid4())}", - "model": model_name, - "object": "chat.completion", - "created": int(time.time()), - "choices": [ - { - "index": 0, - "message": { - "role": "assistant", - "content": text, - }, - "finish_reason": None, - } - ], - "usage": { - "prompt_tokens": -1, - "completion_tokens": -1, - "total_tokens": -1, - }, - } - - def chat( - self, - prompt: str, - chat_history: Optional[List[ChatCompletionMessage]] = None, - generate_config: Optional[QWenCppGenerateConfig] = None, - ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: - if chat_history is not None: - chat_history_list = [message["content"] for message in chat_history] - else: - chat_history_list = [] - - chat_history_list.append(prompt) - logger.debug("Full conversation history:\n%s", str(chat_history_list)) - - generate_config = self._sanitize_generate_config(generate_config) - - params = { - "max_length": generate_config.get("max_tokens"), - "max_context_length": generate_config.get("max_tokens"), - "top_k": generate_config.get("top_k"), - "top_p": generate_config.get("top_p"), - "temperature": generate_config.get("temperature"), - "stream": generate_config.get("stream", False), - } - - # Remove None values to exclude missing keys from params - params = {k: v for k, v in params.items() if v is not None} - - assert self._llm is not None - - if generate_config["stream"]: - it = self._llm.chat( - chat_history_list, - **params, - ) - assert not isinstance(it, str) - return self._convert_raw_text_chunks_to_chat(it, self.model_uid) - else: - c = self._llm.chat( - chat_history_list, - **params, - ) - assert not isinstance(c, Iterator) - return self._convert_raw_text_completion_to_chat(c, self.model_uid) - - @staticmethod - def _convert_str_to_completion(data: str, model_name: str) -> Completion: - return { - "id": "generate" + f"-{str(uuid.uuid4())}", - "model": model_name, - "object": "text_completion", - "created": int(time.time()), - "choices": [ - {"index": 0, "text": data, "finish_reason": None, "logprobs": None} - ], - "usage": { - "prompt_tokens": -1, - "completion_tokens": -1, - "total_tokens": -1, - }, - } - - @staticmethod - def _convert_str_to_completion_chunk( - tokens: Iterator[str], model_name: str - ) -> Iterator[CompletionChunk]: - for token in tokens: - yield { - "id": "generate" + f"-{str(uuid.uuid4())}", - "model": model_name, - "object": "text_completion", - "created": int(time.time()), - "choices": [ - {"index": 0, "text": token, "finish_reason": None, "logprobs": None} - ], - } - - def generate( - self, - prompt: str, - generate_config: Optional[QWenCppGenerateConfig] = None, - ) -> Union[Completion, Iterator[CompletionChunk]]: - logger.debug(f"Prompt for generate:\n{prompt}") - - generate_config = self._sanitize_generate_config(generate_config) - - params = { - "max_length": generate_config.get("max_tokens"), - "max_context_length": generate_config.get("max_tokens"), - "top_k": generate_config.get("top_k"), - "top_p": generate_config.get("top_p"), - "temperature": generate_config.get("temperature"), - "stream": generate_config.get("stream", False), - } - - # Remove None values to exclude missing keys from params - params = {k: v for k, v in params.items() if v is not None} - - assert self._llm is not None - - # See source code in qwen.cpp - input_ids = self._get_input_ids_by_prompt( - prompt, params.get("max_context_length", 512) # type: ignore - ) - - if generate_config["stream"]: - it = self._llm._generate( - input_ids, - **params, - ) - assert not isinstance(it, str) - return self._convert_str_to_completion_chunk(it, self.model_uid) - else: - c = self._llm._generate( - input_ids, - **params, - ) - assert not isinstance(c, Iterator) - return self._convert_str_to_completion(c, self.model_uid) diff --git a/xinference/model/llm/ggml/tests/test_qwen.py b/xinference/model/llm/ggml/tests/test_qwen.py deleted file mode 100644 index 550039fc37..0000000000 --- a/xinference/model/llm/ggml/tests/test_qwen.py +++ /dev/null @@ -1,228 +0,0 @@ -# Copyright 2022-2023 XProbe Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import random -import string -from typing import Any, List - -import pytest - -from ...ggml.qwen import QWenModel -from ...llm_family import GgmlLLMSpecV1, LLMFamilyV1 - - -class MockPipeline: - def __init__(self) -> None: - pass - - def chat(self, *args, **kwargs) -> Any: - stream = kwargs.get("stream", False) - res = ( - "qwen_test_chat" - if not stream - else iter([f"qwen_test_chat_{i}" for i in range(5)]) - ) - return res - - def _generate(self, *args, **kwargs) -> Any: - stream = kwargs.get("stream", False) - res = ( - "qwen_test_gen" - if not stream - else iter([f"qwen_test_gen_{i}" for i in range(5)]) - ) - return res - - -class MockChatglmCppChatModel(QWenModel): - def load(self): - self._llm = MockPipeline() - - def _get_input_ids_by_prompt( - self, prompt: str, max_context_length: int - ) -> List[int]: - return [] - - -mock_model_spec = GgmlLLMSpecV1( - model_format="ggmlv3", - model_size_in_billions=7, - quantizations=["q4_0"], - model_id="test_id", - model_file_name_template="qwen7b-ggml-{quantization}.bin", -) - -serialized = """{ - "version":1, - "context_length":2048, - "model_name":"TestModel", - "model_lang":[ - "en" - ], - "model_ability":[ - "embed", "generate" - ], - "model_specs":[ - { - "model_format":"ggmlv3", - "model_size_in_billions":7, - "quantizations": ["q4_0"], - "model_id":"test_id", - "model_file_name_template":"qwen7b-ggml-{quantization}.bin" - }, - { - "model_format":"pytorch", - "model_size_in_billions":3, - "quantizations": ["int8", "int4", "none"], - "model_id":"example/TestModel" - } - ], - "prompt_style": { - "style_name": "ADD_COLON_SINGLE", - "system_prompt": "TEST", - "roles": ["user", "assistant"], - "intra_message_sep": "\\n### ", - "inter_message_sep": "\\n### ", - "stop": null, - "stop_token_ids": null - } -}""" - -mock_model_family = LLMFamilyV1.parse_raw(serialized) - - -@pytest.mark.parametrize( - "model_spec, model_family", [(mock_model_spec, mock_model_family)] -) -def test_model_init(model_spec, model_family): - quantization = "q4_0" - uid = "".join(random.choice(string.digits) for i in range(100)) - path = "".join( - random.choice(string.ascii_letters + string.punctuation) for i in range(100) - ) - model = MockChatglmCppChatModel( - model_uid=uid, - model_family=model_family, - model_spec=model_spec, - quantization=quantization, - model_path=path, - ) - - assert model.model_uid == uid - assert model.quantization == quantization - assert model.model_path == path - - assert isinstance(model.model_spec, GgmlLLMSpecV1) - assert isinstance(model.model_family, LLMFamilyV1) - assert isinstance(model.model_family.model_specs[0], GgmlLLMSpecV1) - - assert ( - model.model_family.model_specs[0].model_format == model.model_spec.model_format - ) - assert model.model_family.model_specs[0].model_format == model_spec.model_format - assert ( - model.model_family.model_specs[0].model_size_in_billions - == model.model_spec.model_size_in_billions - ) - assert ( - model.model_family.model_specs[0].model_size_in_billions - == model_spec.model_size_in_billions - ) - assert ( - model.model_family.model_specs[0].quantizations - == model.model_spec.quantizations - ) - assert model.model_family.model_specs[0].quantizations == model_spec.quantizations - assert model.model_family.model_specs[0].model_id == model.model_spec.model_id - assert model.model_family.model_specs[0].model_id == model_spec.model_id - assert ( - model.model_family.model_specs[0].model_file_name_template - == model.model_spec.model_file_name_template - ) - assert ( - model.model_family.model_specs[0].model_file_name_template - == model_spec.model_file_name_template - ) - assert model.model_family.model_specs[0].model_uri == model.model_spec.model_uri - assert model.model_family.model_specs[0].model_uri == model_spec.model_uri - - assert model._llm is None - assert model._model_config is None - model._model_config = model._sanitize_generate_config(None) - assert not model._model_config["stream"] - - -@pytest.mark.parametrize( - "model_spec, model_family", [(mock_model_spec, mock_model_family)] -) -def test_model_chat(model_spec, model_family): - quantization = "q4_0" - uid = "".join(random.choice(string.digits) for i in range(100)) - path = "".join( - random.choice(string.ascii_letters + string.punctuation) for i in range(100) - ) - model = MockChatglmCppChatModel( - model_uid=uid, - model_family=model_family, - model_spec=model_spec, - quantization=quantization, - model_path=path, - ) - - assert model._llm is None - - model.load() - assert isinstance(model._llm, MockPipeline) - - responses_stream = list(model.chat("Hello", generate_config={"stream": True})) - assert responses_stream[0]["choices"][0]["delta"] == {"role": "assistant"} - for i in range(3): - assert responses_stream[i + 1]["choices"][0]["delta"] == { - "content": f"qwen_test_chat_{i}" - } - - responses_non_stream = model.chat("Hello", generate_config={"stream": False}) - assert responses_non_stream["choices"][0]["message"] == { - "role": "assistant", - "content": "qwen_test_chat", - } - - -@pytest.mark.parametrize( - "model_spec, model_family", [(mock_model_spec, mock_model_family)] -) -def test_model_generate(model_spec, model_family): - quantization = "q4_0" - uid = "".join(random.choice(string.digits) for i in range(100)) - path = "".join( - random.choice(string.ascii_letters + string.punctuation) for i in range(100) - ) - model = MockChatglmCppChatModel( - model_uid=uid, - model_family=model_family, - model_spec=model_spec, - quantization=quantization, - model_path=path, - ) - assert model._llm is None - - model.load() - assert isinstance(model._llm, MockPipeline) - - responses_stream = list(model.generate("Hello", generate_config={"stream": True})) - for i in range(5): - assert responses_stream[i]["choices"][0]["text"] == f"qwen_test_gen_{i}" - - responses_non_stream = model.generate("Hello", generate_config={"stream": False}) - assert responses_non_stream["choices"][0]["text"] == "qwen_test_gen" diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json index 01100ff4ca..9385b80e8e 100644 --- a/xinference/model/llm/llm_family.json +++ b/xinference/model/llm/llm_family.json @@ -1097,24 +1097,22 @@ "model_description": "Qwen-chat is a fine-tuned version of the Qwen LLM trained with alignment techniques, specializing in chatting.", "model_specs": [ { - "model_format": "ggmlv3", + "model_format": "ggufv2", "model_size_in_billions": 7, "quantizations": [ - "q4_0" + "Q4_K_M" ], - "model_id": "Xorbits/qwen-chat-7B-ggml", - "model_file_name_template": "qwen7b-ggml-{quantization}.bin", - "model_revision": "19f87d09e5ad58333aeeb7daf4b83b055c7a309e" + "model_id": "Xorbits/Qwen-7B-Chat-GGUF", + "model_file_name_template": "Qwen-7B-Chat.{quantization}.gguf" }, { - "model_format": "ggmlv3", + "model_format": "ggufv2", "model_size_in_billions": 14, "quantizations": [ - "q4_0" + "Q4_K_M" ], - "model_id": "Xorbits/qwen-chat-14B-ggml", - "model_file_name_template": "qwen14b-ggml-{quantization}.bin", - "model_revision": "11efca556af372b6f3c730322a4962e9900a2990" + "model_id": "Xorbits/Qwen-14B-Chat-GGUF", + "model_file_name_template": "Qwen-14B-Chat.{quantization}.gguf" }, { "model_format": "pytorch", diff --git a/xinference/model/llm/llm_family.py b/xinference/model/llm/llm_family.py index 9fcff01180..44b7e6d2a4 100644 --- a/xinference/model/llm/llm_family.py +++ b/xinference/model/llm/llm_family.py @@ -510,20 +510,6 @@ def cache_from_modelscope( revision=llm_spec.model_revision, ) symlink_local_file(download_path, cache_dir, filename) - # need to download another file named "qwen.tiktoken" for qwen model - if "qwen" in llm_family.model_name: - tiktoken_path = retry_download( - model_file_download, - llm_family.model_name, - { - "model_size": llm_spec.model_size_in_billions, - "model_format": llm_spec.model_format, - }, - llm_spec.model_id, - "qwen.tiktoken", - revision=llm_spec.model_revision, - ) - symlink_local_file(tiktoken_path, cache_dir, "qwen.tiktoken") else: raise ValueError(f"Unsupported format: {llm_spec.model_format}") @@ -586,21 +572,6 @@ def cache_from_huggingface( local_dir=cache_dir, local_dir_use_symlinks=True, ) - # need to download another file named "qwen.tiktoken" for qwen model - if "qwen" in llm_family.model_name: - retry_download( - huggingface_hub.hf_hub_download, - llm_family.model_name, - { - "model_size": llm_spec.model_size_in_billions, - "model_format": llm_spec.model_format, - }, - llm_spec.model_id, - revision=llm_spec.model_revision, - filename="qwen.tiktoken", - local_dir=cache_dir, - local_dir_use_symlinks=True, - ) else: raise ValueError(f"Unsupported model format: {llm_spec.model_format}") diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json index bf6d8a522d..5669bcbbeb 100644 --- a/xinference/model/llm/llm_family_modelscope.json +++ b/xinference/model/llm/llm_family_modelscope.json @@ -1345,26 +1345,24 @@ "model_description": "Qwen-chat is a fine-tuned version of the Qwen LLM trained with alignment techniques, specializing in chatting.", "model_specs": [ { - "model_format": "ggmlv3", + "model_format": "ggufv2", "model_size_in_billions": 7, "quantizations": [ - "q4_0" + "Q4_K_M" ], "model_hub": "modelscope", - "model_id": "Xorbits/qwen-chat-7B-ggml", - "model_file_name_template": "qwen7b-ggml-{quantization}.bin", - "model_revision": "v0.0.4" + "model_id": "Xorbits/Qwen-7B-Chat-GGUF", + "model_file_name_template": "Qwen-7B-Chat.{quantization}.gguf" }, { - "model_format": "ggmlv3", + "model_format": "ggufv2", "model_size_in_billions": 14, "quantizations": [ - "q4_0" + "Q4_K_M" ], "model_hub": "modelscope", - "model_id": "Xorbits/qwen-chat-14B-ggml", - "model_file_name_template": "qwen14b-ggml-{quantization}.bin", - "model_revision": "v0.0.2" + "model_id": "Xorbits/Qwen-14B-Chat-GGUF", + "model_file_name_template": "Qwen-14B-Chat.{quantization}.gguf" }, { "model_format": "pytorch", diff --git a/xinference/model/llm/utils.py b/xinference/model/llm/utils.py index b4fc94df20..9c600c4785 100644 --- a/xinference/model/llm/utils.py +++ b/xinference/model/llm/utils.py @@ -436,6 +436,7 @@ def _tool_calls_completion(cls, model_name, model_uid, c, tools): content, func, args = cls._eval_qwen_chat_arguments(c, tools) else: raise Exception(f"Model {model_name} is not support tool calls.") + logger.debug("Tool call content: %s, func: %s, args: %s", content, func, args) if content: m = {"role": "assistant", "content": content, "tool_calls": []} diff --git a/xinference/model/utils.py b/xinference/model/utils.py index e1ee21d117..9ec82c6c4f 100644 --- a/xinference/model/utils.py +++ b/xinference/model/utils.py @@ -80,11 +80,20 @@ def retry_download( *args, **kwargs, ): + last_ex = None for current_attempt in range(1, MAX_ATTEMPTS + 1): try: return download_func(*args, **kwargs) - except: + except Exception as e: remaining_attempts = MAX_ATTEMPTS - current_attempt + last_ex = e + logger.debug( + "Download failed: %s, download func: %s, download args: %s, kwargs: %s", + e, + download_func, + args, + kwargs, + ) logger.warning( f"Attempt {current_attempt} failed. Remaining attempts: {remaining_attempts}" ) @@ -101,11 +110,11 @@ def retry_download( f"Failed to download model '{model_name}' " f"(size: {model_size}, format: {model_format}) " f"after multiple retries" - ) + ) from last_ex else: # Embedding models raise RuntimeError( f"Failed to download model '{model_name}' after multiple retries" - ) + ) from last_ex def valid_model_revision(