diff --git a/doc/source/getting_started/installation.rst b/doc/source/getting_started/installation.rst index c82f2698c8..a92e3d5e93 100644 --- a/doc/source/getting_started/installation.rst +++ b/doc/source/getting_started/installation.rst @@ -58,6 +58,7 @@ Currently, supported models include: - ``codegeex4`` - ``qwen1.5-chat``, ``qwen1.5-moe-chat`` - ``qwen2-instruct``, ``qwen2-moe-instruct`` +- ``QwQ-32B-Preview`` - ``gemma-it``, ``gemma-2-it`` - ``orion-chat``, ``orion-chat-rag`` - ``c4ai-command-r-v01`` diff --git a/doc/source/models/builtin/llm/index.rst b/doc/source/models/builtin/llm/index.rst index 34018b8ffe..4d55123965 100644 --- a/doc/source/models/builtin/llm/index.rst +++ b/doc/source/models/builtin/llm/index.rst @@ -446,6 +446,11 @@ The following is a list of built-in LLM in Xinference: - 32768 - Qwen2.5 is the latest series of Qwen large language models. For Qwen2.5, we release a number of base language models and instruction-tuned language models ranging from 0.5 to 72 billion parameters. + * - :ref:`qwq-32b-preview ` + - chat + - 32768 + - QwQ-32B-Preview is an experimental research model developed by the Qwen Team, focused on advancing AI reasoning capabilities. + * - :ref:`seallm_v2 ` - generate - 8192 @@ -723,6 +728,8 @@ The following is a list of built-in LLM in Xinference: qwen2.5-instruct + qwq-32b-preview + seallm_v2 seallm_v2.5 diff --git a/doc/source/models/builtin/llm/qwq-32b-preview.rst b/doc/source/models/builtin/llm/qwq-32b-preview.rst new file mode 100644 index 0000000000..e8c62f6add --- /dev/null +++ b/doc/source/models/builtin/llm/qwq-32b-preview.rst @@ -0,0 +1,111 @@ +.. _models_llm_qwq-32b-preview: + +======================================== +QwQ-32B-Preview +======================================== + +- **Context Length:** 32768 +- **Model Name:** QwQ-32B-Preview +- **Languages:** en, zh +- **Abilities:** chat +- **Description:** QwQ-32B-Preview is an experimental research model developed by the Qwen Team, focused on advancing AI reasoning capabilities. + +Specifications +^^^^^^^^^^^^^^ + + +Model Spec 1 (pytorch, 32 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 32 +- **Quantizations:** 4-bit, 8-bit, none +- **Engines**: vLLM, Transformers, SGLang (vLLM and SGLang only available for quantization none) +- **Model ID:** Qwen/QwQ-32B-Preview +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name QwQ-32B-Preview --size-in-billions 32 --model-format pytorch --quantization ${quantization} + + +Model Spec 2 (awq, 32 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** awq +- **Model Size (in billions):** 32 +- **Quantizations:** Int4 +- **Engines**: vLLM, Transformers, SGLang +- **Model ID:** KirillR/QwQ-32B-Preview-AWQ +- **Model Hubs**: `Hugging Face `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name QwQ-32B-Preview --size-in-billions 32 --model-format awq --quantization ${quantization} + + +Model Spec 3 (ggufv2, 32 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** ggufv2 +- **Model Size (in billions):** 32 +- **Quantizations:** Q3_K_L, Q4_K_M, Q6_K, Q8_0 +- **Engines**: llama.cpp +- **Model ID:** lmstudio-community/QwQ-32B-Preview-GGUF +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name QwQ-32B-Preview --size-in-billions 32 --model-format ggufv2 --quantization ${quantization} + + +Model Spec 4 (mlx, 32 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** mlx +- **Model Size (in billions):** 32 +- **Quantizations:** 4-bit +- **Engines**: MLX +- **Model ID:** mlx-community/Qwen_QwQ-32B-Preview_MLX-4bit +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name QwQ-32B-Preview --size-in-billions 32 --model-format mlx --quantization ${quantization} + + +Model Spec 5 (mlx, 32 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** mlx +- **Model Size (in billions):** 32 +- **Quantizations:** 8-bit +- **Engines**: MLX +- **Model ID:** mlx-community/Qwen_QwQ-32B-Preview_MLX-8bit +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name QwQ-32B-Preview --size-in-billions 32 --model-format mlx --quantization ${quantization} + + +Model Spec 6 (mlx, 32 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** mlx +- **Model Size (in billions):** 32 +- **Quantizations:** none +- **Engines**: MLX +- **Model ID:** mlx-community/QwQ-32B-Preview-bf16 +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name QwQ-32B-Preview --size-in-billions 32 --model-format mlx --quantization ${quantization} + diff --git a/doc/source/user_guide/backends.rst b/doc/source/user_guide/backends.rst index b8a669e0fb..4f4c09f4cd 100644 --- a/doc/source/user_guide/backends.rst +++ b/doc/source/user_guide/backends.rst @@ -65,6 +65,7 @@ Currently, supported model includes: - ``codegeex4`` - ``qwen1.5-chat``, ``qwen1.5-moe-chat`` - ``qwen2-instruct``, ``qwen2-moe-instruct`` +- ``QwQ-32B-Preview`` - ``gemma-it``, ``gemma-2-it`` - ``orion-chat``, ``orion-chat-rag`` - ``c4ai-command-r-v01`` diff --git a/setup.cfg b/setup.cfg index 4066032215..72385a1fa6 100644 --- a/setup.cfg +++ b/setup.cfg @@ -24,7 +24,7 @@ zip_safe = False include_package_data = True packages = find: install_requires = - xoscar>=0.3.0 + xoscar>=0.3.0,<0.4.2 torch gradio pillow @@ -39,7 +39,7 @@ install_requires = typing_extensions modelscope>=1.10.0 sse_starlette>=1.6.5 # ensure_bytes API break change: https://github.com/sysid/sse-starlette/issues/65 - openai>1 # For typing + openai>=1.40.0 # For typing python-jose[cryptography] passlib[bcrypt] aioprometheus[starlette]>=23.12.0 @@ -71,7 +71,7 @@ dev = jieba>=0.42.0 flake8>=3.8.0 black - openai>1 + openai>=1.40.0 langchain langchain-community orjson @@ -121,7 +121,7 @@ all = gdown # For CosyVoice, matcha pyarrow # For CosyVoice, matcha HyperPyYAML # For CosyVoice - onnxruntime==1.16.0 # For CosyVoice, use onnxruntime-gpu==1.16.0 if possible + onnxruntime>=1.16.0 # For CosyVoice, use onnxruntime-gpu==1.16.0 if possible boto3>=1.28.55,<1.28.65 # For tensorizer tensorizer~=2.9.0 eva-decord # For video in VL @@ -209,7 +209,7 @@ audio = gdown # For CosyVoice, matcha pyarrow # For CosyVoice, matcha HyperPyYAML # For CosyVoice - onnxruntime==1.16.0 # For CosyVoice, use onnxruntime-gpu==1.16.0 if possible + onnxruntime>=1.16.0 # For CosyVoice, use onnxruntime-gpu==1.16.0 if possible loguru # For Fish Speech natsort # For Fish Speech loralib # For Fish Speech diff --git a/xinference/deploy/docker/requirements.txt b/xinference/deploy/docker/requirements.txt index 79d4e2defd..b968aeb476 100644 --- a/xinference/deploy/docker/requirements.txt +++ b/xinference/deploy/docker/requirements.txt @@ -1,5 +1,5 @@ # required -xoscar>=0.3.0 +xoscar>=0.3.0,<0.4.2 gradio==4.26.0 typer[all]<0.12.0 # fix typer required by gradio pillow @@ -14,7 +14,7 @@ huggingface-hub>=0.19.4 typing_extensions modelscope>=1.10.0 sse_starlette>=1.6.5 # ensure_bytes API break change: https://github.com/sysid/sse-starlette/issues/65 -openai>1 # For typing +openai>=1.40.0 # For typing python-jose[cryptography] passlib[bcrypt] aioprometheus[starlette]>=23.12.0 @@ -25,7 +25,7 @@ opencv-contrib-python-headless setproctitle # all -transformers>=4.43.2 +transformers>=4.45.0 accelerate>=0.28.0 sentencepiece transformers_stream_generator diff --git a/xinference/deploy/docker/requirements_cpu.txt b/xinference/deploy/docker/requirements_cpu.txt index 4105a2e709..8fcb24711d 100644 --- a/xinference/deploy/docker/requirements_cpu.txt +++ b/xinference/deploy/docker/requirements_cpu.txt @@ -1,4 +1,4 @@ -xoscar>=0.3.0 +xoscar>=0.3.0,<0.4.2 gradio==4.26.0 typer[all]<0.12.0 pillow diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json index 21701b9031..93da847654 100644 --- a/xinference/model/llm/llm_family.json +++ b/xinference/model/llm/llm_family.json @@ -8516,5 +8516,85 @@ "<|im_start|>", "<|im_end|>" ] + }, + { + "version": 1, + "context_length": 32768, + "model_name": "QwQ-32B-Preview", + "model_lang": [ + "en", + "zh" + ], + "model_ability": [ + "chat" + ], + "model_description": "QwQ-32B-Preview is an experimental research model developed by the Qwen Team, focused on advancing AI reasoning capabilities.", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": 32, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "Qwen/QwQ-32B-Preview" + }, + { + "model_format": "awq", + "model_size_in_billions": 32, + "quantizations": [ + "Int4" + ], + "model_id": "KirillR/QwQ-32B-Preview-AWQ" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 32, + "quantizations": [ + "Q3_K_L", + "Q4_K_M", + "Q6_K", + "Q8_0" + ], + "model_id": "lmstudio-community/QwQ-32B-Preview-GGUF", + "model_file_name_template": "QwQ-32B-Preview-{quantization}.gguf" + }, + { + "model_format": "mlx", + "model_size_in_billions": 32, + "quantizations": [ + "4-bit" + ], + "model_id": "mlx-community/Qwen_QwQ-32B-Preview_MLX-4bit" + }, + { + "model_format": "mlx", + "model_size_in_billions": 32, + "quantizations": [ + "8-bit" + ], + "model_id": "mlx-community/Qwen_QwQ-32B-Preview_MLX-8bit" + }, + { + "model_format": "mlx", + "model_size_in_billions": 32, + "quantizations": [ + "none" + ], + "model_id": "mlx-community/QwQ-32B-Preview-bf16" + } + ], + "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{\\\"name\\\": , \\\"arguments\\\": }\\n<|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n\\n' }}\n {{- message.content }}\n {{- '\\n' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n", + "stop_token_ids": [ + 151643, + 151644, + 151645 + ], + "stop": [ + "<|endoftext|>", + "<|im_start|>", + "<|im_end|>" + ] } ] diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json index f8598d3602..d225b2abcb 100644 --- a/xinference/model/llm/llm_family_modelscope.json +++ b/xinference/model/llm/llm_family_modelscope.json @@ -6267,5 +6267,72 @@ "<|im_start|>", "<|im_end|>" ] + }, + { + "version": 1, + "context_length": 32768, + "model_name": "QwQ-32B-Preview", + "model_lang": [ + "en", + "zh" + ], + "model_ability": [ + "chat" + ], + "model_description": "QwQ-32B-Preview is an experimental research model developed by the Qwen Team, focused on advancing AI reasoning capabilities.", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": 32, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "Qwen/QwQ-32B-Preview", + "model_hub": "modelscope" + }, + { + "model_format": "mlx", + "model_size_in_billions": 32, + "quantizations": [ + "4-bit" + ], + "model_id": "okwinds/QwQ-32B-Preview-MLX-4bit", + "model_hub": "modelscope" + }, + { + "model_format": "mlx", + "model_size_in_billions": 32, + "quantizations": [ + "8-bit" + ], + "model_id": "okwinds/QwQ-32B-Preview-MLX-8bit", + "model_hub": "modelscope" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 32, + "quantizations": [ + "Q3_K_L", + "Q4_K_M", + "Q6_K", + "Q8_0" + ], + "model_id": "AI-ModelScope/QwQ-32B-Preview-GGUF", + "model_file_name_template": "QwQ-32B-Preview-{quantization}.gguf" + } + ], + "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{\\\"name\\\": , \\\"arguments\\\": }\\n<|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n\\n' }}\n {{- message.content }}\n {{- '\\n' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n", + "stop_token_ids": [ + 151643, + 151644, + 151645 + ], + "stop": [ + "<|endoftext|>", + "<|im_start|>", + "<|im_end|>" + ] } ] diff --git a/xinference/model/llm/sglang/core.py b/xinference/model/llm/sglang/core.py index a413f2ad0f..016103a867 100644 --- a/xinference/model/llm/sglang/core.py +++ b/xinference/model/llm/sglang/core.py @@ -89,6 +89,7 @@ class SGLANGGenerateConfig(TypedDict, total=False): "deepseek-v2-chat-0628", "qwen2.5-instruct", "qwen2.5-coder-instruct", + "QwQ-32B-Preview", ] diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py index 89e14ae496..bafd84ab1f 100644 --- a/xinference/model/llm/vllm/core.py +++ b/xinference/model/llm/vllm/core.py @@ -153,6 +153,7 @@ class VLLMGenerateConfig(TypedDict, total=False): VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-instruct") VLLM_SUPPORTED_MODELS.append("qwen2.5-coder") VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-coder-instruct") + VLLM_SUPPORTED_CHAT_MODELS.append("QwQ-32B-Preview") if VLLM_INSTALLED and vllm.__version__ >= "0.3.2":