FEAT: support QwQ-32B-Preview (#2602)

xorbitsai · Nov 29, 2024 · 7e87d14 · 7e87d14
1 parent 23f09f9
commit 7e87d14
Show file tree

Hide file tree

Showing 11 changed files with 278 additions and 9 deletions.
diff --git a/doc/source/getting_started/installation.rst b/doc/source/getting_started/installation.rst
@@ -58,6 +58,7 @@ Currently, supported models include:
 - ``codegeex4``
 - ``qwen1.5-chat``, ``qwen1.5-moe-chat``
 - ``qwen2-instruct``, ``qwen2-moe-instruct``
+- ``QwQ-32B-Preview``
 - ``gemma-it``, ``gemma-2-it``
 - ``orion-chat``, ``orion-chat-rag``
 - ``c4ai-command-r-v01``

diff --git a/doc/source/models/builtin/llm/index.rst b/doc/source/models/builtin/llm/index.rst
@@ -446,6 +446,11 @@ The following is a list of built-in LLM in Xinference:
      - 32768
      - Qwen2.5 is the latest series of Qwen large language models. For Qwen2.5, we release a number of base language models and instruction-tuned language models ranging from 0.5 to 72 billion parameters.
 
+   * - :ref:`qwq-32b-preview <models_llm_qwq-32b-preview>`
+     - chat
+     - 32768
+     - QwQ-32B-Preview is an experimental research model developed by the Qwen Team, focused on advancing AI reasoning capabilities.
+
    * - :ref:`seallm_v2 <models_llm_seallm_v2>`
      - generate
      - 8192
@@ -723,6 +728,8 @@ The following is a list of built-in LLM in Xinference:
 
    qwen2.5-instruct
 
+   qwq-32b-preview
+
    seallm_v2
 
    seallm_v2.5

diff --git a/doc/source/models/builtin/llm/qwq-32b-preview.rst b/doc/source/models/builtin/llm/qwq-32b-preview.rst
@@ -0,0 +1,111 @@
+.. _models_llm_qwq-32b-preview:
+
+========================================
+QwQ-32B-Preview
+========================================
+
+- **Context Length:** 32768
+- **Model Name:** QwQ-32B-Preview
+- **Languages:** en, zh
+- **Abilities:** chat
+- **Description:** QwQ-32B-Preview is an experimental research model developed by the Qwen Team, focused on advancing AI reasoning capabilities.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 32 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 32
+- **Quantizations:** 4-bit, 8-bit, none
+- **Engines**: vLLM, Transformers, SGLang (vLLM and SGLang only available for quantization none)
+- **Model ID:** Qwen/QwQ-32B-Preview
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/QwQ-32B-Preview>`__, `ModelScope <https://modelscope.cn/models/Qwen/QwQ-32B-Preview>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name QwQ-32B-Preview --size-in-billions 32 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 2 (awq, 32 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** awq
+- **Model Size (in billions):** 32
+- **Quantizations:** Int4
+- **Engines**: vLLM, Transformers, SGLang
+- **Model ID:** KirillR/QwQ-32B-Preview-AWQ
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/KirillR/QwQ-32B-Preview-AWQ>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name QwQ-32B-Preview --size-in-billions 32 --model-format awq --quantization ${quantization}
+
+
+Model Spec 3 (ggufv2, 32 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 32
+- **Quantizations:** Q3_K_L, Q4_K_M, Q6_K, Q8_0
+- **Engines**: llama.cpp
+- **Model ID:** lmstudio-community/QwQ-32B-Preview-GGUF
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/lmstudio-community/QwQ-32B-Preview-GGUF>`__, `ModelScope <https://modelscope.cn/models/AI-ModelScope/QwQ-32B-Preview-GGUF>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name QwQ-32B-Preview --size-in-billions 32 --model-format ggufv2 --quantization ${quantization}
+
+
+Model Spec 4 (mlx, 32 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** mlx
+- **Model Size (in billions):** 32
+- **Quantizations:** 4-bit
+- **Engines**: MLX
+- **Model ID:** mlx-community/Qwen_QwQ-32B-Preview_MLX-4bit
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/mlx-community/Qwen_QwQ-32B-Preview_MLX-4bit>`__, `ModelScope <https://modelscope.cn/models/okwinds/QwQ-32B-Preview-MLX-8bit>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name QwQ-32B-Preview --size-in-billions 32 --model-format mlx --quantization ${quantization}
+
+
+Model Spec 5 (mlx, 32 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** mlx
+- **Model Size (in billions):** 32
+- **Quantizations:** 8-bit
+- **Engines**: MLX
+- **Model ID:** mlx-community/Qwen_QwQ-32B-Preview_MLX-8bit
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/mlx-community/Qwen_QwQ-32B-Preview_MLX-8bit>`__, `ModelScope <https://modelscope.cn/models/okwinds/QwQ-32B-Preview-MLX-8bit>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name QwQ-32B-Preview --size-in-billions 32 --model-format mlx --quantization ${quantization}
+
+
+Model Spec 6 (mlx, 32 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** mlx
+- **Model Size (in billions):** 32
+- **Quantizations:** none
+- **Engines**: MLX
+- **Model ID:** mlx-community/QwQ-32B-Preview-bf16
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/mlx-community/QwQ-32B-Preview-bf16>`__, `ModelScope <https://modelscope.cn/models/okwinds/QwQ-32B-Preview-MLX-8bit>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name QwQ-32B-Preview --size-in-billions 32 --model-format mlx --quantization ${quantization}
+
diff --git a/doc/source/user_guide/backends.rst b/doc/source/user_guide/backends.rst
@@ -65,6 +65,7 @@ Currently, supported model includes:
 - ``codegeex4``
 - ``qwen1.5-chat``, ``qwen1.5-moe-chat``
 - ``qwen2-instruct``, ``qwen2-moe-instruct``
+- ``QwQ-32B-Preview``
 - ``gemma-it``, ``gemma-2-it``
 - ``orion-chat``, ``orion-chat-rag``
 - ``c4ai-command-r-v01``

diff --git a/setup.cfg b/setup.cfg
@@ -24,7 +24,7 @@ zip_safe = False
 include_package_data = True
 packages = find:
 install_requires =
-    xoscar>=0.3.0
+    xoscar>=0.3.0,<0.4.2
     torch
     gradio
     pillow
@@ -39,7 +39,7 @@ install_requires =
     typing_extensions
     modelscope>=1.10.0
     sse_starlette>=1.6.5  # ensure_bytes API break change: https://github.com/sysid/sse-starlette/issues/65
-    openai>1  # For typing
+    openai>=1.40.0  # For typing
     python-jose[cryptography]
     passlib[bcrypt]
     aioprometheus[starlette]>=23.12.0
@@ -71,7 +71,7 @@ dev =
     jieba>=0.42.0
     flake8>=3.8.0
     black
-    openai>1
+    openai>=1.40.0
     langchain
     langchain-community
     orjson
@@ -121,7 +121,7 @@ all =
     gdown  # For CosyVoice, matcha
     pyarrow  # For CosyVoice, matcha
     HyperPyYAML  # For CosyVoice
-    onnxruntime==1.16.0  # For CosyVoice, use onnxruntime-gpu==1.16.0 if possible
+    onnxruntime>=1.16.0  # For CosyVoice, use onnxruntime-gpu==1.16.0 if possible
     boto3>=1.28.55,<1.28.65 # For tensorizer
     tensorizer~=2.9.0
     eva-decord  # For video in VL
@@ -209,7 +209,7 @@ audio =
     gdown  # For CosyVoice, matcha
     pyarrow  # For CosyVoice, matcha
     HyperPyYAML  # For CosyVoice
-    onnxruntime==1.16.0  # For CosyVoice, use onnxruntime-gpu==1.16.0 if possible
+    onnxruntime>=1.16.0  # For CosyVoice, use onnxruntime-gpu==1.16.0 if possible
     loguru  # For Fish Speech
     natsort  # For Fish Speech
     loralib  # For Fish Speech

diff --git a/xinference/deploy/docker/requirements.txt b/xinference/deploy/docker/requirements.txt
@@ -1,5 +1,5 @@
 # required
-xoscar>=0.3.0
+xoscar>=0.3.0,<0.4.2
 gradio==4.26.0
 typer[all]<0.12.0  # fix typer required by gradio
 pillow
@@ -14,7 +14,7 @@ huggingface-hub>=0.19.4
 typing_extensions
 modelscope>=1.10.0
 sse_starlette>=1.6.5  # ensure_bytes API break change: https://github.com/sysid/sse-starlette/issues/65
-openai>1  # For typing
+openai>=1.40.0  # For typing
 python-jose[cryptography]
 passlib[bcrypt]
 aioprometheus[starlette]>=23.12.0
@@ -25,7 +25,7 @@ opencv-contrib-python-headless
 setproctitle
 
 # all
-transformers>=4.43.2
+transformers>=4.45.0
 accelerate>=0.28.0
 sentencepiece
 transformers_stream_generator

diff --git a/xinference/deploy/docker/requirements_cpu.txt b/xinference/deploy/docker/requirements_cpu.txt
@@ -1,4 +1,4 @@
-xoscar>=0.3.0
+xoscar>=0.3.0,<0.4.2
 gradio==4.26.0
 typer[all]<0.12.0
 pillow

diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
@@ -8516,5 +8516,85 @@
       "<|im_start|>",
       "<|im_end|>"
     ]
+  },
+  {
+    "version": 1,
+    "context_length": 32768,
+    "model_name": "QwQ-32B-Preview",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "QwQ-32B-Preview is an experimental research model developed by the Qwen Team, focused on advancing AI reasoning capabilities.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/QwQ-32B-Preview"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "KirillR/QwQ-32B-Preview-AWQ"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "Q3_K_L",
+          "Q4_K_M",
+          "Q6_K",
+          "Q8_0"
+        ],
+        "model_id": "lmstudio-community/QwQ-32B-Preview-GGUF",
+        "model_file_name_template": "QwQ-32B-Preview-{quantization}.gguf"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "4-bit"
+        ],
+        "model_id": "mlx-community/Qwen_QwQ-32B-Preview_MLX-4bit"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "8-bit"
+        ],
+        "model_id": "mlx-community/Qwen_QwQ-32B-Preview_MLX-8bit"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "mlx-community/QwQ-32B-Preview-bf16"
+      }
+    ],
+    "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
+    "stop_token_ids": [
+      151643,
+      151644,
+      151645
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|im_start|>",
+      "<|im_end|>"
+    ]
   }
 ]
diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json
@@ -6267,5 +6267,72 @@
       "<|im_start|>",
       "<|im_end|>"
     ]
+  },
+  {
+    "version": 1,
+    "context_length": 32768,
+    "model_name": "QwQ-32B-Preview",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "QwQ-32B-Preview is an experimental research model developed by the Qwen Team, focused on advancing AI reasoning capabilities.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/QwQ-32B-Preview",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "4-bit"
+        ],
+        "model_id": "okwinds/QwQ-32B-Preview-MLX-4bit",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "8-bit"
+        ],
+        "model_id": "okwinds/QwQ-32B-Preview-MLX-8bit",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "Q3_K_L",
+          "Q4_K_M",
+          "Q6_K",
+          "Q8_0"
+        ],
+        "model_id": "AI-ModelScope/QwQ-32B-Preview-GGUF",
+        "model_file_name_template": "QwQ-32B-Preview-{quantization}.gguf"
+      }
+    ],
+    "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
+    "stop_token_ids": [
+      151643,
+      151644,
+      151645
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|im_start|>",
+      "<|im_end|>"
+    ]
   }
 ]
diff --git a/xinference/model/llm/sglang/core.py b/xinference/model/llm/sglang/core.py
@@ -89,6 +89,7 @@ class SGLANGGenerateConfig(TypedDict, total=False):
     "deepseek-v2-chat-0628",
     "qwen2.5-instruct",
     "qwen2.5-coder-instruct",
+    "QwQ-32B-Preview",
 ]
 
 

diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py
@@ -153,6 +153,7 @@ class VLLMGenerateConfig(TypedDict, total=False):
     VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-instruct")
     VLLM_SUPPORTED_MODELS.append("qwen2.5-coder")
     VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-coder-instruct")
+    VLLM_SUPPORTED_CHAT_MODELS.append("QwQ-32B-Preview")
 
 
 if VLLM_INSTALLED and vllm.__version__ >= "0.3.2":