FEAT: support codeqwen1.5-chat (#1322)

xorbitsai · Apr 19, 2024 · f19e85b · f19e85b
1 parent 3ce85ae
commit f19e85b
Show file tree

Hide file tree

Showing 11 changed files with 280 additions and 0 deletions.
diff --git a/doc/source/getting_started/installation.rst b/doc/source/getting_started/installation.rst
@@ -42,13 +42,15 @@ Currently, supported models include:
 - ``mistral-v0.1``, ``mistral-instruct-v0.1``, ``mistral-instruct-v0.2``
 - ``Yi``, ``Yi-chat``
 - ``code-llama``, ``code-llama-python``, ``code-llama-instruct``
+- ``c4ai-command-r-v01``, ``c4ai-command-r-v01-4bit``
 - ``vicuna-v1.3``, ``vicuna-v1.5``
 - ``internlm2-chat``
 - ``qwen-chat``
 - ``mixtral-instruct-v0.1``
 - ``chatglm3``, ``chatglm3-32k``, ``chatglm3-128k``
 - ``deepseek-chat``, ``deepseek-coder-instruct``
 - ``qwen1.5-chat``, ``qwen1.5-moe-chat``
+- ``codeqwen1.5-chat``
 - ``gemma-it``
 - ``orion-chat``, ``orion-chat-rag``
 .. vllm_end

diff --git a/doc/source/models/builtin/embedding/index.rst b/doc/source/models/builtin/embedding/index.rst
@@ -51,6 +51,12 @@ The following is a list of built-in embedding models in Xinference:
 
    jina-embeddings-v2-small-en
 
+   m3e-base
+
+   m3e-large
+
+   m3e-small
+
    multilingual-e5-large
 
    text2vec-base-chinese

diff --git a/doc/source/models/builtin/embedding/m3e-base.rst b/doc/source/models/builtin/embedding/m3e-base.rst
@@ -0,0 +1,21 @@
+.. _models_builtin_m3e-base:
+
+========
+m3e-base
+========
+
+- **Model Name:** m3e-base
+- **Languages:** zh, en
+- **Abilities:** embed
+
+Specifications
+^^^^^^^^^^^^^^
+
+- **Dimensions:** 768
+- **Max Tokens:** 512
+- **Model ID:** moka-ai/m3e-base
+- **Model Hubs**: `Hugging Face <https://huggingface.co/moka-ai/m3e-base>`__, `ModelScope <https://modelscope.cn/models/AI-ModelScope/m3e-base>`__
+
+Execute the following command to launch the model::
+
+   xinference launch --model-name m3e-base --model-type embedding
diff --git a/doc/source/models/builtin/embedding/m3e-large.rst b/doc/source/models/builtin/embedding/m3e-large.rst
@@ -0,0 +1,21 @@
+.. _models_builtin_m3e-large:
+
+=========
+m3e-large
+=========
+
+- **Model Name:** m3e-large
+- **Languages:** zh, en
+- **Abilities:** embed
+
+Specifications
+^^^^^^^^^^^^^^
+
+- **Dimensions:** 1024
+- **Max Tokens:** 512
+- **Model ID:** moka-ai/m3e-large
+- **Model Hubs**: `Hugging Face <https://huggingface.co/moka-ai/m3e-large>`__, `ModelScope <https://modelscope.cn/models/AI-ModelScope/m3e-large>`__
+
+Execute the following command to launch the model::
+
+   xinference launch --model-name m3e-large --model-type embedding
diff --git a/doc/source/models/builtin/embedding/m3e-small.rst b/doc/source/models/builtin/embedding/m3e-small.rst
@@ -0,0 +1,21 @@
+.. _models_builtin_m3e-small:
+
+=========
+m3e-small
+=========
+
+- **Model Name:** m3e-small
+- **Languages:** zh, en
+- **Abilities:** embed
+
+Specifications
+^^^^^^^^^^^^^^
+
+- **Dimensions:** 512
+- **Max Tokens:** 512
+- **Model ID:** moka-ai/m3e-small
+- **Model Hubs**: `Hugging Face <https://huggingface.co/moka-ai/m3e-small>`__, `ModelScope <https://modelscope.cn/models/AI-ModelScope/m3e-small>`__
+
+Execute the following command to launch the model::
+
+   xinference launch --model-name m3e-small --model-type embedding
diff --git a/doc/source/models/builtin/llm/codeqwen1.5-chat.rst b/doc/source/models/builtin/llm/codeqwen1.5-chat.rst
@@ -0,0 +1,60 @@
+.. _models_llm_codeqwen1.5-chat:
+
+========================================
+codeqwen1.5-chat
+========================================
+
+- **Context Length:** 65536
+- **Model Name:** codeqwen1.5-chat
+- **Languages:** en, zh
+- **Abilities:** chat
+- **Description:** CodeQwen1.5 is the Code-Specific version of Qwen1.5. It is a transformer-based decoder-only language model pretrained on a large amount of data of codes.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (ggufv2, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 7
+- **Quantizations:** q2_k, q3_k_m, q4_0, q4_k_m, q5_0, q5_k_m, q6_k, q8_0
+- **Model ID:** Qwen/CodeQwen1.5-7B-Chat-GGUF
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/CodeQwen1.5-7B-Chat-GGUF>`__, `ModelScope <https://modelscope.cn/models/qwen/CodeQwen1.5-7B-Chat-GGUF>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name codeqwen1.5-chat --size-in-billions 7 --model-format ggufv2 --quantization ${quantization}
+
+
+Model Spec 2 (pytorch, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 7
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** Qwen/CodeQwen1.5-7B-Chat
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/CodeQwen1.5-7B-Chat>`__, `ModelScope <https://modelscope.cn/models/qwen/CodeQwen1.5-7B-Chat>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name codeqwen1.5-chat --size-in-billions 7 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 3 (awq, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** awq
+- **Model Size (in billions):** 7
+- **Quantizations:** Int4
+- **Model ID:** Qwen/CodeQwen1.5-7B-Chat-AWQ
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/CodeQwen1.5-7B-Chat-AWQ>`__, `ModelScope <https://modelscope.cn/models/qwen/CodeQwen1.5-7B-Chat-AWQ>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name codeqwen1.5-chat --size-in-billions 7 --model-format awq --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/llm/index.rst b/doc/source/models/builtin/llm/index.rst
@@ -106,6 +106,11 @@ The following is a list of built-in LLM in Xinference:
      - 100000
      - Code-Llama-Python is a fine-tuned version of the Code-Llama LLM, specializing in Python.
 
+   * - :ref:`codeqwen1.5-chat <models_llm_codeqwen1.5-chat>`
+     - chat
+     - 65536
+     - CodeQwen1.5 is the Code-Specific version of Qwen1.5. It is a transformer-based decoder-only language model pretrained on a large amount of data of codes.
+
    * - :ref:`codeshell <models_llm_codeshell>`
      - generate
      - 8194
@@ -467,6 +472,8 @@ The following is a list of built-in LLM in Xinference:
 
    code-llama-python
 
+   codeqwen1.5-chat
+
    codeshell
 
    codeshell-chat

diff --git a/doc/source/user_guide/backends.rst b/doc/source/user_guide/backends.rst
@@ -51,13 +51,15 @@ Currently, supported model includes:
 - ``mistral-v0.1``, ``mistral-instruct-v0.1``, ``mistral-instruct-v0.2``
 - ``Yi``, ``Yi-chat``
 - ``code-llama``, ``code-llama-python``, ``code-llama-instruct``
+- ``c4ai-command-r-v01``, ``c4ai-command-r-v01-4bit``
 - ``vicuna-v1.3``, ``vicuna-v1.5``
 - ``internlm2-chat``
 - ``qwen-chat``
 - ``mixtral-instruct-v0.1``
 - ``chatglm3``, ``chatglm3-32k``, ``chatglm3-128k``
 - ``deepseek-chat``, ``deepseek-coder-instruct``
 - ``qwen1.5-chat``, ``qwen1.5-moe-chat``
+- ``codeqwen1.5-chat``
 - ``gemma-it``
 - ``orion-chat``, ``orion-chat-rag``
 .. vllm_end
diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
@@ -1930,6 +1930,74 @@
       ]
     }
   },
+  {
+    "version": 1,
+    "context_length": 65536,
+    "model_name": "codeqwen1.5-chat",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "CodeQwen1.5 is the Code-Specific version of Qwen1.5. It is a transformer-based decoder-only language model pretrained on a large amount of data of codes.",
+    "model_specs": [
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "q2_k",
+          "q3_k_m",
+          "q4_0",
+          "q4_k_m",
+          "q5_0",
+          "q5_k_m",
+          "q6_k",
+          "q8_0"
+        ],
+        "model_id": "Qwen/CodeQwen1.5-7B-Chat-GGUF",
+        "model_file_name_template": "codeqwen-1_5-7b-chat-{quantization}.gguf"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/CodeQwen1.5-7B-Chat"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "Qwen/CodeQwen1.5-7B-Chat-AWQ"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "QWEN",
+      "system_prompt": "You are a helpful assistant.",
+      "roles": [
+        "user",
+        "assistant"
+      ],
+      "intra_message_sep": "\n",
+      "stop_token_ids": [
+        151643,
+        151644,
+        151645
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|im_start|>",
+        "<|im_end|>"
+      ]
+    }
+  },
   {
     "version": 1,
     "context_length": 8192,

diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json
@@ -2175,6 +2175,77 @@
       ]
     }
   },
+  {
+    "version": 1,
+    "context_length": 65536,
+    "model_name": "codeqwen1.5-chat",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "CodeQwen1.5 is the Code-Specific version of Qwen1.5. It is a transformer-based decoder-only language model pretrained on a large amount of data of codes.",
+    "model_specs": [
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "q2_k",
+          "q3_k_m",
+          "q4_0",
+          "q4_k_m",
+          "q5_0",
+          "q5_k_m",
+          "q6_k",
+          "q8_0"
+        ],
+        "model_id": "qwen/CodeQwen1.5-7B-Chat-GGUF",
+        "model_hub": "modelscope",
+        "model_file_name_template": "codeqwen-1_5-7b-chat-{quantization}.gguf"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/CodeQwen1.5-7B-Chat",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "qwen/CodeQwen1.5-7B-Chat-AWQ",
+        "model_hub": "modelscope"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "QWEN",
+      "system_prompt": "You are a helpful assistant.",
+      "roles": [
+        "user",
+        "assistant"
+      ],
+      "intra_message_sep": "\n",
+      "stop_token_ids": [
+        151643,
+        151644,
+        151645
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|im_start|>",
+        "<|im_end|>"
+      ]
+    }
+  },
   {
     "version": 1,
     "context_length": 4096,

diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py
@@ -116,6 +116,7 @@ class VLLMGenerateConfig(TypedDict, total=False):
 ]
 if VLLM_INSTALLED and vllm.__version__ >= "0.3.0":
     VLLM_SUPPORTED_CHAT_MODELS.append("qwen1.5-chat")
+    VLLM_SUPPORTED_CHAT_MODELS.append("codeqwen1.5-chat")
 
 if VLLM_INSTALLED and vllm.__version__ >= "0.3.2":
     VLLM_SUPPORTED_CHAT_MODELS.append("gemma-it")