diff --git a/README.md b/README.md index e704be542f..22971ea84f 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,7 @@ potential of cutting-edge AI models. - Speculative decoding: [#509](https://github.com/xorbitsai/inference/pull/509) - Incorporate vLLM: [#445](https://github.com/xorbitsai/inference/pull/445) ### New Models +- Built-in support for [phi-2](https://huggingface.co/microsoft/phi-2): [#828](https://github.com/xorbitsai/inference/pull/828) - Built-in support for [mistral-instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2): [#796](https://github.com/xorbitsai/inference/pull/796) - Built-in support for [deepseek-llm](https://huggingface.co/deepseek-ai) and [deepseek-coder](https://huggingface.co/deepseek-ai): [#786](https://github.com/xorbitsai/inference/pull/786) - Built-in support for [Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1): [#782](https://github.com/xorbitsai/inference/pull/782) diff --git a/README_zh_CN.md b/README_zh_CN.md index 4e18c320ac..11bd07e1c6 100644 --- a/README_zh_CN.md +++ b/README_zh_CN.md @@ -30,6 +30,7 @@ Xorbits Inference(Xinference)是一个性能强大且功能全面的分布 - 投机采样: [#509](https://github.com/xorbitsai/inference/pull/509) - 引入 vLLM: [#445](https://github.com/xorbitsai/inference/pull/445) ### 新模型 +- 内置 [phi-2](https://huggingface.co/microsoft/phi-2): [#828](https://github.com/xorbitsai/inference/pull/828) - 内置 [mistral-instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2): [#796](https://github.com/xorbitsai/inference/pull/796) - 内置 [deepseek-llm](https://huggingface.co/deepseek-ai) 与 [deepseek-coder](https://huggingface.co/deepseek-ai): [#786](https://github.com/xorbitsai/inference/pull/786) - 内置 [Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1): [#782](https://github.com/xorbitsai/inference/pull/782) diff --git a/doc/source/models/builtin/llm/index.rst b/doc/source/models/builtin/llm/index.rst index 89913618c4..21b57b8c86 100644 --- a/doc/source/models/builtin/llm/index.rst +++ b/doc/source/models/builtin/llm/index.rst @@ -79,6 +79,8 @@ The following is a list of built-in LLM in Xinference: orca + phi-2 + qwen-chat skywork diff --git a/doc/source/models/builtin/llm/phi-2.rst b/doc/source/models/builtin/llm/phi-2.rst new file mode 100644 index 0000000000..080071ab52 --- /dev/null +++ b/doc/source/models/builtin/llm/phi-2.rst @@ -0,0 +1,43 @@ +.. _models_llm_phi-2: + +======================================== +phi-2 +======================================== + +- **Context Length:** 2048 +- **Model Name:** phi-2 +- **Languages:** en +- **Abilities:** generate +- **Description:** Phi-2 is a 2.7B Transformer based LLM used for research on model safety, trained with data similar to Phi-1.5 but augmented with synthetic texts and curated websites. + +Specifications +^^^^^^^^^^^^^^ + + +Model Spec 1 (ggufv2, 2 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** ggufv2 +- **Model Size (in billions):** 2 +- **Quantizations:** Q2_K, Q3_K_S, Q3_K_M, Q3_K_L, Q4_0, Q4_K_S, Q4_K_M, Q5_0, Q5_K_S, Q5_K_M, Q6_K, Q8_0 +- **Model ID:** TheBloke/phi-2-GGUF + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name phi-2 --size-in-billions 2 --model-format ggufv2 --quantization ${quantization} + + +Model Spec 2 (pytorch, 2 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 2 +- **Quantizations:** 4-bit, 8-bit, none +- **Model ID:** microsoft/phi-2 + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name phi-2 --size-in-billions 2 --model-format pytorch --quantization ${quantization} + diff --git a/doc/source/models/builtin/llm/skywork-math.rst b/doc/source/models/builtin/llm/skywork-math.rst index b71b11f5d4..307be888fb 100644 --- a/doc/source/models/builtin/llm/skywork-math.rst +++ b/doc/source/models/builtin/llm/skywork-math.rst @@ -26,3 +26,4 @@ Execute the following command to launch the model, remember to replace ``${quant chosen quantization method from the options listed above:: xinference launch --model-name Skywork-Math --size-in-billions 13 --model-format pytorch --quantization ${quantization} + diff --git a/doc/source/models/builtin/llm/skywork.rst b/doc/source/models/builtin/llm/skywork.rst index 442b32ae43..51d52d7e0a 100644 --- a/doc/source/models/builtin/llm/skywork.rst +++ b/doc/source/models/builtin/llm/skywork.rst @@ -26,3 +26,4 @@ Execute the following command to launch the model, remember to replace ``${quant chosen quantization method from the options listed above:: xinference launch --model-name Skywork --size-in-billions 13 --model-format pytorch --quantization ${quantization} + diff --git a/setup.cfg b/setup.cfg index f8d8f506bd..a1a4f2b02c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -73,7 +73,7 @@ dev = all = chatglm-cpp>=0.3.0 ctransformers - llama-cpp-python>=0.2.23 + llama-cpp-python>=0.2.25 transformers>=4.34.1 torch accelerate>=0.20.3 @@ -91,7 +91,7 @@ all = auto-gptq ; sys_platform!='darwin' optimum ggml = - llama-cpp-python>=0.2.23 + llama-cpp-python>=0.2.25 ctransformers chatglm-cpp>=0.3.0 transformers = diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json index 0a1d9c6541..05a081f4a7 100644 --- a/xinference/model/llm/llm_family.json +++ b/xinference/model/llm/llm_family.json @@ -350,6 +350,51 @@ "intra_message_sep": "\n\n### " } }, + { + "version": 1, + "context_length": 2048, + "model_name": "phi-2", + "model_lang": [ + "en" + ], + "model_ability": [ + "generate" + ], + "model_description": "Phi-2 is a 2.7B Transformer based LLM used for research on model safety, trained with data similar to Phi-1.5 but augmented with synthetic texts and curated websites.", + "model_specs": [ + { + "model_format": "ggufv2", + "model_size_in_billions": 2, + "quantizations": [ + "Q2_K", + "Q3_K_S", + "Q3_K_M", + "Q3_K_L", + "Q4_0", + "Q4_K_S", + "Q4_K_M", + "Q5_0", + "Q5_K_S", + "Q5_K_M", + "Q6_K", + "Q8_0" + ], + "model_id": "TheBloke/phi-2-GGUF", + "model_file_name_template": "phi-2.{quantization}.gguf" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 2, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "microsoft/phi-2", + "model_revision": "d3186761bf5c4409f7679359284066c25ab668ee" + } + ] + }, { "version": 1, "context_length": 2048,