diff --git a/doc/source/gen_docs.py b/doc/source/gen_docs.py new file mode 100644 index 0000000000..fef4a4131f --- /dev/null +++ b/doc/source/gen_docs.py @@ -0,0 +1,83 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +from jinja2 import Environment, FileSystemLoader + + +def main(): + template_dir = '../templates' + env = Environment(loader=FileSystemLoader(template_dir)) + + with open('../../xinference/model/llm/llm_family.json', 'r') as file: + models = json.load(file) + + sorted_models = sorted(models, key=lambda x: x['model_name'].lower()) + output_dir = './models/builtin/llm' + os.makedirs(output_dir, exist_ok=True) + + for model in sorted_models: + rendered = env.get_template('llm.rst.jinja').render(model) + output_file_path = os.path.join(output_dir, f"{model['model_name'].lower()}.rst") + with open(output_file_path, 'w') as output_file: + output_file.write(rendered) + + index_file_path = os.path.join(output_dir, "index.rst") + with open(index_file_path, "w") as file: + + rendered_index = env.get_template('llm_index.rst.jinja').render(models=sorted_models) + file.write(rendered_index) + + + with open('../../xinference/model/embedding/model_spec.json', 'r') as file: + models = json.load(file) + + sorted_models = sorted(models, key=lambda x: x['model_name'].lower()) + output_dir = './models/builtin/embedding' + os.makedirs(output_dir, exist_ok=True) + + for model in sorted_models: + rendered = env.get_template('embedding.rst.jinja').render(model) + output_file_path = os.path.join(output_dir, f"{model['model_name'].lower()}.rst") + with open(output_file_path, 'w') as output_file: + output_file.write(rendered) + + index_file_path = os.path.join(output_dir, "index.rst") + with open(index_file_path, "w") as file: + + rendered_index = env.get_template('embedding_index.rst.jinja').render(models=sorted_models) + file.write(rendered_index) + + with open('../../xinference/model/rerank/model_spec.json', 'r') as file: + models = json.load(file) + + sorted_models = sorted(models, key=lambda x: x['model_name'].lower()) + output_dir = './models/builtin/rerank' + os.makedirs(output_dir, exist_ok=True) + + for model in sorted_models: + rendered = env.get_template('rerank.rst.jinja').render(model) + output_file_path = os.path.join(output_dir, f"{model['model_name'].lower()}.rst") + with open(output_file_path, 'w') as output_file: + output_file.write(rendered) + + index_file_path = os.path.join(output_dir, "index.rst") + with open(index_file_path, "w") as file: + + rendered_index = env.get_template('rerank_index.rst.jinja').render(models=sorted_models) + file.write(rendered_index) + +if __name__ == "__main__": + main() diff --git a/doc/source/models/builtin/Yi-chat.rst b/doc/source/models/builtin/Yi-chat.rst deleted file mode 100644 index dcfc74cf47..0000000000 --- a/doc/source/models/builtin/Yi-chat.rst +++ /dev/null @@ -1,33 +0,0 @@ -.. _models_builtin_Yi_chat: - - -======= -Yi-chat -======= - -- **Context Length:** 4096 -- **Model Name:** Yi -- **Languages:** en, zh -- **Abilities:** generate -- **Description:** The Yi series models are large language models trained from scratch by developers at 01.AI. The first public release contains two bilingual (English/Chinese) base models with the parameter sizes of 6B and 34B. Both of them are trained with 4K sequence length and can be extended to 32K during inference time. - -Specifications -^^^^^^^^^^^^^^ - -Model Spec 1 (pytorch, 34 Billion) -+++++++++++++++++++++++++++++++++ - -- **Model Format:** pytorch -- **Model Size (in billions):** 34 -- **Quantizations:** 4-bit, 8-bit, none -- **Model ID:** 01-ai/Yi-34B - -Execute the following command to launch the model, remember to replace `${quantization}` with your -chosen quantization method from the options listed above:: - - xinference launch --model-name Yi-chat --size-in-billions 34 --model-format pytorch --quantization ${quantization} - - -.. note:: - - 4-bit quantization is not supported on macOS. diff --git a/doc/source/models/builtin/code-llama-instruct.rst b/doc/source/models/builtin/code-llama-instruct.rst deleted file mode 100644 index 1e914e3b87..0000000000 --- a/doc/source/models/builtin/code-llama-instruct.rst +++ /dev/null @@ -1,67 +0,0 @@ -.. _models_builtin_code_llama_instruct: - - -=================== -Code-Llama-Instruct -=================== - -- **Context Length:** 100000 -- **Model Name:** code-llama-instruct -- **Languages:** en -- **Abilities:** chat - -Specifications -^^^^^^^^^^^^^^ - -Model Spec 1 (pytorch, 7 Billion) -+++++++++++++++++++++++++++++++++ - -- **Model Format:** pytorch -- **Model Size (in billions):** 7 -- **Quantizations:** 4-bit, 8-bit, none -- **Model ID:** codellama/CodeLlama-7b-Instruct-hf - -Execute the following command to launch the model, remember to replace ``${quantization}`` with your -chosen quantization method from the options listed above:: - - xinference launch --model-name code-llama-instruct --size-in-billions 7 --model-format pytorch --quantization ${quantization} - -.. note:: - - 4-bit quantization is not supported on macOS. - -Model Spec 2 (pytorch, 13 Billion) -++++++++++++++++++++++++++++++++++ - -- **Model Format:** pytorch -- **Model Size (in billions):** 13 -- **Quantizations:** 4-bit, 8-bit, none -- **Model ID:** codellama/CodeLlama-13b-Instruct-hf - -Execute the following command to launch the model, remember to replace ``${quantization}`` with your -chosen quantization method from the options listed above:: - - xinference launch --model-name code-llama-instruct --size-in-billions 13 --model-format pytorch --quantization ${quantization} - - -.. note:: - - 4-bit quantization is not supported on macOS. - -Model Spec 3 (pytorch, 34 Billion) -++++++++++++++++++++++++++++++++++ - -- **Model Format:** pytorch -- **Model Size (in billions):** 34 -- **Quantizations:** 4-bit, 8-bit, none -- **Model ID:** codellama/CodeLlama-34b-Instruct-hf - -Execute the following command to launch the model, remember to replace ``${quantization}`` with your -chosen quantization method from the options listed above:: - - xinference launch --model-name code-llama-instruct --size-in-billions 34 --model-format pytorch --quantization ${quantization} - - -.. note:: - - 4-bit quantization is not supported on macOS. diff --git a/doc/source/models/builtin/code-llama-python.rst b/doc/source/models/builtin/code-llama-python.rst deleted file mode 100644 index 3fa44f769d..0000000000 --- a/doc/source/models/builtin/code-llama-python.rst +++ /dev/null @@ -1,65 +0,0 @@ -.. _models_builtin_code_llama_python: - - -================= -Code-Llama-Python -================= - -- **Context Length:** 100000 -- **Model Name:** code-llama-python -- **Languages:** en -- **Abilities:** generate - -Specifications -^^^^^^^^^^^^^^ - -Model Spec 1 (pytorch, 7 Billion) -+++++++++++++++++++++++++++++++++ - -- **Model Format:** pytorch -- **Model Size (in billions):** 7 -- **Quantizations:** 4-bit, 8-bit, none -- **Model ID:** TheBloke/CodeLlama-7B-Python-fp16 - -Execute the following command to launch the model, remember to replace ``${quantization}`` with your -chosen quantization method from the options listed above:: - - xinference launch --model-name code-llama-python --size-in-billions 7 --model-format pytorch --quantization ${quantization} - -.. note:: - - 4-bit quantization is not supported on macOS. - -Model Spec 2 (pytorch, 13 Billion) -++++++++++++++++++++++++++++++++++ - -- **Model Format:** pytorch -- **Model Size (in billions):** 13 -- **Quantizations:** 4-bit, 8-bit, none -- **Model ID:** TheBloke/CodeLlama-13B-Python-fp16 - -Execute the following command to launch the model, remember to replace ``${quantization}`` with your -chosen quantization method from the options listed above:: - - xinference launch --model-name code-llama-python --size-in-billions 13 --model-format pytorch --quantization ${quantization} - -.. note:: - - 4-bit quantization is not supported on macOS. - -Model Spec 3 (pytorch, 34 Billion) -++++++++++++++++++++++++++++++++++ - -- **Model Format:** pytorch -- **Model Size (in billions):** 34 -- **Quantizations:** 4-bit, 8-bit, none -- **Model ID:** TheBloke/CodeLlama-34B-Python-fp16 - -Execute the following command to launch the model, remember to replace ``${quantization}`` with your -chosen quantization method from the options listed above:: - - xinference launch --model-name code-llama-python --size-in-billions 34 --model-format pytorch --quantization ${quantization} - -.. note:: - - 4-bit quantization is not supported on macOS. diff --git a/doc/source/models/builtin/code-llama.rst b/doc/source/models/builtin/code-llama.rst deleted file mode 100644 index 81bdd660bc..0000000000 --- a/doc/source/models/builtin/code-llama.rst +++ /dev/null @@ -1,64 +0,0 @@ -.. _models_builtin_code_llama: - -========== -Code-Llama -========== - -- **Context Length:** 100000 -- **Model Name:** code-llama -- **Languages:** en -- **Abilities:** generate - -Specifications -^^^^^^^^^^^^^^ - -Model Spec 1 (pytorch, 7 Billion) -+++++++++++++++++++++++++++++++++ - -- **Model Format:** pytorch -- **Model Size (in billions):** 7 -- **Quantizations:** 4-bit, 8-bit, none -- **Model ID:** TheBloke/CodeLlama-7B-fp16 - -Execute the following command to launch the model, remember to replace ``${quantization}`` with your -chosen quantization method from the options listed above:: - - xinference launch --model-name code-llama --size-in-billions 7 --model-format pytorch --quantization ${quantization} - -.. note:: - - 4-bit quantization is not supported on macOS. - -Model Spec 2 (pytorch, 13 Billion) -++++++++++++++++++++++++++++++++++ - -- **Model Format:** pytorch -- **Model Size (in billions):** 13 -- **Quantizations:** 4-bit, 8-bit, none -- **Model ID:** TheBloke/CodeLlama-13B-fp16 - -Execute the following command to launch the model, remember to replace ``${quantization}`` with your -chosen quantization method from the options listed above:: - - xinference launch --model-name code-llama --size-in-billions 13 --model-format pytorch --quantization ${quantization} - -.. note:: - - 4-bit quantization is not supported on macOS. - -Model Spec 3 (pytorch, 34 Billion) -++++++++++++++++++++++++++++++++++ - -- **Model Format:** pytorch -- **Model Size (in billions):** 34 -- **Quantizations:** 4-bit, 8-bit, none -- **Model ID:** TheBloke/CodeLlama-34B-fp16 - -Execute the following command to launch the model, remember to replace ``${quantization}`` with your -chosen quantization method from the options listed above:: - - xinference launch --model-name code-llama --size-in-billions 34 --model-format pytorch --quantization ${quantization} - -.. note:: - - 4-bit quantization is not supported on macOS. diff --git a/doc/source/models/builtin/bge-base-en-v1.5.rst b/doc/source/models/builtin/embedding/bge-base-en-v1.5.rst similarity index 88% rename from doc/source/models/builtin/bge-base-en-v1.5.rst rename to doc/source/models/builtin/embedding/bge-base-en-v1.5.rst index d25f7e6728..014160b96e 100644 --- a/doc/source/models/builtin/bge-base-en-v1.5.rst +++ b/doc/source/models/builtin/embedding/bge-base-en-v1.5.rst @@ -1,4 +1,4 @@ -.. _models_builtin_bge_base_en_v1.5: +.. _models_builtin_bge-base-en-v1.5: ================ bge-base-en-v1.5 @@ -17,5 +17,4 @@ Specifications Execute the following command to launch the model:: - xinference launch --model-name bge-base-en-v1.5 --model-type embedding - + xinference launch --model-name bge-base-en-v1.5 --model-type embedding \ No newline at end of file diff --git a/doc/source/models/builtin/bge-base-en.rst b/doc/source/models/builtin/embedding/bge-base-en.rst similarity index 89% rename from doc/source/models/builtin/bge-base-en.rst rename to doc/source/models/builtin/embedding/bge-base-en.rst index 952131eda8..b22a16cbed 100644 --- a/doc/source/models/builtin/bge-base-en.rst +++ b/doc/source/models/builtin/embedding/bge-base-en.rst @@ -1,4 +1,4 @@ -.. _models_builtin_bge_base_en: +.. _models_builtin_bge-base-en: =========== bge-base-en @@ -17,6 +17,4 @@ Specifications Execute the following command to launch the model:: - xinference launch --model-name bge-base-en --model-type embedding - - + xinference launch --model-name bge-base-en --model-type embedding \ No newline at end of file diff --git a/doc/source/models/builtin/bge-base-zh-v1.5.rst b/doc/source/models/builtin/embedding/bge-base-zh-v1.5.rst similarity index 88% rename from doc/source/models/builtin/bge-base-zh-v1.5.rst rename to doc/source/models/builtin/embedding/bge-base-zh-v1.5.rst index 262cc03faa..ff3862189c 100644 --- a/doc/source/models/builtin/bge-base-zh-v1.5.rst +++ b/doc/source/models/builtin/embedding/bge-base-zh-v1.5.rst @@ -1,4 +1,4 @@ -.. _models_builtin_bge_base_zh_v1.5: +.. _models_builtin_bge-base-zh-v1.5: ================ bge-base-zh-v1.5 @@ -17,5 +17,4 @@ Specifications Execute the following command to launch the model:: - xinference launch --model-name bge-base-zh-v1.5 --model-type embedding - + xinference launch --model-name bge-base-zh-v1.5 --model-type embedding \ No newline at end of file diff --git a/doc/source/models/builtin/bge-base-zh.rst b/doc/source/models/builtin/embedding/bge-base-zh.rst similarity index 83% rename from doc/source/models/builtin/bge-base-zh.rst rename to doc/source/models/builtin/embedding/bge-base-zh.rst index 5b00cd3879..c9c910812f 100644 --- a/doc/source/models/builtin/bge-base-zh.rst +++ b/doc/source/models/builtin/embedding/bge-base-zh.rst @@ -1,4 +1,4 @@ -.. _models_builtin_bge_base_zh: +.. _models_builtin_bge-base-zh: =========== bge-base-zh @@ -11,11 +11,10 @@ bge-base-zh Specifications ^^^^^^^^^^^^^^ -- **Dimensions:** 1024 +- **Dimensions:** 768 - **Max Tokens:** 512 - **Model ID:** BAAI/bge-base-zh Execute the following command to launch the model:: - xinference launch --model-name bge-base-zh --model-type embedding - + xinference launch --model-name bge-base-zh --model-type embedding \ No newline at end of file diff --git a/doc/source/models/builtin/bge-large-en-v1.5.rst b/doc/source/models/builtin/embedding/bge-large-en-v1.5.rst similarity index 88% rename from doc/source/models/builtin/bge-large-en-v1.5.rst rename to doc/source/models/builtin/embedding/bge-large-en-v1.5.rst index ebd711ce92..d04f09c8ae 100644 --- a/doc/source/models/builtin/bge-large-en-v1.5.rst +++ b/doc/source/models/builtin/embedding/bge-large-en-v1.5.rst @@ -1,4 +1,4 @@ -.. _models_builtin_bge_large_en_v1.5: +.. _models_builtin_bge-large-en-v1.5: ================= bge-large-en-v1.5 @@ -17,5 +17,4 @@ Specifications Execute the following command to launch the model:: - xinference launch --model-name bge-large-en-v1.5 --model-type embedding - + xinference launch --model-name bge-large-en-v1.5 --model-type embedding \ No newline at end of file diff --git a/doc/source/models/builtin/bge-large-en.rst b/doc/source/models/builtin/embedding/bge-large-en.rst similarity index 89% rename from doc/source/models/builtin/bge-large-en.rst rename to doc/source/models/builtin/embedding/bge-large-en.rst index ccb4e58046..f1588fa0f9 100644 --- a/doc/source/models/builtin/bge-large-en.rst +++ b/doc/source/models/builtin/embedding/bge-large-en.rst @@ -1,4 +1,4 @@ -.. _models_builtin_bge_large_en: +.. _models_builtin_bge-large-en: ============ bge-large-en @@ -17,5 +17,4 @@ Specifications Execute the following command to launch the model:: - xinference launch --model-name bge-large-en --model-type embedding - + xinference launch --model-name bge-large-en --model-type embedding \ No newline at end of file diff --git a/doc/source/models/builtin/bge-large-zh-noinstruct.rst b/doc/source/models/builtin/embedding/bge-large-zh-noinstruct.rst similarity index 86% rename from doc/source/models/builtin/bge-large-zh-noinstruct.rst rename to doc/source/models/builtin/embedding/bge-large-zh-noinstruct.rst index 1071d6a0b3..c279f7f688 100644 --- a/doc/source/models/builtin/bge-large-zh-noinstruct.rst +++ b/doc/source/models/builtin/embedding/bge-large-zh-noinstruct.rst @@ -1,4 +1,4 @@ -.. _models_builtin_bge_large_zh_noinstruct: +.. _models_builtin_bge-large-zh-noinstruct: ======================= bge-large-zh-noinstruct @@ -17,5 +17,4 @@ Specifications Execute the following command to launch the model:: - xinference launch --model-name bge-large-zh-noinstruct --model-type embedding - + xinference launch --model-name bge-large-zh-noinstruct --model-type embedding \ No newline at end of file diff --git a/doc/source/models/builtin/bge-large-zh-v1.5.rst b/doc/source/models/builtin/embedding/bge-large-zh-v1.5.rst similarity index 88% rename from doc/source/models/builtin/bge-large-zh-v1.5.rst rename to doc/source/models/builtin/embedding/bge-large-zh-v1.5.rst index 0d5289a9a0..42fab215a5 100644 --- a/doc/source/models/builtin/bge-large-zh-v1.5.rst +++ b/doc/source/models/builtin/embedding/bge-large-zh-v1.5.rst @@ -1,4 +1,4 @@ -.. _models_builtin_bge_large_zh_v1.5: +.. _models_builtin_bge-large-zh-v1.5: ================= bge-large-zh-v1.5 @@ -17,5 +17,4 @@ Specifications Execute the following command to launch the model:: - xinference launch --model-name bge-large-zh-v1.5 --model-type embedding - + xinference launch --model-name bge-large-zh-v1.5 --model-type embedding \ No newline at end of file diff --git a/doc/source/models/builtin/bge-large-zh.rst b/doc/source/models/builtin/embedding/bge-large-zh.rst similarity index 86% rename from doc/source/models/builtin/bge-large-zh.rst rename to doc/source/models/builtin/embedding/bge-large-zh.rst index 847a69e508..65a96dcf07 100644 --- a/doc/source/models/builtin/bge-large-zh.rst +++ b/doc/source/models/builtin/embedding/bge-large-zh.rst @@ -1,7 +1,7 @@ -.. _models_builtin_bge_large_zh: +.. _models_builtin_bge-large-zh: ============ -bge-large-en +bge-large-zh ============ - **Model Name:** bge-large-zh @@ -17,5 +17,4 @@ Specifications Execute the following command to launch the model:: - xinference launch --model-name bge-large-zh --model-type embedding - + xinference launch --model-name bge-large-zh --model-type embedding \ No newline at end of file diff --git a/doc/source/models/builtin/bge-small-en-v1.5.rst b/doc/source/models/builtin/embedding/bge-small-en-v1.5.rst similarity index 88% rename from doc/source/models/builtin/bge-small-en-v1.5.rst rename to doc/source/models/builtin/embedding/bge-small-en-v1.5.rst index de5e4cbad3..665b11d49b 100644 --- a/doc/source/models/builtin/bge-small-en-v1.5.rst +++ b/doc/source/models/builtin/embedding/bge-small-en-v1.5.rst @@ -1,4 +1,4 @@ -.. _models_builtin_bge_small_en_v1.5: +.. _models_builtin_bge-small-en-v1.5: ================= bge-small-en-v1.5 @@ -17,5 +17,4 @@ Specifications Execute the following command to launch the model:: - xinference launch --model-name bge-small-en-v1.5 --model-type embedding - + xinference launch --model-name bge-small-en-v1.5 --model-type embedding \ No newline at end of file diff --git a/doc/source/models/builtin/bge-small-zh-v1.5.rst b/doc/source/models/builtin/embedding/bge-small-zh-v1.5.rst similarity index 88% rename from doc/source/models/builtin/bge-small-zh-v1.5.rst rename to doc/source/models/builtin/embedding/bge-small-zh-v1.5.rst index a80edbd158..deadc73e4d 100644 --- a/doc/source/models/builtin/bge-small-zh-v1.5.rst +++ b/doc/source/models/builtin/embedding/bge-small-zh-v1.5.rst @@ -1,4 +1,4 @@ -.. _models_builtin_bge_small_zh_v1.5: +.. _models_builtin_bge-small-zh-v1.5: ================= bge-small-zh-v1.5 @@ -17,5 +17,4 @@ Specifications Execute the following command to launch the model:: - xinference launch --model-name bge-small-zh-v1.5 --model-type embedding - + xinference launch --model-name bge-small-zh-v1.5 --model-type embedding \ No newline at end of file diff --git a/doc/source/models/builtin/bge-small-zh.rst b/doc/source/models/builtin/embedding/bge-small-zh.rst similarity index 52% rename from doc/source/models/builtin/bge-small-zh.rst rename to doc/source/models/builtin/embedding/bge-small-zh.rst index 489925b6dc..474596873d 100644 --- a/doc/source/models/builtin/bge-small-zh.rst +++ b/doc/source/models/builtin/embedding/bge-small-zh.rst @@ -1,10 +1,10 @@ -.. _models_builtin_bge_small_zh: +.. _models_builtin_bge-small-zh: ============ -bge-large-en +bge-small-zh ============ -- **Model Name:** bge_small_zh +- **Model Name:** bge-small-zh - **Languages:** zh - **Abilities:** embed @@ -13,9 +13,8 @@ Specifications - **Dimensions:** 512 - **Max Tokens:** 512 -- **Model ID:** BAAI/bge_small_zh +- **Model ID:** BAAI/bge-small-zh Execute the following command to launch the model:: - xinference launch --model-name bge_small_zh --model-type embedding - + xinference launch --model-name bge-small-zh --model-type embedding \ No newline at end of file diff --git a/doc/source/models/builtin/e5-large-v2.rst b/doc/source/models/builtin/embedding/e5-large-v2.rst similarity index 81% rename from doc/source/models/builtin/e5-large-v2.rst rename to doc/source/models/builtin/embedding/e5-large-v2.rst index 758e4cbebb..8737e8abca 100644 --- a/doc/source/models/builtin/e5-large-v2.rst +++ b/doc/source/models/builtin/embedding/e5-large-v2.rst @@ -1,8 +1,8 @@ -.. _models_builtin_e5_large_v2: +.. _models_builtin_e5-large-v2: -========= -gte-large -========= +=========== +e5-large-v2 +=========== - **Model Name:** e5-large-v2 - **Languages:** en @@ -17,5 +17,4 @@ Specifications Execute the following command to launch the model:: - xinference launch --model-name e5-large-v2 --model-type embedding - + xinference launch --model-name e5-large-v2 --model-type embedding \ No newline at end of file diff --git a/doc/source/models/builtin/gte-base.rst b/doc/source/models/builtin/embedding/gte-base.rst similarity index 81% rename from doc/source/models/builtin/gte-base.rst rename to doc/source/models/builtin/embedding/gte-base.rst index 0f379ee13f..211636e9ae 100644 --- a/doc/source/models/builtin/gte-base.rst +++ b/doc/source/models/builtin/embedding/gte-base.rst @@ -1,4 +1,4 @@ -.. _models_builtin_gte_base: +.. _models_builtin_gte-base: ======== gte-base @@ -13,9 +13,8 @@ Specifications - **Dimensions:** 768 - **Max Tokens:** 512 -- **Model ID:** thenlper/gte-large +- **Model ID:** thenlper/gte-base Execute the following command to launch the model:: - xinference launch --model-name gte-base --model-type embedding - + xinference launch --model-name gte-base --model-type embedding \ No newline at end of file diff --git a/doc/source/models/builtin/gte-large.rst b/doc/source/models/builtin/embedding/gte-large.rst similarity index 90% rename from doc/source/models/builtin/gte-large.rst rename to doc/source/models/builtin/embedding/gte-large.rst index 09afa2594c..f1bb7105f9 100644 --- a/doc/source/models/builtin/gte-large.rst +++ b/doc/source/models/builtin/embedding/gte-large.rst @@ -1,4 +1,4 @@ -.. _models_builtin_gte_large: +.. _models_builtin_gte-large: ========= gte-large @@ -17,5 +17,4 @@ Specifications Execute the following command to launch the model:: - xinference launch --model-name gte-large --model-type embedding - + xinference launch --model-name gte-large --model-type embedding \ No newline at end of file diff --git a/doc/source/models/builtin/embedding/index.rst b/doc/source/models/builtin/embedding/index.rst new file mode 100644 index 0000000000..26876c8529 --- /dev/null +++ b/doc/source/models/builtin/embedding/index.rst @@ -0,0 +1,49 @@ +.. _models_embedding_index: + +================ +Embedding Models +================ + +The following is a list of built-in embedding models in Xinference: + + +.. toctree:: + :maxdepth: 1 + + + bge-base-en + + bge-base-en-v1.5 + + bge-base-zh + + bge-base-zh-v1.5 + + bge-large-en + + bge-large-en-v1.5 + + bge-large-zh + + bge-large-zh-noinstruct + + bge-large-zh-v1.5 + + bge-small-en-v1.5 + + bge-small-zh + + bge-small-zh-v1.5 + + e5-large-v2 + + gte-base + + gte-large + + jina-embeddings-v2-base-en + + jina-embeddings-v2-small-en + + multilingual-e5-large + \ No newline at end of file diff --git a/doc/source/models/builtin/jina-embeddings-v2-base-en.rst b/doc/source/models/builtin/embedding/jina-embeddings-v2-base-en.rst similarity index 74% rename from doc/source/models/builtin/jina-embeddings-v2-base-en.rst rename to doc/source/models/builtin/embedding/jina-embeddings-v2-base-en.rst index 676a1885e8..627f8a61ac 100644 --- a/doc/source/models/builtin/jina-embeddings-v2-base-en.rst +++ b/doc/source/models/builtin/embedding/jina-embeddings-v2-base-en.rst @@ -1,8 +1,8 @@ -.. _models_builtin_jina_embeddings_v2_base_en: +.. _models_builtin_jina-embeddings-v2-base-en: -=========================== +========================== jina-embeddings-v2-base-en -=========================== +========================== - **Model Name:** jina-embeddings-v2-base-en - **Languages:** en @@ -17,5 +17,4 @@ Specifications Execute the following command to launch the model:: - xinference launch --model-name jina-embeddings-v2-base-en --model-type embedding - + xinference launch --model-name jina-embeddings-v2-base-en --model-type embedding \ No newline at end of file diff --git a/doc/source/models/builtin/jina-embeddings-v2-small-en.rst b/doc/source/models/builtin/embedding/jina-embeddings-v2-small-en.rst similarity index 85% rename from doc/source/models/builtin/jina-embeddings-v2-small-en.rst rename to doc/source/models/builtin/embedding/jina-embeddings-v2-small-en.rst index af1d9c72b0..7f35cda069 100644 --- a/doc/source/models/builtin/jina-embeddings-v2-small-en.rst +++ b/doc/source/models/builtin/embedding/jina-embeddings-v2-small-en.rst @@ -1,4 +1,4 @@ -.. _models_builtin_jina_embeddings_v2_small_en: +.. _models_builtin_jina-embeddings-v2-small-en: =========================== jina-embeddings-v2-small-en @@ -17,5 +17,4 @@ Specifications Execute the following command to launch the model:: - xinference launch --model-name jina-embeddings-v2-small-en --model-type embedding - + xinference launch --model-name jina-embeddings-v2-small-en --model-type embedding \ No newline at end of file diff --git a/doc/source/models/builtin/multilingual-e5-large.rst b/doc/source/models/builtin/embedding/multilingual-e5-large.rst similarity index 67% rename from doc/source/models/builtin/multilingual-e5-large.rst rename to doc/source/models/builtin/embedding/multilingual-e5-large.rst index 44cb618c5e..eb62dbf6db 100644 --- a/doc/source/models/builtin/multilingual-e5-large.rst +++ b/doc/source/models/builtin/embedding/multilingual-e5-large.rst @@ -1,8 +1,8 @@ -.. _models_builtin_multilingual_e5_large: +.. _models_builtin_multilingual-e5-large: -=========== -bge-base-zh -=========== +===================== +multilingual-e5-large +===================== - **Model Name:** multilingual-e5-large - **Languages:** zh @@ -12,10 +12,9 @@ Specifications ^^^^^^^^^^^^^^ - **Dimensions:** 1024 -- **Max Tokens:** 512 +- **Max Tokens:** 514 - **Model ID:** intfloat/multilingual-e5-large Execute the following command to launch the model:: - xinference launch --model-name multilingual-e5-large --model-type embedding - + xinference launch --model-name multilingual-e5-large --model-type embedding \ No newline at end of file diff --git a/doc/source/models/builtin/index.rst b/doc/source/models/builtin/index.rst index aae33432ee..5685f50006 100644 --- a/doc/source/models/builtin/index.rst +++ b/doc/source/models/builtin/index.rst @@ -1,180 +1,13 @@ .. _models_builtin_index: -=============== -Built-in Models -=============== +============== +Builtin Models +============== -Large Language Models -^^^^^^^^^^^^^^^^^^^^^ - -Text Generation Models -++++++++++++++++++++++ - -- :ref:`Baichuan ` -- :ref:`Baichuan-2 ` -- :ref:`Falcon ` -- :ref:`InternLM ` -- :ref:`InternLM 20B ` -- :ref:`Llama-2 ` -- :ref:`OPT ` -- :ref:`Yi ` -- :ref:`Yi-200k ` -- :ref:`Yi-chat ` -- :ref:`xverse ` - - -Chat & Instruction-following Models -+++++++++++++++++++++++++++++++++++ - -- :ref:`Baichuan Chat ` -- :ref:`Baichuan-2 Chat ` -- :ref:`ChatGLM ` -- :ref:`ChatGLM2 ` -- :ref:`ChatGLM2-32k ` -- :ref:`ChatGLM3 ` -- :ref:`ChatGLM3-32k ` -- :ref:`CodeLlama-Instruct ` -- :ref:`Falcon Instruct ` -- :ref:`InternLM Chat ` -- :ref:`InternLM Chat 20B ` -- :ref:`InternLM Chat 8K ` -- :ref:`Llama-2 Chat ` -- :ref:`OpenBuddy v11.1 ` -- :ref:`Orca Mini ` -- :ref:`Qwen Chat ` -- :ref:`Vicuna v1.3 ` -- :ref:`Vicuna v1.5 ` -- :ref:`Vicuna v1.5 16k ` -- :ref:`WizardLM v1.0 ` -- :ref:`WizardMath v1.0 ` -- :ref:`Zephyr-7B-α ` -- :ref:`Zephyr-7B-β ` -- :ref:`xverse-chat ` - - -Code Generation Models -++++++++++++++++++++++ -- :ref:`Starcoder ` -- :ref:`StarCoderPlus ` -- :ref:`Code-Llama ` -- :ref:`CodeLlama-Instruct ` -- :ref:`Code-Llama-Python ` -- :ref:`WizardCoder-Python-v1.0 ` - - -Code Assistant Models -+++++++++++++++++++++ -- :ref:`Starchat-beta ` - - -.. toctree:: - :maxdepth: 2 - :hidden: - - baichuan-chat - baichuan-2-chat - baichuan - baichuan-2 - chatglm - chatglm2-32k - chatglm2 - chatglm3-32k - chatglm3 - code-llama - code-llama-instruct - code-llama-python - falcon-instruct - falcon - internlm - internlm-20b - internlm-chat - internlm-chat-8k - internlm-chat-20b - llama-2-chat - llama-2 - openbuddy - opt - orca_mini - starchat-beta - starcoder - starcoderplus - qwen-chat - vicuna-v1.3 - vicuna-v1.5 - vicuna-v1.5-16k - wizardlm-v1.0 - wizardmath-v1.0 - wizardcoder-python-v1.0 - zephyr-7b-alpha - zephyr-7b-beta - Yi - Yi-200k - xverse - xverse-chat - Yi-chat - - -Embedding Models -^^^^^^^^^^^^^^^^^^^^^ - -Language: English -++++++++++++++++++++++ -- :ref:`bge-large-en ` -- :ref:`bge-large-en-v1.5 ` -- :ref:`bge-base-en ` -- :ref:`bge-base-en-v1.5 ` -- :ref:`gte-large ` -- :ref:`gte-base ` -- :ref:`e5-large-v2 ` -- :ref:`bge-small-en-v1.5 ` - - -Language: Chinese -+++++++++++++++++++++ -- :ref:`bge-large-zh ` -- :ref:`bge-large-zh-noinstruct ` -- :ref:`bge-large-zh-v1.5 ` -- :ref:`bge-base-zh ` -- :ref:`bge-base-zh-v1.5 ` -- :ref:`multilingual-e5-large ` -- :ref:`bge-small-zh ` -- :ref:`bge-small-zh-v1.5 ` -- :ref:`jina-embeddings-v2-small-en ` -- :ref:`jina-embeddings-v2-base-en ` - - -.. toctree:: - :maxdepth: 2 - :hidden: - - bge-large-en - bge-large-en-v1.5 - bge-base-en - bge-base-en-v1.5 - bge-small-en-v1.5 - gte-large - gte-base - e5-large-v2 - bge-large-zh - bge-large-zh-noinstruct - bge-large-zh-v1.5 - bge-base-zh - bge-base-zh-v1.5 - multilingual-e5-large - bge-small-zh - bge-small-zh-v1.5 - jina-embeddings-v2-small-en - jina-embeddings-v2-base-en - - -Rerank Models -++++++++++++++++++++++ -- :ref:`bge-reranker-base ` -- :ref:`bge-reranker-large ` .. toctree:: :maxdepth: 2 - :hidden: - bge-reranker-base - bge-reranker-large + llm/index + embedding/index + rerank/index \ No newline at end of file diff --git a/doc/source/models/builtin/internlm-20b.rst b/doc/source/models/builtin/internlm-20b.rst deleted file mode 100644 index 20d306927d..0000000000 --- a/doc/source/models/builtin/internlm-20b.rst +++ /dev/null @@ -1,23 +0,0 @@ -.. _models_builtin_internlm_20b: - -================== -InternLM-20B Model -================== - -- **Context Length:** 16384 -- **Model Name:** internlm-20b -- **Languages:** en, zh -- **Abilities:** generate -- **Description:** Pre-trained on over 2.3T Tokens containing high-quality English, Chinese, and code data. - -Specifications -^^^^^^^^^^^^^^ - -Model Spec (pytorch, 20 Billion) -++++++++++++++++++++++++++++++++ - -- **Model Format:** pytorch -- **Model Size (in billions):** 20 -- **Quantizations:** 4-bit, 8-bit, none -- **Model ID:** internlm/internlm-20b -- **Model Revision:** f0433b0db933a9adfa169f756ab8547f67ccef1d diff --git a/doc/source/models/builtin/internlm-chat-20b.rst b/doc/source/models/builtin/internlm-chat-20b.rst deleted file mode 100644 index ce48b13f6e..0000000000 --- a/doc/source/models/builtin/internlm-chat-20b.rst +++ /dev/null @@ -1,22 +0,0 @@ -.. _models_builtin_internlm_chat_20b: - -================= -InternLM-Chat-20B -================= - -- **Context Length:** 16384 -- **Model Name:** internlm-chat-20b -- **Languages:** en, zh -- **Abilities:** chat -- **Description:** Pre-trained on over 2.3T Tokens containing high-quality English, Chinese, and code data. The Chat version has undergone SFT and RLHF training. - -Specifications -^^^^^^^^^^^^^^ - -Model Spec (pytorch, 20 Billion) -++++++++++++++++++++++++++++++++ - -- **Model Format:** pytorch -- **Model Size (in billions):** 20 -- **Quantizations:** 4-bit, 8-bit, none -- **Model ID:** internlm/internlm-chat-20b diff --git a/doc/source/models/builtin/internlm-chat-8k.rst b/doc/source/models/builtin/internlm-chat-8k.rst deleted file mode 100644 index a5edd5dd71..0000000000 --- a/doc/source/models/builtin/internlm-chat-8k.rst +++ /dev/null @@ -1,29 +0,0 @@ -.. _models_builtin_internlm_chat_8k: - - -================ -InternLM Chat 8K -================ - -- **Model Name:** internlm-chat-8k -- **Languages:** en, zh -- **Abilities:** embed, chat - -Specifications -^^^^^^^^^^^^^^ - -Model Spec (pytorch, 7 Billion) -+++++++++++++++++++++++++++++++ - -- **Model Format:** pytorch -- **Model Size (in billions):** 7 -- **Quantizations:** 4-bit, 8-bit, none -- **Model ID:** internlm/internlm-chat-7b-8k - -Execute the following command to launch the model, remember to replace ``${quantization}`` with your chosen quantization method from the options listed above:: - - xinference launch --model-name internlm-chat-8k --size-in-billions 7 --model-format pytorch --quantization ${quantization} - -.. note:: - - 4-bit quantization is not supported on macOS. \ No newline at end of file diff --git a/doc/source/models/builtin/internlm-chat.rst b/doc/source/models/builtin/internlm-chat.rst deleted file mode 100644 index f151800cef..0000000000 --- a/doc/source/models/builtin/internlm-chat.rst +++ /dev/null @@ -1,28 +0,0 @@ -.. _models_builtin_internlm_chat: - -============= -InternLM Chat -============= - -- **Model Name:** internlm-chat -- **Languages:** en, zh -- **Abilities:** embed, chat - -Specifications -^^^^^^^^^^^^^^ - -Model Spec (pytorch, 7 Billion) -+++++++++++++++++++++++++++++++ - -- **Model Format:** pytorch -- **Model Size (in billions):** 7 -- **Quantizations:** 4-bit, 8-bit, none -- **Model ID:** internlm/internlm-chat-7b - -Execute the following command to launch the model, remember to replace ``${quantization}`` with your chosen quantization method from the options listed above:: - - xinference launch --model-name internlm-chat --size-in-billions 7 --model-format pytorch --quantization ${quantization} - -.. note:: - - 4-bit quantization is not supported on macOS. \ No newline at end of file diff --git a/doc/source/models/builtin/internlm.rst b/doc/source/models/builtin/internlm.rst deleted file mode 100644 index 9a09ecf06f..0000000000 --- a/doc/source/models/builtin/internlm.rst +++ /dev/null @@ -1,28 +0,0 @@ -.. _models_builtin_internlm: - -======== -InternLM -======== - -- **Model Name:** internlm -- **Languages:** en, zh -- **Abilities:** embed, generate - -Specifications -^^^^^^^^^^^^^^ - -Model Spec (pytorch, 7 Billion) -+++++++++++++++++++++++++++++++ - -- **Model Format:** pytorch -- **Model Size (in billions):** 7 -- **Quantizations:** 4-bit, 8-bit, none -- **Model ID:** internlm/internlm-7b - -Execute the following command to launch the model, remember to replace ``${quantization}`` with your chosen quantization method from the options listed above:: - - xinference launch --model-name internlm --size-in-billions 7 --model-format pytorch --quantization ${quantization} - -.. note:: - - 4-bit quantization is not supported on macOS. diff --git a/doc/source/models/builtin/baichuan-2-chat.rst b/doc/source/models/builtin/llm/baichuan-2-chat.rst similarity index 74% rename from doc/source/models/builtin/baichuan-2-chat.rst rename to doc/source/models/builtin/llm/baichuan-2-chat.rst index 9dfe2175a2..cd05690531 100644 --- a/doc/source/models/builtin/baichuan-2-chat.rst +++ b/doc/source/models/builtin/llm/baichuan-2-chat.rst @@ -1,51 +1,43 @@ -.. _models_builtin_baichuan_2_chat: +.. _models_llm_baichuan-2-chat: -=============== -Baichuan-2-Chat -=============== +======================================== +baichuan-2-chat +======================================== - **Context Length:** 4096 - **Model Name:** baichuan-2-chat - **Languages:** en, zh -- **Abilities:** embed, generate, chat +- **Abilities:** chat - **Description:** Baichuan2-chat is a fine-tuned version of the Baichuan LLM, specializing in chatting. Specifications ^^^^^^^^^^^^^^ + Model Spec 1 (pytorch, 7 Billion) -+++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** pytorch - **Model Size (in billions):** 7 - **Quantizations:** 4-bit, 8-bit, none - **Model ID:** baichuan-inc/Baichuan2-7B-Chat -- **Model Revision:** 2ce891951e000c36c65442608a0b95fd09b405dc Execute the following command to launch the model, remember to replace ``${quantization}`` with your chosen quantization method from the options listed above:: xinference launch --model-name baichuan-2-chat --size-in-billions 7 --model-format pytorch --quantization ${quantization} -.. note:: - - Not supported on macOS. - Model Spec 2 (pytorch, 13 Billion) -++++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** pytorch - **Model Size (in billions):** 13 - **Quantizations:** 4-bit, 8-bit, none - **Model ID:** baichuan-inc/Baichuan2-13B-Chat -- **Model Revision:** a56c793eb7a721ab6c270f779024e0375e8afd4a Execute the following command to launch the model, remember to replace ``${quantization}`` with your chosen quantization method from the options listed above:: xinference launch --model-name baichuan-2-chat --size-in-billions 13 --model-format pytorch --quantization ${quantization} -.. note:: - - Not supported on macOS. diff --git a/doc/source/models/builtin/baichuan-2.rst b/doc/source/models/builtin/llm/baichuan-2.rst similarity index 82% rename from doc/source/models/builtin/baichuan-2.rst rename to doc/source/models/builtin/llm/baichuan-2.rst index 9de361a66a..6df3d8e303 100644 --- a/doc/source/models/builtin/baichuan-2.rst +++ b/doc/source/models/builtin/llm/baichuan-2.rst @@ -1,20 +1,21 @@ -.. _models_builtin_baichuan_2: +.. _models_llm_baichuan-2: -========== -Baichuan-2 -========== +======================================== +baichuan-2 +======================================== - **Context Length:** 4096 - **Model Name:** baichuan-2 - **Languages:** en, zh -- **Abilities:** embed, generate +- **Abilities:** generate - **Description:** Baichuan2 is an open-source Transformer based LLM that is trained on both Chinese and English data. Specifications ^^^^^^^^^^^^^^ + Model Spec 1 (pytorch, 7 Billion) -+++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** pytorch - **Model Size (in billions):** 7 @@ -26,12 +27,9 @@ chosen quantization method from the options listed above:: xinference launch --model-name baichuan-2 --size-in-billions 7 --model-format pytorch --quantization ${quantization} -.. note:: - - Not supported on macOS. Model Spec 2 (pytorch, 13 Billion) -++++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** pytorch - **Model Size (in billions):** 13 @@ -43,6 +41,3 @@ chosen quantization method from the options listed above:: xinference launch --model-name baichuan-2 --size-in-billions 13 --model-format pytorch --quantization ${quantization} -.. note:: - - Not supported on macOS. diff --git a/doc/source/models/builtin/baichuan-chat.rst b/doc/source/models/builtin/llm/baichuan-chat.rst similarity index 59% rename from doc/source/models/builtin/baichuan-chat.rst rename to doc/source/models/builtin/llm/baichuan-chat.rst index 149e8d021d..ef707b12e8 100644 --- a/doc/source/models/builtin/baichuan-chat.rst +++ b/doc/source/models/builtin/llm/baichuan-chat.rst @@ -1,16 +1,22 @@ -.. _models_builtin_baichuan_chat: +.. _models_llm_baichuan-chat: -============= -Baichuan Chat -============= +======================================== +baichuan-chat +======================================== +- **Context Length:** 4096 - **Model Name:** baichuan-chat - **Languages:** en, zh -- **Abilities:** embed, chat +- **Abilities:** chat +- **Description:** Baichuan-chat is a fine-tuned version of the Baichuan LLM, specializing in chatting. Specifications ^^^^^^^^^^^^^^ + +Model Spec 1 (pytorch, 13 Billion) +++++++++++++++++++++++++++++++++++++++++ + - **Model Format:** pytorch - **Model Size (in billions):** 13 - **Quantizations:** 4-bit, 8-bit, none @@ -21,6 +27,3 @@ chosen quantization method from the options listed above:: xinference launch --model-name baichuan-chat --size-in-billions 13 --model-format pytorch --quantization ${quantization} -.. note:: - - Not supported on macOS. diff --git a/doc/source/models/builtin/baichuan.rst b/doc/source/models/builtin/llm/baichuan.rst similarity index 72% rename from doc/source/models/builtin/baichuan.rst rename to doc/source/models/builtin/llm/baichuan.rst index 87499ff72c..1470bdf6c8 100644 --- a/doc/source/models/builtin/baichuan.rst +++ b/doc/source/models/builtin/llm/baichuan.rst @@ -1,35 +1,35 @@ -.. _models_builtin_baichuan: +.. _models_llm_baichuan: -======== -Baichuan -======== +======================================== +baichuan +======================================== +- **Context Length:** 4096 - **Model Name:** baichuan - **Languages:** en, zh -- **Abilities:** embed, generate +- **Abilities:** generate +- **Description:** Baichuan is an open-source Transformer based LLM that is trained on both Chinese and English data. Specifications ^^^^^^^^^^^^^^ -Model Spec 1 (ggmlv3) -+++++++++++++++++++++ + +Model Spec 1 (ggmlv3, 7 Billion) +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** ggmlv3 - **Model Size (in billions):** 7 - **Quantizations:** q2_K, q3_K_L, q3_K_M, q3_K_S, q4_0, q4_1, q4_K_M, q4_K_S, q5_0, q5_1, q5_K_M, q5_K_S, q6_K, q8_0 - **Model ID:** TheBloke/baichuan-llama-7B-GGML -Execute the following command to launch the model, remember to replace ``${quantization}`` with your chosen quantization method from the options listed above:: +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: xinference launch --model-name baichuan --size-in-billions 7 --model-format ggmlv3 --quantization ${quantization} -.. note:: - - For utilizing the Apple Metal GPU for acceleration, select the q4_0 and q4_1 quantizations. - Model Spec 2 (pytorch, 7 Billion) -+++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** pytorch - **Model Size (in billions):** 7 @@ -41,12 +41,9 @@ chosen quantization method from the options listed above:: xinference launch --model-name baichuan --size-in-billions 7 --model-format pytorch --quantization ${quantization} -.. note:: - - Not supported on macOS. Model Spec 3 (pytorch, 13 Billion) -++++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** pytorch - **Model Size (in billions):** 13 @@ -58,6 +55,3 @@ chosen quantization method from the options listed above:: xinference launch --model-name baichuan --size-in-billions 13 --model-format pytorch --quantization ${quantization} -.. note:: - - Not supported on macOS. diff --git a/doc/source/models/builtin/chatglm.rst b/doc/source/models/builtin/llm/chatglm.rst similarity index 72% rename from doc/source/models/builtin/chatglm.rst rename to doc/source/models/builtin/llm/chatglm.rst index 9053841839..86a88b8c53 100644 --- a/doc/source/models/builtin/chatglm.rst +++ b/doc/source/models/builtin/llm/chatglm.rst @@ -1,18 +1,21 @@ -.. _models_builtin_chatglm: +.. _models_llm_chatglm: -======= -ChatGLM -======= +======================================== +chatglm +======================================== +- **Context Length:** 2048 - **Model Name:** chatglm - **Languages:** en, zh -- **Abilities:** embed, chat +- **Abilities:** chat +- **Description:** ChatGLM is an open-source General Language Model (GLM) based LLM trained on both Chinese and English data. Specifications ^^^^^^^^^^^^^^ + Model Spec 1 (ggmlv3, 6 Billion) -++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** ggmlv3 - **Model Size (in billions):** 6 @@ -24,8 +27,9 @@ chosen quantization method from the options listed above:: xinference launch --model-name chatglm --size-in-billions 6 --model-format ggmlv3 --quantization ${quantization} + Model Spec 2 (pytorch, 6 Billion) -+++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** pytorch - **Model Size (in billions):** 6 @@ -37,6 +41,3 @@ chosen quantization method from the options listed above:: xinference launch --model-name chatglm --size-in-billions 6 --model-format pytorch --quantization ${quantization} -.. note:: - - 4-bit quantization is not supported on macOS. diff --git a/doc/source/models/builtin/chatglm2-32k.rst b/doc/source/models/builtin/llm/chatglm2-32k.rst similarity index 62% rename from doc/source/models/builtin/chatglm2-32k.rst rename to doc/source/models/builtin/llm/chatglm2-32k.rst index d61865dc46..bd06f55dbc 100644 --- a/doc/source/models/builtin/chatglm2-32k.rst +++ b/doc/source/models/builtin/llm/chatglm2-32k.rst @@ -1,18 +1,21 @@ -.. _models_builtin_chatglm2_32k: +.. _models_llm_chatglm2-32k: -============ -ChatGLM2 32k -============ +======================================== +chatglm2-32k +======================================== +- **Context Length:** 32768 - **Model Name:** chatglm2-32k - **Languages:** en, zh -- **Abilities:** embed, chat +- **Abilities:** chat +- **Description:** ChatGLM2-32k is a special version of ChatGLM2, with a context window of 32k tokens instead of 8k. Specifications ^^^^^^^^^^^^^^ + Model Spec 1 (pytorch, 6 Billion) -+++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** pytorch - **Model Size (in billions):** 6 @@ -24,6 +27,3 @@ chosen quantization method from the options listed above:: xinference launch --model-name chatglm2-32k --size-in-billions 6 --model-format pytorch --quantization ${quantization} -.. note:: - - 4-bit quantization is not supported on macOS. diff --git a/doc/source/models/builtin/chatglm2.rst b/doc/source/models/builtin/llm/chatglm2.rst similarity index 72% rename from doc/source/models/builtin/chatglm2.rst rename to doc/source/models/builtin/llm/chatglm2.rst index 57e44b8a08..77cd408009 100644 --- a/doc/source/models/builtin/chatglm2.rst +++ b/doc/source/models/builtin/llm/chatglm2.rst @@ -1,18 +1,21 @@ -.. _models_builtin_chatglm2: +.. _models_llm_chatglm2: -======== -ChatGLM2 -======== +======================================== +chatglm2 +======================================== +- **Context Length:** 8192 - **Model Name:** chatglm2 - **Languages:** en, zh -- **Abilities:** embed, chat +- **Abilities:** chat +- **Description:** ChatGLM2 is the second generation of ChatGLM, still open-source and trained on Chinese and English data. Specifications ^^^^^^^^^^^^^^ + Model Spec 1 (ggmlv3, 6 Billion) -++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** ggmlv3 - **Model Size (in billions):** 6 @@ -24,8 +27,9 @@ chosen quantization method from the options listed above:: xinference launch --model-name chatglm2 --size-in-billions 6 --model-format ggmlv3 --quantization ${quantization} + Model Spec 2 (pytorch, 6 Billion) -+++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** pytorch - **Model Size (in billions):** 6 @@ -37,6 +41,3 @@ chosen quantization method from the options listed above:: xinference launch --model-name chatglm2 --size-in-billions 6 --model-format pytorch --quantization ${quantization} -.. note:: - - 4-bit quantization is not supported on macOS. diff --git a/doc/source/models/builtin/chatglm3-32k.rst b/doc/source/models/builtin/llm/chatglm3-32k.rst similarity index 77% rename from doc/source/models/builtin/chatglm3-32k.rst rename to doc/source/models/builtin/llm/chatglm3-32k.rst index ca626bd2cc..74fe11bcd5 100644 --- a/doc/source/models/builtin/chatglm3-32k.rst +++ b/doc/source/models/builtin/llm/chatglm3-32k.rst @@ -1,9 +1,8 @@ -.. _models_builtin_chatglm3_32k: +.. _models_llm_chatglm3-32k: - -============ -ChatGLM3-32K -============ +======================================== +chatglm3-32k +======================================== - **Context Length:** 32768 - **Model Name:** chatglm3-32k @@ -14,8 +13,9 @@ ChatGLM3-32K Specifications ^^^^^^^^^^^^^^ -Model Spec (pytorch, 6 Billion) -+++++++++++++++++++++++++++++++ + +Model Spec 1 (pytorch, 6 Billion) +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** pytorch - **Model Size (in billions):** 6 @@ -27,6 +27,3 @@ chosen quantization method from the options listed above:: xinference launch --model-name chatglm3-32k --size-in-billions 6 --model-format pytorch --quantization ${quantization} -.. note:: - - 4-bit quantization is not supported on macOS. diff --git a/doc/source/models/builtin/chatglm3.rst b/doc/source/models/builtin/llm/chatglm3.rst similarity index 50% rename from doc/source/models/builtin/chatglm3.rst rename to doc/source/models/builtin/llm/chatglm3.rst index fc68c9f0f2..dc153b4191 100644 --- a/doc/source/models/builtin/chatglm3.rst +++ b/doc/source/models/builtin/llm/chatglm3.rst @@ -1,9 +1,8 @@ -.. _models_builtin_chatglm3: +.. _models_llm_chatglm3: - -======== -ChatGLM3 -======== +======================================== +chatglm3 +======================================== - **Context Length:** 8192 - **Model Name:** chatglm3 @@ -14,8 +13,23 @@ ChatGLM3 Specifications ^^^^^^^^^^^^^^ -Model Spec (pytorch, 6 Billion) -+++++++++++++++++++++++++++++++ + +Model Spec 1 (ggmlv3, 6 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** ggmlv3 +- **Model Size (in billions):** 6 +- **Quantizations:** q4_0 +- **Model ID:** Xorbits/chatglm3-6B-GGML + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name chatglm3 --size-in-billions 6 --model-format ggmlv3 --quantization ${quantization} + + +Model Spec 2 (pytorch, 6 Billion) +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** pytorch - **Model Size (in billions):** 6 @@ -27,6 +41,3 @@ chosen quantization method from the options listed above:: xinference launch --model-name chatglm3 --size-in-billions 6 --model-format pytorch --quantization ${quantization} -.. note:: - - 4-bit quantization is not supported on macOS. diff --git a/doc/source/models/builtin/llm/code-llama-instruct.rst b/doc/source/models/builtin/llm/code-llama-instruct.rst new file mode 100644 index 0000000000..cab055ab7e --- /dev/null +++ b/doc/source/models/builtin/llm/code-llama-instruct.rst @@ -0,0 +1,99 @@ +.. _models_llm_code-llama-instruct: + +======================================== +code-llama-instruct +======================================== + +- **Context Length:** 100000 +- **Model Name:** code-llama-instruct +- **Languages:** en +- **Abilities:** chat +- **Description:** Code-Llama-Instruct is an instruct-tuned version of the Code-Llama LLM. + +Specifications +^^^^^^^^^^^^^^ + + +Model Spec 1 (pytorch, 7 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 7 +- **Quantizations:** 4-bit, 8-bit, none +- **Model ID:** codellama/CodeLlama-7b-Instruct-hf + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name code-llama-instruct --size-in-billions 7 --model-format pytorch --quantization ${quantization} + + +Model Spec 2 (pytorch, 13 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 13 +- **Quantizations:** 4-bit, 8-bit, none +- **Model ID:** codellama/CodeLlama-13b-Instruct-hf + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name code-llama-instruct --size-in-billions 13 --model-format pytorch --quantization ${quantization} + + +Model Spec 3 (pytorch, 34 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 34 +- **Quantizations:** 4-bit, 8-bit, none +- **Model ID:** codellama/CodeLlama-34b-Instruct-hf + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name code-llama-instruct --size-in-billions 34 --model-format pytorch --quantization ${quantization} + + +Model Spec 4 (ggufv2, 7 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** ggufv2 +- **Model Size (in billions):** 7 +- **Quantizations:** Q2_K, Q3_K_L, Q3_K_M, Q3_K_S, Q4_0, Q4_K_M, Q4_K_S, Q5_0, Q5_K_M, Q5_K_S, Q6_K, Q8_0 +- **Model ID:** TheBloke/CodeLlama-7B-Instruct-GGUF + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name code-llama-instruct --size-in-billions 7 --model-format ggufv2 --quantization ${quantization} + + +Model Spec 5 (ggufv2, 13 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** ggufv2 +- **Model Size (in billions):** 13 +- **Quantizations:** Q2_K, Q3_K_L, Q3_K_M, Q3_K_S, Q4_0, Q4_K_M, Q4_K_S, Q5_0, Q5_K_M, Q5_K_S, Q6_K, Q8_0 +- **Model ID:** TheBloke/CodeLlama-13B-Instruct-GGUF + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name code-llama-instruct --size-in-billions 13 --model-format ggufv2 --quantization ${quantization} + + +Model Spec 6 (ggufv2, 34 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** ggufv2 +- **Model Size (in billions):** 34 +- **Quantizations:** Q2_K, Q3_K_L, Q3_K_M, Q3_K_S, Q4_0, Q4_K_M, Q4_K_S, Q5_0, Q5_K_M, Q5_K_S, Q6_K, Q8_0 +- **Model ID:** TheBloke/CodeLlama-34B-Instruct-GGUF + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name code-llama-instruct --size-in-billions 34 --model-format ggufv2 --quantization ${quantization} + diff --git a/doc/source/models/builtin/llm/code-llama-python.rst b/doc/source/models/builtin/llm/code-llama-python.rst new file mode 100644 index 0000000000..53b38487ea --- /dev/null +++ b/doc/source/models/builtin/llm/code-llama-python.rst @@ -0,0 +1,99 @@ +.. _models_llm_code-llama-python: + +======================================== +code-llama-python +======================================== + +- **Context Length:** 100000 +- **Model Name:** code-llama-python +- **Languages:** en +- **Abilities:** generate +- **Description:** Code-Llama-Python is a fine-tuned version of the Code-Llama LLM, specializing in Python. + +Specifications +^^^^^^^^^^^^^^ + + +Model Spec 1 (pytorch, 7 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 7 +- **Quantizations:** 4-bit, 8-bit, none +- **Model ID:** TheBloke/CodeLlama-7B-Python-fp16 + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name code-llama-python --size-in-billions 7 --model-format pytorch --quantization ${quantization} + + +Model Spec 2 (pytorch, 13 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 13 +- **Quantizations:** 4-bit, 8-bit, none +- **Model ID:** TheBloke/CodeLlama-13B-Python-fp16 + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name code-llama-python --size-in-billions 13 --model-format pytorch --quantization ${quantization} + + +Model Spec 3 (pytorch, 34 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 34 +- **Quantizations:** 4-bit, 8-bit, none +- **Model ID:** TheBloke/CodeLlama-34B-Python-fp16 + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name code-llama-python --size-in-billions 34 --model-format pytorch --quantization ${quantization} + + +Model Spec 4 (ggufv2, 7 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** ggufv2 +- **Model Size (in billions):** 7 +- **Quantizations:** Q2_K, Q3_K_L, Q3_K_M, Q3_K_S, Q4_0, Q4_K_M, Q4_K_S, Q5_0, Q5_K_M, Q5_K_S, Q6_K, Q8_0 +- **Model ID:** TheBloke/CodeLlama-7B-Python-GGUF + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name code-llama-python --size-in-billions 7 --model-format ggufv2 --quantization ${quantization} + + +Model Spec 5 (ggufv2, 13 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** ggufv2 +- **Model Size (in billions):** 13 +- **Quantizations:** Q2_K, Q3_K_L, Q3_K_M, Q3_K_S, Q4_0, Q4_K_M, Q4_K_S, Q5_0, Q5_K_M, Q5_K_S, Q6_K, Q8_0 +- **Model ID:** TheBloke/CodeLlama-13B-Python-GGUF + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name code-llama-python --size-in-billions 13 --model-format ggufv2 --quantization ${quantization} + + +Model Spec 6 (ggufv2, 34 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** ggufv2 +- **Model Size (in billions):** 34 +- **Quantizations:** Q2_K, Q3_K_L, Q3_K_M, Q3_K_S, Q4_0, Q4_K_M, Q4_K_S, Q5_0, Q5_K_M, Q5_K_S, Q6_K, Q8_0 +- **Model ID:** TheBloke/CodeLlama-34B-Python-GGUF + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name code-llama-python --size-in-billions 34 --model-format ggufv2 --quantization ${quantization} + diff --git a/doc/source/models/builtin/llm/code-llama.rst b/doc/source/models/builtin/llm/code-llama.rst new file mode 100644 index 0000000000..dea196ca9b --- /dev/null +++ b/doc/source/models/builtin/llm/code-llama.rst @@ -0,0 +1,99 @@ +.. _models_llm_code-llama: + +======================================== +code-llama +======================================== + +- **Context Length:** 100000 +- **Model Name:** code-llama +- **Languages:** en +- **Abilities:** generate +- **Description:** Code-Llama is an open-source LLM trained by fine-tuning LLaMA2 for generating and discussing code. + +Specifications +^^^^^^^^^^^^^^ + + +Model Spec 1 (pytorch, 7 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 7 +- **Quantizations:** 4-bit, 8-bit, none +- **Model ID:** TheBloke/CodeLlama-7B-fp16 + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name code-llama --size-in-billions 7 --model-format pytorch --quantization ${quantization} + + +Model Spec 2 (pytorch, 13 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 13 +- **Quantizations:** 4-bit, 8-bit, none +- **Model ID:** TheBloke/CodeLlama-13B-fp16 + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name code-llama --size-in-billions 13 --model-format pytorch --quantization ${quantization} + + +Model Spec 3 (pytorch, 34 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 34 +- **Quantizations:** 4-bit, 8-bit, none +- **Model ID:** TheBloke/CodeLlama-34B-fp16 + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name code-llama --size-in-billions 34 --model-format pytorch --quantization ${quantization} + + +Model Spec 4 (ggufv2, 7 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** ggufv2 +- **Model Size (in billions):** 7 +- **Quantizations:** Q2_K, Q3_K_L, Q3_K_M, Q3_K_S, Q4_0, Q4_K_M, Q4_K_S, Q5_0, Q5_K_M, Q5_K_S, Q6_K, Q8_0 +- **Model ID:** TheBloke/CodeLlama-7B-GGUF + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name code-llama --size-in-billions 7 --model-format ggufv2 --quantization ${quantization} + + +Model Spec 5 (ggufv2, 13 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** ggufv2 +- **Model Size (in billions):** 13 +- **Quantizations:** Q2_K, Q3_K_L, Q3_K_M, Q3_K_S, Q4_0, Q4_K_M, Q4_K_S, Q5_0, Q5_K_M, Q5_K_S, Q6_K, Q8_0 +- **Model ID:** TheBloke/CodeLlama-13B-GGUF + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name code-llama --size-in-billions 13 --model-format ggufv2 --quantization ${quantization} + + +Model Spec 6 (ggufv2, 34 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** ggufv2 +- **Model Size (in billions):** 34 +- **Quantizations:** Q2_K, Q3_K_L, Q3_K_M, Q3_K_S, Q4_0, Q4_K_M, Q4_K_S, Q5_0, Q5_K_M, Q5_K_S, Q6_K, Q8_0 +- **Model ID:** TheBloke/CodeLlama-34B-GGUF + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name code-llama --size-in-billions 34 --model-format ggufv2 --quantization ${quantization} + diff --git a/doc/source/models/builtin/falcon-instruct.rst b/doc/source/models/builtin/llm/falcon-instruct.rst similarity index 73% rename from doc/source/models/builtin/falcon-instruct.rst rename to doc/source/models/builtin/llm/falcon-instruct.rst index 6b348e4fcf..69e2a91c82 100644 --- a/doc/source/models/builtin/falcon-instruct.rst +++ b/doc/source/models/builtin/llm/falcon-instruct.rst @@ -1,18 +1,21 @@ -.. _models_builtin_falcon_instruct: +.. _models_llm_falcon-instruct: -=============== -Falcon Instruct -=============== +======================================== +falcon-instruct +======================================== +- **Context Length:** 2048 - **Model Name:** falcon-instruct - **Languages:** en -- **Abilities:** embed, chat +- **Abilities:** chat +- **Description:** Falcon-instruct is a fine-tuned version of the Falcon LLM, specializing in chatting. Specifications ^^^^^^^^^^^^^^ + Model Spec 1 (pytorch, 7 Billion) -+++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** pytorch - **Model Size (in billions):** 7 @@ -24,12 +27,9 @@ chosen quantization method from the options listed above:: xinference launch --model-name falcon-instruct --size-in-billions 7 --model-format pytorch --quantization ${quantization} -.. note:: - - 4-bit quantization is not supported on macOS. Model Spec 2 (pytorch, 40 Billion) -++++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** pytorch - **Model Size (in billions):** 40 @@ -41,6 +41,3 @@ chosen quantization method from the options listed above:: xinference launch --model-name falcon-instruct --size-in-billions 40 --model-format pytorch --quantization ${quantization} -.. note:: - - 4-bit quantization is not supported on macOS. diff --git a/doc/source/models/builtin/falcon.rst b/doc/source/models/builtin/llm/falcon.rst similarity index 73% rename from doc/source/models/builtin/falcon.rst rename to doc/source/models/builtin/llm/falcon.rst index 9c206e29cd..ae5d185cbf 100644 --- a/doc/source/models/builtin/falcon.rst +++ b/doc/source/models/builtin/llm/falcon.rst @@ -1,46 +1,43 @@ -.. _models_builtin_falcon: +.. _models_llm_falcon: -====== -Falcon -====== +======================================== +falcon +======================================== +- **Context Length:** 2048 - **Model Name:** falcon - **Languages:** en -- **Abilities:** embed, generate +- **Abilities:** generate +- **Description:** Falcon is an open-source Transformer based LLM trained on the RefinedWeb dataset. Specifications ^^^^^^^^^^^^^^ -Model Spec 2 (pytorch, 7 Billion) -+++++++++++++++++++++++++++++++++ + +Model Spec 1 (pytorch, 40 Billion) +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** pytorch -- **Model Size (in billions):** 7 +- **Model Size (in billions):** 40 - **Quantizations:** 4-bit, 8-bit, none -- **Model ID:** tiiuae/falcon-7b +- **Model ID:** tiiuae/falcon-40b Execute the following command to launch the model, remember to replace ``${quantization}`` with your chosen quantization method from the options listed above:: - xinference launch --model-name falcon --size-in-billions 7 --model-format pytorch --quantization ${quantization} - -.. note:: + xinference launch --model-name falcon --size-in-billions 40 --model-format pytorch --quantization ${quantization} - 4-bit quantization is not supported on macOS. -Model Spec 1 (pytorch, 40 Billion) -++++++++++++++++++++++++++++++++++ +Model Spec 2 (pytorch, 7 Billion) +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** pytorch -- **Model Size (in billions):** 40 +- **Model Size (in billions):** 7 - **Quantizations:** 4-bit, 8-bit, none -- **Model ID:** tiiuae/falcon-40b +- **Model ID:** tiiuae/falcon-7b Execute the following command to launch the model, remember to replace ``${quantization}`` with your chosen quantization method from the options listed above:: - xinference launch --model-name falcon --size-in-billions 40 --model-format pytorch --quantization ${quantization} - -.. note:: + xinference launch --model-name falcon --size-in-billions 7 --model-format pytorch --quantization ${quantization} - 4-bit quantization is not supported on macOS. diff --git a/doc/source/models/builtin/llm/glaive-coder.rst b/doc/source/models/builtin/llm/glaive-coder.rst new file mode 100644 index 0000000000..e7084293e0 --- /dev/null +++ b/doc/source/models/builtin/llm/glaive-coder.rst @@ -0,0 +1,29 @@ +.. _models_llm_glaive-coder: + +======================================== +glaive-coder +======================================== + +- **Context Length:** 100000 +- **Model Name:** glaive-coder +- **Languages:** en +- **Abilities:** chat +- **Description:** A code model trained on a dataset of ~140k programming related problems and solutions generated from Glaive’s synthetic data generation platform. + +Specifications +^^^^^^^^^^^^^^ + + +Model Spec 1 (pytorch, 7 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 7 +- **Quantizations:** 4-bit, 8-bit, none +- **Model ID:** glaiveai/glaive-coder-7b + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name glaive-coder --size-in-billions 7 --model-format pytorch --quantization ${quantization} + diff --git a/doc/source/models/builtin/llm/gpt-2.rst b/doc/source/models/builtin/llm/gpt-2.rst new file mode 100644 index 0000000000..713c932128 --- /dev/null +++ b/doc/source/models/builtin/llm/gpt-2.rst @@ -0,0 +1,29 @@ +.. _models_llm_gpt-2: + +======================================== +gpt-2 +======================================== + +- **Context Length:** 1024 +- **Model Name:** gpt-2 +- **Languages:** en +- **Abilities:** generate +- **Description:** GPT-2 is a Transformer-based LLM that is trained on WebTest, a 40 GB dataset of Reddit posts with 3+ upvotes. + +Specifications +^^^^^^^^^^^^^^ + + +Model Spec 1 (ggmlv3, 1 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** ggmlv3 +- **Model Size (in billions):** 1 +- **Quantizations:** none +- **Model ID:** marella/gpt-2-ggml + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name gpt-2 --size-in-billions 1 --model-format ggmlv3 --quantization ${quantization} + diff --git a/doc/source/models/builtin/llm/index.rst b/doc/source/models/builtin/llm/index.rst new file mode 100644 index 0000000000..d2728e51da --- /dev/null +++ b/doc/source/models/builtin/llm/index.rst @@ -0,0 +1,103 @@ +.. _models_llm_index: + +===================== +Large language Models +===================== + +The following is a list of built-in LLM in Xinference: + + +.. toctree:: + :maxdepth: 3 + + + baichuan + + baichuan-2 + + baichuan-2-chat + + baichuan-chat + + chatglm + + chatglm2 + + chatglm2-32k + + chatglm3 + + chatglm3-32k + + code-llama + + code-llama-instruct + + code-llama-python + + falcon + + falcon-instruct + + glaive-coder + + gpt-2 + + internlm-20b + + internlm-7b + + internlm-chat-20b + + internlm-chat-7b + + llama-2 + + llama-2-chat + + mistral-instruct-v0.1 + + mistral-v0.1 + + openbuddy + + opt + + orca + + qwen-chat + + starchat-beta + + starcoder + + starcoderplus + + tiny-llama + + vicuna-v1.3 + + vicuna-v1.5 + + vicuna-v1.5-16k + + wizardcoder-python-v1.0 + + wizardlm-v1.0 + + wizardmath-v1.0 + + xverse + + xverse-chat + + yi + + yi-200k + + yi-chat + + zephyr-7b-alpha + + zephyr-7b-beta + \ No newline at end of file diff --git a/doc/source/models/builtin/llm/internlm-20b.rst b/doc/source/models/builtin/llm/internlm-20b.rst new file mode 100644 index 0000000000..5a6b98128e --- /dev/null +++ b/doc/source/models/builtin/llm/internlm-20b.rst @@ -0,0 +1,29 @@ +.. _models_llm_internlm-20b: + +======================================== +internlm-20b +======================================== + +- **Context Length:** 16384 +- **Model Name:** internlm-20b +- **Languages:** en, zh +- **Abilities:** generate +- **Description:** Pre-trained on over 2.3T Tokens containing high-quality English, Chinese, and code data. + +Specifications +^^^^^^^^^^^^^^ + + +Model Spec 1 (pytorch, 20 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 20 +- **Quantizations:** 4-bit, 8-bit, none +- **Model ID:** internlm/internlm-20b + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name internlm-20b --size-in-billions 20 --model-format pytorch --quantization ${quantization} + diff --git a/doc/source/models/builtin/llm/internlm-7b.rst b/doc/source/models/builtin/llm/internlm-7b.rst new file mode 100644 index 0000000000..cb28a7f6fc --- /dev/null +++ b/doc/source/models/builtin/llm/internlm-7b.rst @@ -0,0 +1,29 @@ +.. _models_llm_internlm-7b: + +======================================== +internlm-7b +======================================== + +- **Context Length:** 8192 +- **Model Name:** internlm-7b +- **Languages:** en, zh +- **Abilities:** generate +- **Description:** InternLM is a Transformer-based LLM that is trained on both Chinese and English data, focusing on practical scenarios. + +Specifications +^^^^^^^^^^^^^^ + + +Model Spec 1 (pytorch, 7 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 7 +- **Quantizations:** 4-bit, 8-bit, none +- **Model ID:** internlm/internlm-7b + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name internlm-7b --size-in-billions 7 --model-format pytorch --quantization ${quantization} + diff --git a/doc/source/models/builtin/llm/internlm-chat-20b.rst b/doc/source/models/builtin/llm/internlm-chat-20b.rst new file mode 100644 index 0000000000..acc30e1d2e --- /dev/null +++ b/doc/source/models/builtin/llm/internlm-chat-20b.rst @@ -0,0 +1,29 @@ +.. _models_llm_internlm-chat-20b: + +======================================== +internlm-chat-20b +======================================== + +- **Context Length:** 16384 +- **Model Name:** internlm-chat-20b +- **Languages:** en, zh +- **Abilities:** chat +- **Description:** Pre-trained on over 2.3T Tokens containing high-quality English, Chinese, and code data. The Chat version has undergone SFT and RLHF training. + +Specifications +^^^^^^^^^^^^^^ + + +Model Spec 1 (pytorch, 20 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 20 +- **Quantizations:** 4-bit, 8-bit, none +- **Model ID:** internlm/internlm-chat-20b + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name internlm-chat-20b --size-in-billions 20 --model-format pytorch --quantization ${quantization} + diff --git a/doc/source/models/builtin/llm/internlm-chat-7b.rst b/doc/source/models/builtin/llm/internlm-chat-7b.rst new file mode 100644 index 0000000000..9b925279ae --- /dev/null +++ b/doc/source/models/builtin/llm/internlm-chat-7b.rst @@ -0,0 +1,29 @@ +.. _models_llm_internlm-chat-7b: + +======================================== +internlm-chat-7b +======================================== + +- **Context Length:** 4096 +- **Model Name:** internlm-chat-7b +- **Languages:** en, zh +- **Abilities:** chat +- **Description:** Internlm-chat is a fine-tuned version of the Internlm LLM, specializing in chatting. + +Specifications +^^^^^^^^^^^^^^ + + +Model Spec 1 (pytorch, 7 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 7 +- **Quantizations:** 4-bit, 8-bit, none +- **Model ID:** internlm/internlm-chat-7b + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name internlm-chat-7b --size-in-billions 7 --model-format pytorch --quantization ${quantization} + diff --git a/doc/source/models/builtin/llama-2-chat.rst b/doc/source/models/builtin/llm/llama-2-chat.rst similarity index 85% rename from doc/source/models/builtin/llama-2-chat.rst rename to doc/source/models/builtin/llm/llama-2-chat.rst index 7903297796..85891b5806 100644 --- a/doc/source/models/builtin/llama-2-chat.rst +++ b/doc/source/models/builtin/llm/llama-2-chat.rst @@ -1,18 +1,21 @@ -.. _models_builtin_llama_2_chat: +.. _models_llm_llama-2-chat: -============ -Llama-2 Chat -============ +======================================== +llama-2-chat +======================================== +- **Context Length:** 4096 - **Model Name:** llama-2-chat - **Languages:** en -- **Abilities:** embed, chat +- **Abilities:** chat +- **Description:** Llama-2-Chat is a fine-tuned version of the Llama-2 LLM, specializing in chatting. Specifications ^^^^^^^^^^^^^^ + Model Spec 1 (ggmlv3, 7 Billion) -++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** ggmlv3 - **Model Size (in billions):** 7 @@ -24,8 +27,9 @@ chosen quantization method from the options listed above:: xinference launch --model-name llama-2-chat --size-in-billions 7 --model-format ggmlv3 --quantization ${quantization} + Model Spec 2 (ggmlv3, 13 Billion) -+++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** ggmlv3 - **Model Size (in billions):** 13 @@ -39,7 +43,7 @@ chosen quantization method from the options listed above:: Model Spec 3 (ggmlv3, 70 Billion) -+++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** ggmlv3 - **Model Size (in billions):** 70 @@ -51,8 +55,9 @@ chosen quantization method from the options listed above:: xinference launch --model-name llama-2-chat --size-in-billions 70 --model-format ggmlv3 --quantization ${quantization} + Model Spec 4 (pytorch, 7 Billion) -+++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** pytorch - **Model Size (in billions):** 7 @@ -64,13 +69,9 @@ chosen quantization method from the options listed above:: xinference launch --model-name llama-2-chat --size-in-billions 7 --model-format pytorch --quantization ${quantization} -.. note:: - - 4-bit quantization is not supported on macOS. - Model Spec 5 (pytorch, 13 Billion) -++++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** pytorch - **Model Size (in billions):** 13 @@ -82,12 +83,9 @@ chosen quantization method from the options listed above:: xinference launch --model-name llama-2-chat --size-in-billions 13 --model-format pytorch --quantization ${quantization} -.. note:: - - 4-bit quantization is not supported on macOS. Model Spec 6 (pytorch, 70 Billion) -++++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** pytorch - **Model Size (in billions):** 70 @@ -99,6 +97,3 @@ chosen quantization method from the options listed above:: xinference launch --model-name llama-2-chat --size-in-billions 70 --model-format pytorch --quantization ${quantization} -.. note:: - - 4-bit quantization is not supported on macOS. \ No newline at end of file diff --git a/doc/source/models/builtin/llama-2.rst b/doc/source/models/builtin/llm/llama-2.rst similarity index 85% rename from doc/source/models/builtin/llama-2.rst rename to doc/source/models/builtin/llm/llama-2.rst index 116a3d65ff..8fa544c92c 100644 --- a/doc/source/models/builtin/llama-2.rst +++ b/doc/source/models/builtin/llm/llama-2.rst @@ -1,18 +1,21 @@ -.. _models_builtin_llama_2: +.. _models_llm_llama-2: -======= -Llama-2 -======= +======================================== +llama-2 +======================================== +- **Context Length:** 4096 - **Model Name:** llama-2 - **Languages:** en -- **Abilities:** embed, generate +- **Abilities:** generate +- **Description:** Llama-2 is the second generation of Llama, open-source and trained on a larger amount of data. Specifications ^^^^^^^^^^^^^^ + Model Spec 1 (ggmlv3, 7 Billion) -++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** ggmlv3 - **Model Size (in billions):** 7 @@ -24,8 +27,9 @@ chosen quantization method from the options listed above:: xinference launch --model-name llama-2 --size-in-billions 7 --model-format ggmlv3 --quantization ${quantization} + Model Spec 2 (ggmlv3, 13 Billion) -+++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** ggmlv3 - **Model Size (in billions):** 13 @@ -37,8 +41,9 @@ chosen quantization method from the options listed above:: xinference launch --model-name llama-2 --size-in-billions 13 --model-format ggmlv3 --quantization ${quantization} + Model Spec 3 (ggmlv3, 70 Billion) -+++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** ggmlv3 - **Model Size (in billions):** 70 @@ -50,8 +55,9 @@ chosen quantization method from the options listed above:: xinference launch --model-name llama-2 --size-in-billions 70 --model-format ggmlv3 --quantization ${quantization} + Model Spec 4 (pytorch, 7 Billion) -+++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** pytorch - **Model Size (in billions):** 7 @@ -63,12 +69,9 @@ chosen quantization method from the options listed above:: xinference launch --model-name llama-2 --size-in-billions 7 --model-format pytorch --quantization ${quantization} -.. note:: - - 4-bit quantization is not supported on macOS. Model Spec 5 (pytorch, 13 Billion) -++++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** pytorch - **Model Size (in billions):** 13 @@ -80,12 +83,9 @@ chosen quantization method from the options listed above:: xinference launch --model-name llama-2 --size-in-billions 13 --model-format pytorch --quantization ${quantization} -.. note:: - - 4-bit quantization is not supported on macOS. Model Spec 6 (pytorch, 70 Billion) -++++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** pytorch - **Model Size (in billions):** 70 @@ -97,6 +97,3 @@ chosen quantization method from the options listed above:: xinference launch --model-name llama-2 --size-in-billions 70 --model-format pytorch --quantization ${quantization} -.. note:: - - 4-bit quantization is not supported on macOS. diff --git a/doc/source/models/builtin/llm/mistral-instruct-v0.1.rst b/doc/source/models/builtin/llm/mistral-instruct-v0.1.rst new file mode 100644 index 0000000000..6e31edc381 --- /dev/null +++ b/doc/source/models/builtin/llm/mistral-instruct-v0.1.rst @@ -0,0 +1,43 @@ +.. _models_llm_mistral-instruct-v0.1: + +======================================== +mistral-instruct-v0.1 +======================================== + +- **Context Length:** 8192 +- **Model Name:** mistral-instruct-v0.1 +- **Languages:** en +- **Abilities:** chat +- **Description:** Mistral-7B-Instruct is a fine-tuned version of the Mistral-7B LLM on public datasets, specializing in chatting. + +Specifications +^^^^^^^^^^^^^^ + + +Model Spec 1 (pytorch, 7 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 7 +- **Quantizations:** 4-bit, 8-bit, none +- **Model ID:** mistralai/Mistral-7B-Instruct-v0.1 + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name mistral-instruct-v0.1 --size-in-billions 7 --model-format pytorch --quantization ${quantization} + + +Model Spec 2 (ggufv2, 7 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** ggufv2 +- **Model Size (in billions):** 7 +- **Quantizations:** Q2_K, Q3_K_S, Q3_K_M, Q3_K_L, Q4_0, Q4_K_S, Q4_K_M, Q5_0, Q5_K_S, Q5_K_M, Q6_K, Q8_0 +- **Model ID:** TheBloke/Mistral-7B-Instruct-v0.1-GGUF + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name mistral-instruct-v0.1 --size-in-billions 7 --model-format ggufv2 --quantization ${quantization} + diff --git a/doc/source/models/builtin/llm/mistral-v0.1.rst b/doc/source/models/builtin/llm/mistral-v0.1.rst new file mode 100644 index 0000000000..fdb7750962 --- /dev/null +++ b/doc/source/models/builtin/llm/mistral-v0.1.rst @@ -0,0 +1,43 @@ +.. _models_llm_mistral-v0.1: + +======================================== +mistral-v0.1 +======================================== + +- **Context Length:** 8192 +- **Model Name:** mistral-v0.1 +- **Languages:** en +- **Abilities:** generate +- **Description:** Mistral-7B is a unmoderated Transformer based LLM claiming to outperform Llama2 on all benchmarks. + +Specifications +^^^^^^^^^^^^^^ + + +Model Spec 1 (pytorch, 7 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 7 +- **Quantizations:** 4-bit, 8-bit, none +- **Model ID:** mistralai/Mistral-7B-v0.1 + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name mistral-v0.1 --size-in-billions 7 --model-format pytorch --quantization ${quantization} + + +Model Spec 2 (ggufv2, 7 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** ggufv2 +- **Model Size (in billions):** 7 +- **Quantizations:** Q2_K, Q3_K_S, Q3_K_M, Q3_K_L, Q4_0, Q4_K_S, Q4_K_M, Q5_0, Q5_K_S, Q5_K_M, Q6_K, Q8_0 +- **Model ID:** TheBloke/Mistral-7B-v0.1-GGUF + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name mistral-v0.1 --size-in-billions 7 --model-format ggufv2 --quantization ${quantization} + diff --git a/doc/source/models/builtin/openbuddy.rst b/doc/source/models/builtin/llm/openbuddy.rst similarity index 66% rename from doc/source/models/builtin/openbuddy.rst rename to doc/source/models/builtin/llm/openbuddy.rst index 1d6c66ee98..b50eb16c9e 100644 --- a/doc/source/models/builtin/openbuddy.rst +++ b/doc/source/models/builtin/llm/openbuddy.rst @@ -1,18 +1,21 @@ -.. _models_builtin_openbuddy_v11.1: +.. _models_llm_openbuddy: -========= +======================================== OpenBuddy -========= +======================================== +- **Context Length:** 2048 - **Model Name:** OpenBuddy -- **Languages:** en, zh -- **Abilities:** embed, chat +- **Languages:** en +- **Abilities:** chat +- **Description:** OpenBuddy is a powerful open multilingual chatbot model aimed at global users. Specifications ^^^^^^^^^^^^^^ + Model Spec 1 (ggmlv3, 13 Billion) -+++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** ggmlv3 - **Model Size (in billions):** 13 @@ -24,6 +27,3 @@ chosen quantization method from the options listed above:: xinference launch --model-name OpenBuddy --size-in-billions 13 --model-format ggmlv3 --quantization ${quantization} -.. note:: - - Multiple rounds chat is disabled for better translation. diff --git a/doc/source/models/builtin/opt.rst b/doc/source/models/builtin/llm/opt.rst similarity index 62% rename from doc/source/models/builtin/opt.rst rename to doc/source/models/builtin/llm/opt.rst index 3bf7999a4c..1a57961a89 100644 --- a/doc/source/models/builtin/opt.rst +++ b/doc/source/models/builtin/llm/opt.rst @@ -1,18 +1,21 @@ -.. _models_builtin_opt: +.. _models_llm_opt: -=== -OPT -=== +======================================== +opt +======================================== +- **Context Length:** 2048 - **Model Name:** opt - **Languages:** en -- **Abilities:** embed, generate +- **Abilities:** generate +- **Description:** Opt is an open-source, decoder-only, Transformer based LLM that was designed to replicate GPT-3. Specifications ^^^^^^^^^^^^^^ + Model Spec 1 (pytorch, 1 Billion) -+++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** pytorch - **Model Size (in billions):** 1 @@ -24,6 +27,3 @@ chosen quantization method from the options listed above:: xinference launch --model-name opt --size-in-billions 1 --model-format pytorch --quantization ${quantization} -.. note:: - - 4-bit quantization is not supported on macOS. diff --git a/doc/source/models/builtin/orca_mini.rst b/doc/source/models/builtin/llm/orca.rst similarity index 79% rename from doc/source/models/builtin/orca_mini.rst rename to doc/source/models/builtin/llm/orca.rst index aa8b92d7ab..0152c3259c 100644 --- a/doc/source/models/builtin/orca_mini.rst +++ b/doc/source/models/builtin/llm/orca.rst @@ -1,18 +1,21 @@ -.. _models_builtin_orca_mini: +.. _models_llm_orca: -========= -Orca Mini -========= +======================================== +orca +======================================== +- **Context Length:** 2048 - **Model Name:** orca - **Languages:** en -- **Abilities:** embed, chat +- **Abilities:** chat +- **Description:** Orca is an LLM trained by fine-tuning LLaMA on explanation traces obtained from GPT-4. Specifications ^^^^^^^^^^^^^^ + Model Spec 1 (ggmlv3, 3 Billion) -++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** ggmlv3 - **Model Size (in billions):** 3 @@ -24,8 +27,9 @@ chosen quantization method from the options listed above:: xinference launch --model-name orca --size-in-billions 3 --model-format ggmlv3 --quantization ${quantization} + Model Spec 2 (ggmlv3, 7 Billion) -++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** ggmlv3 - **Model Size (in billions):** 7 @@ -37,8 +41,9 @@ chosen quantization method from the options listed above:: xinference launch --model-name orca --size-in-billions 7 --model-format ggmlv3 --quantization ${quantization} + Model Spec 3 (ggmlv3, 13 Billion) -+++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** ggmlv3 - **Model Size (in billions):** 13 @@ -49,3 +54,4 @@ Execute the following command to launch the model, remember to replace ``${quant chosen quantization method from the options listed above:: xinference launch --model-name orca --size-in-billions 13 --model-format ggmlv3 --quantization ${quantization} + diff --git a/doc/source/models/builtin/llm/qwen-chat.rst b/doc/source/models/builtin/llm/qwen-chat.rst new file mode 100644 index 0000000000..4c27fb5e0f --- /dev/null +++ b/doc/source/models/builtin/llm/qwen-chat.rst @@ -0,0 +1,127 @@ +.. _models_llm_qwen-chat: + +======================================== +qwen-chat +======================================== + +- **Context Length:** 2048 +- **Model Name:** qwen-chat +- **Languages:** en, zh +- **Abilities:** chat +- **Description:** Qwen-chat is a fine-tuned version of the Qwen LLM trained with alignment techniques, specializing in chatting. + +Specifications +^^^^^^^^^^^^^^ + + +Model Spec 1 (ggmlv3, 7 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** ggmlv3 +- **Model Size (in billions):** 7 +- **Quantizations:** q4_0 +- **Model ID:** Xorbits/qwen-chat-7B-ggml + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name qwen-chat --size-in-billions 7 --model-format ggmlv3 --quantization ${quantization} + + +Model Spec 2 (ggmlv3, 14 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** ggmlv3 +- **Model Size (in billions):** 14 +- **Quantizations:** q4_0 +- **Model ID:** Xorbits/qwen-chat-14B-ggml + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name qwen-chat --size-in-billions 14 --model-format ggmlv3 --quantization ${quantization} + + +Model Spec 3 (pytorch, 7 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 7 +- **Quantizations:** 4-bit, 8-bit, none +- **Model ID:** Qwen/Qwen-7B-Chat + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name qwen-chat --size-in-billions 7 --model-format pytorch --quantization ${quantization} + + +Model Spec 4 (pytorch, 14 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 14 +- **Quantizations:** none +- **Model ID:** Qwen/Qwen-14B-Chat + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name qwen-chat --size-in-billions 14 --model-format pytorch --quantization ${quantization} + + +Model Spec 5 (pytorch, 72 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 72 +- **Quantizations:** 4-bit, 8-bit, none +- **Model ID:** Qwen/Qwen-72B-Chat + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name qwen-chat --size-in-billions 72 --model-format pytorch --quantization ${quantization} + + +Model Spec 6 (gptq, 7 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** gptq +- **Model Size (in billions):** 7 +- **Quantizations:** Int4, Int8 +- **Model ID:** Qwen/Qwen-7B-Chat-{quantization} + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name qwen-chat --size-in-billions 7 --model-format gptq --quantization ${quantization} + + +Model Spec 7 (gptq, 14 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** gptq +- **Model Size (in billions):** 14 +- **Quantizations:** Int4, Int8 +- **Model ID:** Qwen/Qwen-14B-Chat-{quantization} + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name qwen-chat --size-in-billions 14 --model-format gptq --quantization ${quantization} + + +Model Spec 8 (gptq, 72 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** gptq +- **Model Size (in billions):** 72 +- **Quantizations:** Int4, Int8 +- **Model ID:** Qwen/Qwen-72B-Chat-{quantization} + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name qwen-chat --size-in-billions 72 --model-format gptq --quantization ${quantization} + diff --git a/doc/source/models/builtin/starchat-beta.rst b/doc/source/models/builtin/llm/starchat-beta.rst similarity index 58% rename from doc/source/models/builtin/starchat-beta.rst rename to doc/source/models/builtin/llm/starchat-beta.rst index 6f260ed7dd..a0637061c8 100644 --- a/doc/source/models/builtin/starchat-beta.rst +++ b/doc/source/models/builtin/llm/starchat-beta.rst @@ -1,18 +1,21 @@ -.. _models_builtin_starchat_beta: +.. _models_llm_starchat-beta: -============= -Starchat-beta -============= +======================================== +starchat-beta +======================================== +- **Context Length:** 8192 - **Model Name:** starchat-beta - **Languages:** en -- **Abilities:** embed, chat +- **Abilities:** chat +- **Description:** Starchat-beta is a fine-tuned version of the Starcoderplus LLM, specializing in coding assistance. Specifications ^^^^^^^^^^^^^^ -Model Spec (pytorch, 16 Billion) -++++++++++++++++++++++++++++++++ + +Model Spec 1 (pytorch, 16 Billion) +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** pytorch - **Model Size (in billions):** 16 @@ -24,6 +27,3 @@ chosen quantization method from the options listed above:: xinference launch --model-name starchat-beta --size-in-billions 16 --model-format pytorch --quantization ${quantization} -.. note:: - - 4-bit quantization is not supported on macOS. \ No newline at end of file diff --git a/doc/source/models/builtin/llm/starcoder.rst b/doc/source/models/builtin/llm/starcoder.rst new file mode 100644 index 0000000000..d49d8640c9 --- /dev/null +++ b/doc/source/models/builtin/llm/starcoder.rst @@ -0,0 +1,29 @@ +.. _models_llm_starcoder: + +======================================== +starcoder +======================================== + +- **Context Length:** 8192 +- **Model Name:** starcoder +- **Languages:** en +- **Abilities:** generate +- **Description:** Starcoder is an open-source Transformer based LLM that is trained on permissively licensed data from GitHub. + +Specifications +^^^^^^^^^^^^^^ + + +Model Spec 1 (ggmlv3, 16 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** ggmlv3 +- **Model Size (in billions):** 16 +- **Quantizations:** q4_0, q4_1, q5_0, q5_1, q8_0 +- **Model ID:** TheBloke/starcoder-GGML + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name starcoder --size-in-billions 16 --model-format ggmlv3 --quantization ${quantization} + diff --git a/doc/source/models/builtin/starcoderplus.rst b/doc/source/models/builtin/llm/starcoderplus.rst similarity index 57% rename from doc/source/models/builtin/starcoderplus.rst rename to doc/source/models/builtin/llm/starcoderplus.rst index e3819fe01f..6dfaa7b6b2 100644 --- a/doc/source/models/builtin/starcoderplus.rst +++ b/doc/source/models/builtin/llm/starcoderplus.rst @@ -1,18 +1,21 @@ -.. _models_builtin_starcoderplus: +.. _models_llm_starcoderplus: -============= -StarCoderPlus -============= +======================================== +starcoderplus +======================================== +- **Context Length:** 8192 - **Model Name:** starcoderplus - **Languages:** en -- **Abilities:** embed, generate +- **Abilities:** generate +- **Description:** Starcoderplus is an open-source LLM trained by fine-tuning Starcoder on RedefinedWeb and StarCoderData datasets. Specifications ^^^^^^^^^^^^^^ -Model Spec (pytorch, 16 Billion) -++++++++++++++++++++++++++++++++ + +Model Spec 1 (pytorch, 16 Billion) +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** pytorch - **Model Size (in billions):** 16 @@ -24,6 +27,3 @@ chosen quantization method from the options listed above:: xinference launch --model-name starcoderplus --size-in-billions 16 --model-format pytorch --quantization ${quantization} -.. note:: - - 4-bit quantization is not supported on macOS. diff --git a/doc/source/models/builtin/llm/tiny-llama.rst b/doc/source/models/builtin/llm/tiny-llama.rst new file mode 100644 index 0000000000..57e6bf27a5 --- /dev/null +++ b/doc/source/models/builtin/llm/tiny-llama.rst @@ -0,0 +1,29 @@ +.. _models_llm_tiny-llama: + +======================================== +tiny-llama +======================================== + +- **Context Length:** 2048 +- **Model Name:** tiny-llama +- **Languages:** en +- **Abilities:** generate +- **Description:** The TinyLlama project aims to pretrain a 1.1B Llama model on 3 trillion tokens. + +Specifications +^^^^^^^^^^^^^^ + + +Model Spec 1 (ggufv2, 1 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** ggufv2 +- **Model Size (in billions):** 1 +- **Quantizations:** Q2_K, Q3_K_L, Q3_K_M, Q3_K_S, Q4_0, Q4_K_M, Q4_K_S, Q5_0, Q5_K_M, Q5_K_S, Q6_K, Q8_0 +- **Model ID:** TheBloke/TinyLlama-1.1B-Chat-v0.3-GGUF + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name tiny-llama --size-in-billions 1 --model-format ggufv2 --quantization ${quantization} + diff --git a/doc/source/models/builtin/vicuna-v1.3.rst b/doc/source/models/builtin/llm/vicuna-v1.3.rst similarity index 82% rename from doc/source/models/builtin/vicuna-v1.3.rst rename to doc/source/models/builtin/llm/vicuna-v1.3.rst index 43cff0a586..1fa72fea2a 100644 --- a/doc/source/models/builtin/vicuna-v1.3.rst +++ b/doc/source/models/builtin/llm/vicuna-v1.3.rst @@ -1,77 +1,77 @@ -.. _models_builtin_vicuna_v1_3: +.. _models_llm_vicuna-v1.3: -=========== -Vicuna v1.3 -=========== +======================================== +vicuna-v1.3 +======================================== +- **Context Length:** 2048 - **Model Name:** vicuna-v1.3 - **Languages:** en -- **Abilities:** embed, chat +- **Abilities:** chat +- **Description:** Vicuna is an open-source LLM trained by fine-tuning LLaMA on data collected from ShareGPT. Specifications ^^^^^^^^^^^^^^ + Model Spec 1 (ggmlv3, 7 Billion) -++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** ggmlv3 - **Model Size (in billions):** 7 - **Quantizations:** q2_K, q3_K_L, q3_K_M, q3_K_S, q4_0, q4_1, q4_K_M, q4_K_S, q5_0, q5_1, q5_K_M, q5_K_S, q6_K, q8_0 - **Model ID:** TheBloke/vicuna-7B-v1.3-GGML -- **File Name Template:** vicuna-7b-v1.3.ggmlv3.{quantization}.bin Execute the following command to launch the model, remember to replace ``${quantization}`` with your chosen quantization method from the options listed above:: xinference launch --model-name vicuna-v1.3 --size-in-billions 7 --model-format ggmlv3 --quantization ${quantization} + Model Spec 2 (ggmlv3, 13 Billion) -+++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** ggmlv3 - **Model Size (in billions):** 13 - **Quantizations:** q2_K, q3_K_L, q3_K_M, q3_K_S, q4_0, q4_1, q4_K_M, q4_K_S, q5_0, q5_1, q5_K_M, q5_K_S, q6_K, q8_0 - **Model ID:** TheBloke/vicuna-13b-v1.3.0-GGML -- **File Name Template:** vicuna-13b-v1.3.0.ggmlv3.{quantization}.bin Execute the following command to launch the model, remember to replace ``${quantization}`` with your chosen quantization method from the options listed above:: xinference launch --model-name vicuna-v1.3 --size-in-billions 13 --model-format ggmlv3 --quantization ${quantization} + Model Spec 3 (ggmlv3, 33 Billion) -+++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** ggmlv3 - **Model Size (in billions):** 33 - **Quantizations:** q2_K, q3_K_L, q3_K_M, q3_K_S, q4_0, q4_1, q4_K_M, q4_K_S, q5_0, q5_1, q5_K_M, q5_K_S, q6_K, q8_0 - **Model ID:** TheBloke/vicuna-33B-GGML -- **File Name Template:** vicuna-33b.ggmlv3.{quantization}.bin Execute the following command to launch the model, remember to replace ``${quantization}`` with your chosen quantization method from the options listed above:: xinference launch --model-name vicuna-v1.3 --size-in-billions 33 --model-format ggmlv3 --quantization ${quantization} -Model Spec 6 (pytorch, 7 Billion) -+++++++++++++++++++++++++++++++++ + +Model Spec 4 (pytorch, 33 Billion) +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** pytorch -- **Model Size (in billions):** 7 +- **Model Size (in billions):** 33 - **Quantizations:** 4-bit, 8-bit, none -- **Model ID:** lmsys/vicuna-7b-v1.3 +- **Model ID:** lmsys/vicuna-33b-v1.3 Execute the following command to launch the model, remember to replace ``${quantization}`` with your chosen quantization method from the options listed above:: - xinference launch --model-name vicuna-v1.3 --size-in-billions 7 --model-format pytorch --quantization ${quantization} - -.. note:: + xinference launch --model-name vicuna-v1.3 --size-in-billions 33 --model-format pytorch --quantization ${quantization} - 4-bit quantization is not supported on macOS. Model Spec 5 (pytorch, 13 Billion) -++++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** pytorch - **Model Size (in billions):** 13 @@ -83,23 +83,17 @@ chosen quantization method from the options listed above:: xinference launch --model-name vicuna-v1.3 --size-in-billions 13 --model-format pytorch --quantization ${quantization} -.. note:: - - 4-bit quantization is not supported on macOS. -Model Spec 4 (pytorch, 33 Billion) -++++++++++++++++++++++++++++++++++ +Model Spec 6 (pytorch, 7 Billion) +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** pytorch -- **Model Size (in billions):** 33 +- **Model Size (in billions):** 7 - **Quantizations:** 4-bit, 8-bit, none -- **Model ID:** lmsys/vicuna-33b-v1.3 +- **Model ID:** lmsys/vicuna-7b-v1.3 Execute the following command to launch the model, remember to replace ``${quantization}`` with your chosen quantization method from the options listed above:: - xinference launch --model-name vicuna-v1.3 --size-in-billions 33 --model-format pytorch --quantization ${quantization} - -.. note:: + xinference launch --model-name vicuna-v1.3 --size-in-billions 7 --model-format pytorch --quantization ${quantization} - 4-bit quantization is not supported on macOS. \ No newline at end of file diff --git a/doc/source/models/builtin/vicuna-v1.5-16k.rst b/doc/source/models/builtin/llm/vicuna-v1.5-16k.rst similarity index 72% rename from doc/source/models/builtin/vicuna-v1.5-16k.rst rename to doc/source/models/builtin/llm/vicuna-v1.5-16k.rst index 651665cef6..5833f5c295 100644 --- a/doc/source/models/builtin/vicuna-v1.5-16k.rst +++ b/doc/source/models/builtin/llm/vicuna-v1.5-16k.rst @@ -1,18 +1,21 @@ -.. _models_builtin_vicuna_v1_5_16k: +.. _models_llm_vicuna-v1.5-16k: -=============== -Vicuna v1.5-16k -=============== +======================================== +vicuna-v1.5-16k +======================================== +- **Context Length:** 16384 - **Model Name:** vicuna-v1.5-16k - **Languages:** en -- **Abilities:** embed, chat +- **Abilities:** chat +- **Description:** Vicuna-v1.5-16k is a special version of Vicuna-v1.5, with a context window of 16k tokens instead of 4k. Specifications ^^^^^^^^^^^^^^ + Model Spec 1 (pytorch, 7 Billion) -+++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** pytorch - **Model Size (in billions):** 7 @@ -24,12 +27,9 @@ chosen quantization method from the options listed above:: xinference launch --model-name vicuna-v1.5-16k --size-in-billions 7 --model-format pytorch --quantization ${quantization} -.. note:: - - 4-bit quantization is not supported on macOS. Model Spec 2 (pytorch, 13 Billion) -++++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** pytorch - **Model Size (in billions):** 13 @@ -41,6 +41,3 @@ chosen quantization method from the options listed above:: xinference launch --model-name vicuna-v1.5-16k --size-in-billions 13 --model-format pytorch --quantization ${quantization} -.. note:: - - 4-bit quantization is not supported on macOS. diff --git a/doc/source/models/builtin/vicuna-v1.5.rst b/doc/source/models/builtin/llm/vicuna-v1.5.rst similarity index 73% rename from doc/source/models/builtin/vicuna-v1.5.rst rename to doc/source/models/builtin/llm/vicuna-v1.5.rst index 428ccf82e6..a2211231c9 100644 --- a/doc/source/models/builtin/vicuna-v1.5.rst +++ b/doc/source/models/builtin/llm/vicuna-v1.5.rst @@ -1,18 +1,21 @@ -.. _models_builtin_vicuna_v1_5: +.. _models_llm_vicuna-v1.5: -=========== -Vicuna v1.5 -=========== +======================================== +vicuna-v1.5 +======================================== +- **Context Length:** 4096 - **Model Name:** vicuna-v1.5 - **Languages:** en -- **Abilities:** embed, chat +- **Abilities:** chat +- **Description:** Vicuna is an open-source LLM trained by fine-tuning LLaMA on data collected from ShareGPT. Specifications ^^^^^^^^^^^^^^ + Model Spec 1 (pytorch, 7 Billion) -+++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** pytorch - **Model Size (in billions):** 7 @@ -24,12 +27,9 @@ chosen quantization method from the options listed above:: xinference launch --model-name vicuna-v1.5 --size-in-billions 7 --model-format pytorch --quantization ${quantization} -.. note:: - - 4-bit quantization is not supported on macOS. Model Spec 2 (pytorch, 13 Billion) -++++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** pytorch - **Model Size (in billions):** 13 @@ -41,6 +41,3 @@ chosen quantization method from the options listed above:: xinference launch --model-name vicuna-v1.5 --size-in-billions 13 --model-format pytorch --quantization ${quantization} -.. note:: - - 4-bit quantization is not supported on macOS. diff --git a/doc/source/models/builtin/wizardcoder-python-v1.0.rst b/doc/source/models/builtin/llm/wizardcoder-python-v1.0.rst similarity index 82% rename from doc/source/models/builtin/wizardcoder-python-v1.0.rst rename to doc/source/models/builtin/llm/wizardcoder-python-v1.0.rst index 8265bbda7c..4871fdb00c 100644 --- a/doc/source/models/builtin/wizardcoder-python-v1.0.rst +++ b/doc/source/models/builtin/llm/wizardcoder-python-v1.0.rst @@ -1,19 +1,21 @@ -.. _models_builtin_wizardcoder_python_v1_0: +.. _models_llm_wizardcoder-python-v1.0: -======================= -WizardCoder-Python-v1.0 -======================= +======================================== +wizardcoder-python-v1.0 +======================================== - **Context Length:** 100000 - **Model Name:** wizardcoder-python-v1.0 - **Languages:** en -- **Abilities:** generate, chat +- **Abilities:** chat +- **Description:** Specifications ^^^^^^^^^^^^^^ + Model Spec 1 (pytorch, 7 Billion) -+++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** pytorch - **Model Size (in billions):** 7 @@ -25,13 +27,9 @@ chosen quantization method from the options listed above:: xinference launch --model-name wizardcoder-python-v1.0 --size-in-billions 7 --model-format pytorch --quantization ${quantization} -.. note:: - - 4-bit quantization is not supported on macOS. - Model Spec 2 (pytorch, 13 Billion) -++++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** pytorch - **Model Size (in billions):** 13 @@ -43,12 +41,9 @@ chosen quantization method from the options listed above:: xinference launch --model-name wizardcoder-python-v1.0 --size-in-billions 13 --model-format pytorch --quantization ${quantization} -.. note:: - - 4-bit quantization is not supported on macOS. Model Spec 3 (pytorch, 34 Billion) -++++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** pytorch - **Model Size (in billions):** 34 @@ -60,12 +55,9 @@ chosen quantization method from the options listed above:: xinference launch --model-name wizardcoder-python-v1.0 --size-in-billions 34 --model-format pytorch --quantization ${quantization} -.. note:: - - 4-bit quantization is not supported on macOS. Model Spec 4 (ggufv2, 7 Billion) -++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** ggufv2 - **Model Size (in billions):** 7 @@ -77,30 +69,31 @@ chosen quantization method from the options listed above:: xinference launch --model-name wizardcoder-python-v1.0 --size-in-billions 7 --model-format ggufv2 --quantization ${quantization} + Model Spec 5 (ggufv2, 13 Billion) -+++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** ggufv2 - **Model Size (in billions):** 13 - **Quantizations:** Q2_K, Q3_K_L, Q3_K_M, Q3_K_S, Q4_0, Q4_K_M, Q4_K_S, Q5_0, Q5_K_M, Q5_K_S, Q6_K, Q8_0 - **Model ID:** TheBloke/WizardCoder-Python-13B-V1.0-GGUF -- **File Name Template:** wizardcoder-python-13b-v1.0.{quantization}.gguf Execute the following command to launch the model, remember to replace ``${quantization}`` with your chosen quantization method from the options listed above:: xinference launch --model-name wizardcoder-python-v1.0 --size-in-billions 13 --model-format ggufv2 --quantization ${quantization} + Model Spec 6 (ggufv2, 34 Billion) -+++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** ggufv2 - **Model Size (in billions):** 34 - **Quantizations:** Q2_K, Q3_K_L, Q3_K_M, Q3_K_S, Q4_0, Q4_K_M, Q4_K_S, Q5_0, Q5_K_M, Q5_K_S, Q6_K, Q8_0 - **Model ID:** TheBloke/WizardCoder-Python-34B-V1.0-GGUF -- **File Name Template:** wizardcoder-python-34b-v1.0.{quantization}.gguf Execute the following command to launch the model, remember to replace ``${quantization}`` with your chosen quantization method from the options listed above:: - xinference launch --model-name wizardcoder-python-v1.0 --size-in-billions 34 --model-format ggufv2 --quantization ${quantization} \ No newline at end of file + xinference launch --model-name wizardcoder-python-v1.0 --size-in-billions 34 --model-format ggufv2 --quantization ${quantization} + diff --git a/doc/source/models/builtin/wizardlm-v1.0.rst b/doc/source/models/builtin/llm/wizardlm-v1.0.rst similarity index 76% rename from doc/source/models/builtin/wizardlm-v1.0.rst rename to doc/source/models/builtin/llm/wizardlm-v1.0.rst index 297bf30f23..679b18b497 100644 --- a/doc/source/models/builtin/wizardlm-v1.0.rst +++ b/doc/source/models/builtin/llm/wizardlm-v1.0.rst @@ -1,18 +1,21 @@ -.. _models_builtin_wizardlm_v1_0: +.. _models_llm_wizardlm-v1.0: -============= -WizardLM v1.0 -============= +======================================== +wizardlm-v1.0 +======================================== +- **Context Length:** 2048 - **Model Name:** wizardlm-v1.0 - **Languages:** en -- **Abilities:** embed, chat +- **Abilities:** chat +- **Description:** WizardLM is an open-source LLM trained by fine-tuning LLaMA with Evol-Instruct. Specifications ^^^^^^^^^^^^^^ + Model Spec 1 (ggmlv3, 7 Billion) -++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** ggmlv3 - **Model Size (in billions):** 7 @@ -24,8 +27,9 @@ chosen quantization method from the options listed above:: xinference launch --model-name wizardlm-v1.0 --size-in-billions 7 --model-format ggmlv3 --quantization ${quantization} + Model Spec 2 (ggmlv3, 13 Billion) -+++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** ggmlv3 - **Model Size (in billions):** 13 @@ -36,3 +40,4 @@ Execute the following command to launch the model, remember to replace ``${quant chosen quantization method from the options listed above:: xinference launch --model-name wizardlm-v1.0 --size-in-billions 13 --model-format ggmlv3 --quantization ${quantization} + diff --git a/doc/source/models/builtin/wizardmath-v1.0.rst b/doc/source/models/builtin/llm/wizardmath-v1.0.rst similarity index 77% rename from doc/source/models/builtin/wizardmath-v1.0.rst rename to doc/source/models/builtin/llm/wizardmath-v1.0.rst index eda7e4da7e..e711dde922 100644 --- a/doc/source/models/builtin/wizardmath-v1.0.rst +++ b/doc/source/models/builtin/llm/wizardmath-v1.0.rst @@ -1,18 +1,21 @@ -.. _models_builtin_wizardmath_v1_0: +.. _models_llm_wizardmath-v1.0: -=============== -WizardMath v1.0 -=============== +======================================== +wizardmath-v1.0 +======================================== +- **Context Length:** 2048 - **Model Name:** wizardmath-v1.0 - **Languages:** en -- **Abilities:** embed, chat +- **Abilities:** chat +- **Description:** WizardMath is an open-source LLM trained by fine-tuning Llama2 with Evol-Instruct, specializing in math. Specifications ^^^^^^^^^^^^^^ + Model Spec 1 (pytorch, 7 Billion) -+++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** pytorch - **Model Size (in billions):** 7 @@ -24,12 +27,9 @@ chosen quantization method from the options listed above:: xinference launch --model-name wizardmath-v1.0 --size-in-billions 7 --model-format pytorch --quantization ${quantization} -.. note:: - - 4-bit quantization is not supported on macOS. Model Spec 2 (pytorch, 13 Billion) -++++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** pytorch - **Model Size (in billions):** 13 @@ -41,12 +41,9 @@ chosen quantization method from the options listed above:: xinference launch --model-name wizardmath-v1.0 --size-in-billions 13 --model-format pytorch --quantization ${quantization} -.. note:: - - 4-bit quantization is not supported on macOS. Model Spec 3 (pytorch, 70 Billion) -++++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** pytorch - **Model Size (in billions):** 70 @@ -58,6 +55,3 @@ chosen quantization method from the options listed above:: xinference launch --model-name wizardmath-v1.0 --size-in-billions 70 --model-format pytorch --quantization ${quantization} -.. note:: - - 4-bit quantization is not supported on macOS. diff --git a/doc/source/models/builtin/llm/xverse-chat.rst b/doc/source/models/builtin/llm/xverse-chat.rst new file mode 100644 index 0000000000..9d5ce3281a --- /dev/null +++ b/doc/source/models/builtin/llm/xverse-chat.rst @@ -0,0 +1,43 @@ +.. _models_llm_xverse-chat: + +======================================== +xverse-chat +======================================== + +- **Context Length:** 2048 +- **Model Name:** xverse-chat +- **Languages:** en, zh +- **Abilities:** chat +- **Description:** XVERSEB-Chat is the aligned version of model XVERSE. + +Specifications +^^^^^^^^^^^^^^ + + +Model Spec 1 (pytorch, 7 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 7 +- **Quantizations:** 4-bit, 8-bit, none +- **Model ID:** xverse/XVERSE-7B-Chat + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name xverse-chat --size-in-billions 7 --model-format pytorch --quantization ${quantization} + + +Model Spec 2 (pytorch, 13 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 13 +- **Quantizations:** 4-bit, 8-bit, none +- **Model ID:** xverse/XVERSE-13B-Chat + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name xverse-chat --size-in-billions 13 --model-format pytorch --quantization ${quantization} + diff --git a/doc/source/models/builtin/llm/xverse.rst b/doc/source/models/builtin/llm/xverse.rst new file mode 100644 index 0000000000..a02ce6f2bb --- /dev/null +++ b/doc/source/models/builtin/llm/xverse.rst @@ -0,0 +1,57 @@ +.. _models_llm_xverse: + +======================================== +xverse +======================================== + +- **Context Length:** 2048 +- **Model Name:** xverse +- **Languages:** en, zh +- **Abilities:** generate +- **Description:** XVERSE is a multilingual large language model, independently developed by Shenzhen Yuanxiang Technology. + +Specifications +^^^^^^^^^^^^^^ + + +Model Spec 1 (pytorch, 7 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 7 +- **Quantizations:** 4-bit, 8-bit, none +- **Model ID:** xverse/XVERSE-7B + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name xverse --size-in-billions 7 --model-format pytorch --quantization ${quantization} + + +Model Spec 2 (pytorch, 13 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 13 +- **Quantizations:** 4-bit, 8-bit, none +- **Model ID:** xverse/XVERSE-13B + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name xverse --size-in-billions 13 --model-format pytorch --quantization ${quantization} + + +Model Spec 3 (pytorch, 65 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 65 +- **Quantizations:** 4-bit, 8-bit, none +- **Model ID:** xverse/XVERSE-65B + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name xverse --size-in-billions 65 --model-format pytorch --quantization ${quantization} + diff --git a/doc/source/models/builtin/Yi-200k.rst b/doc/source/models/builtin/llm/yi-200k.rst similarity index 67% rename from doc/source/models/builtin/Yi-200k.rst rename to doc/source/models/builtin/llm/yi-200k.rst index 7fa8739a44..125a1a8deb 100644 --- a/doc/source/models/builtin/Yi-200k.rst +++ b/doc/source/models/builtin/llm/yi-200k.rst @@ -1,47 +1,43 @@ -.. _models_builtin_Yi_200k: +.. _models_llm_yi-200k: - -======= +======================================== Yi-200k -======= +======================================== - **Context Length:** 204800 - **Model Name:** Yi-200k - **Languages:** en, zh - **Abilities:** generate -- **Description:** The Yi series models are large language models trained from scratch by developers at 01.AI. The first public release contains two bilingual (English/Chinese) base models with the parameter sizes of 6B and 34B. Both of them are trained with 4K sequence length and can be extended to 32K during inference time. +- **Description:** The Yi series models are large language models trained from scratch by developers at 01.AI. Specifications ^^^^^^^^^^^^^^ + Model Spec 1 (pytorch, 6 Billion) -+++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** pytorch - **Model Size (in billions):** 6 - **Quantizations:** 4-bit, 8-bit, none - **Model ID:** 01-ai/Yi-6B-200K -Execute the following command to launch the model, remember to replace `${quantization}` with your +Execute the following command to launch the model, remember to replace ``${quantization}`` with your chosen quantization method from the options listed above:: xinference launch --model-name Yi-200k --size-in-billions 6 --model-format pytorch --quantization ${quantization} Model Spec 2 (pytorch, 34 Billion) -++++++++++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** pytorch - **Model Size (in billions):** 34 - **Quantizations:** 4-bit, 8-bit, none - **Model ID:** 01-ai/Yi-34B-200K -Execute the following command to launch the model, remember to replace `${quantization}` with your +Execute the following command to launch the model, remember to replace ``${quantization}`` with your chosen quantization method from the options listed above:: xinference launch --model-name Yi-200k --size-in-billions 34 --model-format pytorch --quantization ${quantization} - -.. note:: - - 4-bit quantization is not supported on macOS. diff --git a/doc/source/models/builtin/llm/yi-chat.rst b/doc/source/models/builtin/llm/yi-chat.rst new file mode 100644 index 0000000000..9e42ef7aaf --- /dev/null +++ b/doc/source/models/builtin/llm/yi-chat.rst @@ -0,0 +1,43 @@ +.. _models_llm_yi-chat: + +======================================== +Yi-chat +======================================== + +- **Context Length:** 204800 +- **Model Name:** Yi-chat +- **Languages:** en, zh +- **Abilities:** chat +- **Description:** The Yi series models are large language models trained from scratch by developers at 01.AI. + +Specifications +^^^^^^^^^^^^^^ + + +Model Spec 1 (pytorch, 34 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 34 +- **Quantizations:** 4-bit, 8-bit, none +- **Model ID:** 01-ai/Yi-34B-Chat + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name Yi-chat --size-in-billions 34 --model-format pytorch --quantization ${quantization} + + +Model Spec 2 (ggufv2, 34 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** ggufv2 +- **Model Size (in billions):** 34 +- **Quantizations:** Q2_K, Q3_K_L, Q3_K_M, Q3_K_S, Q4_0, Q4_K_M, Q4_K_S, Q5_0, Q5_K_M, Q5_K_S, Q6_K, Q8_0 +- **Model ID:** TheBloke/Yi-34B-Chat-GGUF + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name Yi-chat --size-in-billions 34 --model-format ggufv2 --quantization ${quantization} + diff --git a/doc/source/models/builtin/Yi.rst b/doc/source/models/builtin/llm/yi.rst similarity index 50% rename from doc/source/models/builtin/Yi.rst rename to doc/source/models/builtin/llm/yi.rst index 20416b3a07..cf17a759aa 100644 --- a/doc/source/models/builtin/Yi.rst +++ b/doc/source/models/builtin/llm/yi.rst @@ -1,47 +1,57 @@ -.. _models_builtin_Yi: +.. _models_llm_yi: - -== +======================================== Yi -== +======================================== - **Context Length:** 4096 - **Model Name:** Yi - **Languages:** en, zh - **Abilities:** generate -- **Description:** The Yi series models are large language models trained from scratch by developers at 01.AI. The first public release contains two bilingual (English/Chinese) base models with the parameter sizes of 6B and 34B. Both of them are trained with 4K sequence length and can be extended to 32K during inference time. +- **Description:** The Yi series models are large language models trained from scratch by developers at 01.AI. Specifications ^^^^^^^^^^^^^^ -Model Spec 1 (pytorch, 6 Billion) -+++++++++++++++++++++++++++++++++ + +Model Spec 1 (ggufv2, 34 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** ggufv2 +- **Model Size (in billions):** 34 +- **Quantizations:** Q2_K, Q3_K_L, Q3_K_M, Q3_K_S, Q4_0, Q4_K_M, Q4_K_S, Q5_0, Q5_K_M, Q5_K_S, Q6_K, Q8_0 +- **Model ID:** TheBloke/Yi-34B-GGUF + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name Yi --size-in-billions 34 --model-format ggufv2 --quantization ${quantization} + + +Model Spec 2 (pytorch, 6 Billion) +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** pytorch - **Model Size (in billions):** 6 - **Quantizations:** 4-bit, 8-bit, none - **Model ID:** 01-ai/Yi-6B -Execute the following command to launch the model, remember to replace `${quantization}` with your +Execute the following command to launch the model, remember to replace ``${quantization}`` with your chosen quantization method from the options listed above:: xinference launch --model-name Yi --size-in-billions 6 --model-format pytorch --quantization ${quantization} -Model Spec 2 (pytorch, 34 Billion) -++++++++++++++++++++++++++++++++++ +Model Spec 3 (pytorch, 34 Billion) +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** pytorch - **Model Size (in billions):** 34 - **Quantizations:** 4-bit, 8-bit, none - **Model ID:** 01-ai/Yi-34B -Execute the following command to launch the model, remember to replace `${quantization}` with your +Execute the following command to launch the model, remember to replace ``${quantization}`` with your chosen quantization method from the options listed above:: xinference launch --model-name Yi --size-in-billions 34 --model-format pytorch --quantization ${quantization} - -.. note:: - - 4-bit quantization is not supported on macOS. diff --git a/doc/source/models/builtin/zephyr-7b-alpha.rst b/doc/source/models/builtin/llm/zephyr-7b-alpha.rst similarity index 77% rename from doc/source/models/builtin/zephyr-7b-alpha.rst rename to doc/source/models/builtin/llm/zephyr-7b-alpha.rst index 700e76ff67..953d797e11 100644 --- a/doc/source/models/builtin/zephyr-7b-alpha.rst +++ b/doc/source/models/builtin/llm/zephyr-7b-alpha.rst @@ -1,8 +1,8 @@ -.. _models_builtin_zephyr_7b_alpha: +.. _models_llm_zephyr-7b-alpha: -=============== -Zephyr-7B-alpha -=============== +======================================== +zephyr-7b-alpha +======================================== - **Context Length:** 8192 - **Model Name:** zephyr-7b-alpha @@ -13,8 +13,9 @@ Zephyr-7B-alpha Specifications ^^^^^^^^^^^^^^ -Model Spec (pytorch, 7 Billion) -+++++++++++++++++++++++++++++++ + +Model Spec 1 (pytorch, 7 Billion) +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** pytorch - **Model Size (in billions):** 7 @@ -26,6 +27,3 @@ chosen quantization method from the options listed above:: xinference launch --model-name zephyr-7b-alpha --size-in-billions 7 --model-format pytorch --quantization ${quantization} -.. note:: - - 4-bit quantization is not supported on macOS. diff --git a/doc/source/models/builtin/zephyr-7b-beta.rst b/doc/source/models/builtin/llm/zephyr-7b-beta.rst similarity index 66% rename from doc/source/models/builtin/zephyr-7b-beta.rst rename to doc/source/models/builtin/llm/zephyr-7b-beta.rst index 96fd26d558..7410a6ae95 100644 --- a/doc/source/models/builtin/zephyr-7b-beta.rst +++ b/doc/source/models/builtin/llm/zephyr-7b-beta.rst @@ -1,32 +1,29 @@ -.. _models_builtin_zephyr_7b_beta: +.. _models_llm_zephyr-7b-beta: -============== -Zephyr-7B-beta -============== +======================================== +zephyr-7b-beta +======================================== - **Context Length:** 8192 - **Model Name:** zephyr-7b-beta - **Languages:** en - **Abilities:** chat -- **Description:** Zephyr-7B-β is the second model in the series, and is a fine-tuned version of mistralai/Mistral-7B-v0.1. +- **Description:** Zephyr-7B-β is the second model in the series, and is a fine-tuned version of mistralai/Mistral-7B-v0.1 Specifications ^^^^^^^^^^^^^^ -Model Spec (pytorch, 7 Billion) -+++++++++++++++++++++++++++++++ + +Model Spec 1 (pytorch, 7 Billion) +++++++++++++++++++++++++++++++++++++++++ - **Model Format:** pytorch - **Model Size (in billions):** 7 - **Quantizations:** 4-bit, 8-bit, none - **Model ID:** HuggingFaceH4/zephyr-7b-beta -- **Model Revision:** 3bac358730f8806e5c3dc7c7e19eb36e045bf720 Execute the following command to launch the model, remember to replace ``${quantization}`` with your chosen quantization method from the options listed above:: xinference launch --model-name zephyr-7b-beta --size-in-billions 7 --model-format pytorch --quantization ${quantization} -.. note:: - - 4-bit quantization is not supported on macOS. diff --git a/doc/source/models/builtin/qwen-chat.rst b/doc/source/models/builtin/qwen-chat.rst deleted file mode 100644 index 87b3b85161..0000000000 --- a/doc/source/models/builtin/qwen-chat.rst +++ /dev/null @@ -1,103 +0,0 @@ -.. _models_builtin_qwen_chat: - -========= -Qwen Chat -========= - -- **Model Name:** qwen-chat -- **Languages:** en, zh -- **Abilities:** embed, chat - -Specifications -^^^^^^^^^^^^^^ - -Model Spec 1 (pytorch, 7 Billion) -+++++++++++++++++++++++++++++++++ - -- **Model Format:** pytorch -- **Model Size (in billions):** 7 -- **Quantizations:** 4-bit, 8-bit, none -- **Model ID:** Qwen/Qwen-7B-Chat - -Execute the following command to launch the model, remember to replace ``${quantization}`` with your -chosen quantization method from the options listed above:: - - xinference launch --model-name qwen-chat --size-in-billions 7 --model-format pytorch --quantization ${quantization} - -.. note:: - - 4-bit and 8-bit quantization are not supported on macOS. - -Model Spec 2 (pytorch, 14 Billion) -++++++++++++++++++++++++++++++++++ - -- **Model Format:** pytorch -- **Model Size (in billions):** 14 -- **Quantizations:** none -- **Model ID:** Qwen/Qwen-14B-Chat - -Execute the following command to launch the model:: - - xinference launch --model-name qwen-chat --size-in-billions 14 --model-format pytorch - -.. note:: - - 4-bit and 8-bit quantization are not supported on macOS. - -Model Spec 3 (ggmlv3, 7 Billion) -++++++++++++++++++++++++++++++++ - -- **Model Format:** ggmlv3 -- **Model Size (in billions):** 7 -- **Quantizations:** q4_0 -- **Model ID:** Xorbits/qwen-chat-7B-ggml - -You need to install ``qwen-cpp`` first: - -.. code-block:: bash - - pip install -U qwen-cpp - - -If you want to use BLAS to accelerate: - -- OpenBLAS: - -.. code-block:: bash - - CMAKE_ARGS="-DGGML_OPENBLAS=ON" pip install -U qwen-cpp - - -- cuBLAS: - -.. code-block:: bash - - CMAKE_ARGS="-DGGML_CUBLAS=ON" pip install -U qwen-cpp - - -- Metal: - -.. code-block:: bash - - CMAKE_ARGS="-DGGML_METAL=ON" pip install -U qwen-cpp - - -Execute the following command to launch the model:: - - xinference launch --model-name qwen-chat --size-in-billions 7 --model-format ggmlv3 - - -Model Spec 4 (ggmlv3, 14 Billion) -+++++++++++++++++++++++++++++++++ - -- **Model Format:** ggmlv3 -- **Model Size (in billions):** 14 -- **Quantizations:** q4_0 -- **Model ID:** Xorbits/qwen-chat-14B-ggml - -Install ``qwen-cpp`` as above. - -Execute the following command to launch the model:: - - xinference launch --model-name qwen-chat --size-in-billions 14 --model-format ggmlv3 - diff --git a/doc/source/models/builtin/bge-reranker-base.rst b/doc/source/models/builtin/rerank/bge-reranker-base.rst similarity index 74% rename from doc/source/models/builtin/bge-reranker-base.rst rename to doc/source/models/builtin/rerank/bge-reranker-base.rst index 48123ba205..2d619e5355 100644 --- a/doc/source/models/builtin/bge-reranker-base.rst +++ b/doc/source/models/builtin/rerank/bge-reranker-base.rst @@ -1,12 +1,12 @@ -.. _models_builtin_bge_rerank_base: +.. _models_builtin_bge-reranker-base: ================= bge-reranker-base ================= - **Model Name:** bge-reranker-base -- **Languages:** [zh, en] -- **Abilities:** rerank +- **Languages:** en, zh +- **Abilities:** embed Specifications ^^^^^^^^^^^^^^ @@ -15,5 +15,4 @@ Specifications Execute the following command to launch the model:: - xinference launch --model-name bge-reranker-base --model-type rerank - + xinference launch --model-name bge-reranker-base --model-type rerank \ No newline at end of file diff --git a/doc/source/models/builtin/bge-reranker-large.rst b/doc/source/models/builtin/rerank/bge-reranker-large.rst similarity index 74% rename from doc/source/models/builtin/bge-reranker-large.rst rename to doc/source/models/builtin/rerank/bge-reranker-large.rst index ae85d71694..2be39bfa57 100644 --- a/doc/source/models/builtin/bge-reranker-large.rst +++ b/doc/source/models/builtin/rerank/bge-reranker-large.rst @@ -1,12 +1,12 @@ -.. _models_builtin_bge_rerank_large: +.. _models_builtin_bge-reranker-large: ================== bge-reranker-large ================== - **Model Name:** bge-reranker-large -- **Languages:** [zh, en] -- **Abilities:** rerank +- **Languages:** en, zh +- **Abilities:** embed Specifications ^^^^^^^^^^^^^^ @@ -15,5 +15,4 @@ Specifications Execute the following command to launch the model:: - xinference launch --model-name bge-reranker-large --model-type rerank - + xinference launch --model-name bge-reranker-large --model-type rerank \ No newline at end of file diff --git a/doc/source/models/builtin/rerank/index.rst b/doc/source/models/builtin/rerank/index.rst new file mode 100644 index 0000000000..8e6fde00a6 --- /dev/null +++ b/doc/source/models/builtin/rerank/index.rst @@ -0,0 +1,17 @@ +.. _models_rarank_index: + +================ +Rerank Models +================ + +The following is a list of built-in rerank models in Xinference: + + +.. toctree:: + :maxdepth: 1 + + + bge-reranker-base + + bge-reranker-large + \ No newline at end of file diff --git a/doc/source/models/builtin/starcoder.rst b/doc/source/models/builtin/starcoder.rst deleted file mode 100644 index 14bd165c5b..0000000000 --- a/doc/source/models/builtin/starcoder.rst +++ /dev/null @@ -1,20 +0,0 @@ -.. _models_builtin_starcoder: - -========= -StarCoder -========= - -- **Model Name:** starcoder -- **Languages:** en -- **Abilities:** generate - -Specifications -^^^^^^^^^^^^^^ - -Model Spec (ggmlv3, 16 Billion) -+++++++++++++++++++++++++++++++ - -- **Model Format:** ggmlv3 -- **Model Size (in billions):** 16 -- **Quantizations:** q4_0, q4_1, q5_0, q5_1, q8_0 -- **Model ID:** TheBloke/starcoder-GGML diff --git a/doc/source/models/builtin/xverse-chat.rst b/doc/source/models/builtin/xverse-chat.rst deleted file mode 100644 index 2df5b57485..0000000000 --- a/doc/source/models/builtin/xverse-chat.rst +++ /dev/null @@ -1,58 +0,0 @@ -.. _models_builtin_xverse_chat: - -=========== -XVERSE-Chat -=========== - -- **Context Length:** 2048 -- **Model Name:** xverse-chat -- **Languages:** en, zh -- **Abilities:** chat -- **Description:** XVERSE-Chat is the aligned version of model XVERSE for chat-based applications. - -Specifications -^^^^^^^^^^^^^^ - -Model Specs (pytorch, Billions) -+++++++++++++++++++++++++++++++ - -- **Model Format:** pytorch -- **Model Size (in billions):** 7, 13 -- **Quantizations:** 4-bit, 8-bit, none - -XVERSE-Chat Model Variants --------------------------- - -1. XVERSE-7B-Chat - - **Model ID:** xverse/XVERSE-7B-Chat - - **Model Revision:** 60acc8c453c067b54df88be98bfdf60585ab5441 - -2. XVERSE-13B-Chat - - **Model ID:** xverse/XVERSE-13B-Chat - - **Model Revision:** 1e4944aaa1d8c8d0cdca28bb8e3a003303d0781b - -To launch a specific XVERSE-Chat model, use the following command and replace `${quantization}` with your chosen quantization method: -chosen quantization method from the options listed above and the size:: - - xinference launch --model-name xverse-chat --size-in-billions 7 --model-format pytorch --quantization ${quantization} - -.. note:: - - 4-bit quantization is not supported on macOS. - -Model Details -^^^^^^^^^^^^^ - -- **Version:** 1 -- **Context Length:** 2048 -- **Model Name:** xverse-chat -- **Model Languages:** en, zh -- **Model Abilities:** chat -- **Model Description:** XVERSE-Chat is the aligned version of model XVERSE for chat-based applications. - -Prompt Style -^^^^^^^^^^^^ - -- **Style Name:** XVERSE -- **System Prompt:** N/A -- **Roles:** [user, assistant] diff --git a/doc/source/models/builtin/xverse.rst b/doc/source/models/builtin/xverse.rst deleted file mode 100644 index 49db28db67..0000000000 --- a/doc/source/models/builtin/xverse.rst +++ /dev/null @@ -1,56 +0,0 @@ -.. _models_builtin_xverse: - -====== -XVERSE -====== - -- **Context Length:** 2048 -- **Model Name:** xverse -- **Languages:** en, zh -- **Abilities:** generate -- **Description:** XVERSE is a multilingual large language model, independently developed by Shenzhen Yuanxiang Technology. - -Specifications -^^^^^^^^^^^^^^ - -Model Specs (pytorch, Billions) -+++++++++++++++++++++++++++++++ - -- **Model Format:** pytorch -- **Model Size (in billions):** 7 -- **Quantizations:** 4-bit, 8-bit, none - -XVERSE Model Variants ---------------------- - -1. XVERSE-7B - - **Model ID:** xverse/XVERSE-7B - - **Model Revision:** 3778b254def675586e9218ccb15b78d6ef66a3a7 - -2. XVERSE-13B - - **Model ID:** xverse/XVERSE-13B - - **Model Revision:** 11ac840dda17af81046614229fdd0c658afff747 - -3. XVERSE-65B - - **Model ID:** xverse/XVERSE-65B - - **Model Revision:** 7f1b7394f74c630f50612a19ba90bd021c373989 - -To launch a specific XVERSE model, use the following command and replace `${quantization}` with your chosen quantization method: -chosen quantization method from the options listed above and the size:: - - xinference launch --model-name xverse --size-in-billions 7 --model-format pytorch --quantization ${quantization} - -.. note:: - - 4-bit quantization is not supported on macOS. - -Model Details -^^^^^^^^^^^^^ - -- **Version:** 1 -- **Context Length:** 2048 -- **Model Name:** xverse -- **Model Languages:** en, zh -- **Model Abilities:** generate -- **Model Description:** XVERSE is a multilingual large language model, independently developed by Shenzhen Yuanxiang Technology. - diff --git a/doc/templates/embedding.rst.jinja b/doc/templates/embedding.rst.jinja new file mode 100644 index 0000000000..3c93fc7cd9 --- /dev/null +++ b/doc/templates/embedding.rst.jinja @@ -0,0 +1,20 @@ +.. _models_builtin_{{ model_name|lower }}: + +{{ "=" * model_name|length }} +{{ model_name }} +{{ "=" * model_name|length }} + +- **Model Name:** {{ model_name }} +- **Languages:** {{ ', '.join(language) }} +- **Abilities:** embed + +Specifications +^^^^^^^^^^^^^^ + +- **Dimensions:** {{ dimensions }} +- **Max Tokens:** {{ max_tokens }} +- **Model ID:** {{ model_id }} + +Execute the following command to launch the model:: + + xinference launch --model-name {{ model_name }} --model-type embedding \ No newline at end of file diff --git a/doc/templates/embedding_index.rst.jinja b/doc/templates/embedding_index.rst.jinja new file mode 100644 index 0000000000..5bb93511c4 --- /dev/null +++ b/doc/templates/embedding_index.rst.jinja @@ -0,0 +1,15 @@ +.. _models_embedding_index: + +================ +Embedding Models +================ + +The following is a list of built-in embedding models in Xinference: + + +.. toctree:: + :maxdepth: 1 + + {% for model in models %} + {{ model.model_name|lower }} + {% endfor %} \ No newline at end of file diff --git a/doc/templates/llm.rst.jinja b/doc/templates/llm.rst.jinja new file mode 100644 index 0000000000..22c4135339 --- /dev/null +++ b/doc/templates/llm.rst.jinja @@ -0,0 +1,30 @@ +.. _models_llm_{{ model_name|lower }}: + +{{ "=" * 40 }} +{{ model_name }} +{{ "=" * 40 }} + +- **Context Length:** {{ context_length }} +- **Model Name:** {{ model_name }} +- **Languages:** {{ model_lang | join(', ') }} +- **Abilities:** {{ model_ability | join(', ') }} +- **Description:** {{ model_description }} + +Specifications +^^^^^^^^^^^^^^ + +{% for spec in model_specs %} +Model Spec {{ loop.index }} ({{ spec.model_format }}, {{ spec.model_size_in_billions }} Billion) +{{ "+" * 40 }} + +- **Model Format:** {{ spec.model_format }} +- **Model Size (in billions):** {{ spec.model_size_in_billions }} +- **Quantizations:** {{ spec.quantizations | join(', ') }} +- **Model ID:** {{ spec.model_id }} + +Execute the following command to launch the model, remember to replace ``${{ '{' }}quantization{{ '}' }}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name {{ model_name }} --size-in-billions {{ spec.model_size_in_billions }} --model-format {{ spec.model_format }} --quantization ${{ '{' }}quantization{{ '}' }} + +{% endfor %} \ No newline at end of file diff --git a/doc/templates/llm_index.rst.jinja b/doc/templates/llm_index.rst.jinja new file mode 100644 index 0000000000..a58480b32e --- /dev/null +++ b/doc/templates/llm_index.rst.jinja @@ -0,0 +1,15 @@ +.. _models_llm_index: + +===================== +Large language Models +===================== + +The following is a list of built-in LLM in Xinference: + + +.. toctree:: + :maxdepth: 3 + + {% for model in models %} + {{ model.model_name|lower }} + {% endfor %} \ No newline at end of file diff --git a/doc/templates/rerank.rst.jinja b/doc/templates/rerank.rst.jinja new file mode 100644 index 0000000000..c898e7f5d5 --- /dev/null +++ b/doc/templates/rerank.rst.jinja @@ -0,0 +1,18 @@ +.. _models_builtin_{{ model_name|lower }}: + +{{ "=" * model_name|length }} +{{ model_name }} +{{ "=" * model_name|length }} + +- **Model Name:** {{ model_name }} +- **Languages:** {{ ', '.join(language) }} +- **Abilities:** rerank + +Specifications +^^^^^^^^^^^^^^ + +- **Model ID:** {{ model_id }} + +Execute the following command to launch the model:: + + xinference launch --model-name {{ model_name }} --model-type rerank \ No newline at end of file diff --git a/doc/templates/rerank_index.rst.jinja b/doc/templates/rerank_index.rst.jinja new file mode 100644 index 0000000000..5d30967b08 --- /dev/null +++ b/doc/templates/rerank_index.rst.jinja @@ -0,0 +1,15 @@ +.. _models_rarank_index: + +================ +Rerank Models +================ + +The following is a list of built-in rerank models in Xinference: + + +.. toctree:: + :maxdepth: 1 + + {% for model in models %} + {{ model.model_name|lower }} + {% endfor %} \ No newline at end of file