diff --git a/doc/source/gen_docs.py b/doc/source/gen_docs.py
new file mode 100644
index 0000000000..fef4a4131f
--- /dev/null
+++ b/doc/source/gen_docs.py
@@ -0,0 +1,83 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from jinja2 import Environment, FileSystemLoader
+
+
+def main():
+    template_dir = '../templates' 
+    env = Environment(loader=FileSystemLoader(template_dir))
+
+    with open('../../xinference/model/llm/llm_family.json', 'r') as file:
+        models = json.load(file)
+
+        sorted_models = sorted(models, key=lambda x: x['model_name'].lower())
+        output_dir = './models/builtin/llm'
+        os.makedirs(output_dir, exist_ok=True)
+
+        for model in sorted_models:
+            rendered = env.get_template('llm.rst.jinja').render(model)
+            output_file_path = os.path.join(output_dir, f"{model['model_name'].lower()}.rst")
+            with open(output_file_path, 'w') as output_file:
+                output_file.write(rendered)
+
+        index_file_path = os.path.join(output_dir, "index.rst")
+        with open(index_file_path, "w") as file:
+            
+            rendered_index = env.get_template('llm_index.rst.jinja').render(models=sorted_models)
+            file.write(rendered_index)
+
+
+    with open('../../xinference/model/embedding/model_spec.json', 'r') as file:
+        models = json.load(file)
+
+        sorted_models = sorted(models, key=lambda x: x['model_name'].lower())
+        output_dir = './models/builtin/embedding'
+        os.makedirs(output_dir, exist_ok=True)
+
+        for model in sorted_models:
+            rendered = env.get_template('embedding.rst.jinja').render(model)
+            output_file_path = os.path.join(output_dir, f"{model['model_name'].lower()}.rst")
+            with open(output_file_path, 'w') as output_file:
+                output_file.write(rendered)
+
+        index_file_path = os.path.join(output_dir, "index.rst")
+        with open(index_file_path, "w") as file:
+            
+            rendered_index = env.get_template('embedding_index.rst.jinja').render(models=sorted_models)
+            file.write(rendered_index)
+
+    with open('../../xinference/model/rerank/model_spec.json', 'r') as file:
+        models = json.load(file)
+
+        sorted_models = sorted(models, key=lambda x: x['model_name'].lower())
+        output_dir = './models/builtin/rerank'
+        os.makedirs(output_dir, exist_ok=True)
+
+        for model in sorted_models:
+            rendered = env.get_template('rerank.rst.jinja').render(model)
+            output_file_path = os.path.join(output_dir, f"{model['model_name'].lower()}.rst")
+            with open(output_file_path, 'w') as output_file:
+                output_file.write(rendered)
+
+        index_file_path = os.path.join(output_dir, "index.rst")
+        with open(index_file_path, "w") as file:
+            
+            rendered_index = env.get_template('rerank_index.rst.jinja').render(models=sorted_models)
+            file.write(rendered_index)
+
+if __name__ == "__main__":
+    main()
diff --git a/doc/source/models/builtin/Yi-chat.rst b/doc/source/models/builtin/Yi-chat.rst
deleted file mode 100644
index dcfc74cf47..0000000000
--- a/doc/source/models/builtin/Yi-chat.rst
+++ /dev/null
@@ -1,33 +0,0 @@
-.. _models_builtin_Yi_chat:
-
-
-=======
-Yi-chat
-=======
-
-- **Context Length:** 4096
-- **Model Name:** Yi
-- **Languages:** en, zh
-- **Abilities:** generate
-- **Description:** The Yi series models are large language models trained from scratch by developers at 01.AI. The first public release contains two bilingual (English/Chinese) base models with the parameter sizes of 6B and 34B. Both of them are trained with 4K sequence length and can be extended to 32K during inference time.
-
-Specifications
-^^^^^^^^^^^^^^
-
-Model Spec 1 (pytorch, 34 Billion)
-+++++++++++++++++++++++++++++++++
-
-- **Model Format:** pytorch
-- **Model Size (in billions):** 34
-- **Quantizations:** 4-bit, 8-bit, none
-- **Model ID:** 01-ai/Yi-34B
-
-Execute the following command to launch the model, remember to replace `${quantization}` with your
-chosen quantization method from the options listed above::
-
-   xinference launch --model-name Yi-chat --size-in-billions 34 --model-format pytorch --quantization ${quantization}
-
-
-.. note::
-
-   4-bit quantization is not supported on macOS.
diff --git a/doc/source/models/builtin/code-llama-instruct.rst b/doc/source/models/builtin/code-llama-instruct.rst
deleted file mode 100644
index 1e914e3b87..0000000000
--- a/doc/source/models/builtin/code-llama-instruct.rst
+++ /dev/null
@@ -1,67 +0,0 @@
-.. _models_builtin_code_llama_instruct:
-
-
-===================
-Code-Llama-Instruct
-===================
-
-- **Context Length:** 100000
-- **Model Name:** code-llama-instruct
-- **Languages:** en
-- **Abilities:** chat
-
-Specifications
-^^^^^^^^^^^^^^
-
-Model Spec 1 (pytorch, 7 Billion)
-+++++++++++++++++++++++++++++++++
-
-- **Model Format:** pytorch
-- **Model Size (in billions):** 7
-- **Quantizations:** 4-bit, 8-bit, none
-- **Model ID:** codellama/CodeLlama-7b-Instruct-hf
-
-Execute the following command to launch the model, remember to replace ``${quantization}`` with your
-chosen quantization method from the options listed above::
-
-   xinference launch --model-name code-llama-instruct --size-in-billions 7 --model-format pytorch --quantization ${quantization}
-
-.. note::
-
-   4-bit quantization is not supported on macOS.
-
-Model Spec 2 (pytorch, 13 Billion)
-++++++++++++++++++++++++++++++++++
-
-- **Model Format:** pytorch
-- **Model Size (in billions):** 13
-- **Quantizations:** 4-bit, 8-bit, none
-- **Model ID:** codellama/CodeLlama-13b-Instruct-hf
-
-Execute the following command to launch the model, remember to replace ``${quantization}`` with your
-chosen quantization method from the options listed above::
-
-   xinference launch --model-name code-llama-instruct --size-in-billions 13 --model-format pytorch --quantization ${quantization}
-
-
-.. note::
-
-   4-bit quantization is not supported on macOS.
-
-Model Spec 3 (pytorch, 34 Billion)
-++++++++++++++++++++++++++++++++++
-
-- **Model Format:** pytorch
-- **Model Size (in billions):** 34
-- **Quantizations:** 4-bit, 8-bit, none
-- **Model ID:** codellama/CodeLlama-34b-Instruct-hf
-
-Execute the following command to launch the model, remember to replace ``${quantization}`` with your
-chosen quantization method from the options listed above::
-
-   xinference launch --model-name code-llama-instruct --size-in-billions 34 --model-format pytorch --quantization ${quantization}
-
-
-.. note::
-
-   4-bit quantization is not supported on macOS.
diff --git a/doc/source/models/builtin/code-llama-python.rst b/doc/source/models/builtin/code-llama-python.rst
deleted file mode 100644
index 3fa44f769d..0000000000
--- a/doc/source/models/builtin/code-llama-python.rst
+++ /dev/null
@@ -1,65 +0,0 @@
-.. _models_builtin_code_llama_python:
-
-
-=================
-Code-Llama-Python
-=================
-
-- **Context Length:** 100000
-- **Model Name:** code-llama-python
-- **Languages:** en
-- **Abilities:** generate
-
-Specifications
-^^^^^^^^^^^^^^
-
-Model Spec 1 (pytorch, 7 Billion)
-+++++++++++++++++++++++++++++++++
-
-- **Model Format:** pytorch
-- **Model Size (in billions):** 7
-- **Quantizations:** 4-bit, 8-bit, none
-- **Model ID:** TheBloke/CodeLlama-7B-Python-fp16
-
-Execute the following command to launch the model, remember to replace ``${quantization}`` with your
-chosen quantization method from the options listed above::
-
-   xinference launch --model-name code-llama-python --size-in-billions 7 --model-format pytorch --quantization ${quantization}
-
-.. note::
-
-   4-bit quantization is not supported on macOS.
-
-Model Spec 2 (pytorch, 13 Billion)
-++++++++++++++++++++++++++++++++++
-
-- **Model Format:** pytorch
-- **Model Size (in billions):** 13
-- **Quantizations:** 4-bit, 8-bit, none
-- **Model ID:** TheBloke/CodeLlama-13B-Python-fp16
-
-Execute the following command to launch the model, remember to replace ``${quantization}`` with your
-chosen quantization method from the options listed above::
-
-   xinference launch --model-name code-llama-python --size-in-billions 13 --model-format pytorch --quantization ${quantization}
-
-.. note::
-
-   4-bit quantization is not supported on macOS.
-
-Model Spec 3 (pytorch, 34 Billion)
-++++++++++++++++++++++++++++++++++
-
-- **Model Format:** pytorch
-- **Model Size (in billions):** 34
-- **Quantizations:** 4-bit, 8-bit, none
-- **Model ID:** TheBloke/CodeLlama-34B-Python-fp16
-
-Execute the following command to launch the model, remember to replace ``${quantization}`` with your
-chosen quantization method from the options listed above::
-
-   xinference launch --model-name code-llama-python --size-in-billions 34 --model-format pytorch --quantization ${quantization}
-
-.. note::
-
-   4-bit quantization is not supported on macOS.
diff --git a/doc/source/models/builtin/code-llama.rst b/doc/source/models/builtin/code-llama.rst
deleted file mode 100644
index 81bdd660bc..0000000000
--- a/doc/source/models/builtin/code-llama.rst
+++ /dev/null
@@ -1,64 +0,0 @@
-.. _models_builtin_code_llama:
-
-==========
-Code-Llama
-==========
-
-- **Context Length:** 100000
-- **Model Name:** code-llama
-- **Languages:** en
-- **Abilities:** generate
-
-Specifications
-^^^^^^^^^^^^^^
-
-Model Spec 1 (pytorch, 7 Billion)
-+++++++++++++++++++++++++++++++++
-
-- **Model Format:** pytorch
-- **Model Size (in billions):** 7
-- **Quantizations:** 4-bit, 8-bit, none
-- **Model ID:** TheBloke/CodeLlama-7B-fp16
-
-Execute the following command to launch the model, remember to replace ``${quantization}`` with your
-chosen quantization method from the options listed above::
-
-   xinference launch --model-name code-llama --size-in-billions 7 --model-format pytorch --quantization ${quantization}
-
-.. note::
-
-   4-bit quantization is not supported on macOS.
-
-Model Spec 2 (pytorch, 13 Billion)
-++++++++++++++++++++++++++++++++++
-
-- **Model Format:** pytorch
-- **Model Size (in billions):** 13
-- **Quantizations:** 4-bit, 8-bit, none
-- **Model ID:** TheBloke/CodeLlama-13B-fp16
-
-Execute the following command to launch the model, remember to replace ``${quantization}`` with your
-chosen quantization method from the options listed above::
-
-   xinference launch --model-name code-llama --size-in-billions 13 --model-format pytorch --quantization ${quantization}
-
-.. note::
-
-   4-bit quantization is not supported on macOS.
-
-Model Spec 3 (pytorch, 34 Billion)
-++++++++++++++++++++++++++++++++++
-
-- **Model Format:** pytorch
-- **Model Size (in billions):** 34
-- **Quantizations:** 4-bit, 8-bit, none
-- **Model ID:** TheBloke/CodeLlama-34B-fp16
-
-Execute the following command to launch the model, remember to replace ``${quantization}`` with your
-chosen quantization method from the options listed above::
-
-   xinference launch --model-name code-llama --size-in-billions 34 --model-format pytorch --quantization ${quantization}
-
-.. note::
-
-   4-bit quantization is not supported on macOS.
diff --git a/doc/source/models/builtin/bge-base-en-v1.5.rst b/doc/source/models/builtin/embedding/bge-base-en-v1.5.rst
similarity index 88%
rename from doc/source/models/builtin/bge-base-en-v1.5.rst
rename to doc/source/models/builtin/embedding/bge-base-en-v1.5.rst
index d25f7e6728..014160b96e 100644
--- a/doc/source/models/builtin/bge-base-en-v1.5.rst
+++ b/doc/source/models/builtin/embedding/bge-base-en-v1.5.rst
@@ -1,4 +1,4 @@
-.. _models_builtin_bge_base_en_v1.5:
+.. _models_builtin_bge-base-en-v1.5:
 
 ================
 bge-base-en-v1.5
@@ -17,5 +17,4 @@ Specifications
 
 Execute the following command to launch the model::
 
-   xinference launch --model-name bge-base-en-v1.5 --model-type embedding
-
+   xinference launch --model-name bge-base-en-v1.5 --model-type embedding
\ No newline at end of file
diff --git a/doc/source/models/builtin/bge-base-en.rst b/doc/source/models/builtin/embedding/bge-base-en.rst
similarity index 89%
rename from doc/source/models/builtin/bge-base-en.rst
rename to doc/source/models/builtin/embedding/bge-base-en.rst
index 952131eda8..b22a16cbed 100644
--- a/doc/source/models/builtin/bge-base-en.rst
+++ b/doc/source/models/builtin/embedding/bge-base-en.rst
@@ -1,4 +1,4 @@
-.. _models_builtin_bge_base_en:
+.. _models_builtin_bge-base-en:
 
 ===========
 bge-base-en
@@ -17,6 +17,4 @@ Specifications
 
 Execute the following command to launch the model::
 
-   xinference launch --model-name bge-base-en --model-type embedding
-
-
+   xinference launch --model-name bge-base-en --model-type embedding
\ No newline at end of file
diff --git a/doc/source/models/builtin/bge-base-zh-v1.5.rst b/doc/source/models/builtin/embedding/bge-base-zh-v1.5.rst
similarity index 88%
rename from doc/source/models/builtin/bge-base-zh-v1.5.rst
rename to doc/source/models/builtin/embedding/bge-base-zh-v1.5.rst
index 262cc03faa..ff3862189c 100644
--- a/doc/source/models/builtin/bge-base-zh-v1.5.rst
+++ b/doc/source/models/builtin/embedding/bge-base-zh-v1.5.rst
@@ -1,4 +1,4 @@
-.. _models_builtin_bge_base_zh_v1.5:
+.. _models_builtin_bge-base-zh-v1.5:
 
 ================
 bge-base-zh-v1.5
@@ -17,5 +17,4 @@ Specifications
 
 Execute the following command to launch the model::
 
-   xinference launch --model-name bge-base-zh-v1.5 --model-type embedding
-
+   xinference launch --model-name bge-base-zh-v1.5 --model-type embedding
\ No newline at end of file
diff --git a/doc/source/models/builtin/bge-base-zh.rst b/doc/source/models/builtin/embedding/bge-base-zh.rst
similarity index 83%
rename from doc/source/models/builtin/bge-base-zh.rst
rename to doc/source/models/builtin/embedding/bge-base-zh.rst
index 5b00cd3879..c9c910812f 100644
--- a/doc/source/models/builtin/bge-base-zh.rst
+++ b/doc/source/models/builtin/embedding/bge-base-zh.rst
@@ -1,4 +1,4 @@
-.. _models_builtin_bge_base_zh:
+.. _models_builtin_bge-base-zh:
 
 ===========
 bge-base-zh
@@ -11,11 +11,10 @@ bge-base-zh
 Specifications
 ^^^^^^^^^^^^^^
 
-- **Dimensions:** 1024
+- **Dimensions:** 768
 - **Max Tokens:** 512
 - **Model ID:** BAAI/bge-base-zh
 
 Execute the following command to launch the model::
 
-   xinference launch --model-name bge-base-zh --model-type embedding
-
+   xinference launch --model-name bge-base-zh --model-type embedding
\ No newline at end of file
diff --git a/doc/source/models/builtin/bge-large-en-v1.5.rst b/doc/source/models/builtin/embedding/bge-large-en-v1.5.rst
similarity index 88%
rename from doc/source/models/builtin/bge-large-en-v1.5.rst
rename to doc/source/models/builtin/embedding/bge-large-en-v1.5.rst
index ebd711ce92..d04f09c8ae 100644
--- a/doc/source/models/builtin/bge-large-en-v1.5.rst
+++ b/doc/source/models/builtin/embedding/bge-large-en-v1.5.rst
@@ -1,4 +1,4 @@
-.. _models_builtin_bge_large_en_v1.5:
+.. _models_builtin_bge-large-en-v1.5:
 
 =================
 bge-large-en-v1.5
@@ -17,5 +17,4 @@ Specifications
 
 Execute the following command to launch the model::
 
-   xinference launch --model-name bge-large-en-v1.5 --model-type embedding
-
+   xinference launch --model-name bge-large-en-v1.5 --model-type embedding
\ No newline at end of file
diff --git a/doc/source/models/builtin/bge-large-en.rst b/doc/source/models/builtin/embedding/bge-large-en.rst
similarity index 89%
rename from doc/source/models/builtin/bge-large-en.rst
rename to doc/source/models/builtin/embedding/bge-large-en.rst
index ccb4e58046..f1588fa0f9 100644
--- a/doc/source/models/builtin/bge-large-en.rst
+++ b/doc/source/models/builtin/embedding/bge-large-en.rst
@@ -1,4 +1,4 @@
-.. _models_builtin_bge_large_en:
+.. _models_builtin_bge-large-en:
 
 ============
 bge-large-en
@@ -17,5 +17,4 @@ Specifications
 
 Execute the following command to launch the model::
 
-   xinference launch --model-name bge-large-en --model-type embedding
-
+   xinference launch --model-name bge-large-en --model-type embedding
\ No newline at end of file
diff --git a/doc/source/models/builtin/bge-large-zh-noinstruct.rst b/doc/source/models/builtin/embedding/bge-large-zh-noinstruct.rst
similarity index 86%
rename from doc/source/models/builtin/bge-large-zh-noinstruct.rst
rename to doc/source/models/builtin/embedding/bge-large-zh-noinstruct.rst
index 1071d6a0b3..c279f7f688 100644
--- a/doc/source/models/builtin/bge-large-zh-noinstruct.rst
+++ b/doc/source/models/builtin/embedding/bge-large-zh-noinstruct.rst
@@ -1,4 +1,4 @@
-.. _models_builtin_bge_large_zh_noinstruct:
+.. _models_builtin_bge-large-zh-noinstruct:
 
 =======================
 bge-large-zh-noinstruct
@@ -17,5 +17,4 @@ Specifications
 
 Execute the following command to launch the model::
 
-   xinference launch --model-name bge-large-zh-noinstruct --model-type embedding
-
+   xinference launch --model-name bge-large-zh-noinstruct --model-type embedding
\ No newline at end of file
diff --git a/doc/source/models/builtin/bge-large-zh-v1.5.rst b/doc/source/models/builtin/embedding/bge-large-zh-v1.5.rst
similarity index 88%
rename from doc/source/models/builtin/bge-large-zh-v1.5.rst
rename to doc/source/models/builtin/embedding/bge-large-zh-v1.5.rst
index 0d5289a9a0..42fab215a5 100644
--- a/doc/source/models/builtin/bge-large-zh-v1.5.rst
+++ b/doc/source/models/builtin/embedding/bge-large-zh-v1.5.rst
@@ -1,4 +1,4 @@
-.. _models_builtin_bge_large_zh_v1.5:
+.. _models_builtin_bge-large-zh-v1.5:
 
 =================
 bge-large-zh-v1.5
@@ -17,5 +17,4 @@ Specifications
 
 Execute the following command to launch the model::
 
-   xinference launch --model-name bge-large-zh-v1.5 --model-type embedding
-
+   xinference launch --model-name bge-large-zh-v1.5 --model-type embedding
\ No newline at end of file
diff --git a/doc/source/models/builtin/bge-large-zh.rst b/doc/source/models/builtin/embedding/bge-large-zh.rst
similarity index 86%
rename from doc/source/models/builtin/bge-large-zh.rst
rename to doc/source/models/builtin/embedding/bge-large-zh.rst
index 847a69e508..65a96dcf07 100644
--- a/doc/source/models/builtin/bge-large-zh.rst
+++ b/doc/source/models/builtin/embedding/bge-large-zh.rst
@@ -1,7 +1,7 @@
-.. _models_builtin_bge_large_zh:
+.. _models_builtin_bge-large-zh:
 
 ============
-bge-large-en
+bge-large-zh
 ============
 
 - **Model Name:** bge-large-zh
@@ -17,5 +17,4 @@ Specifications
 
 Execute the following command to launch the model::
 
-   xinference launch --model-name bge-large-zh --model-type embedding
-
+   xinference launch --model-name bge-large-zh --model-type embedding
\ No newline at end of file
diff --git a/doc/source/models/builtin/bge-small-en-v1.5.rst b/doc/source/models/builtin/embedding/bge-small-en-v1.5.rst
similarity index 88%
rename from doc/source/models/builtin/bge-small-en-v1.5.rst
rename to doc/source/models/builtin/embedding/bge-small-en-v1.5.rst
index de5e4cbad3..665b11d49b 100644
--- a/doc/source/models/builtin/bge-small-en-v1.5.rst
+++ b/doc/source/models/builtin/embedding/bge-small-en-v1.5.rst
@@ -1,4 +1,4 @@
-.. _models_builtin_bge_small_en_v1.5:
+.. _models_builtin_bge-small-en-v1.5:
 
 =================
 bge-small-en-v1.5
@@ -17,5 +17,4 @@ Specifications
 
 Execute the following command to launch the model::
 
-   xinference launch --model-name bge-small-en-v1.5 --model-type embedding
-
+   xinference launch --model-name bge-small-en-v1.5 --model-type embedding
\ No newline at end of file
diff --git a/doc/source/models/builtin/bge-small-zh-v1.5.rst b/doc/source/models/builtin/embedding/bge-small-zh-v1.5.rst
similarity index 88%
rename from doc/source/models/builtin/bge-small-zh-v1.5.rst
rename to doc/source/models/builtin/embedding/bge-small-zh-v1.5.rst
index a80edbd158..deadc73e4d 100644
--- a/doc/source/models/builtin/bge-small-zh-v1.5.rst
+++ b/doc/source/models/builtin/embedding/bge-small-zh-v1.5.rst
@@ -1,4 +1,4 @@
-.. _models_builtin_bge_small_zh_v1.5:
+.. _models_builtin_bge-small-zh-v1.5:
 
 =================
 bge-small-zh-v1.5
@@ -17,5 +17,4 @@ Specifications
 
 Execute the following command to launch the model::
 
-   xinference launch --model-name bge-small-zh-v1.5 --model-type embedding
-
+   xinference launch --model-name bge-small-zh-v1.5 --model-type embedding
\ No newline at end of file
diff --git a/doc/source/models/builtin/bge-small-zh.rst b/doc/source/models/builtin/embedding/bge-small-zh.rst
similarity index 52%
rename from doc/source/models/builtin/bge-small-zh.rst
rename to doc/source/models/builtin/embedding/bge-small-zh.rst
index 489925b6dc..474596873d 100644
--- a/doc/source/models/builtin/bge-small-zh.rst
+++ b/doc/source/models/builtin/embedding/bge-small-zh.rst
@@ -1,10 +1,10 @@
-.. _models_builtin_bge_small_zh:
+.. _models_builtin_bge-small-zh:
 
 ============
-bge-large-en
+bge-small-zh
 ============
 
-- **Model Name:** bge_small_zh
+- **Model Name:** bge-small-zh
 - **Languages:** zh
 - **Abilities:** embed
 
@@ -13,9 +13,8 @@ Specifications
 
 - **Dimensions:** 512
 - **Max Tokens:** 512
-- **Model ID:** BAAI/bge_small_zh
+- **Model ID:** BAAI/bge-small-zh
 
 Execute the following command to launch the model::
 
-   xinference launch --model-name bge_small_zh --model-type embedding
-
+   xinference launch --model-name bge-small-zh --model-type embedding
\ No newline at end of file
diff --git a/doc/source/models/builtin/e5-large-v2.rst b/doc/source/models/builtin/embedding/e5-large-v2.rst
similarity index 81%
rename from doc/source/models/builtin/e5-large-v2.rst
rename to doc/source/models/builtin/embedding/e5-large-v2.rst
index 758e4cbebb..8737e8abca 100644
--- a/doc/source/models/builtin/e5-large-v2.rst
+++ b/doc/source/models/builtin/embedding/e5-large-v2.rst
@@ -1,8 +1,8 @@
-.. _models_builtin_e5_large_v2:
+.. _models_builtin_e5-large-v2:
 
-=========
-gte-large
-=========
+===========
+e5-large-v2
+===========
 
 - **Model Name:** e5-large-v2
 - **Languages:** en
@@ -17,5 +17,4 @@ Specifications
 
 Execute the following command to launch the model::
 
-   xinference launch --model-name e5-large-v2 --model-type embedding
-
+   xinference launch --model-name e5-large-v2 --model-type embedding
\ No newline at end of file
diff --git a/doc/source/models/builtin/gte-base.rst b/doc/source/models/builtin/embedding/gte-base.rst
similarity index 81%
rename from doc/source/models/builtin/gte-base.rst
rename to doc/source/models/builtin/embedding/gte-base.rst
index 0f379ee13f..211636e9ae 100644
--- a/doc/source/models/builtin/gte-base.rst
+++ b/doc/source/models/builtin/embedding/gte-base.rst
@@ -1,4 +1,4 @@
-.. _models_builtin_gte_base:
+.. _models_builtin_gte-base:
 
 ========
 gte-base
@@ -13,9 +13,8 @@ Specifications
 
 - **Dimensions:** 768
 - **Max Tokens:** 512
-- **Model ID:** thenlper/gte-large
+- **Model ID:** thenlper/gte-base
 
 Execute the following command to launch the model::
 
-   xinference launch --model-name gte-base --model-type embedding
-
+   xinference launch --model-name gte-base --model-type embedding
\ No newline at end of file
diff --git a/doc/source/models/builtin/gte-large.rst b/doc/source/models/builtin/embedding/gte-large.rst
similarity index 90%
rename from doc/source/models/builtin/gte-large.rst
rename to doc/source/models/builtin/embedding/gte-large.rst
index 09afa2594c..f1bb7105f9 100644
--- a/doc/source/models/builtin/gte-large.rst
+++ b/doc/source/models/builtin/embedding/gte-large.rst
@@ -1,4 +1,4 @@
-.. _models_builtin_gte_large:
+.. _models_builtin_gte-large:
 
 =========
 gte-large
@@ -17,5 +17,4 @@ Specifications
 
 Execute the following command to launch the model::
 
-   xinference launch --model-name gte-large --model-type embedding
-
+   xinference launch --model-name gte-large --model-type embedding
\ No newline at end of file
diff --git a/doc/source/models/builtin/embedding/index.rst b/doc/source/models/builtin/embedding/index.rst
new file mode 100644
index 0000000000..26876c8529
--- /dev/null
+++ b/doc/source/models/builtin/embedding/index.rst
@@ -0,0 +1,49 @@
+.. _models_embedding_index:
+
+================
+Embedding Models
+================
+
+The following is a list of built-in embedding models in Xinference:
+
+
+.. toctree::
+   :maxdepth: 1
+
+  
+   bge-base-en
+  
+   bge-base-en-v1.5
+  
+   bge-base-zh
+  
+   bge-base-zh-v1.5
+  
+   bge-large-en
+  
+   bge-large-en-v1.5
+  
+   bge-large-zh
+  
+   bge-large-zh-noinstruct
+  
+   bge-large-zh-v1.5
+  
+   bge-small-en-v1.5
+  
+   bge-small-zh
+  
+   bge-small-zh-v1.5
+  
+   e5-large-v2
+  
+   gte-base
+  
+   gte-large
+  
+   jina-embeddings-v2-base-en
+  
+   jina-embeddings-v2-small-en
+  
+   multilingual-e5-large
+  
\ No newline at end of file
diff --git a/doc/source/models/builtin/jina-embeddings-v2-base-en.rst b/doc/source/models/builtin/embedding/jina-embeddings-v2-base-en.rst
similarity index 74%
rename from doc/source/models/builtin/jina-embeddings-v2-base-en.rst
rename to doc/source/models/builtin/embedding/jina-embeddings-v2-base-en.rst
index 676a1885e8..627f8a61ac 100644
--- a/doc/source/models/builtin/jina-embeddings-v2-base-en.rst
+++ b/doc/source/models/builtin/embedding/jina-embeddings-v2-base-en.rst
@@ -1,8 +1,8 @@
-.. _models_builtin_jina_embeddings_v2_base_en:
+.. _models_builtin_jina-embeddings-v2-base-en:
 
-===========================
+==========================
 jina-embeddings-v2-base-en
-===========================
+==========================
 
 - **Model Name:** jina-embeddings-v2-base-en
 - **Languages:** en
@@ -17,5 +17,4 @@ Specifications
 
 Execute the following command to launch the model::
 
-   xinference launch --model-name jina-embeddings-v2-base-en --model-type embedding
-
+   xinference launch --model-name jina-embeddings-v2-base-en --model-type embedding
\ No newline at end of file
diff --git a/doc/source/models/builtin/jina-embeddings-v2-small-en.rst b/doc/source/models/builtin/embedding/jina-embeddings-v2-small-en.rst
similarity index 85%
rename from doc/source/models/builtin/jina-embeddings-v2-small-en.rst
rename to doc/source/models/builtin/embedding/jina-embeddings-v2-small-en.rst
index af1d9c72b0..7f35cda069 100644
--- a/doc/source/models/builtin/jina-embeddings-v2-small-en.rst
+++ b/doc/source/models/builtin/embedding/jina-embeddings-v2-small-en.rst
@@ -1,4 +1,4 @@
-.. _models_builtin_jina_embeddings_v2_small_en:
+.. _models_builtin_jina-embeddings-v2-small-en:
 
 ===========================
 jina-embeddings-v2-small-en
@@ -17,5 +17,4 @@ Specifications
 
 Execute the following command to launch the model::
 
-   xinference launch --model-name jina-embeddings-v2-small-en --model-type embedding
-
+   xinference launch --model-name jina-embeddings-v2-small-en --model-type embedding
\ No newline at end of file
diff --git a/doc/source/models/builtin/multilingual-e5-large.rst b/doc/source/models/builtin/embedding/multilingual-e5-large.rst
similarity index 67%
rename from doc/source/models/builtin/multilingual-e5-large.rst
rename to doc/source/models/builtin/embedding/multilingual-e5-large.rst
index 44cb618c5e..eb62dbf6db 100644
--- a/doc/source/models/builtin/multilingual-e5-large.rst
+++ b/doc/source/models/builtin/embedding/multilingual-e5-large.rst
@@ -1,8 +1,8 @@
-.. _models_builtin_multilingual_e5_large:
+.. _models_builtin_multilingual-e5-large:
 
-===========
-bge-base-zh
-===========
+=====================
+multilingual-e5-large
+=====================
 
 - **Model Name:** multilingual-e5-large
 - **Languages:** zh
@@ -12,10 +12,9 @@ Specifications
 ^^^^^^^^^^^^^^
 
 - **Dimensions:** 1024
-- **Max Tokens:** 512
+- **Max Tokens:** 514
 - **Model ID:** intfloat/multilingual-e5-large
 
 Execute the following command to launch the model::
 
-   xinference launch --model-name multilingual-e5-large --model-type embedding
-
+   xinference launch --model-name multilingual-e5-large --model-type embedding
\ No newline at end of file
diff --git a/doc/source/models/builtin/index.rst b/doc/source/models/builtin/index.rst
index aae33432ee..5685f50006 100644
--- a/doc/source/models/builtin/index.rst
+++ b/doc/source/models/builtin/index.rst
@@ -1,180 +1,13 @@
 .. _models_builtin_index:
 
-===============
-Built-in Models
-===============
+==============
+Builtin Models
+==============
 
-Large Language Models
-^^^^^^^^^^^^^^^^^^^^^
-
-Text Generation Models
-++++++++++++++++++++++
-
-- :ref:`Baichuan <models_builtin_baichuan>`
-- :ref:`Baichuan-2 <models_builtin_baichuan_2>`
-- :ref:`Falcon <models_builtin_falcon>`
-- :ref:`InternLM <models_builtin_internlm>`
-- :ref:`InternLM 20B <models_builtin_internlm_20b>`
-- :ref:`Llama-2 <models_builtin_llama_2>`
-- :ref:`OPT <models_builtin_opt>`
-- :ref:`Yi <models_builtin_Yi>`
-- :ref:`Yi-200k <models_builtin_Yi_200k>`
-- :ref:`Yi-chat <models_builtin_Yi_chat>`
-- :ref:`xverse <models_builtin_xverse>`
-
-
-Chat & Instruction-following Models
-+++++++++++++++++++++++++++++++++++
-
-- :ref:`Baichuan Chat <models_builtin_baichuan_chat>`
-- :ref:`Baichuan-2 Chat <models_builtin_baichuan_2_chat>`
-- :ref:`ChatGLM <models_builtin_chatglm>`
-- :ref:`ChatGLM2 <models_builtin_chatglm2>`
-- :ref:`ChatGLM2-32k <models_builtin_chatglm2_32k>`
-- :ref:`ChatGLM3 <models_builtin_chatglm3>`
-- :ref:`ChatGLM3-32k <models_builtin_chatglm3_32k>`
-- :ref:`CodeLlama-Instruct <models_builtin_code_llama_instruct>`
-- :ref:`Falcon Instruct <models_builtin_falcon_instruct>`
-- :ref:`InternLM Chat <models_builtin_internlm_chat>`
-- :ref:`InternLM Chat 20B <models_builtin_internlm_chat_20b>`
-- :ref:`InternLM Chat 8K <models_builtin_internlm_chat_8k>`
-- :ref:`Llama-2 Chat <models_builtin_llama_2_chat>`
-- :ref:`OpenBuddy v11.1 <models_builtin_openbuddy_v11.1>`
-- :ref:`Orca Mini <models_builtin_orca_mini>`
-- :ref:`Qwen Chat <models_builtin_qwen_chat>`
-- :ref:`Vicuna v1.3 <models_builtin_vicuna_v1_3>`
-- :ref:`Vicuna v1.5 <models_builtin_vicuna_v1_5>`
-- :ref:`Vicuna v1.5 16k <models_builtin_vicuna_v1_5_16k>`
-- :ref:`WizardLM v1.0 <models_builtin_wizardlm_v1_0>`
-- :ref:`WizardMath v1.0 <models_builtin_wizardmath_v1_0>`
-- :ref:`Zephyr-7B-α <models_builtin_zephyr_7b_alpha>`
-- :ref:`Zephyr-7B-β <models_builtin_zephyr_7b_beta>`
-- :ref:`xverse-chat <models_builtin_xverse_chat>`
-
-
-Code Generation Models
-++++++++++++++++++++++
-- :ref:`Starcoder <models_builtin_starcoder>`
-- :ref:`StarCoderPlus <models_builtin_starcoderplus>`
-- :ref:`Code-Llama <models_builtin_code_llama>`
-- :ref:`CodeLlama-Instruct <models_builtin_code_llama_instruct>`
-- :ref:`Code-Llama-Python <models_builtin_code_llama_python>`
-- :ref:`WizardCoder-Python-v1.0 <models_builtin_wizardcoder_python_v1_0>`
-
-
-Code Assistant Models
-+++++++++++++++++++++
-- :ref:`Starchat-beta <models_builtin_starchat_beta>`
-
-
-.. toctree::
-   :maxdepth: 2
-   :hidden:
-
-   baichuan-chat
-   baichuan-2-chat
-   baichuan
-   baichuan-2
-   chatglm
-   chatglm2-32k
-   chatglm2
-   chatglm3-32k
-   chatglm3
-   code-llama
-   code-llama-instruct
-   code-llama-python
-   falcon-instruct
-   falcon
-   internlm
-   internlm-20b
-   internlm-chat
-   internlm-chat-8k
-   internlm-chat-20b
-   llama-2-chat
-   llama-2
-   openbuddy
-   opt
-   orca_mini
-   starchat-beta
-   starcoder
-   starcoderplus
-   qwen-chat
-   vicuna-v1.3
-   vicuna-v1.5
-   vicuna-v1.5-16k
-   wizardlm-v1.0
-   wizardmath-v1.0
-   wizardcoder-python-v1.0
-   zephyr-7b-alpha
-   zephyr-7b-beta
-   Yi
-   Yi-200k
-   xverse
-   xverse-chat
-   Yi-chat
-
-
-Embedding Models
-^^^^^^^^^^^^^^^^^^^^^
-
-Language: English
-++++++++++++++++++++++
-- :ref:`bge-large-en <models_builtin_bge_large_en>`
-- :ref:`bge-large-en-v1.5 <models_builtin_bge_large_en_v1.5>`
-- :ref:`bge-base-en <models_builtin_bge_base_en>`
-- :ref:`bge-base-en-v1.5 <models_builtin_bge_base_en_v1.5>`
-- :ref:`gte-large <models_builtin_gte_large>`
-- :ref:`gte-base <models_builtin_gte_base>`
-- :ref:`e5-large-v2 <models_builtin_e5_large_v2>`
-- :ref:`bge-small-en-v1.5 <models_builtin_bge_small_en_v1.5>`
-
-
-Language: Chinese
-+++++++++++++++++++++
-- :ref:`bge-large-zh <models_builtin_bge_large_zh>`
-- :ref:`bge-large-zh-noinstruct <models_builtin_bge_large_zh_noinstruct>`
-- :ref:`bge-large-zh-v1.5 <models_builtin_bge_large_zh_v1.5>`
-- :ref:`bge-base-zh <models_builtin_bge_base_zh>`
-- :ref:`bge-base-zh-v1.5 <models_builtin_bge_base_zh_v1.5>`
-- :ref:`multilingual-e5-large <models_builtin_multilingual_e5_large>`
-- :ref:`bge-small-zh <models_builtin_bge_small_zh>`
-- :ref:`bge-small-zh-v1.5 <models_builtin_bge_small_zh_v1.5>`
-- :ref:`jina-embeddings-v2-small-en <models_builtin_jina_embeddings_v2_small_en>`
-- :ref:`jina-embeddings-v2-base-en <models_builtin_jina_embeddings_base_en>`
-
-
-.. toctree::
-   :maxdepth: 2
-   :hidden:
-
-   bge-large-en
-   bge-large-en-v1.5
-   bge-base-en
-   bge-base-en-v1.5
-   bge-small-en-v1.5
-   gte-large
-   gte-base
-   e5-large-v2
-   bge-large-zh
-   bge-large-zh-noinstruct
-   bge-large-zh-v1.5
-   bge-base-zh
-   bge-base-zh-v1.5
-   multilingual-e5-large
-   bge-small-zh
-   bge-small-zh-v1.5
-   jina-embeddings-v2-small-en
-   jina-embeddings-v2-base-en
-
-
-Rerank Models
-++++++++++++++++++++++
-- :ref:`bge-reranker-base <models_builtin_bge_rerank_base>`
-- :ref:`bge-reranker-large <models_builtin_bge_rerank_large>`
 
 .. toctree::
    :maxdepth: 2
-   :hidden:
 
-   bge-reranker-base
-   bge-reranker-large
+   llm/index
+   embedding/index
+   rerank/index
\ No newline at end of file
diff --git a/doc/source/models/builtin/internlm-20b.rst b/doc/source/models/builtin/internlm-20b.rst
deleted file mode 100644
index 20d306927d..0000000000
--- a/doc/source/models/builtin/internlm-20b.rst
+++ /dev/null
@@ -1,23 +0,0 @@
-.. _models_builtin_internlm_20b:
-
-==================
-InternLM-20B Model
-==================
-
-- **Context Length:** 16384
-- **Model Name:** internlm-20b
-- **Languages:** en, zh
-- **Abilities:** generate
-- **Description:** Pre-trained on over 2.3T Tokens containing high-quality English, Chinese, and code data.
-
-Specifications
-^^^^^^^^^^^^^^
-
-Model Spec (pytorch, 20 Billion)
-++++++++++++++++++++++++++++++++
-
-- **Model Format:** pytorch
-- **Model Size (in billions):** 20
-- **Quantizations:** 4-bit, 8-bit, none
-- **Model ID:** internlm/internlm-20b
-- **Model Revision:** f0433b0db933a9adfa169f756ab8547f67ccef1d
diff --git a/doc/source/models/builtin/internlm-chat-20b.rst b/doc/source/models/builtin/internlm-chat-20b.rst
deleted file mode 100644
index ce48b13f6e..0000000000
--- a/doc/source/models/builtin/internlm-chat-20b.rst
+++ /dev/null
@@ -1,22 +0,0 @@
-.. _models_builtin_internlm_chat_20b:
-
-=================
-InternLM-Chat-20B
-=================
-
-- **Context Length:** 16384
-- **Model Name:** internlm-chat-20b
-- **Languages:** en, zh
-- **Abilities:** chat
-- **Description:** Pre-trained on over 2.3T Tokens containing high-quality English, Chinese, and code data. The Chat version has undergone SFT and RLHF training.
-
-Specifications
-^^^^^^^^^^^^^^
-
-Model Spec (pytorch, 20 Billion)
-++++++++++++++++++++++++++++++++
-
-- **Model Format:** pytorch
-- **Model Size (in billions):** 20
-- **Quantizations:** 4-bit, 8-bit, none
-- **Model ID:** internlm/internlm-chat-20b
diff --git a/doc/source/models/builtin/internlm-chat-8k.rst b/doc/source/models/builtin/internlm-chat-8k.rst
deleted file mode 100644
index a5edd5dd71..0000000000
--- a/doc/source/models/builtin/internlm-chat-8k.rst
+++ /dev/null
@@ -1,29 +0,0 @@
-.. _models_builtin_internlm_chat_8k:
-
-
-================
-InternLM Chat 8K
-================
-
-- **Model Name:** internlm-chat-8k
-- **Languages:** en, zh
-- **Abilities:** embed, chat
-
-Specifications
-^^^^^^^^^^^^^^
-
-Model Spec (pytorch, 7 Billion)
-+++++++++++++++++++++++++++++++
-
-- **Model Format:** pytorch
-- **Model Size (in billions):** 7
-- **Quantizations:** 4-bit, 8-bit, none
-- **Model ID:** internlm/internlm-chat-7b-8k
-
-Execute the following command to launch the model, remember to replace ``${quantization}`` with your chosen quantization method from the options listed above::
-
-   xinference launch --model-name internlm-chat-8k --size-in-billions 7 --model-format pytorch --quantization ${quantization}
-
-.. note::
-
-   4-bit quantization is not supported on macOS.
\ No newline at end of file
diff --git a/doc/source/models/builtin/internlm-chat.rst b/doc/source/models/builtin/internlm-chat.rst
deleted file mode 100644
index f151800cef..0000000000
--- a/doc/source/models/builtin/internlm-chat.rst
+++ /dev/null
@@ -1,28 +0,0 @@
-.. _models_builtin_internlm_chat:
-
-=============
-InternLM Chat
-=============
-
-- **Model Name:** internlm-chat
-- **Languages:** en, zh
-- **Abilities:** embed, chat
-
-Specifications
-^^^^^^^^^^^^^^
-
-Model Spec (pytorch, 7 Billion)
-+++++++++++++++++++++++++++++++
-
-- **Model Format:** pytorch
-- **Model Size (in billions):** 7
-- **Quantizations:** 4-bit, 8-bit, none
-- **Model ID:** internlm/internlm-chat-7b
-
-Execute the following command to launch the model, remember to replace ``${quantization}`` with your chosen quantization method from the options listed above::
-
-   xinference launch --model-name internlm-chat --size-in-billions 7 --model-format pytorch --quantization ${quantization}
-
-.. note::
-
-   4-bit quantization is not supported on macOS.
\ No newline at end of file
diff --git a/doc/source/models/builtin/internlm.rst b/doc/source/models/builtin/internlm.rst
deleted file mode 100644
index 9a09ecf06f..0000000000
--- a/doc/source/models/builtin/internlm.rst
+++ /dev/null
@@ -1,28 +0,0 @@
-.. _models_builtin_internlm:
-
-========
-InternLM
-========
-
-- **Model Name:** internlm
-- **Languages:** en, zh
-- **Abilities:** embed, generate
-
-Specifications
-^^^^^^^^^^^^^^
-
-Model Spec (pytorch, 7 Billion)
-+++++++++++++++++++++++++++++++
-
-- **Model Format:** pytorch
-- **Model Size (in billions):** 7
-- **Quantizations:** 4-bit, 8-bit, none
-- **Model ID:** internlm/internlm-7b
-
-Execute the following command to launch the model, remember to replace ``${quantization}`` with your chosen quantization method from the options listed above::
-
-   xinference launch --model-name internlm --size-in-billions 7 --model-format pytorch --quantization ${quantization}
-
-.. note::
-
-   4-bit quantization is not supported on macOS.
diff --git a/doc/source/models/builtin/baichuan-2-chat.rst b/doc/source/models/builtin/llm/baichuan-2-chat.rst
similarity index 74%
rename from doc/source/models/builtin/baichuan-2-chat.rst
rename to doc/source/models/builtin/llm/baichuan-2-chat.rst
index 9dfe2175a2..cd05690531 100644
--- a/doc/source/models/builtin/baichuan-2-chat.rst
+++ b/doc/source/models/builtin/llm/baichuan-2-chat.rst
@@ -1,51 +1,43 @@
-.. _models_builtin_baichuan_2_chat:
+.. _models_llm_baichuan-2-chat:
 
-===============
-Baichuan-2-Chat
-===============
+========================================
+baichuan-2-chat
+========================================
 
 - **Context Length:** 4096
 - **Model Name:** baichuan-2-chat
 - **Languages:** en, zh
-- **Abilities:** embed, generate, chat
+- **Abilities:** chat
 - **Description:** Baichuan2-chat is a fine-tuned version of the Baichuan LLM, specializing in chatting.
 
 Specifications
 ^^^^^^^^^^^^^^
 
+
 Model Spec 1 (pytorch, 7 Billion)
-+++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
 - **Model Size (in billions):** 7
 - **Quantizations:** 4-bit, 8-bit, none
 - **Model ID:** baichuan-inc/Baichuan2-7B-Chat
-- **Model Revision:** 2ce891951e000c36c65442608a0b95fd09b405dc
 
 Execute the following command to launch the model, remember to replace ``${quantization}`` with your
 chosen quantization method from the options listed above::
 
    xinference launch --model-name baichuan-2-chat --size-in-billions 7 --model-format pytorch --quantization ${quantization}
 
-.. note::
-
-   Not supported on macOS.
-
 
 Model Spec 2 (pytorch, 13 Billion)
-++++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
 - **Model Size (in billions):** 13
 - **Quantizations:** 4-bit, 8-bit, none
 - **Model ID:** baichuan-inc/Baichuan2-13B-Chat
-- **Model Revision:** a56c793eb7a721ab6c270f779024e0375e8afd4a
 
 Execute the following command to launch the model, remember to replace ``${quantization}`` with your
 chosen quantization method from the options listed above::
 
    xinference launch --model-name baichuan-2-chat --size-in-billions 13 --model-format pytorch --quantization ${quantization}
 
-.. note::
-
-   Not supported on macOS.
diff --git a/doc/source/models/builtin/baichuan-2.rst b/doc/source/models/builtin/llm/baichuan-2.rst
similarity index 82%
rename from doc/source/models/builtin/baichuan-2.rst
rename to doc/source/models/builtin/llm/baichuan-2.rst
index 9de361a66a..6df3d8e303 100644
--- a/doc/source/models/builtin/baichuan-2.rst
+++ b/doc/source/models/builtin/llm/baichuan-2.rst
@@ -1,20 +1,21 @@
-.. _models_builtin_baichuan_2:
+.. _models_llm_baichuan-2:
 
-==========
-Baichuan-2
-==========
+========================================
+baichuan-2
+========================================
 
 - **Context Length:** 4096
 - **Model Name:** baichuan-2
 - **Languages:** en, zh
-- **Abilities:** embed, generate
+- **Abilities:** generate
 - **Description:** Baichuan2 is an open-source Transformer based LLM that is trained on both Chinese and English data.
 
 Specifications
 ^^^^^^^^^^^^^^
 
+
 Model Spec 1 (pytorch, 7 Billion)
-+++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
 - **Model Size (in billions):** 7
@@ -26,12 +27,9 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name baichuan-2 --size-in-billions 7 --model-format pytorch --quantization ${quantization}
 
-.. note::
-
-   Not supported on macOS.
 
 Model Spec 2 (pytorch, 13 Billion)
-++++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
 - **Model Size (in billions):** 13
@@ -43,6 +41,3 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name baichuan-2 --size-in-billions 13 --model-format pytorch --quantization ${quantization}
 
-.. note::
-
-   Not supported on macOS.
diff --git a/doc/source/models/builtin/baichuan-chat.rst b/doc/source/models/builtin/llm/baichuan-chat.rst
similarity index 59%
rename from doc/source/models/builtin/baichuan-chat.rst
rename to doc/source/models/builtin/llm/baichuan-chat.rst
index 149e8d021d..ef707b12e8 100644
--- a/doc/source/models/builtin/baichuan-chat.rst
+++ b/doc/source/models/builtin/llm/baichuan-chat.rst
@@ -1,16 +1,22 @@
-.. _models_builtin_baichuan_chat:
+.. _models_llm_baichuan-chat:
 
-=============
-Baichuan Chat
-=============
+========================================
+baichuan-chat
+========================================
 
+- **Context Length:** 4096
 - **Model Name:** baichuan-chat
 - **Languages:** en, zh
-- **Abilities:** embed, chat
+- **Abilities:** chat
+- **Description:** Baichuan-chat is a fine-tuned version of the Baichuan LLM, specializing in chatting.
 
 Specifications
 ^^^^^^^^^^^^^^
 
+
+Model Spec 1 (pytorch, 13 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
 - **Model Format:** pytorch
 - **Model Size (in billions):** 13
 - **Quantizations:** 4-bit, 8-bit, none
@@ -21,6 +27,3 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name baichuan-chat --size-in-billions 13 --model-format pytorch --quantization ${quantization}
 
-.. note::
-
-   Not supported on macOS.
diff --git a/doc/source/models/builtin/baichuan.rst b/doc/source/models/builtin/llm/baichuan.rst
similarity index 72%
rename from doc/source/models/builtin/baichuan.rst
rename to doc/source/models/builtin/llm/baichuan.rst
index 87499ff72c..1470bdf6c8 100644
--- a/doc/source/models/builtin/baichuan.rst
+++ b/doc/source/models/builtin/llm/baichuan.rst
@@ -1,35 +1,35 @@
-.. _models_builtin_baichuan:
+.. _models_llm_baichuan:
 
-========
-Baichuan
-========
+========================================
+baichuan
+========================================
 
+- **Context Length:** 4096
 - **Model Name:** baichuan
 - **Languages:** en, zh
-- **Abilities:** embed, generate
+- **Abilities:** generate
+- **Description:** Baichuan is an open-source Transformer based LLM that is trained on both Chinese and English data.
 
 Specifications
 ^^^^^^^^^^^^^^
 
-Model Spec 1 (ggmlv3)
-+++++++++++++++++++++
+
+Model Spec 1 (ggmlv3, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** ggmlv3
 - **Model Size (in billions):** 7
 - **Quantizations:** q2_K, q3_K_L, q3_K_M, q3_K_S, q4_0, q4_1, q4_K_M, q4_K_S, q5_0, q5_1, q5_K_M, q5_K_S, q6_K, q8_0
 - **Model ID:** TheBloke/baichuan-llama-7B-GGML
 
-Execute the following command to launch the model, remember to replace ``${quantization}`` with your chosen quantization method from the options listed above::
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
 
    xinference launch --model-name baichuan --size-in-billions 7 --model-format ggmlv3 --quantization ${quantization}
 
 
-.. note::
-
-   For utilizing the Apple Metal GPU for acceleration, select the q4_0 and q4_1 quantizations.
-
 Model Spec 2 (pytorch, 7 Billion)
-+++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
 - **Model Size (in billions):** 7
@@ -41,12 +41,9 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name baichuan --size-in-billions 7 --model-format pytorch --quantization ${quantization}
 
-.. note::
-
-   Not supported on macOS.
 
 Model Spec 3 (pytorch, 13 Billion)
-++++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
 - **Model Size (in billions):** 13
@@ -58,6 +55,3 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name baichuan --size-in-billions 13 --model-format pytorch --quantization ${quantization}
 
-.. note::
-
-   Not supported on macOS.
diff --git a/doc/source/models/builtin/chatglm.rst b/doc/source/models/builtin/llm/chatglm.rst
similarity index 72%
rename from doc/source/models/builtin/chatglm.rst
rename to doc/source/models/builtin/llm/chatglm.rst
index 9053841839..86a88b8c53 100644
--- a/doc/source/models/builtin/chatglm.rst
+++ b/doc/source/models/builtin/llm/chatglm.rst
@@ -1,18 +1,21 @@
-.. _models_builtin_chatglm:
+.. _models_llm_chatglm:
 
-=======
-ChatGLM
-=======
+========================================
+chatglm
+========================================
 
+- **Context Length:** 2048
 - **Model Name:** chatglm
 - **Languages:** en, zh
-- **Abilities:** embed, chat
+- **Abilities:** chat
+- **Description:** ChatGLM is an open-source General Language Model (GLM) based LLM trained on both Chinese and English data.
 
 Specifications
 ^^^^^^^^^^^^^^
 
+
 Model Spec 1 (ggmlv3, 6 Billion)
-++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** ggmlv3
 - **Model Size (in billions):** 6
@@ -24,8 +27,9 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name chatglm --size-in-billions 6 --model-format ggmlv3 --quantization ${quantization}
 
+
 Model Spec 2 (pytorch, 6 Billion)
-+++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
 - **Model Size (in billions):** 6
@@ -37,6 +41,3 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name chatglm --size-in-billions 6 --model-format pytorch --quantization ${quantization}
 
-.. note::
-
-   4-bit quantization is not supported on macOS.
diff --git a/doc/source/models/builtin/chatglm2-32k.rst b/doc/source/models/builtin/llm/chatglm2-32k.rst
similarity index 62%
rename from doc/source/models/builtin/chatglm2-32k.rst
rename to doc/source/models/builtin/llm/chatglm2-32k.rst
index d61865dc46..bd06f55dbc 100644
--- a/doc/source/models/builtin/chatglm2-32k.rst
+++ b/doc/source/models/builtin/llm/chatglm2-32k.rst
@@ -1,18 +1,21 @@
-.. _models_builtin_chatglm2_32k:
+.. _models_llm_chatglm2-32k:
 
-============
-ChatGLM2 32k
-============
+========================================
+chatglm2-32k
+========================================
 
+- **Context Length:** 32768
 - **Model Name:** chatglm2-32k
 - **Languages:** en, zh
-- **Abilities:** embed, chat
+- **Abilities:** chat
+- **Description:** ChatGLM2-32k is a special version of ChatGLM2, with a context window of 32k tokens instead of 8k.
 
 Specifications
 ^^^^^^^^^^^^^^
 
+
 Model Spec 1 (pytorch, 6 Billion)
-+++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
 - **Model Size (in billions):** 6
@@ -24,6 +27,3 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name chatglm2-32k --size-in-billions 6 --model-format pytorch --quantization ${quantization}
 
-.. note::
-
-   4-bit quantization is not supported on macOS.
diff --git a/doc/source/models/builtin/chatglm2.rst b/doc/source/models/builtin/llm/chatglm2.rst
similarity index 72%
rename from doc/source/models/builtin/chatglm2.rst
rename to doc/source/models/builtin/llm/chatglm2.rst
index 57e44b8a08..77cd408009 100644
--- a/doc/source/models/builtin/chatglm2.rst
+++ b/doc/source/models/builtin/llm/chatglm2.rst
@@ -1,18 +1,21 @@
-.. _models_builtin_chatglm2:
+.. _models_llm_chatglm2:
 
-========
-ChatGLM2
-========
+========================================
+chatglm2
+========================================
 
+- **Context Length:** 8192
 - **Model Name:** chatglm2
 - **Languages:** en, zh
-- **Abilities:** embed, chat
+- **Abilities:** chat
+- **Description:** ChatGLM2 is the second generation of ChatGLM, still open-source and trained on Chinese and English data.
 
 Specifications
 ^^^^^^^^^^^^^^
 
+
 Model Spec 1 (ggmlv3, 6 Billion)
-++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** ggmlv3
 - **Model Size (in billions):** 6
@@ -24,8 +27,9 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name chatglm2 --size-in-billions 6 --model-format ggmlv3 --quantization ${quantization}
 
+
 Model Spec 2 (pytorch, 6 Billion)
-+++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
 - **Model Size (in billions):** 6
@@ -37,6 +41,3 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name chatglm2 --size-in-billions 6 --model-format pytorch --quantization ${quantization}
 
-.. note::
-
-   4-bit quantization is not supported on macOS.
diff --git a/doc/source/models/builtin/chatglm3-32k.rst b/doc/source/models/builtin/llm/chatglm3-32k.rst
similarity index 77%
rename from doc/source/models/builtin/chatglm3-32k.rst
rename to doc/source/models/builtin/llm/chatglm3-32k.rst
index ca626bd2cc..74fe11bcd5 100644
--- a/doc/source/models/builtin/chatglm3-32k.rst
+++ b/doc/source/models/builtin/llm/chatglm3-32k.rst
@@ -1,9 +1,8 @@
-.. _models_builtin_chatglm3_32k:
+.. _models_llm_chatglm3-32k:
 
-
-============
-ChatGLM3-32K
-============
+========================================
+chatglm3-32k
+========================================
 
 - **Context Length:** 32768
 - **Model Name:** chatglm3-32k
@@ -14,8 +13,9 @@ ChatGLM3-32K
 Specifications
 ^^^^^^^^^^^^^^
 
-Model Spec (pytorch, 6 Billion)
-+++++++++++++++++++++++++++++++
+
+Model Spec 1 (pytorch, 6 Billion)
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
 - **Model Size (in billions):** 6
@@ -27,6 +27,3 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name chatglm3-32k --size-in-billions 6 --model-format pytorch --quantization ${quantization}
 
-.. note::
-
-   4-bit quantization is not supported on macOS.
diff --git a/doc/source/models/builtin/chatglm3.rst b/doc/source/models/builtin/llm/chatglm3.rst
similarity index 50%
rename from doc/source/models/builtin/chatglm3.rst
rename to doc/source/models/builtin/llm/chatglm3.rst
index fc68c9f0f2..dc153b4191 100644
--- a/doc/source/models/builtin/chatglm3.rst
+++ b/doc/source/models/builtin/llm/chatglm3.rst
@@ -1,9 +1,8 @@
-.. _models_builtin_chatglm3:
+.. _models_llm_chatglm3:
 
-
-========
-ChatGLM3
-========
+========================================
+chatglm3
+========================================
 
 - **Context Length:** 8192
 - **Model Name:** chatglm3
@@ -14,8 +13,23 @@ ChatGLM3
 Specifications
 ^^^^^^^^^^^^^^
 
-Model Spec (pytorch, 6 Billion)
-+++++++++++++++++++++++++++++++
+
+Model Spec 1 (ggmlv3, 6 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggmlv3
+- **Model Size (in billions):** 6
+- **Quantizations:** q4_0
+- **Model ID:** Xorbits/chatglm3-6B-GGML
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name chatglm3 --size-in-billions 6 --model-format ggmlv3 --quantization ${quantization}
+
+
+Model Spec 2 (pytorch, 6 Billion)
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
 - **Model Size (in billions):** 6
@@ -27,6 +41,3 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name chatglm3 --size-in-billions 6 --model-format pytorch --quantization ${quantization}
 
-.. note::
-
-   4-bit quantization is not supported on macOS.
diff --git a/doc/source/models/builtin/llm/code-llama-instruct.rst b/doc/source/models/builtin/llm/code-llama-instruct.rst
new file mode 100644
index 0000000000..cab055ab7e
--- /dev/null
+++ b/doc/source/models/builtin/llm/code-llama-instruct.rst
@@ -0,0 +1,99 @@
+.. _models_llm_code-llama-instruct:
+
+========================================
+code-llama-instruct
+========================================
+
+- **Context Length:** 100000
+- **Model Name:** code-llama-instruct
+- **Languages:** en
+- **Abilities:** chat
+- **Description:** Code-Llama-Instruct is an instruct-tuned version of the Code-Llama LLM.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 7
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** codellama/CodeLlama-7b-Instruct-hf
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name code-llama-instruct --size-in-billions 7 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 2 (pytorch, 13 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 13
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** codellama/CodeLlama-13b-Instruct-hf
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name code-llama-instruct --size-in-billions 13 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 3 (pytorch, 34 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 34
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** codellama/CodeLlama-34b-Instruct-hf
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name code-llama-instruct --size-in-billions 34 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 4 (ggufv2, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 7
+- **Quantizations:** Q2_K, Q3_K_L, Q3_K_M, Q3_K_S, Q4_0, Q4_K_M, Q4_K_S, Q5_0, Q5_K_M, Q5_K_S, Q6_K, Q8_0
+- **Model ID:** TheBloke/CodeLlama-7B-Instruct-GGUF
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name code-llama-instruct --size-in-billions 7 --model-format ggufv2 --quantization ${quantization}
+
+
+Model Spec 5 (ggufv2, 13 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 13
+- **Quantizations:** Q2_K, Q3_K_L, Q3_K_M, Q3_K_S, Q4_0, Q4_K_M, Q4_K_S, Q5_0, Q5_K_M, Q5_K_S, Q6_K, Q8_0
+- **Model ID:** TheBloke/CodeLlama-13B-Instruct-GGUF
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name code-llama-instruct --size-in-billions 13 --model-format ggufv2 --quantization ${quantization}
+
+
+Model Spec 6 (ggufv2, 34 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 34
+- **Quantizations:** Q2_K, Q3_K_L, Q3_K_M, Q3_K_S, Q4_0, Q4_K_M, Q4_K_S, Q5_0, Q5_K_M, Q5_K_S, Q6_K, Q8_0
+- **Model ID:** TheBloke/CodeLlama-34B-Instruct-GGUF
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name code-llama-instruct --size-in-billions 34 --model-format ggufv2 --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/llm/code-llama-python.rst b/doc/source/models/builtin/llm/code-llama-python.rst
new file mode 100644
index 0000000000..53b38487ea
--- /dev/null
+++ b/doc/source/models/builtin/llm/code-llama-python.rst
@@ -0,0 +1,99 @@
+.. _models_llm_code-llama-python:
+
+========================================
+code-llama-python
+========================================
+
+- **Context Length:** 100000
+- **Model Name:** code-llama-python
+- **Languages:** en
+- **Abilities:** generate
+- **Description:** Code-Llama-Python is a fine-tuned version of the Code-Llama LLM, specializing in Python.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 7
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** TheBloke/CodeLlama-7B-Python-fp16
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name code-llama-python --size-in-billions 7 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 2 (pytorch, 13 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 13
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** TheBloke/CodeLlama-13B-Python-fp16
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name code-llama-python --size-in-billions 13 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 3 (pytorch, 34 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 34
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** TheBloke/CodeLlama-34B-Python-fp16
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name code-llama-python --size-in-billions 34 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 4 (ggufv2, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 7
+- **Quantizations:** Q2_K, Q3_K_L, Q3_K_M, Q3_K_S, Q4_0, Q4_K_M, Q4_K_S, Q5_0, Q5_K_M, Q5_K_S, Q6_K, Q8_0
+- **Model ID:** TheBloke/CodeLlama-7B-Python-GGUF
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name code-llama-python --size-in-billions 7 --model-format ggufv2 --quantization ${quantization}
+
+
+Model Spec 5 (ggufv2, 13 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 13
+- **Quantizations:** Q2_K, Q3_K_L, Q3_K_M, Q3_K_S, Q4_0, Q4_K_M, Q4_K_S, Q5_0, Q5_K_M, Q5_K_S, Q6_K, Q8_0
+- **Model ID:** TheBloke/CodeLlama-13B-Python-GGUF
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name code-llama-python --size-in-billions 13 --model-format ggufv2 --quantization ${quantization}
+
+
+Model Spec 6 (ggufv2, 34 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 34
+- **Quantizations:** Q2_K, Q3_K_L, Q3_K_M, Q3_K_S, Q4_0, Q4_K_M, Q4_K_S, Q5_0, Q5_K_M, Q5_K_S, Q6_K, Q8_0
+- **Model ID:** TheBloke/CodeLlama-34B-Python-GGUF
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name code-llama-python --size-in-billions 34 --model-format ggufv2 --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/llm/code-llama.rst b/doc/source/models/builtin/llm/code-llama.rst
new file mode 100644
index 0000000000..dea196ca9b
--- /dev/null
+++ b/doc/source/models/builtin/llm/code-llama.rst
@@ -0,0 +1,99 @@
+.. _models_llm_code-llama:
+
+========================================
+code-llama
+========================================
+
+- **Context Length:** 100000
+- **Model Name:** code-llama
+- **Languages:** en
+- **Abilities:** generate
+- **Description:** Code-Llama is an open-source LLM trained by fine-tuning LLaMA2 for generating and discussing code.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 7
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** TheBloke/CodeLlama-7B-fp16
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name code-llama --size-in-billions 7 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 2 (pytorch, 13 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 13
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** TheBloke/CodeLlama-13B-fp16
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name code-llama --size-in-billions 13 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 3 (pytorch, 34 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 34
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** TheBloke/CodeLlama-34B-fp16
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name code-llama --size-in-billions 34 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 4 (ggufv2, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 7
+- **Quantizations:** Q2_K, Q3_K_L, Q3_K_M, Q3_K_S, Q4_0, Q4_K_M, Q4_K_S, Q5_0, Q5_K_M, Q5_K_S, Q6_K, Q8_0
+- **Model ID:** TheBloke/CodeLlama-7B-GGUF
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name code-llama --size-in-billions 7 --model-format ggufv2 --quantization ${quantization}
+
+
+Model Spec 5 (ggufv2, 13 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 13
+- **Quantizations:** Q2_K, Q3_K_L, Q3_K_M, Q3_K_S, Q4_0, Q4_K_M, Q4_K_S, Q5_0, Q5_K_M, Q5_K_S, Q6_K, Q8_0
+- **Model ID:** TheBloke/CodeLlama-13B-GGUF
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name code-llama --size-in-billions 13 --model-format ggufv2 --quantization ${quantization}
+
+
+Model Spec 6 (ggufv2, 34 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 34
+- **Quantizations:** Q2_K, Q3_K_L, Q3_K_M, Q3_K_S, Q4_0, Q4_K_M, Q4_K_S, Q5_0, Q5_K_M, Q5_K_S, Q6_K, Q8_0
+- **Model ID:** TheBloke/CodeLlama-34B-GGUF
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name code-llama --size-in-billions 34 --model-format ggufv2 --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/falcon-instruct.rst b/doc/source/models/builtin/llm/falcon-instruct.rst
similarity index 73%
rename from doc/source/models/builtin/falcon-instruct.rst
rename to doc/source/models/builtin/llm/falcon-instruct.rst
index 6b348e4fcf..69e2a91c82 100644
--- a/doc/source/models/builtin/falcon-instruct.rst
+++ b/doc/source/models/builtin/llm/falcon-instruct.rst
@@ -1,18 +1,21 @@
-.. _models_builtin_falcon_instruct:
+.. _models_llm_falcon-instruct:
 
-===============
-Falcon Instruct
-===============
+========================================
+falcon-instruct
+========================================
 
+- **Context Length:** 2048
 - **Model Name:** falcon-instruct
 - **Languages:** en
-- **Abilities:** embed, chat
+- **Abilities:** chat
+- **Description:** Falcon-instruct is a fine-tuned version of the Falcon LLM, specializing in chatting.
 
 Specifications
 ^^^^^^^^^^^^^^
 
+
 Model Spec 1 (pytorch, 7 Billion)
-+++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
 - **Model Size (in billions):** 7
@@ -24,12 +27,9 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name falcon-instruct --size-in-billions 7 --model-format pytorch --quantization ${quantization}
 
-.. note::
-
-   4-bit quantization is not supported on macOS.
 
 Model Spec 2 (pytorch, 40 Billion)
-++++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
 - **Model Size (in billions):** 40
@@ -41,6 +41,3 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name falcon-instruct --size-in-billions 40 --model-format pytorch --quantization ${quantization}
 
-.. note::
-
-   4-bit quantization is not supported on macOS.
diff --git a/doc/source/models/builtin/falcon.rst b/doc/source/models/builtin/llm/falcon.rst
similarity index 73%
rename from doc/source/models/builtin/falcon.rst
rename to doc/source/models/builtin/llm/falcon.rst
index 9c206e29cd..ae5d185cbf 100644
--- a/doc/source/models/builtin/falcon.rst
+++ b/doc/source/models/builtin/llm/falcon.rst
@@ -1,46 +1,43 @@
-.. _models_builtin_falcon:
+.. _models_llm_falcon:
 
-======
-Falcon
-======
+========================================
+falcon
+========================================
 
+- **Context Length:** 2048
 - **Model Name:** falcon
 - **Languages:** en
-- **Abilities:** embed, generate
+- **Abilities:** generate
+- **Description:** Falcon is an open-source Transformer based LLM trained on the RefinedWeb dataset.
 
 Specifications
 ^^^^^^^^^^^^^^
 
-Model Spec 2 (pytorch, 7 Billion)
-+++++++++++++++++++++++++++++++++
+
+Model Spec 1 (pytorch, 40 Billion)
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
-- **Model Size (in billions):** 7
+- **Model Size (in billions):** 40
 - **Quantizations:** 4-bit, 8-bit, none
-- **Model ID:** tiiuae/falcon-7b
+- **Model ID:** tiiuae/falcon-40b
 
 Execute the following command to launch the model, remember to replace ``${quantization}`` with your
 chosen quantization method from the options listed above::
 
-   xinference launch --model-name falcon --size-in-billions 7 --model-format pytorch --quantization ${quantization}
-
-.. note::
+   xinference launch --model-name falcon --size-in-billions 40 --model-format pytorch --quantization ${quantization}
 
-   4-bit quantization is not supported on macOS.
 
-Model Spec 1 (pytorch, 40 Billion)
-++++++++++++++++++++++++++++++++++
+Model Spec 2 (pytorch, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
-- **Model Size (in billions):** 40
+- **Model Size (in billions):** 7
 - **Quantizations:** 4-bit, 8-bit, none
-- **Model ID:** tiiuae/falcon-40b
+- **Model ID:** tiiuae/falcon-7b
 
 Execute the following command to launch the model, remember to replace ``${quantization}`` with your
 chosen quantization method from the options listed above::
 
-   xinference launch --model-name falcon --size-in-billions 40 --model-format pytorch --quantization ${quantization}
-
-.. note::
+   xinference launch --model-name falcon --size-in-billions 7 --model-format pytorch --quantization ${quantization}
 
-   4-bit quantization is not supported on macOS.
diff --git a/doc/source/models/builtin/llm/glaive-coder.rst b/doc/source/models/builtin/llm/glaive-coder.rst
new file mode 100644
index 0000000000..e7084293e0
--- /dev/null
+++ b/doc/source/models/builtin/llm/glaive-coder.rst
@@ -0,0 +1,29 @@
+.. _models_llm_glaive-coder:
+
+========================================
+glaive-coder
+========================================
+
+- **Context Length:** 100000
+- **Model Name:** glaive-coder
+- **Languages:** en
+- **Abilities:** chat
+- **Description:** A code model trained on a dataset of ~140k programming related problems and solutions generated from Glaive’s synthetic data generation platform.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 7
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** glaiveai/glaive-coder-7b
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name glaive-coder --size-in-billions 7 --model-format pytorch --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/llm/gpt-2.rst b/doc/source/models/builtin/llm/gpt-2.rst
new file mode 100644
index 0000000000..713c932128
--- /dev/null
+++ b/doc/source/models/builtin/llm/gpt-2.rst
@@ -0,0 +1,29 @@
+.. _models_llm_gpt-2:
+
+========================================
+gpt-2
+========================================
+
+- **Context Length:** 1024
+- **Model Name:** gpt-2
+- **Languages:** en
+- **Abilities:** generate
+- **Description:** GPT-2 is a Transformer-based LLM that is trained on WebTest, a 40 GB dataset of Reddit posts with 3+ upvotes.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (ggmlv3, 1 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggmlv3
+- **Model Size (in billions):** 1
+- **Quantizations:** none
+- **Model ID:** marella/gpt-2-ggml
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name gpt-2 --size-in-billions 1 --model-format ggmlv3 --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/llm/index.rst b/doc/source/models/builtin/llm/index.rst
new file mode 100644
index 0000000000..d2728e51da
--- /dev/null
+++ b/doc/source/models/builtin/llm/index.rst
@@ -0,0 +1,103 @@
+.. _models_llm_index:
+
+=====================
+Large language Models
+=====================
+
+The following is a list of built-in LLM in Xinference:
+
+
+.. toctree::
+   :maxdepth: 3
+
+  
+   baichuan
+  
+   baichuan-2
+  
+   baichuan-2-chat
+  
+   baichuan-chat
+  
+   chatglm
+  
+   chatglm2
+  
+   chatglm2-32k
+  
+   chatglm3
+  
+   chatglm3-32k
+  
+   code-llama
+  
+   code-llama-instruct
+  
+   code-llama-python
+  
+   falcon
+  
+   falcon-instruct
+  
+   glaive-coder
+  
+   gpt-2
+  
+   internlm-20b
+  
+   internlm-7b
+  
+   internlm-chat-20b
+  
+   internlm-chat-7b
+  
+   llama-2
+  
+   llama-2-chat
+  
+   mistral-instruct-v0.1
+  
+   mistral-v0.1
+  
+   openbuddy
+  
+   opt
+  
+   orca
+  
+   qwen-chat
+  
+   starchat-beta
+  
+   starcoder
+  
+   starcoderplus
+  
+   tiny-llama
+  
+   vicuna-v1.3
+  
+   vicuna-v1.5
+  
+   vicuna-v1.5-16k
+  
+   wizardcoder-python-v1.0
+  
+   wizardlm-v1.0
+  
+   wizardmath-v1.0
+  
+   xverse
+  
+   xverse-chat
+  
+   yi
+  
+   yi-200k
+  
+   yi-chat
+  
+   zephyr-7b-alpha
+  
+   zephyr-7b-beta
+  
\ No newline at end of file
diff --git a/doc/source/models/builtin/llm/internlm-20b.rst b/doc/source/models/builtin/llm/internlm-20b.rst
new file mode 100644
index 0000000000..5a6b98128e
--- /dev/null
+++ b/doc/source/models/builtin/llm/internlm-20b.rst
@@ -0,0 +1,29 @@
+.. _models_llm_internlm-20b:
+
+========================================
+internlm-20b
+========================================
+
+- **Context Length:** 16384
+- **Model Name:** internlm-20b
+- **Languages:** en, zh
+- **Abilities:** generate
+- **Description:** Pre-trained on over 2.3T Tokens containing high-quality English, Chinese, and code data.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 20 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 20
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** internlm/internlm-20b
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name internlm-20b --size-in-billions 20 --model-format pytorch --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/llm/internlm-7b.rst b/doc/source/models/builtin/llm/internlm-7b.rst
new file mode 100644
index 0000000000..cb28a7f6fc
--- /dev/null
+++ b/doc/source/models/builtin/llm/internlm-7b.rst
@@ -0,0 +1,29 @@
+.. _models_llm_internlm-7b:
+
+========================================
+internlm-7b
+========================================
+
+- **Context Length:** 8192
+- **Model Name:** internlm-7b
+- **Languages:** en, zh
+- **Abilities:** generate
+- **Description:** InternLM is a Transformer-based LLM that is trained on both Chinese and English data, focusing on practical scenarios.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 7
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** internlm/internlm-7b
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name internlm-7b --size-in-billions 7 --model-format pytorch --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/llm/internlm-chat-20b.rst b/doc/source/models/builtin/llm/internlm-chat-20b.rst
new file mode 100644
index 0000000000..acc30e1d2e
--- /dev/null
+++ b/doc/source/models/builtin/llm/internlm-chat-20b.rst
@@ -0,0 +1,29 @@
+.. _models_llm_internlm-chat-20b:
+
+========================================
+internlm-chat-20b
+========================================
+
+- **Context Length:** 16384
+- **Model Name:** internlm-chat-20b
+- **Languages:** en, zh
+- **Abilities:** chat
+- **Description:** Pre-trained on over 2.3T Tokens containing high-quality English, Chinese, and code data. The Chat version has undergone SFT and RLHF training.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 20 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 20
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** internlm/internlm-chat-20b
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name internlm-chat-20b --size-in-billions 20 --model-format pytorch --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/llm/internlm-chat-7b.rst b/doc/source/models/builtin/llm/internlm-chat-7b.rst
new file mode 100644
index 0000000000..9b925279ae
--- /dev/null
+++ b/doc/source/models/builtin/llm/internlm-chat-7b.rst
@@ -0,0 +1,29 @@
+.. _models_llm_internlm-chat-7b:
+
+========================================
+internlm-chat-7b
+========================================
+
+- **Context Length:** 4096
+- **Model Name:** internlm-chat-7b
+- **Languages:** en, zh
+- **Abilities:** chat
+- **Description:** Internlm-chat is a fine-tuned version of the Internlm LLM, specializing in chatting.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 7
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** internlm/internlm-chat-7b
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name internlm-chat-7b --size-in-billions 7 --model-format pytorch --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/llama-2-chat.rst b/doc/source/models/builtin/llm/llama-2-chat.rst
similarity index 85%
rename from doc/source/models/builtin/llama-2-chat.rst
rename to doc/source/models/builtin/llm/llama-2-chat.rst
index 7903297796..85891b5806 100644
--- a/doc/source/models/builtin/llama-2-chat.rst
+++ b/doc/source/models/builtin/llm/llama-2-chat.rst
@@ -1,18 +1,21 @@
-.. _models_builtin_llama_2_chat:
+.. _models_llm_llama-2-chat:
 
-============
-Llama-2 Chat
-============
+========================================
+llama-2-chat
+========================================
 
+- **Context Length:** 4096
 - **Model Name:** llama-2-chat
 - **Languages:** en
-- **Abilities:** embed, chat
+- **Abilities:** chat
+- **Description:** Llama-2-Chat is a fine-tuned version of the Llama-2 LLM, specializing in chatting.
 
 Specifications
 ^^^^^^^^^^^^^^
 
+
 Model Spec 1 (ggmlv3, 7 Billion)
-++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** ggmlv3
 - **Model Size (in billions):** 7
@@ -24,8 +27,9 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name llama-2-chat --size-in-billions 7 --model-format ggmlv3 --quantization ${quantization}
 
+
 Model Spec 2 (ggmlv3, 13 Billion)
-+++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** ggmlv3
 - **Model Size (in billions):** 13
@@ -39,7 +43,7 @@ chosen quantization method from the options listed above::
 
 
 Model Spec 3 (ggmlv3, 70 Billion)
-+++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** ggmlv3
 - **Model Size (in billions):** 70
@@ -51,8 +55,9 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name llama-2-chat --size-in-billions 70 --model-format ggmlv3 --quantization ${quantization}
 
+
 Model Spec 4 (pytorch, 7 Billion)
-+++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
 - **Model Size (in billions):** 7
@@ -64,13 +69,9 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name llama-2-chat --size-in-billions 7 --model-format pytorch --quantization ${quantization}
 
-.. note::
-
-   4-bit quantization is not supported on macOS.
-
 
 Model Spec 5 (pytorch, 13 Billion)
-++++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
 - **Model Size (in billions):** 13
@@ -82,12 +83,9 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name llama-2-chat --size-in-billions 13 --model-format pytorch --quantization ${quantization}
 
-.. note::
-
-   4-bit quantization is not supported on macOS.
 
 Model Spec 6 (pytorch, 70 Billion)
-++++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
 - **Model Size (in billions):** 70
@@ -99,6 +97,3 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name llama-2-chat --size-in-billions 70 --model-format pytorch --quantization ${quantization}
 
-.. note::
-
-   4-bit quantization is not supported on macOS.
\ No newline at end of file
diff --git a/doc/source/models/builtin/llama-2.rst b/doc/source/models/builtin/llm/llama-2.rst
similarity index 85%
rename from doc/source/models/builtin/llama-2.rst
rename to doc/source/models/builtin/llm/llama-2.rst
index 116a3d65ff..8fa544c92c 100644
--- a/doc/source/models/builtin/llama-2.rst
+++ b/doc/source/models/builtin/llm/llama-2.rst
@@ -1,18 +1,21 @@
-.. _models_builtin_llama_2:
+.. _models_llm_llama-2:
 
-=======
-Llama-2
-=======
+========================================
+llama-2
+========================================
 
+- **Context Length:** 4096
 - **Model Name:** llama-2
 - **Languages:** en
-- **Abilities:** embed, generate
+- **Abilities:** generate
+- **Description:** Llama-2 is the second generation of Llama, open-source and trained on a larger amount of data.
 
 Specifications
 ^^^^^^^^^^^^^^
 
+
 Model Spec 1 (ggmlv3, 7 Billion)
-++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** ggmlv3
 - **Model Size (in billions):** 7
@@ -24,8 +27,9 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name llama-2 --size-in-billions 7 --model-format ggmlv3 --quantization ${quantization}
 
+
 Model Spec 2 (ggmlv3, 13 Billion)
-+++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** ggmlv3
 - **Model Size (in billions):** 13
@@ -37,8 +41,9 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name llama-2 --size-in-billions 13 --model-format ggmlv3 --quantization ${quantization}
 
+
 Model Spec 3 (ggmlv3, 70 Billion)
-+++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** ggmlv3
 - **Model Size (in billions):** 70
@@ -50,8 +55,9 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name llama-2 --size-in-billions 70 --model-format ggmlv3 --quantization ${quantization}
 
+
 Model Spec 4 (pytorch, 7 Billion)
-+++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
 - **Model Size (in billions):** 7
@@ -63,12 +69,9 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name llama-2 --size-in-billions 7 --model-format pytorch --quantization ${quantization}
 
-.. note::
-
-   4-bit quantization is not supported on macOS.
 
 Model Spec 5 (pytorch, 13 Billion)
-++++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
 - **Model Size (in billions):** 13
@@ -80,12 +83,9 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name llama-2 --size-in-billions 13 --model-format pytorch --quantization ${quantization}
 
-.. note::
-
-   4-bit quantization is not supported on macOS.
 
 Model Spec 6 (pytorch, 70 Billion)
-++++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
 - **Model Size (in billions):** 70
@@ -97,6 +97,3 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name llama-2 --size-in-billions 70 --model-format pytorch --quantization ${quantization}
 
-.. note::
-
-   4-bit quantization is not supported on macOS.
diff --git a/doc/source/models/builtin/llm/mistral-instruct-v0.1.rst b/doc/source/models/builtin/llm/mistral-instruct-v0.1.rst
new file mode 100644
index 0000000000..6e31edc381
--- /dev/null
+++ b/doc/source/models/builtin/llm/mistral-instruct-v0.1.rst
@@ -0,0 +1,43 @@
+.. _models_llm_mistral-instruct-v0.1:
+
+========================================
+mistral-instruct-v0.1
+========================================
+
+- **Context Length:** 8192
+- **Model Name:** mistral-instruct-v0.1
+- **Languages:** en
+- **Abilities:** chat
+- **Description:** Mistral-7B-Instruct is a fine-tuned version of the Mistral-7B LLM on public datasets, specializing in chatting.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 7
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** mistralai/Mistral-7B-Instruct-v0.1
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name mistral-instruct-v0.1 --size-in-billions 7 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 2 (ggufv2, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 7
+- **Quantizations:** Q2_K, Q3_K_S, Q3_K_M, Q3_K_L, Q4_0, Q4_K_S, Q4_K_M, Q5_0, Q5_K_S, Q5_K_M, Q6_K, Q8_0
+- **Model ID:** TheBloke/Mistral-7B-Instruct-v0.1-GGUF
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name mistral-instruct-v0.1 --size-in-billions 7 --model-format ggufv2 --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/llm/mistral-v0.1.rst b/doc/source/models/builtin/llm/mistral-v0.1.rst
new file mode 100644
index 0000000000..fdb7750962
--- /dev/null
+++ b/doc/source/models/builtin/llm/mistral-v0.1.rst
@@ -0,0 +1,43 @@
+.. _models_llm_mistral-v0.1:
+
+========================================
+mistral-v0.1
+========================================
+
+- **Context Length:** 8192
+- **Model Name:** mistral-v0.1
+- **Languages:** en
+- **Abilities:** generate
+- **Description:** Mistral-7B is a unmoderated Transformer based LLM claiming to outperform Llama2 on all benchmarks.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 7
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** mistralai/Mistral-7B-v0.1
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name mistral-v0.1 --size-in-billions 7 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 2 (ggufv2, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 7
+- **Quantizations:** Q2_K, Q3_K_S, Q3_K_M, Q3_K_L, Q4_0, Q4_K_S, Q4_K_M, Q5_0, Q5_K_S, Q5_K_M, Q6_K, Q8_0
+- **Model ID:** TheBloke/Mistral-7B-v0.1-GGUF
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name mistral-v0.1 --size-in-billions 7 --model-format ggufv2 --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/openbuddy.rst b/doc/source/models/builtin/llm/openbuddy.rst
similarity index 66%
rename from doc/source/models/builtin/openbuddy.rst
rename to doc/source/models/builtin/llm/openbuddy.rst
index 1d6c66ee98..b50eb16c9e 100644
--- a/doc/source/models/builtin/openbuddy.rst
+++ b/doc/source/models/builtin/llm/openbuddy.rst
@@ -1,18 +1,21 @@
-.. _models_builtin_openbuddy_v11.1:
+.. _models_llm_openbuddy:
 
-=========
+========================================
 OpenBuddy
-=========
+========================================
 
+- **Context Length:** 2048
 - **Model Name:** OpenBuddy
-- **Languages:** en, zh
-- **Abilities:** embed, chat
+- **Languages:** en
+- **Abilities:** chat
+- **Description:** OpenBuddy is a powerful open multilingual chatbot model aimed at global users.
 
 Specifications
 ^^^^^^^^^^^^^^
 
+
 Model Spec 1 (ggmlv3, 13 Billion)
-+++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** ggmlv3
 - **Model Size (in billions):** 13
@@ -24,6 +27,3 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name OpenBuddy --size-in-billions 13 --model-format ggmlv3 --quantization ${quantization}
 
-.. note::
-
-   Multiple rounds chat is disabled for better translation.
diff --git a/doc/source/models/builtin/opt.rst b/doc/source/models/builtin/llm/opt.rst
similarity index 62%
rename from doc/source/models/builtin/opt.rst
rename to doc/source/models/builtin/llm/opt.rst
index 3bf7999a4c..1a57961a89 100644
--- a/doc/source/models/builtin/opt.rst
+++ b/doc/source/models/builtin/llm/opt.rst
@@ -1,18 +1,21 @@
-.. _models_builtin_opt:
+.. _models_llm_opt:
 
-===
-OPT
-===
+========================================
+opt
+========================================
 
+- **Context Length:** 2048
 - **Model Name:** opt
 - **Languages:** en
-- **Abilities:** embed, generate
+- **Abilities:** generate
+- **Description:** Opt is an open-source, decoder-only, Transformer based LLM that was designed to replicate GPT-3.
 
 Specifications
 ^^^^^^^^^^^^^^
 
+
 Model Spec 1 (pytorch, 1 Billion)
-+++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
 - **Model Size (in billions):** 1
@@ -24,6 +27,3 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name opt --size-in-billions 1 --model-format pytorch --quantization ${quantization}
 
-.. note::
-
-   4-bit quantization is not supported on macOS.
diff --git a/doc/source/models/builtin/orca_mini.rst b/doc/source/models/builtin/llm/orca.rst
similarity index 79%
rename from doc/source/models/builtin/orca_mini.rst
rename to doc/source/models/builtin/llm/orca.rst
index aa8b92d7ab..0152c3259c 100644
--- a/doc/source/models/builtin/orca_mini.rst
+++ b/doc/source/models/builtin/llm/orca.rst
@@ -1,18 +1,21 @@
-.. _models_builtin_orca_mini:
+.. _models_llm_orca:
 
-=========
-Orca Mini
-=========
+========================================
+orca
+========================================
 
+- **Context Length:** 2048
 - **Model Name:** orca
 - **Languages:** en
-- **Abilities:** embed, chat
+- **Abilities:** chat
+- **Description:** Orca is an LLM trained by fine-tuning LLaMA on explanation traces obtained from GPT-4.
 
 Specifications
 ^^^^^^^^^^^^^^
 
+
 Model Spec 1 (ggmlv3, 3 Billion)
-++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** ggmlv3
 - **Model Size (in billions):** 3
@@ -24,8 +27,9 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name orca --size-in-billions 3 --model-format ggmlv3 --quantization ${quantization}
 
+
 Model Spec 2 (ggmlv3, 7 Billion)
-++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** ggmlv3
 - **Model Size (in billions):** 7
@@ -37,8 +41,9 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name orca --size-in-billions 7 --model-format ggmlv3 --quantization ${quantization}
 
+
 Model Spec 3 (ggmlv3, 13 Billion)
-+++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** ggmlv3
 - **Model Size (in billions):** 13
@@ -49,3 +54,4 @@ Execute the following command to launch the model, remember to replace ``${quant
 chosen quantization method from the options listed above::
 
    xinference launch --model-name orca --size-in-billions 13 --model-format ggmlv3 --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/llm/qwen-chat.rst b/doc/source/models/builtin/llm/qwen-chat.rst
new file mode 100644
index 0000000000..4c27fb5e0f
--- /dev/null
+++ b/doc/source/models/builtin/llm/qwen-chat.rst
@@ -0,0 +1,127 @@
+.. _models_llm_qwen-chat:
+
+========================================
+qwen-chat
+========================================
+
+- **Context Length:** 2048
+- **Model Name:** qwen-chat
+- **Languages:** en, zh
+- **Abilities:** chat
+- **Description:** Qwen-chat is a fine-tuned version of the Qwen LLM trained with alignment techniques, specializing in chatting.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (ggmlv3, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggmlv3
+- **Model Size (in billions):** 7
+- **Quantizations:** q4_0
+- **Model ID:** Xorbits/qwen-chat-7B-ggml
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name qwen-chat --size-in-billions 7 --model-format ggmlv3 --quantization ${quantization}
+
+
+Model Spec 2 (ggmlv3, 14 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggmlv3
+- **Model Size (in billions):** 14
+- **Quantizations:** q4_0
+- **Model ID:** Xorbits/qwen-chat-14B-ggml
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name qwen-chat --size-in-billions 14 --model-format ggmlv3 --quantization ${quantization}
+
+
+Model Spec 3 (pytorch, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 7
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** Qwen/Qwen-7B-Chat
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name qwen-chat --size-in-billions 7 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 4 (pytorch, 14 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 14
+- **Quantizations:** none
+- **Model ID:** Qwen/Qwen-14B-Chat
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name qwen-chat --size-in-billions 14 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 5 (pytorch, 72 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 72
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** Qwen/Qwen-72B-Chat
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name qwen-chat --size-in-billions 72 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 6 (gptq, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** gptq
+- **Model Size (in billions):** 7
+- **Quantizations:** Int4, Int8
+- **Model ID:** Qwen/Qwen-7B-Chat-{quantization}
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name qwen-chat --size-in-billions 7 --model-format gptq --quantization ${quantization}
+
+
+Model Spec 7 (gptq, 14 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** gptq
+- **Model Size (in billions):** 14
+- **Quantizations:** Int4, Int8
+- **Model ID:** Qwen/Qwen-14B-Chat-{quantization}
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name qwen-chat --size-in-billions 14 --model-format gptq --quantization ${quantization}
+
+
+Model Spec 8 (gptq, 72 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** gptq
+- **Model Size (in billions):** 72
+- **Quantizations:** Int4, Int8
+- **Model ID:** Qwen/Qwen-72B-Chat-{quantization}
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name qwen-chat --size-in-billions 72 --model-format gptq --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/starchat-beta.rst b/doc/source/models/builtin/llm/starchat-beta.rst
similarity index 58%
rename from doc/source/models/builtin/starchat-beta.rst
rename to doc/source/models/builtin/llm/starchat-beta.rst
index 6f260ed7dd..a0637061c8 100644
--- a/doc/source/models/builtin/starchat-beta.rst
+++ b/doc/source/models/builtin/llm/starchat-beta.rst
@@ -1,18 +1,21 @@
-.. _models_builtin_starchat_beta:
+.. _models_llm_starchat-beta:
 
-=============
-Starchat-beta
-=============
+========================================
+starchat-beta
+========================================
 
+- **Context Length:** 8192
 - **Model Name:** starchat-beta
 - **Languages:** en
-- **Abilities:** embed, chat
+- **Abilities:** chat
+- **Description:** Starchat-beta is a fine-tuned version of the Starcoderplus LLM, specializing in coding assistance.
 
 Specifications
 ^^^^^^^^^^^^^^
 
-Model Spec (pytorch, 16 Billion)
-++++++++++++++++++++++++++++++++
+
+Model Spec 1 (pytorch, 16 Billion)
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
 - **Model Size (in billions):** 16
@@ -24,6 +27,3 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name starchat-beta --size-in-billions 16 --model-format pytorch --quantization ${quantization}
 
-.. note::
-
-   4-bit quantization is not supported on macOS.
\ No newline at end of file
diff --git a/doc/source/models/builtin/llm/starcoder.rst b/doc/source/models/builtin/llm/starcoder.rst
new file mode 100644
index 0000000000..d49d8640c9
--- /dev/null
+++ b/doc/source/models/builtin/llm/starcoder.rst
@@ -0,0 +1,29 @@
+.. _models_llm_starcoder:
+
+========================================
+starcoder
+========================================
+
+- **Context Length:** 8192
+- **Model Name:** starcoder
+- **Languages:** en
+- **Abilities:** generate
+- **Description:** Starcoder is an open-source Transformer based LLM that is trained on permissively licensed data from GitHub.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (ggmlv3, 16 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggmlv3
+- **Model Size (in billions):** 16
+- **Quantizations:** q4_0, q4_1, q5_0, q5_1, q8_0
+- **Model ID:** TheBloke/starcoder-GGML
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name starcoder --size-in-billions 16 --model-format ggmlv3 --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/starcoderplus.rst b/doc/source/models/builtin/llm/starcoderplus.rst
similarity index 57%
rename from doc/source/models/builtin/starcoderplus.rst
rename to doc/source/models/builtin/llm/starcoderplus.rst
index e3819fe01f..6dfaa7b6b2 100644
--- a/doc/source/models/builtin/starcoderplus.rst
+++ b/doc/source/models/builtin/llm/starcoderplus.rst
@@ -1,18 +1,21 @@
-.. _models_builtin_starcoderplus:
+.. _models_llm_starcoderplus:
 
-=============
-StarCoderPlus
-=============
+========================================
+starcoderplus
+========================================
 
+- **Context Length:** 8192
 - **Model Name:** starcoderplus
 - **Languages:** en
-- **Abilities:** embed, generate
+- **Abilities:** generate
+- **Description:** Starcoderplus is an open-source LLM trained by fine-tuning Starcoder on RedefinedWeb and StarCoderData datasets.
 
 Specifications
 ^^^^^^^^^^^^^^
 
-Model Spec (pytorch, 16 Billion)
-++++++++++++++++++++++++++++++++
+
+Model Spec 1 (pytorch, 16 Billion)
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
 - **Model Size (in billions):** 16
@@ -24,6 +27,3 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name starcoderplus --size-in-billions 16 --model-format pytorch --quantization ${quantization}
 
-.. note::
-
-   4-bit quantization is not supported on macOS.
diff --git a/doc/source/models/builtin/llm/tiny-llama.rst b/doc/source/models/builtin/llm/tiny-llama.rst
new file mode 100644
index 0000000000..57e6bf27a5
--- /dev/null
+++ b/doc/source/models/builtin/llm/tiny-llama.rst
@@ -0,0 +1,29 @@
+.. _models_llm_tiny-llama:
+
+========================================
+tiny-llama
+========================================
+
+- **Context Length:** 2048
+- **Model Name:** tiny-llama
+- **Languages:** en
+- **Abilities:** generate
+- **Description:** The TinyLlama project aims to pretrain a 1.1B Llama model on 3 trillion tokens.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (ggufv2, 1 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 1
+- **Quantizations:** Q2_K, Q3_K_L, Q3_K_M, Q3_K_S, Q4_0, Q4_K_M, Q4_K_S, Q5_0, Q5_K_M, Q5_K_S, Q6_K, Q8_0
+- **Model ID:** TheBloke/TinyLlama-1.1B-Chat-v0.3-GGUF
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name tiny-llama --size-in-billions 1 --model-format ggufv2 --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/vicuna-v1.3.rst b/doc/source/models/builtin/llm/vicuna-v1.3.rst
similarity index 82%
rename from doc/source/models/builtin/vicuna-v1.3.rst
rename to doc/source/models/builtin/llm/vicuna-v1.3.rst
index 43cff0a586..1fa72fea2a 100644
--- a/doc/source/models/builtin/vicuna-v1.3.rst
+++ b/doc/source/models/builtin/llm/vicuna-v1.3.rst
@@ -1,77 +1,77 @@
-.. _models_builtin_vicuna_v1_3:
+.. _models_llm_vicuna-v1.3:
 
-===========
-Vicuna v1.3
-===========
+========================================
+vicuna-v1.3
+========================================
 
+- **Context Length:** 2048
 - **Model Name:** vicuna-v1.3
 - **Languages:** en
-- **Abilities:** embed, chat
+- **Abilities:** chat
+- **Description:** Vicuna is an open-source LLM trained by fine-tuning LLaMA on data collected from ShareGPT.
 
 Specifications
 ^^^^^^^^^^^^^^
 
+
 Model Spec 1 (ggmlv3, 7 Billion)
-++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** ggmlv3
 - **Model Size (in billions):** 7
 - **Quantizations:** q2_K, q3_K_L, q3_K_M, q3_K_S, q4_0, q4_1, q4_K_M, q4_K_S, q5_0, q5_1, q5_K_M, q5_K_S, q6_K, q8_0
 - **Model ID:** TheBloke/vicuna-7B-v1.3-GGML
-- **File Name Template:** vicuna-7b-v1.3.ggmlv3.{quantization}.bin
 
 Execute the following command to launch the model, remember to replace ``${quantization}`` with your
 chosen quantization method from the options listed above::
 
    xinference launch --model-name vicuna-v1.3 --size-in-billions 7 --model-format ggmlv3 --quantization ${quantization}
 
+
 Model Spec 2 (ggmlv3, 13 Billion)
-+++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** ggmlv3
 - **Model Size (in billions):** 13
 - **Quantizations:** q2_K, q3_K_L, q3_K_M, q3_K_S, q4_0, q4_1, q4_K_M, q4_K_S, q5_0, q5_1, q5_K_M, q5_K_S, q6_K, q8_0
 - **Model ID:** TheBloke/vicuna-13b-v1.3.0-GGML
-- **File Name Template:** vicuna-13b-v1.3.0.ggmlv3.{quantization}.bin
 
 Execute the following command to launch the model, remember to replace ``${quantization}`` with your
 chosen quantization method from the options listed above::
 
    xinference launch --model-name vicuna-v1.3 --size-in-billions 13 --model-format ggmlv3 --quantization ${quantization}
 
+
 Model Spec 3 (ggmlv3, 33 Billion)
-+++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** ggmlv3
 - **Model Size (in billions):** 33
 - **Quantizations:** q2_K, q3_K_L, q3_K_M, q3_K_S, q4_0, q4_1, q4_K_M, q4_K_S, q5_0, q5_1, q5_K_M, q5_K_S, q6_K, q8_0
 - **Model ID:** TheBloke/vicuna-33B-GGML
-- **File Name Template:** vicuna-33b.ggmlv3.{quantization}.bin
 
 Execute the following command to launch the model, remember to replace ``${quantization}`` with your
 chosen quantization method from the options listed above::
 
    xinference launch --model-name vicuna-v1.3 --size-in-billions 33 --model-format ggmlv3 --quantization ${quantization}
 
-Model Spec 6 (pytorch, 7 Billion)
-+++++++++++++++++++++++++++++++++
+
+Model Spec 4 (pytorch, 33 Billion)
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
-- **Model Size (in billions):** 7
+- **Model Size (in billions):** 33
 - **Quantizations:** 4-bit, 8-bit, none
-- **Model ID:** lmsys/vicuna-7b-v1.3
+- **Model ID:** lmsys/vicuna-33b-v1.3
 
 Execute the following command to launch the model, remember to replace ``${quantization}`` with your
 chosen quantization method from the options listed above::
 
-   xinference launch --model-name vicuna-v1.3 --size-in-billions 7 --model-format pytorch --quantization ${quantization}
-
-.. note::
+   xinference launch --model-name vicuna-v1.3 --size-in-billions 33 --model-format pytorch --quantization ${quantization}
 
-   4-bit quantization is not supported on macOS.
 
 Model Spec 5 (pytorch, 13 Billion)
-++++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
 - **Model Size (in billions):** 13
@@ -83,23 +83,17 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name vicuna-v1.3 --size-in-billions 13 --model-format pytorch --quantization ${quantization}
 
-.. note::
-
-   4-bit quantization is not supported on macOS.
 
-Model Spec 4 (pytorch, 33 Billion)
-++++++++++++++++++++++++++++++++++
+Model Spec 6 (pytorch, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
-- **Model Size (in billions):** 33
+- **Model Size (in billions):** 7
 - **Quantizations:** 4-bit, 8-bit, none
-- **Model ID:** lmsys/vicuna-33b-v1.3
+- **Model ID:** lmsys/vicuna-7b-v1.3
 
 Execute the following command to launch the model, remember to replace ``${quantization}`` with your
 chosen quantization method from the options listed above::
 
-   xinference launch --model-name vicuna-v1.3 --size-in-billions 33 --model-format pytorch --quantization ${quantization}
-
-.. note::
+   xinference launch --model-name vicuna-v1.3 --size-in-billions 7 --model-format pytorch --quantization ${quantization}
 
-   4-bit quantization is not supported on macOS.
\ No newline at end of file
diff --git a/doc/source/models/builtin/vicuna-v1.5-16k.rst b/doc/source/models/builtin/llm/vicuna-v1.5-16k.rst
similarity index 72%
rename from doc/source/models/builtin/vicuna-v1.5-16k.rst
rename to doc/source/models/builtin/llm/vicuna-v1.5-16k.rst
index 651665cef6..5833f5c295 100644
--- a/doc/source/models/builtin/vicuna-v1.5-16k.rst
+++ b/doc/source/models/builtin/llm/vicuna-v1.5-16k.rst
@@ -1,18 +1,21 @@
-.. _models_builtin_vicuna_v1_5_16k:
+.. _models_llm_vicuna-v1.5-16k:
 
-===============
-Vicuna v1.5-16k
-===============
+========================================
+vicuna-v1.5-16k
+========================================
 
+- **Context Length:** 16384
 - **Model Name:** vicuna-v1.5-16k
 - **Languages:** en
-- **Abilities:** embed, chat
+- **Abilities:** chat
+- **Description:** Vicuna-v1.5-16k is a special version of Vicuna-v1.5, with a context window of 16k tokens instead of 4k.
 
 Specifications
 ^^^^^^^^^^^^^^
 
+
 Model Spec 1 (pytorch, 7 Billion)
-+++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
 - **Model Size (in billions):** 7
@@ -24,12 +27,9 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name vicuna-v1.5-16k --size-in-billions 7 --model-format pytorch --quantization ${quantization}
 
-.. note::
-
-   4-bit quantization is not supported on macOS.
 
 Model Spec 2 (pytorch, 13 Billion)
-++++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
 - **Model Size (in billions):** 13
@@ -41,6 +41,3 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name vicuna-v1.5-16k --size-in-billions 13 --model-format pytorch --quantization ${quantization}
 
-.. note::
-
-   4-bit quantization is not supported on macOS.
diff --git a/doc/source/models/builtin/vicuna-v1.5.rst b/doc/source/models/builtin/llm/vicuna-v1.5.rst
similarity index 73%
rename from doc/source/models/builtin/vicuna-v1.5.rst
rename to doc/source/models/builtin/llm/vicuna-v1.5.rst
index 428ccf82e6..a2211231c9 100644
--- a/doc/source/models/builtin/vicuna-v1.5.rst
+++ b/doc/source/models/builtin/llm/vicuna-v1.5.rst
@@ -1,18 +1,21 @@
-.. _models_builtin_vicuna_v1_5:
+.. _models_llm_vicuna-v1.5:
 
-===========
-Vicuna v1.5
-===========
+========================================
+vicuna-v1.5
+========================================
 
+- **Context Length:** 4096
 - **Model Name:** vicuna-v1.5
 - **Languages:** en
-- **Abilities:** embed, chat
+- **Abilities:** chat
+- **Description:** Vicuna is an open-source LLM trained by fine-tuning LLaMA on data collected from ShareGPT.
 
 Specifications
 ^^^^^^^^^^^^^^
 
+
 Model Spec 1 (pytorch, 7 Billion)
-+++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
 - **Model Size (in billions):** 7
@@ -24,12 +27,9 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name vicuna-v1.5 --size-in-billions 7 --model-format pytorch --quantization ${quantization}
 
-.. note::
-
-   4-bit quantization is not supported on macOS.
 
 Model Spec 2 (pytorch, 13 Billion)
-++++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
 - **Model Size (in billions):** 13
@@ -41,6 +41,3 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name vicuna-v1.5 --size-in-billions 13 --model-format pytorch --quantization ${quantization}
 
-.. note::
-
-   4-bit quantization is not supported on macOS.
diff --git a/doc/source/models/builtin/wizardcoder-python-v1.0.rst b/doc/source/models/builtin/llm/wizardcoder-python-v1.0.rst
similarity index 82%
rename from doc/source/models/builtin/wizardcoder-python-v1.0.rst
rename to doc/source/models/builtin/llm/wizardcoder-python-v1.0.rst
index 8265bbda7c..4871fdb00c 100644
--- a/doc/source/models/builtin/wizardcoder-python-v1.0.rst
+++ b/doc/source/models/builtin/llm/wizardcoder-python-v1.0.rst
@@ -1,19 +1,21 @@
-.. _models_builtin_wizardcoder_python_v1_0:
+.. _models_llm_wizardcoder-python-v1.0:
 
-=======================
-WizardCoder-Python-v1.0
-=======================
+========================================
+wizardcoder-python-v1.0
+========================================
 
 - **Context Length:** 100000
 - **Model Name:** wizardcoder-python-v1.0
 - **Languages:** en
-- **Abilities:** generate, chat
+- **Abilities:** chat
+- **Description:** 
 
 Specifications
 ^^^^^^^^^^^^^^
 
+
 Model Spec 1 (pytorch, 7 Billion)
-+++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
 - **Model Size (in billions):** 7
@@ -25,13 +27,9 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name wizardcoder-python-v1.0 --size-in-billions 7 --model-format pytorch --quantization ${quantization}
 
-.. note::
-
-   4-bit quantization is not supported on macOS.
-
 
 Model Spec 2 (pytorch, 13 Billion)
-++++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
 - **Model Size (in billions):** 13
@@ -43,12 +41,9 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name wizardcoder-python-v1.0 --size-in-billions 13 --model-format pytorch --quantization ${quantization}
 
-.. note::
-
-   4-bit quantization is not supported on macOS.
 
 Model Spec 3 (pytorch, 34 Billion)
-++++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
 - **Model Size (in billions):** 34
@@ -60,12 +55,9 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name wizardcoder-python-v1.0 --size-in-billions 34 --model-format pytorch --quantization ${quantization}
 
-.. note::
-
-   4-bit quantization is not supported on macOS.
 
 Model Spec 4 (ggufv2, 7 Billion)
-++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** ggufv2
 - **Model Size (in billions):** 7
@@ -77,30 +69,31 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name wizardcoder-python-v1.0 --size-in-billions 7 --model-format ggufv2 --quantization ${quantization}
 
+
 Model Spec 5 (ggufv2, 13 Billion)
-+++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** ggufv2
 - **Model Size (in billions):** 13
 - **Quantizations:** Q2_K, Q3_K_L, Q3_K_M, Q3_K_S, Q4_0, Q4_K_M, Q4_K_S, Q5_0, Q5_K_M, Q5_K_S, Q6_K, Q8_0
 - **Model ID:** TheBloke/WizardCoder-Python-13B-V1.0-GGUF
-- **File Name Template:** wizardcoder-python-13b-v1.0.{quantization}.gguf
 
 Execute the following command to launch the model, remember to replace ``${quantization}`` with your
 chosen quantization method from the options listed above::
 
    xinference launch --model-name wizardcoder-python-v1.0 --size-in-billions 13 --model-format ggufv2 --quantization ${quantization}
 
+
 Model Spec 6 (ggufv2, 34 Billion)
-+++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** ggufv2
 - **Model Size (in billions):** 34
 - **Quantizations:** Q2_K, Q3_K_L, Q3_K_M, Q3_K_S, Q4_0, Q4_K_M, Q4_K_S, Q5_0, Q5_K_M, Q5_K_S, Q6_K, Q8_0
 - **Model ID:** TheBloke/WizardCoder-Python-34B-V1.0-GGUF
-- **File Name Template:** wizardcoder-python-34b-v1.0.{quantization}.gguf
 
 Execute the following command to launch the model, remember to replace ``${quantization}`` with your
 chosen quantization method from the options listed above::
 
-   xinference launch --model-name wizardcoder-python-v1.0 --size-in-billions 34 --model-format ggufv2 --quantization ${quantization}
\ No newline at end of file
+   xinference launch --model-name wizardcoder-python-v1.0 --size-in-billions 34 --model-format ggufv2 --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/wizardlm-v1.0.rst b/doc/source/models/builtin/llm/wizardlm-v1.0.rst
similarity index 76%
rename from doc/source/models/builtin/wizardlm-v1.0.rst
rename to doc/source/models/builtin/llm/wizardlm-v1.0.rst
index 297bf30f23..679b18b497 100644
--- a/doc/source/models/builtin/wizardlm-v1.0.rst
+++ b/doc/source/models/builtin/llm/wizardlm-v1.0.rst
@@ -1,18 +1,21 @@
-.. _models_builtin_wizardlm_v1_0:
+.. _models_llm_wizardlm-v1.0:
 
-=============
-WizardLM v1.0
-=============
+========================================
+wizardlm-v1.0
+========================================
 
+- **Context Length:** 2048
 - **Model Name:** wizardlm-v1.0
 - **Languages:** en
-- **Abilities:** embed, chat
+- **Abilities:** chat
+- **Description:** WizardLM is an open-source LLM trained by fine-tuning LLaMA with Evol-Instruct.
 
 Specifications
 ^^^^^^^^^^^^^^
 
+
 Model Spec 1 (ggmlv3, 7 Billion)
-++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** ggmlv3
 - **Model Size (in billions):** 7
@@ -24,8 +27,9 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name wizardlm-v1.0 --size-in-billions 7 --model-format ggmlv3 --quantization ${quantization}
 
+
 Model Spec 2 (ggmlv3, 13 Billion)
-+++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** ggmlv3
 - **Model Size (in billions):** 13
@@ -36,3 +40,4 @@ Execute the following command to launch the model, remember to replace ``${quant
 chosen quantization method from the options listed above::
 
    xinference launch --model-name wizardlm-v1.0 --size-in-billions 13 --model-format ggmlv3 --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/wizardmath-v1.0.rst b/doc/source/models/builtin/llm/wizardmath-v1.0.rst
similarity index 77%
rename from doc/source/models/builtin/wizardmath-v1.0.rst
rename to doc/source/models/builtin/llm/wizardmath-v1.0.rst
index eda7e4da7e..e711dde922 100644
--- a/doc/source/models/builtin/wizardmath-v1.0.rst
+++ b/doc/source/models/builtin/llm/wizardmath-v1.0.rst
@@ -1,18 +1,21 @@
-.. _models_builtin_wizardmath_v1_0:
+.. _models_llm_wizardmath-v1.0:
 
-===============
-WizardMath v1.0
-===============
+========================================
+wizardmath-v1.0
+========================================
 
+- **Context Length:** 2048
 - **Model Name:** wizardmath-v1.0
 - **Languages:** en
-- **Abilities:** embed, chat
+- **Abilities:** chat
+- **Description:** WizardMath is an open-source LLM trained by fine-tuning Llama2 with Evol-Instruct, specializing in math.
 
 Specifications
 ^^^^^^^^^^^^^^
 
+
 Model Spec 1 (pytorch, 7 Billion)
-+++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
 - **Model Size (in billions):** 7
@@ -24,12 +27,9 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name wizardmath-v1.0 --size-in-billions 7 --model-format pytorch --quantization ${quantization}
 
-.. note::
-
-   4-bit quantization is not supported on macOS.
 
 Model Spec 2 (pytorch, 13 Billion)
-++++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
 - **Model Size (in billions):** 13
@@ -41,12 +41,9 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name wizardmath-v1.0 --size-in-billions 13 --model-format pytorch --quantization ${quantization}
 
-.. note::
-
-   4-bit quantization is not supported on macOS.
 
 Model Spec 3 (pytorch, 70 Billion)
-++++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
 - **Model Size (in billions):** 70
@@ -58,6 +55,3 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name wizardmath-v1.0 --size-in-billions 70 --model-format pytorch --quantization ${quantization}
 
-.. note::
-
-   4-bit quantization is not supported on macOS.
diff --git a/doc/source/models/builtin/llm/xverse-chat.rst b/doc/source/models/builtin/llm/xverse-chat.rst
new file mode 100644
index 0000000000..9d5ce3281a
--- /dev/null
+++ b/doc/source/models/builtin/llm/xverse-chat.rst
@@ -0,0 +1,43 @@
+.. _models_llm_xverse-chat:
+
+========================================
+xverse-chat
+========================================
+
+- **Context Length:** 2048
+- **Model Name:** xverse-chat
+- **Languages:** en, zh
+- **Abilities:** chat
+- **Description:** XVERSEB-Chat is the aligned version of model XVERSE.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 7
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** xverse/XVERSE-7B-Chat
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name xverse-chat --size-in-billions 7 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 2 (pytorch, 13 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 13
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** xverse/XVERSE-13B-Chat
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name xverse-chat --size-in-billions 13 --model-format pytorch --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/llm/xverse.rst b/doc/source/models/builtin/llm/xverse.rst
new file mode 100644
index 0000000000..a02ce6f2bb
--- /dev/null
+++ b/doc/source/models/builtin/llm/xverse.rst
@@ -0,0 +1,57 @@
+.. _models_llm_xverse:
+
+========================================
+xverse
+========================================
+
+- **Context Length:** 2048
+- **Model Name:** xverse
+- **Languages:** en, zh
+- **Abilities:** generate
+- **Description:** XVERSE is a multilingual large language model, independently developed by Shenzhen Yuanxiang Technology.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 7
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** xverse/XVERSE-7B
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name xverse --size-in-billions 7 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 2 (pytorch, 13 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 13
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** xverse/XVERSE-13B
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name xverse --size-in-billions 13 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 3 (pytorch, 65 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 65
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** xverse/XVERSE-65B
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name xverse --size-in-billions 65 --model-format pytorch --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/Yi-200k.rst b/doc/source/models/builtin/llm/yi-200k.rst
similarity index 67%
rename from doc/source/models/builtin/Yi-200k.rst
rename to doc/source/models/builtin/llm/yi-200k.rst
index 7fa8739a44..125a1a8deb 100644
--- a/doc/source/models/builtin/Yi-200k.rst
+++ b/doc/source/models/builtin/llm/yi-200k.rst
@@ -1,47 +1,43 @@
-.. _models_builtin_Yi_200k:
+.. _models_llm_yi-200k:
 
-
-=======
+========================================
 Yi-200k
-=======
+========================================
 
 - **Context Length:** 204800
 - **Model Name:** Yi-200k
 - **Languages:** en, zh
 - **Abilities:** generate
-- **Description:** The Yi series models are large language models trained from scratch by developers at 01.AI. The first public release contains two bilingual (English/Chinese) base models with the parameter sizes of 6B and 34B. Both of them are trained with 4K sequence length and can be extended to 32K during inference time.
+- **Description:** The Yi series models are large language models trained from scratch by developers at 01.AI.
 
 Specifications
 ^^^^^^^^^^^^^^
 
+
 Model Spec 1 (pytorch, 6 Billion)
-+++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
 - **Model Size (in billions):** 6
 - **Quantizations:** 4-bit, 8-bit, none
 - **Model ID:** 01-ai/Yi-6B-200K
 
-Execute the following command to launch the model, remember to replace `${quantization}` with your
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
 chosen quantization method from the options listed above::
 
    xinference launch --model-name Yi-200k --size-in-billions 6 --model-format pytorch --quantization ${quantization}
 
 
 Model Spec 2 (pytorch, 34 Billion)
-++++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
 - **Model Size (in billions):** 34
 - **Quantizations:** 4-bit, 8-bit, none
 - **Model ID:** 01-ai/Yi-34B-200K
 
-Execute the following command to launch the model, remember to replace `${quantization}` with your
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
 chosen quantization method from the options listed above::
 
    xinference launch --model-name Yi-200k --size-in-billions 34 --model-format pytorch --quantization ${quantization}
 
-
-.. note::
-
-   4-bit quantization is not supported on macOS.
diff --git a/doc/source/models/builtin/llm/yi-chat.rst b/doc/source/models/builtin/llm/yi-chat.rst
new file mode 100644
index 0000000000..9e42ef7aaf
--- /dev/null
+++ b/doc/source/models/builtin/llm/yi-chat.rst
@@ -0,0 +1,43 @@
+.. _models_llm_yi-chat:
+
+========================================
+Yi-chat
+========================================
+
+- **Context Length:** 204800
+- **Model Name:** Yi-chat
+- **Languages:** en, zh
+- **Abilities:** chat
+- **Description:** The Yi series models are large language models trained from scratch by developers at 01.AI.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 34 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 34
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** 01-ai/Yi-34B-Chat
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name Yi-chat --size-in-billions 34 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 2 (ggufv2, 34 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 34
+- **Quantizations:** Q2_K, Q3_K_L, Q3_K_M, Q3_K_S, Q4_0, Q4_K_M, Q4_K_S, Q5_0, Q5_K_M, Q5_K_S, Q6_K, Q8_0
+- **Model ID:** TheBloke/Yi-34B-Chat-GGUF
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name Yi-chat --size-in-billions 34 --model-format ggufv2 --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/Yi.rst b/doc/source/models/builtin/llm/yi.rst
similarity index 50%
rename from doc/source/models/builtin/Yi.rst
rename to doc/source/models/builtin/llm/yi.rst
index 20416b3a07..cf17a759aa 100644
--- a/doc/source/models/builtin/Yi.rst
+++ b/doc/source/models/builtin/llm/yi.rst
@@ -1,47 +1,57 @@
-.. _models_builtin_Yi:
+.. _models_llm_yi:
 
-
-==
+========================================
 Yi
-==
+========================================
 
 - **Context Length:** 4096
 - **Model Name:** Yi
 - **Languages:** en, zh
 - **Abilities:** generate
-- **Description:** The Yi series models are large language models trained from scratch by developers at 01.AI. The first public release contains two bilingual (English/Chinese) base models with the parameter sizes of 6B and 34B. Both of them are trained with 4K sequence length and can be extended to 32K during inference time.
+- **Description:** The Yi series models are large language models trained from scratch by developers at 01.AI.
 
 Specifications
 ^^^^^^^^^^^^^^
 
-Model Spec 1 (pytorch, 6 Billion)
-+++++++++++++++++++++++++++++++++
+
+Model Spec 1 (ggufv2, 34 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 34
+- **Quantizations:** Q2_K, Q3_K_L, Q3_K_M, Q3_K_S, Q4_0, Q4_K_M, Q4_K_S, Q5_0, Q5_K_M, Q5_K_S, Q6_K, Q8_0
+- **Model ID:** TheBloke/Yi-34B-GGUF
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name Yi --size-in-billions 34 --model-format ggufv2 --quantization ${quantization}
+
+
+Model Spec 2 (pytorch, 6 Billion)
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
 - **Model Size (in billions):** 6
 - **Quantizations:** 4-bit, 8-bit, none
 - **Model ID:** 01-ai/Yi-6B
 
-Execute the following command to launch the model, remember to replace `${quantization}` with your
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
 chosen quantization method from the options listed above::
 
    xinference launch --model-name Yi --size-in-billions 6 --model-format pytorch --quantization ${quantization}
 
 
-Model Spec 2 (pytorch, 34 Billion)
-++++++++++++++++++++++++++++++++++
+Model Spec 3 (pytorch, 34 Billion)
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
 - **Model Size (in billions):** 34
 - **Quantizations:** 4-bit, 8-bit, none
 - **Model ID:** 01-ai/Yi-34B
 
-Execute the following command to launch the model, remember to replace `${quantization}` with your
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
 chosen quantization method from the options listed above::
 
    xinference launch --model-name Yi --size-in-billions 34 --model-format pytorch --quantization ${quantization}
 
-
-.. note::
-
-   4-bit quantization is not supported on macOS.
diff --git a/doc/source/models/builtin/zephyr-7b-alpha.rst b/doc/source/models/builtin/llm/zephyr-7b-alpha.rst
similarity index 77%
rename from doc/source/models/builtin/zephyr-7b-alpha.rst
rename to doc/source/models/builtin/llm/zephyr-7b-alpha.rst
index 700e76ff67..953d797e11 100644
--- a/doc/source/models/builtin/zephyr-7b-alpha.rst
+++ b/doc/source/models/builtin/llm/zephyr-7b-alpha.rst
@@ -1,8 +1,8 @@
-.. _models_builtin_zephyr_7b_alpha:
+.. _models_llm_zephyr-7b-alpha:
 
-===============
-Zephyr-7B-alpha
-===============
+========================================
+zephyr-7b-alpha
+========================================
 
 - **Context Length:** 8192
 - **Model Name:** zephyr-7b-alpha
@@ -13,8 +13,9 @@ Zephyr-7B-alpha
 Specifications
 ^^^^^^^^^^^^^^
 
-Model Spec (pytorch, 7 Billion)
-+++++++++++++++++++++++++++++++
+
+Model Spec 1 (pytorch, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
 - **Model Size (in billions):** 7
@@ -26,6 +27,3 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name zephyr-7b-alpha --size-in-billions 7 --model-format pytorch --quantization ${quantization}
 
-.. note::
-
-   4-bit quantization is not supported on macOS.
diff --git a/doc/source/models/builtin/zephyr-7b-beta.rst b/doc/source/models/builtin/llm/zephyr-7b-beta.rst
similarity index 66%
rename from doc/source/models/builtin/zephyr-7b-beta.rst
rename to doc/source/models/builtin/llm/zephyr-7b-beta.rst
index 96fd26d558..7410a6ae95 100644
--- a/doc/source/models/builtin/zephyr-7b-beta.rst
+++ b/doc/source/models/builtin/llm/zephyr-7b-beta.rst
@@ -1,32 +1,29 @@
-.. _models_builtin_zephyr_7b_beta:
+.. _models_llm_zephyr-7b-beta:
 
-==============
-Zephyr-7B-beta
-==============
+========================================
+zephyr-7b-beta
+========================================
 
 - **Context Length:** 8192
 - **Model Name:** zephyr-7b-beta
 - **Languages:** en
 - **Abilities:** chat
-- **Description:** Zephyr-7B-β is the second model in the series, and is a fine-tuned version of mistralai/Mistral-7B-v0.1.
+- **Description:** Zephyr-7B-β is the second model in the series, and is a fine-tuned version of mistralai/Mistral-7B-v0.1
 
 Specifications
 ^^^^^^^^^^^^^^
 
-Model Spec (pytorch, 7 Billion)
-+++++++++++++++++++++++++++++++
+
+Model Spec 1 (pytorch, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
 - **Model Size (in billions):** 7
 - **Quantizations:** 4-bit, 8-bit, none
 - **Model ID:** HuggingFaceH4/zephyr-7b-beta
-- **Model Revision:** 3bac358730f8806e5c3dc7c7e19eb36e045bf720
 
 Execute the following command to launch the model, remember to replace ``${quantization}`` with your
 chosen quantization method from the options listed above::
 
    xinference launch --model-name zephyr-7b-beta --size-in-billions 7 --model-format pytorch --quantization ${quantization}
 
-.. note::
-
-   4-bit quantization is not supported on macOS.
diff --git a/doc/source/models/builtin/qwen-chat.rst b/doc/source/models/builtin/qwen-chat.rst
deleted file mode 100644
index 87b3b85161..0000000000
--- a/doc/source/models/builtin/qwen-chat.rst
+++ /dev/null
@@ -1,103 +0,0 @@
-.. _models_builtin_qwen_chat:
-
-=========
-Qwen Chat
-=========
-
-- **Model Name:** qwen-chat
-- **Languages:** en, zh
-- **Abilities:** embed, chat
-
-Specifications
-^^^^^^^^^^^^^^
-
-Model Spec 1 (pytorch, 7 Billion)
-+++++++++++++++++++++++++++++++++
-
-- **Model Format:** pytorch
-- **Model Size (in billions):** 7
-- **Quantizations:** 4-bit, 8-bit, none
-- **Model ID:** Qwen/Qwen-7B-Chat
-
-Execute the following command to launch the model, remember to replace ``${quantization}`` with your
-chosen quantization method from the options listed above::
-
-   xinference launch --model-name qwen-chat --size-in-billions 7 --model-format pytorch --quantization ${quantization}
-
-.. note::
-
-   4-bit and 8-bit quantization are not supported on macOS.
-
-Model Spec 2 (pytorch, 14 Billion)
-++++++++++++++++++++++++++++++++++
-
-- **Model Format:** pytorch
-- **Model Size (in billions):** 14
-- **Quantizations:** none
-- **Model ID:** Qwen/Qwen-14B-Chat
-
-Execute the following command to launch the model::
-
-   xinference launch --model-name qwen-chat --size-in-billions 14 --model-format pytorch
-
-.. note::
-
-   4-bit and 8-bit quantization are not supported on macOS.
-
-Model Spec 3 (ggmlv3, 7 Billion)
-++++++++++++++++++++++++++++++++
-
-- **Model Format:** ggmlv3
-- **Model Size (in billions):** 7
-- **Quantizations:** q4_0
-- **Model ID:** Xorbits/qwen-chat-7B-ggml
-
-You need to install ``qwen-cpp`` first:
-
-.. code-block:: bash
-
-    pip install -U qwen-cpp
-
-
-If you want to use BLAS to accelerate:
-
-- OpenBLAS:
-
-.. code-block:: bash
-
-    CMAKE_ARGS="-DGGML_OPENBLAS=ON" pip install -U qwen-cpp
-
-
-- cuBLAS:
-
-.. code-block:: bash
-
-    CMAKE_ARGS="-DGGML_CUBLAS=ON" pip install -U qwen-cpp
-
-
-- Metal:
-
-.. code-block:: bash
-
-    CMAKE_ARGS="-DGGML_METAL=ON" pip install -U qwen-cpp
-
-
-Execute the following command to launch the model::
-
-   xinference launch --model-name qwen-chat --size-in-billions 7 --model-format ggmlv3
-
-
-Model Spec 4 (ggmlv3, 14 Billion)
-+++++++++++++++++++++++++++++++++
-
-- **Model Format:** ggmlv3
-- **Model Size (in billions):** 14
-- **Quantizations:** q4_0
-- **Model ID:** Xorbits/qwen-chat-14B-ggml
-
-Install ``qwen-cpp`` as above.
-
-Execute the following command to launch the model::
-
-   xinference launch --model-name qwen-chat --size-in-billions 14 --model-format ggmlv3
-
diff --git a/doc/source/models/builtin/bge-reranker-base.rst b/doc/source/models/builtin/rerank/bge-reranker-base.rst
similarity index 74%
rename from doc/source/models/builtin/bge-reranker-base.rst
rename to doc/source/models/builtin/rerank/bge-reranker-base.rst
index 48123ba205..2d619e5355 100644
--- a/doc/source/models/builtin/bge-reranker-base.rst
+++ b/doc/source/models/builtin/rerank/bge-reranker-base.rst
@@ -1,12 +1,12 @@
-.. _models_builtin_bge_rerank_base:
+.. _models_builtin_bge-reranker-base:
 
 =================
 bge-reranker-base
 =================
 
 - **Model Name:** bge-reranker-base
-- **Languages:** [zh, en]
-- **Abilities:** rerank
+- **Languages:** en, zh
+- **Abilities:** embed
 
 Specifications
 ^^^^^^^^^^^^^^
@@ -15,5 +15,4 @@ Specifications
 
 Execute the following command to launch the model::
 
-   xinference launch --model-name bge-reranker-base --model-type rerank
-
+   xinference launch --model-name bge-reranker-base --model-type rerank
\ No newline at end of file
diff --git a/doc/source/models/builtin/bge-reranker-large.rst b/doc/source/models/builtin/rerank/bge-reranker-large.rst
similarity index 74%
rename from doc/source/models/builtin/bge-reranker-large.rst
rename to doc/source/models/builtin/rerank/bge-reranker-large.rst
index ae85d71694..2be39bfa57 100644
--- a/doc/source/models/builtin/bge-reranker-large.rst
+++ b/doc/source/models/builtin/rerank/bge-reranker-large.rst
@@ -1,12 +1,12 @@
-.. _models_builtin_bge_rerank_large:
+.. _models_builtin_bge-reranker-large:
 
 ==================
 bge-reranker-large
 ==================
 
 - **Model Name:** bge-reranker-large
-- **Languages:** [zh, en]
-- **Abilities:** rerank
+- **Languages:** en, zh
+- **Abilities:** embed
 
 Specifications
 ^^^^^^^^^^^^^^
@@ -15,5 +15,4 @@ Specifications
 
 Execute the following command to launch the model::
 
-   xinference launch --model-name bge-reranker-large --model-type rerank
-
+   xinference launch --model-name bge-reranker-large --model-type rerank
\ No newline at end of file
diff --git a/doc/source/models/builtin/rerank/index.rst b/doc/source/models/builtin/rerank/index.rst
new file mode 100644
index 0000000000..8e6fde00a6
--- /dev/null
+++ b/doc/source/models/builtin/rerank/index.rst
@@ -0,0 +1,17 @@
+.. _models_rarank_index:
+
+================
+Rerank Models
+================
+
+The following is a list of built-in rerank models in Xinference:
+
+
+.. toctree::
+   :maxdepth: 1
+
+  
+   bge-reranker-base
+  
+   bge-reranker-large
+  
\ No newline at end of file
diff --git a/doc/source/models/builtin/starcoder.rst b/doc/source/models/builtin/starcoder.rst
deleted file mode 100644
index 14bd165c5b..0000000000
--- a/doc/source/models/builtin/starcoder.rst
+++ /dev/null
@@ -1,20 +0,0 @@
-.. _models_builtin_starcoder:
-
-=========
-StarCoder
-=========
-
-- **Model Name:** starcoder
-- **Languages:** en
-- **Abilities:** generate
-
-Specifications
-^^^^^^^^^^^^^^
-
-Model Spec (ggmlv3, 16 Billion)
-+++++++++++++++++++++++++++++++
-
-- **Model Format:** ggmlv3
-- **Model Size (in billions):** 16
-- **Quantizations:** q4_0, q4_1, q5_0, q5_1, q8_0
-- **Model ID:** TheBloke/starcoder-GGML
diff --git a/doc/source/models/builtin/xverse-chat.rst b/doc/source/models/builtin/xverse-chat.rst
deleted file mode 100644
index 2df5b57485..0000000000
--- a/doc/source/models/builtin/xverse-chat.rst
+++ /dev/null
@@ -1,58 +0,0 @@
-.. _models_builtin_xverse_chat:
-
-===========
-XVERSE-Chat
-===========
-
-- **Context Length:** 2048
-- **Model Name:** xverse-chat
-- **Languages:** en, zh
-- **Abilities:** chat
-- **Description:** XVERSE-Chat is the aligned version of model XVERSE for chat-based applications.
-
-Specifications
-^^^^^^^^^^^^^^
-
-Model Specs (pytorch, Billions)
-+++++++++++++++++++++++++++++++
-
-- **Model Format:** pytorch
-- **Model Size (in billions):** 7, 13
-- **Quantizations:** 4-bit, 8-bit, none
-
-XVERSE-Chat Model Variants
---------------------------
-
-1. XVERSE-7B-Chat
-    - **Model ID:** xverse/XVERSE-7B-Chat
-    - **Model Revision:** 60acc8c453c067b54df88be98bfdf60585ab5441
-
-2. XVERSE-13B-Chat
-    - **Model ID:** xverse/XVERSE-13B-Chat
-    - **Model Revision:** 1e4944aaa1d8c8d0cdca28bb8e3a003303d0781b
-
-To launch a specific XVERSE-Chat model, use the following command and replace `${quantization}` with your chosen quantization method:
-chosen quantization method from the options listed above and the size::
-
-   xinference launch --model-name xverse-chat --size-in-billions 7 --model-format pytorch --quantization ${quantization}
-
-.. note::
-
-   4-bit quantization is not supported on macOS.
-
-Model Details
-^^^^^^^^^^^^^
-
-- **Version:** 1
-- **Context Length:** 2048
-- **Model Name:** xverse-chat
-- **Model Languages:** en, zh
-- **Model Abilities:** chat
-- **Model Description:** XVERSE-Chat is the aligned version of model XVERSE for chat-based applications.
-
-Prompt Style
-^^^^^^^^^^^^
-
-- **Style Name:** XVERSE
-- **System Prompt:** N/A
-- **Roles:** [user, assistant]
diff --git a/doc/source/models/builtin/xverse.rst b/doc/source/models/builtin/xverse.rst
deleted file mode 100644
index 49db28db67..0000000000
--- a/doc/source/models/builtin/xverse.rst
+++ /dev/null
@@ -1,56 +0,0 @@
-.. _models_builtin_xverse:
-
-======
-XVERSE
-======
-
-- **Context Length:** 2048
-- **Model Name:** xverse
-- **Languages:** en, zh
-- **Abilities:** generate
-- **Description:** XVERSE is a multilingual large language model, independently developed by Shenzhen Yuanxiang Technology.
-
-Specifications
-^^^^^^^^^^^^^^
-
-Model Specs (pytorch, Billions)
-+++++++++++++++++++++++++++++++
-
-- **Model Format:** pytorch
-- **Model Size (in billions):** 7
-- **Quantizations:** 4-bit, 8-bit, none
-
-XVERSE Model Variants
----------------------
-
-1. XVERSE-7B
-    - **Model ID:** xverse/XVERSE-7B
-    - **Model Revision:** 3778b254def675586e9218ccb15b78d6ef66a3a7
-
-2. XVERSE-13B
-    - **Model ID:** xverse/XVERSE-13B
-    - **Model Revision:** 11ac840dda17af81046614229fdd0c658afff747
-
-3. XVERSE-65B
-    - **Model ID:** xverse/XVERSE-65B
-    - **Model Revision:** 7f1b7394f74c630f50612a19ba90bd021c373989
-
-To launch a specific XVERSE model, use the following command and replace `${quantization}` with your chosen quantization method:
-chosen quantization method from the options listed above and the size::
-
-   xinference launch --model-name xverse --size-in-billions 7 --model-format pytorch --quantization ${quantization}
-
-.. note::
-
-   4-bit quantization is not supported on macOS.
-
-Model Details
-^^^^^^^^^^^^^
-
-- **Version:** 1
-- **Context Length:** 2048
-- **Model Name:** xverse
-- **Model Languages:** en, zh
-- **Model Abilities:** generate
-- **Model Description:** XVERSE is a multilingual large language model, independently developed by Shenzhen Yuanxiang Technology.
-
diff --git a/doc/templates/embedding.rst.jinja b/doc/templates/embedding.rst.jinja
new file mode 100644
index 0000000000..3c93fc7cd9
--- /dev/null
+++ b/doc/templates/embedding.rst.jinja
@@ -0,0 +1,20 @@
+.. _models_builtin_{{ model_name|lower }}:
+
+{{ "=" * model_name|length }}
+{{ model_name }}
+{{ "=" * model_name|length }}
+
+- **Model Name:** {{ model_name }}
+- **Languages:** {{ ', '.join(language) }}
+- **Abilities:** embed
+
+Specifications
+^^^^^^^^^^^^^^
+
+- **Dimensions:** {{ dimensions }}
+- **Max Tokens:** {{ max_tokens }}
+- **Model ID:** {{ model_id }}
+
+Execute the following command to launch the model::
+
+   xinference launch --model-name {{ model_name }} --model-type embedding
\ No newline at end of file
diff --git a/doc/templates/embedding_index.rst.jinja b/doc/templates/embedding_index.rst.jinja
new file mode 100644
index 0000000000..5bb93511c4
--- /dev/null
+++ b/doc/templates/embedding_index.rst.jinja
@@ -0,0 +1,15 @@
+.. _models_embedding_index:
+
+================
+Embedding Models
+================
+
+The following is a list of built-in embedding models in Xinference:
+
+
+.. toctree::
+   :maxdepth: 1
+
+  {% for model in models %}
+   {{ model.model_name|lower }}
+  {% endfor %}
\ No newline at end of file
diff --git a/doc/templates/llm.rst.jinja b/doc/templates/llm.rst.jinja
new file mode 100644
index 0000000000..22c4135339
--- /dev/null
+++ b/doc/templates/llm.rst.jinja
@@ -0,0 +1,30 @@
+.. _models_llm_{{ model_name|lower }}:
+
+{{ "=" * 40 }}
+{{ model_name }}
+{{ "=" * 40 }}
+
+- **Context Length:** {{ context_length }}
+- **Model Name:** {{ model_name }}
+- **Languages:** {{ model_lang | join(', ') }}
+- **Abilities:** {{ model_ability | join(', ') }}
+- **Description:** {{ model_description }}
+
+Specifications
+^^^^^^^^^^^^^^
+
+{% for spec in model_specs %}
+Model Spec {{ loop.index }} ({{ spec.model_format }}, {{ spec.model_size_in_billions }} Billion)
+{{ "+" * 40 }}
+
+- **Model Format:** {{ spec.model_format }}
+- **Model Size (in billions):** {{ spec.model_size_in_billions }}
+- **Quantizations:** {{ spec.quantizations | join(', ') }}
+- **Model ID:** {{ spec.model_id }}
+
+Execute the following command to launch the model, remember to replace ``${{ '{' }}quantization{{ '}' }}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name {{ model_name }} --size-in-billions {{ spec.model_size_in_billions }} --model-format {{ spec.model_format }} --quantization ${{ '{' }}quantization{{ '}' }}
+
+{% endfor %}
\ No newline at end of file
diff --git a/doc/templates/llm_index.rst.jinja b/doc/templates/llm_index.rst.jinja
new file mode 100644
index 0000000000..a58480b32e
--- /dev/null
+++ b/doc/templates/llm_index.rst.jinja
@@ -0,0 +1,15 @@
+.. _models_llm_index:
+
+=====================
+Large language Models
+=====================
+
+The following is a list of built-in LLM in Xinference:
+
+
+.. toctree::
+   :maxdepth: 3
+
+  {% for model in models %}
+   {{ model.model_name|lower }}
+  {% endfor %}
\ No newline at end of file
diff --git a/doc/templates/rerank.rst.jinja b/doc/templates/rerank.rst.jinja
new file mode 100644
index 0000000000..c898e7f5d5
--- /dev/null
+++ b/doc/templates/rerank.rst.jinja
@@ -0,0 +1,18 @@
+.. _models_builtin_{{ model_name|lower }}:
+
+{{ "=" * model_name|length }}
+{{ model_name }}
+{{ "=" * model_name|length }}
+
+- **Model Name:** {{ model_name }}
+- **Languages:** {{ ', '.join(language) }}
+- **Abilities:** rerank
+
+Specifications
+^^^^^^^^^^^^^^
+
+- **Model ID:** {{ model_id }}
+
+Execute the following command to launch the model::
+
+   xinference launch --model-name {{ model_name }} --model-type rerank
\ No newline at end of file
diff --git a/doc/templates/rerank_index.rst.jinja b/doc/templates/rerank_index.rst.jinja
new file mode 100644
index 0000000000..5d30967b08
--- /dev/null
+++ b/doc/templates/rerank_index.rst.jinja
@@ -0,0 +1,15 @@
+.. _models_rarank_index:
+
+================
+Rerank Models
+================
+
+The following is a list of built-in rerank models in Xinference:
+
+
+.. toctree::
+   :maxdepth: 1
+
+  {% for model in models %}
+   {{ model.model_name|lower }}
+  {% endfor %}
\ No newline at end of file