diff --git a/README.md b/README.md
index e704be542f..22971ea84f 100644
--- a/README.md
+++ b/README.md
@@ -32,6 +32,7 @@ potential of cutting-edge AI models.
 - Speculative decoding: [#509](https://github.com/xorbitsai/inference/pull/509)
 - Incorporate vLLM: [#445](https://github.com/xorbitsai/inference/pull/445)
 ### New Models
+- Built-in support for [phi-2](https://huggingface.co/microsoft/phi-2): [#828](https://github.com/xorbitsai/inference/pull/828)
 - Built-in support for [mistral-instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2): [#796](https://github.com/xorbitsai/inference/pull/796)
 - Built-in support for [deepseek-llm](https://huggingface.co/deepseek-ai) and [deepseek-coder](https://huggingface.co/deepseek-ai): [#786](https://github.com/xorbitsai/inference/pull/786)
 - Built-in support for [Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1): [#782](https://github.com/xorbitsai/inference/pull/782)
diff --git a/README_zh_CN.md b/README_zh_CN.md
index 4e18c320ac..11bd07e1c6 100644
--- a/README_zh_CN.md
+++ b/README_zh_CN.md
@@ -30,6 +30,7 @@ Xorbits Inference（Xinference）是一个性能强大且功能全面的分布
 - 投机采样: [#509](https://github.com/xorbitsai/inference/pull/509)
 - 引入 vLLM: [#445](https://github.com/xorbitsai/inference/pull/445)
 ### 新模型
+- 内置 [phi-2](https://huggingface.co/microsoft/phi-2): [#828](https://github.com/xorbitsai/inference/pull/828)
 - 内置 [mistral-instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2): [#796](https://github.com/xorbitsai/inference/pull/796)
 - 内置 [deepseek-llm](https://huggingface.co/deepseek-ai) 与 [deepseek-coder](https://huggingface.co/deepseek-ai): [#786](https://github.com/xorbitsai/inference/pull/786)
 - 内置 [Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1): [#782](https://github.com/xorbitsai/inference/pull/782)
diff --git a/doc/source/models/builtin/llm/index.rst b/doc/source/models/builtin/llm/index.rst
index 89913618c4..21b57b8c86 100644
--- a/doc/source/models/builtin/llm/index.rst
+++ b/doc/source/models/builtin/llm/index.rst
@@ -79,6 +79,8 @@ The following is a list of built-in LLM in Xinference:
   
    orca
   
+   phi-2
+  
    qwen-chat
   
    skywork
diff --git a/doc/source/models/builtin/llm/phi-2.rst b/doc/source/models/builtin/llm/phi-2.rst
new file mode 100644
index 0000000000..080071ab52
--- /dev/null
+++ b/doc/source/models/builtin/llm/phi-2.rst
@@ -0,0 +1,43 @@
+.. _models_llm_phi-2:
+
+========================================
+phi-2
+========================================
+
+- **Context Length:** 2048
+- **Model Name:** phi-2
+- **Languages:** en
+- **Abilities:** generate
+- **Description:** Phi-2 is a 2.7B Transformer based LLM used for research on model safety, trained with data similar to Phi-1.5 but augmented with synthetic texts and curated websites.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (ggufv2, 2 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 2
+- **Quantizations:** Q2_K, Q3_K_S, Q3_K_M, Q3_K_L, Q4_0, Q4_K_S, Q4_K_M, Q5_0, Q5_K_S, Q5_K_M, Q6_K, Q8_0
+- **Model ID:** TheBloke/phi-2-GGUF
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name phi-2 --size-in-billions 2 --model-format ggufv2 --quantization ${quantization}
+
+
+Model Spec 2 (pytorch, 2 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 2
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** microsoft/phi-2
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name phi-2 --size-in-billions 2 --model-format pytorch --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/llm/skywork-math.rst b/doc/source/models/builtin/llm/skywork-math.rst
index b71b11f5d4..307be888fb 100644
--- a/doc/source/models/builtin/llm/skywork-math.rst
+++ b/doc/source/models/builtin/llm/skywork-math.rst
@@ -26,3 +26,4 @@ Execute the following command to launch the model, remember to replace ``${quant
 chosen quantization method from the options listed above::
 
    xinference launch --model-name Skywork-Math --size-in-billions 13 --model-format pytorch --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/llm/skywork.rst b/doc/source/models/builtin/llm/skywork.rst
index 442b32ae43..51d52d7e0a 100644
--- a/doc/source/models/builtin/llm/skywork.rst
+++ b/doc/source/models/builtin/llm/skywork.rst
@@ -26,3 +26,4 @@ Execute the following command to launch the model, remember to replace ``${quant
 chosen quantization method from the options listed above::
 
    xinference launch --model-name Skywork --size-in-billions 13 --model-format pytorch --quantization ${quantization}
+
diff --git a/setup.cfg b/setup.cfg
index f8d8f506bd..a1a4f2b02c 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -73,7 +73,7 @@ dev =
 all =
     chatglm-cpp>=0.3.0
     ctransformers
-    llama-cpp-python>=0.2.23
+    llama-cpp-python>=0.2.25
     transformers>=4.34.1
     torch
     accelerate>=0.20.3
@@ -91,7 +91,7 @@ all =
     auto-gptq ; sys_platform!='darwin'
     optimum
 ggml =
-    llama-cpp-python>=0.2.23
+    llama-cpp-python>=0.2.25
     ctransformers
     chatglm-cpp>=0.3.0
 transformers =
diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
index 0a1d9c6541..05a081f4a7 100644
--- a/xinference/model/llm/llm_family.json
+++ b/xinference/model/llm/llm_family.json
@@ -350,6 +350,51 @@
       "intra_message_sep": "\n\n### "
     }
   },
+  {
+    "version": 1,
+    "context_length": 2048,
+    "model_name": "phi-2",
+    "model_lang": [
+      "en"
+    ],
+    "model_ability": [
+      "generate"
+    ],
+    "model_description": "Phi-2 is a 2.7B Transformer based LLM used for research on model safety, trained with data similar to Phi-1.5 but augmented with synthetic texts and curated websites.",
+    "model_specs": [
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 2,
+        "quantizations": [
+          "Q2_K",
+          "Q3_K_S",
+          "Q3_K_M",
+          "Q3_K_L",
+          "Q4_0",
+          "Q4_K_S",
+          "Q4_K_M",
+          "Q5_0",
+          "Q5_K_S",
+          "Q5_K_M",
+          "Q6_K",
+          "Q8_0"
+        ],
+        "model_id": "TheBloke/phi-2-GGUF",
+        "model_file_name_template": "phi-2.{quantization}.gguf"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 2,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "microsoft/phi-2",
+        "model_revision": "d3186761bf5c4409f7679359284066c25ab668ee"
+      }
+    ]
+  },
   {
     "version": 1,
     "context_length": 2048,