diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 47b960c7ae..4b1ab6ae1e 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -20,7 +20,7 @@ repos:
       - id: isort
         args: [--sp, setup.cfg]
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.4.1
+    rev: v1.7.1
     hooks:
       - id: mypy
         additional_dependencies: ["tokenize-rt==3.2.0", "types-requests", "types-tabulate"]
diff --git a/doc/source/models/builtin/llm/qwen-chat.rst b/doc/source/models/builtin/llm/qwen-chat.rst
index 4c27fb5e0f..80856794e2 100644
--- a/doc/source/models/builtin/llm/qwen-chat.rst
+++ b/doc/source/models/builtin/llm/qwen-chat.rst
@@ -42,7 +42,21 @@ chosen quantization method from the options listed above::
    xinference launch --model-name qwen-chat --size-in-billions 14 --model-format ggmlv3 --quantization ${quantization}
 
 
-Model Spec 3 (pytorch, 7 Billion)
+Model Spec 3 (pytorch, 1_8 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 1_8
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** Qwen/Qwen-1_8B-Chat
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name qwen-chat --size-in-billions 1_8 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 4 (pytorch, 7 Billion)
 ++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
@@ -56,7 +70,7 @@ chosen quantization method from the options listed above::
    xinference launch --model-name qwen-chat --size-in-billions 7 --model-format pytorch --quantization ${quantization}
 
 
-Model Spec 4 (pytorch, 14 Billion)
+Model Spec 5 (pytorch, 14 Billion)
 ++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
@@ -70,7 +84,7 @@ chosen quantization method from the options listed above::
    xinference launch --model-name qwen-chat --size-in-billions 14 --model-format pytorch --quantization ${quantization}
 
 
-Model Spec 5 (pytorch, 72 Billion)
+Model Spec 6 (pytorch, 72 Billion)
 ++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
@@ -84,7 +98,7 @@ chosen quantization method from the options listed above::
    xinference launch --model-name qwen-chat --size-in-billions 72 --model-format pytorch --quantization ${quantization}
 
 
-Model Spec 6 (gptq, 7 Billion)
+Model Spec 7 (gptq, 7 Billion)
 ++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** gptq
@@ -98,7 +112,7 @@ chosen quantization method from the options listed above::
    xinference launch --model-name qwen-chat --size-in-billions 7 --model-format gptq --quantization ${quantization}
 
 
-Model Spec 7 (gptq, 14 Billion)
+Model Spec 8 (gptq, 14 Billion)
 ++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** gptq
@@ -112,7 +126,7 @@ chosen quantization method from the options listed above::
    xinference launch --model-name qwen-chat --size-in-billions 14 --model-format gptq --quantization ${quantization}
 
 
-Model Spec 8 (gptq, 72 Billion)
+Model Spec 9 (gptq, 72 Billion)
 ++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** gptq
diff --git a/doc/source/models/builtin/rerank/bge-reranker-base.rst b/doc/source/models/builtin/rerank/bge-reranker-base.rst
index 2d619e5355..c891b9ff39 100644
--- a/doc/source/models/builtin/rerank/bge-reranker-base.rst
+++ b/doc/source/models/builtin/rerank/bge-reranker-base.rst
@@ -6,7 +6,7 @@ bge-reranker-base
 
 - **Model Name:** bge-reranker-base
 - **Languages:** en, zh
-- **Abilities:** embed
+- **Abilities:** rerank
 
 Specifications
 ^^^^^^^^^^^^^^
diff --git a/doc/source/models/builtin/rerank/bge-reranker-large.rst b/doc/source/models/builtin/rerank/bge-reranker-large.rst
index 2be39bfa57..24da45f963 100644
--- a/doc/source/models/builtin/rerank/bge-reranker-large.rst
+++ b/doc/source/models/builtin/rerank/bge-reranker-large.rst
@@ -6,7 +6,7 @@ bge-reranker-large
 
 - **Model Name:** bge-reranker-large
 - **Languages:** en, zh
-- **Abilities:** embed
+- **Abilities:** rerank
 
 Specifications
 ^^^^^^^^^^^^^^
diff --git a/xinference/model/llm/core.py b/xinference/model/llm/core.py
index a4e337a502..884f986a2b 100644
--- a/xinference/model/llm/core.py
+++ b/xinference/model/llm/core.py
@@ -17,7 +17,7 @@
 import os
 import platform
 from abc import abstractmethod
-from typing import TYPE_CHECKING, List, Optional, Tuple
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union
 
 from ...core.utils import parse_replica_model_uid
 from ..core import ModelDescription
@@ -51,6 +51,16 @@ def __init__(
         if kwargs:
             raise ValueError(f"Unrecognized keyword arguments: {kwargs}")
 
+    @staticmethod
+    def handle_model_size(model_size_in_billions: Union[str, int]) -> Union[int, float]:
+        if isinstance(model_size_in_billions, str):
+            if "_" in model_size_in_billions:
+                ms = model_size_in_billions.replace("_", ".")
+                return float(ms)
+            else:
+                raise ValueError("Invalid format for `model_size_in_billions`")
+        return model_size_in_billions
+
     @staticmethod
     def _is_darwin_and_apple_silicon():
         return platform.system() == "Darwin" and platform.processor() == "arm"
diff --git a/xinference/model/llm/ggml/ctransformers.py b/xinference/model/llm/ggml/ctransformers.py
index 43e23431f8..52184767a9 100644
--- a/xinference/model/llm/ggml/ctransformers.py
+++ b/xinference/model/llm/ggml/ctransformers.py
@@ -93,7 +93,9 @@ def __init__(
         self._model_type = None
         closest_size = min(
             SIZE_TO_GPU_LAYERS.keys(),
-            key=lambda x: abs(x - model_spec.model_size_in_billions),
+            key=lambda x: abs(
+                x - self.handle_model_size(model_spec.model_size_in_billions)
+            ),
         )
 
         self._model_family = model_family
diff --git a/xinference/model/llm/ggml/llamacpp.py b/xinference/model/llm/ggml/llamacpp.py
index ebe2e10c29..28448e83e1 100644
--- a/xinference/model/llm/ggml/llamacpp.py
+++ b/xinference/model/llm/ggml/llamacpp.py
@@ -59,7 +59,9 @@ def __init__(
 
         closest_size = min(
             SIZE_TO_GPU_LAYERS.keys(),
-            key=lambda x: abs(x - model_spec.model_size_in_billions),
+            key=lambda x: abs(
+                x - self.handle_model_size(model_spec.model_size_in_billions)
+            ),
         )
         self._gpu_layers = SIZE_TO_GPU_LAYERS[closest_size]
         self._llamacpp_model_config: LlamaCppModelConfig = self._sanitize_model_config(
diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
index 5b8a9003dd..0b4621bd45 100644
--- a/xinference/model/llm/llm_family.json
+++ b/xinference/model/llm/llm_family.json
@@ -1116,6 +1116,17 @@
         "model_file_name_template": "qwen14b-ggml-{quantization}.bin",
         "model_revision": "11efca556af372b6f3c730322a4962e9900a2990"
       },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "1_8",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/Qwen-1_8B-Chat",
+        "model_revision": "c3db8007171847931da7efa4b2ed4309afcce021"
+      },
       {
         "model_format": "pytorch",
         "model_size_in_billions": 7,
diff --git a/xinference/model/llm/llm_family.py b/xinference/model/llm/llm_family.py
index 4ec72f4a74..9fcff01180 100644
--- a/xinference/model/llm/llm_family.py
+++ b/xinference/model/llm/llm_family.py
@@ -19,7 +19,7 @@
 from threading import Lock
 from typing import Any, Dict, List, Optional, Tuple, Type, Union
 
-from pydantic import BaseModel, Field, Protocol, ValidationError
+from pydantic import BaseModel, Field, Protocol, ValidationError, validator
 from pydantic.error_wrappers import ErrorWrapper
 from pydantic.parse import load_str_bytes
 from pydantic.types import StrBytes
@@ -45,7 +45,8 @@
 
 class GgmlLLMSpecV1(BaseModel):
     model_format: Literal["ggmlv3", "ggufv2"]
-    model_size_in_billions: int
+    # Must in order that `str` first, then `int`
+    model_size_in_billions: Union[str, int]
     quantizations: List[str]
     model_id: str
     model_file_name_template: str
@@ -53,16 +54,39 @@ class GgmlLLMSpecV1(BaseModel):
     model_uri: Optional[str]
     model_revision: Optional[str]
 
+    @validator("model_size_in_billions", pre=False)
+    def validate_model_size_with_radix(cls, v: object) -> object:
+        if isinstance(v, str):
+            if (
+                "_" in v
+            ):  # for example, "1_8" just returns "1_8", otherwise int("1_8") returns 18
+                return v
+            else:
+                return int(v)
+        return v
+
 
 class PytorchLLMSpecV1(BaseModel):
     model_format: Literal["pytorch", "gptq"]
-    model_size_in_billions: int
+    # Must in order that `str` first, then `int`
+    model_size_in_billions: Union[str, int]
     quantizations: List[str]
     model_id: str
     model_hub: str = "huggingface"
     model_uri: Optional[str]
     model_revision: Optional[str]
 
+    @validator("model_size_in_billions", pre=False)
+    def validate_model_size_with_radix(cls, v: object) -> object:
+        if isinstance(v, str):
+            if (
+                "_" in v
+            ):  # for example, "1_8" just returns "1_8", otherwise int("1_8") returns 18
+                return v
+            else:
+                return int(v)
+        return v
+
 
 class PromptStyleV1(BaseModel):
     style_name: str
@@ -152,7 +176,7 @@ def download_from_self_hosted_storage() -> bool:
 def get_legacy_cache_path(
     model_name: str,
     model_format: str,
-    model_size_in_billions: Optional[int] = None,
+    model_size_in_billions: Optional[Union[str, int]] = None,
     quantization: Optional[str] = None,
 ) -> str:
     full_name = f"{model_name}-{model_format}-{model_size_in_billions}b-{quantization}"
diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json
index fd0d3f59ac..9aaab67039 100644
--- a/xinference/model/llm/llm_family_modelscope.json
+++ b/xinference/model/llm/llm_family_modelscope.json
@@ -1366,6 +1366,18 @@
         "model_file_name_template": "qwen14b-ggml-{quantization}.bin",
         "model_revision": "v0.0.2"
       },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "1_8",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "qwen/Qwen-1_8B-Chat",
+        "model_revision": "v1.0.0"
+      },
       {
         "model_format": "pytorch",
         "model_size_in_billions": 7,
diff --git a/xinference/web/ui/src/scenes/launch_model/modelCard.js b/xinference/web/ui/src/scenes/launch_model/modelCard.js
index bd2bfe7d4a..7a86277635 100644
--- a/xinference/web/ui/src/scenes/launch_model/modelCard.js
+++ b/xinference/web/ui/src/scenes/launch_model/modelCard.js
@@ -91,7 +91,8 @@ const ModelCard = ({ url, modelData, gpuAvailable, is_custom = false }) => {
             .filter(
               (spec) =>
                 spec.model_format === modelFormat &&
-                spec.model_size_in_billions === parseFloat(modelSize)
+                spec.model_size_in_billions ===
+                  (modelSize.includes('_') ? modelSize : parseFloat(modelSize))
             )
             .flatMap((spec) => spec.quantizations)
         ),