diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 47b960c7ae..4b1ab6ae1e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -20,7 +20,7 @@ repos: - id: isort args: [--sp, setup.cfg] - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.4.1 + rev: v1.7.1 hooks: - id: mypy additional_dependencies: ["tokenize-rt==3.2.0", "types-requests", "types-tabulate"] diff --git a/doc/source/models/builtin/llm/qwen-chat.rst b/doc/source/models/builtin/llm/qwen-chat.rst index 4c27fb5e0f..80856794e2 100644 --- a/doc/source/models/builtin/llm/qwen-chat.rst +++ b/doc/source/models/builtin/llm/qwen-chat.rst @@ -42,7 +42,21 @@ chosen quantization method from the options listed above:: xinference launch --model-name qwen-chat --size-in-billions 14 --model-format ggmlv3 --quantization ${quantization} -Model Spec 3 (pytorch, 7 Billion) +Model Spec 3 (pytorch, 1_8 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 1_8 +- **Quantizations:** 4-bit, 8-bit, none +- **Model ID:** Qwen/Qwen-1_8B-Chat + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name qwen-chat --size-in-billions 1_8 --model-format pytorch --quantization ${quantization} + + +Model Spec 4 (pytorch, 7 Billion) ++++++++++++++++++++++++++++++++++++++++ - **Model Format:** pytorch @@ -56,7 +70,7 @@ chosen quantization method from the options listed above:: xinference launch --model-name qwen-chat --size-in-billions 7 --model-format pytorch --quantization ${quantization} -Model Spec 4 (pytorch, 14 Billion) +Model Spec 5 (pytorch, 14 Billion) ++++++++++++++++++++++++++++++++++++++++ - **Model Format:** pytorch @@ -70,7 +84,7 @@ chosen quantization method from the options listed above:: xinference launch --model-name qwen-chat --size-in-billions 14 --model-format pytorch --quantization ${quantization} -Model Spec 5 (pytorch, 72 Billion) +Model Spec 6 (pytorch, 72 Billion) ++++++++++++++++++++++++++++++++++++++++ - **Model Format:** pytorch @@ -84,7 +98,7 @@ chosen quantization method from the options listed above:: xinference launch --model-name qwen-chat --size-in-billions 72 --model-format pytorch --quantization ${quantization} -Model Spec 6 (gptq, 7 Billion) +Model Spec 7 (gptq, 7 Billion) ++++++++++++++++++++++++++++++++++++++++ - **Model Format:** gptq @@ -98,7 +112,7 @@ chosen quantization method from the options listed above:: xinference launch --model-name qwen-chat --size-in-billions 7 --model-format gptq --quantization ${quantization} -Model Spec 7 (gptq, 14 Billion) +Model Spec 8 (gptq, 14 Billion) ++++++++++++++++++++++++++++++++++++++++ - **Model Format:** gptq @@ -112,7 +126,7 @@ chosen quantization method from the options listed above:: xinference launch --model-name qwen-chat --size-in-billions 14 --model-format gptq --quantization ${quantization} -Model Spec 8 (gptq, 72 Billion) +Model Spec 9 (gptq, 72 Billion) ++++++++++++++++++++++++++++++++++++++++ - **Model Format:** gptq diff --git a/doc/source/models/builtin/rerank/bge-reranker-base.rst b/doc/source/models/builtin/rerank/bge-reranker-base.rst index 2d619e5355..c891b9ff39 100644 --- a/doc/source/models/builtin/rerank/bge-reranker-base.rst +++ b/doc/source/models/builtin/rerank/bge-reranker-base.rst @@ -6,7 +6,7 @@ bge-reranker-base - **Model Name:** bge-reranker-base - **Languages:** en, zh -- **Abilities:** embed +- **Abilities:** rerank Specifications ^^^^^^^^^^^^^^ diff --git a/doc/source/models/builtin/rerank/bge-reranker-large.rst b/doc/source/models/builtin/rerank/bge-reranker-large.rst index 2be39bfa57..24da45f963 100644 --- a/doc/source/models/builtin/rerank/bge-reranker-large.rst +++ b/doc/source/models/builtin/rerank/bge-reranker-large.rst @@ -6,7 +6,7 @@ bge-reranker-large - **Model Name:** bge-reranker-large - **Languages:** en, zh -- **Abilities:** embed +- **Abilities:** rerank Specifications ^^^^^^^^^^^^^^ diff --git a/xinference/model/llm/core.py b/xinference/model/llm/core.py index a4e337a502..884f986a2b 100644 --- a/xinference/model/llm/core.py +++ b/xinference/model/llm/core.py @@ -17,7 +17,7 @@ import os import platform from abc import abstractmethod -from typing import TYPE_CHECKING, List, Optional, Tuple +from typing import TYPE_CHECKING, List, Optional, Tuple, Union from ...core.utils import parse_replica_model_uid from ..core import ModelDescription @@ -51,6 +51,16 @@ def __init__( if kwargs: raise ValueError(f"Unrecognized keyword arguments: {kwargs}") + @staticmethod + def handle_model_size(model_size_in_billions: Union[str, int]) -> Union[int, float]: + if isinstance(model_size_in_billions, str): + if "_" in model_size_in_billions: + ms = model_size_in_billions.replace("_", ".") + return float(ms) + else: + raise ValueError("Invalid format for `model_size_in_billions`") + return model_size_in_billions + @staticmethod def _is_darwin_and_apple_silicon(): return platform.system() == "Darwin" and platform.processor() == "arm" diff --git a/xinference/model/llm/ggml/ctransformers.py b/xinference/model/llm/ggml/ctransformers.py index 43e23431f8..52184767a9 100644 --- a/xinference/model/llm/ggml/ctransformers.py +++ b/xinference/model/llm/ggml/ctransformers.py @@ -93,7 +93,9 @@ def __init__( self._model_type = None closest_size = min( SIZE_TO_GPU_LAYERS.keys(), - key=lambda x: abs(x - model_spec.model_size_in_billions), + key=lambda x: abs( + x - self.handle_model_size(model_spec.model_size_in_billions) + ), ) self._model_family = model_family diff --git a/xinference/model/llm/ggml/llamacpp.py b/xinference/model/llm/ggml/llamacpp.py index ebe2e10c29..28448e83e1 100644 --- a/xinference/model/llm/ggml/llamacpp.py +++ b/xinference/model/llm/ggml/llamacpp.py @@ -59,7 +59,9 @@ def __init__( closest_size = min( SIZE_TO_GPU_LAYERS.keys(), - key=lambda x: abs(x - model_spec.model_size_in_billions), + key=lambda x: abs( + x - self.handle_model_size(model_spec.model_size_in_billions) + ), ) self._gpu_layers = SIZE_TO_GPU_LAYERS[closest_size] self._llamacpp_model_config: LlamaCppModelConfig = self._sanitize_model_config( diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json index 5b8a9003dd..0b4621bd45 100644 --- a/xinference/model/llm/llm_family.json +++ b/xinference/model/llm/llm_family.json @@ -1116,6 +1116,17 @@ "model_file_name_template": "qwen14b-ggml-{quantization}.bin", "model_revision": "11efca556af372b6f3c730322a4962e9900a2990" }, + { + "model_format": "pytorch", + "model_size_in_billions": "1_8", + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "Qwen/Qwen-1_8B-Chat", + "model_revision": "c3db8007171847931da7efa4b2ed4309afcce021" + }, { "model_format": "pytorch", "model_size_in_billions": 7, diff --git a/xinference/model/llm/llm_family.py b/xinference/model/llm/llm_family.py index 4ec72f4a74..9fcff01180 100644 --- a/xinference/model/llm/llm_family.py +++ b/xinference/model/llm/llm_family.py @@ -19,7 +19,7 @@ from threading import Lock from typing import Any, Dict, List, Optional, Tuple, Type, Union -from pydantic import BaseModel, Field, Protocol, ValidationError +from pydantic import BaseModel, Field, Protocol, ValidationError, validator from pydantic.error_wrappers import ErrorWrapper from pydantic.parse import load_str_bytes from pydantic.types import StrBytes @@ -45,7 +45,8 @@ class GgmlLLMSpecV1(BaseModel): model_format: Literal["ggmlv3", "ggufv2"] - model_size_in_billions: int + # Must in order that `str` first, then `int` + model_size_in_billions: Union[str, int] quantizations: List[str] model_id: str model_file_name_template: str @@ -53,16 +54,39 @@ class GgmlLLMSpecV1(BaseModel): model_uri: Optional[str] model_revision: Optional[str] + @validator("model_size_in_billions", pre=False) + def validate_model_size_with_radix(cls, v: object) -> object: + if isinstance(v, str): + if ( + "_" in v + ): # for example, "1_8" just returns "1_8", otherwise int("1_8") returns 18 + return v + else: + return int(v) + return v + class PytorchLLMSpecV1(BaseModel): model_format: Literal["pytorch", "gptq"] - model_size_in_billions: int + # Must in order that `str` first, then `int` + model_size_in_billions: Union[str, int] quantizations: List[str] model_id: str model_hub: str = "huggingface" model_uri: Optional[str] model_revision: Optional[str] + @validator("model_size_in_billions", pre=False) + def validate_model_size_with_radix(cls, v: object) -> object: + if isinstance(v, str): + if ( + "_" in v + ): # for example, "1_8" just returns "1_8", otherwise int("1_8") returns 18 + return v + else: + return int(v) + return v + class PromptStyleV1(BaseModel): style_name: str @@ -152,7 +176,7 @@ def download_from_self_hosted_storage() -> bool: def get_legacy_cache_path( model_name: str, model_format: str, - model_size_in_billions: Optional[int] = None, + model_size_in_billions: Optional[Union[str, int]] = None, quantization: Optional[str] = None, ) -> str: full_name = f"{model_name}-{model_format}-{model_size_in_billions}b-{quantization}" diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json index fd0d3f59ac..9aaab67039 100644 --- a/xinference/model/llm/llm_family_modelscope.json +++ b/xinference/model/llm/llm_family_modelscope.json @@ -1366,6 +1366,18 @@ "model_file_name_template": "qwen14b-ggml-{quantization}.bin", "model_revision": "v0.0.2" }, + { + "model_format": "pytorch", + "model_size_in_billions": "1_8", + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_hub": "modelscope", + "model_id": "qwen/Qwen-1_8B-Chat", + "model_revision": "v1.0.0" + }, { "model_format": "pytorch", "model_size_in_billions": 7, diff --git a/xinference/web/ui/src/scenes/launch_model/modelCard.js b/xinference/web/ui/src/scenes/launch_model/modelCard.js index bd2bfe7d4a..7a86277635 100644 --- a/xinference/web/ui/src/scenes/launch_model/modelCard.js +++ b/xinference/web/ui/src/scenes/launch_model/modelCard.js @@ -91,7 +91,8 @@ const ModelCard = ({ url, modelData, gpuAvailable, is_custom = false }) => { .filter( (spec) => spec.model_format === modelFormat && - spec.model_size_in_billions === parseFloat(modelSize) + spec.model_size_in_billions === + (modelSize.includes('_') ? modelSize : parseFloat(modelSize)) ) .flatMap((spec) => spec.quantizations) ),