diff --git a/doc/source/models/builtin/llm/cogvlm2.rst b/doc/source/models/builtin/llm/cogvlm2.rst new file mode 100644 index 0000000000..e907e1d157 --- /dev/null +++ b/doc/source/models/builtin/llm/cogvlm2.rst @@ -0,0 +1,47 @@ +.. _models_llm_cogvlm2: + +======================================== +cogvlm2 +======================================== + +- **Context Length:** 8192 +- **Model Name:** cogvlm2 +- **Languages:** en, zh +- **Abilities:** chat, vision +- **Description:** CogVLM2 have achieved good results in many lists compared to the previous generation of CogVLM open source models. Its excellent performance can compete with some non-open source models. + +Specifications +^^^^^^^^^^^^^^ + + +Model Spec 1 (pytorch, 20 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 20 +- **Quantizations:** none +- **Engines**: Transformers +- **Model ID:** THUDM/cogvlm2-llama3-chinese-chat-19B +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name cogvlm2 --size-in-billions 20 --model-format pytorch --quantization ${quantization} + + +Model Spec 2 (pytorch, 20 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 20 +- **Quantizations:** int4 +- **Engines**: Transformers +- **Model ID:** THUDM/cogvlm2-llama3-chinese-chat-19B-{quantizations} +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name cogvlm2 --size-in-billions 20 --model-format pytorch --quantization ${quantization} + diff --git a/doc/source/models/builtin/llm/index.rst b/doc/source/models/builtin/llm/index.rst index b14e7b4609..ce9c4b01c7 100644 --- a/doc/source/models/builtin/llm/index.rst +++ b/doc/source/models/builtin/llm/index.rst @@ -126,6 +126,11 @@ The following is a list of built-in LLM in Xinference: - 8194 - CodeShell is a multi-language code LLM developed by the Knowledge Computing Lab of Peking University. + * - :ref:`cogvlm2 ` + - chat, vision + - 8192 + - CogVLM2 have achieved good results in many lists compared to the previous generation of CogVLM open source models. Its excellent performance can compete with some non-open source models. + * - :ref:`deepseek ` - generate - 4096 @@ -236,11 +241,6 @@ The following is a list of built-in LLM in Xinference: - 8192 - The Llama 3 instruction tuned models are optimized for dialogue use cases and outperform many of the available open source chat models on common industry benchmarks.. - * - :ref:`mini-internvl-chat ` - - chat, vision - - 32768 - - InternVL 1.5 is an open-source multimodal large language model (MLLM) to bridge the capability gap between open-source and proprietary commercial models in multimodal understanding. - * - :ref:`minicpm-2b-dpo-bf16 ` - chat - 4096 @@ -550,6 +550,8 @@ The following is a list of built-in LLM in Xinference: codeshell-chat + cogvlm2 + deepseek deepseek-chat @@ -594,8 +596,6 @@ The following is a list of built-in LLM in Xinference: llama-3-instruct - mini-internvl-chat - minicpm-2b-dpo-bf16 minicpm-2b-dpo-fp16 diff --git a/xinference/model/llm/__init__.py b/xinference/model/llm/__init__.py index 196d7fd686..d3674b3795 100644 --- a/xinference/model/llm/__init__.py +++ b/xinference/model/llm/__init__.py @@ -113,6 +113,7 @@ def _install(): from .ggml.llamacpp import LlamaCppChatModel, LlamaCppModel from .pytorch.baichuan import BaichuanPytorchChatModel from .pytorch.chatglm import ChatglmPytorchChatModel + from .pytorch.cogvlm2 import CogVLM2Model from .pytorch.core import PytorchChatModel, PytorchModel from .pytorch.deepseek_vl import DeepSeekVLChatModel from .pytorch.falcon import FalconPytorchChatModel, FalconPytorchModel @@ -159,6 +160,7 @@ def _install(): DeepSeekVLChatModel, InternVLChatModel, PytorchModel, + CogVLM2Model, ] ) if OmniLMMModel: # type: ignore diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json index 17380549e2..4d853dd220 100644 --- a/xinference/model/llm/llm_family.json +++ b/xinference/model/llm/llm_family.json @@ -6247,5 +6247,57 @@ "<|im_end|>" ] } +}, + { + "version": 1, + "context_length": 8192, + "model_name": "cogvlm2", + "model_lang": [ + "en", + "zh" + ], + "model_ability": [ + "chat", + "vision" + ], + "model_description": "CogVLM2 have achieved good results in many lists compared to the previous generation of CogVLM open source models. Its excellent performance can compete with some non-open source models.", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": 20, + "quantizations": [ + "none" + ], + "model_id": "THUDM/cogvlm2-llama3-chinese-chat-19B", + "model_revision": "d88b352bce5ee58a289b1ac8328553eb31efa2ef" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 20, + "quantizations": [ + "int4" + ], + "model_id": "THUDM/cogvlm2-llama3-chinese-chat-19B-{quantizations}", + "model_revision": "7863e362174f4718c2fe9cba4befd0b580a3194f" + } + ], + "prompt_style": { + "style_name": "LLAMA3", + "system_prompt": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.", + "roles": [ + "user", + "assistant" + ], + "intra_message_sep": "\n\n", + "inter_message_sep": "<|eot_id|>", + "stop_token_ids": [ + 128001, + 128009 + ], + "stop": [ + "<|end_of_text|>", + "<|eot_id|>" + ] + } } ] diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json index 88ce61e5e3..97b63a2254 100644 --- a/xinference/model/llm/llm_family_modelscope.json +++ b/xinference/model/llm/llm_family_modelscope.json @@ -3860,5 +3860,60 @@ "<|im_end|>" ] } +}, + { + "version": 1, + "context_length": 8192, + "model_name": "cogvlm2", + "model_lang": [ + "en", + "zh" + ], + "model_ability": [ + "chat", + "vision" + ], + "model_description": "CogVLM2 have achieved good results in many lists compared to the previous generation of CogVLM open source models. Its excellent performance can compete with some non-open source models.", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": 20, + "quantizations": [ + "none" + ], + "model_hub": "modelscope", + + "model_id": "ZhipuAI/cogvlm2-llama3-chinese-chat-19B", + "model_revision": "master" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 20, + "quantizations": [ + "int4" + ], + "model_hub": "modelscope", + "model_id": "ZhipuAI/cogvlm2-llama3-chinese-chat-19B-{quantization}", + "model_revision": "master" + } + ], + "prompt_style": { + "style_name": "LLAMA3", + "system_prompt": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.", + "roles": [ + "user", + "assistant" + ], + "intra_message_sep": "\n\n", + "inter_message_sep": "<|eot_id|>", + "stop_token_ids": [ + 128001, + 128009 + ], + "stop": [ + "<|end_of_text|>", + "<|eot_id|>" + ] + } } ] diff --git a/xinference/model/llm/pytorch/cogvlm2.py b/xinference/model/llm/pytorch/cogvlm2.py new file mode 100644 index 0000000000..c3cc31b23a --- /dev/null +++ b/xinference/model/llm/pytorch/cogvlm2.py @@ -0,0 +1,257 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import base64 +import logging +import time +import uuid +from concurrent.futures import ThreadPoolExecutor +from io import BytesIO +from typing import Dict, Iterator, List, Optional, Tuple, Union + +import requests +import torch +from PIL import Image + +from ....model.utils import select_device +from ....types import ( + ChatCompletion, + ChatCompletionChunk, + ChatCompletionMessage, + Completion, + CompletionChoice, + CompletionUsage, +) +from ..llm_family import LLMFamilyV1, LLMSpecV1 +from .core import PytorchChatModel, PytorchGenerateConfig + +logger = logging.getLogger(__name__) + +IMAGENET_MEAN = (0.485, 0.456, 0.406) +IMAGENET_STD = (0.229, 0.224, 0.225) + + +class CogVLM2Model(PytorchChatModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._torch_type = None + self._device = None + self._tokenizer = None + self._model = None + + @classmethod + def match( + cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str + ) -> bool: + family = model_family.model_family or model_family.model_name + if "cogvlm" in family.lower(): + return True + return False + + def load(self, **kwargs): + from transformers import AutoModelForCausalLM, AutoTokenizer + from transformers.generation import GenerationConfig + + device = self._pytorch_model_config.get("device", "auto") + self._device = select_device(device) + self._torch_type = ( + torch.bfloat16 + if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8 + else torch.float16 + ) + + self._tokenizer = AutoTokenizer.from_pretrained( + self.model_path, + trust_remote_code=True, + ) + + self._model = AutoModelForCausalLM.from_pretrained( + self.model_path, + torch_dtype=self._torch_type, + trust_remote_code=True, + low_cpu_mem_usage=True, + device_map="auto", + ).eval() + + # Specify hyperparameters for generation + self._model.generation_config = GenerationConfig.from_pretrained( + self.model_path, + trust_remote_code=True, + ) + + def _message_content_to_cogvlm2(self, content): + def _load_image(_url): + if _url.startswith("data:"): + logging.info("Parse url by base64 decoder.") + # https://platform.openai.com/docs/guides/vision/uploading-base-64-encoded-images + # e.g. f"data:image/jpeg;base64,{base64_image}" + _type, data = _url.split(";") + _, ext = _type.split("/") + data = data[len("base64,") :] + data = base64.b64decode(data.encode("utf-8")) + return Image.open(BytesIO(data)).convert("RGB") + else: + try: + response = requests.get(_url) + except requests.exceptions.MissingSchema: + return Image.open(_url).convert("RGB") + else: + return Image.open(BytesIO(response.content)).convert("RGB") + + if not isinstance(content, str): + texts = [] + image_urls = [] + for c in content: + c_type = c.get("type") + if c_type == "text": + texts.append(c["text"]) + elif c_type == "image_url": + image_urls.append(c["image_url"]["url"]) + image_futures = [] + with ThreadPoolExecutor() as executor: + for image_url in image_urls: + fut = executor.submit(_load_image, image_url) + image_futures.append(fut) + images = [fut.result() for fut in image_futures] + text = " ".join(texts) + if len(images) == 0: + return text, None + elif len(images) == 1: + return text, images + else: + raise RuntimeError( + "Only one image per message is supported by CogVLM2." + ) + return content, None + + def _history_content_to_cogvlm2( + self, system_prompt: str, chat_history: List[ChatCompletionMessage] + ): + def _image_to_piexl_values(image): + if image.startswith("data:"): + logging.info("Parse url by base64 decoder.") + # https://platform.openai.com/docs/guides/vision/uploading-base-64-encoded-images + # e.g. f"data:image/jpeg;base64,{base64_image}" + _type, data = image.split(";") + _, ext = _type.split("/") + data = data[len("base64,") :] + data = base64.b64decode(data.encode("utf-8")) + return Image.open(BytesIO(data)).convert("RGB") + else: + try: + response = requests.get(image) + except requests.exceptions.MissingSchema: + return Image.open(image).convert("RGB") + else: + return Image.open(BytesIO(response.content)).convert("RGB") + + query = system_prompt + history: List[Tuple] = [] + pixel_values = None + for i in range(0, len(chat_history), 2): + user = chat_history[i]["content"] + if isinstance(user, List): + for content in user: + c_type = content.get("type") + if c_type == "text": + user = content["text"] + elif c_type == "image_url" and not pixel_values: + pixel_values = _image_to_piexl_values( + content["image_url"]["url"] + ) + assistant = chat_history[i + 1]["content"] + query = query + f" USER: {user} ASSISTANT:" + history.append((query, assistant)) + query = query + f" {assistant}" + return query, history, [pixel_values] + + def chat( + self, + prompt: Union[str, List[Dict]], + system_prompt: Optional[str] = None, + chat_history: Optional[List[ChatCompletionMessage]] = None, + generate_config: Optional[PytorchGenerateConfig] = None, + ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: + system_prompt = system_prompt if system_prompt else "" + if generate_config and generate_config.get("stream"): + raise Exception( + f"Chat with model {self.model_family.model_name} does not support stream." + ) + + sanitized_config = { + "pad_token_id": 128002, + "max_new_tokens": generate_config.get("max_tokens", 512) + if generate_config + else 512, + } + + content, image = self._message_content_to_cogvlm2(prompt) + + history = [] + query = "" + history_image = None + if chat_history: + query, history, history_image = self._history_content_to_cogvlm2( + system_prompt, chat_history + ) + + if image and history_image: + history = [] + query = system_prompt + f" USER: {content} ASSISTANT:" + else: + image = image if image else history_image + query = query + f" USER: {content} ASSISTANT:" + + input_by_model = self._model.build_conversation_input_ids( + self._tokenizer, + query=query, + history=history, + images=image, + template_version="chat", + ) + + inputs = { + "input_ids": input_by_model["input_ids"].unsqueeze(0).to(self._device), + "token_type_ids": input_by_model["token_type_ids"] + .unsqueeze(0) + .to(self._device), + "attention_mask": input_by_model["attention_mask"] + .unsqueeze(0) + .to(self._device), + "images": [ + [input_by_model["images"][0].to(self._device).to(self._torch_type)] + ] + if image is not None + else None, + } + with torch.no_grad(): + outputs = self._model.generate(**inputs, **sanitized_config) + outputs = outputs[:, inputs["input_ids"].shape[1] :] + response = self._tokenizer.decode(outputs[0]) + response = response.split("<|end_of_text|>")[0] + + chunk = Completion( + id=str(uuid.uuid1()), + object="text_completion", + created=int(time.time()), + model=self.model_uid, + choices=[ + CompletionChoice( + index=0, text=response, finish_reason="stop", logprobs=None + ) + ], + usage=CompletionUsage( + prompt_tokens=-1, completion_tokens=-1, total_tokens=-1 + ), + ) + return self._to_chat_completion(chunk) diff --git a/xinference/model/llm/pytorch/core.py b/xinference/model/llm/pytorch/core.py index fb43d65c9f..f1598358ca 100644 --- a/xinference/model/llm/pytorch/core.py +++ b/xinference/model/llm/pytorch/core.py @@ -62,6 +62,7 @@ "deepseek-vl-chat", "internvl-chat", "mini-internvl-chat", + "cogvlm2", ]