From b854c5bb184069ee1a62dddb69a51a0a1366470c Mon Sep 17 00:00:00 2001 From: Peter Robicheaux Date: Wed, 6 Dec 2023 00:07:41 +0000 Subject: [PATCH 01/12] First cog commit --- inference/models/cogvlm/cog.py | 60 +++++++++++++++++++++++++++++++ requirements/requirements.gpu.txt | 8 ++++- 2 files changed, 67 insertions(+), 1 deletion(-) create mode 100644 inference/models/cogvlm/cog.py diff --git a/inference/models/cogvlm/cog.py b/inference/models/cogvlm/cog.py new file mode 100644 index 000000000..ad7ca7899 --- /dev/null +++ b/inference/models/cogvlm/cog.py @@ -0,0 +1,60 @@ +import torch +import requests +from PIL import Image +from transformers import AutoModelForCausalLM, LlamaTokenizer + +tokenizer = LlamaTokenizer.from_pretrained('lmsys/vicuna-7b-v1.5') +model = AutoModelForCausalLM.from_pretrained( + 'THUDM/cogvlm-chat-hf', + torch_dtype=torch.bfloat16, + low_cpu_mem_usage=True, + trust_remote_code=True, + load_in_8bit=True, + cache_dir="/cache" +).eval() + + +# chat example + +query = 'Describe this image' +image = Image.open(requests.get('https://github.com/THUDM/CogVLM/blob/main/examples/1.png?raw=true', stream=True).raw).convert('RGB') +inputs = model.build_conversation_input_ids(tokenizer, query=query, history=[], images=[image]) # chat mode +inputs = { + 'input_ids': inputs['input_ids'].unsqueeze(0).to('cuda'), + 'token_type_ids': inputs['token_type_ids'].unsqueeze(0).to('cuda'), + 'attention_mask': inputs['attention_mask'].unsqueeze(0).to('cuda'), + 'images': [[inputs['images'][0].to('cuda').to(torch.bfloat16)]], +} +gen_kwargs = {"max_length": 2048, "do_sample": False} + +with torch.no_grad(): + outputs = model.generate(**inputs, **gen_kwargs) + outputs = outputs[:, inputs['input_ids'].shape[1]:] + print(tokenizer.decode(outputs[0])) + +# This image captures a moment from a basketball game. Two players are prominently featured: one wearing a yellow jersey with the number +# 24 and the word 'Lakers' written on it, and the other wearing a navy blue jersey with the word 'Washington' and the number 34. The player +# in yellow is holding a basketball and appears to be dribbling it, while the player in navy blue is reaching out with his arm, possibly +# trying to block or defend. The background shows a filled stadium with spectators, indicating that this is a professional game. + + + +# vqa example + +query = 'How many houses are there in this cartoon?' +image = Image.open(requests.get('https://github.com/THUDM/CogVLM/blob/main/examples/3.jpg?raw=true', stream=True).raw).convert('RGB') +inputs = model.build_conversation_input_ids(tokenizer, query=query, history=[], images=[image], template_version='vqa') # vqa mode +inputs = { + 'input_ids': inputs['input_ids'].unsqueeze(0).to('cuda'), + 'token_type_ids': inputs['token_type_ids'].unsqueeze(0).to('cuda'), + 'attention_mask': inputs['attention_mask'].unsqueeze(0).to('cuda'), + 'images': [[inputs['images'][0].to('cuda').to(torch.bfloat16)]], +} +gen_kwargs = {"max_length": 2048, "do_sample": False} + +with torch.no_grad(): + outputs = model.generate(**inputs, **gen_kwargs) + outputs = outputs[:, inputs['input_ids'].shape[1]:] + print(tokenizer.decode(outputs[0])) + +# 4 \ No newline at end of file diff --git a/requirements/requirements.gpu.txt b/requirements/requirements.gpu.txt index 4347b46d4..23260e875 100644 --- a/requirements/requirements.gpu.txt +++ b/requirements/requirements.gpu.txt @@ -1 +1,7 @@ -onnxruntime-gpu<=1.15.1 \ No newline at end of file +onnxruntime-gpu<=1.15.1 +transformers +sentencepiece +einops +xformers +accelerate +bitsandbytes \ No newline at end of file From c117ff3f199221bb05a7bd3f76f4a23029c674db Mon Sep 17 00:00:00 2001 From: Peter Robicheaux Date: Wed, 6 Dec 2023 23:12:51 +0000 Subject: [PATCH 02/12] CogVLM in inference! --- docker/dockerfiles/Dockerfile.onnx.gpu | 2 + inference/core/entities/requests/cog.py | 40 ++++++ inference/core/entities/responses/cog.py | 10 ++ inference/core/env.py | 6 + inference/core/interfaces/http/http_api.py | 43 ++++++ inference/core/registries/roboflow.py | 1 + inference/models/__init__.py | 5 + inference/models/cogvlm/__init__.py | 1 + inference/models/cogvlm/cog.py | 153 +++++++++++++-------- inference/models/utils.py | 7 + requirements/requirements.cog.txt | 6 + requirements/requirements.gpu.txt | 8 +- 12 files changed, 219 insertions(+), 63 deletions(-) create mode 100644 inference/core/entities/requests/cog.py create mode 100644 inference/core/entities/responses/cog.py create mode 100644 inference/models/cogvlm/__init__.py create mode 100644 requirements/requirements.cog.txt diff --git a/docker/dockerfiles/Dockerfile.onnx.gpu b/docker/dockerfiles/Dockerfile.onnx.gpu index 849a4e32c..b2f92972f 100644 --- a/docker/dockerfiles/Dockerfile.onnx.gpu +++ b/docker/dockerfiles/Dockerfile.onnx.gpu @@ -20,6 +20,7 @@ COPY requirements/requirements.sam.txt \ requirements/requirements.waf.txt \ requirements/requirements.gaze.txt \ requirements/requirements.doctr.txt \ + requirements/requirements.cog.txt \ requirements/_requirements.txt \ ./ @@ -32,6 +33,7 @@ RUN pip3 install --upgrade pip && pip3 install \ -r requirements.waf.txt \ -r requirements.gaze.txt \ -r requirements.doctr.txt \ + -r requirements.cog.txt \ --upgrade \ && rm -rf ~/.cache/pip diff --git a/inference/core/entities/requests/cog.py b/inference/core/entities/requests/cog.py new file mode 100644 index 000000000..66b1e31a4 --- /dev/null +++ b/inference/core/entities/requests/cog.py @@ -0,0 +1,40 @@ +from typing import Dict, List, Optional, Union + +from pydantic import Field, validator + +from inference.core.entities.requests.inference import ( + BaseRequest, + InferenceRequestImage, +) +from inference.core.env import COG_VERSION_ID + + +class CogVLMInferenceRequest(BaseRequest): + """Request for CogVLM inference. + + Attributes: + api_key (Optional[str]): Roboflow API Key. + cog_version_id (Optional[str]): The version ID of CLIP to be used for this request. + """ + + cogvlm_version_id: Optional[str] = Field( + default=COG_VERSION_ID, + example="cogvlm-chat-hf", + description="The version ID of CogVLM to be used for this request. See the huggingface model repo at THUDM.", + ) + model_id: Optional[str] = Field() + image: Optional[InferenceRequestImage] = Field( + description="Image for CogVLM to look at. Specify what you want it to do with this image. Don't provide an image to just use CogVLM as an LLM." + ) + prompt: str = Field( + description="Text to be passed to CogVLM. Use to prompt it to describe an image or provide only text to chat with the model.", + example="Describe this image.", + ) + + @validator("model_id", always=True) + def validate_model_id(cls, value, values): + if value is not None: + return value + if values.get("cogvlm_version_id") is None: + return None + return f"cogvlm/{values['cogvlm_version_id']}" diff --git a/inference/core/entities/responses/cog.py b/inference/core/entities/responses/cog.py new file mode 100644 index 000000000..415c06373 --- /dev/null +++ b/inference/core/entities/responses/cog.py @@ -0,0 +1,10 @@ +from typing import Optional + +from pydantic import BaseModel, Field + + +class CogVLMResponse(BaseModel): + response: str = Field(description="Text generated by CogVLM") + time: Optional[float] = Field( + description="The time in seconds it took to produce the response including preprocessing" + ) diff --git a/inference/core/env.py b/inference/core/env.py index de5594436..0360d1c72 100644 --- a/inference/core/env.py +++ b/inference/core/env.py @@ -38,6 +38,9 @@ # AWS secret access key, default is None AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY", None) +COG_LOAD_4BIT = str2bool(os.getenv("COG_LOAD_4BIT", True)) +COG_LOAD_8BIT = str2bool(os.getenv("COG_LOAD_8BIT", False)) +COG_VERSION_ID = os.getenv("COG_VERSION_ID", "cogvlm-chat-hf") # CLIP version ID, default is "ViT-B-16" CLIP_VERSION_ID = os.getenv("CLIP_VERSION_ID", "ViT-B-16") @@ -83,6 +86,8 @@ # Flag to enable DocTR core model, default is True CORE_MODEL_DOCTR_ENABLED = str2bool(os.getenv("CORE_MODEL_DOCTR_ENABLED", True)) +CORE_MODEL_COGVLM_ENABLED = str2bool(os.getenv("CORE_MODEL_COGVLM_ENABLED", True)) + # ID of host device, default is None DEVICE_ID = os.getenv("DEVICE_ID", None) @@ -227,6 +232,7 @@ # SAM version ID, default is "vit_h" SAM_VERSION_ID = os.getenv("SAM_VERSION_ID", "vit_h") + # Device ID, default is "sample-device-id" INFERENCE_SERVER_ID = os.getenv("INFERENCE_SERVER_ID", None) diff --git a/inference/core/interfaces/http/http_api.py b/inference/core/interfaces/http/http_api.py index 863f86703..def23da8b 100644 --- a/inference/core/interfaces/http/http_api.py +++ b/inference/core/interfaces/http/http_api.py @@ -16,6 +16,7 @@ ClipImageEmbeddingRequest, ClipTextEmbeddingRequest, ) +from inference.core.entities.requests.cog import CogVLMInferenceRequest from inference.core.entities.requests.doctr import DoctrOCRInferenceRequest from inference.core.entities.requests.gaze import GazeDetectionInferenceRequest from inference.core.entities.requests.inference import ( @@ -38,6 +39,7 @@ ClipCompareResponse, ClipEmbeddingResponse, ) +from inference.core.entities.responses.cog import CogVLMResponse from inference.core.entities.responses.doctr import DoctrOCRInferenceResponse from inference.core.entities.responses.gaze import GazeDetectionInferenceResponse from inference.core.entities.responses.inference import ( @@ -60,6 +62,7 @@ from inference.core.env import ( ALLOW_ORIGINS, CORE_MODEL_CLIP_ENABLED, + CORE_MODEL_COGVLM_ENABLED, CORE_MODEL_DOCTR_ENABLED, CORE_MODEL_GAZE_ENABLED, CORE_MODEL_SAM_ENABLED, @@ -341,6 +344,7 @@ def load_core_model( Returns: The DocTR model ID. """ + load_cogvlm_model = partial(load_core_model, core_model="cogvlm") @app.get( "/info", @@ -838,6 +842,45 @@ async def gaze_detection( trackUsage(gaze_model_id, actor) return response + if CORE_MODEL_COGVLM_ENABLED: + + @app.post( + "/llm/cogvlm", + response_model=CogVLMResponse, + summary="CogVLM", + description="Run the CogVLM model to chat or describe an image.", + ) + @with_route_exceptions + async def cog_vlm( + inference_request: CogVLMInferenceRequest, + request: Request, + api_key: Optional[str] = Query( + None, + description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval", + ), + ): + """ + Chat with CogVLM or ask it about an image. + + Args: + inference_request (M.CogVLMInferenceRequest): The request containing the prompt and optional image to be described. + api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval. + request (Request, default Body()): The HTTP request. + + Returns: + M.GazeDetectionResponse: The response containing all the detected faces and the corresponding gazes. + """ + cog_model_id = load_cogvlm_model(inference_request, api_key=api_key) + response = await self.model_manager.infer_from_request( + cog_model_id, inference_request + ) + if LAMBDA: + actor = request.scope["aws.event"]["requestContext"][ + "authorizer" + ]["lambda"]["actor"] + trackUsage(cog_model_id, actor) + return response + if LEGACY_ROUTE_ENABLED: # Legacy object detection inference path for backwards compatability @app.post( diff --git a/inference/core/registries/roboflow.py b/inference/core/registries/roboflow.py index 8f2cdeece..b4e45fc5e 100644 --- a/inference/core/registries/roboflow.py +++ b/inference/core/registries/roboflow.py @@ -23,6 +23,7 @@ "sam": ("embed", "sam"), "gaze": ("gaze", "l2cs"), "doctr": ("ocr", "doctr"), + "cogvlm": ("llm", "cogvlm"), } STUB_VERSION_ID = "0" diff --git a/inference/models/__init__.py b/inference/models/__init__.py index c5374e3a5..5746ebce2 100644 --- a/inference/models/__init__.py +++ b/inference/models/__init__.py @@ -18,6 +18,11 @@ except: pass +try: + from inference.models.cogvlm import CogVLM +except: + pass + from inference.models.vit import VitClassification from inference.models.yolact import YOLACT from inference.models.yolov5 import YOLOv5InstanceSegmentation, YOLOv5ObjectDetection diff --git a/inference/models/cogvlm/__init__.py b/inference/models/cogvlm/__init__.py new file mode 100644 index 000000000..1e0a3e205 --- /dev/null +++ b/inference/models/cogvlm/__init__.py @@ -0,0 +1 @@ +from inference.models.cogvlm.cog import CogVLM diff --git a/inference/models/cogvlm/cog.py b/inference/models/cogvlm/cog.py index ad7ca7899..29f30e4bb 100644 --- a/inference/models/cogvlm/cog.py +++ b/inference/models/cogvlm/cog.py @@ -1,60 +1,101 @@ -import torch +import os +from time import perf_counter +from typing import Any, List, Tuple, Union + +import numpy as np import requests +import torch from PIL import Image from transformers import AutoModelForCausalLM, LlamaTokenizer -tokenizer = LlamaTokenizer.from_pretrained('lmsys/vicuna-7b-v1.5') -model = AutoModelForCausalLM.from_pretrained( - 'THUDM/cogvlm-chat-hf', - torch_dtype=torch.bfloat16, - low_cpu_mem_usage=True, - trust_remote_code=True, - load_in_8bit=True, - cache_dir="/cache" -).eval() - - -# chat example - -query = 'Describe this image' -image = Image.open(requests.get('https://github.com/THUDM/CogVLM/blob/main/examples/1.png?raw=true', stream=True).raw).convert('RGB') -inputs = model.build_conversation_input_ids(tokenizer, query=query, history=[], images=[image]) # chat mode -inputs = { - 'input_ids': inputs['input_ids'].unsqueeze(0).to('cuda'), - 'token_type_ids': inputs['token_type_ids'].unsqueeze(0).to('cuda'), - 'attention_mask': inputs['attention_mask'].unsqueeze(0).to('cuda'), - 'images': [[inputs['images'][0].to('cuda').to(torch.bfloat16)]], -} -gen_kwargs = {"max_length": 2048, "do_sample": False} - -with torch.no_grad(): - outputs = model.generate(**inputs, **gen_kwargs) - outputs = outputs[:, inputs['input_ids'].shape[1]:] - print(tokenizer.decode(outputs[0])) - -# This image captures a moment from a basketball game. Two players are prominently featured: one wearing a yellow jersey with the number -# 24 and the word 'Lakers' written on it, and the other wearing a navy blue jersey with the word 'Washington' and the number 34. The player -# in yellow is holding a basketball and appears to be dribbling it, while the player in navy blue is reaching out with his arm, possibly -# trying to block or defend. The background shows a filled stadium with spectators, indicating that this is a professional game. - - - -# vqa example - -query = 'How many houses are there in this cartoon?' -image = Image.open(requests.get('https://github.com/THUDM/CogVLM/blob/main/examples/3.jpg?raw=true', stream=True).raw).convert('RGB') -inputs = model.build_conversation_input_ids(tokenizer, query=query, history=[], images=[image], template_version='vqa') # vqa mode -inputs = { - 'input_ids': inputs['input_ids'].unsqueeze(0).to('cuda'), - 'token_type_ids': inputs['token_type_ids'].unsqueeze(0).to('cuda'), - 'attention_mask': inputs['attention_mask'].unsqueeze(0).to('cuda'), - 'images': [[inputs['images'][0].to('cuda').to(torch.bfloat16)]], -} -gen_kwargs = {"max_length": 2048, "do_sample": False} - -with torch.no_grad(): - outputs = model.generate(**inputs, **gen_kwargs) - outputs = outputs[:, inputs['input_ids'].shape[1]:] - print(tokenizer.decode(outputs[0])) - -# 4 \ No newline at end of file +from inference.core.entities.requests.cog import CogVLMInferenceRequest +from inference.core.entities.responses.cog import CogVLMResponse +from inference.core.env import ( + API_KEY, + COG_LOAD_4BIT, + COG_LOAD_8BIT, + COG_VERSION_ID, + MODEL_CACHE_DIR, +) +from inference.core.models.base import Model, PreprocessReturnMetadata +from inference.core.utils.image_utils import load_image_rgb + +DEVICE = "cuda" if torch.cuda.is_available() else "cpu" + + +class CogVLM(Model): + def __init__(self, model_id=f"cogvlm/{COG_VERSION_ID}", **kwargs): + self.model_id = model_id + self.endpoint = model_id + self.api_key = API_KEY + self.dataset_id, self.version_id = model_id.split("/") + if COG_LOAD_4BIT and COG_LOAD_8BIT: + raise ValueError( + "Only one of environment variable `COG_LOAD_4BIT` or `COG_LOAD_8BIT` can be true" + ) + self.cache_dir = os.path.join(MODEL_CACHE_DIR, self.endpoint) + with torch.inference_mode(): + self.tokenizer = LlamaTokenizer.from_pretrained("lmsys/vicuna-7b-v1.5") + self.model = AutoModelForCausalLM.from_pretrained( + f"THUDM/{self.version_id}", + torch_dtype=torch.float16, + low_cpu_mem_usage=True, + trust_remote_code=True, + load_in_4bit=COG_LOAD_4BIT, + load_in_8bit=COG_LOAD_8BIT, + cache_dir=self.cache_dir, + ).eval() + + def preprocess( + self, image: Any, **kwargs + ) -> Tuple[np.ndarray, PreprocessReturnMetadata]: + if image is None: + return None + pil_image = Image.fromarray(load_image_rgb(image)) + return pil_image, PreprocessReturnMetadata({}) + + def postprocess( + self, + predictions: Tuple[np.ndarray], + preprocess_return_metadata: PreprocessReturnMetadata, + **kwargs, + ) -> Any: + return predictions[0] + + def predict(self, image_in: np.ndarray, prompt="", **kwargs): + images = [image_in] + if image_in is None: + images = [] + + built_inputs = self.model.build_conversation_input_ids( + self.tokenizer, query=prompt, history=[], images=images + ) # chat mode + inputs = { + "input_ids": built_inputs["input_ids"].unsqueeze(0).to(DEVICE), + "token_type_ids": built_inputs["token_type_ids"].unsqueeze(0).to(DEVICE), + "attention_mask": built_inputs["attention_mask"].unsqueeze(0).to(DEVICE), + } + if images: + inputs["images"] = [ + [built_inputs["images"][0].to(DEVICE).to(torch.float16)] + ] + gen_kwargs = {"max_length": 2048, "do_sample": False} + + with torch.inference_mode(): + outputs = self.model.generate(**inputs, **gen_kwargs) + outputs = outputs[:, inputs["input_ids"].shape[1] :] + text = self.tokenizer.decode(outputs[0]) + if text.endswith(""): + text = text[:-4] + + def infer_from_request(self, request: CogVLMInferenceRequest) -> CogVLMResponse: + t1 = perf_counter() + text = self.infer(**request.dict()) + response = CogVLMResponse(response=text) + response.time = perf_counter() - t1 + return response + + +if __name__ == "__main__": + m = CogVLM() + m.infer() diff --git a/inference/models/utils.py b/inference/models/utils.py index ced155129..9efd7f445 100644 --- a/inference/models/utils.py +++ b/inference/models/utils.py @@ -159,6 +159,13 @@ except: pass +try: + from inference.models import CogVLM + + ROBOFLOW_MODEL_TYPES[("llm", "cogvlm")] = CogVLM +except: + pass + def get_roboflow_model(model_id, api_key=API_KEY, **kwargs): task, model = get_model_type(model_id, api_key=api_key) diff --git a/requirements/requirements.cog.txt b/requirements/requirements.cog.txt new file mode 100644 index 000000000..18891eee4 --- /dev/null +++ b/requirements/requirements.cog.txt @@ -0,0 +1,6 @@ +transformers<=4.35.2 +sentencepiece<=0.1.99 +einops<=0.7.0 +xformers<=0.0.22 +accelerate<=0.25.0 +bitsandbytes<=0.41.2.post2 \ No newline at end of file diff --git a/requirements/requirements.gpu.txt b/requirements/requirements.gpu.txt index 23260e875..4347b46d4 100644 --- a/requirements/requirements.gpu.txt +++ b/requirements/requirements.gpu.txt @@ -1,7 +1 @@ -onnxruntime-gpu<=1.15.1 -transformers -sentencepiece -einops -xformers -accelerate -bitsandbytes \ No newline at end of file +onnxruntime-gpu<=1.15.1 \ No newline at end of file From 9d2e950ad9b16f02b3dc43f61944f931a9a1c105 Mon Sep 17 00:00:00 2001 From: Peter Robicheaux Date: Wed, 6 Dec 2023 23:14:30 +0000 Subject: [PATCH 03/12] bugfix --- inference/models/cogvlm/cog.py | 1 + 1 file changed, 1 insertion(+) diff --git a/inference/models/cogvlm/cog.py b/inference/models/cogvlm/cog.py index 29f30e4bb..a2e4e7823 100644 --- a/inference/models/cogvlm/cog.py +++ b/inference/models/cogvlm/cog.py @@ -87,6 +87,7 @@ def predict(self, image_in: np.ndarray, prompt="", **kwargs): text = self.tokenizer.decode(outputs[0]) if text.endswith(""): text = text[:-4] + return text def infer_from_request(self, request: CogVLMInferenceRequest) -> CogVLMResponse: t1 = perf_counter() From 384e10a16a9c84f9ef7d850cdd592be10a508466 Mon Sep 17 00:00:00 2001 From: Peter Robicheaux Date: Wed, 6 Dec 2023 23:23:05 +0000 Subject: [PATCH 04/12] Bugfix --- inference/models/cogvlm/cog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inference/models/cogvlm/cog.py b/inference/models/cogvlm/cog.py index a2e4e7823..03b606e16 100644 --- a/inference/models/cogvlm/cog.py +++ b/inference/models/cogvlm/cog.py @@ -87,7 +87,7 @@ def predict(self, image_in: np.ndarray, prompt="", **kwargs): text = self.tokenizer.decode(outputs[0]) if text.endswith(""): text = text[:-4] - return text + return (text,) def infer_from_request(self, request: CogVLMInferenceRequest) -> CogVLMResponse: t1 = perf_counter() From 84f36ab03e98c5f58aec58578d4b8d18195168c2 Mon Sep 17 00:00:00 2001 From: Peter Robicheaux Date: Wed, 6 Dec 2023 23:29:58 +0000 Subject: [PATCH 05/12] Bugfix --- inference/models/cogvlm/cog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inference/models/cogvlm/cog.py b/inference/models/cogvlm/cog.py index 03b606e16..0f4b88a57 100644 --- a/inference/models/cogvlm/cog.py +++ b/inference/models/cogvlm/cog.py @@ -50,7 +50,7 @@ def preprocess( self, image: Any, **kwargs ) -> Tuple[np.ndarray, PreprocessReturnMetadata]: if image is None: - return None + return None, PreprocessReturnMetadata({}) pil_image = Image.fromarray(load_image_rgb(image)) return pil_image, PreprocessReturnMetadata({}) From 2a3c8879cba487743736727e218983c11322484b Mon Sep 17 00:00:00 2001 From: Peter Robicheaux Date: Thu, 7 Dec 2023 00:25:19 +0000 Subject: [PATCH 06/12] History + need image to talk --- inference/core/entities/requests/cog.py | 10 +++++--- inference/core/interfaces/http/http_api.py | 4 ++-- inference/models/cogvlm/cog.py | 27 ++++++++++++---------- 3 files changed, 24 insertions(+), 17 deletions(-) diff --git a/inference/core/entities/requests/cog.py b/inference/core/entities/requests/cog.py index 66b1e31a4..416d9ebe3 100644 --- a/inference/core/entities/requests/cog.py +++ b/inference/core/entities/requests/cog.py @@ -1,4 +1,4 @@ -from typing import Dict, List, Optional, Union +from typing import Dict, List, Optional, Union, Tuple from pydantic import Field, validator @@ -23,13 +23,17 @@ class CogVLMInferenceRequest(BaseRequest): description="The version ID of CogVLM to be used for this request. See the huggingface model repo at THUDM.", ) model_id: Optional[str] = Field() - image: Optional[InferenceRequestImage] = Field( - description="Image for CogVLM to look at. Specify what you want it to do with this image. Don't provide an image to just use CogVLM as an LLM." + image: Union[InferenceRequestImage, List[InferenceRequestImage]] = Field( + description="Image or list of images for CogVLM to look at. Use prompt to specify what you want it to do with the images." ) prompt: str = Field( description="Text to be passed to CogVLM. Use to prompt it to describe an image or provide only text to chat with the model.", example="Describe this image.", ) + history: Optional[List[Tuple[str, str]]] = Field( + description="Optional chat history, formatted as a list of 2-tuples where the first entry is the user prompt" + " and the second entry is the generated model response" + ) @validator("model_id", always=True) def validate_model_id(cls, value, values): diff --git a/inference/core/interfaces/http/http_api.py b/inference/core/interfaces/http/http_api.py index def23da8b..506bead8e 100644 --- a/inference/core/interfaces/http/http_api.py +++ b/inference/core/interfaces/http/http_api.py @@ -863,12 +863,12 @@ async def cog_vlm( Chat with CogVLM or ask it about an image. Args: - inference_request (M.CogVLMInferenceRequest): The request containing the prompt and optional image to be described. + inference_request (M.CogVLMInferenceRequest): The request containing the prompt and image to be described. api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval. request (Request, default Body()): The HTTP request. Returns: - M.GazeDetectionResponse: The response containing all the detected faces and the corresponding gazes. + M.CogVLMResponse: The model's text response """ cog_model_id = load_cogvlm_model(inference_request, api_key=api_key) response = await self.model_manager.infer_from_request( diff --git a/inference/models/cogvlm/cog.py b/inference/models/cogvlm/cog.py index 0f4b88a57..3ff350cf0 100644 --- a/inference/models/cogvlm/cog.py +++ b/inference/models/cogvlm/cog.py @@ -47,12 +47,14 @@ def __init__(self, model_id=f"cogvlm/{COG_VERSION_ID}", **kwargs): ).eval() def preprocess( - self, image: Any, **kwargs - ) -> Tuple[np.ndarray, PreprocessReturnMetadata]: - if image is None: - return None, PreprocessReturnMetadata({}) - pil_image = Image.fromarray(load_image_rgb(image)) - return pil_image, PreprocessReturnMetadata({}) + self, images: Any, **kwargs + ) -> Tuple[List[np.ndarray], PreprocessReturnMetadata]: + out_images = [] + for image in images: + pil_image = Image.fromarray(load_image_rgb(image)) + out_images.append(pil_image) + + return out_images, PreprocessReturnMetadata({}) def postprocess( self, @@ -62,13 +64,12 @@ def postprocess( ) -> Any: return predictions[0] - def predict(self, image_in: np.ndarray, prompt="", **kwargs): - images = [image_in] - if image_in is None: - images = [] - + def predict(self, image_in: List[np.ndarray], prompt="", history=None, **kwargs): + images = image_in + if history is None: + history = [] built_inputs = self.model.build_conversation_input_ids( - self.tokenizer, query=prompt, history=[], images=images + self.tokenizer, query=prompt, history=history, images=images ) # chat mode inputs = { "input_ids": built_inputs["input_ids"].unsqueeze(0).to(DEVICE), @@ -91,6 +92,8 @@ def predict(self, image_in: np.ndarray, prompt="", **kwargs): def infer_from_request(self, request: CogVLMInferenceRequest) -> CogVLMResponse: t1 = perf_counter() + if not isinstance(request.image, list): + request.image = [request.image] text = self.infer(**request.dict()) response = CogVLMResponse(response=text) response.time = perf_counter() - t1 From 4956aa97eaa87f4d30b03638e0576d6b80803439 Mon Sep 17 00:00:00 2001 From: Peter Robicheaux Date: Thu, 7 Dec 2023 00:43:50 +0000 Subject: [PATCH 07/12] Style' --- inference/core/entities/requests/cog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inference/core/entities/requests/cog.py b/inference/core/entities/requests/cog.py index 416d9ebe3..d6a327024 100644 --- a/inference/core/entities/requests/cog.py +++ b/inference/core/entities/requests/cog.py @@ -1,4 +1,4 @@ -from typing import Dict, List, Optional, Union, Tuple +from typing import Dict, List, Optional, Tuple, Union from pydantic import Field, validator From 60bc37569e7636a981e97d46223e12aba356c035 Mon Sep 17 00:00:00 2001 From: Peter Robicheaux Date: Thu, 7 Dec 2023 01:14:43 +0000 Subject: [PATCH 08/12] Multi images not currently supported by cogvlm :/ --- inference/core/entities/requests/cog.py | 4 ++-- inference/core/interfaces/http/http_api.py | 2 +- inference/models/cogvlm/cog.py | 22 +++++++--------------- 3 files changed, 10 insertions(+), 18 deletions(-) diff --git a/inference/core/entities/requests/cog.py b/inference/core/entities/requests/cog.py index d6a327024..b3269ca4b 100644 --- a/inference/core/entities/requests/cog.py +++ b/inference/core/entities/requests/cog.py @@ -23,8 +23,8 @@ class CogVLMInferenceRequest(BaseRequest): description="The version ID of CogVLM to be used for this request. See the huggingface model repo at THUDM.", ) model_id: Optional[str] = Field() - image: Union[InferenceRequestImage, List[InferenceRequestImage]] = Field( - description="Image or list of images for CogVLM to look at. Use prompt to specify what you want it to do with the images." + image: InferenceRequestImage = Field( + description="Image for CogVLM to look at. Use prompt to specify what you want it to do with the image." ) prompt: str = Field( description="Text to be passed to CogVLM. Use to prompt it to describe an image or provide only text to chat with the model.", diff --git a/inference/core/interfaces/http/http_api.py b/inference/core/interfaces/http/http_api.py index 506bead8e..13b358e35 100644 --- a/inference/core/interfaces/http/http_api.py +++ b/inference/core/interfaces/http/http_api.py @@ -860,7 +860,7 @@ async def cog_vlm( ), ): """ - Chat with CogVLM or ask it about an image. + Chat with CogVLM or ask it about an image. Multi-image requests not currently supported. Args: inference_request (M.CogVLMInferenceRequest): The request containing the prompt and image to be described. diff --git a/inference/models/cogvlm/cog.py b/inference/models/cogvlm/cog.py index 3ff350cf0..5a8f6cef2 100644 --- a/inference/models/cogvlm/cog.py +++ b/inference/models/cogvlm/cog.py @@ -47,14 +47,11 @@ def __init__(self, model_id=f"cogvlm/{COG_VERSION_ID}", **kwargs): ).eval() def preprocess( - self, images: Any, **kwargs - ) -> Tuple[List[np.ndarray], PreprocessReturnMetadata]: - out_images = [] - for image in images: - pil_image = Image.fromarray(load_image_rgb(image)) - out_images.append(pil_image) + self, image: Any, **kwargs + ) -> Tuple[np.ndarray, PreprocessReturnMetadata]: + pil_image = Image.fromarray(load_image_rgb(image)) - return out_images, PreprocessReturnMetadata({}) + return pil_image, PreprocessReturnMetadata({}) def postprocess( self, @@ -64,8 +61,8 @@ def postprocess( ) -> Any: return predictions[0] - def predict(self, image_in: List[np.ndarray], prompt="", history=None, **kwargs): - images = image_in + def predict(self, image_in: np.ndarray, prompt="", history=None, **kwargs): + images = [image_in] if history is None: history = [] built_inputs = self.model.build_conversation_input_ids( @@ -75,11 +72,8 @@ def predict(self, image_in: List[np.ndarray], prompt="", history=None, **kwargs) "input_ids": built_inputs["input_ids"].unsqueeze(0).to(DEVICE), "token_type_ids": built_inputs["token_type_ids"].unsqueeze(0).to(DEVICE), "attention_mask": built_inputs["attention_mask"].unsqueeze(0).to(DEVICE), + "images": [[built_inputs["images"][0].to(DEVICE).to(torch.float16)]], } - if images: - inputs["images"] = [ - [built_inputs["images"][0].to(DEVICE).to(torch.float16)] - ] gen_kwargs = {"max_length": 2048, "do_sample": False} with torch.inference_mode(): @@ -92,8 +86,6 @@ def predict(self, image_in: List[np.ndarray], prompt="", history=None, **kwargs) def infer_from_request(self, request: CogVLMInferenceRequest) -> CogVLMResponse: t1 = perf_counter() - if not isinstance(request.image, list): - request.image = [request.image] text = self.infer(**request.dict()) response = CogVLMResponse(response=text) response.time = perf_counter() - t1 From 42877a4f325a1717c8f01dbe297b5fe4dc536142 Mon Sep 17 00:00:00 2001 From: Peter Robicheaux Date: Thu, 7 Dec 2023 01:34:28 +0000 Subject: [PATCH 09/12] Changes --- examples/cogvlm/cog_client.py | 87 +++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 examples/cogvlm/cog_client.py diff --git a/examples/cogvlm/cog_client.py b/examples/cogvlm/cog_client.py new file mode 100644 index 000000000..6a5d4b951 --- /dev/null +++ b/examples/cogvlm/cog_client.py @@ -0,0 +1,87 @@ +import base64 +import asyncio +import aiohttp +import os +from PIL import Image +import requests + +PORT = 9001 +API_KEY = os.environ["API_KEY"] +IMAGE_PATH = "image.jpg" + + +def encode_bas64(image_path): + with open(image_path, "rb") as image: + x = image.read() + image_string = base64.b64encode(x) + + return image_string.decode("ascii") + + +async def do_cog_request(session): + api_key = API_KEY + prompt = "The player on the left's name is Moky." + " What round of the tournament is he in? Answer in one word." + + print(f"Starting") + infer_payload = { + "image": { + "type": "base64", + "value": encode_bas64(IMAGE_PATH), + }, + "api_key": api_key, + "prompt": prompt, + } + async with session.post( + f"http://localhost:{PORT}/llm/cogvlm", + json=infer_payload, + ) as response: + if response.status != 200: + print(response.status) + print(await response.json()) + raise RuntimeError + resp = await response.json() + res = resp["response"] + print(resp) + infer_payload = { + "image": { + "type": "base64", + "value": encode_bas64(IMAGE_PATH), + }, + "api_key": api_key, + "prompt": "What is the name of the player on the left?", + "history": [(prompt, res)], + } + async with session.post( + f"http://localhost:{PORT}/llm/cogvlm", + json=infer_payload, + ) as response: + if response.status != 200: + print(response.status) + print(await response.json()) + raise RuntimeError + resp = await response.json() + res = resp["response"] + print(resp) + + +async def main(): + import time + + start = time.perf_counter() + connector = aiohttp.TCPConnector(limit=100, limit_per_host=100) + async with aiohttp.ClientSession(read_timeout=0, connector=connector) as session: + await do_cog_request(session) + total = time.perf_counter() - start + print(f"Total time: {total:.2f} seconds") + + +if __name__ == "__main__": + Image.open( + requests.get( + "https://source.roboflow.com/ACrZ7Hz8DRUB1NBMMtDoQK84Hf22/0qUjAGRJQWWhT5j9hUOG/original.jpg", + stream=True, + ).raw + ).convert("RGB").save(IMAGE_PATH) + loop = asyncio.get_event_loop() + loop.run_until_complete(main()) From ad9283c2bbe47042b9c2c8bcca79a53dff122ef8 Mon Sep 17 00:00:00 2001 From: Peter Robicheaux Date: Thu, 7 Dec 2023 01:44:34 +0000 Subject: [PATCH 10/12] Bugfix --- examples/cogvlm/cog_client.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/cogvlm/cog_client.py b/examples/cogvlm/cog_client.py index 6a5d4b951..fddfb6a82 100644 --- a/examples/cogvlm/cog_client.py +++ b/examples/cogvlm/cog_client.py @@ -20,8 +20,10 @@ def encode_bas64(image_path): async def do_cog_request(session): api_key = API_KEY - prompt = "The player on the left's name is Moky." - " What round of the tournament is he in? Answer in one word." + prompt = ( + "The player on the left's name is Moky." + " What round of the tournament is he in? Answer in one word." + ) print(f"Starting") infer_payload = { From 3bfccf54ba4a2f7e2684d3799576d3c85ee8b21c Mon Sep 17 00:00:00 2001 From: Peter Robicheaux Date: Thu, 7 Dec 2023 20:06:57 +0000 Subject: [PATCH 11/12] Fix typing --- inference/models/cogvlm/cog.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/inference/models/cogvlm/cog.py b/inference/models/cogvlm/cog.py index 5a8f6cef2..9b5ef21d8 100644 --- a/inference/models/cogvlm/cog.py +++ b/inference/models/cogvlm/cog.py @@ -48,20 +48,20 @@ def __init__(self, model_id=f"cogvlm/{COG_VERSION_ID}", **kwargs): def preprocess( self, image: Any, **kwargs - ) -> Tuple[np.ndarray, PreprocessReturnMetadata]: - pil_image = Image.fromarray(load_image_rgb(image)) + ) -> Tuple[Image.Image, PreprocessReturnMetadata]: + pil_image = Image.fromarray(load_image_rgb(image) return pil_image, PreprocessReturnMetadata({}) def postprocess( self, - predictions: Tuple[np.ndarray], + predictions: Tuple[str], preprocess_return_metadata: PreprocessReturnMetadata, **kwargs, ) -> Any: return predictions[0] - def predict(self, image_in: np.ndarray, prompt="", history=None, **kwargs): + def predict(self, image_in: Image.Image, prompt="", history=None, **kwargs): images = [image_in] if history is None: history = [] From 5c3a560ee80ebce278088a059b9a2dac27b00c0e Mon Sep 17 00:00:00 2001 From: Peter Robicheaux Date: Thu, 7 Dec 2023 20:29:37 +0000 Subject: [PATCH 12/12] Bugfix --- inference/models/cogvlm/cog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inference/models/cogvlm/cog.py b/inference/models/cogvlm/cog.py index 9b5ef21d8..25f78ef64 100644 --- a/inference/models/cogvlm/cog.py +++ b/inference/models/cogvlm/cog.py @@ -49,7 +49,7 @@ def __init__(self, model_id=f"cogvlm/{COG_VERSION_ID}", **kwargs): def preprocess( self, image: Any, **kwargs ) -> Tuple[Image.Image, PreprocessReturnMetadata]: - pil_image = Image.fromarray(load_image_rgb(image) + pil_image = Image.fromarray(load_image_rgb(image)) return pil_image, PreprocessReturnMetadata({})