From 17b8e38d1cddf0b0b7f30f7543cf56684b16cba4 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Thu, 8 Aug 2024 00:14:53 +0800 Subject: [PATCH 1/4] Rename MiniCPMVQwen2 to MiniCPMV2.6 --- vllm/model_executor/models/minicpmv.py | 47 ++++++++++---------------- 1 file changed, 17 insertions(+), 30 deletions(-) diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 0388259595628..e6c1a21bc8c88 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -216,16 +216,7 @@ def __init__( self.query = nn.Parameter(torch.zeros(self.num_queries, embed_dim)) trunc_normal_(self.query, std=0.02) - - if kv_dim is not None and kv_dim != embed_dim: - self.kv_proj = ReplicatedLinear(kv_dim, embed_dim, bias=False) - else: - # Maintain the same return value with ReplicatedLinear.forward - self.kv_proj = lambda *args, **kwargs: ( - nn.Identity()(*args, **kwargs), - None, - ) - + self.kv_proj = ReplicatedLinear(kv_dim, embed_dim, bias=False) self.attn = nn.MultiheadAttention(embed_dim, num_heads) self.ln_q = norm_layer(embed_dim) self.ln_kv = norm_layer(embed_dim) @@ -261,7 +252,6 @@ def __init__( norm_layer) self.adaptive = adaptive - pos_embed_arr = get_2d_sincos_pos_embed(embed_dim, grid_size, version=(2, 0)) @@ -717,7 +707,7 @@ def is_default_weight_loading(self, name: str) -> bool: raise NotImplementedError -class MiniCPMV2(MiniCPMVBaseModel): +class MiniCPMV2_0(MiniCPMVBaseModel): def __init__( self, @@ -890,10 +880,7 @@ def is_default_weight_loading(self, name: str) -> bool: return "resampler" in name -# NOTE: Currently, information about this model is unavailable. We are -# temporarily using `MiniCPMVQwen2` as it's name. The name may need -# to be modified in the future. -class MiniCPMVQwen2(MiniCPMVBaseModel): +class MiniCPMV2_6(MiniCPMVBaseModel): def __init__( self, @@ -903,6 +890,7 @@ def __init__( quant_config: Optional[QuantizationConfig] = None, ): super().__init__(config, multimodal_config, cache_config, quant_config) + assert self.version == (2, 6) def init_llm( self, @@ -930,6 +918,7 @@ def init_vision_module(self) -> nn.Module: def init_resampler(self, embed_dim: int, vision_dim: int) -> nn.Module: with set_default_torch_dtype(torch.float16): + # The resampler in 2.6 remains consistent with the one in 2.5. resampler = Resampler2_5( num_queries=self.config.query_num, embed_dim=embed_dim, @@ -989,6 +978,13 @@ def is_default_weight_loading(self, name: str) -> bool: return "resampler" in name or "vpm" in name +_SUPPORT_VERSION = { + (2, 0): MiniCPMV2_0, + (2, 5): MiniCPMV2_5, + (2, 6): MiniCPMV2_6 +} + + @MULTIMODAL_REGISTRY.register_image_input_mapper() @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_minicpmv_image_tokens) @INPUT_REGISTRY.register_dummy_data(dummy_data_for_minicpmv) @@ -1007,20 +1003,11 @@ def __new__( cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, ): - if not hasattr(config, "version"): - if config.hidden_size == 2304 and config.query_num == 64: - version = (2, 0) - else: - version = (2, 5) - else: - version = str(config.version).split(".") - version = tuple([int(x) for x in version]) + version = str(config.version).split(".") + version = tuple([int(x) for x in version]) # Dispatch class based on version - if version == (2, 0): - instance_class = MiniCPMV2 - elif version == (2, 5): - instance_class = MiniCPMV2_5 - else: - instance_class = MiniCPMVQwen2 + instance_class = _SUPPORT_VERSION.get(version, None) + if instance_class is None: + raise NotImplementedError return instance_class(config, multimodal_config, cache_config, quant_config) From 69fb396058b958a2174ef0ede57a0376706512d5 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Thu, 8 Aug 2024 00:49:12 +0800 Subject: [PATCH 2/4] fix bug --- docs/source/models/supported_models.rst | 2 +- vllm/model_executor/models/minicpmv.py | 13 ++++++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index fd5d154006ae7..6fa81e8863073 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -222,7 +222,7 @@ Vision Language Models - * - :code:`MiniCPMV` - MiniCPM-V - - :code:`openbmb/MiniCPM-V-2` (see note), :code:`openbmb/MiniCPM-Llama3-V-2_5`, etc. + - :code:`openbmb/MiniCPM-V-2` (see note), :code:`openbmb/MiniCPM-Llama3-V-2_5`, :code:`openbmb/MiniCPM-V-2_6`, etc. - .. note:: diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index e6c1a21bc8c88..b55843924eb11 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -1003,11 +1003,18 @@ def __new__( cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, ): - version = str(config.version).split(".") - version = tuple([int(x) for x in version]) + if not hasattr(config, "version"): + if config.hidden_size == 2304 and config.query_num == 64: + version = (2, 0) + else: + version = (2, 5) + else: + version = str(config.version).split(".") + version = tuple([int(x) for x in version]) # Dispatch class based on version instance_class = _SUPPORT_VERSION.get(version, None) if instance_class is None: - raise NotImplementedError + raise ValueError( + "Currently, MiniCPMV only supports versions 2.0, 2.5, and 2.6") return instance_class(config, multimodal_config, cache_config, quant_config) From 1a4bf1b96e611252112d1659f3299b421df08bb9 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Thu, 8 Aug 2024 10:36:53 +0800 Subject: [PATCH 3/4] Revert resampler's kv_proj logic --- vllm/model_executor/models/minicpmv.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index b55843924eb11..fc962434cab0b 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -216,7 +216,14 @@ def __init__( self.query = nn.Parameter(torch.zeros(self.num_queries, embed_dim)) trunc_normal_(self.query, std=0.02) - self.kv_proj = ReplicatedLinear(kv_dim, embed_dim, bias=False) + if kv_dim is not None and kv_dim != embed_dim: + self.kv_proj = ReplicatedLinear(kv_dim, embed_dim, bias=False) + else: + # Maintain the same return value with ReplicatedLinear.forward + self.kv_proj = lambda *args, **kwargs: ( + nn.Identity()(*args, **kwargs), + None, + ) self.attn = nn.MultiheadAttention(embed_dim, num_heads) self.ln_q = norm_layer(embed_dim) self.ln_kv = norm_layer(embed_dim) From 25aea1ca029bd38ff891ec4ab1737c187f272c94 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Thu, 8 Aug 2024 14:16:02 +0800 Subject: [PATCH 4/4] Modify VL example --- examples/offline_inference_vision_language.py | 51 +++++++++++++------ 1 file changed, 35 insertions(+), 16 deletions(-) diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index ce9dc9e457c09..ea607fc2a1e51 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -22,8 +22,8 @@ def run_llava(question): prompt = f"USER: \n{question}\nASSISTANT:" llm = LLM(model="llava-hf/llava-1.5-7b-hf") - - return llm, prompt + stop_token_ids = None + return llm, prompt, stop_token_ids # LLaVA-1.6/LLaVA-NeXT @@ -31,8 +31,8 @@ def run_llava_next(question): prompt = f"[INST] \n{question} [/INST]" llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf") - - return llm, prompt + stop_token_ids = None + return llm, prompt, stop_token_ids # Fuyu @@ -40,8 +40,8 @@ def run_fuyu(question): prompt = f"{question}\n" llm = LLM(model="adept/fuyu-8b") - - return llm, prompt + stop_token_ids = None + return llm, prompt, stop_token_ids # Phi-3-Vision @@ -59,7 +59,8 @@ def run_phi3v(question): trust_remote_code=True, max_num_seqs=5, ) - return llm, prompt + stop_token_ids = None + return llm, prompt, stop_token_ids # PaliGemma @@ -68,8 +69,8 @@ def run_paligemma(question): # PaliGemma has special prompt format for VQA prompt = "caption en" llm = LLM(model="google/paligemma-3b-mix-224") - - return llm, prompt + stop_token_ids = None + return llm, prompt, stop_token_ids # Chameleon @@ -77,7 +78,8 @@ def run_chameleon(question): prompt = f"{question}" llm = LLM(model="facebook/chameleon-7b") - return llm, prompt + stop_token_ids = None + return llm, prompt, stop_token_ids # MiniCPM-V @@ -89,13 +91,26 @@ def run_minicpmv(question): # model_name = "HwwwH/MiniCPM-V-2" # 2.5 - model_name = "openbmb/MiniCPM-Llama3-V-2_5" + # model_name = "openbmb/MiniCPM-Llama3-V-2_5" + + #2.6 + model_name = "openbmb/MiniCPM-V-2_6" tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) llm = LLM( model=model_name, trust_remote_code=True, ) + # NOTE The stop_token_ids are different for various versions of MiniCPM-V + # 2.0 + # stop_token_ids = [tokenizer.eos_id] + + # 2.5 + # stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id] + + # 2.6 + stop_tokens = ['<|im_end|>', '<|endoftext|>'] + stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] messages = [{ 'role': 'user', @@ -104,7 +119,7 @@ def run_minicpmv(question): prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) - return llm, prompt + return llm, prompt, stop_token_ids # InternVL @@ -118,7 +133,8 @@ def run_internvl(question): trust_remote_code=True, max_num_seqs=5, ) - return llm, prompt + stop_token_ids = None + return llm, prompt, stop_token_ids # BLIP-2 @@ -128,7 +144,8 @@ def run_blip2(question): # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa prompt = f"Question: {question} Answer:" llm = LLM(model="Salesforce/blip2-opt-2.7b") - return llm, prompt + stop_token_ids = None + return llm, prompt, stop_token_ids model_example_map = { @@ -149,11 +166,13 @@ def main(args): if model not in model_example_map: raise ValueError(f"Model type {model} is not supported.") - llm, prompt = model_example_map[model](question) + llm, prompt, stop_token_ids = model_example_map[model](question) # We set temperature to 0.2 so that outputs can be different # even when all prompts are identical when running batch inference. - sampling_params = SamplingParams(temperature=0.2, max_tokens=64) + sampling_params = SamplingParams(temperature=0.2, + max_tokens=64, + stop_token_ids=stop_token_ids) assert args.num_prompts > 0 if args.num_prompts == 1: