From 133a17f0669d604376ae1339a81def9577c46386 Mon Sep 17 00:00:00 2001 From: kennymckormick Date: Sat, 23 Dec 2023 17:17:33 +0800 Subject: [PATCH 01/11] refactor custom_prompt --- vlmeval/vlm/llava.py | 59 ++++++++++-------------- vlmeval/vlm/mplug_owl2.py | 62 +++++++++++-------------- vlmeval/vlm/transcore_m.py | 71 ++++++++++++----------------- vlmeval/vlm/utils/__init__.py | 1 + vlmeval/vlm/utils/custom_prompt.py | 31 +++++++++++++ vlmeval/vlm/xcomposer.py | 73 +++++++++++++----------------- 6 files changed, 144 insertions(+), 153 deletions(-) create mode 100644 vlmeval/vlm/utils/__init__.py create mode 100644 vlmeval/vlm/utils/custom_prompt.py diff --git a/vlmeval/vlm/llava.py b/vlmeval/vlm/llava.py index e8fafef5d..a39e90f5c 100644 --- a/vlmeval/vlm/llava.py +++ b/vlmeval/vlm/llava.py @@ -4,9 +4,10 @@ import os import os.path as osp from ..smp import * +from .utils import CustomPrompt from ..utils import DATASET_TYPE -class LLaVA: +class LLaVA(CustomPrompt): INSTALL_REQ = True @@ -50,46 +51,34 @@ def __init__(self, self.kwargs = kwargs_default warnings.warn(f"Following kwargs received: {self.kwargs}, will use as generation config. ") + def use_custom_prompt(self, dataset): + assert dataset is not None + if DATASET_TYPE(dataset) == 'multi-choice': + return True + return False + def build_prompt(self, line, dataset=None): - from ..utils import img_root_map + assert self.use_custom_prompt(dataset) assert dataset is None or isinstance(dataset, str) - img_root = osp.join('images', img_root_map[dataset]) - os.makedirs(img_root, exist_ok=True) + tgt_path = self.dump_image(line, dataset) - if isinstance(line['image'], list): - tgt_path = [] - for img, im_name in zip(line['image'], line['image_path']): - path = osp.join(img_root, im_name) - if not read_ok(path): - decode_base64_to_image_file(img, path) - tgt_path.append(path) - else: - tgt_path = osp.join(img_root, f"{line['index']}.jpg") - if not read_ok(tgt_path): - decode_base64_to_image_file(line['image'], tgt_path) - - if dataset is not None and DATASET_TYPE(dataset) == 'multi-choice': - question = line['question'] - hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None - if hint is not None: - question + hint + '\n' + question + question = line['question'] + hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None + if hint is not None: + question + hint + '\n' + question - option_candidate = ['A', 'B', 'C', 'D', 'E'] - options = { - cand: line[cand] - for cand in option_candidate - if cand in line and not pd.isna(line[cand]) - } - for key, item in options.items(): - question += f'\n{key}. {item}' - prompt = question + options = { + cand: line[cand] + for cand in string.ascii_uppercase + if cand in line and not pd.isna(line[cand]) + } + for key, item in options.items(): + question += f'\n{key}. {item}' - if not cn_string(prompt): - prompt = prompt + "\n" + "Answer with the option's letter from the given choices directly." - else: - prompt = prompt + "\n" + "请直接回答选项字母。" + if not cn_string(question): + prompt = question + "\n" + "Answer with the option's letter from the given choices directly." else: - prompt = line['question'] + prompt = question + "\n" + "请直接回答选项字母。" return {'image': tgt_path, 'text': prompt} diff --git a/vlmeval/vlm/mplug_owl2.py b/vlmeval/vlm/mplug_owl2.py index 6847304b6..179036c09 100644 --- a/vlmeval/vlm/mplug_owl2.py +++ b/vlmeval/vlm/mplug_owl2.py @@ -1,10 +1,11 @@ import os, torch from PIL import Image from ..smp import * +from .utils import CustomPrompt from ..utils import DATASET_TYPE -class mPLUG_Owl2: +class mPLUG_Owl2(CustomPrompt): INSTALL_REQ = True @@ -34,46 +35,35 @@ def __init__(self, model_path='MAGAer13/mplug-owl2-llama2-7b', **kwargs): self.kwargs = kwargs_default warnings.warn(f"Following kwargs received: {self.kwargs}, will use as generation config. ") + def use_custom_prompt(self, dataset): + assert dataset is not None + if DATASET_TYPE(dataset) == 'multi-choice': + return True + return False + def build_prompt(self, line, dataset=None): - from ..utils import img_root_map assert dataset is None or isinstance(dataset, str) - img_root = osp.join('images', img_root_map[dataset]) + assert self.use_custom_prompt(dataset) + tgt_path = self.dump_image(line, dataset) - os.makedirs(img_root, exist_ok=True) prompt_tmpl = "USER: <|image|>{}\n{}\n{}\nAnswer with the option’s letter from the given choices directly. ASSISTANT:" - - if isinstance(line['image'], list): - tgt_path = [] - for img, im_name in zip(line['image'], line['image_path']): - path = osp.join(img_root, im_name) - if not read_ok(path): - decode_base64_to_image_file(img, path) - tgt_path.append(path) - else: - tgt_path = osp.join(img_root, f"{line['index']}.jpg") - if not read_ok(tgt_path): - decode_base64_to_image_file(line['image'], tgt_path) - if dataset is not None and DATASET_TYPE(dataset) == 'multi-choice': - question = line['question'] - option_candidate = ['A', 'B', 'C', 'D', 'E'] - options = { - cand: line[cand] - for cand in option_candidate - if cand in line and not pd.isna(line[cand]) - } - options_prompt = '' - for key, item in options.items(): - options_prompt += f'{key}. {item}\n' - - hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else 'N/A' - prompt = prompt_tmpl.format(hint, question, options_prompt) - else: - prompt = line['question'] + question = line['question'] + options = { + cand: line[cand] + for cand in string.ascii_uppercase + if cand in line and not pd.isna(line[cand]) + } + options_prompt = '' + for key, item in options.items(): + options_prompt += f'{key}. {item}\n' + + hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else 'N/A' + prompt = prompt_tmpl.format(hint, question, options_prompt) return {'image': tgt_path, 'text': prompt} - def vanilla_generate(self, image_path, prompt): + def generate_vanilla(self, image_path, prompt): from mplug_owl2.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN from mplug_owl2.conversation import conv_templates from mplug_owl2.mm_utils import process_images, tokenizer_image_token, KeywordsStoppingCriteria @@ -106,7 +96,7 @@ def vanilla_generate(self, image_path, prompt): outputs = self.tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip() return outputs.split('')[0] - def mmbench_generate(self, image_path, prompt): + def generate_multichoice(self, image_path, prompt): from mplug_owl2.constants import IMAGE_TOKEN_INDEX from mplug_owl2.mm_utils import process_images, tokenizer_image_token image = Image.open(image_path).convert('RGB') @@ -129,9 +119,9 @@ def mmbench_generate(self, image_path, prompt): def generate(self, image_path, prompt, dataset=None): if dataset is not None and DATASET_TYPE(dataset) == 'multi-choice': - return self.mmbench_generate(image_path, prompt) + return self.generate_multichoice(image_path, prompt) else: - return self.vanilla_generate(image_path, prompt) + return self.generate_vanilla(image_path, prompt) def multi_generate(self, image_paths, prompt, dataset=None): from mplug_owl2.constants import IMAGE_TOKEN_INDEX diff --git a/vlmeval/vlm/transcore_m.py b/vlmeval/vlm/transcore_m.py index c01d11a5d..090b134a3 100644 --- a/vlmeval/vlm/transcore_m.py +++ b/vlmeval/vlm/transcore_m.py @@ -2,10 +2,12 @@ import sys import torch from abc import abstractproperty +import math from ..smp import * +from .utils import CustomPrompt from ..utils import DATASET_TYPE -class TransCoreM: +class TransCoreM(CustomPrompt): INSTALL_REQ = True @@ -44,7 +46,6 @@ def get_options(self,row, options): parsed_options.append(option_value) return parsed_options - def is_none(self,value): if value is None: return True @@ -55,48 +56,37 @@ def is_none(self,value): if type(value) is str and value.lower() == 'none': return True return False + + def use_custom_prompt(self, dataset): + assert dataset is not None + if DATASET_TYPE(dataset) == 'multi-choice': + return True + return False def build_prompt(self, line, dataset=None): - - from ..utils import img_root_map assert dataset is None or isinstance(dataset, str) - img_root = osp.join('images', img_root_map[dataset]) - os.makedirs(img_root, exist_ok=True) - - if isinstance(line['image'], list): - tgt_path = [] - for img, im_name in zip(line['image'], line['image_path']): - path = osp.join(img_root, im_name) - if not read_ok(path): - decode_base64_to_image_file(img, path) - tgt_path.append(path) - else: - tgt_path = osp.join(img_root, f"{line['index']}.jpg") - if not read_ok(tgt_path): - decode_base64_to_image_file(line['image'], tgt_path) - - if dataset is not None and DATASET_TYPE(dataset) == 'multi-choice': - question = line['question'] - hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None - if hint is not None: - question = hint + '\n' + question - - option_candidate = ['A', 'B', 'C', 'D'] - options = { - cand: line[cand] - for cand in option_candidate - if cand in line and not pd.isna(line[cand]) - } - for key, item in options.items(): - question += f'\n{key}. {item}' - prompt = question - - if not cn_string(prompt): - prompt = prompt + "\n" + "Answer with the option's letter from the given choices directly." - else: - prompt = prompt + "\n" + "请直接回答选项字母。" + assert self.use_custom_prompt(dataset) + tgt_path = self.dump_image(line, dataset) + + question = line['question'] + hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None + if hint is not None: + question = hint + '\n' + question + + options = { + cand: line[cand] + for cand in string.ascii_uppercase + if cand in line and not pd.isna(line[cand]) + } + for key, item in options.items(): + question += f'\n{key}. {item}' + prompt = question + + if not cn_string(prompt): + prompt = prompt + "\n" + "Answer with the option's letter from the given choices directly." else: - prompt = line['question'] + prompt = prompt + "\n" + "请直接回答选项字母。" + return {'image': tgt_path, 'text': prompt} def generate(self, image_path, prompt, dataset=None): @@ -108,7 +98,6 @@ def generate(self, image_path, prompt, dataset=None): top_p=None num_beams=1 - image = Image.open(image_path).convert('RGB') args = abstractproperty() args.image_aspect_ratio = 'pad' diff --git a/vlmeval/vlm/utils/__init__.py b/vlmeval/vlm/utils/__init__.py new file mode 100644 index 000000000..f2d69523f --- /dev/null +++ b/vlmeval/vlm/utils/__init__.py @@ -0,0 +1 @@ +from .custom_prompt import CustomPrompt \ No newline at end of file diff --git a/vlmeval/vlm/utils/custom_prompt.py b/vlmeval/vlm/utils/custom_prompt.py new file mode 100644 index 000000000..fd045c65f --- /dev/null +++ b/vlmeval/vlm/utils/custom_prompt.py @@ -0,0 +1,31 @@ +from ...smp import * +from ...utils import img_root_map +from abc import abstractmethod + +class CustomPrompt: + + @abstractmethod + def use_custom_prompt(self, dataset): + raise NotImplementedError + + @abstractmethod + def build_prompt(self, line, dataset): + raise NotImplementedError + + def dump_image(self, line, dataset): + assert isinstance(dataset, str) + img_root = osp.join('images', img_root_map[dataset]) + os.makedirs(img_root, exist_ok=True) + if isinstance(line['image'], list): + tgt_path = [] + assert 'image_path' in line + for img, im_name in zip(line['image'], line['image_path']): + path = osp.join(img_root, im_name) + if not read_ok(path): + decode_base64_to_image_file(img, path) + tgt_path.append(path) + else: + tgt_path = osp.join(img_root, f"{line['index']}.jpg") + if not read_ok(tgt_path): + decode_base64_to_image_file(line['image'], tgt_path) + return tgt_path \ No newline at end of file diff --git a/vlmeval/vlm/xcomposer.py b/vlmeval/vlm/xcomposer.py index 014c6ea00..3a80c9675 100644 --- a/vlmeval/vlm/xcomposer.py +++ b/vlmeval/vlm/xcomposer.py @@ -4,6 +4,7 @@ from ..smp import * from transformers import StoppingCriteria, StoppingCriteriaList +from .utils import CustomPrompt from PIL import Image class StoppingCriteriaSub(StoppingCriteria): @@ -20,7 +21,7 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor): from ..utils import DATASET_TYPE -class XComposer: +class XComposer(CustomPrompt): INSTALL_REQ = False @@ -39,10 +40,10 @@ def __init__(self, model_path='internlm/internlm-xcomposer-vl-7b'): ] self.stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)]) - def vanilla_generate(self, image_path, prompt): + def generate_vanilla(self, image_path, prompt): return self.model.generate(prompt, image_path) - def mmbench_generate(self, image_path, prompt): + def generate_multichoice(self, image_path, prompt): image = Image.open(image_path).convert("RGB") image = self.model.vis_processor(image).unsqueeze(0).to(self.device) img_embeds = self.model.encode_img(image) @@ -83,12 +84,12 @@ def mmbench_generate(self, image_path, prompt): def generate(self, image_path, prompt, dataset=None): if dataset is None: - return self.vanilla_generate(image_path, prompt) + return self.generate_vanilla(image_path, prompt) assert isinstance(dataset, str) if dataset is not None and DATASET_TYPE(dataset) == 'multi-choice': - return self.mmbench_generate(image_path, prompt) + return self.generate_multichoice(image_path, prompt) else: - return self.vanilla_generate(image_path, prompt) + return self.generate_vanilla(image_path, prompt) def multi_generate(self, image_paths, prompt, dataset=None): img_embeds, img_prompt = [], '' @@ -137,43 +138,33 @@ def multi_generate(self, image_paths, prompt, dataset=None): output_text = output_text.split('<|Bot|>')[-1].strip() return output_text + def use_custom_prompt(self, dataset): + assert dataset is not None + if DATASET_TYPE(dataset) == 'multi-choice': + return True + return False + def build_prompt(self, line, dataset=None): - from ..utils import img_root_map assert dataset is None or isinstance(dataset, str) - img_root = osp.join('images', img_root_map[dataset]) - os.makedirs(img_root, exist_ok=True) + assert self.use_custom_prompt(dataset) + tgt_path = self.dump_image(line, dataset) - if isinstance(line['image'], list): - tgt_path = [] - for img, im_name in zip(line['image'], line['image_path']): - path = osp.join(img_root, im_name) - if not read_ok(path): - decode_base64_to_image_file(img, path) - tgt_path.append(path) - else: - tgt_path = osp.join(img_root, f"{line['index']}.jpg") - if not read_ok(tgt_path): - decode_base64_to_image_file(line['image'], tgt_path) + question = line['question'] + options = { + cand: line[cand] + for cand in string.ascii_uppercase + if cand in line and not pd.isna(line[cand]) + } + options_prompt = '' + for key, item in options.items(): + options_prompt += f'{key}. {item}\n' + hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None - if dataset is not None and DATASET_TYPE(dataset) == 'multi-choice': - question = line['question'] - option_candidate = ['A', 'B', 'C', 'D', 'E'] - options = { - cand: line[cand] - for cand in option_candidate - if cand in line and not pd.isna(line[cand]) - } - options_prompt = '' - for key, item in options.items(): - options_prompt += f'{key}. {item}\n' - hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None - - img_prompt = ' <|User|>:' - txt_prompt = 'Please answer this question by choosing the correct choice.' - context = 'N/A' if hint is None else hint - mid_prompt = 'Context: ' + context + '\nQuestion: ' + question + '\nOptions: ' + options_prompt - ans_prompt = ' <|Bot|>: Answer: The answer is' - prompt = img_prompt + txt_prompt + mid_prompt + '' + ans_prompt - else: - prompt = line['question'] + img_prompt = ' <|User|>:' + txt_prompt = 'Please answer this question by choosing the correct choice.' + context = 'N/A' if hint is None else hint + mid_prompt = 'Context: ' + context + '\nQuestion: ' + question + '\nOptions: ' + options_prompt + ans_prompt = ' <|Bot|>: Answer: The answer is' + prompt = img_prompt + txt_prompt + mid_prompt + '' + ans_prompt + return {'image': tgt_path, 'text': prompt} \ No newline at end of file From e3939d67dd4588cded31494d1b061893d807dcac Mon Sep 17 00:00:00 2001 From: kennymckormick Date: Sat, 23 Dec 2023 17:22:33 +0800 Subject: [PATCH 02/11] update --- vlmeval/inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vlmeval/inference.py b/vlmeval/inference.py index 947159440..331a57b16 100644 --- a/vlmeval/inference.py +++ b/vlmeval/inference.py @@ -94,7 +94,7 @@ def infer_data(model_name, dataset_name, out_file, verbose=False, api_nproc=4): if idx in res: continue - if hasattr(model, 'build_prompt'): + if hasattr(model, 'use_custom_prompt') and model.use_custom_prompt(dataset_name): struct = model.build_prompt(data.iloc[i], dataset=dataset_name) else: struct = dataset.build_prompt(data.iloc[i]) From 76186aa52dbcaec6953af78cecdcf8f006b218ab Mon Sep 17 00:00:00 2001 From: kennymckormick Date: Sat, 23 Dec 2023 17:56:50 +0800 Subject: [PATCH 03/11] update MME performance --- results/MME.md | 12 ++++++------ vlmeval/vlm/mplug_owl2.py | 12 ++++++++++-- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/results/MME.md b/results/MME.md index da0c96dcb..c685e60be 100644 --- a/results/MME.md +++ b/results/MME.md @@ -15,16 +15,16 @@ VLMs are sorted by the descending order of Total score. | qwen_chat | 1849 / 1860 | 1457 / 1468 | 392 | | sharegpt4v_7b | 1799 / 1808 | 1491 | 308 / 318 | | llava_v1.5_13b | 1800 / 1805 | 1485 / 1490 | 315 | -| llava_v1.5_7b | 1776 | 1490 | 285 | -| mPLUG-Owl2 | 1733 / 1735 | 1407 / 1409 | 326 | +| mPLUG-Owl2 | 1781 / 1786 | 1435 / 1436 | 346 / 350 | +| llava_v1.5_7b | 1775 | 1490 | 285 | | TransCore_M | 1682 / 1701 | 1427 / 1429 | 254 / 272 | | instructblip_13b | 1624 / 1646 | 1381 / 1383 | 243 / 263 | -| idefics_80b_instruct | 1508 / 1518 | 1276 / 1285 | 231 / 234 | +| idefics_80b_instruct | 1507 / 1519 | 1276 / 1285 | 231 / 234 | | instructblip_7b | 1313 / 1391 | 1084 / 1137 | 229 / 254 | | idefics_9b_instruct | 1177 | 942 | 235 | | PandaGPT_13B | 1072 | 826 | 246 | | MiniGPT-4-v1-13B | 648 / 1067 | 533 / 794 | 115 / 273 | -| MiniGPT-4-v1-7B | 806 / 1047 | 622 / 771 | 184 / 277 | +| MiniGPT-4-v1-7B | 806 / 1048 | 622 / 771 | 184 / 277 | | llava_v1_7b | 1027 / 1044 | 793 / 807 | 234 / 238 | | MiniGPT-4-v2 | 968 | 708 | 260 | | VisualGLM_6b | 738 | 628 | 110 | @@ -38,6 +38,6 @@ For most VLMs, using ChatGPT as the answer extractor or not may not significantl | MME Score Improvement with ChatGPT Answer Extractor | Models | | --------------------------------------------------- | ------------------------------------------------------------ | | **No (0)** | XComposer, llava_v1.5_7b, idefics_9b_instruct, PandaGPT_13B, MiniGPT-4-v2, VisualGLM_6b, flamingov2 | -| **Minor (1~20)** | qwen_chat (11), llava_v1.5_13b (5), mPLUG-Owl2 (2), idefics_80b_instruct (10), llava_v1_7b (17), sharegpt4v_7b (9), TransCore_M (19) | +| **Minor (1~20)** | qwen_chat (11), llava_v1.5_13b (5), mPLUG-Owl2 (5), idefics_80b_instruct (12), llava_v1_7b (17), sharegpt4v_7b (9), TransCore_M (19) | | **Moderate (21~100)** | instructblip_13b (22), instructblip_7b (78) | -| **Huge (> 100)** | MiniGPT-4-v1-7B (241), MiniGPT-4-v1-13B (419), qwen_base (477) | \ No newline at end of file +| **Huge (> 100)** | MiniGPT-4-v1-7B (242), MiniGPT-4-v1-13B (419), qwen_base (477) | \ No newline at end of file diff --git a/vlmeval/vlm/mplug_owl2.py b/vlmeval/vlm/mplug_owl2.py index 179036c09..64c235994 100644 --- a/vlmeval/vlm/mplug_owl2.py +++ b/vlmeval/vlm/mplug_owl2.py @@ -118,10 +118,18 @@ def generate_multichoice(self, image_path, prompt): return answer.split('')[0] def generate(self, image_path, prompt, dataset=None): + if dataset in ['MMVet']: + num_beams_old = self.kwargs.pop('num_beams') + self.kwargs['num_beams'] = 5 + + ret = None if dataset is not None and DATASET_TYPE(dataset) == 'multi-choice': - return self.generate_multichoice(image_path, prompt) + ret = self.generate_multichoice(image_path, prompt) else: - return self.generate_vanilla(image_path, prompt) + ret = self.generate_vanilla(image_path, prompt) + + if dataset in ['MMVet']: + self.kwargs['num_beams'] = num_beams_old def multi_generate(self, image_paths, prompt, dataset=None): from mplug_owl2.constants import IMAGE_TOKEN_INDEX From 3e71f55cfb01db76aa752232db127ad693ea50a7 Mon Sep 17 00:00:00 2001 From: kennymckormick Date: Sat, 23 Dec 2023 18:00:01 +0800 Subject: [PATCH 04/11] update --- vlmeval/vlm/mplug_owl2.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vlmeval/vlm/mplug_owl2.py b/vlmeval/vlm/mplug_owl2.py index 64c235994..91be358f7 100644 --- a/vlmeval/vlm/mplug_owl2.py +++ b/vlmeval/vlm/mplug_owl2.py @@ -130,6 +130,7 @@ def generate(self, image_path, prompt, dataset=None): if dataset in ['MMVet']: self.kwargs['num_beams'] = num_beams_old + return ret def multi_generate(self, image_paths, prompt, dataset=None): from mplug_owl2.constants import IMAGE_TOKEN_INDEX From 6d22eab17b6467698483c8a5b0bed537fab625d8 Mon Sep 17 00:00:00 2001 From: kennymckormick Date: Sat, 23 Dec 2023 18:09:51 +0800 Subject: [PATCH 05/11] update --- vlmeval/vlm/mplug_owl2.py | 39 ++++++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/vlmeval/vlm/mplug_owl2.py b/vlmeval/vlm/mplug_owl2.py index 91be358f7..a6772ae07 100644 --- a/vlmeval/vlm/mplug_owl2.py +++ b/vlmeval/vlm/mplug_owl2.py @@ -116,21 +116,38 @@ def generate_multichoice(self, image_path, prompt): **self.kwargs) answer = self.tokenizer.decode(output_ids[0, input_ids.shape[1]: ]).strip() return answer.split('')[0] + + def generate_mmvet(self, image_path, prompt): + from mplug_owl2.constants import IMAGE_TOKEN_INDEX + from mplug_owl2.mm_utils import process_images, tokenizer_image_token + image = Image.open(image_path).convert('RGB') + max_edge = max(image.size) # We recommand you to resize to squared image for BEST performance. + image = image.resize((max_edge, max_edge)) + + image_tensor = process_images([image], self.image_processor) + image_tensor = image_tensor.to(self.device, dtype=torch.float16) + + input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(self.device) + kwargs = cp.deepcopy(self.kwargs) + kwargs['num_beams'] = 5 + + with torch.inference_mode(): + output_ids = self.model.generate( + input_ids=input_ids, + images=image_tensor, + output_hidden_states=True, + use_cache=True, + **kwargs) + answer = self.tokenizer.decode(output_ids[0, input_ids.shape[1]: ]).strip() + return answer.split('')[0] def generate(self, image_path, prompt, dataset=None): - if dataset in ['MMVet']: - num_beams_old = self.kwargs.pop('num_beams') - self.kwargs['num_beams'] = 5 - - ret = None if dataset is not None and DATASET_TYPE(dataset) == 'multi-choice': - ret = self.generate_multichoice(image_path, prompt) + return self.generate_multichoice(image_path, prompt) + elif dataset == 'MMVet': + return self.generate_mmvet(image_path, prompt) else: - ret = self.generate_vanilla(image_path, prompt) - - if dataset in ['MMVet']: - self.kwargs['num_beams'] = num_beams_old - return ret + return self.generate_vanilla(image_path, prompt) def multi_generate(self, image_paths, prompt, dataset=None): from mplug_owl2.constants import IMAGE_TOKEN_INDEX From c2764cc7e9ce0a22041005a4b3ca8f730c507982 Mon Sep 17 00:00:00 2001 From: kennymckormick Date: Sat, 23 Dec 2023 18:12:00 +0800 Subject: [PATCH 06/11] update --- vlmeval/vlm/mplug_owl2.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vlmeval/vlm/mplug_owl2.py b/vlmeval/vlm/mplug_owl2.py index a6772ae07..6ef6a55c5 100644 --- a/vlmeval/vlm/mplug_owl2.py +++ b/vlmeval/vlm/mplug_owl2.py @@ -130,6 +130,7 @@ def generate_mmvet(self, image_path, prompt): input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(self.device) kwargs = cp.deepcopy(self.kwargs) kwargs['num_beams'] = 5 + kwargs['max_new_tokens'] = 64 with torch.inference_mode(): output_ids = self.model.generate( From f20dbf76d37fb1c6bdaeaafc71b7b98cd6beaf70 Mon Sep 17 00:00:00 2001 From: kennymckormick Date: Sat, 23 Dec 2023 18:21:30 +0800 Subject: [PATCH 07/11] update --- vlmeval/vlm/mplug_owl2.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vlmeval/vlm/mplug_owl2.py b/vlmeval/vlm/mplug_owl2.py index 6ef6a55c5..391e82259 100644 --- a/vlmeval/vlm/mplug_owl2.py +++ b/vlmeval/vlm/mplug_owl2.py @@ -131,6 +131,7 @@ def generate_mmvet(self, image_path, prompt): kwargs = cp.deepcopy(self.kwargs) kwargs['num_beams'] = 5 kwargs['max_new_tokens'] = 64 + kwargs['length_penalty'] = 0 with torch.inference_mode(): output_ids = self.model.generate( From 2b7e0452bc6a5b4452683d5da26a8e91b43394d0 Mon Sep 17 00:00:00 2001 From: kennymckormick Date: Sat, 23 Dec 2023 20:31:44 +0800 Subject: [PATCH 08/11] update --- vlmeval/vlm/mplug_owl2.py | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/vlmeval/vlm/mplug_owl2.py b/vlmeval/vlm/mplug_owl2.py index 391e82259..b483e6261 100644 --- a/vlmeval/vlm/mplug_owl2.py +++ b/vlmeval/vlm/mplug_owl2.py @@ -37,7 +37,7 @@ def __init__(self, model_path='MAGAer13/mplug-owl2-llama2-7b', **kwargs): def use_custom_prompt(self, dataset): assert dataset is not None - if DATASET_TYPE(dataset) == 'multi-choice': + if DATASET_TYPE(dataset) == 'multi-choice' or dataset == 'MMVet': return True return False @@ -45,21 +45,25 @@ def build_prompt(self, line, dataset=None): assert dataset is None or isinstance(dataset, str) assert self.use_custom_prompt(dataset) tgt_path = self.dump_image(line, dataset) - - prompt_tmpl = "USER: <|image|>{}\n{}\n{}\nAnswer with the option’s letter from the given choices directly. ASSISTANT:" - - question = line['question'] - options = { - cand: line[cand] - for cand in string.ascii_uppercase - if cand in line and not pd.isna(line[cand]) - } - options_prompt = '' - for key, item in options.items(): - options_prompt += f'{key}. {item}\n' - - hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else 'N/A' - prompt = prompt_tmpl.format(hint, question, options_prompt) + + if dataset == 'MMVet': + prompt_tmpl = "USER: <|image|>{}\nAnswer the question directly. ASSISTANT:" + prompt = prompt_tmpl.format(line['question']) + elif DATASET_TYPE(dataset) == 'multi-choice': + prompt_tmpl = "USER: <|image|>{}\n{}\n{}\nAnswer with the option’s letter from the given choices directly. ASSISTANT:" + options = { + cand: line[cand] + for cand in string.ascii_uppercase + if cand in line and not pd.isna(line[cand]) + } + options_prompt = '' + for key, item in options.items(): + options_prompt += f'{key}. {item}\n' + + hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else 'N/A' + prompt = prompt_tmpl.format(hint, line['question'], options_prompt) + else: + raise NotImplementedError return {'image': tgt_path, 'text': prompt} @@ -129,7 +133,6 @@ def generate_mmvet(self, image_path, prompt): input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(self.device) kwargs = cp.deepcopy(self.kwargs) - kwargs['num_beams'] = 5 kwargs['max_new_tokens'] = 64 kwargs['length_penalty'] = 0 From 77d6e3adb1cc3fae90ee765eae484a335ce719ab Mon Sep 17 00:00:00 2001 From: kennymckormick Date: Sat, 23 Dec 2023 20:33:56 +0800 Subject: [PATCH 09/11] update --- vlmeval/inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vlmeval/inference.py b/vlmeval/inference.py index 331a57b16..a3c2b2d18 100644 --- a/vlmeval/inference.py +++ b/vlmeval/inference.py @@ -183,7 +183,7 @@ def infer_data_job(model, model_name, dataset_name, verbose=False, api_nproc=4): data = load(result_file) failed_set = [] for idx, pred in zip(data['index'], data['prediction']): - if FAIL_MSG in pred: + if FAIL_MSG in str(pred): failed_set.append(idx) if len(failed_set): print(f'{len(failed_set)} records failed in the original result file {result_file}. ') From 9917e19830b6ece21bca604e70f61623e57fa759 Mon Sep 17 00:00:00 2001 From: kennymckormick Date: Sat, 23 Dec 2023 20:55:32 +0800 Subject: [PATCH 10/11] update --- vlmeval/eval/mmvet_eval.py | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/vlmeval/eval/mmvet_eval.py b/vlmeval/eval/mmvet_eval.py index 254946e25..d754087f8 100644 --- a/vlmeval/eval/mmvet_eval.py +++ b/vlmeval/eval/mmvet_eval.py @@ -88,7 +88,9 @@ def MMVet_acc(result_file): def MMVet_eval(eval_file, model='gpt-4-turbo', nproc=4, verbose=False): logger = get_logger('Evaluation') - storage = eval_file.replace('.xlsx', f'_{model}.xlsx') + suffix = eval_file.split('.')[-1] + storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx') + tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl') if osp.exists(storage): logger.warning(f"GPT scoring file {storage} already exists, will reuse it in MMVet_eval. ") else: @@ -114,12 +116,26 @@ def MMVet_eval(eval_file, model='gpt-4-turbo', nproc=4, verbose=False): tups = [(model, line) for line in lines] indices = [line['index'] for line in lines] - res = track_progress_rich(MMVet_auxeval, tups, nproc=nproc, chunksize=nproc) - + ans = {} + if osp.exists(tmp_file): + ans = load(tmp_file) + tups = [x for x, i in zip(tups, indices) if i not in ans] + indices = [i for i in indices if i not in ans] + + if len(indices): + new_results = track_progress_rich( + MMVet_auxeval, tups, nproc=nproc, chunksize=nproc, + keys=indices, save=tmp_file) + ans = load(tmp_file) + for k, v in zip(indices, new_results): + assert k in ans + assert ans[k]['log'] == v['log'] and ans[k]['score'] == v['score'] + log_map, score_map = {}, {} - for k, v in zip(indices, res): - log_map[k] = v['log'] - score_map[k] = v['score'] + all_inds = [line['index'] for line in lines] + for k in all_inds: + log_map[k] = ans[k]['log'] + score_map[k] = ans[k]['score'] data['score'] = [score_map[idx] for idx in data['index']] data['log'] = [log_map[idx] for idx in data['index']] dump(data, storage) From 724eefb40f04fe77f461bff4c2f8edc866b6e1cf Mon Sep 17 00:00:00 2001 From: kennymckormick Date: Sat, 23 Dec 2023 21:03:01 +0800 Subject: [PATCH 11/11] update MMVet acc --- results/MMVet.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/results/MMVet.md b/results/MMVet.md index 2be8f74d0..5ec182025 100644 --- a/results/MMVet.md +++ b/results/MMVet.md @@ -10,6 +10,7 @@ | qwen_chat | 47.3 | N/A | 37.2 | 22.3 | 42.8 | 52.5 | 45.4 | 40.3 | | idefics_80b_instruct | 39.7 | N/A | 29.9 | 15 | 30.7 | 45.6 | 38.6 | 37.1 | | llava_v1.5_13b | 38.3 | 36.3±0.2 | 28.8 | 11.5 | 31.5 | 42 | 23.1 | 23 | +| mPLUG-Owl2 | 35.7 | 36.3±0.1 | 29.5 | 7.7 | 32.1 | 47.3 | 23.8 | 20.9 | | XComposer | 35.2 | N/A | 21.8 | 3.8 | 24.7 | 43.1 | 28.9 | 27.5 | | sharegpt4v_7b | 34.7 | 37.6 | 30.2 | 18.5 | 30 | 36.1 | 20.2 | 18.1 | | TransCore_M | 33.9 | N/A | 27.3 | 15.4 | 32.7 | 36.7 | 23 | 23.5 | @@ -18,7 +19,6 @@ | instructblip_13b | 30.1 | 25.6±0.3 | 25.4 | 11.2 | 26.9 | 33.4 | 19 | 18.2 | | idefics_9b_instruct | 30 | N/A | 21.7 | 11.5 | 22.4 | 34.6 | 27.4 | 26.9 | | llava_v1_7b (vicuna-v1.1) | 27.4 | 23.8±0.6 | 19 | 11.5 | 25.6 | 31.4 | 18.1 | 16.2 | -| mPLUG-Owl2 | 24.1 | 36.3±0.1 | 16.1 | 7.3 | 16.5 | 27.7 | 9 | 6.9 | | flamingov2 | 23.3 | 24.8±0.2 | 19.5 | 7.7 | 21.7 | 24.7 | 21.7 | 19 | | PandaGPT_13B | 19.6 | N/A | 6.8 | 6.5 | 16.5 | 26.3 | 13.7 | 13.9 | | MiniGPT-4-v1-13B | 16.9 | 24.4±0.4 | 10.3 | 7.7 | 12.5 | 19.9 | 14.9 | 13.8 |