open-compass · kennymckormick · Dec 23, 2023 · Dec 23, 2023 · Dec 23, 2023 · Dec 23, 2023
diff --git a/results/MME.md b/results/MME.md
@@ -15,16 +15,16 @@ VLMs are sorted by the descending order of Total score.
 | qwen_chat            | 1849 / 1860 | 1457 / 1468 |       392 |
 | sharegpt4v_7b        | 1799 / 1808 |        1491 | 308 / 318 |
 | llava_v1.5_13b       | 1800 / 1805 | 1485 / 1490 |       315 |
-| llava_v1.5_7b        |        1776 |        1490 |       285 |
-| mPLUG-Owl2           | 1733 / 1735 | 1407 / 1409 |       326 |
+| mPLUG-Owl2           | 1781 / 1786 | 1435 / 1436 | 346 / 350 |
+| llava_v1.5_7b        |        1775 |        1490 |       285 |
 | TransCore_M          | 1682 / 1701 | 1427 / 1429 | 254 / 272 |
 | instructblip_13b     | 1624 / 1646 | 1381 / 1383 | 243 / 263 |
-| idefics_80b_instruct | 1508 / 1518 | 1276 / 1285 | 231 / 234 |
+| idefics_80b_instruct | 1507 / 1519 | 1276 / 1285 | 231 / 234 |
 | instructblip_7b      | 1313 / 1391 | 1084 / 1137 | 229 / 254 |
 | idefics_9b_instruct  |        1177 |         942 |       235 |
 | PandaGPT_13B         |        1072 |         826 |       246 |
 | MiniGPT-4-v1-13B     |  648 / 1067 |   533 / 794 | 115 / 273 |
-| MiniGPT-4-v1-7B      |  806 / 1047 |   622 / 771 | 184 / 277 |
+| MiniGPT-4-v1-7B      |  806 / 1048 |   622 / 771 | 184 / 277 |
 | llava_v1_7b          | 1027 / 1044 |   793 / 807 | 234 / 238 |
 | MiniGPT-4-v2         |         968 |         708 |       260 |
 | VisualGLM_6b         |         738 |         628 |       110 |
@@ -38,6 +38,6 @@ For most VLMs, using ChatGPT as the answer extractor or not may not significantl
 | MME Score Improvement with ChatGPT Answer Extractor | Models                                                       |
 | --------------------------------------------------- | ------------------------------------------------------------ |
 | **No (0)**                                          | XComposer, llava_v1.5_7b, idefics_9b_instruct, PandaGPT_13B, MiniGPT-4-v2, VisualGLM_6b, flamingov2 |
-| **Minor (1~20)**                                    | qwen_chat (11), llava_v1.5_13b (5), mPLUG-Owl2 (2), idefics_80b_instruct (10), llava_v1_7b (17), sharegpt4v_7b (9), TransCore_M (19) |
+| **Minor (1~20)**                                    | qwen_chat (11), llava_v1.5_13b (5), mPLUG-Owl2 (5), idefics_80b_instruct (12), llava_v1_7b (17), sharegpt4v_7b (9), TransCore_M (19) |
 | **Moderate (21~100)**                               | instructblip_13b (22), instructblip_7b (78)                  |
-| **Huge (> 100)**                                    | MiniGPT-4-v1-7B (241), MiniGPT-4-v1-13B (419), qwen_base (477) |
+| **Huge (> 100)**                                    | MiniGPT-4-v1-7B (242), MiniGPT-4-v1-13B (419), qwen_base (477) |
diff --git a/results/MMVet.md b/results/MMVet.md
@@ -10,6 +10,7 @@
 | qwen_chat                 |    47.3 |                                                          N/A | 37.2 | 22.3 | 42.8 | 52.5 | 45.4 | 40.3 |
 | idefics_80b_instruct      |    39.7 |                                                          N/A | 29.9 |   15 | 30.7 | 45.6 | 38.6 | 37.1 |
 | llava_v1.5_13b            |    38.3 |                                                     36.3±0.2 | 28.8 | 11.5 | 31.5 |   42 | 23.1 |   23 |
+| mPLUG-Owl2                |    35.7 |                                                     36.3±0.1 | 29.5 |  7.7 | 32.1 | 47.3 | 23.8 | 20.9 |
 | XComposer                 |    35.2 |                                                          N/A | 21.8 |  3.8 | 24.7 | 43.1 | 28.9 | 27.5 |
 | sharegpt4v_7b             |    34.7 |                                                         37.6 | 30.2 | 18.5 |   30 | 36.1 | 20.2 | 18.1 |
 | TransCore_M               |    33.9 |                                                          N/A | 27.3 | 15.4 | 32.7 | 36.7 |   23 | 23.5 |
@@ -18,7 +19,6 @@
 | instructblip_13b          |    30.1 |                                                     25.6±0.3 | 25.4 | 11.2 | 26.9 | 33.4 |   19 | 18.2 |
 | idefics_9b_instruct       |      30 |                                                          N/A | 21.7 | 11.5 | 22.4 | 34.6 | 27.4 | 26.9 |
 | llava_v1_7b (vicuna-v1.1) |    27.4 |                                                     23.8±0.6 |   19 | 11.5 | 25.6 | 31.4 | 18.1 | 16.2 |
-| mPLUG-Owl2                |    24.1 |                                                     36.3±0.1 | 16.1 |  7.3 | 16.5 | 27.7 |    9 |  6.9 |
 | flamingov2                |    23.3 |                                                     24.8±0.2 | 19.5 |  7.7 | 21.7 | 24.7 | 21.7 |   19 |
 | PandaGPT_13B              |    19.6 |                                                          N/A |  6.8 |  6.5 | 16.5 | 26.3 | 13.7 | 13.9 |
 | MiniGPT-4-v1-13B          |    16.9 |                                                     24.4±0.4 | 10.3 |  7.7 | 12.5 | 19.9 | 14.9 | 13.8 |

diff --git a/vlmeval/eval/mmvet_eval.py b/vlmeval/eval/mmvet_eval.py
@@ -88,7 +88,9 @@ def MMVet_acc(result_file):
 def MMVet_eval(eval_file, model='gpt-4-turbo', nproc=4, verbose=False):
     logger = get_logger('Evaluation')
 
-    storage = eval_file.replace('.xlsx', f'_{model}.xlsx')
+    suffix = eval_file.split('.')[-1]
+    storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
+    tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
     if osp.exists(storage):
         logger.warning(f"GPT scoring file {storage} already exists, will reuse it in MMVet_eval. ")
     else:
@@ -114,12 +116,26 @@ def MMVet_eval(eval_file, model='gpt-4-turbo', nproc=4, verbose=False):
         tups = [(model, line) for line in lines]
         indices = [line['index'] for line in lines]
 
-        res = track_progress_rich(MMVet_auxeval, tups, nproc=nproc, chunksize=nproc)
-
+        ans = {}
+        if osp.exists(tmp_file):
+            ans = load(tmp_file)
+        tups = [x for x, i in zip(tups, indices) if i not in ans]
+        indices = [i for i in indices if i not in ans]
+
+        if len(indices):
+            new_results = track_progress_rich(
+                MMVet_auxeval, tups, nproc=nproc, chunksize=nproc,
+                keys=indices, save=tmp_file)
+            ans = load(tmp_file)
+            for k, v in zip(indices, new_results):
+                assert k in ans 
+                assert ans[k]['log'] == v['log'] and ans[k]['score'] == v['score']
+
         log_map, score_map = {}, {}
-        for k, v in zip(indices, res):
-            log_map[k] = v['log']
-            score_map[k] = v['score']
+        all_inds = [line['index'] for line in lines]
+        for k in all_inds:
+            log_map[k] = ans[k]['log']
+            score_map[k] = ans[k]['score']
         data['score'] = [score_map[idx] for idx in data['index']]
         data['log'] = [log_map[idx] for idx in data['index']]
         dump(data, storage)

diff --git a/vlmeval/inference.py b/vlmeval/inference.py
@@ -94,7 +94,7 @@ def infer_data(model_name, dataset_name, out_file, verbose=False, api_nproc=4):
         if idx in res:
             continue
 
-        if hasattr(model, 'build_prompt'):
+        if hasattr(model, 'use_custom_prompt') and model.use_custom_prompt(dataset_name):
             struct = model.build_prompt(data.iloc[i], dataset=dataset_name)
         else:
             struct = dataset.build_prompt(data.iloc[i])
@@ -183,7 +183,7 @@ def infer_data_job(model, model_name, dataset_name, verbose=False, api_nproc=4):
         data = load(result_file)
         failed_set = []
         for idx, pred in zip(data['index'], data['prediction']):
-            if FAIL_MSG in pred:
+            if FAIL_MSG in str(pred):
                 failed_set.append(idx)
         if len(failed_set):
             print(f'{len(failed_set)} records failed in the original result file {result_file}. ')

diff --git a/vlmeval/vlm/llava.py b/vlmeval/vlm/llava.py
@@ -4,9 +4,10 @@
 import os
 import os.path as osp
 from ..smp import *
+from .utils import CustomPrompt
 from ..utils import DATASET_TYPE
 
-class LLaVA:
+class LLaVA(CustomPrompt):
 
     INSTALL_REQ = True
 
@@ -50,46 +51,34 @@ def __init__(self,
         self.kwargs = kwargs_default
         warnings.warn(f"Following kwargs received: {self.kwargs}, will use as generation config. ")
 
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        if DATASET_TYPE(dataset) == 'multi-choice':
+            return True
+        return False
+
     def build_prompt(self, line, dataset=None):
-        from ..utils import img_root_map
+        assert self.use_custom_prompt(dataset)
         assert dataset is None or isinstance(dataset, str)
-        img_root = osp.join('images', img_root_map[dataset])
-        os.makedirs(img_root, exist_ok=True)
+        tgt_path = self.dump_image(line, dataset)
 
-        if isinstance(line['image'], list):
-            tgt_path = []
-            for img, im_name in zip(line['image'], line['image_path']):
-                path = osp.join(img_root, im_name)
-                if not read_ok(path):
-                    decode_base64_to_image_file(img, path)
-                tgt_path.append(path)
-        else:
-            tgt_path = osp.join(img_root, f"{line['index']}.jpg")
-            if not read_ok(tgt_path):
-                decode_base64_to_image_file(line['image'], tgt_path)
-
-        if dataset is not None and DATASET_TYPE(dataset) == 'multi-choice':
-            question = line['question']
-            hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
-            if hint is not None:
-                question + hint + '\n' + question
+        question = line['question']
+        hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+        if hint is not None:
+            question + hint + '\n' + question
 
-            option_candidate = ['A', 'B', 'C', 'D', 'E']
-            options = {
-                cand: line[cand]
-                for cand in option_candidate
-                if cand in line and not pd.isna(line[cand])
-            }
-            for key, item in options.items():
-                question += f'\n{key}. {item}'
-            prompt = question
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        for key, item in options.items():
+            question += f'\n{key}. {item}'
 
-            if not cn_string(prompt):
-                prompt = prompt + "\n" + "Answer with the option's letter from the given choices directly."
-            else:
-                prompt = prompt + "\n" + "请直接回答选项字母。"
+        if not cn_string(question):
+            prompt = question + "\n" + "Answer with the option's letter from the given choices directly."
         else:
-            prompt = line['question']
+            prompt = question + "\n" + "请直接回答选项字母。"
 
         return {'image': tgt_path, 'text': prompt}
 

diff --git a/vlmeval/vlm/mplug_owl2.py b/vlmeval/vlm/mplug_owl2.py
@@ -1,10 +1,11 @@
 import os, torch
 from PIL import Image
 from ..smp import *
+from .utils import CustomPrompt
 from ..utils import DATASET_TYPE
 
 
-class mPLUG_Owl2:
+class mPLUG_Owl2(CustomPrompt):
 
     INSTALL_REQ = True
 
@@ -34,46 +35,39 @@ def __init__(self, model_path='MAGAer13/mplug-owl2-llama2-7b', **kwargs):
         self.kwargs = kwargs_default
         warnings.warn(f"Following kwargs received: {self.kwargs}, will use as generation config. ")
 
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        if DATASET_TYPE(dataset) == 'multi-choice' or dataset == 'MMVet':
+            return True
+        return False
+
     def build_prompt(self, line, dataset=None):
-        from ..utils import img_root_map
         assert dataset is None or isinstance(dataset, str)
-        img_root = osp.join('images', img_root_map[dataset])
-
-        os.makedirs(img_root, exist_ok=True)
-        prompt_tmpl = "USER: <|image|>{}\n{}\n{}\nAnswer with the option’s letter from the given choices directly. ASSISTANT:"
-
-        if isinstance(line['image'], list):
-            tgt_path = []
-            for img, im_name in zip(line['image'], line['image_path']):
-                path = osp.join(img_root, im_name)
-                if not read_ok(path):
-                    decode_base64_to_image_file(img, path)
-                tgt_path.append(path)
-        else:
-            tgt_path = osp.join(img_root, f"{line['index']}.jpg")
-            if not read_ok(tgt_path):
-                decode_base64_to_image_file(line['image'], tgt_path)
-
-        if dataset is not None and DATASET_TYPE(dataset) == 'multi-choice':
-            question = line['question']
-            option_candidate = ['A', 'B', 'C', 'D', 'E']
+        assert self.use_custom_prompt(dataset)
+        tgt_path = self.dump_image(line, dataset)
+
+        if dataset == 'MMVet':
+            prompt_tmpl = "USER: <|image|>{}\nAnswer the question directly. ASSISTANT:"
+            prompt = prompt_tmpl.format(line['question'])
+        elif DATASET_TYPE(dataset) == 'multi-choice':
+            prompt_tmpl = "USER: <|image|>{}\n{}\n{}\nAnswer with the option’s letter from the given choices directly. ASSISTANT:"
             options = {
                 cand: line[cand]
-                for cand in option_candidate
+                for cand in string.ascii_uppercase
                 if cand in line and not pd.isna(line[cand])
             }
             options_prompt = ''
             for key, item in options.items():
                 options_prompt += f'{key}. {item}\n'
 
             hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else 'N/A'
-            prompt = prompt_tmpl.format(hint, question, options_prompt)
+            prompt = prompt_tmpl.format(hint, line['question'], options_prompt)
         else:
-            prompt = line['question']
+            raise NotImplementedError
 
         return {'image': tgt_path, 'text': prompt}
 
-    def vanilla_generate(self, image_path, prompt):
+    def generate_vanilla(self, image_path, prompt):
         from mplug_owl2.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
         from mplug_owl2.conversation import conv_templates
         from mplug_owl2.mm_utils import process_images, tokenizer_image_token, KeywordsStoppingCriteria
@@ -106,7 +100,7 @@ def vanilla_generate(self, image_path, prompt):
         outputs = self.tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
         return outputs.split('</s>')[0]
 
-    def mmbench_generate(self, image_path, prompt):
+    def generate_multichoice(self, image_path, prompt):
         from mplug_owl2.constants import IMAGE_TOKEN_INDEX
         from mplug_owl2.mm_utils import process_images, tokenizer_image_token
         image = Image.open(image_path).convert('RGB')
@@ -126,12 +120,39 @@ def mmbench_generate(self, image_path, prompt):
                 **self.kwargs)
         answer = self.tokenizer.decode(output_ids[0, input_ids.shape[1]: ]).strip()
         return answer.split('</s>')[0]
+
+    def generate_mmvet(self, image_path, prompt):
+        from mplug_owl2.constants import IMAGE_TOKEN_INDEX
+        from mplug_owl2.mm_utils import process_images, tokenizer_image_token
+        image = Image.open(image_path).convert('RGB')
+        max_edge = max(image.size) # We recommand you to resize to squared image for BEST performance.
+        image = image.resize((max_edge, max_edge))
+
+        image_tensor = process_images([image], self.image_processor)
+        image_tensor = image_tensor.to(self.device, dtype=torch.float16)
+
+        input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(self.device)
+        kwargs = cp.deepcopy(self.kwargs)
+        kwargs['max_new_tokens'] = 64
+        kwargs['length_penalty'] = 0
+
+        with torch.inference_mode():
+            output_ids = self.model.generate(
+                input_ids=input_ids, 
+                images=image_tensor, 
+                output_hidden_states=True, 
+                use_cache=True, 
+                **kwargs)
+        answer = self.tokenizer.decode(output_ids[0, input_ids.shape[1]: ]).strip()
+        return answer.split('</s>')[0]
 
     def generate(self, image_path, prompt, dataset=None):
         if dataset is not None and DATASET_TYPE(dataset) == 'multi-choice':
-            return self.mmbench_generate(image_path, prompt)
+            return self.generate_multichoice(image_path, prompt)
+        elif dataset == 'MMVet':
+            return self.generate_mmvet(image_path, prompt)
         else:
-            return self.vanilla_generate(image_path, prompt)
+            return self.generate_vanilla(image_path, prompt)
 
     def multi_generate(self, image_paths, prompt, dataset=None):
         from mplug_owl2.constants import IMAGE_TOKEN_INDEX