From 133a17f0669d604376ae1339a81def9577c46386 Mon Sep 17 00:00:00 2001
From: kennymckormick <dhd@pku.edu.cn>
Date: Sat, 23 Dec 2023 17:17:33 +0800
Subject: [PATCH 01/11] refactor custom_prompt

---
 vlmeval/vlm/llava.py               | 59 ++++++++++--------------
 vlmeval/vlm/mplug_owl2.py          | 62 +++++++++++--------------
 vlmeval/vlm/transcore_m.py         | 71 ++++++++++++-----------------
 vlmeval/vlm/utils/__init__.py      |  1 +
 vlmeval/vlm/utils/custom_prompt.py | 31 +++++++++++++
 vlmeval/vlm/xcomposer.py           | 73 +++++++++++++-----------------
 6 files changed, 144 insertions(+), 153 deletions(-)
 create mode 100644 vlmeval/vlm/utils/__init__.py
 create mode 100644 vlmeval/vlm/utils/custom_prompt.py

diff --git a/vlmeval/vlm/llava.py b/vlmeval/vlm/llava.py
index e8fafef5d..a39e90f5c 100644
--- a/vlmeval/vlm/llava.py
+++ b/vlmeval/vlm/llava.py
@@ -4,9 +4,10 @@
 import os
 import os.path as osp
 from ..smp import *
+from .utils import CustomPrompt
 from ..utils import DATASET_TYPE
 
-class LLaVA:
+class LLaVA(CustomPrompt):
 
     INSTALL_REQ = True
 
@@ -50,46 +51,34 @@ def __init__(self,
         self.kwargs = kwargs_default
         warnings.warn(f"Following kwargs received: {self.kwargs}, will use as generation config. ")
 
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        if DATASET_TYPE(dataset) == 'multi-choice':
+            return True
+        return False
+    
     def build_prompt(self, line, dataset=None):
-        from ..utils import img_root_map
+        assert self.use_custom_prompt(dataset)
         assert dataset is None or isinstance(dataset, str)
-        img_root = osp.join('images', img_root_map[dataset])
-        os.makedirs(img_root, exist_ok=True)
+        tgt_path = self.dump_image(line, dataset)
 
-        if isinstance(line['image'], list):
-            tgt_path = []
-            for img, im_name in zip(line['image'], line['image_path']):
-                path = osp.join(img_root, im_name)
-                if not read_ok(path):
-                    decode_base64_to_image_file(img, path)
-                tgt_path.append(path)
-        else:
-            tgt_path = osp.join(img_root, f"{line['index']}.jpg")
-            if not read_ok(tgt_path):
-                decode_base64_to_image_file(line['image'], tgt_path)
-
-        if dataset is not None and DATASET_TYPE(dataset) == 'multi-choice':
-            question = line['question']
-            hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
-            if hint is not None:
-                question + hint + '\n' + question
+        question = line['question']
+        hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+        if hint is not None:
+            question + hint + '\n' + question
 
-            option_candidate = ['A', 'B', 'C', 'D', 'E']
-            options = {
-                cand: line[cand]
-                for cand in option_candidate
-                if cand in line and not pd.isna(line[cand])
-            }
-            for key, item in options.items():
-                question += f'\n{key}. {item}'
-            prompt = question
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        for key, item in options.items():
+            question += f'\n{key}. {item}'
 
-            if not cn_string(prompt):
-                prompt = prompt + "\n" + "Answer with the option's letter from the given choices directly."
-            else:
-                prompt = prompt + "\n" + "请直接回答选项字母。"
+        if not cn_string(question):
+            prompt = question + "\n" + "Answer with the option's letter from the given choices directly."
         else:
-            prompt = line['question']
+            prompt = question + "\n" + "请直接回答选项字母。"
 
         return {'image': tgt_path, 'text': prompt}
 
diff --git a/vlmeval/vlm/mplug_owl2.py b/vlmeval/vlm/mplug_owl2.py
index 6847304b6..179036c09 100644
--- a/vlmeval/vlm/mplug_owl2.py
+++ b/vlmeval/vlm/mplug_owl2.py
@@ -1,10 +1,11 @@
 import os, torch
 from PIL import Image
 from ..smp import *
+from .utils import CustomPrompt
 from ..utils import DATASET_TYPE
 
 
-class mPLUG_Owl2:
+class mPLUG_Owl2(CustomPrompt):
 
     INSTALL_REQ = True
 
@@ -34,46 +35,35 @@ def __init__(self, model_path='MAGAer13/mplug-owl2-llama2-7b', **kwargs):
         self.kwargs = kwargs_default
         warnings.warn(f"Following kwargs received: {self.kwargs}, will use as generation config. ")
 
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        if DATASET_TYPE(dataset) == 'multi-choice':
+            return True
+        return False
+
     def build_prompt(self, line, dataset=None):
-        from ..utils import img_root_map
         assert dataset is None or isinstance(dataset, str)
-        img_root = osp.join('images', img_root_map[dataset])
+        assert self.use_custom_prompt(dataset)
+        tgt_path = self.dump_image(line, dataset)
         
-        os.makedirs(img_root, exist_ok=True)
         prompt_tmpl = "USER: <|image|>{}\n{}\n{}\nAnswer with the option’s letter from the given choices directly. ASSISTANT:"
-        
-        if isinstance(line['image'], list):
-            tgt_path = []
-            for img, im_name in zip(line['image'], line['image_path']):
-                path = osp.join(img_root, im_name)
-                if not read_ok(path):
-                    decode_base64_to_image_file(img, path)
-                tgt_path.append(path)
-        else:
-            tgt_path = osp.join(img_root, f"{line['index']}.jpg")
-            if not read_ok(tgt_path):
-                decode_base64_to_image_file(line['image'], tgt_path)
 
-        if dataset is not None and DATASET_TYPE(dataset) == 'multi-choice':
-            question = line['question']
-            option_candidate = ['A', 'B', 'C', 'D', 'E']
-            options = {
-                cand: line[cand]
-                for cand in option_candidate
-                if cand in line and not pd.isna(line[cand])
-            }
-            options_prompt = ''
-            for key, item in options.items():
-                options_prompt += f'{key}. {item}\n'
-            
-            hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else 'N/A'
-            prompt = prompt_tmpl.format(hint, question, options_prompt)
-        else:
-            prompt = line['question']
+        question = line['question']
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        options_prompt = ''
+        for key, item in options.items():
+            options_prompt += f'{key}. {item}\n'
+        
+        hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else 'N/A'
+        prompt = prompt_tmpl.format(hint, question, options_prompt)
 
         return {'image': tgt_path, 'text': prompt}
     
-    def vanilla_generate(self, image_path, prompt):
+    def generate_vanilla(self, image_path, prompt):
         from mplug_owl2.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
         from mplug_owl2.conversation import conv_templates
         from mplug_owl2.mm_utils import process_images, tokenizer_image_token, KeywordsStoppingCriteria
@@ -106,7 +96,7 @@ def vanilla_generate(self, image_path, prompt):
         outputs = self.tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
         return outputs.split('</s>')[0]
 
-    def mmbench_generate(self, image_path, prompt):
+    def generate_multichoice(self, image_path, prompt):
         from mplug_owl2.constants import IMAGE_TOKEN_INDEX
         from mplug_owl2.mm_utils import process_images, tokenizer_image_token
         image = Image.open(image_path).convert('RGB')
@@ -129,9 +119,9 @@ def mmbench_generate(self, image_path, prompt):
 
     def generate(self, image_path, prompt, dataset=None):
         if dataset is not None and DATASET_TYPE(dataset) == 'multi-choice':
-            return self.mmbench_generate(image_path, prompt)
+            return self.generate_multichoice(image_path, prompt)
         else:
-            return self.vanilla_generate(image_path, prompt)
+            return self.generate_vanilla(image_path, prompt)
         
     def multi_generate(self, image_paths, prompt, dataset=None):
         from mplug_owl2.constants import IMAGE_TOKEN_INDEX
diff --git a/vlmeval/vlm/transcore_m.py b/vlmeval/vlm/transcore_m.py
index c01d11a5d..090b134a3 100644
--- a/vlmeval/vlm/transcore_m.py
+++ b/vlmeval/vlm/transcore_m.py
@@ -2,10 +2,12 @@
 import sys
 import torch
 from abc import abstractproperty
+import math
 from ..smp import *
+from .utils import CustomPrompt
 from ..utils import DATASET_TYPE
 
-class TransCoreM:
+class TransCoreM(CustomPrompt):
 
     INSTALL_REQ = True
 
@@ -44,7 +46,6 @@ def get_options(self,row, options):
             parsed_options.append(option_value)
         return parsed_options
 
-
     def is_none(self,value):
         if value is None:
             return True
@@ -55,48 +56,37 @@ def is_none(self,value):
         if type(value) is str and value.lower() == 'none':
             return True
         return False
+    
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        if DATASET_TYPE(dataset) == 'multi-choice':
+            return True
+        return False
 
     def build_prompt(self, line, dataset=None):
-
-        from ..utils import img_root_map
         assert dataset is None or isinstance(dataset, str)
-        img_root = osp.join('images', img_root_map[dataset])
-        os.makedirs(img_root, exist_ok=True)
-
-        if isinstance(line['image'], list):
-            tgt_path = []
-            for img, im_name in zip(line['image'], line['image_path']):
-                path = osp.join(img_root, im_name)
-                if not read_ok(path):
-                    decode_base64_to_image_file(img, path)
-                tgt_path.append(path)
-        else:
-            tgt_path = osp.join(img_root, f"{line['index']}.jpg")
-            if not read_ok(tgt_path):
-                decode_base64_to_image_file(line['image'], tgt_path)
-        
-        if dataset is not None and DATASET_TYPE(dataset) == 'multi-choice':
-            question = line['question']
-            hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
-            if hint is not None:
-                question = hint + '\n' + question
-
-            option_candidate = ['A', 'B', 'C', 'D']
-            options = {
-                cand: line[cand]
-                for cand in option_candidate
-                if cand in line and not pd.isna(line[cand])
-            }
-            for key, item in options.items():
-                question += f'\n{key}. {item}'
-            prompt = question
-
-            if not cn_string(prompt):
-                prompt = prompt + "\n" + "Answer with the option's letter from the given choices directly."
-            else:
-                prompt = prompt + "\n" + "请直接回答选项字母。"
+        assert self.use_custom_prompt(dataset)
+        tgt_path = self.dump_image(line, dataset)
+
+        question = line['question']
+        hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+        if hint is not None:
+            question = hint + '\n' + question
+
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        for key, item in options.items():
+            question += f'\n{key}. {item}'
+        prompt = question
+
+        if not cn_string(prompt):
+            prompt = prompt + "\n" + "Answer with the option's letter from the given choices directly."
         else:
-            prompt = line['question']
+            prompt = prompt + "\n" + "请直接回答选项字母。"
+            
         return {'image': tgt_path, 'text': prompt}
 
     def generate(self, image_path, prompt, dataset=None):
@@ -108,7 +98,6 @@ def generate(self, image_path, prompt, dataset=None):
         top_p=None
         num_beams=1
 
-
         image = Image.open(image_path).convert('RGB')
         args = abstractproperty()
         args.image_aspect_ratio = 'pad'
diff --git a/vlmeval/vlm/utils/__init__.py b/vlmeval/vlm/utils/__init__.py
new file mode 100644
index 000000000..f2d69523f
--- /dev/null
+++ b/vlmeval/vlm/utils/__init__.py
@@ -0,0 +1 @@
+from .custom_prompt import CustomPrompt
\ No newline at end of file
diff --git a/vlmeval/vlm/utils/custom_prompt.py b/vlmeval/vlm/utils/custom_prompt.py
new file mode 100644
index 000000000..fd045c65f
--- /dev/null
+++ b/vlmeval/vlm/utils/custom_prompt.py
@@ -0,0 +1,31 @@
+from ...smp import *
+from ...utils import img_root_map
+from abc import abstractmethod
+
+class CustomPrompt:
+
+    @abstractmethod
+    def use_custom_prompt(self, dataset):
+        raise NotImplementedError
+    
+    @abstractmethod
+    def build_prompt(self, line, dataset):
+        raise NotImplementedError
+    
+    def dump_image(self, line, dataset):
+        assert isinstance(dataset, str)
+        img_root = osp.join('images', img_root_map[dataset])
+        os.makedirs(img_root, exist_ok=True)
+        if isinstance(line['image'], list):        
+            tgt_path = []
+            assert 'image_path' in line
+            for img, im_name in zip(line['image'], line['image_path']):
+                path = osp.join(img_root, im_name)
+                if not read_ok(path):
+                    decode_base64_to_image_file(img, path)
+                tgt_path.append(path)
+        else:
+            tgt_path = osp.join(img_root, f"{line['index']}.jpg")
+            if not read_ok(tgt_path):
+                decode_base64_to_image_file(line['image'], tgt_path)
+        return tgt_path
\ No newline at end of file
diff --git a/vlmeval/vlm/xcomposer.py b/vlmeval/vlm/xcomposer.py
index 014c6ea00..3a80c9675 100644
--- a/vlmeval/vlm/xcomposer.py
+++ b/vlmeval/vlm/xcomposer.py
@@ -4,6 +4,7 @@
 from ..smp import *
 
 from transformers import StoppingCriteria, StoppingCriteriaList
+from .utils import CustomPrompt
 from PIL import Image
 
 class StoppingCriteriaSub(StoppingCriteria):
@@ -20,7 +21,7 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
 
 from ..utils import DATASET_TYPE
 
-class XComposer:
+class XComposer(CustomPrompt):
 
     INSTALL_REQ = False
     
@@ -39,10 +40,10 @@ def __init__(self, model_path='internlm/internlm-xcomposer-vl-7b'):
         ]
         self.stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
 
-    def vanilla_generate(self, image_path, prompt):
+    def generate_vanilla(self, image_path, prompt):
         return self.model.generate(prompt, image_path)
     
-    def mmbench_generate(self, image_path, prompt):
+    def generate_multichoice(self, image_path, prompt):
         image = Image.open(image_path).convert("RGB")
         image = self.model.vis_processor(image).unsqueeze(0).to(self.device)
         img_embeds = self.model.encode_img(image)
@@ -83,12 +84,12 @@ def mmbench_generate(self, image_path, prompt):
     
     def generate(self, image_path, prompt, dataset=None):
         if dataset is None:
-            return self.vanilla_generate(image_path, prompt)
+            return self.generate_vanilla(image_path, prompt)
         assert isinstance(dataset, str)
         if dataset is not None and DATASET_TYPE(dataset) == 'multi-choice':
-            return self.mmbench_generate(image_path, prompt)
+            return self.generate_multichoice(image_path, prompt)
         else:
-            return self.vanilla_generate(image_path, prompt)
+            return self.generate_vanilla(image_path, prompt)
     
     def multi_generate(self, image_paths, prompt, dataset=None):
         img_embeds, img_prompt = [], ''
@@ -137,43 +138,33 @@ def multi_generate(self, image_paths, prompt, dataset=None):
         output_text = output_text.split('<|Bot|>')[-1].strip()
         return output_text
     
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        if DATASET_TYPE(dataset) == 'multi-choice':
+            return True
+        return False
+    
     def build_prompt(self, line, dataset=None):
-        from ..utils import img_root_map
         assert dataset is None or isinstance(dataset, str)
-        img_root = osp.join('images', img_root_map[dataset])
-        os.makedirs(img_root, exist_ok=True)
+        assert self.use_custom_prompt(dataset)
+        tgt_path = self.dump_image(line, dataset)
 
-        if isinstance(line['image'], list):
-            tgt_path = []
-            for img, im_name in zip(line['image'], line['image_path']):
-                path = osp.join(img_root, im_name)
-                if not read_ok(path):
-                    decode_base64_to_image_file(img, path)
-                tgt_path.append(path)
-        else:
-            tgt_path = osp.join(img_root, f"{line['index']}.jpg")
-            if not read_ok(tgt_path):
-                decode_base64_to_image_file(line['image'], tgt_path)
+        question = line['question']
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        options_prompt = ''
+        for key, item in options.items():
+            options_prompt += f'{key}. {item}\n'
+        hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
 
-        if dataset is not None and DATASET_TYPE(dataset) == 'multi-choice':
-            question = line['question']
-            option_candidate = ['A', 'B', 'C', 'D', 'E']
-            options = {
-                cand: line[cand]
-                for cand in option_candidate
-                if cand in line and not pd.isna(line[cand])
-            }
-            options_prompt = ''
-            for key, item in options.items():
-                options_prompt += f'{key}. {item}\n'
-            hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
-
-            img_prompt = ' <|User|>:<ImageHere>'
-            txt_prompt = 'Please answer this question by choosing the correct choice.'
-            context = 'N/A' if hint is None else hint
-            mid_prompt = 'Context: ' + context + '\nQuestion: ' + question + '\nOptions: ' + options_prompt
-            ans_prompt = ' <|Bot|>: Answer: The answer is'
-            prompt = img_prompt + txt_prompt + mid_prompt + '<TOKENS_UNUSED_0>' + ans_prompt
-        else:
-            prompt = line['question']
+        img_prompt = ' <|User|>:<ImageHere>'
+        txt_prompt = 'Please answer this question by choosing the correct choice.'
+        context = 'N/A' if hint is None else hint
+        mid_prompt = 'Context: ' + context + '\nQuestion: ' + question + '\nOptions: ' + options_prompt
+        ans_prompt = ' <|Bot|>: Answer: The answer is'
+        prompt = img_prompt + txt_prompt + mid_prompt + '<TOKENS_UNUSED_0>' + ans_prompt
+        
         return {'image': tgt_path, 'text': prompt}
\ No newline at end of file

From e3939d67dd4588cded31494d1b061893d807dcac Mon Sep 17 00:00:00 2001
From: kennymckormick <dhd@pku.edu.cn>
Date: Sat, 23 Dec 2023 17:22:33 +0800
Subject: [PATCH 02/11] update

---
 vlmeval/inference.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vlmeval/inference.py b/vlmeval/inference.py
index 947159440..331a57b16 100644
--- a/vlmeval/inference.py
+++ b/vlmeval/inference.py
@@ -94,7 +94,7 @@ def infer_data(model_name, dataset_name, out_file, verbose=False, api_nproc=4):
         if idx in res:
             continue
 
-        if hasattr(model, 'build_prompt'):
+        if hasattr(model, 'use_custom_prompt') and model.use_custom_prompt(dataset_name):
             struct = model.build_prompt(data.iloc[i], dataset=dataset_name)
         else:
             struct = dataset.build_prompt(data.iloc[i])

From 76186aa52dbcaec6953af78cecdcf8f006b218ab Mon Sep 17 00:00:00 2001
From: kennymckormick <dhd@pku.edu.cn>
Date: Sat, 23 Dec 2023 17:56:50 +0800
Subject: [PATCH 03/11] update MME performance

---
 results/MME.md            | 12 ++++++------
 vlmeval/vlm/mplug_owl2.py | 12 ++++++++++--
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/results/MME.md b/results/MME.md
index da0c96dcb..c685e60be 100644
--- a/results/MME.md
+++ b/results/MME.md
@@ -15,16 +15,16 @@ VLMs are sorted by the descending order of Total score.
 | qwen_chat            | 1849 / 1860 | 1457 / 1468 |       392 |
 | sharegpt4v_7b        | 1799 / 1808 |        1491 | 308 / 318 |
 | llava_v1.5_13b       | 1800 / 1805 | 1485 / 1490 |       315 |
-| llava_v1.5_7b        |        1776 |        1490 |       285 |
-| mPLUG-Owl2           | 1733 / 1735 | 1407 / 1409 |       326 |
+| mPLUG-Owl2           | 1781 / 1786 | 1435 / 1436 | 346 / 350 |
+| llava_v1.5_7b        |        1775 |        1490 |       285 |
 | TransCore_M          | 1682 / 1701 | 1427 / 1429 | 254 / 272 |
 | instructblip_13b     | 1624 / 1646 | 1381 / 1383 | 243 / 263 |
-| idefics_80b_instruct | 1508 / 1518 | 1276 / 1285 | 231 / 234 |
+| idefics_80b_instruct | 1507 / 1519 | 1276 / 1285 | 231 / 234 |
 | instructblip_7b      | 1313 / 1391 | 1084 / 1137 | 229 / 254 |
 | idefics_9b_instruct  |        1177 |         942 |       235 |
 | PandaGPT_13B         |        1072 |         826 |       246 |
 | MiniGPT-4-v1-13B     |  648 / 1067 |   533 / 794 | 115 / 273 |
-| MiniGPT-4-v1-7B      |  806 / 1047 |   622 / 771 | 184 / 277 |
+| MiniGPT-4-v1-7B      |  806 / 1048 |   622 / 771 | 184 / 277 |
 | llava_v1_7b          | 1027 / 1044 |   793 / 807 | 234 / 238 |
 | MiniGPT-4-v2         |         968 |         708 |       260 |
 | VisualGLM_6b         |         738 |         628 |       110 |
@@ -38,6 +38,6 @@ For most VLMs, using ChatGPT as the answer extractor or not may not significantl
 | MME Score Improvement with ChatGPT Answer Extractor | Models                                                       |
 | --------------------------------------------------- | ------------------------------------------------------------ |
 | **No (0)**                                          | XComposer, llava_v1.5_7b, idefics_9b_instruct, PandaGPT_13B, MiniGPT-4-v2, VisualGLM_6b, flamingov2 |
-| **Minor (1~20)**                                    | qwen_chat (11), llava_v1.5_13b (5), mPLUG-Owl2 (2), idefics_80b_instruct (10), llava_v1_7b (17), sharegpt4v_7b (9), TransCore_M (19) |
+| **Minor (1~20)**                                    | qwen_chat (11), llava_v1.5_13b (5), mPLUG-Owl2 (5), idefics_80b_instruct (12), llava_v1_7b (17), sharegpt4v_7b (9), TransCore_M (19) |
 | **Moderate (21~100)**                               | instructblip_13b (22), instructblip_7b (78)                  |
-| **Huge (> 100)**                                    | MiniGPT-4-v1-7B (241), MiniGPT-4-v1-13B (419), qwen_base (477) |
\ No newline at end of file
+| **Huge (> 100)**                                    | MiniGPT-4-v1-7B (242), MiniGPT-4-v1-13B (419), qwen_base (477) |
\ No newline at end of file
diff --git a/vlmeval/vlm/mplug_owl2.py b/vlmeval/vlm/mplug_owl2.py
index 179036c09..64c235994 100644
--- a/vlmeval/vlm/mplug_owl2.py
+++ b/vlmeval/vlm/mplug_owl2.py
@@ -118,10 +118,18 @@ def generate_multichoice(self, image_path, prompt):
         return answer.split('</s>')[0]
 
     def generate(self, image_path, prompt, dataset=None):
+        if dataset in ['MMVet']:
+            num_beams_old = self.kwargs.pop('num_beams')
+            self.kwargs['num_beams'] = 5
+            
+        ret = None
         if dataset is not None and DATASET_TYPE(dataset) == 'multi-choice':
-            return self.generate_multichoice(image_path, prompt)
+            ret = self.generate_multichoice(image_path, prompt)
         else:
-            return self.generate_vanilla(image_path, prompt)
+            ret = self.generate_vanilla(image_path, prompt)
+        
+        if dataset in ['MMVet']:
+            self.kwargs['num_beams'] = num_beams_old
         
     def multi_generate(self, image_paths, prompt, dataset=None):
         from mplug_owl2.constants import IMAGE_TOKEN_INDEX

From 3e71f55cfb01db76aa752232db127ad693ea50a7 Mon Sep 17 00:00:00 2001
From: kennymckormick <dhd@pku.edu.cn>
Date: Sat, 23 Dec 2023 18:00:01 +0800
Subject: [PATCH 04/11] update

---
 vlmeval/vlm/mplug_owl2.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vlmeval/vlm/mplug_owl2.py b/vlmeval/vlm/mplug_owl2.py
index 64c235994..91be358f7 100644
--- a/vlmeval/vlm/mplug_owl2.py
+++ b/vlmeval/vlm/mplug_owl2.py
@@ -130,6 +130,7 @@ def generate(self, image_path, prompt, dataset=None):
         
         if dataset in ['MMVet']:
             self.kwargs['num_beams'] = num_beams_old
+        return ret
         
     def multi_generate(self, image_paths, prompt, dataset=None):
         from mplug_owl2.constants import IMAGE_TOKEN_INDEX

From 6d22eab17b6467698483c8a5b0bed537fab625d8 Mon Sep 17 00:00:00 2001
From: kennymckormick <dhd@pku.edu.cn>
Date: Sat, 23 Dec 2023 18:09:51 +0800
Subject: [PATCH 05/11] update

---
 vlmeval/vlm/mplug_owl2.py | 39 ++++++++++++++++++++++++++++-----------
 1 file changed, 28 insertions(+), 11 deletions(-)

diff --git a/vlmeval/vlm/mplug_owl2.py b/vlmeval/vlm/mplug_owl2.py
index 91be358f7..a6772ae07 100644
--- a/vlmeval/vlm/mplug_owl2.py
+++ b/vlmeval/vlm/mplug_owl2.py
@@ -116,21 +116,38 @@ def generate_multichoice(self, image_path, prompt):
                 **self.kwargs)
         answer = self.tokenizer.decode(output_ids[0, input_ids.shape[1]: ]).strip()
         return answer.split('</s>')[0]
+    
+    def generate_mmvet(self, image_path, prompt):
+        from mplug_owl2.constants import IMAGE_TOKEN_INDEX
+        from mplug_owl2.mm_utils import process_images, tokenizer_image_token
+        image = Image.open(image_path).convert('RGB')
+        max_edge = max(image.size) # We recommand you to resize to squared image for BEST performance.
+        image = image.resize((max_edge, max_edge))
+
+        image_tensor = process_images([image], self.image_processor)
+        image_tensor = image_tensor.to(self.device, dtype=torch.float16)
+
+        input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(self.device)
+        kwargs = cp.deepcopy(self.kwargs)
+        kwargs['num_beams'] = 5
+
+        with torch.inference_mode():
+            output_ids = self.model.generate(
+                input_ids=input_ids, 
+                images=image_tensor, 
+                output_hidden_states=True, 
+                use_cache=True, 
+                **kwargs)
+        answer = self.tokenizer.decode(output_ids[0, input_ids.shape[1]: ]).strip()
+        return answer.split('</s>')[0]
 
     def generate(self, image_path, prompt, dataset=None):
-        if dataset in ['MMVet']:
-            num_beams_old = self.kwargs.pop('num_beams')
-            self.kwargs['num_beams'] = 5
-            
-        ret = None
         if dataset is not None and DATASET_TYPE(dataset) == 'multi-choice':
-            ret = self.generate_multichoice(image_path, prompt)
+            return self.generate_multichoice(image_path, prompt)
+        elif dataset == 'MMVet':
+            return self.generate_mmvet(image_path, prompt)
         else:
-            ret = self.generate_vanilla(image_path, prompt)
-        
-        if dataset in ['MMVet']:
-            self.kwargs['num_beams'] = num_beams_old
-        return ret
+            return self.generate_vanilla(image_path, prompt)
         
     def multi_generate(self, image_paths, prompt, dataset=None):
         from mplug_owl2.constants import IMAGE_TOKEN_INDEX

From c2764cc7e9ce0a22041005a4b3ca8f730c507982 Mon Sep 17 00:00:00 2001
From: kennymckormick <dhd@pku.edu.cn>
Date: Sat, 23 Dec 2023 18:12:00 +0800
Subject: [PATCH 06/11] update

---
 vlmeval/vlm/mplug_owl2.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vlmeval/vlm/mplug_owl2.py b/vlmeval/vlm/mplug_owl2.py
index a6772ae07..6ef6a55c5 100644
--- a/vlmeval/vlm/mplug_owl2.py
+++ b/vlmeval/vlm/mplug_owl2.py
@@ -130,6 +130,7 @@ def generate_mmvet(self, image_path, prompt):
         input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(self.device)
         kwargs = cp.deepcopy(self.kwargs)
         kwargs['num_beams'] = 5
+        kwargs['max_new_tokens'] = 64
 
         with torch.inference_mode():
             output_ids = self.model.generate(

From f20dbf76d37fb1c6bdaeaafc71b7b98cd6beaf70 Mon Sep 17 00:00:00 2001
From: kennymckormick <dhd@pku.edu.cn>
Date: Sat, 23 Dec 2023 18:21:30 +0800
Subject: [PATCH 07/11] update

---
 vlmeval/vlm/mplug_owl2.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vlmeval/vlm/mplug_owl2.py b/vlmeval/vlm/mplug_owl2.py
index 6ef6a55c5..391e82259 100644
--- a/vlmeval/vlm/mplug_owl2.py
+++ b/vlmeval/vlm/mplug_owl2.py
@@ -131,6 +131,7 @@ def generate_mmvet(self, image_path, prompt):
         kwargs = cp.deepcopy(self.kwargs)
         kwargs['num_beams'] = 5
         kwargs['max_new_tokens'] = 64
+        kwargs['length_penalty'] = 0
 
         with torch.inference_mode():
             output_ids = self.model.generate(

From 2b7e0452bc6a5b4452683d5da26a8e91b43394d0 Mon Sep 17 00:00:00 2001
From: kennymckormick <dhd@pku.edu.cn>
Date: Sat, 23 Dec 2023 20:31:44 +0800
Subject: [PATCH 08/11] update

---
 vlmeval/vlm/mplug_owl2.py | 37 ++++++++++++++++++++-----------------
 1 file changed, 20 insertions(+), 17 deletions(-)

diff --git a/vlmeval/vlm/mplug_owl2.py b/vlmeval/vlm/mplug_owl2.py
index 391e82259..b483e6261 100644
--- a/vlmeval/vlm/mplug_owl2.py
+++ b/vlmeval/vlm/mplug_owl2.py
@@ -37,7 +37,7 @@ def __init__(self, model_path='MAGAer13/mplug-owl2-llama2-7b', **kwargs):
 
     def use_custom_prompt(self, dataset):
         assert dataset is not None
-        if DATASET_TYPE(dataset) == 'multi-choice':
+        if DATASET_TYPE(dataset) == 'multi-choice' or dataset == 'MMVet':
             return True
         return False
 
@@ -45,21 +45,25 @@ def build_prompt(self, line, dataset=None):
         assert dataset is None or isinstance(dataset, str)
         assert self.use_custom_prompt(dataset)
         tgt_path = self.dump_image(line, dataset)
-        
-        prompt_tmpl = "USER: <|image|>{}\n{}\n{}\nAnswer with the option’s letter from the given choices directly. ASSISTANT:"
-
-        question = line['question']
-        options = {
-            cand: line[cand]
-            for cand in string.ascii_uppercase
-            if cand in line and not pd.isna(line[cand])
-        }
-        options_prompt = ''
-        for key, item in options.items():
-            options_prompt += f'{key}. {item}\n'
-        
-        hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else 'N/A'
-        prompt = prompt_tmpl.format(hint, question, options_prompt)
+
+        if dataset == 'MMVet':
+            prompt_tmpl = "USER: <|image|>{}\nAnswer the question directly. ASSISTANT:"
+            prompt = prompt_tmpl.format(line['question'])
+        elif DATASET_TYPE(dataset) == 'multi-choice':
+            prompt_tmpl = "USER: <|image|>{}\n{}\n{}\nAnswer with the option’s letter from the given choices directly. ASSISTANT:"
+            options = {
+                cand: line[cand]
+                for cand in string.ascii_uppercase
+                if cand in line and not pd.isna(line[cand])
+            }
+            options_prompt = ''
+            for key, item in options.items():
+                options_prompt += f'{key}. {item}\n'
+            
+            hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else 'N/A'
+            prompt = prompt_tmpl.format(hint, line['question'], options_prompt)
+        else:
+            raise NotImplementedError
 
         return {'image': tgt_path, 'text': prompt}
     
@@ -129,7 +133,6 @@ def generate_mmvet(self, image_path, prompt):
 
         input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(self.device)
         kwargs = cp.deepcopy(self.kwargs)
-        kwargs['num_beams'] = 5
         kwargs['max_new_tokens'] = 64
         kwargs['length_penalty'] = 0
 

From 77d6e3adb1cc3fae90ee765eae484a335ce719ab Mon Sep 17 00:00:00 2001
From: kennymckormick <dhd@pku.edu.cn>
Date: Sat, 23 Dec 2023 20:33:56 +0800
Subject: [PATCH 09/11] update

---
 vlmeval/inference.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vlmeval/inference.py b/vlmeval/inference.py
index 331a57b16..a3c2b2d18 100644
--- a/vlmeval/inference.py
+++ b/vlmeval/inference.py
@@ -183,7 +183,7 @@ def infer_data_job(model, model_name, dataset_name, verbose=False, api_nproc=4):
         data = load(result_file)
         failed_set = []
         for idx, pred in zip(data['index'], data['prediction']):
-            if FAIL_MSG in pred:
+            if FAIL_MSG in str(pred):
                 failed_set.append(idx)
         if len(failed_set):
             print(f'{len(failed_set)} records failed in the original result file {result_file}. ')

From 9917e19830b6ece21bca604e70f61623e57fa759 Mon Sep 17 00:00:00 2001
From: kennymckormick <dhd@pku.edu.cn>
Date: Sat, 23 Dec 2023 20:55:32 +0800
Subject: [PATCH 10/11] update

---
 vlmeval/eval/mmvet_eval.py | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/vlmeval/eval/mmvet_eval.py b/vlmeval/eval/mmvet_eval.py
index 254946e25..d754087f8 100644
--- a/vlmeval/eval/mmvet_eval.py
+++ b/vlmeval/eval/mmvet_eval.py
@@ -88,7 +88,9 @@ def MMVet_acc(result_file):
 def MMVet_eval(eval_file, model='gpt-4-turbo', nproc=4, verbose=False):
     logger = get_logger('Evaluation')
 
-    storage = eval_file.replace('.xlsx', f'_{model}.xlsx')
+    suffix = eval_file.split('.')[-1]
+    storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
+    tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
     if osp.exists(storage):
         logger.warning(f"GPT scoring file {storage} already exists, will reuse it in MMVet_eval. ")
     else:
@@ -114,12 +116,26 @@ def MMVet_eval(eval_file, model='gpt-4-turbo', nproc=4, verbose=False):
         tups = [(model, line) for line in lines]
         indices = [line['index'] for line in lines]
 
-        res = track_progress_rich(MMVet_auxeval, tups, nproc=nproc, chunksize=nproc)
-
+        ans = {}
+        if osp.exists(tmp_file):
+            ans = load(tmp_file)
+        tups = [x for x, i in zip(tups, indices) if i not in ans]
+        indices = [i for i in indices if i not in ans]
+        
+        if len(indices):
+            new_results = track_progress_rich(
+                MMVet_auxeval, tups, nproc=nproc, chunksize=nproc,
+                keys=indices, save=tmp_file)
+            ans = load(tmp_file)
+            for k, v in zip(indices, new_results):
+                assert k in ans 
+                assert ans[k]['log'] == v['log'] and ans[k]['score'] == v['score']
+        
         log_map, score_map = {}, {}
-        for k, v in zip(indices, res):
-            log_map[k] = v['log']
-            score_map[k] = v['score']
+        all_inds = [line['index'] for line in lines]
+        for k in all_inds:
+            log_map[k] = ans[k]['log']
+            score_map[k] = ans[k]['score']
         data['score'] = [score_map[idx] for idx in data['index']]
         data['log'] = [log_map[idx] for idx in data['index']]
         dump(data, storage)

From 724eefb40f04fe77f461bff4c2f8edc866b6e1cf Mon Sep 17 00:00:00 2001
From: kennymckormick <dhd@pku.edu.cn>
Date: Sat, 23 Dec 2023 21:03:01 +0800
Subject: [PATCH 11/11] update MMVet acc

---
 results/MMVet.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/results/MMVet.md b/results/MMVet.md
index 2be8f74d0..5ec182025 100644
--- a/results/MMVet.md
+++ b/results/MMVet.md
@@ -10,6 +10,7 @@
 | qwen_chat                 |    47.3 |                                                          N/A | 37.2 | 22.3 | 42.8 | 52.5 | 45.4 | 40.3 |
 | idefics_80b_instruct      |    39.7 |                                                          N/A | 29.9 |   15 | 30.7 | 45.6 | 38.6 | 37.1 |
 | llava_v1.5_13b            |    38.3 |                                                     36.3±0.2 | 28.8 | 11.5 | 31.5 |   42 | 23.1 |   23 |
+| mPLUG-Owl2                |    35.7 |                                                     36.3±0.1 | 29.5 |  7.7 | 32.1 | 47.3 | 23.8 | 20.9 |
 | XComposer                 |    35.2 |                                                          N/A | 21.8 |  3.8 | 24.7 | 43.1 | 28.9 | 27.5 |
 | sharegpt4v_7b             |    34.7 |                                                         37.6 | 30.2 | 18.5 |   30 | 36.1 | 20.2 | 18.1 |
 | TransCore_M               |    33.9 |                                                          N/A | 27.3 | 15.4 | 32.7 | 36.7 |   23 | 23.5 |
@@ -18,7 +19,6 @@
 | instructblip_13b          |    30.1 |                                                     25.6±0.3 | 25.4 | 11.2 | 26.9 | 33.4 |   19 | 18.2 |
 | idefics_9b_instruct       |      30 |                                                          N/A | 21.7 | 11.5 | 22.4 | 34.6 | 27.4 | 26.9 |
 | llava_v1_7b (vicuna-v1.1) |    27.4 |                                                     23.8±0.6 |   19 | 11.5 | 25.6 | 31.4 | 18.1 | 16.2 |
-| mPLUG-Owl2                |    24.1 |                                                     36.3±0.1 | 16.1 |  7.3 | 16.5 | 27.7 |    9 |  6.9 |
 | flamingov2                |    23.3 |                                                     24.8±0.2 | 19.5 |  7.7 | 21.7 | 24.7 | 21.7 |   19 |
 | PandaGPT_13B              |    19.6 |                                                          N/A |  6.8 |  6.5 | 16.5 | 26.3 | 13.7 | 13.9 |
 | MiniGPT-4-v1-13B          |    16.9 |                                                     24.4±0.4 | 10.3 |  7.7 | 12.5 | 19.9 | 14.9 | 13.8 |