diff --git a/README.md b/README.md index ea7b5302..c82ee6f8 100644 --- a/README.md +++ b/README.md @@ -197,6 +197,19 @@ We also provide the raw data exported from Weights & Biases for the detailed res - MMMU (mmmu) - MMMU Validation (mmmu_val) - MMMU Test (mmmu_test) +- MMUPD (mmupd) + - MMUPD Base (mmupd_base) + - MMAAD Base (mmaad_base) + - MMIASD Base (mmiasd_base) + - MMIVQD Base (mmivqd_base) + - MMUPD Option (mmupd_option) + - MMAAD Option (mmaad_option) + - MMIASD Option (mmiasd_option) + - MMIVQD Option (mmivqd_option) + - MMUPD Instruction (mmupd_instruction) + - MMAAD Instruction (mmaad_instruction) + - MMIASD Instruction (mmiasd_instruction) + - MMIVQD Instruction (mmivqd_instruction) - MMVet (mmvet) - Multi-DocVQA (multidocvqa) - Multi-DocVQA Validation (multidocvqa_val) diff --git a/lmms_eval/tasks/mmupd/_default_template_mmupd_yaml b/lmms_eval/tasks/mmupd/_default_template_mmupd_yaml new file mode 100644 index 00000000..7aa8d812 --- /dev/null +++ b/lmms_eval/tasks/mmupd/_default_template_mmupd_yaml @@ -0,0 +1,18 @@ +dataset_path: MM-UPD/MM-UPD +doc_to_target: "answer" +doc_to_visual: !function utils.mmupd_doc_to_visual +doc_to_text: !function utils.mmupd_doc_to_text +doc_to_target: "answer" +process_results: !function utils.mmupd_process_results +model_specific_generation_kwargs: + llava: + image_aspect_ratio: original +output_type: generate_until +generation_kwargs: + until: + - "ASSISTANT:" + max_new_tokens: 1024 + temperature: 0 + top_p: 0 + num_beams: 1 + do_sample: false diff --git a/lmms_eval/tasks/mmupd/mmaad_base.yaml b/lmms_eval/tasks/mmupd/mmaad_base.yaml new file mode 100644 index 00000000..9a66b3e8 --- /dev/null +++ b/lmms_eval/tasks/mmupd/mmaad_base.yaml @@ -0,0 +1,12 @@ +task: "mmaad_base" +test_split: test +dataset_name: mmaad_base +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "\n" +include: _default_template_mmupd_yaml +metric_list: + - metric: gpt_eval_score + aggregation: !function utils.mmaad_base + higher_is_better: true \ No newline at end of file diff --git a/lmms_eval/tasks/mmupd/mmaad_instruction.yaml b/lmms_eval/tasks/mmupd/mmaad_instruction.yaml new file mode 100644 index 00000000..30a3bac1 --- /dev/null +++ b/lmms_eval/tasks/mmupd/mmaad_instruction.yaml @@ -0,0 +1,12 @@ +task: "mmaad_instruction" +test_split: test +dataset_name: mmaad_base +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "\nIf all the options are incorrect, answer \"F. None of the above\"." +include: _default_template_mmupd_yaml +metric_list: + - metric: gpt_eval_score + aggregation: !function utils.mmaad_instruction + higher_is_better: true \ No newline at end of file diff --git a/lmms_eval/tasks/mmupd/mmaad_option.yaml b/lmms_eval/tasks/mmupd/mmaad_option.yaml new file mode 100644 index 00000000..f110b822 --- /dev/null +++ b/lmms_eval/tasks/mmupd/mmaad_option.yaml @@ -0,0 +1,12 @@ +task: "mmaad_option" +test_split: test +dataset_name: mmaad_option +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "\nAnswer with the option's letter from the given choices directly." +include: _default_template_mmupd_yaml +metric_list: + - metric: gpt_eval_score + aggregation: !function utils.mmaad_option + higher_is_better: true \ No newline at end of file diff --git a/lmms_eval/tasks/mmupd/mmiasd_base.yaml b/lmms_eval/tasks/mmupd/mmiasd_base.yaml new file mode 100644 index 00000000..6a9159fd --- /dev/null +++ b/lmms_eval/tasks/mmupd/mmiasd_base.yaml @@ -0,0 +1,12 @@ +task: "mmiasd_base" +test_split: test +dataset_name: mmiasd_base +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "\n" +include: _default_template_mmupd_yaml +metric_list: + - metric: gpt_eval_score + aggregation: !function utils.mmiasd_base + higher_is_better: true \ No newline at end of file diff --git a/lmms_eval/tasks/mmupd/mmiasd_instruction.yaml b/lmms_eval/tasks/mmupd/mmiasd_instruction.yaml new file mode 100644 index 00000000..d0ac3a00 --- /dev/null +++ b/lmms_eval/tasks/mmupd/mmiasd_instruction.yaml @@ -0,0 +1,12 @@ +task: "mmiasd_instruction" +test_split: test +dataset_name: mmiasd_base +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "\nIf all the options are incorrect, answer \"F. None of the above\"." +include: _default_template_mmupd_yaml +metric_list: + - metric: gpt_eval_score + aggregation: !function utils.mmiasd_instruction + higher_is_better: true \ No newline at end of file diff --git a/lmms_eval/tasks/mmupd/mmiasd_option.yaml b/lmms_eval/tasks/mmupd/mmiasd_option.yaml new file mode 100644 index 00000000..a1a1aead --- /dev/null +++ b/lmms_eval/tasks/mmupd/mmiasd_option.yaml @@ -0,0 +1,12 @@ +task: "mmiasd_option" +test_split: test +dataset_name: mmiasd_option +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "\nAnswer with the option's letter from the given choices directly." +include: _default_template_mmupd_yaml +metric_list: + - metric: gpt_eval_score + aggregation: !function utils.mmiasd_option + higher_is_better: true \ No newline at end of file diff --git a/lmms_eval/tasks/mmupd/mmivqd_base.yaml b/lmms_eval/tasks/mmupd/mmivqd_base.yaml new file mode 100644 index 00000000..82853fde --- /dev/null +++ b/lmms_eval/tasks/mmupd/mmivqd_base.yaml @@ -0,0 +1,12 @@ +task: "mmivqd_base" +test_split: test +dataset_name: mmivqd_base +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "\n" +include: _default_template_mmupd_yaml +metric_list: + - metric: gpt_eval_score + aggregation: !function utils.mmivqd_base + higher_is_better: true \ No newline at end of file diff --git a/lmms_eval/tasks/mmupd/mmivqd_instruction.yaml b/lmms_eval/tasks/mmupd/mmivqd_instruction.yaml new file mode 100644 index 00000000..a024022a --- /dev/null +++ b/lmms_eval/tasks/mmupd/mmivqd_instruction.yaml @@ -0,0 +1,12 @@ +task: "mmivqd_instruction" +test_split: test +dataset_name: mmivqd_base +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "\nIf the given image is irrelevant to the question, answer \"F. The image and question are irrelevant.\"." +include: _default_template_mmupd_yaml +metric_list: + - metric: gpt_eval_score + aggregation: !function utils.mmivqd_instruction + higher_is_better: true \ No newline at end of file diff --git a/lmms_eval/tasks/mmupd/mmivqd_option.yaml b/lmms_eval/tasks/mmupd/mmivqd_option.yaml new file mode 100644 index 00000000..1363c345 --- /dev/null +++ b/lmms_eval/tasks/mmupd/mmivqd_option.yaml @@ -0,0 +1,12 @@ +task: "mmivqd_option" +test_split: test +dataset_name: mmivqd_option +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "\nAnswer with the option's letter from the given choices directly." +include: _default_template_mmupd_yaml +metric_list: + - metric: gpt_eval_score + aggregation: !function utils.mmivqd_option + higher_is_better: true \ No newline at end of file diff --git a/lmms_eval/tasks/mmupd/mmupd.yaml b/lmms_eval/tasks/mmupd/mmupd.yaml new file mode 100644 index 00000000..f40416a1 --- /dev/null +++ b/lmms_eval/tasks/mmupd/mmupd.yaml @@ -0,0 +1,15 @@ +group: mmupd +task: + - mmaad_base + - mmaad_option + - mmaad_instruction + - mmiasd_base + - mmiasd_option + - mmiasd_instruction + - mmivqd_base + - mmivqd_option + - mmivqd_instruction +metadata: + version: 0.0 + sys_prompt: "" + gpt_eval_model_name: "gpt-3.5-turbo-0613" \ No newline at end of file diff --git a/lmms_eval/tasks/mmupd/mmupd_base.yaml b/lmms_eval/tasks/mmupd/mmupd_base.yaml new file mode 100644 index 00000000..988ad2e1 --- /dev/null +++ b/lmms_eval/tasks/mmupd/mmupd_base.yaml @@ -0,0 +1,10 @@ +group: mmupd_base +task: + - mmaad_base + - mmiasd_base + - mmivqd_base +metadata: + version: 0.0 + sys_prompt: "" + gpt_eval_model_name: "gpt-3.5-turbo-0613" + \ No newline at end of file diff --git a/lmms_eval/tasks/mmupd/mmupd_evals.py b/lmms_eval/tasks/mmupd/mmupd_evals.py new file mode 100644 index 00000000..4878763d --- /dev/null +++ b/lmms_eval/tasks/mmupd/mmupd_evals.py @@ -0,0 +1,630 @@ +import time +import random as rd +import string +from collections import defaultdict +import requests +import math +import numpy as np +import pandas as pd +import pickle +import logging +import json + +eval_logger = logging.getLogger("lmms-eval") + + +def dump(data, f): + + def dump_pkl(data, pth): + pickle.dump(data, open(pth, 'wb')) + + def dump_json(data, pth): + json.dump(data, open(pth, 'w')) + + def dump_jsonl(data, f): + lines = [json.dumps(x, ensure_ascii=False) for x in data] + with open(f, 'w', encoding='utf8') as fout: + fout.write('\n'.join(lines)) + + def dump_xlsx(data, f): + data.to_excel(f, index=False) + + def dump_csv(data, f): + data.to_csv(f, index=False) + + def dump_tsv(data, f): + data.to_csv(f, sep='\t', index=False) + + handlers = dict(pkl=dump_pkl, + json=dump_json, + jsonl=dump_jsonl, + xlsx=dump_xlsx, + csv=dump_csv, + tsv=dump_tsv) + suffix = f.split('.')[-1] + return handlers[suffix](data, f) + + +def load(f): + """ + Loads data from various file formats. + + Parameters: + - file_path: Path to the file to be loaded. + + Returns: + - Loaded data. + """ + def load_pkl(pth): + return pickle.load(open(pth, 'rb')) + + def load_json(pth): + return json.load(open(pth, 'r', encoding='utf-8')) + + def load_jsonl(f): + lines = open(f, encoding='utf-8').readlines() + lines = [x.strip() for x in lines] + if lines[-1] == '': + lines = lines[:-1] + data = [json.loads(x) for x in lines] + return data + + def load_xlsx(f): + df = pd.read_excel(f) + df = df.dropna(subset=['prediction']) + + return df + + def load_csv(f): + return pd.read_csv(f) + + def load_tsv(f): + return pd.read_csv(f, sep='\t') + + handlers = dict(pkl=load_pkl, + json=load_json, + jsonl=load_jsonl, + xlsx=load_xlsx, + csv=load_csv, + tsv=load_tsv) + suffix = f.split('.')[-1] + return handlers[suffix](f) + + +class MMUPD_Evaluator: + def __init__(self, sys_prompt="There are several options:", API_KEY="", API_URL="", model_version="gpt-3.5-turbo-0613"): + self.sys_prompt = sys_prompt + self.model_version = model_version + self.API_KEY = API_KEY + self.API_URL = API_URL + + def create_options_prompt(self, row_data, option_candidate): + available_keys = set(row_data.keys()) & set(option_candidate) + options = {cand: row_data[cand] for cand in available_keys if row_data[cand]} + sorted_options = dict(sorted(options.items())) + options_prompt = f"{self.sys_prompt}\n" + for key, item in sorted_options.items(): + if pd.notna(item) and item != "nan": + options_prompt += f"{key}. {item}\n" + return options_prompt.rstrip("\n"), sorted_options + + def is_none(self, value): + if value is None: + return True + if pd.isna(value): + return True + if type(value) is float and math.isnan(value): + return True + if type(value) is str and value.lower() == 'nan': + return True + if type(value) is str and value.lower() == 'none': + return True + return False + + # Prompt Building + def build_option_str(self, option_list): + chars = string.ascii_uppercase + s = "There are several options: \n" + for c, opt in zip(chars, option_list): + if not pd.isna(opt): + s += f"{c}. {opt}\n" + else: + return s + return s + + def extract_options(self, item): + options = [] + for c in 'ABCDE': + try: + if self.is_none(item[c]) is False: + options.append(item[c]) + except KeyError: + continue + return options + + def build_choices(self, item): + ret = {} + for ch in 'ABCDE': + try: + if self.is_none(item[ch]) is False: + ret[ch] = item[ch] + except KeyError: + continue + return ret + + def build_option_str_w_gt(self, option_list, gt_text, eval_type, question_type, upd_type): + chars = string.ascii_uppercase + s = 'There are several options: \n' + valid_option = [] + answer_option = [] + + for c, opt in zip(chars, option_list): + if self.is_none(opt) is False: + s += f'{c}. {opt}\n' + valid_option.append(c) + if upd_type == "aad": + if eval_type == "aad": + gt_option = "" + for option in ['C', 'D', 'E', 'F']: + if option not in valid_option: + gt_option = option + break + none_option_mapping = {'C': 'D', 'D': 'E', 'E': 'F', 'F': 'G'} + s += f'{gt_option}. {gt_text}\n' + none_option = none_option_mapping[gt_option] + s += f'{none_option}. The correct answer is No answer, None of the above, \ + all provided options are incorrect, or I cannot answer.\n' + valid_option.append(gt_option) + valid_option.append(none_option) + answer_option = [gt_option, none_option] + if question_type == 'inst': + s += f'F. None of the above\n' + valid_option.append("F") + answer_option = [gt_option, none_option, "F"] + + if eval_type == "standard": + none_option = "" + for option in ['C', 'D', 'E', 'F']: + if option not in valid_option: + none_option = option + break + s += f'{none_option}. The correct answer is No answer, None of the above, \ + all provided options are incorrect, or I cannot answer.\n' + valid_option.append(none_option) + if question_type == 'inst': + s += 'F. None of the above\n' + valid_option.append("F") + elif upd_type == "iasd": + if eval_type == "iasd": + gt_option = "" + for option in ['C', 'D', 'E', 'F']: + if option not in valid_option: + gt_option = option + break + + s += f'{gt_option}. {gt_text}\n' + valid_option.append(gt_option) + + if question_type == 'inst': + if gt_option == 'E': + s += f'F. None of the above\n' + valid_option.append('F') + s += 'G. The correct answer is No answer, None of the above, all provided options are irrelevant or incorrect, or I cannot answer.\n' + valid_option.append('G') + answer_option = [gt_option, 'F', 'G'] + else: + none_option_mapping = {'C': 'D', 'D': 'E'} + none_option = none_option_mapping[gt_option] + s += f'{none_option}. The correct answer is No answer, None of the above, all provided options are irrelevant or incorrect, or I cannot answer.\n' + valid_option.append(none_option) + s += f'F. None of the above\n' + valid_option.append('F') + answer_option = [gt_option, none_option, 'F'] + else: + none_option_mapping = {'C': 'D', 'D': 'E', 'E': 'F', 'F': 'G'} + none_option = none_option_mapping[gt_option] + s += f'{none_option}. The correct answer is No answer, None of the above, all provided options are irrelevant or incorrect, or I cannot answer.\n' + valid_option.append(none_option) + answer_option = [gt_option, none_option] + + if eval_type == "standard": + none_option = "" + for option in ['C', 'D', 'E', 'F']: + if option not in valid_option: + none_option = option + break + s += f'{none_option}. The correct answer is No answer, None of the above, \ + all provided options are irrelevant or incorrect, or I cannot answer.\n' + valid_option.append(none_option) + if question_type == 'inst': + s += f'F. None of the above\n' + valid_option.append("F") + elif upd_type == "ivqd": + if eval_type == "ivqd": + none_option = "" + for option in ['C', 'D', 'E', 'F']: + if option not in valid_option: + none_option = option + break + s += f'{none_option}. The correct answer is that The image is incompatible with the question, or I cannot answer.\n' + valid_option.append(none_option) + answer_option = [none_option] + if question_type == 'inst': + s += f'F. The image and question are irrelevant.\n' + valid_option.append("F") + answer_option = [none_option, "F"] + + if eval_type == "standard": + none_option = "" + for option in ['C', 'D', 'E', 'F']: + if option not in valid_option: + none_option = option + break + s += f'{none_option}. The correct answer is that The image is incompatible with the question, or I cannot answer.\n' + valid_option.append(none_option) + if question_type == 'inst': + s += f'F. The image and question are irrelevant.\n' + valid_option.append("F") + return s, valid_option, answer_option + + def build_prompt(self, question, options, prediction, upd_type): + if upd_type == "aad": + tmpl = ( + 'You are an AI assistant who will help me to match an answer ' + 'with several options of a single-choice question. ' + 'You are provided with a question, several options, and an answer, ' + 'and you need to find which option is most similar to the answer. ' + 'If the meaning of all options are significantly different ' + 'from the answer, output H. ' + 'Your should output a single uppercase character in A, B, C, D, E, F, G ' + '(if they are valid options), and H. \n' + 'Example 1: \n' + 'Question: What is the main object in image?\nOptions: A. teddy bear ' + 'B. rabbit C. cat D. dog E. no answer \nAnswer: a cute teddy bear\nYour output: A\n' + 'Example 2: \n' + 'Question: What is the main object in image?\nOptions: A. teddy bear ' + 'B. rabbit C. cat D. dog E. None of the above \nAnswer: no answer \nYour output: E\n' + 'Example 3: \n' + 'Question: What is the main object in image?\nOptions: A. teddy bear ' + 'B. rabbit C. cat D. dog E. None of the above \nAnswer: fish \nYour output: H\n' + 'Example 4: \n' + 'Question: {}?\nOptions: {}\nAnswer: {}\nYour output: ') + elif upd_type == "iasd": + tmpl = ( + 'You are an AI assistant who will help me to match an answer ' + 'with several options of a single-choice question. ' + 'You are provided with a question, several options, and an answer, ' + 'and you need to find which option is most similar to the answer. ' + 'If the meaning of all options are significantly different ' + 'from the answer, output H. ' + 'Your should output a single uppercase character in A, B, C, D, E, F, G ' + '(if they are valid options), and H. \n' + 'Example 1: \n' + 'Question: What is the main object in image?\nOptions: A. teddy bear ' + 'B. rabbit C. cat D. dog E. no answer \nAnswer: a cute teddy bear\nYour output: A\n' + 'Example 2: \n' + 'Question: What is the main object in image?\nOptions: A. teddy bear ' + 'B. rabbit C. cat D. dog E. None of the above \nAnswer: no answer \nYour output: E\n' + 'Example 3: \n' + 'Question: What is the main object in image?\nOptions: A. teddy bear ' + 'B. rabbit C. cat D. dog E. None of the above \nAnswer: fish \nYour output: H\n' + 'Example 4: \n' + 'Question: {}?\nOptions: {}\nAnswer: {}\nYour output: ') + elif upd_type == "ivqd": + tmpl = ( + 'You are an AI assistant who will help me to match an answer ' + 'with several options of a single-choice question. ' + 'You are provided with a question, several options, and an answer, ' + 'and you need to find which option is most similar to the answer. ' + 'If the meaning of all options are significantly different ' + 'from the answer, output H. ' + 'Your should output a single uppercase character in A, B, C, D, E, F, G ' + '(if they are valid options), and H. \n' + 'Example 1: \n' + 'Question: What is the main object in image?\nOptions: A. teddy bear ' + 'B. rabbit C. cat D. dog E. The image and question are irrelevant \nAnswer: a cute teddy bear\nYour output: A\n' + 'Example 2: \n' + 'Question: What is the main object in image?\nOptions: A. teddy bear ' + 'B. rabbit C. cat D. dog E. The image and question are irrelevant \nAnswer: The updloaded image and question are incompatible. \nYour output: E\n' + 'Example 3: \n' + 'Question: What is the main object in image?\nOptions: A. teddy bear ' + 'B. rabbit C. cat D. dog E. The image and question are irrelevant \nAnswer: fish \nYour output: H\n' + 'Example 4: \n' + 'Question: {}?\nOptions: {}\nAnswer: {}\nYour output: ') + return tmpl.format(question, options, prediction) + + # Prefetch Answers + def can_infer_option(self, answer, option_dict, question_type=None, valid_option=None): + if valid_option is None: + valid_option = list(option_dict.keys()) + if question_type == 'inst': + valid_option.append("F") + + if 'Failed to obtain answer via API' in answer: + return False + + answer = answer.strip() + + ch_cand_list = [] + + punctuations = [".", ")", ","] + if "A" in valid_option: + characters = ["B", "C", "D", "E", "F", "G"] + combinations = [char + punct for char in characters for punct in punctuations] + start_patterns = ["A)", "A.", "A,", "(A)"] + if answer == "A" or (any(answer.startswith(pattern) for pattern in start_patterns) and all(x not in answer for x in combinations)): + ch_cand_list.append("A") + if "B" in valid_option: + characters = ["A", "C", "D", "E", "F", "G"] + combinations = [char + punct for char in characters for punct in punctuations] + start_patterns = ["B)", "B.", "B,", "(B)"] + if answer == "B" or (any(answer.startswith(pattern) for pattern in start_patterns) and all(x not in answer for x in combinations)): + ch_cand_list.append("B") + if "C" in valid_option: + characters = ["A", "B", "D", "E", "F", "G"] + combinations = [char + punct for char in characters for punct in punctuations] + start_patterns = ["C)", "C.", "C,", "(C)"] + if answer == "C" or (any(answer.startswith(pattern) for pattern in start_patterns) and all(x not in answer for x in combinations)): + ch_cand_list.append("C") + if "D" in valid_option: + characters = ["A", "B", "C", "E", "F", "G"] + combinations = [char + punct for char in characters for punct in punctuations] + start_patterns = ["D)", "D.", "D,", "(D)"] + if answer == "D" or (any(answer.startswith(pattern) for pattern in start_patterns) and all(x not in answer for x in combinations)): + ch_cand_list.append("D") + if "E" in valid_option: + characters = ["A", "B", "C", "D", "F", "G"] + combinations = [char + punct for char in characters for punct in punctuations] + start_patterns = ["E)", "E.", "E,", "(E)"] + if answer == "E" or (any(answer.startswith(pattern) for pattern in start_patterns) and all(x not in answer for x in combinations)): + ch_cand_list.append("E") + if "F" in valid_option: + characters = ["A", "B", "C", "D", "E", "G"] + combinations = [char + punct for char in characters for punct in punctuations] + start_patterns = ["F)", "F.", "F,", "(F)"] + if answer == "F" or (any(answer.startswith(pattern) for pattern in start_patterns) and all(x not in answer for x in combinations)): + ch_cand_list.append("F") + if "G" in valid_option: + characters = ["A", "B", "C", "D", "E", "F"] + combinations = [char + punct for char in characters for punct in punctuations] + + start_patterns = ["G)", "G.", "G,", "(G)"] + if answer == "G" or (any(answer.startswith(pattern) for pattern in start_patterns) and all(x not in answer for x in combinations)): + ch_cand_list.append("G") + + if len(ch_cand_list) == 1: + return ch_cand_list[0] + + return False + + def can_infer(self, answer, choices, question_type=None, valid_option=None): + copt = self.can_infer_option(answer, choices, question_type, valid_option=valid_option) + return copt if copt else False + + def prefetch_answer(self, item, question_type): + choices = self.build_choices(item) + return self.can_infer(item["prediction"], choices, question_type=question_type) + + def _post_request(self, payload): + headers = { + "Authorization": f"Bearer {self.API_KEY}", + "Content-Type": "application/json", + } + response = requests.post(self.API_URL, headers=headers, json=payload, timeout=30) + response.raise_for_status() + return response.json() + + def get_chat_response(self, prompt, temperature=0, max_tokens=256, n=1, patience=5, sleep_time=3): + messages = [ + {"role": "user", "content": prompt}, + ] + payload = {"model": self.model_version, "messages": messages, "temperature": temperature, "max_tokens": max_tokens, "n": n} + + while patience > 0: + patience -= 1 + try: + response = self._post_request(payload) + if n == 1: + prediction = response["choices"][0]["message"]["content"].strip() + if prediction and prediction != "": + return prediction + else: + prediction = [choice["message"]["content"].strip() for choice in response["choices"]] + if prediction and prediction[0] != "": + return prediction + + except Exception as e: + eval_logger.info(f"Attempt {patience + 1} failed with error: {e}") + if sleep_time > 0: + time.sleep(sleep_time) + + return "Failed to obtain answer via API" + + def extract_answer_from_item(self, item, gt_text, eval_type, question_type, upd_type): + options = self.extract_options(item) + option_str, valid_option, answer_option = self.build_option_str_w_gt(options, gt_text, eval_type, question_type=question_type, upd_type=upd_type) + + prompt = self.build_prompt(item['question'], option_str, item['prediction'], upd_type=upd_type) + retry = 3 + choices = self.build_choices(item) + + ret = self.can_infer(item['prediction'], choices, valid_option=valid_option) + if ret: + return ret, item['prediction'], answer_option + + while retry: + ans = self.get_chat_response(prompt, temperature=0.7) + if 'Failed to obtain answer via API' in ans: + msg = 'GPT API failed to answer. ' + eval_logger.info(msg) + retry -= 1 + else: + ret = self.can_infer(ans, choices, valid_option=valid_option) + if ret: + return ret, ans, answer_option + else: + eval_logger.info(f'GPT output includes 0 / >1 letter in "{valid_option}": {ans}') + retry -= 1 + + if retry == 0: + return 'H', 'Failed to predict. ', answer_option + + def eval_sub_data(self, sub_data, answer_map, gt_text_map, question_type, eval_type, upd_type): + lt = len(sub_data) + GT, PRED = [], [] + + for i in range(lt): + item = sub_data.iloc[i] + idx = item['index'] + GT.append(answer_map[idx]) + PRED.append(self.prefetch_answer(item, question_type)) + if PRED[-1] and (GT[-1] != PRED[-1]): + return 0 + + for i in range(lt): + if PRED[i]: + continue + else: + item = sub_data.iloc[i] + idx = item['index'] + gt_text = gt_text_map[idx] if gt_text_map is not None else None + ret, _, answer_option = self.extract_answer_from_item(sub_data.iloc[i], gt_text, eval_type, question_type=question_type, upd_type=upd_type) + PRED[i] = ret + if eval_type == "standard": + if PRED[i] != GT[i]: + return 0 + else: + if GT[i] == "F": + if PRED[i] not in answer_option: + return 0 + else: + if PRED[i] != GT[i] and PRED[i] not in answer_option: + return 0 + return 1 + + def calculate_hit_rates(self, data): + overall_hit_rate = data["hit"].mean() + + category_hit_rate = {} + if "category" in data.columns: + # Category-based hit rate + category_hit_rate = data.groupby("category")["hit"].mean().to_dict() + + return overall_hit_rate, category_hit_rate + + # Evaluate Results + def eval_result(self, results, eval_method, upd_type, question_type, eval_type): + """ + Parameters: + - args: Arguments. + - results: Results to evaluate. + - eval_method: The evaluation method. either "openai". + - upd_type: The type of UPD. either "aad", "iasd", or "ivqd". + - question_type: The type of question. either "base", "option", or "inst". + - eval_type: The type of evaluation. either "standard", "aad", "iasd", "ivqd". + """ + + rd.seed(2680) + assert eval_method == "openai" + + result = {} + data = pd.DataFrame(results) + + if eval_type == "standard": + data = data[data["type"] == "standard"] + else: + data = data[data["type"] == "upd"] + + data = data.sort_values(by="index") + + data["prediction"] = [str(x) for x in data["prediction"]] + for k in data.keys(): + data[k.lower() if k not in "ABCDE" else k] = data.pop(k) + + # meta = load(meta_file) + + data_main = data[data["index"] < int(1e6)] + + data_main["hit"] = 0 + cate_map = {i: c for i, c in zip(data["index"], data["category"])} + answer_map = {i: c for i, c in zip(data["index"], data["answer"])} + + gt_text_map = {i: c for i, c in zip(data['index'], data['masked_answer'])} + + lt = len(data_main) + hit, tot = 0, 0 + + for i in range(lt): + # Dealing with the normal part + item_main = data_main.iloc[i] + idx = item_main["index"] + + if idx in result: + correct = result[idx] + assert correct in [0, 1] + hit += correct + tot += 1 + continue + + sub_data = data[data["index"] % int(1e6) == idx] + + ret = self.eval_sub_data(sub_data, answer_map, gt_text_map,\ + question_type=question_type, eval_type=eval_type, upd_type=upd_type) + result[idx] = ret + hit += ret + tot += 1 + + data_main.loc[data_main["index"] == idx, "hit"] = ret + + indices = data_main["index"] + + data_main["category"] = [cate_map[i] if not math.isnan(i) else "uncategorized" for i in indices] + + overall_hit_rate, category_hit_rate = self.calculate_hit_rates(data_main) + + return overall_hit_rate, category_hit_rate, data_main + + def report_acc(self, df, groupd='category'): + assert 'split' in df + assert groupd in [None, 'category', 'l2-category'] + + res = defaultdict(list) + res['split'] = ['test'] + if groupd is None: + res['overall'] = [ + np.mean(df['hit']), + ] + return pd.DataFrame(res) + + elif groupd in df: + abilities = list(set(df[groupd])) + abilities.sort() + for ab in abilities: + sub_df = df[df[groupd] == ab] + res[ab] = [ + np.mean(sub_df['hit']), + ] + return pd.DataFrame(res) + + def calculate_dual_acc(self, standard_df, upd_df): + # dual results + dual_df = pd.merge(standard_df, upd_df, on='index', + suffixes=('_standard', '_upd')) + dual_df['hit'] = dual_df.apply(lambda row: 1 if row['hit_standard'] == 1 and row['hit_upd'] == 1 else 0, axis=1) + dual_df['split'] = dual_df['split_standard'] + dual_df['category'] = dual_df['category_standard'] + + # Evaluate dual results + overall_hit_rate, category_hit_rate = self.calculate_hit_rates(dual_df) + print("Overall Dual Acc.", overall_hit_rate) + + if "category" in dual_df.columns: + print("Category Dual Acc.:") + for category_key in category_hit_rate: + if category_key == "split": + continue + + category_percentage = category_hit_rate[category_key] * 100 + print(f"\t{category_key}: {category_percentage:.3f}") + + return overall_hit_rate, category_hit_rate, dual_df diff --git a/lmms_eval/tasks/mmupd/mmupd_instruction.yaml b/lmms_eval/tasks/mmupd/mmupd_instruction.yaml new file mode 100644 index 00000000..3ca2444e --- /dev/null +++ b/lmms_eval/tasks/mmupd/mmupd_instruction.yaml @@ -0,0 +1,9 @@ +group: mmupd_instruction +task: + - mmaad_instruction + - mmiasd_instruction + - mmivqd_instruction +metadata: + version: 0.0 + sys_prompt: "" + gpt_eval_model_name: "gpt-3.5-turbo-0613" \ No newline at end of file diff --git a/lmms_eval/tasks/mmupd/mmupd_option.yaml b/lmms_eval/tasks/mmupd/mmupd_option.yaml new file mode 100644 index 00000000..6393fdc8 --- /dev/null +++ b/lmms_eval/tasks/mmupd/mmupd_option.yaml @@ -0,0 +1,9 @@ +group: mmupd_option +task: + - mmaad_option + - mmiasd_option + - mmivqd_option +metadata: + version: 0.0 + sys_prompt: "" + gpt_eval_model_name: "gpt-3.5-turbo-0613" \ No newline at end of file diff --git a/lmms_eval/tasks/mmupd/utils.py b/lmms_eval/tasks/mmupd/utils.py new file mode 100644 index 00000000..a2f237d3 --- /dev/null +++ b/lmms_eval/tasks/mmupd/utils.py @@ -0,0 +1,168 @@ +import logging +import yaml +import os +from pathlib import Path +import pandas as pd +import json +from PIL import Image +from io import BytesIO +import base64 +from lmms_eval.tasks.mmupd.mmupd_evals import MMUPD_Evaluator +from lmms_eval.tasks._task_utils.file_utils import generate_submission_file + +eval_logger = logging.getLogger("lmms-eval") + +with open(Path(__file__).parent / "mmupd.yaml", "r") as f: + raw_data = f.readlines() + safe_data = [] + for i, line in enumerate(raw_data): + # remove function definition since yaml load cannot handle it + if "!function" not in line: + safe_data.append(line) + + config = yaml.safe_load("".join(safe_data)) + +GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"] +API_TYPE = os.getenv("API_TYPE", "openai") + +if API_TYPE == "openai": + API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions") + API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY") +elif API_TYPE == "azure": + API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken") + API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY") + + +mmupd_evaluator = MMUPD_Evaluator(sys_prompt=config["metadata"]["sys_prompt"], API_KEY=API_KEY, API_URL=API_URL, model_version=GPT_EVAL_MODEL_NAME) + + +def mmupd_doc_to_visual(doc): + return [Image.open(BytesIO(base64.b64decode(doc["image"])))] + + +def mmupd_doc_to_text(doc, model_specific_prompt_kwargs=None): + option_candidate = ["A", "B", "C", "D", "E"] + options_prompt, options_dict = mmupd_evaluator.create_options_prompt(doc, option_candidate) + + data = { + # "img": doc["image"], + "question": doc["question"], + "answer": doc.get("answer", None), + "options": options_prompt, + "category": doc["category"], + "options_dict": options_dict, + "index": doc["index"], + "hint": doc["hint"], + "source": doc["source"], + "split": doc["split"], + } + + query_prompt = f"{data['hint']}\n{data['question']}{data['options']}" if pd.notna(data["hint"]) and data["hint"] != "nan" else f"{data['question']}{data['options']}" + + if model_specific_prompt_kwargs: + query_prompt = f"{query_prompt}{model_specific_prompt_kwargs['post_prompt']}" + + return query_prompt + + +def mmupd_process_results(doc, results): + model_response = results[0].strip() + data = { + "gpt_eval_score": { + "index": doc["index"], + "question": doc["question"], + "answer": doc["answer"], + "prediction": model_response, + "hint": doc["hint"], + "source": doc["source"], + "split": doc["split"], + "category": doc["category"], + "type": doc["type"], + "masked_answer": doc["masked_answer"] + }, + "submission": { + "index": doc["index"], + "question": doc["question"], + "answer": doc["answer"], + "prediction": model_response, + "hint": doc["hint"], + "source": doc["source"], + "split": doc["split"], + "category": doc["category"], + "type": doc["type"], + "masked_answer": doc["masked_answer"] + }, + } + option_candidate = ["A", "B", "C", "D", "E"] + for c in option_candidate: + data["submission"][c] = doc.get(c, "nan") + data["gpt_eval_score"][c] = doc.get(c, "nan") + return data + + +def mmaad_base(results, args): + return mmupd_results_eval(results, args, upd_type="aad", question_type="base") + + +def mmaad_option(results, args): + return mmupd_results_eval(results, args, upd_type="aad", question_type="option") + + +def mmaad_instruction(results, args): + return mmupd_results_eval(results, args, upd_type="aad", question_type="inst") + + +def mmiasd_base(results, args): + return mmupd_results_eval(results, args, upd_type="iasd", question_type="base") + + +def mmiasd_option(results, args): + return mmupd_results_eval(results, args, upd_type="iasd", question_type="option") + + +def mmiasd_instruction(results, args): + return mmupd_results_eval(results, args, upd_type="iasd", question_type="inst") + + +def mmivqd_base(results, args): + return mmupd_results_eval(results, args, upd_type="ivqd", question_type="base") + + +def mmivqd_option(results, args): + return mmupd_results_eval(results, args, upd_type="ivqd", question_type="option") + + +def mmivqd_instruction(results, args): + return mmupd_results_eval(results, args, upd_type="ivqd", question_type="inst") + + +def mmupd_results_eval(results, args, upd_type, question_type): + + print("============= MMUPD Bench Detailed Results =============") + + overall_acc_standard, category_acc_standard, standard_results_df = mmupd_evaluator.eval_result(results, eval_method="openai", upd_type=upd_type, question_type=question_type, eval_type="standard") + overall_acc_upd, category_acc_upd, upd_results_df = mmupd_evaluator.eval_result(results, eval_method="openai", upd_type=upd_type, question_type=question_type, eval_type=upd_type) + + overall_acc_dual, category_acc_dual, dual_results_df = mmupd_evaluator.calculate_dual_acc(standard_results_df, upd_results_df) + + file_json = generate_submission_file(f"mmupd_{upd_type}_{question_type}_dual_results.json", args) + + details_info = { + "overall_acc_dual": overall_acc_dual, + "category_acc_dual": category_acc_dual, + "overall_acc_standard": overall_acc_standard, + "category_acc_standard": category_acc_standard, + "overall_acc_upd": overall_acc_upd, + "category_acc_upd": category_acc_upd, + } + + with open(file_json, "w") as f: + json.dump(details_info, f) + + file_excel = generate_submission_file(f"mmupd_{upd_type}_{question_type}_dual_results_detail.xlsx", args) + dual_results_df.to_excel(file_excel, index=False) + + file_json = generate_submission_file(f"mmupd_{upd_type}_{question_type}_dual_results_detail.json", args) + dual_results_df.to_json(file_json, orient="records", indent=2) # for huggingface leaderboard submission + + return overall_acc_dual * 100