From 96820d0c79e45a176f7d67972b97a53965c28b36 Mon Sep 17 00:00:00 2001 From: Tianrui Guan <502112826@qq.com> Date: Sun, 10 Dec 2023 13:35:58 -0500 Subject: [PATCH] update leaderboard and data --- README.md | 29 ++++---- gpt4v_benchmark.py | 4 ++ random_guess.py | 164 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 184 insertions(+), 13 deletions(-) create mode 100644 random_guess.py diff --git a/README.md b/README.md index 8f049f4..b484530 100644 --- a/README.md +++ b/README.md @@ -91,19 +91,22 @@ You can use your own API key for GPT4 evaluation by editing the code [here](./ut | Model | Question Pair Acc | Figure Acc | Easy Question Acc | Hard Question Acc | Question Acc | Json | | ----- | :----: | :----: | :----: | :----: | :----: | :----: | -| **GPT4V** <br />Sep 25, 2023 Version <br />(Human Eval) | 31.42 | 44.22 | 79.56 | 38.37 | 67.58 | [VD](), [VS]() | -| **GPT4V** <br />Sep 25, 2023 Version <br />(GPT Eval) | 28.79 | 39.88 | 75.60 | 37.67 | 65.28 | [VD](), [VS]() | -| **LLaVA-1.5** <br />(Human Eval) | 9.45 | 25.43 | 50.77 | 29.07 | 47.12 | [VD](), [VS]() | -| **LLaVA-1.5** <br />(GPT Eval) | 10.55 | 24.86 | 49.67 | 29.77 | 46.94 | [VD](), [VS]() | -| **BLIP2-T5** <br />(GPT Eval) | 15.16 | 20.52 | 45.49 | 43.49 | 48.09 | [VD](), [VS]() | -| **InstructBLIP** <br />(GPT Eval) | 9.45 | 10.11 | 35.60 | 45.12 | 45.26 | [VD](), [VS]() | -| **Qwen-VL** <br />(GPT Eval) | 5.93 | 6.65 | 31.43 | 24.88 | 39.15 | [VD](), [VS]() | -| **Open-Flamingo** <br />(GPT Eval) | 6.37 | 11.27 | 39.56 | 27.21 | 38.44 | [VD](), [VS]() | -| **MiniGPT5** <br />(GPT Eval) |10.55 | 9.83 | 36.04| 28.37 | 40.30 | [VD](), [VS]() | -| **MiniGPT4** <br />(GPT Eval) |8.79 | 10.12 | 31.87| 27.67 | 35.78 | [VD](), [VS]() | -| **mPLUG_Owl-v2** <br />(GPT Eval) |13.85 | 19.94 | 44.84| 39.07 | 47.30 | [VD](), [VS]() | -| **mPLUG_Owl-v1** <br />(GPT Eval) |9.45 | 10.40 | 39.34| 29.77 | 43.93 | [VD](), [VS]() | -| **GiT** <br />(GPT Eval) |5.27 | 6.36 | 26.81| 31.86 | 34.37 | [VD](), [VS]() | +| **GPT4V** <br />Sep 25, 2023 Version <br />(Human Eval) | 31.42 | 44.22 | 79.56 | 38.37 | 67.58 | [VD](https://drive.google.com/file/d/1E-YGJjGV9nhMear7MsHan99oSzs7aeuO/view?usp=sharing), [VS](https://drive.google.com/file/d/1rndpHxRZFocY7PFDr5R1pF_EkEyC-sZT/view?usp=sharing) | +| **GPT4V** <br />Sep 25, 2023 Version <br />(GPT Eval) | 28.79 | 39.88 | 75.60 | 37.67 | 65.28 | [VD](https://drive.google.com/file/d/1E-YGJjGV9nhMear7MsHan99oSzs7aeuO/view?usp=sharing), [VS](https://drive.google.com/file/d/1rndpHxRZFocY7PFDr5R1pF_EkEyC-sZT/view?usp=sharing) | +| **LLaVA-1.5** <br />(Human Eval) | 9.45 | 25.43 | 50.77 | 29.07 | 47.12 | [VD](https://drive.google.com/file/d/1_5P7lVileQMnRwg5GMWRQvABTD3HFj5_/view?usp=sharing), [VS](https://drive.google.com/file/d/1eB_v4Oe_FttGGcsrorOaOiNblHhBCVDV/view?usp=sharing) | +| **LLaVA-1.5** <br />(GPT Eval) | 10.55 | 24.86 | 49.67 | 29.77 | 46.94 | [VD](https://drive.google.com/file/d/1_5P7lVileQMnRwg5GMWRQvABTD3HFj5_/view?usp=sharing), [VS](https://drive.google.com/file/d/1eB_v4Oe_FttGGcsrorOaOiNblHhBCVDV/view?usp=sharing) | +| **BLIP2-T5** <br />(GPT Eval) | 15.16 | 20.52 | 45.49 | 43.49 | 48.09 | [VD](https://drive.google.com/file/d/1lpo2Lg0a2ruwuLoahtvhoZdnnprXKxUz/view?usp=sharing), [VS](https://drive.google.com/file/d/1aTt6TSXSbA-k6Mvv5kTvaBcgFvlKm1Ox/view?usp=sharing) | +| **Qwen-VL** <br />(GPT Eval) | 5.93 | 6.65 | 31.43 | 24.88 | 39.15 | [VD](https://drive.google.com/file/d/1-gTi91aU5sI3vc4Yer6qT06dnXMDPHdP/view?usp=sharing), [VS](https://drive.google.com/file/d/1kSXoN_nNcRyaTL-7JhEqrrr5XTVLjOCb/view?usp=sharing) | +| **Open-Flamingo** <br />(GPT Eval) | 6.37 | 11.27 | 39.56 | 27.21 | 38.44 | [VD](https://drive.google.com/file/d/1OsvE47tguXyaFdL2UEQdg14bSF7jV-by/view?usp=sharing), [VS](https://drive.google.com/file/d/1Klwy9hga_9V3S83Q5U-ofeLlWnYPwpMq/view?usp=sharing) | +| **MiniGPT5** <br />(GPT Eval) |10.55 | 9.83 | 36.04| 28.37 | 40.30 | [VD](https://drive.google.com/file/d/1g6rCyUekv5nsZHDbzvZBg3VhwtUZPQk0/view?usp=sharing), [VS](https://drive.google.com/file/d/1fB4v00-FJ1ENR-paoAzajuGtRyy0ZivJ/view?usp=sharing) | +| **MiniGPT4** <br />(GPT Eval) |8.79 | 10.12 | 31.87| 27.67 | 35.78 | [VD](https://drive.google.com/file/d/148AwFVXLCaw2HCbvaZ85hVjC_otjOWlB/view?usp=sharing), [VS](https://drive.google.com/file/d/1KQXiGSEQjm0JSSIKtQP21279jOhatJuq/view?usp=sharing) | +| **InstructBLIP** <br />(GPT Eval) | 9.45 | 10.11 | 35.60 | 45.12 | 45.26 | [VD](https://drive.google.com/file/d/13jPzKKKvffxwZoegA0aBAOvmt0AXf2hU/view?usp=sharing), [VS](https://drive.google.com/file/d/1HUe0fay54owtOIJPuolF08mbT0uF0NYG/view?usp=sharing) | +| **BLIP2** <br />(GPT Eval) |5.05 | 12.43 | 33.85| 40.70 | 40.48 | [VD](https://drive.google.com/file/d/175L9UwR7Cjet3MSOfxgn3pJeVSr3a2zc/view?usp=sharing), [VS](https://drive.google.com/file/d/104E27ReTdDcaKbDKS6LZJnH0PdMjfwcA/view?usp=sharing) | +| **mPLUG_Owl-v2** <br />(GPT Eval) |13.85 | 19.94 | 44.84| 39.07 | 47.30 | [VD](https://drive.google.com/file/d/1pTq8lVnCnUbMjYf7SOXBF_iYXADyuniH/view?usp=sharing), [VS](https://drive.google.com/file/d/1ouaSAlcfm1kYOxa2MHrLHPLrGocYqtRJ/view?usp=sharing) | +| **mPLUG_Owl-v1** <br />(GPT Eval) |9.45 | 10.40 | 39.34| 29.77 | 43.93 | [VD](https://drive.google.com/file/d/1zl-Omccz2gN295OJb-WJmWlKfx0YLPNo/view?usp=sharing), [VS](https://drive.google.com/file/d/1nNG0KLifWdzwE0qe7UnIlRDaosgPkBQF/view?usp=sharing) | +| **LRV_Instruction** <br />(GPT Eval) |8.79 | 13.01 | 39.78| 27.44 | 42.78 | [VD](https://drive.google.com/file/d/1q0AUUrua-jVTSbL4oL2knCleopEmrGrQ/view?usp=sharing), [VS](https://drive.google.com/file/d/1cb7QILBox-78I9txzR70uWHmB38I13FA/view?usp=sharing) | +| **ViLT** <br />(GPT Eval) | 8.3516 | 11.2717 | 37.8022 | 45.3488 | 44.4641 | [VD](https://drive.google.com/file/d/1xpCi1utJ3tD97y0CD496bEaMDA3p2qxY/view?usp=sharing), [VS](https://drive.google.com/file/d/1ZZK2lvpGdoY_ntWpF23m-m4LIo-Jgxav/view?usp=sharing) | +| **GiT** <br />(GPT Eval) |5.27 | 6.36 | 26.81| 31.86 | 34.37 | [VD](https://drive.google.com/file/d/14zkly-UCIjaMdRaYKyHQHgtJVM8enp9i/view?usp=sharing), [VS](https://drive.google.com/file/d/1aH6KCsiXKMMsh9pElnQrR1scIfOA3zJp/view?usp=sharing) | diff --git a/gpt4v_benchmark.py b/gpt4v_benchmark.py index 46bdcda..afbe383 100644 --- a/gpt4v_benchmark.py +++ b/gpt4v_benchmark.py @@ -96,6 +96,8 @@ def generate_answer(data, model_output_entry): data_vs = assign_correctness(data_vs, correctness_entry=model_correctness_entry_human) data = data_vd + data_vs + # data = [i for i in data if i["subcategory"] == "illusion"] + all_data = get_eval_all(data, model_correctness_entry_human) all_vd = get_eval_all(data_vd, model_correctness_entry_human) all_vs = get_eval_all(data_vs, model_correctness_entry_human) @@ -163,6 +165,8 @@ def generate_answer(data, model_output_entry): data_vs = assign_correctness(data_vs, correctness_entry=model_correctness_entry) data = data_vd + data_vs + # data = [i for i in data if i["subcategory"] == "illusion"] + all_data = get_eval_all(data, model_correctness_entry) all_vd = get_eval_all(data_vd, model_correctness_entry) all_vs = get_eval_all(data_vs, model_correctness_entry) diff --git a/random_guess.py b/random_guess.py new file mode 100644 index 0000000..65efc1f --- /dev/null +++ b/random_guess.py @@ -0,0 +1,164 @@ + + +import csv +import json +from tqdm import tqdm +import numpy as np +from prettytable import PrettyTable +import os +import time +from utils import * +import random +import openai + + +### to evaluate your method, implement and run generate_answer function! + +root_dir = "." +### +input_file_name = "HallusionBench.json" +save_json_path_vd = "./hallusion_output_vd_random_guess2.json" +save_json_path_vs = "./hallusion_output_vs_random_guess2.json" +### +# load_json = False +load_json = True +model_output_entry = "model_prediction" +model_correctness_entry = "gpt4v_output_gpt_check" + + +def generate_answer(data, model_output_entry): + + for i in data: + i[model_output_entry] = "Yes" if random.random() > 0.5 else "No" + + ## TODO + ## implement this section with yout model! + ## your_function(img_filename, question) -> "0" (No), "1" (Yes), "2" (Uncertain) + # for r in data: + # r[model_output_entry] = your_function(r["filename"], r["question"]) + + return data + + +if __name__ == "__main__": + + data_vd = [] + data_vs = [] + with open(input_file_name) as json_file: + datas = json.load(json_file) + + datas = generate_answer(datas, model_output_entry) + + for data in tqdm(datas): + if data['category'] == 'VD': + data_vd.append(data) + if data['category'] == 'VS': + data_vs.append(data) + + data_vd = evaluate_by_chatgpt(data_vd, model_output_entry, model_correctness_entry, load_json=load_json, save_json_path=save_json_path_vd) + data_vd = check_same_by_chatgpt(data_vd, model_output_entry, load_json=load_json, save_json_path=save_json_path_vd) + #time.sleep(60) # + try: + data_vs = evaluate_by_chatgpt(data_vs, model_output_entry, model_correctness_entry, load_json=load_json, save_json_path=save_json_path_vs) + data_vs = check_same_by_chatgpt(data_vs, model_output_entry, load_json=load_json, save_json_path=save_json_path_vs) + except: + time.sleep(60) + data_vs = evaluate_by_chatgpt(data_vs, model_output_entry, model_correctness_entry, load_json=load_json, save_json_path=save_json_path_vs) + data_vs = check_same_by_chatgpt(data_vs, model_output_entry, load_json=load_json, save_json_path=save_json_path_vs) + print("##### GPT Evaluate #####") + + data_vd = assign_correctness(data_vd, correctness_entry=model_correctness_entry) + data_vs = assign_correctness(data_vs, correctness_entry=model_correctness_entry) + data = data_vd + data_vs + + all_data = get_eval_all(data, model_correctness_entry) + all_vd = get_eval_all(data_vd, model_correctness_entry) + all_vs = get_eval_all(data_vs, model_correctness_entry) + + table1 = [["per question", "Total"], + ["VD", round(100 * all_vd["correct"]/all_vd["total"], 4)], + ["VS", round(100 * all_vs["correct"]/all_vs["total"], 4)], + ["Overall", round(100 * all_data["correct"]/all_data["total"], 4)]] + tab1 = PrettyTable(table1[0]) + tab1.add_rows(table1[1:]) + + + q_acc_gpt = round(100 * all_data["correct"]/all_data["total"], 4) + + all_data = get_eval_pair_all(data, model_correctness_entry) + easy = get_eval_pair_easy(data) + hard = get_eval_pair_hard(data) + all_vd = get_eval_pair_all(data_vd, model_correctness_entry) + easy_vd = get_eval_pair_easy(data_vd) + hard_vd = get_eval_pair_hard(data_vd) + all_vs = get_eval_pair_all(data_vs, model_correctness_entry) + easy_vs = get_eval_pair_easy(data_vs) + hard_vs = get_eval_pair_hard(data_vs) + # question pair level + table3 = [["per question pair", "Easy", "Hard", "Total"], + ["VD", round(100 * easy_vd["correct"]/easy_vd["total"], 4), round(100 * hard_vd["correct"]/hard_vd["total"], 4), round(100 * all_vd["correct"]/all_vd["total"], 4)], + ["VS", round(100 * easy_vs["correct"]/easy_vs["total"], 4), round(100 * hard_vs["correct"]/hard_vs["total"], 4), round(100 * all_vs["correct"]/all_vs["total"], 4)], + ["Overall", round(100 * easy["correct"]/easy["total"], 4), round(100 * hard["correct"]/hard["total"], 4), round(100 * all_data["correct"]/all_data["total"], 4)]] + tab3 = PrettyTable(table3[0]) + tab3.add_rows(table3[1:]) + #print(tab3) + + + fig_all = get_eval_fig(data) + fig_vd = get_eval_fig(data_vd) + fig_vs = get_eval_fig(data_vs) + + # image level + table2 = [["per figure", "Correct", "Wrong", "Score"], + ["VD", round(100 * fig_vd["correct"]/fig_vd["total"], 4), round(100 * fig_vd["inconsistent"]/fig_vd["total"], 4) + round(100 * fig_vd["wrong"]/fig_vd["total"], 4), round(fig_vd["score"], 4)], + ["VS", round(100 * fig_vs["correct"]/fig_vs["total"], 4), round(100 * fig_vs["inconsistent"]/fig_vs["total"], 4) + round(100 * fig_vs["wrong"]/fig_vs["total"], 4), round(fig_vs["score"], 4)], + ["Overall", round(100 * fig_all["correct"]/fig_all["total"], 4), round(100 * fig_all["inconsistent"]/fig_all["total"], 4) + round(100 * fig_all["wrong"]/fig_all["total"], 4), round(fig_all["score"], 4)]] + tab2 = PrettyTable(table2[0]) + tab2.add_rows(table2[1:]) + + pair_acc_gpt = round(100 * all_data["correct"]/all_data["total"], 4) + figure_acc_gpt = round(100 * fig_all["correct"]/fig_all["total"], 4) + easy_acc_gpt = round(100 * easy["correct"]/easy["total"], 4) + hard_acc_gpt = round(100 * hard["correct"]/hard["total"], 4) + + + + print("##### Question Stats #####") + print("Easy Questions: " + str(easy_vd["total_q"]) + "(Visual Dependent) + " + str(easy_vs["total_q"]) + "(Visual Supplement)") + print("Hard Questions: " + str(hard_vd["total_q"]) + "(Visual Dependent) + " + str(hard_vs["total_q"]) + "(Visual Supplement)") + print("Total Questions: " + str(all_data["total_q"])) + + + print("##### Figure Stats #####") + print("Visual Dependent Figures: " + str(fig_vd["total"])) + print("Visual Supplement Figures: " + str(fig_vs["total"])) + print("Total Figures: " + str(fig_all["total"])) + + print("##### Leaderboard Stats #####") + + table = [["", "Acc per question pair (qAcc)", "Acc per figure (fAcc)", "Acc per easy question (easy aAcc)", "Acc per hard question (hard aAcc)", "Acc per question (aAcc)"], + ["GPT Eval", pair_acc_gpt, figure_acc_gpt, easy_acc_gpt, hard_acc_gpt, q_acc_gpt]] + leaderboard = PrettyTable(table[0]) + leaderboard.add_rows(table[1:]) + print(leaderboard) + + + stats = yes_ratio_stats(data) + + table = [["", "Yes/No Bias (Pct Diff)", "Yes/No Bias (FP Ratio)", "Consistency Test (correct)", "Consistency Test (inconsistent)", "Consistency Test (wrong)", "LH", "VI", "Mixed"], + ["GPT Eval", stats["diff"], stats["fp"], round(100 * fig_all["correct"]/fig_all["total"], 4), round(100 * fig_all["inconsistent"]/fig_all["total"], 4), round(100 * fig_all["wrong"]/fig_all["total"], 4), round(100 * all_data["LH_cg"]/(all_data["LH_cg"] + all_data["VI_cg"] + all_data["Mix_cg"]), 4), round(100 * all_data["VI_cg"]/(all_data["LH_cg"] + all_data["VI_cg"] + all_data["Mix_cg"]), 4), round(100 * all_data["Mix_cg"]/(all_data["LH_cg"] + all_data["VI_cg"] + all_data["Mix_cg"]), 4)]] + test = PrettyTable(table[0]) + test.add_rows(table[1:]) + print(test) + + orig = [i for i in data if int(i["visual_input"]) == 1] + + edit = [i for i in data if int(i["visual_input"]) == 2] + + a = np.unique([i["category"] + "_" + i["subcategory"] + "_" + i["set_id"] + "_" + i["figure_id"] for i in orig]) + b = np.unique([i["category"] + "_" + i["subcategory"] + "_" + i["set_id"] + "_" + i["figure_id"] for i in edit]) + print(len(a)) + print(len(b)) + + +