update leaderboard and data

Lichang-Chen · Dec 10, 2023 · 96820d0 · 96820d0
1 parent 050ab6c
commit 96820d0
Show file tree

Hide file tree

Showing 3 changed files with 184 additions and 13 deletions.
diff --git a/README.md b/README.md
@@ -91,19 +91,22 @@ You can use your own API key for GPT4 evaluation by editing the code [here](./ut
 
 | Model | Question Pair Acc | Figure Acc | Easy Question Acc | Hard Question Acc | Question Acc | Json |
 | ----- | :----: | :----: | :----: | :----: | :----: | :----: |
-| **GPT4V** <br />Sep 25, 2023 Version <br />(Human Eval) | 31.42 | 44.22 |  79.56 | 38.37 |  67.58 | [VD](), [VS]() |
-| **GPT4V** <br />Sep 25, 2023 Version <br />(GPT Eval) | 28.79 | 39.88 | 75.60 |  37.67 | 65.28 | [VD](), [VS]() |
-| **LLaVA-1.5** <br />(Human Eval) | 9.45 |  25.43 | 50.77 | 29.07 | 47.12 | [VD](), [VS]() |
-| **LLaVA-1.5** <br />(GPT Eval) | 10.55 |  24.86  | 49.67 | 29.77  | 46.94 | [VD](), [VS]() |
-| **BLIP2-T5** <br />(GPT Eval) | 15.16 |  20.52  | 45.49 | 43.49  | 48.09 | [VD](), [VS]() |
-| **InstructBLIP** <br />(GPT Eval) | 9.45 |  10.11  | 35.60 | 45.12 | 45.26 | [VD](), [VS]() |
-| **Qwen-VL** <br />(GPT Eval) | 5.93 |  6.65  | 31.43 | 24.88 | 39.15 | [VD](), [VS]() |
-| **Open-Flamingo** <br />(GPT Eval) | 6.37 |  11.27 | 39.56 | 27.21 | 38.44 | [VD](), [VS]() |
-| **MiniGPT5** <br />(GPT Eval) |10.55 | 9.83 | 36.04| 28.37 | 40.30 | [VD](), [VS]() |
-| **MiniGPT4** <br />(GPT Eval) |8.79 | 10.12 | 31.87| 27.67 | 35.78 | [VD](), [VS]() |
-| **mPLUG_Owl-v2** <br />(GPT Eval) |13.85 | 19.94 | 44.84| 39.07 | 47.30 | [VD](), [VS]() |
-| **mPLUG_Owl-v1** <br />(GPT Eval) |9.45 | 10.40 | 39.34| 29.77 | 43.93 | [VD](), [VS]() |
-| **GiT** <br />(GPT Eval) |5.27 | 6.36 | 26.81| 31.86 | 34.37 | [VD](), [VS]() |
+| **GPT4V** <br />Sep 25, 2023 Version <br />(Human Eval) | 31.42 | 44.22 |  79.56 | 38.37 |  67.58 | [VD](https://drive.google.com/file/d/1E-YGJjGV9nhMear7MsHan99oSzs7aeuO/view?usp=sharing), [VS](https://drive.google.com/file/d/1rndpHxRZFocY7PFDr5R1pF_EkEyC-sZT/view?usp=sharing) |
+| **GPT4V** <br />Sep 25, 2023 Version <br />(GPT Eval) | 28.79 | 39.88 | 75.60 |  37.67 | 65.28 | [VD](https://drive.google.com/file/d/1E-YGJjGV9nhMear7MsHan99oSzs7aeuO/view?usp=sharing), [VS](https://drive.google.com/file/d/1rndpHxRZFocY7PFDr5R1pF_EkEyC-sZT/view?usp=sharing) |
+| **LLaVA-1.5** <br />(Human Eval) | 9.45 |  25.43 | 50.77 | 29.07 | 47.12 | [VD](https://drive.google.com/file/d/1_5P7lVileQMnRwg5GMWRQvABTD3HFj5_/view?usp=sharing), [VS](https://drive.google.com/file/d/1eB_v4Oe_FttGGcsrorOaOiNblHhBCVDV/view?usp=sharing) |
+| **LLaVA-1.5** <br />(GPT Eval) | 10.55 |  24.86  | 49.67 | 29.77  | 46.94 | [VD](https://drive.google.com/file/d/1_5P7lVileQMnRwg5GMWRQvABTD3HFj5_/view?usp=sharing), [VS](https://drive.google.com/file/d/1eB_v4Oe_FttGGcsrorOaOiNblHhBCVDV/view?usp=sharing) |
+| **BLIP2-T5** <br />(GPT Eval) | 15.16 |  20.52  | 45.49 | 43.49  | 48.09 | [VD](https://drive.google.com/file/d/1lpo2Lg0a2ruwuLoahtvhoZdnnprXKxUz/view?usp=sharing), [VS](https://drive.google.com/file/d/1aTt6TSXSbA-k6Mvv5kTvaBcgFvlKm1Ox/view?usp=sharing) |
+| **Qwen-VL** <br />(GPT Eval) | 5.93 |  6.65  | 31.43 | 24.88 | 39.15 | [VD](https://drive.google.com/file/d/1-gTi91aU5sI3vc4Yer6qT06dnXMDPHdP/view?usp=sharing), [VS](https://drive.google.com/file/d/1kSXoN_nNcRyaTL-7JhEqrrr5XTVLjOCb/view?usp=sharing) |
+| **Open-Flamingo** <br />(GPT Eval) | 6.37 |  11.27 | 39.56 | 27.21 | 38.44 | [VD](https://drive.google.com/file/d/1OsvE47tguXyaFdL2UEQdg14bSF7jV-by/view?usp=sharing), [VS](https://drive.google.com/file/d/1Klwy9hga_9V3S83Q5U-ofeLlWnYPwpMq/view?usp=sharing) |
+| **MiniGPT5** <br />(GPT Eval) |10.55 | 9.83 | 36.04| 28.37 | 40.30 | [VD](https://drive.google.com/file/d/1g6rCyUekv5nsZHDbzvZBg3VhwtUZPQk0/view?usp=sharing), [VS](https://drive.google.com/file/d/1fB4v00-FJ1ENR-paoAzajuGtRyy0ZivJ/view?usp=sharing) |
+| **MiniGPT4** <br />(GPT Eval) |8.79 | 10.12 | 31.87| 27.67 | 35.78 | [VD](https://drive.google.com/file/d/148AwFVXLCaw2HCbvaZ85hVjC_otjOWlB/view?usp=sharing), [VS](https://drive.google.com/file/d/1KQXiGSEQjm0JSSIKtQP21279jOhatJuq/view?usp=sharing) |
+| **InstructBLIP** <br />(GPT Eval) | 9.45 |  10.11  | 35.60 | 45.12 | 45.26 | [VD](https://drive.google.com/file/d/13jPzKKKvffxwZoegA0aBAOvmt0AXf2hU/view?usp=sharing), [VS](https://drive.google.com/file/d/1HUe0fay54owtOIJPuolF08mbT0uF0NYG/view?usp=sharing) |
+| **BLIP2** <br />(GPT Eval) |5.05 | 12.43 | 33.85| 40.70 | 40.48 | [VD](https://drive.google.com/file/d/175L9UwR7Cjet3MSOfxgn3pJeVSr3a2zc/view?usp=sharing), [VS](https://drive.google.com/file/d/104E27ReTdDcaKbDKS6LZJnH0PdMjfwcA/view?usp=sharing) |
+| **mPLUG_Owl-v2** <br />(GPT Eval) |13.85 | 19.94 | 44.84| 39.07 | 47.30 | [VD](https://drive.google.com/file/d/1pTq8lVnCnUbMjYf7SOXBF_iYXADyuniH/view?usp=sharing), [VS](https://drive.google.com/file/d/1ouaSAlcfm1kYOxa2MHrLHPLrGocYqtRJ/view?usp=sharing) |
+| **mPLUG_Owl-v1** <br />(GPT Eval) |9.45 | 10.40 | 39.34| 29.77 | 43.93 | [VD](https://drive.google.com/file/d/1zl-Omccz2gN295OJb-WJmWlKfx0YLPNo/view?usp=sharing), [VS](https://drive.google.com/file/d/1nNG0KLifWdzwE0qe7UnIlRDaosgPkBQF/view?usp=sharing) |
+| **LRV_Instruction** <br />(GPT Eval) |8.79 | 13.01 | 39.78| 27.44 | 42.78 | [VD](https://drive.google.com/file/d/1q0AUUrua-jVTSbL4oL2knCleopEmrGrQ/view?usp=sharing), [VS](https://drive.google.com/file/d/1cb7QILBox-78I9txzR70uWHmB38I13FA/view?usp=sharing) |
+| **ViLT** <br />(GPT Eval) | 8.3516 | 11.2717 | 37.8022 | 45.3488  | 44.4641 | [VD](https://drive.google.com/file/d/1xpCi1utJ3tD97y0CD496bEaMDA3p2qxY/view?usp=sharing), [VS](https://drive.google.com/file/d/1ZZK2lvpGdoY_ntWpF23m-m4LIo-Jgxav/view?usp=sharing) |
+| **GiT** <br />(GPT Eval) |5.27 | 6.36 | 26.81| 31.86 | 34.37 | [VD](https://drive.google.com/file/d/14zkly-UCIjaMdRaYKyHQHgtJVM8enp9i/view?usp=sharing), [VS](https://drive.google.com/file/d/1aH6KCsiXKMMsh9pElnQrR1scIfOA3zJp/view?usp=sharing) |
 
 
 

diff --git a/gpt4v_benchmark.py b/gpt4v_benchmark.py
@@ -96,6 +96,8 @@ def generate_answer(data, model_output_entry):
     data_vs = assign_correctness(data_vs, correctness_entry=model_correctness_entry_human)
     data = data_vd + data_vs
 
+    # data = [i for i in data if i["subcategory"] == "illusion"]
+
     all_data = get_eval_all(data, model_correctness_entry_human)
     all_vd = get_eval_all(data_vd, model_correctness_entry_human)
     all_vs = get_eval_all(data_vs, model_correctness_entry_human)
@@ -163,6 +165,8 @@ def generate_answer(data, model_output_entry):
     data_vs = assign_correctness(data_vs, correctness_entry=model_correctness_entry)
     data = data_vd + data_vs
 
+    # data = [i for i in data if i["subcategory"] == "illusion"]
+
     all_data = get_eval_all(data, model_correctness_entry)
     all_vd = get_eval_all(data_vd, model_correctness_entry)
     all_vs = get_eval_all(data_vs, model_correctness_entry)

diff --git a/random_guess.py b/random_guess.py
@@ -0,0 +1,164 @@
+
+
+import csv
+import json
+from tqdm import tqdm
+import numpy as np
+from prettytable import PrettyTable
+import os
+import time
+from utils import *
+import random
+import openai
+
+
+### to evaluate your method, implement and run generate_answer function!
+
+root_dir = "."
+###
+input_file_name = "HallusionBench.json"
+save_json_path_vd = "./hallusion_output_vd_random_guess2.json"
+save_json_path_vs = "./hallusion_output_vs_random_guess2.json"
+### 
+# load_json = False
+load_json = True
+model_output_entry = "model_prediction"
+model_correctness_entry = "gpt4v_output_gpt_check"
+
+
+def generate_answer(data, model_output_entry):
+
+    for i in data:
+        i[model_output_entry] = "Yes" if random.random() > 0.5 else "No"
+
+    ## TODO
+    ## implement this section with yout model!
+    ## your_function(img_filename, question) -> "0" (No), "1" (Yes), "2" (Uncertain)
+    # for r in data:
+        # r[model_output_entry] = your_function(r["filename"], r["question"])
+
+    return data
+
+
+if __name__ == "__main__":
+
+    data_vd = []
+    data_vs = []
+    with open(input_file_name) as json_file:
+        datas = json.load(json_file)
+
+    datas = generate_answer(datas, model_output_entry)
+
+    for data in tqdm(datas):
+        if data['category'] == 'VD':
+            data_vd.append(data)
+        if data['category'] == 'VS':
+            data_vs.append(data)
+
+    data_vd = evaluate_by_chatgpt(data_vd, model_output_entry, model_correctness_entry, load_json=load_json, save_json_path=save_json_path_vd)
+    data_vd = check_same_by_chatgpt(data_vd, model_output_entry, load_json=load_json, save_json_path=save_json_path_vd)
+    #time.sleep(60) #
+    try:
+        data_vs = evaluate_by_chatgpt(data_vs, model_output_entry, model_correctness_entry, load_json=load_json, save_json_path=save_json_path_vs)
+        data_vs = check_same_by_chatgpt(data_vs, model_output_entry, load_json=load_json, save_json_path=save_json_path_vs)
+    except:
+        time.sleep(60)
+        data_vs = evaluate_by_chatgpt(data_vs, model_output_entry, model_correctness_entry, load_json=load_json, save_json_path=save_json_path_vs)
+        data_vs = check_same_by_chatgpt(data_vs, model_output_entry, load_json=load_json, save_json_path=save_json_path_vs)
+    print("##### GPT Evaluate #####")
+
+    data_vd = assign_correctness(data_vd, correctness_entry=model_correctness_entry)
+    data_vs = assign_correctness(data_vs, correctness_entry=model_correctness_entry)
+    data = data_vd + data_vs
+
+    all_data = get_eval_all(data, model_correctness_entry)
+    all_vd = get_eval_all(data_vd, model_correctness_entry)
+    all_vs = get_eval_all(data_vs, model_correctness_entry)
+
+    table1 = [["per question", "Total"], 
+              ["VD", round(100 * all_vd["correct"]/all_vd["total"], 4)], 
+              ["VS", round(100 * all_vs["correct"]/all_vs["total"], 4)], 
+              ["Overall", round(100 * all_data["correct"]/all_data["total"], 4)]]
+    tab1 = PrettyTable(table1[0])
+    tab1.add_rows(table1[1:])
+
+
+    q_acc_gpt = round(100 * all_data["correct"]/all_data["total"], 4)
+
+    all_data = get_eval_pair_all(data, model_correctness_entry)
+    easy = get_eval_pair_easy(data)
+    hard = get_eval_pair_hard(data)
+    all_vd = get_eval_pair_all(data_vd, model_correctness_entry)
+    easy_vd = get_eval_pair_easy(data_vd)
+    hard_vd = get_eval_pair_hard(data_vd)
+    all_vs = get_eval_pair_all(data_vs, model_correctness_entry)
+    easy_vs = get_eval_pair_easy(data_vs)
+    hard_vs = get_eval_pair_hard(data_vs)
+    # question pair level
+    table3 = [["per question pair", "Easy", "Hard", "Total"], 
+              ["VD", round(100 * easy_vd["correct"]/easy_vd["total"], 4), round(100 * hard_vd["correct"]/hard_vd["total"], 4), round(100 * all_vd["correct"]/all_vd["total"], 4)], 
+              ["VS", round(100 * easy_vs["correct"]/easy_vs["total"], 4), round(100 * hard_vs["correct"]/hard_vs["total"], 4), round(100 * all_vs["correct"]/all_vs["total"], 4)], 
+              ["Overall", round(100 * easy["correct"]/easy["total"], 4), round(100 * hard["correct"]/hard["total"], 4), round(100 * all_data["correct"]/all_data["total"], 4)]]
+    tab3 = PrettyTable(table3[0])
+    tab3.add_rows(table3[1:])
+    #print(tab3)
+
+
+    fig_all = get_eval_fig(data)
+    fig_vd = get_eval_fig(data_vd)
+    fig_vs = get_eval_fig(data_vs)
+
+    # image level 
+    table2 = [["per figure", "Correct", "Wrong", "Score"], 
+              ["VD", round(100 * fig_vd["correct"]/fig_vd["total"], 4), round(100 * fig_vd["inconsistent"]/fig_vd["total"], 4) + round(100 * fig_vd["wrong"]/fig_vd["total"], 4), round(fig_vd["score"], 4)], 
+              ["VS", round(100 * fig_vs["correct"]/fig_vs["total"], 4), round(100 * fig_vs["inconsistent"]/fig_vs["total"], 4) + round(100 * fig_vs["wrong"]/fig_vs["total"], 4), round(fig_vs["score"], 4)], 
+              ["Overall", round(100 * fig_all["correct"]/fig_all["total"], 4), round(100 * fig_all["inconsistent"]/fig_all["total"], 4) + round(100 * fig_all["wrong"]/fig_all["total"], 4), round(fig_all["score"], 4)]]
+    tab2 = PrettyTable(table2[0])
+    tab2.add_rows(table2[1:])
+
+    pair_acc_gpt = round(100 * all_data["correct"]/all_data["total"], 4)
+    figure_acc_gpt = round(100 * fig_all["correct"]/fig_all["total"], 4)
+    easy_acc_gpt = round(100 * easy["correct"]/easy["total"], 4)
+    hard_acc_gpt = round(100 * hard["correct"]/hard["total"], 4)
+
+
+
+    print("##### Question Stats #####")
+    print("Easy Questions: " + str(easy_vd["total_q"]) + "(Visual Dependent) + " + str(easy_vs["total_q"]) + "(Visual Supplement)")
+    print("Hard Questions: " + str(hard_vd["total_q"]) + "(Visual Dependent) + " + str(hard_vs["total_q"]) + "(Visual Supplement)")
+    print("Total Questions: " + str(all_data["total_q"]))
+
+
+    print("##### Figure Stats #####")
+    print("Visual Dependent Figures: " + str(fig_vd["total"]))
+    print("Visual Supplement Figures: " + str(fig_vs["total"]))
+    print("Total Figures: " + str(fig_all["total"]))
+
+    print("##### Leaderboard Stats #####")
+
+    table = [["", "Acc per question pair (qAcc)", "Acc per figure (fAcc)", "Acc per easy question (easy aAcc)", "Acc per hard question (hard aAcc)", "Acc per question (aAcc)"], 
+              ["GPT Eval", pair_acc_gpt, figure_acc_gpt, easy_acc_gpt, hard_acc_gpt, q_acc_gpt]]
+    leaderboard = PrettyTable(table[0])
+    leaderboard.add_rows(table[1:])
+    print(leaderboard)
+
+
+    stats = yes_ratio_stats(data)
+
+    table = [["", "Yes/No Bias (Pct Diff)", "Yes/No Bias (FP Ratio)", "Consistency Test (correct)", "Consistency Test (inconsistent)", "Consistency Test (wrong)", "LH", "VI", "Mixed"], 
+              ["GPT Eval", stats["diff"], stats["fp"], round(100 * fig_all["correct"]/fig_all["total"], 4), round(100 * fig_all["inconsistent"]/fig_all["total"], 4), round(100 * fig_all["wrong"]/fig_all["total"], 4), round(100 * all_data["LH_cg"]/(all_data["LH_cg"] + all_data["VI_cg"] + all_data["Mix_cg"]), 4), round(100 * all_data["VI_cg"]/(all_data["LH_cg"] + all_data["VI_cg"] + all_data["Mix_cg"]), 4), round(100 * all_data["Mix_cg"]/(all_data["LH_cg"] + all_data["VI_cg"] + all_data["Mix_cg"]), 4)]]
+    test = PrettyTable(table[0])
+    test.add_rows(table[1:])
+    print(test)
+
+    orig = [i for i in data if int(i["visual_input"]) == 1]
+
+    edit = [i for i in data if int(i["visual_input"]) == 2]
+
+    a = np.unique([i["category"] + "_" + i["subcategory"] + "_" + i["set_id"] + "_" + i["figure_id"] for i in orig])
+    b = np.unique([i["category"] + "_" + i["subcategory"] + "_" + i["set_id"] + "_" + i["figure_id"] for i in edit])
+    print(len(a))
+    print(len(b))
+
+
+