From 908c2629ac21e8c57217536d9c2fdcc2950a35c1 Mon Sep 17 00:00:00 2001
From: kennymckormick <dhd@pku.edu.cn>
Date: Sun, 14 Jan 2024 16:28:08 +0800
Subject: [PATCH 1/2] support AI2D

---
 README.md                       | 10 ++--
 run.py                          |  4 +-
 scripts/AI2D_preproc.ipynb      | 95 +++++++++++++++++++++++++++++++++
 scripts/report_missing.py       |  6 ++-
 vlmeval/utils/dataset_config.py |  9 ++--
 5 files changed, 114 insertions(+), 10 deletions(-)
 create mode 100644 scripts/AI2D_preproc.ipynb

diff --git a/README.md b/README.md
index 17a7fc0b3..a94822f48 100644
--- a/README.md
+++ b/README.md
@@ -14,12 +14,12 @@
 
 ## 🆕 News
 
+- **[2024-01-14]** We have supported [**AI2D**](https://allenai.org/data/diagrams) and provided the [**script**](/scripts/AI2D_preproc.ipynb) for data pre-processing. 🔥🔥🔥
 - **[2024-01-13]** We have supported [**EMU2 / EMU2-Chat**](https://github.com/baaivision/Emu) and [**DocVQA**](https://www.docvqa.org). 🔥🔥🔥
 - **[2024-01-11]** We have supported [**Monkey**](https://github.com/Yuliang-Liu/Monkey). 🔥🔥🔥
 - **[2024-01-09]** The performance numbers on our official multi-modal leaderboards can be downloaded in json files: [**MMBench Leaderboard**](http://opencompass.openxlab.space/utils/MMBench.json), [**OpenCompass Multi-Modal Leaderboard**](http://opencompass.openxlab.space/utils/MMLB.json). We also add a [notebook](scripts/visualize.ipynb) to visualize these results.🔥🔥🔥
 - **[2024-01-03]** We support **ScienceQA (Img)** (Dataset Name: ScienceQA_[VAL/TEST], [**eval results**](results/ScienceQA.md)), **HallusionBench** (Dataset Name: HallusionBench, [**eval results**](/results/HallusionBench.md)), and **MathVista** (Dataset Name: MathVista_MINI, [**eval results**](/results/MathVista.md)).  🔥🔥🔥
 - **[2023-12-31]** We release the [**preliminary results**](/results/VQA.md) of three VQA datasets (**OCRVQA**, **TextVQA**, **ChatVQA**). The results are obtained by exact matching and may not faithfully reflect the real performance of VLMs on the corresponding task.
-- **[2023-12-29]** We release the evaluation results of [**COCO Caption**](results/Caption.md). 🔥🔥🔥
 
 ## 📊 Datasets, Models, and Evaluation Results
 
@@ -46,6 +46,7 @@
 | [**TextVQA**](https://textvqa.org)                           | TextVQA_VAL                                            | ✅         | ✅          | [**VQA**](/results/VQA.md)                                   |
 | [**ChartQA**](https://github.com/vis-nlp/ChartQA)            | ChartQA_VALTEST_HUMAN                                  | ✅         | ✅          | [**VQA**](/results/VQA.md)                                   |
 | [**DocVQA**](https://www.docvqa.org)                         | DocVQA_VAL                                             | ✅         | ✅          |                                                              |
+| [**AI2D**](https://allenai.org/data/diagrams)                | AI2D                                                   | ✅         | ✅          |                                                              |
 | [**Core-MM**](https://github.com/core-mm/core-mm)            | CORE_MM                                                | ✅         |            |                                                              |
 
 **Supported API Models**
@@ -109,7 +110,6 @@ We use `run.py` for evaluation. To use the script, you can use `$VLMEvalKit/run.
 **Arguments**
 
 - `--data (list[str])`: Set the dataset names that are supported in VLMEvalKit (defined in `vlmeval/utils/data_util.py`). 
-  - including: `MME, SEEDBench_IMG, MMBench_DEV_EN, MMBench_TEST_EN, MMBench_DEV_CN, MMBench_TEST_CN, CCBench, Core_MM, MMVet`
 - `--model (list[str])`: Set the VLM names that are supported in VLMEvalKit (defined in `supported_VLM` in `vlmeval/config.py`). 
 - `--mode (str, default to 'all', choices are ['all', 'infer'])`: When `mode` set to "all", will perform both inference and evaluation; when set to "infer", will only perform the inference.
 - `--nproc (int, default to 4)`: The number of threads for OpenAI API calling.
@@ -141,7 +141,11 @@ The evaluation results will be printed as logs, besides. **Result Files** will a
 
 ## 🛠️ Custom Benchmark or VLM
 
-To implement a custom benchmark or VLM in VLMEvalKit, please refer to [Custom_Benchmark_and_Model](/Custom_Benchmark_and_Model.md).
+To implement a custom benchmark or VLM in **VLMEvalKit**, please refer to [Custom_Benchmark_and_Model](/Custom_Benchmark_and_Model.md).
+
+Example PRs to follow:
+- [**New Model**] Support Monkey ([#45](https://github.com/open-compass/VLMEvalKit/pull/45/files))
+- [**New Benchmark**] Support AI2D ()
 
 ## 🎯 The Goal of VLMEvalKit
 
diff --git a/run.py b/run.py
index f4acdc819..8de401e86 100644
--- a/run.py
+++ b/run.py
@@ -60,7 +60,7 @@ def main():
             if rank == 0:
                 time.sleep(3)
                 res = None
-                if listinstr(['SEEDBench_IMG', 'MMBench', 'CCBench', 'ScienceQA'], dataset_name):
+                if listinstr(['SEEDBench_IMG', 'MMBench', 'CCBench', 'ScienceQA', 'AI2D'], dataset_name):
                     res = prefetch_acc(result_file)
                 else:
                     logger.warning(f'{dataset_name} is not handled by prefetch score calculator')
@@ -70,7 +70,7 @@ def main():
                     dump(res, result_file.replace('.xlsx', '_prefetch.xlsx'))
                 
             if rank == 0 and args.mode == 'all':
-                if listinstr(['MMBench', 'CCBench', 'SEEDBench_IMG', 'MMMU', 'ScienceQA'], dataset_name):
+                if listinstr(['MMBench', 'CCBench', 'SEEDBench_IMG', 'MMMU', 'ScienceQA', 'AI2D'], dataset_name):
                     multiple_choice_eval(result_file, dataset=dataset_name, model='chatgpt-0613', nproc=args.nproc, verbose=args.verbose)
                 elif listinstr(['MME', 'Hallusion'], dataset_name):
                     YOrN_eval(result_file, model='chatgpt-0613', nproc=args.nproc, verbose=args.verbose, dataset=dataset_name)
diff --git a/scripts/AI2D_preproc.ipynb b/scripts/AI2D_preproc.ipynb
new file mode 100644
index 000000000..4f0ef779b
--- /dev/null
+++ b/scripts/AI2D_preproc.ipynb
@@ -0,0 +1,95 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from collections import defaultdict\n",
+    "from vlmeval.smp import ls, load, dump, download_file, encode_image_file_to_base64, md5\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "download_file('https://ai2-public-datasets.s3.amazonaws.com/diagrams/ai2d-all.zip')\n",
+    "os.system('unzip -o ai2d-all.zip')\n",
+    "\n",
+    "images = ls('ai2d/images/')\n",
+    "questions = ls('ai2d/questions/')\n",
+    "cates = load('ai2d/categories.json')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_all = defaultdict(list)\n",
+    "for qfile in questions:\n",
+    "    data = load(qfile)\n",
+    "    image_pth = qfile.replace('questions', 'images').replace('.json', '')\n",
+    "    cate = cates[image_pth.split('/')[-1]]\n",
+    "    for q, qmeta in data['questions'].items():\n",
+    "        assert '.png-' in qmeta['questionId']\n",
+    "        main, sub = qmeta['questionId'].split('.png-')\n",
+    "        idx = int(main) * 100 + int(sub)\n",
+    "        data_all['index'].append(idx)\n",
+    "        data_all['question'].append(q)\n",
+    "        data_all['image_path'].append(image_pth)\n",
+    "        answers = qmeta['answerTexts']\n",
+    "        correct = qmeta['correctAnswer']\n",
+    "        assert len(answers) == 4\n",
+    "        for c, a in zip('ABCD', answers):\n",
+    "            data_all[c].append(a)\n",
+    "        data_all['answer'].append('ABCD'[qmeta['correctAnswer']])\n",
+    "        data_all['category'].append(cate)\n",
+    "data_all = pd.DataFrame(data_all)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = data_all\n",
+    "print(data.keys())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "images = []\n",
+    "image_seen = {}\n",
+    "for idx, pth in zip(data['index'], data['image_path']):\n",
+    "    if pth in image_seen:\n",
+    "        images.append(image_seen[pth])\n",
+    "    else:\n",
+    "        image_seen[pth] = idx\n",
+    "        images.append(encode_image_file_to_base64(pth))\n",
+    "\n",
+    "data['image'] = images\n",
+    "dump(data, 'AI2D.tsv')\n",
+    "print(md5('AI2D.tsv'))"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/scripts/report_missing.py b/scripts/report_missing.py
index b159a348a..cfe327431 100644
--- a/scripts/report_missing.py
+++ b/scripts/report_missing.py
@@ -6,12 +6,14 @@
 dataset = [
     'MME', 'SEEDBench_IMG', 'MMBench', 'CCBench', 'MMBench_CN',
     'MMVet', 'OCRVQA_TESTCORE', 'TextVQA_VAL', 'COCO_VAL', 'MMMU_DEV_VAL',
-    'ChartQA_VALTEST_HUMAN', 'ScienceQA_VAL', 'ScienceQA_TEST', 'MathVista_MINI', 'HallusionBench'
+    'ChartQA_VALTEST_HUMAN', 'ScienceQA_VAL', 'ScienceQA_TEST', 'MathVista_MINI', 'HallusionBench',
+    'AI2D'
 ]
 suffix = [
     'score.csv', 'acc.csv', 'acc.csv', 'acc.csv', 'acc.csv',
     'gpt-4-turbo_score.csv', 'acc.csv', 'acc.csv', 'score.json', 'acc.csv',
-    'acc.csv', 'acc.csv', 'acc.csv', 'gpt-4-turbo_score.csv', 'score.csv'
+    'acc.csv', 'acc.csv', 'acc.csv', 'gpt-4-turbo_score.csv', 'score.csv',
+    'acc.csv'
 ]
 
 N = len(dataset)
diff --git a/vlmeval/utils/dataset_config.py b/vlmeval/utils/dataset_config.py
index bcd6f14c9..dabfc0b8d 100644
--- a/vlmeval/utils/dataset_config.py
+++ b/vlmeval/utils/dataset_config.py
@@ -23,6 +23,7 @@
     'ScienceQA_TEST': "https://opencompass.openxlab.space/utils/VLMEval/ScienceQA_TEST.tsv",
     'HallusionBench': "https://opencompass.openxlab.space/utils/VLMEval/HallusionBench.tsv",
     "DocVQA_VAL": "https://opencompass.openxlab.space/utils/VLMEval/DocVQA_VAL.tsv",
+    'AI2D': "https://opencompass.openxlab.space/utils/VLMEval/AI2D.tsv"
 }
 
 dataset_md5_dict = {
@@ -47,7 +48,8 @@
     'ScienceQA_VAL': '96320d05e142e585e7204e72affd29f3',
     'ScienceQA_TEST': 'e42e9e00f9c59a80d8a5db35bc32b71f',
     'HallusionBench': '0c23ac0dc9ef46832d7a24504f2a0c7c',
-    "DocVQA_VAL": '3744f5df4aaf2781c85fe7677ae0a411'
+    "DocVQA_VAL": '3744f5df4aaf2781c85fe7677ae0a411',
+    "AI2D": "53db8397adbe73e9cc0b4861227004d4"
 }
 
 img_root_map = {
@@ -72,13 +74,14 @@
     'ScienceQA_VAL': 'ScienceQA_VAL',
     'ScienceQA_TEST': 'ScienceQA_TEST',
     'HallusionBench': 'Hallusion',
-    'DocVQA_VAL': 'DocVQA'
+    'DocVQA_VAL': 'DocVQA',
+    'AI2D': 'AI2D'
 }
 
 assert set(dataset_URLs) == set(img_root_map) == set(dataset_md5_dict)
 
 def DATASET_TYPE(dataset):
-    if listinstr(['mmbench', 'seedbench', 'ccbench', 'mmmu', 'scienceqa'], dataset.lower()):
+    if listinstr(['mmbench', 'seedbench', 'ccbench', 'mmmu', 'scienceqa', 'ai2d'], dataset.lower()):
         return 'multi-choice'
     elif 'MME' in dataset:
         return 'Y/N'

From 566aea9b2707377b39a78ffecc4aed391c247f4c Mon Sep 17 00:00:00 2001
From: kennymckormick <dhd@pku.edu.cn>
Date: Sun, 14 Jan 2024 16:29:21 +0800
Subject: [PATCH 2/2] update

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a94822f48..5e003a114 100644
--- a/README.md
+++ b/README.md
@@ -144,8 +144,9 @@ The evaluation results will be printed as logs, besides. **Result Files** will a
 To implement a custom benchmark or VLM in **VLMEvalKit**, please refer to [Custom_Benchmark_and_Model](/Custom_Benchmark_and_Model.md).
 
 Example PRs to follow:
+
 - [**New Model**] Support Monkey ([#45](https://github.com/open-compass/VLMEvalKit/pull/45/files))
-- [**New Benchmark**] Support AI2D ()
+- [**New Benchmark**] Support AI2D ([#51](https://github.com/open-compass/VLMEvalKit/pull/51/files))
 
 ## 🎯 The Goal of VLMEvalKit