From 908c2629ac21e8c57217536d9c2fdcc2950a35c1 Mon Sep 17 00:00:00 2001 From: kennymckormick Date: Sun, 14 Jan 2024 16:28:08 +0800 Subject: [PATCH 1/2] support AI2D --- README.md | 10 ++-- run.py | 4 +- scripts/AI2D_preproc.ipynb | 95 +++++++++++++++++++++++++++++++++ scripts/report_missing.py | 6 ++- vlmeval/utils/dataset_config.py | 9 ++-- 5 files changed, 114 insertions(+), 10 deletions(-) create mode 100644 scripts/AI2D_preproc.ipynb diff --git a/README.md b/README.md index 17a7fc0b3..a94822f48 100644 --- a/README.md +++ b/README.md @@ -14,12 +14,12 @@ ## 🆕 News +- **[2024-01-14]** We have supported [**AI2D**](https://allenai.org/data/diagrams) and provided the [**script**](/scripts/AI2D_preproc.ipynb) for data pre-processing. 🔥🔥🔥 - **[2024-01-13]** We have supported [**EMU2 / EMU2-Chat**](https://github.com/baaivision/Emu) and [**DocVQA**](https://www.docvqa.org). 🔥🔥🔥 - **[2024-01-11]** We have supported [**Monkey**](https://github.com/Yuliang-Liu/Monkey). 🔥🔥🔥 - **[2024-01-09]** The performance numbers on our official multi-modal leaderboards can be downloaded in json files: [**MMBench Leaderboard**](http://opencompass.openxlab.space/utils/MMBench.json), [**OpenCompass Multi-Modal Leaderboard**](http://opencompass.openxlab.space/utils/MMLB.json). We also add a [notebook](scripts/visualize.ipynb) to visualize these results.🔥🔥🔥 - **[2024-01-03]** We support **ScienceQA (Img)** (Dataset Name: ScienceQA_[VAL/TEST], [**eval results**](results/ScienceQA.md)), **HallusionBench** (Dataset Name: HallusionBench, [**eval results**](/results/HallusionBench.md)), and **MathVista** (Dataset Name: MathVista_MINI, [**eval results**](/results/MathVista.md)). 🔥🔥🔥 - **[2023-12-31]** We release the [**preliminary results**](/results/VQA.md) of three VQA datasets (**OCRVQA**, **TextVQA**, **ChatVQA**). The results are obtained by exact matching and may not faithfully reflect the real performance of VLMs on the corresponding task. -- **[2023-12-29]** We release the evaluation results of [**COCO Caption**](results/Caption.md). 🔥🔥🔥 ## 📊 Datasets, Models, and Evaluation Results @@ -46,6 +46,7 @@ | [**TextVQA**](https://textvqa.org) | TextVQA_VAL | ✅ | ✅ | [**VQA**](/results/VQA.md) | | [**ChartQA**](https://github.com/vis-nlp/ChartQA) | ChartQA_VALTEST_HUMAN | ✅ | ✅ | [**VQA**](/results/VQA.md) | | [**DocVQA**](https://www.docvqa.org) | DocVQA_VAL | ✅ | ✅ | | +| [**AI2D**](https://allenai.org/data/diagrams) | AI2D | ✅ | ✅ | | | [**Core-MM**](https://github.com/core-mm/core-mm) | CORE_MM | ✅ | | | **Supported API Models** @@ -109,7 +110,6 @@ We use `run.py` for evaluation. To use the script, you can use `$VLMEvalKit/run. **Arguments** - `--data (list[str])`: Set the dataset names that are supported in VLMEvalKit (defined in `vlmeval/utils/data_util.py`). - - including: `MME, SEEDBench_IMG, MMBench_DEV_EN, MMBench_TEST_EN, MMBench_DEV_CN, MMBench_TEST_CN, CCBench, Core_MM, MMVet` - `--model (list[str])`: Set the VLM names that are supported in VLMEvalKit (defined in `supported_VLM` in `vlmeval/config.py`). - `--mode (str, default to 'all', choices are ['all', 'infer'])`: When `mode` set to "all", will perform both inference and evaluation; when set to "infer", will only perform the inference. - `--nproc (int, default to 4)`: The number of threads for OpenAI API calling. @@ -141,7 +141,11 @@ The evaluation results will be printed as logs, besides. **Result Files** will a ## 🛠️ Custom Benchmark or VLM -To implement a custom benchmark or VLM in VLMEvalKit, please refer to [Custom_Benchmark_and_Model](/Custom_Benchmark_and_Model.md). +To implement a custom benchmark or VLM in **VLMEvalKit**, please refer to [Custom_Benchmark_and_Model](/Custom_Benchmark_and_Model.md). + +Example PRs to follow: +- [**New Model**] Support Monkey ([#45](https://github.com/open-compass/VLMEvalKit/pull/45/files)) +- [**New Benchmark**] Support AI2D () ## 🎯 The Goal of VLMEvalKit diff --git a/run.py b/run.py index f4acdc819..8de401e86 100644 --- a/run.py +++ b/run.py @@ -60,7 +60,7 @@ def main(): if rank == 0: time.sleep(3) res = None - if listinstr(['SEEDBench_IMG', 'MMBench', 'CCBench', 'ScienceQA'], dataset_name): + if listinstr(['SEEDBench_IMG', 'MMBench', 'CCBench', 'ScienceQA', 'AI2D'], dataset_name): res = prefetch_acc(result_file) else: logger.warning(f'{dataset_name} is not handled by prefetch score calculator') @@ -70,7 +70,7 @@ def main(): dump(res, result_file.replace('.xlsx', '_prefetch.xlsx')) if rank == 0 and args.mode == 'all': - if listinstr(['MMBench', 'CCBench', 'SEEDBench_IMG', 'MMMU', 'ScienceQA'], dataset_name): + if listinstr(['MMBench', 'CCBench', 'SEEDBench_IMG', 'MMMU', 'ScienceQA', 'AI2D'], dataset_name): multiple_choice_eval(result_file, dataset=dataset_name, model='chatgpt-0613', nproc=args.nproc, verbose=args.verbose) elif listinstr(['MME', 'Hallusion'], dataset_name): YOrN_eval(result_file, model='chatgpt-0613', nproc=args.nproc, verbose=args.verbose, dataset=dataset_name) diff --git a/scripts/AI2D_preproc.ipynb b/scripts/AI2D_preproc.ipynb new file mode 100644 index 000000000..4f0ef779b --- /dev/null +++ b/scripts/AI2D_preproc.ipynb @@ -0,0 +1,95 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from collections import defaultdict\n", + "from vlmeval.smp import ls, load, dump, download_file, encode_image_file_to_base64, md5\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "download_file('https://ai2-public-datasets.s3.amazonaws.com/diagrams/ai2d-all.zip')\n", + "os.system('unzip -o ai2d-all.zip')\n", + "\n", + "images = ls('ai2d/images/')\n", + "questions = ls('ai2d/questions/')\n", + "cates = load('ai2d/categories.json')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data_all = defaultdict(list)\n", + "for qfile in questions:\n", + " data = load(qfile)\n", + " image_pth = qfile.replace('questions', 'images').replace('.json', '')\n", + " cate = cates[image_pth.split('/')[-1]]\n", + " for q, qmeta in data['questions'].items():\n", + " assert '.png-' in qmeta['questionId']\n", + " main, sub = qmeta['questionId'].split('.png-')\n", + " idx = int(main) * 100 + int(sub)\n", + " data_all['index'].append(idx)\n", + " data_all['question'].append(q)\n", + " data_all['image_path'].append(image_pth)\n", + " answers = qmeta['answerTexts']\n", + " correct = qmeta['correctAnswer']\n", + " assert len(answers) == 4\n", + " for c, a in zip('ABCD', answers):\n", + " data_all[c].append(a)\n", + " data_all['answer'].append('ABCD'[qmeta['correctAnswer']])\n", + " data_all['category'].append(cate)\n", + "data_all = pd.DataFrame(data_all)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = data_all\n", + "print(data.keys())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "images = []\n", + "image_seen = {}\n", + "for idx, pth in zip(data['index'], data['image_path']):\n", + " if pth in image_seen:\n", + " images.append(image_seen[pth])\n", + " else:\n", + " image_seen[pth] = idx\n", + " images.append(encode_image_file_to_base64(pth))\n", + "\n", + "data['image'] = images\n", + "dump(data, 'AI2D.tsv')\n", + "print(md5('AI2D.tsv'))" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/scripts/report_missing.py b/scripts/report_missing.py index b159a348a..cfe327431 100644 --- a/scripts/report_missing.py +++ b/scripts/report_missing.py @@ -6,12 +6,14 @@ dataset = [ 'MME', 'SEEDBench_IMG', 'MMBench', 'CCBench', 'MMBench_CN', 'MMVet', 'OCRVQA_TESTCORE', 'TextVQA_VAL', 'COCO_VAL', 'MMMU_DEV_VAL', - 'ChartQA_VALTEST_HUMAN', 'ScienceQA_VAL', 'ScienceQA_TEST', 'MathVista_MINI', 'HallusionBench' + 'ChartQA_VALTEST_HUMAN', 'ScienceQA_VAL', 'ScienceQA_TEST', 'MathVista_MINI', 'HallusionBench', + 'AI2D' ] suffix = [ 'score.csv', 'acc.csv', 'acc.csv', 'acc.csv', 'acc.csv', 'gpt-4-turbo_score.csv', 'acc.csv', 'acc.csv', 'score.json', 'acc.csv', - 'acc.csv', 'acc.csv', 'acc.csv', 'gpt-4-turbo_score.csv', 'score.csv' + 'acc.csv', 'acc.csv', 'acc.csv', 'gpt-4-turbo_score.csv', 'score.csv', + 'acc.csv' ] N = len(dataset) diff --git a/vlmeval/utils/dataset_config.py b/vlmeval/utils/dataset_config.py index bcd6f14c9..dabfc0b8d 100644 --- a/vlmeval/utils/dataset_config.py +++ b/vlmeval/utils/dataset_config.py @@ -23,6 +23,7 @@ 'ScienceQA_TEST': "https://opencompass.openxlab.space/utils/VLMEval/ScienceQA_TEST.tsv", 'HallusionBench': "https://opencompass.openxlab.space/utils/VLMEval/HallusionBench.tsv", "DocVQA_VAL": "https://opencompass.openxlab.space/utils/VLMEval/DocVQA_VAL.tsv", + 'AI2D': "https://opencompass.openxlab.space/utils/VLMEval/AI2D.tsv" } dataset_md5_dict = { @@ -47,7 +48,8 @@ 'ScienceQA_VAL': '96320d05e142e585e7204e72affd29f3', 'ScienceQA_TEST': 'e42e9e00f9c59a80d8a5db35bc32b71f', 'HallusionBench': '0c23ac0dc9ef46832d7a24504f2a0c7c', - "DocVQA_VAL": '3744f5df4aaf2781c85fe7677ae0a411' + "DocVQA_VAL": '3744f5df4aaf2781c85fe7677ae0a411', + "AI2D": "53db8397adbe73e9cc0b4861227004d4" } img_root_map = { @@ -72,13 +74,14 @@ 'ScienceQA_VAL': 'ScienceQA_VAL', 'ScienceQA_TEST': 'ScienceQA_TEST', 'HallusionBench': 'Hallusion', - 'DocVQA_VAL': 'DocVQA' + 'DocVQA_VAL': 'DocVQA', + 'AI2D': 'AI2D' } assert set(dataset_URLs) == set(img_root_map) == set(dataset_md5_dict) def DATASET_TYPE(dataset): - if listinstr(['mmbench', 'seedbench', 'ccbench', 'mmmu', 'scienceqa'], dataset.lower()): + if listinstr(['mmbench', 'seedbench', 'ccbench', 'mmmu', 'scienceqa', 'ai2d'], dataset.lower()): return 'multi-choice' elif 'MME' in dataset: return 'Y/N' From 566aea9b2707377b39a78ffecc4aed391c247f4c Mon Sep 17 00:00:00 2001 From: kennymckormick Date: Sun, 14 Jan 2024 16:29:21 +0800 Subject: [PATCH 2/2] update --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index a94822f48..5e003a114 100644 --- a/README.md +++ b/README.md @@ -144,8 +144,9 @@ The evaluation results will be printed as logs, besides. **Result Files** will a To implement a custom benchmark or VLM in **VLMEvalKit**, please refer to [Custom_Benchmark_and_Model](/Custom_Benchmark_and_Model.md). Example PRs to follow: + - [**New Model**] Support Monkey ([#45](https://github.com/open-compass/VLMEvalKit/pull/45/files)) -- [**New Benchmark**] Support AI2D () +- [**New Benchmark**] Support AI2D ([#51](https://github.com/open-compass/VLMEvalKit/pull/51/files)) ## 🎯 The Goal of VLMEvalKit