From f1d0ce43fa62300cc07ac7c335d5926cd7b8acd9 Mon Sep 17 00:00:00 2001 From: Haodong Duan Date: Thu, 28 Dec 2023 14:35:03 +0800 Subject: [PATCH] [Result] Update XTuner Performance (#31) * update report_missing * update results * update --- results/MME.md | 52 ++++++++++++++++++++------------------- results/MMMU.md | 51 +++++++++++++++++++------------------- results/MMVet.md | 50 ++++++++++++++++++++----------------- results/SEEDBench_IMG.md | 49 +++++++++++++++++++----------------- scripts/report_missing.py | 8 +++--- 5 files changed, 110 insertions(+), 100 deletions(-) diff --git a/results/MME.md b/results/MME.md index 247ff0faa..84e52864d 100644 --- a/results/MME.md +++ b/results/MME.md @@ -8,30 +8,32 @@ In each cell, we list `vanilla score / ChatGPT Answer Extraction Score` if the t VLMs are sorted by the descending order of Total score. -| Model | Total | perception | reasoning | -| :------------------- | ----------: | ----------: | --------: | -| Full | 2800 | 2000 | 800 | -| GeminiProVision | 2131 / 2149 | 1601 / 1609 | 530 / 540 | -| XComposer | 1874 | 1497 | 377 | -| qwen_chat | 1849 / 1860 | 1457 / 1468 | 392 | -| sharegpt4v_7b | 1799 / 1808 | 1491 | 308 / 318 | -| llava_v1.5_13b | 1800 / 1805 | 1485 / 1490 | 315 | -| mPLUG-Owl2 | 1781 / 1786 | 1435 / 1436 | 346 / 350 | -| llava_v1.5_7b | 1775 | 1490 | 285 | -| GPT-4v (detail: low) | 1737 / 1771 | 1300 / 1334 | 437 | -| TransCore_M | 1682 / 1701 | 1427 / 1429 | 254 / 272 | -| instructblip_13b | 1624 / 1646 | 1381 / 1383 | 243 / 263 | -| idefics_80b_instruct | 1507 / 1519 | 1276 / 1285 | 231 / 234 | -| instructblip_7b | 1313 / 1391 | 1084 / 1137 | 229 / 254 | -| idefics_9b_instruct | 1177 | 942 | 235 | -| PandaGPT_13B | 1072 | 826 | 246 | -| MiniGPT-4-v1-13B | 648 / 1067 | 533 / 794 | 115 / 273 | -| MiniGPT-4-v1-7B | 806 / 1048 | 622 / 771 | 184 / 277 | -| llava_v1_7b | 1027 / 1044 | 793 / 807 | 234 / 238 | -| MiniGPT-4-v2 | 968 | 708 | 260 | -| VisualGLM_6b | 738 | 628 | 110 | -| flamingov2 | 607 | 535 | 72 | -| qwen_base | 6 / 483 | 0 / 334 | 6 / 149 | +| Model | Total | Perception | Reasoning | +|:------------------------------|:------------|:-------------|:------------| +| GeminiProVision | 2131 / 2149 | 1601 / 1609 | 530 / 540 | +| InternLM-XComposer-VL | 1874 | 1497 | 377 | +| Qwen-VL-Chat | 1849 / 1860 | 1457 / 1468 | 392 | +| ShareGPT4V-7B | 1799 / 1808 | 1491 | 308 / 317 | +| LLaVA-v1.5-13B | 1800 / 1805 | 1485 / 1490 | 315 | +| mPLUG-Owl2 | 1781 / 1786 | 1435 / 1436 | 346 / 350 | +| LLaVA-v1.5-7B | 1775 | 1490 | 285 | +| GPT-4v (detail: low) | 1737 / 1771 | 1300 / 1334 | 437 | +| LLaVA-v1.5-13B (LoRA, XTuner) | 1766 | 1475 | 291 | +| LLaVA-v1.5-7B (LoRA, XTuner) | 1716 | 1434 | 282 | +| TransCore-M | 1681 / 1701 | 1427 / 1429 | 254 / 272 | +| instructblip_13b | 1624 / 1646 | 1381 / 1383 | 243 / 263 | +| LLaVA-InternLM-7B (LoRA) | 1637 | 1393 | 244 | +| IDEFICS-80B-Instruct | 1507 / 1519 | 1276 / 1285 | 231 / 234 | +| InstructBLIP-7B | 1313 / 1391 | 1084 / 1137 | 229 / 254 | +| IDEFICS-9B-Instruct | 1177 | 942 | 235 | +| PandaGPT-13B | 1072 | 826 | 246 | +| MiniGPT-4-v1-13B | 648 / 1067 | 533 / 794 | 115 / 273 | +| MiniGPT-4-v1-7B | 806 / 1048 | 622 / 771 | 184 / 277 | +| LLaVA-v1-7B | 1027 / 1044 | 793 / 807 | 234 / 237 | +| MiniGPT-4-v2 | 968 | 708 | 260 | +| VisualGLM | 738 | 628 | 110 | +| OpenFlamingo v2 | 607 | 535 | 72 | +| Qwen-VL | 6 / 483 | 0 / 334 | 6 / 149 | ### Comments @@ -39,7 +41,7 @@ For most VLMs, using ChatGPT as the answer extractor or not may not significantl | MME Score Improvement with ChatGPT Answer Extractor | Models | | --------------------------------------------------- | ------------------------------------------------------------ | -| **No (0)** | XComposer, llava_v1.5_7b, idefics_9b_instruct, PandaGPT_13B, MiniGPT-4-v2,
VisualGLM_6b, flamingov2 | +| **No (0)** | XComposer, llava_v1.5_7b, idefics_9b_instruct, PandaGPT_13B, MiniGPT-4-v2,
VisualGLM_6b, flamingov2, LLaVA-XTuner Series | | **Minor (1~20)** | qwen_chat (11), llava_v1.5_13b (5), mPLUG-Owl2 (5), idefics_80b_instruct (12), llava_v1_7b (17),
sharegpt4v_7b (9), TransCore_M (19), GeminiProVision (18) | | **Moderate (21~100)** | instructblip_13b (22), instructblip_7b (78), GPT-4v (34) | | **Huge (> 100)** | MiniGPT-4-v1-7B (242), MiniGPT-4-v1-13B (419), qwen_base (477) | \ No newline at end of file diff --git a/results/MMMU.md b/results/MMMU.md index bebfee53b..5c4ec124f 100644 --- a/results/MMMU.md +++ b/results/MMMU.md @@ -11,28 +11,29 @@ ### MMMU Scores -| Model | Overall
(Val) | Art & Design
(Val) | Business
(Val) | Science
(Val) | Health & Medicine
(Val) | Humanities & Social Science
(Val) | Tech & Engineering
(Val) | Overall
(Dev) | -|:---------------------|-------------------:|------------------------:|--------------------:|-------------------:|-----------------------------:|---------------------------------------:|------------------------------:|-------------------:| -| GPT-4v | 53.8 | 66.7 | 60 | 46 | 54.7 | 71.7 | 36.7 | 52.7 | -| GeminiProVision | 48.4 | 59.2 | 36 | 42 | 52 | 66.7 | 42.9 | 54 | -| qwen_chat | 37.6 | 49.2 | 36 | 28 | 32.7 | 55.8 | 31.9 | 30 | -| llava_v1.5_13b | 36.8 | 49.2 | 23.3 | 36 | 34 | 51.7 | 33.3 | 42 | -| sharegpt4v_7b | 36.7 | 50 | 27.3 | 26.7 | 37.3 | 50 | 34.8 | 30 | -| TransCore_M | 36.6 | 54.2 | 32 | 27.3 | 32 | 49.2 | 32.4 | 38.7 | -| llava_v1.5_7b | 36.1 | 45.8 | 25.3 | 34 | 32 | 48.3 | 35.7 | 38.7 | -| XComposer | 35.7 | 45.8 | 28.7 | 22.7 | 30.7 | 53.3 | 37.6 | 36.7 | -| mPLUG-Owl2 | 34.6 | 47.5 | 26 | 21.3 | 37.3 | 50 | 31.9 | 40.7 | -| instructblip_13b | 32.9 | 37.5 | 29.3 | 32 | 28.7 | 37.5 | 33.8 | 30 | -| PandaGPT_13B | 32.7 | 42.5 | 35.3 | 30 | 29.3 | 45.8 | 21.9 | 26.7 | -| llava_v1_7b | 32.1 | 31.7 | 24.7 | 31.3 | 32 | 37.5 | 35.2 | 33.3 | -| instructblip_7b | 30.4 | 38.3 | 28 | 22 | 30.7 | 39.2 | 28.6 | 24 | -| VisualGLM_6b | 28.9 | 30 | 24 | 28 | 28 | 40.8 | 26.2 | 28.7 | -| qwen_base | 28.8 | 43.3 | 18.7 | 25.3 | 32.7 | 42.5 | 19.5 | 29.3 | -| flamingov2 | 28.2 | 27.5 | 30 | 28.7 | 28 | 33.3 | 24.3 | 21.3 | -| Frequent Choice | 26.8 | | | | | | | | -| MiniGPT-4-v1-13B | 26.2 | 33.3 | 19.3 | 28.7 | 26 | 34.2 | 21 | 23.3 | -| idefics_80b_instruct | 25.1 | 39.2 | 17.3 | 23.3 | 24 | 48.3 | 11.4 | 23.3 | -| MiniGPT-4-v2 | 24.6 | 27.5 | 22.7 | 21.3 | 28 | 33.3 | 19 | 32 | -| MiniGPT-4-v1-7B | 23 | 32.5 | 27.3 | 18.7 | 17.3 | 15 | 26.2 | 19.3 | -| Random Choice | 22.1 | | | | | | | | -| idefics_9b_instruct | 19.6 | 22.5 | 11.3 | 20.7 | 23.3 | 31.7 | 13.3 | 20 | \ No newline at end of file +| Model | Overall
(Val) | Art & Design
(Val) | Business
(Val) | Science
(Val) | Health & Medicine
(Val) | Humanities & Social Science
(Val) | Tech & Engineering
(Val) | Overall
(Dev) | +|:------------------------------|-------------------:|------------------------:|--------------------:|-------------------:|-----------------------------:|---------------------------------------:|------------------------------:|-------------------:| +| GPT-4v (detail: low) | 53.8 | 66.7 | 60 | 46 | 54.7 | 71.7 | 36.7 | 52.7 | +| GeminiProVision | 48.4 | 59.2 | 36 | 42 | 52 | 66.7 | 42.9 | 54 | +| Qwen-VL-Chat | 37.6 | 49.2 | 36 | 28 | 32.7 | 55.8 | 31.9 | 30 | +| LLaVA-InternLM-7B (LoRA) | 37 | 44.2 | 32 | 29.3 | 38.7 | 47.5 | 34.8 | 43.3 | +| LLaVA-v1.5-13B | 36.8 | 49.2 | 23.3 | 36 | 34 | 51.7 | 33.3 | 42 | +| ShareGPT4V-7B | 36.7 | 50 | 27.3 | 26.7 | 37.3 | 50 | 34.8 | 30 | +| TransCore-M | 36.6 | 54.2 | 32 | 27.3 | 32 | 49.2 | 32.4 | 38.7 | +| LLaVA-v1.5-7B | 36.1 | 45.8 | 25.3 | 34 | 32 | 48.3 | 35.7 | 38.7 | +| InternLM-XComposer-VL | 35.7 | 45.8 | 28.7 | 22.7 | 30.7 | 53.3 | 37.6 | 36.7 | +| LLaVA-v1.5-13B (LoRA, XTuner) | 35.1 | 40.8 | 30.7 | 26.7 | 35.3 | 45 | 35.2 | 43.3 | +| mPLUG-Owl2 | 34.6 | 47.5 | 26 | 21.3 | 37.3 | 50 | 31.9 | 40.7 | +| LLaVA-v1.5-7B (LoRA, XTuner) | 33.7 | 48.3 | 23.3 | 30 | 32.7 | 46.7 | 28.6 | 37.3 | +| instructblip_13b | 32.9 | 37.5 | 29.3 | 32 | 28.7 | 37.5 | 33.8 | 30 | +| PandaGPT-13B | 32.7 | 42.5 | 35.3 | 30 | 29.3 | 45.8 | 21.9 | 26.7 | +| LLaVA-v1-7B | 32.1 | 31.7 | 24.7 | 31.3 | 32 | 37.5 | 35.2 | 33.3 | +| InstructBLIP-7B | 30.4 | 38.3 | 28 | 22 | 30.7 | 39.2 | 28.6 | 24 | +| VisualGLM | 28.9 | 30 | 24 | 28 | 28 | 40.8 | 26.2 | 28.7 | +| Qwen-VL | 28.8 | 43.3 | 18.7 | 25.3 | 32.7 | 42.5 | 19.5 | 29.3 | +| OpenFlamingo v2 | 28.2 | 27.5 | 30 | 28.7 | 28 | 33.3 | 24.3 | 21.3 | +| MiniGPT-4-v1-13B | 26.2 | 33.3 | 19.3 | 28.7 | 26 | 34.2 | 21 | 23.3 | +| IDEFICS-80B-Instruct | 25.1 | 39.2 | 17.3 | 23.3 | 24 | 48.3 | 11.4 | 23.3 | +| MiniGPT-4-v2 | 24.6 | 27.5 | 22.7 | 21.3 | 28 | 33.3 | 19 | 32 | +| MiniGPT-4-v1-7B | 23 | 32.5 | 27.3 | 18.7 | 17.3 | 15 | 26.2 | 19.3 | +| IDEFICS-9B-Instruct | 19.6 | 22.5 | 11.3 | 20.7 | 23.3 | 31.7 | 13.3 | 20 | \ No newline at end of file diff --git a/results/MMVet.md b/results/MMVet.md index c191cef3b..3027952ff 100644 --- a/results/MMVet.md +++ b/results/MMVet.md @@ -5,26 +5,30 @@ ### MMVet Scores -| Model | Overall | [Overall (Official Leaderboard)](https://paperswithcode.com/sota/visual-question-answering-on-mm-vet) | ocr | math | spat | rec | know | gen | -| :------------------------ | ------: | -----------------------------------------------------------: | ---: | ---: | ---: | ---: | ---: | ---: | -| GeminiProVision | 59.2 | 64.3±0.4 | 63.6 | 41.5 | 61.2 | 59.8 | 51.0 | 48.0 | -| GPT-4v (detail: low) | 56.8 | 60.2±0.3 | 59.4 | 61.2 | 52.5 | 59.7 | 48.0 | 46.5 | -| qwen_chat | 47.3 | N/A | 37.2 | 22.3 | 42.8 | 52.5 | 45.4 | 40.3 | -| idefics_80b_instruct | 39.7 | N/A | 29.9 | 15 | 30.7 | 45.6 | 38.6 | 37.1 | -| llava_v1.5_13b | 38.3 | 36.3±0.2 | 28.8 | 11.5 | 31.5 | 42 | 23.1 | 23 | -| mPLUG-Owl2 | 35.7 | 36.3±0.1 | 29.5 | 7.7 | 32.1 | 47.3 | 23.8 | 20.9 | -| XComposer | 35.2 | N/A | 21.8 | 3.8 | 24.7 | 43.1 | 28.9 | 27.5 | -| sharegpt4v_7b | 34.7 | 37.6 | 30.2 | 18.5 | 30 | 36.1 | 20.2 | 18.1 | -| TransCore_M | 33.9 | N/A | 27.3 | 15.4 | 32.7 | 36.7 | 23 | 23.5 | -| instructblip_7b | 33.1 | 26.2±0.2 | 25.5 | 11.5 | 23.5 | 39.3 | 24.3 | 23.6 | -| llava_v1.5_7b | 32.7 | 31.1±0.2 | 25 | 7.7 | 26.3 | 36.9 | 22 | 21.5 | -| instructblip_13b | 30.1 | 25.6±0.3 | 25.4 | 11.2 | 26.9 | 33.4 | 19 | 18.2 | -| idefics_9b_instruct | 30 | N/A | 21.7 | 11.5 | 22.4 | 34.6 | 27.4 | 26.9 | -| llava_v1_7b (vicuna-v1.1) | 27.4 | 23.8±0.6 | 19 | 11.5 | 25.6 | 31.4 | 18.1 | 16.2 | -| flamingov2 | 23.3 | 24.8±0.2 | 19.5 | 7.7 | 21.7 | 24.7 | 21.7 | 19 | -| PandaGPT_13B | 19.6 | N/A | 6.8 | 6.5 | 16.5 | 26.3 | 13.7 | 13.9 | -| MiniGPT-4-v1-13B | 16.9 | 24.4±0.4 | 10.3 | 7.7 | 12.5 | 19.9 | 14.9 | 13.8 | -| MiniGPT-4-v1-7B | 15.6 | 22.1±0.1 | 9.2 | 3.8 | 10.1 | 19.4 | 13.3 | 12.5 | -| VisualGLM_6b | 14.8 | N/A | 8.5 | 6.5 | 9.1 | 18 | 8.1 | 7.1 | -| qwen_base | 13 | N/A | 7.4 | 0 | 3.9 | 16.5 | 18.6 | 18.1 | -| MiniGPT-4-v2 | 10.5 | N/A | 7.1 | 7.3 | 9.6 | 12.2 | 9.2 | 8 | \ No newline at end of file + +| Model | ocr | math | spat | rec | know | gen | Overall | [**Overall (Official)**](https://paperswithcode.com/sota/visual-question-answering-on-mm-vet) | +| :---------------------------- | ---: | ---: | ---: | ---: | ---: | ---: | ------: | -----------------------------------------------------------: | +| GeminiProVision | 63.6 | 41.5 | 61.2 | 59.8 | 51 | 48 | 59.2 | 64.3±0.4 | +| GPT-4v (detail: low) | 59.4 | 61.2 | 52.5 | 59.7 | 48 | 46.5 | 56.8 | 60.2±0.3 | +| Qwen-VL-Chat | 37.2 | 22.3 | 42.8 | 52.5 | 45.4 | 40.3 | 47.3 | N/A | +| IDEFICS-80B-Instruct | 29.9 | 15 | 30.7 | 45.6 | 38.6 | 37.1 | 39.7 | N/A | +| LLaVA-v1.5-13B | 28.8 | 11.5 | 31.5 | 42 | 23.1 | 23 | 38.3 | 36.3±0.2 | +| LLaVA-v1.5-13B (LoRA, XTuner) | 31.3 | 15 | 28 | 46.3 | 25.6 | 27.3 | 35.9 | N/A | +| mPLUG-Owl2 | 29.5 | 7.7 | 32.1 | 47.3 | 23.8 | 20.9 | 35.7 | 36.3±0.1 | +| InternLM-XComposer-VL | 21.8 | 3.8 | 24.7 | 43.1 | 28.9 | 27.5 | 35.2 | N/A | +| ShareGPT4V-7B | 30.2 | 18.5 | 30 | 36.1 | 20.2 | 18.1 | 34.7 | 37.6 | +| TransCore-M | 27.3 | 15.4 | 32.7 | 36.7 | 23 | 23.5 | 33.9 | N/A | +| InstructBLIP-7B | 25.5 | 11.5 | 23.5 | 39.3 | 24.3 | 23.6 | 33.1 | 26.2±0.2 | +| LLaVA-v1.5-7B | 25 | 7.7 | 26.3 | 36.9 | 22 | 21.5 | 32.7 | 31.1±0.2 | +| LLaVA-InternLM-7B (LoRA) | 29.2 | 7.7 | 27.5 | 41.1 | 21.7 | 18.5 | 32.4 | N/A | +| LLaVA-v1.5-7B (LoRA, XTuner) | 28.2 | 11.5 | 26.8 | 41.1 | 21.7 | 17 | 32.2 | N/A | +| instructblip_13b | 25.4 | 11.2 | 26.9 | 33.4 | 19 | 18.2 | 30.1 | 25.6±0.3 | +| IDEFICS-9B-Instruct | 21.7 | 11.5 | 22.4 | 34.6 | 27.4 | 26.9 | 30 | N/A | +| LLaVA-v1-7B | 19 | 11.5 | 25.6 | 31.4 | 18.1 | 16.2 | 27.4 | 23.8±0.6 | +| OpenFlamingo v2 | 19.5 | 7.7 | 21.7 | 24.7 | 21.7 | 19 | 23.3 | 24.8±0.2 | +| PandaGPT-13B | 6.8 | 6.5 | 16.5 | 26.3 | 13.7 | 13.9 | 19.6 | N/A | +| MiniGPT-4-v1-13B | 10.3 | 7.7 | 12.5 | 19.9 | 14.9 | 13.8 | 16.9 | 24.4±0.4 | +| MiniGPT-4-v1-7B | 9.2 | 3.8 | 10.1 | 19.4 | 13.3 | 12.5 | 15.6 | 22.1±0.1 | +| VisualGLM | 8.5 | 6.5 | 9.1 | 18 | 8.1 | 7.1 | 14.8 | N/A | +| Qwen-VL | 7.4 | 0 | 3.9 | 16.5 | 18.6 | 18.1 | 13 | N/A | +| MiniGPT-4-v2 | 7.1 | 7.3 | 9.6 | 12.2 | 9.2 | 8 | 10.5 | N/A | diff --git a/results/SEEDBench_IMG.md b/results/SEEDBench_IMG.md index 41accc127..69799de59 100644 --- a/results/SEEDBench_IMG.md +++ b/results/SEEDBench_IMG.md @@ -10,29 +10,32 @@ - **LLMMatchAcc:** The overall accuracy across all questions with **ChatGPT answer matching**. - **OfficialAcc**: SEEDBench_IMG acc on the official leaderboard (if applicable). -| Model | ExactMatchRate | MatchedAcc | ExactMatchAcc | LLMMatchAcc | [Official Leaderboard (Eval Method)](https://huggingface.co/spaces/AILab-CVC/SEED-Bench_Leaderboard) | -| :------------------- | -------------: | ---------: | ------------: | ----------: | -----------------------------------------------------------: | -| GPT-4v (detail: low) | 95.58 | 73.92 | 70.65 | 71.59 | 69.1 (Gen) | -| GeminiProVision | 99.38 | 71.09 | 70.65 | 70.74 | NA | -| sharegpt4v_7b | 100 | 69.25 | 69.25 | 69.25 | 69.7 (Gen) | -| llava_v1.5_13b | 100 | 68.11 | 68.11 | 68.11 | 68.2 (Gen) | -| TransCore_M | 100 | 66.77 | 66.77 | 66.77 | N/A | -| XComposer | 100 | 66.07 | 66.07 | 66.07 | 66.9 (PPL) | -| llava_v1.5_7b | 100 | 65.59 | 65.59 | 65.59 | N/A | -| qwen_chat | 96.21 | 66.61 | 64.08 | 64.83 | 65.4 (PPL) | -| mPLUG-Owl2 | 100 | 64.52 | 64.52 | 64.52 | 64.1 (Not Given) | -| qwen_base | 99.28 | 52.69 | 52.31 | 52.53 | 62.3 (PPL) | -| idefics_80b_instruct | 99.84 | 51.96 | 51.88 | 51.96 | 53.2 (Not Given) | -| llava_v1_7b | 82.51 | 50.18 | 41.41 | 49.48 | N/A | -| PandaGPT_13B | 82.02 | 48.41 | 39.71 | 47.63 | N/A | -| instructblip_13b | 99.07 | 47.5 | 47.06 | 47.26 | N/A | -| VisualGLM_6b | 74.15 | 47.66 | 35.34 | 47.02 | N/A | -| idefics_9b_instruct | 99.52 | 44.97 | 44.75 | 45 | 44.5 (Not Given) | -| instructblip_7b | 88.35 | 49.63 | 43.84 | 44.51 | 58.8 (PPL) | -| MiniGPT-4-v1-13B | 67.71 | 39.37 | 26.66 | 34.91 | N/A | -| MiniGPT-4-v1-7B | 69.25 | 33.62 | 23.29 | 31.56 | 47.4 (PPL) | -| MiniGPT-4-v2 | 81.4 | 31.81 | 25.89 | 29.38 | N/A | -| flamingov2 | 99.84 | 28.83 | 28.79 | 28.84 | 42.7 (PPL) | +| Model | ExactMatchRate | MatchedAcc | ExactMatchAcc | LLMMatchAcc | [**Official Leaderboard (Eval Method)**](https://huggingface.co/spaces/AILab-CVC/SEED-Bench_Leaderboard) | +| :---------------------------- | -------------: | ---------: | ------------: | ----------: | ------------------------------------------------------------ | +| GPT-4v (detail: low) | 95.58 | 73.92 | 70.65 | 71.59 | 69.1 (Gen) | +| GeminiProVision | 99.38 | 71.09 | 70.65 | 70.74 | N/A | +| ShareGPT4V-7B | 100 | 69.25 | 69.25 | 69.25 | 69.7 (Gen) | +| LLaVA-v1.5-13B | 100 | 68.11 | 68.11 | 68.11 | 68.2 (Gen) | +| LLaVA-v1.5-13B (LoRA, XTuner) | 100 | 67.95 | 67.95 | 67.95 | N/A | +| TransCore-M | 100 | 66.77 | 66.77 | 66.77 | N/A | +| LLaVA-v1.5-7B (LoRA, XTuner) | 100 | 66.39 | 66.39 | 66.39 | N/A | +| InternLM-XComposer-VL | 100 | 66.07 | 66.07 | 66.07 | 66.9 (PPL) | +| LLaVA-InternLM-7B (LoRA) | 100 | 65.75 | 65.75 | 65.75 | N/A | +| LLaVA-v1.5-7B | 100 | 65.59 | 65.59 | 65.59 | N/A | +| Qwen-VL-Chat | 96.21 | 66.61 | 64.08 | 64.83 | 65.4 (PPL) | +| mPLUG-Owl2 | 100 | 64.52 | 64.52 | 64.52 | 64.1 (Not Given) | +| Qwen-VL | 99.28 | 52.69 | 52.31 | 52.53 | 62.3 (PPL) | +| IDEFICS-80B-Instruct | 99.84 | 51.96 | 51.88 | 51.96 | 53.2 (Not Given) | +| LLaVA-v1-7B | 82.51 | 50.18 | 41.41 | 49.48 | N/A | +| PandaGPT_13B | 82.02 | 48.41 | 39.71 | 47.63 | N/A | +| InstructBLIP_13B | 99.07 | 47.5 | 47.06 | 47.26 | N/A | +| VisualGLM | 74.15 | 47.66 | 35.34 | 47.02 | N/A | +| IDEFICS-9B-Instruct | 99.52 | 44.97 | 44.75 | 45 | 44.5 (Not Given) | +| InstructBLIP_7B | 88.35 | 49.63 | 43.84 | 44.51 | 58.8 (PPL) | +| MiniGPT-4-v1-13B | 67.71 | 39.37 | 26.66 | 34.91 | N/A | +| MiniGPT-4-v1-7B | 69.25 | 33.62 | 23.29 | 31.56 | 47.4 (PPL) | +| MiniGPT-4-v2 | 81.4 | 31.81 | 25.89 | 29.38 | N/A | +| OpenFlamingo v2 | 99.84 | 28.83 | 28.79 | 28.84 | 42.7 (PPL) | ### Comments diff --git a/scripts/report_missing.py b/scripts/report_missing.py index 917e214d6..7f946884d 100644 --- a/scripts/report_missing.py +++ b/scripts/report_missing.py @@ -5,12 +5,12 @@ logger = get_logger('Report Missing') dataset = [ - 'MME', 'SEEDBench_IMG', 'MMBench', 'CCBench', 'MMBench_CN', - 'MMVet', 'OCRVQA_TESTCORE', 'TextVQA_VAL' + 'MME', 'SEEDBench_IMG', 'MMBench', 'CCBench', 'MMBench_CN', + 'MMVet', 'OCRVQA_TESTCORE', 'TextVQA_VAL', 'COCO_VAL', 'MMMU_DEV_VAL' ] suffix = [ - 'score.csv', 'acc.csv', 'acc.csv', 'acc.csv', 'acc.csv', - 'gpt-4-turbo_score.csv', 'acc.csv', 'acc.csv' + 'score.csv', 'acc.csv', 'acc.csv', 'acc.csv', 'acc.csv', + 'gpt-4-turbo_score.csv', 'acc.csv', 'acc.csv', 'score.json', 'acc.csv' ] N = len(dataset)