From 043fa6917553ab778fc856e17b01d63b88529c80 Mon Sep 17 00:00:00 2001 From: Pu Fanyi Date: Sat, 5 Oct 2024 20:07:20 +0800 Subject: [PATCH 1/6] Refactor code formatting in setup.py, .gitignore, __init__.py, and data_summary.ipynb --- tools/live_bench/create_dataset.py | 21 +- tools/live_bench/data_summary.ipynb | 2 +- tools/live_bench/example.ipynb | 962 +++++++-------- tools/live_bench/filter.ipynb | 658 +++++----- tools/live_bench/live_bench/__init__.py | 4 +- tools/live_bench/live_bench/api/live_bench.py | 40 +- .../live_bench/data_generator/__init__.py | 8 +- .../live_bench/data_generator/check_prompt.md | 50 +- .../data_generator/default_criteria.md | 32 +- .../example/example_output.json | 114 +- .../live_bench/data_generator/live_bench.py | 378 +++--- .../data_generator/live_bench_data.py | 278 ++--- .../live_bench/data_generator/prompt.md | 34 +- .../live_bench/data_generator/qa_generator.py | 1064 ++++++++--------- .../data_generator/question_finalizer.py | 276 ++--- .../live_bench/data_generator/response.py | 24 +- .../live_bench/data_generator/score_getter.py | 328 ++--- .../live_bench/data_generator/score_prompt.md | 40 +- .../live_bench/data_generator/utils/claude.py | 138 +-- .../utils/extract_infomation.py | 262 ++-- .../live_bench/data_generator/utils/gemini.py | 76 +- .../live_bench/data_generator/utils/gpt4v.py | 152 +-- tools/live_bench/live_bench/driver/.gitignore | 2 +- .../live_bench/live_bench/driver/__init__.py | 2 +- .../live_bench/driver/load_driver.py | 142 +-- .../live_bench/screen_shoter/__init__.py | 12 +- .../live_bench/screen_shoter/screen.py | 58 +- .../live_bench/screen_shoter/screen_shoter.py | 282 ++--- tools/live_bench/live_bench/view.ipynb | 862 ++++++------- .../live_bench/websites/__init__.py | 4 +- .../live_bench/websites/load_website.py | 70 +- .../live_bench/live_bench/websites/website.py | 127 +- .../live_bench/websites/website_list.yaml | 156 +-- tools/live_bench/pyproject.toml | 94 +- tools/live_bench/refine_all_results.py | 70 +- tools/live_bench/script/README.md | 30 +- tools/live_bench/script/modify.ipynb | 922 +++++++------- tools/live_bench/script/refractor.py | 18 + tools/live_bench/script/upload_results.py | 354 +++--- tools/live_bench/setup.py | 6 +- 40 files changed, 4088 insertions(+), 4064 deletions(-) create mode 100644 tools/live_bench/script/refractor.py diff --git a/tools/live_bench/create_dataset.py b/tools/live_bench/create_dataset.py index 227fb724..1da8edd2 100644 --- a/tools/live_bench/create_dataset.py +++ b/tools/live_bench/create_dataset.py @@ -1,11 +1,10 @@ -from live_bench import LiveBench -from live_bench.websites import load_websites, load_websites_from_file - -if __name__ == "__main__": - website = load_websites() - dataset = LiveBench() - dataset.capture(websites=website, driver_kwargs={"headless": True}, screen_shoter="single_screen", shoter_kwargs={"screen_size": (1024, 1024)}, qa_generator="gpt4v", scorer="claude", checker="gemini") - - website = load_websites_from_file("/data/pufanyi/project/lmms-eval/temp/images") - dataset.capture(websites=website, screen_shoter="human", qa_generator="gpt4v", scorer="claude", checker="gemini", driver_kwargs={}, shoter_kwargs={}, generator_kwargs={}) - dataset.upload() +from live_bench import LiveBench +from live_bench.websites import load_websites, load_websites_from_file + +if __name__ == "__main__": + website = load_websites() + dataset = LiveBench(name="2024-9") + + website = load_websites_from_file("/data/pufanyi/project/lmms-eval/tools/temp/processed_images") + dataset.capture(websites=website, screen_shoter="human", qa_generator="gpt4v", scorer="claude", checker="gemini", driver_kwargs={}, shoter_kwargs={}, generator_kwargs={}) + dataset.upload() diff --git a/tools/live_bench/data_summary.ipynb b/tools/live_bench/data_summary.ipynb index 291aeff0..fb6d2ba9 100644 --- a/tools/live_bench/data_summary.ipynb +++ b/tools/live_bench/data_summary.ipynb @@ -325,7 +325,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.12.7" } }, "nbformat": 4, diff --git a/tools/live_bench/example.ipynb b/tools/live_bench/example.ipynb index 92b9df73..e5ad1a04 100644 --- a/tools/live_bench/example.ipynb +++ b/tools/live_bench/example.ipynb @@ -1,481 +1,481 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/data/pufanyi/anaconda3/anacondabin/envs/live_bench/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - } - ], - "source": [ - "from random import sample\n", - "\n", - "from live_bench.websites.website import DefaultWebsite\n", - "from live_bench.websites import load_websites\n", - "\n", - "# website = load_websites()\n", - "# website = sample(website, 1)\n", - "# website[0].url\n", - "website = [DefaultWebsite(url=\"https://www.asahi.com/\")] # , DefaultWebsite(url=\"https://www.bbc.com/sport\"), DefaultWebsite(url=\"https://www.bbc.com/business\")]" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "could not detect version_main.therefore, we are assuming it is chrome 108 or higher\n" - ] - } - ], - "source": [ - "from live_bench.data_generator.utils.extract_infomation import InfomationExtractor\n", - "from live_bench.screen_shoter import get_shoter\n", - "from live_bench.driver import load_driver\n", - "\n", - "shoter = get_shoter(\"single_screen\")\n", - "driver = load_driver()\n", - "w = shoter(driver, website[0])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "extractor = InfomationExtractor()" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "response = extractor.extract_infomation(w)" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "**Here is something you can take as reference.**\n", - "\n", - "## Text Extracted in the HTML\n", - "\n", - "Below is the text extracted from the website for you to take reference:\n", - "BBC Home - Breaking News, World News, US News, Sports, Business, Innovation, Climate, Culture, Travel, Video & Audio\n", - "\n", - "What son's conviction means for President Biden\n", - "The guilty verdict is unlikely to change voters' minds, but it will be a personal blow for the US president.\n", - "1 hr ago | US & Canada\n", - "\n", - "Hunter Biden found guilty on all counts in gun trial\n", - "The US president's son is found guilty of lying about his drug use when buying a handgun in 2018.\n", - "- The struggles and scandals of Hunter Biden\n", - "\n", - "Blinken says fate of ceasefire plan down to Hamas\n", - "The US diplomat says Israel's prime minister \"reaffirmed his commitment\" to a Gaza ceasefire plan.\n", - "2 hrs ago | Middle East\n", - "\n", - "Ukraine 'hits missile launch sites in Russia'\n", - "The mayor of the city of Kharkiv says the situation there is \"calmer\" as Russia has been shelling less.\n", - "1 hr ago | Europe\n", - "\n", - "Four US college instructors stabbed in public park in China\n", - "The instructors were on a daytime visit to a public park when they were attacked, Cornell College says.\n", - "4 hrs ago | Asia\n", - "\n", - "Animal-rights protesters attack portrait of King\n", - "Animal-rights protesters attack a portrait of King Charles III, in a London gallery.\n", - "2 hrs ago | UK\n", - "\n", - "Warning shots from South as NK soldiers cross border\n", - "The incident at the DMZ comes at a time of heightened tensions between the two Koreas.\n", - "11 hrs ago | Asia\n", - "\n", - "## Image Features\n", - "\n", - "From the screenshot of the news website you provided, here is the analysis based on the images displayed alongside the corresponding news headings and text:\n", - "\n", - "1. **Image Associated with Hunter Biden's Conviction**:\n", - " - **Description**: The image depicts Hunter Biden escorted by, possibly, security personnel or aides. He seems to be dressed in a formal dark suit and appears to be descending stairs or possibly exiting a vehicle. He carries an air of seriousness, likely reflective of the gravity of his legal situation.\n", - " - **Relevance**: This image effectively captures the serious, personal, and public nature of the judicial proceedings against the President's son, making the situation more relatable to the audience. It directly ties to the news confirming Hunter Biden’s guilty verdict in a gun trial related to lying about drug use.\n", - "\n", - "2. **Image Accompanying the Article on Biden's Supporters**:\n", - " - **Description**: The accompanying image shows a group of enthusiastic supporters holding signs, with one prominently reading \"Say Yes to Biden,\" suggesting a political rally or campaign event. The participants display expressions of support and enthusiasm.\n", - " - **Relevance**: This image provides a visual contrast to the first, highlighting the ongoing support for the Biden family or campaign despite the legal issues faced by Hunter Biden. It serves to illustrate the political backdrop and public opinion dynamic mentioned in the news headlines.\n", - "\n", - "These images serve different purposes:\n", - "- The first image personalizes the news story, putting a face to the name in a high-stakes legal controversy. It underlines the personal and public challenges faced by the Biden family due to the conviction.\n", - "- The second image contextualizes the broader political support for the Biden family, suggesting that despite personal legal woes, there is a segment of the populace fervently supporting them.\n", - "\n", - "The clear connection between the images and the corresponding textual content on the news site helps readers visualize and better understand the unfolding events, enhancing the impact of the news storytelling.\n", - "\n", - "## Interesting Points\n", - "\n", - "The BBC news website, as demonstrated through the detailed examination of its content, offers a dynamic and visually engaging approach to news presentation. Here’s a deeper analysis of how it distinguishes itself:\n", - "\n", - "1. **Comprehensive and Geographically Diverse News Coverage**:\n", - " - The content spans a wide range of geographical locations including the US, Middle East, Europe, Asia, and the UK. Each news piece targets a major recent event, reflecting the website’s commitment to global news coverage. This expansive geographic focus ensures that readers have access to a broad spectrum of significant, impactful news.\n", - "\n", - "2. **Varied Content Themes**: \n", - " - The news themes are diverse, covering political, social, and cultural issues. From the legal troubles of a high-profile political figure’s son in the US to a ceasefire plan in the Middle East and violent incidents in Asia, the website covers a wide array of topics. This variety meets different readers' interests and keeps the content engaging.\n", - "\n", - "3. **Immediate Relevance**:\n", - " - The website's content is timely, as indicated by timestamps such as “1 hr ago” and “2 hrs ago.” This reflects the website’s commitment to providing the latest news, which is crucial for maintaining reader engagement and trust in a digital age where current information is highly valued.\n", - "\n", - "4. **Stylistic and Engaging Visual Design**:\n", - " - The use of compelling images alongside the news articles plays a critical role in storytelling. For instance, the image of Hunter Biden descending steps with a serious demeanor visually reinforces the gravity of the news about his conviction. \n", - " - Meanwhile, the image of supporters holding \"Say Yes to Biden\" signs juxtaposed with Hunter Biden's legal news offers a visual narrative of continued support amidst political strife, underscoring the complexity and depth of public and personal life in politics.\n", - "\n", - "5. **Interactive and Multimedia Features**:\n", - " - The use of tags such as \"OLIVE\" beside the breaking story of Hunter Biden indicates an interactive or breaking news feature that likely offers real-time updates and extensive coverage. This kind of multimedia integration enhances user interaction and engagement.\n", - "\n", - "In summary, the BBC news website sets itself apart through a combination of up-to-date, visually engaging, and comprehensively covered news items that cater to a global audience with varied interests. The effective use of images not only contextualizes the stories but also adds a layer of emotional and visual impact, making the news relatable and striking.\n" - ] - } - ], - "source": [ - "print(str(response))" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "This screenshot from a news website contains several images corresponding to different news stories. Let's examine each image and extract relevant details:\n", - "\n", - "1. **Image associated with the Michael Mosley news story:**\n", - " - The image depicts a middle-aged couple, smiling warmly at each other in a sunny, natural outdoor setting. This photo likely portrays Dr. Michael Mosley with his wife, representing a human interest angle to the story about Dr. Mosley's disappearance on the Greek island of Symi. The caption, \"'We will not lose hope,' says Michael Mosley's wife,\" implies a context of hope and determination amidst difficult circumstances.\n", - "\n", - "2. **Image linked to the news about the hostages freed in Gaza:**\n", - " - This image features a group of soldiers with one individual in civilian clothing at the center, being lifted or celebrated, possibly right after a rescue scenario. The setting appears to be a rugged outdoor area, suggestive of a conflict or military zone, which aligns with the news story about hostages being freed in Gaza. The inclusion of armed personnel and a jubilant expression on the civilian's face highlights the relief and successful outcome of a dangerous operation.\n", - "\n", - "3. **Image for the Nova festival hostages news:**\n", - " - This image depicts a motorboat on clear water under bright skies, possibly implying the geographic setting related to Michael Mosley’s disappearance near the Greek island of Symi. The serene environment contrasts starkly with the concerning news of his disappearance during what might have been a routine outing or travel.\n", - "\n", - "These images serve as visual supplements to the written content, providing readers with a clearer, more immediate understanding of the stories. They help bridge the emotional and contextual gaps that pure text might leave, allowing readers to engage more deeply with the news events. Each image is carefully selected to evoke specific sentiments and to provide visual context to the news headlines and summaries.\n" - ] - } - ], - "source": [ - "print(response[\"features\"])" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "from live_bench import LiveBench" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "dataset = LiveBench(force_clear=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'2024-06'" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dataset.name" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "could not detect version_main.therefore, we are assuming it is chrome 108 or higher\n", - "Capturing websites: 0%| | 0/1 [00:00 5)\n", - "filtered_data" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idimageswebsitequestionanswersubtaskdata_generatorcheckerdate_timescreen_shoterscreen_sizescorereasonscorer_name
00[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...{'url': 'https://www.bbc.com/'}Look at the image provided. Which article head...\"BBC tracks down smuggler behind Channel cross...Deeper Implicationsgpt4vgemini2024-06-27 14:36:42single_screen(1024, 1024)10The answer accurately identifies the relevant ...gpt4v
11[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...{'url': 'https://www.bbc.com/'}Look at the image. What significant global eve...The image of a young girl wearing a life jacke...Contextual Analysisgpt4vgemini2024-06-27 14:36:42single_screen(1024, 1024)10The answer correctly identifies Biden and Trum...gpt4v
22[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...{'url': 'https://www.bbc.com/'}What detail in the image connected to the \"Bol...The \"Interpol Bolivia\" background visible in t...Deeper Implicationsgpt4vgemini2024-06-27 14:36:42single_screen(1024, 1024)7Authenticity (4/5): The answer is reasonable b...gpt4v
33[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...{'url': 'https://www.bbc.com/'}Analyze the visual composition of the article ...The image of the young girl, potentially a mig...Contextual Analysisgpt4vgemini2024-06-27 14:36:42single_screen(1024, 1024)8The question directly relates to assessing the...gpt4v
44[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...{'url': 'https://www.bbc.com/'}Focusing on the article \"BBC tracks down smugg...The image of a child, juxtaposed with an artic...Broader Implicationsgpt4vgemini2024-06-27 14:36:42single_screen(1024, 1024)10The answer directly correlates with the story'...gpt4v
\n", - "
" - ], - "text/plain": [ - " id images \\\n", - "0 0 [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH... \n", - "1 1 [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH... \n", - "2 2 [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH... \n", - "3 3 [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH... \n", - "4 4 [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH... \n", - "\n", - " website \\\n", - "0 {'url': 'https://www.bbc.com/'} \n", - "1 {'url': 'https://www.bbc.com/'} \n", - "2 {'url': 'https://www.bbc.com/'} \n", - "3 {'url': 'https://www.bbc.com/'} \n", - "4 {'url': 'https://www.bbc.com/'} \n", - "\n", - " question \\\n", - "0 Look at the image provided. Which article head... \n", - "1 Look at the image. What significant global eve... \n", - "2 What detail in the image connected to the \"Bol... \n", - "3 Analyze the visual composition of the article ... \n", - "4 Focusing on the article \"BBC tracks down smugg... \n", - "\n", - " answer subtask \\\n", - "0 \"BBC tracks down smuggler behind Channel cross... Deeper Implications \n", - "1 The image of a young girl wearing a life jacke... Contextual Analysis \n", - "2 The \"Interpol Bolivia\" background visible in t... Deeper Implications \n", - "3 The image of the young girl, potentially a mig... Contextual Analysis \n", - "4 The image of a child, juxtaposed with an artic... Broader Implications \n", - "\n", - " data_generator checker date_time screen_shoter screen_size \\\n", - "0 gpt4v gemini 2024-06-27 14:36:42 single_screen (1024, 1024) \n", - "1 gpt4v gemini 2024-06-27 14:36:42 single_screen (1024, 1024) \n", - "2 gpt4v gemini 2024-06-27 14:36:42 single_screen (1024, 1024) \n", - "3 gpt4v gemini 2024-06-27 14:36:42 single_screen (1024, 1024) \n", - "4 gpt4v gemini 2024-06-27 14:36:42 single_screen (1024, 1024) \n", - "\n", - " score reason scorer_name \n", - "0 10 The answer accurately identifies the relevant ... gpt4v \n", - "1 10 The answer correctly identifies Biden and Trum... gpt4v \n", - "2 7 Authenticity (4/5): The answer is reasonable b... gpt4v \n", - "3 8 The question directly relates to assessing the... gpt4v \n", - "4 10 The answer directly correlates with the story'... gpt4v " - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "filtered_data.to_pandas().head()" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Map: 100%|██████████| 409/409 [00:30<00:00, 13.63 examples/s]?it/s]\n", - "Creating parquet from Arrow format: 100%|██████████| 5/5 [00:00<00:00, 34.92ba/s]\n", - "Uploading the dataset shards: 100%|██████████| 1/1 [00:38<00:00, 38.26s/it]\n" - ] - }, - { - "data": { - "text/plain": [ - "CommitInfo(commit_url='https://huggingface.co/datasets/lmms-lab/LiveBench/commit/7eaf5caa899cc0b8bae7156cc534e12825a97565', commit_message='Upload dataset', commit_description='', oid='7eaf5caa899cc0b8bae7156cc534e12825a97565', pr_url=None, pr_revision=None, pr_num=None)" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "filtered_data.push_to_hub(\"lmms-lab/LiveBench\", \"2024-06\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "live_bench", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.9" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/data/pufanyi/anaconda3/anacondabin/envs/live_bench/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "from datasets import Dataset, load_dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "data = load_dataset(\"lmms-lab/LiveBench\", \"2024-07\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DatasetDict({\n", + " test: Dataset({\n", + " features: ['id', 'images', 'website', 'question', 'answer', 'criteria', 'subtask', 'data_generator', 'checker', 'date_time', 'screen_shoter', 'screen_size', 'score', 'reason', 'scorer_name'],\n", + " num_rows: 320\n", + " })\n", + "})" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Filter: 100%|██████████| 441/441 [00:33<00:00, 13.09 examples/s]\n" + ] + }, + { + "data": { + "text/plain": [ + "Dataset({\n", + " features: ['id', 'images', 'website', 'question', 'answer', 'subtask', 'data_generator', 'checker', 'date_time', 'screen_shoter', 'screen_size', 'score', 'reason', 'scorer_name'],\n", + " num_rows: 409\n", + "})" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "filtered_data = data[\"test\"].filter(lambda example: example[\"score\"] and example[\"score\"] > 5)\n", + "filtered_data" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idimageswebsitequestionanswersubtaskdata_generatorcheckerdate_timescreen_shoterscreen_sizescorereasonscorer_name
00[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...{'url': 'https://www.bbc.com/'}Look at the image provided. Which article head...\"BBC tracks down smuggler behind Channel cross...Deeper Implicationsgpt4vgemini2024-06-27 14:36:42single_screen(1024, 1024)10The answer accurately identifies the relevant ...gpt4v
11[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...{'url': 'https://www.bbc.com/'}Look at the image. What significant global eve...The image of a young girl wearing a life jacke...Contextual Analysisgpt4vgemini2024-06-27 14:36:42single_screen(1024, 1024)10The answer correctly identifies Biden and Trum...gpt4v
22[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...{'url': 'https://www.bbc.com/'}What detail in the image connected to the \"Bol...The \"Interpol Bolivia\" background visible in t...Deeper Implicationsgpt4vgemini2024-06-27 14:36:42single_screen(1024, 1024)7Authenticity (4/5): The answer is reasonable b...gpt4v
33[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...{'url': 'https://www.bbc.com/'}Analyze the visual composition of the article ...The image of the young girl, potentially a mig...Contextual Analysisgpt4vgemini2024-06-27 14:36:42single_screen(1024, 1024)8The question directly relates to assessing the...gpt4v
44[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...{'url': 'https://www.bbc.com/'}Focusing on the article \"BBC tracks down smugg...The image of a child, juxtaposed with an artic...Broader Implicationsgpt4vgemini2024-06-27 14:36:42single_screen(1024, 1024)10The answer directly correlates with the story'...gpt4v
\n", + "
" + ], + "text/plain": [ + " id images \\\n", + "0 0 [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH... \n", + "1 1 [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH... \n", + "2 2 [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH... \n", + "3 3 [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH... \n", + "4 4 [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH... \n", + "\n", + " website \\\n", + "0 {'url': 'https://www.bbc.com/'} \n", + "1 {'url': 'https://www.bbc.com/'} \n", + "2 {'url': 'https://www.bbc.com/'} \n", + "3 {'url': 'https://www.bbc.com/'} \n", + "4 {'url': 'https://www.bbc.com/'} \n", + "\n", + " question \\\n", + "0 Look at the image provided. Which article head... \n", + "1 Look at the image. What significant global eve... \n", + "2 What detail in the image connected to the \"Bol... \n", + "3 Analyze the visual composition of the article ... \n", + "4 Focusing on the article \"BBC tracks down smugg... \n", + "\n", + " answer subtask \\\n", + "0 \"BBC tracks down smuggler behind Channel cross... Deeper Implications \n", + "1 The image of a young girl wearing a life jacke... Contextual Analysis \n", + "2 The \"Interpol Bolivia\" background visible in t... Deeper Implications \n", + "3 The image of the young girl, potentially a mig... Contextual Analysis \n", + "4 The image of a child, juxtaposed with an artic... Broader Implications \n", + "\n", + " data_generator checker date_time screen_shoter screen_size \\\n", + "0 gpt4v gemini 2024-06-27 14:36:42 single_screen (1024, 1024) \n", + "1 gpt4v gemini 2024-06-27 14:36:42 single_screen (1024, 1024) \n", + "2 gpt4v gemini 2024-06-27 14:36:42 single_screen (1024, 1024) \n", + "3 gpt4v gemini 2024-06-27 14:36:42 single_screen (1024, 1024) \n", + "4 gpt4v gemini 2024-06-27 14:36:42 single_screen (1024, 1024) \n", + "\n", + " score reason scorer_name \n", + "0 10 The answer accurately identifies the relevant ... gpt4v \n", + "1 10 The answer correctly identifies Biden and Trum... gpt4v \n", + "2 7 Authenticity (4/5): The answer is reasonable b... gpt4v \n", + "3 8 The question directly relates to assessing the... gpt4v \n", + "4 10 The answer directly correlates with the story'... gpt4v " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "filtered_data.to_pandas().head()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Map: 100%|██████████| 409/409 [00:30<00:00, 13.63 examples/s]?it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 5/5 [00:00<00:00, 34.92ba/s]\n", + "Uploading the dataset shards: 100%|██████████| 1/1 [00:38<00:00, 38.26s/it]\n" + ] + }, + { + "data": { + "text/plain": [ + "CommitInfo(commit_url='https://huggingface.co/datasets/lmms-lab/LiveBench/commit/7eaf5caa899cc0b8bae7156cc534e12825a97565', commit_message='Upload dataset', commit_description='', oid='7eaf5caa899cc0b8bae7156cc534e12825a97565', pr_url=None, pr_revision=None, pr_num=None)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "filtered_data.push_to_hub(\"lmms-lab/LiveBench\", \"2024-06\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "live_bench", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tools/live_bench/live_bench/__init__.py b/tools/live_bench/live_bench/__init__.py index d8299fc9..90c06824 100644 --- a/tools/live_bench/live_bench/__init__.py +++ b/tools/live_bench/live_bench/__init__.py @@ -1,2 +1,2 @@ -from live_bench.api.live_bench import generate_live_bench, generate_live_bench_from_path -from live_bench.data_generator import LiveBench +from .api.live_bench import generate_live_bench, generate_live_bench_from_path +from .data_generator import LiveBench diff --git a/tools/live_bench/live_bench/api/live_bench.py b/tools/live_bench/live_bench/api/live_bench.py index f0c59dd5..12b89aca 100644 --- a/tools/live_bench/live_bench/api/live_bench.py +++ b/tools/live_bench/live_bench/api/live_bench.py @@ -1,20 +1,20 @@ -from live_bench import LiveBench -from live_bench.websites import load_websites, load_websites_from_file - - -def generate_live_bench(*, force_clear=False, screen_shoter="single_screen", qa_generator="gpt4v", scorer="gpt4v", checker="gemini", driver_kwargs={}, shoter_kwargs={}, generator_kwargs={}): - website = load_websites() - dataset = LiveBench(force_clear=force_clear) - dataset.capture(websites=website, screen_shoter=screen_shoter, qa_generator=qa_generator, scorer=scorer, checker=checker, driver_kwargs=driver_kwargs, shoter_kwargs=shoter_kwargs, generator_kwargs=generator_kwargs) - dataset.upload() - - -def generate_live_bench_from_path(path, *, force_clear=False, qa_generator="gpt4v", scorer="gpt4v", checker="gemini", driver_kwargs={}, shoter_kwargs={}, generator_kwargs={}): - website = load_websites_from_file(path) - dataset: LiveBench = LiveBench(force_clear=force_clear) - dataset.capture(websites=website, screen_shoter="human", qa_generator=qa_generator, scorer=scorer, checker=checker, driver_kwargs=driver_kwargs, shoter_kwargs=shoter_kwargs, generator_kwargs=generator_kwargs) - dataset.upload() - - -if __name__ == "__main__": - generate_live_bench() +from live_bench.data_generator import LiveBench +from live_bench.websites import load_websites, load_websites_from_file + + +def generate_live_bench(*, force_clear=False, screen_shoter="single_screen", qa_generator="gpt4v", scorer="gpt4v", checker="gemini", driver_kwargs={}, shoter_kwargs={}, generator_kwargs={}): + website = load_websites() + dataset = LiveBench(force_clear=force_clear) + dataset.capture(websites=website, screen_shoter=screen_shoter, qa_generator=qa_generator, scorer=scorer, checker=checker, driver_kwargs=driver_kwargs, shoter_kwargs=shoter_kwargs, generator_kwargs=generator_kwargs) + dataset.upload() + + +def generate_live_bench_from_path(path, *, force_clear=False, qa_generator="gpt4v", scorer="gpt4v", checker="gemini", driver_kwargs={}, shoter_kwargs={}, generator_kwargs={}): + website = load_websites_from_file(path) + dataset: LiveBench = LiveBench(force_clear=force_clear) + dataset.capture(websites=website, screen_shoter="human", qa_generator=qa_generator, scorer=scorer, checker=checker, driver_kwargs=driver_kwargs, shoter_kwargs=shoter_kwargs, generator_kwargs=generator_kwargs) + dataset.upload() + + +if __name__ == "__main__": + generate_live_bench() diff --git a/tools/live_bench/live_bench/data_generator/__init__.py b/tools/live_bench/live_bench/data_generator/__init__.py index cda9cd04..54129621 100644 --- a/tools/live_bench/live_bench/data_generator/__init__.py +++ b/tools/live_bench/live_bench/data_generator/__init__.py @@ -1,4 +1,4 @@ -from live_bench.data_generator.live_bench import LiveBench -from live_bench.data_generator.live_bench_data import LiveBenchData -from live_bench.data_generator.qa_generator import get_generator, get_random_generator -from live_bench.data_generator.response import Response +from live_bench.data_generator.live_bench import LiveBench +from live_bench.data_generator.live_bench_data import LiveBenchData +from live_bench.data_generator.qa_generator import get_generator, get_random_generator +from live_bench.data_generator.response import Response diff --git a/tools/live_bench/live_bench/data_generator/check_prompt.md b/tools/live_bench/live_bench/data_generator/check_prompt.md index 4532e561..85164517 100644 --- a/tools/live_bench/live_bench/data_generator/check_prompt.md +++ b/tools/live_bench/live_bench/data_generator/check_prompt.md @@ -1,25 +1,25 @@ -I would like you to act as a quizmaster who designs questions based on a provided image that would challenge adults to think critically. The image in question is a screenshot from the homepage or section of news website. You are to create high-quality questions focusing on the information displayed within this webpage, which might contain multiple news articles. Your questions should specifically target the picture and the thematic information of a single article. Your question should be answerable, and checkable. Please disregard redundant elements of the website such as headers, and focus on the events depicted in the images themselves. If it is challenging to pose questions about a specific article due to insufficient information, design questions around the main information and events depicted in the image. - -Now, you are given a screenshot of the homepage of a news website, with a already generated question and answer. Your task is to refine the question and answer, and refractor them to make the question more answerable, checkable, and challenging. If you don't think the question is good, please provide a new question and answer. - -Note that the subtask must be one of these five: - -- Basic Understanding -- Contextual Analysis -- Deeper Implications -- Broader Implications -- Further Insights - -If you think the question does not correspond to the subtask, you have two options: -1. Modify the question to correspond to the subtask. -2. Modify the subtask to correspond to the question. - -However, you should not change the original question's subtask unless the original subtask is not one of these five. If you feel the original question's subtask does not match the question, modify the question to match the subtask instead of rewriting the subtask. - -Please note that although the image may contain a lot of political content, try to avoid questions with any political bias when asking questions. The question should focus on understanding and thinking about the image, not on political opinions. Within your capabilities, try to make the questions more challenging. However, you also need to consider the gradability of the questions you set. It is reiterated that what you need to assess is the ability to understand the news webpage, not politics. - -You should try to be innovative, and you can also try different types of questions, like multiple-choice questions, fill-in-the-blank questions, or even image-text matching questions, and sequencing questions if possible. Within your capabilities, try to make the questions more challenging. - -If you think the question is not good, or it is not answerable, please provide a new question and answer. - -Reminder again that you cannot change the original subtask unless the original subtask is not one of the five listed above. +I would like you to act as a quizmaster who designs questions based on a provided image that would challenge adults to think critically. The image in question is a screenshot from the homepage or section of news website. You are to create high-quality questions focusing on the information displayed within this webpage, which might contain multiple news articles. Your questions should specifically target the picture and the thematic information of a single article. Your question should be answerable, and checkable. Please disregard redundant elements of the website such as headers, and focus on the events depicted in the images themselves. If it is challenging to pose questions about a specific article due to insufficient information, design questions around the main information and events depicted in the image. + +Now, you are given a screenshot of the homepage of a news website, with a already generated question and answer. Your task is to refine the question and answer, and refractor them to make the question more answerable, checkable, and challenging. If you don't think the question is good, please provide a new question and answer. + +Note that the subtask must be one of these five: + +- Basic Understanding +- Contextual Analysis +- Deeper Implications +- Broader Implications +- Further Insights + +If you think the question does not correspond to the subtask, you have two options: +1. Modify the question to correspond to the subtask. +2. Modify the subtask to correspond to the question. + +However, you should not change the original question's subtask unless the original subtask is not one of these five. If you feel the original question's subtask does not match the question, modify the question to match the subtask instead of rewriting the subtask. + +Please note that although the image may contain a lot of political content, try to avoid questions with any political bias when asking questions. The question should focus on understanding and thinking about the image, not on political opinions. Within your capabilities, try to make the questions more challenging. However, you also need to consider the gradability of the questions you set. It is reiterated that what you need to assess is the ability to understand the news webpage, not politics. + +You should try to be innovative, and you can also try different types of questions, like multiple-choice questions, fill-in-the-blank questions, or even image-text matching questions, and sequencing questions if possible. Within your capabilities, try to make the questions more challenging. + +If you think the question is not good, or it is not answerable, please provide a new question and answer. + +Reminder again that you cannot change the original subtask unless the original subtask is not one of the five listed above. diff --git a/tools/live_bench/live_bench/data_generator/default_criteria.md b/tools/live_bench/live_bench/data_generator/default_criteria.md index f276b439..90d3c22d 100644 --- a/tools/live_bench/live_bench/data_generator/default_criteria.md +++ b/tools/live_bench/live_bench/data_generator/default_criteria.md @@ -1,16 +1,16 @@ -### 1. Authenticity (5 points) -- **5 Points**: The information is directly observable in the image or can be reasonably inferred with strong evidence. -- **3 Points**: The information has a plausible connection to the image but requires assumptions that are not strongly supported by the image. -- **1 Point**: The information cannot be observed or reasonably inferred from the image; it seems unrelated or speculative. - -### 2. Logical Coherence (3 points) -- **3 Points**: The answer logically follows from the question and maintains consistency with the image context. -- **2 Points**: There are minor logical gaps or inconsistencies in the answer relative to the question. -- **1 Point**: The answer is logically inconsistent or contradictory to the question or image context. - -### 3. Clarity and Precision (2 points) -- **2 Points**: The question and answer are clearly articulated and precisely address specifics of the image. -- **1 Point**: The question or answer is somewhat vague or overly general, lacking specific details related to the image. -- **0 Points**: The question or answer is unclear or too ambiguous to determine relevance to the image. - -Each Q&A pair can score a maximum of 10 points. The sum of points from these three categories determines the final score for each pair. Provide a brief explanation for each rating, focusing on how well the Q&A adheres to these criteria. +### 1. Authenticity (5 points) +- **5 Points**: The information is directly observable in the image or can be reasonably inferred with strong evidence. +- **3 Points**: The information has a plausible connection to the image but requires assumptions that are not strongly supported by the image. +- **1 Point**: The information cannot be observed or reasonably inferred from the image; it seems unrelated or speculative. + +### 2. Logical Coherence (3 points) +- **3 Points**: The answer logically follows from the question and maintains consistency with the image context. +- **2 Points**: There are minor logical gaps or inconsistencies in the answer relative to the question. +- **1 Point**: The answer is logically inconsistent or contradictory to the question or image context. + +### 3. Clarity and Precision (2 points) +- **2 Points**: The question and answer are clearly articulated and precisely address specifics of the image. +- **1 Point**: The question or answer is somewhat vague or overly general, lacking specific details related to the image. +- **0 Points**: The question or answer is unclear or too ambiguous to determine relevance to the image. + +Each Q&A pair can score a maximum of 10 points. The sum of points from these three categories determines the final score for each pair. Provide a brief explanation for each rating, focusing on how well the Q&A adheres to these criteria. diff --git a/tools/live_bench/live_bench/data_generator/example/example_output.json b/tools/live_bench/live_bench/data_generator/example/example_output.json index 2526e7df..c789db3b 100644 --- a/tools/live_bench/live_bench/data_generator/example/example_output.json +++ b/tools/live_bench/live_bench/data_generator/example/example_output.json @@ -1,57 +1,57 @@ -{ - "Basic Understanding": [ - { - "Question": "Which of the following topics is NOT covered in the news articles shown in the image?\nA) Middle East politics\nB) Technological advancements\nC) Natural disasters\nD) Animal welfare", - "Answer": "C) Natural disasters", - "Criteria": "Give 10 marks if correctly selected C, otherwise 0 marks." - }, - { - "Question": "Based on the image and the headlines provided on the BBC webpage, fill in the blank with the most appropriate word or phrase:\n\"The article titled 'UN Security Council backs US Israel-Gaza ceasefire plan' is accompanied by an image of ______, which symbolizes the impact of the conflict on civilians and the urgency for a ceasefire.\"", - "Answer": "The article titled 'UN Security Council backs US Israel-Gaza ceasefire plan' is accompanied by an image of **children amidst rubble**, which symbolizes the impact of the conflict on civilians and the urgency for a ceasefire.", - "Criteria": "Award 10 marks for the correct answer 'children amidst rubble', 5 marks for partially correct synonyms or related phrases, and 0 marks for incorrect answers." - } - ], - "Contextual Analysis": [ - { - "Question": "In the image associated with the article about the US Security Council backing the Israel-Gaza ceasefire plan, what are the people in the image doing, and how do their actions and expressions relate to the content of the article?", - "Answer": "In the image associated with the article about the US Security Council backing the Israel-Gaza ceasefire plan, the people are navigating through rubble, indicating a scene of destruction. One person is climbing over debris, while another is looking directly at the camera with a serious expression. Their actions and expressions reflect the aftermath of conflict and the urgency of the situation, which aligns with the article's focus on the need for a ceasefire and the release of hostages held by Hamas.", - "Criteria": "Award up to 10 marks based on the accuracy and detail of the response: 2 marks for identifying the scene of destruction, 2 marks for mentioning a person climbing over debris, 2 marks for noting someone looking directly at the camera with a serious expression, 4 marks for correctly relating these observations to the aftermath of conflict, the urgency of the situation, and the article's focus on ceasefire and hostage release." - }, - { - "Question": "How might the image of children navigating through rubble relate to the themes discussed in the article about Netanyahu and Gaza ceasefire?", - "Answer": "The image powerfully underscores the humanitarian impact of the conflict, aligning with the themes discussed in the article where Netanyahu's political maneuvers are contrasted with the pressing need for a ceasefire to alleviate civilian suffering in Gaza.", - "Criteria": "Award up to 10 marks based on the accuracy and relevance of the response: 2 marks for identifying the humanitarian impact, 3 marks for linking the image to the themes of the article, 2 marks for mentioning Netanyahu's political maneuvers, 3 marks for correctly associating the need for a ceasefire with the alleviation of civilian suffering." - } - ], - "Deeper Implications": [ - { - "Question": "What broader issues are raised by the UN Security Council's backing of a ceasefire in the Israel-Gaza context?", - "Answer": "The broader issues include international involvement in regional conflicts, the effectiveness of UN resolutions in conflict resolution, and the ongoing debate over the balance between national security and humanitarian needs in conflict zones.", - "Criteria": "Award up to 10 marks based on the accuracy and completeness of the response: 3 marks for mentioning international involvement in regional conflicts, 3 marks for discussing the effectiveness of UN resolutions in conflict resolution, 4 marks for addressing the debate over the balance between national security and humanitarian needs in conflict zones." - }, - { - "Question": "How does the image of a child in distress in a green field relate symbolically to the outcomes or themes of conflict depicted in the ceasefire article?", - "Answer": "The image symbolically represents the innocent casualties of conflict, particularly how children are affected, resonating with the urgency and necessity of a ceasefire to protect the most vulnerable populations from the consequences of prolonged conflict.", - "Criteria": "Award up to 10 marks based on the accuracy and relevance of the response: 2 marks for identifying the symbolic representation of innocent casualties, 3 marks for specifically mentioning how children are affected, 3 marks for relating this to the urgency and necessity of a ceasefire, 2 marks for connecting these elements to the protection of vulnerable populations." - } - ], - "Broader Implications": [ - { - "Question": "Rank the news articles in the image in order of their potential global impact, from highest to lowest.", - "Answer": "1. **UN Security Council backs US Israel-Gaza ceasefire plan**\n2. **Netanyahu walks tightrope as US urges Gaza ceasefire deal**\n3. **Apple brings ChatGPT to iPhones in AI overhaul**\n4. **Aircraft carrying Malawi vice-president goes missing**\n5. **Fire at famous Bangkok market kills 1,000 animals**\n6. **Four US college instructors stabbed in public park in China**\n7. **Baltimore shipping channel reopens after bridge collapse**", - "Criteria": "Award up to 10 marks based on the accuracy of the ranking: 2 marks for correctly placing the UN Security Council article first, 2 marks for correctly placing the Netanyahu article second, 1 mark each for correctly placing the next three articles (Apple, Aircraft, Fire), and 1 mark each for correctly placing the last two articles (Stabbing, Bridge). Deduct 1 mark for each position an article is away from its correct placement." - } - ], - "Further Insights": [ - { - "Question": "Based on the image, which of the following statements best explains the potential global impact of the events described in the news articles?\nA. The US-Israel-Gaza ceasefire plan backed by the UN Security Council is likely to reduce tensions in the Middle East, potentially leading to a more stable geopolitical environment in the region.\nB. The introduction of ChatGPT to iPhones is expected to significantly disrupt the technology market, overshadowing the geopolitical events in the Middle East and Africa.\nC. The fire at the Bangkok market, which killed 1,000 animals, is likely to have a more profound impact on global environmental policies than the ceasefire plan in the Middle East.\nD. The disappearance of the aircraft carrying the Malawi vice-president is expected to lead to a major international search and rescue operation, diverting attention from other global issues.", - "Answer": "A. The US-Israel-Gaza ceasefire plan backed by the UN Security Council is likely to reduce tensions in the Middle East, potentially leading to a more stable geopolitical environment in the region.", - "Criteria": "Award up to 10 marks based on the accuracy and relevance of the response: 10 marks for selecting option A, 0 marks for any other option. Detailed justification for scoring: Option A directly addresses the reduction of tensions and potential stabilization in the Middle East, which is a significant global impact. Other options, while plausible, do not directly relate to the primary global impact as depicted in the provided image and articles." - }, - { - "Question": "Considering the current global attention on AI, how might the article about Apple bringing ChatGPT to iPhones in an AI overhaul reflect on broader technological trends and consumer expectations?", - "Answer": "This article reflects broader trends in AI integration into consumer technology, highlighting competitive dynamics in the tech industry, and growing consumer expectations for sophisticated AI features in everyday devices.", - "Criteria": "Award up to 10 marks based on the depth and accuracy of the response: 3 marks for identifying AI integration into consumer technology, 3 marks for discussing competitive dynamics in the tech industry, 4 marks for explaining the growth in consumer expectations for sophisticated AI features in everyday devices." - } - ] -} +{ + "Basic Understanding": [ + { + "Question": "Which of the following topics is NOT covered in the news articles shown in the image?\nA) Middle East politics\nB) Technological advancements\nC) Natural disasters\nD) Animal welfare", + "Answer": "C) Natural disasters", + "Criteria": "Give 10 marks if correctly selected C, otherwise 0 marks." + }, + { + "Question": "Based on the image and the headlines provided on the BBC webpage, fill in the blank with the most appropriate word or phrase:\n\"The article titled 'UN Security Council backs US Israel-Gaza ceasefire plan' is accompanied by an image of ______, which symbolizes the impact of the conflict on civilians and the urgency for a ceasefire.\"", + "Answer": "The article titled 'UN Security Council backs US Israel-Gaza ceasefire plan' is accompanied by an image of **children amidst rubble**, which symbolizes the impact of the conflict on civilians and the urgency for a ceasefire.", + "Criteria": "Award 10 marks for the correct answer 'children amidst rubble', 5 marks for partially correct synonyms or related phrases, and 0 marks for incorrect answers." + } + ], + "Contextual Analysis": [ + { + "Question": "In the image associated with the article about the US Security Council backing the Israel-Gaza ceasefire plan, what are the people in the image doing, and how do their actions and expressions relate to the content of the article?", + "Answer": "In the image associated with the article about the US Security Council backing the Israel-Gaza ceasefire plan, the people are navigating through rubble, indicating a scene of destruction. One person is climbing over debris, while another is looking directly at the camera with a serious expression. Their actions and expressions reflect the aftermath of conflict and the urgency of the situation, which aligns with the article's focus on the need for a ceasefire and the release of hostages held by Hamas.", + "Criteria": "Award up to 10 marks based on the accuracy and detail of the response: 2 marks for identifying the scene of destruction, 2 marks for mentioning a person climbing over debris, 2 marks for noting someone looking directly at the camera with a serious expression, 4 marks for correctly relating these observations to the aftermath of conflict, the urgency of the situation, and the article's focus on ceasefire and hostage release." + }, + { + "Question": "How might the image of children navigating through rubble relate to the themes discussed in the article about Netanyahu and Gaza ceasefire?", + "Answer": "The image powerfully underscores the humanitarian impact of the conflict, aligning with the themes discussed in the article where Netanyahu's political maneuvers are contrasted with the pressing need for a ceasefire to alleviate civilian suffering in Gaza.", + "Criteria": "Award up to 10 marks based on the accuracy and relevance of the response: 2 marks for identifying the humanitarian impact, 3 marks for linking the image to the themes of the article, 2 marks for mentioning Netanyahu's political maneuvers, 3 marks for correctly associating the need for a ceasefire with the alleviation of civilian suffering." + } + ], + "Deeper Implications": [ + { + "Question": "What broader issues are raised by the UN Security Council's backing of a ceasefire in the Israel-Gaza context?", + "Answer": "The broader issues include international involvement in regional conflicts, the effectiveness of UN resolutions in conflict resolution, and the ongoing debate over the balance between national security and humanitarian needs in conflict zones.", + "Criteria": "Award up to 10 marks based on the accuracy and completeness of the response: 3 marks for mentioning international involvement in regional conflicts, 3 marks for discussing the effectiveness of UN resolutions in conflict resolution, 4 marks for addressing the debate over the balance between national security and humanitarian needs in conflict zones." + }, + { + "Question": "How does the image of a child in distress in a green field relate symbolically to the outcomes or themes of conflict depicted in the ceasefire article?", + "Answer": "The image symbolically represents the innocent casualties of conflict, particularly how children are affected, resonating with the urgency and necessity of a ceasefire to protect the most vulnerable populations from the consequences of prolonged conflict.", + "Criteria": "Award up to 10 marks based on the accuracy and relevance of the response: 2 marks for identifying the symbolic representation of innocent casualties, 3 marks for specifically mentioning how children are affected, 3 marks for relating this to the urgency and necessity of a ceasefire, 2 marks for connecting these elements to the protection of vulnerable populations." + } + ], + "Broader Implications": [ + { + "Question": "Rank the news articles in the image in order of their potential global impact, from highest to lowest.", + "Answer": "1. **UN Security Council backs US Israel-Gaza ceasefire plan**\n2. **Netanyahu walks tightrope as US urges Gaza ceasefire deal**\n3. **Apple brings ChatGPT to iPhones in AI overhaul**\n4. **Aircraft carrying Malawi vice-president goes missing**\n5. **Fire at famous Bangkok market kills 1,000 animals**\n6. **Four US college instructors stabbed in public park in China**\n7. **Baltimore shipping channel reopens after bridge collapse**", + "Criteria": "Award up to 10 marks based on the accuracy of the ranking: 2 marks for correctly placing the UN Security Council article first, 2 marks for correctly placing the Netanyahu article second, 1 mark each for correctly placing the next three articles (Apple, Aircraft, Fire), and 1 mark each for correctly placing the last two articles (Stabbing, Bridge). Deduct 1 mark for each position an article is away from its correct placement." + } + ], + "Further Insights": [ + { + "Question": "Based on the image, which of the following statements best explains the potential global impact of the events described in the news articles?\nA. The US-Israel-Gaza ceasefire plan backed by the UN Security Council is likely to reduce tensions in the Middle East, potentially leading to a more stable geopolitical environment in the region.\nB. The introduction of ChatGPT to iPhones is expected to significantly disrupt the technology market, overshadowing the geopolitical events in the Middle East and Africa.\nC. The fire at the Bangkok market, which killed 1,000 animals, is likely to have a more profound impact on global environmental policies than the ceasefire plan in the Middle East.\nD. The disappearance of the aircraft carrying the Malawi vice-president is expected to lead to a major international search and rescue operation, diverting attention from other global issues.", + "Answer": "A. The US-Israel-Gaza ceasefire plan backed by the UN Security Council is likely to reduce tensions in the Middle East, potentially leading to a more stable geopolitical environment in the region.", + "Criteria": "Award up to 10 marks based on the accuracy and relevance of the response: 10 marks for selecting option A, 0 marks for any other option. Detailed justification for scoring: Option A directly addresses the reduction of tensions and potential stabilization in the Middle East, which is a significant global impact. Other options, while plausible, do not directly relate to the primary global impact as depicted in the provided image and articles." + }, + { + "Question": "Considering the current global attention on AI, how might the article about Apple bringing ChatGPT to iPhones in an AI overhaul reflect on broader technological trends and consumer expectations?", + "Answer": "This article reflects broader trends in AI integration into consumer technology, highlighting competitive dynamics in the tech industry, and growing consumer expectations for sophisticated AI features in everyday devices.", + "Criteria": "Award up to 10 marks based on the depth and accuracy of the response: 3 marks for identifying AI integration into consumer technology, 3 marks for discussing competitive dynamics in the tech industry, 4 marks for explaining the growth in consumer expectations for sophisticated AI features in everyday devices." + } + ] +} diff --git a/tools/live_bench/live_bench/data_generator/live_bench.py b/tools/live_bench/live_bench/data_generator/live_bench.py index 9a27f408..f7ba55d7 100644 --- a/tools/live_bench/live_bench/data_generator/live_bench.py +++ b/tools/live_bench/live_bench/data_generator/live_bench.py @@ -1,187 +1,191 @@ -import json -import logging -import os -from datetime import datetime -from typing import List, Tuple - -from datasets import Dataset, load_dataset -from live_bench.data_generator import get_generator, get_random_generator -from live_bench.data_generator.live_bench_data import LiveBenchData -from live_bench.data_generator.qa_generator import QAData, QAGenerator -from live_bench.data_generator.question_finalizer import QuestionFinalizer -from live_bench.data_generator.response import Response -from live_bench.data_generator.score_getter import ( - get_random_score_getter, - get_score_getter, -) -from live_bench.data_generator.utils.extract_infomation import ( - ImageInfomation, - InfomationExtractor, -) -from live_bench.driver import load_driver -from live_bench.screen_shoter import ScreenImage, ScreenShoter, get_shoter -from live_bench.websites import Website -from tqdm import tqdm - -logger = logging.getLogger("lmms-eval") - - -def get_qa_data(images: ScreenImage, qa_generator: QAGenerator, *, infomation_getter: InfomationExtractor = None, test=False) -> Tuple[List[QAData], Response]: - if infomation_getter: - infomation = infomation_getter.extract_infomation(images) - else: - infomation = None - response = qa_generator.generate(images, test=test, infomation=infomation) - qa_data = qa_generator.format_response(response) - return qa_data, response - - -def get_live_bench_data( - driver, website: Website, screen_shoter: ScreenShoter, qa_generator: QAGenerator, checker: QAGenerator, infomation_getter: InfomationExtractor, question_finalizer: QuestionFinalizer, test=False, scorer=None, score_threshold=5 -) -> Tuple[List[LiveBenchData], Response]: - images = screen_shoter.capture(driver, website) - qa_data, logs = get_qa_data(images, qa_generator, test=test, infomation_getter=infomation_getter) - data = [] - for qa in qa_data: - # qa_data = question_finalizer.finalize_question(qa, images.images) - item = LiveBenchData(screen=images, question=qa.question, answer=qa.answer, subtask=qa.subtask, criteria=qa.criteria, data_generator=qa_generator.get_name(), checker=checker, scorer=scorer, finalizer=question_finalizer) - if score_threshold and (not item.score or item.score < score_threshold): - continue - data.append(item) - return data, logs - - -class LiveBench(object): - def __init__(self, path: str = "lmms-lab/LiveBench", *, name="auto", split="test", cache_dir=None, remote_path=None, trust_remote_code=True, force_clear=False, **kwargs): - self.path = path - if name == "auto": - name = datetime.now().strftime("%Y-%m") - self.name = name - self.split = split - self.cache_dir = cache_dir - self.dataset_kwargs = kwargs - if remote_path is None: - self.remote_path = path - if force_clear: - self.clear() - else: - try: - self.hf_data = load_dataset(self.path, name=self.name, split=split, cache_dir=cache_dir, trust_remote_code=trust_remote_code, **kwargs) - except Exception as e: - logger.error(f"Error loading dataset: {e}") - self.clear() - - def clear(self): - self.hf_data = Dataset.from_dict( - { - "id": [], - "images": [], - "website": [], - "question": [], - "answer": [], - "criteria": [], - "subtask": [], - "data_generator": [], - "checker": [], - "date_time": [], - "screen_shoter": [], - "screen_size": [], - "score": [], - "reason": [], - "scorer_name": [], - }, - features=LiveBenchData.features, - ) - - def add(self, data: LiveBenchData, id: int = None): - if id is None: - id = len(self.hf_data) - organized_data = data.to_hf_dict() - organized_data["id"] = id - self.hf_data = self.hf_data.add_item(organized_data) - - def capture( - self, - websites: List[Website] = None, - *, - screen_shoter="single_screen", - qa_generator=None, - checker=None, - driver=None, - scorer=None, - question_finalizer=None, - test=False, - driver_kwargs={}, - shoter_kwargs={}, - generator_kwargs={}, - question_finalizer_kwargs={}, - log_folder="./logs", - ): - can_quit_driver = False - if driver is None and screen_shoter != "human": - driver = load_driver(**driver_kwargs) - can_quit_driver = True - screen_shoter = get_shoter(screen_shoter, **shoter_kwargs) - if qa_generator is not None: - qa_generator = get_generator(qa_generator, **generator_kwargs) - else: - qa_generator = get_random_generator(**generator_kwargs) - if checker is None: - checker = get_random_generator(**generator_kwargs) - else: - checker = get_generator(checker, **generator_kwargs) - if scorer is not None and isinstance(scorer, str): - scorer = get_score_getter(scorer) - elif scorer is None: - scorer = get_random_score_getter() - if question_finalizer is None: - question_finalizer = QuestionFinalizer(**question_finalizer_kwargs) - logs = [] - infomation_getter = InfomationExtractor() - for website in tqdm(websites, desc="Capturing websites"): - try: - data, log = get_live_bench_data(driver, website, screen_shoter, qa_generator, checker, test=test, scorer=scorer, infomation_getter=infomation_getter, question_finalizer=question_finalizer) - logs.append(log.to_dict()) - for d in data: - self.add(d) - except Exception as e: - logger.error(f"Error capturing website: {e}") - logger.error(f"Website: {website.get_info()}") - logs.append( - { - "success": False, - "content": f"Error capturing website: {e}", - "full_log": { - "website": website.get_info(), - "error": str(e), - }, - } - ) - continue - if not os.path.exists(log_folder): - os.makedirs(log_folder) - date_time = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") - log_file = os.path.join(log_folder, f"{date_time}.json") - full_log = { - "info": { - "date_time": date_time, - "screen_shoter": screen_shoter.get_name(), - "qa_generator": qa_generator.get_name(), - "checker": checker.get_name(), - "scorer": scorer.get_name(), - }, - "websites": [w.get_info() for w in websites], - "logs": logs, - } - with open(log_file, "w") as f: - json.dump(full_log, f, indent=4) - logger.info(f"Logs saved to {os.path.abspath(log_file)}") - if can_quit_driver: - driver.quit() - - def upload(self, **kwargs): - self.hf_data.push_to_hub(self.remote_path, config_name=self.name, split=self.split, **kwargs) - - def save(self, path: str): - self.hf_data.save_to_disk(path) - logger.info(f"Data saved to {os.path.abspath(path)}") +import json +import logging +import os +from datetime import datetime +from typing import List, Tuple + +from datasets import Dataset, load_dataset +from live_bench.data_generator.live_bench_data import LiveBenchData +from live_bench.data_generator.qa_generator import ( + QAData, + QAGenerator, + get_generator, + get_random_generator, +) +from live_bench.data_generator.question_finalizer import QuestionFinalizer +from live_bench.data_generator.response import Response +from live_bench.data_generator.score_getter import ( + get_random_score_getter, + get_score_getter, +) +from live_bench.data_generator.utils.extract_infomation import ( + ImageInfomation, + InfomationExtractor, +) +from live_bench.driver import load_driver +from live_bench.screen_shoter import ScreenImage, ScreenShoter, get_shoter +from live_bench.websites import Website +from tqdm import tqdm + +logger = logging.getLogger("lmms-eval") + + +def get_qa_data(images: ScreenImage, qa_generator: QAGenerator, *, infomation_getter: InfomationExtractor = None, test=False) -> Tuple[List[QAData], Response]: + if infomation_getter: + infomation = infomation_getter.extract_infomation(images) + else: + infomation = None + response = qa_generator.generate(images, test=test, infomation=infomation) + qa_data = qa_generator.format_response(response) + return qa_data, response + + +def get_live_bench_data( + driver, website: Website, screen_shoter: ScreenShoter, qa_generator: QAGenerator, checker: QAGenerator, infomation_getter: InfomationExtractor, question_finalizer: QuestionFinalizer, test=False, scorer=None, score_threshold=5 +) -> Tuple[List[LiveBenchData], Response]: + images = screen_shoter.capture(driver, website) + qa_data, logs = get_qa_data(images, qa_generator, test=test, infomation_getter=infomation_getter) + data = [] + for qa in qa_data: + # qa_data = question_finalizer.finalize_question(qa, images.images) + item = LiveBenchData(screen=images, question=qa.question, answer=qa.answer, subtask=qa.subtask, criteria=qa.criteria, data_generator=qa_generator.get_name(), checker=checker, scorer=scorer, finalizer=question_finalizer) + if score_threshold and (not item.score or item.score < score_threshold): + continue + data.append(item) + return data, logs + + +class LiveBench(object): + def __init__(self, path: str = "lmms-lab/LiveBench", *, name="auto", split="test", cache_dir=None, remote_path=None, trust_remote_code=True, force_clear=False, **kwargs): + self.path = path + if name == "auto": + name = datetime.now().strftime("%Y-%m") + self.name = name + self.split = split + self.cache_dir = cache_dir + self.dataset_kwargs = kwargs + if remote_path is None: + self.remote_path = path + if force_clear: + self.clear() + else: + try: + self.hf_data = load_dataset(self.path, name=self.name, split=split, cache_dir=cache_dir, trust_remote_code=trust_remote_code, **kwargs) + except Exception as e: + logger.error(f"Error loading dataset: {e}") + self.clear() + + def clear(self): + self.hf_data = Dataset.from_dict( + { + "id": [], + "images": [], + "website": [], + "question": [], + "answer": [], + "criteria": [], + "subtask": [], + "data_generator": [], + "checker": [], + "date_time": [], + "screen_shoter": [], + "screen_size": [], + "score": [], + "reason": [], + "scorer_name": [], + }, + features=LiveBenchData.features, + ) + + def add(self, data: LiveBenchData, id: int = None): + if id is None: + id = len(self.hf_data) + organized_data = data.to_hf_dict() + organized_data["id"] = id + self.hf_data = self.hf_data.add_item(organized_data) + + def capture( + self, + websites: List[Website] = None, + *, + screen_shoter="single_screen", + qa_generator=None, + checker=None, + driver=None, + scorer=None, + question_finalizer=None, + test=False, + driver_kwargs={}, + shoter_kwargs={}, + generator_kwargs={}, + question_finalizer_kwargs={}, + log_folder="./logs", + ): + can_quit_driver = False + if driver is None and screen_shoter != "human": + driver = load_driver(**driver_kwargs) + can_quit_driver = True + screen_shoter = get_shoter(screen_shoter, **shoter_kwargs) + if qa_generator is not None: + qa_generator = get_generator(qa_generator, **generator_kwargs) + else: + qa_generator = get_random_generator(**generator_kwargs) + if checker is None: + checker = get_random_generator(**generator_kwargs) + else: + checker = get_generator(checker, **generator_kwargs) + if scorer is not None and isinstance(scorer, str): + scorer = get_score_getter(scorer) + elif scorer is None: + scorer = get_random_score_getter() + if question_finalizer is None: + question_finalizer = QuestionFinalizer(**question_finalizer_kwargs) + logs = [] + infomation_getter = InfomationExtractor() + for website in tqdm(websites, desc="Capturing websites"): + try: + data, log = get_live_bench_data(driver, website, screen_shoter, qa_generator, checker, test=test, scorer=scorer, infomation_getter=infomation_getter, question_finalizer=question_finalizer) + logs.append(log.to_dict()) + for d in data: + self.add(d) + except Exception as e: + logger.error(f"Error capturing website: {e}") + logger.error(f"Website: {website.get_info()}") + logs.append( + { + "success": False, + "content": f"Error capturing website: {e}", + "full_log": { + "website": website.get_info(), + "error": str(e), + }, + } + ) + continue + if not os.path.exists(log_folder): + os.makedirs(log_folder) + date_time = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + log_file = os.path.join(log_folder, f"{date_time}.json") + full_log = { + "info": { + "date_time": date_time, + "screen_shoter": screen_shoter.get_name(), + "qa_generator": qa_generator.get_name(), + "checker": checker.get_name(), + "scorer": scorer.get_name(), + }, + "websites": [w.get_info() for w in websites], + "logs": logs, + } + with open(log_file, "w") as f: + json.dump(full_log, f, indent=4) + logger.info(f"Logs saved to {os.path.abspath(log_file)}") + if can_quit_driver: + driver.quit() + + def upload(self, **kwargs): + self.hf_data.push_to_hub(self.remote_path, config_name=self.name, split=self.split, **kwargs) + + def save(self, path: str): + self.hf_data.save_to_disk(path) + logger.info(f"Data saved to {os.path.abspath(path)}") diff --git a/tools/live_bench/live_bench/data_generator/live_bench_data.py b/tools/live_bench/live_bench/data_generator/live_bench_data.py index eccb106c..6619a07f 100644 --- a/tools/live_bench/live_bench/data_generator/live_bench_data.py +++ b/tools/live_bench/live_bench/data_generator/live_bench_data.py @@ -1,139 +1,139 @@ -import datasets -from live_bench.data_generator.qa_generator import QAGenerator -from live_bench.data_generator.question_finalizer import QuestionFinalizer -from live_bench.data_generator.utils.extract_infomation import ImageInfomation -from live_bench.screen_shoter.screen import ScreenImage - - -class LiveBenchData(object): - SUBTASKS = ("Basic Understanding", "Contextual Analysis", "Deeper Implications", "Broader Implications", "Further Insights") - - features = datasets.Features( - { - "id": datasets.Value("int32"), - "images": datasets.Sequence(datasets.Image()), - "website": datasets.Value("string"), - "question": datasets.Value("string"), - "answer": datasets.Value("string"), - "criteria": datasets.Value("string"), - "subtask": datasets.Value("string"), - "data_generator": datasets.Value("string"), - "checker": datasets.Value("string"), - "date_time": datasets.Value("string"), - "screen_shoter": datasets.Value("string"), - "screen_size": datasets.Value("string"), - "score": datasets.Value("int32"), - "reason": datasets.Value("string"), - "scorer_name": datasets.Value("string"), - } - ) - - def __init__( - self, - *, - screen: ScreenImage, - question: str, - answer: str, - criteria: str, - subtask: str, - data_generator: str, - infomation: ImageInfomation = None, - score: int = None, - reason: str = None, - checker: QAGenerator = None, - finalizer: QuestionFinalizer = None, - scorer_name=None, - scorer=None, - ): - self.screen = screen - self.question = question - self.answer = answer - self.criteria = criteria - self.subtask = subtask - self.data_generator = data_generator - self.infomation = infomation - self.checker = None - if checker: - response = checker.check(screen, question, answer, criteria, subtask, infomation=infomation) - if response.success: - formatted_response = checker.format_checked_response(response) - if formatted_response.question and formatted_response.answer and formatted_response.criteria: - self.question = formatted_response.question - self.answer = formatted_response.answer - self.criteria = formatted_response.criteria - if formatted_response.subtask: - self.subtask = formatted_response.subtask - else: - self.subtask = subtask - self.checker = checker.get_name() - if finalizer: - try: - qa = finalizer.finalize_question(self.question, self.answer, self.criteria, self.screen.images) - except Exception as e: - raise e - self.question = qa["question"] - self.answer = qa["answer"] - self.criteria = qa["criteria"] - if self.subtask: - for sub in LiveBenchData.SUBTASKS: - if sub.lower() in self.subtask.lower(): - self.subtask = sub - break - else: - self.subtask = "Further Insights" - else: - self.subtask = "Not Specified" - if score is not None: - self.score = score - self.reason = reason - self.scorer_name = scorer_name - else: - score = scorer.get_score(question, answer, screen.images) - self.score = score.score - self.reason = score.reason - self.scorer_name = scorer.get_name() - - def to_dict(self): - images = self.screen.images - website = self.screen.website.get_info() - question = self.question - answer = self.answer - subtask = self.subtask - data_generator = self.data_generator - date_time = self.screen.capture_datetime - screen_shoter = self.screen.shoter - screen_size = self.screen.screen_size - criteria = self.criteria - return { - "images": images, - "website": website, - "question": question, - "answer": answer, - "criteria": criteria, - "subtask": subtask, - "data_generator": data_generator, - "checker": self.checker, - "date_time": date_time, - "screen_shoter": screen_shoter, - "screen_size": screen_size, - "score": self.score, - "reason": self.reason, - "scorer_name": self.scorer_name, - } - - def to_hf_dict(self): - return self.features.encode_example(self.to_dict()) - - def to_output_dict(self): - return { - "screen": self.screen.to_output_dict(), - "question": self.question, - "answer": self.answer, - "criteria": self.criteria, - "subtask": self.subtask, - "data_generator": self.data_generator, - "checker": self.checker, - "score": self.score, - "reason": self.reason, - "scorer_name": self.scorer_name, - } +import datasets +from live_bench.data_generator.qa_generator import QAGenerator +from live_bench.data_generator.question_finalizer import QuestionFinalizer +from live_bench.data_generator.utils.extract_infomation import ImageInfomation +from live_bench.screen_shoter.screen import ScreenImage + + +class LiveBenchData(object): + SUBTASKS = ("Basic Understanding", "Contextual Analysis", "Deeper Implications", "Broader Implications", "Further Insights") + + features = datasets.Features( + { + "id": datasets.Value("int32"), + "images": datasets.Sequence(datasets.Image()), + "website": datasets.Value("string"), + "question": datasets.Value("string"), + "answer": datasets.Value("string"), + "criteria": datasets.Value("string"), + "subtask": datasets.Value("string"), + "data_generator": datasets.Value("string"), + "checker": datasets.Value("string"), + "date_time": datasets.Value("string"), + "screen_shoter": datasets.Value("string"), + "screen_size": datasets.Value("string"), + "score": datasets.Value("int32"), + "reason": datasets.Value("string"), + "scorer_name": datasets.Value("string"), + } + ) + + def __init__( + self, + *, + screen: ScreenImage, + question: str, + answer: str, + criteria: str, + subtask: str, + data_generator: str, + infomation: ImageInfomation = None, + score: int = None, + reason: str = None, + checker: QAGenerator = None, + finalizer: QuestionFinalizer = None, + scorer_name=None, + scorer=None, + ): + self.screen = screen + self.question = question + self.answer = answer + self.criteria = criteria + self.subtask = subtask + self.data_generator = data_generator + self.infomation = infomation + self.checker = None + if checker: + response = checker.check(screen, question, answer, criteria, subtask, infomation=infomation) + if response.success: + formatted_response = checker.format_checked_response(response) + if formatted_response.question and formatted_response.answer and formatted_response.criteria: + self.question = formatted_response.question + self.answer = formatted_response.answer + self.criteria = formatted_response.criteria + if formatted_response.subtask: + self.subtask = formatted_response.subtask + else: + self.subtask = subtask + self.checker = checker.get_name() + if finalizer: + try: + qa = finalizer.finalize_question(self.question, self.answer, self.criteria, self.screen.images) + except Exception as e: + raise e + self.question = qa["question"] + self.answer = qa["answer"] + self.criteria = qa["criteria"] + if self.subtask: + for sub in LiveBenchData.SUBTASKS: + if sub.lower() in self.subtask.lower(): + self.subtask = sub + break + else: + self.subtask = "Further Insights" + else: + self.subtask = "Not Specified" + if score is not None: + self.score = score + self.reason = reason + self.scorer_name = scorer_name + else: + score = scorer.get_score(question, answer, screen.images) + self.score = score.score + self.reason = score.reason + self.scorer_name = scorer.get_name() + + def to_dict(self): + images = self.screen.images + website = self.screen.website.get_info() + question = self.question + answer = self.answer + subtask = self.subtask + data_generator = self.data_generator + date_time = self.screen.capture_datetime + screen_shoter = self.screen.shoter + screen_size = self.screen.screen_size + criteria = self.criteria + return { + "images": images, + "website": website, + "question": question, + "answer": answer, + "criteria": criteria, + "subtask": subtask, + "data_generator": data_generator, + "checker": self.checker, + "date_time": date_time, + "screen_shoter": screen_shoter, + "screen_size": screen_size, + "score": self.score, + "reason": self.reason, + "scorer_name": self.scorer_name, + } + + def to_hf_dict(self): + return self.features.encode_example(self.to_dict()) + + def to_output_dict(self): + return { + "screen": self.screen.to_output_dict(), + "question": self.question, + "answer": self.answer, + "criteria": self.criteria, + "subtask": self.subtask, + "data_generator": self.data_generator, + "checker": self.checker, + "score": self.score, + "reason": self.reason, + "scorer_name": self.scorer_name, + } diff --git a/tools/live_bench/live_bench/data_generator/prompt.md b/tools/live_bench/live_bench/data_generator/prompt.md index faa2fa88..684e193f 100644 --- a/tools/live_bench/live_bench/data_generator/prompt.md +++ b/tools/live_bench/live_bench/data_generator/prompt.md @@ -1,17 +1,17 @@ -I would like you to act as a quizmaster who designs questions based on a provided image that would challenge adults to think critically. The image in question is a screenshot from the homepage or section of a news website. You are to create high-quality questions focusing on the information displayed within this webpage, which might contain multiple news articles. Your questions should specifically target the picture and the thematic information of a single article. Your question should be answerable, and checkable. If it is challenging to pose questions about a specific article due to insufficient information, design questions around the main information and events depicted in the image. Within your capabilities, try to make the questions more challenging. - -A well-crafted question about an event should allow respondents to gain deeper insights by observing and analyzing the event, paying attention to the following aspects: - -- Basic Understanding: Questions that require direct observation or recall of the information presented in the image. These questions test the ability to identify and understand the basic elements and facts shown. -- Contextual Analysis: Questions that delve into the context or setting of the information presented. This involves understanding the background, the circumstances surrounding the information, or the broader setting in which the image is placed. -- Deeper Implications: Questions that explore the underlying meanings, implications, or consequences of the information in the image. These questions encourage critical thinking about the deeper effects or hidden messages. -- Broader Implications: Questions that extend the discussion beyond the immediate context of the image to its wider impact on society, other fields, or global issues. -- Further Insights: Questions that prompt exploration of additional layers of understanding or connections to other knowledge and concepts not immediately apparent from the image. - -Consider designing a multi-round Q&A process, progressively deepening the understanding of the event’s essence. - -Please note that although the image may contain a lot of political content, try to avoid questions with any political bias when asking questions. Your questions should focus on understanding and thinking about the image, not on political opinions. - -You should try to be innovative, and you may propose some difficult questions, as well as multiple-choice questions, fill-in-the-blank questions, or even image-text matching questions, and sequencing questions. Within your capabilities, try to make the questions more challenging. - -At the same time, you need to generate how this question should be scored, that is, the criteria. Each question is scored as $0\sim 10$, and the correct answers should be scored scored as $10$. Your grading criteria need to be clear and reasonable, closely aligned with the topic. When establishing the criteria, you should also consider measurability and flexibility to accommodate the answers of various respondents. +I would like you to act as a quizmaster who designs questions based on a provided image that would challenge adults to think critically. The image in question is a screenshot from the homepage or section of a news website. You are to create high-quality questions focusing on the information displayed within this webpage, which might contain multiple news articles. Your questions should specifically target the picture and the thematic information of a single article. Your question should be answerable, and checkable. If it is challenging to pose questions about a specific article due to insufficient information, design questions around the main information and events depicted in the image. Within your capabilities, try to make the questions more challenging. + +A well-crafted question about an event should allow respondents to gain deeper insights by observing and analyzing the event, paying attention to the following aspects: + +- Basic Understanding: Questions that require direct observation or recall of the information presented in the image. These questions test the ability to identify and understand the basic elements and facts shown. +- Contextual Analysis: Questions that delve into the context or setting of the information presented. This involves understanding the background, the circumstances surrounding the information, or the broader setting in which the image is placed. +- Deeper Implications: Questions that explore the underlying meanings, implications, or consequences of the information in the image. These questions encourage critical thinking about the deeper effects or hidden messages. +- Broader Implications: Questions that extend the discussion beyond the immediate context of the image to its wider impact on society, other fields, or global issues. +- Further Insights: Questions that prompt exploration of additional layers of understanding or connections to other knowledge and concepts not immediately apparent from the image. + +Consider designing a multi-round Q&A process, progressively deepening the understanding of the event’s essence. + +Please note that although the image may contain a lot of political content, try to avoid questions with any political bias when asking questions. Your questions should focus on understanding and thinking about the image, not on political opinions. + +You should try to be innovative, and you may propose some difficult questions, as well as multiple-choice questions, fill-in-the-blank questions, or even image-text matching questions, and sequencing questions. Within your capabilities, try to make the questions more challenging. + +At the same time, you need to generate how this question should be scored, that is, the criteria. Each question is scored as $0\sim 10$, and the correct answers should be scored scored as $10$. Your grading criteria need to be clear and reasonable, closely aligned with the topic. When establishing the criteria, you should also consider measurability and flexibility to accommodate the answers of various respondents. diff --git a/tools/live_bench/live_bench/data_generator/qa_generator.py b/tools/live_bench/live_bench/data_generator/qa_generator.py index f44d5452..631477db 100644 --- a/tools/live_bench/live_bench/data_generator/qa_generator.py +++ b/tools/live_bench/live_bench/data_generator/qa_generator.py @@ -1,532 +1,532 @@ -import base64 -import io -import json -import logging -import os -import random -import re -from abc import ABC, abstractmethod -from time import sleep -from typing import List - -import anthropic -import google.generativeai as genai -import openai -from live_bench.data_generator.response import Response -from live_bench.data_generator.utils.claude import ( - claude_generate_response, - format_claude_images, -) -from live_bench.data_generator.utils.extract_infomation import ( - ImageInfomation, - InfomationExtractor, -) -from live_bench.data_generator.utils.gemini import gemini_generate_response -from live_bench.data_generator.utils.gpt4v import ( - format_gpt4v_images, - gpt4v_generate_response, -) -from live_bench.screen_shoter import ScreenImage -from PIL import Image - -logger = logging.getLogger("lmms-eval") - -SUBTASKS = {"Basic Understanding", "Contextual Analysis", "Deeper Implications", "Broader Implications", "Further Insights"} - - -class QAData(object): - def __init__(self, question: str = None, answer: str = None, criteria: str = None, subtask: str = None): - self.question = question - self.answer = answer - self.criteria = criteria - self.subtask = subtask - - def parse_subtask(subtask: str) -> str: - subtask = subtask.strip().lower() - for valid_subtask in SUBTASKS: - if valid_subtask.lower() in subtask.lower(): - return valid_subtask - return "Unknown Subtask" - - def set_subtask(self, subtask: str): - """ - Set the subtask for the QAData instance after parsing it. - - Args: - subtask (str): The subtask string to be set. - """ - self.subtask = self.parse_subtask(subtask) - - def to_dict(self): - return {"question": self.question, "answer": self.answer} - - -class QAGenerator(ABC): - def __init__(self, prompt_file: str = os.path.join(os.path.dirname(__file__), "prompt.md")): - self.prompt_file = prompt_file - self.prompt = self._load_prompt() - - def _load_prompt(self): - with open(self.prompt_file, "r") as f: - return f.read() - - def __call__(self, images: ScreenImage, *args, **kwargs): - return self.generate(images, *args, **kwargs) - - def generate(self, images: ScreenImage, *, test=False, infomation=None, **kwargs) -> Response: - if test: - return Response(success=True, content="This is a test response.", full_log={}) - return self._generate(images, infomation=infomation, test=test, **kwargs) - - def check(self, images: ScreenImage, question, answer, criteria, subtask, *, infomation=None, test=False, **kwargs) -> Response: - if test: - return Response(success=True, content="This is a test response.", full_log={}) - return self._check(images, question, answer, criteria, subtask, infomation=infomation, **kwargs) - - @abstractmethod - def _generate(self, images: ScreenImage, **kwargs) -> Response: - raise NotImplementedError("_generate not implemented") - - @abstractmethod - def _check(self, images: ScreenImage, question, answer, criteria, subtask, **kwargs) -> Response: - raise NotImplementedError("_check not implemented") - - def format_response(self, response: Response) -> QAData: - if response.success: - qa_data = self._format_response(response) - if qa_data is None: - return [] - else: - return qa_data - else: - return [] - - @abstractmethod - def _format_response(self, response: Response) -> str: - raise NotImplementedError("format_response not implemented") - - @abstractmethod - def format_checked_response(self, response: Response) -> QAData: - raise NotImplementedError("format_checked_response not implemented") - - def get_name(self) -> str: - raise NotImplementedError("get_name not implemented") - - -class GeneratorRegistry: - def __init__(self): - self.generators = {} - - def register_generator(self, name): - def decorator(cls): - self.generators[name] = cls - cls.get_name = lambda self: name - return cls - - return decorator - - def get_generator(self, name) -> QAGenerator: - return self.generators[name] - - def get_random_generator(self) -> QAGenerator: - return random.choice(list(self.generators.values())) - - -generator_registry = GeneratorRegistry() - - -def register_generator(name): - return generator_registry.register_generator(name) - - -def get_generator(name, *args, **kwargs) -> QAGenerator: - return generator_registry.get_generator(name)(*args, **kwargs) - - -def get_random_generator(*args, **kwargs) -> QAGenerator: - return generator_registry.get_random_generator()(*args, **kwargs) - - -@register_generator("gpt4v") -class GPT4Generator(QAGenerator): - def __init__( - self, - prompt_file: str = os.path.join(os.path.dirname(__file__), "prompt.md"), - model="gpt-4o", - example_path=os.path.join(os.path.dirname(__file__), "example"), - check_prompt=os.path.join(os.path.dirname(__file__), "check_prompt.md"), - ): - super().__init__(prompt_file) - API_KEY = os.getenv("OPENAI_API_KEY") - if not API_KEY: - raise ValueError("OPENAI_API_KEY environment variable not set.") - self.api_key = API_KEY - self.client = openai.OpenAI(api_key=self.api_key) - self.model = model - if os.path.exists(example_path): - self.example_path = example_path - else: - self.example_path = None - if os.path.exists(check_prompt): - with open(check_prompt, "r") as f: - self.check_prompt = f.read() - else: - self.check_prompt = check_prompt - - def format_messages(self, images: List[Image.Image], example_image: Image.Image, example_output: str, infomation: ImageInfomation): - example = [ - { - "type": "text", - "text": "Here are few examples about the task and the expected output format. You can take these as examples to generate your own questions.", - }, - format_gpt4v_images(example_image), - { - "type": "text", - "text": example_output, - }, - ] - content = example + [format_gpt4v_images(image) for image in images] - if infomation: - content.append({"type": "text", "text": str(infomation)}) - content.append( - { - "type": "text", - "text": "Please generate high-quality questions focusing on the information displayed within this webpage. Your response should be in the format of the examples provided above and in JSON format.", - }, - ) - messages = [ - { - "role": "system", - "content": self.prompt, - }, - { - "role": "user", - "content": content, - }, - ] - return messages - - def _generate(self, images: ScreenImage, *, max_tokens=4096, max_try_times=5, infomation=None, **kwargs): - if self.example_path: - example_image_path = os.path.join(self.example_path, "example_website.png") - example_output_path = os.path.join(self.example_path, "example_output.json") - example_image = Image.open(example_image_path) - with open(example_output_path, "r") as f: - example_output = json.load(f) - example_output = json.dumps(example_output, indent=4) - - messages = self.format_messages(images.images, example_image, example_output, infomation) - - return gpt4v_generate_response(client=self.client, model=self.model, messages=messages, max_tokens=max_tokens, max_try_times=max_try_times, json_format=True, **kwargs) - - def get_check_prompt(self, question: str, answer: str, criteria, subtask, images: List[Image.Image], infomation: ImageInfomation = None): - messages = [ - { - "role": "system", - "content": self.check_prompt, - } - ] - content = [] - for img in images: - content.append(format_gpt4v_images(img)) - content.append( - { - "type": "text", - "text": f"Question: {question}\nQuestioner's Answer: {answer}\nCriteria: {criteria}\nSubtask: {subtask}", - }, - ) - if infomation: - content.append( - { - "type": "text", - "text": str(infomation), - }, - ) - content.append( - { - "type": "text", - "text": "Please rephrase or rewrite the high-quality question focusing on the information displayed within this webpage. Your response should be in the format of the examples provided above and in JSON format.", - }, - ) - messages.append( - { - "role": "user", - "content": content, - } - ) - return messages - - def _check(self, images: ScreenImage, question, answer, criteria, subtask, *, max_tokens=4096, max_try_times=5, **kwargs): - messages = self.get_check_prompt(question, answer, criteria, subtask, images.images) - return gpt4v_generate_response(client=self.client, model=self.model, messages=messages, max_tokens=max_tokens, max_try_times=max_try_times, json_format=True, **kwargs) - - def format_checked_response(self, response: Response): - data = json.loads(response.content) - question = data.get("question", None) - answer = data.get("answer", None) - criteria = data.get("criteria", None) - subtask = data.get("subtask", None) - return QAData(question=question, answer=answer, criteria=criteria, subtask=subtask) - - def _format_response(self, response: Response) -> List[QAData]: - try: - qa_data = [] - content = json.loads(response.content) - for subtask, messages in content.items(): - subtask = subtask.lower() - for message in messages: - message_lower = {k.lower(): v for k, v in message.items()} - try: - question = message_lower["question"] - answer = message_lower["answer"] - criteria = message_lower["criteria"] - qa_data.append(QAData(question=question, answer=answer, criteria=criteria, subtask=subtask)) - except KeyError as e: - logger.error(f"Failed to parse response: {message}") - logger.error(f"Error: {e}") - return qa_data - except Exception as e: - logger.error(f"Failed to format response: {e}") - return [] - - -@register_generator("gemini") -class GeminiGenerator(QAGenerator): - def __init__( - self, - prompt_file: str = os.path.join(os.path.dirname(__file__), "prompt.md"), - model="gemini-1.5-pro-latest", - example_path=os.path.join(os.path.dirname(__file__), "example"), - check_prompt=os.path.join(os.path.dirname(__file__), "check_prompt.md"), - ): - super().__init__(prompt_file) - GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") - if not GOOGLE_API_KEY: - raise ValueError("GOOGLE_API_KEY environment variable not set.") - genai.configure(api_key=GOOGLE_API_KEY) - - self.api_key = GOOGLE_API_KEY - self.model = model - self.client = genai.GenerativeModel(model) - if os.path.exists(example_path): - self.example_path = example_path - else: - self.example_path = None - if os.path.exists(check_prompt): - with open(check_prompt, "r") as f: - self.check_prompt = f.read() - else: - self.check_prompt = check_prompt - - def format_messages(self, images: List[Image.Image], example_image: Image.Image, example_output: str, infomation: ImageInfomation = None): - content = [self.prompt, "\n", "Example Image:", example_image, "\n", "Example Output:", example_output] - content.extend(images) - content.append(str(infomation)) - content.append("Please generate high-quality questions focusing on the information displayed within this webpage. Your response should be in the format of the examples provided above and in JSON format.") - return content - - def _generate(self, images: ScreenImage, *, max_tokens=4096, max_try_times=5, infomation: ImageInfomation = None, **kwargs): - if self.example_path: - example_image_path = os.path.join(self.example_path, "example_website.png") - example_output_path = os.path.join(self.example_path, "example_output.json") - example_image = Image.open(example_image_path) - with open(example_output_path, "r") as f: - # example_output = f.read() - example_output = json.load(f) - example_output = json.dumps(example_output, indent=4) - - messages = self.format_messages(images.images, example_image, example_output, infomation) - - return gemini_generate_response(self.client, messages, max_tokens, max_try_times, **kwargs) - - def get_check_prompt(self, question: str, answer: str, criteria, subtask, images: List[Image.Image], infomation: ImageInfomation = None): - content = [self.check_prompt] + images - content.append(f"Question: {question}\nQuestioner's Answer: {answer}\nCriteria: {criteria}, Subtask: {subtask}") - content.append("Your response should be strictly in the below format:\n\nQuestion: \nAnswer: \nCriteria: \nSubtask: ") - if infomation: - content.append(str(infomation)) - return content - - def _check(self, images: ScreenImage, question, answer, criteria, subtask, *, max_tokens=4096, max_try_times=5, infomation: ImageInfomation = None, **kwargs): - messages = self.get_check_prompt(question, answer, criteria, subtask, images.images, infomation) - return gemini_generate_response(self.client, messages, max_tokens, max_try_times, **kwargs) - - def format_checked_response(self, response: Response): - # Extract the question, answer, and subtask from the normalized content - question_match = re.search(r"question:\s*(.*?)\nAnswer:", response.content, re.IGNORECASE | re.DOTALL) - answer_match = re.search(r"answer:\s*(.*?)\nCriteria", response.content, re.IGNORECASE | re.DOTALL) - criteria_match = re.search(r"criteria:\s*(.*?)\n(Subtask:|$)", response.content, re.IGNORECASE | re.DOTALL) - subtask_match = re.search(r"subtask:\s*(.*)", response.content, re.IGNORECASE) - - question = answer = subtask = None - - if question_match: - # Extract the matched groups - question = question_match.group(1).strip() - if answer_match: - answer = answer_match.group(1).strip() - if criteria_match: - criteria = criteria_match.group(1).strip() - if subtask_match: - subtask = subtask_match.group(1).strip() - - return QAData(question=question, answer=answer, criteria=criteria, subtask=subtask) - - def _format_response(self, response: Response) -> List[QAData]: - try: - qa_data = [] - content = json.loads(response.content) - for subtask, message in content.items(): - subtask = subtask.lower() - message_lower = {k.lower(): v for k, v in message.items()} - try: - question = message_lower["question"] - answer = message_lower["answer"] - qa_data.append(QAData(question=question, answer=answer, subtask=subtask)) - except KeyError as e: - logger.error(f"Failed to parse response: {message}") - logger.error(f"Error: {e}") - return qa_data - except Exception as e: - logger.error(f"Failed to format response: {e}") - return [] - - -@register_generator("claude") -class ClaudeGenerator(QAGenerator): - def __init__( - self, - prompt_file: str = os.path.join(os.path.dirname(__file__), "prompt.md"), - model="claude-3-5-sonnet-20240620", - example_path=os.path.join(os.path.dirname(__file__), "example"), - check_prompt=os.path.join(os.path.dirname(__file__), "check_prompt.md"), - ): - super().__init__(prompt_file) - API_KEY = os.getenv("ANTHROPIC_API_KEY") - if not API_KEY: - raise ValueError("ANTHROPIC_API_KEY environment variable not set.") - self.api_key = API_KEY - self.client = anthropic.Anthropic(api_key=self.api_key) - self.model = model - if os.path.exists(example_path): - self.example_path = example_path - else: - self.example_path = None - if os.path.exists(check_prompt): - with open(check_prompt, "r") as f: - self.check_prompt = f.read() - else: - self.check_prompt = check_prompt - - def format_messages(self, images: List[Image.Image], example_image: Image.Image, example_output: str, infomation: ImageInfomation): - example = [ - { - "type": "text", - "text": "Here are few examples about the task and the expected output format. You can take these as examples to generate your own questions.", - }, - format_claude_images(example_image), - { - "type": "text", - "text": example_output, - }, - ] - content = example + [format_claude_images(image) for image in images] - if infomation: - content.append({"type": "text", "text": str(infomation)}) - content.append( - { - "type": "text", - "text": "Please generate high-quality questions focusing on the information displayed within this webpage. Ensure your response adheres to the examples provided above and is structured in JSON format, incorporating regular expressions to validate the format.", - }, - ) - messages = [ - { - "role": "user", - "content": content, - }, - ] - return messages - - def _generate(self, images: ScreenImage, *, max_tokens=4096, max_try_times=5, infomation=None, **kwargs): - if self.example_path: - example_image_path = os.path.join(self.example_path, "example_website.png") - example_output_path = os.path.join(self.example_path, "example_output.json") - example_image = Image.open(example_image_path) - with open(example_output_path, "r") as f: - # example_output = f.read() - example_output = json.load(f) - example_output = json.dumps(example_output, indent=4) - - messages = self.format_messages(images.images, example_image, example_output, infomation) - - return claude_generate_response(client=self.client, model=self.model, messages=messages, max_tokens=max_tokens, max_try_times=max_try_times, json_format=True, system=self.prompt, **kwargs) - - def get_check_prompt(self, question: str, answer: str, criteria, subtask, images: List[Image.Image], infomation: ImageInfomation = None): - messages = [ - { - "role": "system", - "content": self.check_prompt, - } - ] - content = [] - for img in images: - content.append(format_claude_images(img)) - content.append( - { - "type": "text", - "text": f"Question: {question}\nQuestioner's Answer: {answer}\nCriteria: {criteria}\nSubtask: {subtask}", - }, - ) - if infomation: - content.append( - { - "type": "text", - "text": str(infomation), - }, - ) - content.append( - { - "type": "text", - "text": "Please rephrase or rewrite the high-quality question focusing on the information displayed within this webpage. Your response should be in the format of the examples provided above and in JSON format.", - }, - ) - messages.append( - { - "role": "user", - "content": content, - } - ) - return messages - - def _check(self, images: ScreenImage, question, answer, criteria, subtask, *, max_tokens=4096, max_try_times=5, **kwargs): - messages = self.get_check_prompt(question, answer, criteria, subtask, images.images) - return claude_generate_response(client=self.client, model=self.model, messages=messages, max_tokens=max_tokens, max_try_times=max_try_times, json_format=True, **kwargs) - - def format_checked_response(self, response: Response): - data = json.loads(response.content) - question = data.get("question", None) - answer = data.get("answer", None) - criteria = data.get("criteria", None) - subtask = data.get("subtask", None) - return QAData(question=question, answer=answer, criteria=criteria, subtask=subtask) - - def _format_response(self, response: Response) -> List[QAData]: - try: - qa_data = [] - content = json.loads(response.content) - for subtask, messages in content.items(): - subtask = subtask.lower() - for message in messages: - message_lower = {k.lower(): v for k, v in message.items()} - try: - question = message_lower["question"] - answer = message_lower["answer"] - criteria = message_lower["criteria"] - qa_data.append(QAData(question=question, answer=answer, criteria=criteria, subtask=subtask)) - except KeyError as e: - logger.error(f"Failed to parse response: {message}") - logger.error(f"Error: {e}") - return qa_data - except Exception as e: - logger.error(f"Failed to format response: {e}") - return [] +import base64 +import io +import json +import logging +import os +import random +import re +from abc import ABC, abstractmethod +from time import sleep +from typing import List + +import anthropic +import google.generativeai as genai +import openai +from live_bench.data_generator.response import Response +from live_bench.data_generator.utils.claude import ( + claude_generate_response, + format_claude_images, +) +from live_bench.data_generator.utils.extract_infomation import ( + ImageInfomation, + InfomationExtractor, +) +from live_bench.data_generator.utils.gemini import gemini_generate_response +from live_bench.data_generator.utils.gpt4v import ( + format_gpt4v_images, + gpt4v_generate_response, +) +from live_bench.screen_shoter import ScreenImage +from PIL import Image + +logger = logging.getLogger("lmms-eval") + +SUBTASKS = {"Basic Understanding", "Contextual Analysis", "Deeper Implications", "Broader Implications", "Further Insights"} + + +class QAData(object): + def __init__(self, question: str = None, answer: str = None, criteria: str = None, subtask: str = None): + self.question = question + self.answer = answer + self.criteria = criteria + self.subtask = subtask + + def parse_subtask(subtask: str) -> str: + subtask = subtask.strip().lower() + for valid_subtask in SUBTASKS: + if valid_subtask.lower() in subtask.lower(): + return valid_subtask + return "Unknown Subtask" + + def set_subtask(self, subtask: str): + """ + Set the subtask for the QAData instance after parsing it. + + Args: + subtask (str): The subtask string to be set. + """ + self.subtask = self.parse_subtask(subtask) + + def to_dict(self): + return {"question": self.question, "answer": self.answer} + + +class QAGenerator(ABC): + def __init__(self, prompt_file: str = os.path.join(os.path.dirname(__file__), "prompt.md")): + self.prompt_file = prompt_file + self.prompt = self._load_prompt() + + def _load_prompt(self): + with open(self.prompt_file, "r") as f: + return f.read() + + def __call__(self, images: ScreenImage, *args, **kwargs): + return self.generate(images, *args, **kwargs) + + def generate(self, images: ScreenImage, *, test=False, infomation=None, **kwargs) -> Response: + if test: + return Response(success=True, content="This is a test response.", full_log={}) + return self._generate(images, infomation=infomation, test=test, **kwargs) + + def check(self, images: ScreenImage, question, answer, criteria, subtask, *, infomation=None, test=False, **kwargs) -> Response: + if test: + return Response(success=True, content="This is a test response.", full_log={}) + return self._check(images, question, answer, criteria, subtask, infomation=infomation, **kwargs) + + @abstractmethod + def _generate(self, images: ScreenImage, **kwargs) -> Response: + raise NotImplementedError("_generate not implemented") + + @abstractmethod + def _check(self, images: ScreenImage, question, answer, criteria, subtask, **kwargs) -> Response: + raise NotImplementedError("_check not implemented") + + def format_response(self, response: Response) -> QAData: + if response.success: + qa_data = self._format_response(response) + if qa_data is None: + return [] + else: + return qa_data + else: + return [] + + @abstractmethod + def _format_response(self, response: Response) -> str: + raise NotImplementedError("format_response not implemented") + + @abstractmethod + def format_checked_response(self, response: Response) -> QAData: + raise NotImplementedError("format_checked_response not implemented") + + def get_name(self) -> str: + raise NotImplementedError("get_name not implemented") + + +class GeneratorRegistry: + def __init__(self): + self.generators = {} + + def register_generator(self, name): + def decorator(cls): + self.generators[name] = cls + cls.get_name = lambda self: name + return cls + + return decorator + + def get_generator(self, name) -> QAGenerator: + return self.generators[name] + + def get_random_generator(self) -> QAGenerator: + return random.choice(list(self.generators.values())) + + +generator_registry = GeneratorRegistry() + + +def register_generator(name): + return generator_registry.register_generator(name) + + +def get_generator(name, *args, **kwargs) -> QAGenerator: + return generator_registry.get_generator(name)(*args, **kwargs) + + +def get_random_generator(*args, **kwargs) -> QAGenerator: + return generator_registry.get_random_generator()(*args, **kwargs) + + +@register_generator("gpt4v") +class GPT4Generator(QAGenerator): + def __init__( + self, + prompt_file: str = os.path.join(os.path.dirname(__file__), "prompt.md"), + model="gpt-4o", + example_path=os.path.join(os.path.dirname(__file__), "example"), + check_prompt=os.path.join(os.path.dirname(__file__), "check_prompt.md"), + ): + super().__init__(prompt_file) + API_KEY = os.getenv("OPENAI_API_KEY") + if not API_KEY: + raise ValueError("OPENAI_API_KEY environment variable not set.") + self.api_key = API_KEY + self.client = openai.OpenAI(api_key=self.api_key) + self.model = model + if os.path.exists(example_path): + self.example_path = example_path + else: + self.example_path = None + if os.path.exists(check_prompt): + with open(check_prompt, "r") as f: + self.check_prompt = f.read() + else: + self.check_prompt = check_prompt + + def format_messages(self, images: List[Image.Image], example_image: Image.Image, example_output: str, infomation: ImageInfomation): + example = [ + { + "type": "text", + "text": "Here are few examples about the task and the expected output format. You can take these as examples to generate your own questions.", + }, + format_gpt4v_images(example_image), + { + "type": "text", + "text": example_output, + }, + ] + content = example + [format_gpt4v_images(image) for image in images] + if infomation: + content.append({"type": "text", "text": str(infomation)}) + content.append( + { + "type": "text", + "text": "Please generate high-quality questions focusing on the information displayed within this webpage. Your response should be in the format of the examples provided above and in JSON format.", + }, + ) + messages = [ + { + "role": "system", + "content": self.prompt, + }, + { + "role": "user", + "content": content, + }, + ] + return messages + + def _generate(self, images: ScreenImage, *, max_tokens=4096, max_try_times=5, infomation=None, **kwargs): + if self.example_path: + example_image_path = os.path.join(self.example_path, "example_website.png") + example_output_path = os.path.join(self.example_path, "example_output.json") + example_image = Image.open(example_image_path) + with open(example_output_path, "r") as f: + example_output = json.load(f) + example_output = json.dumps(example_output, indent=4) + + messages = self.format_messages(images.images, example_image, example_output, infomation) + + return gpt4v_generate_response(client=self.client, model=self.model, messages=messages, max_tokens=max_tokens, max_try_times=max_try_times, json_format=True, **kwargs) + + def get_check_prompt(self, question: str, answer: str, criteria, subtask, images: List[Image.Image], infomation: ImageInfomation = None): + messages = [ + { + "role": "system", + "content": self.check_prompt, + } + ] + content = [] + for img in images: + content.append(format_gpt4v_images(img)) + content.append( + { + "type": "text", + "text": f"Question: {question}\nQuestioner's Answer: {answer}\nCriteria: {criteria}\nSubtask: {subtask}", + }, + ) + if infomation: + content.append( + { + "type": "text", + "text": str(infomation), + }, + ) + content.append( + { + "type": "text", + "text": "Please rephrase or rewrite the high-quality question focusing on the information displayed within this webpage. Your response should be in the format of the examples provided above and in JSON format.", + }, + ) + messages.append( + { + "role": "user", + "content": content, + } + ) + return messages + + def _check(self, images: ScreenImage, question, answer, criteria, subtask, *, max_tokens=4096, max_try_times=5, **kwargs): + messages = self.get_check_prompt(question, answer, criteria, subtask, images.images) + return gpt4v_generate_response(client=self.client, model=self.model, messages=messages, max_tokens=max_tokens, max_try_times=max_try_times, json_format=True, **kwargs) + + def format_checked_response(self, response: Response): + data = json.loads(response.content) + question = data.get("question", None) + answer = data.get("answer", None) + criteria = data.get("criteria", None) + subtask = data.get("subtask", None) + return QAData(question=question, answer=answer, criteria=criteria, subtask=subtask) + + def _format_response(self, response: Response) -> List[QAData]: + try: + qa_data = [] + content = json.loads(response.content) + for subtask, messages in content.items(): + subtask = subtask.lower() + for message in messages: + message_lower = {k.lower(): v for k, v in message.items()} + try: + question = message_lower["question"] + answer = message_lower["answer"] + criteria = message_lower["criteria"] + qa_data.append(QAData(question=question, answer=answer, criteria=criteria, subtask=subtask)) + except KeyError as e: + logger.error(f"Failed to parse response: {message}") + logger.error(f"Error: {e}") + return qa_data + except Exception as e: + logger.error(f"Failed to format response: {e}") + return [] + + +@register_generator("gemini") +class GeminiGenerator(QAGenerator): + def __init__( + self, + prompt_file: str = os.path.join(os.path.dirname(__file__), "prompt.md"), + model="gemini-1.5-pro-latest", + example_path=os.path.join(os.path.dirname(__file__), "example"), + check_prompt=os.path.join(os.path.dirname(__file__), "check_prompt.md"), + ): + super().__init__(prompt_file) + GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") + if not GOOGLE_API_KEY: + raise ValueError("GOOGLE_API_KEY environment variable not set.") + genai.configure(api_key=GOOGLE_API_KEY) + + self.api_key = GOOGLE_API_KEY + self.model = model + self.client = genai.GenerativeModel(model) + if os.path.exists(example_path): + self.example_path = example_path + else: + self.example_path = None + if os.path.exists(check_prompt): + with open(check_prompt, "r") as f: + self.check_prompt = f.read() + else: + self.check_prompt = check_prompt + + def format_messages(self, images: List[Image.Image], example_image: Image.Image, example_output: str, infomation: ImageInfomation = None): + content = [self.prompt, "\n", "Example Image:", example_image, "\n", "Example Output:", example_output] + content.extend(images) + content.append(str(infomation)) + content.append("Please generate high-quality questions focusing on the information displayed within this webpage. Your response should be in the format of the examples provided above and in JSON format.") + return content + + def _generate(self, images: ScreenImage, *, max_tokens=4096, max_try_times=5, infomation: ImageInfomation = None, **kwargs): + if self.example_path: + example_image_path = os.path.join(self.example_path, "example_website.png") + example_output_path = os.path.join(self.example_path, "example_output.json") + example_image = Image.open(example_image_path) + with open(example_output_path, "r") as f: + # example_output = f.read() + example_output = json.load(f) + example_output = json.dumps(example_output, indent=4) + + messages = self.format_messages(images.images, example_image, example_output, infomation) + + return gemini_generate_response(self.client, messages, max_tokens, max_try_times, **kwargs) + + def get_check_prompt(self, question: str, answer: str, criteria, subtask, images: List[Image.Image], infomation: ImageInfomation = None): + content = [self.check_prompt] + images + content.append(f"Question: {question}\nQuestioner's Answer: {answer}\nCriteria: {criteria}, Subtask: {subtask}") + content.append("Your response should be strictly in the below format:\n\nQuestion: \nAnswer: \nCriteria: \nSubtask: ") + if infomation: + content.append(str(infomation)) + return content + + def _check(self, images: ScreenImage, question, answer, criteria, subtask, *, max_tokens=4096, max_try_times=5, infomation: ImageInfomation = None, **kwargs): + messages = self.get_check_prompt(question, answer, criteria, subtask, images.images, infomation) + return gemini_generate_response(self.client, messages, max_tokens, max_try_times, **kwargs) + + def format_checked_response(self, response: Response): + # Extract the question, answer, and subtask from the normalized content + question_match = re.search(r"question:\s*(.*?)\nAnswer:", response.content, re.IGNORECASE | re.DOTALL) + answer_match = re.search(r"answer:\s*(.*?)\nCriteria", response.content, re.IGNORECASE | re.DOTALL) + criteria_match = re.search(r"criteria:\s*(.*?)\n(Subtask:|$)", response.content, re.IGNORECASE | re.DOTALL) + subtask_match = re.search(r"subtask:\s*(.*)", response.content, re.IGNORECASE) + + question = answer = subtask = None + + if question_match: + # Extract the matched groups + question = question_match.group(1).strip() + if answer_match: + answer = answer_match.group(1).strip() + if criteria_match: + criteria = criteria_match.group(1).strip() + if subtask_match: + subtask = subtask_match.group(1).strip() + + return QAData(question=question, answer=answer, criteria=criteria, subtask=subtask) + + def _format_response(self, response: Response) -> List[QAData]: + try: + qa_data = [] + content = json.loads(response.content) + for subtask, message in content.items(): + subtask = subtask.lower() + message_lower = {k.lower(): v for k, v in message.items()} + try: + question = message_lower["question"] + answer = message_lower["answer"] + qa_data.append(QAData(question=question, answer=answer, subtask=subtask)) + except KeyError as e: + logger.error(f"Failed to parse response: {message}") + logger.error(f"Error: {e}") + return qa_data + except Exception as e: + logger.error(f"Failed to format response: {e}") + return [] + + +@register_generator("claude") +class ClaudeGenerator(QAGenerator): + def __init__( + self, + prompt_file: str = os.path.join(os.path.dirname(__file__), "prompt.md"), + model="claude-3-5-sonnet-20240620", + example_path=os.path.join(os.path.dirname(__file__), "example"), + check_prompt=os.path.join(os.path.dirname(__file__), "check_prompt.md"), + ): + super().__init__(prompt_file) + API_KEY = os.getenv("ANTHROPIC_API_KEY") + if not API_KEY: + raise ValueError("ANTHROPIC_API_KEY environment variable not set.") + self.api_key = API_KEY + self.client = anthropic.Anthropic(api_key=self.api_key) + self.model = model + if os.path.exists(example_path): + self.example_path = example_path + else: + self.example_path = None + if os.path.exists(check_prompt): + with open(check_prompt, "r") as f: + self.check_prompt = f.read() + else: + self.check_prompt = check_prompt + + def format_messages(self, images: List[Image.Image], example_image: Image.Image, example_output: str, infomation: ImageInfomation): + example = [ + { + "type": "text", + "text": "Here are few examples about the task and the expected output format. You can take these as examples to generate your own questions.", + }, + format_claude_images(example_image), + { + "type": "text", + "text": example_output, + }, + ] + content = example + [format_claude_images(image) for image in images] + if infomation: + content.append({"type": "text", "text": str(infomation)}) + content.append( + { + "type": "text", + "text": "Please generate high-quality questions focusing on the information displayed within this webpage. Ensure your response adheres to the examples provided above and is structured in JSON format, incorporating regular expressions to validate the format.", + }, + ) + messages = [ + { + "role": "user", + "content": content, + }, + ] + return messages + + def _generate(self, images: ScreenImage, *, max_tokens=4096, max_try_times=5, infomation=None, **kwargs): + if self.example_path: + example_image_path = os.path.join(self.example_path, "example_website.png") + example_output_path = os.path.join(self.example_path, "example_output.json") + example_image = Image.open(example_image_path) + with open(example_output_path, "r") as f: + # example_output = f.read() + example_output = json.load(f) + example_output = json.dumps(example_output, indent=4) + + messages = self.format_messages(images.images, example_image, example_output, infomation) + + return claude_generate_response(client=self.client, model=self.model, messages=messages, max_tokens=max_tokens, max_try_times=max_try_times, json_format=True, system=self.prompt, **kwargs) + + def get_check_prompt(self, question: str, answer: str, criteria, subtask, images: List[Image.Image], infomation: ImageInfomation = None): + messages = [ + { + "role": "system", + "content": self.check_prompt, + } + ] + content = [] + for img in images: + content.append(format_claude_images(img)) + content.append( + { + "type": "text", + "text": f"Question: {question}\nQuestioner's Answer: {answer}\nCriteria: {criteria}\nSubtask: {subtask}", + }, + ) + if infomation: + content.append( + { + "type": "text", + "text": str(infomation), + }, + ) + content.append( + { + "type": "text", + "text": "Please rephrase or rewrite the high-quality question focusing on the information displayed within this webpage. Your response should be in the format of the examples provided above and in JSON format.", + }, + ) + messages.append( + { + "role": "user", + "content": content, + } + ) + return messages + + def _check(self, images: ScreenImage, question, answer, criteria, subtask, *, max_tokens=4096, max_try_times=5, **kwargs): + messages = self.get_check_prompt(question, answer, criteria, subtask, images.images) + return claude_generate_response(client=self.client, model=self.model, messages=messages, max_tokens=max_tokens, max_try_times=max_try_times, json_format=True, **kwargs) + + def format_checked_response(self, response: Response): + data = json.loads(response.content) + question = data.get("question", None) + answer = data.get("answer", None) + criteria = data.get("criteria", None) + subtask = data.get("subtask", None) + return QAData(question=question, answer=answer, criteria=criteria, subtask=subtask) + + def _format_response(self, response: Response) -> List[QAData]: + try: + qa_data = [] + content = json.loads(response.content) + for subtask, messages in content.items(): + subtask = subtask.lower() + for message in messages: + message_lower = {k.lower(): v for k, v in message.items()} + try: + question = message_lower["question"] + answer = message_lower["answer"] + criteria = message_lower["criteria"] + qa_data.append(QAData(question=question, answer=answer, criteria=criteria, subtask=subtask)) + except KeyError as e: + logger.error(f"Failed to parse response: {message}") + logger.error(f"Error: {e}") + return qa_data + except Exception as e: + logger.error(f"Failed to format response: {e}") + return [] diff --git a/tools/live_bench/live_bench/data_generator/question_finalizer.py b/tools/live_bench/live_bench/data_generator/question_finalizer.py index 5996e5cc..6aa5f69a 100644 --- a/tools/live_bench/live_bench/data_generator/question_finalizer.py +++ b/tools/live_bench/live_bench/data_generator/question_finalizer.py @@ -1,138 +1,138 @@ -import json -import os -from abc import ABC, abstractmethod -from typing import List - -import anthropic -import google.generativeai as genai -import openai -from live_bench.data_generator.qa_generator import QAData -from live_bench.data_generator.utils.claude import ( - claude_generate_response, - format_claude_images, -) -from live_bench.data_generator.utils.gemini import gemini_generate_response -from live_bench.data_generator.utils.gpt4v import ( - format_gpt4v_images, - gpt4v_generate_response, -) -from PIL import Image - -REJECT_TO_ANSWER = "Reject to answer" - - -class AnswerGetter(ABC): - @abstractmethod - def get_answer(self, question: str, images: List[Image.Image]): - raise NotImplementedError("get_answer not implemented") - - -class GPT4VAnswerGetter(AnswerGetter): - def __init__(self, model: str = "gpt-4o", api_key=None): - self.model = model - if api_key is None: - self.api_key = os.getenv("OPENAI_API_KEY", None) - else: - self.api_key = api_key - self.client = openai.OpenAI(api_key=self.api_key) - - def get_answer(self, question: str, images: List[Image.Image]): - messages = [{"role": "user", "content": format_gpt4v_images(images) + [{"type": "text", "text": question}]}] - response = gpt4v_generate_response(messages=messages, client=self.client, model=self.model) - if response.success: - return response.content - else: - return REJECT_TO_ANSWER - - -class ClaudeAnswerGetter(AnswerGetter): - def __init__(self, model: str = "claude-3-5-sonnet-20240620", api_key=None): - self.model = model - if api_key is None: - self.api_key = os.getenv("ANTHROPIC_API_KEY", None) - else: - self.api_key = api_key - self.client = anthropic.Anthropic(api_key=self.api_key) - - def get_answer(self, question: str, images: List[Image.Image]): - messages = [{"role": "user", "content": format_claude_images(images) + [{"type": "text", "text": question}]}] - response = claude_generate_response(self.client, self.model, messages) - if response.success: - return response.content - else: - return REJECT_TO_ANSWER - - -class GeminiAnswerGetter(AnswerGetter): - def __init__(self, model: str = "gemini-1.5-pro", api_key=None): - self.model = model - self.client = genai.GenerativeModel(model) - - def get_answer(self, question: str, images: List[Image.Image]): - response = gemini_generate_response(self.client, images + [question], max_tokens=2048) - if response.success: - return response.content - else: - return REJECT_TO_ANSWER - - -QUESTION_FINALIZER_PROMPT = """\ -You are a question setter, and your task is to modify the following question, answer, and scoring criteria to ensure: - -1. The question is clear and unambiguous. -2. The answer is correct and reasonable (although the original ground truth answer is mostly correct, it may not be perfect, and sometimes the answer maybe incorrect). -3. The scoring criteria are rational and facilitate the accurate assessment of responses. -4. The full score for the scoring criteria must be 10 points, and it must directly relate to the specific answer. -5. Except for some multiple-choice questions or other questions with only one possible answer, the scoring criteria should not be an all-or-nothing system. Partially correct answers should receive proportional points. -6. Ensure that the scoring system is flexible enough to accommodate slight variations in correct answers while still maintaining a standard for what is considered an acceptable answer. -7. Clearly define what constitutes a full score, partial score, and zero score. -8. The criteria should be as detailed as possible, even if a LLM without image understanding capabilities could still score the answer based on the criteria and ground truth answer correctly. - -Your task is to finalize these standards, thus ensuring the correctness of the answer and the rationality of the scoring criteria. - -Some tips: - -1. For some extremely hard open-ended questions where answers may vary, hitting all points perfectly may not be realistic. In such cases, you can relax the criteria slightly. For example, if there are five possible points in an answer, but answering three adequately could merit full points. An other option is to change the question to a multiple-choice / multi-select question. But remember, it only applies to extremely hard open-ended questions which are impossible to answer perfectly. -2. For some questions, changing the format might be beneficial. You can consider transforming them into different types of questions such as essay, fill-in-the-blank, multiple-choice (or multiple-select), true/false, ranking (e.g., based on time, importance, etc.), or matching questions to enhance the difficulty and rationality of the scoring criteria. -""" - -FINALIZER_OUTPUT_FORMAT_PROMPT = """\ -Please provide the final question, answer, and scoring criteria in the following json format: -{ - "question": "The final question", - "answer": "The final answer", - "criteria": "The final scoring criteria" -} - -One thing as a reminder is that if you want to add a new line in the json string, you should use the escape character "\\n" instead to represent the new line. -""" - - -class QuestionFinalizer(object): - def __init__(self, gpt4v_model: str = "gpt-4o", claude_model: str = "claude-3-5-sonnet-20240620", gemini_model: str = "gemini-1.5-pro"): - self.models = {"GPT4V": GPT4VAnswerGetter(gpt4v_model), "Claude": ClaudeAnswerGetter(claude_model), "Gemini": GeminiAnswerGetter(gemini_model)} - self.client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY", None)) - # self.client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY", None)) - - def finalize_question(self, question, answer, criteria, images: List[Image.Image]): - information = [f"[Original Question]\n{question}", f"[Original Answer]\n{answer}", f"[Original Criteria]\n{criteria}"] - information.append( - "Below are answers from three candidates for reference. These answers may not be correct but are reasonably credible (but mostly correct). If any candidate rejects to answer, consider whether there is an issue with the question (such as containing violent or graphic content, or having a clear political bias). If so, please make necessary modifications. For open-ended questions, also consider the reasonableness of these answers. If they are reasonable, you may need to adjust the scoring criteria or the answer itself." - ) - for model_name, model in self.models.items(): - information.append(f"[{model_name} Answer]\n{model.get_answer(question, images)}") - information.append(FINALIZER_OUTPUT_FORMAT_PROMPT) - prompt = "\n\n".join(information) - messages = [{"role": "user", "content": format_gpt4v_images(images) + [{"type": "text", "text": prompt}]}] - try: - response = gpt4v_generate_response(client=self.client, model="gpt-4o", messages=messages, system=QUESTION_FINALIZER_PROMPT) - if response.success: - data = json.loads(response.content) - return { - "question": data["question"], - "answer": data["answer"], - "criteria": data["criteria"], - } - except Exception as e: - print(f"Failed to generate response: {e}") - return None +import json +import os +from abc import ABC, abstractmethod +from typing import List + +import anthropic +import google.generativeai as genai +import openai +from live_bench.data_generator.qa_generator import QAData +from live_bench.data_generator.utils.claude import ( + claude_generate_response, + format_claude_images, +) +from live_bench.data_generator.utils.gemini import gemini_generate_response +from live_bench.data_generator.utils.gpt4v import ( + format_gpt4v_images, + gpt4v_generate_response, +) +from PIL import Image + +REJECT_TO_ANSWER = "Reject to answer" + + +class AnswerGetter(ABC): + @abstractmethod + def get_answer(self, question: str, images: List[Image.Image]): + raise NotImplementedError("get_answer not implemented") + + +class GPT4VAnswerGetter(AnswerGetter): + def __init__(self, model: str = "gpt-4o", api_key=None): + self.model = model + if api_key is None: + self.api_key = os.getenv("OPENAI_API_KEY", None) + else: + self.api_key = api_key + self.client = openai.OpenAI(api_key=self.api_key) + + def get_answer(self, question: str, images: List[Image.Image]): + messages = [{"role": "user", "content": format_gpt4v_images(images) + [{"type": "text", "text": question}]}] + response = gpt4v_generate_response(messages=messages, client=self.client, model=self.model) + if response.success: + return response.content + else: + return REJECT_TO_ANSWER + + +class ClaudeAnswerGetter(AnswerGetter): + def __init__(self, model: str = "claude-3-5-sonnet-20240620", api_key=None): + self.model = model + if api_key is None: + self.api_key = os.getenv("ANTHROPIC_API_KEY", None) + else: + self.api_key = api_key + self.client = anthropic.Anthropic(api_key=self.api_key) + + def get_answer(self, question: str, images: List[Image.Image]): + messages = [{"role": "user", "content": format_claude_images(images) + [{"type": "text", "text": question}]}] + response = claude_generate_response(self.client, self.model, messages) + if response.success: + return response.content + else: + return REJECT_TO_ANSWER + + +class GeminiAnswerGetter(AnswerGetter): + def __init__(self, model: str = "gemini-1.5-pro", api_key=None): + self.model = model + self.client = genai.GenerativeModel(model) + + def get_answer(self, question: str, images: List[Image.Image]): + response = gemini_generate_response(self.client, images + [question], max_tokens=2048) + if response.success: + return response.content + else: + return REJECT_TO_ANSWER + + +QUESTION_FINALIZER_PROMPT = """\ +You are a question setter, and your task is to modify the following question, answer, and scoring criteria to ensure: + +1. The question is clear and unambiguous. +2. The answer is correct and reasonable (although the original ground truth answer is mostly correct, it may not be perfect, and sometimes the answer maybe incorrect). +3. The scoring criteria are rational and facilitate the accurate assessment of responses. +4. The full score for the scoring criteria must be 10 points, and it must directly relate to the specific answer. +5. Except for some multiple-choice questions or other questions with only one possible answer, the scoring criteria should not be an all-or-nothing system. Partially correct answers should receive proportional points. +6. Ensure that the scoring system is flexible enough to accommodate slight variations in correct answers while still maintaining a standard for what is considered an acceptable answer. +7. Clearly define what constitutes a full score, partial score, and zero score. +8. The criteria should be as detailed as possible, even if a LLM without image understanding capabilities could still score the answer based on the criteria and ground truth answer correctly. + +Your task is to finalize these standards, thus ensuring the correctness of the answer and the rationality of the scoring criteria. + +Some tips: + +1. For some extremely hard open-ended questions where answers may vary, hitting all points perfectly may not be realistic. In such cases, you can relax the criteria slightly. For example, if there are five possible points in an answer, but answering three adequately could merit full points. An other option is to change the question to a multiple-choice / multi-select question. But remember, it only applies to extremely hard open-ended questions which are impossible to answer perfectly. +2. For some questions, changing the format might be beneficial. You can consider transforming them into different types of questions such as essay, fill-in-the-blank, multiple-choice (or multiple-select), true/false, ranking (e.g., based on time, importance, etc.), or matching questions to enhance the difficulty and rationality of the scoring criteria. +""" + +FINALIZER_OUTPUT_FORMAT_PROMPT = """\ +Please provide the final question, answer, and scoring criteria in the following json format: +{ + "question": "The final question", + "answer": "The final answer", + "criteria": "The final scoring criteria" +} + +One thing as a reminder is that if you want to add a new line in the json string, you should use the escape character "\\n" instead to represent the new line. +""" + + +class QuestionFinalizer(object): + def __init__(self, gpt4v_model: str = "gpt-4o", claude_model: str = "claude-3-5-sonnet-20240620", gemini_model: str = "gemini-1.5-pro"): + self.models = {"GPT4V": GPT4VAnswerGetter(gpt4v_model), "Claude": ClaudeAnswerGetter(claude_model), "Gemini": GeminiAnswerGetter(gemini_model)} + self.client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY", None)) + # self.client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY", None)) + + def finalize_question(self, question, answer, criteria, images: List[Image.Image]): + information = [f"[Original Question]\n{question}", f"[Original Answer]\n{answer}", f"[Original Criteria]\n{criteria}"] + information.append( + "Below are answers from three candidates for reference. These answers may not be correct but are reasonably credible (but mostly correct). If any candidate rejects to answer, consider whether there is an issue with the question (such as containing violent or graphic content, or having a clear political bias). If so, please make necessary modifications. For open-ended questions, also consider the reasonableness of these answers. If they are reasonable, you may need to adjust the scoring criteria or the answer itself." + ) + for model_name, model in self.models.items(): + information.append(f"[{model_name} Answer]\n{model.get_answer(question, images)}") + information.append(FINALIZER_OUTPUT_FORMAT_PROMPT) + prompt = "\n\n".join(information) + messages = [{"role": "user", "content": format_gpt4v_images(images) + [{"type": "text", "text": prompt}]}] + try: + response = gpt4v_generate_response(client=self.client, model="gpt-4o", messages=messages, system=QUESTION_FINALIZER_PROMPT) + if response.success: + data = json.loads(response.content) + return { + "question": data["question"], + "answer": data["answer"], + "criteria": data["criteria"], + } + except Exception as e: + print(f"Failed to generate response: {e}") + return None diff --git a/tools/live_bench/live_bench/data_generator/response.py b/tools/live_bench/live_bench/data_generator/response.py index 9eed882b..10a082d4 100644 --- a/tools/live_bench/live_bench/data_generator/response.py +++ b/tools/live_bench/live_bench/data_generator/response.py @@ -1,12 +1,12 @@ -class Response(object): - def __init__(self, success: bool, content: str, full_log: dict): - self.success = success - self.content = content - self.full_log = full_log - - def to_dict(self): - return { - "success": self.success, - "content": self.content, - "full_log": self.full_log, - } +class Response(object): + def __init__(self, success: bool, content: str, full_log: dict): + self.success = success + self.content = content + self.full_log = full_log + + def to_dict(self): + return { + "success": self.success, + "content": self.content, + "full_log": self.full_log, + } diff --git a/tools/live_bench/live_bench/data_generator/score_getter.py b/tools/live_bench/live_bench/data_generator/score_getter.py index 82950044..78ce99d0 100644 --- a/tools/live_bench/live_bench/data_generator/score_getter.py +++ b/tools/live_bench/live_bench/data_generator/score_getter.py @@ -1,164 +1,164 @@ -import json -import os -import random -from abc import ABC, abstractmethod -from typing import List - -import anthropic -import openai -from live_bench.data_generator.qa_generator import Response -from live_bench.data_generator.utils.claude import ( - claude_generate_response, - format_claude_images, -) -from live_bench.data_generator.utils.gpt4v import ( - format_gpt4v_images, - gpt4v_generate_response, -) -from live_bench.screen_shoter import ScreenImage -from PIL import Image - - -class Score(object): - def __init__(self, score: int, reason: str): - self.score = score - self.reason = reason - - -class ScoreGetter(ABC): - def get_name(self): - return self.name - - @abstractmethod - def get_score(self, question: str, answer: str, images: ScreenImage): - raise NotImplementedError("get_score not implemented") - - def __call__(self, question: str, answer: str, images: ScreenImage, **kwargs): - return self.get_score(question, answer, images, **kwargs) - - -class ScoreGetterRegistry: - def __init__(self): - self.score_getters = {} - - def register_score_getter(self, name): - def decorator(cls): - self.score_getters[name] = cls - cls.name = name - return cls - - return decorator - - def get_score_getter(self, name) -> ScoreGetter: - return self.score_getters[name] - - def get_random_score_getter(self) -> ScoreGetter: - return random.choice(list(self.score_getters.values())) - - -generator_registry = ScoreGetterRegistry() - - -def register_score_getter(name): - return generator_registry.register_score_getter(name) - - -def get_score_getter(name, *args, **kwargs) -> ScoreGetter: - return generator_registry.get_score_getter(name)(*args, **kwargs) - - -def get_random_score_getter(*args, **kwargs) -> ScoreGetter: - return generator_registry.get_random_score_getter()(*args, **kwargs) - - -@register_score_getter("gpt4v") -class GPT4VScoreGetter(ScoreGetter): - def __init__(self, prompt: str = os.path.join(os.path.dirname(__file__), "score_prompt.md"), model="gpt-4o", example_path=os.path.join(os.path.dirname(__file__), "example")): - super().__init__() - if os.path.exists(prompt): - with open(prompt, "r") as f: - self.prompt = f.read() - else: - self.prompt = prompt - API_KEY = os.getenv("OPENAI_API_KEY") - if not API_KEY: - raise ValueError("OPENAI_API_KEY environment variable not set.") - self.api_key = API_KEY - self.client = openai.OpenAI(api_key=self.api_key) - self.model = model - if os.path.exists(example_path) and os.path.isfile(os.path.join(example_path, "example_score_input.md")): - with open(example_path, "r") as f: - self.example = f.read() - else: - self.example = None - - def _format_prompt(self, question: str, answer: str, images: List[Image.Image]): - prompt = [{"role": "system", "content": self.prompt}] - messages = [] - for image in images: - messages.append(format_gpt4v_images(image)) - messages.append({"type": "text", "text": f"Question: {question}\nQuestioner's Answer: {answer}"}) - messages.append({"type": "text", "text": 'You should format you answer into json format like this: {"reason": "some reason", "score": 10}'}) - prompt.append({"role": "user", "content": messages}) - return prompt - - def get_score(self, question: str, answer: str, images: ScreenImage, *, max_tokens=4096, max_try_times=5, **kwargs) -> Score: - prompt = self._format_prompt(question, answer, images) - try: - response = gpt4v_generate_response(client=self.client, model=self.model, messages=prompt, max_tokens=max_tokens, max_try_times=max_try_times, json_format=True, **kwargs) - if response.success: - content = json.loads(response.content) - score = content.get("score", None) - reason = content.get("reason", None) - return Score(score=score, reason=reason) - else: - return Score(score=None, reason=response.content) - except Exception as e: - return Score(score=None, reason=str(e)) - - -@register_score_getter("claude") -class ClaudeScoreGetter(ScoreGetter): - def __init__(self, prompt: str = os.path.join(os.path.dirname(__file__), "score_prompt.md"), model="claude-3-5-sonnet-20240620", example_path=os.path.join(os.path.dirname(__file__), "example")): - super().__init__() - if os.path.exists(prompt): - with open(prompt, "r") as f: - self.prompt = f.read() - else: - self.prompt = prompt - API_KEY = os.getenv("ANTHROPIC_API_KEY") - if not API_KEY: - raise ValueError("ANTHROPIC_API_KEY environment variable not set.") - self.api_key = API_KEY - self.client = anthropic.Anthropic(api_key=self.api_key) - self.model = model - if os.path.exists(example_path) and os.path.isfile(os.path.join(example_path, "example_score_input.md")): - with open(example_path, "r") as f: - self.example = f.read() - else: - self.example = None - - def _format_prompt(self, question: str, answer: str, images: List[Image.Image]): - # prompt = [{"role": "system", "content": self.prompt}] - prompt = [] - messages = [] - for image in images: - messages.append(format_claude_images(image)) - messages.append({"type": "text", "text": f"Question: {question}\nQuestioner's Answer: {answer}"}) - messages.append({"type": "text", "text": 'You should format you answer into JSON format like this: { "reason": "some reason", "score": 10 }'}) - prompt.append({"role": "user", "content": messages}) - return prompt - - def get_score(self, question: str, answer: str, images: ScreenImage, *, max_tokens=4096, max_try_times=5, **kwargs) -> Score: - prompt = self._format_prompt(question, answer, images) - try: - response = claude_generate_response(client=self.client, model=self.model, messages=prompt, system=self.prompt, max_tokens=max_tokens, max_try_times=max_try_times, **kwargs) - if response.success: - content = json.loads(response.content) - score = content.get("score", None) - reason = content.get("reason", None) - return Score(score=score, reason=reason) - else: - return Score(score=None, reason=response.content) - except Exception as e: - return Score(score=None, reason=str(e)) +import json +import os +import random +from abc import ABC, abstractmethod +from typing import List + +import anthropic +import openai +from live_bench.data_generator.qa_generator import Response +from live_bench.data_generator.utils.claude import ( + claude_generate_response, + format_claude_images, +) +from live_bench.data_generator.utils.gpt4v import ( + format_gpt4v_images, + gpt4v_generate_response, +) +from live_bench.screen_shoter import ScreenImage +from PIL import Image + + +class Score(object): + def __init__(self, score: int, reason: str): + self.score = score + self.reason = reason + + +class ScoreGetter(ABC): + def get_name(self): + return self.name + + @abstractmethod + def get_score(self, question: str, answer: str, images: ScreenImage): + raise NotImplementedError("get_score not implemented") + + def __call__(self, question: str, answer: str, images: ScreenImage, **kwargs): + return self.get_score(question, answer, images, **kwargs) + + +class ScoreGetterRegistry: + def __init__(self): + self.score_getters = {} + + def register_score_getter(self, name): + def decorator(cls): + self.score_getters[name] = cls + cls.name = name + return cls + + return decorator + + def get_score_getter(self, name) -> ScoreGetter: + return self.score_getters[name] + + def get_random_score_getter(self) -> ScoreGetter: + return random.choice(list(self.score_getters.values())) + + +generator_registry = ScoreGetterRegistry() + + +def register_score_getter(name): + return generator_registry.register_score_getter(name) + + +def get_score_getter(name, *args, **kwargs) -> ScoreGetter: + return generator_registry.get_score_getter(name)(*args, **kwargs) + + +def get_random_score_getter(*args, **kwargs) -> ScoreGetter: + return generator_registry.get_random_score_getter()(*args, **kwargs) + + +@register_score_getter("gpt4v") +class GPT4VScoreGetter(ScoreGetter): + def __init__(self, prompt: str = os.path.join(os.path.dirname(__file__), "score_prompt.md"), model="gpt-4o", example_path=os.path.join(os.path.dirname(__file__), "example")): + super().__init__() + if os.path.exists(prompt): + with open(prompt, "r") as f: + self.prompt = f.read() + else: + self.prompt = prompt + API_KEY = os.getenv("OPENAI_API_KEY") + if not API_KEY: + raise ValueError("OPENAI_API_KEY environment variable not set.") + self.api_key = API_KEY + self.client = openai.OpenAI(api_key=self.api_key) + self.model = model + if os.path.exists(example_path) and os.path.isfile(os.path.join(example_path, "example_score_input.md")): + with open(example_path, "r") as f: + self.example = f.read() + else: + self.example = None + + def _format_prompt(self, question: str, answer: str, images: List[Image.Image]): + prompt = [{"role": "system", "content": self.prompt}] + messages = [] + for image in images: + messages.append(format_gpt4v_images(image)) + messages.append({"type": "text", "text": f"Question: {question}\nQuestioner's Answer: {answer}"}) + messages.append({"type": "text", "text": 'You should format you answer into json format like this: {"reason": "some reason", "score": 10}'}) + prompt.append({"role": "user", "content": messages}) + return prompt + + def get_score(self, question: str, answer: str, images: ScreenImage, *, max_tokens=4096, max_try_times=5, **kwargs) -> Score: + prompt = self._format_prompt(question, answer, images) + try: + response = gpt4v_generate_response(client=self.client, model=self.model, messages=prompt, max_tokens=max_tokens, max_try_times=max_try_times, json_format=True, **kwargs) + if response.success: + content = json.loads(response.content) + score = content.get("score", None) + reason = content.get("reason", None) + return Score(score=score, reason=reason) + else: + return Score(score=None, reason=response.content) + except Exception as e: + return Score(score=None, reason=str(e)) + + +@register_score_getter("claude") +class ClaudeScoreGetter(ScoreGetter): + def __init__(self, prompt: str = os.path.join(os.path.dirname(__file__), "score_prompt.md"), model="claude-3-5-sonnet-20240620", example_path=os.path.join(os.path.dirname(__file__), "example")): + super().__init__() + if os.path.exists(prompt): + with open(prompt, "r") as f: + self.prompt = f.read() + else: + self.prompt = prompt + API_KEY = os.getenv("ANTHROPIC_API_KEY") + if not API_KEY: + raise ValueError("ANTHROPIC_API_KEY environment variable not set.") + self.api_key = API_KEY + self.client = anthropic.Anthropic(api_key=self.api_key) + self.model = model + if os.path.exists(example_path) and os.path.isfile(os.path.join(example_path, "example_score_input.md")): + with open(example_path, "r") as f: + self.example = f.read() + else: + self.example = None + + def _format_prompt(self, question: str, answer: str, images: List[Image.Image]): + # prompt = [{"role": "system", "content": self.prompt}] + prompt = [] + messages = [] + for image in images: + messages.append(format_claude_images(image)) + messages.append({"type": "text", "text": f"Question: {question}\nQuestioner's Answer: {answer}"}) + messages.append({"type": "text", "text": 'You should format you answer into JSON format like this: { "reason": "some reason", "score": 10 }'}) + prompt.append({"role": "user", "content": messages}) + return prompt + + def get_score(self, question: str, answer: str, images: ScreenImage, *, max_tokens=4096, max_try_times=5, **kwargs) -> Score: + prompt = self._format_prompt(question, answer, images) + try: + response = claude_generate_response(client=self.client, model=self.model, messages=prompt, system=self.prompt, max_tokens=max_tokens, max_try_times=max_try_times, **kwargs) + if response.success: + content = json.loads(response.content) + score = content.get("score", None) + reason = content.get("reason", None) + return Score(score=score, reason=reason) + else: + return Score(score=None, reason=response.content) + except Exception as e: + return Score(score=None, reason=str(e)) diff --git a/tools/live_bench/live_bench/data_generator/score_prompt.md b/tools/live_bench/live_bench/data_generator/score_prompt.md index 1de806a7..a427e8e6 100644 --- a/tools/live_bench/live_bench/data_generator/score_prompt.md +++ b/tools/live_bench/live_bench/data_generator/score_prompt.md @@ -1,20 +1,20 @@ -Based on the multi-round Q&A regarding the image, please evaluate each question and answer from the multi-round Q&A based on the image for their authenticity (whether the information can be directly obtained from the image or reasonably inferred) and logical coherence. For each Q&A pair, provide a rating from 1 to 10, where 1 indicates very poor and 10 indicates excellent. Additionally, please provide a brief explanation for each rating. - -Here are the criteria for evaluating the Q&A pairs: - -### 1. Authenticity (5 points) -- **5 Points**: The information is directly observable in the image or can be reasonably inferred with strong evidence. -- **3 Points**: The information has a plausible connection to the image but requires assumptions that are not strongly supported by the image. -- **1 Point**: The information cannot be observed or reasonably inferred from the image; it seems unrelated or speculative. - -### 2. Logical Coherence (3 points) -- **3 Points**: The answer logically follows from the question and maintains consistency with the image context. -- **2 Points**: There are minor logical gaps or inconsistencies in the answer relative to the question. -- **1 Point**: The answer is logically inconsistent or contradictory to the question or image context. - -### 3. Clarity and Precision (2 points) -- **2 Points**: The question and answer are clearly articulated and precisely address specifics of the image. -- **1 Point**: The question or answer is somewhat vague or overly general, lacking specific details related to the image. -- **0 Points**: The question or answer is unclear or too ambiguous to determine relevance to the image. - -Each Q&A pair can score a maximum of 10 points. The sum of points from these three categories determines the final score for each pair. Provide a brief explanation for each rating, focusing on how well the Q&A adheres to these criteria. +Based on the multi-round Q&A regarding the image, please evaluate each question and answer from the multi-round Q&A based on the image for their authenticity (whether the information can be directly obtained from the image or reasonably inferred) and logical coherence. For each Q&A pair, provide a rating from 1 to 10, where 1 indicates very poor and 10 indicates excellent. Additionally, please provide a brief explanation for each rating. + +Here are the criteria for evaluating the Q&A pairs: + +### 1. Authenticity (5 points) +- **5 Points**: The information is directly observable in the image or can be reasonably inferred with strong evidence. +- **3 Points**: The information has a plausible connection to the image but requires assumptions that are not strongly supported by the image. +- **1 Point**: The information cannot be observed or reasonably inferred from the image; it seems unrelated or speculative. + +### 2. Logical Coherence (3 points) +- **3 Points**: The answer logically follows from the question and maintains consistency with the image context. +- **2 Points**: There are minor logical gaps or inconsistencies in the answer relative to the question. +- **1 Point**: The answer is logically inconsistent or contradictory to the question or image context. + +### 3. Clarity and Precision (2 points) +- **2 Points**: The question and answer are clearly articulated and precisely address specifics of the image. +- **1 Point**: The question or answer is somewhat vague or overly general, lacking specific details related to the image. +- **0 Points**: The question or answer is unclear or too ambiguous to determine relevance to the image. + +Each Q&A pair can score a maximum of 10 points. The sum of points from these three categories determines the final score for each pair. Provide a brief explanation for each rating, focusing on how well the Q&A adheres to these criteria. diff --git a/tools/live_bench/live_bench/data_generator/utils/claude.py b/tools/live_bench/live_bench/data_generator/utils/claude.py index e1b5e6dc..47b4b867 100644 --- a/tools/live_bench/live_bench/data_generator/utils/claude.py +++ b/tools/live_bench/live_bench/data_generator/utils/claude.py @@ -1,69 +1,69 @@ -import base64 -import io -import logging -from time import sleep -from typing import List, Union - -import anthropic -from live_bench.data_generator.response import Response -from PIL import Image - -logger = logging.getLogger("lmms-eval") - - -def format_claude_images(image: Union[Image.Image, List[Image.Image]]): - if isinstance(image, list): - return [format_claude_images(img) for img in image] - buffered = io.BytesIO() - image.save(buffered, format="PNG") - img_str = base64.b64encode(buffered.getvalue()).decode("utf-8") - return { - "type": "image", - "source": { - "type": "base64", - "media_type": "image/png", - "data": img_str, - }, - } - - -def claude_generate_response(client: anthropic.Anthropic, model, messages, max_tokens: int = 4096, max_try_times=5, system=None, json_format="auto", test=False, **kwargs): - if json_format == "auto": - json_format = False - for message in messages: - if message.get("role") == "user": - contents = message.get("content", []) - if isinstance(contents, str): - if "json" in contents: - json_format = True - break - else: - for content in contents: - if content.get("type", None) == "text" and "json" in content.get("text", ""): - json_format = True - break - - if json_format: - messages.append({"role": "assistant", "content": "{"}) - - def _generate(): - if system: - return client.messages.create(model=model, messages=messages, max_tokens=max_tokens, system=system, **kwargs) - else: - return client.messages.create(model=model, messages=messages, max_tokens=max_tokens, **kwargs) - - for times in range(max_try_times): - try: - response = _generate() - response_str = response.content[0].text - if json_format: - response_str = "{" + response_str - return Response(success=True, content=response_str, full_log={"input": messages, "output": response.to_dict()}) - except Exception as e: - logger.error(f"Failed to generate response: {e}") - if times < max_try_times - 1: - logger.info(f"Retrying... ({times+1}/{max_try_times})") - sleep(3) - else: - logger.error("Failed to generate response after retrying.") - return Response(success=False, content=str(e), full_log={"input": messages, "output": None}) +import base64 +import io +import logging +from time import sleep +from typing import List, Union + +import anthropic +from live_bench.data_generator.response import Response +from PIL import Image + +logger = logging.getLogger("lmms-eval") + + +def format_claude_images(image: Union[Image.Image, List[Image.Image]]): + if isinstance(image, list): + return [format_claude_images(img) for img in image] + buffered = io.BytesIO() + image.save(buffered, format="PNG") + img_str = base64.b64encode(buffered.getvalue()).decode("utf-8") + return { + "type": "image", + "source": { + "type": "base64", + "media_type": "image/png", + "data": img_str, + }, + } + + +def claude_generate_response(client: anthropic.Anthropic, model, messages, max_tokens: int = 4096, max_try_times=5, system=None, json_format="auto", test=False, **kwargs): + if json_format == "auto": + json_format = False + for message in messages: + if message.get("role") == "user": + contents = message.get("content", []) + if isinstance(contents, str): + if "json" in contents: + json_format = True + break + else: + for content in contents: + if content.get("type", None) == "text" and "json" in content.get("text", ""): + json_format = True + break + + if json_format: + messages.append({"role": "assistant", "content": "{"}) + + def _generate(): + if system: + return client.messages.create(model=model, messages=messages, max_tokens=max_tokens, system=system, **kwargs) + else: + return client.messages.create(model=model, messages=messages, max_tokens=max_tokens, **kwargs) + + for times in range(max_try_times): + try: + response = _generate() + response_str = response.content[0].text + if json_format: + response_str = "{" + response_str + return Response(success=True, content=response_str, full_log={"input": messages, "output": response.to_dict()}) + except Exception as e: + logger.error(f"Failed to generate response: {e}") + if times < max_try_times - 1: + logger.info(f"Retrying... ({times+1}/{max_try_times})") + sleep(3) + else: + logger.error("Failed to generate response after retrying.") + return Response(success=False, content=str(e), full_log={"input": messages, "output": None}) diff --git a/tools/live_bench/live_bench/data_generator/utils/extract_infomation.py b/tools/live_bench/live_bench/data_generator/utils/extract_infomation.py index a079f42e..b46f3af5 100644 --- a/tools/live_bench/live_bench/data_generator/utils/extract_infomation.py +++ b/tools/live_bench/live_bench/data_generator/utils/extract_infomation.py @@ -1,131 +1,131 @@ -import logging -import os - -import anthropic -import openai -import requests -from bs4 import BeautifulSoup -from live_bench.data_generator.response import Response -from live_bench.data_generator.utils.claude import ( - claude_generate_response, - format_claude_images, -) -from live_bench.data_generator.utils.gpt4v import ( - format_gpt4v_images, - gpt4v_generate_response, -) -from live_bench.screen_shoter import ScreenImage -from live_bench.websites import Website - -logger = logging.getLogger("live-bench") - - -EXTRACT_TEXT_PROMPT: str = """\ -These are the images of the website that we have captured. Please extract the text from the website. -You should extract the text from the website as detailed as possible. -Only output the text extracted from the website, do not include any other information. -""" - -FIND_IMAGES_FEATURES_PROMPT: str = """\ -This is a screenshot from a news website. Your task is to identify the meaningful images in this screenshot and extract relevant information about these images, such as the environment depicted, the actions and expressions of the people, and the connection between these images and the corresponding text. You need to think deeply about these images and provide as much detailed and useful information as possible. -""" - -THINK_DIFFERENTLY_PROMPT: str = """\ -What makes this website different from other websites? What is special about its news? Since it is a news website, where is the "new" reflected? Do not give a generalized answer; you need to provide detailed answers based on the specific content of each news article and the accompanying illustrations. -""" - - -class ImageInfomation(object): - def __init__(self, text=None, image_features=None, differnt_points=None): - self.text = text - self.image_features = image_features - self.differnt_points = differnt_points - - def to_dict(self): - res = {} - if self.text: - res["Text Extracted in the HTML"] = self.text - if self.image_features: - res["Image Features"] = self.image_features - if self.differnt_points: - res["Interesting Points"] = self.differnt_points - return res - - def __str__(self): - return self.get_info() - - def get_info(self): - res_list = [f"## {key}\n\n{value}" for key, value in self.to_dict().items()] - if res_list: - return "**Here is something you can take as reference.**\n\n" + "\n\n".join(res_list) - else: - return "" - - -class InfomationExtractor(object): - def __init__(self, model="claude-3-5-sonnet-20240620", openai_api_key=None, anthropic_api_key=None): - if not anthropic_api_key: - anthropic_api_key = os.getenv("ANTHROPIC_API_KEY", None) - if not openai_api_key: - openai_api_key = os.getenv("OPENAI_API_KEY", None) - if "gpt" in model: - self.client = openai.OpenAI(api_key=openai_api_key) - self.generate_response = gpt4v_generate_response - self.format_images = format_gpt4v_images - elif "claude" in model: - self.client = anthropic.Anthropic(api_key=anthropic_api_key) - self.generate_response = claude_generate_response - self.format_images = format_claude_images - self.model = model - - def extract_text_from_html(self, url): - response = requests.get(url) - soup = BeautifulSoup(response.text, "html.parser") - text = "\n".join(soup.stripped_strings) - return text - - def extract_text_from_html_from_gpt(self, screen_image: ScreenImage, **kwargs) -> Response: - website: Website = screen_image.website - if website.url: - url = website.url - text = self.extract_text_from_html(url) - text = f"Below is the text extracted from the website {url} for you to take reference:\n{text}" - else: - text = "" - text = f"{EXTRACT_TEXT_PROMPT}\n{text}" - messages = [ - { - "role": "user", - "content": [{"type": "text", "text": text}] + self.format_images(screen_image.images), - } - ] - response = self.generate_response(messages=messages, model=self.model, client=self.client, json_format=False, **kwargs) - return response - - def extract_infomation(self, screen_image: ScreenImage, **kwargs) -> ImageInfomation: - ocrs = self.extract_text_from_html_from_gpt(screen_image) - infomation = ImageInfomation() - if ocrs.success: - ocrs = f"Below is the text extracted from the website for you to take reference:\n{ocrs.content}" - infomation.text = ocrs - else: - ocrs = "" - messages = [ - { - "role": "user", - "content": [{"type": "text", "text": f"{FIND_IMAGES_FEATURES_PROMPT}\n{ocrs}"}] + self.format_images(screen_image.images), - } - ] - response = self.generate_response(messages=messages, model=self.model, client=self.client, json_format=False, **kwargs) - if response.success: - infomation.image_features = response.content - messages = [ - { - "role": "user", - "content": [{"type": "text", "text": f"{THINK_DIFFERENTLY_PROMPT}\n\n{str(infomation)}"}] + self.format_images(screen_image.images), - } - ] - response = self.generate_response(messages=messages, model=self.model, client=self.client, json_format=False, **kwargs) - if response.success: - infomation.differnt_points = response.content - return infomation +import logging +import os + +import anthropic +import openai +import requests +from bs4 import BeautifulSoup +from live_bench.data_generator.response import Response +from live_bench.data_generator.utils.claude import ( + claude_generate_response, + format_claude_images, +) +from live_bench.data_generator.utils.gpt4v import ( + format_gpt4v_images, + gpt4v_generate_response, +) +from live_bench.screen_shoter import ScreenImage +from live_bench.websites import Website + +logger = logging.getLogger("live-bench") + + +EXTRACT_TEXT_PROMPT: str = """\ +These are the images of the website that we have captured. Please extract the text from the website. +You should extract the text from the website as detailed as possible. +Only output the text extracted from the website, do not include any other information. +""" + +FIND_IMAGES_FEATURES_PROMPT: str = """\ +This is a screenshot from a news website. Your task is to identify the meaningful images in this screenshot and extract relevant information about these images, such as the environment depicted, the actions and expressions of the people, and the connection between these images and the corresponding text. You need to think deeply about these images and provide as much detailed and useful information as possible. +""" + +THINK_DIFFERENTLY_PROMPT: str = """\ +What makes this website different from other websites? What is special about its news? Since it is a news website, where is the "new" reflected? Do not give a generalized answer; you need to provide detailed answers based on the specific content of each news article and the accompanying illustrations. +""" + + +class ImageInfomation(object): + def __init__(self, text=None, image_features=None, differnt_points=None): + self.text = text + self.image_features = image_features + self.differnt_points = differnt_points + + def to_dict(self): + res = {} + if self.text: + res["Text Extracted in the HTML"] = self.text + if self.image_features: + res["Image Features"] = self.image_features + if self.differnt_points: + res["Interesting Points"] = self.differnt_points + return res + + def __str__(self): + return self.get_info() + + def get_info(self): + res_list = [f"## {key}\n\n{value}" for key, value in self.to_dict().items()] + if res_list: + return "**Here is something you can take as reference.**\n\n" + "\n\n".join(res_list) + else: + return "" + + +class InfomationExtractor(object): + def __init__(self, model="claude-3-5-sonnet-20240620", openai_api_key=None, anthropic_api_key=None): + if not anthropic_api_key: + anthropic_api_key = os.getenv("ANTHROPIC_API_KEY", None) + if not openai_api_key: + openai_api_key = os.getenv("OPENAI_API_KEY", None) + if "gpt" in model: + self.client = openai.OpenAI(api_key=openai_api_key) + self.generate_response = gpt4v_generate_response + self.format_images = format_gpt4v_images + elif "claude" in model: + self.client = anthropic.Anthropic(api_key=anthropic_api_key) + self.generate_response = claude_generate_response + self.format_images = format_claude_images + self.model = model + + def extract_text_from_html(self, url): + response = requests.get(url) + soup = BeautifulSoup(response.text, "html.parser") + text = "\n".join(soup.stripped_strings) + return text + + def extract_text_from_html_from_gpt(self, screen_image: ScreenImage, **kwargs) -> Response: + website: Website = screen_image.website + if website.url: + url = website.url + text = self.extract_text_from_html(url) + text = f"Below is the text extracted from the website {url} for you to take reference:\n{text}" + else: + text = "" + text = f"{EXTRACT_TEXT_PROMPT}\n{text}" + messages = [ + { + "role": "user", + "content": [{"type": "text", "text": text}] + self.format_images(screen_image.images), + } + ] + response = self.generate_response(messages=messages, model=self.model, client=self.client, json_format=False, **kwargs) + return response + + def extract_infomation(self, screen_image: ScreenImage, **kwargs) -> ImageInfomation: + ocrs = self.extract_text_from_html_from_gpt(screen_image) + infomation = ImageInfomation() + if ocrs.success: + ocrs = f"Below is the text extracted from the website for you to take reference:\n{ocrs.content}" + infomation.text = ocrs + else: + ocrs = "" + messages = [ + { + "role": "user", + "content": [{"type": "text", "text": f"{FIND_IMAGES_FEATURES_PROMPT}\n{ocrs}"}] + self.format_images(screen_image.images), + } + ] + response = self.generate_response(messages=messages, model=self.model, client=self.client, json_format=False, **kwargs) + if response.success: + infomation.image_features = response.content + messages = [ + { + "role": "user", + "content": [{"type": "text", "text": f"{THINK_DIFFERENTLY_PROMPT}\n\n{str(infomation)}"}] + self.format_images(screen_image.images), + } + ] + response = self.generate_response(messages=messages, model=self.model, client=self.client, json_format=False, **kwargs) + if response.success: + infomation.differnt_points = response.content + return infomation diff --git a/tools/live_bench/live_bench/data_generator/utils/gemini.py b/tools/live_bench/live_bench/data_generator/utils/gemini.py index 26987bb8..d57495cd 100644 --- a/tools/live_bench/live_bench/data_generator/utils/gemini.py +++ b/tools/live_bench/live_bench/data_generator/utils/gemini.py @@ -1,38 +1,38 @@ -import logging -from time import sleep - -import google.generativeai as genai -from google.generativeai.types import HarmBlockThreshold, HarmCategory -from live_bench.data_generator.response import Response - -logger = logging.getLogger("lmms-eval") - - -def gemini_generate_response(client: genai.GenerativeModel, messages, max_tokens: int, max_try_times: int = 5, **kwargs): - generation_config = genai.GenerationConfig(max_output_tokens=max_tokens) - - def _generate(): - return client.generate_content( - messages, - generation_config=generation_config, - safety_settings={ - HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE, - HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE, - HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE, - HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE, - }, - **kwargs, - ) - - for times in range(max_try_times): - try: - response = _generate() - return Response(success=True, content=response.text, full_log={"input": messages, "output": response}) - except Exception as e: - logger.error(f"Failed to generate response: {e}") - if times < max_try_times - 1: - logger.info(f"Retrying... ({times+1}/{max_try_times})") - sleep(3) - else: - logger.error("Failed to generate response after retrying.") - return Response(success=False, content=str(e), full_log={"input": messages, "output": None}) +import logging +from time import sleep + +import google.generativeai as genai +from google.generativeai.types import HarmBlockThreshold, HarmCategory +from live_bench.data_generator.response import Response + +logger = logging.getLogger("lmms-eval") + + +def gemini_generate_response(client: genai.GenerativeModel, messages, max_tokens: int, max_try_times: int = 5, **kwargs): + generation_config = genai.GenerationConfig(max_output_tokens=max_tokens) + + def _generate(): + return client.generate_content( + messages, + generation_config=generation_config, + safety_settings={ + HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE, + HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE, + HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE, + HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE, + }, + **kwargs, + ) + + for times in range(max_try_times): + try: + response = _generate() + return Response(success=True, content=response.text, full_log={"input": messages, "output": response}) + except Exception as e: + logger.error(f"Failed to generate response: {e}") + if times < max_try_times - 1: + logger.info(f"Retrying... ({times+1}/{max_try_times})") + sleep(3) + else: + logger.error("Failed to generate response after retrying.") + return Response(success=False, content=str(e), full_log={"input": messages, "output": None}) diff --git a/tools/live_bench/live_bench/data_generator/utils/gpt4v.py b/tools/live_bench/live_bench/data_generator/utils/gpt4v.py index 5dc1b65b..6ee5452d 100644 --- a/tools/live_bench/live_bench/data_generator/utils/gpt4v.py +++ b/tools/live_bench/live_bench/data_generator/utils/gpt4v.py @@ -1,76 +1,76 @@ -import base64 -import io -import logging -from time import sleep - -from live_bench.data_generator.response import Response -from PIL import Image - -logger = logging.getLogger("lmms-eval") - - -def format_gpt4v_images(image): - if isinstance(image, Image.Image): - buffered = io.BytesIO() - image.save(buffered, format="PNG") - img_str = base64.b64encode(buffered.getvalue()).decode("utf-8") - return { - "type": "image_url", - "image_url": { - "url": f"data:image/png;base64,{img_str}", - }, - } - elif isinstance(image, list): - return [format_gpt4v_images(img) for img in image] - else: - raise ValueError(f"Unsupported image type: {type(image)}") - - -def format_printable_messages(messages): - for message in messages: - if "content" in message and isinstance(message["content"], list): - for content in message["content"]: - if "type" in content and content["type"] == "image_url": - content["image_url"]["url"] = "" - return messages - - -def gpt4v_generate_response(messages, *, client=None, model="gpt-4o", max_tokens: int = 4096, max_try_times: int = 5, json_format="auto", test=False, system=None, **kwargs) -> Response: - if system: - messages = [{"role": "system", "content": system}] + messages - - if json_format == "auto": - json_format = False - for message in messages: - if message.get("role") == "user": - contents = message.get("content", []) - if isinstance(contents, str): - if "json" in contents: - json_format = True - break - else: - for content in contents: - if content.get("type", None) == "text" and "json" in content.get("text", ""): - json_format = True - break - - if json_format: - response_format = {"type": "json_object"} - else: - response_format = None - - def _generate(): - return client.chat.completions.create(model=model, messages=messages, max_tokens=max_tokens, response_format=response_format, **kwargs) - - for times in range(max_try_times): - try: - response = _generate() - return Response(success=True, content=response.choices[0].message.content, full_log={"input": format_printable_messages(messages), "output": response.choices[0].message.content}) - except Exception as e: - logger.error(f"Failed to generate response: {e}") - if times < max_try_times - 1: - logger.info(f"Retrying... ({times+1}/{max_try_times})") - sleep(3) - else: - logger.error("Failed to generate response after retrying.") - return Response(success=False, content=str(e), full_log={"input": format_printable_messages(messages), "output": None}) +import base64 +import io +import logging +from time import sleep + +from live_bench.data_generator.response import Response +from PIL import Image + +logger = logging.getLogger("lmms-eval") + + +def format_gpt4v_images(image): + if isinstance(image, Image.Image): + buffered = io.BytesIO() + image.save(buffered, format="PNG") + img_str = base64.b64encode(buffered.getvalue()).decode("utf-8") + return { + "type": "image_url", + "image_url": { + "url": f"data:image/png;base64,{img_str}", + }, + } + elif isinstance(image, list): + return [format_gpt4v_images(img) for img in image] + else: + raise ValueError(f"Unsupported image type: {type(image)}") + + +def format_printable_messages(messages): + for message in messages: + if "content" in message and isinstance(message["content"], list): + for content in message["content"]: + if "type" in content and content["type"] == "image_url": + content["image_url"]["url"] = "" + return messages + + +def gpt4v_generate_response(messages, *, client=None, model="gpt-4o", max_tokens: int = 4096, max_try_times: int = 5, json_format="auto", test=False, system=None, **kwargs) -> Response: + if system: + messages = [{"role": "system", "content": system}] + messages + + if json_format == "auto": + json_format = False + for message in messages: + if message.get("role") == "user": + contents = message.get("content", []) + if isinstance(contents, str): + if "json" in contents: + json_format = True + break + else: + for content in contents: + if content.get("type", None) == "text" and "json" in content.get("text", ""): + json_format = True + break + + if json_format: + response_format = {"type": "json_object"} + else: + response_format = None + + def _generate(): + return client.chat.completions.create(model=model, messages=messages, max_tokens=max_tokens, response_format=response_format, **kwargs) + + for times in range(max_try_times): + try: + response = _generate() + return Response(success=True, content=response.choices[0].message.content, full_log={"input": format_printable_messages(messages), "output": response.choices[0].message.content}) + except Exception as e: + logger.error(f"Failed to generate response: {e}") + if times < max_try_times - 1: + logger.info(f"Retrying... ({times+1}/{max_try_times})") + sleep(3) + else: + logger.error("Failed to generate response after retrying.") + return Response(success=False, content=str(e), full_log={"input": format_printable_messages(messages), "output": None}) diff --git a/tools/live_bench/live_bench/driver/.gitignore b/tools/live_bench/live_bench/driver/.gitignore index 0ef18421..0d97af22 100644 --- a/tools/live_bench/live_bench/driver/.gitignore +++ b/tools/live_bench/live_bench/driver/.gitignore @@ -1 +1 @@ -extensions/ +extensions/ diff --git a/tools/live_bench/live_bench/driver/__init__.py b/tools/live_bench/live_bench/driver/__init__.py index 5ac815d2..6853a0b5 100644 --- a/tools/live_bench/live_bench/driver/__init__.py +++ b/tools/live_bench/live_bench/driver/__init__.py @@ -1 +1 @@ -from live_bench.driver.load_driver import load_driver +from live_bench.driver.load_driver import load_driver diff --git a/tools/live_bench/live_bench/driver/load_driver.py b/tools/live_bench/live_bench/driver/load_driver.py index 544c7161..9a4fca8d 100644 --- a/tools/live_bench/live_bench/driver/load_driver.py +++ b/tools/live_bench/live_bench/driver/load_driver.py @@ -1,71 +1,71 @@ -import os -import zipfile - -import requests -import undetected_chromedriver as uc -from selenium import webdriver -from selenium.webdriver.chrome.options import Options -from webdriver_manager.chrome import ChromeDriverManager -from webdriver_manager.core.os_manager import ChromeType - - -def load_driver( - window_size="auto", - headless=True, - driver="undetected_chromedriver", - driver_version=None, - chrome_type="CHROME", - adblock=True, - adblock_version="6.0.2-mv3", - extension_cache_dir=os.path.join(os.path.dirname(__file__), "extensions"), - *, - service=None, - additional_options=None, -): - options = Options() - if service is None: - chrome_type = chrome_type.upper() - if chrome_type == "CHROMIUM": - chrome_type = ChromeType.CHROMIUM - elif chrome_type == "CHROME": - chrome_type = ChromeType.GOOGLE - elif chrome_type == "BRAVE": - chrome_type = ChromeType.BRAVE - service = ChromeDriverManager(driver_version=driver_version, chrome_type=chrome_type).install() - if headless: - options.add_argument("--headless") - if adblock: - try: - adblock_url = f"https://code.getadblock.com/releases/adblockchrome-{adblock_version}.zip" - adblock_path = os.path.join(extension_cache_dir, f"adblockchrome-{adblock_version}") - if not os.path.isdir(adblock_path): - os.makedirs(os.path.join(adblock_path, ".."), exist_ok=True) - # Download the adblock zip file - response = requests.get(adblock_url) - with open(f"{adblock_path}.zip", "wb") as file: - file.write(response.content) - # Unzip the downloaded file - with zipfile.ZipFile(f"{adblock_path}.zip", "r") as zip_ref: - zip_ref.extractall(adblock_path) - # Remove the zip file after extraction - os.remove(f"{adblock_path}.zip") - options.add_argument(f"--load-extension={os.path.abspath(adblock_path)}") - except Exception as e: - print(f"Error loading adblock extension: {e}") - if driver == "undetected_chromedriver": - driver = uc.Chrome(headless=headless, options=options, driver_executable_path=service) - if window_size != "auto": - driver.set_window_size(*window_size) - return driver - elif driver == "chrome": - options = Options() - if additional_options is not None: - for option in additional_options: - options.add_argument(option) - service = webdriver.chrome.service.Service(service) - driver = webdriver.Chrome(service=service, options=options) - if window_size != "auto": - driver.set_window_size(*window_size) - return driver - else: - raise ValueError(f"Unknown driver: {driver}") +import os +import zipfile + +import requests +import undetected_chromedriver as uc +from selenium import webdriver +from selenium.webdriver.chrome.options import Options +from webdriver_manager.chrome import ChromeDriverManager +from webdriver_manager.core.os_manager import ChromeType + + +def load_driver( + window_size="auto", + headless=True, + driver="undetected_chromedriver", + driver_version=None, + chrome_type="CHROME", + adblock=True, + adblock_version="6.0.2-mv3", + extension_cache_dir=os.path.join(os.path.dirname(__file__), "extensions"), + *, + service=None, + additional_options=None, +): + options = Options() + if service is None: + chrome_type = chrome_type.upper() + if chrome_type == "CHROMIUM": + chrome_type = ChromeType.CHROMIUM + elif chrome_type == "CHROME": + chrome_type = ChromeType.GOOGLE + elif chrome_type == "BRAVE": + chrome_type = ChromeType.BRAVE + service = ChromeDriverManager(driver_version=driver_version, chrome_type=chrome_type).install() + if headless: + options.add_argument("--headless") + if adblock: + try: + adblock_url = f"https://code.getadblock.com/releases/adblockchrome-{adblock_version}.zip" + adblock_path = os.path.join(extension_cache_dir, f"adblockchrome-{adblock_version}") + if not os.path.isdir(adblock_path): + os.makedirs(os.path.join(adblock_path, ".."), exist_ok=True) + # Download the adblock zip file + response = requests.get(adblock_url) + with open(f"{adblock_path}.zip", "wb") as file: + file.write(response.content) + # Unzip the downloaded file + with zipfile.ZipFile(f"{adblock_path}.zip", "r") as zip_ref: + zip_ref.extractall(adblock_path) + # Remove the zip file after extraction + os.remove(f"{adblock_path}.zip") + options.add_argument(f"--load-extension={os.path.abspath(adblock_path)}") + except Exception as e: + print(f"Error loading adblock extension: {e}") + if driver == "undetected_chromedriver": + driver = uc.Chrome(headless=headless, options=options, driver_executable_path=service) + if window_size != "auto": + driver.set_window_size(*window_size) + return driver + elif driver == "chrome": + options = Options() + if additional_options is not None: + for option in additional_options: + options.add_argument(option) + service = webdriver.chrome.service.Service(service) + driver = webdriver.Chrome(service=service, options=options) + if window_size != "auto": + driver.set_window_size(*window_size) + return driver + else: + raise ValueError(f"Unknown driver: {driver}") diff --git a/tools/live_bench/live_bench/screen_shoter/__init__.py b/tools/live_bench/live_bench/screen_shoter/__init__.py index 757efde4..eda8e445 100644 --- a/tools/live_bench/live_bench/screen_shoter/__init__.py +++ b/tools/live_bench/live_bench/screen_shoter/__init__.py @@ -1,6 +1,6 @@ -from live_bench.screen_shoter.screen import ScreenImage -from live_bench.screen_shoter.screen_shoter import ( - ScreenShoter, - get_shoter, - register_shoter, -) +from live_bench.screen_shoter.screen import ScreenImage +from live_bench.screen_shoter.screen_shoter import ( + ScreenShoter, + get_shoter, + register_shoter, +) diff --git a/tools/live_bench/live_bench/screen_shoter/screen.py b/tools/live_bench/live_bench/screen_shoter/screen.py index a892afc5..2ecf0c48 100644 --- a/tools/live_bench/live_bench/screen_shoter/screen.py +++ b/tools/live_bench/live_bench/screen_shoter/screen.py @@ -1,29 +1,29 @@ -import base64 -import io -from typing import List, Tuple - -from live_bench.websites import Website -from PIL import Image - - -def image_to_base64(image: Image.Image) -> str: - buffered = io.BytesIO() - image.save(buffered, format="PNG") - return base64.b64encode(buffered.getvalue()).decode("utf-8") - - -class ScreenImage(object): - def __init__(self, images: List[Image.Image], website: Website, shoter: str, screen_size: Tuple[int, int], capture_datetime: str): - self.images = images - self.website = website - self.shoter = shoter - self.screen_size = screen_size - self.capture_datetime = capture_datetime - - def to_dict(self): - return {"images": self.images, "website": self.website.get_info(), "shoter": self.shoter, "screen_size": self.screen_size, "capture_datetime": self.capture_datetime} - - def to_output_dict(self): - output = self.to_dict() - output["images"] = [image_to_base64(image) for image in self.images] - return output +import base64 +import io +from typing import List, Tuple + +from live_bench.websites import Website +from PIL import Image + + +def image_to_base64(image: Image.Image) -> str: + buffered = io.BytesIO() + image.save(buffered, format="PNG") + return base64.b64encode(buffered.getvalue()).decode("utf-8") + + +class ScreenImage(object): + def __init__(self, images: List[Image.Image], website: Website, shoter: str, screen_size: Tuple[int, int], capture_datetime: str): + self.images = images + self.website = website + self.shoter = shoter + self.screen_size = screen_size + self.capture_datetime = capture_datetime + + def to_dict(self): + return {"images": self.images, "website": self.website.get_info(), "shoter": self.shoter, "screen_size": self.screen_size, "capture_datetime": self.capture_datetime} + + def to_output_dict(self): + output = self.to_dict() + output["images"] = [image_to_base64(image) for image in self.images] + return output diff --git a/tools/live_bench/live_bench/screen_shoter/screen_shoter.py b/tools/live_bench/live_bench/screen_shoter/screen_shoter.py index 6f94582b..a99927c9 100644 --- a/tools/live_bench/live_bench/screen_shoter/screen_shoter.py +++ b/tools/live_bench/live_bench/screen_shoter/screen_shoter.py @@ -1,141 +1,141 @@ -import io -import logging -import os -from abc import ABC, abstractmethod -from datetime import datetime -from typing import List - -from live_bench.screen_shoter.screen import ScreenImage -from live_bench.websites import Website -from PIL import Image -from selenium import webdriver - -logger = logging.getLogger("lmms-eval") - - -class ScreenShoter(ABC): - def __init__(self, screen_size=(1024, 1024)): - self.screen_size = screen_size - - def capture(self, driver: webdriver.Chrome, website: Website) -> ScreenImage: - if driver is not None: - website.visit(driver) - if self.screen_size != "auto": - driver.set_window_size(self.screen_size[0], self.screen_size[1]) - else: - driver.set_window_size(1024, 1024) - page_width = driver.execute_script("return document.body.scrollWidth") - driver.set_window_size(page_width, 1024) - # print("Screen size:", driver.get_window_size()) - images = self.get_screenshot(driver) - return ScreenImage(images, website, self.get_name(), self.screen_size, datetime.now().strftime("%Y-%m-%d %H:%M:%S")) - - def __call__(self, driver: webdriver.Chrome, website: Website) -> List[Image.Image]: - return self.capture(driver, website) - - def get_name(self) -> str: - raise NotImplementedError("get_name not implemented") - - @abstractmethod - def get_screenshot(self, driver: webdriver.Chrome) -> List[Image.Image]: - pass - - -class ScreenShoterRegistry: - def __init__(self): - self.shoters = {} - - def register_shoter(self, name): - def decorator(cls): - self.shoters[name] = cls - cls.get_name = lambda self: name - return cls - - return decorator - - def get_shoter(self, name) -> ScreenShoter: - return self.shoters[name] - - -shoter_registry = ScreenShoterRegistry() - - -def register_shoter(name): - return shoter_registry.register_shoter(name) - - -def get_shoter(name, *args, **kwargs) -> ScreenShoter: - return shoter_registry.get_shoter(name)(*args, **kwargs) - - -@register_shoter("human") -class HumanScreenShoter(ScreenShoter): - def __init__(self, screen_size=None): - super().__init__(screen_size) - - def capture(self, driver: webdriver.Chrome, website: Website) -> ScreenImage: - path = website.get_path() - images = [] - - def get_image(path): - try: - with open(path, "rb") as f: - image_data = f.read() - image = Image.open(io.BytesIO(image_data)) - images.append(image) - except Exception as e: - logger.error(f"Error loading image {path}: {e}") - - if os.path.isdir(path): - for root, dirs, files in os.walk(path): - for file_name in files: - get_image(os.path.join(root, file_name)) - else: - try: - get_image(path) - except Exception as e: - logger.error(f"Error loading image {path}: {e}") - if not images: - raise ValueError(f"No images found in {path}") - return ScreenImage(images, website, self.get_name(), self.screen_size, datetime.now().strftime("%Y-%m-%d %H:%M:%S")) - - def get_screenshot(self, driver: webdriver.Chrome) -> List[Image.Image]: - return [] - - -@register_shoter("single_screen") -class SingleScreenShoter(ScreenShoter): - def __init__(self, screen_size=(1024, 1024)): - super().__init__(screen_size) - - def get_screenshot(self, driver: webdriver.Chrome) -> List[Image.Image]: - screenshot = driver.get_screenshot_as_png() - return [Image.open(io.BytesIO(screenshot))] - - -@register_shoter("rolling_screen") -class RollingScreenShoter(ScreenShoter): - def __init__(self, screen_size=(1024, 1024)): - super().__init__(screen_size) - - def get_screenshot(self, driver: webdriver.Chrome) -> List[Image.Image]: - screenshots = [] - # Scroll to the top of the page before taking the first screenshot - driver.execute_script("window.scrollTo(0, 0)") - # Get the total height of the web page - total_height = driver.execute_script("return document.body.parentNode.scrollHeight") - # Get the viewport height - viewport_height = driver.execute_script("return window.innerHeight") - # Initialize the current scroll position - current_scroll_position = 0 - - # Scroll through the page and take screenshots - while current_scroll_position < total_height: - # Take screenshot and append to the list - screenshot = driver.get_screenshot_as_png() - screenshots.append(Image.open(io.BytesIO(screenshot))) - # Scroll down by the viewport height - current_scroll_position += viewport_height - driver.execute_script(f"window.scrollTo(0, {current_scroll_position})") - - return screenshots +import io +import logging +import os +from abc import ABC, abstractmethod +from datetime import datetime +from typing import List + +from live_bench.screen_shoter.screen import ScreenImage +from live_bench.websites import Website +from PIL import Image +from selenium import webdriver + +logger = logging.getLogger("lmms-eval") + + +class ScreenShoter(ABC): + def __init__(self, screen_size=(1024, 1024)): + self.screen_size = screen_size + + def capture(self, driver: webdriver.Chrome, website: Website) -> ScreenImage: + if driver is not None: + website.visit(driver) + if self.screen_size != "auto": + driver.set_window_size(self.screen_size[0], self.screen_size[1]) + else: + driver.set_window_size(1024, 1024) + page_width = driver.execute_script("return document.body.scrollWidth") + driver.set_window_size(page_width, 1024) + # print("Screen size:", driver.get_window_size()) + images = self.get_screenshot(driver) + return ScreenImage(images, website, self.get_name(), self.screen_size, datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + + def __call__(self, driver: webdriver.Chrome, website: Website) -> List[Image.Image]: + return self.capture(driver, website) + + def get_name(self) -> str: + raise NotImplementedError("get_name not implemented") + + @abstractmethod + def get_screenshot(self, driver: webdriver.Chrome) -> List[Image.Image]: + pass + + +class ScreenShoterRegistry: + def __init__(self): + self.shoters = {} + + def register_shoter(self, name): + def decorator(cls): + self.shoters[name] = cls + cls.get_name = lambda self: name + return cls + + return decorator + + def get_shoter(self, name) -> ScreenShoter: + return self.shoters[name] + + +shoter_registry = ScreenShoterRegistry() + + +def register_shoter(name): + return shoter_registry.register_shoter(name) + + +def get_shoter(name, *args, **kwargs) -> ScreenShoter: + return shoter_registry.get_shoter(name)(*args, **kwargs) + + +@register_shoter("human") +class HumanScreenShoter(ScreenShoter): + def __init__(self, screen_size=None): + super().__init__(screen_size) + + def capture(self, driver: webdriver.Chrome, website: Website) -> ScreenImage: + path = website.get_path() + images = [] + + def get_image(path): + try: + with open(path, "rb") as f: + image_data = f.read() + image = Image.open(io.BytesIO(image_data)) + images.append(image) + except Exception as e: + logger.error(f"Error loading image {path}: {e}") + + if os.path.isdir(path): + for root, dirs, files in os.walk(path): + for file_name in files: + get_image(os.path.join(root, file_name)) + else: + try: + get_image(path) + except Exception as e: + logger.error(f"Error loading image {path}: {e}") + if not images: + raise ValueError(f"No images found in {path}") + return ScreenImage(images, website, self.get_name(), self.screen_size, datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + + def get_screenshot(self, driver: webdriver.Chrome) -> List[Image.Image]: + return [] + + +@register_shoter("single_screen") +class SingleScreenShoter(ScreenShoter): + def __init__(self, screen_size=(1024, 1024)): + super().__init__(screen_size) + + def get_screenshot(self, driver: webdriver.Chrome) -> List[Image.Image]: + screenshot = driver.get_screenshot_as_png() + return [Image.open(io.BytesIO(screenshot))] + + +@register_shoter("rolling_screen") +class RollingScreenShoter(ScreenShoter): + def __init__(self, screen_size=(1024, 1024)): + super().__init__(screen_size) + + def get_screenshot(self, driver: webdriver.Chrome) -> List[Image.Image]: + screenshots = [] + # Scroll to the top of the page before taking the first screenshot + driver.execute_script("window.scrollTo(0, 0)") + # Get the total height of the web page + total_height = driver.execute_script("return document.body.parentNode.scrollHeight") + # Get the viewport height + viewport_height = driver.execute_script("return window.innerHeight") + # Initialize the current scroll position + current_scroll_position = 0 + + # Scroll through the page and take screenshots + while current_scroll_position < total_height: + # Take screenshot and append to the list + screenshot = driver.get_screenshot_as_png() + screenshots.append(Image.open(io.BytesIO(screenshot))) + # Scroll down by the viewport height + current_scroll_position += viewport_height + driver.execute_script(f"window.scrollTo(0, {current_scroll_position})") + + return screenshots diff --git a/tools/live_bench/live_bench/view.ipynb b/tools/live_bench/live_bench/view.ipynb index 5a4addac..7f987bed 100644 --- a/tools/live_bench/live_bench/view.ipynb +++ b/tools/live_bench/live_bench/view.ipynb @@ -1,431 +1,431 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/data/pufanyi/anaconda3/anacondabin/envs/live_bench/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n", - "Downloading readme: 100%|██████████| 2.45k/2.45k [00:00<00:00, 5.08MB/s]\n", - "Downloading data: 100%|██████████| 88.6M/88.6M [00:04<00:00, 19.6MB/s]\n", - "Generating test split: 100%|██████████| 320/320 [00:00<00:00, 384.23 examples/s]\n" - ] - } - ], - "source": [ - "from datasets import load_dataset\n", - "\n", - "dataset = load_dataset(\"lmms-lab/LiveBench\", \"2024-07\")" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "DatasetDict({\n", - " test: Dataset({\n", - " features: ['id', 'images', 'website', 'question', 'answer', 'criteria', 'subtask', 'data_generator', 'checker', 'date_time', 'screen_shoter', 'screen_size', 'score', 'reason', 'scorer_name'],\n", - " num_rows: 320\n", - " })\n", - "})" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dataset" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idimageswebsitequestionanswercriteriasubtaskdata_generatorcheckerdate_timescreen_shoterscreen_sizescorereasonscorer_name
00[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...{'url': 'https://www.bbc.com/news'}Examine the top menu bar of the BBC website di...C) WeatherAward 10 points for selecting option C) Weathe...Basic Understandingclaudegemini2024-07-20 14:02:22single_screen(1024, 1024)10The answer is correct and can be directly veri...claude
11[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...{'url': 'https://www.bbc.com/news'}Based on the image accompanying the article ab...The image depicts a crowded airport setting, l...{'10 points': 'The answer correctly identifies...Deeper Implicationsclaudegemini2024-07-20 14:02:22single_screen(1024, 1024)10The image clearly shows that the news article ...claude
22[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...{'url': 'https://www.bbc.com/news'}Based on the photograph accompanying the headl...The photograph shows a crowded scene inside an...{'2 points': 'Interprets the overall scene as ...Contextual Analysisclaudegemini2024-07-20 14:02:22single_screen(1024, 1024)10The answer accurately describes the image and ...claude
33[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...{'url': 'https://www.bbc.com/news'}Considering the headline 'Global services slow...This incident underscores several critical iss...Award up to 10 marks: 1.5 points for discussin...Deeper ImplicationsclaudeNone2024-07-20 14:02:22single_screen(1024, 1024)10The answer provides an insightful analysis of ...claude
44[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...{'url': 'https://www.bbc.com/news'}Analyze the layout and image selection of the ...The layout of the BBC News homepage utilizes v...{'10 points': 'The answer provides a detailed ...Contextual Analysisclaudegemini2024-07-20 14:02:22single_screen(1024, 1024)8The ranking provided is generally accurate and...claude
................................................
315315[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...{'name': 'bloomberg.comeconomics.png'}Analyze the image accompanying the article 'Xi...The image accompanying the article suggests a ...Scoring Criteria (Total: 10 points)\\n\\n- Ident...Contextual Analysisgpt4vgemini2024-07-21 20:23:39humanNone6While the question is thoughtful and relevant ...claude
316316[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...{'name': 'wsj.comworldafrica.png'}Analyze the image accompanying the article 'Qu...The image contains several key elements that s...Scoring Criteria (Total: 10 points):\\n\\n1. Ide...Contextual Analysisgpt4vgemini2024-07-21 20:27:57humanNone6While the question is based on information pro...claude
317317[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...{'name': 'wsj.comworldafrica.png'}Based on the articles shown on the Wall Street...The two most closely related news stories are:...Scoring Criteria (Total 10 points):\\n1. Correc...Contextual Analysisgpt4vgemini2024-07-21 20:27:57humanNone7This question requires analysis and interpreta...claude
318318[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...{'name': 'wsj.comworldafrica.png'}Analyze the image and headlines related to the...The image and headlines present a complex narr...Scoring Criteria (Total: 10 points)\\n\\n1. Anal...Contextual Analysisgpt4vgemini2024-07-21 20:27:57humanNone6This question and answer are not directly supp...claude
319319[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...{'name': 'wsj.comworldafrica.png'}Analyze the image accompanying the article tit...The image contains several key visual elements...Scoring Criteria (Total 10 points):\\n\\n- Ident...Basic Understandinggpt4vgemini2024-07-21 20:27:57humanNone5This answer is not directly supported by the i...claude
\n", - "

320 rows × 15 columns

\n", - "
" - ], - "text/plain": [ - " id images \\\n", - "0 0 [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH... \n", - "1 1 [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH... \n", - "2 2 [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH... \n", - "3 3 [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH... \n", - "4 4 [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH... \n", - ".. ... ... \n", - "315 315 [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH... \n", - "316 316 [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH... \n", - "317 317 [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH... \n", - "318 318 [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH... \n", - "319 319 [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH... \n", - "\n", - " website \\\n", - "0 {'url': 'https://www.bbc.com/news'} \n", - "1 {'url': 'https://www.bbc.com/news'} \n", - "2 {'url': 'https://www.bbc.com/news'} \n", - "3 {'url': 'https://www.bbc.com/news'} \n", - "4 {'url': 'https://www.bbc.com/news'} \n", - ".. ... \n", - "315 {'name': 'bloomberg.comeconomics.png'} \n", - "316 {'name': 'wsj.comworldafrica.png'} \n", - "317 {'name': 'wsj.comworldafrica.png'} \n", - "318 {'name': 'wsj.comworldafrica.png'} \n", - "319 {'name': 'wsj.comworldafrica.png'} \n", - "\n", - " question \\\n", - "0 Examine the top menu bar of the BBC website di... \n", - "1 Based on the image accompanying the article ab... \n", - "2 Based on the photograph accompanying the headl... \n", - "3 Considering the headline 'Global services slow... \n", - "4 Analyze the layout and image selection of the ... \n", - ".. ... \n", - "315 Analyze the image accompanying the article 'Xi... \n", - "316 Analyze the image accompanying the article 'Qu... \n", - "317 Based on the articles shown on the Wall Street... \n", - "318 Analyze the image and headlines related to the... \n", - "319 Analyze the image accompanying the article tit... \n", - "\n", - " answer \\\n", - "0 C) Weather \n", - "1 The image depicts a crowded airport setting, l... \n", - "2 The photograph shows a crowded scene inside an... \n", - "3 This incident underscores several critical iss... \n", - "4 The layout of the BBC News homepage utilizes v... \n", - ".. ... \n", - "315 The image accompanying the article suggests a ... \n", - "316 The image contains several key elements that s... \n", - "317 The two most closely related news stories are:... \n", - "318 The image and headlines present a complex narr... \n", - "319 The image contains several key visual elements... \n", - "\n", - " criteria subtask \\\n", - "0 Award 10 points for selecting option C) Weathe... Basic Understanding \n", - "1 {'10 points': 'The answer correctly identifies... Deeper Implications \n", - "2 {'2 points': 'Interprets the overall scene as ... Contextual Analysis \n", - "3 Award up to 10 marks: 1.5 points for discussin... Deeper Implications \n", - "4 {'10 points': 'The answer provides a detailed ... Contextual Analysis \n", - ".. ... ... \n", - "315 Scoring Criteria (Total: 10 points)\\n\\n- Ident... Contextual Analysis \n", - "316 Scoring Criteria (Total: 10 points):\\n\\n1. Ide... Contextual Analysis \n", - "317 Scoring Criteria (Total 10 points):\\n1. Correc... Contextual Analysis \n", - "318 Scoring Criteria (Total: 10 points)\\n\\n1. Anal... Contextual Analysis \n", - "319 Scoring Criteria (Total 10 points):\\n\\n- Ident... Basic Understanding \n", - "\n", - " data_generator checker date_time screen_shoter screen_size \\\n", - "0 claude gemini 2024-07-20 14:02:22 single_screen (1024, 1024) \n", - "1 claude gemini 2024-07-20 14:02:22 single_screen (1024, 1024) \n", - "2 claude gemini 2024-07-20 14:02:22 single_screen (1024, 1024) \n", - "3 claude None 2024-07-20 14:02:22 single_screen (1024, 1024) \n", - "4 claude gemini 2024-07-20 14:02:22 single_screen (1024, 1024) \n", - ".. ... ... ... ... ... \n", - "315 gpt4v gemini 2024-07-21 20:23:39 human None \n", - "316 gpt4v gemini 2024-07-21 20:27:57 human None \n", - "317 gpt4v gemini 2024-07-21 20:27:57 human None \n", - "318 gpt4v gemini 2024-07-21 20:27:57 human None \n", - "319 gpt4v gemini 2024-07-21 20:27:57 human None \n", - "\n", - " score reason scorer_name \n", - "0 10 The answer is correct and can be directly veri... claude \n", - "1 10 The image clearly shows that the news article ... claude \n", - "2 10 The answer accurately describes the image and ... claude \n", - "3 10 The answer provides an insightful analysis of ... claude \n", - "4 8 The ranking provided is generally accurate and... claude \n", - ".. ... ... ... \n", - "315 6 While the question is thoughtful and relevant ... claude \n", - "316 6 While the question is based on information pro... claude \n", - "317 7 This question requires analysis and interpreta... claude \n", - "318 6 This question and answer are not directly supp... claude \n", - "319 5 This answer is not directly supported by the i... claude \n", - "\n", - "[320 rows x 15 columns]" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dataset[\"test\"].to_pandas()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "live_bench", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.9" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/data/pufanyi/anaconda3/anacondabin/envs/live_bench/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "Downloading readme: 100%|██████████| 2.45k/2.45k [00:00<00:00, 5.08MB/s]\n", + "Downloading data: 100%|██████████| 88.6M/88.6M [00:04<00:00, 19.6MB/s]\n", + "Generating test split: 100%|██████████| 320/320 [00:00<00:00, 384.23 examples/s]\n" + ] + } + ], + "source": [ + "from datasets import load_dataset\n", + "\n", + "dataset = load_dataset(\"lmms-lab/LiveBench\", \"2024-07\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DatasetDict({\n", + " test: Dataset({\n", + " features: ['id', 'images', 'website', 'question', 'answer', 'criteria', 'subtask', 'data_generator', 'checker', 'date_time', 'screen_shoter', 'screen_size', 'score', 'reason', 'scorer_name'],\n", + " num_rows: 320\n", + " })\n", + "})" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idimageswebsitequestionanswercriteriasubtaskdata_generatorcheckerdate_timescreen_shoterscreen_sizescorereasonscorer_name
00[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...{'url': 'https://www.bbc.com/news'}Examine the top menu bar of the BBC website di...C) WeatherAward 10 points for selecting option C) Weathe...Basic Understandingclaudegemini2024-07-20 14:02:22single_screen(1024, 1024)10The answer is correct and can be directly veri...claude
11[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...{'url': 'https://www.bbc.com/news'}Based on the image accompanying the article ab...The image depicts a crowded airport setting, l...{'10 points': 'The answer correctly identifies...Deeper Implicationsclaudegemini2024-07-20 14:02:22single_screen(1024, 1024)10The image clearly shows that the news article ...claude
22[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...{'url': 'https://www.bbc.com/news'}Based on the photograph accompanying the headl...The photograph shows a crowded scene inside an...{'2 points': 'Interprets the overall scene as ...Contextual Analysisclaudegemini2024-07-20 14:02:22single_screen(1024, 1024)10The answer accurately describes the image and ...claude
33[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...{'url': 'https://www.bbc.com/news'}Considering the headline 'Global services slow...This incident underscores several critical iss...Award up to 10 marks: 1.5 points for discussin...Deeper ImplicationsclaudeNone2024-07-20 14:02:22single_screen(1024, 1024)10The answer provides an insightful analysis of ...claude
44[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...{'url': 'https://www.bbc.com/news'}Analyze the layout and image selection of the ...The layout of the BBC News homepage utilizes v...{'10 points': 'The answer provides a detailed ...Contextual Analysisclaudegemini2024-07-20 14:02:22single_screen(1024, 1024)8The ranking provided is generally accurate and...claude
................................................
315315[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...{'name': 'bloomberg.comeconomics.png'}Analyze the image accompanying the article 'Xi...The image accompanying the article suggests a ...Scoring Criteria (Total: 10 points)\\n\\n- Ident...Contextual Analysisgpt4vgemini2024-07-21 20:23:39humanNone6While the question is thoughtful and relevant ...claude
316316[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...{'name': 'wsj.comworldafrica.png'}Analyze the image accompanying the article 'Qu...The image contains several key elements that s...Scoring Criteria (Total: 10 points):\\n\\n1. Ide...Contextual Analysisgpt4vgemini2024-07-21 20:27:57humanNone6While the question is based on information pro...claude
317317[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...{'name': 'wsj.comworldafrica.png'}Based on the articles shown on the Wall Street...The two most closely related news stories are:...Scoring Criteria (Total 10 points):\\n1. Correc...Contextual Analysisgpt4vgemini2024-07-21 20:27:57humanNone7This question requires analysis and interpreta...claude
318318[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...{'name': 'wsj.comworldafrica.png'}Analyze the image and headlines related to the...The image and headlines present a complex narr...Scoring Criteria (Total: 10 points)\\n\\n1. Anal...Contextual Analysisgpt4vgemini2024-07-21 20:27:57humanNone6This question and answer are not directly supp...claude
319319[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...{'name': 'wsj.comworldafrica.png'}Analyze the image accompanying the article tit...The image contains several key visual elements...Scoring Criteria (Total 10 points):\\n\\n- Ident...Basic Understandinggpt4vgemini2024-07-21 20:27:57humanNone5This answer is not directly supported by the i...claude
\n", + "

320 rows × 15 columns

\n", + "
" + ], + "text/plain": [ + " id images \\\n", + "0 0 [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH... \n", + "1 1 [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH... \n", + "2 2 [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH... \n", + "3 3 [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH... \n", + "4 4 [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH... \n", + ".. ... ... \n", + "315 315 [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH... \n", + "316 316 [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH... \n", + "317 317 [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH... \n", + "318 318 [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH... \n", + "319 319 [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH... \n", + "\n", + " website \\\n", + "0 {'url': 'https://www.bbc.com/news'} \n", + "1 {'url': 'https://www.bbc.com/news'} \n", + "2 {'url': 'https://www.bbc.com/news'} \n", + "3 {'url': 'https://www.bbc.com/news'} \n", + "4 {'url': 'https://www.bbc.com/news'} \n", + ".. ... \n", + "315 {'name': 'bloomberg.comeconomics.png'} \n", + "316 {'name': 'wsj.comworldafrica.png'} \n", + "317 {'name': 'wsj.comworldafrica.png'} \n", + "318 {'name': 'wsj.comworldafrica.png'} \n", + "319 {'name': 'wsj.comworldafrica.png'} \n", + "\n", + " question \\\n", + "0 Examine the top menu bar of the BBC website di... \n", + "1 Based on the image accompanying the article ab... \n", + "2 Based on the photograph accompanying the headl... \n", + "3 Considering the headline 'Global services slow... \n", + "4 Analyze the layout and image selection of the ... \n", + ".. ... \n", + "315 Analyze the image accompanying the article 'Xi... \n", + "316 Analyze the image accompanying the article 'Qu... \n", + "317 Based on the articles shown on the Wall Street... \n", + "318 Analyze the image and headlines related to the... \n", + "319 Analyze the image accompanying the article tit... \n", + "\n", + " answer \\\n", + "0 C) Weather \n", + "1 The image depicts a crowded airport setting, l... \n", + "2 The photograph shows a crowded scene inside an... \n", + "3 This incident underscores several critical iss... \n", + "4 The layout of the BBC News homepage utilizes v... \n", + ".. ... \n", + "315 The image accompanying the article suggests a ... \n", + "316 The image contains several key elements that s... \n", + "317 The two most closely related news stories are:... \n", + "318 The image and headlines present a complex narr... \n", + "319 The image contains several key visual elements... \n", + "\n", + " criteria subtask \\\n", + "0 Award 10 points for selecting option C) Weathe... Basic Understanding \n", + "1 {'10 points': 'The answer correctly identifies... Deeper Implications \n", + "2 {'2 points': 'Interprets the overall scene as ... Contextual Analysis \n", + "3 Award up to 10 marks: 1.5 points for discussin... Deeper Implications \n", + "4 {'10 points': 'The answer provides a detailed ... Contextual Analysis \n", + ".. ... ... \n", + "315 Scoring Criteria (Total: 10 points)\\n\\n- Ident... Contextual Analysis \n", + "316 Scoring Criteria (Total: 10 points):\\n\\n1. Ide... Contextual Analysis \n", + "317 Scoring Criteria (Total 10 points):\\n1. Correc... Contextual Analysis \n", + "318 Scoring Criteria (Total: 10 points)\\n\\n1. Anal... Contextual Analysis \n", + "319 Scoring Criteria (Total 10 points):\\n\\n- Ident... Basic Understanding \n", + "\n", + " data_generator checker date_time screen_shoter screen_size \\\n", + "0 claude gemini 2024-07-20 14:02:22 single_screen (1024, 1024) \n", + "1 claude gemini 2024-07-20 14:02:22 single_screen (1024, 1024) \n", + "2 claude gemini 2024-07-20 14:02:22 single_screen (1024, 1024) \n", + "3 claude None 2024-07-20 14:02:22 single_screen (1024, 1024) \n", + "4 claude gemini 2024-07-20 14:02:22 single_screen (1024, 1024) \n", + ".. ... ... ... ... ... \n", + "315 gpt4v gemini 2024-07-21 20:23:39 human None \n", + "316 gpt4v gemini 2024-07-21 20:27:57 human None \n", + "317 gpt4v gemini 2024-07-21 20:27:57 human None \n", + "318 gpt4v gemini 2024-07-21 20:27:57 human None \n", + "319 gpt4v gemini 2024-07-21 20:27:57 human None \n", + "\n", + " score reason scorer_name \n", + "0 10 The answer is correct and can be directly veri... claude \n", + "1 10 The image clearly shows that the news article ... claude \n", + "2 10 The answer accurately describes the image and ... claude \n", + "3 10 The answer provides an insightful analysis of ... claude \n", + "4 8 The ranking provided is generally accurate and... claude \n", + ".. ... ... ... \n", + "315 6 While the question is thoughtful and relevant ... claude \n", + "316 6 While the question is based on information pro... claude \n", + "317 7 This question requires analysis and interpreta... claude \n", + "318 6 This question and answer are not directly supp... claude \n", + "319 5 This answer is not directly supported by the i... claude \n", + "\n", + "[320 rows x 15 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset[\"test\"].to_pandas()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "live_bench", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tools/live_bench/live_bench/websites/__init__.py b/tools/live_bench/live_bench/websites/__init__.py index a865d16e..6aa11dc6 100644 --- a/tools/live_bench/live_bench/websites/__init__.py +++ b/tools/live_bench/live_bench/websites/__init__.py @@ -1,2 +1,2 @@ -from live_bench.websites.load_website import load_websites, load_websites_from_file -from live_bench.websites.website import Website +from live_bench.websites.load_website import load_websites, load_websites_from_file +from live_bench.websites.website import Website diff --git a/tools/live_bench/live_bench/websites/load_website.py b/tools/live_bench/live_bench/websites/load_website.py index 531cec57..d9b41748 100644 --- a/tools/live_bench/live_bench/websites/load_website.py +++ b/tools/live_bench/live_bench/websites/load_website.py @@ -1,35 +1,35 @@ -import os -from random import sample - -import yaml -from live_bench.websites.website import DefaultWebsite, HumanScreenShotWebsite, Website - - -def get_website(website_dict): - if "website_class" not in website_dict: - website_class = DefaultWebsite - else: - website_class = website_dict["website_class"] - url = website_dict["url"] - if "args" in website_dict: - return website_class(url, **website_dict["args"]) - else: - return website_class(url) - - -def load_websites(num_sample: int = -1): - website_list_path = os.path.join(os.path.dirname(__file__), "website_list.yaml") - with open(website_list_path, "r") as f: - website_list = yaml.full_load(f)["websites"] - if num_sample > 0: - website_list = sample(website_list, num_sample) - return [get_website(website_dict) for website_dict in website_list] - - -def load_websites_from_file(file_path): - names = os.listdir(file_path) - websites = [] - for name in names: - path = os.path.join(file_path, name) - websites.append(HumanScreenShotWebsite(path=path, name=name)) - return websites +import os +from random import sample + +import yaml +from live_bench.websites.website import DefaultWebsite, HumanScreenShotWebsite, Website + + +def get_website(website_dict): + if "website_class" not in website_dict: + website_class = DefaultWebsite + else: + website_class = website_dict["website_class"] + url = website_dict["url"] + if "args" in website_dict: + return website_class(url, **website_dict["args"]) + else: + return website_class(url) + + +def load_websites(num_sample: int = -1): + website_list_path = os.path.join(os.path.dirname(__file__), "website_list.yaml") + with open(website_list_path, "r") as f: + website_list = yaml.full_load(f)["websites"] + if num_sample > 0: + website_list = sample(website_list, num_sample) + return [get_website(website_dict) for website_dict in website_list] + + +def load_websites_from_file(file_path): + names = os.listdir(file_path) + websites = [] + for name in names: + path = os.path.join(file_path, name) + websites.append(HumanScreenShotWebsite(path=path, name=name)) + return websites diff --git a/tools/live_bench/live_bench/websites/website.py b/tools/live_bench/live_bench/websites/website.py index 7aecde84..eeb5354c 100644 --- a/tools/live_bench/live_bench/websites/website.py +++ b/tools/live_bench/live_bench/websites/website.py @@ -1,62 +1,65 @@ -import os -import time -from abc import ABC, abstractmethod - -from webdriver_manager.core.driver import Driver - - -class Website(ABC): - def __init__(self, url=None, name=None, path=None): - self.url = url - self.name = name - self.path = path - assert self.url is not None or self.path is not None, "Either url or path must be provided" - - def get_path(self): - if self.url: - return self.url - else: - return self.path - - def visit(self, driver: Driver): - self.pre_visit(driver) - driver.get(self.url) - self.post_visit(driver) - - def get_info(self): - info = {} - if self.url: - info["url"] = self.url - if self.name: - info["name"] = self.name - return info - - @abstractmethod - def pre_visit(self, driver: Driver): - raise NotImplementedError("pre_action not implemented") - - @abstractmethod - def post_visit(self, driver: Driver): - raise NotImplementedError("post_action not implemented") - - -class DefaultWebsite(Website): - def __init__(self, url, name=None): - super().__init__(url, name) - - def pre_visit(self, driver: Driver): - pass - - def post_visit(self, driver: Driver): - time.sleep(5) # Wait for 5 seconds to allow adblock to finish - - -class HumanScreenShotWebsite(Website): - def __init__(self, name=None, path=None): - super().__init__(name=name, path=path) - - def pre_visit(self, driver: Driver): - pass - - def post_visit(self, driver: Driver): - pass +import os +import time +from abc import ABC, abstractmethod + +from webdriver_manager.core.driver import Driver + + +class Website(ABC): + def __init__(self, url=None, name=None, path=None, subject=None): + self.url = url + self.name = name + self.path = path + self.subject = subject + assert self.url is not None or self.path is not None, "Either url or path must be provided" + + def get_path(self): + if self.url: + return self.url + else: + return self.path + + def visit(self, driver: Driver): + self.pre_visit(driver) + driver.get(self.url) + self.post_visit(driver) + + def get_info(self): + info = {} + if self.url: + info["url"] = self.url + if self.name: + info["name"] = self.name + if self.subject: + info["subject"] = self.subject + return info + + @abstractmethod + def pre_visit(self, driver: Driver): + raise NotImplementedError("pre_action not implemented") + + @abstractmethod + def post_visit(self, driver: Driver): + raise NotImplementedError("post_action not implemented") + + +class DefaultWebsite(Website): + def __init__(self, url, name=None): + super().__init__(url, name) + + def pre_visit(self, driver: Driver): + pass + + def post_visit(self, driver: Driver): + time.sleep(5) # Wait for 5 seconds to allow adblock to finish + + +class HumanScreenShotWebsite(Website): + def __init__(self, name=None, path=None): + super().__init__(name=name, path=path) + + def pre_visit(self, driver: Driver): + pass + + def post_visit(self, driver: Driver): + pass diff --git a/tools/live_bench/live_bench/websites/website_list.yaml b/tools/live_bench/live_bench/websites/website_list.yaml index c85605d7..d8938174 100644 --- a/tools/live_bench/live_bench/websites/website_list.yaml +++ b/tools/live_bench/live_bench/websites/website_list.yaml @@ -1,78 +1,78 @@ -websites: -- url: https://www.bbc.com/ - # can add below line to specify the class to use for this website - # website_class: !constructor website.DefaultWebsite - # can add args tag to specify the arguments to pass to the class constructor - # args: - # arg1: value1 - # arg2: value2 -# - url: https://www.bbc.com/news -# - url: https://www.bbc.com/sport -# - url: https://www.bbc.com/business -# - url: https://www.bbc.com/innovation -# - url: https://www.bbc.com/culture -# - url: https://www.bbc.com/travel -# - url: https://www.bbc.com/future-planet -# - url: https://edition.cnn.com/ -# - url: https://edition.cnn.com/politics -# - url: https://edition.cnn.com/entertainment -# - url: https://edition.cnn.com/style -# - url: https://www.bloomberg.com/economics -# - url: https://www.bloomberg.com/industries -# - url: https://www.bloomberg.com/technology -# - url: https://www.bloomberg.com/politics -# - url: https://www.bloomberg.com/opinion -# - url: https://www.wsj.com/ -# - url: https://www.wsj.com/world/africa?mod=nav_top_subsection -# - url: https://www.wsj.com/world/americas?mod=nav_top_subsection -# - url: https://www.wsj.com/world/asia?mod=nav_top_subsection -# - url: https://www.wsj.com/world/china?mod=nav_top_subsection -# - url: https://www.wsj.com/world/europe?mod=nav_top_subsection -# - url: https://www.wsj.com/world/middle-east?mod=nav_top_subsection -# - url: https://www.wsj.com/world/india?mod=nav_top_subsection -# - url: https://www.wsj.com/world/oceania?mod=nav_top_subsection -# - url: https://www.wsj.com/world/russia?mod=nav_top_subsection -# - url: https://www.wsj.com/world/uk?mod=nav_top_subsection -# - url: https://www.wsj.com/science?mod=nav_top_subsection -# - url: https://www.wsj.com/science/archaeology?mod=nav_top_subsection -# - url: https://www.wsj.com/science/biology?mod=nav_top_subsection -# - url: https://www.wsj.com/science/environment?mod=nav_top_subsection -# - url: https://www.wsj.com/science/physics?mod=nav_top_subsection -# - url: https://www.wsj.com/science/space-astronomy?mod=nav_top_subsection -# - url: https://www.wsj.com/economy/central-banking?mod=nav_top_subsection -# - url: https://www.wsj.com/economy/consumers?mod=nav_top_subsection -# - url: https://www.wsj.com/economy/housing?mod=nav_top_subsection -# - url: https://www.wsj.com/economy/jobs?mod=nav_top_subsection -# - url: https://www.wsj.com/economy/trade?mod=nav_top_subsection -# - url: https://www.wsj.com/economy/global -# - url: https://www.wsj.com/tech/ai?mod=nav_top_subsection -# - url: https://www.wsj.com/tech/biotech -# - url: https://www.wsj.com/tech/cybersecurity?mod=nav_top_subsection -# - url: https://www.wsj.com/tech/personal-tech?mod=nav_top_subsection -# - url: https://www.reuters.com/ -# - url: https://www.reuters.com/business/aerospace-defense/ -# - url: https://www.reuters.com/business/autos-transportation/ -# - url: https://www.reuters.com/business/davos/ -# - url: https://www.reuters.com/business/energy/ -# - url: https://www.reuters.com/business/environment/ -# - url: https://www.reuters.com/business/finance/ -# - url: https://www.reuters.com/business/healthcare-pharmaceuticals/ -# - url: https://www.reuters.com/business/media-telecom/ -# - url: https://www.reuters.com/business/retail-consumer/ -# - url: https://www.reuters.com/business/future-of-health/ -# - url: https://www.reuters.com/business/future-of-money/ -# - url: https://www.reuters.com/business/take-five/ -# - url: https://www.reuters.com/business/world-at-work/ -# - url: https://www.reuters.com/breakingviews/ -# - url: https://www.reuters.com/technology/ -# - url: https://www.reuters.com/technology/cybersecurity/ -# - url: https://www.reuters.com/technology/space/ -# - url: https://www.reuters.com/technology/disrupted/ -# - url: https://www.reuters.com/technology/reuters-momentum/ -# - url: https://www.reuters.com/investigations/ -# - url: https://a16z.com/news-content/#latest -# - url: https://news.ycombinator.com/ -# - url: https://www.reddit.com/?rdt=48006 -# - url: https://news.crunchbase.com/ -# - url: https://www.cctv.com/ -# - url: https://sports.cctv.com/ +websites: +- url: https://www.bbc.com/ + # can add below line to specify the class to use for this website + # website_class: !constructor website.DefaultWebsite + # can add args tag to specify the arguments to pass to the class constructor + # args: + # arg1: value1 + # arg2: value2 +# - url: https://www.bbc.com/news +# - url: https://www.bbc.com/sport +# - url: https://www.bbc.com/business +# - url: https://www.bbc.com/innovation +# - url: https://www.bbc.com/culture +# - url: https://www.bbc.com/travel +# - url: https://www.bbc.com/future-planet +# - url: https://edition.cnn.com/ +# - url: https://edition.cnn.com/politics +# - url: https://edition.cnn.com/entertainment +# - url: https://edition.cnn.com/style +# - url: https://www.bloomberg.com/economics +# - url: https://www.bloomberg.com/industries +# - url: https://www.bloomberg.com/technology +# - url: https://www.bloomberg.com/politics +# - url: https://www.bloomberg.com/opinion +# - url: https://www.wsj.com/ +# - url: https://www.wsj.com/world/africa?mod=nav_top_subsection +# - url: https://www.wsj.com/world/americas?mod=nav_top_subsection +# - url: https://www.wsj.com/world/asia?mod=nav_top_subsection +# - url: https://www.wsj.com/world/china?mod=nav_top_subsection +# - url: https://www.wsj.com/world/europe?mod=nav_top_subsection +# - url: https://www.wsj.com/world/middle-east?mod=nav_top_subsection +# - url: https://www.wsj.com/world/india?mod=nav_top_subsection +# - url: https://www.wsj.com/world/oceania?mod=nav_top_subsection +# - url: https://www.wsj.com/world/russia?mod=nav_top_subsection +# - url: https://www.wsj.com/world/uk?mod=nav_top_subsection +# - url: https://www.wsj.com/science?mod=nav_top_subsection +# - url: https://www.wsj.com/science/archaeology?mod=nav_top_subsection +# - url: https://www.wsj.com/science/biology?mod=nav_top_subsection +# - url: https://www.wsj.com/science/environment?mod=nav_top_subsection +# - url: https://www.wsj.com/science/physics?mod=nav_top_subsection +# - url: https://www.wsj.com/science/space-astronomy?mod=nav_top_subsection +# - url: https://www.wsj.com/economy/central-banking?mod=nav_top_subsection +# - url: https://www.wsj.com/economy/consumers?mod=nav_top_subsection +# - url: https://www.wsj.com/economy/housing?mod=nav_top_subsection +# - url: https://www.wsj.com/economy/jobs?mod=nav_top_subsection +# - url: https://www.wsj.com/economy/trade?mod=nav_top_subsection +# - url: https://www.wsj.com/economy/global +# - url: https://www.wsj.com/tech/ai?mod=nav_top_subsection +# - url: https://www.wsj.com/tech/biotech +# - url: https://www.wsj.com/tech/cybersecurity?mod=nav_top_subsection +# - url: https://www.wsj.com/tech/personal-tech?mod=nav_top_subsection +# - url: https://www.reuters.com/ +# - url: https://www.reuters.com/business/aerospace-defense/ +# - url: https://www.reuters.com/business/autos-transportation/ +# - url: https://www.reuters.com/business/davos/ +# - url: https://www.reuters.com/business/energy/ +# - url: https://www.reuters.com/business/environment/ +# - url: https://www.reuters.com/business/finance/ +# - url: https://www.reuters.com/business/healthcare-pharmaceuticals/ +# - url: https://www.reuters.com/business/media-telecom/ +# - url: https://www.reuters.com/business/retail-consumer/ +# - url: https://www.reuters.com/business/future-of-health/ +# - url: https://www.reuters.com/business/future-of-money/ +# - url: https://www.reuters.com/business/take-five/ +# - url: https://www.reuters.com/business/world-at-work/ +# - url: https://www.reuters.com/breakingviews/ +# - url: https://www.reuters.com/technology/ +# - url: https://www.reuters.com/technology/cybersecurity/ +# - url: https://www.reuters.com/technology/space/ +# - url: https://www.reuters.com/technology/disrupted/ +# - url: https://www.reuters.com/technology/reuters-momentum/ +# - url: https://www.reuters.com/investigations/ +# - url: https://a16z.com/news-content/#latest +# - url: https://news.ycombinator.com/ +# - url: https://www.reddit.com/?rdt=48006 +# - url: https://news.crunchbase.com/ +# - url: https://www.cctv.com/ +# - url: https://sports.cctv.com/ diff --git a/tools/live_bench/pyproject.toml b/tools/live_bench/pyproject.toml index 79956c40..2e78889c 100644 --- a/tools/live_bench/pyproject.toml +++ b/tools/live_bench/pyproject.toml @@ -1,47 +1,47 @@ -[tool.black] -line-length = 240 - -[build-system] -requires = ["setuptools>=42", "wheel", "setuptools_scm[tomli]>=6.3"] -build-backend = "setuptools.build_meta" - -[project] -name = "live_bench" -version = "0.0.1" -authors = [ - { name = "LMMMs-Lab Evaluation Team", email = "lmms_eval@outlook.com" }, -] -description = "Live Bench" -readme = "README.md" -classifiers = [ - "Programming Language :: Python :: 3", - "License :: OSI Approved :: MIT License", - "Operating System :: OS Independent", -] -requires-python = ">=3.9" -license = { text = "MIT" } -dependencies = [ - "PyYAML >= 6.0.1", - "webdriver_manager >= 4.0.1", - "openai >= 1.32.0", - "google-generativeai >= 0.6.0", - "datasets >= 2.19.2", - "Pillow >= 10.3.0", - "selenium >= 4.21.0", - "undetected-chromedriver >= 3.5.5", - "anthropic >= 0.28.0", - "bs4 >= 0.0.2", -] - -[tool.setuptools.packages.find] -include = ["lmms_eval*"] - -[tool.setuptools.package-data] -lmms_eval = ["**/*.yaml", "tasks/**/*"] - -[project.scripts] -lmms-eval = "lmms_eval.__main__:cli_evaluate" - -[project.urls] -Homepage = "https://lmms-lab.github.io/" -Repository = "https://github.com/EvolvingLMMs-Lab/lmms-eval" +[tool.black] +line-length = 240 + +[build-system] +requires = ["setuptools>=42", "wheel", "setuptools_scm[tomli]>=6.3"] +build-backend = "setuptools.build_meta" + +[project] +name = "live_bench" +version = "0.0.1" +authors = [ + { name = "LMMMs-Lab Evaluation Team", email = "lmms_eval@outlook.com" }, +] +description = "Live Bench" +readme = "README.md" +classifiers = [ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", +] +requires-python = ">=3.9" +license = { text = "MIT" } +dependencies = [ + "PyYAML >= 6.0.1", + "webdriver_manager >= 4.0.1", + "openai >= 1.32.0", + "google-generativeai >= 0.6.0", + "datasets >= 2.19.2", + "Pillow >= 10.3.0", + "selenium >= 4.21.0", + "undetected-chromedriver >= 3.5.5", + "anthropic >= 0.28.0", + "bs4 >= 0.0.2", +] + +[tool.setuptools.packages.find] +include = ["lmms_eval*"] + +[tool.setuptools.package-data] +lmms_eval = ["**/*.yaml", "tasks/**/*"] + +[project.scripts] +lmms-eval = "lmms_eval.__main__:cli_evaluate" + +[project.urls] +Homepage = "https://lmms-lab.github.io/" +Repository = "https://github.com/EvolvingLMMs-Lab/lmms-eval" diff --git a/tools/live_bench/refine_all_results.py b/tools/live_bench/refine_all_results.py index 00b72f3d..438f98fe 100644 --- a/tools/live_bench/refine_all_results.py +++ b/tools/live_bench/refine_all_results.py @@ -1,35 +1,35 @@ -from datasets import Dataset, load_dataset -from live_bench.data_generator.question_finalizer import QuestionFinalizer -from tqdm import tqdm - -if __name__ == "__main__": - hf_data = load_dataset("lmms-lab/LiveBench", "2024-07", split="test") - finalizer = QuestionFinalizer() - - def load_results(): - for item in tqdm(hf_data): - try: - res = finalizer.finalize_question(question=item["question"], answer=item["answer"], criteria=item["criteria"], images=item["images"]) - final_answer = item.copy() - final_answer["question"] = res["question"] - final_answer["answer"] = res["answer"] - final_answer["criteria"] = res["criteria"] - print(item) - print(final_answer) - except Exception as e: - print(f"Error in {item['id']}: {e}") - final_answer = item - - yield final_answer - # break - - final_data = {} - for data in load_results(): - for item, value in data.items(): - if item not in final_data: - final_data[item] = [] - final_data[item].append(value) - # final_data = Dataset.from_generator(load_results) - final_data = Dataset.from_dict(final_data, features=hf_data.features) - final_data.save_to_disk("logs/2024-07-final") - final_data.push_to_hub("lmms-lab/LiveBench", "2024-07") +from datasets import Dataset, load_dataset +from live_bench.data_generator.question_finalizer import QuestionFinalizer +from tqdm import tqdm + +if __name__ == "__main__": + hf_data = load_dataset("lmms-lab/LiveBench", "2024-07", split="test") + finalizer = QuestionFinalizer() + + def load_results(): + for item in tqdm(hf_data): + try: + res = finalizer.finalize_question(question=item["question"], answer=item["answer"], criteria=item["criteria"], images=item["images"]) + final_answer = item.copy() + final_answer["question"] = res["question"] + final_answer["answer"] = res["answer"] + final_answer["criteria"] = res["criteria"] + print(item) + print(final_answer) + except Exception as e: + print(f"Error in {item['id']}: {e}") + final_answer = item + + yield final_answer + # break + + final_data = {} + for data in load_results(): + for item, value in data.items(): + if item not in final_data: + final_data[item] = [] + final_data[item].append(value) + # final_data = Dataset.from_generator(load_results) + final_data = Dataset.from_dict(final_data, features=hf_data.features) + final_data.save_to_disk("logs/2024-07-final") + final_data.push_to_hub("lmms-lab/LiveBench", "2024-07") diff --git a/tools/live_bench/script/README.md b/tools/live_bench/script/README.md index 49250cfc..af241a80 100644 --- a/tools/live_bench/script/README.md +++ b/tools/live_bench/script/README.md @@ -1,15 +1,15 @@ -## Usage - -### Upload Results - -```sh -python upload_results.py -f -m [-F] -``` - -`[-F]` means the script will automatically upload the results without human checking. Otherwise, the script will print the results and ask for confirmation before uploading. - -Example: - -```sh -python upload_results.py -f logs/0706_0959_model_outputs_gpt4v_model_args_c974bc -m gpt-4o -F -``` +## Usage + +### Upload Results + +```sh +python upload_results.py -f -m [-F] +``` + +`[-F]` means the script will automatically upload the results without human checking. Otherwise, the script will print the results and ask for confirmation before uploading. + +Example: + +```sh +python upload_results.py -f logs/0706_0959_model_outputs_gpt4v_model_args_c974bc -m gpt-4o -F +``` diff --git a/tools/live_bench/script/modify.ipynb b/tools/live_bench/script/modify.ipynb index 22944f1e..2231057c 100644 --- a/tools/live_bench/script/modify.ipynb +++ b/tools/live_bench/script/modify.ipynb @@ -1,461 +1,461 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Downloading data: 100%|██████████| 4.62k/4.62k [00:01<00:00, 4.14kB/s]\n", - "Generating test split: 100%|██████████| 8/8 [00:00<00:00, 933.60 examples/s]\n" - ] - } - ], - "source": [ - "import datasets\n", - "\n", - "data = datasets.load_dataset(\"lmms-lab/LiveBenchResults\", \"2024-07\")" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "df = data[\"test\"].to_pandas()" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "df.to_csv(\"2024-07.csv\", index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "\n", - "df = pd.read_csv(\"2024-07.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Model NameTotalBasic UnderstandingContextual AnalysisDeeper ImplicationsBroader ImplicationsFurther Insights
0gpt-4o-mini86.24000089.081.089.687.60000084.0
1gemini-1.5-flash80.76000084.469.281.790.30000078.2
2gpt-4o91.30000092.283.494.194.40000092.4
3gemini-1.5-pro86.28000090.676.685.691.60000087.0
4llama3-llava-next-8b62.65060260.161.474.863.67346953.3
5llava-1.5-7b41.94000038.634.558.842.80000035.0
6Idefics2-8B25.86000018.016.343.827.00000024.2
7InternVL2-2B56.84000065.849.964.255.80000048.5
\n", - "
" - ], - "text/plain": [ - " Model Name Total Basic Understanding Contextual Analysis \\\n", - "0 gpt-4o-mini 86.240000 89.0 81.0 \n", - "1 gemini-1.5-flash 80.760000 84.4 69.2 \n", - "2 gpt-4o 91.300000 92.2 83.4 \n", - "3 gemini-1.5-pro 86.280000 90.6 76.6 \n", - "4 llama3-llava-next-8b 62.650602 60.1 61.4 \n", - "5 llava-1.5-7b 41.940000 38.6 34.5 \n", - "6 Idefics2-8B 25.860000 18.0 16.3 \n", - "7 InternVL2-2B 56.840000 65.8 49.9 \n", - "\n", - " Deeper Implications Broader Implications Further Insights \n", - "0 89.6 87.600000 84.0 \n", - "1 81.7 90.300000 78.2 \n", - "2 94.1 94.400000 92.4 \n", - "3 85.6 91.600000 87.0 \n", - "4 74.8 63.673469 53.3 \n", - "5 58.8 42.800000 35.0 \n", - "6 43.8 27.000000 24.2 \n", - "7 64.2 55.800000 48.5 " - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "data = datasets.Dataset.from_pandas(df)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 473.45ba/s]\n", - "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00, 1.17s/it]\n" - ] - }, - { - "data": { - "text/plain": [ - "CommitInfo(commit_url='https://huggingface.co/datasets/lmms-lab/LiveBenchResults/commit/a29f8ecb399dbd7ab7475f0de2c48ee54affbff9', commit_message='Upload dataset', commit_description='', oid='a29f8ecb399dbd7ab7475f0de2c48ee54affbff9', pr_url=None, pr_revision=None, pr_num=None)" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data.push_to_hub(\"lmms-lab/LiveBenchResults\", \"2024-07\", split=\"test\")" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [], - "source": [ - "data = datasets.load_dataset(\"lmms-lab/LiveBenchDetailedResults\", \"2024-07\")" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [], - "source": [ - "data[\"Idefics2_8B\"] = data[\"idefics2\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "DatasetDict({\n", - " gpt_4o_mini: Dataset({\n", - " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", - " num_rows: 250\n", - " })\n", - " gemini_1.5_flash: Dataset({\n", - " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", - " num_rows: 250\n", - " })\n", - " gpt_4o: Dataset({\n", - " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", - " num_rows: 250\n", - " })\n", - " gemini_1.5_pro: Dataset({\n", - " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", - " num_rows: 250\n", - " })\n", - " llama3_llava_next_8b: Dataset({\n", - " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", - " num_rows: 250\n", - " })\n", - " llava_1.5_7b: Dataset({\n", - " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", - " num_rows: 250\n", - " })\n", - " idefics2: Dataset({\n", - " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", - " num_rows: 250\n", - " })\n", - " InternVL2_2B: Dataset({\n", - " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", - " num_rows: 250\n", - " })\n", - " Idefics2_8B: Dataset({\n", - " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", - " num_rows: 250\n", - " })\n", - "})" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [], - "source": [ - "new_data = {}\n", - "for k, v in data.items():\n", - " if k == \"idefics2\":\n", - " continue\n", - " new_data[k] = v\n", - "data = datasets.DatasetDict(new_data)" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "DatasetDict({\n", - " gpt_4o_mini: Dataset({\n", - " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", - " num_rows: 250\n", - " })\n", - " gemini_1.5_flash: Dataset({\n", - " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", - " num_rows: 250\n", - " })\n", - " gpt_4o: Dataset({\n", - " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", - " num_rows: 250\n", - " })\n", - " gemini_1.5_pro: Dataset({\n", - " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", - " num_rows: 250\n", - " })\n", - " llama3_llava_next_8b: Dataset({\n", - " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", - " num_rows: 250\n", - " })\n", - " llava_1.5_7b: Dataset({\n", - " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", - " num_rows: 250\n", - " })\n", - " InternVL2_2B: Dataset({\n", - " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", - " num_rows: 250\n", - " })\n", - " Idefics2_8B: Dataset({\n", - " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", - " num_rows: 250\n", - " })\n", - "})" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Map: 100%|██████████| 250/250 [00:00<00:00, 347.35 examples/s]it/s]\n", - "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 12.58ba/s]\n", - "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00, 1.63s/it]\n", - "Map: 100%|██████████| 250/250 [00:00<00:00, 363.40 examples/s]it/s]\n", - "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 12.70ba/s]\n", - "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00, 1.59s/it]\n", - "Map: 100%|██████████| 250/250 [00:00<00:00, 472.60 examples/s]it/s]\n", - "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 12.62ba/s]\n", - "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00, 1.43s/it]\n", - "Map: 100%|██████████| 250/250 [00:00<00:00, 352.11 examples/s]it/s]\n", - "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 11.55ba/s]\n", - "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00, 1.63s/it]\n", - "Map: 100%|██████████| 250/250 [00:00<00:00, 475.90 examples/s]it/s]\n", - "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 11.38ba/s]\n", - "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00, 1.46s/it]\n", - "Map: 100%|██████████| 250/250 [00:00<00:00, 364.89 examples/s]it/s]\n", - "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 10.94ba/s]\n", - "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00, 1.59s/it]\n", - "Map: 100%|██████████| 250/250 [00:00<00:00, 529.96 examples/s]it/s]\n", - "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 13.51ba/s]\n", - "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00, 1.33s/it]\n", - "Map: 100%|██████████| 250/250 [00:00<00:00, 349.67 examples/s]it/s]\n", - "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 12.74ba/s]\n", - "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00, 1.57s/it]\n" - ] - }, - { - "data": { - "text/plain": [ - "CommitInfo(commit_url='https://huggingface.co/datasets/lmms-lab/LiveBenchDetailedResults/commit/047d6dc66759e0a8b57b4e6015db6208da1cd4da', commit_message='Upload dataset', commit_description='', oid='047d6dc66759e0a8b57b4e6015db6208da1cd4da', pr_url=None, pr_revision=None, pr_num=None)" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data.push_to_hub(\"lmms-lab/LiveBenchDetailedResults\", \"2024-07\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "live_bench", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.9" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Downloading data: 100%|██████████| 4.62k/4.62k [00:01<00:00, 4.14kB/s]\n", + "Generating test split: 100%|██████████| 8/8 [00:00<00:00, 933.60 examples/s]\n" + ] + } + ], + "source": [ + "import datasets\n", + "\n", + "data = datasets.load_dataset(\"lmms-lab/LiveBenchResults\", \"2024-07\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "df = data[\"test\"].to_pandas()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "df.to_csv(\"2024-07.csv\", index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "df = pd.read_csv(\"2024-07.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Model NameTotalBasic UnderstandingContextual AnalysisDeeper ImplicationsBroader ImplicationsFurther Insights
0gpt-4o-mini86.24000089.081.089.687.60000084.0
1gemini-1.5-flash80.76000084.469.281.790.30000078.2
2gpt-4o91.30000092.283.494.194.40000092.4
3gemini-1.5-pro86.28000090.676.685.691.60000087.0
4llama3-llava-next-8b62.65060260.161.474.863.67346953.3
5llava-1.5-7b41.94000038.634.558.842.80000035.0
6Idefics2-8B25.86000018.016.343.827.00000024.2
7InternVL2-2B56.84000065.849.964.255.80000048.5
\n", + "
" + ], + "text/plain": [ + " Model Name Total Basic Understanding Contextual Analysis \\\n", + "0 gpt-4o-mini 86.240000 89.0 81.0 \n", + "1 gemini-1.5-flash 80.760000 84.4 69.2 \n", + "2 gpt-4o 91.300000 92.2 83.4 \n", + "3 gemini-1.5-pro 86.280000 90.6 76.6 \n", + "4 llama3-llava-next-8b 62.650602 60.1 61.4 \n", + "5 llava-1.5-7b 41.940000 38.6 34.5 \n", + "6 Idefics2-8B 25.860000 18.0 16.3 \n", + "7 InternVL2-2B 56.840000 65.8 49.9 \n", + "\n", + " Deeper Implications Broader Implications Further Insights \n", + "0 89.6 87.600000 84.0 \n", + "1 81.7 90.300000 78.2 \n", + "2 94.1 94.400000 92.4 \n", + "3 85.6 91.600000 87.0 \n", + "4 74.8 63.673469 53.3 \n", + "5 58.8 42.800000 35.0 \n", + "6 43.8 27.000000 24.2 \n", + "7 64.2 55.800000 48.5 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "data = datasets.Dataset.from_pandas(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 473.45ba/s]\n", + "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00, 1.17s/it]\n" + ] + }, + { + "data": { + "text/plain": [ + "CommitInfo(commit_url='https://huggingface.co/datasets/lmms-lab/LiveBenchResults/commit/a29f8ecb399dbd7ab7475f0de2c48ee54affbff9', commit_message='Upload dataset', commit_description='', oid='a29f8ecb399dbd7ab7475f0de2c48ee54affbff9', pr_url=None, pr_revision=None, pr_num=None)" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.push_to_hub(\"lmms-lab/LiveBenchResults\", \"2024-07\", split=\"test\")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "data = datasets.load_dataset(\"lmms-lab/LiveBenchDetailedResults\", \"2024-07\")" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "data[\"Idefics2_8B\"] = data[\"idefics2\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DatasetDict({\n", + " gpt_4o_mini: Dataset({\n", + " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", + " num_rows: 250\n", + " })\n", + " gemini_1.5_flash: Dataset({\n", + " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", + " num_rows: 250\n", + " })\n", + " gpt_4o: Dataset({\n", + " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", + " num_rows: 250\n", + " })\n", + " gemini_1.5_pro: Dataset({\n", + " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", + " num_rows: 250\n", + " })\n", + " llama3_llava_next_8b: Dataset({\n", + " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", + " num_rows: 250\n", + " })\n", + " llava_1.5_7b: Dataset({\n", + " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", + " num_rows: 250\n", + " })\n", + " idefics2: Dataset({\n", + " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", + " num_rows: 250\n", + " })\n", + " InternVL2_2B: Dataset({\n", + " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", + " num_rows: 250\n", + " })\n", + " Idefics2_8B: Dataset({\n", + " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", + " num_rows: 250\n", + " })\n", + "})" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "new_data = {}\n", + "for k, v in data.items():\n", + " if k == \"idefics2\":\n", + " continue\n", + " new_data[k] = v\n", + "data = datasets.DatasetDict(new_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DatasetDict({\n", + " gpt_4o_mini: Dataset({\n", + " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", + " num_rows: 250\n", + " })\n", + " gemini_1.5_flash: Dataset({\n", + " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", + " num_rows: 250\n", + " })\n", + " gpt_4o: Dataset({\n", + " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", + " num_rows: 250\n", + " })\n", + " gemini_1.5_pro: Dataset({\n", + " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", + " num_rows: 250\n", + " })\n", + " llama3_llava_next_8b: Dataset({\n", + " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", + " num_rows: 250\n", + " })\n", + " llava_1.5_7b: Dataset({\n", + " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", + " num_rows: 250\n", + " })\n", + " InternVL2_2B: Dataset({\n", + " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", + " num_rows: 250\n", + " })\n", + " Idefics2_8B: Dataset({\n", + " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", + " num_rows: 250\n", + " })\n", + "})" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Map: 100%|██████████| 250/250 [00:00<00:00, 347.35 examples/s]it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 12.58ba/s]\n", + "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00, 1.63s/it]\n", + "Map: 100%|██████████| 250/250 [00:00<00:00, 363.40 examples/s]it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 12.70ba/s]\n", + "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00, 1.59s/it]\n", + "Map: 100%|██████████| 250/250 [00:00<00:00, 472.60 examples/s]it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 12.62ba/s]\n", + "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00, 1.43s/it]\n", + "Map: 100%|██████████| 250/250 [00:00<00:00, 352.11 examples/s]it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 11.55ba/s]\n", + "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00, 1.63s/it]\n", + "Map: 100%|██████████| 250/250 [00:00<00:00, 475.90 examples/s]it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 11.38ba/s]\n", + "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00, 1.46s/it]\n", + "Map: 100%|██████████| 250/250 [00:00<00:00, 364.89 examples/s]it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 10.94ba/s]\n", + "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00, 1.59s/it]\n", + "Map: 100%|██████████| 250/250 [00:00<00:00, 529.96 examples/s]it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 13.51ba/s]\n", + "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00, 1.33s/it]\n", + "Map: 100%|██████████| 250/250 [00:00<00:00, 349.67 examples/s]it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 12.74ba/s]\n", + "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00, 1.57s/it]\n" + ] + }, + { + "data": { + "text/plain": [ + "CommitInfo(commit_url='https://huggingface.co/datasets/lmms-lab/LiveBenchDetailedResults/commit/047d6dc66759e0a8b57b4e6015db6208da1cd4da', commit_message='Upload dataset', commit_description='', oid='047d6dc66759e0a8b57b4e6015db6208da1cd4da', pr_url=None, pr_revision=None, pr_num=None)" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.push_to_hub(\"lmms-lab/LiveBenchDetailedResults\", \"2024-07\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "live_bench", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tools/live_bench/script/refractor.py b/tools/live_bench/script/refractor.py new file mode 100644 index 00000000..bfd74179 --- /dev/null +++ b/tools/live_bench/script/refractor.py @@ -0,0 +1,18 @@ +import os + +if __name__ == "__main__": + path = "/data/pufanyi/project/lmms-eval/tools/temp/images" + new_path = "/data/pufanyi/project/lmms-eval/tools/temp/processed_images" + if not os.path.exists(new_path): + os.makedirs(new_path) + subjects = os.listdir(path) + for subject in subjects: + subject_folder = os.path.join(path, subject) + if not os.path.isdir(subject_folder): + continue + print(f"Processing {subject_folder}") + images = os.listdir(subject_folder) + for id, image in enumerate(images): + image_ext = image.split(".")[-1] + new_image_name = f"{subject}_{id}.{image_ext}" + os.rename(os.path.join(subject_folder, image), os.path.join(new_path, new_image_name)) diff --git a/tools/live_bench/script/upload_results.py b/tools/live_bench/script/upload_results.py index 69cc5ce8..3a10a9f2 100644 --- a/tools/live_bench/script/upload_results.py +++ b/tools/live_bench/script/upload_results.py @@ -1,177 +1,177 @@ -import argparse -import json -import os -from typing import Dict, List, Union - -import datasets -import numpy as np -import pandas as pd -from datasets import Dataset, load_dataset -from PIL import Image -from tqdm import tqdm - -EPS = 1e-6 - -RESULT_FEATURES = { - "id": datasets.Value("int32"), - "images": datasets.Sequence(datasets.Image()), - "question": datasets.Value("string"), - "ground_truth": datasets.Value("string"), - "criteria": datasets.Value("string"), - "subtask": datasets.Value("string"), - "response": datasets.Value("string"), - "score": datasets.Value("float32"), - "reason": datasets.Value("string"), -} - -SUBTASKS = [ - "Basic Understanding", - "Contextual Analysis", - "Deeper Implications", - "Broader Implications", - "Further Insights", -] - - -def load_images(config) -> Dict[int, List[Image.Image]]: - dataset = datasets.load_dataset(config["dataset_path"], config["dataset_name"], split=config["test_split"]) - images = {} - for data in tqdm(dataset, desc="Loading images"): - images[data["id"]] = data["images"] - return images - - -def get_hf_results(results, detailed_results): - live_bench_images = load_images(results["configs"]["live_bench"]) - - def load_results(): - for result in tqdm(detailed_results["logs"], desc="Loading results"): - doc = result["doc"] - res = {} - res["id"] = doc["id"] - res["images"] = live_bench_images[doc["id"]] - res["question"] = doc["question"] - res["ground_truth"] = doc["answer"] - res["criteria"] = doc["criteria"] - res["subtask"] = doc["subtask"] - res["response"] = result["filtered_resps"][0] - res["score"] = result["gpt4_eval_score"]["rating"] - res["reason"] = result["gpt4_eval_score"]["explanation"] - yield res - - result_dataset = Dataset.from_generator(load_results, features=datasets.Features(RESULT_FEATURES)) - - return result_dataset - - -def preview_results(results, heading: str): - HEADING = "=" * 15 + " " + heading + " " + "=" * 15 - ENDING = "=" * len(HEADING) - print(HEADING) - print(results) - print(ENDING) - - -def calculate_score(results: Dataset): - results = results.to_pandas() - - sum_score, count = 0, 0 - score = {} - for subtask in SUBTASKS: - score[subtask] = [] - for index, result in tqdm(results.iterrows(), total=len(results), desc="Calculating score"): - if result["score"] == -1: - continue - sum_score += result["score"] / 10 - count += 1 - subtask = result["subtask"] - if subtask not in SUBTASKS: - subtask = "Further Insights" - score[result["subtask"]].append(result["score"] / 10) - res = [(subtask, len(score[subtask]), np.mean(score[subtask]) * 100) for subtask in SUBTASKS] - res.append(("Total", count, sum_score / count * 100)) - res = pd.DataFrame(res, columns=["Subtask", "Count", "Score"]) - - return res - - -def get_results(folder): - detailed_file = os.path.join(folder, "live_bench.json") - results_file = os.path.join(folder, "results.json") - - with open(results_file, "r") as f: - results = json.load(f) - - assert "live_bench" in results["configs"], "No live_bench config found in results.json" - final_score = results["results"]["live_bench"]["gpt4_eval_score,none"] - model_configs = results["model_configs"] - version = results["configs"]["live_bench"]["metadata"]["version"] - - assert model_configs["limit"] is None, "Model limit is not None, please check if the model is tested on the full dataset" - - with open(detailed_file, "r") as f: - detailed_results = json.load(f) - - hf_results = get_hf_results(results, detailed_results) - preview_results(hf_results.to_pandas().iloc[0], "Detailed Results") - score = calculate_score(hf_results) - preview_results(score, "Final Score") - - assert ( - abs(score[score["Subtask"] == "Total"]["Score"] - final_score) <= EPS - ).all(), f"Final score does not match the calculated score, the score calculated by the script is {score[score['Subtask'] == 'Total']['Score'].values[0]} and the final score in the log is {final_score}." - - return hf_results, score, version - - -def upload_results( - hf_results: Dataset, - score: pd.DataFrame, - model_name, - dataset_version, - log_folder="logs", -): - hf_results.push_to_hub( - "lmms-lab/LiveBenchDetailedResults", - config_name=dataset_version, - split=model_name.replace("-", "_"), - ) - if not os.path.exists(log_folder): - os.makedirs(log_folder) - score_path = os.path.abspath(os.path.join(log_folder, f"{dataset_version}_{model_name}.csv")) - score.to_csv(score_path, index=False) - print(f"Results saved to {score_path}") - score_dict = {item["Subtask"]: item["Score"] for index, item in score.iterrows()} - score_dict["Model Name"] = model_name - try: - hf_score = datasets.load_dataset("lmms-lab/LiveBenchResults", dataset_version, split="test") - except: - hf_score = Dataset.from_dict({subtask: [] for subtask in ["Model Name", "Total"] + SUBTASKS}) - hf_score = hf_score.add_item(score_dict) - df_score = pd.DataFrame(hf_score) - df_score = df_score.drop_duplicates(subset=["Model Name"], keep="last") - df_score = df_score[["Model Name", "Total"] + SUBTASKS] - hf_score = Dataset.from_pandas(df_score) - hf_score.push_to_hub("lmms-lab/LiveBenchResults", dataset_version, split="test") - - -if __name__ == "__main__": - argparse = argparse.ArgumentParser() - argparse.add_argument("--folder", "-f", type=str, required=True, help="Results folder") - argparse.add_argument("--name", "-m", type=str, required=True, help="Model name") - argparse.add_argument("--log_folder", "-l", type=str, default="logs", help="Log folder") - argparse.add_argument("--force", "-F", action="store_true", help="Force upload") - args = argparse.parse_args() - hf_results, score, version = get_results(args.folder) - print(f"Results will be uploaded with model name {args.name} and model version {version}") - if args.force is False: - print("Are you sure you want to upload the results? (y/n)", end=" ") - while True: - choice = input().lower() - if choice == "y": - break - elif choice == "n": - exit() - else: - print("Invalid choice, please enter 'y' or 'n'") - upload_results(hf_results, score, args.name, version, args.log_folder) +import argparse +import json +import os +from typing import Dict, List, Union + +import datasets +import numpy as np +import pandas as pd +from datasets import Dataset, load_dataset +from PIL import Image +from tqdm import tqdm + +EPS = 1e-6 + +RESULT_FEATURES = { + "id": datasets.Value("int32"), + "images": datasets.Sequence(datasets.Image()), + "question": datasets.Value("string"), + "ground_truth": datasets.Value("string"), + "criteria": datasets.Value("string"), + "subtask": datasets.Value("string"), + "response": datasets.Value("string"), + "score": datasets.Value("float32"), + "reason": datasets.Value("string"), +} + +SUBTASKS = [ + "Basic Understanding", + "Contextual Analysis", + "Deeper Implications", + "Broader Implications", + "Further Insights", +] + + +def load_images(config) -> Dict[int, List[Image.Image]]: + dataset = datasets.load_dataset(config["dataset_path"], config["dataset_name"], split=config["test_split"]) + images = {} + for data in tqdm(dataset, desc="Loading images"): + images[data["id"]] = data["images"] + return images + + +def get_hf_results(results, detailed_results): + live_bench_images = load_images(results["configs"]["live_bench"]) + + def load_results(): + for result in tqdm(detailed_results["logs"], desc="Loading results"): + doc = result["doc"] + res = {} + res["id"] = doc["id"] + res["images"] = live_bench_images[doc["id"]] + res["question"] = doc["question"] + res["ground_truth"] = doc["answer"] + res["criteria"] = doc["criteria"] + res["subtask"] = doc["subtask"] + res["response"] = result["filtered_resps"][0] + res["score"] = result["gpt4_eval_score"]["rating"] + res["reason"] = result["gpt4_eval_score"]["explanation"] + yield res + + result_dataset = Dataset.from_generator(load_results, features=datasets.Features(RESULT_FEATURES)) + + return result_dataset + + +def preview_results(results, heading: str): + HEADING = "=" * 15 + " " + heading + " " + "=" * 15 + ENDING = "=" * len(HEADING) + print(HEADING) + print(results) + print(ENDING) + + +def calculate_score(results: Dataset): + results = results.to_pandas() + + sum_score, count = 0, 0 + score = {} + for subtask in SUBTASKS: + score[subtask] = [] + for index, result in tqdm(results.iterrows(), total=len(results), desc="Calculating score"): + if result["score"] == -1: + continue + sum_score += result["score"] / 10 + count += 1 + subtask = result["subtask"] + if subtask not in SUBTASKS: + subtask = "Further Insights" + score[result["subtask"]].append(result["score"] / 10) + res = [(subtask, len(score[subtask]), np.mean(score[subtask]) * 100) for subtask in SUBTASKS] + res.append(("Total", count, sum_score / count * 100)) + res = pd.DataFrame(res, columns=["Subtask", "Count", "Score"]) + + return res + + +def get_results(folder): + detailed_file = os.path.join(folder, "live_bench.json") + results_file = os.path.join(folder, "results.json") + + with open(results_file, "r") as f: + results = json.load(f) + + assert "live_bench" in results["configs"], "No live_bench config found in results.json" + final_score = results["results"]["live_bench"]["gpt4_eval_score,none"] + model_configs = results["model_configs"] + version = results["configs"]["live_bench"]["metadata"]["version"] + + assert model_configs["limit"] is None, "Model limit is not None, please check if the model is tested on the full dataset" + + with open(detailed_file, "r") as f: + detailed_results = json.load(f) + + hf_results = get_hf_results(results, detailed_results) + preview_results(hf_results.to_pandas().iloc[0], "Detailed Results") + score = calculate_score(hf_results) + preview_results(score, "Final Score") + + assert ( + abs(score[score["Subtask"] == "Total"]["Score"] - final_score) <= EPS + ).all(), f"Final score does not match the calculated score, the score calculated by the script is {score[score['Subtask'] == 'Total']['Score'].values[0]} and the final score in the log is {final_score}." + + return hf_results, score, version + + +def upload_results( + hf_results: Dataset, + score: pd.DataFrame, + model_name, + dataset_version, + log_folder="logs", +): + hf_results.push_to_hub( + "lmms-lab/LiveBenchDetailedResults", + config_name=dataset_version, + split=model_name.replace("-", "_"), + ) + if not os.path.exists(log_folder): + os.makedirs(log_folder) + score_path = os.path.abspath(os.path.join(log_folder, f"{dataset_version}_{model_name}.csv")) + score.to_csv(score_path, index=False) + print(f"Results saved to {score_path}") + score_dict = {item["Subtask"]: item["Score"] for index, item in score.iterrows()} + score_dict["Model Name"] = model_name + try: + hf_score = datasets.load_dataset("lmms-lab/LiveBenchResults", dataset_version, split="test") + except: + hf_score = Dataset.from_dict({subtask: [] for subtask in ["Model Name", "Total"] + SUBTASKS}) + hf_score = hf_score.add_item(score_dict) + df_score = pd.DataFrame(hf_score) + df_score = df_score.drop_duplicates(subset=["Model Name"], keep="last") + df_score = df_score[["Model Name", "Total"] + SUBTASKS] + hf_score = Dataset.from_pandas(df_score) + hf_score.push_to_hub("lmms-lab/LiveBenchResults", dataset_version, split="test") + + +if __name__ == "__main__": + argparse = argparse.ArgumentParser() + argparse.add_argument("--folder", "-f", type=str, required=True, help="Results folder") + argparse.add_argument("--name", "-m", type=str, required=True, help="Model name") + argparse.add_argument("--log_folder", "-l", type=str, default="logs", help="Log folder") + argparse.add_argument("--force", "-F", action="store_true", help="Force upload") + args = argparse.parse_args() + hf_results, score, version = get_results(args.folder) + print(f"Results will be uploaded with model name {args.name} and model version {version}") + if args.force is False: + print("Are you sure you want to upload the results? (y/n)", end=" ") + while True: + choice = input().lower() + if choice == "y": + break + elif choice == "n": + exit() + else: + print("Invalid choice, please enter 'y' or 'n'") + upload_results(hf_results, score, args.name, version, args.log_folder) diff --git a/tools/live_bench/setup.py b/tools/live_bench/setup.py index b908cbe5..55f7451d 100644 --- a/tools/live_bench/setup.py +++ b/tools/live_bench/setup.py @@ -1,3 +1,3 @@ -import setuptools - -setuptools.setup() +import setuptools + +setuptools.setup() From eccc76cd6fb61b3cebddaa14203e6f5142d4b627 Mon Sep 17 00:00:00 2001 From: Pu Fanyi Date: Sat, 5 Oct 2024 20:11:57 +0800 Subject: [PATCH 2/6] Refactor DefaultWebsite class in website.py --- tools/live_bench/live_bench/websites/website.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/live_bench/live_bench/websites/website.py b/tools/live_bench/live_bench/websites/website.py index eeb5354c..6e55ae5f 100644 --- a/tools/live_bench/live_bench/websites/website.py +++ b/tools/live_bench/live_bench/websites/website.py @@ -56,7 +56,12 @@ def post_visit(self, driver: Driver): class HumanScreenShotWebsite(Website): def __init__(self, name=None, path=None): - super().__init__(name=name, path=path) + try: + image_name = os.path.basename(path) + subject = image_name.split("_")[0] + except: + subject = None + super().__init__(name=name, path=path, subject=subject) def pre_visit(self, driver: Driver): pass From 2cb32eb630bbc7130042da7ddde8f3a167bc0d7c Mon Sep 17 00:00:00 2001 From: Pu Fanyi Date: Sun, 6 Oct 2024 03:35:06 +0800 Subject: [PATCH 3/6] Refactor dataset name and limit processed images to 2 --- tools/live_bench/create_dataset.py | 4 ++-- .../live_bench/data_generator/qa_generator.py | 7 ++----- .../data_generator/question_finalizer.py | 10 ++++------ .../live_bench/data_generator/score_getter.py | 12 +++++++----- .../data_generator/utils/extract_infomation.py | 4 +++- .../live_bench/data_generator/utils/gpt4v.py | 16 ++++++++++++++++ tools/live_bench/live_bench/websites/website.py | 15 +++++++++++++++ 7 files changed, 49 insertions(+), 19 deletions(-) diff --git a/tools/live_bench/create_dataset.py b/tools/live_bench/create_dataset.py index 1da8edd2..03d113c1 100644 --- a/tools/live_bench/create_dataset.py +++ b/tools/live_bench/create_dataset.py @@ -3,8 +3,8 @@ if __name__ == "__main__": website = load_websites() - dataset = LiveBench(name="2024-9") + dataset = LiveBench(name="2024-09") - website = load_websites_from_file("/data/pufanyi/project/lmms-eval/tools/temp/processed_images") + website = load_websites_from_file("/data/pufanyi/project/lmms-eval/tools/temp/processed_images")[:2] dataset.capture(websites=website, screen_shoter="human", qa_generator="gpt4v", scorer="claude", checker="gemini", driver_kwargs={}, shoter_kwargs={}, generator_kwargs={}) dataset.upload() diff --git a/tools/live_bench/live_bench/data_generator/qa_generator.py b/tools/live_bench/live_bench/data_generator/qa_generator.py index 631477db..dc603ef3 100644 --- a/tools/live_bench/live_bench/data_generator/qa_generator.py +++ b/tools/live_bench/live_bench/data_generator/qa_generator.py @@ -24,6 +24,7 @@ from live_bench.data_generator.utils.gemini import gemini_generate_response from live_bench.data_generator.utils.gpt4v import ( format_gpt4v_images, + get_openai_client, gpt4v_generate_response, ) from live_bench.screen_shoter import ScreenImage @@ -157,11 +158,7 @@ def __init__( check_prompt=os.path.join(os.path.dirname(__file__), "check_prompt.md"), ): super().__init__(prompt_file) - API_KEY = os.getenv("OPENAI_API_KEY") - if not API_KEY: - raise ValueError("OPENAI_API_KEY environment variable not set.") - self.api_key = API_KEY - self.client = openai.OpenAI(api_key=self.api_key) + self.client = get_openai_client() self.model = model if os.path.exists(example_path): self.example_path = example_path diff --git a/tools/live_bench/live_bench/data_generator/question_finalizer.py b/tools/live_bench/live_bench/data_generator/question_finalizer.py index 6aa5f69a..100003a8 100644 --- a/tools/live_bench/live_bench/data_generator/question_finalizer.py +++ b/tools/live_bench/live_bench/data_generator/question_finalizer.py @@ -14,6 +14,7 @@ from live_bench.data_generator.utils.gemini import gemini_generate_response from live_bench.data_generator.utils.gpt4v import ( format_gpt4v_images, + get_openai_client, gpt4v_generate_response, ) from PIL import Image @@ -30,11 +31,7 @@ def get_answer(self, question: str, images: List[Image.Image]): class GPT4VAnswerGetter(AnswerGetter): def __init__(self, model: str = "gpt-4o", api_key=None): self.model = model - if api_key is None: - self.api_key = os.getenv("OPENAI_API_KEY", None) - else: - self.api_key = api_key - self.client = openai.OpenAI(api_key=self.api_key) + self.client = get_openai_client() def get_answer(self, question: str, images: List[Image.Image]): messages = [{"role": "user", "content": format_gpt4v_images(images) + [{"type": "text", "text": question}]}] @@ -111,7 +108,8 @@ def get_answer(self, question: str, images: List[Image.Image]): class QuestionFinalizer(object): def __init__(self, gpt4v_model: str = "gpt-4o", claude_model: str = "claude-3-5-sonnet-20240620", gemini_model: str = "gemini-1.5-pro"): self.models = {"GPT4V": GPT4VAnswerGetter(gpt4v_model), "Claude": ClaudeAnswerGetter(claude_model), "Gemini": GeminiAnswerGetter(gemini_model)} - self.client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY", None)) + self.client = get_openai_client() + # self.client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY", None)) # self.client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY", None)) def finalize_question(self, question, answer, criteria, images: List[Image.Image]): diff --git a/tools/live_bench/live_bench/data_generator/score_getter.py b/tools/live_bench/live_bench/data_generator/score_getter.py index 78ce99d0..4cab632a 100644 --- a/tools/live_bench/live_bench/data_generator/score_getter.py +++ b/tools/live_bench/live_bench/data_generator/score_getter.py @@ -13,6 +13,7 @@ ) from live_bench.data_generator.utils.gpt4v import ( format_gpt4v_images, + get_openai_client, gpt4v_generate_response, ) from live_bench.screen_shoter import ScreenImage @@ -80,11 +81,12 @@ def __init__(self, prompt: str = os.path.join(os.path.dirname(__file__), "score_ self.prompt = f.read() else: self.prompt = prompt - API_KEY = os.getenv("OPENAI_API_KEY") - if not API_KEY: - raise ValueError("OPENAI_API_KEY environment variable not set.") - self.api_key = API_KEY - self.client = openai.OpenAI(api_key=self.api_key) + # API_KEY = os.getenv("OPENAI_API_KEY") + # if not API_KEY: + # raise ValueError("OPENAI_API_KEY environment variable not set.") + # self.api_key = API_KEY + # self.client = openai.OpenAI(api_key=self.api_key) + self.client = get_openai_client() self.model = model if os.path.exists(example_path) and os.path.isfile(os.path.join(example_path, "example_score_input.md")): with open(example_path, "r") as f: diff --git a/tools/live_bench/live_bench/data_generator/utils/extract_infomation.py b/tools/live_bench/live_bench/data_generator/utils/extract_infomation.py index b46f3af5..5bf46cdd 100644 --- a/tools/live_bench/live_bench/data_generator/utils/extract_infomation.py +++ b/tools/live_bench/live_bench/data_generator/utils/extract_infomation.py @@ -12,6 +12,7 @@ ) from live_bench.data_generator.utils.gpt4v import ( format_gpt4v_images, + get_openai_client, gpt4v_generate_response, ) from live_bench.screen_shoter import ScreenImage @@ -69,7 +70,8 @@ def __init__(self, model="claude-3-5-sonnet-20240620", openai_api_key=None, anth if not openai_api_key: openai_api_key = os.getenv("OPENAI_API_KEY", None) if "gpt" in model: - self.client = openai.OpenAI(api_key=openai_api_key) + # self.client = openai.OpenAI(api_key=openai_api_key) + self.client = get_openai_client() self.generate_response = gpt4v_generate_response self.format_images = format_gpt4v_images elif "claude" in model: diff --git a/tools/live_bench/live_bench/data_generator/utils/gpt4v.py b/tools/live_bench/live_bench/data_generator/utils/gpt4v.py index 6ee5452d..67581459 100644 --- a/tools/live_bench/live_bench/data_generator/utils/gpt4v.py +++ b/tools/live_bench/live_bench/data_generator/utils/gpt4v.py @@ -1,14 +1,30 @@ import base64 import io import logging +import os from time import sleep +import openai from live_bench.data_generator.response import Response from PIL import Image logger = logging.getLogger("lmms-eval") +def get_openai_client(api_version="2024-02-15-preview") -> openai.OpenAI: + endpoint = os.getenv("AZURE_OPENAI_ENDPOINT") + if endpoint: + key = os.getenv("AZURE_OPENAI_API_KEY") + if not key: + raise ValueError("OPENAI_API_KEY environment variable not set.") + return openai.AzureOpenAI(azure_endpoint=endpoint, api_key=key, api_version=api_version) + else: + api_key = os.getenv("OPENAI_API_KEY") + if not api_key: + raise ValueError("OPENAI_API_KEY environment variable not set.") + return openai.OpenAI(api_key=api_key) + + def format_gpt4v_images(image): if isinstance(image, Image.Image): buffered = io.BytesIO() diff --git a/tools/live_bench/live_bench/websites/website.py b/tools/live_bench/live_bench/websites/website.py index 6e55ae5f..b5210765 100644 --- a/tools/live_bench/live_bench/websites/website.py +++ b/tools/live_bench/live_bench/websites/website.py @@ -4,13 +4,28 @@ from webdriver_manager.core.driver import Driver +SUBJECT_MATCH = { + "entertainment": "Entertainment", + "artandculture": "Art & Culture", + "entertainment": "Entertainment", + "finance": "Economy & Finance", + "politics": "Politics", + "science": "Science", + "sports": "Sports", + "technology": "Technology", +} + class Website(ABC): def __init__(self, url=None, name=None, path=None, subject=None): self.url = url self.name = name self.path = path + + if subject in SUBJECT_MATCH: + self.subject = SUBJECT_MATCH[subject] self.subject = subject + assert self.url is not None or self.path is not None, "Either url or path must be provided" def get_path(self): From 293079280c406d81ce18557bcb661c1ca824f3e1 Mon Sep 17 00:00:00 2001 From: Pu Fanyi Date: Sun, 6 Oct 2024 03:40:48 +0800 Subject: [PATCH 4/6] Refactor extract_infomation.py and update prompts --- .../live_bench/data_generator/utils/extract_infomation.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/live_bench/live_bench/data_generator/utils/extract_infomation.py b/tools/live_bench/live_bench/data_generator/utils/extract_infomation.py index 5bf46cdd..bffbd9f1 100644 --- a/tools/live_bench/live_bench/data_generator/utils/extract_infomation.py +++ b/tools/live_bench/live_bench/data_generator/utils/extract_infomation.py @@ -28,11 +28,13 @@ """ FIND_IMAGES_FEATURES_PROMPT: str = """\ -This is a screenshot from a news website. Your task is to identify the meaningful images in this screenshot and extract relevant information about these images, such as the environment depicted, the actions and expressions of the people, and the connection between these images and the corresponding text. You need to think deeply about these images and provide as much detailed and useful information as possible. +This is a screenshot from a news website. Your task is to identify the meaningful images in this screenshot and extract relevant information about these images, such as the environment depicted, the actions and expressions of the people, and the connection between these images and the corresponding text. You need to think deeply about these images and provide as much detailed and useful information as possible. Of course, it is also possible that the website is mainly text-based, and in this case, there might not be much information to extract from the images. In such instances, you can approach the task from a textual perspective, analyzing the website's content. For example, what is the theme of the website? What kind of information is contained in the text? If the website requires thoughtful analysis, feel free to engage in deeper reflection and provide your insights. """ THINK_DIFFERENTLY_PROMPT: str = """\ -What makes this website different from other websites? What is special about its news? Since it is a news website, where is the "new" reflected? Do not give a generalized answer; you need to provide detailed answers based on the specific content of each news article and the accompanying illustrations. +What makes this website different from other websites? What is special about its news? Since it is a news website, where is the 'new' aspect reflected? Do not provide a generalized answer; you need to give detailed responses based on the specific content of each news article and the accompanying illustrations. + +For example, if the news is about a software update, what conveniences will this update bring to people? How can people use these new features? Perhaps there are also some drawbacks? You need to come up with your own questions worth pondering about the website and describe in as much detail as possible your understanding of what is 'new' on the website. """ From 9a20b96c7db691415b1c7dab0fb57112ae929a6a Mon Sep 17 00:00:00 2001 From: Pu Fanyi Date: Sun, 6 Oct 2024 16:11:11 +0800 Subject: [PATCH 5/6] update --- tools/live_bench/create_dataset.py | 2 +- tools/live_bench/data_summary.ipynb | 2 +- .../live_bench/data_generator/check_prompt.md | 9 +- .../example/example_output.json | 43 +- .../data_generator/live_bench_data.py | 2 +- .../live_bench/data_generator/prompt.md | 31 +- .../live_bench/data_generator/qa_generator.py | 2 +- tools/live_bench/script/modify.ipynb | 922 +++++++++--------- tools/live_bench/script/upload_results.py | 8 +- 9 files changed, 509 insertions(+), 512 deletions(-) diff --git a/tools/live_bench/create_dataset.py b/tools/live_bench/create_dataset.py index 03d113c1..6c318a5e 100644 --- a/tools/live_bench/create_dataset.py +++ b/tools/live_bench/create_dataset.py @@ -3,7 +3,7 @@ if __name__ == "__main__": website = load_websites() - dataset = LiveBench(name="2024-09") + dataset = LiveBench(name="2024-09", force_clear=True) website = load_websites_from_file("/data/pufanyi/project/lmms-eval/tools/temp/processed_images")[:2] dataset.capture(websites=website, screen_shoter="human", qa_generator="gpt4v", scorer="claude", checker="gemini", driver_kwargs={}, shoter_kwargs={}, generator_kwargs={}) diff --git a/tools/live_bench/data_summary.ipynb b/tools/live_bench/data_summary.ipynb index fb6d2ba9..01296b0a 100644 --- a/tools/live_bench/data_summary.ipynb +++ b/tools/live_bench/data_summary.ipynb @@ -311,7 +311,7 @@ ], "metadata": { "kernelspec": { - "display_name": "lmms-eval", + "display_name": "live_bench", "language": "python", "name": "python3" }, diff --git a/tools/live_bench/live_bench/data_generator/check_prompt.md b/tools/live_bench/live_bench/data_generator/check_prompt.md index 85164517..c189d321 100644 --- a/tools/live_bench/live_bench/data_generator/check_prompt.md +++ b/tools/live_bench/live_bench/data_generator/check_prompt.md @@ -2,13 +2,12 @@ I would like you to act as a quizmaster who designs questions based on a provide Now, you are given a screenshot of the homepage of a news website, with a already generated question and answer. Your task is to refine the question and answer, and refractor them to make the question more answerable, checkable, and challenging. If you don't think the question is good, please provide a new question and answer. -Note that the subtask must be one of these five: +Note that the subtask must be one of these four: - Basic Understanding -- Contextual Analysis -- Deeper Implications -- Broader Implications -- Further Insights +- Analytical Questions +- Evaluative Questions +- Divergent Thinking If you think the question does not correspond to the subtask, you have two options: 1. Modify the question to correspond to the subtask. diff --git a/tools/live_bench/live_bench/data_generator/example/example_output.json b/tools/live_bench/live_bench/data_generator/example/example_output.json index c789db3b..3d22415e 100644 --- a/tools/live_bench/live_bench/data_generator/example/example_output.json +++ b/tools/live_bench/live_bench/data_generator/example/example_output.json @@ -1,17 +1,12 @@ { "Basic Understanding": [ { - "Question": "Which of the following topics is NOT covered in the news articles shown in the image?\nA) Middle East politics\nB) Technological advancements\nC) Natural disasters\nD) Animal welfare", - "Answer": "C) Natural disasters", - "Criteria": "Give 10 marks if correctly selected C, otherwise 0 marks." - }, - { - "Question": "Based on the image and the headlines provided on the BBC webpage, fill in the blank with the most appropriate word or phrase:\n\"The article titled 'UN Security Council backs US Israel-Gaza ceasefire plan' is accompanied by an image of ______, which symbolizes the impact of the conflict on civilians and the urgency for a ceasefire.\"", - "Answer": "The article titled 'UN Security Council backs US Israel-Gaza ceasefire plan' is accompanied by an image of **children amidst rubble**, which symbolizes the impact of the conflict on civilians and the urgency for a ceasefire.", - "Criteria": "Award 10 marks for the correct answer 'children amidst rubble', 5 marks for partially correct synonyms or related phrases, and 0 marks for incorrect answers." + "Question": "Sort all the news headlines that appear in the graph from new to old. Provide the headlines and their respective timestamps.", + "Answer": "Here is a sorted list of the news headlines from new to old based on the timestamps:\n\n1. **Fire at famous Bangkok market kills 1,000 animals** \n - 2 hrs ago | Asia\n\n2. **Four US college instructors stabbed in public park in China** \n - 5 hrs ago | Asia\n\n3. **Baltimore shipping channel reopens after bridge collapse** \n - 5 hrs ago | US & Canada\n\n4. **Aircraft carrying Malawi vice-president goes missing** \n - 6 hrs ago | Africa\n\n5. **UN Security Council backs US Israel-Gaza ceasefire plan** \n - 7 hrs ago | Middle East\n\n6. **Apple brings ChatGPT to iPhones in AI overhaul** \n - 10 hrs ago | Technology\n\n7. **Netanyahu walks tightrope as US urges Gaza ceasefire deal** \n - 14 hrs ago | World", + "Criteria": "The total score is 10 points, with seven headlines in total. Missing one headline results in a 2-point deduction, and missing five or more headlines results in a score of 0. For each headline, providing the wrong title or timestamp will result in a 1-point deduction. If the timestamp is correct but the order is incorrect, an additional 3 points will be deducted. If the score drops below 0, it will be recorded as 0. The headlines “Four US college instructors stabbed in public park in China” and “Baltimore shipping channel reopens after bridge collapse” have the same timestamp and can be swapped." } ], - "Contextual Analysis": [ + "Analytical Questions": [ { "Question": "In the image associated with the article about the US Security Council backing the Israel-Gaza ceasefire plan, what are the people in the image doing, and how do their actions and expressions relate to the content of the article?", "Answer": "In the image associated with the article about the US Security Council backing the Israel-Gaza ceasefire plan, the people are navigating through rubble, indicating a scene of destruction. One person is climbing over debris, while another is looking directly at the camera with a serious expression. Their actions and expressions reflect the aftermath of conflict and the urgency of the situation, which aligns with the article's focus on the need for a ceasefire and the release of hostages held by Hamas.", @@ -23,35 +18,23 @@ "Criteria": "Award up to 10 marks based on the accuracy and relevance of the response: 2 marks for identifying the humanitarian impact, 3 marks for linking the image to the themes of the article, 2 marks for mentioning Netanyahu's political maneuvers, 3 marks for correctly associating the need for a ceasefire with the alleviation of civilian suffering." } ], - "Deeper Implications": [ - { - "Question": "What broader issues are raised by the UN Security Council's backing of a ceasefire in the Israel-Gaza context?", - "Answer": "The broader issues include international involvement in regional conflicts, the effectiveness of UN resolutions in conflict resolution, and the ongoing debate over the balance between national security and humanitarian needs in conflict zones.", - "Criteria": "Award up to 10 marks based on the accuracy and completeness of the response: 3 marks for mentioning international involvement in regional conflicts, 3 marks for discussing the effectiveness of UN resolutions in conflict resolution, 4 marks for addressing the debate over the balance between national security and humanitarian needs in conflict zones." - }, - { - "Question": "How does the image of a child in distress in a green field relate symbolically to the outcomes or themes of conflict depicted in the ceasefire article?", - "Answer": "The image symbolically represents the innocent casualties of conflict, particularly how children are affected, resonating with the urgency and necessity of a ceasefire to protect the most vulnerable populations from the consequences of prolonged conflict.", - "Criteria": "Award up to 10 marks based on the accuracy and relevance of the response: 2 marks for identifying the symbolic representation of innocent casualties, 3 marks for specifically mentioning how children are affected, 3 marks for relating this to the urgency and necessity of a ceasefire, 2 marks for connecting these elements to the protection of vulnerable populations." - } - ], - "Broader Implications": [ - { - "Question": "Rank the news articles in the image in order of their potential global impact, from highest to lowest.", - "Answer": "1. **UN Security Council backs US Israel-Gaza ceasefire plan**\n2. **Netanyahu walks tightrope as US urges Gaza ceasefire deal**\n3. **Apple brings ChatGPT to iPhones in AI overhaul**\n4. **Aircraft carrying Malawi vice-president goes missing**\n5. **Fire at famous Bangkok market kills 1,000 animals**\n6. **Four US college instructors stabbed in public park in China**\n7. **Baltimore shipping channel reopens after bridge collapse**", - "Criteria": "Award up to 10 marks based on the accuracy of the ranking: 2 marks for correctly placing the UN Security Council article first, 2 marks for correctly placing the Netanyahu article second, 1 mark each for correctly placing the next three articles (Apple, Aircraft, Fire), and 1 mark each for correctly placing the last two articles (Stabbing, Bridge). Deduct 1 mark for each position an article is away from its correct placement." - } - ], - "Further Insights": [ + "Evaluative Questions": [ { "Question": "Based on the image, which of the following statements best explains the potential global impact of the events described in the news articles?\nA. The US-Israel-Gaza ceasefire plan backed by the UN Security Council is likely to reduce tensions in the Middle East, potentially leading to a more stable geopolitical environment in the region.\nB. The introduction of ChatGPT to iPhones is expected to significantly disrupt the technology market, overshadowing the geopolitical events in the Middle East and Africa.\nC. The fire at the Bangkok market, which killed 1,000 animals, is likely to have a more profound impact on global environmental policies than the ceasefire plan in the Middle East.\nD. The disappearance of the aircraft carrying the Malawi vice-president is expected to lead to a major international search and rescue operation, diverting attention from other global issues.", "Answer": "A. The US-Israel-Gaza ceasefire plan backed by the UN Security Council is likely to reduce tensions in the Middle East, potentially leading to a more stable geopolitical environment in the region.", "Criteria": "Award up to 10 marks based on the accuracy and relevance of the response: 10 marks for selecting option A, 0 marks for any other option. Detailed justification for scoring: Option A directly addresses the reduction of tensions and potential stabilization in the Middle East, which is a significant global impact. Other options, while plausible, do not directly relate to the primary global impact as depicted in the provided image and articles." - }, + } + ], + "Divergent Thinking": [ { "Question": "Considering the current global attention on AI, how might the article about Apple bringing ChatGPT to iPhones in an AI overhaul reflect on broader technological trends and consumer expectations?", "Answer": "This article reflects broader trends in AI integration into consumer technology, highlighting competitive dynamics in the tech industry, and growing consumer expectations for sophisticated AI features in everyday devices.", "Criteria": "Award up to 10 marks based on the depth and accuracy of the response: 3 marks for identifying AI integration into consumer technology, 3 marks for discussing competitive dynamics in the tech industry, 4 marks for explaining the growth in consumer expectations for sophisticated AI features in everyday devices." + }, + { + "Question": "How does the image of a child in distress in a green field relate symbolically to the outcomes or themes of conflict depicted in the ceasefire article?", + "Answer": "The image symbolically represents the innocent casualties of conflict, particularly how children are affected, resonating with the urgency and necessity of a ceasefire to protect the most vulnerable populations from the consequences of prolonged conflict.", + "Criteria": "Award up to 10 marks based on the accuracy and relevance of the response: 2 marks for identifying the symbolic representation of innocent casualties, 3 marks for specifically mentioning how children are affected, 3 marks for relating this to the urgency and necessity of a ceasefire, 2 marks for connecting these elements to the protection of vulnerable populations." } ] } diff --git a/tools/live_bench/live_bench/data_generator/live_bench_data.py b/tools/live_bench/live_bench/data_generator/live_bench_data.py index 6619a07f..f5c44e9c 100644 --- a/tools/live_bench/live_bench/data_generator/live_bench_data.py +++ b/tools/live_bench/live_bench/data_generator/live_bench_data.py @@ -6,7 +6,7 @@ class LiveBenchData(object): - SUBTASKS = ("Basic Understanding", "Contextual Analysis", "Deeper Implications", "Broader Implications", "Further Insights") + SUBTASKS = ("Basic Understanding", "Analytical Questions", "Evaluative Questions", "Divergent Thinking") features = datasets.Features( { diff --git a/tools/live_bench/live_bench/data_generator/prompt.md b/tools/live_bench/live_bench/data_generator/prompt.md index 684e193f..1a973f13 100644 --- a/tools/live_bench/live_bench/data_generator/prompt.md +++ b/tools/live_bench/live_bench/data_generator/prompt.md @@ -2,11 +2,32 @@ I would like you to act as a quizmaster who designs questions based on a provide A well-crafted question about an event should allow respondents to gain deeper insights by observing and analyzing the event, paying attention to the following aspects: -- Basic Understanding: Questions that require direct observation or recall of the information presented in the image. These questions test the ability to identify and understand the basic elements and facts shown. -- Contextual Analysis: Questions that delve into the context or setting of the information presented. This involves understanding the background, the circumstances surrounding the information, or the broader setting in which the image is placed. -- Deeper Implications: Questions that explore the underlying meanings, implications, or consequences of the information in the image. These questions encourage critical thinking about the deeper effects or hidden messages. -- Broader Implications: Questions that extend the discussion beyond the immediate context of the image to its wider impact on society, other fields, or global issues. -- Further Insights: Questions that prompt exploration of additional layers of understanding or connections to other knowledge and concepts not immediately apparent from the image. +1. **Basic Understanding (Comprehension and Remembering)**: + - These levels involve recalling facts and explaining concepts. + - Example questions include: + - "What are the key points in this news story?" (Remembering) + - "How would you explain the main event reported here?" (Comprehension) + - Reference: Vanderbilt University Center for Teaching on Bloom’s Taxonomy. +2. **Analytical Questions (Analysis)**: + - This level involves breaking down information into components to understand relationships and meanings. + - Example questions: + - "What are the factors that led to this event?" + - "How does this event relate to other current issues?" + - Reference: ClassPoint's description of Bloom's Taxonomy levels, focusing on analytical and critical thinking skills. +3. **Evaluative Questions (Evaluation)**: + - These questions are about making judgments based on criteria and standards. + - Example questions: + - "Do you think the report's presentation was fair? Why or why not?" + - "What evidence supports the reliability of the sources?" + - Reference: ClassPoint’s detailed examples for evaluation questions. +4. **Divergent Thinking (Creation)**: + - This is the highest level where individuals generate new ideas and integrate different concepts. + - Example questions: + - "How could you create a new headline that captures the essence of the event differently?" + - "If you were the reporter, how would you approach this story to provide a unique angle?" + - Reference: ThoughtCo’s overview of question types related to Bloom’s Taxonomy for fostering creativity. + +Maybe it is difficult to come up with questions about some aspects of some pictures, so you can be biased when setting questions. Consider designing a multi-round Q&A process, progressively deepening the understanding of the event’s essence. diff --git a/tools/live_bench/live_bench/data_generator/qa_generator.py b/tools/live_bench/live_bench/data_generator/qa_generator.py index dc603ef3..2582bf87 100644 --- a/tools/live_bench/live_bench/data_generator/qa_generator.py +++ b/tools/live_bench/live_bench/data_generator/qa_generator.py @@ -32,7 +32,7 @@ logger = logging.getLogger("lmms-eval") -SUBTASKS = {"Basic Understanding", "Contextual Analysis", "Deeper Implications", "Broader Implications", "Further Insights"} +SUBTASKS = {"Basic Understanding", "Analytical Questions", "Evaluative Questions", "Divergent Thinking"} class QAData(object): diff --git a/tools/live_bench/script/modify.ipynb b/tools/live_bench/script/modify.ipynb index 2231057c..307efe34 100644 --- a/tools/live_bench/script/modify.ipynb +++ b/tools/live_bench/script/modify.ipynb @@ -1,461 +1,461 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Downloading data: 100%|██████████| 4.62k/4.62k [00:01<00:00, 4.14kB/s]\n", - "Generating test split: 100%|██████████| 8/8 [00:00<00:00, 933.60 examples/s]\n" - ] - } - ], - "source": [ - "import datasets\n", - "\n", - "data = datasets.load_dataset(\"lmms-lab/LiveBenchResults\", \"2024-07\")" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "df = data[\"test\"].to_pandas()" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "df.to_csv(\"2024-07.csv\", index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "\n", - "df = pd.read_csv(\"2024-07.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Model NameTotalBasic UnderstandingContextual AnalysisDeeper ImplicationsBroader ImplicationsFurther Insights
0gpt-4o-mini86.24000089.081.089.687.60000084.0
1gemini-1.5-flash80.76000084.469.281.790.30000078.2
2gpt-4o91.30000092.283.494.194.40000092.4
3gemini-1.5-pro86.28000090.676.685.691.60000087.0
4llama3-llava-next-8b62.65060260.161.474.863.67346953.3
5llava-1.5-7b41.94000038.634.558.842.80000035.0
6Idefics2-8B25.86000018.016.343.827.00000024.2
7InternVL2-2B56.84000065.849.964.255.80000048.5
\n", - "
" - ], - "text/plain": [ - " Model Name Total Basic Understanding Contextual Analysis \\\n", - "0 gpt-4o-mini 86.240000 89.0 81.0 \n", - "1 gemini-1.5-flash 80.760000 84.4 69.2 \n", - "2 gpt-4o 91.300000 92.2 83.4 \n", - "3 gemini-1.5-pro 86.280000 90.6 76.6 \n", - "4 llama3-llava-next-8b 62.650602 60.1 61.4 \n", - "5 llava-1.5-7b 41.940000 38.6 34.5 \n", - "6 Idefics2-8B 25.860000 18.0 16.3 \n", - "7 InternVL2-2B 56.840000 65.8 49.9 \n", - "\n", - " Deeper Implications Broader Implications Further Insights \n", - "0 89.6 87.600000 84.0 \n", - "1 81.7 90.300000 78.2 \n", - "2 94.1 94.400000 92.4 \n", - "3 85.6 91.600000 87.0 \n", - "4 74.8 63.673469 53.3 \n", - "5 58.8 42.800000 35.0 \n", - "6 43.8 27.000000 24.2 \n", - "7 64.2 55.800000 48.5 " - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "data = datasets.Dataset.from_pandas(df)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 473.45ba/s]\n", - "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00, 1.17s/it]\n" - ] - }, - { - "data": { - "text/plain": [ - "CommitInfo(commit_url='https://huggingface.co/datasets/lmms-lab/LiveBenchResults/commit/a29f8ecb399dbd7ab7475f0de2c48ee54affbff9', commit_message='Upload dataset', commit_description='', oid='a29f8ecb399dbd7ab7475f0de2c48ee54affbff9', pr_url=None, pr_revision=None, pr_num=None)" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data.push_to_hub(\"lmms-lab/LiveBenchResults\", \"2024-07\", split=\"test\")" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [], - "source": [ - "data = datasets.load_dataset(\"lmms-lab/LiveBenchDetailedResults\", \"2024-07\")" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [], - "source": [ - "data[\"Idefics2_8B\"] = data[\"idefics2\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "DatasetDict({\n", - " gpt_4o_mini: Dataset({\n", - " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", - " num_rows: 250\n", - " })\n", - " gemini_1.5_flash: Dataset({\n", - " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", - " num_rows: 250\n", - " })\n", - " gpt_4o: Dataset({\n", - " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", - " num_rows: 250\n", - " })\n", - " gemini_1.5_pro: Dataset({\n", - " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", - " num_rows: 250\n", - " })\n", - " llama3_llava_next_8b: Dataset({\n", - " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", - " num_rows: 250\n", - " })\n", - " llava_1.5_7b: Dataset({\n", - " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", - " num_rows: 250\n", - " })\n", - " idefics2: Dataset({\n", - " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", - " num_rows: 250\n", - " })\n", - " InternVL2_2B: Dataset({\n", - " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", - " num_rows: 250\n", - " })\n", - " Idefics2_8B: Dataset({\n", - " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", - " num_rows: 250\n", - " })\n", - "})" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [], - "source": [ - "new_data = {}\n", - "for k, v in data.items():\n", - " if k == \"idefics2\":\n", - " continue\n", - " new_data[k] = v\n", - "data = datasets.DatasetDict(new_data)" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "DatasetDict({\n", - " gpt_4o_mini: Dataset({\n", - " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", - " num_rows: 250\n", - " })\n", - " gemini_1.5_flash: Dataset({\n", - " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", - " num_rows: 250\n", - " })\n", - " gpt_4o: Dataset({\n", - " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", - " num_rows: 250\n", - " })\n", - " gemini_1.5_pro: Dataset({\n", - " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", - " num_rows: 250\n", - " })\n", - " llama3_llava_next_8b: Dataset({\n", - " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", - " num_rows: 250\n", - " })\n", - " llava_1.5_7b: Dataset({\n", - " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", - " num_rows: 250\n", - " })\n", - " InternVL2_2B: Dataset({\n", - " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", - " num_rows: 250\n", - " })\n", - " Idefics2_8B: Dataset({\n", - " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", - " num_rows: 250\n", - " })\n", - "})" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Map: 100%|██████████| 250/250 [00:00<00:00, 347.35 examples/s]it/s]\n", - "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 12.58ba/s]\n", - "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00, 1.63s/it]\n", - "Map: 100%|██████████| 250/250 [00:00<00:00, 363.40 examples/s]it/s]\n", - "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 12.70ba/s]\n", - "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00, 1.59s/it]\n", - "Map: 100%|██████████| 250/250 [00:00<00:00, 472.60 examples/s]it/s]\n", - "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 12.62ba/s]\n", - "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00, 1.43s/it]\n", - "Map: 100%|██████████| 250/250 [00:00<00:00, 352.11 examples/s]it/s]\n", - "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 11.55ba/s]\n", - "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00, 1.63s/it]\n", - "Map: 100%|██████████| 250/250 [00:00<00:00, 475.90 examples/s]it/s]\n", - "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 11.38ba/s]\n", - "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00, 1.46s/it]\n", - "Map: 100%|██████████| 250/250 [00:00<00:00, 364.89 examples/s]it/s]\n", - "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 10.94ba/s]\n", - "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00, 1.59s/it]\n", - "Map: 100%|██████████| 250/250 [00:00<00:00, 529.96 examples/s]it/s]\n", - "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 13.51ba/s]\n", - "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00, 1.33s/it]\n", - "Map: 100%|██████████| 250/250 [00:00<00:00, 349.67 examples/s]it/s]\n", - "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 12.74ba/s]\n", - "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00, 1.57s/it]\n" - ] - }, - { - "data": { - "text/plain": [ - "CommitInfo(commit_url='https://huggingface.co/datasets/lmms-lab/LiveBenchDetailedResults/commit/047d6dc66759e0a8b57b4e6015db6208da1cd4da', commit_message='Upload dataset', commit_description='', oid='047d6dc66759e0a8b57b4e6015db6208da1cd4da', pr_url=None, pr_revision=None, pr_num=None)" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data.push_to_hub(\"lmms-lab/LiveBenchDetailedResults\", \"2024-07\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "live_bench", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.9" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Downloading data: 100%|██████████| 4.62k/4.62k [00:01<00:00, 4.14kB/s]\n", + "Generating test split: 100%|██████████| 8/8 [00:00<00:00, 933.60 examples/s]\n" + ] + } + ], + "source": [ + "import datasets\n", + "\n", + "data = datasets.load_dataset(\"lmms-lab/LiveBenchResults\", \"2024-07\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "df = data[\"test\"].to_pandas()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "df.to_csv(\"2024-07.csv\", index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "df = pd.read_csv(\"2024-07.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Model NameTotalBasic UnderstandingContextual AnalysisDeeper ImplicationsBroader ImplicationsFurther Insights
0gpt-4o-mini86.24000089.081.089.687.60000084.0
1gemini-1.5-flash80.76000084.469.281.790.30000078.2
2gpt-4o91.30000092.283.494.194.40000092.4
3gemini-1.5-pro86.28000090.676.685.691.60000087.0
4llama3-llava-next-8b62.65060260.161.474.863.67346953.3
5llava-1.5-7b41.94000038.634.558.842.80000035.0
6Idefics2-8B25.86000018.016.343.827.00000024.2
7InternVL2-2B56.84000065.849.964.255.80000048.5
\n", + "
" + ], + "text/plain": [ + " Model Name Total Basic Understanding Contextual Analysis \\\n", + "0 gpt-4o-mini 86.240000 89.0 81.0 \n", + "1 gemini-1.5-flash 80.760000 84.4 69.2 \n", + "2 gpt-4o 91.300000 92.2 83.4 \n", + "3 gemini-1.5-pro 86.280000 90.6 76.6 \n", + "4 llama3-llava-next-8b 62.650602 60.1 61.4 \n", + "5 llava-1.5-7b 41.940000 38.6 34.5 \n", + "6 Idefics2-8B 25.860000 18.0 16.3 \n", + "7 InternVL2-2B 56.840000 65.8 49.9 \n", + "\n", + " Deeper Implications Broader Implications Further Insights \n", + "0 89.6 87.600000 84.0 \n", + "1 81.7 90.300000 78.2 \n", + "2 94.1 94.400000 92.4 \n", + "3 85.6 91.600000 87.0 \n", + "4 74.8 63.673469 53.3 \n", + "5 58.8 42.800000 35.0 \n", + "6 43.8 27.000000 24.2 \n", + "7 64.2 55.800000 48.5 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "data = datasets.Dataset.from_pandas(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 473.45ba/s]\n", + "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00, 1.17s/it]\n" + ] + }, + { + "data": { + "text/plain": [ + "CommitInfo(commit_url='https://huggingface.co/datasets/lmms-lab/LiveBenchResults/commit/a29f8ecb399dbd7ab7475f0de2c48ee54affbff9', commit_message='Upload dataset', commit_description='', oid='a29f8ecb399dbd7ab7475f0de2c48ee54affbff9', pr_url=None, pr_revision=None, pr_num=None)" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.push_to_hub(\"lmms-lab/LiveBenchResults\", \"2024-07\", split=\"test\")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "data = datasets.load_dataset(\"lmms-lab/LiveBenchDetailedResults\", \"2024-07\")" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "data[\"Idefics2_8B\"] = data[\"idefics2\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DatasetDict({\n", + " gpt_4o_mini: Dataset({\n", + " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", + " num_rows: 250\n", + " })\n", + " gemini_1.5_flash: Dataset({\n", + " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", + " num_rows: 250\n", + " })\n", + " gpt_4o: Dataset({\n", + " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", + " num_rows: 250\n", + " })\n", + " gemini_1.5_pro: Dataset({\n", + " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", + " num_rows: 250\n", + " })\n", + " llama3_llava_next_8b: Dataset({\n", + " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", + " num_rows: 250\n", + " })\n", + " llava_1.5_7b: Dataset({\n", + " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", + " num_rows: 250\n", + " })\n", + " idefics2: Dataset({\n", + " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", + " num_rows: 250\n", + " })\n", + " InternVL2_2B: Dataset({\n", + " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", + " num_rows: 250\n", + " })\n", + " Idefics2_8B: Dataset({\n", + " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", + " num_rows: 250\n", + " })\n", + "})" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "new_data = {}\n", + "for k, v in data.items():\n", + " if k == \"idefics2\":\n", + " continue\n", + " new_data[k] = v\n", + "data = datasets.DatasetDict(new_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DatasetDict({\n", + " gpt_4o_mini: Dataset({\n", + " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", + " num_rows: 250\n", + " })\n", + " gemini_1.5_flash: Dataset({\n", + " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", + " num_rows: 250\n", + " })\n", + " gpt_4o: Dataset({\n", + " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", + " num_rows: 250\n", + " })\n", + " gemini_1.5_pro: Dataset({\n", + " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", + " num_rows: 250\n", + " })\n", + " llama3_llava_next_8b: Dataset({\n", + " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", + " num_rows: 250\n", + " })\n", + " llava_1.5_7b: Dataset({\n", + " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", + " num_rows: 250\n", + " })\n", + " InternVL2_2B: Dataset({\n", + " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", + " num_rows: 250\n", + " })\n", + " Idefics2_8B: Dataset({\n", + " features: ['id', 'images', 'question', 'ground_truth', 'criteria', 'subtask', 'response', 'score', 'reason'],\n", + " num_rows: 250\n", + " })\n", + "})" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Map: 100%|██████████| 250/250 [00:00<00:00, 347.35 examples/s]it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 12.58ba/s]\n", + "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00, 1.63s/it]\n", + "Map: 100%|██████████| 250/250 [00:00<00:00, 363.40 examples/s]it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 12.70ba/s]\n", + "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00, 1.59s/it]\n", + "Map: 100%|██████████| 250/250 [00:00<00:00, 472.60 examples/s]it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 12.62ba/s]\n", + "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00, 1.43s/it]\n", + "Map: 100%|██████████| 250/250 [00:00<00:00, 352.11 examples/s]it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 11.55ba/s]\n", + "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00, 1.63s/it]\n", + "Map: 100%|██████████| 250/250 [00:00<00:00, 475.90 examples/s]it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 11.38ba/s]\n", + "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00, 1.46s/it]\n", + "Map: 100%|██████████| 250/250 [00:00<00:00, 364.89 examples/s]it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 10.94ba/s]\n", + "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00, 1.59s/it]\n", + "Map: 100%|██████████| 250/250 [00:00<00:00, 529.96 examples/s]it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 13.51ba/s]\n", + "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00, 1.33s/it]\n", + "Map: 100%|██████████| 250/250 [00:00<00:00, 349.67 examples/s]it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 12.74ba/s]\n", + "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00, 1.57s/it]\n" + ] + }, + { + "data": { + "text/plain": [ + "CommitInfo(commit_url='https://huggingface.co/datasets/lmms-lab/LiveBenchDetailedResults/commit/047d6dc66759e0a8b57b4e6015db6208da1cd4da', commit_message='Upload dataset', commit_description='', oid='047d6dc66759e0a8b57b4e6015db6208da1cd4da', pr_url=None, pr_revision=None, pr_num=None)" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.push_to_hub(\"lmms-lab/LiveBenchDetailedResults\", \"2024-07\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "live_bench", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tools/live_bench/script/upload_results.py b/tools/live_bench/script/upload_results.py index 3a10a9f2..766136da 100644 --- a/tools/live_bench/script/upload_results.py +++ b/tools/live_bench/script/upload_results.py @@ -24,13 +24,7 @@ "reason": datasets.Value("string"), } -SUBTASKS = [ - "Basic Understanding", - "Contextual Analysis", - "Deeper Implications", - "Broader Implications", - "Further Insights", -] +SUBTASKS = ["Basic Understanding", "Analytical Questions", "Evaluative Questions", "Divergent Thinking"] def load_images(config) -> Dict[int, List[Image.Image]]: From 052517cfa4b763ab91b1e5852250a5bd1435f98c Mon Sep 17 00:00:00 2001 From: Pu Fanyi Date: Tue, 8 Oct 2024 01:01:02 +0800 Subject: [PATCH 6/6] Refactor live_bench_2409.yaml and live_bench.yaml --- lmms_eval/tasks/live_bench/live_bench.yaml | 2 +- .../tasks/live_bench/live_bench_2409.yaml | 3 + .../live_bench/live_bench_template_yaml_v2 | 29 + lmms_eval/tasks/live_bench/utils_v2.py | 216 ++++++ tools/live_bench/create_dataset.py | 6 +- tools/live_bench/example.ipynb | 4 +- .../live_bench/data_generator/check_prompt.md | 1 + .../example/example_output.json | 12 +- .../live_bench/data_generator/live_bench.py | 20 +- .../data_generator/live_bench_data.py | 10 +- .../live_bench/data_generator/prompt.md | 13 +- .../live_bench/data_generator/qa_generator.py | 66 +- .../data_generator/question_finalizer.py | 2 +- .../live_bench/data_generator/utils/claude.py | 6 +- ...t_infomation.py => extract_information.py} | 14 +- .../live_bench/data_generator/utils/gemini.py | 4 +- .../live_bench/data_generator/utils/gpt4v.py | 4 +- tools/live_bench/script/refractor.py | 4 +- tools/live_bench/script/upload_results.py | 2 +- tools/live_bench/summerize.ipynb | 719 ++++++++++++++++++ 20 files changed, 1059 insertions(+), 78 deletions(-) create mode 100644 lmms_eval/tasks/live_bench/live_bench_2409.yaml create mode 100644 lmms_eval/tasks/live_bench/live_bench_template_yaml_v2 create mode 100644 lmms_eval/tasks/live_bench/utils_v2.py rename tools/live_bench/live_bench/data_generator/utils/{extract_infomation.py => extract_information.py} (91%) create mode 100644 tools/live_bench/summerize.ipynb diff --git a/lmms_eval/tasks/live_bench/live_bench.yaml b/lmms_eval/tasks/live_bench/live_bench.yaml index 1253105e..1d7f9b9a 100755 --- a/lmms_eval/tasks/live_bench/live_bench.yaml +++ b/lmms_eval/tasks/live_bench/live_bench.yaml @@ -4,5 +4,5 @@ task: - live_bench_2407 metadata: - api_type : openai + api_type: azure eval_with_mini: false diff --git a/lmms_eval/tasks/live_bench/live_bench_2409.yaml b/lmms_eval/tasks/live_bench/live_bench_2409.yaml new file mode 100644 index 00000000..78a8aad0 --- /dev/null +++ b/lmms_eval/tasks/live_bench/live_bench_2409.yaml @@ -0,0 +1,3 @@ +task: "live_bench_2409" +dataset_name: 2024-09 +include: live_bench_template_yaml_v2 diff --git a/lmms_eval/tasks/live_bench/live_bench_template_yaml_v2 b/lmms_eval/tasks/live_bench/live_bench_template_yaml_v2 new file mode 100644 index 00000000..625901c4 --- /dev/null +++ b/lmms_eval/tasks/live_bench/live_bench_template_yaml_v2 @@ -0,0 +1,29 @@ +dataset_path: lmms-lab/LiveBench +dataset_kwargs: + token: True +test_split: test +dataset_name: 2024-07 +output_type: generate_until +doc_to_visual: !function utils_v2.livebench_doc_to_visual +doc_to_text: !function utils_v2.livebench_doc_to_text +doc_to_target: "answer" +generation_kwargs: + max_new_tokens: 1024 + temperature: 0 + top_p: 1.0 + num_beams: 1 + do_sample: false +process_results: !function utils_v2.livebench_process_results +process_results_use_image: true +metric_list: + - metric: gpt4_eval_score + aggregation: !function utils_v2.livebench_aggregate_results + higher_is_better: true + # - metric: gpt4_eval_score_mini + # aggregation: !function utils.livebench_aggregate_results + # higher_is_better: true + +lmms_eval_specific_kwargs: + default: + pre_prompt: "" + post_prompt: "" diff --git a/lmms_eval/tasks/live_bench/utils_v2.py b/lmms_eval/tasks/live_bench/utils_v2.py new file mode 100644 index 00000000..82dbe602 --- /dev/null +++ b/lmms_eval/tasks/live_bench/utils_v2.py @@ -0,0 +1,216 @@ +import base64 +import json +import logging +import os +import time +from io import BytesIO +from pathlib import Path + +import numpy as np +import openai +import pandas as pd +import requests +import yaml +from tqdm import tqdm + +eval_logger = logging.getLogger("lmms-eval") + + +with open(Path(__file__).parent / "live_bench.yaml", "r") as f: + raw_data = f.readlines() + safe_data = [] + for i, line in enumerate(raw_data): + # remove function definition since yaml load cannot handle it + if "!function" not in line: + safe_data.append(line) + + config = yaml.safe_load("".join(safe_data)) + +API_TYPE = config["metadata"]["api_type"] +EVAL_WITH_MINI = config["metadata"]["eval_with_mini"] + + +def get_openai_client(api_version="2024-02-15-preview") -> openai.OpenAI: + endpoint = os.getenv("AZURE_OPENAI_ENDPOINT") + if endpoint: + key = os.getenv("AZURE_OPENAI_API_KEY") + if not key: + raise ValueError("OPENAI_API_KEY environment variable not set.") + return openai.AzureOpenAI(azure_endpoint=endpoint, api_key=key, api_version=api_version) + else: + api_key = os.getenv("OPENAI_API_KEY") + if not api_key: + raise ValueError("OPENAI_API_KEY environment variable not set.") + return openai.OpenAI(api_key=api_key) + + +client = get_openai_client() + +_PROMPT_WITH_IMAGE = """\ +[Question] + +{prompt} + +[Assistant Response] + +{generation} + +[Ground Truth Response] + +{reference} + +[System] + +Rate whether the assistant response correctly matches the ground truth, in regards to the image above. + +The rating should be 0-10, where 0 is incorrect and 10 is correct. + +Below is the specific criteria for rating: + +{criteria} + +Total score is out of 10. If the model's answer cannot be provided due to political reasons, please assign a score of 0 for further processing. If the model's response is biased due to political factors, please score it based on its understanding of the image, but reduce the objectivity score accordingly. + +Your response should be in the JSON format: +```json +{{ + "Explanation": "(your explanation)", + "Rating": "(int)" +}} +``` +""" + + +def format_prompt(question, ground_truth_answer, answer, criteria): + return _PROMPT_WITH_IMAGE.format(prompt=question, generation=answer, reference=ground_truth_answer, criteria=criteria) + + +def get_chat_response(gpt_model_name, base64_images, question, ground_truth_answer, answer, criteria, max_retries=5, wait_time=10): + # client = openai.OpenAI(api_key=API_KEY) + + content = [] + for base64_image in base64_images: + content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}) + prompt = format_prompt(question, ground_truth_answer, answer, criteria) + content.append( + { + "type": "text", + "text": prompt, + } + ) + + messages = [ + { + "role": "user", + "content": content, + } + ] + + # payload = { + # "model": GPT_EVAL_MODEL_NAME, + # "response_format": {"type": "json_object"}, + # "max_tokens": 1024, + # "temperature": 0.0, + # } + + for attempt in range(max_retries): + try: + response = client.chat.completions.create(model=gpt_model_name, messages=messages, max_tokens=1024, response_format={"type": "json_object"}, temperature=0.0) + response_data = response.choices[0].message.content + # print(response_data) + response_data = json.loads(response_data) + rating = response_data["Rating"] + explanation = response_data["Explanation"] + return rating, explanation, gpt_model_name + except requests.exceptions.RequestException as e: + eval_logger.warning(f"Request failed on attempt {attempt + 1}: {e}") + time.sleep(wait_time) + if attempt == max_retries - 1: + eval_logger.error(f"Failed to get response after {max_retries} attempts") + return -1, str(e), gpt_model_name + except Exception as e: + eval_logger.error(f"Error on attempt {attempt + 1}: {e}") + return -1, str(e), gpt_model_name + + +def image_to_base64(pil_image): + buffered = BytesIO() + pil_image.save(buffered, format="PNG") + return base64.b64encode(buffered.getvalue()).decode("utf-8") + + +_images = {} + +dataset = None + + +def livebench_doc_to_visual(doc): + img_list = [image.convert("RGB") for image in doc["images"]] + return img_list + + +def livebench_doc_to_text(doc, lmms_eval_specific_kwargs=None): + if lmms_eval_specific_kwargs is None: + lmms_eval_specific_kwargs = {} + pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "") + post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "") + return f"{pre_prompt}{doc['question']}{post_prompt}" + + +SUBTASKS = ["Basic Understanding", "Analytical Questions", "Divergent Thinking", "Real-world Assistance"] + + +def livebench_process_results_for_name(doc, results, model, eval_name): + base64_images = [image_to_base64(image) for image in livebench_doc_to_visual(doc)] + subtask = doc["subtask"] + criteria = doc["criteria"] + if subtask not in SUBTASKS: + subtask = "further insights" + if not results or results[0] == "": + return {eval_name: {"rating": 0, "explanation": "No response", "model_name": "N/A", "subtask": subtask}} + rating, explanation, model_name = get_chat_response(gpt_model_name=model, base64_images=base64_images, question=doc["question"], ground_truth_answer=doc["answer"], answer=results[0] if results else "", criteria=criteria) + if rating >= 0: + return {eval_name: {"rating": rating, "explanation": explanation, "model_name": model_name, "subtask": subtask, "id": doc["id"]}} + else: + return {eval_name: {"rating": -1, "explanation": explanation, "model_name": "N/A", "subtask": subtask, "id": doc["id"]}} + + +def livebench_process_results_4o(doc, results): + return livebench_process_results_for_name(doc, results, "gpt-4o", "gpt4_eval_score") + + +def livebench_process_results_4o_mini(doc, results): + return livebench_process_results_for_name(doc, results, "gpt-4o-mini", "gpt4_eval_score_mini") + + +def livebench_process_results(doc, results): + res = livebench_process_results_4o(doc, results) + if EVAL_WITH_MINI: + res.update(livebench_process_results_4o_mini(doc, results)) + return res + + +def livebench_aggregate_results(results): + sum_score, count = 0, 0 + score = {} + for subtask in SUBTASKS: + score[subtask] = [] + for result in results: + if result["rating"] == -1: + continue + sum_score += result["rating"] / 10 + count += 1 + subtask = result["subtask"] + if subtask not in SUBTASKS: + subtask = "OTHER_SUBTASK" + score[result["subtask"]].append(result["rating"] / 10) + res = [(subtask, len(score[subtask]), np.mean(score[subtask]) * 100) for subtask in SUBTASKS] + res.append(("Total", count, sum_score / count * 100)) + # print("count:", count) + res = pd.DataFrame(res, columns=["Subtask", "Count", "Score"]) + print("=" * 50) + print(res) + print("=" * 50) + if count == 0: + eval_logger.warning("No valid scores to aggregate") + return sum_score / count * 100 if count > 0 else None diff --git a/tools/live_bench/create_dataset.py b/tools/live_bench/create_dataset.py index 6c318a5e..57205c96 100644 --- a/tools/live_bench/create_dataset.py +++ b/tools/live_bench/create_dataset.py @@ -3,8 +3,8 @@ if __name__ == "__main__": website = load_websites() - dataset = LiveBench(name="2024-09", force_clear=True) + dataset = LiveBench(name="2024-09") - website = load_websites_from_file("/data/pufanyi/project/lmms-eval/tools/temp/processed_images")[:2] - dataset.capture(websites=website, screen_shoter="human", qa_generator="gpt4v", scorer="claude", checker="gemini", driver_kwargs={}, shoter_kwargs={}, generator_kwargs={}) + website = load_websites_from_file("/data/pufanyi/project/lmms-eval/tools/temp/processed_images/selected") + dataset.capture(websites=website, screen_shoter="human", qa_generator="claude", scorer="claude", checker="gpt4v", driver_kwargs={}, shoter_kwargs={}, generator_kwargs={}) dataset.upload() diff --git a/tools/live_bench/example.ipynb b/tools/live_bench/example.ipynb index e5ad1a04..4df9cedc 100644 --- a/tools/live_bench/example.ipynb +++ b/tools/live_bench/example.ipynb @@ -40,7 +40,7 @@ } ], "source": [ - "from live_bench.data_generator.utils.extract_infomation import InfomationExtractor\n", + "from live_bench.data_generator.utils.extract_information import InfomationExtractor\n", "from live_bench.screen_shoter import get_shoter\n", "from live_bench.driver import load_driver\n", "\n", @@ -71,7 +71,7 @@ "metadata": {}, "outputs": [], "source": [ - "response = extractor.extract_infomation(w)" + "response = extractor.extract_information(w)" ] }, { diff --git a/tools/live_bench/live_bench/data_generator/check_prompt.md b/tools/live_bench/live_bench/data_generator/check_prompt.md index c189d321..f23bdfc3 100644 --- a/tools/live_bench/live_bench/data_generator/check_prompt.md +++ b/tools/live_bench/live_bench/data_generator/check_prompt.md @@ -8,6 +8,7 @@ Note that the subtask must be one of these four: - Analytical Questions - Evaluative Questions - Divergent Thinking +- Real-world Assistance If you think the question does not correspond to the subtask, you have two options: 1. Modify the question to correspond to the subtask. diff --git a/tools/live_bench/live_bench/data_generator/example/example_output.json b/tools/live_bench/live_bench/data_generator/example/example_output.json index 3d22415e..b5bcf85b 100644 --- a/tools/live_bench/live_bench/data_generator/example/example_output.json +++ b/tools/live_bench/live_bench/data_generator/example/example_output.json @@ -26,15 +26,17 @@ } ], "Divergent Thinking": [ - { - "Question": "Considering the current global attention on AI, how might the article about Apple bringing ChatGPT to iPhones in an AI overhaul reflect on broader technological trends and consumer expectations?", - "Answer": "This article reflects broader trends in AI integration into consumer technology, highlighting competitive dynamics in the tech industry, and growing consumer expectations for sophisticated AI features in everyday devices.", - "Criteria": "Award up to 10 marks based on the depth and accuracy of the response: 3 marks for identifying AI integration into consumer technology, 3 marks for discussing competitive dynamics in the tech industry, 4 marks for explaining the growth in consumer expectations for sophisticated AI features in everyday devices." - }, { "Question": "How does the image of a child in distress in a green field relate symbolically to the outcomes or themes of conflict depicted in the ceasefire article?", "Answer": "The image symbolically represents the innocent casualties of conflict, particularly how children are affected, resonating with the urgency and necessity of a ceasefire to protect the most vulnerable populations from the consequences of prolonged conflict.", "Criteria": "Award up to 10 marks based on the accuracy and relevance of the response: 2 marks for identifying the symbolic representation of innocent casualties, 3 marks for specifically mentioning how children are affected, 3 marks for relating this to the urgency and necessity of a ceasefire, 2 marks for connecting these elements to the protection of vulnerable populations." } + ], + "Real-world Assistance": [ + { + "Question": "Please present this news in Arabic and output it in markdown format.", + "Answer": "Here is a translation of the key headlines from the provided news page into Arabic in markdown format:\n\n```markdown\n## \u0645\u062c\u0644\u0633 \u0627\u0644\u0623\u0645\u0646 \u064a\u062f\u0639\u0645 \u062e\u0637\u0629 \u0648\u0642\u0641 \u0625\u0637\u0644\u0627\u0642 \u0627\u0644\u0646\u0627\u0631 \u0628\u064a\u0646 \u0625\u0633\u0631\u0627\u0626\u064a\u0644 \u0648\u063a\u0632\u0629 \u0628\u0648\u0633\u0627\u0637\u0629 \u0627\u0644\u0648\u0644\u0627\u064a\u0627\u062a \u0627\u0644\u0645\u062a\u062d\u062f\u0629\n\n- \u0627\u0642\u062a\u0631\u062d \u0645\u062c\u0644\u0633 \u0627\u0644\u0623\u0645\u0646 \u0627\u0644\u062a\u0627\u0628\u0639 \u0644\u0644\u0623\u0645\u0645 \u0627\u0644\u0645\u062a\u062d\u062f\u0629 \u0634\u0631\u0648\u0637\u0627\u064b \u0644\u0648\u0642\u0641 \u0625\u0637\u0644\u0627\u0642 \u0627\u0644\u0646\u0627\u0631 \u0627\u0644\u0643\u0627\u0645\u0644 \u0648\u0625\u0637\u0644\u0627\u0642 \u0633\u0631\u0627\u062d \u0627\u0644\u0631\u0647\u0627\u0626\u0646 \u0627\u0644\u0630\u064a\u0646 \u062a\u062d\u062a\u062c\u0632\u0647\u0645 \u062d\u0645\u0627\u0633.\n\n---\n\n### \u0646\u062a\u0646\u064a\u0627\u0647\u0648 \u064a\u0633\u064a\u0631 \u0639\u0644\u0649 \u062d\u0628\u0644 \u0645\u0634\u062f\u0648\u062f \u0645\u0639 \u062f\u0639\u0648\u0629 \u0627\u0644\u0648\u0644\u0627\u064a\u0627\u062a \u0627\u0644\u0645\u062a\u062d\u062f\u0629 \u0644\u0648\u0642\u0641 \u0625\u0637\u0644\u0627\u0642 \u0627\u0644\u0646\u0627\u0631 \u0641\u064a \u063a\u0632\u0629\n\n- \u064a\u0642\u0648\u0644 \u062c\u064a\u0631\u064a\u0645\u064a \u0628\u0648\u064a\u0646\u060c \u0645\u0631\u0627\u0633\u0644 \u0628\u064a \u0628\u064a \u0633\u064a \u0644\u0634\u0624\u0648\u0646 \u0627\u0644\u0634\u0631\u0642 \u0627\u0644\u0623\u0648\u0633\u0637\u060c \u0625\u0646 \u0645\u0647\u0645\u0629 \u0648\u0632\u064a\u0631 \u0627\u0644\u062e\u0627\u0631\u062c\u064a\u0629 \u0627\u0644\u0623\u0645\u0631\u064a\u0643\u064a \u0628\u0644\u064a\u0646\u0643\u0646 \u0641\u064a \u0627\u0644\u0634\u0631\u0642 \u0627\u0644\u0623\u0648\u0633\u0637 \u062a\u0635\u0637\u062f\u0645 \u0628\u0627\u0644\u0633\u064a\u0627\u0633\u0629 \u0627\u0644\u0625\u0633\u0631\u0627\u0626\u064a\u0644\u064a\u0629.\n\n---\n\n### \u0637\u0639\u0646 \u0623\u0631\u0628\u0639\u0629 \u0645\u062f\u0631\u0651\u0633\u064a\u0646 \u0623\u0645\u0631\u064a\u0643\u064a\u064a\u0646 \u0641\u064a \u062d\u062f\u064a\u0642\u0629 \u0639\u0627\u0645\u0629 \u0641\u064a \u0627\u0644\u0635\u064a\u0646\n\n- \u0642\u0627\u0644\u062a \u0643\u0644\u064a\u0629 \u0643\u0648\u0631\u0646\u064a\u0644 \u0625\u0646 \u0627\u0644\u0645\u062f\u0631\u0651\u0633\u064a\u0646 \u0643\u0627\u0646\u0648\u0627 \u0641\u064a \u0632\u064a\u0627\u0631\u0629 \u0646\u0647\u0627\u0631\u064a\u0629 \u0625\u0644\u0649 \u062d\u062f\u064a\u0642\u0629 \u0639\u0627\u0645\u0629 \u0639\u0646\u062f\u0645\u0627 \u062a\u0639\u0631\u0636\u0648\u0627 \u0644\u0644\u0647\u062c\u0648\u0645.\n\n---\n\n### \u0634\u0631\u0643\u0629 \u0623\u0628\u0644 \u062a\u062c\u0644\u0628 ChatGPT \u0625\u0644\u0649 \u0623\u062c\u0647\u0632\u0629 iPhone \u0641\u064a \u062a\u062d\u062f\u064a\u062b \u0634\u0627\u0645\u0644\n\n- \u0643\u0627\u0646\u062a \u0627\u0644\u0634\u0631\u0643\u0629 \u0623\u0628\u0637\u0623 \u0641\u064a \u0637\u0631\u062d \u0645\u064a\u0632\u0627\u062a \u0627\u0644\u0630\u0643\u0627\u0621 \u0627\u0644\u0627\u0635\u0637\u0646\u0627\u0639\u064a \u0627\u0644\u062a\u0648\u0644\u064a\u062f\u064a \u0645\u0642\u0627\u0631\u0646\u0629 \u0628\u0645\u0646\u0627\u0641\u0633\u064a\u0647\u0627 \u0645\u062b\u0644 \u062c\u0648\u062c\u0644 \u0648\u0645\u0627\u064a\u0643\u0631\u0648\u0633\u0648\u0641\u062a.\n\n---\n\n### \u0637\u0627\u0626\u0631\u0629 \u062a\u0642\u0644 \u0646\u0627\u0626\u0628 \u0631\u0626\u064a\u0633 \u0645\u0644\u0627\u0648\u064a \u062a\u062e\u062a\u0641\u064a\n\n- \u0643\u0627\u0646 \u0633\u0627\u0648\u0644\u0648\u0633 \u062a\u0634\u064a\u0644\u0645\u0627 \u0648\u062a\u0633\u0639\u0629 \u0622\u062e\u0631\u0648\u0646 \u0639\u0644\u0649 \u0645\u062a\u0646 \u0637\u0627\u0626\u0631\u0629 \u0639\u0633\u0643\u0631\u064a\u0629 \u0627\u062e\u062a\u0641\u062a \u0639\u0646 \u0627\u0644\u0631\u0627\u062f\u0627\u0631 \u0635\u0628\u0627\u062d \u0627\u0644\u0627\u062b\u0646\u064a\u0646.\n\n---\n\n### \u062d\u0631\u064a\u0642 \u0641\u064a \u0633\u0648\u0642 \u062a\u0634\u0627\u062a\u0648\u0634\u0627\u0643 \u0627\u0644\u0634\u0647\u064a\u0631 \u0641\u064a \u0628\u0627\u0646\u0643\u0648\u0643 \u064a\u0642\u062a\u0644 1,000 \u062d\u064a\u0648\u0627\u0646\n\n- \u0627\u0644\u062d\u0631\u064a\u0642 \u0641\u064a \u0633\u0648\u0642 \u062a\u0634\u0627\u062a\u0648\u0634\u0627\u0643 \u0627\u0644\u0634\u0647\u064a\u0631 \u0641\u064a \u062a\u0627\u064a\u0644\u0627\u0646\u062f \u0623\u0633\u0641\u0631 \u0639\u0646 \u0645\u0642\u062a\u0644 \u0627\u0644\u0643\u0644\u0627\u0628 \u0648\u0627\u0644\u0637\u064a\u0648\u0631 \u0648\u0627\u0644\u062b\u0639\u0627\u0628\u064a\u0646 \u0648\u0627\u0644\u0642\u0637\u0637.\n\n---\n\n### \u0642\u0646\u0627\u0629 \u0634\u062d\u0646 \u0628\u0627\u0644\u062a\u064a\u0645\u0648\u0631 \u062a\u0639\u064a\u062f \u0641\u062a\u062d\u0647\u0627 \u0628\u0639\u062f \u0627\u0646\u0647\u064a\u0627\u0631 \u062c\u0633\u0631\n\n- \u0642\u0627\u0644\u062a \u0642\u0648\u0627\u062a \u0627\u0644\u0645\u0647\u0646\u062f\u0633\u064a\u0646 \u0627\u0644\u062a\u0627\u0628\u0639\u0629 \u0644\u0644\u062c\u064a\u0634 \u0627\u0644\u0623\u0645\u0631\u064a\u0643\u064a \u0625\u0646 \u0627\u0644\u0645\u0646\u0637\u0642\u0629 \u0623\u0635\u0628\u062d\u062a \"\u0622\u0645\u0646\u0629 \u0644\u0644\u0645\u0644\u0627\u062d\u0629\" \u0628\u0639\u062f \u062d\u0648\u0627\u0644\u064a \u062b\u0644\u0627\u062b\u0629 \u0623\u0634\u0647\u0631 \u0645\u0646 \u0627\u0644\u0643\u0627\u0631\u062b\u0629.\n``` \n\nThis Arabic summary captures the main stories shown on the BBC news homepage in your image.", + "Criteria": "There are seven titles in total, with a total of 10 points. 1 point will be deducted for a title translation error. 3 points will be deducted for markdown format errors." + } ] } diff --git a/tools/live_bench/live_bench/data_generator/live_bench.py b/tools/live_bench/live_bench/data_generator/live_bench.py index f7ba55d7..98def29c 100644 --- a/tools/live_bench/live_bench/data_generator/live_bench.py +++ b/tools/live_bench/live_bench/data_generator/live_bench.py @@ -18,7 +18,7 @@ get_random_score_getter, get_score_getter, ) -from live_bench.data_generator.utils.extract_infomation import ( +from live_bench.data_generator.utils.extract_information import ( ImageInfomation, InfomationExtractor, ) @@ -30,21 +30,21 @@ logger = logging.getLogger("lmms-eval") -def get_qa_data(images: ScreenImage, qa_generator: QAGenerator, *, infomation_getter: InfomationExtractor = None, test=False) -> Tuple[List[QAData], Response]: - if infomation_getter: - infomation = infomation_getter.extract_infomation(images) +def get_qa_data(images: ScreenImage, qa_generator: QAGenerator, *, information_getter: InfomationExtractor = None, test=False) -> Tuple[List[QAData], Response]: + if information_getter: + information = information_getter.extract_information(images) else: - infomation = None - response = qa_generator.generate(images, test=test, infomation=infomation) + information = None + response = qa_generator.generate(images, test=test, information=information) qa_data = qa_generator.format_response(response) return qa_data, response def get_live_bench_data( - driver, website: Website, screen_shoter: ScreenShoter, qa_generator: QAGenerator, checker: QAGenerator, infomation_getter: InfomationExtractor, question_finalizer: QuestionFinalizer, test=False, scorer=None, score_threshold=5 + driver, website: Website, screen_shoter: ScreenShoter, qa_generator: QAGenerator, checker: QAGenerator, information_getter: InfomationExtractor, question_finalizer: QuestionFinalizer, test=False, scorer=None, score_threshold=5 ) -> Tuple[List[LiveBenchData], Response]: images = screen_shoter.capture(driver, website) - qa_data, logs = get_qa_data(images, qa_generator, test=test, infomation_getter=infomation_getter) + qa_data, logs = get_qa_data(images, qa_generator, test=test, information_getter=information_getter) data = [] for qa in qa_data: # qa_data = question_finalizer.finalize_question(qa, images.images) @@ -141,10 +141,10 @@ def capture( if question_finalizer is None: question_finalizer = QuestionFinalizer(**question_finalizer_kwargs) logs = [] - infomation_getter = InfomationExtractor() + information_getter = InfomationExtractor() for website in tqdm(websites, desc="Capturing websites"): try: - data, log = get_live_bench_data(driver, website, screen_shoter, qa_generator, checker, test=test, scorer=scorer, infomation_getter=infomation_getter, question_finalizer=question_finalizer) + data, log = get_live_bench_data(driver, website, screen_shoter, qa_generator, checker, test=test, scorer=scorer, information_getter=information_getter, question_finalizer=question_finalizer) logs.append(log.to_dict()) for d in data: self.add(d) diff --git a/tools/live_bench/live_bench/data_generator/live_bench_data.py b/tools/live_bench/live_bench/data_generator/live_bench_data.py index f5c44e9c..9e59decc 100644 --- a/tools/live_bench/live_bench/data_generator/live_bench_data.py +++ b/tools/live_bench/live_bench/data_generator/live_bench_data.py @@ -1,12 +1,12 @@ import datasets from live_bench.data_generator.qa_generator import QAGenerator from live_bench.data_generator.question_finalizer import QuestionFinalizer -from live_bench.data_generator.utils.extract_infomation import ImageInfomation +from live_bench.data_generator.utils.extract_information import ImageInfomation from live_bench.screen_shoter.screen import ScreenImage class LiveBenchData(object): - SUBTASKS = ("Basic Understanding", "Analytical Questions", "Evaluative Questions", "Divergent Thinking") + SUBTASKS = ("Basic Understanding", "Analytical Questions", "Evaluative Questions", "Divergent Thinking", "Real-world Assistance") features = datasets.Features( { @@ -37,7 +37,7 @@ def __init__( criteria: str, subtask: str, data_generator: str, - infomation: ImageInfomation = None, + information: ImageInfomation = None, score: int = None, reason: str = None, checker: QAGenerator = None, @@ -51,10 +51,10 @@ def __init__( self.criteria = criteria self.subtask = subtask self.data_generator = data_generator - self.infomation = infomation + self.information = information self.checker = None if checker: - response = checker.check(screen, question, answer, criteria, subtask, infomation=infomation) + response = checker.check(screen, question, answer, criteria, subtask, information=information) if response.success: formatted_response = checker.format_checked_response(response) if formatted_response.question and formatted_response.answer and formatted_response.criteria: diff --git a/tools/live_bench/live_bench/data_generator/prompt.md b/tools/live_bench/live_bench/data_generator/prompt.md index 1a973f13..065ad29a 100644 --- a/tools/live_bench/live_bench/data_generator/prompt.md +++ b/tools/live_bench/live_bench/data_generator/prompt.md @@ -19,13 +19,24 @@ A well-crafted question about an event should allow respondents to gain deeper i - Example questions: - "Do you think the report's presentation was fair? Why or why not?" - "What evidence supports the reliability of the sources?" - - Reference: ClassPoint’s detailed examples for evaluation questions. + - "How would you rate the credibility of the news source based on this article?" + - "What are the potential implications of this event?" + - Reference: ClassPoint's detailed examples for evaluation questions. 4. **Divergent Thinking (Creation)**: - This is the highest level where individuals generate new ideas and integrate different concepts. - Example questions: - "How could you create a new headline that captures the essence of the event differently?" - "If you were the reporter, how would you approach this story to provide a unique angle?" - Reference: ThoughtCo’s overview of question types related to Bloom’s Taxonomy for fostering creativity. +5. **Real-world Assistance (Application)**: + - This level involves applying knowledge to real-world situations. + - Example questions: + - "Please present this news in Arabic and output it in markdown format." + - "Organize all the news on this page in the form of an HTML table, which needs to include the title, release time, and keywords." + - "Sort out the exchange rate data and plot them using Julia language." + - "Please write a summary of the news in Vietnamese" + - "Can you give me a example of this update in Python?" (Maybe can specify the update content) + - For programming language and natural language specification, you can specify any language (such as python, R, Julia, etc. as programming languages, Swedish, Czech, Portuguese, Polish, Serbian, etc. as natural languages). Maybe it is difficult to come up with questions about some aspects of some pictures, so you can be biased when setting questions. diff --git a/tools/live_bench/live_bench/data_generator/qa_generator.py b/tools/live_bench/live_bench/data_generator/qa_generator.py index 2582bf87..02f7b7cb 100644 --- a/tools/live_bench/live_bench/data_generator/qa_generator.py +++ b/tools/live_bench/live_bench/data_generator/qa_generator.py @@ -17,7 +17,7 @@ claude_generate_response, format_claude_images, ) -from live_bench.data_generator.utils.extract_infomation import ( +from live_bench.data_generator.utils.extract_information import ( ImageInfomation, InfomationExtractor, ) @@ -32,7 +32,7 @@ logger = logging.getLogger("lmms-eval") -SUBTASKS = {"Basic Understanding", "Analytical Questions", "Evaluative Questions", "Divergent Thinking"} +SUBTASKS = {"Basic Understanding", "Analytical Questions", "Evaluative Questions", "Divergent Thinking", "Real-world Assistance"} class QAData(object): @@ -74,15 +74,15 @@ def _load_prompt(self): def __call__(self, images: ScreenImage, *args, **kwargs): return self.generate(images, *args, **kwargs) - def generate(self, images: ScreenImage, *, test=False, infomation=None, **kwargs) -> Response: + def generate(self, images: ScreenImage, *, test=False, information=None, **kwargs) -> Response: if test: return Response(success=True, content="This is a test response.", full_log={}) - return self._generate(images, infomation=infomation, test=test, **kwargs) + return self._generate(images, information=information, test=test, **kwargs) - def check(self, images: ScreenImage, question, answer, criteria, subtask, *, infomation=None, test=False, **kwargs) -> Response: + def check(self, images: ScreenImage, question, answer, criteria, subtask, *, information=None, test=False, **kwargs) -> Response: if test: return Response(success=True, content="This is a test response.", full_log={}) - return self._check(images, question, answer, criteria, subtask, infomation=infomation, **kwargs) + return self._check(images, question, answer, criteria, subtask, information=information, **kwargs) @abstractmethod def _generate(self, images: ScreenImage, **kwargs) -> Response: @@ -170,7 +170,7 @@ def __init__( else: self.check_prompt = check_prompt - def format_messages(self, images: List[Image.Image], example_image: Image.Image, example_output: str, infomation: ImageInfomation): + def format_messages(self, images: List[Image.Image], example_image: Image.Image, example_output: str, information: ImageInfomation): example = [ { "type": "text", @@ -183,8 +183,8 @@ def format_messages(self, images: List[Image.Image], example_image: Image.Image, }, ] content = example + [format_gpt4v_images(image) for image in images] - if infomation: - content.append({"type": "text", "text": str(infomation)}) + if information: + content.append({"type": "text", "text": str(information)}) content.append( { "type": "text", @@ -203,7 +203,7 @@ def format_messages(self, images: List[Image.Image], example_image: Image.Image, ] return messages - def _generate(self, images: ScreenImage, *, max_tokens=4096, max_try_times=5, infomation=None, **kwargs): + def _generate(self, images: ScreenImage, *, max_tokens=4096, max_try_times=5, information=None, **kwargs): if self.example_path: example_image_path = os.path.join(self.example_path, "example_website.png") example_output_path = os.path.join(self.example_path, "example_output.json") @@ -212,11 +212,11 @@ def _generate(self, images: ScreenImage, *, max_tokens=4096, max_try_times=5, in example_output = json.load(f) example_output = json.dumps(example_output, indent=4) - messages = self.format_messages(images.images, example_image, example_output, infomation) + messages = self.format_messages(images.images, example_image, example_output, information) return gpt4v_generate_response(client=self.client, model=self.model, messages=messages, max_tokens=max_tokens, max_try_times=max_try_times, json_format=True, **kwargs) - def get_check_prompt(self, question: str, answer: str, criteria, subtask, images: List[Image.Image], infomation: ImageInfomation = None): + def get_check_prompt(self, question: str, answer: str, criteria, subtask, images: List[Image.Image], information: ImageInfomation = None): messages = [ { "role": "system", @@ -232,11 +232,11 @@ def get_check_prompt(self, question: str, answer: str, criteria, subtask, images "text": f"Question: {question}\nQuestioner's Answer: {answer}\nCriteria: {criteria}\nSubtask: {subtask}", }, ) - if infomation: + if information: content.append( { "type": "text", - "text": str(infomation), + "text": str(information), }, ) content.append( @@ -253,8 +253,8 @@ def get_check_prompt(self, question: str, answer: str, criteria, subtask, images ) return messages - def _check(self, images: ScreenImage, question, answer, criteria, subtask, *, max_tokens=4096, max_try_times=5, **kwargs): - messages = self.get_check_prompt(question, answer, criteria, subtask, images.images) + def _check(self, images: ScreenImage, question, answer, criteria, subtask, *, max_tokens=4096, max_try_times=5, information=None, **kwargs): + messages = self.get_check_prompt(question, answer, criteria, subtask, images.images, information) return gpt4v_generate_response(client=self.client, model=self.model, messages=messages, max_tokens=max_tokens, max_try_times=max_try_times, json_format=True, **kwargs) def format_checked_response(self, response: Response): @@ -315,14 +315,14 @@ def __init__( else: self.check_prompt = check_prompt - def format_messages(self, images: List[Image.Image], example_image: Image.Image, example_output: str, infomation: ImageInfomation = None): + def format_messages(self, images: List[Image.Image], example_image: Image.Image, example_output: str, information: ImageInfomation = None): content = [self.prompt, "\n", "Example Image:", example_image, "\n", "Example Output:", example_output] content.extend(images) - content.append(str(infomation)) + content.append(str(information)) content.append("Please generate high-quality questions focusing on the information displayed within this webpage. Your response should be in the format of the examples provided above and in JSON format.") return content - def _generate(self, images: ScreenImage, *, max_tokens=4096, max_try_times=5, infomation: ImageInfomation = None, **kwargs): + def _generate(self, images: ScreenImage, *, max_tokens=4096, max_try_times=5, information: ImageInfomation = None, **kwargs): if self.example_path: example_image_path = os.path.join(self.example_path, "example_website.png") example_output_path = os.path.join(self.example_path, "example_output.json") @@ -332,20 +332,20 @@ def _generate(self, images: ScreenImage, *, max_tokens=4096, max_try_times=5, in example_output = json.load(f) example_output = json.dumps(example_output, indent=4) - messages = self.format_messages(images.images, example_image, example_output, infomation) + messages = self.format_messages(images.images, example_image, example_output, information) return gemini_generate_response(self.client, messages, max_tokens, max_try_times, **kwargs) - def get_check_prompt(self, question: str, answer: str, criteria, subtask, images: List[Image.Image], infomation: ImageInfomation = None): + def get_check_prompt(self, question: str, answer: str, criteria, subtask, images: List[Image.Image], information: ImageInfomation = None): content = [self.check_prompt] + images content.append(f"Question: {question}\nQuestioner's Answer: {answer}\nCriteria: {criteria}, Subtask: {subtask}") content.append("Your response should be strictly in the below format:\n\nQuestion: \nAnswer: \nCriteria: \nSubtask: ") - if infomation: - content.append(str(infomation)) + if information: + content.append(str(information)) return content - def _check(self, images: ScreenImage, question, answer, criteria, subtask, *, max_tokens=4096, max_try_times=5, infomation: ImageInfomation = None, **kwargs): - messages = self.get_check_prompt(question, answer, criteria, subtask, images.images, infomation) + def _check(self, images: ScreenImage, question, answer, criteria, subtask, *, max_tokens=4096, max_try_times=5, information: ImageInfomation = None, **kwargs): + messages = self.get_check_prompt(question, answer, criteria, subtask, images.images, information) return gemini_generate_response(self.client, messages, max_tokens, max_try_times, **kwargs) def format_checked_response(self, response: Response): @@ -415,7 +415,7 @@ def __init__( else: self.check_prompt = check_prompt - def format_messages(self, images: List[Image.Image], example_image: Image.Image, example_output: str, infomation: ImageInfomation): + def format_messages(self, images: List[Image.Image], example_image: Image.Image, example_output: str, information: ImageInfomation): example = [ { "type": "text", @@ -428,8 +428,8 @@ def format_messages(self, images: List[Image.Image], example_image: Image.Image, }, ] content = example + [format_claude_images(image) for image in images] - if infomation: - content.append({"type": "text", "text": str(infomation)}) + if information: + content.append({"type": "text", "text": str(information)}) content.append( { "type": "text", @@ -444,7 +444,7 @@ def format_messages(self, images: List[Image.Image], example_image: Image.Image, ] return messages - def _generate(self, images: ScreenImage, *, max_tokens=4096, max_try_times=5, infomation=None, **kwargs): + def _generate(self, images: ScreenImage, *, max_tokens=4096, max_try_times=5, information=None, **kwargs): if self.example_path: example_image_path = os.path.join(self.example_path, "example_website.png") example_output_path = os.path.join(self.example_path, "example_output.json") @@ -454,11 +454,11 @@ def _generate(self, images: ScreenImage, *, max_tokens=4096, max_try_times=5, in example_output = json.load(f) example_output = json.dumps(example_output, indent=4) - messages = self.format_messages(images.images, example_image, example_output, infomation) + messages = self.format_messages(images.images, example_image, example_output, information) return claude_generate_response(client=self.client, model=self.model, messages=messages, max_tokens=max_tokens, max_try_times=max_try_times, json_format=True, system=self.prompt, **kwargs) - def get_check_prompt(self, question: str, answer: str, criteria, subtask, images: List[Image.Image], infomation: ImageInfomation = None): + def get_check_prompt(self, question: str, answer: str, criteria, subtask, images: List[Image.Image], information: ImageInfomation = None): messages = [ { "role": "system", @@ -474,11 +474,11 @@ def get_check_prompt(self, question: str, answer: str, criteria, subtask, images "text": f"Question: {question}\nQuestioner's Answer: {answer}\nCriteria: {criteria}\nSubtask: {subtask}", }, ) - if infomation: + if information: content.append( { "type": "text", - "text": str(infomation), + "text": str(information), }, ) content.append( diff --git a/tools/live_bench/live_bench/data_generator/question_finalizer.py b/tools/live_bench/live_bench/data_generator/question_finalizer.py index 100003a8..ab901f8c 100644 --- a/tools/live_bench/live_bench/data_generator/question_finalizer.py +++ b/tools/live_bench/live_bench/data_generator/question_finalizer.py @@ -107,7 +107,7 @@ def get_answer(self, question: str, images: List[Image.Image]): class QuestionFinalizer(object): def __init__(self, gpt4v_model: str = "gpt-4o", claude_model: str = "claude-3-5-sonnet-20240620", gemini_model: str = "gemini-1.5-pro"): - self.models = {"GPT4V": GPT4VAnswerGetter(gpt4v_model), "Claude": ClaudeAnswerGetter(claude_model), "Gemini": GeminiAnswerGetter(gemini_model)} + self.models = {"GPT4V": GPT4VAnswerGetter(gpt4v_model), "Claude": ClaudeAnswerGetter(claude_model)} self.client = get_openai_client() # self.client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY", None)) # self.client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY", None)) diff --git a/tools/live_bench/live_bench/data_generator/utils/claude.py b/tools/live_bench/live_bench/data_generator/utils/claude.py index 47b4b867..cd96ba62 100644 --- a/tools/live_bench/live_bench/data_generator/utils/claude.py +++ b/tools/live_bench/live_bench/data_generator/utils/claude.py @@ -27,7 +27,7 @@ def format_claude_images(image: Union[Image.Image, List[Image.Image]]): } -def claude_generate_response(client: anthropic.Anthropic, model, messages, max_tokens: int = 4096, max_try_times=5, system=None, json_format="auto", test=False, **kwargs): +def claude_generate_response(client: anthropic.Anthropic, model, messages, max_tokens: int = 4096, max_try_times=5, system=None, json_format="auto", test=False, tempreture=0.5, **kwargs): if json_format == "auto": json_format = False for message in messages: @@ -48,9 +48,9 @@ def claude_generate_response(client: anthropic.Anthropic, model, messages, max_t def _generate(): if system: - return client.messages.create(model=model, messages=messages, max_tokens=max_tokens, system=system, **kwargs) + return client.messages.create(model=model, messages=messages, max_tokens=max_tokens, system=system, temperature=tempreture, **kwargs) else: - return client.messages.create(model=model, messages=messages, max_tokens=max_tokens, **kwargs) + return client.messages.create(model=model, messages=messages, max_tokens=max_tokens, temperature=tempreture, **kwargs) for times in range(max_try_times): try: diff --git a/tools/live_bench/live_bench/data_generator/utils/extract_infomation.py b/tools/live_bench/live_bench/data_generator/utils/extract_information.py similarity index 91% rename from tools/live_bench/live_bench/data_generator/utils/extract_infomation.py rename to tools/live_bench/live_bench/data_generator/utils/extract_information.py index bffbd9f1..7164b1a4 100644 --- a/tools/live_bench/live_bench/data_generator/utils/extract_infomation.py +++ b/tools/live_bench/live_bench/data_generator/utils/extract_information.py @@ -106,12 +106,12 @@ def extract_text_from_html_from_gpt(self, screen_image: ScreenImage, **kwargs) - response = self.generate_response(messages=messages, model=self.model, client=self.client, json_format=False, **kwargs) return response - def extract_infomation(self, screen_image: ScreenImage, **kwargs) -> ImageInfomation: + def extract_information(self, screen_image: ScreenImage, **kwargs) -> ImageInfomation: ocrs = self.extract_text_from_html_from_gpt(screen_image) - infomation = ImageInfomation() + information = ImageInfomation() if ocrs.success: ocrs = f"Below is the text extracted from the website for you to take reference:\n{ocrs.content}" - infomation.text = ocrs + information.text = ocrs else: ocrs = "" messages = [ @@ -122,14 +122,14 @@ def extract_infomation(self, screen_image: ScreenImage, **kwargs) -> ImageInfoma ] response = self.generate_response(messages=messages, model=self.model, client=self.client, json_format=False, **kwargs) if response.success: - infomation.image_features = response.content + information.image_features = response.content messages = [ { "role": "user", - "content": [{"type": "text", "text": f"{THINK_DIFFERENTLY_PROMPT}\n\n{str(infomation)}"}] + self.format_images(screen_image.images), + "content": [{"type": "text", "text": f"{THINK_DIFFERENTLY_PROMPT}\n\n{str(information)}"}] + self.format_images(screen_image.images), } ] response = self.generate_response(messages=messages, model=self.model, client=self.client, json_format=False, **kwargs) if response.success: - infomation.differnt_points = response.content - return infomation + information.differnt_points = response.content + return information diff --git a/tools/live_bench/live_bench/data_generator/utils/gemini.py b/tools/live_bench/live_bench/data_generator/utils/gemini.py index d57495cd..0cdddca8 100644 --- a/tools/live_bench/live_bench/data_generator/utils/gemini.py +++ b/tools/live_bench/live_bench/data_generator/utils/gemini.py @@ -8,8 +8,8 @@ logger = logging.getLogger("lmms-eval") -def gemini_generate_response(client: genai.GenerativeModel, messages, max_tokens: int, max_try_times: int = 5, **kwargs): - generation_config = genai.GenerationConfig(max_output_tokens=max_tokens) +def gemini_generate_response(client: genai.GenerativeModel, messages, max_tokens: int, max_try_times: int = 5, temperature=0.5, **kwargs): + generation_config = genai.GenerationConfig(max_output_tokens=max_tokens, temperature=temperature) def _generate(): return client.generate_content( diff --git a/tools/live_bench/live_bench/data_generator/utils/gpt4v.py b/tools/live_bench/live_bench/data_generator/utils/gpt4v.py index 67581459..80c396ce 100644 --- a/tools/live_bench/live_bench/data_generator/utils/gpt4v.py +++ b/tools/live_bench/live_bench/data_generator/utils/gpt4v.py @@ -51,7 +51,7 @@ def format_printable_messages(messages): return messages -def gpt4v_generate_response(messages, *, client=None, model="gpt-4o", max_tokens: int = 4096, max_try_times: int = 5, json_format="auto", test=False, system=None, **kwargs) -> Response: +def gpt4v_generate_response(messages, *, client=None, model="gpt-4o", max_tokens: int = 4096, max_try_times: int = 5, json_format="auto", test=False, system=None, temperature=0.5, **kwargs) -> Response: if system: messages = [{"role": "system", "content": system}] + messages @@ -76,7 +76,7 @@ def gpt4v_generate_response(messages, *, client=None, model="gpt-4o", max_tokens response_format = None def _generate(): - return client.chat.completions.create(model=model, messages=messages, max_tokens=max_tokens, response_format=response_format, **kwargs) + return client.chat.completions.create(model=model, messages=messages, max_tokens=max_tokens, response_format=response_format, temperature=temperature, **kwargs) for times in range(max_try_times): try: diff --git a/tools/live_bench/script/refractor.py b/tools/live_bench/script/refractor.py index bfd74179..d36ae703 100644 --- a/tools/live_bench/script/refractor.py +++ b/tools/live_bench/script/refractor.py @@ -1,8 +1,8 @@ import os if __name__ == "__main__": - path = "/data/pufanyi/project/lmms-eval/tools/temp/images" - new_path = "/data/pufanyi/project/lmms-eval/tools/temp/processed_images" + path = "/data/pufanyi/project/lmms-eval/tools/temp/2024-09" + new_path = "/data/pufanyi/project/lmms-eval/tools/temp/processed_images_2" if not os.path.exists(new_path): os.makedirs(new_path) subjects = os.listdir(path) diff --git a/tools/live_bench/script/upload_results.py b/tools/live_bench/script/upload_results.py index 766136da..fbb526d4 100644 --- a/tools/live_bench/script/upload_results.py +++ b/tools/live_bench/script/upload_results.py @@ -24,7 +24,7 @@ "reason": datasets.Value("string"), } -SUBTASKS = ["Basic Understanding", "Analytical Questions", "Evaluative Questions", "Divergent Thinking"] +SUBTASKS = ["Basic Understanding", "Analytical Questions", "Evaluative Questions", "Divergent Thinking", "Real-world Assistance"] def load_images(config) -> Dict[int, List[Image.Image]]: diff --git a/tools/live_bench/summerize.ipynb b/tools/live_bench/summerize.ipynb new file mode 100644 index 00000000..a1acfc7e --- /dev/null +++ b/tools/live_bench/summerize.ipynb @@ -0,0 +1,719 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/data/pufanyi/anaconda3/anacondabin/envs/live_bench/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "from datasets import load_dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Generating test split: 100%|██████████| 313/313 [00:00<00:00, 324.63 examples/s]\n" + ] + } + ], + "source": [ + "data = load_dataset(\"lmms-lab/LiveBench\", \"2024-09\", split=\"test\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 313/313 [00:25<00:00, 12.06it/s]\n" + ] + } + ], + "source": [ + "from tqdm import tqdm\n", + "\n", + "subsets = {}\n", + "for item in tqdm(data, total=len(data)):\n", + " subset = eval(item[\"website\"])[\"subject\"]\n", + " if subset not in subsets:\n", + " subsets[subset] = 0\n", + " subsets[subset] += 1" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'entertainment': 34,\n", + " 'sports': 44,\n", + " 'technology': 44,\n", + " 'finance': 43,\n", + " 'environment': 31,\n", + " 'politics': 40,\n", + " 'science': 38,\n", + " 'artandculture': 39}" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "subsets" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "34.75" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sum(subsets.values()) / len(subsets)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "df = data.to_pandas()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "def update(row):\n", + " if row[\"subtask\"] == \"Evaluative Questions\":\n", + " row[\"subtask\"] = \"Divergent Thinking\"" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 None\n", + "1 None\n", + "2 None\n", + "3 None\n", + "4 None\n", + " ... \n", + "308 None\n", + "309 None\n", + "310 None\n", + "311 None\n", + "312 None\n", + "Length: 313, dtype: object" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.apply(update, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "df = df[df[\"score\"] > 6]" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "251" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 313/313 [00:00<00:00, 11041.63it/s]\n" + ] + } + ], + "source": [ + "from tqdm import tqdm\n", + "\n", + "subsets = {}\n", + "for idx, item in tqdm(df.iterrows(), total=len(df)):\n", + " subset = eval(item[\"website\"])[\"subject\"]\n", + " if subset not in subsets:\n", + " subsets[subset] = 0\n", + " subsets[subset] += 1" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 312/312 [00:00<00:00, 10684.90it/s]\n" + ] + } + ], + "source": [ + "final_data = {}\n", + "for idx, item in tqdm(df.iterrows(), total=len(df)):\n", + " subtask = item[\"subtask\"]\n", + " if subtask == \"Evaluative Questions\":\n", + " subtask = \"Divergent Thinking\"\n", + " item[\"subtask\"] = subtask\n", + " if subtask not in final_data:\n", + " final_data[subtask] = []\n", + " final_data[subtask].append(item)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "ename": "KeyError", + "evalue": "'Further Insights'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[39], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m \u001b[43mfinal_data\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mFurther Insights\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\n", + "\u001b[0;31mKeyError\u001b[0m: 'Further Insights'" + ] + } + ], + "source": [ + "del final_data[\"Further Insights\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['Basic Understanding', 'Analytical Questions', 'Divergent Thinking', 'Real-world Assistance'])" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "final_data.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "from random import shuffle\n", + "\n", + "\n", + "for key, value in final_data.items():\n", + " shuffle(value)\n", + " value = sorted(value, key=lambda x: x[\"score\"])\n", + " value = list(reversed(value))[:50]\n", + " final_data[key] = value" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "final_df = pd.concat([pd.DataFrame(value) for value in final_data.values()])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'entertainment': 34,\n", + " 'sports': 44,\n", + " 'technology': 44,\n", + " 'finance': 43,\n", + " 'environment': 31,\n", + " 'politics': 40,\n", + " 'science': 38,\n", + " 'artandculture': 39}" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "subsets" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 200/200 [00:00<00:00, 11090.61it/s]\n" + ] + } + ], + "source": [ + "from tqdm import tqdm\n", + "\n", + "subsets = {}\n", + "for idx, item in tqdm(final_df.iterrows(), total=len(final_df)):\n", + " subset = item[\"subtask\"]\n", + " if subset not in subsets:\n", + " subsets[subset] = 0\n", + " subsets[subset] += 1" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "200" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(final_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'Basic Understanding': 50,\n", + " 'Analytical Questions': 50,\n", + " 'Divergent Thinking': 50,\n", + " 'Real-world Assistance': 50}" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "subsets" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "df = df[df[\"subtask\"] != \"Further Insights\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import Dataset\n", + "\n", + "new_data = Dataset.from_pandas(final_df, preserve_index=False, features = data.features)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'id': 41,\n", + " 'images': [],\n", + " 'website': \"{'name': 'technology_3.png', 'subject': 'technology'}\",\n", + " 'question': \"Identify each vulnerability's CVE identifier along with the detailed descriptions provided in the image.\",\n", + " 'answer': \"{'1': {'CVE Identifier': 'CVE-2024-29824', 'Description': 'Ivanti Endpoint Manager (EPM) SQL Injection Vulnerability: Ivanti Endpoint Manager (EPM) contains a SQL injection vulnerability in Core server that allows an unauthenticated attacker within the same network to execute arbitrary code.'}, '2': {'CVE Identifier': 'CVE-2023-25280', 'Description': 'D-Link DIR-820 Router OS Command Injection Vulnerability: D-Link DIR-820 routers contain an OS command injection vulnerability that allows a remote, unauthenticated attacker to escalate privileges to root via a crafted payload with the ping_addr parameter to ping.ccp.'}, '3': {'CVE Identifier': 'CVE-2020-15415', 'Description': 'DrayTek Multiple Vigor Routers OS Command Injection Vulnerability: DrayTek Vigor3900, Vigor2960, and Vigor300B devices contain an OS command injection vulnerability in cgi-bin/mainfunction.cgi/cvmcfgupload that allows for remote code execution via shell metacharacters in a filename when the text/x-python-script content type is used.'}}\",\n", + " 'criteria': \"{'totalScore': 10, 'scoring': {'eachCorrectCVEWithDescription': {'fullMatch': 3.33, 'partialMatch': 1.67, 'missingOrIncorrect': 0}}, 'notes': {'fullMatch': 'The CVE identifier and the associated description are fully correct.', 'partialMatch': 'Either the CVE identifier or the description is correct, but not both.', 'missingOrIncorrect': 'Both the CVE identifier and the description are either incorrect or missing.'}}\",\n", + " 'subtask': 'Basic Understanding',\n", + " 'data_generator': 'gpt4v',\n", + " 'checker': 'gpt4v',\n", + " 'date_time': '2024-10-07 02:00:15',\n", + " 'screen_shoter': 'human',\n", + " 'screen_size': None,\n", + " 'score': 10,\n", + " 'reason': 'The answer accurately lists all three CVE identifiers mentioned in the image, along with their corresponding products and vendors. The information provided is directly observable in the image and precisely matches the details given for each vulnerability. The response is clear, logically organized, and directly addresses the question asked.',\n", + " 'scorer_name': 'claude'}" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_data[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Dataset({\n", + " features: ['id', 'images', 'website', 'question', 'answer', 'criteria', 'subtask', 'data_generator', 'checker', 'date_time', 'screen_shoter', 'screen_size', 'score', 'reason', 'scorer_name'],\n", + " num_rows: 200\n", + "})" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_data" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Map: 100%|██████████| 200/200 [00:00<00:00, 322.10 examples/s]it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 6.96ba/s]\n", + "Uploading the dataset shards: 100%|██████████| 1/1 [00:10<00:00, 10.69s/it]\n" + ] + }, + { + "data": { + "text/plain": [ + "CommitInfo(commit_url='https://huggingface.co/datasets/lmms-lab/LiveBench/commit/91ed85da9ab4e4eb3babf7c5e39f506a33e62ba1', commit_message='Upload dataset', commit_description='', oid='91ed85da9ab4e4eb3babf7c5e39f506a33e62ba1', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/lmms-lab/LiveBench', endpoint='https://huggingface.co', repo_type='dataset', repo_id='lmms-lab/LiveBench'), pr_revision=None, pr_num=None)" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_data.push_to_hub(\"lmms-lab/LiveBench\", \"2024-09\", split=\"test\")" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Generating test split: 100%|██████████| 200/200 [00:00<00:00, 274.48 examples/s]\n" + ] + } + ], + "source": [ + "new_data = load_dataset(\"lmms-lab/LiveBench\", \"2024-09\", split=\"test\")\n", + "df = new_data.to_pandas()" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [], + "source": [ + "def get_subset(row):\n", + " return eval(row)[\"subject\"]\n", + "\n", + "df[\"website\"] = df[\"website\"].apply(get_subset)" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'technology'" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.iloc[0][\"website\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Map: 100%|██████████| 200/200 [00:00<00:00, 582.27 examples/s]it/s]\n", + "Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 6.05ba/s]\n", + "Uploading the dataset shards: 100%|██████████| 1/1 [00:08<00:00, 8.85s/it]\n" + ] + }, + { + "data": { + "text/plain": [ + "CommitInfo(commit_url='https://huggingface.co/datasets/lmms-lab/LiveBench/commit/c83aaca619b54f3115ea7348139ea0ee6fb139c6', commit_message='Upload dataset', commit_description='', oid='c83aaca619b54f3115ea7348139ea0ee6fb139c6', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/lmms-lab/LiveBench', endpoint='https://huggingface.co', repo_type='dataset', repo_id='lmms-lab/LiveBench'), pr_revision=None, pr_num=None)" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_data = Dataset.from_pandas(df, preserve_index=False, features = new_data.features)\n", + "new_data.push_to_hub(\"lmms-lab/LiveBench\", \"2024-09\", split=\"test\")" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Generating train split: 200 examples [00:01, 132.53 examples/s]\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "File \u001b[0;32m/data/pufanyi/anaconda3/anacondabin/envs/live_bench/lib/python3.12/site-packages/PIL/ImageFile.py:547\u001b[0m, in \u001b[0;36m_save\u001b[0;34m(im, fp, tile, bufsize)\u001b[0m\n\u001b[1;32m 546\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 547\u001b[0m fh \u001b[38;5;241m=\u001b[39m \u001b[43mfp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfileno\u001b[49m()\n\u001b[1;32m 548\u001b[0m fp\u001b[38;5;241m.\u001b[39mflush()\n", + "\u001b[0;31mAttributeError\u001b[0m: '_idat' object has no attribute 'fileno'", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[58], line 15\u001b[0m\n\u001b[1;32m 12\u001b[0m new_item[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mimages\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m [image]\n\u001b[1;32m 13\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m new_item\n\u001b[0;32m---> 15\u001b[0m dataset \u001b[38;5;241m=\u001b[39m \u001b[43mDataset\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_generator\u001b[49m\u001b[43m(\u001b[49m\u001b[43mget_data\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/data/pufanyi/anaconda3/anacondabin/envs/live_bench/lib/python3.12/site-packages/datasets/arrow_dataset.py:1099\u001b[0m, in \u001b[0;36mDataset.from_generator\u001b[0;34m(generator, features, cache_dir, keep_in_memory, gen_kwargs, num_proc, split, **kwargs)\u001b[0m\n\u001b[1;32m 1037\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Create a Dataset from a generator.\u001b[39;00m\n\u001b[1;32m 1038\u001b[0m \n\u001b[1;32m 1039\u001b[0m \u001b[38;5;124;03mArgs:\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1086\u001b[0m \u001b[38;5;124;03m```\u001b[39;00m\n\u001b[1;32m 1087\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 1088\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mio\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mgenerator\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m GeneratorDatasetInputStream\n\u001b[1;32m 1090\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mGeneratorDatasetInputStream\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1091\u001b[0m \u001b[43m \u001b[49m\u001b[43mgenerator\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgenerator\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1092\u001b[0m \u001b[43m \u001b[49m\u001b[43mfeatures\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfeatures\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1093\u001b[0m \u001b[43m \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1094\u001b[0m \u001b[43m \u001b[49m\u001b[43mkeep_in_memory\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkeep_in_memory\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1095\u001b[0m \u001b[43m \u001b[49m\u001b[43mgen_kwargs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgen_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1096\u001b[0m \u001b[43m \u001b[49m\u001b[43mnum_proc\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mnum_proc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1097\u001b[0m \u001b[43m \u001b[49m\u001b[43msplit\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msplit\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1098\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m-> 1099\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/data/pufanyi/anaconda3/anacondabin/envs/live_bench/lib/python3.12/site-packages/datasets/io/generator.py:49\u001b[0m, in \u001b[0;36mGeneratorDatasetInputStream.read\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 46\u001b[0m verification_mode \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 47\u001b[0m base_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m---> 49\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbuilder\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdownload_and_prepare\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 50\u001b[0m \u001b[43m \u001b[49m\u001b[43mdownload_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 51\u001b[0m \u001b[43m \u001b[49m\u001b[43mdownload_mode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 52\u001b[0m \u001b[43m \u001b[49m\u001b[43mverification_mode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverification_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 53\u001b[0m \u001b[43m \u001b[49m\u001b[43mbase_path\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbase_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 54\u001b[0m \u001b[43m \u001b[49m\u001b[43mnum_proc\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnum_proc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 55\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 56\u001b[0m dataset \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbuilder\u001b[38;5;241m.\u001b[39mas_dataset(\n\u001b[1;32m 57\u001b[0m split\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbuilder\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39msplit, verification_mode\u001b[38;5;241m=\u001b[39mverification_mode, in_memory\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mkeep_in_memory\n\u001b[1;32m 58\u001b[0m )\n\u001b[1;32m 59\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m dataset\n", + "File \u001b[0;32m/data/pufanyi/anaconda3/anacondabin/envs/live_bench/lib/python3.12/site-packages/datasets/builder.py:924\u001b[0m, in \u001b[0;36mDatasetBuilder.download_and_prepare\u001b[0;34m(self, output_dir, download_config, download_mode, verification_mode, dl_manager, base_path, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs)\u001b[0m\n\u001b[1;32m 922\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m num_proc \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 923\u001b[0m prepare_split_kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnum_proc\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m num_proc\n\u001b[0;32m--> 924\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_download_and_prepare\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 925\u001b[0m \u001b[43m \u001b[49m\u001b[43mdl_manager\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdl_manager\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 926\u001b[0m \u001b[43m \u001b[49m\u001b[43mverification_mode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverification_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 927\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mprepare_split_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 928\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mdownload_and_prepare_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 929\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 930\u001b[0m \u001b[38;5;66;03m# Sync info\u001b[39;00m\n\u001b[1;32m 931\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39minfo\u001b[38;5;241m.\u001b[39mdataset_size \u001b[38;5;241m=\u001b[39m \u001b[38;5;28msum\u001b[39m(split\u001b[38;5;241m.\u001b[39mnum_bytes \u001b[38;5;28;01mfor\u001b[39;00m split \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39minfo\u001b[38;5;241m.\u001b[39msplits\u001b[38;5;241m.\u001b[39mvalues())\n", + "File \u001b[0;32m/data/pufanyi/anaconda3/anacondabin/envs/live_bench/lib/python3.12/site-packages/datasets/builder.py:1647\u001b[0m, in \u001b[0;36mGeneratorBasedBuilder._download_and_prepare\u001b[0;34m(self, dl_manager, verification_mode, **prepare_splits_kwargs)\u001b[0m\n\u001b[1;32m 1646\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_download_and_prepare\u001b[39m(\u001b[38;5;28mself\u001b[39m, dl_manager, verification_mode, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mprepare_splits_kwargs):\n\u001b[0;32m-> 1647\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_download_and_prepare\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1648\u001b[0m \u001b[43m \u001b[49m\u001b[43mdl_manager\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1649\u001b[0m \u001b[43m \u001b[49m\u001b[43mverification_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1650\u001b[0m \u001b[43m \u001b[49m\u001b[43mcheck_duplicate_keys\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverification_mode\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m==\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mVerificationMode\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mBASIC_CHECKS\u001b[49m\n\u001b[1;32m 1651\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;129;43;01mor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mverification_mode\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m==\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mVerificationMode\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mALL_CHECKS\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1652\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mprepare_splits_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1653\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/data/pufanyi/anaconda3/anacondabin/envs/live_bench/lib/python3.12/site-packages/datasets/builder.py:999\u001b[0m, in \u001b[0;36mDatasetBuilder._download_and_prepare\u001b[0;34m(self, dl_manager, verification_mode, **prepare_split_kwargs)\u001b[0m\n\u001b[1;32m 995\u001b[0m split_dict\u001b[38;5;241m.\u001b[39madd(split_generator\u001b[38;5;241m.\u001b[39msplit_info)\n\u001b[1;32m 997\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 998\u001b[0m \u001b[38;5;66;03m# Prepare split will record examples associated to the split\u001b[39;00m\n\u001b[0;32m--> 999\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_prepare_split\u001b[49m\u001b[43m(\u001b[49m\u001b[43msplit_generator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mprepare_split_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1000\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 1001\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m(\n\u001b[1;32m 1002\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCannot find data file. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1003\u001b[0m \u001b[38;5;241m+\u001b[39m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmanual_download_instructions \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 1004\u001b[0m \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mOriginal error:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1005\u001b[0m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(e)\n\u001b[1;32m 1006\u001b[0m ) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n", + "File \u001b[0;32m/data/pufanyi/anaconda3/anacondabin/envs/live_bench/lib/python3.12/site-packages/datasets/builder.py:1485\u001b[0m, in \u001b[0;36mGeneratorBasedBuilder._prepare_split\u001b[0;34m(self, split_generator, check_duplicate_keys, file_format, num_proc, max_shard_size)\u001b[0m\n\u001b[1;32m 1483\u001b[0m job_id \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m 1484\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m pbar:\n\u001b[0;32m-> 1485\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mjob_id\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdone\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcontent\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_prepare_split_single\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1486\u001b[0m \u001b[43m \u001b[49m\u001b[43mgen_kwargs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgen_kwargs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mjob_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mjob_id\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43m_prepare_split_args\u001b[49m\n\u001b[1;32m 1487\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[43m:\u001b[49m\n\u001b[1;32m 1488\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mdone\u001b[49m\u001b[43m:\u001b[49m\n\u001b[1;32m 1489\u001b[0m \u001b[43m \u001b[49m\u001b[43mresult\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mcontent\u001b[49m\n", + "File \u001b[0;32m/data/pufanyi/anaconda3/anacondabin/envs/live_bench/lib/python3.12/site-packages/datasets/builder.py:1633\u001b[0m, in \u001b[0;36mGeneratorBasedBuilder._prepare_split_single\u001b[0;34m(self, gen_kwargs, fpath, file_format, max_shard_size, split_info, check_duplicate_keys, job_id)\u001b[0m\n\u001b[1;32m 1631\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m job_id, \u001b[38;5;28;01mFalse\u001b[39;00m, num_examples_progress_update\n\u001b[1;32m 1632\u001b[0m num_shards \u001b[38;5;241m=\u001b[39m shard_id \u001b[38;5;241m+\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[0;32m-> 1633\u001b[0m num_examples, num_bytes \u001b[38;5;241m=\u001b[39m \u001b[43mwriter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfinalize\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1634\u001b[0m writer\u001b[38;5;241m.\u001b[39mclose()\n\u001b[1;32m 1635\u001b[0m shard_lengths\u001b[38;5;241m.\u001b[39mappend(num_examples)\n", + "File \u001b[0;32m/data/pufanyi/anaconda3/anacondabin/envs/live_bench/lib/python3.12/site-packages/datasets/arrow_writer.py:594\u001b[0m, in \u001b[0;36mArrowWriter.finalize\u001b[0;34m(self, close_stream)\u001b[0m\n\u001b[1;32m 592\u001b[0m \u001b[38;5;66;03m# Re-intializing to empty list for next batch\u001b[39;00m\n\u001b[1;32m 593\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhkey_record \u001b[38;5;241m=\u001b[39m []\n\u001b[0;32m--> 594\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwrite_examples_on_file\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 595\u001b[0m \u001b[38;5;66;03m# If schema is known, infer features even if no examples were written\u001b[39;00m\n\u001b[1;32m 596\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpa_writer \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mschema:\n", + "File \u001b[0;32m/data/pufanyi/anaconda3/anacondabin/envs/live_bench/lib/python3.12/site-packages/datasets/arrow_writer.py:453\u001b[0m, in \u001b[0;36mArrowWriter.write_examples_on_file\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 448\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 449\u001b[0m batch_examples[col] \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 450\u001b[0m row[\u001b[38;5;241m0\u001b[39m][col]\u001b[38;5;241m.\u001b[39mto_pylist()[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(row[\u001b[38;5;241m0\u001b[39m][col], (pa\u001b[38;5;241m.\u001b[39mArray, pa\u001b[38;5;241m.\u001b[39mChunkedArray)) \u001b[38;5;28;01melse\u001b[39;00m row[\u001b[38;5;241m0\u001b[39m][col]\n\u001b[1;32m 451\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m row \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcurrent_examples\n\u001b[1;32m 452\u001b[0m ]\n\u001b[0;32m--> 453\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwrite_batch\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbatch_examples\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbatch_examples\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 454\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcurrent_examples \u001b[38;5;241m=\u001b[39m []\n", + "File \u001b[0;32m/data/pufanyi/anaconda3/anacondabin/envs/live_bench/lib/python3.12/site-packages/datasets/arrow_writer.py:563\u001b[0m, in \u001b[0;36mArrowWriter.write_batch\u001b[0;34m(self, batch_examples, writer_batch_size)\u001b[0m\n\u001b[1;32m 561\u001b[0m col_try_type \u001b[38;5;241m=\u001b[39m try_features[col] \u001b[38;5;28;01mif\u001b[39;00m try_features \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m col \u001b[38;5;129;01min\u001b[39;00m try_features \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 562\u001b[0m typed_sequence \u001b[38;5;241m=\u001b[39m OptimizedTypedSequence(col_values, \u001b[38;5;28mtype\u001b[39m\u001b[38;5;241m=\u001b[39mcol_type, try_type\u001b[38;5;241m=\u001b[39mcol_try_type, col\u001b[38;5;241m=\u001b[39mcol)\n\u001b[0;32m--> 563\u001b[0m arrays\u001b[38;5;241m.\u001b[39mappend(\u001b[43mpa\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marray\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtyped_sequence\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[1;32m 564\u001b[0m inferred_features[col] \u001b[38;5;241m=\u001b[39m typed_sequence\u001b[38;5;241m.\u001b[39mget_inferred_type()\n\u001b[1;32m 565\u001b[0m schema \u001b[38;5;241m=\u001b[39m inferred_features\u001b[38;5;241m.\u001b[39marrow_schema \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpa_writer \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mschema\n", + "File \u001b[0;32m/data/pufanyi/anaconda3/anacondabin/envs/live_bench/lib/python3.12/site-packages/pyarrow/array.pxi:248\u001b[0m, in \u001b[0;36mpyarrow.lib.array\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32m/data/pufanyi/anaconda3/anacondabin/envs/live_bench/lib/python3.12/site-packages/pyarrow/array.pxi:112\u001b[0m, in \u001b[0;36mpyarrow.lib._handle_arrow_array_protocol\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32m/data/pufanyi/anaconda3/anacondabin/envs/live_bench/lib/python3.12/site-packages/datasets/arrow_writer.py:188\u001b[0m, in \u001b[0;36mTypedSequence.__arrow_array__\u001b[0;34m(self, type)\u001b[0m\n\u001b[1;32m 186\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 187\u001b[0m trying_cast_to_python_objects \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 188\u001b[0m out \u001b[38;5;241m=\u001b[39m pa\u001b[38;5;241m.\u001b[39marray(\u001b[43mcast_to_python_objects\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43monly_1d_for_numpy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m)\n\u001b[1;32m 189\u001b[0m \u001b[38;5;66;03m# use smaller integer precisions if possible\u001b[39;00m\n\u001b[1;32m 190\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtrying_int_optimization:\n", + "File \u001b[0;32m/data/pufanyi/anaconda3/anacondabin/envs/live_bench/lib/python3.12/site-packages/datasets/features/features.py:455\u001b[0m, in \u001b[0;36mcast_to_python_objects\u001b[0;34m(obj, only_1d_for_numpy, optimize_list_casting)\u001b[0m\n\u001b[1;32m 435\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcast_to_python_objects\u001b[39m(obj: Any, only_1d_for_numpy\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m, optimize_list_casting\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Any:\n\u001b[1;32m 436\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 437\u001b[0m \u001b[38;5;124;03m Cast numpy/pytorch/tensorflow/pandas objects to python lists.\u001b[39;00m\n\u001b[1;32m 438\u001b[0m \u001b[38;5;124;03m It works recursively.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 453\u001b[0m \u001b[38;5;124;03m casted_obj: the casted object\u001b[39;00m\n\u001b[1;32m 454\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 455\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_cast_to_python_objects\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 456\u001b[0m \u001b[43m \u001b[49m\u001b[43mobj\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43monly_1d_for_numpy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43monly_1d_for_numpy\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptimize_list_casting\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moptimize_list_casting\u001b[49m\n\u001b[1;32m 457\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m[\u001b[38;5;241m0\u001b[39m]\n", + "File \u001b[0;32m/data/pufanyi/anaconda3/anacondabin/envs/live_bench/lib/python3.12/site-packages/datasets/features/features.py:417\u001b[0m, in \u001b[0;36m_cast_to_python_objects\u001b[0;34m(obj, only_1d_for_numpy, optimize_list_casting)\u001b[0m\n\u001b[1;32m 411\u001b[0m casted_first_elmt, has_changed_first_elmt \u001b[38;5;241m=\u001b[39m _cast_to_python_objects(\n\u001b[1;32m 412\u001b[0m first_elmt, only_1d_for_numpy\u001b[38;5;241m=\u001b[39monly_1d_for_numpy, optimize_list_casting\u001b[38;5;241m=\u001b[39moptimize_list_casting\n\u001b[1;32m 413\u001b[0m )\n\u001b[1;32m 414\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m has_changed_first_elmt \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m optimize_list_casting:\n\u001b[1;32m 415\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m (\n\u001b[1;32m 416\u001b[0m [\n\u001b[0;32m--> 417\u001b[0m \u001b[43m_cast_to_python_objects\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 418\u001b[0m \u001b[43m \u001b[49m\u001b[43melmt\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43monly_1d_for_numpy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43monly_1d_for_numpy\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptimize_list_casting\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moptimize_list_casting\u001b[49m\n\u001b[1;32m 419\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 420\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m elmt \u001b[38;5;129;01min\u001b[39;00m obj\n\u001b[1;32m 421\u001b[0m ],\n\u001b[1;32m 422\u001b[0m \u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m 423\u001b[0m )\n\u001b[1;32m 424\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 425\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(obj, (\u001b[38;5;28mlist\u001b[39m, \u001b[38;5;28mtuple\u001b[39m)):\n", + "File \u001b[0;32m/data/pufanyi/anaconda3/anacondabin/envs/live_bench/lib/python3.12/site-packages/datasets/features/features.py:411\u001b[0m, in \u001b[0;36m_cast_to_python_objects\u001b[0;34m(obj, only_1d_for_numpy, optimize_list_casting)\u001b[0m\n\u001b[1;32m 409\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m _check_non_null_non_empty_recursive(first_elmt):\n\u001b[1;32m 410\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[0;32m--> 411\u001b[0m casted_first_elmt, has_changed_first_elmt \u001b[38;5;241m=\u001b[39m \u001b[43m_cast_to_python_objects\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 412\u001b[0m \u001b[43m \u001b[49m\u001b[43mfirst_elmt\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43monly_1d_for_numpy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43monly_1d_for_numpy\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptimize_list_casting\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moptimize_list_casting\u001b[49m\n\u001b[1;32m 413\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 414\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m has_changed_first_elmt \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m optimize_list_casting:\n\u001b[1;32m 415\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m (\n\u001b[1;32m 416\u001b[0m [\n\u001b[1;32m 417\u001b[0m _cast_to_python_objects(\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 422\u001b[0m \u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m 423\u001b[0m )\n", + "File \u001b[0;32m/data/pufanyi/anaconda3/anacondabin/envs/live_bench/lib/python3.12/site-packages/datasets/features/features.py:367\u001b[0m, in \u001b[0;36m_cast_to_python_objects\u001b[0;34m(obj, only_1d_for_numpy, optimize_list_casting)\u001b[0m\n\u001b[1;32m 357\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m (\n\u001b[1;32m 358\u001b[0m [\n\u001b[1;32m 359\u001b[0m _cast_to_python_objects(\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 364\u001b[0m \u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m 365\u001b[0m )\n\u001b[1;32m 366\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m config\u001b[38;5;241m.\u001b[39mPIL_AVAILABLE \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPIL\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m sys\u001b[38;5;241m.\u001b[39mmodules \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(obj, PIL\u001b[38;5;241m.\u001b[39mImage\u001b[38;5;241m.\u001b[39mImage):\n\u001b[0;32m--> 367\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mencode_pil_image\u001b[49m\u001b[43m(\u001b[49m\u001b[43mobj\u001b[49m\u001b[43m)\u001b[49m, \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 368\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(obj, pd\u001b[38;5;241m.\u001b[39mSeries):\n\u001b[1;32m 369\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m (\n\u001b[1;32m 370\u001b[0m _cast_to_python_objects(\n\u001b[1;32m 371\u001b[0m obj\u001b[38;5;241m.\u001b[39mtolist(), only_1d_for_numpy\u001b[38;5;241m=\u001b[39monly_1d_for_numpy, optimize_list_casting\u001b[38;5;241m=\u001b[39moptimize_list_casting\n\u001b[1;32m 372\u001b[0m )[\u001b[38;5;241m0\u001b[39m],\n\u001b[1;32m 373\u001b[0m \u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m 374\u001b[0m )\n", + "File \u001b[0;32m/data/pufanyi/anaconda3/anacondabin/envs/live_bench/lib/python3.12/site-packages/datasets/features/image.py:315\u001b[0m, in \u001b[0;36mencode_pil_image\u001b[0;34m(image)\u001b[0m\n\u001b[1;32m 313\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpath\u001b[39m\u001b[38;5;124m\"\u001b[39m: image\u001b[38;5;241m.\u001b[39mfilename, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbytes\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;28;01mNone\u001b[39;00m}\n\u001b[1;32m 314\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 315\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpath\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;28;01mNone\u001b[39;00m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbytes\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[43mimage_to_bytes\u001b[49m\u001b[43m(\u001b[49m\u001b[43mimage\u001b[49m\u001b[43m)\u001b[49m}\n", + "File \u001b[0;32m/data/pufanyi/anaconda3/anacondabin/envs/live_bench/lib/python3.12/site-packages/datasets/features/image.py:307\u001b[0m, in \u001b[0;36mimage_to_bytes\u001b[0;34m(image)\u001b[0m\n\u001b[1;32m 305\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 306\u001b[0m \u001b[38;5;28mformat\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPNG\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m image\u001b[38;5;241m.\u001b[39mmode \u001b[38;5;129;01min\u001b[39;00m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m1\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mL\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mLA\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRGB\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRGBA\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTIFF\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m--> 307\u001b[0m \u001b[43mimage\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msave\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbuffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mformat\u001b[39;49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mformat\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 308\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m buffer\u001b[38;5;241m.\u001b[39mgetvalue()\n", + "File \u001b[0;32m/data/pufanyi/anaconda3/anacondabin/envs/live_bench/lib/python3.12/site-packages/PIL/Image.py:2568\u001b[0m, in \u001b[0;36mImage.save\u001b[0;34m(self, fp, format, **params)\u001b[0m\n\u001b[1;32m 2565\u001b[0m fp \u001b[38;5;241m=\u001b[39m cast(IO[\u001b[38;5;28mbytes\u001b[39m], fp)\n\u001b[1;32m 2567\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 2568\u001b[0m \u001b[43msave_handler\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfp\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2569\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m:\n\u001b[1;32m 2570\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m open_fp:\n", + "File \u001b[0;32m/data/pufanyi/anaconda3/anacondabin/envs/live_bench/lib/python3.12/site-packages/PIL/PngImagePlugin.py:1431\u001b[0m, in \u001b[0;36m_save\u001b[0;34m(im, fp, filename, chunk, save_all)\u001b[0m\n\u001b[1;32m 1427\u001b[0m im \u001b[38;5;241m=\u001b[39m _write_multiple_frames(\n\u001b[1;32m 1428\u001b[0m im, fp, chunk, mode, rawmode, default_image, append_images\n\u001b[1;32m 1429\u001b[0m )\n\u001b[1;32m 1430\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m im:\n\u001b[0;32m-> 1431\u001b[0m \u001b[43mImageFile\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_save\u001b[49m\u001b[43m(\u001b[49m\u001b[43mim\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m_idat\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfp\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mchunk\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mzip\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mim\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msize\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrawmode\u001b[49m\u001b[43m)\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1433\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m info:\n\u001b[1;32m 1434\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m info_chunk \u001b[38;5;129;01min\u001b[39;00m info\u001b[38;5;241m.\u001b[39mchunks:\n", + "File \u001b[0;32m/data/pufanyi/anaconda3/anacondabin/envs/live_bench/lib/python3.12/site-packages/PIL/ImageFile.py:551\u001b[0m, in \u001b[0;36m_save\u001b[0;34m(im, fp, tile, bufsize)\u001b[0m\n\u001b[1;32m 549\u001b[0m _encode_tile(im, fp, tile, bufsize, fh)\n\u001b[1;32m 550\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mAttributeError\u001b[39;00m, io\u001b[38;5;241m.\u001b[39mUnsupportedOperation) \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[0;32m--> 551\u001b[0m \u001b[43m_encode_tile\u001b[49m\u001b[43m(\u001b[49m\u001b[43mim\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfp\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbufsize\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexc\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 552\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(fp, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mflush\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m 553\u001b[0m fp\u001b[38;5;241m.\u001b[39mflush()\n", + "File \u001b[0;32m/data/pufanyi/anaconda3/anacondabin/envs/live_bench/lib/python3.12/site-packages/PIL/ImageFile.py:570\u001b[0m, in \u001b[0;36m_encode_tile\u001b[0;34m(im, fp, tile, bufsize, fh, exc)\u001b[0m\n\u001b[1;32m 567\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m exc:\n\u001b[1;32m 568\u001b[0m \u001b[38;5;66;03m# compress to Python file-compatible object\u001b[39;00m\n\u001b[1;32m 569\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[0;32m--> 570\u001b[0m errcode, data \u001b[38;5;241m=\u001b[39m \u001b[43mencoder\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencode\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbufsize\u001b[49m\u001b[43m)\u001b[49m[\u001b[38;5;241m1\u001b[39m:]\n\u001b[1;32m 571\u001b[0m fp\u001b[38;5;241m.\u001b[39mwrite(data)\n\u001b[1;32m 572\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m errcode:\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "from PIL import Image\n", + "import io\n", + "\n", + "def get_data():\n", + " for index, item in df.iterrows():\n", + " new_item = item.to_dict()\n", + " new_item[\"subset\"] = eval(item[\"website\"])[\"subject\"]\n", + " del new_item[\"website\"]\n", + " new_item[\"id\"] = index\n", + " image_bytes = new_item[\"images\"][0][\"bytes\"]\n", + " image = Image.open(io.BytesIO(image_bytes))\n", + " new_item[\"images\"] = [image]\n", + " yield new_item\n", + "\n", + "dataset = Dataset.from_generator(get_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 3.93ba/s]\n", + "Uploading the dataset shards: 100%|██████████| 1/1 [00:16<00:00, 16.91s/it]\n" + ] + }, + { + "data": { + "text/plain": [ + "CommitInfo(commit_url='https://huggingface.co/datasets/lmms-lab/LiveBench/commit/a25ad8c1a882b28a838d6ca3afd26cd29beac153', commit_message='Upload dataset', commit_description='', oid='a25ad8c1a882b28a838d6ca3afd26cd29beac153', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/lmms-lab/LiveBench', endpoint='https://huggingface.co', repo_type='dataset', repo_id='lmms-lab/LiveBench'), pr_revision=None, pr_num=None)" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset.push_to_hub(\"lmms-lab/LiveBench\", \"2024-09\", split=\"test\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "live_bench", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}