From d1a9db58d3df1a2e04a54bcea0ef292a25c33484 Mon Sep 17 00:00:00 2001 From: fistyee Date: Tue, 24 Dec 2024 10:19:08 +0800 Subject: [PATCH 01/26] [Feature] Support MMLU-CF Benchmark --- configs/datasets/mmlu_cf/README.md | 614 ++++++++++++++++++ .../datasets/mmlu_cf/mmlu_cf_categories.py | 16 + configs/datasets/mmlu_cf/mmlu_cf_few_shot.py | 64 ++ configs/datasets/mmlu_cf/mmlu_cf_gen.py | 64 ++ configs/datasets/mmlu_cf/mmlu_cf_zero_shot.py | 64 ++ 5 files changed, 822 insertions(+) create mode 100644 configs/datasets/mmlu_cf/README.md create mode 100644 configs/datasets/mmlu_cf/mmlu_cf_categories.py create mode 100644 configs/datasets/mmlu_cf/mmlu_cf_few_shot.py create mode 100644 configs/datasets/mmlu_cf/mmlu_cf_gen.py create mode 100644 configs/datasets/mmlu_cf/mmlu_cf_zero_shot.py diff --git a/configs/datasets/mmlu_cf/README.md b/configs/datasets/mmlu_cf/README.md new file mode 100644 index 000000000..3181c5baa --- /dev/null +++ b/configs/datasets/mmlu_cf/README.md @@ -0,0 +1,614 @@ +# MMLU-CF: A Contamination-free Multi-task Language Understanding Benchmark + +
+ +![](https://img.shields.io/badge/Task-MMLU_CF-orange) +![](https://img.shields.io/badge/Data-Released-green) +![](https://img.shields.io/badge/Code_License-MIT-blue) + +
+ +

+ [📜 Paper] • + [🤗 HF Dataset] • + [🐱 GitHub] +

+ +## 📢 News and Updates +[2024.12.01] 🔥We have initialized the repository. +[2024.12.16] 🔥We have added the evaluation results of Phi-4-14B and Llama-3.3-70B-Instruct. +[2024.12.20] 🔥We have released the validation dataset of MMLU-CF. + + +## 1. The Motivation of MMLU-CF + + + +- The open-source nature of these benchmarks and the broad sources of training data for LLMs have inevitably led to benchmark contamination, resulting in unreliable evaluation results. To alleviate this issue, we propose MMLU-CF. +- (a) An instance of leakage in MMLU. When questions are used as prompt from the MMLU, certain LLMs, due to their memorization capabilities, directly provide **choices identical to the original ones**. (b) When questions are used as prompt from the MMLU-CF, LLMs only provide guessed choices. +This indicates that the MMLU test set suffers from data contamination and memorization by some LLMs, while the proposed MMLU-CF avoids such leakage. +

+ Fig1_a + Fig1_b +

+ + +## 2. How to Evaluate Your Models on the MMLU-CF Validation/Test Set + + #### (1) We perform automated testing only on Huggingface models. After following the steps outlined below and obtaining the validation set results from [OpenCompass](https://github.com/open-compass/opencompass), the test set results can then be accessed via GitHub Issues. + + **Step 1**. **Validation set evaluation**: Obtaining the validation results for your model using LLM evaluation tools, [OpenCompass](https://github.com/open-compass/opencompass). The validation dataset download from [🤗 Huggingface](https://huggingface.co/datasets/microsoft/MMLU-CF). The data directory structure in the opencompass: + +``` +data +└── mmlu_cf +    ├── dev + └── val +``` + + **Step 2**. **Test set evaluation**: With the validation results, submit a GitHub issue on the [MMLU-CF](https://github.com/) GitHub homepage to request the test set results. Please follow the format below: + +Example 1, +``` +Title: +Test set evaluation Request - add HF model [microsoft/phi-4] +Content: +The result on validation set: 68.5% +``` +Example 2, +

+ Fig6 +

+ + **Notably**: + - Ensure you use the format with square brackets `[ ]` as shown. The model name **microsoft/phi-4** corresponds to the name on HuggingFace. + - We will automatically submit your model. The time to receive the results depends on the number of models being evaluated, but it typically takes **1-2 weeks**. + + #### (2) For API models, if OpenCompass updates the model interface, you can obtain the test set results by sending a temporary key to [Email](yangyu.huang@microsoft.com) after receiving the validation set results. + + +## 3. What is the Difference between MMLU-CF and MMLU +MMLU focuses on the breadth and reasoning without considering contamination prevention. We apply three decontamination rules to mitigate unintentional data leakage while collecting data from a broader domain. Meanwhile, our MMLU-CF benchmark maintains the test set closed-source to prevent malicious data leakage. + +

+ Fig4 + + Fig5 +

+ + +## 4. Leaderboard + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ModelMMLU MMLU-CF
5-shot 5-shot Test 5-shot Validation 5-shot Δ 0-shot Test 0-shot Validation 0-shot Δ
API
GPT-4o88.073.473.4+0.071.972.4-0.5
GPT-4-Turbo86.570.470.1+0.368.968.7+0.1
GPT-4o-mini81.865.565.1+0.466.065.3+0.7
Gemini-1.5-Flash78.764.864.9-0.156.756.9-0.2
GPT-3.5-Turbo71.458.259.0-0.857.258.1-0.9
Large
Qwen2.5-72B-instruct85.371.671.3+0.370.670.4+0.2
Llama-3-70B-instruct82.068.968.8+0.168.167.4+0.7
Llama-3.3-70B-instruct86.368.867.8+1.067.667.5+0.1
Llama-3.1-70B-instruct86.068.768.1+0.670.469.7+0.7
Phi-3.5-MoE-instruct78.964.664.5+0.163.162.1+1.0
Qwen2-72B-instruct82.363.764.3-0.662.462.5-0.1
Mixtral-8x22B-instruct76.262.862.5+0.365.364.8+0.5
Qwen1.5-72B-chat75.659.860.2-0.459.159.6-0.5
Llama-2-70B-chat68.952.251.8+0.451.250.9+0.3
Medium
Qwen2.5-32B-instruct83.969.768.8+0.968.968.8+0.1
Phi-4-14B84.867.868.5-0.768.569.4-0.9
Qwen2.5-14B-instruct79.966.466.1+0.367.066.0+1.0
Phi-3-medium-instruct77.964.264.2+0.062.562.7-0.2
Gemma2-27B75.263.963.5+0.464.264.0+0.2
Yi-1.5-34B-chat76.861.360.5+0.860.659.5+1.1
Mixtral-8x7B-instruct-v0.170.558.357.1-1.258.958.5+0.4
Deepseek-v2-lite-chat55.749.348.7+0.648.247.7+0.5
Baichuan-2-13B-chat57.348.348.6-0.347.148.1-1.0
Llama-2-13B-chat54.842.842.1+0.744.844.6+0.2
Small
Qwen2.5-7B-instruct75.461.360.4+0.959.358.6+0.7
Qwen2-7B-instruct70.558.157.9+0.258.357.4+0.9
Glm-4-9B-chat72.457.857.9-0.158.658.7-0.1
Internlm-2.5-7B-chat72.857.356.8+0.557.956.9+1.0
Llama-3-8B-instruct68.457.356.5+0.856.455.4+1.0
Llama-3.1-8B-instruct68.157.157.9-0.856.156.1+0.0
Gemma-2-9B71.353.753.3+0.432.131.2+0.9
Yi-1.5-6B-chat62.852.851.4+1.452.251.9+0.3
Mistral-7B-instruct-v0.360.350.750.9-0.251.150.9+0.2
Baichuan-2-7B-chat52.944.543.9+0.643.944.0-0.1
Llama-2-7B-chat45.339.438.5+0.941.940.9+1.0
Mini
Phi-3-mini-instruct (3.8B)70.957.958.1-0.258.257.5+0.7
Phi-3.5-mini-instruct (3.8B)69.157.957.4+0.558.357.7+0.6
Qwen2.5-3B-instruct64.455.956.4-0.554.353.9+0.4
Qwen2.5-1.5B-instruct50.751.251.0+0.250.750.4+0.3
Qwen2-1.5B-instruct52.447.147.5-0.445.244.5+0.7
Gemma-2-2B51.343.942.4+1.530.529.4+0.9
Qwen2.5-0.5B-instruct24.141.941.1+0.836.034.9+1.1
Internlm-2-chat-1.8b47.140.539.4+1.141.239.8+1.4
Qwen2-0.5B-instruct37.938.338.3+0.033.533.5+0.0
+ +## 5. Data Construction Pipeline +![Fig3](./Figures/Fig_3.png) +The pipeline involves (1) MCQ Collection to gather a diverse set of questions; (2) MCQ Cleaning to ensure quality; (3) Difficulty Sampling to ensure an appropriate difficulty distribution for questions; (4) LLMs checking: The LLMs, including GPT-4o, Gemini, and Claude, are reviewing the accuracy and safety of the data; and (5) Contamination-Free Processing to prevent data leakage and maintain dataset purity. Ultimately, this process results in the MMLU-CF, consisting of 10,000 questions for the closed-source test set and 10,000 for the open-source validation set. + +## 6. Contact +For any inquiries or concerns, feel free to reach out to us via Email: [Qihao Zhao](qhzhaoo@gmail.com) and [Yangyu Huang](yanghuan@microsoft.com). + +## 7. Citation +``` +@misc{zhao2024mmlucfcontaminationfreemultitasklanguage, + title={MMLU-CF: A Contamination-free Multi-task Language Understanding Benchmark}, + author={Qihao Zhao and Yangyu Huang and Tengchao Lv and Lei Cui and Qinzheng Sun and Shaoguang Mao and Xin Zhang and Ying Xin and Qiufeng Yin and Scarlett Li and Furu Wei}, + year={2024}, + eprint={2412.15194}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2412.15194}, +} +``` + +## 8. License +This repository is licensed under the [MIT](https://github.com/microsoft/PEACE/blob/main/LICENSE) License. +The validation dataset of MMLU-CF is subject to the [CDLA-2.0](https://cdla.dev/permissive-2-0/) License. diff --git a/configs/datasets/mmlu_cf/mmlu_cf_categories.py b/configs/datasets/mmlu_cf/mmlu_cf_categories.py new file mode 100644 index 000000000..ab8b198f4 --- /dev/null +++ b/configs/datasets/mmlu_cf/mmlu_cf_categories.py @@ -0,0 +1,16 @@ +categories = [ + 'Math', + 'Physics', + 'Chemistry', + 'Law', + 'Engineering', + 'Other', + 'Economics', + 'Health', + 'Psychology', + 'Business', + 'Biology', + 'Philosophy', + 'Computer_Science', + 'History', +] diff --git a/configs/datasets/mmlu_cf/mmlu_cf_few_shot.py b/configs/datasets/mmlu_cf/mmlu_cf_few_shot.py new file mode 100644 index 000000000..a3a5d39fb --- /dev/null +++ b/configs/datasets/mmlu_cf/mmlu_cf_few_shot.py @@ -0,0 +1,64 @@ +from mmengine.config import read_base +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import FixKRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator +from opencompass.datasets import MMLUCFDataset +from opencompass.utils.text_postprocessors import first_option_postprocess + +with read_base(): + from .mmlu_cf_categories import categories + +mmlu_cf_reader_cfg = dict( + input_columns=['input', 'A', 'B', 'C', 'D'], + output_column='target', + train_split='dev') + +mmlu_cf_datasets = [] +for _name in categories: + _hint = f'There is a single choice question. Answer the question by replying A, B, C or D.' + mmlu_cf_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt= + f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' + ), + dict(role='BOT', prompt='{target}\n') + ]), + ), + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin='', + round=[ + dict( + role='HUMAN', + prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' + ), + ], + ), + ice_token='', + ), + retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]), + inferencer=dict(type=GenInferencer), + ) + + mmlu_cf_eval_cfg = dict( + evaluator=dict(type=AccwithDetailsEvaluator), + pred_postprocessor=dict(type=first_option_postprocess, options='ABCD')) + + mmlu_cf_datasets.append( + dict( + abbr=f'mmlu_cf_{_name}', + type=MMLUCFDataset, + path='opencompass/mmlu_cf', + name=_name, + reader_cfg=mmlu_cf_reader_cfg, + infer_cfg=mmlu_cf_infer_cfg, + eval_cfg=mmlu_cf_eval_cfg, + )) + +del _name, _hint diff --git a/configs/datasets/mmlu_cf/mmlu_cf_gen.py b/configs/datasets/mmlu_cf/mmlu_cf_gen.py new file mode 100644 index 000000000..a0be611f1 --- /dev/null +++ b/configs/datasets/mmlu_cf/mmlu_cf_gen.py @@ -0,0 +1,64 @@ +from mmengine.config import read_base +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import FixKRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator +from opencompass.datasets import MMLUCFDataset +from opencompass.utils.text_postprocessors import first_option_postprocess + +with read_base(): + from .mmlu_cf_categories import categories + +mmlu_cf_reader_cfg = dict( + input_columns=['input', 'A', 'B', 'C', 'D'], + output_column='target', + train_split='dev') + +mmlu_cf_datasets = [] +for _name in categories: + _hint = f'There is a single choice question. Answer the question by replying A, B, C or D.' + mmlu_cf_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt= + f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' + ), + dict(role='BOT', prompt='{target}\n') + ]), + ), + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin='', + round=[ + dict( + role='HUMAN', + prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' + ), + ], + ), + ice_token='', + ), + retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]), + inferencer=dict(type=GenInferencer), + ) + + mmlu_cf_eval_cfg = dict( + evaluator=dict(type=AccwithDetailsEvaluator), + pred_postprocessor=dict(type=first_option_postprocess, options='ABCD')) + + mmlu_cf_datasets.append( + dict( + abbr=f'mmlu_cf_{_name}', + type=MMLUCFDataset, + path='opencompass/mmlu_cf/', + name=_name, + reader_cfg=mmlu_cf_reader_cfg, + infer_cfg=mmlu_cf_infer_cfg, + eval_cfg=mmlu_cf_eval_cfg, + )) + +del _name, _hint diff --git a/configs/datasets/mmlu_cf/mmlu_cf_zero_shot.py b/configs/datasets/mmlu_cf/mmlu_cf_zero_shot.py new file mode 100644 index 000000000..5035fdb85 --- /dev/null +++ b/configs/datasets/mmlu_cf/mmlu_cf_zero_shot.py @@ -0,0 +1,64 @@ +from mmengine.config import read_base +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator +from opencompass.datasets import MMLUCFDataset +from opencompass.utils.text_postprocessors import first_option_postprocess + +with read_base(): + from .mmlu_cf_categories import categories + +mmlu_cf_reader_cfg = dict( + input_columns=['input', 'A', 'B', 'C', 'D'], + output_column='target', + train_split='dev') + +mmlu_cf_datasets = [] +for _name in categories: + _hint = f'There is a single choice question. Answer the question by replying A, B, C or D.' + mmlu_cf_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt= + f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' + ), + dict(role='BOT', prompt='{target}\n') + ]), + ), + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin='', + round=[ + dict( + role='HUMAN', + prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' + ), + ], + ), + ice_token='', + ), + retriever=dict(type= ZeroRetriever), + inferencer=dict(type=GenInferencer), + ) + + mmlu_cf_eval_cfg = dict( + evaluator=dict(type=AccwithDetailsEvaluator), + pred_postprocessor=dict(type=first_option_postprocess, options='ABCD')) + + mmlu_cf_datasets.append( + dict( + abbr=f'mmlu_cf_{_name}', + type=MMLUCFDataset, + path='opencompass/mmlu_cf', + name=_name, + reader_cfg=mmlu_cf_reader_cfg, + infer_cfg=mmlu_cf_infer_cfg, + eval_cfg=mmlu_cf_eval_cfg, + )) + +del _name, _hint From 0c48407696786a835478e236a4fb9b1ed0956a3c Mon Sep 17 00:00:00 2001 From: fistyee Date: Tue, 24 Dec 2024 11:42:49 +0800 Subject: [PATCH 02/26] [Feature] Support MMLU-CF Benchmark --- README.md | 1 + README_zh-CN.md | 1 + configs/dataset_collections/chat_OC15.py | 1 + configs/eval_corebench_2409_base_objective.py | 20 + configs/eval_mmlu_cf.py | 16 + configs/summarizers/chat_OC15.py | 3 + .../summarizers/chat_OC15_multi_faceted.py | 7 + configs/summarizers/example.py | 1 + configs/summarizers/groups/mmlu_cf.py | 5 + configs/summarizers/mmlu_cf.py | 25 + .../configs/dataset_collections/chat_OC15.py | 1 + .../configs/datasets/mmlu_cf/README.md | 614 ++++++++++++++++++ .../datasets/mmlu_cf/mmlu_cf_categories.py | 16 + .../configs/datasets/mmlu_cf/mmlu_cf_gen.py | 64 ++ .../datasets/mmlu_cf/mmlu_cf_val_few_shot.py | 64 ++ .../datasets/mmlu_cf/mmlu_cf_val_zero_shot.py | 64 ++ opencompass/configs/summarizers/chat_OC15.py | 5 +- .../summarizers/chat_OC15_multi_faceted.py | 9 + opencompass/configs/summarizers/example.py | 1 + .../configs/summarizers/groups/mmlu_cf.py | 5 + opencompass/configs/summarizers/mmlu_cf.py | 25 + opencompass/datasets/__init__.py | 1 + opencompass/datasets/mmlu_cf.py | 72 ++ opencompass/utils/datasets_info.py | 6 + 24 files changed, 1026 insertions(+), 1 deletion(-) create mode 100644 configs/eval_mmlu_cf.py create mode 100644 configs/summarizers/groups/mmlu_cf.py create mode 100644 configs/summarizers/mmlu_cf.py create mode 100644 opencompass/configs/datasets/mmlu_cf/README.md create mode 100644 opencompass/configs/datasets/mmlu_cf/mmlu_cf_categories.py create mode 100644 opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen.py create mode 100644 opencompass/configs/datasets/mmlu_cf/mmlu_cf_val_few_shot.py create mode 100644 opencompass/configs/datasets/mmlu_cf/mmlu_cf_val_zero_shot.py create mode 100644 opencompass/configs/summarizers/groups/mmlu_cf.py create mode 100644 opencompass/configs/summarizers/mmlu_cf.py create mode 100644 opencompass/datasets/mmlu_cf.py diff --git a/README.md b/README.md index 6d8cabe5f..308a16c8f 100644 --- a/README.md +++ b/README.md @@ -57,6 +57,7 @@ Just like a compass guides us on our journey, OpenCompass will guide you through ## 🚀 What's New +- **\[2024.12.24\]** We now support the Microsoft's Contamination-Free Multi-task language Understanding Benchmark [MMLU-CF](https://huggingface.co/datasets/microsoft/MMLU-CF). Feel free to give it a try! 🔥🔥🔥 - **\[2024.12.17\]** We have provided the evaluation script for the December [CompassAcademic](configs/eval_academic_leaderboard_202412.py), which allows users to easily reproduce the official evaluation results by configuring it. - **\[2024.11.14\]** OpenCompass now offers support for a sophisticated benchmark designed to evaluate complex reasoning skills — [MuSR](https://arxiv.org/pdf/2310.16049). Check out the [demo](configs/eval_musr.py) and give it a spin! 🔥🔥🔥 - **\[2024.11.14\]** OpenCompass now supports the brand new long-context language model evaluation benchmark — [BABILong](https://arxiv.org/pdf/2406.10149). Have a look at the [demo](configs/eval_babilong.py) and give it a try! 🔥🔥🔥 diff --git a/README_zh-CN.md b/README_zh-CN.md index 21c0d666e..8760b9251 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -57,6 +57,7 @@ ## 🚀 最新进展 +- **\[2024.12.24\]** 现已支持Microsoft去污染多任务语言理解数据集[MMLU-CF](https://huggingface.co/datasets/microsoft/MMLU-CF),欢迎尝试! 🔥🔥🔥 - **\[2024.12.17\]** 我们提供了12月CompassAcademic学术榜单评估脚本 [CompassAcademic](configs/eval_academic_leaderboard_202412.py),你可以通过简单地配置复现官方评测结果。 - **\[2024.10.14\]** 现已支持OpenAI多语言问答数据集[MMMLU](https://huggingface.co/datasets/openai/MMMLU),欢迎尝试! 🔥🔥🔥 - **\[2024.09.19\]** 现已支持[Qwen2.5](https://huggingface.co/Qwen)(0.5B to 72B) ,可以使用多种推理后端(huggingface/vllm/lmdeploy), 欢迎尝试! 🔥🔥🔥 diff --git a/configs/dataset_collections/chat_OC15.py b/configs/dataset_collections/chat_OC15.py index c06f519dd..62a598185 100644 --- a/configs/dataset_collections/chat_OC15.py +++ b/configs/dataset_collections/chat_OC15.py @@ -2,6 +2,7 @@ with read_base(): from opencompass.configs.datasets.mmlu.mmlu_gen_4d595a import mmlu_datasets + from opencompass.configs.datasets.mmlu_cf.mmlu_cf_gen import mmlu_cf_datasets from opencompass.configs.datasets.cmmlu.cmmlu_gen_c13365 import cmmlu_datasets from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import ceval_datasets from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import GaokaoBench_datasets diff --git a/configs/eval_corebench_2409_base_objective.py b/configs/eval_corebench_2409_base_objective.py index d5d7a3879..2cd22eaa0 100644 --- a/configs/eval_corebench_2409_base_objective.py +++ b/configs/eval_corebench_2409_base_objective.py @@ -13,6 +13,7 @@ ## Core Set # ## Examination from opencompass.configs.datasets.mmlu.mmlu_ppl_ac766d import mmlu_datasets + from opencompass.configs.datasets.mmlu_cf.mmlu_cf_gen import mmlu_cf_datasets from opencompass.configs.datasets.mmlu_pro.mmlu_pro_few_shot_gen_bfaf90 import \ mmlu_pro_datasets from opencompass.configs.datasets.cmmlu.cmmlu_ppl_041cbf import \ @@ -42,6 +43,7 @@ # Summarizer from opencompass.configs.summarizers.groups.mmlu import mmlu_summary_groups + from opencompass.configs.summarizers.groups.mmlu_cf import mmlu_cf_summary_groups from opencompass.configs.summarizers.groups.mmlu_pro import mmlu_pro_summary_groups from opencompass.configs.summarizers.groups.cmmlu import cmmlu_summary_groups from opencompass.configs.summarizers.groups.bbh import bbh_summary_groups @@ -75,6 +77,7 @@ 'subsets': [ ['mmlu', 'accuracy'], ['mmlu_pro', 'accuracy'], + ['mmlu_cf', 'accuracy'], ['cmmlu', 'accuracy'], ['bbh', 'naive_average'], ['hellaswag', 'accuracy'], @@ -95,6 +98,7 @@ dataset_abbrs=[ ['mmlu', 'accuracy'], ['mmlu_pro', 'accuracy'], + ['mmlu_cf', 'accuracy'], ['cmmlu', 'accuracy'], ['bbh', 'naive_average'], ['hellaswag', 'accuracy'], @@ -132,6 +136,22 @@ ['mmlu_pro_computer_science','accuracy'], ['mmlu_pro_history', 'accuracy'], '', + ['mmlu_cf', 'accuracy'], + ['mmlu_cf_math','accuracy'], + ['mmlu_cf_physics', 'accuracy'], + ['mmlu_cf_chemistry', 'accuracy'], + ['mmlu_cf_law', 'accuracy'], + ['mmlu_cf_engineering', 'accuracy'], + ['mmlu_cf_other', 'accuracy'], + ['mmlu_cf_economics', 'accuracy'], + ['mmlu_cf_health', 'accuracy'], + ['mmlu_cf_psychology', 'accuracy'], + ['mmlu_cf_business', 'accuracy'], + ['mmlu_cf_biology', 'accuracy'], + ['mmlu_cf_philosophy', 'accuracy'], + ['mmlu_cf_computer_science','accuracy'], + ['mmlu_cf_history', 'accuracy'], + '', ['cmmlu', 'accuracy'], ['cmmlu-stem', 'accuracy'], ['cmmlu-social-science', 'accuracy'], diff --git a/configs/eval_mmlu_cf.py b/configs/eval_mmlu_cf.py new file mode 100644 index 000000000..cd8ff1a1f --- /dev/null +++ b/configs/eval_mmlu_cf.py @@ -0,0 +1,16 @@ +from mmengine.config import read_base + +with read_base(): + from opencompass.configs.datasets.mmlu_cf.mmlu_cf_gen import mmlu_cf_datasets + + from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import models as lmdeploy_qwen2_7b_instruct_model + from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import models as lmdeploy_llama3_8b_instruct_model + + from opencompass.configs.summarizers.mmlu_cf import summarizer + from opencompass.configs.internal.clusters.local import infer_num_worker as infer + from opencompass.configs.internal.clusters.local import eval + +datasets = sum([v for k, v in locals().items() if k.endswith('_datasets') or k == 'datasets'], []) +models = sum([v for k, v in locals().items() if k.endswith('_model')], []) + +work_dir = 'outputs/debug/mmlu_cf' diff --git a/configs/summarizers/chat_OC15.py b/configs/summarizers/chat_OC15.py index 7a02e33ed..589933770 100644 --- a/configs/summarizers/chat_OC15.py +++ b/configs/summarizers/chat_OC15.py @@ -2,6 +2,7 @@ with read_base(): from .groups.mmlu import mmlu_summary_groups + from .groups.mmlu_cf import mmlu_cf_summary_groups from .groups.cmmlu import cmmlu_summary_groups from .groups.ceval import ceval_summary_groups from .groups.bbh import bbh_summary_groups @@ -13,6 +14,7 @@ 'name': 'average', 'subsets': [ ['mmlu', 'naive_average'], + ['mmlu_cf', 'naive_average'], ['cmmlu', 'naive_average'], ['ceval', 'naive_average'], ['GaokaoBench', 'weighted_average'], @@ -37,6 +39,7 @@ dataset_abbrs=[ ['average', 'naive_average'], ['mmlu', 'naive_average'], + ['mmlu_cf', 'naive_average'], ['cmmlu', 'naive_average'], ['ceval', 'naive_average'], ['GaokaoBench', 'weighted_average'], diff --git a/configs/summarizers/chat_OC15_multi_faceted.py b/configs/summarizers/chat_OC15_multi_faceted.py index c6fc58121..521a63587 100644 --- a/configs/summarizers/chat_OC15_multi_faceted.py +++ b/configs/summarizers/chat_OC15_multi_faceted.py @@ -13,6 +13,7 @@ 'name': 'average', 'subsets': [ ['mmlu', 'naive_average'], + ['mmlu_cf', 'naive_average'], ['cmmlu', 'naive_average'], ['ceval', 'naive_average'], ['GaokaoBench', 'weighted_average'], @@ -36,6 +37,7 @@ overall_dataset_abbrs = [ ['average', 'naive_average'], ['mmlu', 'naive_average'], + ['mmlu_cf', 'naive_average'], ['cmmlu', 'naive_average'], ['ceval', 'naive_average'], ['GaokaoBench', 'weighted_average'], @@ -54,6 +56,11 @@ ['IFEval', 'Prompt-level-strict-accuracy'], ] +mmlu_cf_summary_groups_dict = {g['name']: g['subsets'] for g in mmlu_cf_summary_groups} +mmlu_cf_dataset_abbrs = [ + ['mmlu_cf', 'naive_average'], +] + mmlu_summary_groups_dict = {g['name']: g['subsets'] for g in mmlu_summary_groups} mmlu_dataset_abbrs = [ ['mmlu', 'naive_average'], diff --git a/configs/summarizers/example.py b/configs/summarizers/example.py index 937acfba9..30b8df552 100644 --- a/configs/summarizers/example.py +++ b/configs/summarizers/example.py @@ -3,6 +3,7 @@ with read_base(): from .groups.agieval import agieval_summary_groups from .groups.mmlu import mmlu_summary_groups + from .groups.mmlu_cf import mmlu_cf_summary_groups from .groups.cmmlu import cmmlu_summary_groups from .groups.ceval import ceval_summary_groups from .groups.bbh import bbh_summary_groups diff --git a/configs/summarizers/groups/mmlu_cf.py b/configs/summarizers/groups/mmlu_cf.py new file mode 100644 index 000000000..0ecc5f8ad --- /dev/null +++ b/configs/summarizers/groups/mmlu_cf.py @@ -0,0 +1,5 @@ +categories = ['Computer Science', 'Math', 'Physics', 'Chemistry', 'Law', 'Engineering', 'Other', 'Economics', 'Health', 'Psychology', 'Business', 'Biology', 'Philosophy', 'History'] + +mmlu_cf_summary_groups = [ + {'name': 'mmlu_cf', 'subsets': ['mmlu_cf_' + c.replace(' ', '_') for c in categories]}, +] diff --git a/configs/summarizers/mmlu_cf.py b/configs/summarizers/mmlu_cf.py new file mode 100644 index 000000000..d607c598d --- /dev/null +++ b/configs/summarizers/mmlu_cf.py @@ -0,0 +1,25 @@ +from mmengine.config import read_base + +with read_base(): + from .groups.mmlu_cf import mmlu_cf_summary_groups + +summarizer = dict( + dataset_abbrs=[ + 'mmlu_cf', + 'mmlu_cf_Biology', + 'mmlu_cf_Business', + 'mmlu_cf_Chemistry', + 'mmlu_cf_Computer_Science', + 'mmlu_cf_Economics', + 'mmlu_cf_Engineering', + 'mmlu_cf_Health', + 'mmlu_cf_History', + 'mmlu_cf_Law', + 'mmlu_cf_Math', + 'mmlu_cf_Philosophy', + 'mmlu_cf_Physics', + 'mmlu_cf_Psychology', + 'mmlu_cf_Other', + ], + summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []), +) diff --git a/opencompass/configs/dataset_collections/chat_OC15.py b/opencompass/configs/dataset_collections/chat_OC15.py index c06f519dd..62a598185 100644 --- a/opencompass/configs/dataset_collections/chat_OC15.py +++ b/opencompass/configs/dataset_collections/chat_OC15.py @@ -2,6 +2,7 @@ with read_base(): from opencompass.configs.datasets.mmlu.mmlu_gen_4d595a import mmlu_datasets + from opencompass.configs.datasets.mmlu_cf.mmlu_cf_gen import mmlu_cf_datasets from opencompass.configs.datasets.cmmlu.cmmlu_gen_c13365 import cmmlu_datasets from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import ceval_datasets from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import GaokaoBench_datasets diff --git a/opencompass/configs/datasets/mmlu_cf/README.md b/opencompass/configs/datasets/mmlu_cf/README.md new file mode 100644 index 000000000..3181c5baa --- /dev/null +++ b/opencompass/configs/datasets/mmlu_cf/README.md @@ -0,0 +1,614 @@ +# MMLU-CF: A Contamination-free Multi-task Language Understanding Benchmark + +
+ +![](https://img.shields.io/badge/Task-MMLU_CF-orange) +![](https://img.shields.io/badge/Data-Released-green) +![](https://img.shields.io/badge/Code_License-MIT-blue) + +
+ +

+ [📜 Paper] • + [🤗 HF Dataset] • + [🐱 GitHub] +

+ +## 📢 News and Updates +[2024.12.01] 🔥We have initialized the repository. +[2024.12.16] 🔥We have added the evaluation results of Phi-4-14B and Llama-3.3-70B-Instruct. +[2024.12.20] 🔥We have released the validation dataset of MMLU-CF. + + +## 1. The Motivation of MMLU-CF + + + +- The open-source nature of these benchmarks and the broad sources of training data for LLMs have inevitably led to benchmark contamination, resulting in unreliable evaluation results. To alleviate this issue, we propose MMLU-CF. +- (a) An instance of leakage in MMLU. When questions are used as prompt from the MMLU, certain LLMs, due to their memorization capabilities, directly provide **choices identical to the original ones**. (b) When questions are used as prompt from the MMLU-CF, LLMs only provide guessed choices. +This indicates that the MMLU test set suffers from data contamination and memorization by some LLMs, while the proposed MMLU-CF avoids such leakage. +

+ Fig1_a + Fig1_b +

+ + +## 2. How to Evaluate Your Models on the MMLU-CF Validation/Test Set + + #### (1) We perform automated testing only on Huggingface models. After following the steps outlined below and obtaining the validation set results from [OpenCompass](https://github.com/open-compass/opencompass), the test set results can then be accessed via GitHub Issues. + + **Step 1**. **Validation set evaluation**: Obtaining the validation results for your model using LLM evaluation tools, [OpenCompass](https://github.com/open-compass/opencompass). The validation dataset download from [🤗 Huggingface](https://huggingface.co/datasets/microsoft/MMLU-CF). The data directory structure in the opencompass: + +``` +data +└── mmlu_cf +    ├── dev + └── val +``` + + **Step 2**. **Test set evaluation**: With the validation results, submit a GitHub issue on the [MMLU-CF](https://github.com/) GitHub homepage to request the test set results. Please follow the format below: + +Example 1, +``` +Title: +Test set evaluation Request - add HF model [microsoft/phi-4] +Content: +The result on validation set: 68.5% +``` +Example 2, +

+ Fig6 +

+ + **Notably**: + - Ensure you use the format with square brackets `[ ]` as shown. The model name **microsoft/phi-4** corresponds to the name on HuggingFace. + - We will automatically submit your model. The time to receive the results depends on the number of models being evaluated, but it typically takes **1-2 weeks**. + + #### (2) For API models, if OpenCompass updates the model interface, you can obtain the test set results by sending a temporary key to [Email](yangyu.huang@microsoft.com) after receiving the validation set results. + + +## 3. What is the Difference between MMLU-CF and MMLU +MMLU focuses on the breadth and reasoning without considering contamination prevention. We apply three decontamination rules to mitigate unintentional data leakage while collecting data from a broader domain. Meanwhile, our MMLU-CF benchmark maintains the test set closed-source to prevent malicious data leakage. + +

+ Fig4 + + Fig5 +

+ + +## 4. Leaderboard + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ModelMMLU MMLU-CF
5-shot 5-shot Test 5-shot Validation 5-shot Δ 0-shot Test 0-shot Validation 0-shot Δ
API
GPT-4o88.073.473.4+0.071.972.4-0.5
GPT-4-Turbo86.570.470.1+0.368.968.7+0.1
GPT-4o-mini81.865.565.1+0.466.065.3+0.7
Gemini-1.5-Flash78.764.864.9-0.156.756.9-0.2
GPT-3.5-Turbo71.458.259.0-0.857.258.1-0.9
Large
Qwen2.5-72B-instruct85.371.671.3+0.370.670.4+0.2
Llama-3-70B-instruct82.068.968.8+0.168.167.4+0.7
Llama-3.3-70B-instruct86.368.867.8+1.067.667.5+0.1
Llama-3.1-70B-instruct86.068.768.1+0.670.469.7+0.7
Phi-3.5-MoE-instruct78.964.664.5+0.163.162.1+1.0
Qwen2-72B-instruct82.363.764.3-0.662.462.5-0.1
Mixtral-8x22B-instruct76.262.862.5+0.365.364.8+0.5
Qwen1.5-72B-chat75.659.860.2-0.459.159.6-0.5
Llama-2-70B-chat68.952.251.8+0.451.250.9+0.3
Medium
Qwen2.5-32B-instruct83.969.768.8+0.968.968.8+0.1
Phi-4-14B84.867.868.5-0.768.569.4-0.9
Qwen2.5-14B-instruct79.966.466.1+0.367.066.0+1.0
Phi-3-medium-instruct77.964.264.2+0.062.562.7-0.2
Gemma2-27B75.263.963.5+0.464.264.0+0.2
Yi-1.5-34B-chat76.861.360.5+0.860.659.5+1.1
Mixtral-8x7B-instruct-v0.170.558.357.1-1.258.958.5+0.4
Deepseek-v2-lite-chat55.749.348.7+0.648.247.7+0.5
Baichuan-2-13B-chat57.348.348.6-0.347.148.1-1.0
Llama-2-13B-chat54.842.842.1+0.744.844.6+0.2
Small
Qwen2.5-7B-instruct75.461.360.4+0.959.358.6+0.7
Qwen2-7B-instruct70.558.157.9+0.258.357.4+0.9
Glm-4-9B-chat72.457.857.9-0.158.658.7-0.1
Internlm-2.5-7B-chat72.857.356.8+0.557.956.9+1.0
Llama-3-8B-instruct68.457.356.5+0.856.455.4+1.0
Llama-3.1-8B-instruct68.157.157.9-0.856.156.1+0.0
Gemma-2-9B71.353.753.3+0.432.131.2+0.9
Yi-1.5-6B-chat62.852.851.4+1.452.251.9+0.3
Mistral-7B-instruct-v0.360.350.750.9-0.251.150.9+0.2
Baichuan-2-7B-chat52.944.543.9+0.643.944.0-0.1
Llama-2-7B-chat45.339.438.5+0.941.940.9+1.0
Mini
Phi-3-mini-instruct (3.8B)70.957.958.1-0.258.257.5+0.7
Phi-3.5-mini-instruct (3.8B)69.157.957.4+0.558.357.7+0.6
Qwen2.5-3B-instruct64.455.956.4-0.554.353.9+0.4
Qwen2.5-1.5B-instruct50.751.251.0+0.250.750.4+0.3
Qwen2-1.5B-instruct52.447.147.5-0.445.244.5+0.7
Gemma-2-2B51.343.942.4+1.530.529.4+0.9
Qwen2.5-0.5B-instruct24.141.941.1+0.836.034.9+1.1
Internlm-2-chat-1.8b47.140.539.4+1.141.239.8+1.4
Qwen2-0.5B-instruct37.938.338.3+0.033.533.5+0.0
+ +## 5. Data Construction Pipeline +![Fig3](./Figures/Fig_3.png) +The pipeline involves (1) MCQ Collection to gather a diverse set of questions; (2) MCQ Cleaning to ensure quality; (3) Difficulty Sampling to ensure an appropriate difficulty distribution for questions; (4) LLMs checking: The LLMs, including GPT-4o, Gemini, and Claude, are reviewing the accuracy and safety of the data; and (5) Contamination-Free Processing to prevent data leakage and maintain dataset purity. Ultimately, this process results in the MMLU-CF, consisting of 10,000 questions for the closed-source test set and 10,000 for the open-source validation set. + +## 6. Contact +For any inquiries or concerns, feel free to reach out to us via Email: [Qihao Zhao](qhzhaoo@gmail.com) and [Yangyu Huang](yanghuan@microsoft.com). + +## 7. Citation +``` +@misc{zhao2024mmlucfcontaminationfreemultitasklanguage, + title={MMLU-CF: A Contamination-free Multi-task Language Understanding Benchmark}, + author={Qihao Zhao and Yangyu Huang and Tengchao Lv and Lei Cui and Qinzheng Sun and Shaoguang Mao and Xin Zhang and Ying Xin and Qiufeng Yin and Scarlett Li and Furu Wei}, + year={2024}, + eprint={2412.15194}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2412.15194}, +} +``` + +## 8. License +This repository is licensed under the [MIT](https://github.com/microsoft/PEACE/blob/main/LICENSE) License. +The validation dataset of MMLU-CF is subject to the [CDLA-2.0](https://cdla.dev/permissive-2-0/) License. diff --git a/opencompass/configs/datasets/mmlu_cf/mmlu_cf_categories.py b/opencompass/configs/datasets/mmlu_cf/mmlu_cf_categories.py new file mode 100644 index 000000000..ab8b198f4 --- /dev/null +++ b/opencompass/configs/datasets/mmlu_cf/mmlu_cf_categories.py @@ -0,0 +1,16 @@ +categories = [ + 'Math', + 'Physics', + 'Chemistry', + 'Law', + 'Engineering', + 'Other', + 'Economics', + 'Health', + 'Psychology', + 'Business', + 'Biology', + 'Philosophy', + 'Computer_Science', + 'History', +] diff --git a/opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen.py b/opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen.py new file mode 100644 index 000000000..a0be611f1 --- /dev/null +++ b/opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen.py @@ -0,0 +1,64 @@ +from mmengine.config import read_base +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import FixKRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator +from opencompass.datasets import MMLUCFDataset +from opencompass.utils.text_postprocessors import first_option_postprocess + +with read_base(): + from .mmlu_cf_categories import categories + +mmlu_cf_reader_cfg = dict( + input_columns=['input', 'A', 'B', 'C', 'D'], + output_column='target', + train_split='dev') + +mmlu_cf_datasets = [] +for _name in categories: + _hint = f'There is a single choice question. Answer the question by replying A, B, C or D.' + mmlu_cf_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt= + f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' + ), + dict(role='BOT', prompt='{target}\n') + ]), + ), + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin='', + round=[ + dict( + role='HUMAN', + prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' + ), + ], + ), + ice_token='', + ), + retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]), + inferencer=dict(type=GenInferencer), + ) + + mmlu_cf_eval_cfg = dict( + evaluator=dict(type=AccwithDetailsEvaluator), + pred_postprocessor=dict(type=first_option_postprocess, options='ABCD')) + + mmlu_cf_datasets.append( + dict( + abbr=f'mmlu_cf_{_name}', + type=MMLUCFDataset, + path='opencompass/mmlu_cf/', + name=_name, + reader_cfg=mmlu_cf_reader_cfg, + infer_cfg=mmlu_cf_infer_cfg, + eval_cfg=mmlu_cf_eval_cfg, + )) + +del _name, _hint diff --git a/opencompass/configs/datasets/mmlu_cf/mmlu_cf_val_few_shot.py b/opencompass/configs/datasets/mmlu_cf/mmlu_cf_val_few_shot.py new file mode 100644 index 000000000..a811bca61 --- /dev/null +++ b/opencompass/configs/datasets/mmlu_cf/mmlu_cf_val_few_shot.py @@ -0,0 +1,64 @@ +from mmengine.config import read_base +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import FixKRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator +from opencompass.datasets import MMLUCFDataset +from opencompass.utils.text_postprocessors import first_option_postprocess + +with read_base(): + from .mmlu_cf_categories import categories + +mmlu_cf_reader_cfg = dict( + input_columns=['input', 'A', 'B', 'C', 'D'], + output_column='target', + train_split='dev') + +mmlu_cf_datasets = [] +for _name in categories: + _hint = f'There is a single choice question (with answers). Answer the question by replying A, B, C or D.' + mmlu_cf_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt= + f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' + ), + dict(role='BOT', prompt='{target}\n') + ]), + ), + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin='', + round=[ + dict( + role='HUMAN', + prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' + ), + ], + ), + ice_token='', + ), + retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]), + inferencer=dict(type=GenInferencer), + ) + + mmlu_cf_eval_cfg = dict( + evaluator=dict(type=AccwithDetailsEvaluator), + pred_postprocessor=dict(type=first_option_postprocess, options='ABCD')) + + mmlu_cf_datasets.append( + dict( + abbr=f'mmlu_cf_{_name}', + type=MMLUCFDataset, + path='opencompass/mmlu_cf', + name=_name, + reader_cfg=mmlu_cf_reader_cfg, + infer_cfg=mmlu_cf_infer_cfg, + eval_cfg=mmlu_cf_eval_cfg, + )) + +del _name, _hint diff --git a/opencompass/configs/datasets/mmlu_cf/mmlu_cf_val_zero_shot.py b/opencompass/configs/datasets/mmlu_cf/mmlu_cf_val_zero_shot.py new file mode 100644 index 000000000..9082db6d7 --- /dev/null +++ b/opencompass/configs/datasets/mmlu_cf/mmlu_cf_val_zero_shot.py @@ -0,0 +1,64 @@ +from mmengine.config import read_base +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator +from opencompass.datasets import MMLUCFDataset +from opencompass.utils.text_postprocessors import first_option_postprocess + +with read_base(): + from .mmlu_cf_categories import categories + +mmlu_cf_reader_cfg = dict( + input_columns=['input', 'A', 'B', 'C', 'D'], + output_column='target', + train_split='dev') + +mmlu_cf_datasets = [] +for _name in categories: + _hint = f'There is a single choice question (with answers). Answer the question by replying A, B, C or D.' + mmlu_cf_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt= + f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' + ), + dict(role='BOT', prompt='{target}\n') + ]), + ), + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin='', + round=[ + dict( + role='HUMAN', + prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' + ), + ], + ), + ice_token='', + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ) + + mmlu_cf_eval_cfg = dict( + evaluator=dict(type=AccwithDetailsEvaluator), + pred_postprocessor=dict(type=first_option_postprocess, options='ABCD')) + + mmlu_cf_datasets.append( + dict( + abbr=f'mmlu_cf_{_name}', + type=MMLUCFDataset, + path='opencompass/mmlu_cf', + name=_name, + reader_cfg=mmlu_cf_reader_cfg, + infer_cfg=mmlu_cf_infer_cfg, + eval_cfg=mmlu_cf_eval_cfg, + )) + +del _name, _hint diff --git a/opencompass/configs/summarizers/chat_OC15.py b/opencompass/configs/summarizers/chat_OC15.py index 7a02e33ed..a445306d8 100644 --- a/opencompass/configs/summarizers/chat_OC15.py +++ b/opencompass/configs/summarizers/chat_OC15.py @@ -2,6 +2,7 @@ with read_base(): from .groups.mmlu import mmlu_summary_groups + from .groups.mmlu_cf import mmlu_cf_summary_groups from .groups.cmmlu import cmmlu_summary_groups from .groups.ceval import ceval_summary_groups from .groups.bbh import bbh_summary_groups @@ -12,7 +13,8 @@ { 'name': 'average', 'subsets': [ - ['mmlu', 'naive_average'], + ['mmlu', 'naive_average'] + ['mmlu_cf', 'naive_average'], ['cmmlu', 'naive_average'], ['ceval', 'naive_average'], ['GaokaoBench', 'weighted_average'], @@ -38,6 +40,7 @@ ['average', 'naive_average'], ['mmlu', 'naive_average'], ['cmmlu', 'naive_average'], + ['mmlu_cf', 'naive_average'], ['ceval', 'naive_average'], ['GaokaoBench', 'weighted_average'], ['triviaqa_wiki_1shot', 'score'], diff --git a/opencompass/configs/summarizers/chat_OC15_multi_faceted.py b/opencompass/configs/summarizers/chat_OC15_multi_faceted.py index c6fc58121..976491ffb 100644 --- a/opencompass/configs/summarizers/chat_OC15_multi_faceted.py +++ b/opencompass/configs/summarizers/chat_OC15_multi_faceted.py @@ -3,6 +3,7 @@ with read_base(): from .groups.mmlu import mmlu_summary_groups + from .groups.mmlu_cf import mmlu_cf_summary_groups from .groups.cmmlu import cmmlu_summary_groups from .groups.ceval import ceval_summary_groups from .groups.bbh import bbh_summary_groups @@ -13,6 +14,7 @@ 'name': 'average', 'subsets': [ ['mmlu', 'naive_average'], + ['mmlu_cf', 'naive_average'], ['cmmlu', 'naive_average'], ['ceval', 'naive_average'], ['GaokaoBench', 'weighted_average'], @@ -36,6 +38,7 @@ overall_dataset_abbrs = [ ['average', 'naive_average'], ['mmlu', 'naive_average'], + ['mmlu_cf', 'naive_average'], ['cmmlu', 'naive_average'], ['ceval', 'naive_average'], ['GaokaoBench', 'weighted_average'], @@ -54,6 +57,11 @@ ['IFEval', 'Prompt-level-strict-accuracy'], ] +mmlu_cf_summary_groups_dict = {g['name']: g['subsets'] for g in mmlu_cf_summary_groups} +mmlu_cf_dataset_abbrs = [ + ['mmlu_cf', 'naive_average'], +] + mmlu_summary_groups_dict = {g['name']: g['subsets'] for g in mmlu_summary_groups} mmlu_dataset_abbrs = [ ['mmlu', 'naive_average'], @@ -127,6 +135,7 @@ dataset_abbrs_list=[ {'name': 'overall', 'dataset_abbrs': overall_dataset_abbrs}, {'name': 'mmlu', 'dataset_abbrs': mmlu_dataset_abbrs}, + {'name': 'mmlu_cf', 'dataset_abbrs': mmlu_cf_dataset_abbrs}, {'name': 'cmmlu', 'dataset_abbrs': cmmlu_dataset_abbrs}, {'name': 'ceval', 'dataset_abbrs': ceval_dataset_abbrs}, {'name': 'bbh', 'dataset_abbrs': bbh_dataset_abbrs}, diff --git a/opencompass/configs/summarizers/example.py b/opencompass/configs/summarizers/example.py index 937acfba9..30b8df552 100644 --- a/opencompass/configs/summarizers/example.py +++ b/opencompass/configs/summarizers/example.py @@ -3,6 +3,7 @@ with read_base(): from .groups.agieval import agieval_summary_groups from .groups.mmlu import mmlu_summary_groups + from .groups.mmlu_cf import mmlu_cf_summary_groups from .groups.cmmlu import cmmlu_summary_groups from .groups.ceval import ceval_summary_groups from .groups.bbh import bbh_summary_groups diff --git a/opencompass/configs/summarizers/groups/mmlu_cf.py b/opencompass/configs/summarizers/groups/mmlu_cf.py new file mode 100644 index 000000000..0ecc5f8ad --- /dev/null +++ b/opencompass/configs/summarizers/groups/mmlu_cf.py @@ -0,0 +1,5 @@ +categories = ['Computer Science', 'Math', 'Physics', 'Chemistry', 'Law', 'Engineering', 'Other', 'Economics', 'Health', 'Psychology', 'Business', 'Biology', 'Philosophy', 'History'] + +mmlu_cf_summary_groups = [ + {'name': 'mmlu_cf', 'subsets': ['mmlu_cf_' + c.replace(' ', '_') for c in categories]}, +] diff --git a/opencompass/configs/summarizers/mmlu_cf.py b/opencompass/configs/summarizers/mmlu_cf.py new file mode 100644 index 000000000..d607c598d --- /dev/null +++ b/opencompass/configs/summarizers/mmlu_cf.py @@ -0,0 +1,25 @@ +from mmengine.config import read_base + +with read_base(): + from .groups.mmlu_cf import mmlu_cf_summary_groups + +summarizer = dict( + dataset_abbrs=[ + 'mmlu_cf', + 'mmlu_cf_Biology', + 'mmlu_cf_Business', + 'mmlu_cf_Chemistry', + 'mmlu_cf_Computer_Science', + 'mmlu_cf_Economics', + 'mmlu_cf_Engineering', + 'mmlu_cf_Health', + 'mmlu_cf_History', + 'mmlu_cf_Law', + 'mmlu_cf_Math', + 'mmlu_cf_Philosophy', + 'mmlu_cf_Physics', + 'mmlu_cf_Psychology', + 'mmlu_cf_Other', + ], + summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []), +) diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py index e535ddc29..385159985 100644 --- a/opencompass/datasets/__init__.py +++ b/opencompass/datasets/__init__.py @@ -90,6 +90,7 @@ from .medbench import * # noqa: F401, F403 from .mgsm import * # noqa: F401, F403 from .mmlu import * # noqa: F401, F403 +from .mmlu_cf import * # noqa: F401, F403 from .mmlu_pro import * # noqa: F401, F403 from .MMLUArabic import * # noqa: F401, F403 from .mmmlu import * # noqa: F401, F403 diff --git a/opencompass/datasets/mmlu_cf.py b/opencompass/datasets/mmlu_cf.py new file mode 100644 index 000000000..471e7b4f0 --- /dev/null +++ b/opencompass/datasets/mmlu_cf.py @@ -0,0 +1,72 @@ +import csv +import json +import os.path as osp +from os import environ + +from datasets import Dataset, DatasetDict + +from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class MMLUCFDataset(BaseDataset): + + @staticmethod + def load(path: str, name: str): + path = get_data_path(path) + dataset = DatasetDict() + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + for split in ['dev', 'test']: + # 从 ModelScope 加载数据 + if split == 'test': + _split = 'val' + ms_dataset = MsDataset.load(path, + subset_name=name, + split=_split) + else: + ms_dataset = MsDataset.load(path, + subset_name=name, + split=split) + + dataset_list = [] + for i, line in ms_dataset: + if i == 0: # 跳过第一行 + continue + dataset_list.append({ + 'input': line['question'], + 'A': line['choices'][0], + 'B': line['choices'][1], + 'C': line['choices'][2], + 'D': line['choices'][3], + 'target': 'ABCD'[line['answer']], + }) + dataset[split] = Dataset.from_list(dataset_list) + else: + for split in ['dev', 'test']: + if split == 'test': + _split = 'val' + filename = osp.join(path, _split, f'{name}_{_split}.csv') + else: + filename = osp.join(path, split, f'{name}_{split}.csv') + raw_data = [] + with open(filename, encoding='utf-8') as f: + reader = csv.reader(f) + next(reader) + for row in reader: + + assert len(row) == 6 + raw_data.append({ + 'input': row[0], + 'A': row[1], + 'B': row[2], + 'C': row[3], + 'D': row[4], + 'target': row[5], + }) + dataset[split] = Dataset.from_list(raw_data) + return dataset + diff --git a/opencompass/utils/datasets_info.py b/opencompass/utils/datasets_info.py index aa187d36f..850c1bea3 100644 --- a/opencompass/utils/datasets_info.py +++ b/opencompass/utils/datasets_info.py @@ -181,6 +181,12 @@ "hf_id": "opencompass/mmlu", "local": "./data/mmlu/", }, + # MMLU_CF + "opencompass/mmlu_cf": { + "ms_id": "", + "hf_id": "microsoft/MMLU-CF", + "local": "./data/mmlu_cf/", + }, # MMLU_PRO "opencompass/mmlu_pro": { "ms_id": "", From 17af07e88590a769d036df7d6d37dba512f54db4 Mon Sep 17 00:00:00 2001 From: fistyee Date: Tue, 24 Dec 2024 11:51:05 +0800 Subject: [PATCH 03/26] [Feature] Support MMLU-CF Benchmark --- configs/datasets/mmlu_cf/README.md | 30 ++++++++----------- .../configs/datasets/mmlu_cf/README.md | 30 ++++++++----------- 2 files changed, 26 insertions(+), 34 deletions(-) diff --git a/configs/datasets/mmlu_cf/README.md b/configs/datasets/mmlu_cf/README.md index 3181c5baa..603923860 100644 --- a/configs/datasets/mmlu_cf/README.md +++ b/configs/datasets/mmlu_cf/README.md @@ -14,13 +14,7 @@ [🐱 GitHub]

-## 📢 News and Updates -[2024.12.01] 🔥We have initialized the repository. -[2024.12.16] 🔥We have added the evaluation results of Phi-4-14B and Llama-3.3-70B-Instruct. -[2024.12.20] 🔥We have released the validation dataset of MMLU-CF. - - -## 1. The Motivation of MMLU-CF +## The Motivation of MMLU-CF @@ -28,12 +22,12 @@ - (a) An instance of leakage in MMLU. When questions are used as prompt from the MMLU, certain LLMs, due to their memorization capabilities, directly provide **choices identical to the original ones**. (b) When questions are used as prompt from the MMLU-CF, LLMs only provide guessed choices. This indicates that the MMLU test set suffers from data contamination and memorization by some LLMs, while the proposed MMLU-CF avoids such leakage.

- Fig1_a - Fig1_b + Fig1_a + Fig1_b

-## 2. How to Evaluate Your Models on the MMLU-CF Validation/Test Set +## How to Evaluate Your Models on the MMLU-CF Validation/Test Set #### (1) We perform automated testing only on Huggingface models. After following the steps outlined below and obtaining the validation set results from [OpenCompass](https://github.com/open-compass/opencompass), the test set results can then be accessed via GitHub Issues. @@ -43,7 +37,11 @@ This indicates that the MMLU test set suffers from data contamination and memori data └── mmlu_cf    ├── dev + ├── Biology_dev.csv + ├── ... └── val + ├── Biology_val.csv + ├── ... ``` **Step 2**. **Test set evaluation**: With the validation results, submit a GitHub issue on the [MMLU-CF](https://github.com/) GitHub homepage to request the test set results. Please follow the format below: @@ -57,7 +55,7 @@ The result on validation set: 68.5% ``` Example 2,

- Fig6 + Fig6

**Notably**: @@ -67,13 +65,11 @@ Example 2, #### (2) For API models, if OpenCompass updates the model interface, you can obtain the test set results by sending a temporary key to [Email](yangyu.huang@microsoft.com) after receiving the validation set results. -## 3. What is the Difference between MMLU-CF and MMLU +## What is the Difference between MMLU-CF and MMLU MMLU focuses on the breadth and reasoning without considering contamination prevention. We apply three decontamination rules to mitigate unintentional data leakage while collecting data from a broader domain. Meanwhile, our MMLU-CF benchmark maintains the test set closed-source to prevent malicious data leakage.

- Fig4 - - Fig5 + Fig4

@@ -589,8 +585,8 @@ MMLU focuses on the breadth and reasoning without considering contamination prev -## 5. Data Construction Pipeline -![Fig3](./Figures/Fig_3.png) +## Data Construction Pipeline +![Fig3](https://github.com/microsoft/MMLU-CF/blob/main/Figures/Fig_3.png) The pipeline involves (1) MCQ Collection to gather a diverse set of questions; (2) MCQ Cleaning to ensure quality; (3) Difficulty Sampling to ensure an appropriate difficulty distribution for questions; (4) LLMs checking: The LLMs, including GPT-4o, Gemini, and Claude, are reviewing the accuracy and safety of the data; and (5) Contamination-Free Processing to prevent data leakage and maintain dataset purity. Ultimately, this process results in the MMLU-CF, consisting of 10,000 questions for the closed-source test set and 10,000 for the open-source validation set. ## 6. Contact diff --git a/opencompass/configs/datasets/mmlu_cf/README.md b/opencompass/configs/datasets/mmlu_cf/README.md index 3181c5baa..603923860 100644 --- a/opencompass/configs/datasets/mmlu_cf/README.md +++ b/opencompass/configs/datasets/mmlu_cf/README.md @@ -14,13 +14,7 @@ [🐱 GitHub]

-## 📢 News and Updates -[2024.12.01] 🔥We have initialized the repository. -[2024.12.16] 🔥We have added the evaluation results of Phi-4-14B and Llama-3.3-70B-Instruct. -[2024.12.20] 🔥We have released the validation dataset of MMLU-CF. - - -## 1. The Motivation of MMLU-CF +## The Motivation of MMLU-CF @@ -28,12 +22,12 @@ - (a) An instance of leakage in MMLU. When questions are used as prompt from the MMLU, certain LLMs, due to their memorization capabilities, directly provide **choices identical to the original ones**. (b) When questions are used as prompt from the MMLU-CF, LLMs only provide guessed choices. This indicates that the MMLU test set suffers from data contamination and memorization by some LLMs, while the proposed MMLU-CF avoids such leakage.

- Fig1_a - Fig1_b + Fig1_a + Fig1_b

-## 2. How to Evaluate Your Models on the MMLU-CF Validation/Test Set +## How to Evaluate Your Models on the MMLU-CF Validation/Test Set #### (1) We perform automated testing only on Huggingface models. After following the steps outlined below and obtaining the validation set results from [OpenCompass](https://github.com/open-compass/opencompass), the test set results can then be accessed via GitHub Issues. @@ -43,7 +37,11 @@ This indicates that the MMLU test set suffers from data contamination and memori data └── mmlu_cf    ├── dev + ├── Biology_dev.csv + ├── ... └── val + ├── Biology_val.csv + ├── ... ``` **Step 2**. **Test set evaluation**: With the validation results, submit a GitHub issue on the [MMLU-CF](https://github.com/) GitHub homepage to request the test set results. Please follow the format below: @@ -57,7 +55,7 @@ The result on validation set: 68.5% ``` Example 2,

- Fig6 + Fig6

**Notably**: @@ -67,13 +65,11 @@ Example 2, #### (2) For API models, if OpenCompass updates the model interface, you can obtain the test set results by sending a temporary key to [Email](yangyu.huang@microsoft.com) after receiving the validation set results. -## 3. What is the Difference between MMLU-CF and MMLU +## What is the Difference between MMLU-CF and MMLU MMLU focuses on the breadth and reasoning without considering contamination prevention. We apply three decontamination rules to mitigate unintentional data leakage while collecting data from a broader domain. Meanwhile, our MMLU-CF benchmark maintains the test set closed-source to prevent malicious data leakage.

- Fig4 - - Fig5 + Fig4

@@ -589,8 +585,8 @@ MMLU focuses on the breadth and reasoning without considering contamination prev -## 5. Data Construction Pipeline -![Fig3](./Figures/Fig_3.png) +## Data Construction Pipeline +![Fig3](https://github.com/microsoft/MMLU-CF/blob/main/Figures/Fig_3.png) The pipeline involves (1) MCQ Collection to gather a diverse set of questions; (2) MCQ Cleaning to ensure quality; (3) Difficulty Sampling to ensure an appropriate difficulty distribution for questions; (4) LLMs checking: The LLMs, including GPT-4o, Gemini, and Claude, are reviewing the accuracy and safety of the data; and (5) Contamination-Free Processing to prevent data leakage and maintain dataset purity. Ultimately, this process results in the MMLU-CF, consisting of 10,000 questions for the closed-source test set and 10,000 for the open-source validation set. ## 6. Contact From 772b9a74cbc89c1515ce37f8b78c6f1f407eddf4 Mon Sep 17 00:00:00 2001 From: fistyee Date: Tue, 24 Dec 2024 11:54:23 +0800 Subject: [PATCH 04/26] [Feature] Support MMLU-CF Benchmark --- configs/datasets/mmlu_cf/README.md | 8 ++++---- opencompass/configs/datasets/mmlu_cf/README.md | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/configs/datasets/mmlu_cf/README.md b/configs/datasets/mmlu_cf/README.md index 603923860..28d12b13e 100644 --- a/configs/datasets/mmlu_cf/README.md +++ b/configs/datasets/mmlu_cf/README.md @@ -14,7 +14,7 @@ [🐱 GitHub]

-## The Motivation of MMLU-CF +## 1. The Motivation of MMLU-CF @@ -27,7 +27,7 @@ This indicates that the MMLU test set suffers from data contamination and memori

-## How to Evaluate Your Models on the MMLU-CF Validation/Test Set +## 2. How to Evaluate Your Models on the MMLU-CF Validation/Test Set #### (1) We perform automated testing only on Huggingface models. After following the steps outlined below and obtaining the validation set results from [OpenCompass](https://github.com/open-compass/opencompass), the test set results can then be accessed via GitHub Issues. @@ -65,7 +65,7 @@ Example 2, #### (2) For API models, if OpenCompass updates the model interface, you can obtain the test set results by sending a temporary key to [Email](yangyu.huang@microsoft.com) after receiving the validation set results. -## What is the Difference between MMLU-CF and MMLU +## 3. What is the Difference between MMLU-CF and MMLU MMLU focuses on the breadth and reasoning without considering contamination prevention. We apply three decontamination rules to mitigate unintentional data leakage while collecting data from a broader domain. Meanwhile, our MMLU-CF benchmark maintains the test set closed-source to prevent malicious data leakage.

@@ -585,7 +585,7 @@ MMLU focuses on the breadth and reasoning without considering contamination prev -## Data Construction Pipeline +## 5. Data Construction Pipeline ![Fig3](https://github.com/microsoft/MMLU-CF/blob/main/Figures/Fig_3.png) The pipeline involves (1) MCQ Collection to gather a diverse set of questions; (2) MCQ Cleaning to ensure quality; (3) Difficulty Sampling to ensure an appropriate difficulty distribution for questions; (4) LLMs checking: The LLMs, including GPT-4o, Gemini, and Claude, are reviewing the accuracy and safety of the data; and (5) Contamination-Free Processing to prevent data leakage and maintain dataset purity. Ultimately, this process results in the MMLU-CF, consisting of 10,000 questions for the closed-source test set and 10,000 for the open-source validation set. diff --git a/opencompass/configs/datasets/mmlu_cf/README.md b/opencompass/configs/datasets/mmlu_cf/README.md index 603923860..28d12b13e 100644 --- a/opencompass/configs/datasets/mmlu_cf/README.md +++ b/opencompass/configs/datasets/mmlu_cf/README.md @@ -14,7 +14,7 @@ [🐱 GitHub]

-## The Motivation of MMLU-CF +## 1. The Motivation of MMLU-CF @@ -27,7 +27,7 @@ This indicates that the MMLU test set suffers from data contamination and memori

-## How to Evaluate Your Models on the MMLU-CF Validation/Test Set +## 2. How to Evaluate Your Models on the MMLU-CF Validation/Test Set #### (1) We perform automated testing only on Huggingface models. After following the steps outlined below and obtaining the validation set results from [OpenCompass](https://github.com/open-compass/opencompass), the test set results can then be accessed via GitHub Issues. @@ -65,7 +65,7 @@ Example 2, #### (2) For API models, if OpenCompass updates the model interface, you can obtain the test set results by sending a temporary key to [Email](yangyu.huang@microsoft.com) after receiving the validation set results. -## What is the Difference between MMLU-CF and MMLU +## 3. What is the Difference between MMLU-CF and MMLU MMLU focuses on the breadth and reasoning without considering contamination prevention. We apply three decontamination rules to mitigate unintentional data leakage while collecting data from a broader domain. Meanwhile, our MMLU-CF benchmark maintains the test set closed-source to prevent malicious data leakage.

@@ -585,7 +585,7 @@ MMLU focuses on the breadth and reasoning without considering contamination prev -## Data Construction Pipeline +## 5. Data Construction Pipeline ![Fig3](https://github.com/microsoft/MMLU-CF/blob/main/Figures/Fig_3.png) The pipeline involves (1) MCQ Collection to gather a diverse set of questions; (2) MCQ Cleaning to ensure quality; (3) Difficulty Sampling to ensure an appropriate difficulty distribution for questions; (4) LLMs checking: The LLMs, including GPT-4o, Gemini, and Claude, are reviewing the accuracy and safety of the data; and (5) Contamination-Free Processing to prevent data leakage and maintain dataset purity. Ultimately, this process results in the MMLU-CF, consisting of 10,000 questions for the closed-source test set and 10,000 for the open-source validation set. From a32a1ee615acce6ed65fd001a4628d5b53ff43d8 Mon Sep 17 00:00:00 2001 From: fistyee Date: Tue, 24 Dec 2024 13:44:38 +0800 Subject: [PATCH 05/26] [Feature] Support MMLU-CF Benchmark --- configs/summarizers/chat_OC15.py | 4 ++-- configs/summarizers/chat_OC15_multi_faceted.py | 2 ++ .../mmlu_cf/{mmlu_cf_val_few_shot.py => mmlu_cf_few_shot.py} | 0 .../{mmlu_cf_val_zero_shot.py => mmlu_cf_zero_shot.py} | 0 4 files changed, 4 insertions(+), 2 deletions(-) rename opencompass/configs/datasets/mmlu_cf/{mmlu_cf_val_few_shot.py => mmlu_cf_few_shot.py} (100%) rename opencompass/configs/datasets/mmlu_cf/{mmlu_cf_val_zero_shot.py => mmlu_cf_zero_shot.py} (100%) diff --git a/configs/summarizers/chat_OC15.py b/configs/summarizers/chat_OC15.py index 589933770..a445306d8 100644 --- a/configs/summarizers/chat_OC15.py +++ b/configs/summarizers/chat_OC15.py @@ -13,7 +13,7 @@ { 'name': 'average', 'subsets': [ - ['mmlu', 'naive_average'], + ['mmlu', 'naive_average'] ['mmlu_cf', 'naive_average'], ['cmmlu', 'naive_average'], ['ceval', 'naive_average'], @@ -39,8 +39,8 @@ dataset_abbrs=[ ['average', 'naive_average'], ['mmlu', 'naive_average'], - ['mmlu_cf', 'naive_average'], ['cmmlu', 'naive_average'], + ['mmlu_cf', 'naive_average'], ['ceval', 'naive_average'], ['GaokaoBench', 'weighted_average'], ['triviaqa_wiki_1shot', 'score'], diff --git a/configs/summarizers/chat_OC15_multi_faceted.py b/configs/summarizers/chat_OC15_multi_faceted.py index 521a63587..976491ffb 100644 --- a/configs/summarizers/chat_OC15_multi_faceted.py +++ b/configs/summarizers/chat_OC15_multi_faceted.py @@ -3,6 +3,7 @@ with read_base(): from .groups.mmlu import mmlu_summary_groups + from .groups.mmlu_cf import mmlu_cf_summary_groups from .groups.cmmlu import cmmlu_summary_groups from .groups.ceval import ceval_summary_groups from .groups.bbh import bbh_summary_groups @@ -134,6 +135,7 @@ dataset_abbrs_list=[ {'name': 'overall', 'dataset_abbrs': overall_dataset_abbrs}, {'name': 'mmlu', 'dataset_abbrs': mmlu_dataset_abbrs}, + {'name': 'mmlu_cf', 'dataset_abbrs': mmlu_cf_dataset_abbrs}, {'name': 'cmmlu', 'dataset_abbrs': cmmlu_dataset_abbrs}, {'name': 'ceval', 'dataset_abbrs': ceval_dataset_abbrs}, {'name': 'bbh', 'dataset_abbrs': bbh_dataset_abbrs}, diff --git a/opencompass/configs/datasets/mmlu_cf/mmlu_cf_val_few_shot.py b/opencompass/configs/datasets/mmlu_cf/mmlu_cf_few_shot.py similarity index 100% rename from opencompass/configs/datasets/mmlu_cf/mmlu_cf_val_few_shot.py rename to opencompass/configs/datasets/mmlu_cf/mmlu_cf_few_shot.py diff --git a/opencompass/configs/datasets/mmlu_cf/mmlu_cf_val_zero_shot.py b/opencompass/configs/datasets/mmlu_cf/mmlu_cf_zero_shot.py similarity index 100% rename from opencompass/configs/datasets/mmlu_cf/mmlu_cf_val_zero_shot.py rename to opencompass/configs/datasets/mmlu_cf/mmlu_cf_zero_shot.py From 531945d83177f3b981a504a1d146b21eed998dcb Mon Sep 17 00:00:00 2001 From: fistyee Date: Tue, 24 Dec 2024 13:48:33 +0800 Subject: [PATCH 06/26] [Feature] Support MMLU-CF Benchmark --- configs/summarizers/chat_OC15.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/summarizers/chat_OC15.py b/configs/summarizers/chat_OC15.py index a445306d8..2427fcb1e 100644 --- a/configs/summarizers/chat_OC15.py +++ b/configs/summarizers/chat_OC15.py @@ -13,7 +13,7 @@ { 'name': 'average', 'subsets': [ - ['mmlu', 'naive_average'] + ['mmlu', 'naive_average'], ['mmlu_cf', 'naive_average'], ['cmmlu', 'naive_average'], ['ceval', 'naive_average'], From 3d769a951e27aca53dbd6b702bf9c2dbfd1f0eff Mon Sep 17 00:00:00 2001 From: fistyee Date: Tue, 24 Dec 2024 19:00:50 +0800 Subject: [PATCH 07/26] [Feature] Support MMLU-CF Benchmark --- configs/eval_corebench_2409_base_objective.py | 21 +------------------ configs/summarizers/chat_OC15.py | 5 +---- .../summarizers/chat_OC15_multi_faceted.py | 9 -------- opencompass/configs/summarizers/chat_OC15.py | 7 ++----- .../summarizers/chat_OC15_multi_faceted.py | 9 -------- opencompass/configs/summarizers/example.py | 1 - 6 files changed, 4 insertions(+), 48 deletions(-) diff --git a/configs/eval_corebench_2409_base_objective.py b/configs/eval_corebench_2409_base_objective.py index 2cd22eaa0..5f995259b 100644 --- a/configs/eval_corebench_2409_base_objective.py +++ b/configs/eval_corebench_2409_base_objective.py @@ -13,7 +13,6 @@ ## Core Set # ## Examination from opencompass.configs.datasets.mmlu.mmlu_ppl_ac766d import mmlu_datasets - from opencompass.configs.datasets.mmlu_cf.mmlu_cf_gen import mmlu_cf_datasets from opencompass.configs.datasets.mmlu_pro.mmlu_pro_few_shot_gen_bfaf90 import \ mmlu_pro_datasets from opencompass.configs.datasets.cmmlu.cmmlu_ppl_041cbf import \ @@ -43,7 +42,6 @@ # Summarizer from opencompass.configs.summarizers.groups.mmlu import mmlu_summary_groups - from opencompass.configs.summarizers.groups.mmlu_cf import mmlu_cf_summary_groups from opencompass.configs.summarizers.groups.mmlu_pro import mmlu_pro_summary_groups from opencompass.configs.summarizers.groups.cmmlu import cmmlu_summary_groups from opencompass.configs.summarizers.groups.bbh import bbh_summary_groups @@ -77,7 +75,6 @@ 'subsets': [ ['mmlu', 'accuracy'], ['mmlu_pro', 'accuracy'], - ['mmlu_cf', 'accuracy'], ['cmmlu', 'accuracy'], ['bbh', 'naive_average'], ['hellaswag', 'accuracy'], @@ -98,7 +95,6 @@ dataset_abbrs=[ ['mmlu', 'accuracy'], ['mmlu_pro', 'accuracy'], - ['mmlu_cf', 'accuracy'], ['cmmlu', 'accuracy'], ['bbh', 'naive_average'], ['hellaswag', 'accuracy'], @@ -135,22 +131,7 @@ ['mmlu_pro_philosophy', 'accuracy'], ['mmlu_pro_computer_science','accuracy'], ['mmlu_pro_history', 'accuracy'], - '', - ['mmlu_cf', 'accuracy'], - ['mmlu_cf_math','accuracy'], - ['mmlu_cf_physics', 'accuracy'], - ['mmlu_cf_chemistry', 'accuracy'], - ['mmlu_cf_law', 'accuracy'], - ['mmlu_cf_engineering', 'accuracy'], - ['mmlu_cf_other', 'accuracy'], - ['mmlu_cf_economics', 'accuracy'], - ['mmlu_cf_health', 'accuracy'], - ['mmlu_cf_psychology', 'accuracy'], - ['mmlu_cf_business', 'accuracy'], - ['mmlu_cf_biology', 'accuracy'], - ['mmlu_cf_philosophy', 'accuracy'], - ['mmlu_cf_computer_science','accuracy'], - ['mmlu_cf_history', 'accuracy'], + '', ['cmmlu', 'accuracy'], ['cmmlu-stem', 'accuracy'], diff --git a/configs/summarizers/chat_OC15.py b/configs/summarizers/chat_OC15.py index 2427fcb1e..df4c68c88 100644 --- a/configs/summarizers/chat_OC15.py +++ b/configs/summarizers/chat_OC15.py @@ -2,7 +2,6 @@ with read_base(): from .groups.mmlu import mmlu_summary_groups - from .groups.mmlu_cf import mmlu_cf_summary_groups from .groups.cmmlu import cmmlu_summary_groups from .groups.ceval import ceval_summary_groups from .groups.bbh import bbh_summary_groups @@ -14,7 +13,6 @@ 'name': 'average', 'subsets': [ ['mmlu', 'naive_average'], - ['mmlu_cf', 'naive_average'], ['cmmlu', 'naive_average'], ['ceval', 'naive_average'], ['GaokaoBench', 'weighted_average'], @@ -39,8 +37,7 @@ dataset_abbrs=[ ['average', 'naive_average'], ['mmlu', 'naive_average'], - ['cmmlu', 'naive_average'], - ['mmlu_cf', 'naive_average'], + ['cmmlu', 'naive_average'], ['ceval', 'naive_average'], ['GaokaoBench', 'weighted_average'], ['triviaqa_wiki_1shot', 'score'], diff --git a/configs/summarizers/chat_OC15_multi_faceted.py b/configs/summarizers/chat_OC15_multi_faceted.py index 976491ffb..c6fc58121 100644 --- a/configs/summarizers/chat_OC15_multi_faceted.py +++ b/configs/summarizers/chat_OC15_multi_faceted.py @@ -3,7 +3,6 @@ with read_base(): from .groups.mmlu import mmlu_summary_groups - from .groups.mmlu_cf import mmlu_cf_summary_groups from .groups.cmmlu import cmmlu_summary_groups from .groups.ceval import ceval_summary_groups from .groups.bbh import bbh_summary_groups @@ -14,7 +13,6 @@ 'name': 'average', 'subsets': [ ['mmlu', 'naive_average'], - ['mmlu_cf', 'naive_average'], ['cmmlu', 'naive_average'], ['ceval', 'naive_average'], ['GaokaoBench', 'weighted_average'], @@ -38,7 +36,6 @@ overall_dataset_abbrs = [ ['average', 'naive_average'], ['mmlu', 'naive_average'], - ['mmlu_cf', 'naive_average'], ['cmmlu', 'naive_average'], ['ceval', 'naive_average'], ['GaokaoBench', 'weighted_average'], @@ -57,11 +54,6 @@ ['IFEval', 'Prompt-level-strict-accuracy'], ] -mmlu_cf_summary_groups_dict = {g['name']: g['subsets'] for g in mmlu_cf_summary_groups} -mmlu_cf_dataset_abbrs = [ - ['mmlu_cf', 'naive_average'], -] - mmlu_summary_groups_dict = {g['name']: g['subsets'] for g in mmlu_summary_groups} mmlu_dataset_abbrs = [ ['mmlu', 'naive_average'], @@ -135,7 +127,6 @@ dataset_abbrs_list=[ {'name': 'overall', 'dataset_abbrs': overall_dataset_abbrs}, {'name': 'mmlu', 'dataset_abbrs': mmlu_dataset_abbrs}, - {'name': 'mmlu_cf', 'dataset_abbrs': mmlu_cf_dataset_abbrs}, {'name': 'cmmlu', 'dataset_abbrs': cmmlu_dataset_abbrs}, {'name': 'ceval', 'dataset_abbrs': ceval_dataset_abbrs}, {'name': 'bbh', 'dataset_abbrs': bbh_dataset_abbrs}, diff --git a/opencompass/configs/summarizers/chat_OC15.py b/opencompass/configs/summarizers/chat_OC15.py index a445306d8..df4c68c88 100644 --- a/opencompass/configs/summarizers/chat_OC15.py +++ b/opencompass/configs/summarizers/chat_OC15.py @@ -2,7 +2,6 @@ with read_base(): from .groups.mmlu import mmlu_summary_groups - from .groups.mmlu_cf import mmlu_cf_summary_groups from .groups.cmmlu import cmmlu_summary_groups from .groups.ceval import ceval_summary_groups from .groups.bbh import bbh_summary_groups @@ -13,8 +12,7 @@ { 'name': 'average', 'subsets': [ - ['mmlu', 'naive_average'] - ['mmlu_cf', 'naive_average'], + ['mmlu', 'naive_average'], ['cmmlu', 'naive_average'], ['ceval', 'naive_average'], ['GaokaoBench', 'weighted_average'], @@ -39,8 +37,7 @@ dataset_abbrs=[ ['average', 'naive_average'], ['mmlu', 'naive_average'], - ['cmmlu', 'naive_average'], - ['mmlu_cf', 'naive_average'], + ['cmmlu', 'naive_average'], ['ceval', 'naive_average'], ['GaokaoBench', 'weighted_average'], ['triviaqa_wiki_1shot', 'score'], diff --git a/opencompass/configs/summarizers/chat_OC15_multi_faceted.py b/opencompass/configs/summarizers/chat_OC15_multi_faceted.py index 976491ffb..c6fc58121 100644 --- a/opencompass/configs/summarizers/chat_OC15_multi_faceted.py +++ b/opencompass/configs/summarizers/chat_OC15_multi_faceted.py @@ -3,7 +3,6 @@ with read_base(): from .groups.mmlu import mmlu_summary_groups - from .groups.mmlu_cf import mmlu_cf_summary_groups from .groups.cmmlu import cmmlu_summary_groups from .groups.ceval import ceval_summary_groups from .groups.bbh import bbh_summary_groups @@ -14,7 +13,6 @@ 'name': 'average', 'subsets': [ ['mmlu', 'naive_average'], - ['mmlu_cf', 'naive_average'], ['cmmlu', 'naive_average'], ['ceval', 'naive_average'], ['GaokaoBench', 'weighted_average'], @@ -38,7 +36,6 @@ overall_dataset_abbrs = [ ['average', 'naive_average'], ['mmlu', 'naive_average'], - ['mmlu_cf', 'naive_average'], ['cmmlu', 'naive_average'], ['ceval', 'naive_average'], ['GaokaoBench', 'weighted_average'], @@ -57,11 +54,6 @@ ['IFEval', 'Prompt-level-strict-accuracy'], ] -mmlu_cf_summary_groups_dict = {g['name']: g['subsets'] for g in mmlu_cf_summary_groups} -mmlu_cf_dataset_abbrs = [ - ['mmlu_cf', 'naive_average'], -] - mmlu_summary_groups_dict = {g['name']: g['subsets'] for g in mmlu_summary_groups} mmlu_dataset_abbrs = [ ['mmlu', 'naive_average'], @@ -135,7 +127,6 @@ dataset_abbrs_list=[ {'name': 'overall', 'dataset_abbrs': overall_dataset_abbrs}, {'name': 'mmlu', 'dataset_abbrs': mmlu_dataset_abbrs}, - {'name': 'mmlu_cf', 'dataset_abbrs': mmlu_cf_dataset_abbrs}, {'name': 'cmmlu', 'dataset_abbrs': cmmlu_dataset_abbrs}, {'name': 'ceval', 'dataset_abbrs': ceval_dataset_abbrs}, {'name': 'bbh', 'dataset_abbrs': bbh_dataset_abbrs}, diff --git a/opencompass/configs/summarizers/example.py b/opencompass/configs/summarizers/example.py index 30b8df552..937acfba9 100644 --- a/opencompass/configs/summarizers/example.py +++ b/opencompass/configs/summarizers/example.py @@ -3,7 +3,6 @@ with read_base(): from .groups.agieval import agieval_summary_groups from .groups.mmlu import mmlu_summary_groups - from .groups.mmlu_cf import mmlu_cf_summary_groups from .groups.cmmlu import cmmlu_summary_groups from .groups.ceval import ceval_summary_groups from .groups.bbh import bbh_summary_groups From 21c8a98fe5f811ddd3035b5327a690a7e53c04d6 Mon Sep 17 00:00:00 2001 From: fistyee Date: Wed, 25 Dec 2024 15:43:31 +0800 Subject: [PATCH 08/26] [Feature] Support MMLU-CF Benchmark --- configs/dataset_collections/chat_OC15.py | 1 - configs/summarizers/chat_OC15.py | 2 +- configs/summarizers/example.py | 1 - opencompass/configs/dataset_collections/chat_OC15.py | 1 - opencompass/configs/summarizers/chat_OC15.py | 2 +- 5 files changed, 2 insertions(+), 5 deletions(-) diff --git a/configs/dataset_collections/chat_OC15.py b/configs/dataset_collections/chat_OC15.py index 62a598185..c06f519dd 100644 --- a/configs/dataset_collections/chat_OC15.py +++ b/configs/dataset_collections/chat_OC15.py @@ -2,7 +2,6 @@ with read_base(): from opencompass.configs.datasets.mmlu.mmlu_gen_4d595a import mmlu_datasets - from opencompass.configs.datasets.mmlu_cf.mmlu_cf_gen import mmlu_cf_datasets from opencompass.configs.datasets.cmmlu.cmmlu_gen_c13365 import cmmlu_datasets from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import ceval_datasets from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import GaokaoBench_datasets diff --git a/configs/summarizers/chat_OC15.py b/configs/summarizers/chat_OC15.py index df4c68c88..7a02e33ed 100644 --- a/configs/summarizers/chat_OC15.py +++ b/configs/summarizers/chat_OC15.py @@ -37,7 +37,7 @@ dataset_abbrs=[ ['average', 'naive_average'], ['mmlu', 'naive_average'], - ['cmmlu', 'naive_average'], + ['cmmlu', 'naive_average'], ['ceval', 'naive_average'], ['GaokaoBench', 'weighted_average'], ['triviaqa_wiki_1shot', 'score'], diff --git a/configs/summarizers/example.py b/configs/summarizers/example.py index 30b8df552..937acfba9 100644 --- a/configs/summarizers/example.py +++ b/configs/summarizers/example.py @@ -3,7 +3,6 @@ with read_base(): from .groups.agieval import agieval_summary_groups from .groups.mmlu import mmlu_summary_groups - from .groups.mmlu_cf import mmlu_cf_summary_groups from .groups.cmmlu import cmmlu_summary_groups from .groups.ceval import ceval_summary_groups from .groups.bbh import bbh_summary_groups diff --git a/opencompass/configs/dataset_collections/chat_OC15.py b/opencompass/configs/dataset_collections/chat_OC15.py index 62a598185..c06f519dd 100644 --- a/opencompass/configs/dataset_collections/chat_OC15.py +++ b/opencompass/configs/dataset_collections/chat_OC15.py @@ -2,7 +2,6 @@ with read_base(): from opencompass.configs.datasets.mmlu.mmlu_gen_4d595a import mmlu_datasets - from opencompass.configs.datasets.mmlu_cf.mmlu_cf_gen import mmlu_cf_datasets from opencompass.configs.datasets.cmmlu.cmmlu_gen_c13365 import cmmlu_datasets from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import ceval_datasets from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import GaokaoBench_datasets diff --git a/opencompass/configs/summarizers/chat_OC15.py b/opencompass/configs/summarizers/chat_OC15.py index df4c68c88..7a02e33ed 100644 --- a/opencompass/configs/summarizers/chat_OC15.py +++ b/opencompass/configs/summarizers/chat_OC15.py @@ -37,7 +37,7 @@ dataset_abbrs=[ ['average', 'naive_average'], ['mmlu', 'naive_average'], - ['cmmlu', 'naive_average'], + ['cmmlu', 'naive_average'], ['ceval', 'naive_average'], ['GaokaoBench', 'weighted_average'], ['triviaqa_wiki_1shot', 'score'], From 113b564741a2c5eb341a3dc6b3a9a962195b9802 Mon Sep 17 00:00:00 2001 From: fistyee Date: Wed, 25 Dec 2024 15:45:16 +0800 Subject: [PATCH 09/26] [Feature] Support MMLU-CF Benchmark --- configs/eval_corebench_2409_base_objective.py | 1 - 1 file changed, 1 deletion(-) diff --git a/configs/eval_corebench_2409_base_objective.py b/configs/eval_corebench_2409_base_objective.py index 5f995259b..d5d7a3879 100644 --- a/configs/eval_corebench_2409_base_objective.py +++ b/configs/eval_corebench_2409_base_objective.py @@ -131,7 +131,6 @@ ['mmlu_pro_philosophy', 'accuracy'], ['mmlu_pro_computer_science','accuracy'], ['mmlu_pro_history', 'accuracy'], - '', ['cmmlu', 'accuracy'], ['cmmlu-stem', 'accuracy'], From 50445165b23272757b9f458a2a855971cff19846 Mon Sep 17 00:00:00 2001 From: fistyee Date: Thu, 26 Dec 2024 15:29:53 +0800 Subject: [PATCH 10/26] [Feature] Support MMLU-CF Benchmark --- README.md | 2 +- README_zh-CN.md | 2 +- configs/datasets/mmlu_cf/mmlu_cf_few_shot.py | 2 +- configs/datasets/mmlu_cf/mmlu_cf_zero_shot.py | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 308a16c8f..e75b8fd68 100644 --- a/README.md +++ b/README.md @@ -57,7 +57,7 @@ Just like a compass guides us on our journey, OpenCompass will guide you through ## 🚀 What's New -- **\[2024.12.24\]** We now support the Microsoft's Contamination-Free Multi-task language Understanding Benchmark [MMLU-CF](https://huggingface.co/datasets/microsoft/MMLU-CF). Feel free to give it a try! 🔥🔥🔥 +- **\[2024.12.26\]** We now support the Microsoft's Contamination-Free Multi-task language Understanding Benchmark [MMLU-CF](https://huggingface.co/datasets/microsoft/MMLU-CF). Feel free to give it a try! 🔥🔥🔥 - **\[2024.12.17\]** We have provided the evaluation script for the December [CompassAcademic](configs/eval_academic_leaderboard_202412.py), which allows users to easily reproduce the official evaluation results by configuring it. - **\[2024.11.14\]** OpenCompass now offers support for a sophisticated benchmark designed to evaluate complex reasoning skills — [MuSR](https://arxiv.org/pdf/2310.16049). Check out the [demo](configs/eval_musr.py) and give it a spin! 🔥🔥🔥 - **\[2024.11.14\]** OpenCompass now supports the brand new long-context language model evaluation benchmark — [BABILong](https://arxiv.org/pdf/2406.10149). Have a look at the [demo](configs/eval_babilong.py) and give it a try! 🔥🔥🔥 diff --git a/README_zh-CN.md b/README_zh-CN.md index 8760b9251..3e91eff0a 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -57,7 +57,7 @@ ## 🚀 最新进展 -- **\[2024.12.24\]** 现已支持Microsoft去污染多任务语言理解数据集[MMLU-CF](https://huggingface.co/datasets/microsoft/MMLU-CF),欢迎尝试! 🔥🔥🔥 +- **\[2024.12.26\]** 现已支持Microsoft去污染多任务语言理解数据集[MMLU-CF](https://huggingface.co/datasets/microsoft/MMLU-CF),欢迎尝试! 🔥🔥🔥 - **\[2024.12.17\]** 我们提供了12月CompassAcademic学术榜单评估脚本 [CompassAcademic](configs/eval_academic_leaderboard_202412.py),你可以通过简单地配置复现官方评测结果。 - **\[2024.10.14\]** 现已支持OpenAI多语言问答数据集[MMMLU](https://huggingface.co/datasets/openai/MMMLU),欢迎尝试! 🔥🔥🔥 - **\[2024.09.19\]** 现已支持[Qwen2.5](https://huggingface.co/Qwen)(0.5B to 72B) ,可以使用多种推理后端(huggingface/vllm/lmdeploy), 欢迎尝试! 🔥🔥🔥 diff --git a/configs/datasets/mmlu_cf/mmlu_cf_few_shot.py b/configs/datasets/mmlu_cf/mmlu_cf_few_shot.py index a3a5d39fb..a811bca61 100644 --- a/configs/datasets/mmlu_cf/mmlu_cf_few_shot.py +++ b/configs/datasets/mmlu_cf/mmlu_cf_few_shot.py @@ -16,7 +16,7 @@ mmlu_cf_datasets = [] for _name in categories: - _hint = f'There is a single choice question. Answer the question by replying A, B, C or D.' + _hint = f'There is a single choice question (with answers). Answer the question by replying A, B, C or D.' mmlu_cf_infer_cfg = dict( ice_template=dict( type=PromptTemplate, diff --git a/configs/datasets/mmlu_cf/mmlu_cf_zero_shot.py b/configs/datasets/mmlu_cf/mmlu_cf_zero_shot.py index 5035fdb85..9082db6d7 100644 --- a/configs/datasets/mmlu_cf/mmlu_cf_zero_shot.py +++ b/configs/datasets/mmlu_cf/mmlu_cf_zero_shot.py @@ -16,7 +16,7 @@ mmlu_cf_datasets = [] for _name in categories: - _hint = f'There is a single choice question. Answer the question by replying A, B, C or D.' + _hint = f'There is a single choice question (with answers). Answer the question by replying A, B, C or D.' mmlu_cf_infer_cfg = dict( ice_template=dict( type=PromptTemplate, @@ -42,7 +42,7 @@ ), ice_token='', ), - retriever=dict(type= ZeroRetriever), + retriever=dict(type=ZeroRetriever), inferencer=dict(type=GenInferencer), ) From 6a57af57bd91cac7c57a11bdd5a95b6234a93d4a Mon Sep 17 00:00:00 2001 From: fistyee Date: Fri, 27 Dec 2024 19:05:40 +0800 Subject: [PATCH 11/26] [Feature] Support MMLU-CF Benchmark --- README.md | 2 +- README_zh-CN.md | 2 +- configs/eval_mmlu_cf.py | 16 ---------------- 3 files changed, 2 insertions(+), 18 deletions(-) delete mode 100644 configs/eval_mmlu_cf.py diff --git a/README.md b/README.md index e75b8fd68..47483e360 100644 --- a/README.md +++ b/README.md @@ -57,7 +57,7 @@ Just like a compass guides us on our journey, OpenCompass will guide you through ## 🚀 What's New -- **\[2024.12.26\]** We now support the Microsoft's Contamination-Free Multi-task language Understanding Benchmark [MMLU-CF](https://huggingface.co/datasets/microsoft/MMLU-CF). Feel free to give it a try! 🔥🔥🔥 +- **\[2024.12.27\]** We now support the Microsoft's Contamination-Free Multi-task language Understanding Benchmark [MMLU-CF](https://huggingface.co/datasets/microsoft/MMLU-CF). Feel free to give it a try! 🔥🔥🔥 - **\[2024.12.17\]** We have provided the evaluation script for the December [CompassAcademic](configs/eval_academic_leaderboard_202412.py), which allows users to easily reproduce the official evaluation results by configuring it. - **\[2024.11.14\]** OpenCompass now offers support for a sophisticated benchmark designed to evaluate complex reasoning skills — [MuSR](https://arxiv.org/pdf/2310.16049). Check out the [demo](configs/eval_musr.py) and give it a spin! 🔥🔥🔥 - **\[2024.11.14\]** OpenCompass now supports the brand new long-context language model evaluation benchmark — [BABILong](https://arxiv.org/pdf/2406.10149). Have a look at the [demo](configs/eval_babilong.py) and give it a try! 🔥🔥🔥 diff --git a/README_zh-CN.md b/README_zh-CN.md index 3e91eff0a..f0d6cc9af 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -57,7 +57,7 @@ ## 🚀 最新进展 -- **\[2024.12.26\]** 现已支持Microsoft去污染多任务语言理解数据集[MMLU-CF](https://huggingface.co/datasets/microsoft/MMLU-CF),欢迎尝试! 🔥🔥🔥 +- **\[2024.12.27\]** 现已支持Microsoft去污染多任务语言理解数据集[MMLU-CF](https://huggingface.co/datasets/microsoft/MMLU-CF),欢迎尝试! 🔥🔥🔥 - **\[2024.12.17\]** 我们提供了12月CompassAcademic学术榜单评估脚本 [CompassAcademic](configs/eval_academic_leaderboard_202412.py),你可以通过简单地配置复现官方评测结果。 - **\[2024.10.14\]** 现已支持OpenAI多语言问答数据集[MMMLU](https://huggingface.co/datasets/openai/MMMLU),欢迎尝试! 🔥🔥🔥 - **\[2024.09.19\]** 现已支持[Qwen2.5](https://huggingface.co/Qwen)(0.5B to 72B) ,可以使用多种推理后端(huggingface/vllm/lmdeploy), 欢迎尝试! 🔥🔥🔥 diff --git a/configs/eval_mmlu_cf.py b/configs/eval_mmlu_cf.py deleted file mode 100644 index cd8ff1a1f..000000000 --- a/configs/eval_mmlu_cf.py +++ /dev/null @@ -1,16 +0,0 @@ -from mmengine.config import read_base - -with read_base(): - from opencompass.configs.datasets.mmlu_cf.mmlu_cf_gen import mmlu_cf_datasets - - from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import models as lmdeploy_qwen2_7b_instruct_model - from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import models as lmdeploy_llama3_8b_instruct_model - - from opencompass.configs.summarizers.mmlu_cf import summarizer - from opencompass.configs.internal.clusters.local import infer_num_worker as infer - from opencompass.configs.internal.clusters.local import eval - -datasets = sum([v for k, v in locals().items() if k.endswith('_datasets') or k == 'datasets'], []) -models = sum([v for k, v in locals().items() if k.endswith('_model')], []) - -work_dir = 'outputs/debug/mmlu_cf' From 706108d590118d28666eac2b6b532607b684dbf4 Mon Sep 17 00:00:00 2001 From: fistyee Date: Fri, 27 Dec 2024 20:24:57 +0800 Subject: [PATCH 12/26] [Feature] Support MMLU-CF Benchmark --- configs/datasets/mmlu_cf/README.md | 37 +++---------------- .../configs/datasets/mmlu_cf/README.md | 37 +++---------------- 2 files changed, 12 insertions(+), 62 deletions(-) diff --git a/configs/datasets/mmlu_cf/README.md b/configs/datasets/mmlu_cf/README.md index 28d12b13e..c91b0fd2d 100644 --- a/configs/datasets/mmlu_cf/README.md +++ b/configs/datasets/mmlu_cf/README.md @@ -1,33 +1,12 @@ # MMLU-CF: A Contamination-free Multi-task Language Understanding Benchmark -

- -![](https://img.shields.io/badge/Task-MMLU_CF-orange) -![](https://img.shields.io/badge/Data-Released-green) -![](https://img.shields.io/badge/Code_License-MIT-blue) - -
-

[📜 Paper][🤗 HF Dataset][🐱 GitHub]

-## 1. The Motivation of MMLU-CF - - - -- The open-source nature of these benchmarks and the broad sources of training data for LLMs have inevitably led to benchmark contamination, resulting in unreliable evaluation results. To alleviate this issue, we propose MMLU-CF. -- (a) An instance of leakage in MMLU. When questions are used as prompt from the MMLU, certain LLMs, due to their memorization capabilities, directly provide **choices identical to the original ones**. (b) When questions are used as prompt from the MMLU-CF, LLMs only provide guessed choices. -This indicates that the MMLU test set suffers from data contamination and memorization by some LLMs, while the proposed MMLU-CF avoids such leakage. -

- Fig1_a - Fig1_b -

- - -## 2. How to Evaluate Your Models on the MMLU-CF Validation/Test Set +## How to Evaluate Your Models on the MMLU-CF Validation/Test Set #### (1) We perform automated testing only on Huggingface models. After following the steps outlined below and obtaining the validation set results from [OpenCompass](https://github.com/open-compass/opencompass), the test set results can then be accessed via GitHub Issues. @@ -65,7 +44,7 @@ Example 2, #### (2) For API models, if OpenCompass updates the model interface, you can obtain the test set results by sending a temporary key to [Email](yangyu.huang@microsoft.com) after receiving the validation set results. -## 3. What is the Difference between MMLU-CF and MMLU +## What is the Difference between MMLU-CF and MMLU MMLU focuses on the breadth and reasoning without considering contamination prevention. We apply three decontamination rules to mitigate unintentional data leakage while collecting data from a broader domain. Meanwhile, our MMLU-CF benchmark maintains the test set closed-source to prevent malicious data leakage.

@@ -73,7 +52,7 @@ MMLU focuses on the breadth and reasoning without considering contamination prev

-## 4. Leaderboard +## Leaderboard @@ -585,14 +564,10 @@ MMLU focuses on the breadth and reasoning without considering contamination prev
-## 5. Data Construction Pipeline -![Fig3](https://github.com/microsoft/MMLU-CF/blob/main/Figures/Fig_3.png) -The pipeline involves (1) MCQ Collection to gather a diverse set of questions; (2) MCQ Cleaning to ensure quality; (3) Difficulty Sampling to ensure an appropriate difficulty distribution for questions; (4) LLMs checking: The LLMs, including GPT-4o, Gemini, and Claude, are reviewing the accuracy and safety of the data; and (5) Contamination-Free Processing to prevent data leakage and maintain dataset purity. Ultimately, this process results in the MMLU-CF, consisting of 10,000 questions for the closed-source test set and 10,000 for the open-source validation set. - -## 6. Contact +## Contact For any inquiries or concerns, feel free to reach out to us via Email: [Qihao Zhao](qhzhaoo@gmail.com) and [Yangyu Huang](yanghuan@microsoft.com). -## 7. Citation +## Citation ``` @misc{zhao2024mmlucfcontaminationfreemultitasklanguage, title={MMLU-CF: A Contamination-free Multi-task Language Understanding Benchmark}, @@ -605,6 +580,6 @@ For any inquiries or concerns, feel free to reach out to us via Email: [Qihao Zh } ``` -## 8. License +## License This repository is licensed under the [MIT](https://github.com/microsoft/PEACE/blob/main/LICENSE) License. The validation dataset of MMLU-CF is subject to the [CDLA-2.0](https://cdla.dev/permissive-2-0/) License. diff --git a/opencompass/configs/datasets/mmlu_cf/README.md b/opencompass/configs/datasets/mmlu_cf/README.md index 28d12b13e..c91b0fd2d 100644 --- a/opencompass/configs/datasets/mmlu_cf/README.md +++ b/opencompass/configs/datasets/mmlu_cf/README.md @@ -1,33 +1,12 @@ # MMLU-CF: A Contamination-free Multi-task Language Understanding Benchmark -
- -![](https://img.shields.io/badge/Task-MMLU_CF-orange) -![](https://img.shields.io/badge/Data-Released-green) -![](https://img.shields.io/badge/Code_License-MIT-blue) - -
-

[📜 Paper][🤗 HF Dataset][🐱 GitHub]

-## 1. The Motivation of MMLU-CF - - - -- The open-source nature of these benchmarks and the broad sources of training data for LLMs have inevitably led to benchmark contamination, resulting in unreliable evaluation results. To alleviate this issue, we propose MMLU-CF. -- (a) An instance of leakage in MMLU. When questions are used as prompt from the MMLU, certain LLMs, due to their memorization capabilities, directly provide **choices identical to the original ones**. (b) When questions are used as prompt from the MMLU-CF, LLMs only provide guessed choices. -This indicates that the MMLU test set suffers from data contamination and memorization by some LLMs, while the proposed MMLU-CF avoids such leakage. -

- Fig1_a - Fig1_b -

- - -## 2. How to Evaluate Your Models on the MMLU-CF Validation/Test Set +## How to Evaluate Your Models on the MMLU-CF Validation/Test Set #### (1) We perform automated testing only on Huggingface models. After following the steps outlined below and obtaining the validation set results from [OpenCompass](https://github.com/open-compass/opencompass), the test set results can then be accessed via GitHub Issues. @@ -65,7 +44,7 @@ Example 2, #### (2) For API models, if OpenCompass updates the model interface, you can obtain the test set results by sending a temporary key to [Email](yangyu.huang@microsoft.com) after receiving the validation set results. -## 3. What is the Difference between MMLU-CF and MMLU +## What is the Difference between MMLU-CF and MMLU MMLU focuses on the breadth and reasoning without considering contamination prevention. We apply three decontamination rules to mitigate unintentional data leakage while collecting data from a broader domain. Meanwhile, our MMLU-CF benchmark maintains the test set closed-source to prevent malicious data leakage.

@@ -73,7 +52,7 @@ MMLU focuses on the breadth and reasoning without considering contamination prev

-## 4. Leaderboard +## Leaderboard @@ -585,14 +564,10 @@ MMLU focuses on the breadth and reasoning without considering contamination prev
-## 5. Data Construction Pipeline -![Fig3](https://github.com/microsoft/MMLU-CF/blob/main/Figures/Fig_3.png) -The pipeline involves (1) MCQ Collection to gather a diverse set of questions; (2) MCQ Cleaning to ensure quality; (3) Difficulty Sampling to ensure an appropriate difficulty distribution for questions; (4) LLMs checking: The LLMs, including GPT-4o, Gemini, and Claude, are reviewing the accuracy and safety of the data; and (5) Contamination-Free Processing to prevent data leakage and maintain dataset purity. Ultimately, this process results in the MMLU-CF, consisting of 10,000 questions for the closed-source test set and 10,000 for the open-source validation set. - -## 6. Contact +## Contact For any inquiries or concerns, feel free to reach out to us via Email: [Qihao Zhao](qhzhaoo@gmail.com) and [Yangyu Huang](yanghuan@microsoft.com). -## 7. Citation +## Citation ``` @misc{zhao2024mmlucfcontaminationfreemultitasklanguage, title={MMLU-CF: A Contamination-free Multi-task Language Understanding Benchmark}, @@ -605,6 +580,6 @@ For any inquiries or concerns, feel free to reach out to us via Email: [Qihao Zh } ``` -## 8. License +## License This repository is licensed under the [MIT](https://github.com/microsoft/PEACE/blob/main/LICENSE) License. The validation dataset of MMLU-CF is subject to the [CDLA-2.0](https://cdla.dev/permissive-2-0/) License. From 5ab6362d41214ff24b97b9f4094b985a1f3af10d Mon Sep 17 00:00:00 2001 From: fistyee Date: Mon, 30 Dec 2024 13:29:05 +0800 Subject: [PATCH 13/26] [Feature] Support MMLU-CF Benchmark --- configs/datasets/mmlu_cf/README.md | 585 ------------------ .../configs/datasets/mmlu_cf/README.md | 585 ------------------ tools/compare_configs.py | 2 +- 3 files changed, 1 insertion(+), 1171 deletions(-) delete mode 100644 configs/datasets/mmlu_cf/README.md delete mode 100644 opencompass/configs/datasets/mmlu_cf/README.md diff --git a/configs/datasets/mmlu_cf/README.md b/configs/datasets/mmlu_cf/README.md deleted file mode 100644 index c91b0fd2d..000000000 --- a/configs/datasets/mmlu_cf/README.md +++ /dev/null @@ -1,585 +0,0 @@ -# MMLU-CF: A Contamination-free Multi-task Language Understanding Benchmark - -

- [📜 Paper] • - [🤗 HF Dataset] • - [🐱 GitHub] -

- -## How to Evaluate Your Models on the MMLU-CF Validation/Test Set - - #### (1) We perform automated testing only on Huggingface models. After following the steps outlined below and obtaining the validation set results from [OpenCompass](https://github.com/open-compass/opencompass), the test set results can then be accessed via GitHub Issues. - - **Step 1**. **Validation set evaluation**: Obtaining the validation results for your model using LLM evaluation tools, [OpenCompass](https://github.com/open-compass/opencompass). The validation dataset download from [🤗 Huggingface](https://huggingface.co/datasets/microsoft/MMLU-CF). The data directory structure in the opencompass: - -``` -data -└── mmlu_cf -    ├── dev - ├── Biology_dev.csv - ├── ... - └── val - ├── Biology_val.csv - ├── ... -``` - - **Step 2**. **Test set evaluation**: With the validation results, submit a GitHub issue on the [MMLU-CF](https://github.com/) GitHub homepage to request the test set results. Please follow the format below: - -Example 1, -``` -Title: -Test set evaluation Request - add HF model [microsoft/phi-4] -Content: -The result on validation set: 68.5% -``` -Example 2, -

- Fig6 -

- - **Notably**: - - Ensure you use the format with square brackets `[ ]` as shown. The model name **microsoft/phi-4** corresponds to the name on HuggingFace. - - We will automatically submit your model. The time to receive the results depends on the number of models being evaluated, but it typically takes **1-2 weeks**. - - #### (2) For API models, if OpenCompass updates the model interface, you can obtain the test set results by sending a temporary key to [Email](yangyu.huang@microsoft.com) after receiving the validation set results. - - -## What is the Difference between MMLU-CF and MMLU -MMLU focuses on the breadth and reasoning without considering contamination prevention. We apply three decontamination rules to mitigate unintentional data leakage while collecting data from a broader domain. Meanwhile, our MMLU-CF benchmark maintains the test set closed-source to prevent malicious data leakage. - -

- Fig4 -

- - -## Leaderboard - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelMMLU MMLU-CF
5-shot 5-shot Test 5-shot Validation 5-shot Δ 0-shot Test 0-shot Validation 0-shot Δ
API
GPT-4o88.073.473.4+0.071.972.4-0.5
GPT-4-Turbo86.570.470.1+0.368.968.7+0.1
GPT-4o-mini81.865.565.1+0.466.065.3+0.7
Gemini-1.5-Flash78.764.864.9-0.156.756.9-0.2
GPT-3.5-Turbo71.458.259.0-0.857.258.1-0.9
Large
Qwen2.5-72B-instruct85.371.671.3+0.370.670.4+0.2
Llama-3-70B-instruct82.068.968.8+0.168.167.4+0.7
Llama-3.3-70B-instruct86.368.867.8+1.067.667.5+0.1
Llama-3.1-70B-instruct86.068.768.1+0.670.469.7+0.7
Phi-3.5-MoE-instruct78.964.664.5+0.163.162.1+1.0
Qwen2-72B-instruct82.363.764.3-0.662.462.5-0.1
Mixtral-8x22B-instruct76.262.862.5+0.365.364.8+0.5
Qwen1.5-72B-chat75.659.860.2-0.459.159.6-0.5
Llama-2-70B-chat68.952.251.8+0.451.250.9+0.3
Medium
Qwen2.5-32B-instruct83.969.768.8+0.968.968.8+0.1
Phi-4-14B84.867.868.5-0.768.569.4-0.9
Qwen2.5-14B-instruct79.966.466.1+0.367.066.0+1.0
Phi-3-medium-instruct77.964.264.2+0.062.562.7-0.2
Gemma2-27B75.263.963.5+0.464.264.0+0.2
Yi-1.5-34B-chat76.861.360.5+0.860.659.5+1.1
Mixtral-8x7B-instruct-v0.170.558.357.1-1.258.958.5+0.4
Deepseek-v2-lite-chat55.749.348.7+0.648.247.7+0.5
Baichuan-2-13B-chat57.348.348.6-0.347.148.1-1.0
Llama-2-13B-chat54.842.842.1+0.744.844.6+0.2
Small
Qwen2.5-7B-instruct75.461.360.4+0.959.358.6+0.7
Qwen2-7B-instruct70.558.157.9+0.258.357.4+0.9
Glm-4-9B-chat72.457.857.9-0.158.658.7-0.1
Internlm-2.5-7B-chat72.857.356.8+0.557.956.9+1.0
Llama-3-8B-instruct68.457.356.5+0.856.455.4+1.0
Llama-3.1-8B-instruct68.157.157.9-0.856.156.1+0.0
Gemma-2-9B71.353.753.3+0.432.131.2+0.9
Yi-1.5-6B-chat62.852.851.4+1.452.251.9+0.3
Mistral-7B-instruct-v0.360.350.750.9-0.251.150.9+0.2
Baichuan-2-7B-chat52.944.543.9+0.643.944.0-0.1
Llama-2-7B-chat45.339.438.5+0.941.940.9+1.0
Mini
Phi-3-mini-instruct (3.8B)70.957.958.1-0.258.257.5+0.7
Phi-3.5-mini-instruct (3.8B)69.157.957.4+0.558.357.7+0.6
Qwen2.5-3B-instruct64.455.956.4-0.554.353.9+0.4
Qwen2.5-1.5B-instruct50.751.251.0+0.250.750.4+0.3
Qwen2-1.5B-instruct52.447.147.5-0.445.244.5+0.7
Gemma-2-2B51.343.942.4+1.530.529.4+0.9
Qwen2.5-0.5B-instruct24.141.941.1+0.836.034.9+1.1
Internlm-2-chat-1.8b47.140.539.4+1.141.239.8+1.4
Qwen2-0.5B-instruct37.938.338.3+0.033.533.5+0.0
- -## Contact -For any inquiries or concerns, feel free to reach out to us via Email: [Qihao Zhao](qhzhaoo@gmail.com) and [Yangyu Huang](yanghuan@microsoft.com). - -## Citation -``` -@misc{zhao2024mmlucfcontaminationfreemultitasklanguage, - title={MMLU-CF: A Contamination-free Multi-task Language Understanding Benchmark}, - author={Qihao Zhao and Yangyu Huang and Tengchao Lv and Lei Cui and Qinzheng Sun and Shaoguang Mao and Xin Zhang and Ying Xin and Qiufeng Yin and Scarlett Li and Furu Wei}, - year={2024}, - eprint={2412.15194}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2412.15194}, -} -``` - -## License -This repository is licensed under the [MIT](https://github.com/microsoft/PEACE/blob/main/LICENSE) License. -The validation dataset of MMLU-CF is subject to the [CDLA-2.0](https://cdla.dev/permissive-2-0/) License. diff --git a/opencompass/configs/datasets/mmlu_cf/README.md b/opencompass/configs/datasets/mmlu_cf/README.md deleted file mode 100644 index c91b0fd2d..000000000 --- a/opencompass/configs/datasets/mmlu_cf/README.md +++ /dev/null @@ -1,585 +0,0 @@ -# MMLU-CF: A Contamination-free Multi-task Language Understanding Benchmark - -

- [📜 Paper] • - [🤗 HF Dataset] • - [🐱 GitHub] -

- -## How to Evaluate Your Models on the MMLU-CF Validation/Test Set - - #### (1) We perform automated testing only on Huggingface models. After following the steps outlined below and obtaining the validation set results from [OpenCompass](https://github.com/open-compass/opencompass), the test set results can then be accessed via GitHub Issues. - - **Step 1**. **Validation set evaluation**: Obtaining the validation results for your model using LLM evaluation tools, [OpenCompass](https://github.com/open-compass/opencompass). The validation dataset download from [🤗 Huggingface](https://huggingface.co/datasets/microsoft/MMLU-CF). The data directory structure in the opencompass: - -``` -data -└── mmlu_cf -    ├── dev - ├── Biology_dev.csv - ├── ... - └── val - ├── Biology_val.csv - ├── ... -``` - - **Step 2**. **Test set evaluation**: With the validation results, submit a GitHub issue on the [MMLU-CF](https://github.com/) GitHub homepage to request the test set results. Please follow the format below: - -Example 1, -``` -Title: -Test set evaluation Request - add HF model [microsoft/phi-4] -Content: -The result on validation set: 68.5% -``` -Example 2, -

- Fig6 -

- - **Notably**: - - Ensure you use the format with square brackets `[ ]` as shown. The model name **microsoft/phi-4** corresponds to the name on HuggingFace. - - We will automatically submit your model. The time to receive the results depends on the number of models being evaluated, but it typically takes **1-2 weeks**. - - #### (2) For API models, if OpenCompass updates the model interface, you can obtain the test set results by sending a temporary key to [Email](yangyu.huang@microsoft.com) after receiving the validation set results. - - -## What is the Difference between MMLU-CF and MMLU -MMLU focuses on the breadth and reasoning without considering contamination prevention. We apply three decontamination rules to mitigate unintentional data leakage while collecting data from a broader domain. Meanwhile, our MMLU-CF benchmark maintains the test set closed-source to prevent malicious data leakage. - -

- Fig4 -

- - -## Leaderboard - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModelMMLU MMLU-CF
5-shot 5-shot Test 5-shot Validation 5-shot Δ 0-shot Test 0-shot Validation 0-shot Δ
API
GPT-4o88.073.473.4+0.071.972.4-0.5
GPT-4-Turbo86.570.470.1+0.368.968.7+0.1
GPT-4o-mini81.865.565.1+0.466.065.3+0.7
Gemini-1.5-Flash78.764.864.9-0.156.756.9-0.2
GPT-3.5-Turbo71.458.259.0-0.857.258.1-0.9
Large
Qwen2.5-72B-instruct85.371.671.3+0.370.670.4+0.2
Llama-3-70B-instruct82.068.968.8+0.168.167.4+0.7
Llama-3.3-70B-instruct86.368.867.8+1.067.667.5+0.1
Llama-3.1-70B-instruct86.068.768.1+0.670.469.7+0.7
Phi-3.5-MoE-instruct78.964.664.5+0.163.162.1+1.0
Qwen2-72B-instruct82.363.764.3-0.662.462.5-0.1
Mixtral-8x22B-instruct76.262.862.5+0.365.364.8+0.5
Qwen1.5-72B-chat75.659.860.2-0.459.159.6-0.5
Llama-2-70B-chat68.952.251.8+0.451.250.9+0.3
Medium
Qwen2.5-32B-instruct83.969.768.8+0.968.968.8+0.1
Phi-4-14B84.867.868.5-0.768.569.4-0.9
Qwen2.5-14B-instruct79.966.466.1+0.367.066.0+1.0
Phi-3-medium-instruct77.964.264.2+0.062.562.7-0.2
Gemma2-27B75.263.963.5+0.464.264.0+0.2
Yi-1.5-34B-chat76.861.360.5+0.860.659.5+1.1
Mixtral-8x7B-instruct-v0.170.558.357.1-1.258.958.5+0.4
Deepseek-v2-lite-chat55.749.348.7+0.648.247.7+0.5
Baichuan-2-13B-chat57.348.348.6-0.347.148.1-1.0
Llama-2-13B-chat54.842.842.1+0.744.844.6+0.2
Small
Qwen2.5-7B-instruct75.461.360.4+0.959.358.6+0.7
Qwen2-7B-instruct70.558.157.9+0.258.357.4+0.9
Glm-4-9B-chat72.457.857.9-0.158.658.7-0.1
Internlm-2.5-7B-chat72.857.356.8+0.557.956.9+1.0
Llama-3-8B-instruct68.457.356.5+0.856.455.4+1.0
Llama-3.1-8B-instruct68.157.157.9-0.856.156.1+0.0
Gemma-2-9B71.353.753.3+0.432.131.2+0.9
Yi-1.5-6B-chat62.852.851.4+1.452.251.9+0.3
Mistral-7B-instruct-v0.360.350.750.9-0.251.150.9+0.2
Baichuan-2-7B-chat52.944.543.9+0.643.944.0-0.1
Llama-2-7B-chat45.339.438.5+0.941.940.9+1.0
Mini
Phi-3-mini-instruct (3.8B)70.957.958.1-0.258.257.5+0.7
Phi-3.5-mini-instruct (3.8B)69.157.957.4+0.558.357.7+0.6
Qwen2.5-3B-instruct64.455.956.4-0.554.353.9+0.4
Qwen2.5-1.5B-instruct50.751.251.0+0.250.750.4+0.3
Qwen2-1.5B-instruct52.447.147.5-0.445.244.5+0.7
Gemma-2-2B51.343.942.4+1.530.529.4+0.9
Qwen2.5-0.5B-instruct24.141.941.1+0.836.034.9+1.1
Internlm-2-chat-1.8b47.140.539.4+1.141.239.8+1.4
Qwen2-0.5B-instruct37.938.338.3+0.033.533.5+0.0
- -## Contact -For any inquiries or concerns, feel free to reach out to us via Email: [Qihao Zhao](qhzhaoo@gmail.com) and [Yangyu Huang](yanghuan@microsoft.com). - -## Citation -``` -@misc{zhao2024mmlucfcontaminationfreemultitasklanguage, - title={MMLU-CF: A Contamination-free Multi-task Language Understanding Benchmark}, - author={Qihao Zhao and Yangyu Huang and Tengchao Lv and Lei Cui and Qinzheng Sun and Shaoguang Mao and Xin Zhang and Ying Xin and Qiufeng Yin and Scarlett Li and Furu Wei}, - year={2024}, - eprint={2412.15194}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2412.15194}, -} -``` - -## License -This repository is licensed under the [MIT](https://github.com/microsoft/PEACE/blob/main/LICENSE) License. -The validation dataset of MMLU-CF is subject to the [CDLA-2.0](https://cdla.dev/permissive-2-0/) License. diff --git a/tools/compare_configs.py b/tools/compare_configs.py index 480e0307e..b0d3ae10b 100755 --- a/tools/compare_configs.py +++ b/tools/compare_configs.py @@ -58,7 +58,7 @@ def compare_folders(folder1, folder2, extensions, ignore_folder): raise ValueError(f'Files differ: {file1} and {file2}') else: pass - # logger.info(f"Files are the same: {file1} and {file2}") + logger.info(f"Files are the same: {file1} and {file2}") def main(): From ddd558362b55409767b0f3fdc04ccf5d659ce80b Mon Sep 17 00:00:00 2001 From: fistyee Date: Mon, 30 Dec 2024 13:46:17 +0800 Subject: [PATCH 14/26] [Feature] Support MMLU-CF Benchmark --- tools/compare_configs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/compare_configs.py b/tools/compare_configs.py index b0d3ae10b..480e0307e 100755 --- a/tools/compare_configs.py +++ b/tools/compare_configs.py @@ -58,7 +58,7 @@ def compare_folders(folder1, folder2, extensions, ignore_folder): raise ValueError(f'Files differ: {file1} and {file2}') else: pass - logger.info(f"Files are the same: {file1} and {file2}") + # logger.info(f"Files are the same: {file1} and {file2}") def main(): From 956fe456a915601bc833aac05ffd081159bcb6b0 Mon Sep 17 00:00:00 2001 From: fistyee Date: Mon, 30 Dec 2024 14:28:28 +0800 Subject: [PATCH 15/26] [Feature] Support MMLU-CF Benchmark --- README.md | 1 - README_zh-CN.md | 1 - 2 files changed, 2 deletions(-) diff --git a/README.md b/README.md index 47483e360..6d8cabe5f 100644 --- a/README.md +++ b/README.md @@ -57,7 +57,6 @@ Just like a compass guides us on our journey, OpenCompass will guide you through ## 🚀 What's New -- **\[2024.12.27\]** We now support the Microsoft's Contamination-Free Multi-task language Understanding Benchmark [MMLU-CF](https://huggingface.co/datasets/microsoft/MMLU-CF). Feel free to give it a try! 🔥🔥🔥 - **\[2024.12.17\]** We have provided the evaluation script for the December [CompassAcademic](configs/eval_academic_leaderboard_202412.py), which allows users to easily reproduce the official evaluation results by configuring it. - **\[2024.11.14\]** OpenCompass now offers support for a sophisticated benchmark designed to evaluate complex reasoning skills — [MuSR](https://arxiv.org/pdf/2310.16049). Check out the [demo](configs/eval_musr.py) and give it a spin! 🔥🔥🔥 - **\[2024.11.14\]** OpenCompass now supports the brand new long-context language model evaluation benchmark — [BABILong](https://arxiv.org/pdf/2406.10149). Have a look at the [demo](configs/eval_babilong.py) and give it a try! 🔥🔥🔥 diff --git a/README_zh-CN.md b/README_zh-CN.md index f0d6cc9af..21c0d666e 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -57,7 +57,6 @@ ## 🚀 最新进展 -- **\[2024.12.27\]** 现已支持Microsoft去污染多任务语言理解数据集[MMLU-CF](https://huggingface.co/datasets/microsoft/MMLU-CF),欢迎尝试! 🔥🔥🔥 - **\[2024.12.17\]** 我们提供了12月CompassAcademic学术榜单评估脚本 [CompassAcademic](configs/eval_academic_leaderboard_202412.py),你可以通过简单地配置复现官方评测结果。 - **\[2024.10.14\]** 现已支持OpenAI多语言问答数据集[MMMLU](https://huggingface.co/datasets/openai/MMMLU),欢迎尝试! 🔥🔥🔥 - **\[2024.09.19\]** 现已支持[Qwen2.5](https://huggingface.co/Qwen)(0.5B to 72B) ,可以使用多种推理后端(huggingface/vllm/lmdeploy), 欢迎尝试! 🔥🔥🔥 From a222713fc7e3fd011fbd40306136954c7dc57ec4 Mon Sep 17 00:00:00 2001 From: fistyee Date: Wed, 8 Jan 2025 18:05:11 +0800 Subject: [PATCH 16/26] [Feature] Support MMLU-CF Benchmark --- configs/eval_mmlu_cf.py | 16 +++ configs/summarizers/groups/mmlu_cf.py | 2 +- configs/summarizers/mmlu_cf.py | 2 +- opencompass/cli/main.py | 3 +- .../configs/summarizers/groups/mmlu_cf.py | 2 +- opencompass/configs/summarizers/mmlu_cf.py | 2 +- opencompass/datasets/mmlu_cf.py | 99 +++++++------------ opencompass/utils/datasets_info.py | 6 -- opencompass/utils/run.py | 5 +- 9 files changed, 58 insertions(+), 79 deletions(-) create mode 100644 configs/eval_mmlu_cf.py diff --git a/configs/eval_mmlu_cf.py b/configs/eval_mmlu_cf.py new file mode 100644 index 000000000..2f4a47aea --- /dev/null +++ b/configs/eval_mmlu_cf.py @@ -0,0 +1,16 @@ +from mmengine.config import read_base +import pdb +with read_base(): + from opencompass.configs.datasets.mmlu_cf.mmlu_cf_gen import mmlu_cf_datasets + + from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import models as lmdeploy_qwen2_7b_instruct_model + from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import models as lmdeploy_llama3_8b_instruct_model + + from opencompass.configs.summarizers.mmlu_cf import summarizer + from opencompass.configs.internal.clusters.local import infer_num_worker as infer + from opencompass.configs.internal.clusters.local import eval + +datasets = sum([v for k, v in locals().items() if k.endswith('_datasets') or k == 'datasets'], []) +models = sum([v for k, v in locals().items() if k.endswith('_model')], []) + +work_dir = 'outputs/debug/mmlu_cf' diff --git a/configs/summarizers/groups/mmlu_cf.py b/configs/summarizers/groups/mmlu_cf.py index 0ecc5f8ad..3e0b8b25b 100644 --- a/configs/summarizers/groups/mmlu_cf.py +++ b/configs/summarizers/groups/mmlu_cf.py @@ -1,4 +1,4 @@ -categories = ['Computer Science', 'Math', 'Physics', 'Chemistry', 'Law', 'Engineering', 'Other', 'Economics', 'Health', 'Psychology', 'Business', 'Biology', 'Philosophy', 'History'] +categories = ['Math', 'Physics', 'Chemistry', 'Law', 'Engineering', 'Other', 'Economics', 'Health', 'Psychology', 'Business', 'Biology', 'Philosophy', 'Computer_Science', 'History'] mmlu_cf_summary_groups = [ {'name': 'mmlu_cf', 'subsets': ['mmlu_cf_' + c.replace(' ', '_') for c in categories]}, diff --git a/configs/summarizers/mmlu_cf.py b/configs/summarizers/mmlu_cf.py index d607c598d..f5d3e7a98 100644 --- a/configs/summarizers/mmlu_cf.py +++ b/configs/summarizers/mmlu_cf.py @@ -5,7 +5,6 @@ summarizer = dict( dataset_abbrs=[ - 'mmlu_cf', 'mmlu_cf_Biology', 'mmlu_cf_Business', 'mmlu_cf_Chemistry', @@ -20,6 +19,7 @@ 'mmlu_cf_Physics', 'mmlu_cf_Psychology', 'mmlu_cf_Other', + 'mmlu_cf', ], summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []), ) diff --git a/opencompass/cli/main.py b/opencompass/cli/main.py index 63377371c..dd3067580 100644 --- a/opencompass/cli/main.py +++ b/opencompass/cli/main.py @@ -223,7 +223,6 @@ def main(): args.debug = True # initialize logger logger = get_logger(log_level='DEBUG' if args.debug else 'INFO') - cfg = get_config_from_arg(args) if args.work_dir is not None: cfg['work_dir'] = args.work_dir @@ -253,7 +252,7 @@ def main(): logger.info(f'Current exp folder: {current_workdir}') os.makedirs(osp.join(cfg.work_dir, 'configs'), exist_ok=True) - + # dump config output_config_path = osp.join(cfg.work_dir, 'configs', f'{cfg_time_str}_{os.getpid()}.py') diff --git a/opencompass/configs/summarizers/groups/mmlu_cf.py b/opencompass/configs/summarizers/groups/mmlu_cf.py index 0ecc5f8ad..3e0b8b25b 100644 --- a/opencompass/configs/summarizers/groups/mmlu_cf.py +++ b/opencompass/configs/summarizers/groups/mmlu_cf.py @@ -1,4 +1,4 @@ -categories = ['Computer Science', 'Math', 'Physics', 'Chemistry', 'Law', 'Engineering', 'Other', 'Economics', 'Health', 'Psychology', 'Business', 'Biology', 'Philosophy', 'History'] +categories = ['Math', 'Physics', 'Chemistry', 'Law', 'Engineering', 'Other', 'Economics', 'Health', 'Psychology', 'Business', 'Biology', 'Philosophy', 'Computer_Science', 'History'] mmlu_cf_summary_groups = [ {'name': 'mmlu_cf', 'subsets': ['mmlu_cf_' + c.replace(' ', '_') for c in categories]}, diff --git a/opencompass/configs/summarizers/mmlu_cf.py b/opencompass/configs/summarizers/mmlu_cf.py index d607c598d..f5d3e7a98 100644 --- a/opencompass/configs/summarizers/mmlu_cf.py +++ b/opencompass/configs/summarizers/mmlu_cf.py @@ -5,7 +5,6 @@ summarizer = dict( dataset_abbrs=[ - 'mmlu_cf', 'mmlu_cf_Biology', 'mmlu_cf_Business', 'mmlu_cf_Chemistry', @@ -20,6 +19,7 @@ 'mmlu_cf_Physics', 'mmlu_cf_Psychology', 'mmlu_cf_Other', + 'mmlu_cf', ], summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []), ) diff --git a/opencompass/datasets/mmlu_cf.py b/opencompass/datasets/mmlu_cf.py index 471e7b4f0..a677862e9 100644 --- a/opencompass/datasets/mmlu_cf.py +++ b/opencompass/datasets/mmlu_cf.py @@ -1,72 +1,43 @@ -import csv -import json -import os.path as osp -from os import environ - -from datasets import Dataset, DatasetDict - +from datasets import load_dataset, DatasetDict from opencompass.registry import LOAD_DATASET -from opencompass.utils import get_data_path - from .base import BaseDataset - @LOAD_DATASET.register_module() class MMLUCFDataset(BaseDataset): @staticmethod def load(path: str, name: str): - path = get_data_path(path) - dataset = DatasetDict() - if environ.get('DATASET_SOURCE') == 'ModelScope': - from modelscope import MsDataset - for split in ['dev', 'test']: - # 从 ModelScope 加载数据 - if split == 'test': - _split = 'val' - ms_dataset = MsDataset.load(path, - subset_name=name, - split=_split) - else: - ms_dataset = MsDataset.load(path, - subset_name=name, - split=split) - - dataset_list = [] - for i, line in ms_dataset: - if i == 0: # 跳过第一行 - continue - dataset_list.append({ - 'input': line['question'], - 'A': line['choices'][0], - 'B': line['choices'][1], - 'C': line['choices'][2], - 'D': line['choices'][3], - 'target': 'ABCD'[line['answer']], - }) - dataset[split] = Dataset.from_list(dataset_list) - else: - for split in ['dev', 'test']: - if split == 'test': - _split = 'val' - filename = osp.join(path, _split, f'{name}_{_split}.csv') - else: - filename = osp.join(path, split, f'{name}_{split}.csv') - raw_data = [] - with open(filename, encoding='utf-8') as f: - reader = csv.reader(f) - next(reader) - for row in reader: - - assert len(row) == 6 - raw_data.append({ - 'input': row[0], - 'A': row[1], - 'B': row[2], - 'C': row[3], - 'D': row[4], - 'target': row[5], - }) - dataset[split] = Dataset.from_list(raw_data) - return dataset - + """ + Loading HuggingFace datasets + """ + try: + # Use HuggingFace's load_dataset method to load the dataset + hf_dataset = load_dataset("microsoft/MMLU-CF") + columns_to_keep = ["Question", "A", "B", "C", "D", "Answer"] + hf_dataset = hf_dataset.map(lambda x: {key: x[key] for key in columns_to_keep}) + splits = ['dev', 'val'] + + for split in splits: + sub_set = f'{name}_{split}' + + # Rename fields here if they don't match the expected names + hf_dataset[sub_set] = hf_dataset[sub_set].map(lambda example: { + "input": example["Question"], + "A": example["A"], + "B": example["B"], + "C": example["C"], + "D": example["D"], + "target": example["Answer"] + }) + + # Create a DatasetDict and return it + dataset = DatasetDict({ + "dev": hf_dataset[f'{name}_{splits[0]}'], + "test": hf_dataset[f'{name}_{splits[1]}'] # Use 'val' as 'test' + }) + return dataset + + except Exception as e: + print(f"Failed to load the dataset from HuggingFace: {e}") + print("Please check if the dataset exists and if the network connection is stable.") + raise \ No newline at end of file diff --git a/opencompass/utils/datasets_info.py b/opencompass/utils/datasets_info.py index 850c1bea3..aa187d36f 100644 --- a/opencompass/utils/datasets_info.py +++ b/opencompass/utils/datasets_info.py @@ -181,12 +181,6 @@ "hf_id": "opencompass/mmlu", "local": "./data/mmlu/", }, - # MMLU_CF - "opencompass/mmlu_cf": { - "ms_id": "", - "hf_id": "microsoft/MMLU-CF", - "local": "./data/mmlu_cf/", - }, # MMLU_PRO "opencompass/mmlu_pro": { "ms_id": "", diff --git a/opencompass/utils/run.py b/opencompass/utils/run.py index accd3468f..ed73985be 100644 --- a/opencompass/utils/run.py +++ b/opencompass/utils/run.py @@ -92,7 +92,6 @@ def get_config_from_arg(args) -> Config: 2. args.models and args.datasets 3. Huggingface parameter groups and args.datasets """ - if args.config: config = Config.fromfile(args.config, format_python_code=False) config = try_fill_in_custom_cfgs(config) @@ -116,6 +115,7 @@ def get_config_from_arg(args) -> Config: raise ValueError('You must specify "--datasets" or "--custom-dataset-path" if you do not specify a config file path.') datasets = [] if args.datasets: + script_dir = os.path.dirname(os.path.abspath(__file__)) parent_dir = os.path.dirname(script_dir) default_configs_dir = os.path.join(parent_dir, 'configs') @@ -124,7 +124,6 @@ def get_config_from_arg(args) -> Config: os.path.join(args.config_dir, 'dataset_collections'), os.path.join(default_configs_dir, './datasets'), os.path.join(default_configs_dir, './dataset_collections') - ] for dataset_arg in args.datasets: if '/' in dataset_arg: @@ -133,7 +132,6 @@ def get_config_from_arg(args) -> Config: else: dataset_name = dataset_arg dataset_key_suffix = '_datasets' - for dataset in match_cfg_file(datasets_dir, [dataset_name]): logger.info(f'Loading {dataset[0]}: {dataset[1]}') cfg = Config.fromfile(dataset[1]) @@ -196,6 +194,7 @@ def get_config_from_arg(args) -> Config: # set infer accelerator if needed if args.accelerator in ['vllm', 'lmdeploy']: models = change_accelerator(models, args.accelerator) + # parse summarizer args summarizer_arg = args.summarizer if args.summarizer is not None else 'example' script_dir = os.path.dirname(os.path.abspath(__file__)) From d5f756e356ac3db2dcc2fad73a052844bd0d22ec Mon Sep 17 00:00:00 2001 From: fistyee Date: Wed, 8 Jan 2025 18:09:24 +0800 Subject: [PATCH 17/26] [Feature] Support MMLU-CF Benchmark --- opencompass/utils/run.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/opencompass/utils/run.py b/opencompass/utils/run.py index ed73985be..accd3468f 100644 --- a/opencompass/utils/run.py +++ b/opencompass/utils/run.py @@ -92,6 +92,7 @@ def get_config_from_arg(args) -> Config: 2. args.models and args.datasets 3. Huggingface parameter groups and args.datasets """ + if args.config: config = Config.fromfile(args.config, format_python_code=False) config = try_fill_in_custom_cfgs(config) @@ -115,7 +116,6 @@ def get_config_from_arg(args) -> Config: raise ValueError('You must specify "--datasets" or "--custom-dataset-path" if you do not specify a config file path.') datasets = [] if args.datasets: - script_dir = os.path.dirname(os.path.abspath(__file__)) parent_dir = os.path.dirname(script_dir) default_configs_dir = os.path.join(parent_dir, 'configs') @@ -124,6 +124,7 @@ def get_config_from_arg(args) -> Config: os.path.join(args.config_dir, 'dataset_collections'), os.path.join(default_configs_dir, './datasets'), os.path.join(default_configs_dir, './dataset_collections') + ] for dataset_arg in args.datasets: if '/' in dataset_arg: @@ -132,6 +133,7 @@ def get_config_from_arg(args) -> Config: else: dataset_name = dataset_arg dataset_key_suffix = '_datasets' + for dataset in match_cfg_file(datasets_dir, [dataset_name]): logger.info(f'Loading {dataset[0]}: {dataset[1]}') cfg = Config.fromfile(dataset[1]) @@ -194,7 +196,6 @@ def get_config_from_arg(args) -> Config: # set infer accelerator if needed if args.accelerator in ['vllm', 'lmdeploy']: models = change_accelerator(models, args.accelerator) - # parse summarizer args summarizer_arg = args.summarizer if args.summarizer is not None else 'example' script_dir = os.path.dirname(os.path.abspath(__file__)) From 2329a5f41894349715d4b0e56bf9ca1bd80eca20 Mon Sep 17 00:00:00 2001 From: fistyee Date: Wed, 8 Jan 2025 18:10:41 +0800 Subject: [PATCH 18/26] [Feature] Support MMLU-CF Benchmark --- opencompass/cli/main.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/opencompass/cli/main.py b/opencompass/cli/main.py index dd3067580..63377371c 100644 --- a/opencompass/cli/main.py +++ b/opencompass/cli/main.py @@ -223,6 +223,7 @@ def main(): args.debug = True # initialize logger logger = get_logger(log_level='DEBUG' if args.debug else 'INFO') + cfg = get_config_from_arg(args) if args.work_dir is not None: cfg['work_dir'] = args.work_dir @@ -252,7 +253,7 @@ def main(): logger.info(f'Current exp folder: {current_workdir}') os.makedirs(osp.join(cfg.work_dir, 'configs'), exist_ok=True) - + # dump config output_config_path = osp.join(cfg.work_dir, 'configs', f'{cfg_time_str}_{os.getpid()}.py') From 93c44118048d977f7d0c6156084c6c8dedd3b323 Mon Sep 17 00:00:00 2001 From: fistyee Date: Wed, 8 Jan 2025 18:20:39 +0800 Subject: [PATCH 19/26] [Feature] Support MMLU-CF Benchmark --- opencompass/datasets/mmlu_cf.py | 50 +++++++++++++++------------------ 1 file changed, 22 insertions(+), 28 deletions(-) diff --git a/opencompass/datasets/mmlu_cf.py b/opencompass/datasets/mmlu_cf.py index a677862e9..ea639c7aa 100644 --- a/opencompass/datasets/mmlu_cf.py +++ b/opencompass/datasets/mmlu_cf.py @@ -10,34 +10,28 @@ def load(path: str, name: str): """ Loading HuggingFace datasets """ - try: - # Use HuggingFace's load_dataset method to load the dataset - hf_dataset = load_dataset("microsoft/MMLU-CF") - columns_to_keep = ["Question", "A", "B", "C", "D", "Answer"] - hf_dataset = hf_dataset.map(lambda x: {key: x[key] for key in columns_to_keep}) - splits = ['dev', 'val'] + # Use HuggingFace's load_dataset method to load the dataset + hf_dataset = load_dataset("microsoft/MMLU-CF") + columns_to_keep = ["Question", "A", "B", "C", "D", "Answer"] + hf_dataset = hf_dataset.map(lambda x: {key: x[key] for key in columns_to_keep}) + splits = ['dev', 'val'] - for split in splits: - sub_set = f'{name}_{split}' - - # Rename fields here if they don't match the expected names - hf_dataset[sub_set] = hf_dataset[sub_set].map(lambda example: { - "input": example["Question"], - "A": example["A"], - "B": example["B"], - "C": example["C"], - "D": example["D"], - "target": example["Answer"] - }) - - # Create a DatasetDict and return it - dataset = DatasetDict({ - "dev": hf_dataset[f'{name}_{splits[0]}'], - "test": hf_dataset[f'{name}_{splits[1]}'] # Use 'val' as 'test' + for split in splits: + sub_set = f'{name}_{split}' + + # Rename fields here if they don't match the expected names + hf_dataset[sub_set] = hf_dataset[sub_set].map(lambda example: { + "input": example["Question"], + "A": example["A"], + "B": example["B"], + "C": example["C"], + "D": example["D"], + "target": example["Answer"] }) - return dataset - except Exception as e: - print(f"Failed to load the dataset from HuggingFace: {e}") - print("Please check if the dataset exists and if the network connection is stable.") - raise \ No newline at end of file + # Create a DatasetDict and return it + dataset = DatasetDict({ + "dev": hf_dataset[f'{name}_{splits[0]}'], + "test": hf_dataset[f'{name}_{splits[1]}'] # Use 'val' as 'test' + }) + return dataset \ No newline at end of file From 77df499f36e35055fe296ae4c074817c1f54b982 Mon Sep 17 00:00:00 2001 From: liushz Date: Wed, 8 Jan 2025 12:15:29 +0000 Subject: [PATCH 20/26] Update mmlu-cf --- configs/datasets/mmlu_cf/mmlu_cf_gen.py | 62 +----------------- .../datasets/mmlu_cf/mmlu_cf_gen_040615.py | 64 +++++++++++++++++++ configs/eval_mmlu_cf.py | 32 ++++++++-- .../configs/datasets/mmlu_cf/mmlu_cf_gen.py | 62 +----------------- .../datasets/mmlu_cf/mmlu_cf_gen_040615.py | 64 +++++++++++++++++++ opencompass/datasets/mmlu_cf.py | 42 ++++++------ 6 files changed, 180 insertions(+), 146 deletions(-) create mode 100644 configs/datasets/mmlu_cf/mmlu_cf_gen_040615.py create mode 100644 opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen_040615.py diff --git a/configs/datasets/mmlu_cf/mmlu_cf_gen.py b/configs/datasets/mmlu_cf/mmlu_cf_gen.py index a0be611f1..01611aba3 100644 --- a/configs/datasets/mmlu_cf/mmlu_cf_gen.py +++ b/configs/datasets/mmlu_cf/mmlu_cf_gen.py @@ -1,64 +1,4 @@ from mmengine.config import read_base -from opencompass.openicl.icl_prompt_template import PromptTemplate -from opencompass.openicl.icl_retriever import FixKRetriever -from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator -from opencompass.datasets import MMLUCFDataset -from opencompass.utils.text_postprocessors import first_option_postprocess with read_base(): - from .mmlu_cf_categories import categories - -mmlu_cf_reader_cfg = dict( - input_columns=['input', 'A', 'B', 'C', 'D'], - output_column='target', - train_split='dev') - -mmlu_cf_datasets = [] -for _name in categories: - _hint = f'There is a single choice question. Answer the question by replying A, B, C or D.' - mmlu_cf_infer_cfg = dict( - ice_template=dict( - type=PromptTemplate, - template=dict(round=[ - dict( - role='HUMAN', - prompt= - f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' - ), - dict(role='BOT', prompt='{target}\n') - ]), - ), - prompt_template=dict( - type=PromptTemplate, - template=dict( - begin='', - round=[ - dict( - role='HUMAN', - prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' - ), - ], - ), - ice_token='', - ), - retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]), - inferencer=dict(type=GenInferencer), - ) - - mmlu_cf_eval_cfg = dict( - evaluator=dict(type=AccwithDetailsEvaluator), - pred_postprocessor=dict(type=first_option_postprocess, options='ABCD')) - - mmlu_cf_datasets.append( - dict( - abbr=f'mmlu_cf_{_name}', - type=MMLUCFDataset, - path='opencompass/mmlu_cf/', - name=_name, - reader_cfg=mmlu_cf_reader_cfg, - infer_cfg=mmlu_cf_infer_cfg, - eval_cfg=mmlu_cf_eval_cfg, - )) - -del _name, _hint + from .mmlu_cf_gen_dadasd import mmlu_cf_datasets # noqa: F401, F403 diff --git a/configs/datasets/mmlu_cf/mmlu_cf_gen_040615.py b/configs/datasets/mmlu_cf/mmlu_cf_gen_040615.py new file mode 100644 index 000000000..851fec913 --- /dev/null +++ b/configs/datasets/mmlu_cf/mmlu_cf_gen_040615.py @@ -0,0 +1,64 @@ +from mmengine.config import read_base +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import FixKRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator +from opencompass.datasets import MMLUCFDataset +from opencompass.utils.text_postprocessors import first_option_postprocess + +with read_base(): + from .mmlu_cf_categories import categories + +mmlu_cf_reader_cfg = dict( + input_columns=['input', 'A', 'B', 'C', 'D'], + output_column='target', + train_split='dev') + +mmlu_cf_datasets = [] +for _name in categories: + _hint = f'There is a single choice question. Answer the question by replying A, B, C or D.' + mmlu_cf_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt= + f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' + ), + dict(role='BOT', prompt='{target}\n') + ]), + ), + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin='', + round=[ + dict( + role='HUMAN', + prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' + ), + ], + ), + ice_token='', + ), + retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]), + inferencer=dict(type=GenInferencer), + ) + + mmlu_cf_eval_cfg = dict( + evaluator=dict(type=AccwithDetailsEvaluator), + pred_postprocessor=dict(type=first_option_postprocess, options='ABCD')) + + mmlu_cf_datasets.append( + dict( + abbr=f'mmlu_cf_{_name}', + type=MMLUCFDataset, + path='microsoft/MMLU-CF', + name=_name, + reader_cfg=mmlu_cf_reader_cfg, + infer_cfg=mmlu_cf_infer_cfg, + eval_cfg=mmlu_cf_eval_cfg, + )) + +del _name, _hint diff --git a/configs/eval_mmlu_cf.py b/configs/eval_mmlu_cf.py index 2f4a47aea..adb445ae5 100644 --- a/configs/eval_mmlu_cf.py +++ b/configs/eval_mmlu_cf.py @@ -1,16 +1,38 @@ from mmengine.config import read_base -import pdb + with read_base(): - from opencompass.configs.datasets.mmlu_cf.mmlu_cf_gen import mmlu_cf_datasets + from opencompass.configs.datasets.mmlu_cf.mmlu_cf_gen_040615 import mmlu_cf_datasets - from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import models as lmdeploy_qwen2_7b_instruct_model + from opencompass.configs.models.qwen2_5.hf_qwen2_5_7b_instruct import models as hf_qwen2_5_7b_instruct_model from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import models as lmdeploy_llama3_8b_instruct_model from opencompass.configs.summarizers.mmlu_cf import summarizer - from opencompass.configs.internal.clusters.local import infer_num_worker as infer - from opencompass.configs.internal.clusters.local import eval + datasets = sum([v for k, v in locals().items() if k.endswith('_datasets') or k == 'datasets'], []) models = sum([v for k, v in locals().items() if k.endswith('_model')], []) + +from opencompass.runners import LocalRunner +from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner +from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask + +infer = dict( + partitioner=dict(type=NumWorkerPartitioner, num_worker=8), + runner=dict( + type=LocalRunner, + max_num_workers=8, + task=dict(type=OpenICLInferTask) + ), +) + +eval = dict( + partitioner=dict(type=NaivePartitioner, n=10), + runner=dict( + type=LocalRunner, + max_num_workers=256, + task=dict(type=OpenICLEvalTask) + ), +) + work_dir = 'outputs/debug/mmlu_cf' diff --git a/opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen.py b/opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen.py index a0be611f1..01611aba3 100644 --- a/opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen.py +++ b/opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen.py @@ -1,64 +1,4 @@ from mmengine.config import read_base -from opencompass.openicl.icl_prompt_template import PromptTemplate -from opencompass.openicl.icl_retriever import FixKRetriever -from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator -from opencompass.datasets import MMLUCFDataset -from opencompass.utils.text_postprocessors import first_option_postprocess with read_base(): - from .mmlu_cf_categories import categories - -mmlu_cf_reader_cfg = dict( - input_columns=['input', 'A', 'B', 'C', 'D'], - output_column='target', - train_split='dev') - -mmlu_cf_datasets = [] -for _name in categories: - _hint = f'There is a single choice question. Answer the question by replying A, B, C or D.' - mmlu_cf_infer_cfg = dict( - ice_template=dict( - type=PromptTemplate, - template=dict(round=[ - dict( - role='HUMAN', - prompt= - f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' - ), - dict(role='BOT', prompt='{target}\n') - ]), - ), - prompt_template=dict( - type=PromptTemplate, - template=dict( - begin='', - round=[ - dict( - role='HUMAN', - prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' - ), - ], - ), - ice_token='', - ), - retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]), - inferencer=dict(type=GenInferencer), - ) - - mmlu_cf_eval_cfg = dict( - evaluator=dict(type=AccwithDetailsEvaluator), - pred_postprocessor=dict(type=first_option_postprocess, options='ABCD')) - - mmlu_cf_datasets.append( - dict( - abbr=f'mmlu_cf_{_name}', - type=MMLUCFDataset, - path='opencompass/mmlu_cf/', - name=_name, - reader_cfg=mmlu_cf_reader_cfg, - infer_cfg=mmlu_cf_infer_cfg, - eval_cfg=mmlu_cf_eval_cfg, - )) - -del _name, _hint + from .mmlu_cf_gen_dadasd import mmlu_cf_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen_040615.py b/opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen_040615.py new file mode 100644 index 000000000..851fec913 --- /dev/null +++ b/opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen_040615.py @@ -0,0 +1,64 @@ +from mmengine.config import read_base +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import FixKRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator +from opencompass.datasets import MMLUCFDataset +from opencompass.utils.text_postprocessors import first_option_postprocess + +with read_base(): + from .mmlu_cf_categories import categories + +mmlu_cf_reader_cfg = dict( + input_columns=['input', 'A', 'B', 'C', 'D'], + output_column='target', + train_split='dev') + +mmlu_cf_datasets = [] +for _name in categories: + _hint = f'There is a single choice question. Answer the question by replying A, B, C or D.' + mmlu_cf_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt= + f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' + ), + dict(role='BOT', prompt='{target}\n') + ]), + ), + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin='', + round=[ + dict( + role='HUMAN', + prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' + ), + ], + ), + ice_token='', + ), + retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]), + inferencer=dict(type=GenInferencer), + ) + + mmlu_cf_eval_cfg = dict( + evaluator=dict(type=AccwithDetailsEvaluator), + pred_postprocessor=dict(type=first_option_postprocess, options='ABCD')) + + mmlu_cf_datasets.append( + dict( + abbr=f'mmlu_cf_{_name}', + type=MMLUCFDataset, + path='microsoft/MMLU-CF', + name=_name, + reader_cfg=mmlu_cf_reader_cfg, + infer_cfg=mmlu_cf_infer_cfg, + eval_cfg=mmlu_cf_eval_cfg, + )) + +del _name, _hint diff --git a/opencompass/datasets/mmlu_cf.py b/opencompass/datasets/mmlu_cf.py index ea639c7aa..f3ee8685b 100644 --- a/opencompass/datasets/mmlu_cf.py +++ b/opencompass/datasets/mmlu_cf.py @@ -1,37 +1,41 @@ -from datasets import load_dataset, DatasetDict +from datasets import DatasetDict, load_dataset + from opencompass.registry import LOAD_DATASET + from .base import BaseDataset + @LOAD_DATASET.register_module() class MMLUCFDataset(BaseDataset): @staticmethod def load(path: str, name: str): - """ - Loading HuggingFace datasets - """ + """Loading HuggingFace datasets.""" # Use HuggingFace's load_dataset method to load the dataset - hf_dataset = load_dataset("microsoft/MMLU-CF") - columns_to_keep = ["Question", "A", "B", "C", "D", "Answer"] - hf_dataset = hf_dataset.map(lambda x: {key: x[key] for key in columns_to_keep}) + hf_dataset = load_dataset(path) + columns_to_keep = ['Question', 'A', 'B', 'C', 'D', 'Answer'] + hf_dataset = hf_dataset.map( + lambda x: {key: x[key] + for key in columns_to_keep}) splits = ['dev', 'val'] for split in splits: sub_set = f'{name}_{split}' - + # Rename fields here if they don't match the expected names - hf_dataset[sub_set] = hf_dataset[sub_set].map(lambda example: { - "input": example["Question"], - "A": example["A"], - "B": example["B"], - "C": example["C"], - "D": example["D"], - "target": example["Answer"] - }) + hf_dataset[sub_set] = hf_dataset[sub_set].map( + lambda example: { + 'input': example['Question'], + 'A': example['A'], + 'B': example['B'], + 'C': example['C'], + 'D': example['D'], + 'target': example['Answer'] + }) # Create a DatasetDict and return it dataset = DatasetDict({ - "dev": hf_dataset[f'{name}_{splits[0]}'], - "test": hf_dataset[f'{name}_{splits[1]}'] # Use 'val' as 'test' + 'dev': hf_dataset[f'{name}_{splits[0]}'], + 'test': hf_dataset[f'{name}_{splits[1]}'] # Use 'val' as 'test' }) - return dataset \ No newline at end of file + return dataset From e428a7e760ec64e99fa9bf1fdfbf528c3868c7c5 Mon Sep 17 00:00:00 2001 From: liushz Date: Wed, 8 Jan 2025 12:16:39 +0000 Subject: [PATCH 21/26] Update mmlu-cf --- opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen.py b/opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen.py index 01611aba3..5fbee8d9e 100644 --- a/opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen.py +++ b/opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .mmlu_cf_gen_dadasd import mmlu_cf_datasets # noqa: F401, F403 + from .mmlu_cf_gen_040615 import mmlu_cf_datasets # noqa: F401, F403 From ce3ee2db0af2bbadfc1fe56e5d945eaf0eba78d3 Mon Sep 17 00:00:00 2001 From: liushz Date: Wed, 8 Jan 2025 12:28:41 +0000 Subject: [PATCH 22/26] Update mmlu-cf --- configs/datasets/mmlu_cf/mmlu_cf_gen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/datasets/mmlu_cf/mmlu_cf_gen.py b/configs/datasets/mmlu_cf/mmlu_cf_gen.py index 01611aba3..5fbee8d9e 100644 --- a/configs/datasets/mmlu_cf/mmlu_cf_gen.py +++ b/configs/datasets/mmlu_cf/mmlu_cf_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .mmlu_cf_gen_dadasd import mmlu_cf_datasets # noqa: F401, F403 + from .mmlu_cf_gen_040615 import mmlu_cf_datasets # noqa: F401, F403 From d061100fcefa5cdff825b0019415b89ff30e5231 Mon Sep 17 00:00:00 2001 From: fistyee Date: Thu, 9 Jan 2025 00:34:35 +0800 Subject: [PATCH 23/26] [Feature] Support MMLU-CF Benchmark --- configs/datasets/mmlu_cf/mmlu_cf_few_shot.py | 2 +- configs/datasets/mmlu_cf/mmlu_cf_gen.py | 2 +- configs/datasets/mmlu_cf/mmlu_cf_zero_shot.py | 2 +- opencompass/configs/datasets/mmlu_cf/mmlu_cf_few_shot.py | 2 +- opencompass/configs/datasets/mmlu_cf/mmlu_cf_zero_shot.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/configs/datasets/mmlu_cf/mmlu_cf_few_shot.py b/configs/datasets/mmlu_cf/mmlu_cf_few_shot.py index a811bca61..6500cee7f 100644 --- a/configs/datasets/mmlu_cf/mmlu_cf_few_shot.py +++ b/configs/datasets/mmlu_cf/mmlu_cf_few_shot.py @@ -54,7 +54,7 @@ dict( abbr=f'mmlu_cf_{_name}', type=MMLUCFDataset, - path='opencompass/mmlu_cf', + path='microsoft/MMLU-CF', name=_name, reader_cfg=mmlu_cf_reader_cfg, infer_cfg=mmlu_cf_infer_cfg, diff --git a/configs/datasets/mmlu_cf/mmlu_cf_gen.py b/configs/datasets/mmlu_cf/mmlu_cf_gen.py index 5fbee8d9e..158d10772 100644 --- a/configs/datasets/mmlu_cf/mmlu_cf_gen.py +++ b/configs/datasets/mmlu_cf/mmlu_cf_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .mmlu_cf_gen_040615 import mmlu_cf_datasets # noqa: F401, F403 + from .mmlu_cf_gen_040615 import mmlu_cf_datasets # noqa: F401, F403 \ No newline at end of file diff --git a/configs/datasets/mmlu_cf/mmlu_cf_zero_shot.py b/configs/datasets/mmlu_cf/mmlu_cf_zero_shot.py index 9082db6d7..d084f4f0f 100644 --- a/configs/datasets/mmlu_cf/mmlu_cf_zero_shot.py +++ b/configs/datasets/mmlu_cf/mmlu_cf_zero_shot.py @@ -54,7 +54,7 @@ dict( abbr=f'mmlu_cf_{_name}', type=MMLUCFDataset, - path='opencompass/mmlu_cf', + path='microsoft/MMLU-CF', name=_name, reader_cfg=mmlu_cf_reader_cfg, infer_cfg=mmlu_cf_infer_cfg, diff --git a/opencompass/configs/datasets/mmlu_cf/mmlu_cf_few_shot.py b/opencompass/configs/datasets/mmlu_cf/mmlu_cf_few_shot.py index a811bca61..6500cee7f 100644 --- a/opencompass/configs/datasets/mmlu_cf/mmlu_cf_few_shot.py +++ b/opencompass/configs/datasets/mmlu_cf/mmlu_cf_few_shot.py @@ -54,7 +54,7 @@ dict( abbr=f'mmlu_cf_{_name}', type=MMLUCFDataset, - path='opencompass/mmlu_cf', + path='microsoft/MMLU-CF', name=_name, reader_cfg=mmlu_cf_reader_cfg, infer_cfg=mmlu_cf_infer_cfg, diff --git a/opencompass/configs/datasets/mmlu_cf/mmlu_cf_zero_shot.py b/opencompass/configs/datasets/mmlu_cf/mmlu_cf_zero_shot.py index 9082db6d7..d084f4f0f 100644 --- a/opencompass/configs/datasets/mmlu_cf/mmlu_cf_zero_shot.py +++ b/opencompass/configs/datasets/mmlu_cf/mmlu_cf_zero_shot.py @@ -54,7 +54,7 @@ dict( abbr=f'mmlu_cf_{_name}', type=MMLUCFDataset, - path='opencompass/mmlu_cf', + path='microsoft/MMLU-CF', name=_name, reader_cfg=mmlu_cf_reader_cfg, infer_cfg=mmlu_cf_infer_cfg, From c5722b9f0419caf2317f5bc4838c8598eec70990 Mon Sep 17 00:00:00 2001 From: fistyee Date: Thu, 9 Jan 2025 00:38:55 +0800 Subject: [PATCH 24/26] [Feature] Support MMLU-CF Benchmark --- opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen.py b/opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen.py index 5fbee8d9e..158d10772 100644 --- a/opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen.py +++ b/opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .mmlu_cf_gen_040615 import mmlu_cf_datasets # noqa: F401, F403 + from .mmlu_cf_gen_040615 import mmlu_cf_datasets # noqa: F401, F403 \ No newline at end of file From 84392457441640aad955d4a0480b0e8ae43fb3eb Mon Sep 17 00:00:00 2001 From: fistyee Date: Thu, 9 Jan 2025 13:17:42 +0800 Subject: [PATCH 25/26] [Feature] Support MMLU-CF Benchmark --- configs/datasets/mmlu_cf/mmlu_cf_gen.py | 2 +- opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/datasets/mmlu_cf/mmlu_cf_gen.py b/configs/datasets/mmlu_cf/mmlu_cf_gen.py index 158d10772..5fbee8d9e 100644 --- a/configs/datasets/mmlu_cf/mmlu_cf_gen.py +++ b/configs/datasets/mmlu_cf/mmlu_cf_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .mmlu_cf_gen_040615 import mmlu_cf_datasets # noqa: F401, F403 \ No newline at end of file + from .mmlu_cf_gen_040615 import mmlu_cf_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen.py b/opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen.py index 158d10772..5fbee8d9e 100644 --- a/opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen.py +++ b/opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .mmlu_cf_gen_040615 import mmlu_cf_datasets # noqa: F401, F403 \ No newline at end of file + from .mmlu_cf_gen_040615 import mmlu_cf_datasets # noqa: F401, F403 From 89929df6221c740c71bc57dd308bba2b6c5a9202 Mon Sep 17 00:00:00 2001 From: liushz Date: Thu, 9 Jan 2025 05:19:16 +0000 Subject: [PATCH 26/26] Remove outside configs --- .../datasets/mmlu_cf/mmlu_cf_categories.py | 16 ----- configs/datasets/mmlu_cf/mmlu_cf_few_shot.py | 64 ------------------- configs/datasets/mmlu_cf/mmlu_cf_gen.py | 4 -- .../datasets/mmlu_cf/mmlu_cf_gen_040615.py | 64 ------------------- configs/datasets/mmlu_cf/mmlu_cf_zero_shot.py | 64 ------------------- 5 files changed, 212 deletions(-) delete mode 100644 configs/datasets/mmlu_cf/mmlu_cf_categories.py delete mode 100644 configs/datasets/mmlu_cf/mmlu_cf_few_shot.py delete mode 100644 configs/datasets/mmlu_cf/mmlu_cf_gen.py delete mode 100644 configs/datasets/mmlu_cf/mmlu_cf_gen_040615.py delete mode 100644 configs/datasets/mmlu_cf/mmlu_cf_zero_shot.py diff --git a/configs/datasets/mmlu_cf/mmlu_cf_categories.py b/configs/datasets/mmlu_cf/mmlu_cf_categories.py deleted file mode 100644 index ab8b198f4..000000000 --- a/configs/datasets/mmlu_cf/mmlu_cf_categories.py +++ /dev/null @@ -1,16 +0,0 @@ -categories = [ - 'Math', - 'Physics', - 'Chemistry', - 'Law', - 'Engineering', - 'Other', - 'Economics', - 'Health', - 'Psychology', - 'Business', - 'Biology', - 'Philosophy', - 'Computer_Science', - 'History', -] diff --git a/configs/datasets/mmlu_cf/mmlu_cf_few_shot.py b/configs/datasets/mmlu_cf/mmlu_cf_few_shot.py deleted file mode 100644 index 6500cee7f..000000000 --- a/configs/datasets/mmlu_cf/mmlu_cf_few_shot.py +++ /dev/null @@ -1,64 +0,0 @@ -from mmengine.config import read_base -from opencompass.openicl.icl_prompt_template import PromptTemplate -from opencompass.openicl.icl_retriever import FixKRetriever -from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator -from opencompass.datasets import MMLUCFDataset -from opencompass.utils.text_postprocessors import first_option_postprocess - -with read_base(): - from .mmlu_cf_categories import categories - -mmlu_cf_reader_cfg = dict( - input_columns=['input', 'A', 'B', 'C', 'D'], - output_column='target', - train_split='dev') - -mmlu_cf_datasets = [] -for _name in categories: - _hint = f'There is a single choice question (with answers). Answer the question by replying A, B, C or D.' - mmlu_cf_infer_cfg = dict( - ice_template=dict( - type=PromptTemplate, - template=dict(round=[ - dict( - role='HUMAN', - prompt= - f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' - ), - dict(role='BOT', prompt='{target}\n') - ]), - ), - prompt_template=dict( - type=PromptTemplate, - template=dict( - begin='', - round=[ - dict( - role='HUMAN', - prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' - ), - ], - ), - ice_token='', - ), - retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]), - inferencer=dict(type=GenInferencer), - ) - - mmlu_cf_eval_cfg = dict( - evaluator=dict(type=AccwithDetailsEvaluator), - pred_postprocessor=dict(type=first_option_postprocess, options='ABCD')) - - mmlu_cf_datasets.append( - dict( - abbr=f'mmlu_cf_{_name}', - type=MMLUCFDataset, - path='microsoft/MMLU-CF', - name=_name, - reader_cfg=mmlu_cf_reader_cfg, - infer_cfg=mmlu_cf_infer_cfg, - eval_cfg=mmlu_cf_eval_cfg, - )) - -del _name, _hint diff --git a/configs/datasets/mmlu_cf/mmlu_cf_gen.py b/configs/datasets/mmlu_cf/mmlu_cf_gen.py deleted file mode 100644 index 158d10772..000000000 --- a/configs/datasets/mmlu_cf/mmlu_cf_gen.py +++ /dev/null @@ -1,4 +0,0 @@ -from mmengine.config import read_base - -with read_base(): - from .mmlu_cf_gen_040615 import mmlu_cf_datasets # noqa: F401, F403 \ No newline at end of file diff --git a/configs/datasets/mmlu_cf/mmlu_cf_gen_040615.py b/configs/datasets/mmlu_cf/mmlu_cf_gen_040615.py deleted file mode 100644 index 851fec913..000000000 --- a/configs/datasets/mmlu_cf/mmlu_cf_gen_040615.py +++ /dev/null @@ -1,64 +0,0 @@ -from mmengine.config import read_base -from opencompass.openicl.icl_prompt_template import PromptTemplate -from opencompass.openicl.icl_retriever import FixKRetriever -from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator -from opencompass.datasets import MMLUCFDataset -from opencompass.utils.text_postprocessors import first_option_postprocess - -with read_base(): - from .mmlu_cf_categories import categories - -mmlu_cf_reader_cfg = dict( - input_columns=['input', 'A', 'B', 'C', 'D'], - output_column='target', - train_split='dev') - -mmlu_cf_datasets = [] -for _name in categories: - _hint = f'There is a single choice question. Answer the question by replying A, B, C or D.' - mmlu_cf_infer_cfg = dict( - ice_template=dict( - type=PromptTemplate, - template=dict(round=[ - dict( - role='HUMAN', - prompt= - f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' - ), - dict(role='BOT', prompt='{target}\n') - ]), - ), - prompt_template=dict( - type=PromptTemplate, - template=dict( - begin='', - round=[ - dict( - role='HUMAN', - prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' - ), - ], - ), - ice_token='', - ), - retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]), - inferencer=dict(type=GenInferencer), - ) - - mmlu_cf_eval_cfg = dict( - evaluator=dict(type=AccwithDetailsEvaluator), - pred_postprocessor=dict(type=first_option_postprocess, options='ABCD')) - - mmlu_cf_datasets.append( - dict( - abbr=f'mmlu_cf_{_name}', - type=MMLUCFDataset, - path='microsoft/MMLU-CF', - name=_name, - reader_cfg=mmlu_cf_reader_cfg, - infer_cfg=mmlu_cf_infer_cfg, - eval_cfg=mmlu_cf_eval_cfg, - )) - -del _name, _hint diff --git a/configs/datasets/mmlu_cf/mmlu_cf_zero_shot.py b/configs/datasets/mmlu_cf/mmlu_cf_zero_shot.py deleted file mode 100644 index d084f4f0f..000000000 --- a/configs/datasets/mmlu_cf/mmlu_cf_zero_shot.py +++ /dev/null @@ -1,64 +0,0 @@ -from mmengine.config import read_base -from opencompass.openicl.icl_prompt_template import PromptTemplate -from opencompass.openicl.icl_retriever import ZeroRetriever -from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator -from opencompass.datasets import MMLUCFDataset -from opencompass.utils.text_postprocessors import first_option_postprocess - -with read_base(): - from .mmlu_cf_categories import categories - -mmlu_cf_reader_cfg = dict( - input_columns=['input', 'A', 'B', 'C', 'D'], - output_column='target', - train_split='dev') - -mmlu_cf_datasets = [] -for _name in categories: - _hint = f'There is a single choice question (with answers). Answer the question by replying A, B, C or D.' - mmlu_cf_infer_cfg = dict( - ice_template=dict( - type=PromptTemplate, - template=dict(round=[ - dict( - role='HUMAN', - prompt= - f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' - ), - dict(role='BOT', prompt='{target}\n') - ]), - ), - prompt_template=dict( - type=PromptTemplate, - template=dict( - begin='', - round=[ - dict( - role='HUMAN', - prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ' - ), - ], - ), - ice_token='', - ), - retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer), - ) - - mmlu_cf_eval_cfg = dict( - evaluator=dict(type=AccwithDetailsEvaluator), - pred_postprocessor=dict(type=first_option_postprocess, options='ABCD')) - - mmlu_cf_datasets.append( - dict( - abbr=f'mmlu_cf_{_name}', - type=MMLUCFDataset, - path='microsoft/MMLU-CF', - name=_name, - reader_cfg=mmlu_cf_reader_cfg, - infer_cfg=mmlu_cf_infer_cfg, - eval_cfg=mmlu_cf_eval_cfg, - )) - -del _name, _hint