diff --git a/opencompass/configs/datasets/omni_math/README.md b/opencompass/configs/datasets/omni_math/README.md new file mode 100644 index 000000000..12f54c1b1 --- /dev/null +++ b/opencompass/configs/datasets/omni_math/README.md @@ -0,0 +1,43 @@ +# Omni-Math + +[Omni-Math](https://huggingface.co/datasets/KbsdJames/Omni-MATH) contains 4428 competition-level problems. These problems are meticulously categorized into 33 (and potentially more) sub-domains and span across 10 distinct difficulty levels, enabling a nuanced analysis of model performance across various mathematical disciplines and levels of complexity. + +* Project Page: https://omni-math.github.io/ +* Github Repo: https://github.com/KbsdJames/Omni-MATH +* Omni-Judge (opensource evaluator of this dataset): https://huggingface.co/KbsdJames/Omni-Judge + +## Omni-Judge + +> Omni-Judge is an open-source mathematical evaluation model designed to assess whether a solution generated by a model is correct given a problem and a standard answer. + +You should deploy the omni-judge server like: +```bash +set -x + +lmdeploy serve api_server KbsdJames/Omni-Judge --server-port 8000 \ + --tp 1 \ + --cache-max-entry-count 0.9 \ + --log-level INFO +``` + +and set the server url in opencompass config file: + +```python +from mmengine.config import read_base + +with read_base(): + from opencompass.configs.datasets.omni_math.omni_math_gen import omni_math_datasets + + +omni_math_dataset = omni_math_datasets[0] +omni_math_dataset['eval_cfg']['evaluator'].update( + url=['http://172.30.8.45:8000', + 'http://172.30.16.113:8000'], +) +``` + +## Performance + +| llama-3_1-8b-instruct | qwen-2_5-7b-instruct | InternLM3-8b-Instruct | +| -- | -- | -- | +| 15.18 | 29.97 | 32.75 | \ No newline at end of file diff --git a/opencompass/datasets/omni_math.py b/opencompass/datasets/omni_math.py index 8a2dd96d4..18987b477 100644 --- a/opencompass/datasets/omni_math.py +++ b/opencompass/datasets/omni_math.py @@ -5,7 +5,6 @@ from datasets import load_dataset from transformers import AutoTokenizer -from opencompass.models import OpenAISDK from opencompass.models.turbomind_api import TurboMindAPIModel from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET, MODELS @@ -47,7 +46,7 @@ def __init__(self, url): self.tokenizer = AutoTokenizer.from_pretrained('KbsdJames/Omni-Judge', trust_remote_code=True) - def batch_infer(self, models: List[OpenAISDK], + def batch_infer(self, models: List[TurboMindAPIModel], inputs: List[str]) -> List[str]: batch_num = len(models) batch_size = (len(inputs) + batch_num - 1) // batch_num