Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add dataset collection #253

Open
wants to merge 17 commits into
base: main
Choose a base branch
from
3 changes: 2 additions & 1 deletion docs/zh/user_guides/backend/rageval_backend/mteb.md
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,8 @@ one_stage_task_cfg = {


### 两阶段评测
配置文件示例如下,先进行检索,再进行reranking:
评测reranker需要用retrieval数据集,先用embedding模型检索topk,再进行排序。配置文件示例如下:

```python
two_stage_task_cfg = {
"eval_backend": "RAGEval",
Expand Down
13 changes: 10 additions & 3 deletions evalscope/arguments.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import argparse
import json

from evalscope.constants import EvalBackend, EvalStage, EvalType


class ParseStrArgsAction(argparse.Action):

Expand Down Expand Up @@ -47,10 +49,13 @@ def add_argument(parser: argparse.ArgumentParser):
parser.add_argument('--generation-config', type=str, action=ParseStrArgsAction, help='The generation config, should be a string.') # noqa: E501

# Evaluation-related arguments
parser.add_argument('--eval-type', type=str, help='The type for evaluating.')
parser.add_argument('--eval-backend', type=str, help='The evaluation backend to use.')
parser.add_argument('--eval-type', type=str, help='The type for evaluating.',
choices=[EvalType.CHECKPOINT, EvalType.CUSTOM, EvalType.SERVICE])
parser.add_argument('--eval-backend', type=str, help='The evaluation backend to use.',
choices=[EvalBackend.NATIVE, EvalBackend.OPEN_COMPASS, EvalBackend.VLM_EVAL_KIT, EvalBackend.RAG_EVAL]) # noqa: E501
parser.add_argument('--eval-config', type=str, required=False, help='The eval task config file path for evaluation backend.') # noqa: E501
parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.')
parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.',
choices=[EvalStage.ALL, EvalStage.INFER, EvalStage.EVAL])
parser.add_argument('--limit', type=int, default=None, help='Max evaluation samples num for each subset.')

# Cache and working directory arguments
Expand All @@ -62,6 +67,8 @@ def add_argument(parser: argparse.ArgumentParser):
parser.add_argument('--debug', action='store_true', default=False, help='Debug mode, will print information for debugging.') # noqa: E501
parser.add_argument('--dry-run', action='store_true', default=False, help='Dry run in single processing mode.')
parser.add_argument('--seed', type=int, default=42, help='Random seed for reproducibility.')
parser.add_argument('--api-key', type=str, default='EMPTY', help='The API key for the remote API model.')
parser.add_argument('--api-url', type=str, default=None, help='The API url for the remote API model.')
# yapf: enable


Expand Down
21 changes: 20 additions & 1 deletion evalscope/benchmarks/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,23 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import glob
import importlib
import os

from evalscope.benchmarks.benchmark import Benchmark
from evalscope.benchmarks.benchmark import Benchmark, BenchmarkMeta
from evalscope.benchmarks.data_adapter import DataAdapter
from evalscope.utils import get_logger

logger = get_logger()

# Using glob to find all files matching the pattern
pattern = os.path.join(os.path.dirname(__file__), '*', '*_adapter.py')
files = glob.glob(pattern, recursive=False)

for file_path in files:
if file_path.endswith('.py') and not os.path.basename(file_path).startswith('_'):
# Convert file path to a module path
relative_path = os.path.relpath(file_path, os.path.dirname(__file__))
module_path = relative_path[:-3].replace(os.path.sep, '.') # strip '.py' and convert to module path
full_path = f'evalscope.benchmarks.{module_path}'
importlib.import_module(full_path)
# print(f'Importing {full_path}')
5 changes: 0 additions & 5 deletions evalscope/benchmarks/arc/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1 @@
# Copyright (c) Alibaba, Inc. and its affiliates.

from evalscope.benchmarks.arc.arc_adapter import DATASET_ID, SUBSET_LIST
from evalscope.benchmarks.arc.arc_adapter import ARCAdapter
from evalscope.benchmarks.arc.arc_adapter import ARCAdapter as DataAdapterClass
from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass # noqa
122 changes: 23 additions & 99 deletions evalscope/benchmarks/arc/arc_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,40 +3,35 @@
import json
import os

from evalscope.benchmarks.data_adapter import DataAdapter
from evalscope.metrics.metrics import exact_match, weighted_mean
from evalscope.utils import ResponseParser, normalize_score
from evalscope.benchmarks import Benchmark, DataAdapter
from evalscope.constants import EvalType
from evalscope.metrics import WeightedAverageAccuracy, exact_match
from evalscope.models import MultiChoiceModelAdapter
from evalscope.utils import ResponseParser
from evalscope.utils.logger import get_logger

# flake8: noqa

logger = get_logger()

DATASET_ID = 'modelscope/ai2_arc'

# task_list = ['ARC-Easy', 'ARC-Challenge']
SUBSET_LIST = ['ARC-Challenge']


@Benchmark.register(
name='arc',
dataset_id='modelscope/ai2_arc',
model_adapter=MultiChoiceModelAdapter,
subset_list=['ARC-Easy', 'ARC-Challenge'],
metric_list=[WeightedAverageAccuracy],
few_shot_num=0,
train_split='train',
eval_split='test',
prompt_template='',
)
class ARCAdapter(DataAdapter):

choices = ['A', 'B', 'C', 'D']

def __init__(self,
subset_list: list = None,
metric_list: list = None,
few_shot_num: int = None,
train_split: str = 'train',
eval_split: str = 'test',
prompt_template: str = '',
**kwargs):

if subset_list is None:
subset_list = SUBSET_LIST

if metric_list is None:
metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]

def __init__(self, **kwargs):
few_shot_num = kwargs.get('few_shot_num', None)
if few_shot_num is None:
# Use 0-shot by default
logger.info(f'Set 0-shot examples by system for ARC.')
Expand All @@ -45,14 +40,7 @@ def __init__(self,
if few_shot_num != 0:
logger.warning(f'few_shot_num is recommended to set 0 for ARC, got {few_shot_num}.')

super().__init__(
subset_list=subset_list,
metric_list=metric_list,
few_shot_num=few_shot_num,
train_split=train_split,
eval_split=eval_split,
prompt_template=prompt_template,
**kwargs)
super().__init__(**kwargs)

def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
"""
Expand Down Expand Up @@ -132,7 +120,7 @@ def get_gold_answer(self, input_d: dict) -> str:
# Get the gold choice
return input_d.get('answerKey', '')

def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
"""
Parse the model output to get the answer. Could be the best choice index.

Expand All @@ -144,12 +132,12 @@ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: st
Returns:
The parsed answer. Depending on the dataset. Usually a string for chat.
"""
if eval_type == 'checkpoint':
if eval_type == EvalType.CHECKPOINT:
return result
elif eval_type == 'service':
elif eval_type == EvalType.SERVICE:
return ResponseParser.parse_first_option_with_choices(
text=result, options=self.choices) # TODO: to be checked !
elif eval_type == 'custom':
elif eval_type == EvalType.CUSTOM:
return ResponseParser.parse_first_option_with_choices(
text=result, options=self.choices) # TODO: to be checked !
else:
Expand All @@ -158,70 +146,6 @@ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: st
def match(self, gold: str, pred: str) -> float:
return exact_match(gold=gold, pred=pred)

def compute_metric(self, review_res_list: list) -> float:
"""
Compute evaluation result by specific metric.

Args:
review_res_list: review score list, e.g. [0, 1, 1, 0, ...]

Returns:
The metric score.
"""
items = [(score, 1.0) for score in review_res_list]
return weighted_mean(items)

def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
"""
Generate the report for the model output.

Args:
subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
report_name: The user-defined report name.

Returns: A dict of metric calculation results. The format is like:
{
"name":"ARC",
"metric":"WeightedAverageAccuracy",
"score":0.3389,
"category":[
{
"name":"DEFAULT",
"score":0.4128,
"subset":[
{
"name":"ARC-Easy",
"score":0.5632
},
{
"name":"ARC-Challenge",
"score":0.3157
}
]
}
],
"total_num":7800
}
"""
total_num: int = sum([num for _, num in subset_score_map.values()])
weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
weighted_avg_acc = normalize_score(score=weighted_avg_acc)
cate_avg_list = [{
'name': subset_name,
'score': normalize_score(score=score)
} for subset_name, (score, _) in subset_score_map.items()]

category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)

res_map = dict(
name=report_name or 'arc',
metric=self.metric_list[0]['name'],
score=weighted_avg_acc,
category=[category_d],
total_num=total_num)

return res_map

@classmethod
def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:

Expand Down
4 changes: 0 additions & 4 deletions evalscope/benchmarks/bbh/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1 @@
# Copyright (c) Alibaba, Inc. and its affiliates.

from evalscope.benchmarks.bbh.bbh_adapter import DATASET_ID, SUBSET_LIST
from evalscope.benchmarks.bbh.bbh_adapter import BBHAdapter as DataAdapterClass
from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass # noqa
Loading
Loading