diff --git a/docs/zh/user_guides/backend/rageval_backend/mteb.md b/docs/zh/user_guides/backend/rageval_backend/mteb.md index 1a864a55..0e0937d5 100644 --- a/docs/zh/user_guides/backend/rageval_backend/mteb.md +++ b/docs/zh/user_guides/backend/rageval_backend/mteb.md @@ -102,7 +102,8 @@ one_stage_task_cfg = { ### 两阶段评测 -配置文件示例如下,先进行检索,再进行reranking: +评测reranker需要用retrieval数据集,先用embedding模型检索topk,再进行排序。配置文件示例如下: + ```python two_stage_task_cfg = { "eval_backend": "RAGEval", diff --git a/evalscope/arguments.py b/evalscope/arguments.py index 12ea7703..a8c4d262 100644 --- a/evalscope/arguments.py +++ b/evalscope/arguments.py @@ -1,6 +1,8 @@ import argparse import json +from evalscope.constants import EvalBackend, EvalStage, EvalType + class ParseStrArgsAction(argparse.Action): @@ -47,10 +49,13 @@ def add_argument(parser: argparse.ArgumentParser): parser.add_argument('--generation-config', type=str, action=ParseStrArgsAction, help='The generation config, should be a string.') # noqa: E501 # Evaluation-related arguments - parser.add_argument('--eval-type', type=str, help='The type for evaluating.') - parser.add_argument('--eval-backend', type=str, help='The evaluation backend to use.') + parser.add_argument('--eval-type', type=str, help='The type for evaluating.', + choices=[EvalType.CHECKPOINT, EvalType.CUSTOM, EvalType.SERVICE]) + parser.add_argument('--eval-backend', type=str, help='The evaluation backend to use.', + choices=[EvalBackend.NATIVE, EvalBackend.OPEN_COMPASS, EvalBackend.VLM_EVAL_KIT, EvalBackend.RAG_EVAL]) # noqa: E501 parser.add_argument('--eval-config', type=str, required=False, help='The eval task config file path for evaluation backend.') # noqa: E501 - parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.') + parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.', + choices=[EvalStage.ALL, EvalStage.INFER, EvalStage.EVAL]) parser.add_argument('--limit', type=int, default=None, help='Max evaluation samples num for each subset.') # Cache and working directory arguments @@ -62,6 +67,8 @@ def add_argument(parser: argparse.ArgumentParser): parser.add_argument('--debug', action='store_true', default=False, help='Debug mode, will print information for debugging.') # noqa: E501 parser.add_argument('--dry-run', action='store_true', default=False, help='Dry run in single processing mode.') parser.add_argument('--seed', type=int, default=42, help='Random seed for reproducibility.') + parser.add_argument('--api-key', type=str, default='EMPTY', help='The API key for the remote API model.') + parser.add_argument('--api-url', type=str, default=None, help='The API url for the remote API model.') # yapf: enable diff --git a/evalscope/benchmarks/__init__.py b/evalscope/benchmarks/__init__.py index b863b5ab..984f4f00 100644 --- a/evalscope/benchmarks/__init__.py +++ b/evalscope/benchmarks/__init__.py @@ -1,4 +1,23 @@ # Copyright (c) Alibaba, Inc. and its affiliates. +import glob +import importlib +import os -from evalscope.benchmarks.benchmark import Benchmark +from evalscope.benchmarks.benchmark import Benchmark, BenchmarkMeta from evalscope.benchmarks.data_adapter import DataAdapter +from evalscope.utils import get_logger + +logger = get_logger() + +# Using glob to find all files matching the pattern +pattern = os.path.join(os.path.dirname(__file__), '*', '*_adapter.py') +files = glob.glob(pattern, recursive=False) + +for file_path in files: + if file_path.endswith('.py') and not os.path.basename(file_path).startswith('_'): + # Convert file path to a module path + relative_path = os.path.relpath(file_path, os.path.dirname(__file__)) + module_path = relative_path[:-3].replace(os.path.sep, '.') # strip '.py' and convert to module path + full_path = f'evalscope.benchmarks.{module_path}' + importlib.import_module(full_path) + # print(f'Importing {full_path}') diff --git a/evalscope/benchmarks/arc/__init__.py b/evalscope/benchmarks/arc/__init__.py index 8b7d5dc4..b937315b 100644 --- a/evalscope/benchmarks/arc/__init__.py +++ b/evalscope/benchmarks/arc/__init__.py @@ -1,6 +1 @@ # Copyright (c) Alibaba, Inc. and its affiliates. - -from evalscope.benchmarks.arc.arc_adapter import DATASET_ID, SUBSET_LIST -from evalscope.benchmarks.arc.arc_adapter import ARCAdapter -from evalscope.benchmarks.arc.arc_adapter import ARCAdapter as DataAdapterClass -from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass # noqa diff --git a/evalscope/benchmarks/arc/arc_adapter.py b/evalscope/benchmarks/arc/arc_adapter.py index 46b1f6a5..eb470c9a 100644 --- a/evalscope/benchmarks/arc/arc_adapter.py +++ b/evalscope/benchmarks/arc/arc_adapter.py @@ -3,40 +3,35 @@ import json import os -from evalscope.benchmarks.data_adapter import DataAdapter -from evalscope.metrics.metrics import exact_match, weighted_mean -from evalscope.utils import ResponseParser, normalize_score +from evalscope.benchmarks import Benchmark, DataAdapter +from evalscope.constants import EvalType +from evalscope.metrics import WeightedAverageAccuracy, exact_match +from evalscope.models import MultiChoiceModelAdapter +from evalscope.utils import ResponseParser from evalscope.utils.logger import get_logger # flake8: noqa logger = get_logger() -DATASET_ID = 'modelscope/ai2_arc' - -# task_list = ['ARC-Easy', 'ARC-Challenge'] -SUBSET_LIST = ['ARC-Challenge'] - +@Benchmark.register( + name='arc', + dataset_id='modelscope/ai2_arc', + model_adapter=MultiChoiceModelAdapter, + subset_list=['ARC-Easy', 'ARC-Challenge'], + metric_list=[WeightedAverageAccuracy], + few_shot_num=0, + train_split='train', + eval_split='test', + prompt_template='', +) class ARCAdapter(DataAdapter): choices = ['A', 'B', 'C', 'D'] - def __init__(self, - subset_list: list = None, - metric_list: list = None, - few_shot_num: int = None, - train_split: str = 'train', - eval_split: str = 'test', - prompt_template: str = '', - **kwargs): - - if subset_list is None: - subset_list = SUBSET_LIST - - if metric_list is None: - metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}] - + def __init__(self, **kwargs): + few_shot_num = kwargs.get('few_shot_num', None) if few_shot_num is None: # Use 0-shot by default logger.info(f'Set 0-shot examples by system for ARC.') @@ -45,14 +40,7 @@ def __init__(self, if few_shot_num != 0: logger.warning(f'few_shot_num is recommended to set 0 for ARC, got {few_shot_num}.') - super().__init__( - subset_list=subset_list, - metric_list=metric_list, - few_shot_num=few_shot_num, - train_split=train_split, - eval_split=eval_split, - prompt_template=prompt_template, - **kwargs) + super().__init__(**kwargs) def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict: """ @@ -132,7 +120,7 @@ def get_gold_answer(self, input_d: dict) -> str: # Get the gold choice return input_d.get('answerKey', '') - def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str: + def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str: """ Parse the model output to get the answer. Could be the best choice index. @@ -144,12 +132,12 @@ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: st Returns: The parsed answer. Depending on the dataset. Usually a string for chat. """ - if eval_type == 'checkpoint': + if eval_type == EvalType.CHECKPOINT: return result - elif eval_type == 'service': + elif eval_type == EvalType.SERVICE: return ResponseParser.parse_first_option_with_choices( text=result, options=self.choices) # TODO: to be checked ! - elif eval_type == 'custom': + elif eval_type == EvalType.CUSTOM: return ResponseParser.parse_first_option_with_choices( text=result, options=self.choices) # TODO: to be checked ! else: @@ -158,70 +146,6 @@ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: st def match(self, gold: str, pred: str) -> float: return exact_match(gold=gold, pred=pred) - def compute_metric(self, review_res_list: list) -> float: - """ - Compute evaluation result by specific metric. - - Args: - review_res_list: review score list, e.g. [0, 1, 1, 0, ...] - - Returns: - The metric score. - """ - items = [(score, 1.0) for score in review_res_list] - return weighted_mean(items) - - def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict: - """ - Generate the report for the model output. - - Args: - subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...} - report_name: The user-defined report name. - - Returns: A dict of metric calculation results. The format is like: - { - "name":"ARC", - "metric":"WeightedAverageAccuracy", - "score":0.3389, - "category":[ - { - "name":"DEFAULT", - "score":0.4128, - "subset":[ - { - "name":"ARC-Easy", - "score":0.5632 - }, - { - "name":"ARC-Challenge", - "score":0.3157 - } - ] - } - ], - "total_num":7800 - } - """ - total_num: int = sum([num for _, num in subset_score_map.values()]) - weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num - weighted_avg_acc = normalize_score(score=weighted_avg_acc) - cate_avg_list = [{ - 'name': subset_name, - 'score': normalize_score(score=score) - } for subset_name, (score, _) in subset_score_map.items()] - - category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list) - - res_map = dict( - name=report_name or 'arc', - metric=self.metric_list[0]['name'], - score=weighted_avg_acc, - category=[category_d], - total_num=total_num) - - return res_map - @classmethod def _generate_prompt(cls, input_d: dict, include_answer=True) -> str: diff --git a/evalscope/benchmarks/bbh/__init__.py b/evalscope/benchmarks/bbh/__init__.py index 7387c94c..b937315b 100644 --- a/evalscope/benchmarks/bbh/__init__.py +++ b/evalscope/benchmarks/bbh/__init__.py @@ -1,5 +1 @@ # Copyright (c) Alibaba, Inc. and its affiliates. - -from evalscope.benchmarks.bbh.bbh_adapter import DATASET_ID, SUBSET_LIST -from evalscope.benchmarks.bbh.bbh_adapter import BBHAdapter as DataAdapterClass -from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass # noqa diff --git a/evalscope/benchmarks/bbh/bbh_adapter.py b/evalscope/benchmarks/bbh/bbh_adapter.py index 356639a6..5f049a35 100644 --- a/evalscope/benchmarks/bbh/bbh_adapter.py +++ b/evalscope/benchmarks/bbh/bbh_adapter.py @@ -5,18 +5,17 @@ import random import re -from evalscope.benchmarks.data_adapter import DataAdapter +from evalscope.benchmarks import Benchmark, DataAdapter from evalscope.constants import AnswerKeys -from evalscope.metrics.metrics import exact_match, weighted_mean -from evalscope.utils import ResponseParser, normalize_score +from evalscope.metrics import WeightedAverageAccuracy, exact_match +from evalscope.models.chat_adapter import ChatGenerationModelAdapter +from evalscope.utils import ResponseParser from evalscope.utils.logger import get_logger # flake8: noqa logger = get_logger() -DATASET_ID = 'modelscope/bbh' - # BBH multiple choice subset list MULTIPLE_CHOICE = 'multiple_choice' MULTIPLE_CHOICE_LIST = [ @@ -59,41 +58,32 @@ SUBSET_LIST = MULTIPLE_CHOICE_LIST + FREE_FORM_LIST +@Benchmark.register( + name='bbh', + dataset_id='modelscope/bbh', + model_adapter=ChatGenerationModelAdapter, + subset_list=SUBSET_LIST, + metric_list=[WeightedAverageAccuracy], + few_shot_num=3, + train_split=None, + eval_split='test', + prompt_template='', +) class BBHAdapter(DataAdapter): """ Adapter for BBH free-form and multiple-choices sub-tasks. """ - def __init__(self, - subset_list: list = None, - metric_list: list = None, - few_shot_num: int = None, - train_split: str = None, - eval_split: str = 'test', - **kwargs): - - if subset_list is None: - subset_list = SUBSET_LIST + def __init__(self, **kwargs): - if metric_list is None: - metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}] - - if few_shot_num is None: - logger.info(f'Set 3-shot examples by system for BBH.') - few_shot_num = 3 + few_shot_num = kwargs.get('few_shot_num', 3) if few_shot_num != 3 and few_shot_num != 0: logger.error(f'BBH uses 3-shot examples with CoT or 0-shot by system, but got {few_shot_num}. ' f'Use 3-shot by default.') - few_shot_num = 3 + kwargs['few_shot_num'] = 3 - super().__init__( - subset_list=subset_list, - metric_list=metric_list, - few_shot_num=few_shot_num, - train_split=train_split, - eval_split=eval_split, - **kwargs) + super().__init__(**kwargs) def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict: data_dict = {} @@ -217,66 +207,6 @@ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: st def match(self, gold: str, pred: str) -> float: return exact_match(gold=gold, pred=pred) - def compute_metric(self, review_res_list: list) -> float: - """ - Compute evaluation result by specific metric. - - Args: - review_res_list: review score list, e.g. [0, 1, 1, 0, ...] - - Returns: - The metric score. - """ - items = [(score, 1.0) for score in review_res_list] - return weighted_mean(items) - - def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict: - """ - Generate the report for the model output. - - Args: - subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...} - report_name: The user-defined report name. - - Returns: A dict of metric calculation results. The format is like: - { - "name":"BBH", - "metric":"WeightedAverageAccuracy", - "score":0.3389, - "category":[ - { - "name":"DEFAULT", - "score":0.3389, - "subset":[ - { - "name":"BBH", - "score":0.3389 - }, - ] - } - ], - "total_num":100 - } - """ - total_num: int = sum([num for _, num in subset_score_map.values()]) - weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num - weighted_avg_acc = normalize_score(score=weighted_avg_acc) - cate_avg_list = [{ - 'name': subset_name, - 'score': normalize_score(score=score) - } for subset_name, (score, _) in subset_score_map.items()] - - category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list) - - res_map = dict( - name=report_name or 'bbh', - metric=self.metric_list[0]['name'], - score=weighted_avg_acc, - category=[category_d], - total_num=total_num) - - return res_map - @classmethod def _extract_mc_answer(cls, ans: str) -> str: """ diff --git a/evalscope/benchmarks/benchmark.py b/evalscope/benchmarks/benchmark.py index aafc9868..12f00e99 100644 --- a/evalscope/benchmarks/benchmark.py +++ b/evalscope/benchmarks/benchmark.py @@ -1,65 +1,76 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. +import copy +from dataclasses import dataclass, field +from typing import TYPE_CHECKING, Dict, List, Optional -import os.path -from modelscope.msdatasets import MsDataset -from typing import Optional +if TYPE_CHECKING: + from evalscope.benchmarks import DataAdapter -from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, HubType +from evalscope.models import BaseModelAdapter +BENCHMARK_MAPPINGS = {} -class Benchmark(object): - """ - Wrapper for loading datasets from ModelScope or HuggingFace. - """ + +@dataclass +class BenchmarkMeta: + name: str + dataset_id: str + data_adapter: 'DataAdapter' + model_adapter: BaseModelAdapter + subset_list: List[str] = field(default_factory=list) + metric_list: List[dict] = field(default_factory=list) + few_shot_num: int = 0 + few_shot_random: bool = False + train_split: Optional[str] = None + eval_split: Optional[str] = None + prompt_template: str = '' + + def _update(self, args: dict): + if args.get('local_path'): + self.dataset_id = args['local_path'] + del args['local_path'] + self.__dict__.update(args) + + def to_dict(self) -> dict: + return self.__dict__ + + def to_string_dict(self) -> dict: + cur_dict = copy.deepcopy(self.__dict__) + # cur_dict['data_adapter'] = self.data_adapter.__name__ + # cur_dict['model_adapter'] = self.model_adapter.__name__ + # cur_dict['metric_list'] = [metric['name'] for metric in self.metric_list] + del cur_dict['data_adapter'] + del cur_dict['model_adapter'] + del cur_dict['metric_list'] + return cur_dict + + def get_data_adapter(self, config: dict = {}) -> 'DataAdapter': + if config: + self._update(config) + + data_adapter = self.data_adapter(**self.to_dict()) + return data_adapter + + +class Benchmark: def __init__(self): - ... - - @staticmethod - def load(dataset_name: str, - subset: str = None, - split: str = None, - token: str = None, - hub: str = 'ModelScope', - work_dir: Optional[str] = DEFAULT_DATASET_CACHE_DIR, - **kwargs): - """ - Load a dataset from ModelScope or HuggingFace. - - Args: - dataset_name (str): The dataset id or path. - If it is dataset id, should be in the format of `organization/name` for ModelScope and HuggingFace hub. - If it is dataset path, should be the path on local disk. - subset (str): - split: - token: sdk token for ModelScope, optional, default None - hub: `ModelScope` or `HuggingFace` - work_dir: the work directory for caching, optional - - Returns: - A dict. - """ - - dataset = MsDataset.load( - dataset_name=dataset_name, - subset_name=subset, - split=split, - token=token, - cache_dir=work_dir, - hub=hub, - **kwargs) - - dataset.dataset_name = dataset_name.split('/')[-1] - dataset.subset_name = subset - # dataset.split = split - return dataset - - -if __name__ == '__main__': - - ds = Benchmark.load(dataset_name='mmlu', subset='management', split=None) - - n = 1 - for i in ds: - print('>', n, ': ', i) - n += 1 + pass + + @classmethod + def get(cls, name: str) -> 'BenchmarkMeta': + if name not in BENCHMARK_MAPPINGS: + raise Exception(f'Unknown benchmark: {name}. Available tasks: {BENCHMARK_MAPPINGS.keys()}') + benchmark = BENCHMARK_MAPPINGS[name] + return benchmark + + @classmethod + def register(cls, name: str, dataset_id: str, model_adapter: BaseModelAdapter, **kwargs): + + def register_wrapper(data_adapter): + if name in BENCHMARK_MAPPINGS: + raise Exception(f'Benchmark {name} already registered') + BENCHMARK_MAPPINGS[name] = BenchmarkMeta( + name=name, data_adapter=data_adapter, model_adapter=model_adapter, dataset_id=dataset_id, **kwargs) + return data_adapter + + return register_wrapper diff --git a/evalscope/benchmarks/ceval/__init__.py b/evalscope/benchmarks/ceval/__init__.py index b7532a3d..b937315b 100644 --- a/evalscope/benchmarks/ceval/__init__.py +++ b/evalscope/benchmarks/ceval/__init__.py @@ -1,6 +1 @@ # Copyright (c) Alibaba, Inc. and its affiliates. - -from evalscope.benchmarks.ceval.ceval_adapter import DATASET_ID, SUBJECT_MAPPING, SUBSET_LIST -from evalscope.benchmarks.ceval.ceval_adapter import CEVALAdapter -from evalscope.benchmarks.ceval.ceval_adapter import CEVALAdapter as DataAdapterClass -from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass # noqa diff --git a/evalscope/benchmarks/ceval/ceval_adapter.py b/evalscope/benchmarks/ceval/ceval_adapter.py index 543b6204..ee1b64ac 100644 --- a/evalscope/benchmarks/ceval/ceval_adapter.py +++ b/evalscope/benchmarks/ceval/ceval_adapter.py @@ -2,8 +2,11 @@ import csv import os -from evalscope.benchmarks.data_adapter import DataAdapter +from evalscope.benchmarks import Benchmark, DataAdapter +from evalscope.constants import EvalType +from evalscope.metrics import WeightedAverageAccuracy from evalscope.metrics.metrics import exact_match, weighted_mean +from evalscope.models import MultiChoiceModelAdapter from evalscope.utils import ResponseParser, normalize_score from evalscope.utils.logger import get_logger @@ -11,8 +14,6 @@ logger = get_logger() -DATASET_ID = 'modelscope/ceval-exam' - SUBSET_LIST = [ 'computer_network', 'operating_system', @@ -124,40 +125,28 @@ } +@Benchmark.register( + name='ceval', + dataset_id='modelscope/ceval-exam', + model_adapter=MultiChoiceModelAdapter, + subset_list=SUBSET_LIST, + metric_list=[WeightedAverageAccuracy], + few_shot_num=0, + train_split='dev', + eval_split='val', +) class CEVALAdapter(DataAdapter): choices = ['A', 'B', 'C', 'D'] - def __init__(self, - subset_list: list = None, - metric_list: list = None, - few_shot_num: int = None, - train_split: str = 'dev', - eval_split: str = 'val', - **kwargs): - - if subset_list is None: - subset_list = SUBSET_LIST - - if metric_list is None: - metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}] - - if few_shot_num is None: - # Use 5-shot by default - logger.info(f'Set 0-shot examples by default for C-Eval.') - few_shot_num = 0 + def __init__(self, **kwargs): + few_shot_num = kwargs.get('few_shot_num', 0) if few_shot_num > 5: logger.warning(f'few_shot_num <= 5 for C-Eval, but got {few_shot_num}. Use 5-shot by default.') - few_shot_num = 5 + kwargs['few_shot_num'] = 5 - super().__init__( - subset_list=subset_list, - metric_list=metric_list, - few_shot_num=few_shot_num, - train_split=train_split, - eval_split=eval_split, - **kwargs) + super().__init__(**kwargs) def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict: data_dict = {} @@ -223,7 +212,7 @@ def get_gold_answer(self, input_d: dict) -> str: # Get the gold choice return input_d.get('answer', '') - def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str: + def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str: """ Parse the model output to get the answer. Could be the best choice index. @@ -235,11 +224,11 @@ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: st Returns: The parsed answer. Depending on the dataset. Usually a string for chat. """ - if eval_type == 'checkpoint': + if eval_type == EvalType.CHECKPOINT: return result - elif eval_type == 'service': + elif eval_type == EvalType.SERVICE: return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked ! - elif eval_type == 'custom': + elif eval_type == EvalType.CUSTOM: return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked ! else: raise ValueError(f'Invalid eval_type: {eval_type}') @@ -247,19 +236,6 @@ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: st def match(self, gold: str, pred: str) -> float: return exact_match(gold=gold, pred=pred) - def compute_metric(self, review_res_list: list) -> float: - """ - Compute evaluation result by specific metric. - - Args: - review_res_list: review score list, e.g. [0, 1, 1, 0, ...] - - Returns: - The metric score. - """ - items = [(score, 1.0) for score in review_res_list] - return weighted_mean(items) - def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict: """ Generate report for the evaluation. diff --git a/evalscope/benchmarks/cmmlu/__init__.py b/evalscope/benchmarks/cmmlu/__init__.py index 864f8469..b937315b 100644 --- a/evalscope/benchmarks/cmmlu/__init__.py +++ b/evalscope/benchmarks/cmmlu/__init__.py @@ -1,6 +1 @@ # Copyright (c) Alibaba, Inc. and its affiliates. - -from evalscope.benchmarks.cmmlu.cmmlu_adapter import DATASET_ID, SUBJECT_MAPPING, SUBSET_LIST -from evalscope.benchmarks.cmmlu.cmmlu_adapter import CMMLUAdapter -from evalscope.benchmarks.cmmlu.cmmlu_adapter import CMMLUAdapter as DataAdapterClass -from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass # noqa diff --git a/evalscope/benchmarks/cmmlu/cmmlu_adapter.py b/evalscope/benchmarks/cmmlu/cmmlu_adapter.py index 7e358f81..8fc41dd4 100644 --- a/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +++ b/evalscope/benchmarks/cmmlu/cmmlu_adapter.py @@ -3,8 +3,10 @@ import csv import os -from evalscope.benchmarks.data_adapter import DataAdapter -from evalscope.metrics.metrics import exact_match, weighted_mean +from evalscope.benchmarks import Benchmark, DataAdapter +from evalscope.constants import EvalType +from evalscope.metrics import WeightedAverageAccuracy, exact_match +from evalscope.models import MultiChoiceModelAdapter from evalscope.utils import ResponseParser, normalize_score from evalscope.utils.logger import get_logger @@ -12,8 +14,6 @@ logger = get_logger() -DATASET_ID = 'modelscope/cmmlu' - SUBSET_LIST = [ 'agronomy', 'anatomy', 'ancient_chinese', 'arts', 'astronomy', 'business_ethics', 'chinese_civil_service_exam', 'chinese_driving_rule', 'chinese_food_culture', 'chinese_foreign_policy', 'chinese_history', 'chinese_literature', @@ -101,31 +101,23 @@ } +@Benchmark.register( + name='cmmlu', + dataset_id='modelscope/cmmlu', + model_adapter=MultiChoiceModelAdapter, + subset_list=SUBSET_LIST, + metric_list=[WeightedAverageAccuracy], + few_shot_num=5, + train_split='dev', + eval_split='test', +) class CMMLUAdapter(DataAdapter): choices = ['A', 'B', 'C', 'D'] - def __init__(self, - subset_list: list = None, - metric_list: list = None, - few_shot_num: int = 5, - train_split: str = 'dev', - eval_split: str = 'test', - **kwargs): - - if subset_list is None: - subset_list = SUBSET_LIST - - if metric_list is None: - metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}] + def __init__(self, **kwargs): - super().__init__( - subset_list=subset_list, - metric_list=metric_list, - few_shot_num=few_shot_num, - train_split=train_split, - eval_split=eval_split, - **kwargs) + super().__init__(**kwargs) def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict: data_dict = {} @@ -187,7 +179,7 @@ def get_gold_answer(self, input_d: dict) -> str: # Get the gold choice return input_d.get('Answer', '') - def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str: + def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str: """ Parse the model output to get the answer. Could be the best choice index. @@ -199,11 +191,11 @@ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: st Returns: The parsed answer. Depending on the dataset. Usually a string for chat. """ - if eval_type == 'checkpoint': + if eval_type == EvalType.CHECKPOINT: return result - elif eval_type == 'service': + elif eval_type == EvalType.SERVICE: return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked ! - elif eval_type == 'custom': + elif eval_type == EvalType.CUSTOM: return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked ! else: raise ValueError(f'Invalid eval_type: {eval_type}') @@ -211,19 +203,6 @@ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: st def match(self, gold: str, pred: str) -> float: return exact_match(gold=gold, pred=pred) - def compute_metric(self, review_res_list: list) -> float: - """ - Compute evaluation result by specific metric. - - Args: - review_res_list: review score list, e.g. [0, 1, 1, 0, ...] - - Returns: - The metric score. - """ - items = [(score, 1.0) for score in review_res_list] - return weighted_mean(items) - def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict: """ Generate report for the evaluation. diff --git a/evalscope/benchmarks/competition_math/__init__.py b/evalscope/benchmarks/competition_math/__init__.py index 85efbf4f..b937315b 100644 --- a/evalscope/benchmarks/competition_math/__init__.py +++ b/evalscope/benchmarks/competition_math/__init__.py @@ -1,6 +1 @@ # Copyright (c) Alibaba, Inc. and its affiliates. - -from evalscope.benchmarks.competition_math.competition_math_adapter import DATASET_ID, SUBSET_LIST -from evalscope.benchmarks.competition_math.competition_math_adapter import CompetitionMathAdapter -from evalscope.benchmarks.competition_math.competition_math_adapter import CompetitionMathAdapter as DataAdapterClass -from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass # noqa diff --git a/evalscope/benchmarks/competition_math/competition_math_adapter.py b/evalscope/benchmarks/competition_math/competition_math_adapter.py index 5daed130..9f2af0c2 100644 --- a/evalscope/benchmarks/competition_math/competition_math_adapter.py +++ b/evalscope/benchmarks/competition_math/competition_math_adapter.py @@ -4,53 +4,39 @@ import json import os -from evalscope.benchmarks import DataAdapter -from evalscope.metrics.metrics import weighted_mean -from evalscope.utils import normalize_score +from evalscope.benchmarks import Benchmark, DataAdapter +from evalscope.metrics import WeightedAverageAccuracy +from evalscope.models import ChatGenerationModelAdapter from evalscope.utils.logger import get_logger # flake8: noqa logger = get_logger() -DATASET_ID = 'modelscope/competition_math' -SUBSET_LIST = ['default'] - +@Benchmark.register( + name='competition_math', + dataset_id='modelscope/competition_math', + model_adapter=ChatGenerationModelAdapter, + subset_list=['default'], + metric_list=[WeightedAverageAccuracy], + few_shot_num=4, + train_split='train', + eval_split='test', + prompt_template='', +) class CompetitionMathAdapter(DataAdapter): - """ TODO: To be tested for all models. """ - - def __init__(self, - subset_list: list = None, - metric_list: list = None, - few_shot_num: int = None, - train_split: str = 'train', - eval_split: str = 'test', - **kwargs): - - if subset_list is None: - subset_list = SUBSET_LIST - - if metric_list is None: - metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}] + """ To be tested for all models. """ - if few_shot_num is None: - # Use 4-shot by default - logger.info(f'Set 4-shot examples by system for MATH.') - few_shot_num = 4 + def __init__(self, **kwargs): + few_shot_num = kwargs.get('few_shot_num', 4) if few_shot_num != 4 and few_shot_num != 0: logger.error(f'The MATH benchmark ONLY supports 4-shot by system or 0-shot settings, ' - f'but got {self.few_shot_num}. Use 4-shot by default.') - few_shot_num = 4 + f'but got {few_shot_num}. Use 4-shot by default.') + kwargs['few_shot_num'] = 4 - super().__init__( - subset_list=subset_list, - metric_list=metric_list, - few_shot_num=few_shot_num, - train_split=train_split, - eval_split=eval_split, - **kwargs) + super().__init__(**kwargs) def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict: data_dict: dict = {} @@ -119,66 +105,6 @@ def match(self, gold: str, pred: str) -> float: return res - def compute_metric(self, review_res_list: list) -> float: - """ - Compute evaluation result by specific metric. - - Args: - review_res_list: review score list, e.g. [0, 1, 1, 0, ...] - - Returns: - The metric score. - """ - items = [(score, 1.0) for score in review_res_list] - return weighted_mean(items) - - def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict: - """ - Generate the report for the model output. - - Args: - subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...} - report_name: The user-defined report name. - - Returns: A dict of metric calculation results. The format is like: - { - "name":"CompetitionMath", - "metric":"WeightedAverageAccuracy", - "score":0.5632, - "category":[ - { - "name":"DEFAULT", - "score":0.5632, - "subset":[ - { - "name":"main", - "score":0.5632 - }, - ] - } - ], - "total_num":100 - } - """ - total_num: int = sum([num for _, num in subset_score_map.values()]) - weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num - weighted_avg_acc = normalize_score(score=weighted_avg_acc) - cate_avg_list = [{ - 'name': subset_name, - 'score': normalize_score(score=score) - } for subset_name, (score, _) in subset_score_map.items()] - - category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list) - - res_map = dict( - name=report_name or 'competition_math', - metric=self.metric_list[0]['name'], - score=weighted_avg_acc, - category=[category_d], - total_num=total_num) - - return res_map - @classmethod def _generate_prompt(cls, input_d: dict, use_fewshot: bool = True) -> str: problem: str = input_d['problem'] diff --git a/evalscope/benchmarks/data_adapter.py b/evalscope/benchmarks/data_adapter.py index 58f09d95..0fd1e787 100644 --- a/evalscope/benchmarks/data_adapter.py +++ b/evalscope/benchmarks/data_adapter.py @@ -2,10 +2,11 @@ import os.path import random from abc import ABC, abstractmethod +from modelscope.msdatasets import MsDataset from typing import Any, Optional -from evalscope.benchmarks import Benchmark -from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, HubType +from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, EvalType, HubType +from evalscope.utils import normalize_score from evalscope.utils.logger import get_logger logger = get_logger() @@ -22,6 +23,11 @@ def __init__(self, prompt_template: str = '', **kwargs): """ + Data Adapter for the benchmark. You need to implement the following methods: + - gen_prompt + - get_gold_answer + - parse_pred_result + - match Args: subset_list: list of subset names for the dataset. metric_list: list, the metric list to evaluate the model on specific benchmark. @@ -55,33 +61,34 @@ def load(self, """ dataset_name_or_path = os.path.expanduser(dataset_name_or_path) + subset_list = subset_list or self.subset_list # Try to load dataset from local disk if os.path.exists(dataset_name_or_path): - logger.info( - f'Loading dataset from local disk: > dataset_name: {dataset_name_or_path} > work_dir: {work_dir}') + logger.info(f'Loading dataset from work_dir: {work_dir}: > dataset_name: {dataset_name_or_path} > \ + subsets: {subset_list}') data_dict = self.load_from_disk(dataset_name_or_path, subset_list, work_dir, **kwargs) if len(data_dict) == 0 or len(next(iter(data_dict.values()))) == 0: raise ValueError(f'Local dataset is empty: {dataset_name_or_path}') else: # Load dataset from remote - logger.info(f'Loading dataset from {datasets_hub} hub: >dataset_name: {dataset_name_or_path}') + logger.info( + f'Loading dataset from {datasets_hub}: > dataset_name: {dataset_name_or_path} > subsets: {subset_list}') data_dict = {} split_list = [split for split in [self.train_split, self.eval_split] if split is not None] if len(split_list) == 0: logger.error(f'Got empty split list: {split_list}') - subset_list = subset_list if subset_list is not None else self.subset_list for sub_name in subset_list: data_dict[sub_name] = {} # e.g. train: few-shot, test: target dataset to evaluate for split in split_list: - dataset = Benchmark.load( + dataset = MsDataset.load( dataset_name=dataset_name_or_path, - subset=sub_name, + subset_name=sub_name, split=split, + cache_dir=work_dir, hub=datasets_hub, - work_dir=work_dir, **kwargs) data_dict[sub_name].update({split: dataset}) @@ -132,13 +139,93 @@ def gen_prompts(self, data_dict: dict) -> dict: prompt_d[AnswerKeys.RAW_INPUT] = sample_d res_dict[sub_name].append(prompt_d) - rnd = random.Random() - rnd.seed(42) - for k, v in res_dict.items(): - rnd.shuffle(v) - return res_dict + def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict: + """ + Generate report for the evaluation results for all subsets. + + Args: + subset_score_map: The subset-score map. + e.g. {subset_name: (score, num)} + + report_name: str, the user-defined report name. Default: None + + Returns: The evaluation report. Note: should normalize the score by normalize_score method in utils. + + Here is a format example for ARC-Challenge: + { + "name":"ARC-Challenge", + "metric":"WeightedAverageAccuracy", + "score": 0.3389, + "category":[ + { + "name":"DEFAULT", + "score": 0.3389, + "subset":[ + { + "name":"ARC-Challenge", + "score": 0.3389, + "num": 100 + }, + ] + } + ], + "total_num":100 + } + """ + total_num: int = sum([num for _, num in subset_score_map.values()]) + weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num + weighted_avg_acc = normalize_score(score=weighted_avg_acc) + cate_avg_list = [{ + 'name': subset_name, + 'score': normalize_score(score=score), + 'num': num + } for subset_name, (score, num) in subset_score_map.items()] + + category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list) + + res_map = dict( + name=report_name or 'DEFAULT', + metric=self.metric_list[0]['name'], + score=weighted_avg_acc, + category=[category_d], + total_num=total_num) + + return res_map + + def get_fewshot_examples(self, data_list: list, k: int, few_shot_random: bool = True): + + if k > len(data_list): + k = len(data_list) + if few_shot_random: + return random.sample(data_list, k) + else: + return data_list[:k] + + def compute_metric(self, review_res_list: list) -> Any: + """ + Compute evaluation result by specific metrics. + + Args: + review_res_list: list, the review result list, each item of which is match result for gold and pred. + + Attributes: + DataAdapter.metric_func_map: metric_name -> metric_func mapping, + e.g. {'WeightedAverageAccuracy': weighted_average_acc} + + Returns: + Metric results. + """ + if len(self.metric_list) == 0: + raise ValueError('No metric list found for the benchmark.') + elif len(self.metric_list) == 1: + # review_res_list: review score list, e.g. [0, 1, 1, 0, ...] + items = [(score, 1.0) for score in review_res_list] + return self.metric_list[0]['object'](items) + else: + raise ValueError('Please implement the compute_metric method for multiple metrics.') + @abstractmethod def gen_prompt(self, *args, **kwargs) -> Any: """ @@ -172,7 +259,7 @@ def get_gold_answer(self, input_d: Any) -> Any: raise NotImplementedError @abstractmethod - def parse_pred_result(self, result: Any, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> Any: + def parse_pred_result(self, result: Any, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> Any: """ Parse the predicted result and extract proper answer. @@ -193,71 +280,11 @@ def match(self, gold: Any, pred: Any) -> Any: Args: gold (Any): The golden answer. Usually a string for chat/multiple-choice-questions. - e.g. 'A' + e.g. 'A', extracted from get_gold_answer method. pred (Any): The predicted answer. Usually a string for chat/multiple-choice-questions. - e.g. 'B' + e.g. 'B', extracted from parse_pred_result method. Returns: The match result. Usually a score (float) for chat/multiple-choice-questions. """ raise NotImplementedError - - @abstractmethod - def compute_metric(self, review_res_list: list) -> Any: - """ - Compute evaluation result by specific metrics. - - Args: - review_res_list: list, the review result list, each item of which is match result for gold and pred. - - Attributes: - DataAdapter.metric_func_map: metric_name -> metric_func mapping, - e.g. {'WeightedAverageAccuracy': weighted_average_acc} - - Returns: - Metric results. - """ - raise NotImplementedError - - def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict: - """ - Generate report for the evaluation results for all subsets. - - Args: - subset_score_map: The subset-score map. - e.g. {subset_name: (score, num)} - - report_name: str, the user-defined report name. Default: None - - Returns: The evaluation report. Note: should normalize the score by normalize_score method in utils. - - Here is a format example for ARC-Challenge: - { - "name":"ARC-Challenge", - "metric":"WeightedAverageAccuracy", - "score": 0.3389, - "category":[ - { - "name":"DEFAULT", - "score": 0.3389, - "subset":[ - { - "name":"ARC-Challenge", - "score": 0.3389 - }, - ] - } - ], - "total_num":100 - } - """ - raise NotImplementedError - - def get_fewshot_examples(self, data_list: list, k: int, few_shot_random: bool = True): - - if k > len(data_list): - k = len(data_list) - if few_shot_random: - return random.sample(data_list, k) - else: - return data_list[:k] diff --git a/evalscope/benchmarks/general_qa/__init__.py b/evalscope/benchmarks/general_qa/__init__.py index 2e732005..b937315b 100644 --- a/evalscope/benchmarks/general_qa/__init__.py +++ b/evalscope/benchmarks/general_qa/__init__.py @@ -1,6 +1 @@ # Copyright (c) Alibaba, Inc. and its affiliates. - -from evalscope.benchmarks.general_qa.general_qa_adapter import DATASET_ID, SUBSET_LIST -from evalscope.benchmarks.general_qa.general_qa_adapter import GeneralQAAdapter -from evalscope.benchmarks.general_qa.general_qa_adapter import GeneralQAAdapter as DataAdapterClass -from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass diff --git a/evalscope/benchmarks/general_qa/general_qa_adapter.py b/evalscope/benchmarks/general_qa/general_qa_adapter.py index c0178a96..e2941687 100644 --- a/evalscope/benchmarks/general_qa/general_qa_adapter.py +++ b/evalscope/benchmarks/general_qa/general_qa_adapter.py @@ -5,35 +5,32 @@ from collections import defaultdict from typing import Any, Optional -from evalscope.benchmarks.data_adapter import DataAdapter -from evalscope.metrics.metrics import bleu_ngram_one_sample, weighted_mean -from evalscope.metrics.rouge_metric import compute_rouge_score_one_sample_zh +from evalscope.benchmarks import Benchmark, DataAdapter +from evalscope.metrics import (WeightedAverageBLEU, bleu_ngram_one_sample, compute_rouge_score_one_sample_zh, + weighted_mean) +from evalscope.models import ChatGenerationModelAdapter from evalscope.utils.io_utils import jsonl_to_list from evalscope.utils.logger import get_logger logger = get_logger() -DATASET_ID = 'general_qa' -SUBSET_LIST = ['default'] - +@Benchmark.register( + name='general_qa', + dataset_id='general_qa', + model_adapter=ChatGenerationModelAdapter, + subset_list=['default'], + metric_list=[WeightedAverageBLEU], + few_shot_num=0, + train_split=None, + eval_split='test', +) class GeneralQAAdapter(DataAdapter): # TODO: set few_shot_num - def __init__(self, - subset_list: list = None, - metric_list: list = None, - train_split: str = None, - eval_split: str = 'test', - **kwargs): - if subset_list is None: - subset_list = SUBSET_LIST - - if metric_list is None: - metric_list = [{'name': 'WeightedAverageBLEU', 'object': weighted_mean}] + def __init__(self, **kwargs): - super().__init__( - subset_list=subset_list, metric_list=metric_list, train_split=train_split, eval_split=eval_split, **kwargs) + super().__init__(**kwargs) def load(self, dataset_name_or_path: str, subset_list: list = None, **kwargs) -> dict: diff --git a/evalscope/benchmarks/gsm8k/__init__.py b/evalscope/benchmarks/gsm8k/__init__.py index 968a91dd..b937315b 100644 --- a/evalscope/benchmarks/gsm8k/__init__.py +++ b/evalscope/benchmarks/gsm8k/__init__.py @@ -1,5 +1 @@ # Copyright (c) Alibaba, Inc. and its affiliates. - -from evalscope.benchmarks.gsm8k.gsm8k_adapter import DATASET_ID, SUBSET_LIST -from evalscope.benchmarks.gsm8k.gsm8k_adapter import GSM8KAdapter as DataAdapterClass -from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass # noqa diff --git a/evalscope/benchmarks/gsm8k/gsm8k_adapter.py b/evalscope/benchmarks/gsm8k/gsm8k_adapter.py index e33d8ed0..23541f07 100644 --- a/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +++ b/evalscope/benchmarks/gsm8k/gsm8k_adapter.py @@ -1,35 +1,33 @@ # Copyright (c) Alibaba, Inc. and its affiliates. # Copyright (c) EleutherAI, Inc. and its affiliates. +# flake8: noqa import math import os import re -from evalscope.benchmarks import DataAdapter -from evalscope.metrics.metrics import exact_match, weighted_mean -from evalscope.utils import normalize_score +from evalscope.benchmarks import Benchmark, DataAdapter +from evalscope.metrics import WeightedAverageAccuracy +from evalscope.models import ChatGenerationModelAdapter from evalscope.utils.io_utils import jsonl_to_list from evalscope.utils.logger import get_logger -# flake8: noqa - logger = get_logger() -DATASET_ID = 'modelscope/gsm8k' -SUBSET_LIST = ['main'] -ANS_RE = re.compile(r'#### (\-?[0-9\.\,]+)') -INVALID_ANS = '[invalid]' - +@Benchmark.register( + name='gsm8k', + dataset_id='modelscope/gsm8k', + model_adapter=ChatGenerationModelAdapter, + subset_list=['main'], + metric_list=[WeightedAverageAccuracy], + few_shot_num=4, + train_split='train', + eval_split='test', + prompt_template='', +) class GSM8KAdapter(DataAdapter): - def __init__(self, - subset_list: list = None, - metric_list: list = None, - few_shot_num: int = None, - train_split: str = 'train', - eval_split: str = 'test', - prompt_template: str = '', - **kwargs): + def __init__(self, **kwargs): """ Data adapter for GSM8K dataset. @@ -41,30 +39,13 @@ def __init__(self, eval_split (str): The target eval split name. Default: 'test' **kwargs: ... """ - - if subset_list is None: - subset_list = SUBSET_LIST - - if metric_list is None: - metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}] - - if few_shot_num is None: - logger.info(f'Set 4-shot examples by system for GSM8K.') - few_shot_num = 4 - + few_shot_num = kwargs.get('few_shot_num', 4) if few_shot_num != 4 and few_shot_num != 0: logger.error(f'GSM8K uses 4-shot examples with CoT or 0-shot by system, but got {few_shot_num}. ' f'Use 4-shot by default.') - few_shot_num = 4 + kwargs['few_shot_num'] = 4 - super().__init__( - subset_list=subset_list, - metric_list=metric_list, - few_shot_num=few_shot_num, - train_split=train_split, - eval_split=eval_split, - prompt_template=prompt_template, - **kwargs) + super().__init__(**kwargs) def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict: data_dict = {} @@ -142,66 +123,6 @@ def number_equal(gold_ans, pred_ans): return number_equal(gold_ans=gold, pred_ans=pred) - def compute_metric(self, review_res_list: list) -> float: - """ - Compute evaluation result by specific metric. - - Args: - review_res_list: review score list, e.g. [0, 1, 1, 0, ...] - - Returns: - The metric score. - """ - items = [(score, 1.0) for score in review_res_list] - return weighted_mean(items) - - def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict: - """ - Generate the report for the model output. - - Args: - subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...} - report_name: The user-defined report name. Default: None - - Returns: A dict of metric calculation results. The format is like: - { - "name":"GSM8K", - "metric":"WeightedAverageAccuracy", - "score":0.5632, - "category":[ - { - "name":"DEFAULT", - "score":0.5632, - "subset":[ - { - "name":"main", - "score":0.5632 - }, - ] - } - ], - "total_num":100 - } - """ - total_num: int = sum([num for _, num in subset_score_map.values()]) - weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num - weighted_avg_acc = normalize_score(score=weighted_avg_acc) - cate_avg_list = [{ - 'name': subset_name, - 'score': normalize_score(score=score) - } for subset_name, (score, _) in subset_score_map.items()] - - category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list) - - res_map = dict( - name=report_name or 'gsm8k', - metric=self.metric_list[0]['name'], - score=weighted_avg_acc, - category=[category_d], - total_num=total_num) - - return res_map - @classmethod def _generate_prompt(cls, input_d: dict, few_shot_list: list, use_fewshot: bool = True) -> str: if use_fewshot: diff --git a/evalscope/benchmarks/hellaswag/__init__.py b/evalscope/benchmarks/hellaswag/__init__.py index 5899f3de..b937315b 100644 --- a/evalscope/benchmarks/hellaswag/__init__.py +++ b/evalscope/benchmarks/hellaswag/__init__.py @@ -1,6 +1 @@ # Copyright (c) Alibaba, Inc. and its affiliates. - -from evalscope.benchmarks.hellaswag.hellaswag_adapter import DATASET_ID, SUBSET_LIST -from evalscope.benchmarks.hellaswag.hellaswag_adapter import HellaSwagAdapter -from evalscope.benchmarks.hellaswag.hellaswag_adapter import HellaSwagAdapter as DataAdapterClass -from evalscope.models.model_adapter import ContinuationLogitsModelAdapter as ModelAdapterClass # noqa diff --git a/evalscope/benchmarks/hellaswag/hellaswag_adapter.py b/evalscope/benchmarks/hellaswag/hellaswag_adapter.py index 4d5f7ef0..5e580237 100644 --- a/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +++ b/evalscope/benchmarks/hellaswag/hellaswag_adapter.py @@ -3,9 +3,10 @@ import os import re -from evalscope.benchmarks.data_adapter import DataAdapter -from evalscope.metrics.metrics import exact_match, weighted_mean -from evalscope.utils import normalize_score +from evalscope.benchmarks import Benchmark, DataAdapter +from evalscope.constants import EvalType +from evalscope.metrics import WeightedAverageAccuracy, exact_match +from evalscope.models import ContinuationLogitsModelAdapter from evalscope.utils.io_utils import jsonl_to_list from evalscope.utils.logger import get_logger @@ -13,44 +14,30 @@ logger = get_logger() -DATASET_ID = 'modelscope/hellaswag' -SUBSET_LIST = ['default'] - +@Benchmark.register( + name='hellaswag', + dataset_id='modelscope/hellaswag', + model_adapter=ContinuationLogitsModelAdapter, + subset_list=['default'], + metric_list=[WeightedAverageAccuracy], + few_shot_num=0, + train_split='train', + eval_split='validation', + prompt_template='', +) class HellaSwagAdapter(DataAdapter): choices = ['0', '1', '2', '3'] - def __init__(self, - subset_list: list = None, - metric_list: list = None, - few_shot_num: int = None, - train_split: str = 'train', - eval_split: str = 'validation', - **kwargs): - - if subset_list is None: - subset_list = SUBSET_LIST - - if metric_list is None: - metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}] - - if few_shot_num is None: - # Use 0-shot by default - logger.info(f'Set 0-shot examples by system for HellaSwag.') - few_shot_num = 0 + def __init__(self, **kwargs): + few_shot_num = kwargs.get('few_shot_num', 0) if few_shot_num != 0: logger.warning(f'few_shot_num should be 0 for HellaSwag, but got {few_shot_num}. Use 0-shot by default.') - few_shot_num = 0 + kwargs['few_shot_num'] = 0 - super().__init__( - subset_list=subset_list, - metric_list=metric_list, - few_shot_num=few_shot_num, - train_split=train_split, - eval_split=eval_split, - **kwargs) + super().__init__(**kwargs) def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict: data_dict = {} @@ -106,7 +93,7 @@ def get_gold_answer(self, input_d: dict) -> str: # Get the gold choice return input_d['label'] - def parse_pred_result(self, result: list, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str: + def parse_pred_result(self, result: list, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str: """ Parse the model output to get the answer. Could be the best choice index. @@ -118,7 +105,7 @@ def parse_pred_result(self, result: list, raw_input_d: dict = None, eval_type: s Returns: The parsed answer. Depending on the dataset. Usually a string for chat. """ - if eval_type == 'checkpoint': + if eval_type == EvalType.CHECKPOINT: # answer: in the form of [-2.3, -4.5, ...], len of self.choices result = np.array(result) endings: list = [self._preprocess(ending) for ending in raw_input_d['endings']] @@ -126,9 +113,9 @@ def parse_pred_result(self, result: list, raw_input_d: dict = None, eval_type: s best_choice_idx = np.argmax(result / completion_len) return str(best_choice_idx) - elif eval_type == 'service': + elif eval_type == EvalType.SERVICE: return result # TODO: to be supported ! - elif eval_type == 'custom': + elif eval_type == EvalType.CUSTOM: return result # TODO: to be supported ! else: raise ValueError(f'Invalid eval_type: {eval_type}') @@ -136,66 +123,6 @@ def parse_pred_result(self, result: list, raw_input_d: dict = None, eval_type: s def match(self, gold: str, pred: str) -> float: return exact_match(gold=str(gold), pred=str(pred)) - def compute_metric(self, review_res_list: list) -> float: - """ - Compute evaluation result by specific metric. - - Args: - review_res_list: review score list, e.g. [0, 1, 1, 0, ...] - - Returns: - The metric score. - """ - items = [(score, 1.0) for score in review_res_list] - return weighted_mean(items) - - def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict: - """ - Generate the report for the model output. - - Args: - subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...} - report_name: The user-defined report name. - - Returns: A dict of metric calculation results. The format is like: - { - "name":"HellaSwag", - "metric":"WeightedAverageAccuracy", - "score":0.3389, - "category":[ - { - "name":"DEFAULT", - "score":0.4128, - "subset":[ - { - "name":"default", - "score":0.5632 - }, - ] - } - ], - "total_num":7800 - } - """ - total_num: int = sum([num for _, num in subset_score_map.values()]) - weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num - weighted_avg_acc = normalize_score(score=weighted_avg_acc) - cate_avg_list = [{ - 'name': subset_name, - 'score': normalize_score(score=score) - } for subset_name, (score, _) in subset_score_map.items()] - - category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list) - - res_map = dict( - name=report_name or 'hellaswag', - metric=self.metric_list[0]['name'], - score=weighted_avg_acc, - category=[category_d], - total_num=total_num) - - return res_map - @classmethod def _preprocess(cls, text): text = text.strip() diff --git a/evalscope/benchmarks/humaneval/__init__.py b/evalscope/benchmarks/humaneval/__init__.py index 176dd8f6..b937315b 100644 --- a/evalscope/benchmarks/humaneval/__init__.py +++ b/evalscope/benchmarks/humaneval/__init__.py @@ -1,5 +1 @@ # Copyright (c) Alibaba, Inc. and its affiliates. - -from evalscope.benchmarks.humaneval.humaneval_adapter import DATASET_ID, SUBSET_LIST -from evalscope.benchmarks.humaneval.humaneval_adapter import HumanevalAdapter as DataAdapterClass -from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass # noqa diff --git a/evalscope/benchmarks/humaneval/humaneval_adapter.py b/evalscope/benchmarks/humaneval/humaneval_adapter.py index 8dcfe6e7..39d80976 100644 --- a/evalscope/benchmarks/humaneval/humaneval_adapter.py +++ b/evalscope/benchmarks/humaneval/humaneval_adapter.py @@ -1,38 +1,35 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -import json -import os import re -from tqdm import tqdm from typing import List -from evalscope.benchmarks.data_adapter import DataAdapter -from evalscope.metrics.metrics import weighted_mean -from evalscope.tools.combine_reports import gen_table -from evalscope.utils import normalize_score +from evalscope.benchmarks import Benchmark, DataAdapter +from evalscope.metrics import Pass1 +from evalscope.models import ChatGenerationModelAdapter from evalscope.utils.logger import get_logger logger = get_logger() -DATASET_ID = 'modelscope/humaneval' -SUBSET_LIST = ['openai_humaneval'] - # Example: # {"task_id": "HumanEval/0", "prompt": "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n given threshold.\n >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n False\n >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n True\n \"\"\"\n", "entry_point": "has_close_elements", "canonical_solution": " for idx, elem in enumerate(numbers):\n for idx2, elem2 in enumerate(numbers):\n if idx != idx2:\n distance = abs(elem - elem2)\n if distance < threshold:\n return True\n\n return False\n", "test": "\n\nMETADATA = {\n 'author': 'jt',\n 'dataset': 'test'\n}\n\n\ndef check(candidate):\n assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\n"} # noqa +@Benchmark.register( + name='humaneval', + dataset_id='modelscope/humaneval', + model_adapter=ChatGenerationModelAdapter, + subset_list=['openai_humaneval'], + metric_list=[Pass1], + few_shot_num=0, + train_split=None, + eval_split='test', + prompt_template='Complete the following python code:\n', +) class HumanevalAdapter(DataAdapter): """ A placeholder for humaneval adapter, see HumanevalEvaluator for implementation. """ - def __init__(self, - subset_list: list = None, - metric_list: list = None, - few_shot_num: int = None, - train_split: str = None, - eval_split: str = 'test', - prompt_template: str = 'Complete the following python code:\n', - **kwargs): + def __init__(self, **kwargs): try: from human_eval.data import stream_jsonl, write_jsonl from human_eval.evaluation import check_correctness @@ -41,29 +38,15 @@ def __init__(self, 'https://github.com/openai/human-eval/tree/master#installation , ' 'Note that you need to enable the execution code in the human_eval/execution.py first.') - if subset_list is None: - subset_list = SUBSET_LIST - - if metric_list is None: - metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}] - self.k = [1] self.num_workers = 4 self.timeout = 4.0 - self.outputs = kwargs.get('outputs', None) self.read_problems_func = stream_jsonl self.write_jsonl_func = write_jsonl self.eval_func = check_correctness - super().__init__( - subset_list=subset_list, - metric_list=metric_list, - few_shot_num=few_shot_num, - train_split=train_split, - eval_split=eval_split, - prompt_template=prompt_template, - **kwargs) + super().__init__(**kwargs) def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict: data_dict = {} @@ -87,77 +70,6 @@ def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict: return {'data': [full_prompt]} - def get_answers(self, infer_cfg: dict) -> List[dict]: - ans_list: list = [] - system_prompt: str = '' - for task_id, data_d in tqdm(self.problems.items(), total=len(self.problems), desc='Predicting(problems)'): - prompt: str = system_prompt + data_d['prompt'] - inputs: dict = {'data': [prompt]} - - pred_res: dict = self.model_adapter.predict(inputs=inputs, infer_cfg=infer_cfg) - - pred_ans: str = pred_res['choices'][0]['message']['content'] - pred_ans = self._postprocess(pred_ans) - - ans_list.append({'task_id': task_id, 'completion': pred_ans}) - - return ans_list - - def eval(self, infer_cfg: dict, **kwargs): - - # predict - ans_list: list = self.get_answers(infer_cfg) - ans_out_file: str = os.path.join(self.outputs_structure.predictions_dir, 'human_eval_predictions.jsonl') - - self.write_jsonl_func(filename=ans_out_file, data=ans_list) - # logger.info(f'** Dump predictions to {ans_out_file} successfully.') - logger.info('** Dump predictions successfully.') - - # evaluate results: e.g. {'pass@1': 0.333, 'pass@10': 0.111} - results = self.eval_func( - sample_file=ans_out_file, - k=self.k, - n_workers=self.num_workers, - timeout=self.timeout, - problem_file=self.problem_file) - - # output: report - report_map: dict = self.gen_report(results=results) - report_dir: str = self.outputs_structure.reports_dir - report_file: str = os.path.join(report_dir, 'human_eval_report.json') - - with open(report_file, 'w') as f: - f.write(json.dumps(report_map, ensure_ascii=False, indent=4)) - # logger.info(f'** Dump report to {report_file} \n') - logger.info('** Dump report \n') - - try: - # Make table - report_table: str = gen_table([report_dir]) - logger.info(f'** Report table: \n {report_table} \n') - except Exception: - logger.error('Failed to generate report table.') - - def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict: - total_num: int = sum([num for _, num in subset_score_map.values()]) - weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num - weighted_avg_acc = normalize_score(score=weighted_avg_acc) - cate_avg_list = [{ - 'name': subset_name, - 'score': normalize_score(score=score) - } for subset_name, (score, _) in subset_score_map.items()] - - category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list) - - res_map = dict( - name=report_name or 'HumanEval', - metric='pass@1', - score=weighted_avg_acc, - category=[category_d], - total_num=total_num) - - return res_map - @classmethod def _postprocess(cls, text: str) -> str: if '```' in text: @@ -182,19 +94,6 @@ def _postprocess(cls, text: str) -> str: text = '\n'.join([' ' + line for line in text.split('\n')]) return text - def compute_metric(self, review_res_list: list) -> float: - """ - Compute evaluation result by specific metric. - - Args: - review_res_list: review score list, e.g. [0, 1, 1, 0, ...] - - Returns: - The metric score. - """ - items = [(score, 1.0) for score in review_res_list] - return weighted_mean(items) - def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str: return self._postprocess(result) diff --git a/evalscope/benchmarks/mmlu/__init__.py b/evalscope/benchmarks/mmlu/__init__.py index c112533f..b937315b 100644 --- a/evalscope/benchmarks/mmlu/__init__.py +++ b/evalscope/benchmarks/mmlu/__init__.py @@ -1,6 +1 @@ # Copyright (c) Alibaba, Inc. and its affiliates. - -from evalscope.benchmarks.mmlu.mmlu_adapter import DATASET_ID, SUBJECT_MAPPING, SUBSET_LIST -from evalscope.benchmarks.mmlu.mmlu_adapter import MMLUAdapter -from evalscope.benchmarks.mmlu.mmlu_adapter import MMLUAdapter as DataAdapterClass -from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass # noqa diff --git a/evalscope/benchmarks/mmlu/mmlu_adapter.py b/evalscope/benchmarks/mmlu/mmlu_adapter.py index ecd6f5d2..d77839c9 100644 --- a/evalscope/benchmarks/mmlu/mmlu_adapter.py +++ b/evalscope/benchmarks/mmlu/mmlu_adapter.py @@ -2,8 +2,10 @@ import csv import os -from evalscope.benchmarks.data_adapter import DataAdapter -from evalscope.metrics.metrics import exact_match, weighted_mean +from evalscope.benchmarks import Benchmark, DataAdapter +from evalscope.constants import EvalType +from evalscope.metrics import WeightedAverageAccuracy, exact_match +from evalscope.models import MultiChoiceModelAdapter from evalscope.utils import ResponseParser, normalize_score from evalscope.utils.logger import get_logger @@ -134,40 +136,29 @@ } +@Benchmark.register( + name='mmlu', + dataset_id='modelscope/mmlu', + model_adapter=MultiChoiceModelAdapter, + subset_list=SUBSET_LIST, + metric_list=[WeightedAverageAccuracy], + few_shot_num=5, + train_split='train', + eval_split='test', + prompt_template='', +) class MMLUAdapter(DataAdapter): choices = ['A', 'B', 'C', 'D'] - def __init__(self, - subset_list: list = None, - metric_list: list = None, - few_shot_num: int = None, - train_split: str = 'train', - eval_split: str = 'test', - **kwargs): - - if subset_list is None: - subset_list = SUBSET_LIST - - if metric_list is None: - metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}] - - if few_shot_num is None: - # Use 5-shot by default - logger.info(f'Set 5-shot examples by system for MMLU.') - few_shot_num = 5 + def __init__(self, **kwargs): + few_shot_num = kwargs.get('few_shot_num', 5) if few_shot_num > 5: logger.warning(f'few_shot_num <= 5 for MMLU, but got {few_shot_num}. Use 5-shot by default.') - few_shot_num = 5 + kwargs['few_shot_num'] = 5 - super().__init__( - subset_list=subset_list, - metric_list=metric_list, - few_shot_num=few_shot_num, - train_split=train_split, - eval_split=eval_split, - **kwargs) + super().__init__(**kwargs) def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict: data_dict = {} @@ -244,7 +235,7 @@ def get_gold_answer(self, input_d: dict) -> str: # Get the gold choice return input_d.get('target', '') - def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str: + def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str: """ Parse the model output to get the answer. Could be the best choice index. @@ -256,11 +247,11 @@ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: st Returns: The parsed answer. Depending on the dataset. Usually a string for chat. """ - if eval_type == 'checkpoint': + if eval_type == EvalType.CHECKPOINT: return result - elif eval_type == 'service': + elif eval_type == EvalType.SERVICE: return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked ! - elif eval_type == 'custom': + elif eval_type == EvalType.CUSTOM: return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked ! else: raise ValueError(f'Invalid eval_type: {eval_type}') @@ -268,19 +259,6 @@ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: st def match(self, gold: str, pred: str) -> float: return exact_match(gold=gold, pred=pred) - def compute_metric(self, review_res_list: list) -> float: - """ - Compute evaluation result by specific metric. - - Args: - review_res_list: review score list, e.g. [0, 1, 1, 0, ...] - - Returns: - The metric score. - """ - items = [(score, 1.0) for score in review_res_list] - return weighted_mean(items) - def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict: """ Generate report for the evaluation. diff --git a/evalscope/benchmarks/race/__init__.py b/evalscope/benchmarks/race/__init__.py index f4290c4f..b937315b 100644 --- a/evalscope/benchmarks/race/__init__.py +++ b/evalscope/benchmarks/race/__init__.py @@ -1,6 +1 @@ # Copyright (c) Alibaba, Inc. and its affiliates. - -from evalscope.benchmarks.race.race_adapter import DATASET_ID, SUBJECT_MAPPING, SUBSET_LIST -from evalscope.benchmarks.race.race_adapter import RACEAdapter -from evalscope.benchmarks.race.race_adapter import RACEAdapter as DataAdapterClass -from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass # noqa diff --git a/evalscope/benchmarks/race/race_adapter.py b/evalscope/benchmarks/race/race_adapter.py index 3496db9e..bf73882a 100644 --- a/evalscope/benchmarks/race/race_adapter.py +++ b/evalscope/benchmarks/race/race_adapter.py @@ -1,11 +1,12 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -import json import os -from evalscope.benchmarks.data_adapter import DataAdapter -from evalscope.metrics.metrics import exact_match, weighted_mean -from evalscope.utils import normalize_score +from evalscope.benchmarks import Benchmark, DataAdapter +from evalscope.constants import EvalType +from evalscope.metrics import WeightedAverageAccuracy, exact_match +from evalscope.models import MultiChoiceModelAdapter +from evalscope.utils import ResponseParser, normalize_score from evalscope.utils.io_utils import jsonl_to_list from evalscope.utils.logger import get_logger @@ -13,46 +14,30 @@ logger = get_logger() -DATASET_ID = 'modelscope/race' - -SUBSET_LIST = ['high', 'middle'] - SUBJECT_MAPPING = {'high': 'High', 'middle': 'Middle'} +@Benchmark.register( + name='race', + dataset_id='modelscope/race', + model_adapter=MultiChoiceModelAdapter, + subset_list=['high', 'middle'], + metric_list=[WeightedAverageAccuracy], + few_shot_num=3, + train_split='train', + eval_split='test', +) class RACEAdapter(DataAdapter): choices = ['A', 'B', 'C', 'D'] - def __init__(self, - subset_list: list = None, - metric_list: list = None, - few_shot_num: int = None, - train_split: str = 'train', - eval_split: str = 'test', - **kwargs): - - if subset_list is None: - subset_list = SUBSET_LIST - - if metric_list is None: - metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}] - - if few_shot_num is None: - logger.info(f'Set 3-shot examples by system for RACE.') - few_shot_num = 3 - + def __init__(self, **kwargs): + few_shot_num = kwargs.get('few_shot_num', 3) if few_shot_num > 3: logger.warning(f'few_shot_num <= 3 for RACE, but got {few_shot_num}. Use 3-shot by default.') - few_shot_num = 3 + kwargs['few_shot_num'] = 3 - super().__init__( - subset_list=subset_list, - metric_list=metric_list, - few_shot_num=few_shot_num, - train_split=train_split, - eval_split=eval_split, - **kwargs) + super().__init__(**kwargs) def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict: data_dict = {} @@ -105,7 +90,7 @@ def get_gold_answer(self, input_d: dict) -> str: # Get the gold choice return input_d.get('answer', '') - def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str: + def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str: """ Parse the model output to get the answer. Could be the best choice index. @@ -117,31 +102,18 @@ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: st Returns: The parsed answer. Depending on the dataset. Usually a string for chat. """ - if eval_type == 'checkpoint': - return result - elif eval_type == 'service': # TODO: to be implemented - return result - elif eval_type == 'custom': # TODO: to be implemented + if eval_type == EvalType.CHECKPOINT: return result + elif eval_type == EvalType.SERVICE: + return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked ! + elif eval_type == EvalType.CUSTOM: + return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked ! else: raise ValueError(f'Unknown eval_type: {eval_type}') def match(self, gold: str, pred: str) -> float: return exact_match(gold=gold, pred=pred) - def compute_metric(self, review_res_list: list) -> float: - """ - Compute evaluation result by specific metric. - - Args: - review_res_list: review score list, e.g. [0, 1, 1, 0, ...] - - Returns: - The metric score. - """ - items = [(score, 1.0) for score in review_res_list] - return weighted_mean(items) - def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict: """ Generate report for the evaluation. diff --git a/evalscope/benchmarks/trivia_qa/__init__.py b/evalscope/benchmarks/trivia_qa/__init__.py index 50875493..b937315b 100644 --- a/evalscope/benchmarks/trivia_qa/__init__.py +++ b/evalscope/benchmarks/trivia_qa/__init__.py @@ -1,6 +1 @@ # Copyright (c) Alibaba, Inc. and its affiliates. - -from evalscope.benchmarks.trivia_qa.trivia_qa_adapter import DATASET_ID, SUBSET_LIST -from evalscope.benchmarks.trivia_qa.trivia_qa_adapter import TriviaQaAdapter -from evalscope.benchmarks.trivia_qa.trivia_qa_adapter import TriviaQaAdapter as DataAdapterClass -from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass # noqa diff --git a/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py b/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py index 1923b819..c604128f 100644 --- a/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +++ b/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py @@ -5,45 +5,35 @@ import os from typing import List +from evalscope.benchmarks import Benchmark from evalscope.benchmarks.data_adapter import DataAdapter -from evalscope.metrics.metrics import exact_match, weighted_mean -from evalscope.utils.logger import get_logger +from evalscope.constants import EvalType +from evalscope.metrics import WeightedAverageAccuracy +from evalscope.metrics.metrics import exact_match +from evalscope.models import ChatGenerationModelAdapter +from evalscope.utils import get_logger +from evalscope.utils.utils import ResponseParser # flake8: noqa logger = get_logger() -DATASET_ID = 'modelscope/trivia_qa' -SUBSET_LIST = ['default'] - +@Benchmark.register( + name='trivia_qa', + dataset_id='modelscope/trivia_qa', + model_adapter=ChatGenerationModelAdapter, + subset_list=['default'], + metric_list=[WeightedAverageAccuracy], + few_shot_num=5, + train_split='dev', + eval_split='test', +) class TriviaQaAdapter(DataAdapter): - def __init__(self, - subset_list: list = None, - metric_list: list = None, - few_shot_num: int = None, - train_split: str = 'dev', - eval_split: str = 'test', - **kwargs): - - if subset_list is None: - subset_list = SUBSET_LIST - - if metric_list is None: - metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}] + def __init__(self, **kwargs): - if few_shot_num is None: - logger.info(f'few_shot_num is not specified for TriviaQA, use default value: 5') - few_shot_num = 5 - - super().__init__( - subset_list=subset_list, - metric_list=metric_list, - few_shot_num=few_shot_num, - train_split=train_split, - eval_split=eval_split, - **kwargs) + super().__init__(**kwargs) def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict: data_dict = {} @@ -122,7 +112,7 @@ def get_gold_answer(self, input_d: dict) -> list: ans: list = input_d.get('ideal', []) return ans - def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str: + def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str: """ Parse the model output to get the answer. @@ -134,74 +124,11 @@ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: st Returns: The predicted answer. """ - if eval_type == 'checkpoint': - return result - elif eval_type == 'service': # TODO: to be implemented - return result - elif eval_type == 'custom': # TODO: to be implemented - return result - else: - raise ValueError(f'Unknown eval_type: {eval_type}') + return ResponseParser.parse_first_option(result) def match(self, gold: list, pred: str) -> float: return max([exact_match(gold=ref, pred=pred) for ref in gold]) - def compute_metric(self, review_res_list: list) -> float: - """ - Compute evaluation result by specific metric. - - Args: - review_res_list: review score list, e.g. [0, 1, 1, 0, ...] - - Returns: - The metric score. - """ - items = [(score, 1.0) for score in review_res_list] - return weighted_mean(items) - - def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict: - """ - Generate the report for the model output. - - Args: - subset_score_map: {subset_name: (score, num), ...} - report_name: The user-defined report name. - - Returns: - { - "name":"TriviaQA", - "metric":"WeightedAverageAccuracy", - "score":0.3389, - "category":[ - { - "name":"DEFAULT", - "score":0.3389, - "subset":[ - { - "name":"default", - "score":0.3389 - } - ] - } - ], - "total_num":100 - } - """ - total_num: int = sum([num for _, num in subset_score_map.values()]) - weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num - cate_avg_list = [{'name': subset_name, 'score': score} for subset_name, (score, _) in subset_score_map.items()] - - category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list) - - res_map = dict( - name=report_name or 'trivia_qa', - metric=self.metric_list[0]['name'], - score=weighted_avg_acc, - category=[category_d], - total_num=total_num) - - return res_map - @classmethod def _generate_prompt(cls, input_d: dict, include_answer=True) -> str: diff --git a/evalscope/benchmarks/truthful_qa/__init__.py b/evalscope/benchmarks/truthful_qa/__init__.py index 1fbe8879..b937315b 100644 --- a/evalscope/benchmarks/truthful_qa/__init__.py +++ b/evalscope/benchmarks/truthful_qa/__init__.py @@ -1,6 +1 @@ # Copyright (c) Alibaba, Inc. and its affiliates. - -from evalscope.benchmarks.truthful_qa.truthful_qa_adapter import DATASET_ID, SUBSET_LIST -from evalscope.benchmarks.truthful_qa.truthful_qa_adapter import TruthfulQaAdapter -from evalscope.benchmarks.truthful_qa.truthful_qa_adapter import TruthfulQaAdapter as DataAdapterClass -from evalscope.models.model_adapter import ContinuationLogitsModelAdapter as ModelAdapterClass # noqa diff --git a/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py b/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py index ca6128d2..4a83c5e9 100644 --- a/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +++ b/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py @@ -6,20 +6,31 @@ import os from typing import List +from evalscope.benchmarks import Benchmark from evalscope.benchmarks.data_adapter import DataAdapter +from evalscope.constants import EvalType +from evalscope.metrics import WeightedAverageAccuracy from evalscope.metrics.metrics import weighted_mean -from evalscope.utils import normalize_score -from evalscope.utils.logger import get_logger +from evalscope.models import ContinuationLogitsModelAdapter +from evalscope.utils import get_logger, normalize_score # flake8: noqa logger = get_logger() -DATASET_ID = 'modelscope/truthful_qa' # task_list = ['generation', 'multiple_choice'] -SUBSET_LIST = ['multiple_choice'] +@Benchmark.register( + name='truthful_qa', + dataset_id='modelscope/truthful_qa', + model_adapter=ContinuationLogitsModelAdapter, + subset_list=['multiple_choice'], + metric_list=[WeightedAverageAccuracy], + few_shot_num=0, + train_split=None, + eval_split='validation', +) class TruthfulQaAdapter(DataAdapter): """ Adapter for TruthfulQA benchmark. @@ -40,35 +51,14 @@ class TruthfulQaAdapter(DataAdapter): 'Q: Where were the 1992 Olympics held?\n' 'A: The 1992 Olympics were held in Barcelona, Spain.') - def __init__(self, - subset_list: list = None, - metric_list: list = None, - few_shot_num: int = None, - train_split: str = None, - eval_split: str = 'validation', - **kwargs): - - if subset_list is None: - subset_list = SUBSET_LIST - - if metric_list is None: - metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}] - - if few_shot_num is None: - logger.info(f'Set 0-shot examples by system for TruthfulQA.') - few_shot_num = 0 + def __init__(self, **kwargs): + few_shot_num = kwargs.get('few_shot_num', 0) if few_shot_num != 0: logger.warning(f'few_shot_num should be 0 for TruthfulQA, but got {few_shot_num}. Use 0-shot by default.') - few_shot_num = 0 + kwargs['few_shot_num'] = 0 - super().__init__( - subset_list=subset_list, - metric_list=metric_list, - few_shot_num=few_shot_num, - train_split=train_split, - eval_split=eval_split, - **kwargs) + super().__init__(**kwargs) def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict: data_dict = {} @@ -215,7 +205,7 @@ def get_gold_answer(self, input_d: dict) -> dict: # TODO: generation sub-task to be added return {'mc1_labels': input_d['mc1_targets']['labels'], 'mc2_labels': input_d['mc2_targets']['labels']} - def parse_pred_result(self, result: list, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> list: + def parse_pred_result(self, result: list, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> list: """ Parse the model output to get the answer. @@ -227,11 +217,11 @@ def parse_pred_result(self, result: list, raw_input_d: dict = None, eval_type: s Returns: The predicted answer. """ - if eval_type == 'checkpoint': + if eval_type == EvalType.CHECKPOINT: return result - elif eval_type == 'service': # TODO: to be supported ! + elif eval_type == EvalType.SERVICE: # TODO: to be supported ! return result - elif eval_type == 'custom': # TODO: to be supported ! + elif eval_type == EvalType.CUSTOM: # TODO: to be supported ! return result else: raise ValueError(f'Invalid eval_type: {eval_type}') diff --git a/evalscope/collections/__init__.py b/evalscope/collections/__init__.py new file mode 100644 index 00000000..c87f613f --- /dev/null +++ b/evalscope/collections/__init__.py @@ -0,0 +1,3 @@ +from evalscope.collections.data_generator import WeightedSampler +from evalscope.collections.evaluator import EvaluatorCollection +from evalscope.collections.schema import CollectionSchema diff --git a/evalscope/collections/data_generator.py b/evalscope/collections/data_generator.py new file mode 100644 index 00000000..499abac6 --- /dev/null +++ b/evalscope/collections/data_generator.py @@ -0,0 +1,83 @@ +import json +import random +from abc import ABC, abstractmethod +from dataclasses import asdict, dataclass, field +from tqdm import tqdm +from typing import List, Optional + +from evalscope.collections.schema import CollectionSchema +from evalscope.utils.io_utils import dump_jsonl_data + + +# Define an abstract base class for Samplers +class Sampler(ABC): + + def __init__(self, schema: CollectionSchema, count: Optional[int] = None): + self.schema = schema + self.count = count + + @abstractmethod + def sample(self) -> List[dict]: + pass + + +@dataclass +class DatasetEntry: + index: int = 0 + prompt: dict = field(default_factory=dict) + tags: List[str] = field(default_factory=list) + task: str = '' + weight: float = 0.0 + dataset_name: str = '' + subset_name: str = '' + + +class WeightedSampler(Sampler): + + def sample(self) -> List[dict]: + all_data: List[DatasetEntry] = [] + + dataset_info_list = self.schema.flatten() + total_weight = sum(dataset.weight for dataset in dataset_info_list) + + remaining_count = self.count + + for i, dataset in enumerate(tqdm(dataset_info_list, desc='Sampling data')): + data_dict = dataset.get_data() + + dataset_data = [] + for subset_name, subset_data in data_dict.items(): + for prompt in subset_data: + dataset_data.append( + DatasetEntry( + prompt=prompt, + tags=dataset.tags, + task=dataset.task_type, + weight=dataset.weight, + dataset_name=dataset.name, + subset_name=subset_name, + )) + + # For the last dataset, use the remaining count + if i == len(dataset_info_list) - 1: + dataset_sample_count = remaining_count + else: + dataset_sample_count = int((dataset.weight / total_weight) * self.count) + remaining_count -= dataset_sample_count + + sampled_data = random.choices(dataset_data, k=dataset_sample_count) + all_data.extend(sampled_data) + + # update index + result = [] + for i, entry in enumerate(all_data): + entry.index = i + result.append(asdict(entry)) + return result + + +if __name__ == '__main__': + schema = CollectionSchema.from_dict(json.load(open('outputs/schema.json', 'r'))) + print(schema.to_dict()) + mixed_data = WeightedSampler(schema, 10).sample() + dump_jsonl_data(mixed_data, 'outputs/mixed_data.jsonl') diff --git a/evalscope/collections/evaluator.py b/evalscope/collections/evaluator.py new file mode 100644 index 00000000..2dfe1c9e --- /dev/null +++ b/evalscope/collections/evaluator.py @@ -0,0 +1,164 @@ +import json +import os +import pandas as pd +from collections import defaultdict +from datetime import datetime +from tqdm import tqdm + +from evalscope.benchmarks import Benchmark +from evalscope.collections.data_generator import DatasetEntry +from evalscope.config import TaskConfig +from evalscope.constants import AnswerKeys, DumpMode, EvalType, ReviewKeys +from evalscope.evaluator import Evaluator +from evalscope.models import get_local_model, initialize_model_adapter +from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list +from evalscope.utils.logger import get_logger + +logger = get_logger() + + +class SimpleEvaluator(Evaluator): + + def __init__(self, dataset_name, data_adapter, model_adapter, task_cfg, outputs): + super().__init__( + dataset_name_or_path=dataset_name, + data_adapter=data_adapter, + model_adapter=model_adapter, + task_cfg=task_cfg, + outputs=outputs) + + def get_answer(self, input_prompt, subset_name, infer_cfg) -> dict: + answer_d: dict = self.model_adapter.predict(inputs=input_prompt, infer_cfg=infer_cfg) + answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg) + processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id) + return processed_answer + + def get_review(self, answer_d) -> dict: + review_id, reviewer_spec = self._generate_review_id(answer_d) + review_d = self._get_review(answer_d=answer_d, review_id=review_id, reviewer_spec=reviewer_spec) + return review_d + + +class EvaluatorCollection: + + def __init__(self, task_cfg: TaskConfig, outputs: OutputsStructure): + self.task_cfg = task_cfg + self.outputs = outputs + self.model = get_local_model(task_cfg) + self.dataset = self.load() + self.dataset_name_map, self.dataset_id_map = self._parse_dataset() + self.evaluators = self._initialize_evaluators() + + def load(self) -> list[DatasetEntry]: + raw_dataset = jsonl_to_list(self.task_cfg.dataset_args['data_collection']['local_path']) + datasets = [] + for sample in raw_dataset: + datasets.append(DatasetEntry(**sample)) + return datasets + + def _parse_dataset(self): + dataset_name_map = defaultdict(lambda: defaultdict(list)) + dataset_id_map = {} + for sample in self.dataset: + dataset_name, subset_name = sample.dataset_name, sample.subset_name + dataset_name_map[dataset_name][subset_name].append(sample.index) + dataset_id_map[sample.index] = sample + return dataset_name_map, dataset_id_map + + def _initialize_evaluators(self): + evaluators = {} + for dataset_name in self.dataset_name_map.keys(): + benchmark = Benchmark.get(dataset_name) + data_adapter = benchmark.get_data_adapter() + model_adapter = initialize_model_adapter(self.task_cfg, benchmark.model_adapter, self.model) + evaluators[dataset_name] = SimpleEvaluator(dataset_name, data_adapter, model_adapter, self.task_cfg, + self.outputs) + return evaluators + + def get_report(self, reviews): + data = [] + for dataset_name, data_map in self.dataset_name_map.items(): + for subset_name, ids in data_map.items(): + for _id in ids: + review_d = reviews[_id] + row_data: DatasetEntry = self.dataset_id_map[_id] + score = self.get_pred_score(review_d) + data.append({ + 'task_type': row_data.task, + 'dataset_name': dataset_name, + 'subset_name': subset_name, + 'tags': row_data.tags, + 'score': score + }) + + df = pd.DataFrame(data) + + # Multi-level aggregation + subset_report_df = df.groupby(['task_type', 'dataset_name', 'subset_name']).agg( + average_score=('score', 'mean'), count=('score', 'size')).reset_index() + + dataset_report_df = df.groupby(['task_type', 'dataset_name']).agg( + average_score=('score', 'mean'), count=('score', 'size')).reset_index() + + task_report_df = df.groupby(['task_type']).agg( + average_score=('score', 'mean'), count=('score', 'size')).reset_index() + + # Combine all reports into a single dictionary + report = { + 'subset_level': subset_report_df.to_dict(orient='records'), + 'dataset_level': dataset_report_df.to_dict(orient='records'), + 'task_level': task_report_df.to_dict(orient='records') + } + + # Log the report + logger.info(f"Report:\n{pd.DataFrame(report['subset_level']).to_markdown(index=False)}") + + # Save the report to a JSON file + report_file_path = os.path.join(self.outputs.reports_dir, 'data_collection.json') + with open(report_file_path, 'w', encoding='utf-8') as f: + json.dump(report, f, ensure_ascii=False, indent=4) + + def get_answers(self): + pred_file_path = os.path.join(self.outputs.predictions_dir, 'data_collection.jsonl') + answers = defaultdict(dict) + for sample in tqdm(self.dataset, desc='Getting answers'): + evaluator = self.evaluators[sample.dataset_name] + answer_d = evaluator.get_answer(sample.prompt, sample.subset_name, self.task_cfg.generation_config) + answers[sample.index] = answer_d + dump_jsonl_data(answer_d, pred_file_path, dump_mode=DumpMode.APPEND) + return answers + + def get_reviews(self, answers): + review_file_path = os.path.join(self.outputs.reviews_dir, 'data_collection.jsonl') + reviews = defaultdict(dict) + for sample in tqdm(self.dataset, desc='Getting reviews'): + evaluator = self.evaluators[sample.dataset_name] + review_d = evaluator.get_review(answers[sample.index]) + reviews[sample.index] = review_d + dump_jsonl_data(review_d, review_file_path, dump_mode=DumpMode.APPEND) + return reviews + + @staticmethod + def get_pred_score(review_d) -> float: + return review_d[AnswerKeys.CHOICES][0][ReviewKeys.REVIEW][ReviewKeys.RESULT] + + def eval(self, **kwargs): + answers = self.get_answers() + reviews = self.get_reviews(answers) + self.get_report(reviews) + + +if __name__ == '__main__': + task_cfg = TaskConfig( + model='qwen2.5', + api_url='http://127.0.0.1:8801/v1/chat/completions', + api_key='EMPTY', + eval_type=EvalType.SERVICE, + datasets=['data_collection'], + dataset_args={'data_collection': { + 'local_path': 'outputs/mixed_data.jsonl' + }}, + ) + + evaluator_collection = EvaluatorCollection(task_cfg) + evaluator_collection.eval() diff --git a/evalscope/collections/schema.py b/evalscope/collections/schema.py new file mode 100644 index 00000000..74662e68 --- /dev/null +++ b/evalscope/collections/schema.py @@ -0,0 +1,104 @@ +import json +from dataclasses import asdict, dataclass, field +from typing import List, Union + +from evalscope.benchmarks.benchmark import Benchmark + + +@dataclass +class DatasetInfo: + name: str + weight: int = 1 # sample weight in each collection + task_type: str = '' + tags: List[str] = field(default_factory=list) + args: dict = field(default_factory=dict) + + def get_data(self) -> dict: + benchmark_meta = Benchmark.get(self.name) + + data_adapter = benchmark_meta.get_data_adapter(config=self.args) + data_dict = data_adapter.load( + dataset_name_or_path=benchmark_meta.dataset_id, subset_list=benchmark_meta.subset_list) + prompts = data_adapter.gen_prompts(data_dict) + return prompts + + +@dataclass +class CollectionSchema: + name: str + datasets: List[Union[DatasetInfo, 'CollectionSchema']] = field(default_factory=list) + + def __post_init__(self): + # uniform the weight of datasets in each collection + total_weight = sum(dataset.weight for dataset in self.datasets if isinstance(dataset, DatasetInfo)) + for dataset in self.datasets: + if isinstance(dataset, DatasetInfo): + dataset.weight = dataset.weight / total_weight + + def add_dataset(self, name, weight=1, task_type='', tags=[]): + self.datasets.append(DatasetInfo(name, weight, task_type, tags)) + + def add_collection(self, collection: 'CollectionSchema'): + self.datasets.append(collection) + + def get_datasets(self): + return self.datasets + + def to_dict(self): + return { + 'name': + self.name, + 'datasets': + [asdict(dataset) if isinstance(dataset, DatasetInfo) else dataset.to_dict() for dataset in self.datasets] + } + + @classmethod + def from_dict(cls, data): + instance = cls(name=data.get('name', '')) + for dataset in data.get('datasets', []): + if 'datasets' in dataset: + instance.datasets.append(CollectionSchema.from_dict(dataset)) + else: + instance.datasets.append(DatasetInfo(**dataset)) + return instance + + def flatten(self) -> List[DatasetInfo]: + flat_datasets = [] + + for dataset in self.datasets: + if isinstance(dataset, CollectionSchema): + nested_datasets = dataset.flatten() + flat_datasets.extend(nested_datasets) + else: + flat_datasets.append(dataset) + return flat_datasets + + def dump_json(self, file_path): + d = self.to_dict() + with open(file_path, 'w') as f: + json.dump(d, f, ensure_ascii=False, indent=4) + + +if __name__ == '__main__': + schema = CollectionSchema( + name='math&reasoning', + datasets=[ + CollectionSchema( + name='math', + datasets=[ + DatasetInfo(name='gsm8k', weight=1, task_type='math', tags=['en', 'math']), + DatasetInfo(name='competition_math', weight=2, task_type='math', tags=['en', 'math']), + ]), + CollectionSchema( + name='reasoning', + datasets=[ + DatasetInfo(name='arc', weight=1, task_type='reasoning', tags=['en', 'reasoning']), + ]), + ]) + print(schema.to_dict()) + print(schema.flatten()) + schema.dump_json('outputs/schema.json') + + schema = CollectionSchema.from_dict(json.load(open('outputs/schema.json', 'r'))) + print(schema.to_dict()) + print(schema.flatten()) diff --git a/evalscope/config.py b/evalscope/config.py index f6964274..7f3041a8 100644 --- a/evalscope/config.py +++ b/evalscope/config.py @@ -31,7 +31,7 @@ @dataclass class TaskConfig: # Model-related arguments - model: Union[str, CustomModel, None] = None + model: Union[str, 'CustomModel', None] = None model_id: Optional[str] = None model_args: Optional[Dict] = field(default_factory=lambda: DEFAULT_MODEL_ARGS | {}) @@ -40,8 +40,8 @@ class TaskConfig: chat_template: Optional[str] = None # Dataset-related arguments - datasets: Optional[List[str]] = None - dataset_args: Optional[Dict] = field(default_factory=dict) + datasets: List[str] = field(default_factory=list) + dataset_args: Dict = field(default_factory=dict) dataset_dir: str = DEFAULT_DATASET_CACHE_DIR dataset_hub: str = HubType.MODELSCOPE @@ -64,7 +64,9 @@ class TaskConfig: # Debug and runtime mode arguments debug: bool = False dry_run: bool = False - seed: int = 42 + seed: Optional[int] = 42 + api_url: Optional[str] = None # Only used for server model + api_key: Optional[str] = 'EMPTY' # Only used for server model def __post_init__(self): if (not self.model_id) and self.model: @@ -74,7 +76,6 @@ def __post_init__(self): self.model_id = os.path.basename(self.model).rstrip(os.sep) def to_dict(self): - # Note: to avoid serialization error for some model instance return self.__dict__ def __str__(self): @@ -130,6 +131,7 @@ def load(custom_model: CustomModel, tasks: List[str]) -> List['TaskConfig']: continue task.model = custom_model + task.model_args = custom_model.config task.model_id = type(custom_model).__name__ res_list.append(task) diff --git a/evalscope/constants.py b/evalscope/constants.py index be8d00ed..d2409579 100644 --- a/evalscope/constants.py +++ b/evalscope/constants.py @@ -135,34 +135,13 @@ class EvalStage: class EvalType: CUSTOM = 'custom' - CHECKPOINT = 'checkpoint' + CHECKPOINT = 'checkpoint' # native model checkpoint + SERVICE = 'service' # model service class EvalBackend: - - class _Backend: - # compatible with old version, set 'value' - - def __init__(self, value): - self._value = value - - @property - def value(self): - return self._value - - def __str__(self): - return self._value - - def __repr__(self): - return f"'{self._value}'" - - def __eq__(self, other): - if isinstance(other, str): - return self._value == other - return NotImplemented - - NATIVE = _Backend('Native') - OPEN_COMPASS = _Backend('OpenCompass') - VLM_EVAL_KIT = _Backend('VLMEvalKit') - RAG_EVAL = _Backend('RAGEval') - THIRD_PARTY = _Backend('ThirdParty') + NATIVE = 'Native' + OPEN_COMPASS = 'OpenCompass' + VLM_EVAL_KIT = 'VLMEvalKit' + RAG_EVAL = 'RAGEval' + THIRD_PARTY = 'ThirdParty' diff --git a/evalscope/evaluator/evaluator.py b/evalscope/evaluator/evaluator.py index f894411d..bf65d51e 100644 --- a/evalscope/evaluator/evaluator.py +++ b/evalscope/evaluator/evaluator.py @@ -10,9 +10,8 @@ from evalscope.benchmarks import DataAdapter from evalscope.config import TaskConfig -from evalscope.constants import (DEFAULT_DATASET_CACHE_DIR, AnswerKeys, DumpMode, EvalStage, EvalType, HubType, - ReviewKeys) -from evalscope.models.model_adapter import BaseModelAdapter, CustomModelAdapter +from evalscope.constants import AnswerKeys, DumpMode, EvalStage, ReviewKeys +from evalscope.models import BaseModelAdapter, CustomModelAdapter from evalscope.tools.combine_reports import gen_table from evalscope.utils import dict_torch_dtype_to_str, gen_hash from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list @@ -30,73 +29,63 @@ class Evaluator(object): if the dataset is a local path, e.g. /path/to/your_dataset_name, then the task name will be the basename of the path, which is `your_dataset_name`. data_adapter: DataAdapter, the data adapter for the dataset. - subset_list: list, the subset list for the dataset. model_adapter: BaseModelAdapter, the model adapter for the model. - use_cache: str, path to local cache. Default: None - outputs_dir: OutputsStructure, the outputs dir. Default: None - datasets_dir: str, the datasets dir. Default: DEFAULT_ROOT_CACHE_DIR - datasets_hub: str, the datasets hub. `Local`, `ModelScope` or `HuggingFace`. Default: 'ModelScope' - stage: str, the stage of evaluation. `all` or `infer` or `review`. Default: 'all' - eval_type: str, the evaluation type. `checkpoint` or `service` or `custom`. Default: 'checkpoint' - overall_task_cfg: dict, the overall task config. Default: None + outputs: OutputsStructure, the outputs dir. Default: None + task_cfg: TaskConfig, the overall task config. Default: None **kwargs: kwargs. """ def __init__(self, dataset_name_or_path: str, data_adapter: DataAdapter, - subset_list: Optional[list] = None, - model_adapter: Optional[BaseModelAdapter] = None, - use_cache: Optional[str] = None, - outputs: Optional[OutputsStructure] = None, - datasets_dir: Optional[str] = DEFAULT_DATASET_CACHE_DIR, - datasets_hub: Optional[str] = HubType.MODELSCOPE, - stage: Optional[str] = EvalStage.ALL, - eval_type: Optional[str] = EvalType.CHECKPOINT, - overall_task_cfg: Optional[TaskConfig] = None, + model_adapter: BaseModelAdapter, + outputs: OutputsStructure = None, + task_cfg: TaskConfig = None, **kwargs): self.dataset_name_or_path = os.path.expanduser(dataset_name_or_path) self.dataset_name = os.path.basename(self.dataset_name_or_path.rstrip(os.sep)).split('.')[0] - self.model_name = overall_task_cfg.model_id + self.model_name = task_cfg.model_id self.custom_task_name = f'{self.model_name}_{self.dataset_name}' - self.datasets_dir = os.path.expanduser(datasets_dir) - self.kwargs = kwargs self.data_adapter = data_adapter self.model_adapter = model_adapter - self.eval_type = eval_type - self.stage = stage - self.use_cache = use_cache - self.overall_task_cfg = overall_task_cfg - if isinstance(self.model_adapter, CustomModelAdapter): - self.overall_task_cfg.model_args = self.model_adapter.custom_model.config - - self.model_cfg = self.model_adapter.model_cfg - + self.model_cfg = model_adapter.model_cfg + self.eval_type = task_cfg.eval_type + self.dataset_hub = task_cfg.dataset_hub + self.stage = task_cfg.stage + self.use_cache = task_cfg.use_cache + self.task_cfg = task_cfg # Deal with the output paths self.outputs_structure = outputs - # Load dataset - self.dataset = self.data_adapter.load( - dataset_name_or_path=dataset_name_or_path, - subset_list=subset_list, - work_dir=self.datasets_dir, - datasets_hub=datasets_hub, - **kwargs) - - # Get prompts from dataset - # TODO: support sampler - self.prompts = self.data_adapter.gen_prompts(data_dict=self.dataset) - del self.dataset - - def _pred_answer(self, input_d: dict, infer_cfg: dict, subset_name: str, answer_id: str = None) -> dict: + self.kwargs = kwargs - ans: dict = self.model_adapter.predict(inputs=input_d, infer_cfg=infer_cfg) - ans[AnswerKeys.ANSWER_ID] = answer_id - ans[AnswerKeys.SUBSET_NAME] = subset_name + def load_dataset(self): + dataset = self.data_adapter.load( + dataset_name_or_path=self.dataset_name_or_path, + subset_list=self.data_adapter.subset_list, + work_dir=os.path.expanduser(self.task_cfg.dataset_dir), + datasets_hub=self.dataset_hub, + **self.kwargs) - return ans + # Get prompts from dataset + prompts = self.data_adapter.gen_prompts(data_dict=dataset) + return prompts + + def _generate_answer_id(self, model_cfg, input_d, infer_cfg): + model_cfg_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(model_cfg).items())), ensure_ascii=False) + input_prompt_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(input_d).items())), ensure_ascii=False) + infer_cfg_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())), ensure_ascii=False) + return 'answer-' + gen_hash(model_cfg_str + input_prompt_str + infer_cfg_str) + + def _process_answer(self, answer_d, input_d, subset_name, answer_id): + answer_d[AnswerKeys.MODEL_SPEC] = self.model_adapter.model_cfg + answer_d[AnswerKeys.ANSWER_ID] = answer_id + answer_d[AnswerKeys.SUBSET_NAME] = subset_name + answer_d[AnswerKeys.RAW_INPUT] = input_d[AnswerKeys.RAW_INPUT] + answer_d[AnswerKeys.ORIGIN_PROMPT] = input_d + return answer_d def get_answers(self, subset_name: str, @@ -147,57 +136,24 @@ def get_answers(self, resp_answers_list: List[Dict[str, Any]] = self.model_adapter.predict( inputs=prompts_list, infer_cfg=infer_cfg) - assert len(prompts_list) == len(resp_answers_list), \ - f'Length of prompts_list({len(prompts_list)}) != Length of resp_answers_list({len(resp_answers_list)})' - - for in_d, resp_d in zip(prompts_list, resp_answers_list): - - # Gen answer_id (concat: model_cfg + input_prompt + infer_cfg) - model_cfg_str = json.dumps( - OrderedDict(sorted(dict_torch_dtype_to_str(self.model_adapter.model_cfg).items())), - ensure_ascii=False) - input_prompt_str = json.dumps( - OrderedDict(sorted(dict_torch_dtype_to_str(in_d).items())), ensure_ascii=False) - infer_cfg_str = json.dumps( - OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())), ensure_ascii=False) - answer_id = 'answer-' + gen_hash(model_cfg_str + input_prompt_str + infer_cfg_str) - - resp_d[AnswerKeys.MODEL_SPEC] = self.model_adapter.model_cfg - resp_d[AnswerKeys.ANSWER_ID] = answer_id - resp_d[AnswerKeys.SUBSET_NAME] = subset_name - resp_d[AnswerKeys.RAW_INPUT] = in_d[AnswerKeys.RAW_INPUT] - resp_d[AnswerKeys.ORIGIN_PROMPT] = in_d - - answers_list.append(resp_d) - dump_jsonl_data(resp_d, pred_file_path, dump_mode=DumpMode.APPEND) + for input_prompt, answer_d in zip(prompts_list, resp_answers_list): + answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg) + processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id) + answers_list.append(processed_answer) + dump_jsonl_data(processed_answer, pred_file_path, dump_mode=DumpMode.APPEND) else: for input_prompt in tqdm(prompts_list, total=len(prompts_list), desc=f'Predicting({subset_name}): '): - - # Gen answer_id (concat: model_cfg + input_prompt + infer_cfg) - model_cfg_str = json.dumps( - OrderedDict(sorted(dict_torch_dtype_to_str(self.model_adapter.model_cfg).items())), - ensure_ascii=False) - input_prompt_str = json.dumps( - OrderedDict(sorted(dict_torch_dtype_to_str(input_prompt).items())), ensure_ascii=False) - infer_cfg_str = json.dumps( - OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())), ensure_ascii=False) - answer_id = 'answer-' + gen_hash(model_cfg_str + input_prompt_str + infer_cfg_str) - - # Get answers - answer_d: dict = self._pred_answer( - input_d=input_prompt, infer_cfg=infer_cfg, subset_name=subset_name, answer_id=answer_id) - - answer_d[AnswerKeys.MODEL_SPEC] = self.model_adapter.model_cfg - answer_d[AnswerKeys.RAW_INPUT] = input_prompt[AnswerKeys.RAW_INPUT] - answer_d[AnswerKeys.ORIGIN_PROMPT] = input_prompt + answer_d: dict = self.model_adapter.predict(inputs=input_prompt, infer_cfg=infer_cfg) + answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg) + processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id) if debug: logger.info(f'**input_prompt: {json.dumps(input_prompt, ensure_ascii=False)} \n') - logger.info(f'**predicted ans: {json.dumps(answer_d, ensure_ascii=False)} \n') + logger.info(f'**predicted ans: {json.dumps(processed_answer, ensure_ascii=False)} \n') - answers_list.append(answer_d) - dump_jsonl_data(answer_d, pred_file_path, dump_mode=DumpMode.APPEND) + answers_list.append(processed_answer) + dump_jsonl_data(processed_answer, pred_file_path, dump_mode=DumpMode.APPEND) logger.info(f'Dump predictions to {pred_file_path}.') return answers_list @@ -241,6 +197,19 @@ def _get_review(self, answer_d: dict, review_id: str = None, reviewer_spec: dict return review_res + def _generate_review_id(self, answer_d): + # Gen review_id (concat: answer_id + reviewer_spec) + answer_id = answer_d[AnswerKeys.ANSWER_ID] + reviewer_spec = { + 'metric': [metric_d['name'] for metric_d in self.data_adapter.metric_list], + 'reviewer': ['Evaluator'], + 'revision': ['default'] + } + reviewer_spec_str = json.dumps( + OrderedDict(sorted(dict_torch_dtype_to_str(reviewer_spec).items())), ensure_ascii=False) + review_id = 'review-' + gen_hash(answer_id + reviewer_spec_str) + return review_id, reviewer_spec + def get_reviews(self, subset_name: str, answers_list: List[dict], debug: bool = False, **kwargs) -> list: """ Get reviews from answers. @@ -264,19 +233,7 @@ def get_reviews(self, subset_name: str, answers_list: List[dict], debug: bool = logger.warning(f'Ignore use_cache={self.use_cache}, updating the review file: {review_file_path} ...') for answer_d in tqdm(answers_list, total=len(answers_list), desc=f'Reviewing({subset_name}): '): - - # Gen review_id (concat: answer_id + reviewer_spec) - answer_id = answer_d[AnswerKeys.ANSWER_ID] - - reviewer_spec: dict = { - 'metric': [metric_d['name'] for metric_d in self.data_adapter.metric_list], - 'reviewer': ['Evaluator'], - 'revision': ['default'] - } - reviewer_spec_str = json.dumps( - OrderedDict(sorted(dict_torch_dtype_to_str(reviewer_spec).items())), ensure_ascii=False) - review_id = 'review-' + gen_hash(answer_id + reviewer_spec_str) - + review_id, reviewer_spec = self._generate_review_id(answer_d) # Get review review_d = self._get_review(answer_d=answer_d, review_id=review_id, reviewer_spec=reviewer_spec) @@ -284,7 +241,6 @@ def get_reviews(self, subset_name: str, answers_list: List[dict], debug: bool = logger.info(review_d) reviews_list.append(review_d) - # Dump reviews dump_jsonl_data(review_d, review_file_path, dump_mode=DumpMode.APPEND) @@ -380,7 +336,8 @@ def eval(self, infer_cfg: dict = None, debug: bool = False, **kwargs) -> dict: stage_answers_dict = {} stage_reviews_dict = {} - for subset_name, prompts_list in self.prompts.items(): + prompts = self.load_dataset() + for subset_name, prompts_list in prompts.items(): limit = kwargs.get('limit', len(prompts_list)) prompts_list = prompts_list[:limit] diff --git a/evalscope/evaluator/reviewer/auto_reviewer.py b/evalscope/evaluator/reviewer/auto_reviewer.py index 01902f45..bd0e3873 100644 --- a/evalscope/evaluator/reviewer/auto_reviewer.py +++ b/evalscope/evaluator/reviewer/auto_reviewer.py @@ -8,10 +8,10 @@ import time from abc import ABC, abstractmethod from functools import partial -from typing import Any, List +from typing import Any, List, Tuple from evalscope.constants import ArenaMode, EvalConfigKeys, FnCompletionParser, PositionBiasMitigation -from evalscope.models.openai_model import OpenAIModel +from evalscope.models.model import OpenAIModel from evalscope.utils import completion_parsers, random_seeded_choice from evalscope.utils.arena_utils import get_battle_pairs, merge_ques_ans, shuffle_pairwise_preferences from evalscope.utils.io_utils import dump_jsonl_data, jsonl_to_list @@ -240,7 +240,15 @@ def get_review_single(self, row: List[dict], dry_run: bool = False, **kwargs): review_text=review_text) return review_result - def _get_review_pair(self, model_a, model_b, question, category, ans1, ans2, dry_run=False, **kwargs) -> (str, Any): + def _get_review_pair(self, + model_a, + model_b, + question, + category, + ans1, + ans2, + dry_run=False, + **kwargs) -> Tuple[str, Any]: input_msg = dict(ques=question, category=category, ans1=ans1, ans2=ans2) if self.reference_list: @@ -263,7 +271,7 @@ def _get_review_pair(self, model_a, model_b, question, category, ans1, ans2, dry result = (result, None) return review_text, *result - def _get_review_single(self, model, question, category, answer, dry_run=False, **kwargs) -> (str, Any): + def _get_review_single(self, model, question, category, answer, dry_run=False, **kwargs) -> Tuple[str, Any]: input_msg = dict(ques=question, category=category, ans1=answer) if self.reference_list: diff --git a/evalscope/metrics/__init__.py b/evalscope/metrics/__init__.py index b937315b..1714b5e2 100644 --- a/evalscope/metrics/__init__.py +++ b/evalscope/metrics/__init__.py @@ -1 +1,7 @@ # Copyright (c) Alibaba, Inc. and its affiliates. +from evalscope.metrics.metrics import bleu_ngram_one_sample, exact_match, weighted_mean +from evalscope.metrics.rouge_metric import compute_rouge_score_one_sample_zh + +WeightedAverageAccuracy = {'name': 'WeightedAverageAccuracy', 'object': weighted_mean} +WeightedAverageBLEU = {'name': 'WeightedAverageBLEU', 'object': weighted_mean} +Pass1 = {'name': 'Pass@1', 'object': weighted_mean} diff --git a/evalscope/models/__init__.py b/evalscope/models/__init__.py index e619a4e8..09dee522 100644 --- a/evalscope/models/__init__.py +++ b/evalscope/models/__init__.py @@ -1,3 +1,16 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -from evalscope.models.model import BaseModel, ChatBaseModel +from evalscope.models.base_adapter import BaseModelAdapter, initialize_model_adapter +from evalscope.models.chat_adapter import ChatGenerationModelAdapter +from evalscope.models.choice_adapter import ContinuationLogitsModelAdapter, MultiChoiceModelAdapter +from evalscope.models.custom import CustomModel +from evalscope.models.custom_adapter import CustomModelAdapter +from evalscope.models.local_model import LocalModel, get_local_model +from evalscope.models.model import BaseModel, ChatBaseModel, OpenAIModel +from evalscope.models.server_adapter import ServerModelAdapter + +__all__ = [ + 'CustomModel', 'BaseModel', 'ChatBaseModel', 'OpenAIModel', 'BaseModelAdapter', 'ChatGenerationModelAdapter', + 'MultiChoiceModelAdapter', 'ContinuationLogitsModelAdapter', 'CustomModelAdapter', 'ServerModelAdapter', + 'LocalModel', 'get_local_model', 'initialize_model_adapter' +] diff --git a/evalscope/models/api/__init__.py b/evalscope/models/api/__init__.py deleted file mode 100644 index a19bf86e..00000000 --- a/evalscope/models/api/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. - -from evalscope.models.api.openai_api import OpenaiApi diff --git a/evalscope/models/base_adapter.py b/evalscope/models/base_adapter.py new file mode 100644 index 00000000..8eff3a0e --- /dev/null +++ b/evalscope/models/base_adapter.py @@ -0,0 +1,52 @@ +import torch +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Any, Optional, Union + +from evalscope.constants import EvalType +from evalscope.models.custom import CustomModel +from evalscope.models.local_model import LocalModel + +if TYPE_CHECKING: + from evalscope.config import TaskConfig + + +class BaseModelAdapter(ABC): + + def __init__(self, model: Optional[Union[LocalModel, CustomModel]], **kwargs): + if model is None: + self.model_cfg = kwargs.get('model_cfg', None) + elif isinstance(model, LocalModel): + self.model = model.model + self.model_id = model.model_id + self.model_revision = model.model_revision + self.device = model.device + self.tokenizer = model.tokenizer + self.model_cfg = model.model_cfg + elif isinstance(model, CustomModel): + self.model_cfg = model.config + else: + raise ValueError(f'Unsupported model type: {type(model)}') + + @abstractmethod + @torch.no_grad() + def predict(self, *args, **kwargs) -> Any: + raise NotImplementedError + + +def initialize_model_adapter(task_cfg: 'TaskConfig', model_adapter_cls: 'BaseModelAdapter', base_model: 'LocalModel'): + """Initialize the model adapter based on the task configuration.""" + if task_cfg.dry_run: + from evalscope.models.model import DummyChatModel + return DummyChatModel(model_cfg=dict()) + elif task_cfg.eval_type == EvalType.CUSTOM: + if not isinstance(task_cfg.model, CustomModel): + raise ValueError(f'Expected evalscope.models.custom.CustomModel, but got {type(task_cfg.model)}.') + from evalscope.models import CustomModelAdapter + return CustomModelAdapter(custom_model=task_cfg.model) + elif task_cfg.eval_type == EvalType.SERVICE: + from evalscope.models import ServerModelAdapter + return ServerModelAdapter( + api_url=task_cfg.api_url, model_id=task_cfg.model, api_key=task_cfg.api_key, seed=task_cfg.seed) + else: + return model_adapter_cls( + model=base_model, generation_config=task_cfg.generation_config, chat_template=task_cfg.chat_template) diff --git a/evalscope/models/chat_adapter.py b/evalscope/models/chat_adapter.py new file mode 100644 index 00000000..033ee7f3 --- /dev/null +++ b/evalscope/models/chat_adapter.py @@ -0,0 +1,108 @@ +import os +import time +import torch +from modelscope import GenerationConfig +from typing import Union + +from evalscope.models.base_adapter import BaseModelAdapter +from evalscope.models.local_model import LocalModel +from evalscope.utils.chat_service import ChatCompletionResponse, ChatMessage +from evalscope.utils.logger import get_logger +from evalscope.utils.model_utils import fix_do_sample_warning + +logger = get_logger() + + +class ChatGenerationModelAdapter(BaseModelAdapter): + """ + Chat generation model adapter. + """ + + def __init__(self, model: LocalModel, **kwargs): + super().__init__(model) + + self.generation_config = self._parse_generation_config(self.tokenizer, self.model) + + custom_generation_config = kwargs.pop('generation_config', None) + custom_chat_template = kwargs.pop('chat_template', None) + + if custom_generation_config: + logger.info('Updating generation config ...') + self.generation_config.update(**custom_generation_config) + + if custom_chat_template: + self.tokenizer.chat_template = custom_chat_template + logger.info(f'Using custom chat template: {custom_chat_template}') + + def _parse_generation_config(self, tokenizer, model): + generation_config = getattr(model, 'generation_config', GenerationConfig(do_sample=False)) + + try: + remote_config = GenerationConfig.from_pretrained( + self.model_id, revision=self.model_revision, trust_remote_code=True) + generation_config.update(**remote_config.to_dict()) + except Exception: + logger.warning(f'Failed to get generation config of {self.model_id} from model hub, use default.') + + if isinstance(self.model_id, str) and os.path.exists(self.model_id): + logger.warning(f'Got local model dir: {self.model_id}') + + if tokenizer.eos_token_id is not None: + generation_config.eos_token_id = tokenizer.eos_token_id + if tokenizer.pad_token_id is not None: + generation_config.pad_token_id = tokenizer.pad_token_id + if generation_config.max_new_tokens is None: + generation_config.max_new_tokens = 2048 + + return generation_config + + def _model_generate(self, query: str, infer_cfg: dict) -> str: + messages = [ChatMessage(role='user', content=query)] + formatted_prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + inputs = self.tokenizer(formatted_prompt, return_tensors='pt', padding=True).to(self.device) + input_ids = inputs['input_ids'] + + # Process infer_cfg + if isinstance(infer_cfg.get('num_return_sequences'), int) and infer_cfg['num_return_sequences'] > 1: + infer_cfg['do_sample'] = True + + # stop settings + stop = infer_cfg.get('stop', None) + eos_token_id = self.tokenizer.encode(stop, add_special_tokens=False)[0] \ + if stop else self.tokenizer.eos_token_id + + if eos_token_id is not None: + infer_cfg['eos_token_id'] = eos_token_id + infer_cfg['pad_token_id'] = eos_token_id # setting eos_token_id as pad token + + self.generation_config.update(**infer_cfg) + fix_do_sample_warning(self.generation_config) + + # Run inference + output_ids = self.model.generate(**inputs, generation_config=self.generation_config) + + response = self.tokenizer.decode(output_ids[0, len(input_ids[0]):], skip_special_tokens=True) + return response + + @torch.no_grad() + def predict(self, inputs: Union[str, dict, list], infer_cfg: dict = {}) -> dict: + + # Process inputs + if isinstance(inputs, str): + query = inputs + elif isinstance(inputs, dict): + query = inputs['data'][0] + elif isinstance(inputs, list): + query = '\n'.join(inputs) + else: + raise TypeError(f'Unsupported inputs type: {type(inputs)}') + + response = self._model_generate(query, infer_cfg) + + choices_list = [{'index': 0, 'message': {'content': response, 'role': 'assistant'}}] + + res_d = ChatCompletionResponse( + model=self.model_id, choices=choices_list, object='chat.completion', created=int(time.time()), + usage=None).model_dump(exclude_unset=True) + + return res_d diff --git a/evalscope/models/choice_adapter.py b/evalscope/models/choice_adapter.py new file mode 100644 index 00000000..b2d403e3 --- /dev/null +++ b/evalscope/models/choice_adapter.py @@ -0,0 +1,214 @@ +import numpy as np +import time +import torch +from typing import List + +from evalscope.models.base_adapter import BaseModelAdapter +from evalscope.models.local_model import LocalModel +from evalscope.utils.chat_service import ChatCompletionResponse + + +class MultiChoiceModelAdapter(BaseModelAdapter): + """ The multi-choice model adapter. """ + + _DEFAULT_MAX_LENGTH = 2048 + + def __init__(self, model: LocalModel, **kwargs): + super().__init__(model) + + self._max_length = kwargs.get('max_length') + + @property + def max_length(self): + if self._max_length: + return self._max_length + seqlen_config_attrs = ('n_positions', 'max_position_embeddings', 'n_ctx') + for attr in seqlen_config_attrs: + if hasattr(self.model.config, attr): + return getattr(self.model.config, attr) + if hasattr(self.tokenizer, 'model_max_length'): + if self.tokenizer.model_max_length == 1000000000000000019884624838656: + return self._DEFAULT_MAX_LENGTH + return self.tokenizer.model_max_length + return self._DEFAULT_MAX_LENGTH + + @torch.no_grad() + def predict(self, inputs: dict, infer_cfg: dict = None) -> dict: + """ + Multi-choice model prediction func. + + Args: + inputs (dict): The inputs for a doc. Format: + {'data': [full_prompt], 'multi_choices': ['A', 'B', 'C', 'D']} + + infer_cfg (dict): inference configuration. + + Returns: + res (dict): The model prediction results. Format: + { + 'choices': [ + { + 'index': 0, + 'message': { + 'content': [-14.9609, -13.6015, ...], # loglikelihood values for inputs context-continuation pairs. + 'role': 'assistant' + } + } + ], + 'created': 1677664795, + # For models on the ModelScope or HuggingFace, concat model_id and revision with "-". + 'model': 'gpt-3.5-turbo-0613', + 'object': 'chat.completion', + 'usage': { + 'completion_tokens': 17, + 'prompt_tokens': 57, + 'total_tokens': 74 + } + } + """ + infer_cfg = infer_cfg or {} + self.model.generation_config.update(**infer_cfg) + + input_data = inputs['data'] + multi_choices = inputs['multi_choices'] + + output, input_info = self._get_logits(self.tokenizer, self.model, input_data) + assert output.shape[0] == 1 + logits = output.flatten() + + choice_logits = [logits[self.tokenizer(ch)['input_ids'][-1:]] for ch in multi_choices] + softval = torch.nn.functional.softmax(torch.tensor(choice_logits).float(), dim=0) + + if softval.dtype in {torch.bfloat16, torch.float16}: + softval = softval.to(dtype=torch.float32) + probs = softval.detach().cpu().numpy() + pred: str = multi_choices[int(np.argmax(probs))] # Format: A or B or C or D + + res_d = ChatCompletionResponse( + model=self.model_id, + choices=[{ + 'index': 0, + 'message': { + 'content': pred, + 'role': 'assistant' + } + }], + object='chat.completion', + created=int(time.time()), + usage=None).model_dump(exclude_unset=True) + + return res_d + + @staticmethod + def _get_logits(tokenizer, model, inputs: List[str]): + input_ids = tokenizer(inputs, padding=False)['input_ids'] + input_ids = torch.tensor(input_ids, device=model.device) + tokens = {'input_ids': input_ids} + + outputs = model(input_ids)['logits'] + logits = outputs[:, -1, :] + log_probs = torch.nn.functional.softmax(logits, dim=-1) + return log_probs, {'tokens': tokens} + + +class ContinuationLogitsModelAdapter(MultiChoiceModelAdapter): + """ + Continuation-logits model adapter. + """ + + def __init__(self, model: LocalModel, **kwargs): + super().__init__(model, **kwargs) + + @torch.no_grad() + def predict(self, inputs: dict, infer_cfg: dict = None) -> dict: + """ + Multi-choice model prediction func. + Args: + inputs (dict): The inputs for a doc. Format: + {'data': [(context, continuation), ...]} + infer_cfg (dict): inference configuration. + Returns: + res (dict): The model prediction results. Format: + { + 'choices': [ + { + 'index': 0, + 'message': { + 'content': [-14.9609, -13.6015, ...], # loglikelihood values for inputs context-continuation pairs. + 'role': 'assistant' + } + } + ], + 'created': 1677664795, + # For models on the ModelScope or HuggingFace, concat model_id and revision with "-". + 'model': 'gpt-3.5-turbo-0613', + 'object': 'chat.completion', + 'usage': { + 'completion_tokens': 17, + 'prompt_tokens': 57, + 'total_tokens': 74 + } + } + """ + infer_cfg = infer_cfg or {} + + pred_list: list = self.loglikelihood(inputs=inputs['data'], infer_cfg=infer_cfg) + + res_d = ChatCompletionResponse( + model=self.model_id, + choices=[{ + 'index': 0, + 'message': { + 'content': pred_list, + 'role': 'assistant' + } + }], + object='chat.completion', + created=int(time.time()), + usage=None).model_dump(exclude_unset=True) + + return res_d + + def loglikelihood(self, inputs: list, infer_cfg: dict = None) -> list: + self.model.generation_config.update(**infer_cfg) + # To predict one doc + doc_ele_pred = [] + for ctx, continuation in inputs: + + # ctx_enc shape: [context_tok_len] cont_enc shape: [continuation_tok_len] + ctx_enc, cont_enc = self._encode_pair(ctx, continuation) + + inputs_tokens = torch.tensor( + (ctx_enc.tolist() + cont_enc.tolist())[-(self.max_length + 1):][:-1], + dtype=torch.long, + device=self.model.device).unsqueeze(0) + + logits = self.model(inputs_tokens)[0] + logits = torch.nn.functional.log_softmax(logits.float(), dim=-1) + + logits = logits[:, -len(cont_enc):, :] + cont_enc = cont_enc.unsqueeze(0).unsqueeze(-1) + logits = torch.gather(logits.cpu(), 2, cont_enc.cpu()).squeeze(-1) + + choice_score = float(logits.sum()) + doc_ele_pred.append(choice_score) + + # e.g. [-2.3, -9.2, -12.9, 1.1], length=len(choices) + return doc_ele_pred + + def _encode_pair(self, context, continuation): + n_spaces = len(context) - len(context.rstrip()) + if n_spaces > 0: + continuation = context[-n_spaces:] + continuation + context = context[:-n_spaces] + + whole_enc = self.tokenizer(context + continuation, padding=False)['input_ids'] + whole_enc = torch.tensor(whole_enc, device=self.device) + + context_enc = self.tokenizer(context, padding=False)['input_ids'] + context_enc = torch.tensor(context_enc, device=self.device) + + context_enc_len = len(context_enc) + continuation_enc = whole_enc[context_enc_len:] + + return context_enc, continuation_enc diff --git a/evalscope/models/custom_adapter.py b/evalscope/models/custom_adapter.py new file mode 100644 index 00000000..fb279feb --- /dev/null +++ b/evalscope/models/custom_adapter.py @@ -0,0 +1,67 @@ +from typing import Any, Dict, List, Union + +from evalscope.models.base_adapter import BaseModelAdapter +from evalscope.models.custom import CustomModel + + +class CustomModelAdapter(BaseModelAdapter): + + def __init__(self, custom_model: CustomModel, **kwargs): + """ + Custom model adapter. + + Args: + custom_model: The custom model instance. + **kwargs: Other args. + """ + self.custom_model = custom_model + super(CustomModelAdapter, self).__init__(model=custom_model) + + def predict(self, inputs: Union[str, dict, list], **kwargs) -> List[Dict[str, Any]]: + """ + Model prediction func. + + Args: + inputs (Union[str, dict, list]): The input data. Depending on the specific model. + str: 'xxx' + dict: {'data': [full_prompt]} + list: ['xxx', 'yyy', 'zzz'] + **kwargs: kwargs + + Returns: + res (dict): The model prediction results. Format: + { + 'choices': [ + { + 'index': 0, + 'message': { + 'content': 'xxx', + 'role': 'assistant' + } + } + ], + 'created': 1677664795, + 'model': 'gpt-3.5-turbo-0613', # should be model_id + 'object': 'chat.completion', + 'usage': { + 'completion_tokens': 17, + 'prompt_tokens': 57, + 'total_tokens': 74 + } + } + """ + in_prompts = [] + + # Note: here we assume the inputs are all prompts for the benchmark. + for input_prompt in inputs: + if isinstance(input_prompt, str): + in_prompts.append(input_prompt) + elif isinstance(input_prompt, dict): + # TODO: to be supported for continuation list like truthful_qa + in_prompts.append(input_prompt['data'][0]) + elif isinstance(input_prompt, list): + in_prompts.append('\n'.join(input_prompt)) + else: + raise TypeError(f'Unsupported inputs type: {type(input_prompt)}') + + return self.custom_model.predict(prompts=in_prompts, **kwargs) diff --git a/evalscope/models/dummy_chat_model.py b/evalscope/models/dummy_chat_model.py deleted file mode 100644 index 578b5f59..00000000 --- a/evalscope/models/dummy_chat_model.py +++ /dev/null @@ -1,49 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. - -import random -import time - -from evalscope.models import ChatBaseModel -from evalscope.utils.logger import get_logger - -logger = get_logger() - - -class DummyChatModel(ChatBaseModel): - - MODEL_ID = 'dummy_chat_model_0801' - REVISION = 'v1.0.0' - - def __init__(self, model_cfg: dict, **kwargs): - model_cfg['model_id'] = self.MODEL_ID - model_cfg['revision'] = self.REVISION - super(DummyChatModel, self).__init__(model_cfg=model_cfg) - - def predict(self, inputs: dict, **kwargs) -> dict: - - debug: bool = False - if debug: - messages = inputs['messages'] - history = inputs['history'] - - logger.info(f'** messages: {messages}') - logger.info(f'** history: {history}') - - choice = random.choice(['A', 'B', 'C', 'D']) - - # Build response - res = { - 'choices': [{ - 'index': 0, - 'message': { - 'content': choice, - 'role': 'assistant' - } - }], - 'created': time.time(), - 'model': self.MODEL_ID + '-' + self.REVISION, - 'object': 'chat.completion', - 'usage': {} - } - - return res diff --git a/evalscope/models/local_model.py b/evalscope/models/local_model.py new file mode 100644 index 00000000..502e0643 --- /dev/null +++ b/evalscope/models/local_model.py @@ -0,0 +1,74 @@ +import torch +from modelscope import AutoModelForCausalLM, AutoTokenizer +from torch import dtype +from typing import TYPE_CHECKING, Optional + +from evalscope.constants import DEFAULT_MODEL_CACHE_DIR, DEFAULT_MODEL_REVISION, EvalType +from evalscope.utils.logger import get_logger + +if TYPE_CHECKING: + from evalscope.config import TaskConfig + +logger = get_logger() + + +class LocalModel: + + def __init__(self, + model_id: str, + model_revision: str = DEFAULT_MODEL_REVISION, + device_map: str = 'auto', + torch_dtype: dtype = torch.bfloat16, + cache_dir: str = None, + **kwargs): + model_cache_dir = cache_dir or DEFAULT_MODEL_CACHE_DIR + + self.model_id = model_id + self.model_revision = model_revision + self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + logger.info(f'Device: {self.device}') + + self.tokenizer = AutoTokenizer.from_pretrained( + self.model_id, + revision=model_revision, + trust_remote_code=True, + cache_dir=model_cache_dir, + ) + + self.model = AutoModelForCausalLM.from_pretrained( + self.model_id, + revision=model_revision, + device_map=device_map, + trust_remote_code=True, + torch_dtype=torch_dtype, + cache_dir=model_cache_dir, + ) + + self.model_cfg = { + 'model_id': model_id, + 'device_map': device_map, + 'torch_dtype': str(torch_dtype), + } + + +def get_local_model(task_cfg: 'TaskConfig') -> Optional[LocalModel]: + """Get the base local model for the task. If the task is not checkpoint-based, return None. + Avoids loading model multiple times for different datasets. + """ + if task_cfg.eval_type != EvalType.CHECKPOINT: + return None + else: + device_map = task_cfg.model_args.get('device_map', 'auto') if torch.cuda.is_available() else None + cache_dir = task_cfg.model_args.get('cache_dir', None) + model_precision = task_cfg.model_args.get('precision', torch.float16) + model_revision = task_cfg.model_args.get('revision', DEFAULT_MODEL_REVISION) + if isinstance(model_precision, str) and model_precision != 'auto': + model_precision = eval(model_precision) + + base_model = LocalModel( + model_id=task_cfg.model, + model_revision=model_revision, + device_map=device_map, + torch_dtype=model_precision, + cache_dir=cache_dir) + return base_model diff --git a/evalscope/models/model.py b/evalscope/models/model.py index 826fb879..7f32f7b9 100644 --- a/evalscope/models/model.py +++ b/evalscope/models/model.py @@ -1,7 +1,15 @@ # Copyright (c) Alibaba, Inc. and its affiliates. +import openai +import os +import random +import time from abc import ABC, abstractmethod from typing import Any +from evalscope.utils.logger import get_logger + +logger = get_logger() + class BaseModel(ABC): @@ -86,3 +94,136 @@ def predict(self, inputs: dict, **kwargs) -> dict: } """ raise NotImplementedError + + +class OpenAIModel(ChatBaseModel): + """ + APIs of OpenAI models. + Available models: gpt-3.5-turbo, gpt-4 + """ + + MAX_RETRIES = 3 + + def __init__(self, model_cfg: dict, **kwargs): + super(OpenAIModel, self).__init__(model_cfg=model_cfg, **kwargs) + + openai_api_key = os.environ.get('OPENAI_API_KEY', None) + self.api_key = self.model_cfg.get('api_key', openai_api_key) + + if not self.api_key: + logger.error('OpenAI API key is not provided, please set it in environment variable OPENAI_API_KEY') + # raise ValueError( + # 'OpenAI API key is not provided, ' + # 'please set it in environment variable OPENAI_API_KEY') + + def predict(self, model_id: str, inputs: dict, **kwargs) -> dict: + + sys_prompt: str = inputs.get('sys_prompt', '') + user_prompt: str = inputs.get('user_prompt', '') + + # model_id: str = kwargs.get('model_id', '') + temperature: float = kwargs.pop('temperature', 0.2) + max_tokens: int = kwargs.pop('max_tokens', 1024) + mode: str = kwargs.pop('mode', 'chat.completion') + + logger.info(f'Using OpenAI model_id: {model_id}') + + res = self._predict( + model_id=model_id, + sys_prompt=sys_prompt, + user_prompt=user_prompt, + temperature=temperature, + max_tokens=max_tokens, + mode=mode) + + return res + + def _predict( + self, + model_id, + sys_prompt, + user_prompt, + temperature, + max_tokens, + mode: str = 'chat.completion', + ) -> dict: + + res = {} + openai.api_key = self.api_key + + for i in range(self.MAX_RETRIES): + try: + if mode == 'chat.completion': + resp = openai.ChatCompletion.create( + model=model_id, + messages=[{ + 'role': 'system', + 'content': sys_prompt + }, { + 'role': 'user', + 'content': user_prompt + }], + temperature=temperature, + max_tokens=max_tokens) + + if resp: + ans_text = resp['choices'][0]['message']['content'] + model_id = resp['model'] + else: + logger.warning(f'OpenAI GPT API call failed: got empty response ' + f'for input {sys_prompt} {user_prompt}') + ans_text = '' + model_id = '' + + res['ans_text'] = ans_text + res['model_id'] = model_id + else: + raise ValueError(f'Invalid mode: {mode}') + + return res + + except Exception as e: + logger.warning(f'OpenAI API call failed: {e}') + time.sleep(3) + logger.error(f'OpenAI API call failed after {self.MAX_RETRIES} retries') + return res + + +class DummyChatModel(ChatBaseModel): + + MODEL_ID = 'dummy_chat_model_0801' + REVISION = 'v1.0.0' + + def __init__(self, model_cfg: dict, **kwargs): + model_cfg['model_id'] = self.MODEL_ID + model_cfg['revision'] = self.REVISION + super(DummyChatModel, self).__init__(model_cfg=model_cfg) + + def predict(self, inputs: dict, **kwargs) -> dict: + + debug: bool = False + if debug: + messages = inputs['messages'] + history = inputs['history'] + + logger.info(f'** messages: {messages}') + logger.info(f'** history: {history}') + + choice = random.choice(['A', 'B', 'C', 'D']) + + # Build response + res = { + 'choices': [{ + 'index': 0, + 'message': { + 'content': choice, + 'role': 'assistant' + } + }], + 'created': time.time(), + 'model': self.MODEL_ID + '-' + self.REVISION, + 'object': 'chat.completion', + 'usage': {} + } + + return res diff --git a/evalscope/models/openai_model.py b/evalscope/models/openai_model.py deleted file mode 100644 index 3caa9c4b..00000000 --- a/evalscope/models/openai_model.py +++ /dev/null @@ -1,103 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. - -import openai -import os -import time - -from evalscope.models import ChatBaseModel -from evalscope.utils.logger import get_logger - -logger = get_logger() - - -class OpenAIModel(ChatBaseModel): - """ - APIs of OpenAI models. - Available models: gpt-3.5-turbo, gpt-4 - """ - - MAX_RETRIES = 3 - - def __init__(self, model_cfg: dict, **kwargs): - super(OpenAIModel, self).__init__(model_cfg=model_cfg, **kwargs) - - openai_api_key = os.environ.get('OPENAI_API_KEY', None) - self.api_key = self.model_cfg.get('api_key', openai_api_key) - - if not self.api_key: - logger.error('OpenAI API key is not provided, please set it in environment variable OPENAI_API_KEY') - # raise ValueError( - # 'OpenAI API key is not provided, ' - # 'please set it in environment variable OPENAI_API_KEY') - - def predict(self, model_id: str, inputs: dict, **kwargs) -> dict: - - sys_prompt: str = inputs.get('sys_prompt', '') - user_prompt: str = inputs.get('user_prompt', '') - - # model_id: str = kwargs.get('model_id', '') - temperature: float = kwargs.pop('temperature', 0.2) - max_tokens: int = kwargs.pop('max_tokens', 1024) - mode: str = kwargs.pop('mode', 'chat.completion') - - logger.info(f'Using OpenAI model_id: {model_id}') - - res = self._predict( - model_id=model_id, - sys_prompt=sys_prompt, - user_prompt=user_prompt, - temperature=temperature, - max_tokens=max_tokens, - mode=mode) - - return res - - def _predict( - self, - model_id, - sys_prompt, - user_prompt, - temperature, - max_tokens, - mode: str = 'chat.completion', - ) -> dict: - - res = {} - openai.api_key = self.api_key - - for i in range(self.MAX_RETRIES): - try: - if mode == 'chat.completion': - resp = openai.ChatCompletion.create( - model=model_id, - messages=[{ - 'role': 'system', - 'content': sys_prompt - }, { - 'role': 'user', - 'content': user_prompt - }], - temperature=temperature, - max_tokens=max_tokens) - - if resp: - ans_text = resp['choices'][0]['message']['content'] - model_id = resp['model'] - else: - logger.warning(f'OpenAI GPT API call failed: got empty response ' - f'for input {sys_prompt} {user_prompt}') - ans_text = '' - model_id = '' - - res['ans_text'] = ans_text - res['model_id'] = model_id - else: - raise ValueError(f'Invalid mode: {mode}') - - return res - - except Exception as e: - logger.warning(f'OpenAI API call failed: {e}') - time.sleep(3) - logger.error(f'OpenAI API call failed after {self.MAX_RETRIES} retries') - return res diff --git a/evalscope/models/server_adapter.py b/evalscope/models/server_adapter.py new file mode 100644 index 00000000..8f93caa8 --- /dev/null +++ b/evalscope/models/server_adapter.py @@ -0,0 +1,90 @@ +import requests +import time +from typing import Union + +from evalscope.models.base_adapter import BaseModelAdapter +from evalscope.utils.logger import get_logger + +logger = get_logger() + + +class ServerModelAdapter(BaseModelAdapter): + """ + Server model adapter to request remote API model and generate results. + """ + + def __init__(self, api_url: str, model_id: str, api_key: str = 'EMPTY', **kwargs): + """ + Args: + api_url: The URL of the remote API model. + model_id: The ID of the remote API model. + api_key: The API key of the remote API model. + """ + self.api_url = api_url + self.model_id = model_id + self.api_key = api_key + self.seed = kwargs.get('seed', None) + self.model_cfg = {'api_url': api_url, 'model_id': model_id, 'api_key': api_key} + super().__init__(model=None, model_cfg=self.model_cfg, **kwargs) + + def predict(self, inputs: Union[str, dict, list], infer_cfg: dict = None) -> dict: + """ + Model prediction func. + + Args: + inputs (Union[str, dict, list]): The input data. + infer_cfg (dict): Inference configuration. + + Returns: + res (dict): The model prediction results. + """ + infer_cfg = infer_cfg or {} + + # Process inputs + if isinstance(inputs, str): + query = inputs + elif isinstance(inputs, dict): + # TODO: to be supported for continuation list like truthful_qa + query = inputs['data'][0] + elif isinstance(inputs, list): + query = '\n'.join(inputs) + else: + raise TypeError(f'Unsupported inputs type: {type(inputs)}') + + request_json = self.make_request(query, infer_cfg) + return self.send_request(request_json) + + def make_request(self, query: str, infer_cfg: dict) -> dict: + """Make request to remote API.""" + # Format request JSON according to OpenAI API format + # do not sample by default + request_json = { + 'model': self.model_id, + 'messages': [{ + 'role': 'user', + 'content': query + }], + 'max_tokens': infer_cfg.get('max_tokens', 2048), + 'temperature': infer_cfg.get('temperature', 0.0), + 'top_p': infer_cfg.get('top_p', 1.0), + 'n': infer_cfg.get('num_return_sequences', 1), + 'stop': infer_cfg.get('stop', None) + } + if self.seed is not None: + request_json['seed'] = self.seed + logger.debug(f'Request to remote API: {request_json}') + return request_json + + def send_request(self, request_json: dict, max_retries: int = 3) -> dict: + for attempt in range(max_retries): + response = requests.post( + self.api_url, json=request_json, headers={'Authorization': f'Bearer {self.api_key}'}) + if response.status_code == 200: + response_data = response.json() + return response_data + logger.warning(f'Failed to request to remote API: {response.status_code} {response.text}') + if attempt < max_retries - 1: + time.sleep(5) # Sleep for 5 seconds before retrying + else: + raise RuntimeError(f'Failed to request to remote API after {max_retries} attempts: ' + f'{response.status_code} {response.text}') diff --git a/evalscope/run.py b/evalscope/run.py index a1841545..998ecc99 100644 --- a/evalscope/run.py +++ b/evalscope/run.py @@ -2,27 +2,23 @@ """ Run evaluation for LLMs. """ -import logging import os.path -import torch from argparse import Namespace from datetime import datetime from typing import List, Optional, Union from evalscope.arguments import parse_args +from evalscope.benchmarks import Benchmark, BenchmarkMeta from evalscope.config import TaskConfig, parse_task_config -from evalscope.constants import DEFAULT_MODEL_REVISION, DEFAULT_WORK_DIR, EvalBackend, EvalType +from evalscope.constants import DEFAULT_WORK_DIR, EvalBackend from evalscope.evaluator import Evaluator -from evalscope.models.custom import CustomModel -from evalscope.utils import import_module_util, seed_everything +from evalscope.models import LocalModel, get_local_model, initialize_model_adapter +from evalscope.utils import seed_everything from evalscope.utils.io_utils import OutputsStructure, are_paths_same from evalscope.utils.logger import configure_logging, get_logger logger = get_logger() -BENCHMARK_PATH_PREFIX = 'evalscope.benchmarks.' -MEMBERS_TO_IMPORT = ['DATASET_ID', 'SUBSET_LIST', 'DataAdapterClass', 'ModelAdapterClass'] - def run_task(task_cfg: Union[str, dict, TaskConfig, List[TaskConfig], Namespace]) -> Union[dict, List[dict]]: """Run evaluation task(s) based on the provided configuration.""" @@ -38,15 +34,13 @@ def run_task(task_cfg: Union[str, dict, TaskConfig, List[TaskConfig], Namespace] def run_single_task(task_cfg: TaskConfig, run_time: str) -> dict: """Run a single evaluation task.""" - seed_everything(task_cfg.seed) + if task_cfg.seed is not None: + seed_everything(task_cfg.seed) outputs = setup_work_directory(task_cfg, run_time) configure_logging(task_cfg.debug, os.path.join(outputs.logs_dir, 'eval_log.log')) - task_cfg.dump_yaml(outputs.configs_dir) - logger.info(task_cfg) - if task_cfg.eval_backend != EvalBackend.NATIVE: - return run_non_native_backend(task_cfg) + return run_non_native_backend(task_cfg, outputs) else: return evaluate_model(task_cfg, outputs) @@ -68,7 +62,7 @@ def setup_work_directory(task_cfg: TaskConfig, run_time: str): return outputs -def run_non_native_backend(task_cfg: TaskConfig) -> dict: +def run_non_native_backend(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict: """Run evaluation using a non-native backend.""" eval_backend = task_cfg.eval_backend eval_config = task_cfg.eval_config @@ -78,6 +72,10 @@ def run_non_native_backend(task_cfg: TaskConfig) -> dict: backend_manager_class = get_backend_manager_class(eval_backend) backend_manager = backend_manager_class(config=eval_config) + + task_cfg.dump_yaml(outputs.configs_dir) + logger.info(task_cfg) + backend_manager.run() return dict() @@ -102,75 +100,48 @@ def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict: """Evaluate the model based on the provided task configuration.""" # Initialize evaluator eval_results = {} - + base_model = get_local_model(task_cfg) + evaluators = [] for dataset_name in task_cfg.datasets: - evaluator = create_evaluator(task_cfg, dataset_name, outputs) + evaluator = create_evaluator(task_cfg, dataset_name, outputs, base_model) + evaluators.append(evaluator) + + # dump task_cfg to outputs.configs_dir after creating evaluators + task_cfg.dump_yaml(outputs.configs_dir) + logger.info(task_cfg) + + for evaluator in evaluators: res_dict = evaluator.eval(infer_cfg=task_cfg.generation_config, debug=task_cfg.debug, limit=task_cfg.limit) eval_results[dataset_name] = res_dict return eval_results -def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsStructure): +def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsStructure, base_model: LocalModel): """Create an evaluator object for the specified dataset.""" - imported_modules = import_module_util(BENCHMARK_PATH_PREFIX, dataset_name, MEMBERS_TO_IMPORT) - model_adapter = initialize_model_adapter(task_cfg, dataset_name, imported_modules) - - dataset_config = task_cfg.dataset_args.get(dataset_name, {}) - dataset_name_or_path = dataset_config.get('local_path') or imported_modules['DATASET_ID'] - in_prompt_template = dataset_config.get('prompt_template', '') - few_shot_num = dataset_config.get('few_shot_num', None) - few_shot_random = dataset_config.get('few_shot_random', True) - - data_adapter = imported_modules['DataAdapterClass']( - few_shot_num=few_shot_num, - few_shot_random=few_shot_random, - prompt_template=in_prompt_template, - outputs=outputs, - ) - in_subset_list = dataset_config.get('subset_list', imported_modules['SUBSET_LIST']) - logger.info(f'Evaluating on subsets for {dataset_name}: {in_subset_list}\n') + if dataset_name == 'data_collection': + # EvaluatorCollection is a collection of evaluators + from evalscope.collections import EvaluatorCollection + return EvaluatorCollection(task_cfg, outputs) + + benchmark: BenchmarkMeta = Benchmark.get(dataset_name) + + data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {})) + model_adapter = initialize_model_adapter(task_cfg, benchmark.model_adapter, base_model) + + # update task_cfg.dataset_args + task_cfg.dataset_args[dataset_name] = benchmark.to_string_dict() return Evaluator( - dataset_name_or_path=dataset_name_or_path, - subset_list=in_subset_list, + dataset_name_or_path=benchmark.dataset_id, data_adapter=data_adapter, model_adapter=model_adapter, - use_cache=task_cfg.use_cache, outputs=outputs, - datasets_dir=task_cfg.dataset_dir, - datasets_hub=task_cfg.dataset_hub, - stage=task_cfg.stage, - eval_type=task_cfg.eval_type, - overall_task_cfg=task_cfg, + task_cfg=task_cfg, ) -def initialize_model_adapter(task_cfg: TaskConfig, dataset_name: str, imported_modules): - """Initialize the model adapter based on the task configuration.""" - if task_cfg.dry_run: - from evalscope.models.dummy_chat_model import DummyChatModel - return DummyChatModel(model_cfg=dict()) - elif task_cfg.eval_type == EvalType.CUSTOM: - if not isinstance(task_cfg.model, CustomModel): - raise ValueError(f'Expected evalscope.models.custom.CustomModel, but got {type(task_cfg.model)}.') - from evalscope.models.model_adapter import CustomModelAdapter - return CustomModelAdapter(custom_model=task_cfg.model) - else: - device_map = task_cfg.model_args.get('device_map', 'auto') if torch.cuda.is_available() else None - model_precision = task_cfg.model_args.get('precision', torch.float16) - if isinstance(model_precision, str) and model_precision != 'auto': - model_precision = eval(model_precision) - return imported_modules['ModelAdapterClass']( - model_id=task_cfg.model, - model_revision=task_cfg.model_args.get('revision', DEFAULT_MODEL_REVISION), - device_map=device_map, - torch_dtype=model_precision, - generation_config=task_cfg.generation_config, - chat_template=task_cfg.chat_template) - - def main(): args = parse_args() run_task(args) diff --git a/evalscope/models/api/openai_api.py b/evalscope/third_party/longbench_write/tools/openai_api.py similarity index 100% rename from evalscope/models/api/openai_api.py rename to evalscope/third_party/longbench_write/tools/openai_api.py diff --git a/evalscope/utils/__init__.py b/evalscope/utils/__init__.py index b3cf1c35..56e6a260 100644 --- a/evalscope/utils/__init__.py +++ b/evalscope/utils/__init__.py @@ -1,4 +1,4 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -from evalscope.constants import * +from evalscope.utils.model_utils import EvalBackend from evalscope.utils.utils import * diff --git a/evalscope/utils/chat_service.py b/evalscope/utils/chat_service.py index 6e4a4a77..6df4fd96 100644 --- a/evalscope/utils/chat_service.py +++ b/evalscope/utils/chat_service.py @@ -7,7 +7,7 @@ from pydantic import BaseModel, Field from threading import Thread from transformers import TextIteratorStreamer -from typing import List, Literal, Optional, Union +from typing import Any, List, Literal, Optional, Union class Usage(BaseModel): @@ -66,7 +66,7 @@ class ChatCompletionResponseStreamChoice(BaseModel): class ChatCompletionResponse(BaseModel): model: str object: Literal['chat.completion', 'chat.completion.chunk'] - choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice]] + choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice, Any]] created: Optional[int] = Field(default_factory=lambda: int(time.time())) usage: Optional[Usage] diff --git a/evalscope/utils/io_utils.py b/evalscope/utils/io_utils.py index daecd2b4..c4fc2e7a 100644 --- a/evalscope/utils/io_utils.py +++ b/evalscope/utils/io_utils.py @@ -160,3 +160,11 @@ def are_paths_same(path1, path2): real_path2 = os.path.realpath(os.path.abspath(os.path.expanduser(path2))) return real_path1 == real_path2 + + +def dict_to_json(d: dict, json_file: str): + """ + Dump dict to json file. + """ + with open(json_file, 'w') as f: + json.dump(d, f, indent=4, ensure_ascii=False) diff --git a/evalscope/utils/model_utils.py b/evalscope/utils/model_utils.py index 0bdbec87..3fc895d4 100644 --- a/evalscope/utils/model_utils.py +++ b/evalscope/utils/model_utils.py @@ -1,6 +1,15 @@ +from enum import Enum from transformers import GenerationConfig +class EvalBackend(Enum): + NATIVE = 'Native' + OPEN_COMPASS = 'OpenCompass' + VLM_EVAL_KIT = 'VLMEvalKit' + RAG_EVAL = 'RAGEval' + THIRD_PARTY = 'ThirdParty' + + def fix_do_sample_warning(generation_config: GenerationConfig) -> None: # Use the default values of temperature/top_p/top_k in generation_config. if generation_config.temperature == 0: diff --git a/evalscope/utils/utils.py b/evalscope/utils/utils.py index df11c743..ed3e4d2a 100644 --- a/evalscope/utils/utils.py +++ b/evalscope/utils/utils.py @@ -199,27 +199,6 @@ def parse_last_option(text: str, options: str) -> str: -def import_module_util(import_path_prefix: str, module_name: str, members_to_import: list) -> dict: - """ - Import module utility function. - - Args: - import_path_prefix: e.g. 'evalscope.benchmarks.' - module_name: The module name to import. e.g. 'mmlu' - members_to_import: The members to import. - e.g. ['DATASET_ID', 'SUBJECT_MAPPING', 'SUBSET_LIST', 'DataAdapterClass'] - - Returns: - dict: imported modules map. e.g. {'DATASET_ID': 'mmlu', 'SUBJECT_MAPPING': {...}, ...} - """ - imported_modules = {} - module = importlib.import_module(import_path_prefix + module_name) - for member_name in members_to_import: - imported_modules[member_name] = getattr(module, member_name) - - return imported_modules - - def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float, dict]: """ Normalize score. diff --git a/examples/tasks/eval_vlm_swift.yaml b/examples/tasks/eval_vlm_swift.yaml index f3e76f71..d55b2673 100644 --- a/examples/tasks/eval_vlm_swift.yaml +++ b/examples/tasks/eval_vlm_swift.yaml @@ -4,7 +4,7 @@ eval_config: model: - type: internvl2-8b # model id of the model name: CustomAPIModel # Don't change, must be CustomAPIModel for deploy evaluation - api_base: http://localhost:8000/v1/chat/completions # deployed model api + api_base: http://localhost:8801/v1/chat/completions # deployed model api key: EMPTY temperature: 0.0 img_size: 224 @@ -21,12 +21,14 @@ eval_config: # - AI2D_TEST # - POPE # - RealWorldQA - - SEEDBench2_Plus + # - SEEDBench2_Plus + - MME mode: all - limit: 10 + limit: 2 reuse: true nproc: 1 + judge: exact_matching # judge model server config - OPENAI_API_KEY: EMPTY - OPENAI_API_BASE: http://localhost:11434/v1/chat/completions # judge model api - LOCAL_LLM: llama3.1:latest # judge model type + # OPENAI_API_KEY: EMPTY + # OPENAI_API_BASE: http://localhost:11434/v1/chat/completions # judge model api + # LOCAL_LLM: llama3.1:latest # judge model type diff --git a/tests/cli/test_collection.py b/tests/cli/test_collection.py new file mode 100644 index 00000000..72ca3a55 --- /dev/null +++ b/tests/cli/test_collection.py @@ -0,0 +1,54 @@ +import json +import unittest + +from evalscope.collections.data_generator import WeightedSampler +from evalscope.collections.schema import CollectionSchema, DatasetInfo +from evalscope.constants import EvalType +from evalscope.run import run_task +from evalscope.utils.io_utils import dump_jsonl_data +from evalscope.utils.utils import test_level_list + + +class TestCollection(unittest.TestCase): + @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level') + def test_create_collection(self): + schema = CollectionSchema(name='math&reasoning', datasets=[ + CollectionSchema(name='math', datasets=[ + DatasetInfo(name='gsm8k', weight=1, task_type='math', tags=['en', 'math']), + DatasetInfo(name='competition_math', weight=1, task_type='math', tags=['en', 'math']), + DatasetInfo(name='cmmlu', weight=2, task_type='math', tags=['zh', 'math'], args={'subset_list': ['college_mathematics', 'high_school_mathematics']}), + DatasetInfo(name='ceval', weight=3, task_type='math', tags=['zh', 'math'], args={'subset_list': ['advanced_mathematics', 'high_school_mathematics', 'discrete_mathematics', 'middle_school_mathematics']}), + ]), + CollectionSchema(name='reasoning', datasets=[ + DatasetInfo(name='arc', weight=1, task_type='reasoning', tags=['en', 'reasoning']), + DatasetInfo(name='ceval', weight=1, task_type='reasoning', tags=['zh', 'reasoning'], args={'subset_list': ['logic']}), + DatasetInfo(name='race', weight=1, task_type='reasoning', tags=['en', 'reasoning']), + ]), + ]) + print(schema.to_dict()) + print(schema.flatten()) + schema.dump_json('outputs/schema_test.json') + + + @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level') + def test_generate_data(self): + schema = CollectionSchema.from_dict(json.load(open('outputs/schema_test.json', 'r'))) + print(schema.to_dict()) + mixed_data = WeightedSampler(schema, 100).sample() + dump_jsonl_data(mixed_data, 'outputs/mixed_data_test.jsonl') + + @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level') + def test_evaluate_collection(self): + from evalscope.config import TaskConfig + + task_cfg = TaskConfig( + model='qwen2.5', + api_url='http://127.0.0.1:8801/v1/chat/completions', + api_key='EMPTY', + eval_type=EvalType.SERVICE, + datasets=['data_collection'], + dataset_args={'data_collection': { + 'local_path': 'outputs/mixed_data_test.jsonl' + }}, + ) + run_task(task_cfg=task_cfg) diff --git a/tests/cli/test_run.py b/tests/cli/test_run.py index fb01245f..7b8fddec 100644 --- a/tests/cli/test_run.py +++ b/tests/cli/test_run.py @@ -4,6 +4,7 @@ import torch import unittest +from evalscope.constants import EvalType from evalscope.run import run_task from evalscope.utils import is_module_installed, test_level_list from evalscope.utils.logger import get_logger @@ -70,7 +71,18 @@ def test_run_eval_with_args(self): @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level') def test_run_task(self): - task_cfg = {'model': 'qwen/Qwen2-0.5B-Instruct', 'datasets': ['bbh', 'gsm8k', 'arc'], 'limit': 2, 'debug': False} + task_cfg = {'model': 'qwen/Qwen2-0.5B-Instruct', + 'datasets': [ + # 'bbh', + # 'hellaswag', + # 'gsm8k', + # 'arc' + 'race', + 'truthful_qa', + 'trivia_qa', + ], + 'limit': 2, + 'debug': True} run_task(task_cfg=task_cfg) @@ -110,5 +122,34 @@ def test_run_humaneval(self): run_task(task_cfg=task_cfg) + @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level') + def test_run_server_model(self): + from evalscope.config import TaskConfig + + task_cfg = TaskConfig( + model='qwen2.5', + api_url='http://127.0.0.1:8801/v1/chat/completions', + api_key='EMPTY', + eval_type=EvalType.SERVICE, + datasets=[ + # 'mmlu', + # 'race', + 'trivia_qa', + # 'cmmlu', + # 'humaneval', + # 'competition_math', + # 'gsm8k', + # 'arc', + # 'ceval', + # 'bbh', + # 'hellaswag', + ], + limit=20, + debug=True + ) + + run_task(task_cfg=task_cfg) + + if __name__ == '__main__': unittest.main() diff --git a/tests/rag/test_mteb.py b/tests/rag/test_mteb.py index f80e0023..66d494ad 100644 --- a/tests/rag/test_mteb.py +++ b/tests/rag/test_mteb.py @@ -79,7 +79,7 @@ def test_run_two_stage_mteb(self): }, }, { - 'model_name_or_path': 'OpenBMB/MiniCPM-Reranker', + 'model_name_or_path': 'BAAI/bge-reranker-v2-m3', 'is_cross_encoder': True, 'max_seq_length': 512, 'prompt': '为这个问题生成一个检索用的表示', @@ -94,7 +94,8 @@ def test_run_two_stage_mteb(self): 'verbosity': 2, 'output_folder': 'outputs', 'overwrite_results': True, - 'limits': 10, + # 'limits': 10, + 'top_k': 10, }, }, }