modelscope · Yunnglin · Dec 18, 2024 · Dec 18, 2024 · Dec 18, 2024 · Dec 19, 2024
diff --git a/docs/zh/user_guides/backend/rageval_backend/mteb.md b/docs/zh/user_guides/backend/rageval_backend/mteb.md
@@ -102,7 +102,8 @@ one_stage_task_cfg = {
 
 
 ### 两阶段评测
-配置文件示例如下，先进行检索，再进行reranking：
+评测reranker需要用retrieval数据集，先用embedding模型检索topk，再进行排序。配置文件示例如下：
+
 ```python
 two_stage_task_cfg = {
     "eval_backend": "RAGEval",

diff --git a/evalscope/arguments.py b/evalscope/arguments.py
@@ -1,6 +1,8 @@
 import argparse
 import json
 
+from evalscope.constants import EvalBackend, EvalStage, EvalType
+
 
 class ParseStrArgsAction(argparse.Action):
 
@@ -47,10 +49,13 @@ def add_argument(parser: argparse.ArgumentParser):
     parser.add_argument('--generation-config', type=str, action=ParseStrArgsAction, help='The generation config, should be a string.')  # noqa: E501
 
     # Evaluation-related arguments
-    parser.add_argument('--eval-type', type=str, help='The type for evaluating.')
-    parser.add_argument('--eval-backend', type=str, help='The evaluation backend to use.')
+    parser.add_argument('--eval-type', type=str, help='The type for evaluating.',
+                        choices=[EvalType.CHECKPOINT, EvalType.CUSTOM, EvalType.SERVICE])
+    parser.add_argument('--eval-backend', type=str, help='The evaluation backend to use.',
+                        choices=[EvalBackend.NATIVE, EvalBackend.OPEN_COMPASS, EvalBackend.VLM_EVAL_KIT, EvalBackend.RAG_EVAL])  # noqa: E501
     parser.add_argument('--eval-config', type=str, required=False, help='The eval task config file path for evaluation backend.')  # noqa: E501
-    parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.')
+    parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.',
+                        choices=[EvalStage.ALL, EvalStage.INFER, EvalStage.EVAL])
     parser.add_argument('--limit', type=int, default=None, help='Max evaluation samples num for each subset.')
 
     # Cache and working directory arguments
@@ -62,6 +67,8 @@ def add_argument(parser: argparse.ArgumentParser):
     parser.add_argument('--debug', action='store_true', default=False, help='Debug mode, will print information for debugging.')  # noqa: E501
     parser.add_argument('--dry-run', action='store_true', default=False, help='Dry run in single processing mode.')
     parser.add_argument('--seed', type=int, default=42, help='Random seed for reproducibility.')
+    parser.add_argument('--api-key', type=str, default='EMPTY', help='The API key for the remote API model.')
+    parser.add_argument('--api-url', type=str, default=None, help='The API url for the remote API model.')
     # yapf: enable
 
 

diff --git a/evalscope/benchmarks/__init__.py b/evalscope/benchmarks/__init__.py
@@ -1,4 +1,23 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import glob
+import importlib
+import os
 
-from evalscope.benchmarks.benchmark import Benchmark
+from evalscope.benchmarks.benchmark import Benchmark, BenchmarkMeta
 from evalscope.benchmarks.data_adapter import DataAdapter
+from evalscope.utils import get_logger
+
+logger = get_logger()
+
+# Using glob to find all files matching the pattern
+pattern = os.path.join(os.path.dirname(__file__), '*', '*_adapter.py')
+files = glob.glob(pattern, recursive=False)
+
+for file_path in files:
+    if file_path.endswith('.py') and not os.path.basename(file_path).startswith('_'):
+        # Convert file path to a module path
+        relative_path = os.path.relpath(file_path, os.path.dirname(__file__))
+        module_path = relative_path[:-3].replace(os.path.sep, '.')  # strip '.py' and convert to module path
+        full_path = f'evalscope.benchmarks.{module_path}'
+        importlib.import_module(full_path)
+        # print(f'Importing {full_path}')
diff --git a/evalscope/benchmarks/arc/__init__.py b/evalscope/benchmarks/arc/__init__.py
@@ -1,6 +1 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-
-from evalscope.benchmarks.arc.arc_adapter import DATASET_ID, SUBSET_LIST
-from evalscope.benchmarks.arc.arc_adapter import ARCAdapter
-from evalscope.benchmarks.arc.arc_adapter import ARCAdapter as DataAdapterClass
-from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass  # noqa
diff --git a/evalscope/benchmarks/arc/arc_adapter.py b/evalscope/benchmarks/arc/arc_adapter.py
@@ -3,40 +3,35 @@
 import json
 import os
 
-from evalscope.benchmarks.data_adapter import DataAdapter
-from evalscope.metrics.metrics import exact_match, weighted_mean
-from evalscope.utils import ResponseParser, normalize_score
+from evalscope.benchmarks import Benchmark, DataAdapter
+from evalscope.constants import EvalType
+from evalscope.metrics import WeightedAverageAccuracy, exact_match
+from evalscope.models import MultiChoiceModelAdapter
+from evalscope.utils import ResponseParser
 from evalscope.utils.logger import get_logger
 
 # flake8: noqa
 
 logger = get_logger()
 
-DATASET_ID = 'modelscope/ai2_arc'
-
-# task_list = ['ARC-Easy', 'ARC-Challenge']
-SUBSET_LIST = ['ARC-Challenge']
-
 
+@Benchmark.register(
+    name='arc',
+    dataset_id='modelscope/ai2_arc',
+    model_adapter=MultiChoiceModelAdapter,
+    subset_list=['ARC-Easy', 'ARC-Challenge'],
+    metric_list=[WeightedAverageAccuracy],
+    few_shot_num=0,
+    train_split='train',
+    eval_split='test',
+    prompt_template='',
+)
 class ARCAdapter(DataAdapter):
 
     choices = ['A', 'B', 'C', 'D']
 
-    def __init__(self,
-                 subset_list: list = None,
-                 metric_list: list = None,
-                 few_shot_num: int = None,
-                 train_split: str = 'train',
-                 eval_split: str = 'test',
-                 prompt_template: str = '',
-                 **kwargs):
-
-        if subset_list is None:
-            subset_list = SUBSET_LIST
-
-        if metric_list is None:
-            metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
-
+    def __init__(self, **kwargs):
+        few_shot_num = kwargs.get('few_shot_num', None)
         if few_shot_num is None:
             # Use 0-shot by default
             logger.info(f'Set 0-shot examples by system for ARC.')
@@ -45,14 +40,7 @@ def __init__(self,
         if few_shot_num != 0:
             logger.warning(f'few_shot_num is recommended to set 0 for ARC, got {few_shot_num}.')
 
-        super().__init__(
-            subset_list=subset_list,
-            metric_list=metric_list,
-            few_shot_num=few_shot_num,
-            train_split=train_split,
-            eval_split=eval_split,
-            prompt_template=prompt_template,
-            **kwargs)
+        super().__init__(**kwargs)
 
     def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
         """
@@ -132,7 +120,7 @@ def get_gold_answer(self, input_d: dict) -> str:
         # Get the gold choice
         return input_d.get('answerKey', '')
 
-    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
+    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
         """
         Parse the model output to get the answer. Could be the best choice index.
 
@@ -144,12 +132,12 @@ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: st
         Returns:
             The parsed answer. Depending on the dataset. Usually a string for chat.
         """
-        if eval_type == 'checkpoint':
+        if eval_type == EvalType.CHECKPOINT:
             return result
-        elif eval_type == 'service':
+        elif eval_type == EvalType.SERVICE:
             return ResponseParser.parse_first_option_with_choices(
                 text=result, options=self.choices)  # TODO: to be checked !
-        elif eval_type == 'custom':
+        elif eval_type == EvalType.CUSTOM:
             return ResponseParser.parse_first_option_with_choices(
                 text=result, options=self.choices)  # TODO: to be checked !
         else:
@@ -158,70 +146,6 @@ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: st
     def match(self, gold: str, pred: str) -> float:
         return exact_match(gold=gold, pred=pred)
 
-    def compute_metric(self, review_res_list: list) -> float:
-        """
-        Compute evaluation result by specific metric.
-
-        Args:
-            review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
-
-        Returns:
-            The metric score.
-        """
-        items = [(score, 1.0) for score in review_res_list]
-        return weighted_mean(items)
-
-    def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
-        """
-        Generate the report for the model output.
-
-        Args:
-            subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
-            report_name: The user-defined report name.
-
-        Returns: A dict of metric calculation results. The format is like:
-        {
-            "name":"ARC",
-            "metric":"WeightedAverageAccuracy",
-            "score":0.3389,
-            "category":[
-                {
-                    "name":"DEFAULT",
-                    "score":0.4128,
-                    "subset":[
-                        {
-                            "name":"ARC-Easy",
-                            "score":0.5632
-                        },
-                        {
-                            "name":"ARC-Challenge",
-                            "score":0.3157
-                        }
-                    ]
-                }
-            ],
-            "total_num":7800
-        }
-        """
-        total_num: int = sum([num for _, num in subset_score_map.values()])
-        weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
-        weighted_avg_acc = normalize_score(score=weighted_avg_acc)
-        cate_avg_list = [{
-            'name': subset_name,
-            'score': normalize_score(score=score)
-        } for subset_name, (score, _) in subset_score_map.items()]
-
-        category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
-
-        res_map = dict(
-            name=report_name or 'arc',
-            metric=self.metric_list[0]['name'],
-            score=weighted_avg_acc,
-            category=[category_d],
-            total_num=total_num)
-
-        return res_map
-
     @classmethod
     def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:
 

diff --git a/evalscope/benchmarks/bbh/__init__.py b/evalscope/benchmarks/bbh/__init__.py
@@ -1,5 +1 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-
-from evalscope.benchmarks.bbh.bbh_adapter import DATASET_ID, SUBSET_LIST
-from evalscope.benchmarks.bbh.bbh_adapter import BBHAdapter as DataAdapterClass
-from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass  # noqa