From f3c09da73594b23e920ae464c4e00730ed8c8a95 Mon Sep 17 00:00:00 2001
From: Yunnglin <mao.looper@qq.com>
Date: Wed, 18 Dec 2024 18:03:45 +0800
Subject: [PATCH 01/15] add dataset register

---
 evalscope/benchmarks/__init__.py              |  21 +++-
 evalscope/benchmarks/benchmark.py             | 107 +++++++++---------
 evalscope/benchmarks/data_adapter.py          |  14 ++-
 evalscope/benchmarks/gsm8k/__init__.py        |   4 +-
 evalscope/benchmarks/gsm8k/gsm8k_adapter.py   |  56 ++++-----
 .../benchmarks/humaneval/humaneval_adapter.py |  65 +----------
 evalscope/benchmarks/mmlu/mmlu_adapter.py     |   5 +-
 evalscope/config.py                           |   3 +-
 evalscope/models/__init__.py                  |   1 +
 evalscope/run.py                              |  36 ++----
 evalscope/utils/utils.py                      |  21 ----
 tests/cli/test_run.py                         |   2 +-
 12 files changed, 122 insertions(+), 213 deletions(-)

diff --git a/evalscope/benchmarks/__init__.py b/evalscope/benchmarks/__init__.py
index b863b5ab..ab96a29e 100644
--- a/evalscope/benchmarks/__init__.py
+++ b/evalscope/benchmarks/__init__.py
@@ -1,4 +1,23 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import glob
+import importlib
+import os
 
-from evalscope.benchmarks.benchmark import Benchmark
+from evalscope.benchmarks.benchmark import Benchmark, BenchmarkMeta
 from evalscope.benchmarks.data_adapter import DataAdapter
+from evalscope.utils import get_logger
+
+logger = get_logger()
+
+# Using glob to find all files matching the pattern
+pattern = os.path.join(os.path.dirname(__file__), '*', '*_adapter.py')
+files = glob.glob(pattern, recursive=False)
+
+for file_path in files:
+    if file_path.endswith('.py') and not os.path.basename(file_path).startswith('_'):
+        # Convert file path to a module path
+        relative_path = os.path.relpath(file_path, os.path.dirname(__file__))
+        module_path = relative_path[:-3].replace(os.path.sep, '.')  # strip '.py' and convert to module path
+        full_path = f"evalscope.benchmarks.{module_path}"
+        importlib.import_module(full_path)
+        print(f"Importing {full_path}")
diff --git a/evalscope/benchmarks/benchmark.py b/evalscope/benchmarks/benchmark.py
index aafc9868..bffd4f87 100644
--- a/evalscope/benchmarks/benchmark.py
+++ b/evalscope/benchmarks/benchmark.py
@@ -1,65 +1,60 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
+import copy
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional
 
-import os.path
-from modelscope.msdatasets import MsDataset
-from typing import Optional
+from evalscope.benchmarks import DataAdapter
+from evalscope.models import BaseModelAdapter
 
-from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, HubType
+BENCHMARK_MAPPINGS = {}
 
 
-class Benchmark(object):
-    """
-    Wrapper for loading datasets from ModelScope or HuggingFace.
-    """
+@dataclass
+class BenchmarkMeta:
+    name: str
+    dataset_id: str
+    data_adapter: DataAdapter
+    model_adapter: BaseModelAdapter
+    subset_list: List[str] = field(default_factory=list)
+    metric_list: List[Dict] = field(default_factory=list)
+    few_shot_num: int = 0
+    few_shot_random: bool = False
+    train_split: Optional[str] = None
+    eval_split: Optional[str] = None
+    prompt_template: str = ''
 
-    def __init__(self):
-        ...
-
-    @staticmethod
-    def load(dataset_name: str,
-             subset: str = None,
-             split: str = None,
-             token: str = None,
-             hub: str = 'ModelScope',
-             work_dir: Optional[str] = DEFAULT_DATASET_CACHE_DIR,
-             **kwargs):
-        """
-        Load a dataset from ModelScope or HuggingFace.
-
-        Args:
-            dataset_name (str): The dataset id or path.
-                If it is dataset id, should be in the format of `organization/name` for ModelScope and HuggingFace hub.
-                If it is dataset path, should be the path on local disk.
-            subset (str):
-            split:
-            token: sdk token for ModelScope, optional, default None
-            hub: `ModelScope` or `HuggingFace`
-            work_dir: the work directory for caching, optional
-
-        Returns:
-            A dict.
-        """
+    def update(self, args: dict):
+        if args.get('local_path'):
+            self.dataset_id = args['local_path']
+            del args['local_path']
+        self.__dict__.update(args)
 
-        dataset = MsDataset.load(
-            dataset_name=dataset_name,
-            subset_name=subset,
-            split=split,
-            token=token,
-            cache_dir=work_dir,
-            hub=hub,
-            **kwargs)
+    def to_dict(self):
+        cur_dict = copy.deepcopy(self.__dict__)
+        del cur_dict['data_adapter']
+        del cur_dict['model_adapter']
+        return cur_dict
 
-        dataset.dataset_name = dataset_name.split('/')[-1]
-        dataset.subset_name = subset
-        # dataset.split = split
-        return dataset
 
+class Benchmark:
 
-if __name__ == '__main__':
-
-    ds = Benchmark.load(dataset_name='mmlu', subset='management', split=None)
-
-    n = 1
-    for i in ds:
-        print('>', n, ': ', i)
-        n += 1
+    def __init__(self):
+        pass
+
+    @classmethod
+    def get(cls, name: str) -> 'BenchmarkMeta':
+        if name not in BENCHMARK_MAPPINGS:
+            raise Exception(f"Unknown benchmark: {name}. Available tasks: {BENCHMARK_MAPPINGS.keys()}")
+        benchmark = BENCHMARK_MAPPINGS[name]
+        return benchmark
+
+    @classmethod
+    def register(cls, name: str, dataset_id: str, model_adapter: BaseModelAdapter, **kwargs):
+
+        def register_wrapper(data_adapter):
+            if name in BENCHMARK_MAPPINGS:
+                raise Exception(f"Benchmark {name} already registered")
+            BENCHMARK_MAPPINGS[name] = BenchmarkMeta(
+                name=name, data_adapter=data_adapter, model_adapter=model_adapter, dataset_id=dataset_id, **kwargs)
+            return data_adapter
+
+        return register_wrapper
diff --git a/evalscope/benchmarks/data_adapter.py b/evalscope/benchmarks/data_adapter.py
index 58f09d95..fc1e6b40 100644
--- a/evalscope/benchmarks/data_adapter.py
+++ b/evalscope/benchmarks/data_adapter.py
@@ -2,9 +2,9 @@
 import os.path
 import random
 from abc import ABC, abstractmethod
+from modelscope.msdatasets import MsDataset
 from typing import Any, Optional
 
-from evalscope.benchmarks import Benchmark
 from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, HubType
 from evalscope.utils.logger import get_logger
 
@@ -55,6 +55,9 @@ def load(self,
 
         """
         dataset_name_or_path = os.path.expanduser(dataset_name_or_path)
+        subset_list = subset_list or self.subset_list
+
+        logger.info(f'Evaluating on subsets for {dataset_name_or_path}: {subset_list}')
 
         # Try to load dataset from local disk
         if os.path.exists(dataset_name_or_path):
@@ -65,23 +68,22 @@ def load(self,
                 raise ValueError(f'Local dataset is empty: {dataset_name_or_path}')
         else:
             # Load dataset from remote
-            logger.info(f'Loading dataset from {datasets_hub} hub: >dataset_name: {dataset_name_or_path}')
+            logger.info(f'Loading dataset from {datasets_hub}: > dataset_name: {dataset_name_or_path}')
             data_dict = {}
             split_list = [split for split in [self.train_split, self.eval_split] if split is not None]
             if len(split_list) == 0:
                 logger.error(f'Got empty split list: {split_list}')
 
-            subset_list = subset_list if subset_list is not None else self.subset_list
             for sub_name in subset_list:
                 data_dict[sub_name] = {}
                 # e.g. train: few-shot, test: target dataset to evaluate
                 for split in split_list:
-                    dataset = Benchmark.load(
+                    dataset = MsDataset.load(
                         dataset_name=dataset_name_or_path,
-                        subset=sub_name,
+                        subset_name=sub_name,
                         split=split,
+                        cache_dir=work_dir,
                         hub=datasets_hub,
-                        work_dir=work_dir,
                         **kwargs)
 
                     data_dict[sub_name].update({split: dataset})
diff --git a/evalscope/benchmarks/gsm8k/__init__.py b/evalscope/benchmarks/gsm8k/__init__.py
index 968a91dd..bf63ba4c 100644
--- a/evalscope/benchmarks/gsm8k/__init__.py
+++ b/evalscope/benchmarks/gsm8k/__init__.py
@@ -1,5 +1,3 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
-from evalscope.benchmarks.gsm8k.gsm8k_adapter import DATASET_ID, SUBSET_LIST
-from evalscope.benchmarks.gsm8k.gsm8k_adapter import GSM8KAdapter as DataAdapterClass
-from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass  # noqa
+from evalscope.benchmarks.gsm8k.gsm8k_adapter import GSM8KAdapter
diff --git a/evalscope/benchmarks/gsm8k/gsm8k_adapter.py b/evalscope/benchmarks/gsm8k/gsm8k_adapter.py
index e33d8ed0..d0d830b3 100644
--- a/evalscope/benchmarks/gsm8k/gsm8k_adapter.py
+++ b/evalscope/benchmarks/gsm8k/gsm8k_adapter.py
@@ -1,35 +1,36 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # Copyright (c) EleutherAI, Inc. and its affiliates.
+# flake8: noqa
 import math
 import os
 import re
 
-from evalscope.benchmarks import DataAdapter
-from evalscope.metrics.metrics import exact_match, weighted_mean
+from evalscope.benchmarks import Benchmark, DataAdapter
+from evalscope.metrics.metrics import weighted_mean
+from evalscope.models import ChatGenerationModelAdapter
 from evalscope.utils import normalize_score
 from evalscope.utils.io_utils import jsonl_to_list
 from evalscope.utils.logger import get_logger
 
-# flake8: noqa
-
 logger = get_logger()
 
-DATASET_ID = 'modelscope/gsm8k'
-SUBSET_LIST = ['main']
-ANS_RE = re.compile(r'#### (\-?[0-9\.\,]+)')
-INVALID_ANS = '[invalid]'
-
 
+@Benchmark.register(
+    name='gsm8k',
+    dataset_id='modelscope/gsm8k',
+    subset_list=['main'],
+    metric_list=[{
+        'name': 'WeightedAverageAccuracy',
+        'object': weighted_mean
+    }],
+    few_shot_num=4,
+    train_split='train',
+    eval_split='test',
+    prompt_template='',
+    model_adapter=ChatGenerationModelAdapter)
 class GSM8KAdapter(DataAdapter):
 
-    def __init__(self,
-                 subset_list: list = None,
-                 metric_list: list = None,
-                 few_shot_num: int = None,
-                 train_split: str = 'train',
-                 eval_split: str = 'test',
-                 prompt_template: str = '',
-                 **kwargs):
+    def __init__(self, **kwargs):
         """
         Data adapter for GSM8K dataset.
 
@@ -41,30 +42,13 @@ def __init__(self,
             eval_split (str): The target eval split name. Default: 'test'
             **kwargs: ...
         """
-
-        if subset_list is None:
-            subset_list = SUBSET_LIST
-
-        if metric_list is None:
-            metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
-
-        if few_shot_num is None:
-            logger.info(f'Set 4-shot examples by system for GSM8K.')
-            few_shot_num = 4
-
+        few_shot_num = kwargs.get('few_shot_num', 4)
         if few_shot_num != 4 and few_shot_num != 0:
             logger.error(f'GSM8K uses 4-shot examples with CoT or 0-shot by system, but got {few_shot_num}. '
                          f'Use 4-shot by default.')
             few_shot_num = 4
 
-        super().__init__(
-            subset_list=subset_list,
-            metric_list=metric_list,
-            few_shot_num=few_shot_num,
-            train_split=train_split,
-            eval_split=eval_split,
-            prompt_template=prompt_template,
-            **kwargs)
+        super().__init__(**kwargs)
 
     def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
         data_dict = {}
diff --git a/evalscope/benchmarks/humaneval/humaneval_adapter.py b/evalscope/benchmarks/humaneval/humaneval_adapter.py
index 8dcfe6e7..501a0e32 100644
--- a/evalscope/benchmarks/humaneval/humaneval_adapter.py
+++ b/evalscope/benchmarks/humaneval/humaneval_adapter.py
@@ -1,13 +1,9 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import json
-import os
 import re
-from tqdm import tqdm
 from typing import List
 
 from evalscope.benchmarks.data_adapter import DataAdapter
 from evalscope.metrics.metrics import weighted_mean
-from evalscope.tools.combine_reports import gen_table
 from evalscope.utils import normalize_score
 from evalscope.utils.logger import get_logger
 
@@ -31,7 +27,7 @@ def __init__(self,
                  few_shot_num: int = None,
                  train_split: str = None,
                  eval_split: str = 'test',
-                 prompt_template: str = 'Complete the following python code:\n',
+                 prompt_template: str = None,
                  **kwargs):
         try:
             from human_eval.data import stream_jsonl, write_jsonl
@@ -45,12 +41,14 @@ def __init__(self,
             subset_list = SUBSET_LIST
 
         if metric_list is None:
-            metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
+            metric_list = [{'name': 'pass@1', 'object': weighted_mean}]
+
+        if prompt_template is None:
+            prompt_template = 'Complete the following python code:\n'
 
         self.k = [1]
         self.num_workers = 4
         self.timeout = 4.0
-        self.outputs = kwargs.get('outputs', None)
 
         self.read_problems_func = stream_jsonl
         self.write_jsonl_func = write_jsonl
@@ -87,57 +85,6 @@ def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
 
         return {'data': [full_prompt]}
 
-    def get_answers(self, infer_cfg: dict) -> List[dict]:
-        ans_list: list = []
-        system_prompt: str = ''
-        for task_id, data_d in tqdm(self.problems.items(), total=len(self.problems), desc='Predicting(problems)'):
-            prompt: str = system_prompt + data_d['prompt']
-            inputs: dict = {'data': [prompt]}
-
-            pred_res: dict = self.model_adapter.predict(inputs=inputs, infer_cfg=infer_cfg)
-
-            pred_ans: str = pred_res['choices'][0]['message']['content']
-            pred_ans = self._postprocess(pred_ans)
-
-            ans_list.append({'task_id': task_id, 'completion': pred_ans})
-
-        return ans_list
-
-    def eval(self, infer_cfg: dict, **kwargs):
-
-        # predict
-        ans_list: list = self.get_answers(infer_cfg)
-        ans_out_file: str = os.path.join(self.outputs_structure.predictions_dir, 'human_eval_predictions.jsonl')
-
-        self.write_jsonl_func(filename=ans_out_file, data=ans_list)
-        # logger.info(f'** Dump predictions to {ans_out_file} successfully.')
-        logger.info('** Dump predictions successfully.')
-
-        # evaluate  results: e.g. {'pass@1': 0.333, 'pass@10': 0.111}
-        results = self.eval_func(
-            sample_file=ans_out_file,
-            k=self.k,
-            n_workers=self.num_workers,
-            timeout=self.timeout,
-            problem_file=self.problem_file)
-
-        # output: report
-        report_map: dict = self.gen_report(results=results)
-        report_dir: str = self.outputs_structure.reports_dir
-        report_file: str = os.path.join(report_dir, 'human_eval_report.json')
-
-        with open(report_file, 'w') as f:
-            f.write(json.dumps(report_map, ensure_ascii=False, indent=4))
-        # logger.info(f'** Dump report to {report_file} \n')
-        logger.info('** Dump report \n')
-
-        try:
-            # Make table
-            report_table: str = gen_table([report_dir])
-            logger.info(f'** Report table: \n {report_table} \n')
-        except Exception:
-            logger.error('Failed to generate report table.')
-
     def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
         total_num: int = sum([num for _, num in subset_score_map.values()])
         weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
@@ -151,7 +98,7 @@ def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
 
         res_map = dict(
             name=report_name or 'HumanEval',
-            metric='pass@1',
+            metric=self.metric_list[0]['name'],
             score=weighted_avg_acc,
             category=[category_d],
             total_num=total_num)
diff --git a/evalscope/benchmarks/mmlu/mmlu_adapter.py b/evalscope/benchmarks/mmlu/mmlu_adapter.py
index ecd6f5d2..ed6769c7 100644
--- a/evalscope/benchmarks/mmlu/mmlu_adapter.py
+++ b/evalscope/benchmarks/mmlu/mmlu_adapter.py
@@ -139,16 +139,13 @@ class MMLUAdapter(DataAdapter):
     choices = ['A', 'B', 'C', 'D']
 
     def __init__(self,
-                 subset_list: list = None,
+                 subset_list: list = SUBSET_LIST,
                  metric_list: list = None,
                  few_shot_num: int = None,
                  train_split: str = 'train',
                  eval_split: str = 'test',
                  **kwargs):
 
-        if subset_list is None:
-            subset_list = SUBSET_LIST
-
         if metric_list is None:
             metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
 
diff --git a/evalscope/config.py b/evalscope/config.py
index 749193e4..3e8652ce 100644
--- a/evalscope/config.py
+++ b/evalscope/config.py
@@ -114,7 +114,8 @@ def from_json(json_file: str):
     def from_args(args: Namespace):
         # Convert Namespace to a dictionary and filter out None values
         args_dict = {k: v for k, v in vars(args).items() if v is not None}
-        del args_dict['func']  # Note: compat CLI arguments
+        if args_dict.get('func', None):
+            del args_dict['func']  # Note: compat CLI arguments
 
         return TaskConfig.from_dict(args_dict)
 
diff --git a/evalscope/models/__init__.py b/evalscope/models/__init__.py
index e619a4e8..9afbad48 100644
--- a/evalscope/models/__init__.py
+++ b/evalscope/models/__init__.py
@@ -1,3 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
 from evalscope.models.model import BaseModel, ChatBaseModel
+from evalscope.models.model_adapter import *
diff --git a/evalscope/run.py b/evalscope/run.py
index a1841545..bec9e905 100644
--- a/evalscope/run.py
+++ b/evalscope/run.py
@@ -2,7 +2,6 @@
 """
 Run evaluation for LLMs.
 """
-import logging
 import os.path
 import torch
 from argparse import Namespace
@@ -10,19 +9,17 @@
 from typing import List, Optional, Union
 
 from evalscope.arguments import parse_args
+from evalscope.benchmarks import Benchmark, BenchmarkMeta
 from evalscope.config import TaskConfig, parse_task_config
 from evalscope.constants import DEFAULT_MODEL_REVISION, DEFAULT_WORK_DIR, EvalBackend, EvalType
 from evalscope.evaluator import Evaluator
 from evalscope.models.custom import CustomModel
-from evalscope.utils import import_module_util, seed_everything
+from evalscope.utils import seed_everything
 from evalscope.utils.io_utils import OutputsStructure, are_paths_same
 from evalscope.utils.logger import configure_logging, get_logger
 
 logger = get_logger()
 
-BENCHMARK_PATH_PREFIX = 'evalscope.benchmarks.'
-MEMBERS_TO_IMPORT = ['DATASET_ID', 'SUBSET_LIST', 'DataAdapterClass', 'ModelAdapterClass']
-
 
 def run_task(task_cfg: Union[str, dict, TaskConfig, List[TaskConfig], Namespace]) -> Union[dict, List[dict]]:
     """Run evaluation task(s) based on the provided configuration."""
@@ -113,29 +110,18 @@ def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
 
 def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsStructure):
     """Create an evaluator object for the specified dataset."""
-    imported_modules = import_module_util(BENCHMARK_PATH_PREFIX, dataset_name, MEMBERS_TO_IMPORT)
-    model_adapter = initialize_model_adapter(task_cfg, dataset_name, imported_modules)
-
+    # imported_modules = import_module_util(BENCHMARK_PATH_PREFIX, dataset_name, MEMBERS_TO_IMPORT)
+    benchmark: BenchmarkMeta = Benchmark.get(dataset_name)
     dataset_config = task_cfg.dataset_args.get(dataset_name, {})
-    dataset_name_or_path = dataset_config.get('local_path') or imported_modules['DATASET_ID']
-    in_prompt_template = dataset_config.get('prompt_template', '')
-    few_shot_num = dataset_config.get('few_shot_num', None)
-    few_shot_random = dataset_config.get('few_shot_random', True)
-
-    data_adapter = imported_modules['DataAdapterClass'](
-        few_shot_num=few_shot_num,
-        few_shot_random=few_shot_random,
-        prompt_template=in_prompt_template,
-        outputs=outputs,
-    )
-    in_subset_list = dataset_config.get('subset_list', imported_modules['SUBSET_LIST'])
+    benchmark.update(dataset_config)
 
-    logger.info(f'Evaluating on subsets for {dataset_name}: {in_subset_list}\n')
+    data_adapter = benchmark.data_adapter(**benchmark.to_dict())
+    model_adapter = initialize_model_adapter(task_cfg, benchmark.model_adapter)
 
     return Evaluator(
-        dataset_name_or_path=dataset_name_or_path,
-        subset_list=in_subset_list,
+        dataset_name_or_path=benchmark.dataset_id,
         data_adapter=data_adapter,
+        subset_list=benchmark.subset_list,
         model_adapter=model_adapter,
         use_cache=task_cfg.use_cache,
         outputs=outputs,
@@ -147,7 +133,7 @@ def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsSt
     )
 
 
-def initialize_model_adapter(task_cfg: TaskConfig, dataset_name: str, imported_modules):
+def initialize_model_adapter(task_cfg: TaskConfig, model_adapter):
     """Initialize the model adapter based on the task configuration."""
     if task_cfg.dry_run:
         from evalscope.models.dummy_chat_model import DummyChatModel
@@ -162,7 +148,7 @@ def initialize_model_adapter(task_cfg: TaskConfig, dataset_name: str, imported_m
         model_precision = task_cfg.model_args.get('precision', torch.float16)
         if isinstance(model_precision, str) and model_precision != 'auto':
             model_precision = eval(model_precision)
-        return imported_modules['ModelAdapterClass'](
+        return model_adapter(
             model_id=task_cfg.model,
             model_revision=task_cfg.model_args.get('revision', DEFAULT_MODEL_REVISION),
             device_map=device_map,
diff --git a/evalscope/utils/utils.py b/evalscope/utils/utils.py
index df11c743..ed3e4d2a 100644
--- a/evalscope/utils/utils.py
+++ b/evalscope/utils/utils.py
@@ -199,27 +199,6 @@ def parse_last_option(text: str, options: str) -> str:
 
 
 
-def import_module_util(import_path_prefix: str, module_name: str, members_to_import: list) -> dict:
-    """
-    Import module utility function.
-
-    Args:
-        import_path_prefix: e.g. 'evalscope.benchmarks.'
-        module_name: The module name to import. e.g. 'mmlu'
-        members_to_import: The members to import.
-            e.g. ['DATASET_ID', 'SUBJECT_MAPPING', 'SUBSET_LIST', 'DataAdapterClass']
-
-    Returns:
-        dict: imported modules map. e.g. {'DATASET_ID': 'mmlu', 'SUBJECT_MAPPING': {...}, ...}
-    """
-    imported_modules = {}
-    module = importlib.import_module(import_path_prefix + module_name)
-    for member_name in members_to_import:
-        imported_modules[member_name] = getattr(module, member_name)
-
-    return imported_modules
-
-
 def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float, dict]:
     """
     Normalize score.
diff --git a/tests/cli/test_run.py b/tests/cli/test_run.py
index fb01245f..caaac0b2 100644
--- a/tests/cli/test_run.py
+++ b/tests/cli/test_run.py
@@ -70,7 +70,7 @@ def test_run_eval_with_args(self):
 
     @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
     def test_run_task(self):
-        task_cfg = {'model': 'qwen/Qwen2-0.5B-Instruct', 'datasets': ['bbh', 'gsm8k', 'arc'], 'limit': 2, 'debug': False}
+        task_cfg = {'model': 'qwen/Qwen2-0.5B-Instruct', 'datasets': ['gsm8k', 'arc'], 'limit': 2, 'debug': True}
         run_task(task_cfg=task_cfg)
 
 

From db7f37c0459ea2afb26f998f8b19e2834f0d8fc7 Mon Sep 17 00:00:00 2001
From: Yunnglin <mao.looper@qq.com>
Date: Wed, 18 Dec 2024 19:31:16 +0800
Subject: [PATCH 02/15] fix  circular import

---
 evalscope/benchmarks/benchmark.py | 14 +++++++++++---
 evalscope/run.py                  | 10 ++++------
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/evalscope/benchmarks/benchmark.py b/evalscope/benchmarks/benchmark.py
index bffd4f87..b206e2f0 100644
--- a/evalscope/benchmarks/benchmark.py
+++ b/evalscope/benchmarks/benchmark.py
@@ -1,8 +1,10 @@
 import copy
 from dataclasses import dataclass, field
-from typing import Dict, List, Optional
+from typing import TYPE_CHECKING, Dict, List, Optional
+
+if TYPE_CHECKING:
+    from evalscope.benchmarks import DataAdapter
 
-from evalscope.benchmarks import DataAdapter
 from evalscope.models import BaseModelAdapter
 
 BENCHMARK_MAPPINGS = {}
@@ -12,7 +14,7 @@
 class BenchmarkMeta:
     name: str
     dataset_id: str
-    data_adapter: DataAdapter
+    data_adapter: 'DataAdapter'
     model_adapter: BaseModelAdapter
     subset_list: List[str] = field(default_factory=list)
     metric_list: List[Dict] = field(default_factory=list)
@@ -34,6 +36,12 @@ def to_dict(self):
         del cur_dict['model_adapter']
         return cur_dict
 
+    def get_data_adapter(self, config: dict = None) -> 'DataAdapter':
+        if config:
+            dataset_config = config.get(self.name, {})
+            self.update(dataset_config)
+        return self.data_adapter(**self.to_dict())
+
 
 class Benchmark:
 
diff --git a/evalscope/run.py b/evalscope/run.py
index bec9e905..3b531732 100644
--- a/evalscope/run.py
+++ b/evalscope/run.py
@@ -112,11 +112,9 @@ def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsSt
     """Create an evaluator object for the specified dataset."""
     # imported_modules = import_module_util(BENCHMARK_PATH_PREFIX, dataset_name, MEMBERS_TO_IMPORT)
     benchmark: BenchmarkMeta = Benchmark.get(dataset_name)
-    dataset_config = task_cfg.dataset_args.get(dataset_name, {})
-    benchmark.update(dataset_config)
 
-    data_adapter = benchmark.data_adapter(**benchmark.to_dict())
-    model_adapter = initialize_model_adapter(task_cfg, benchmark.model_adapter)
+    data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args)
+    model_adapter = initialize_model_adapter(task_cfg, model_adapter_cls=benchmark.model_adapter)
 
     return Evaluator(
         dataset_name_or_path=benchmark.dataset_id,
@@ -133,7 +131,7 @@ def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsSt
     )
 
 
-def initialize_model_adapter(task_cfg: TaskConfig, model_adapter):
+def initialize_model_adapter(task_cfg: TaskConfig, model_adapter_cls):
     """Initialize the model adapter based on the task configuration."""
     if task_cfg.dry_run:
         from evalscope.models.dummy_chat_model import DummyChatModel
@@ -148,7 +146,7 @@ def initialize_model_adapter(task_cfg: TaskConfig, model_adapter):
         model_precision = task_cfg.model_args.get('precision', torch.float16)
         if isinstance(model_precision, str) and model_precision != 'auto':
             model_precision = eval(model_precision)
-        return model_adapter(
+        return model_adapter_cls(
             model_id=task_cfg.model,
             model_revision=task_cfg.model_args.get('revision', DEFAULT_MODEL_REVISION),
             device_map=device_map,

From 11687964f1e853bf344f74c1b0158c0e2217e1b9 Mon Sep 17 00:00:00 2001
From: Yunnglin <mao.looper@qq.com>
Date: Wed, 18 Dec 2024 19:43:32 +0800
Subject: [PATCH 03/15] fix lint

---
 evalscope/benchmarks/__init__.py  | 4 ++--
 evalscope/benchmarks/benchmark.py | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/evalscope/benchmarks/__init__.py b/evalscope/benchmarks/__init__.py
index ab96a29e..444d5e79 100644
--- a/evalscope/benchmarks/__init__.py
+++ b/evalscope/benchmarks/__init__.py
@@ -18,6 +18,6 @@
         # Convert file path to a module path
         relative_path = os.path.relpath(file_path, os.path.dirname(__file__))
         module_path = relative_path[:-3].replace(os.path.sep, '.')  # strip '.py' and convert to module path
-        full_path = f"evalscope.benchmarks.{module_path}"
+        full_path = f'evalscope.benchmarks.{module_path}'
         importlib.import_module(full_path)
-        print(f"Importing {full_path}")
+        print(f'Importing {full_path}')
diff --git a/evalscope/benchmarks/benchmark.py b/evalscope/benchmarks/benchmark.py
index b206e2f0..79855366 100644
--- a/evalscope/benchmarks/benchmark.py
+++ b/evalscope/benchmarks/benchmark.py
@@ -30,7 +30,7 @@ def update(self, args: dict):
             del args['local_path']
         self.__dict__.update(args)
 
-    def to_dict(self):
+    def to_dict(self) -> dict:
         cur_dict = copy.deepcopy(self.__dict__)
         del cur_dict['data_adapter']
         del cur_dict['model_adapter']
@@ -51,7 +51,7 @@ def __init__(self):
     @classmethod
     def get(cls, name: str) -> 'BenchmarkMeta':
         if name not in BENCHMARK_MAPPINGS:
-            raise Exception(f"Unknown benchmark: {name}. Available tasks: {BENCHMARK_MAPPINGS.keys()}")
+            raise Exception(f'Unknown benchmark: {name}. Available tasks: {BENCHMARK_MAPPINGS.keys()}')
         benchmark = BENCHMARK_MAPPINGS[name]
         return benchmark
 
@@ -60,7 +60,7 @@ def register(cls, name: str, dataset_id: str, model_adapter: BaseModelAdapter, *
 
         def register_wrapper(data_adapter):
             if name in BENCHMARK_MAPPINGS:
-                raise Exception(f"Benchmark {name} already registered")
+                raise Exception(f'Benchmark {name} already registered')
             BENCHMARK_MAPPINGS[name] = BenchmarkMeta(
                 name=name, data_adapter=data_adapter, model_adapter=model_adapter, dataset_id=dataset_id, **kwargs)
             return data_adapter

From 655d49cc44db23d70a542df8640f9593fd61e580 Mon Sep 17 00:00:00 2001
From: Yunnglin <mao.looper@qq.com>
Date: Thu, 19 Dec 2024 14:06:46 +0800
Subject: [PATCH 04/15] update data adapter

---
 evalscope/benchmarks/arc/__init__.py        |   5 -
 evalscope/benchmarks/arc/arc_adapter.py     | 113 +++------------
 evalscope/benchmarks/benchmark.py           |   2 +-
 evalscope/benchmarks/data_adapter.py        | 151 ++++++++++++--------
 evalscope/benchmarks/gsm8k/__init__.py      |   2 -
 evalscope/benchmarks/gsm8k/gsm8k_adapter.py |  68 +--------
 evalscope/metrics/__init__.py               |   3 +
 7 files changed, 115 insertions(+), 229 deletions(-)

diff --git a/evalscope/benchmarks/arc/__init__.py b/evalscope/benchmarks/arc/__init__.py
index 8b7d5dc4..b937315b 100644
--- a/evalscope/benchmarks/arc/__init__.py
+++ b/evalscope/benchmarks/arc/__init__.py
@@ -1,6 +1 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-
-from evalscope.benchmarks.arc.arc_adapter import DATASET_ID, SUBSET_LIST
-from evalscope.benchmarks.arc.arc_adapter import ARCAdapter
-from evalscope.benchmarks.arc.arc_adapter import ARCAdapter as DataAdapterClass
-from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass  # noqa
diff --git a/evalscope/benchmarks/arc/arc_adapter.py b/evalscope/benchmarks/arc/arc_adapter.py
index 46b1f6a5..e00cf784 100644
--- a/evalscope/benchmarks/arc/arc_adapter.py
+++ b/evalscope/benchmarks/arc/arc_adapter.py
@@ -3,40 +3,34 @@
 import json
 import os
 
-from evalscope.benchmarks.data_adapter import DataAdapter
-from evalscope.metrics.metrics import exact_match, weighted_mean
-from evalscope.utils import ResponseParser, normalize_score
+from evalscope.benchmarks import Benchmark, DataAdapter
+from evalscope.metrics import WeightedAverageAccuracy, exact_match
+from evalscope.models import MultiChoiceModelAdapter
+from evalscope.utils import ResponseParser
 from evalscope.utils.logger import get_logger
 
 # flake8: noqa
 
 logger = get_logger()
 
-DATASET_ID = 'modelscope/ai2_arc'
-
-# task_list = ['ARC-Easy', 'ARC-Challenge']
-SUBSET_LIST = ['ARC-Challenge']
-
 
+@Benchmark.register(
+    name='arc',
+    dataset_id='modelscope/ai2_arc',
+    model_adapter=MultiChoiceModelAdapter,
+    subset_list=['ARC-Easy', 'ARC-Challenge'],
+    metric_list=[WeightedAverageAccuracy],
+    few_shot_num=0,
+    train_split='train',
+    eval_split='test',
+    prompt_template='',
+)
 class ARCAdapter(DataAdapter):
 
     choices = ['A', 'B', 'C', 'D']
 
-    def __init__(self,
-                 subset_list: list = None,
-                 metric_list: list = None,
-                 few_shot_num: int = None,
-                 train_split: str = 'train',
-                 eval_split: str = 'test',
-                 prompt_template: str = '',
-                 **kwargs):
-
-        if subset_list is None:
-            subset_list = SUBSET_LIST
-
-        if metric_list is None:
-            metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
-
+    def __init__(self, **kwargs):
+        few_shot_num = kwargs.get('few_shot_num', None)
         if few_shot_num is None:
             # Use 0-shot by default
             logger.info(f'Set 0-shot examples by system for ARC.')
@@ -45,14 +39,7 @@ def __init__(self,
         if few_shot_num != 0:
             logger.warning(f'few_shot_num is recommended to set 0 for ARC, got {few_shot_num}.')
 
-        super().__init__(
-            subset_list=subset_list,
-            metric_list=metric_list,
-            few_shot_num=few_shot_num,
-            train_split=train_split,
-            eval_split=eval_split,
-            prompt_template=prompt_template,
-            **kwargs)
+        super().__init__(**kwargs)
 
     def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
         """
@@ -158,70 +145,6 @@ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: st
     def match(self, gold: str, pred: str) -> float:
         return exact_match(gold=gold, pred=pred)
 
-    def compute_metric(self, review_res_list: list) -> float:
-        """
-        Compute evaluation result by specific metric.
-
-        Args:
-            review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
-
-        Returns:
-            The metric score.
-        """
-        items = [(score, 1.0) for score in review_res_list]
-        return weighted_mean(items)
-
-    def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
-        """
-        Generate the report for the model output.
-
-        Args:
-            subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
-            report_name: The user-defined report name.
-
-        Returns: A dict of metric calculation results. The format is like:
-        {
-            "name":"ARC",
-            "metric":"WeightedAverageAccuracy",
-            "score":0.3389,
-            "category":[
-                {
-                    "name":"DEFAULT",
-                    "score":0.4128,
-                    "subset":[
-                        {
-                            "name":"ARC-Easy",
-                            "score":0.5632
-                        },
-                        {
-                            "name":"ARC-Challenge",
-                            "score":0.3157
-                        }
-                    ]
-                }
-            ],
-            "total_num":7800
-        }
-        """
-        total_num: int = sum([num for _, num in subset_score_map.values()])
-        weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
-        weighted_avg_acc = normalize_score(score=weighted_avg_acc)
-        cate_avg_list = [{
-            'name': subset_name,
-            'score': normalize_score(score=score)
-        } for subset_name, (score, _) in subset_score_map.items()]
-
-        category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
-
-        res_map = dict(
-            name=report_name or 'arc',
-            metric=self.metric_list[0]['name'],
-            score=weighted_avg_acc,
-            category=[category_d],
-            total_num=total_num)
-
-        return res_map
-
     @classmethod
     def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:
 
diff --git a/evalscope/benchmarks/benchmark.py b/evalscope/benchmarks/benchmark.py
index 79855366..30113928 100644
--- a/evalscope/benchmarks/benchmark.py
+++ b/evalscope/benchmarks/benchmark.py
@@ -17,7 +17,7 @@ class BenchmarkMeta:
     data_adapter: 'DataAdapter'
     model_adapter: BaseModelAdapter
     subset_list: List[str] = field(default_factory=list)
-    metric_list: List[Dict] = field(default_factory=list)
+    metric_list: List[dict] = field(default_factory=list)
     few_shot_num: int = 0
     few_shot_random: bool = False
     train_split: Optional[str] = None
diff --git a/evalscope/benchmarks/data_adapter.py b/evalscope/benchmarks/data_adapter.py
index fc1e6b40..da3a72e2 100644
--- a/evalscope/benchmarks/data_adapter.py
+++ b/evalscope/benchmarks/data_adapter.py
@@ -6,6 +6,7 @@
 from typing import Any, Optional
 
 from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, HubType
+from evalscope.utils import normalize_score
 from evalscope.utils.logger import get_logger
 
 logger = get_logger()
@@ -22,6 +23,11 @@ def __init__(self,
                  prompt_template: str = '',
                  **kwargs):
         """
+        Data Adapter for the benchmark. You need to implement the following methods:
+            - gen_prompt
+            - get_gold_answer
+            - parse_pred_result
+            - match
         Args:
             subset_list: list of subset names for the dataset.
             metric_list: list, the metric list to evaluate the model on specific benchmark.
@@ -141,6 +147,91 @@ def gen_prompts(self, data_dict: dict) -> dict:
 
         return res_dict
 
+    def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
+        """
+        Generate report for the evaluation results for all subsets.
+
+        Args:
+            subset_score_map: The subset-score map.
+                e.g. {subset_name: (score, num)}
+
+            report_name: str, the user-defined report name. Default: None
+
+        Returns: The evaluation report.  Note: should normalize the score by normalize_score method in utils.
+
+        Here is a format example for ARC-Challenge:
+        {
+            "name":"ARC-Challenge",
+            "metric":"WeightedAverageAccuracy",
+            "score": 0.3389,
+            "category":[
+                {
+                    "name":"DEFAULT",
+                    "score": 0.3389,
+                    "subset":[
+                        {
+                            "name":"ARC-Challenge",
+                            "score": 0.3389,
+                            "num": 100
+                        },
+                    ]
+                }
+            ],
+            "total_num":100
+        }
+        """
+        total_num: int = sum([num for _, num in subset_score_map.values()])
+        weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
+        weighted_avg_acc = normalize_score(score=weighted_avg_acc)
+        cate_avg_list = [{
+            'name': subset_name,
+            'score': normalize_score(score=score),
+            'num': num
+        } for subset_name, (score, num) in subset_score_map.items()]
+
+        category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
+
+        res_map = dict(
+            name=report_name or 'DEFAULT',
+            metric=self.metric_list[0]['name'],
+            score=weighted_avg_acc,
+            category=[category_d],
+            total_num=total_num)
+
+        return res_map
+
+    def get_fewshot_examples(self, data_list: list, k: int, few_shot_random: bool = True):
+
+        if k > len(data_list):
+            k = len(data_list)
+        if few_shot_random:
+            return random.sample(data_list, k)
+        else:
+            return data_list[:k]
+
+    def compute_metric(self, review_res_list: list) -> Any:
+        """
+        Compute evaluation result by specific metrics.
+
+        Args:
+            review_res_list: list, the review result list, each item of which is match result for gold and pred.
+
+        Attributes:
+            DataAdapter.metric_func_map: metric_name -> metric_func mapping,
+                e.g. {'WeightedAverageAccuracy': weighted_average_acc}
+
+        Returns:
+            Metric results.
+        """
+        if len(self.metric_list) == 0:
+            raise ValueError('No metric list found for the benchmark.')
+        elif len(self.metric_list) == 1:
+            # review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
+            items = [(score, 1.0) for score in review_res_list]
+            return self.metric_list[0]['object'](items)
+        else:
+            raise ValueError('Please implement the compute_metric method for multiple metrics.')
+
     @abstractmethod
     def gen_prompt(self, *args, **kwargs) -> Any:
         """
@@ -203,63 +294,3 @@ def match(self, gold: Any, pred: Any) -> Any:
             The match result. Usually a score (float) for chat/multiple-choice-questions.
         """
         raise NotImplementedError
-
-    @abstractmethod
-    def compute_metric(self, review_res_list: list) -> Any:
-        """
-        Compute evaluation result by specific metrics.
-
-        Args:
-            review_res_list: list, the review result list, each item of which is match result for gold and pred.
-
-        Attributes:
-            DataAdapter.metric_func_map: metric_name -> metric_func mapping,
-                e.g. {'WeightedAverageAccuracy': weighted_average_acc}
-
-        Returns:
-            Metric results.
-        """
-        raise NotImplementedError
-
-    def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
-        """
-        Generate report for the evaluation results for all subsets.
-
-        Args:
-            subset_score_map: The subset-score map.
-                e.g. {subset_name: (score, num)}
-
-            report_name: str, the user-defined report name. Default: None
-
-        Returns: The evaluation report.  Note: should normalize the score by normalize_score method in utils.
-
-        Here is a format example for ARC-Challenge:
-        {
-            "name":"ARC-Challenge",
-            "metric":"WeightedAverageAccuracy",
-            "score": 0.3389,
-            "category":[
-                {
-                    "name":"DEFAULT",
-                    "score": 0.3389,
-                    "subset":[
-                        {
-                            "name":"ARC-Challenge",
-                            "score": 0.3389
-                        },
-                    ]
-                }
-            ],
-            "total_num":100
-        }
-        """
-        raise NotImplementedError
-
-    def get_fewshot_examples(self, data_list: list, k: int, few_shot_random: bool = True):
-
-        if k > len(data_list):
-            k = len(data_list)
-        if few_shot_random:
-            return random.sample(data_list, k)
-        else:
-            return data_list[:k]
diff --git a/evalscope/benchmarks/gsm8k/__init__.py b/evalscope/benchmarks/gsm8k/__init__.py
index bf63ba4c..b937315b 100644
--- a/evalscope/benchmarks/gsm8k/__init__.py
+++ b/evalscope/benchmarks/gsm8k/__init__.py
@@ -1,3 +1 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-
-from evalscope.benchmarks.gsm8k.gsm8k_adapter import GSM8KAdapter
diff --git a/evalscope/benchmarks/gsm8k/gsm8k_adapter.py b/evalscope/benchmarks/gsm8k/gsm8k_adapter.py
index d0d830b3..5aa67e97 100644
--- a/evalscope/benchmarks/gsm8k/gsm8k_adapter.py
+++ b/evalscope/benchmarks/gsm8k/gsm8k_adapter.py
@@ -6,9 +6,8 @@
 import re
 
 from evalscope.benchmarks import Benchmark, DataAdapter
-from evalscope.metrics.metrics import weighted_mean
+from evalscope.metrics import WeightedAverageAccuracy
 from evalscope.models import ChatGenerationModelAdapter
-from evalscope.utils import normalize_score
 from evalscope.utils.io_utils import jsonl_to_list
 from evalscope.utils.logger import get_logger
 
@@ -19,10 +18,7 @@
     name='gsm8k',
     dataset_id='modelscope/gsm8k',
     subset_list=['main'],
-    metric_list=[{
-        'name': 'WeightedAverageAccuracy',
-        'object': weighted_mean
-    }],
+    metric_list=[WeightedAverageAccuracy],
     few_shot_num=4,
     train_split='train',
     eval_split='test',
@@ -126,66 +122,6 @@ def number_equal(gold_ans, pred_ans):
 
         return number_equal(gold_ans=gold, pred_ans=pred)
 
-    def compute_metric(self, review_res_list: list) -> float:
-        """
-        Compute evaluation result by specific metric.
-
-        Args:
-            review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
-
-        Returns:
-            The metric score.
-        """
-        items = [(score, 1.0) for score in review_res_list]
-        return weighted_mean(items)
-
-    def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
-        """
-        Generate the report for the model output.
-
-        Args:
-            subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
-            report_name: The user-defined report name. Default: None
-
-        Returns: A dict of metric calculation results. The format is like:
-        {
-            "name":"GSM8K",
-            "metric":"WeightedAverageAccuracy",
-            "score":0.5632,
-            "category":[
-                {
-                    "name":"DEFAULT",
-                    "score":0.5632,
-                    "subset":[
-                        {
-                            "name":"main",
-                            "score":0.5632
-                        },
-                    ]
-                }
-            ],
-            "total_num":100
-        }
-        """
-        total_num: int = sum([num for _, num in subset_score_map.values()])
-        weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
-        weighted_avg_acc = normalize_score(score=weighted_avg_acc)
-        cate_avg_list = [{
-            'name': subset_name,
-            'score': normalize_score(score=score)
-        } for subset_name, (score, _) in subset_score_map.items()]
-
-        category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
-
-        res_map = dict(
-            name=report_name or 'gsm8k',
-            metric=self.metric_list[0]['name'],
-            score=weighted_avg_acc,
-            category=[category_d],
-            total_num=total_num)
-
-        return res_map
-
     @classmethod
     def _generate_prompt(cls, input_d: dict, few_shot_list: list, use_fewshot: bool = True) -> str:
         if use_fewshot:
diff --git a/evalscope/metrics/__init__.py b/evalscope/metrics/__init__.py
index b937315b..7c7ff37a 100644
--- a/evalscope/metrics/__init__.py
+++ b/evalscope/metrics/__init__.py
@@ -1 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+from evalscope.metrics.metrics import exact_match, weighted_mean
+
+WeightedAverageAccuracy = {'name': 'WeightedAverageAccuracy', 'object': weighted_mean}

From 2f941a66ed7d07a09e458d0bb0af855c4251aabb Mon Sep 17 00:00:00 2001
From: Yunnglin <mao.looper@qq.com>
Date: Thu, 19 Dec 2024 20:46:39 +0800
Subject: [PATCH 05/15] update model adapter

---
 .../backend/rageval_backend/mteb.md           |   3 +-
 evalscope/benchmarks/data_adapter.py          |  10 +-
 evalscope/benchmarks/gsm8k/gsm8k_adapter.py   |   3 +-
 evalscope/benchmarks/hellaswag/__init__.py    |   5 -
 .../benchmarks/hellaswag/hellaswag_adapter.py | 108 ++--------
 evalscope/evaluator/reviewer/auto_reviewer.py |   2 +-
 evalscope/models/__init__.py                  |   1 +
 evalscope/models/api/__init__.py              |   3 -
 evalscope/models/dummy_chat_model.py          |  49 -----
 evalscope/models/model.py                     | 141 ++++++++++++
 evalscope/models/model_adapter.py             | 204 +++++++-----------
 evalscope/models/openai_model.py              | 103 ---------
 evalscope/run.py                              |  45 ++--
 .../longbench_write/tools}/openai_api.py      |   0
 tests/cli/test_run.py                         |   2 +-
 15 files changed, 272 insertions(+), 407 deletions(-)
 delete mode 100644 evalscope/models/api/__init__.py
 delete mode 100644 evalscope/models/dummy_chat_model.py
 delete mode 100644 evalscope/models/openai_model.py
 rename evalscope/{models/api => third_party/longbench_write/tools}/openai_api.py (100%)

diff --git a/docs/zh/user_guides/backend/rageval_backend/mteb.md b/docs/zh/user_guides/backend/rageval_backend/mteb.md
index 1a864a55..0e0937d5 100644
--- a/docs/zh/user_guides/backend/rageval_backend/mteb.md
+++ b/docs/zh/user_guides/backend/rageval_backend/mteb.md
@@ -102,7 +102,8 @@ one_stage_task_cfg = {
 
 
 ### 两阶段评测
-配置文件示例如下，先进行检索，再进行reranking：
+评测reranker需要用retrieval数据集，先用embedding模型检索topk，再进行排序。配置文件示例如下：
+
 ```python
 two_stage_task_cfg = {
     "eval_backend": "RAGEval",
diff --git a/evalscope/benchmarks/data_adapter.py b/evalscope/benchmarks/data_adapter.py
index da3a72e2..18f823ed 100644
--- a/evalscope/benchmarks/data_adapter.py
+++ b/evalscope/benchmarks/data_adapter.py
@@ -139,11 +139,11 @@ def gen_prompts(self, data_dict: dict) -> dict:
                 prompt_d = self.gen_prompt(input_d=sample_d, subset_name=sub_name, few_shot_list=few_shot_data)
                 prompt_d[AnswerKeys.RAW_INPUT] = sample_d
                 res_dict[sub_name].append(prompt_d)
-
-        rnd = random.Random()
-        rnd.seed(42)
-        for k, v in res_dict.items():
-            rnd.shuffle(v)
+        # Note: for multiprocess
+        # rnd = random.Random()
+        # rnd.seed(42)
+        # for k, v in res_dict.items():
+        #     rnd.shuffle(v)
 
         return res_dict
 
diff --git a/evalscope/benchmarks/gsm8k/gsm8k_adapter.py b/evalscope/benchmarks/gsm8k/gsm8k_adapter.py
index 5aa67e97..450df31d 100644
--- a/evalscope/benchmarks/gsm8k/gsm8k_adapter.py
+++ b/evalscope/benchmarks/gsm8k/gsm8k_adapter.py
@@ -17,13 +17,14 @@
 @Benchmark.register(
     name='gsm8k',
     dataset_id='modelscope/gsm8k',
+    model_adapter=ChatGenerationModelAdapter,
     subset_list=['main'],
     metric_list=[WeightedAverageAccuracy],
     few_shot_num=4,
     train_split='train',
     eval_split='test',
     prompt_template='',
-    model_adapter=ChatGenerationModelAdapter)
+)
 class GSM8KAdapter(DataAdapter):
 
     def __init__(self, **kwargs):
diff --git a/evalscope/benchmarks/hellaswag/__init__.py b/evalscope/benchmarks/hellaswag/__init__.py
index 5899f3de..b937315b 100644
--- a/evalscope/benchmarks/hellaswag/__init__.py
+++ b/evalscope/benchmarks/hellaswag/__init__.py
@@ -1,6 +1 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-
-from evalscope.benchmarks.hellaswag.hellaswag_adapter import DATASET_ID, SUBSET_LIST
-from evalscope.benchmarks.hellaswag.hellaswag_adapter import HellaSwagAdapter
-from evalscope.benchmarks.hellaswag.hellaswag_adapter import HellaSwagAdapter as DataAdapterClass
-from evalscope.models.model_adapter import ContinuationLogitsModelAdapter as ModelAdapterClass  # noqa
diff --git a/evalscope/benchmarks/hellaswag/hellaswag_adapter.py b/evalscope/benchmarks/hellaswag/hellaswag_adapter.py
index 4d5f7ef0..afae5570 100644
--- a/evalscope/benchmarks/hellaswag/hellaswag_adapter.py
+++ b/evalscope/benchmarks/hellaswag/hellaswag_adapter.py
@@ -3,9 +3,9 @@
 import os
 import re
 
-from evalscope.benchmarks.data_adapter import DataAdapter
-from evalscope.metrics.metrics import exact_match, weighted_mean
-from evalscope.utils import normalize_score
+from evalscope.benchmarks import Benchmark, DataAdapter
+from evalscope.metrics import WeightedAverageAccuracy, exact_match
+from evalscope.models import ContinuationLogitsModelAdapter
 from evalscope.utils.io_utils import jsonl_to_list
 from evalscope.utils.logger import get_logger
 
@@ -13,44 +13,30 @@
 
 logger = get_logger()
 
-DATASET_ID = 'modelscope/hellaswag'
-SUBSET_LIST = ['default']
-
 
+@Benchmark.register(
+    name='hellaswag',
+    dataset_id='modelscope/hellaswag',
+    model_adapter=ContinuationLogitsModelAdapter,
+    subset_list=['default'],
+    metric_list=[WeightedAverageAccuracy],
+    few_shot_num=0,
+    train_split='train',
+    eval_split='validation',
+    prompt_template='',
+)
 class HellaSwagAdapter(DataAdapter):
 
     choices = ['0', '1', '2', '3']
 
-    def __init__(self,
-                 subset_list: list = None,
-                 metric_list: list = None,
-                 few_shot_num: int = None,
-                 train_split: str = 'train',
-                 eval_split: str = 'validation',
-                 **kwargs):
-
-        if subset_list is None:
-            subset_list = SUBSET_LIST
-
-        if metric_list is None:
-            metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
-
-        if few_shot_num is None:
-            # Use 0-shot by default
-            logger.info(f'Set 0-shot examples by system for HellaSwag.')
-            few_shot_num = 0
+    def __init__(self, **kwargs):
 
+        few_shot_num = kwargs.get('few_shot_num', None)
         if few_shot_num != 0:
             logger.warning(f'few_shot_num should be 0 for HellaSwag, but got {few_shot_num}. Use 0-shot by default.')
             few_shot_num = 0
 
-        super().__init__(
-            subset_list=subset_list,
-            metric_list=metric_list,
-            few_shot_num=few_shot_num,
-            train_split=train_split,
-            eval_split=eval_split,
-            **kwargs)
+        super().__init__(**kwargs)
 
     def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
         data_dict = {}
@@ -136,66 +122,6 @@ def parse_pred_result(self, result: list, raw_input_d: dict = None, eval_type: s
     def match(self, gold: str, pred: str) -> float:
         return exact_match(gold=str(gold), pred=str(pred))
 
-    def compute_metric(self, review_res_list: list) -> float:
-        """
-        Compute evaluation result by specific metric.
-
-        Args:
-            review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
-
-        Returns:
-            The metric score.
-        """
-        items = [(score, 1.0) for score in review_res_list]
-        return weighted_mean(items)
-
-    def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
-        """
-        Generate the report for the model output.
-
-        Args:
-            subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
-            report_name: The user-defined report name.
-
-        Returns: A dict of metric calculation results. The format is like:
-        {
-            "name":"HellaSwag",
-            "metric":"WeightedAverageAccuracy",
-            "score":0.3389,
-            "category":[
-                {
-                    "name":"DEFAULT",
-                    "score":0.4128,
-                    "subset":[
-                        {
-                            "name":"default",
-                            "score":0.5632
-                        },
-                    ]
-                }
-            ],
-            "total_num":7800
-        }
-        """
-        total_num: int = sum([num for _, num in subset_score_map.values()])
-        weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
-        weighted_avg_acc = normalize_score(score=weighted_avg_acc)
-        cate_avg_list = [{
-            'name': subset_name,
-            'score': normalize_score(score=score)
-        } for subset_name, (score, _) in subset_score_map.items()]
-
-        category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
-
-        res_map = dict(
-            name=report_name or 'hellaswag',
-            metric=self.metric_list[0]['name'],
-            score=weighted_avg_acc,
-            category=[category_d],
-            total_num=total_num)
-
-        return res_map
-
     @classmethod
     def _preprocess(cls, text):
         text = text.strip()
diff --git a/evalscope/evaluator/reviewer/auto_reviewer.py b/evalscope/evaluator/reviewer/auto_reviewer.py
index 01902f45..4144f111 100644
--- a/evalscope/evaluator/reviewer/auto_reviewer.py
+++ b/evalscope/evaluator/reviewer/auto_reviewer.py
@@ -11,7 +11,7 @@
 from typing import Any, List
 
 from evalscope.constants import ArenaMode, EvalConfigKeys, FnCompletionParser, PositionBiasMitigation
-from evalscope.models.openai_model import OpenAIModel
+from evalscope.models.model import OpenAIModel
 from evalscope.utils import completion_parsers, random_seeded_choice
 from evalscope.utils.arena_utils import get_battle_pairs, merge_ques_ans, shuffle_pairwise_preferences
 from evalscope.utils.io_utils import dump_jsonl_data, jsonl_to_list
diff --git a/evalscope/models/__init__.py b/evalscope/models/__init__.py
index 9afbad48..8fc22ebf 100644
--- a/evalscope/models/__init__.py
+++ b/evalscope/models/__init__.py
@@ -1,4 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
+from evalscope.models.custom import *
 from evalscope.models.model import BaseModel, ChatBaseModel
 from evalscope.models.model_adapter import *
diff --git a/evalscope/models/api/__init__.py b/evalscope/models/api/__init__.py
deleted file mode 100644
index a19bf86e..00000000
--- a/evalscope/models/api/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-
-from evalscope.models.api.openai_api import OpenaiApi
diff --git a/evalscope/models/dummy_chat_model.py b/evalscope/models/dummy_chat_model.py
deleted file mode 100644
index 578b5f59..00000000
--- a/evalscope/models/dummy_chat_model.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-
-import random
-import time
-
-from evalscope.models import ChatBaseModel
-from evalscope.utils.logger import get_logger
-
-logger = get_logger()
-
-
-class DummyChatModel(ChatBaseModel):
-
-    MODEL_ID = 'dummy_chat_model_0801'
-    REVISION = 'v1.0.0'
-
-    def __init__(self, model_cfg: dict, **kwargs):
-        model_cfg['model_id'] = self.MODEL_ID
-        model_cfg['revision'] = self.REVISION
-        super(DummyChatModel, self).__init__(model_cfg=model_cfg)
-
-    def predict(self, inputs: dict, **kwargs) -> dict:
-
-        debug: bool = False
-        if debug:
-            messages = inputs['messages']
-            history = inputs['history']
-
-            logger.info(f'** messages: {messages}')
-            logger.info(f'** history: {history}')
-
-        choice = random.choice(['A', 'B', 'C', 'D'])
-
-        # Build response
-        res = {
-            'choices': [{
-                'index': 0,
-                'message': {
-                    'content': choice,
-                    'role': 'assistant'
-                }
-            }],
-            'created': time.time(),
-            'model': self.MODEL_ID + '-' + self.REVISION,
-            'object': 'chat.completion',
-            'usage': {}
-        }
-
-        return res
diff --git a/evalscope/models/model.py b/evalscope/models/model.py
index 826fb879..7f32f7b9 100644
--- a/evalscope/models/model.py
+++ b/evalscope/models/model.py
@@ -1,7 +1,15 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import openai
+import os
+import random
+import time
 from abc import ABC, abstractmethod
 from typing import Any
 
+from evalscope.utils.logger import get_logger
+
+logger = get_logger()
+
 
 class BaseModel(ABC):
 
@@ -86,3 +94,136 @@ def predict(self, inputs: dict, **kwargs) -> dict:
                 }
         """
         raise NotImplementedError
+
+
+class OpenAIModel(ChatBaseModel):
+    """
+    APIs of OpenAI models.
+    Available models: gpt-3.5-turbo, gpt-4
+    """
+
+    MAX_RETRIES = 3
+
+    def __init__(self, model_cfg: dict, **kwargs):
+        super(OpenAIModel, self).__init__(model_cfg=model_cfg, **kwargs)
+
+        openai_api_key = os.environ.get('OPENAI_API_KEY', None)
+        self.api_key = self.model_cfg.get('api_key', openai_api_key)
+
+        if not self.api_key:
+            logger.error('OpenAI API key is not provided, please set it in environment variable OPENAI_API_KEY')
+            # raise ValueError(
+            #     'OpenAI API key is not provided, '
+            #     'please set it in environment variable OPENAI_API_KEY')
+
+    def predict(self, model_id: str, inputs: dict, **kwargs) -> dict:
+
+        sys_prompt: str = inputs.get('sys_prompt', '')
+        user_prompt: str = inputs.get('user_prompt', '')
+
+        # model_id: str = kwargs.get('model_id', '')
+        temperature: float = kwargs.pop('temperature', 0.2)
+        max_tokens: int = kwargs.pop('max_tokens', 1024)
+        mode: str = kwargs.pop('mode', 'chat.completion')
+
+        logger.info(f'Using OpenAI model_id: {model_id}')
+
+        res = self._predict(
+            model_id=model_id,
+            sys_prompt=sys_prompt,
+            user_prompt=user_prompt,
+            temperature=temperature,
+            max_tokens=max_tokens,
+            mode=mode)
+
+        return res
+
+    def _predict(
+        self,
+        model_id,
+        sys_prompt,
+        user_prompt,
+        temperature,
+        max_tokens,
+        mode: str = 'chat.completion',
+    ) -> dict:
+
+        res = {}
+        openai.api_key = self.api_key
+
+        for i in range(self.MAX_RETRIES):
+            try:
+                if mode == 'chat.completion':
+                    resp = openai.ChatCompletion.create(
+                        model=model_id,
+                        messages=[{
+                            'role': 'system',
+                            'content': sys_prompt
+                        }, {
+                            'role': 'user',
+                            'content': user_prompt
+                        }],
+                        temperature=temperature,
+                        max_tokens=max_tokens)
+
+                    if resp:
+                        ans_text = resp['choices'][0]['message']['content']
+                        model_id = resp['model']
+                    else:
+                        logger.warning(f'OpenAI GPT API call failed: got empty response '
+                                       f'for input {sys_prompt} {user_prompt}')
+                        ans_text = ''
+                        model_id = ''
+
+                    res['ans_text'] = ans_text
+                    res['model_id'] = model_id
+                else:
+                    raise ValueError(f'Invalid mode: {mode}')
+
+                return res
+
+            except Exception as e:
+                logger.warning(f'OpenAI API call failed: {e}')
+                time.sleep(3)
+        logger.error(f'OpenAI API call failed after {self.MAX_RETRIES} retries')
+        return res
+
+
+class DummyChatModel(ChatBaseModel):
+
+    MODEL_ID = 'dummy_chat_model_0801'
+    REVISION = 'v1.0.0'
+
+    def __init__(self, model_cfg: dict, **kwargs):
+        model_cfg['model_id'] = self.MODEL_ID
+        model_cfg['revision'] = self.REVISION
+        super(DummyChatModel, self).__init__(model_cfg=model_cfg)
+
+    def predict(self, inputs: dict, **kwargs) -> dict:
+
+        debug: bool = False
+        if debug:
+            messages = inputs['messages']
+            history = inputs['history']
+
+            logger.info(f'** messages: {messages}')
+            logger.info(f'** history: {history}')
+
+        choice = random.choice(['A', 'B', 'C', 'D'])
+
+        # Build response
+        res = {
+            'choices': [{
+                'index': 0,
+                'message': {
+                    'content': choice,
+                    'role': 'assistant'
+                }
+            }],
+            'created': time.time(),
+            'model': self.MODEL_ID + '-' + self.REVISION,
+            'object': 'chat.completion',
+            'usage': {}
+        }
+
+        return res
diff --git a/evalscope/models/model_adapter.py b/evalscope/models/model_adapter.py
index d52bdf72..a3d56500 100644
--- a/evalscope/models/model_adapter.py
+++ b/evalscope/models/model_adapter.py
@@ -3,11 +3,9 @@
 # flake8: noqa
 import numpy as np
 import os
-import sys
 import time
 import torch
 from abc import ABC, abstractmethod
-from copy import deepcopy
 from modelscope import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
 from torch import dtype
 from typing import Any, Dict, List, Union
@@ -21,77 +19,39 @@
 logger = get_logger()
 
 
-class BaseModelAdapter(ABC):
-    """
-    Base class for model adapter.
-    """
-
-    def __init__(self, model, tokenizer, model_cfg: dict):
-        """
-        Args:
-            model: The model instance which is compatible with
-                AutoModel/AutoModelForCausalLM/AutoModelForSeq2SeqLM of transformers.
-            tokenizer: The tokenizer instance which is compatible with AutoTokenizer of transformers.
-            model_cfg:
-                Attributes: model_id, model_revision, device_map, torch_dtype
-        """
-        self.model = model
-        self.tokenizer = tokenizer
-        self.model_cfg = model_cfg
-
-    @abstractmethod
-    @torch.no_grad()
-    def predict(self, *args, **kwargs) -> Any:
-        """
-        Model prediction func.
-        """
-        raise NotImplementedError
-
-
-class MultiChoiceModelAdapter(BaseModelAdapter):
-    """ The multi-choice model adapter. """
-
-    _DEFAULT_MAX_LENGTH = 2048
+class LocalModel:
 
     def __init__(self,
                  model_id: str,
+                 model_revision: str = 'master',
                  device_map: str = 'auto',
                  torch_dtype: dtype = torch.bfloat16,
-                 model_revision: str = None,
-                 max_length: int = None,
                  cache_dir: str = None,
                  **kwargs):
         """
         Args:
-            model_id: The model id on ModelScope, or local model_dir.  TODO: torch.nn.module to be supported.
+            model_id: The model id on ModelScope, or local model_dir.
+            model_revision: The model revision on ModelScope.
             device_map: The device map for model inference.
-            torch_dtype: The torch dtype for model inference. Default: torch.bfloat16.
-            model_revision: The model revision on ModelScope. Default: None.
-            max_length: The max length of input sequence. Default: None.
-            **kwargs: Other args.
+            torch_dtype: The torch dtype for model inference.
+            cache_dir: Directory to cache the models.
         """
         model_cache_dir = cache_dir or DEFAULT_MODEL_CACHE_DIR
 
-        self.model_id: str = model_id
+        self.model_id = model_id
+        self.model_revision = model_revision
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        logger.warning(f'Device: {self.device}')
-
-        torch_dtype = torch_dtype if torch_dtype is not None else 'auto'
-
-        model_cfg: dict = dict()
-        model_cfg['model_id'] = model_id
-        model_cfg['device_map'] = device_map
-        model_cfg['torch_dtype'] = str(torch_dtype)
+        logger.info(f'Device: {self.device}')
 
-        tokenizer = AutoTokenizer.from_pretrained(
-            self.model_id,  # self.model_id
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.model_id,
             revision=model_revision,
             trust_remote_code=True,
             cache_dir=model_cache_dir,
         )
 
-        model = AutoModelForCausalLM.from_pretrained(
-            self.model_id,  # self.model_id
+        self.model = AutoModelForCausalLM.from_pretrained(
+            self.model_id,
             revision=model_revision,
             device_map=device_map,
             trust_remote_code=True,
@@ -99,9 +59,54 @@ def __init__(self,
             cache_dir=model_cache_dir,
         )
 
-        super().__init__(model=model, tokenizer=tokenizer, model_cfg=model_cfg)
+        self.model_cfg = {
+            'model_id': model_id,
+            'device_map': device_map,
+            'torch_dtype': str(torch_dtype),
+        }
+
+
+class BaseModelAdapter(ABC):
+    """
+    Base class for model adapter.
+    """
+
+    def __init__(self, model: Union[LocalModel, CustomModel], **kwargs):
+        """
+        Args:
+            model: The model instance which is compatible with
+                AutoModel/AutoModelForCausalLM/AutoModelForSeq2SeqLM of transformers.
+        """
+        if isinstance(model, LocalModel):
+            self.model = model.model
+            self.model_id = model.model_id
+            self.model_revision = model.model_revision
+            self.device = model.device
+            self.tokenizer = model.tokenizer
+            self.model_cfg = model.model_cfg
+        elif isinstance(model, CustomModel):
+            pass
+        else:
+            raise ValueError(f'Unsupported model type: {type(model)}')
+
+    @abstractmethod
+    @torch.no_grad()
+    def predict(self, *args, **kwargs) -> Any:
+        """
+        Model prediction func.
+        """
+        raise NotImplementedError
+
 
-        self._max_length = max_length
+class MultiChoiceModelAdapter(BaseModelAdapter):
+    """ The multi-choice model adapter. """
+
+    _DEFAULT_MAX_LENGTH = 2048
+
+    def __init__(self, model: LocalModel, **kwargs):
+        super().__init__(model)
+
+        self._max_length = kwargs.get('max_length')
 
     @property
     def max_length(self):
@@ -198,32 +203,12 @@ def _get_logits(tokenizer, model, inputs: List[str]):
 
 
 class ContinuationLogitsModelAdapter(MultiChoiceModelAdapter):
+    """
+    Continuation-logits model adapter.
+    """
 
-    def __init__(self,
-                 model_id: str,
-                 device_map: str = 'auto',
-                 torch_dtype: dtype = torch.bfloat16,
-                 model_revision: str = None,
-                 cache_dir: str = None,
-                 **kwargs):
-        """
-        Continuation-logits model adapter.
-
-        Args:
-            model_id: The model id on ModelScope, or local model_dir.
-            device_map: The device map for model inference.
-            torch_dtype: The torch dtype for model inference. Default: torch.bfloat16.
-            model_revision: The model revision on ModelScope. Default: None.
-            **kwargs: Other args.
-        """
-
-        super().__init__(
-            model_id=model_id,
-            device_map=device_map,
-            torch_dtype=torch_dtype,
-            model_revision=model_revision,
-            cache_dir=cache_dir,
-            **kwargs)
+    def __init__(self, model: LocalModel, **kwargs):
+        super().__init__(model, **kwargs)
 
     @torch.no_grad()
     def predict(self, inputs: dict, infer_cfg: dict = None) -> dict:
@@ -321,69 +306,26 @@ def _encode_pair(self, context, continuation):
 
 
 class ChatGenerationModelAdapter(BaseModelAdapter):
+    """
+    Chat generation model adapter.
+    """
 
-    def __init__(self,
-                 model_id: str,
-                 model_revision: str = 'master',
-                 device_map: str = 'auto',
-                 torch_dtype: dtype = 'auto',
-                 cache_dir: str = None,
-                 **kwargs):
-        """
-        Chat completion model adapter. Tasks of chat and generation are supported.
+    def __init__(self, model: LocalModel, **kwargs):
+        super().__init__(model)
 
-        Args:
-            model_id: The model id on ModelScope, or local model_dir.
-            model_revision: The model revision on ModelScope. Default: None.
-            device_map: The device map for model inference.
-            torch_dtype: The torch dtype for model inference. Default: 'auto'.
-            **kwargs: Other args.
-        """
+        self.generation_config = self._parse_generation_config(self.tokenizer, self.model)
 
         custom_generation_config = kwargs.pop('generation_config', None)
         custom_chat_template = kwargs.pop('chat_template', None)
-        model_cache_dir = cache_dir or DEFAULT_MODEL_CACHE_DIR
-
-        self.model_id: str = model_id
-        self.model_revision: str = model_revision
-        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        logger.warning(f'Device: {self.device}')
-
-        torch_dtype = torch_dtype if torch_dtype is not None else 'auto'
-
-        model_cfg: dict = dict()
-        model_cfg['model_id'] = model_id
-        model_cfg['device_map'] = device_map
-        model_cfg['torch_dtype'] = str(torch_dtype)
-
-        tokenizer = AutoTokenizer.from_pretrained(
-            self.model_id,
-            revision=model_revision,
-            trust_remote_code=True,
-            cache_dir=model_cache_dir,
-        )
-
-        model = AutoModelForCausalLM.from_pretrained(
-            self.model_id,
-            revision=model_revision,
-            device_map=device_map,
-            trust_remote_code=True,
-            torch_dtype=torch_dtype,
-            cache_dir=model_cache_dir,
-        )
-
-        self.generation_config = self._parse_generation_config(tokenizer, model)
 
         if custom_generation_config:
             logger.info('Updating generation config ...')
             self.generation_config.update(**custom_generation_config)
 
         if custom_chat_template:
-            tokenizer.chat_template = custom_chat_template
+            self.tokenizer.chat_template = custom_chat_template
             logger.info(f'Using custom chat template: {custom_chat_template}')
 
-        super().__init__(model=model, tokenizer=tokenizer, model_cfg=model_cfg)
-
     def _parse_generation_config(self, tokenizer, model):
         generation_config = getattr(model, 'generation_config', GenerationConfig(do_sample=False))
 
@@ -473,7 +415,7 @@ def __init__(self, custom_model: CustomModel, **kwargs):
             **kwargs: Other args.
         """
         self.custom_model = custom_model
-        super(CustomModelAdapter, self).__init__(model=None, tokenizer=None, model_cfg=custom_model.config)
+        super(CustomModelAdapter, self).__init__(model=custom_model)
 
     def predict(self, inputs: Union[str, dict, list], **kwargs) -> List[Dict[str, Any]]:
         """
diff --git a/evalscope/models/openai_model.py b/evalscope/models/openai_model.py
deleted file mode 100644
index 3caa9c4b..00000000
--- a/evalscope/models/openai_model.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-
-import openai
-import os
-import time
-
-from evalscope.models import ChatBaseModel
-from evalscope.utils.logger import get_logger
-
-logger = get_logger()
-
-
-class OpenAIModel(ChatBaseModel):
-    """
-    APIs of OpenAI models.
-    Available models: gpt-3.5-turbo, gpt-4
-    """
-
-    MAX_RETRIES = 3
-
-    def __init__(self, model_cfg: dict, **kwargs):
-        super(OpenAIModel, self).__init__(model_cfg=model_cfg, **kwargs)
-
-        openai_api_key = os.environ.get('OPENAI_API_KEY', None)
-        self.api_key = self.model_cfg.get('api_key', openai_api_key)
-
-        if not self.api_key:
-            logger.error('OpenAI API key is not provided, please set it in environment variable OPENAI_API_KEY')
-            # raise ValueError(
-            #     'OpenAI API key is not provided, '
-            #     'please set it in environment variable OPENAI_API_KEY')
-
-    def predict(self, model_id: str, inputs: dict, **kwargs) -> dict:
-
-        sys_prompt: str = inputs.get('sys_prompt', '')
-        user_prompt: str = inputs.get('user_prompt', '')
-
-        # model_id: str = kwargs.get('model_id', '')
-        temperature: float = kwargs.pop('temperature', 0.2)
-        max_tokens: int = kwargs.pop('max_tokens', 1024)
-        mode: str = kwargs.pop('mode', 'chat.completion')
-
-        logger.info(f'Using OpenAI model_id: {model_id}')
-
-        res = self._predict(
-            model_id=model_id,
-            sys_prompt=sys_prompt,
-            user_prompt=user_prompt,
-            temperature=temperature,
-            max_tokens=max_tokens,
-            mode=mode)
-
-        return res
-
-    def _predict(
-        self,
-        model_id,
-        sys_prompt,
-        user_prompt,
-        temperature,
-        max_tokens,
-        mode: str = 'chat.completion',
-    ) -> dict:
-
-        res = {}
-        openai.api_key = self.api_key
-
-        for i in range(self.MAX_RETRIES):
-            try:
-                if mode == 'chat.completion':
-                    resp = openai.ChatCompletion.create(
-                        model=model_id,
-                        messages=[{
-                            'role': 'system',
-                            'content': sys_prompt
-                        }, {
-                            'role': 'user',
-                            'content': user_prompt
-                        }],
-                        temperature=temperature,
-                        max_tokens=max_tokens)
-
-                    if resp:
-                        ans_text = resp['choices'][0]['message']['content']
-                        model_id = resp['model']
-                    else:
-                        logger.warning(f'OpenAI GPT API call failed: got empty response '
-                                       f'for input {sys_prompt} {user_prompt}')
-                        ans_text = ''
-                        model_id = ''
-
-                    res['ans_text'] = ans_text
-                    res['model_id'] = model_id
-                else:
-                    raise ValueError(f'Invalid mode: {mode}')
-
-                return res
-
-            except Exception as e:
-                logger.warning(f'OpenAI API call failed: {e}')
-                time.sleep(3)
-        logger.error(f'OpenAI API call failed after {self.MAX_RETRIES} retries')
-        return res
diff --git a/evalscope/run.py b/evalscope/run.py
index 3b531732..e7473293 100644
--- a/evalscope/run.py
+++ b/evalscope/run.py
@@ -13,7 +13,7 @@
 from evalscope.config import TaskConfig, parse_task_config
 from evalscope.constants import DEFAULT_MODEL_REVISION, DEFAULT_WORK_DIR, EvalBackend, EvalType
 from evalscope.evaluator import Evaluator
-from evalscope.models.custom import CustomModel
+from evalscope.models import CustomModel, LocalModel
 from evalscope.utils import seed_everything
 from evalscope.utils.io_utils import OutputsStructure, are_paths_same
 from evalscope.utils.logger import configure_logging, get_logger
@@ -99,22 +99,21 @@ def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
     """Evaluate the model based on the provided task configuration."""
     # Initialize evaluator
     eval_results = {}
-
+    base_model = get_base_model(task_cfg)
     for dataset_name in task_cfg.datasets:
-        evaluator = create_evaluator(task_cfg, dataset_name, outputs)
+        evaluator = create_evaluator(task_cfg, dataset_name, outputs, base_model)
         res_dict = evaluator.eval(infer_cfg=task_cfg.generation_config, debug=task_cfg.debug, limit=task_cfg.limit)
         eval_results[dataset_name] = res_dict
 
     return eval_results
 
 
-def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsStructure):
+def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsStructure, base_model: LocalModel):
     """Create an evaluator object for the specified dataset."""
-    # imported_modules = import_module_util(BENCHMARK_PATH_PREFIX, dataset_name, MEMBERS_TO_IMPORT)
     benchmark: BenchmarkMeta = Benchmark.get(dataset_name)
 
     data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args)
-    model_adapter = initialize_model_adapter(task_cfg, model_adapter_cls=benchmark.model_adapter)
+    model_adapter = initialize_model_adapter(task_cfg, benchmark.model_adapter, base_model)
 
     return Evaluator(
         dataset_name_or_path=benchmark.dataset_id,
@@ -131,10 +130,31 @@ def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsSt
     )
 
 
-def initialize_model_adapter(task_cfg: TaskConfig, model_adapter_cls):
+def get_base_model(task_cfg: TaskConfig) -> Optional[LocalModel]:
+    """Get the base model for the task."""
+    if task_cfg.eval_type != EvalType.CHECKPOINT:
+        return None
+    else:
+        device_map = task_cfg.model_args.get('device_map', 'auto') if torch.cuda.is_available() else None
+        cache_dir = task_cfg.model_args.get('cache_dir', None)
+        model_precision = task_cfg.model_args.get('precision', torch.float16)
+        model_revision = task_cfg.model_args.get('revision', DEFAULT_MODEL_REVISION)
+        if isinstance(model_precision, str) and model_precision != 'auto':
+            model_precision = eval(model_precision)
+
+        base_model = LocalModel(
+            model_id=task_cfg.model,
+            model_revision=model_revision,
+            device_map=device_map,
+            torch_dtype=model_precision,
+            cache_dir=cache_dir)
+        return base_model
+
+
+def initialize_model_adapter(task_cfg: TaskConfig, model_adapter_cls, base_model: LocalModel):
     """Initialize the model adapter based on the task configuration."""
     if task_cfg.dry_run:
-        from evalscope.models.dummy_chat_model import DummyChatModel
+        from evalscope.models.model import DummyChatModel
         return DummyChatModel(model_cfg=dict())
     elif task_cfg.eval_type == EvalType.CUSTOM:
         if not isinstance(task_cfg.model, CustomModel):
@@ -142,15 +162,8 @@ def initialize_model_adapter(task_cfg: TaskConfig, model_adapter_cls):
         from evalscope.models.model_adapter import CustomModelAdapter
         return CustomModelAdapter(custom_model=task_cfg.model)
     else:
-        device_map = task_cfg.model_args.get('device_map', 'auto') if torch.cuda.is_available() else None
-        model_precision = task_cfg.model_args.get('precision', torch.float16)
-        if isinstance(model_precision, str) and model_precision != 'auto':
-            model_precision = eval(model_precision)
         return model_adapter_cls(
-            model_id=task_cfg.model,
-            model_revision=task_cfg.model_args.get('revision', DEFAULT_MODEL_REVISION),
-            device_map=device_map,
-            torch_dtype=model_precision,
+            model=base_model or get_base_model(task_cfg),
             generation_config=task_cfg.generation_config,
             chat_template=task_cfg.chat_template)
 
diff --git a/evalscope/models/api/openai_api.py b/evalscope/third_party/longbench_write/tools/openai_api.py
similarity index 100%
rename from evalscope/models/api/openai_api.py
rename to evalscope/third_party/longbench_write/tools/openai_api.py
diff --git a/tests/cli/test_run.py b/tests/cli/test_run.py
index caaac0b2..9370e2bb 100644
--- a/tests/cli/test_run.py
+++ b/tests/cli/test_run.py
@@ -70,7 +70,7 @@ def test_run_eval_with_args(self):
 
     @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
     def test_run_task(self):
-        task_cfg = {'model': 'qwen/Qwen2-0.5B-Instruct', 'datasets': ['gsm8k', 'arc'], 'limit': 2, 'debug': True}
+        task_cfg = {'model': 'qwen/Qwen2-0.5B-Instruct', 'datasets': ['hellaswag', 'gsm8k', 'arc'], 'limit': 2, 'debug': True}
         run_task(task_cfg=task_cfg)
 
 

From a3b9b9f396bef9d416e8152f6a7977d2f80a3666 Mon Sep 17 00:00:00 2001
From: Yunnglin <mao.looper@qq.com>
Date: Fri, 20 Dec 2024 13:19:12 +0800
Subject: [PATCH 06/15] split model adapter

---
 evalscope/benchmarks/arc/arc_adapter.py       |   9 +-
 evalscope/benchmarks/bbh/__init__.py          |   4 -
 evalscope/benchmarks/ceval/__init__.py        |   5 -
 evalscope/benchmarks/cmmlu/__init__.py        |   5 -
 .../benchmarks/competition_math/__init__.py   |   5 -
 evalscope/benchmarks/data_adapter.py          |   8 +-
 evalscope/benchmarks/general_qa/__init__.py   |   5 -
 .../benchmarks/hellaswag/hellaswag_adapter.py |   9 +-
 evalscope/benchmarks/humaneval/__init__.py    |   4 -
 evalscope/benchmarks/mmlu/__init__.py         |   5 -
 evalscope/benchmarks/race/__init__.py         |   5 -
 evalscope/benchmarks/trivia_qa/__init__.py    |   5 -
 evalscope/benchmarks/truthful_qa/__init__.py  |   5 -
 evalscope/config.py                           |   4 +-
 evalscope/constants.py                        |   3 +-
 evalscope/evaluator/evaluator.py              |  39 +-
 evalscope/evaluator/reviewer/auto_reviewer.py |  14 +-
 evalscope/models/__init__.py                  |  17 +-
 evalscope/models/base_adapter.py              |  27 +
 evalscope/models/chat_adapter.py              | 108 ++++
 evalscope/models/choice_adapter.py            | 214 ++++++++
 evalscope/models/custom_adapter.py            |  67 +++
 evalscope/models/local_model.py               |  47 ++
 evalscope/models/model_adapter.py             | 467 ------------------
 evalscope/models/server_adapter.py            |  80 +++
 evalscope/run.py                              |  16 +-
 evalscope/utils/chat_service.py               |   4 +-
 27 files changed, 612 insertions(+), 569 deletions(-)
 create mode 100644 evalscope/models/base_adapter.py
 create mode 100644 evalscope/models/chat_adapter.py
 create mode 100644 evalscope/models/choice_adapter.py
 create mode 100644 evalscope/models/custom_adapter.py
 create mode 100644 evalscope/models/local_model.py
 delete mode 100644 evalscope/models/model_adapter.py
 create mode 100644 evalscope/models/server_adapter.py

diff --git a/evalscope/benchmarks/arc/arc_adapter.py b/evalscope/benchmarks/arc/arc_adapter.py
index e00cf784..eb470c9a 100644
--- a/evalscope/benchmarks/arc/arc_adapter.py
+++ b/evalscope/benchmarks/arc/arc_adapter.py
@@ -4,6 +4,7 @@
 import os
 
 from evalscope.benchmarks import Benchmark, DataAdapter
+from evalscope.constants import EvalType
 from evalscope.metrics import WeightedAverageAccuracy, exact_match
 from evalscope.models import MultiChoiceModelAdapter
 from evalscope.utils import ResponseParser
@@ -119,7 +120,7 @@ def get_gold_answer(self, input_d: dict) -> str:
         # Get the gold choice
         return input_d.get('answerKey', '')
 
-    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
+    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
         """
         Parse the model output to get the answer. Could be the best choice index.
 
@@ -131,12 +132,12 @@ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: st
         Returns:
             The parsed answer. Depending on the dataset. Usually a string for chat.
         """
-        if eval_type == 'checkpoint':
+        if eval_type == EvalType.CHECKPOINT:
             return result
-        elif eval_type == 'service':
+        elif eval_type == EvalType.SERVICE:
             return ResponseParser.parse_first_option_with_choices(
                 text=result, options=self.choices)  # TODO: to be checked !
-        elif eval_type == 'custom':
+        elif eval_type == EvalType.CUSTOM:
             return ResponseParser.parse_first_option_with_choices(
                 text=result, options=self.choices)  # TODO: to be checked !
         else:
diff --git a/evalscope/benchmarks/bbh/__init__.py b/evalscope/benchmarks/bbh/__init__.py
index 7387c94c..b937315b 100644
--- a/evalscope/benchmarks/bbh/__init__.py
+++ b/evalscope/benchmarks/bbh/__init__.py
@@ -1,5 +1 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-
-from evalscope.benchmarks.bbh.bbh_adapter import DATASET_ID, SUBSET_LIST
-from evalscope.benchmarks.bbh.bbh_adapter import BBHAdapter as DataAdapterClass
-from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass  # noqa
diff --git a/evalscope/benchmarks/ceval/__init__.py b/evalscope/benchmarks/ceval/__init__.py
index b7532a3d..b937315b 100644
--- a/evalscope/benchmarks/ceval/__init__.py
+++ b/evalscope/benchmarks/ceval/__init__.py
@@ -1,6 +1 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-
-from evalscope.benchmarks.ceval.ceval_adapter import DATASET_ID, SUBJECT_MAPPING, SUBSET_LIST
-from evalscope.benchmarks.ceval.ceval_adapter import CEVALAdapter
-from evalscope.benchmarks.ceval.ceval_adapter import CEVALAdapter as DataAdapterClass
-from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass  # noqa
diff --git a/evalscope/benchmarks/cmmlu/__init__.py b/evalscope/benchmarks/cmmlu/__init__.py
index 864f8469..b937315b 100644
--- a/evalscope/benchmarks/cmmlu/__init__.py
+++ b/evalscope/benchmarks/cmmlu/__init__.py
@@ -1,6 +1 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-
-from evalscope.benchmarks.cmmlu.cmmlu_adapter import DATASET_ID, SUBJECT_MAPPING, SUBSET_LIST
-from evalscope.benchmarks.cmmlu.cmmlu_adapter import CMMLUAdapter
-from evalscope.benchmarks.cmmlu.cmmlu_adapter import CMMLUAdapter as DataAdapterClass
-from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass  # noqa
diff --git a/evalscope/benchmarks/competition_math/__init__.py b/evalscope/benchmarks/competition_math/__init__.py
index 85efbf4f..b937315b 100644
--- a/evalscope/benchmarks/competition_math/__init__.py
+++ b/evalscope/benchmarks/competition_math/__init__.py
@@ -1,6 +1 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-
-from evalscope.benchmarks.competition_math.competition_math_adapter import DATASET_ID, SUBSET_LIST
-from evalscope.benchmarks.competition_math.competition_math_adapter import CompetitionMathAdapter
-from evalscope.benchmarks.competition_math.competition_math_adapter import CompetitionMathAdapter as DataAdapterClass
-from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass  # noqa
diff --git a/evalscope/benchmarks/data_adapter.py b/evalscope/benchmarks/data_adapter.py
index 18f823ed..468b47a4 100644
--- a/evalscope/benchmarks/data_adapter.py
+++ b/evalscope/benchmarks/data_adapter.py
@@ -5,7 +5,7 @@
 from modelscope.msdatasets import MsDataset
 from typing import Any, Optional
 
-from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, HubType
+from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, EvalType, HubType
 from evalscope.utils import normalize_score
 from evalscope.utils.logger import get_logger
 
@@ -265,7 +265,7 @@ def get_gold_answer(self, input_d: Any) -> Any:
         raise NotImplementedError
 
     @abstractmethod
-    def parse_pred_result(self, result: Any, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> Any:
+    def parse_pred_result(self, result: Any, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> Any:
         """
         Parse the predicted result and extract proper answer.
 
@@ -286,9 +286,9 @@ def match(self, gold: Any, pred: Any) -> Any:
 
         Args:
             gold (Any): The golden answer. Usually a string for chat/multiple-choice-questions.
-                        e.g. 'A'
+                        e.g. 'A', extracted from get_gold_answer method.
             pred (Any): The predicted answer. Usually a string for chat/multiple-choice-questions.
-                        e.g. 'B'
+                        e.g. 'B', extracted from parse_pred_result method.
 
         Returns:
             The match result. Usually a score (float) for chat/multiple-choice-questions.
diff --git a/evalscope/benchmarks/general_qa/__init__.py b/evalscope/benchmarks/general_qa/__init__.py
index 2e732005..b937315b 100644
--- a/evalscope/benchmarks/general_qa/__init__.py
+++ b/evalscope/benchmarks/general_qa/__init__.py
@@ -1,6 +1 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-
-from evalscope.benchmarks.general_qa.general_qa_adapter import DATASET_ID, SUBSET_LIST
-from evalscope.benchmarks.general_qa.general_qa_adapter import GeneralQAAdapter
-from evalscope.benchmarks.general_qa.general_qa_adapter import GeneralQAAdapter as DataAdapterClass
-from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass
diff --git a/evalscope/benchmarks/hellaswag/hellaswag_adapter.py b/evalscope/benchmarks/hellaswag/hellaswag_adapter.py
index afae5570..faafc96b 100644
--- a/evalscope/benchmarks/hellaswag/hellaswag_adapter.py
+++ b/evalscope/benchmarks/hellaswag/hellaswag_adapter.py
@@ -4,6 +4,7 @@
 import re
 
 from evalscope.benchmarks import Benchmark, DataAdapter
+from evalscope.constants import EvalType
 from evalscope.metrics import WeightedAverageAccuracy, exact_match
 from evalscope.models import ContinuationLogitsModelAdapter
 from evalscope.utils.io_utils import jsonl_to_list
@@ -92,7 +93,7 @@ def get_gold_answer(self, input_d: dict) -> str:
         # Get the gold choice
         return input_d['label']
 
-    def parse_pred_result(self, result: list, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
+    def parse_pred_result(self, result: list, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
         """
         Parse the model output to get the answer. Could be the best choice index.
 
@@ -104,7 +105,7 @@ def parse_pred_result(self, result: list, raw_input_d: dict = None, eval_type: s
         Returns:
             The parsed answer. Depending on the dataset. Usually a string for chat.
         """
-        if eval_type == 'checkpoint':
+        if eval_type == EvalType.CHECKPOINT:
             # answer: in the form of [-2.3, -4.5, ...], len of self.choices
             result = np.array(result)
             endings: list = [self._preprocess(ending) for ending in raw_input_d['endings']]
@@ -112,9 +113,9 @@ def parse_pred_result(self, result: list, raw_input_d: dict = None, eval_type: s
             best_choice_idx = np.argmax(result / completion_len)
 
             return str(best_choice_idx)
-        elif eval_type == 'service':
+        elif eval_type == EvalType.SERVICE:
             return result  # TODO: to be supported !
-        elif eval_type == 'custom':
+        elif eval_type == EvalType.CUSTOM:
             return result  # TODO: to be supported !
         else:
             raise ValueError(f'Invalid eval_type: {eval_type}')
diff --git a/evalscope/benchmarks/humaneval/__init__.py b/evalscope/benchmarks/humaneval/__init__.py
index 176dd8f6..b937315b 100644
--- a/evalscope/benchmarks/humaneval/__init__.py
+++ b/evalscope/benchmarks/humaneval/__init__.py
@@ -1,5 +1 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-
-from evalscope.benchmarks.humaneval.humaneval_adapter import DATASET_ID, SUBSET_LIST
-from evalscope.benchmarks.humaneval.humaneval_adapter import HumanevalAdapter as DataAdapterClass
-from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass  # noqa
diff --git a/evalscope/benchmarks/mmlu/__init__.py b/evalscope/benchmarks/mmlu/__init__.py
index c112533f..b937315b 100644
--- a/evalscope/benchmarks/mmlu/__init__.py
+++ b/evalscope/benchmarks/mmlu/__init__.py
@@ -1,6 +1 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-
-from evalscope.benchmarks.mmlu.mmlu_adapter import DATASET_ID, SUBJECT_MAPPING, SUBSET_LIST
-from evalscope.benchmarks.mmlu.mmlu_adapter import MMLUAdapter
-from evalscope.benchmarks.mmlu.mmlu_adapter import MMLUAdapter as DataAdapterClass
-from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass  # noqa
diff --git a/evalscope/benchmarks/race/__init__.py b/evalscope/benchmarks/race/__init__.py
index f4290c4f..b937315b 100644
--- a/evalscope/benchmarks/race/__init__.py
+++ b/evalscope/benchmarks/race/__init__.py
@@ -1,6 +1 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-
-from evalscope.benchmarks.race.race_adapter import DATASET_ID, SUBJECT_MAPPING, SUBSET_LIST
-from evalscope.benchmarks.race.race_adapter import RACEAdapter
-from evalscope.benchmarks.race.race_adapter import RACEAdapter as DataAdapterClass
-from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass  # noqa
diff --git a/evalscope/benchmarks/trivia_qa/__init__.py b/evalscope/benchmarks/trivia_qa/__init__.py
index 50875493..b937315b 100644
--- a/evalscope/benchmarks/trivia_qa/__init__.py
+++ b/evalscope/benchmarks/trivia_qa/__init__.py
@@ -1,6 +1 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-
-from evalscope.benchmarks.trivia_qa.trivia_qa_adapter import DATASET_ID, SUBSET_LIST
-from evalscope.benchmarks.trivia_qa.trivia_qa_adapter import TriviaQaAdapter
-from evalscope.benchmarks.trivia_qa.trivia_qa_adapter import TriviaQaAdapter as DataAdapterClass
-from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass  # noqa
diff --git a/evalscope/benchmarks/truthful_qa/__init__.py b/evalscope/benchmarks/truthful_qa/__init__.py
index 1fbe8879..b937315b 100644
--- a/evalscope/benchmarks/truthful_qa/__init__.py
+++ b/evalscope/benchmarks/truthful_qa/__init__.py
@@ -1,6 +1 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-
-from evalscope.benchmarks.truthful_qa.truthful_qa_adapter import DATASET_ID, SUBSET_LIST
-from evalscope.benchmarks.truthful_qa.truthful_qa_adapter import TruthfulQaAdapter
-from evalscope.benchmarks.truthful_qa.truthful_qa_adapter import TruthfulQaAdapter as DataAdapterClass
-from evalscope.models.model_adapter import ContinuationLogitsModelAdapter as ModelAdapterClass  # noqa
diff --git a/evalscope/config.py b/evalscope/config.py
index 3e8652ce..a768bc20 100644
--- a/evalscope/config.py
+++ b/evalscope/config.py
@@ -72,9 +72,10 @@ def __post_init__(self):
                 self.model_id = type(self.model).__name__
             else:
                 self.model_id = os.path.basename(self.model).rstrip(os.sep)
+        # Convert Enum to string
+        self.eval_backend = str(self.eval_backend)
 
     def to_dict(self):
-        # Note: to avoid serialization error for some model instance
         return self.__dict__
 
     def __str__(self):
@@ -129,6 +130,7 @@ def load(custom_model: CustomModel, tasks: List[str]) -> List['TaskConfig']:
                 continue
 
             task.model = custom_model
+            task.model_args = custom_model.config
             task.model_id = type(custom_model).__name__
             res_list.append(task)
 
diff --git a/evalscope/constants.py b/evalscope/constants.py
index be8d00ed..f6152ac0 100644
--- a/evalscope/constants.py
+++ b/evalscope/constants.py
@@ -135,7 +135,8 @@ class EvalStage:
 class EvalType:
 
     CUSTOM = 'custom'
-    CHECKPOINT = 'checkpoint'
+    CHECKPOINT = 'checkpoint'  # native model checkpoint
+    SERVICE = 'service'  # model service
 
 
 class EvalBackend:
diff --git a/evalscope/evaluator/evaluator.py b/evalscope/evaluator/evaluator.py
index f894411d..88c18946 100644
--- a/evalscope/evaluator/evaluator.py
+++ b/evalscope/evaluator/evaluator.py
@@ -10,9 +10,8 @@
 
 from evalscope.benchmarks import DataAdapter
 from evalscope.config import TaskConfig
-from evalscope.constants import (DEFAULT_DATASET_CACHE_DIR, AnswerKeys, DumpMode, EvalStage, EvalType, HubType,
-                                 ReviewKeys)
-from evalscope.models.model_adapter import BaseModelAdapter, CustomModelAdapter
+from evalscope.constants import AnswerKeys, DumpMode, EvalStage, ReviewKeys
+from evalscope.models import BaseModelAdapter, CustomModelAdapter
 from evalscope.tools.combine_reports import gen_table
 from evalscope.utils import dict_torch_dtype_to_str, gen_hash
 from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
@@ -45,44 +44,36 @@ class Evaluator(object):
     def __init__(self,
                  dataset_name_or_path: str,
                  data_adapter: DataAdapter,
-                 subset_list: Optional[list] = None,
-                 model_adapter: Optional[BaseModelAdapter] = None,
-                 use_cache: Optional[str] = None,
-                 outputs: Optional[OutputsStructure] = None,
-                 datasets_dir: Optional[str] = DEFAULT_DATASET_CACHE_DIR,
-                 datasets_hub: Optional[str] = HubType.MODELSCOPE,
-                 stage: Optional[str] = EvalStage.ALL,
-                 eval_type: Optional[str] = EvalType.CHECKPOINT,
-                 overall_task_cfg: Optional[TaskConfig] = None,
+                 model_adapter: BaseModelAdapter,
+                 subset_list: list = None,
+                 outputs: OutputsStructure = None,
+                 task_cfg: TaskConfig = None,
                  **kwargs):
 
         self.dataset_name_or_path = os.path.expanduser(dataset_name_or_path)
         self.dataset_name = os.path.basename(self.dataset_name_or_path.rstrip(os.sep)).split('.')[0]
-        self.model_name = overall_task_cfg.model_id
+        self.model_name = task_cfg.model_id
         self.custom_task_name = f'{self.model_name}_{self.dataset_name}'
 
-        self.datasets_dir = os.path.expanduser(datasets_dir)
+        self.datasets_dir = os.path.expanduser(task_cfg.dataset_dir)
         self.kwargs = kwargs
         self.data_adapter = data_adapter
         self.model_adapter = model_adapter
-        self.eval_type = eval_type
-        self.stage = stage
-        self.use_cache = use_cache
-        self.overall_task_cfg = overall_task_cfg
-        if isinstance(self.model_adapter, CustomModelAdapter):
-            self.overall_task_cfg.model_args = self.model_adapter.custom_model.config
-
-        self.model_cfg = self.model_adapter.model_cfg
+        self.eval_type = task_cfg.eval_type
+        self.stage = task_cfg.stage
+        self.use_cache = task_cfg.use_cache
+        self.task_cfg = task_cfg
+        self.model_cfg = model_adapter.model_cfg
 
         # Deal with the output paths
         self.outputs_structure = outputs
 
         # Load dataset
         self.dataset = self.data_adapter.load(
-            dataset_name_or_path=dataset_name_or_path,
+            dataset_name_or_path=self.dataset_name_or_path,
             subset_list=subset_list,
             work_dir=self.datasets_dir,
-            datasets_hub=datasets_hub,
+            datasets_hub=task_cfg.dataset_hub,
             **kwargs)
 
         # Get prompts from dataset
diff --git a/evalscope/evaluator/reviewer/auto_reviewer.py b/evalscope/evaluator/reviewer/auto_reviewer.py
index 4144f111..bd0e3873 100644
--- a/evalscope/evaluator/reviewer/auto_reviewer.py
+++ b/evalscope/evaluator/reviewer/auto_reviewer.py
@@ -8,7 +8,7 @@
 import time
 from abc import ABC, abstractmethod
 from functools import partial
-from typing import Any, List
+from typing import Any, List, Tuple
 
 from evalscope.constants import ArenaMode, EvalConfigKeys, FnCompletionParser, PositionBiasMitigation
 from evalscope.models.model import OpenAIModel
@@ -240,7 +240,15 @@ def get_review_single(self, row: List[dict], dry_run: bool = False, **kwargs):
             review_text=review_text)
         return review_result
 
-    def _get_review_pair(self, model_a, model_b, question, category, ans1, ans2, dry_run=False, **kwargs) -> (str, Any):
+    def _get_review_pair(self,
+                         model_a,
+                         model_b,
+                         question,
+                         category,
+                         ans1,
+                         ans2,
+                         dry_run=False,
+                         **kwargs) -> Tuple[str, Any]:
         input_msg = dict(ques=question, category=category, ans1=ans1, ans2=ans2)
 
         if self.reference_list:
@@ -263,7 +271,7 @@ def _get_review_pair(self, model_a, model_b, question, category, ans1, ans2, dry
             result = (result, None)
         return review_text, *result
 
-    def _get_review_single(self, model, question, category, answer, dry_run=False, **kwargs) -> (str, Any):
+    def _get_review_single(self, model, question, category, answer, dry_run=False, **kwargs) -> Tuple[str, Any]:
         input_msg = dict(ques=question, category=category, ans1=answer)
 
         if self.reference_list:
diff --git a/evalscope/models/__init__.py b/evalscope/models/__init__.py
index 8fc22ebf..90f126ee 100644
--- a/evalscope/models/__init__.py
+++ b/evalscope/models/__init__.py
@@ -1,5 +1,16 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
-from evalscope.models.custom import *
-from evalscope.models.model import BaseModel, ChatBaseModel
-from evalscope.models.model_adapter import *
+from evalscope.models.base_adapter import BaseModelAdapter
+from evalscope.models.chat_adapter import ChatGenerationModelAdapter
+from evalscope.models.choice_adapter import ContinuationLogitsModelAdapter, MultiChoiceModelAdapter
+from evalscope.models.custom import CustomModel
+from evalscope.models.custom_adapter import CustomModelAdapter
+from evalscope.models.local_model import LocalModel
+from evalscope.models.model import BaseModel, ChatBaseModel, OpenAIModel
+from evalscope.models.server_adapter import ServerModelAdapter
+
+__all__ = [
+    'CustomModel', 'BaseModel', 'ChatBaseModel', 'OpenAIModel', 'BaseModelAdapter', 'ChatGenerationModelAdapter',
+    'MultiChoiceModelAdapter', 'ContinuationLogitsModelAdapter', 'CustomModelAdapter', 'ServerModelAdapter',
+    'LocalModel'
+]
diff --git a/evalscope/models/base_adapter.py b/evalscope/models/base_adapter.py
new file mode 100644
index 00000000..ea00c7c1
--- /dev/null
+++ b/evalscope/models/base_adapter.py
@@ -0,0 +1,27 @@
+import torch
+from abc import ABC, abstractmethod
+from typing import Any, Union
+
+from evalscope.models.custom import CustomModel
+from evalscope.models.local_model import LocalModel
+
+
+class BaseModelAdapter(ABC):
+
+    def __init__(self, model: Union[LocalModel, CustomModel], **kwargs):
+        if isinstance(model, LocalModel):
+            self.model = model.model
+            self.model_id = model.model_id
+            self.model_revision = model.model_revision
+            self.device = model.device
+            self.tokenizer = model.tokenizer
+            self.model_cfg = model.model_cfg
+        elif isinstance(model, CustomModel):
+            self.model_cfg = model.config
+        else:
+            raise ValueError(f'Unsupported model type: {type(model)}')
+
+    @abstractmethod
+    @torch.no_grad()
+    def predict(self, *args, **kwargs) -> Any:
+        raise NotImplementedError
diff --git a/evalscope/models/chat_adapter.py b/evalscope/models/chat_adapter.py
new file mode 100644
index 00000000..033ee7f3
--- /dev/null
+++ b/evalscope/models/chat_adapter.py
@@ -0,0 +1,108 @@
+import os
+import time
+import torch
+from modelscope import GenerationConfig
+from typing import Union
+
+from evalscope.models.base_adapter import BaseModelAdapter
+from evalscope.models.local_model import LocalModel
+from evalscope.utils.chat_service import ChatCompletionResponse, ChatMessage
+from evalscope.utils.logger import get_logger
+from evalscope.utils.model_utils import fix_do_sample_warning
+
+logger = get_logger()
+
+
+class ChatGenerationModelAdapter(BaseModelAdapter):
+    """
+    Chat generation model adapter.
+    """
+
+    def __init__(self, model: LocalModel, **kwargs):
+        super().__init__(model)
+
+        self.generation_config = self._parse_generation_config(self.tokenizer, self.model)
+
+        custom_generation_config = kwargs.pop('generation_config', None)
+        custom_chat_template = kwargs.pop('chat_template', None)
+
+        if custom_generation_config:
+            logger.info('Updating generation config ...')
+            self.generation_config.update(**custom_generation_config)
+
+        if custom_chat_template:
+            self.tokenizer.chat_template = custom_chat_template
+            logger.info(f'Using custom chat template: {custom_chat_template}')
+
+    def _parse_generation_config(self, tokenizer, model):
+        generation_config = getattr(model, 'generation_config', GenerationConfig(do_sample=False))
+
+        try:
+            remote_config = GenerationConfig.from_pretrained(
+                self.model_id, revision=self.model_revision, trust_remote_code=True)
+            generation_config.update(**remote_config.to_dict())
+        except Exception:
+            logger.warning(f'Failed to get generation config of {self.model_id} from model hub, use default.')
+
+        if isinstance(self.model_id, str) and os.path.exists(self.model_id):
+            logger.warning(f'Got local model dir: {self.model_id}')
+
+        if tokenizer.eos_token_id is not None:
+            generation_config.eos_token_id = tokenizer.eos_token_id
+        if tokenizer.pad_token_id is not None:
+            generation_config.pad_token_id = tokenizer.pad_token_id
+        if generation_config.max_new_tokens is None:
+            generation_config.max_new_tokens = 2048
+
+        return generation_config
+
+    def _model_generate(self, query: str, infer_cfg: dict) -> str:
+        messages = [ChatMessage(role='user', content=query)]
+        formatted_prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = self.tokenizer(formatted_prompt, return_tensors='pt', padding=True).to(self.device)
+        input_ids = inputs['input_ids']
+
+        # Process infer_cfg
+        if isinstance(infer_cfg.get('num_return_sequences'), int) and infer_cfg['num_return_sequences'] > 1:
+            infer_cfg['do_sample'] = True
+
+        # stop settings
+        stop = infer_cfg.get('stop', None)
+        eos_token_id = self.tokenizer.encode(stop, add_special_tokens=False)[0] \
+            if stop else self.tokenizer.eos_token_id
+
+        if eos_token_id is not None:
+            infer_cfg['eos_token_id'] = eos_token_id
+            infer_cfg['pad_token_id'] = eos_token_id  # setting eos_token_id as pad token
+
+        self.generation_config.update(**infer_cfg)
+        fix_do_sample_warning(self.generation_config)
+
+        # Run inference
+        output_ids = self.model.generate(**inputs, generation_config=self.generation_config)
+
+        response = self.tokenizer.decode(output_ids[0, len(input_ids[0]):], skip_special_tokens=True)
+        return response
+
+    @torch.no_grad()
+    def predict(self, inputs: Union[str, dict, list], infer_cfg: dict = {}) -> dict:
+
+        # Process inputs
+        if isinstance(inputs, str):
+            query = inputs
+        elif isinstance(inputs, dict):
+            query = inputs['data'][0]
+        elif isinstance(inputs, list):
+            query = '\n'.join(inputs)
+        else:
+            raise TypeError(f'Unsupported inputs type: {type(inputs)}')
+
+        response = self._model_generate(query, infer_cfg)
+
+        choices_list = [{'index': 0, 'message': {'content': response, 'role': 'assistant'}}]
+
+        res_d = ChatCompletionResponse(
+            model=self.model_id, choices=choices_list, object='chat.completion', created=int(time.time()),
+            usage=None).model_dump(exclude_unset=True)
+
+        return res_d
diff --git a/evalscope/models/choice_adapter.py b/evalscope/models/choice_adapter.py
new file mode 100644
index 00000000..b2d403e3
--- /dev/null
+++ b/evalscope/models/choice_adapter.py
@@ -0,0 +1,214 @@
+import numpy as np
+import time
+import torch
+from typing import List
+
+from evalscope.models.base_adapter import BaseModelAdapter
+from evalscope.models.local_model import LocalModel
+from evalscope.utils.chat_service import ChatCompletionResponse
+
+
+class MultiChoiceModelAdapter(BaseModelAdapter):
+    """ The multi-choice model adapter. """
+
+    _DEFAULT_MAX_LENGTH = 2048
+
+    def __init__(self, model: LocalModel, **kwargs):
+        super().__init__(model)
+
+        self._max_length = kwargs.get('max_length')
+
+    @property
+    def max_length(self):
+        if self._max_length:
+            return self._max_length
+        seqlen_config_attrs = ('n_positions', 'max_position_embeddings', 'n_ctx')
+        for attr in seqlen_config_attrs:
+            if hasattr(self.model.config, attr):
+                return getattr(self.model.config, attr)
+        if hasattr(self.tokenizer, 'model_max_length'):
+            if self.tokenizer.model_max_length == 1000000000000000019884624838656:
+                return self._DEFAULT_MAX_LENGTH
+            return self.tokenizer.model_max_length
+        return self._DEFAULT_MAX_LENGTH
+
+    @torch.no_grad()
+    def predict(self, inputs: dict, infer_cfg: dict = None) -> dict:
+        """
+        Multi-choice model prediction func.
+
+        Args:
+            inputs (dict): The inputs for a doc. Format:
+                {'data': [full_prompt], 'multi_choices': ['A', 'B', 'C', 'D']}
+
+            infer_cfg (dict): inference configuration.
+
+        Returns:
+            res (dict): The model prediction results. Format:
+            {
+              'choices': [
+                {
+                  'index': 0,
+                  'message': {
+                    'content': [-14.9609, -13.6015, ...],  # loglikelihood values for inputs context-continuation pairs.
+                    'role': 'assistant'
+                  }
+                }
+              ],
+              'created': 1677664795,
+              # For models on the ModelScope or HuggingFace, concat model_id and revision with "-".
+              'model': 'gpt-3.5-turbo-0613',
+              'object': 'chat.completion',
+              'usage': {
+                'completion_tokens': 17,
+                'prompt_tokens': 57,
+                'total_tokens': 74
+              }
+            }
+        """
+        infer_cfg = infer_cfg or {}
+        self.model.generation_config.update(**infer_cfg)
+
+        input_data = inputs['data']
+        multi_choices = inputs['multi_choices']
+
+        output, input_info = self._get_logits(self.tokenizer, self.model, input_data)
+        assert output.shape[0] == 1
+        logits = output.flatten()
+
+        choice_logits = [logits[self.tokenizer(ch)['input_ids'][-1:]] for ch in multi_choices]
+        softval = torch.nn.functional.softmax(torch.tensor(choice_logits).float(), dim=0)
+
+        if softval.dtype in {torch.bfloat16, torch.float16}:
+            softval = softval.to(dtype=torch.float32)
+        probs = softval.detach().cpu().numpy()
+        pred: str = multi_choices[int(np.argmax(probs))]  # Format: A or B or C or D
+
+        res_d = ChatCompletionResponse(
+            model=self.model_id,
+            choices=[{
+                'index': 0,
+                'message': {
+                    'content': pred,
+                    'role': 'assistant'
+                }
+            }],
+            object='chat.completion',
+            created=int(time.time()),
+            usage=None).model_dump(exclude_unset=True)
+
+        return res_d
+
+    @staticmethod
+    def _get_logits(tokenizer, model, inputs: List[str]):
+        input_ids = tokenizer(inputs, padding=False)['input_ids']
+        input_ids = torch.tensor(input_ids, device=model.device)
+        tokens = {'input_ids': input_ids}
+
+        outputs = model(input_ids)['logits']
+        logits = outputs[:, -1, :]
+        log_probs = torch.nn.functional.softmax(logits, dim=-1)
+        return log_probs, {'tokens': tokens}
+
+
+class ContinuationLogitsModelAdapter(MultiChoiceModelAdapter):
+    """
+    Continuation-logits model adapter.
+    """
+
+    def __init__(self, model: LocalModel, **kwargs):
+        super().__init__(model, **kwargs)
+
+    @torch.no_grad()
+    def predict(self, inputs: dict, infer_cfg: dict = None) -> dict:
+        """
+        Multi-choice model prediction func.
+        Args:
+            inputs (dict): The inputs for a doc. Format:
+                {'data': [(context, continuation), ...]}
+            infer_cfg (dict): inference configuration.
+        Returns:
+            res (dict): The model prediction results. Format:
+            {
+              'choices': [
+                {
+                  'index': 0,
+                  'message': {
+                    'content': [-14.9609, -13.6015, ...],  # loglikelihood values for inputs context-continuation pairs.
+                    'role': 'assistant'
+                  }
+                }
+              ],
+              'created': 1677664795,
+              # For models on the ModelScope or HuggingFace, concat model_id and revision with "-".
+              'model': 'gpt-3.5-turbo-0613',
+              'object': 'chat.completion',
+              'usage': {
+                'completion_tokens': 17,
+                'prompt_tokens': 57,
+                'total_tokens': 74
+              }
+            }
+        """
+        infer_cfg = infer_cfg or {}
+
+        pred_list: list = self.loglikelihood(inputs=inputs['data'], infer_cfg=infer_cfg)
+
+        res_d = ChatCompletionResponse(
+            model=self.model_id,
+            choices=[{
+                'index': 0,
+                'message': {
+                    'content': pred_list,
+                    'role': 'assistant'
+                }
+            }],
+            object='chat.completion',
+            created=int(time.time()),
+            usage=None).model_dump(exclude_unset=True)
+
+        return res_d
+
+    def loglikelihood(self, inputs: list, infer_cfg: dict = None) -> list:
+        self.model.generation_config.update(**infer_cfg)
+        # To predict one doc
+        doc_ele_pred = []
+        for ctx, continuation in inputs:
+
+            # ctx_enc shape: [context_tok_len]  cont_enc shape: [continuation_tok_len]
+            ctx_enc, cont_enc = self._encode_pair(ctx, continuation)
+
+            inputs_tokens = torch.tensor(
+                (ctx_enc.tolist() + cont_enc.tolist())[-(self.max_length + 1):][:-1],
+                dtype=torch.long,
+                device=self.model.device).unsqueeze(0)
+
+            logits = self.model(inputs_tokens)[0]
+            logits = torch.nn.functional.log_softmax(logits.float(), dim=-1)
+
+            logits = logits[:, -len(cont_enc):, :]
+            cont_enc = cont_enc.unsqueeze(0).unsqueeze(-1)
+            logits = torch.gather(logits.cpu(), 2, cont_enc.cpu()).squeeze(-1)
+
+            choice_score = float(logits.sum())
+            doc_ele_pred.append(choice_score)
+
+        # e.g. [-2.3, -9.2, -12.9, 1.1], length=len(choices)
+        return doc_ele_pred
+
+    def _encode_pair(self, context, continuation):
+        n_spaces = len(context) - len(context.rstrip())
+        if n_spaces > 0:
+            continuation = context[-n_spaces:] + continuation
+            context = context[:-n_spaces]
+
+        whole_enc = self.tokenizer(context + continuation, padding=False)['input_ids']
+        whole_enc = torch.tensor(whole_enc, device=self.device)
+
+        context_enc = self.tokenizer(context, padding=False)['input_ids']
+        context_enc = torch.tensor(context_enc, device=self.device)
+
+        context_enc_len = len(context_enc)
+        continuation_enc = whole_enc[context_enc_len:]
+
+        return context_enc, continuation_enc
diff --git a/evalscope/models/custom_adapter.py b/evalscope/models/custom_adapter.py
new file mode 100644
index 00000000..fb279feb
--- /dev/null
+++ b/evalscope/models/custom_adapter.py
@@ -0,0 +1,67 @@
+from typing import Any, Dict, List, Union
+
+from evalscope.models.base_adapter import BaseModelAdapter
+from evalscope.models.custom import CustomModel
+
+
+class CustomModelAdapter(BaseModelAdapter):
+
+    def __init__(self, custom_model: CustomModel, **kwargs):
+        """
+        Custom model adapter.
+
+        Args:
+            custom_model: The custom model instance.
+            **kwargs: Other args.
+        """
+        self.custom_model = custom_model
+        super(CustomModelAdapter, self).__init__(model=custom_model)
+
+    def predict(self, inputs: Union[str, dict, list], **kwargs) -> List[Dict[str, Any]]:
+        """
+        Model prediction func.
+
+        Args:
+            inputs (Union[str, dict, list]): The input data. Depending on the specific model.
+                str: 'xxx'
+                dict: {'data': [full_prompt]}
+                list: ['xxx', 'yyy', 'zzz']
+            **kwargs: kwargs
+
+        Returns:
+            res (dict): The model prediction results. Format:
+            {
+              'choices': [
+                {
+                  'index': 0,
+                  'message': {
+                    'content': 'xxx',
+                    'role': 'assistant'
+                  }
+                }
+              ],
+              'created': 1677664795,
+              'model': 'gpt-3.5-turbo-0613',   # should be model_id
+              'object': 'chat.completion',
+              'usage': {
+                'completion_tokens': 17,
+                'prompt_tokens': 57,
+                'total_tokens': 74
+              }
+            }
+        """
+        in_prompts = []
+
+        # Note: here we assume the inputs are all prompts for the benchmark.
+        for input_prompt in inputs:
+            if isinstance(input_prompt, str):
+                in_prompts.append(input_prompt)
+            elif isinstance(input_prompt, dict):
+                # TODO: to be supported for continuation list like truthful_qa
+                in_prompts.append(input_prompt['data'][0])
+            elif isinstance(input_prompt, list):
+                in_prompts.append('\n'.join(input_prompt))
+            else:
+                raise TypeError(f'Unsupported inputs type: {type(input_prompt)}')
+
+        return self.custom_model.predict(prompts=in_prompts, **kwargs)
diff --git a/evalscope/models/local_model.py b/evalscope/models/local_model.py
new file mode 100644
index 00000000..3702781f
--- /dev/null
+++ b/evalscope/models/local_model.py
@@ -0,0 +1,47 @@
+import torch
+from modelscope import AutoModelForCausalLM, AutoTokenizer
+from torch import dtype
+
+from evalscope.constants import DEFAULT_MODEL_CACHE_DIR
+from evalscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+class LocalModel:
+
+    def __init__(self,
+                 model_id: str,
+                 model_revision: str = 'master',
+                 device_map: str = 'auto',
+                 torch_dtype: dtype = torch.bfloat16,
+                 cache_dir: str = None,
+                 **kwargs):
+        model_cache_dir = cache_dir or DEFAULT_MODEL_CACHE_DIR
+
+        self.model_id = model_id
+        self.model_revision = model_revision
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        logger.info(f'Device: {self.device}')
+
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.model_id,
+            revision=model_revision,
+            trust_remote_code=True,
+            cache_dir=model_cache_dir,
+        )
+
+        self.model = AutoModelForCausalLM.from_pretrained(
+            self.model_id,
+            revision=model_revision,
+            device_map=device_map,
+            trust_remote_code=True,
+            torch_dtype=torch_dtype,
+            cache_dir=model_cache_dir,
+        )
+
+        self.model_cfg = {
+            'model_id': model_id,
+            'device_map': device_map,
+            'torch_dtype': str(torch_dtype),
+        }
diff --git a/evalscope/models/model_adapter.py b/evalscope/models/model_adapter.py
deleted file mode 100644
index a3d56500..00000000
--- a/evalscope/models/model_adapter.py
+++ /dev/null
@@ -1,467 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-# Copyright (c) EleutherAI, Inc. and its affiliates.
-# flake8: noqa
-import numpy as np
-import os
-import time
-import torch
-from abc import ABC, abstractmethod
-from modelscope import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
-from torch import dtype
-from typing import Any, Dict, List, Union
-
-from evalscope.constants import DEFAULT_MODEL_CACHE_DIR
-from evalscope.models.custom import CustomModel
-from evalscope.utils.chat_service import ChatMessage
-from evalscope.utils.logger import get_logger
-from evalscope.utils.model_utils import fix_do_sample_warning
-
-logger = get_logger()
-
-
-class LocalModel:
-
-    def __init__(self,
-                 model_id: str,
-                 model_revision: str = 'master',
-                 device_map: str = 'auto',
-                 torch_dtype: dtype = torch.bfloat16,
-                 cache_dir: str = None,
-                 **kwargs):
-        """
-        Args:
-            model_id: The model id on ModelScope, or local model_dir.
-            model_revision: The model revision on ModelScope.
-            device_map: The device map for model inference.
-            torch_dtype: The torch dtype for model inference.
-            cache_dir: Directory to cache the models.
-        """
-        model_cache_dir = cache_dir or DEFAULT_MODEL_CACHE_DIR
-
-        self.model_id = model_id
-        self.model_revision = model_revision
-        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        logger.info(f'Device: {self.device}')
-
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            self.model_id,
-            revision=model_revision,
-            trust_remote_code=True,
-            cache_dir=model_cache_dir,
-        )
-
-        self.model = AutoModelForCausalLM.from_pretrained(
-            self.model_id,
-            revision=model_revision,
-            device_map=device_map,
-            trust_remote_code=True,
-            torch_dtype=torch_dtype,
-            cache_dir=model_cache_dir,
-        )
-
-        self.model_cfg = {
-            'model_id': model_id,
-            'device_map': device_map,
-            'torch_dtype': str(torch_dtype),
-        }
-
-
-class BaseModelAdapter(ABC):
-    """
-    Base class for model adapter.
-    """
-
-    def __init__(self, model: Union[LocalModel, CustomModel], **kwargs):
-        """
-        Args:
-            model: The model instance which is compatible with
-                AutoModel/AutoModelForCausalLM/AutoModelForSeq2SeqLM of transformers.
-        """
-        if isinstance(model, LocalModel):
-            self.model = model.model
-            self.model_id = model.model_id
-            self.model_revision = model.model_revision
-            self.device = model.device
-            self.tokenizer = model.tokenizer
-            self.model_cfg = model.model_cfg
-        elif isinstance(model, CustomModel):
-            pass
-        else:
-            raise ValueError(f'Unsupported model type: {type(model)}')
-
-    @abstractmethod
-    @torch.no_grad()
-    def predict(self, *args, **kwargs) -> Any:
-        """
-        Model prediction func.
-        """
-        raise NotImplementedError
-
-
-class MultiChoiceModelAdapter(BaseModelAdapter):
-    """ The multi-choice model adapter. """
-
-    _DEFAULT_MAX_LENGTH = 2048
-
-    def __init__(self, model: LocalModel, **kwargs):
-        super().__init__(model)
-
-        self._max_length = kwargs.get('max_length')
-
-    @property
-    def max_length(self):
-        if self._max_length:
-            return self._max_length
-        seqlen_config_attrs = ('n_positions', 'max_position_embeddings', 'n_ctx')
-        for attr in seqlen_config_attrs:
-            if hasattr(self.model.config, attr):
-                return getattr(self.model.config, attr)
-        if hasattr(self.tokenizer, 'model_max_length'):
-            if self.tokenizer.model_max_length == 1000000000000000019884624838656:
-                return self._DEFAULT_MAX_LENGTH
-            return self.tokenizer.model_max_length
-        return self._DEFAULT_MAX_LENGTH
-
-    @torch.no_grad()
-    def predict(self, inputs: dict, infer_cfg: dict = None) -> dict:
-        """
-        Multi-choice model prediction func.
-
-        Args:
-            inputs (dict): The inputs for a doc. Format:
-                {'data': [full_prompt], 'multi_choices': ['A', 'B', 'C', 'D']}
-
-            infer_cfg (dict): inference configuration.
-
-        Returns:
-            res (dict): The model prediction results. Format:
-            {
-              'choices': [
-                {
-                  'index': 0,
-                  'message': {
-                    'content': [-14.9609, -13.6015, ...],  # loglikelihood values for inputs context-continuation pairs.
-                    'role': 'assistant'
-                  }
-                }
-              ],
-              'created': 1677664795,
-              # For models on the ModelScope or HuggingFace, concat model_id and revision with "-".
-              'model': 'gpt-3.5-turbo-0613',
-              'object': 'chat.completion',
-              'usage': {
-                'completion_tokens': 17,
-                'prompt_tokens': 57,
-                'total_tokens': 74
-              }
-            }
-        """
-        infer_cfg = infer_cfg or {}
-        self.model.generation_config.update(**infer_cfg)
-
-        input_data = inputs['data']
-        multi_choices = inputs['multi_choices']
-
-        output, input_info = self._get_logits(self.tokenizer, self.model, input_data)
-        assert output.shape[0] == 1
-        logits = output.flatten()
-
-        choice_logits = [logits[self.tokenizer(ch)['input_ids'][-1:]] for ch in multi_choices]
-        softval = torch.nn.functional.softmax(torch.tensor(choice_logits).float(), dim=0)
-
-        if softval.dtype in {torch.bfloat16, torch.float16}:
-            softval = softval.to(dtype=torch.float32)
-        probs = softval.detach().cpu().numpy()
-        pred: str = multi_choices[int(np.argmax(probs))]  # Format: A or B or C or D
-
-        res_d = {
-            'choices': [{
-                'index': 0,
-                'message': {
-                    'content': pred,
-                    'role': 'assistant'
-                }
-            }],
-            'created': time.time(),
-            'model': self.model_id,
-            'object': 'chat.completion',
-            'usage': {}
-        }
-
-        return res_d
-
-    @staticmethod
-    def _get_logits(tokenizer, model, inputs: List[str]):
-        input_ids = tokenizer(inputs, padding=False)['input_ids']
-        input_ids = torch.tensor(input_ids, device=model.device)
-        tokens = {'input_ids': input_ids}
-
-        outputs = model(input_ids)['logits']
-        logits = outputs[:, -1, :]
-        log_probs = torch.nn.functional.softmax(logits, dim=-1)
-        return log_probs, {'tokens': tokens}
-
-
-class ContinuationLogitsModelAdapter(MultiChoiceModelAdapter):
-    """
-    Continuation-logits model adapter.
-    """
-
-    def __init__(self, model: LocalModel, **kwargs):
-        super().__init__(model, **kwargs)
-
-    @torch.no_grad()
-    def predict(self, inputs: dict, infer_cfg: dict = None) -> dict:
-        """
-        Multi-choice model prediction func.
-        Args:
-            inputs (dict): The inputs for a doc. Format:
-                {'data': [(context, continuation), ...]}
-            infer_cfg (dict): inference configuration.
-        Returns:
-            res (dict): The model prediction results. Format:
-            {
-              'choices': [
-                {
-                  'index': 0,
-                  'message': {
-                    'content': [-14.9609, -13.6015, ...],  # loglikelihood values for inputs context-continuation pairs.
-                    'role': 'assistant'
-                  }
-                }
-              ],
-              'created': 1677664795,
-              # For models on the ModelScope or HuggingFace, concat model_id and revision with "-".
-              'model': 'gpt-3.5-turbo-0613',
-              'object': 'chat.completion',
-              'usage': {
-                'completion_tokens': 17,
-                'prompt_tokens': 57,
-                'total_tokens': 74
-              }
-            }
-        """
-        infer_cfg = infer_cfg or {}
-
-        pred_list: list = self.loglikelihood(inputs=inputs['data'], infer_cfg=infer_cfg)
-
-        res_d = {
-            'choices': [{
-                'index': 0,
-                'message': {
-                    'content': pred_list,
-                    'role': 'assistant'
-                }
-            }],
-            'created': time.time(),
-            'model': self.model_id,
-            'object': 'chat.completion',
-            'usage': {}
-        }
-        return res_d
-
-    def loglikelihood(self, inputs: list, infer_cfg: dict = None) -> list:
-        self.model.generation_config.update(**infer_cfg)
-        # To predict one doc
-        doc_ele_pred = []
-        for ctx, continuation in inputs:
-
-            # ctx_enc shape: [context_tok_len]  cont_enc shape: [continuation_tok_len]
-            ctx_enc, cont_enc = self._encode_pair(ctx, continuation)
-
-            inputs_tokens = torch.tensor(
-                (ctx_enc.tolist() + cont_enc.tolist())[-(self.max_length + 1):][:-1],
-                dtype=torch.long,
-                device=self.model.device).unsqueeze(0)
-
-            logits = self.model(inputs_tokens)[0]
-            logits = torch.nn.functional.log_softmax(logits.float(), dim=-1)
-
-            logits = logits[:, -len(cont_enc):, :]
-            cont_enc = cont_enc.unsqueeze(0).unsqueeze(-1)
-            logits = torch.gather(logits.cpu(), 2, cont_enc.cpu()).squeeze(-1)
-
-            choice_score = float(logits.sum())
-            doc_ele_pred.append(choice_score)
-
-        # e.g. [-2.3, -9.2, -12.9, 1.1], length=len(choices)
-        return doc_ele_pred
-
-    def _encode_pair(self, context, continuation):
-        n_spaces = len(context) - len(context.rstrip())
-        if n_spaces > 0:
-            continuation = context[-n_spaces:] + continuation
-            context = context[:-n_spaces]
-
-        whole_enc = self.tokenizer(context + continuation, padding=False)['input_ids']
-        whole_enc = torch.tensor(whole_enc, device=self.device)
-
-        context_enc = self.tokenizer(context, padding=False)['input_ids']
-        context_enc = torch.tensor(context_enc, device=self.device)
-
-        context_enc_len = len(context_enc)
-        continuation_enc = whole_enc[context_enc_len:]
-
-        return context_enc, continuation_enc
-
-
-class ChatGenerationModelAdapter(BaseModelAdapter):
-    """
-    Chat generation model adapter.
-    """
-
-    def __init__(self, model: LocalModel, **kwargs):
-        super().__init__(model)
-
-        self.generation_config = self._parse_generation_config(self.tokenizer, self.model)
-
-        custom_generation_config = kwargs.pop('generation_config', None)
-        custom_chat_template = kwargs.pop('chat_template', None)
-
-        if custom_generation_config:
-            logger.info('Updating generation config ...')
-            self.generation_config.update(**custom_generation_config)
-
-        if custom_chat_template:
-            self.tokenizer.chat_template = custom_chat_template
-            logger.info(f'Using custom chat template: {custom_chat_template}')
-
-    def _parse_generation_config(self, tokenizer, model):
-        generation_config = getattr(model, 'generation_config', GenerationConfig(do_sample=False))
-
-        try:
-            remote_config = GenerationConfig.from_pretrained(
-                self.model_id, revision=self.model_revision, trust_remote_code=True)
-            generation_config.update(**remote_config.to_dict())
-        except:
-            logger.warning(f'Failed to get generation config of {self.model_id} from model hub, use default.')
-
-        if isinstance(self.model_id, str) and os.path.exists(self.model_id):
-            logger.warning(f'Got local model dir: {self.model_id}')
-
-        if tokenizer.eos_token_id is not None:
-            generation_config.eos_token_id = tokenizer.eos_token_id
-        if tokenizer.pad_token_id is not None:
-            generation_config.pad_token_id = tokenizer.pad_token_id
-        if generation_config.max_new_tokens is None:
-            generation_config.max_new_tokens = 2048
-
-        return generation_config
-
-    def _model_generate(self, query: str, infer_cfg: dict) -> str:
-        messages = [ChatMessage(role='user', content=query)]
-        formatted_prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        inputs = self.tokenizer(formatted_prompt, return_tensors='pt', padding=True).to(self.device)
-        input_ids = inputs['input_ids']
-
-        # Process infer_cfg
-        if isinstance(infer_cfg.get('num_return_sequences'), int) and infer_cfg['num_return_sequences'] > 1:
-            infer_cfg['do_sample'] = True
-
-        # stop settings
-        stop = infer_cfg.get('stop', None)
-        eos_token_id = self.tokenizer.encode(stop, add_special_tokens=False)[0] \
-            if stop else self.tokenizer.eos_token_id
-
-        if eos_token_id is not None:
-            infer_cfg['eos_token_id'] = eos_token_id
-            infer_cfg['pad_token_id'] = eos_token_id  # setting eos_token_id as pad token
-
-        self.generation_config.update(**infer_cfg)
-        fix_do_sample_warning(self.generation_config)
-
-        # Run inference
-        output_ids = self.model.generate(**inputs, generation_config=self.generation_config)
-
-        response = self.tokenizer.decode(output_ids[0, len(input_ids[0]):], skip_special_tokens=True)
-        return response
-
-    @torch.no_grad()
-    def predict(self, inputs: Union[str, dict, list], infer_cfg: dict = {}) -> dict:
-
-        # Process inputs
-        if isinstance(inputs, str):
-            query = inputs
-        elif isinstance(inputs, dict):
-            query = inputs['data'][0]
-        elif isinstance(inputs, list):
-            query = '\n'.join(inputs)
-        else:
-            raise TypeError(f'Unsupported inputs type: {type(inputs)}')
-
-        response = self._model_generate(query, infer_cfg)
-
-        choices_list = [{'index': 0, 'message': {'content': response, 'role': 'assistant'}}]
-
-        res_d = {
-            'choices': choices_list,
-            'created': time.time(),
-            'model': self.model_id,
-            'object': 'chat.completion',
-            'usage': {}
-        }
-
-        return res_d
-
-
-class CustomModelAdapter(BaseModelAdapter):
-
-    def __init__(self, custom_model: CustomModel, **kwargs):
-        """
-        Custom model adapter.
-
-        Args:
-            custom_model: The custom model instance.
-            **kwargs: Other args.
-        """
-        self.custom_model = custom_model
-        super(CustomModelAdapter, self).__init__(model=custom_model)
-
-    def predict(self, inputs: Union[str, dict, list], **kwargs) -> List[Dict[str, Any]]:
-        """
-        Model prediction func.
-
-        Args:
-            inputs (Union[str, dict, list]): The input data. Depending on the specific model.
-                str: 'xxx'
-                dict: {'data': [full_prompt]}
-                list: ['xxx', 'yyy', 'zzz']
-            **kwargs: kwargs
-
-        Returns:
-            res (dict): The model prediction results. Format:
-            {
-              'choices': [
-                {
-                  'index': 0,
-                  'message': {
-                    'content': 'xxx',
-                    'role': 'assistant'
-                  }
-                }
-              ],
-              'created': 1677664795,
-              'model': 'gpt-3.5-turbo-0613',   # should be model_id
-              'object': 'chat.completion',
-              'usage': {
-                'completion_tokens': 17,
-                'prompt_tokens': 57,
-                'total_tokens': 74
-              }
-            }
-        """
-        in_prompts = []
-
-        # Note: here we assume the inputs are all prompts for the benchmark.
-        for input_prompt in inputs:
-            if isinstance(input_prompt, str):
-                in_prompts.append(input_prompt)
-            elif isinstance(input_prompt, dict):
-                # TODO: to be supported for continuation list like truthful_qa
-                in_prompts.append(input_prompt['data'][0])
-            elif isinstance(input_prompt, list):
-                in_prompts.append('\n'.join(input_prompt))
-            else:
-                raise TypeError(f'Unsupported inputs type: {type(input_prompt)}')
-
-        return self.custom_model.predict(prompts=in_prompts, **kwargs)
diff --git a/evalscope/models/server_adapter.py b/evalscope/models/server_adapter.py
new file mode 100644
index 00000000..4c753939
--- /dev/null
+++ b/evalscope/models/server_adapter.py
@@ -0,0 +1,80 @@
+import requests
+import time
+from typing import Union
+
+from evalscope.models.base_adapter import BaseModelAdapter
+from evalscope.models.custom import CustomModel
+from evalscope.models.local_model import LocalModel
+from evalscope.utils.chat_service import ChatCompletionResponse
+
+
+class ServerModelAdapter(BaseModelAdapter):
+    """
+    Server model adapter to request remote API model and generate results.
+    """
+
+    def __init__(self, model: Union[LocalModel, CustomModel], api_url: str, **kwargs):
+        """
+        Args:
+            model: The model instance.
+            api_url: The URL of the remote API model.
+            **kwargs: Other args.
+        """
+        super().__init__(model, **kwargs)
+        self.api_url = api_url
+
+    def predict(self, inputs: Union[str, dict, list], infer_cfg: dict = None) -> dict:
+        """
+        Model prediction func.
+
+        Args:
+            inputs (Union[str, dict, list]): The input data.
+            infer_cfg (dict): Inference configuration.
+
+        Returns:
+            res (dict): The model prediction results.
+        """
+        infer_cfg = infer_cfg or {}
+
+        # Process inputs
+        if isinstance(inputs, str):
+            query = inputs
+        elif isinstance(inputs, dict):
+            # TODO: to be supported for continuation list like truthful_qa
+            query = inputs['data'][0]
+        elif isinstance(inputs, list):
+            query = '\n'.join(inputs)
+        else:
+            raise TypeError(f'Unsupported inputs type: {type(inputs)}')
+
+        # Format request JSON according to OpenAI API format
+        request_json = {
+            'model': self.model_id,
+            'prompt': query,
+            'max_tokens': infer_cfg.get('max_tokens', 2048),
+            'temperature': infer_cfg.get('temperature', 1.0),
+            'top_p': infer_cfg.get('top_p', 1.0),
+            'n': infer_cfg.get('num_return_sequences', 1),
+            'stop': infer_cfg.get('stop', None)
+        }
+
+        # Request to remote API
+        response = requests.post(self.api_url, json=request_json)
+        response_data = response.json()
+
+        choices_list = [{
+            'index': i,
+            'message': {
+                'content': choice['text'],
+                'role': 'assistant'
+            }
+        } for i, choice in enumerate(response_data['choices'])]
+
+        res_d = ChatCompletionResponse(
+            model=self.model_id,
+            choices=choices_list,
+            object='chat.completion',
+            created=int(time.time()),
+            usage=response_data.get('usage', None)).model_dump(exclude_unset=True)
+
+        return res_d
diff --git a/evalscope/run.py b/evalscope/run.py
index e7473293..069795dc 100644
--- a/evalscope/run.py
+++ b/evalscope/run.py
@@ -120,18 +120,15 @@ def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsSt
         data_adapter=data_adapter,
         subset_list=benchmark.subset_list,
         model_adapter=model_adapter,
-        use_cache=task_cfg.use_cache,
         outputs=outputs,
-        datasets_dir=task_cfg.dataset_dir,
-        datasets_hub=task_cfg.dataset_hub,
-        stage=task_cfg.stage,
-        eval_type=task_cfg.eval_type,
-        overall_task_cfg=task_cfg,
+        task_cfg=task_cfg,
     )
 
 
 def get_base_model(task_cfg: TaskConfig) -> Optional[LocalModel]:
-    """Get the base model for the task."""
+    """Get the base local model for the task. If the task is not checkpoint-based, return None.
+       Avoids loading model multiple times for different datasets.
+    """
     if task_cfg.eval_type != EvalType.CHECKPOINT:
         return None
     else:
@@ -159,8 +156,11 @@ def initialize_model_adapter(task_cfg: TaskConfig, model_adapter_cls, base_model
     elif task_cfg.eval_type == EvalType.CUSTOM:
         if not isinstance(task_cfg.model, CustomModel):
             raise ValueError(f'Expected evalscope.models.custom.CustomModel, but got {type(task_cfg.model)}.')
-        from evalscope.models.model_adapter import CustomModelAdapter
+        from evalscope.models import CustomModelAdapter
         return CustomModelAdapter(custom_model=task_cfg.model)
+    elif task_cfg.eval_type == EvalType.SERVICE:
+        from evalscope.models import ServerModelAdapter
+        return ServerModelAdapter(url=task_cfg.model, model_id=task_cfg.model_id)
     else:
         return model_adapter_cls(
             model=base_model or get_base_model(task_cfg),
diff --git a/evalscope/utils/chat_service.py b/evalscope/utils/chat_service.py
index 6e4a4a77..6df4fd96 100644
--- a/evalscope/utils/chat_service.py
+++ b/evalscope/utils/chat_service.py
@@ -7,7 +7,7 @@
 from pydantic import BaseModel, Field
 from threading import Thread
 from transformers import TextIteratorStreamer
-from typing import List, Literal, Optional, Union
+from typing import Any, List, Literal, Optional, Union
 
 
 class Usage(BaseModel):
@@ -66,7 +66,7 @@ class ChatCompletionResponseStreamChoice(BaseModel):
 class ChatCompletionResponse(BaseModel):
     model: str
     object: Literal['chat.completion', 'chat.completion.chunk']
-    choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice]]
+    choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice, Any]]
     created: Optional[int] = Field(default_factory=lambda: int(time.time()))
     usage: Optional[Usage]
 

From 85b6577d02b7cfbe8f446c8935206bacb895a50b Mon Sep 17 00:00:00 2001
From: Yunnglin <mao.looper@qq.com>
Date: Fri, 20 Dec 2024 15:28:20 +0800
Subject: [PATCH 07/15] add server

---
 evalscope/arguments.py                  |  13 ++-
 evalscope/benchmarks/bbh/bbh_adapter.py | 103 +++++-------------------
 evalscope/config.py                     |   2 +
 evalscope/constants.py                  |  32 ++------
 evalscope/models/base_adapter.py        |   8 +-
 evalscope/models/server_adapter.py      |  59 +++++++-------
 evalscope/run.py                        |   2 +-
 evalscope/utils/__init__.py             |   2 +-
 evalscope/utils/model_utils.py          |   9 +++
 tests/cli/test_run.py                   |  18 +++++
 10 files changed, 99 insertions(+), 149 deletions(-)

diff --git a/evalscope/arguments.py b/evalscope/arguments.py
index 12ea7703..a8c4d262 100644
--- a/evalscope/arguments.py
+++ b/evalscope/arguments.py
@@ -1,6 +1,8 @@
 import argparse
 import json
 
+from evalscope.constants import EvalBackend, EvalStage, EvalType
+
 
 class ParseStrArgsAction(argparse.Action):
 
@@ -47,10 +49,13 @@ def add_argument(parser: argparse.ArgumentParser):
     parser.add_argument('--generation-config', type=str, action=ParseStrArgsAction, help='The generation config, should be a string.')  # noqa: E501
 
     # Evaluation-related arguments
-    parser.add_argument('--eval-type', type=str, help='The type for evaluating.')
-    parser.add_argument('--eval-backend', type=str, help='The evaluation backend to use.')
+    parser.add_argument('--eval-type', type=str, help='The type for evaluating.',
+                        choices=[EvalType.CHECKPOINT, EvalType.CUSTOM, EvalType.SERVICE])
+    parser.add_argument('--eval-backend', type=str, help='The evaluation backend to use.',
+                        choices=[EvalBackend.NATIVE, EvalBackend.OPEN_COMPASS, EvalBackend.VLM_EVAL_KIT, EvalBackend.RAG_EVAL])  # noqa: E501
     parser.add_argument('--eval-config', type=str, required=False, help='The eval task config file path for evaluation backend.')  # noqa: E501
-    parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.')
+    parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.',
+                        choices=[EvalStage.ALL, EvalStage.INFER, EvalStage.EVAL])
     parser.add_argument('--limit', type=int, default=None, help='Max evaluation samples num for each subset.')
 
     # Cache and working directory arguments
@@ -62,6 +67,8 @@ def add_argument(parser: argparse.ArgumentParser):
     parser.add_argument('--debug', action='store_true', default=False, help='Debug mode, will print information for debugging.')  # noqa: E501
     parser.add_argument('--dry-run', action='store_true', default=False, help='Dry run in single processing mode.')
     parser.add_argument('--seed', type=int, default=42, help='Random seed for reproducibility.')
+    parser.add_argument('--api-key', type=str, default='EMPTY', help='The API key for the remote API model.')
+    parser.add_argument('--api-url', type=str, default=None, help='The API url for the remote API model.')
     # yapf: enable
 
 
diff --git a/evalscope/benchmarks/bbh/bbh_adapter.py b/evalscope/benchmarks/bbh/bbh_adapter.py
index 356639a6..3b075e86 100644
--- a/evalscope/benchmarks/bbh/bbh_adapter.py
+++ b/evalscope/benchmarks/bbh/bbh_adapter.py
@@ -5,18 +5,17 @@
 import random
 import re
 
-from evalscope.benchmarks.data_adapter import DataAdapter
+from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.constants import AnswerKeys
-from evalscope.metrics.metrics import exact_match, weighted_mean
-from evalscope.utils import ResponseParser, normalize_score
+from evalscope.metrics import WeightedAverageAccuracy, exact_match
+from evalscope.models.chat_adapter import ChatGenerationModelAdapter
+from evalscope.utils import ResponseParser
 from evalscope.utils.logger import get_logger
 
 # flake8: noqa
 
 logger = get_logger()
 
-DATASET_ID = 'modelscope/bbh'
-
 # BBH multiple choice subset list
 MULTIPLE_CHOICE = 'multiple_choice'
 MULTIPLE_CHOICE_LIST = [
@@ -59,25 +58,25 @@
 SUBSET_LIST = MULTIPLE_CHOICE_LIST + FREE_FORM_LIST
 
 
+@Benchmark.register(
+    name='bbh',
+    dataset_id='modelscope/bbh',
+    model_adapter=ChatGenerationModelAdapter,
+    subset_list=SUBSET_LIST,
+    metric_list=[WeightedAverageAccuracy],
+    few_shot_num=0,
+    train_split=None,
+    eval_split='test',
+    prompt_template='',
+)
 class BBHAdapter(DataAdapter):
     """
     Adapter for BBH free-form and multiple-choices sub-tasks.
     """
 
-    def __init__(self,
-                 subset_list: list = None,
-                 metric_list: list = None,
-                 few_shot_num: int = None,
-                 train_split: str = None,
-                 eval_split: str = 'test',
-                 **kwargs):
-
-        if subset_list is None:
-            subset_list = SUBSET_LIST
-
-        if metric_list is None:
-            metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
+    def __init__(self, **kwargs):
 
+        few_shot_num = kwargs.get('few_shot_num', None)
         if few_shot_num is None:
             logger.info(f'Set 3-shot examples by system for BBH.')
             few_shot_num = 3
@@ -87,13 +86,7 @@ def __init__(self,
                          f'Use 3-shot by default.')
             few_shot_num = 3
 
-        super().__init__(
-            subset_list=subset_list,
-            metric_list=metric_list,
-            few_shot_num=few_shot_num,
-            train_split=train_split,
-            eval_split=eval_split,
-            **kwargs)
+        super().__init__(**kwargs)
 
     def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
         data_dict = {}
@@ -217,66 +210,6 @@ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: st
     def match(self, gold: str, pred: str) -> float:
         return exact_match(gold=gold, pred=pred)
 
-    def compute_metric(self, review_res_list: list) -> float:
-        """
-        Compute evaluation result by specific metric.
-
-        Args:
-            review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
-
-        Returns:
-            The metric score.
-        """
-        items = [(score, 1.0) for score in review_res_list]
-        return weighted_mean(items)
-
-    def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
-        """
-        Generate the report for the model output.
-
-        Args:
-            subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
-            report_name: The user-defined report name.
-
-        Returns: A dict of metric calculation results. The format is like:
-        {
-            "name":"BBH",
-            "metric":"WeightedAverageAccuracy",
-            "score":0.3389,
-            "category":[
-                {
-                    "name":"DEFAULT",
-                    "score":0.3389,
-                    "subset":[
-                        {
-                            "name":"BBH",
-                            "score":0.3389
-                        },
-                    ]
-                }
-            ],
-            "total_num":100
-        }
-        """
-        total_num: int = sum([num for _, num in subset_score_map.values()])
-        weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
-        weighted_avg_acc = normalize_score(score=weighted_avg_acc)
-        cate_avg_list = [{
-            'name': subset_name,
-            'score': normalize_score(score=score)
-        } for subset_name, (score, _) in subset_score_map.items()]
-
-        category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
-
-        res_map = dict(
-            name=report_name or 'bbh',
-            metric=self.metric_list[0]['name'],
-            score=weighted_avg_acc,
-            category=[category_d],
-            total_num=total_num)
-
-        return res_map
-
     @classmethod
     def _extract_mc_answer(cls, ans: str) -> str:
         """
diff --git a/evalscope/config.py b/evalscope/config.py
index a768bc20..2ded2758 100644
--- a/evalscope/config.py
+++ b/evalscope/config.py
@@ -65,6 +65,8 @@ class TaskConfig:
     debug: bool = False
     dry_run: bool = False
     seed: int = 42
+    api_url: Optional[str] = None  # Only used for server model
+    api_key: Optional[str] = 'EMPTY'  # Only used for server model
 
     def __post_init__(self):
         if (not self.model_id) and self.model:
diff --git a/evalscope/constants.py b/evalscope/constants.py
index f6152ac0..d2409579 100644
--- a/evalscope/constants.py
+++ b/evalscope/constants.py
@@ -140,30 +140,8 @@ class EvalType:
 
 
 class EvalBackend:
-
-    class _Backend:
-        #  compatible with old version, set 'value'
-
-        def __init__(self, value):
-            self._value = value
-
-        @property
-        def value(self):
-            return self._value
-
-        def __str__(self):
-            return self._value
-
-        def __repr__(self):
-            return f"'{self._value}'"
-
-        def __eq__(self, other):
-            if isinstance(other, str):
-                return self._value == other
-            return NotImplemented
-
-    NATIVE = _Backend('Native')
-    OPEN_COMPASS = _Backend('OpenCompass')
-    VLM_EVAL_KIT = _Backend('VLMEvalKit')
-    RAG_EVAL = _Backend('RAGEval')
-    THIRD_PARTY = _Backend('ThirdParty')
+    NATIVE = 'Native'
+    OPEN_COMPASS = 'OpenCompass'
+    VLM_EVAL_KIT = 'VLMEvalKit'
+    RAG_EVAL = 'RAGEval'
+    THIRD_PARTY = 'ThirdParty'
diff --git a/evalscope/models/base_adapter.py b/evalscope/models/base_adapter.py
index ea00c7c1..32ec490c 100644
--- a/evalscope/models/base_adapter.py
+++ b/evalscope/models/base_adapter.py
@@ -1,6 +1,6 @@
 import torch
 from abc import ABC, abstractmethod
-from typing import Any, Union
+from typing import Any, Optional, Union
 
 from evalscope.models.custom import CustomModel
 from evalscope.models.local_model import LocalModel
@@ -8,8 +8,10 @@
 
 class BaseModelAdapter(ABC):
 
-    def __init__(self, model: Union[LocalModel, CustomModel], **kwargs):
-        if isinstance(model, LocalModel):
+    def __init__(self, model: Optional[Union[LocalModel, CustomModel]], **kwargs):
+        if model is None:
+            self.model_cfg = kwargs.get('model_cfg', None)
+        elif isinstance(model, LocalModel):
             self.model = model.model
             self.model_id = model.model_id
             self.model_revision = model.model_revision
diff --git a/evalscope/models/server_adapter.py b/evalscope/models/server_adapter.py
index 4c753939..eb8338ab 100644
--- a/evalscope/models/server_adapter.py
+++ b/evalscope/models/server_adapter.py
@@ -3,9 +3,9 @@
 from typing import Union
 
 from evalscope.models.base_adapter import BaseModelAdapter
-from evalscope.models.custom import CustomModel
-from evalscope.models.local_model import LocalModel
-from evalscope.utils.chat_service import ChatCompletionResponse
+from evalscope.utils.logger import get_logger
+
+logger = get_logger()
 
 
 class ServerModelAdapter(BaseModelAdapter):
@@ -13,15 +13,18 @@ class ServerModelAdapter(BaseModelAdapter):
     Server model adapter to request remote API model and generate results.
     """
 
-    def __init__(self, model: Union[LocalModel, CustomModel], api_url: str, **kwargs):
+    def __init__(self, api_url: str, model_id: str, api_key: str = 'EMPTY', **kwargs):
         """
         Args:
-            model: The model instance.
             api_url: The URL of the remote API model.
-            **kwargs: Other args.
+            model_id: The ID of the remote API model.
+            api_key: The API key of the remote API model.
         """
-        super().__init__(model, **kwargs)
         self.api_url = api_url
+        self.model_id = model_id
+        self.api_key = api_key
+        self.model_cfg = {'api_url': api_url, 'model_id': model_id, 'api_key': api_key}
+        super().__init__(model=None, model_cfg=self.model_cfg, **kwargs)
 
     def predict(self, inputs: Union[str, dict, list], infer_cfg: dict = None) -> dict:
         """
@@ -48,33 +51,31 @@ def predict(self, inputs: Union[str, dict, list], infer_cfg: dict = None) -> dic
             raise TypeError(f'Unsupported inputs type: {type(inputs)}')
 
         # Format request JSON according to OpenAI API format
+        # do not sample by default
         request_json = {
             'model': self.model_id,
-            'prompt': query,
+            'messages': [{
+                'role': 'user',
+                'content': query
+            }],
             'max_tokens': infer_cfg.get('max_tokens', 2048),
-            'temperature': infer_cfg.get('temperature', 1.0),
+            'temperature': infer_cfg.get('temperature', 0.0),
             'top_p': infer_cfg.get('top_p', 1.0),
             'n': infer_cfg.get('num_return_sequences', 1),
             'stop': infer_cfg.get('stop', None)
         }
 
-        # Request to remote API
-        response = requests.post(self.api_url, json=request_json)
-        response_data = response.json()
-
-        choices_list = [{
-            'index': i,
-            'message': {
-                'content': choice['text'],
-                'role': 'assistant'
-            }
-        } for i, choice in enumerate(response_data['choices'])]
-
-        res_d = ChatCompletionResponse(
-            model=self.model_id,
-            choices=choices_list,
-            object='chat.completion',
-            created=int(time.time()),
-            usage=response_data.get('usage', None)).model_dump(exclude_unset=True)
-
-        return res_d
+        # Request to remote API with retry mechanism
+        max_retries = 3
+        for attempt in range(max_retries):
+            response = requests.post(
+                self.api_url, json=request_json, headers={'Authorization': f'Bearer {self.api_key}'})
+            if response.status_code == 200:
+                response_data = response.json()
+                return response_data
+            logger.warning(f'Failed to request to remote API: {response.status_code} {response.text}')
+            if attempt < max_retries - 1:
+                time.sleep(5)  # Sleep for 5 seconds before retrying
+            else:
+                raise RuntimeError(f'Failed to request to remote API after {max_retries} attempts: '
+                                   f'{response.status_code} {response.text}')
diff --git a/evalscope/run.py b/evalscope/run.py
index 069795dc..76c70444 100644
--- a/evalscope/run.py
+++ b/evalscope/run.py
@@ -160,7 +160,7 @@ def initialize_model_adapter(task_cfg: TaskConfig, model_adapter_cls, base_model
         return CustomModelAdapter(custom_model=task_cfg.model)
     elif task_cfg.eval_type == EvalType.SERVICE:
         from evalscope.models import ServerModelAdapter
-        return ServerModelAdapter(url=task_cfg.model, model_id=task_cfg.model_id)
+        return ServerModelAdapter(api_url=task_cfg.api_url, model_id=task_cfg.model, api_key=task_cfg.api_key)
     else:
         return model_adapter_cls(
             model=base_model or get_base_model(task_cfg),
diff --git a/evalscope/utils/__init__.py b/evalscope/utils/__init__.py
index b3cf1c35..56e6a260 100644
--- a/evalscope/utils/__init__.py
+++ b/evalscope/utils/__init__.py
@@ -1,4 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
-from evalscope.constants import *
+from evalscope.utils.model_utils import EvalBackend
 from evalscope.utils.utils import *
diff --git a/evalscope/utils/model_utils.py b/evalscope/utils/model_utils.py
index 0bdbec87..3fc895d4 100644
--- a/evalscope/utils/model_utils.py
+++ b/evalscope/utils/model_utils.py
@@ -1,6 +1,15 @@
+from enum import Enum
 from transformers import GenerationConfig
 
 
+class EvalBackend(Enum):
+    NATIVE = 'Native'
+    OPEN_COMPASS = 'OpenCompass'
+    VLM_EVAL_KIT = 'VLMEvalKit'
+    RAG_EVAL = 'RAGEval'
+    THIRD_PARTY = 'ThirdParty'
+
+
 def fix_do_sample_warning(generation_config: GenerationConfig) -> None:
     # Use the default values of temperature/top_p/top_k in generation_config.
     if generation_config.temperature == 0:
diff --git a/tests/cli/test_run.py b/tests/cli/test_run.py
index 9370e2bb..929405d4 100644
--- a/tests/cli/test_run.py
+++ b/tests/cli/test_run.py
@@ -4,6 +4,7 @@
 import torch
 import unittest
 
+from evalscope.constants import EvalType
 from evalscope.run import run_task
 from evalscope.utils import is_module_installed, test_level_list
 from evalscope.utils.logger import get_logger
@@ -110,5 +111,22 @@ def test_run_humaneval(self):
 
         run_task(task_cfg=task_cfg)
 
+    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
+    def test_run_server_model(self):
+        from evalscope.config import TaskConfig
+
+        task_cfg = TaskConfig(
+            model='qwen2.5',
+            api_url='http://127.0.0.1:8801/v1/chat/completions',
+            api_key='EMPTY',
+            eval_type=EvalType.SERVICE,
+            datasets=['gsm8k', 'bbh'],
+            limit=2,
+            debug=True
+        )
+
+        run_task(task_cfg=task_cfg)
+
+
 if __name__ == '__main__':
     unittest.main()

From 78ce4429f46afbfaddff29b6cba2a3089bac1376 Mon Sep 17 00:00:00 2001
From: Yunnglin <mao.looper@qq.com>
Date: Fri, 20 Dec 2024 17:20:09 +0800
Subject: [PATCH 08/15] update seed and ceval

---
 evalscope/benchmarks/bbh/bbh_adapter.py       |  9 +--
 evalscope/benchmarks/benchmark.py             | 16 +++--
 evalscope/benchmarks/ceval/ceval_adapter.py   | 68 ++++++-------------
 evalscope/benchmarks/data_adapter.py          |  5 --
 evalscope/benchmarks/gsm8k/gsm8k_adapter.py   |  2 +-
 .../benchmarks/hellaswag/hellaswag_adapter.py |  4 +-
 evalscope/config.py                           |  2 +-
 evalscope/evaluator/evaluator.py              | 22 +++---
 evalscope/models/server_adapter.py            | 13 +++-
 evalscope/run.py                              | 40 +++++++----
 evalscope/utils/io_utils.py                   |  8 +++
 tests/cli/test_run.py                         | 10 ++-
 12 files changed, 107 insertions(+), 92 deletions(-)

diff --git a/evalscope/benchmarks/bbh/bbh_adapter.py b/evalscope/benchmarks/bbh/bbh_adapter.py
index 3b075e86..5f049a35 100644
--- a/evalscope/benchmarks/bbh/bbh_adapter.py
+++ b/evalscope/benchmarks/bbh/bbh_adapter.py
@@ -64,7 +64,7 @@
     model_adapter=ChatGenerationModelAdapter,
     subset_list=SUBSET_LIST,
     metric_list=[WeightedAverageAccuracy],
-    few_shot_num=0,
+    few_shot_num=3,
     train_split=None,
     eval_split='test',
     prompt_template='',
@@ -76,15 +76,12 @@ class BBHAdapter(DataAdapter):
 
     def __init__(self, **kwargs):
 
-        few_shot_num = kwargs.get('few_shot_num', None)
-        if few_shot_num is None:
-            logger.info(f'Set 3-shot examples by system for BBH.')
-            few_shot_num = 3
+        few_shot_num = kwargs.get('few_shot_num', 3)
 
         if few_shot_num != 3 and few_shot_num != 0:
             logger.error(f'BBH uses 3-shot examples with CoT or 0-shot by system, but got {few_shot_num}. '
                          f'Use 3-shot by default.')
-            few_shot_num = 3
+            kwargs['few_shot_num'] = 3
 
         super().__init__(**kwargs)
 
diff --git a/evalscope/benchmarks/benchmark.py b/evalscope/benchmarks/benchmark.py
index 30113928..aa2820ef 100644
--- a/evalscope/benchmarks/benchmark.py
+++ b/evalscope/benchmarks/benchmark.py
@@ -31,16 +31,24 @@ def update(self, args: dict):
         self.__dict__.update(args)
 
     def to_dict(self) -> dict:
+        return self.__dict__
+
+    def to_string_dict(self) -> dict:
         cur_dict = copy.deepcopy(self.__dict__)
+        # cur_dict['data_adapter'] = self.data_adapter.__name__
+        # cur_dict['model_adapter'] = self.model_adapter.__name__
+        # cur_dict['metric_list'] = [metric['name'] for metric in self.metric_list]
         del cur_dict['data_adapter']
         del cur_dict['model_adapter']
+        del cur_dict['metric_list']
         return cur_dict
 
-    def get_data_adapter(self, config: dict = None) -> 'DataAdapter':
+    def get_data_adapter(self, config: dict = {}) -> 'DataAdapter':
         if config:
-            dataset_config = config.get(self.name, {})
-            self.update(dataset_config)
-        return self.data_adapter(**self.to_dict())
+            self.update(config.get(self.name, {}))
+
+        data_adapter = self.data_adapter(**self.to_dict())
+        return data_adapter
 
 
 class Benchmark:
diff --git a/evalscope/benchmarks/ceval/ceval_adapter.py b/evalscope/benchmarks/ceval/ceval_adapter.py
index 543b6204..ee1b64ac 100644
--- a/evalscope/benchmarks/ceval/ceval_adapter.py
+++ b/evalscope/benchmarks/ceval/ceval_adapter.py
@@ -2,8 +2,11 @@
 import csv
 import os
 
-from evalscope.benchmarks.data_adapter import DataAdapter
+from evalscope.benchmarks import Benchmark, DataAdapter
+from evalscope.constants import EvalType
+from evalscope.metrics import WeightedAverageAccuracy
 from evalscope.metrics.metrics import exact_match, weighted_mean
+from evalscope.models import MultiChoiceModelAdapter
 from evalscope.utils import ResponseParser, normalize_score
 from evalscope.utils.logger import get_logger
 
@@ -11,8 +14,6 @@
 
 logger = get_logger()
 
-DATASET_ID = 'modelscope/ceval-exam'
-
 SUBSET_LIST = [
     'computer_network',
     'operating_system',
@@ -124,40 +125,28 @@
 }
 
 
+@Benchmark.register(
+    name='ceval',
+    dataset_id='modelscope/ceval-exam',
+    model_adapter=MultiChoiceModelAdapter,
+    subset_list=SUBSET_LIST,
+    metric_list=[WeightedAverageAccuracy],
+    few_shot_num=0,
+    train_split='dev',
+    eval_split='val',
+)
 class CEVALAdapter(DataAdapter):
 
     choices = ['A', 'B', 'C', 'D']
 
-    def __init__(self,
-                 subset_list: list = None,
-                 metric_list: list = None,
-                 few_shot_num: int = None,
-                 train_split: str = 'dev',
-                 eval_split: str = 'val',
-                 **kwargs):
-
-        if subset_list is None:
-            subset_list = SUBSET_LIST
-
-        if metric_list is None:
-            metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
-
-        if few_shot_num is None:
-            # Use 5-shot by default
-            logger.info(f'Set 0-shot examples by default for C-Eval.')
-            few_shot_num = 0
+    def __init__(self, **kwargs):
 
+        few_shot_num = kwargs.get('few_shot_num', 0)
         if few_shot_num > 5:
             logger.warning(f'few_shot_num <= 5 for C-Eval, but got {few_shot_num}. Use 5-shot by default.')
-            few_shot_num = 5
+            kwargs['few_shot_num'] = 5
 
-        super().__init__(
-            subset_list=subset_list,
-            metric_list=metric_list,
-            few_shot_num=few_shot_num,
-            train_split=train_split,
-            eval_split=eval_split,
-            **kwargs)
+        super().__init__(**kwargs)
 
     def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
         data_dict = {}
@@ -223,7 +212,7 @@ def get_gold_answer(self, input_d: dict) -> str:
         # Get the gold choice
         return input_d.get('answer', '')
 
-    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
+    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
         """
         Parse the model output to get the answer. Could be the best choice index.
 
@@ -235,11 +224,11 @@ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: st
         Returns:
             The parsed answer. Depending on the dataset. Usually a string for chat.
         """
-        if eval_type == 'checkpoint':
+        if eval_type == EvalType.CHECKPOINT:
             return result
-        elif eval_type == 'service':
+        elif eval_type == EvalType.SERVICE:
             return ResponseParser.parse_first_option_with_choices(result, self.choices)  # TODO: to be checked !
-        elif eval_type == 'custom':
+        elif eval_type == EvalType.CUSTOM:
             return ResponseParser.parse_first_option_with_choices(result, self.choices)  # TODO: to be checked !
         else:
             raise ValueError(f'Invalid eval_type: {eval_type}')
@@ -247,19 +236,6 @@ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: st
     def match(self, gold: str, pred: str) -> float:
         return exact_match(gold=gold, pred=pred)
 
-    def compute_metric(self, review_res_list: list) -> float:
-        """
-        Compute evaluation result by specific metric.
-
-        Args:
-            review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
-
-        Returns:
-            The metric score.
-        """
-        items = [(score, 1.0) for score in review_res_list]
-        return weighted_mean(items)
-
     def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
         """
         Generate report for the evaluation.
diff --git a/evalscope/benchmarks/data_adapter.py b/evalscope/benchmarks/data_adapter.py
index 468b47a4..34e49acc 100644
--- a/evalscope/benchmarks/data_adapter.py
+++ b/evalscope/benchmarks/data_adapter.py
@@ -139,11 +139,6 @@ def gen_prompts(self, data_dict: dict) -> dict:
                 prompt_d = self.gen_prompt(input_d=sample_d, subset_name=sub_name, few_shot_list=few_shot_data)
                 prompt_d[AnswerKeys.RAW_INPUT] = sample_d
                 res_dict[sub_name].append(prompt_d)
-        # Note: for multiprocess
-        # rnd = random.Random()
-        # rnd.seed(42)
-        # for k, v in res_dict.items():
-        #     rnd.shuffle(v)
 
         return res_dict
 
diff --git a/evalscope/benchmarks/gsm8k/gsm8k_adapter.py b/evalscope/benchmarks/gsm8k/gsm8k_adapter.py
index 450df31d..23541f07 100644
--- a/evalscope/benchmarks/gsm8k/gsm8k_adapter.py
+++ b/evalscope/benchmarks/gsm8k/gsm8k_adapter.py
@@ -43,7 +43,7 @@ def __init__(self, **kwargs):
         if few_shot_num != 4 and few_shot_num != 0:
             logger.error(f'GSM8K uses 4-shot examples with CoT or 0-shot by system, but got {few_shot_num}. '
                          f'Use 4-shot by default.')
-            few_shot_num = 4
+            kwargs['few_shot_num'] = 4
 
         super().__init__(**kwargs)
 
diff --git a/evalscope/benchmarks/hellaswag/hellaswag_adapter.py b/evalscope/benchmarks/hellaswag/hellaswag_adapter.py
index faafc96b..5e580237 100644
--- a/evalscope/benchmarks/hellaswag/hellaswag_adapter.py
+++ b/evalscope/benchmarks/hellaswag/hellaswag_adapter.py
@@ -32,10 +32,10 @@ class HellaSwagAdapter(DataAdapter):
 
     def __init__(self, **kwargs):
 
-        few_shot_num = kwargs.get('few_shot_num', None)
+        few_shot_num = kwargs.get('few_shot_num', 0)
         if few_shot_num != 0:
             logger.warning(f'few_shot_num should be 0 for HellaSwag, but got {few_shot_num}. Use 0-shot by default.')
-            few_shot_num = 0
+            kwargs['few_shot_num'] = 0
 
         super().__init__(**kwargs)
 
diff --git a/evalscope/config.py b/evalscope/config.py
index 2ded2758..0566bc13 100644
--- a/evalscope/config.py
+++ b/evalscope/config.py
@@ -64,7 +64,7 @@ class TaskConfig:
     # Debug and runtime mode arguments
     debug: bool = False
     dry_run: bool = False
-    seed: int = 42
+    seed: Optional[int] = 42
     api_url: Optional[str] = None  # Only used for server model
     api_key: Optional[str] = 'EMPTY'  # Only used for server model
 
diff --git a/evalscope/evaluator/evaluator.py b/evalscope/evaluator/evaluator.py
index 88c18946..2f406a29 100644
--- a/evalscope/evaluator/evaluator.py
+++ b/evalscope/evaluator/evaluator.py
@@ -60,26 +60,29 @@ def __init__(self,
         self.data_adapter = data_adapter
         self.model_adapter = model_adapter
         self.eval_type = task_cfg.eval_type
+        self.subset_list = subset_list
+        self.dataset_hub = task_cfg.dataset_hub
         self.stage = task_cfg.stage
         self.use_cache = task_cfg.use_cache
         self.task_cfg = task_cfg
         self.model_cfg = model_adapter.model_cfg
-
         # Deal with the output paths
         self.outputs_structure = outputs
 
-        # Load dataset
-        self.dataset = self.data_adapter.load(
+        self.kwargs = kwargs
+
+    def load_dataset(self):
+        dataset = self.data_adapter.load(
             dataset_name_or_path=self.dataset_name_or_path,
-            subset_list=subset_list,
+            subset_list=self.subset_list,
             work_dir=self.datasets_dir,
-            datasets_hub=task_cfg.dataset_hub,
-            **kwargs)
+            datasets_hub=self.dataset_hub,
+            **self.kwargs)
 
         # Get prompts from dataset
         # TODO: support sampler
-        self.prompts = self.data_adapter.gen_prompts(data_dict=self.dataset)
-        del self.dataset
+        prompts = self.data_adapter.gen_prompts(data_dict=dataset)
+        return prompts
 
     def _pred_answer(self, input_d: dict, infer_cfg: dict, subset_name: str, answer_id: str = None) -> dict:
 
@@ -371,7 +374,8 @@ def eval(self, infer_cfg: dict = None, debug: bool = False, **kwargs) -> dict:
         stage_answers_dict = {}
         stage_reviews_dict = {}
 
-        for subset_name, prompts_list in self.prompts.items():
+        prompts = self.load_dataset()
+        for subset_name, prompts_list in prompts.items():
             limit = kwargs.get('limit', len(prompts_list))
             prompts_list = prompts_list[:limit]
 
diff --git a/evalscope/models/server_adapter.py b/evalscope/models/server_adapter.py
index eb8338ab..8f93caa8 100644
--- a/evalscope/models/server_adapter.py
+++ b/evalscope/models/server_adapter.py
@@ -23,6 +23,7 @@ def __init__(self, api_url: str, model_id: str, api_key: str = 'EMPTY', **kwargs
         self.api_url = api_url
         self.model_id = model_id
         self.api_key = api_key
+        self.seed = kwargs.get('seed', None)
         self.model_cfg = {'api_url': api_url, 'model_id': model_id, 'api_key': api_key}
         super().__init__(model=None, model_cfg=self.model_cfg, **kwargs)
 
@@ -50,6 +51,11 @@ def predict(self, inputs: Union[str, dict, list], infer_cfg: dict = None) -> dic
         else:
             raise TypeError(f'Unsupported inputs type: {type(inputs)}')
 
+        request_json = self.make_request(query, infer_cfg)
+        return self.send_request(request_json)
+
+    def make_request(self, query: str, infer_cfg: dict) -> dict:
+        """Make request to remote API."""
         # Format request JSON according to OpenAI API format
         # do not sample by default
         request_json = {
@@ -64,9 +70,12 @@ def predict(self, inputs: Union[str, dict, list], infer_cfg: dict = None) -> dic
             'n': infer_cfg.get('num_return_sequences', 1),
             'stop': infer_cfg.get('stop', None)
         }
+        if self.seed is not None:
+            request_json['seed'] = self.seed
+        logger.debug(f'Request to remote API: {request_json}')
+        return request_json
 
-        # Request to remote API with retry mechanism
-        max_retries = 3
+    def send_request(self, request_json: dict, max_retries: int = 3) -> dict:
         for attempt in range(max_retries):
             response = requests.post(
                 self.api_url, json=request_json, headers={'Authorization': f'Bearer {self.api_key}'})
diff --git a/evalscope/run.py b/evalscope/run.py
index 76c70444..8fd41575 100644
--- a/evalscope/run.py
+++ b/evalscope/run.py
@@ -13,7 +13,7 @@
 from evalscope.config import TaskConfig, parse_task_config
 from evalscope.constants import DEFAULT_MODEL_REVISION, DEFAULT_WORK_DIR, EvalBackend, EvalType
 from evalscope.evaluator import Evaluator
-from evalscope.models import CustomModel, LocalModel
+from evalscope.models import BaseModelAdapter, CustomModel, LocalModel
 from evalscope.utils import seed_everything
 from evalscope.utils.io_utils import OutputsStructure, are_paths_same
 from evalscope.utils.logger import configure_logging, get_logger
@@ -35,15 +35,13 @@ def run_task(task_cfg: Union[str, dict, TaskConfig, List[TaskConfig], Namespace]
 
 def run_single_task(task_cfg: TaskConfig, run_time: str) -> dict:
     """Run a single evaluation task."""
-    seed_everything(task_cfg.seed)
+    if task_cfg.seed is not None:
+        seed_everything(task_cfg.seed)
     outputs = setup_work_directory(task_cfg, run_time)
     configure_logging(task_cfg.debug, os.path.join(outputs.logs_dir, 'eval_log.log'))
 
-    task_cfg.dump_yaml(outputs.configs_dir)
-    logger.info(task_cfg)
-
     if task_cfg.eval_backend != EvalBackend.NATIVE:
-        return run_non_native_backend(task_cfg)
+        return run_non_native_backend(task_cfg, outputs)
     else:
         return evaluate_model(task_cfg, outputs)
 
@@ -65,7 +63,7 @@ def setup_work_directory(task_cfg: TaskConfig, run_time: str):
     return outputs
 
 
-def run_non_native_backend(task_cfg: TaskConfig) -> dict:
+def run_non_native_backend(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
     """Run evaluation using a non-native backend."""
     eval_backend = task_cfg.eval_backend
     eval_config = task_cfg.eval_config
@@ -75,6 +73,10 @@ def run_non_native_backend(task_cfg: TaskConfig) -> dict:
 
     backend_manager_class = get_backend_manager_class(eval_backend)
     backend_manager = backend_manager_class(config=eval_config)
+
+    task_cfg.dump_yaml(outputs.configs_dir)
+    logger.info(task_cfg)
+
     backend_manager.run()
 
     return dict()
@@ -99,9 +101,17 @@ def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
     """Evaluate the model based on the provided task configuration."""
     # Initialize evaluator
     eval_results = {}
-    base_model = get_base_model(task_cfg)
+    base_model = get_local_model(task_cfg)
+    evaluators = []
     for dataset_name in task_cfg.datasets:
         evaluator = create_evaluator(task_cfg, dataset_name, outputs, base_model)
+        evaluators.append(evaluator)
+
+    # dump task_cfg to outputs.configs_dir after creating evaluators
+    task_cfg.dump_yaml(outputs.configs_dir)
+    logger.info(task_cfg)
+
+    for evaluator in evaluators:
         res_dict = evaluator.eval(infer_cfg=task_cfg.generation_config, debug=task_cfg.debug, limit=task_cfg.limit)
         eval_results[dataset_name] = res_dict
 
@@ -115,6 +125,9 @@ def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsSt
     data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args)
     model_adapter = initialize_model_adapter(task_cfg, benchmark.model_adapter, base_model)
 
+    # update task_cfg.dataset_args
+    task_cfg.dataset_args[dataset_name] = benchmark.to_string_dict()
+
     return Evaluator(
         dataset_name_or_path=benchmark.dataset_id,
         data_adapter=data_adapter,
@@ -125,7 +138,7 @@ def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsSt
     )
 
 
-def get_base_model(task_cfg: TaskConfig) -> Optional[LocalModel]:
+def get_local_model(task_cfg: TaskConfig) -> Optional[LocalModel]:
     """Get the base local model for the task. If the task is not checkpoint-based, return None.
        Avoids loading model multiple times for different datasets.
     """
@@ -148,7 +161,7 @@ def get_base_model(task_cfg: TaskConfig) -> Optional[LocalModel]:
         return base_model
 
 
-def initialize_model_adapter(task_cfg: TaskConfig, model_adapter_cls, base_model: LocalModel):
+def initialize_model_adapter(task_cfg: TaskConfig, model_adapter_cls: BaseModelAdapter, base_model: LocalModel):
     """Initialize the model adapter based on the task configuration."""
     if task_cfg.dry_run:
         from evalscope.models.model import DummyChatModel
@@ -160,12 +173,11 @@ def initialize_model_adapter(task_cfg: TaskConfig, model_adapter_cls, base_model
         return CustomModelAdapter(custom_model=task_cfg.model)
     elif task_cfg.eval_type == EvalType.SERVICE:
         from evalscope.models import ServerModelAdapter
-        return ServerModelAdapter(api_url=task_cfg.api_url, model_id=task_cfg.model, api_key=task_cfg.api_key)
+        return ServerModelAdapter(
+            api_url=task_cfg.api_url, model_id=task_cfg.model, api_key=task_cfg.api_key, seed=task_cfg.seed)
     else:
         return model_adapter_cls(
-            model=base_model or get_base_model(task_cfg),
-            generation_config=task_cfg.generation_config,
-            chat_template=task_cfg.chat_template)
+            model=base_model, generation_config=task_cfg.generation_config, chat_template=task_cfg.chat_template)
 
 
 def main():
diff --git a/evalscope/utils/io_utils.py b/evalscope/utils/io_utils.py
index daecd2b4..c4fc2e7a 100644
--- a/evalscope/utils/io_utils.py
+++ b/evalscope/utils/io_utils.py
@@ -160,3 +160,11 @@ def are_paths_same(path1, path2):
     real_path2 = os.path.realpath(os.path.abspath(os.path.expanduser(path2)))
 
     return real_path1 == real_path2
+
+
+def dict_to_json(d: dict, json_file: str):
+    """
+    Dump dict to json file.
+    """
+    with open(json_file, 'w') as f:
+        json.dump(d, f, indent=4, ensure_ascii=False)
diff --git a/tests/cli/test_run.py b/tests/cli/test_run.py
index 929405d4..16109997 100644
--- a/tests/cli/test_run.py
+++ b/tests/cli/test_run.py
@@ -71,7 +71,7 @@ def test_run_eval_with_args(self):
 
     @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
     def test_run_task(self):
-        task_cfg = {'model': 'qwen/Qwen2-0.5B-Instruct', 'datasets': ['hellaswag', 'gsm8k', 'arc'], 'limit': 2, 'debug': True}
+        task_cfg = {'model': 'qwen/Qwen2-0.5B-Instruct', 'datasets': ['bbh', 'hellaswag', 'gsm8k', 'arc'], 'limit': 2, 'debug': True}
         run_task(task_cfg=task_cfg)
 
 
@@ -120,7 +120,13 @@ def test_run_server_model(self):
             api_url='http://127.0.0.1:8801/v1/chat/completions',
             api_key='EMPTY',
             eval_type=EvalType.SERVICE,
-            datasets=['gsm8k', 'bbh'],
+            datasets=[
+                'gsm8k',
+                'arc',
+                'ceval',
+                'bbh',
+                # 'hellaswag',
+            ],
             limit=2,
             debug=True
         )

From 4b07449d43cfb04dd74aa59b0d0c58a5c9537bca Mon Sep 17 00:00:00 2001
From: Yunnglin <mao.looper@qq.com>
Date: Fri, 20 Dec 2024 18:09:01 +0800
Subject: [PATCH 09/15] init collection

---
 evalscope/collections/__init__.py          |  0
 evalscope/collections/collection_schema.py | 17 +++++++
 evalscope/collections/data_generator.py    | 26 ++++++++++
 evalscope/collections/data_sampler.py      | 47 ++++++++++++++++++
 evalscope/collections/evaluators.py        | 15 ++++++
 evalscope/collections/run.py               | 58 ++++++++++++++++++++++
 examples/tasks/eval_vlm_swift.yaml         | 14 +++---
 7 files changed, 171 insertions(+), 6 deletions(-)
 create mode 100644 evalscope/collections/__init__.py
 create mode 100644 evalscope/collections/collection_schema.py
 create mode 100644 evalscope/collections/data_generator.py
 create mode 100644 evalscope/collections/data_sampler.py
 create mode 100644 evalscope/collections/evaluators.py
 create mode 100644 evalscope/collections/run.py

diff --git a/evalscope/collections/__init__.py b/evalscope/collections/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/evalscope/collections/collection_schema.py b/evalscope/collections/collection_schema.py
new file mode 100644
index 00000000..53089cde
--- /dev/null
+++ b/evalscope/collections/collection_schema.py
@@ -0,0 +1,17 @@
+class CollectionSchema:
+
+    def __init__(self):
+        self.datasets = []
+
+    def register_dataset(self, name, evaluator, weight=1, task_type='', tags=''):
+        dataset_info = {'name': name, 'evaluator': evaluator, 'weight': weight, 'task_type': task_type, 'tags': tags}
+        self.datasets.append(dataset_info)
+
+    def get_evaluator(self, name):
+        for dataset in self.datasets:
+            if dataset['name'] == name:
+                return dataset['evaluator']
+        return None
+
+    def get_datasets(self):
+        return self.datasets
diff --git a/evalscope/collections/data_generator.py b/evalscope/collections/data_generator.py
new file mode 100644
index 00000000..be2c5369
--- /dev/null
+++ b/evalscope/collections/data_generator.py
@@ -0,0 +1,26 @@
+import json
+
+
+def generate_mixed_dataset(evaluator_collection, samples):
+    mixed_data = []
+    for sample in samples:
+        dataset_name = sample['source']
+        evaluation_result = evaluator_collection.evaluate(dataset_name, sample)
+        mixed_data.append({
+            'id': sample['id'],
+            'row_data': {
+                'prompt': sample['row_data']['prompt'],
+                'answer': sample['row_data']['answer']
+            },
+            'tags': sample['tags'],
+            'task': sample['task_type'],
+            'source': dataset_name,
+            'evaluation': evaluation_result
+        })
+    return mixed_data
+
+
+def save_to_jsonl(data, file_path):
+    with open(file_path, 'w') as f:
+        for entry in data:
+            f.write(json.dumps(entry, ensure_ascii=False) + '\n')
diff --git a/evalscope/collections/data_sampler.py b/evalscope/collections/data_sampler.py
new file mode 100644
index 00000000..aa904ab5
--- /dev/null
+++ b/evalscope/collections/data_sampler.py
@@ -0,0 +1,47 @@
+import random
+
+
+class DatasetSampler:
+
+    def __init__(self, collection_schema):
+        self.collection_schema = collection_schema
+        self.datasets = collection_schema['datasets']
+        self.total_weight = sum(dataset['weight'] for dataset in self.datasets)
+
+    def sample_dataset(self):
+        rand_value = random.uniform(0, self.total_weight)
+        cumulative_weight = 0
+        for dataset in self.datasets:
+            cumulative_weight += dataset['weight']
+            if rand_value <= cumulative_weight:
+                return dataset['name']
+        return None
+
+
+# 示例使用
+collection_schema = {
+    'collection_name':
+    'math',
+    'datasets': [
+        {
+            'name': 'gsm8k',
+            'weight': 1,
+            'task_type': 'math',
+            'tags': 'en,math'
+        },
+        {
+            'name': 'competition_math',
+            'weight': 2,
+            'task_type': 'math',
+            'tags': 'en,math'
+        },
+        # 可以继续添加其他数据集
+    ]
+}
+
+sampler = DatasetSampler(collection_schema)
+
+# 采样数据集
+for _ in range(10):
+    sampled_dataset = sampler.sample_dataset()
+    print(f"Sampled dataset: {sampled_dataset}")
diff --git a/evalscope/collections/evaluators.py b/evalscope/collections/evaluators.py
new file mode 100644
index 00000000..70c0ed0d
--- /dev/null
+++ b/evalscope/collections/evaluators.py
@@ -0,0 +1,15 @@
+class EvaluatorCollection:
+
+    def __init__(self, schema):
+        self.schema = schema
+        self.evaluators = {}
+
+    def add_evaluator(self, dataset_name):
+        evaluator = self.schema.get_evaluator(dataset_name)
+        if evaluator:
+            self.evaluators[dataset_name] = evaluator
+
+    def evaluate(self, dataset_name, sample):
+        evaluator = self.evaluators.get(dataset_name)
+        if evaluator:
+            return evaluator.evaluate(sample)
diff --git a/evalscope/collections/run.py b/evalscope/collections/run.py
new file mode 100644
index 00000000..94b32c89
--- /dev/null
+++ b/evalscope/collections/run.py
@@ -0,0 +1,58 @@
+# 导入必要的模块
+from evalscope.collections.collection_schema import CollectionSchema
+from evalscope.collections.data_generator import generate_mixed_dataset, save_to_jsonl
+from evalscope.collections.data_sampler import DatasetSampler
+from evalscope.collections.evaluators import EvaluatorCollection
+
+
+# 假设 Gsm8kEvaluator 和 CompetitionMathEvaluator 已定义
+class Gsm8kEvaluator:
+
+    def evaluate(self, sample):
+        # 实现评估逻辑
+        return {'score': 1.0}
+
+
+class CompetitionMathEvaluator:
+
+    def evaluate(self, sample):
+        # 实现评估逻辑
+        return {'score': 2.0}
+
+
+# 创建集合架构
+schema = CollectionSchema()
+schema.register_dataset('gsm8k', Gsm8kEvaluator(), weight=1, task_type='math', tags='en,math')
+schema.register_dataset('competition_math', CompetitionMathEvaluator(), weight=2, task_type='math', tags='en,math')
+
+# 创建评估器集合
+evaluator_collection = EvaluatorCollection(schema)
+evaluator_collection.add_evaluator('gsm8k')
+evaluator_collection.add_evaluator('competition_math')
+
+# 示例数据
+samples = [{
+    'id': 1,
+    'row_data': {
+        'prompt': '一艘大船运了6次货，一艘小船运了9次货，大船每次运30吨，小船每次运12吨，大船和小船一共运了多少吨货？',
+        'answer': ['288.0']
+    },
+    'tags': 'zh_cn,math',
+    'task_type': 'math23k',
+    'source': 'gsm8k'
+}, {
+    'id': 2,
+    'row_data': {
+        'prompt': '0.054-(-0.045)=',
+        'answer': ['0.0990']
+    },
+    'tags': 'en,math',
+    'task_type': 'math401',
+    'source': 'competition_math'
+}]
+
+# 生成混合数据集
+mixed_data = generate_mixed_dataset(evaluator_collection, samples)
+
+# 保存为JSONL文件
+save_to_jsonl(mixed_data, 'mixed_dataset.jsonl')
diff --git a/examples/tasks/eval_vlm_swift.yaml b/examples/tasks/eval_vlm_swift.yaml
index f3e76f71..d55b2673 100644
--- a/examples/tasks/eval_vlm_swift.yaml
+++ b/examples/tasks/eval_vlm_swift.yaml
@@ -4,7 +4,7 @@ eval_config:
   model:
     - type: internvl2-8b   # model id of the model
       name: CustomAPIModel # Don't change, must be CustomAPIModel for deploy evaluation
-      api_base: http://localhost:8000/v1/chat/completions # deployed model api
+      api_base: http://localhost:8801/v1/chat/completions # deployed model api
       key: EMPTY
       temperature: 0.0
       img_size: 224
@@ -21,12 +21,14 @@ eval_config:
     # - AI2D_TEST
     # - POPE
     # - RealWorldQA
-    - SEEDBench2_Plus
+    # - SEEDBench2_Plus
+    - MME
   mode: all
-  limit: 10
+  limit: 2
   reuse: true
   nproc: 1
+  judge: exact_matching
   # judge model server config
-  OPENAI_API_KEY: EMPTY
-  OPENAI_API_BASE: http://localhost:11434/v1/chat/completions # judge model api
-  LOCAL_LLM: llama3.1:latest                              # judge model type
+  # OPENAI_API_KEY: EMPTY
+  # OPENAI_API_BASE: http://localhost:11434/v1/chat/completions # judge model api
+  # LOCAL_LLM: llama3.1:latest                              # judge model type

From f67322a6a9a2fbe68d8f1e36c352e3f45d2c45f9 Mon Sep 17 00:00:00 2001
From: Yunnglin <mao.looper@qq.com>
Date: Mon, 23 Dec 2024 17:21:36 +0800
Subject: [PATCH 10/15] add collection and sampler

---
 evalscope/benchmarks/benchmark.py             |   4 +-
 .../competition_math_adapter.py               | 114 +++---------------
 evalscope/benchmarks/data_adapter.py          |   9 +-
 evalscope/collections/collection_schema.py    | 107 ++++++++++++++--
 evalscope/collections/data_generator.py       |  79 +++++++++---
 evalscope/collections/data_sampler.py         |  47 --------
 evalscope/collections/run.py                  |  58 ---------
 evalscope/config.py                           |   4 +-
 evalscope/evaluator/evaluator.py              |   9 +-
 evalscope/run.py                              |   2 +-
 mixed_data.jsonl                              |  10 ++
 schema.json                                   |  45 +++++++
 tests/cli/test_run.py                         |   9 +-
 13 files changed, 249 insertions(+), 248 deletions(-)
 delete mode 100644 evalscope/collections/data_sampler.py
 create mode 100644 mixed_data.jsonl
 create mode 100644 schema.json

diff --git a/evalscope/benchmarks/benchmark.py b/evalscope/benchmarks/benchmark.py
index aa2820ef..12f00e99 100644
--- a/evalscope/benchmarks/benchmark.py
+++ b/evalscope/benchmarks/benchmark.py
@@ -24,7 +24,7 @@ class BenchmarkMeta:
     eval_split: Optional[str] = None
     prompt_template: str = ''
 
-    def update(self, args: dict):
+    def _update(self, args: dict):
         if args.get('local_path'):
             self.dataset_id = args['local_path']
             del args['local_path']
@@ -45,7 +45,7 @@ def to_string_dict(self) -> dict:
 
     def get_data_adapter(self, config: dict = {}) -> 'DataAdapter':
         if config:
-            self.update(config.get(self.name, {}))
+            self._update(config)
 
         data_adapter = self.data_adapter(**self.to_dict())
         return data_adapter
diff --git a/evalscope/benchmarks/competition_math/competition_math_adapter.py b/evalscope/benchmarks/competition_math/competition_math_adapter.py
index 5daed130..9f2af0c2 100644
--- a/evalscope/benchmarks/competition_math/competition_math_adapter.py
+++ b/evalscope/benchmarks/competition_math/competition_math_adapter.py
@@ -4,53 +4,39 @@
 import json
 import os
 
-from evalscope.benchmarks import DataAdapter
-from evalscope.metrics.metrics import weighted_mean
-from evalscope.utils import normalize_score
+from evalscope.benchmarks import Benchmark, DataAdapter
+from evalscope.metrics import WeightedAverageAccuracy
+from evalscope.models import ChatGenerationModelAdapter
 from evalscope.utils.logger import get_logger
 
 # flake8: noqa
 
 logger = get_logger()
 
-DATASET_ID = 'modelscope/competition_math'
-SUBSET_LIST = ['default']
-
 
+@Benchmark.register(
+    name='competition_math',
+    dataset_id='modelscope/competition_math',
+    model_adapter=ChatGenerationModelAdapter,
+    subset_list=['default'],
+    metric_list=[WeightedAverageAccuracy],
+    few_shot_num=4,
+    train_split='train',
+    eval_split='test',
+    prompt_template='',
+)
 class CompetitionMathAdapter(DataAdapter):
-    """ TODO: To be tested for all models. """
-
-    def __init__(self,
-                 subset_list: list = None,
-                 metric_list: list = None,
-                 few_shot_num: int = None,
-                 train_split: str = 'train',
-                 eval_split: str = 'test',
-                 **kwargs):
-
-        if subset_list is None:
-            subset_list = SUBSET_LIST
-
-        if metric_list is None:
-            metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
+    """ To be tested for all models. """
 
-        if few_shot_num is None:
-            # Use 4-shot by default
-            logger.info(f'Set 4-shot examples by system for MATH.')
-            few_shot_num = 4
+    def __init__(self, **kwargs):
 
+        few_shot_num = kwargs.get('few_shot_num', 4)
         if few_shot_num != 4 and few_shot_num != 0:
             logger.error(f'The MATH benchmark ONLY supports 4-shot by system or 0-shot settings, '
-                         f'but got {self.few_shot_num}. Use 4-shot by default.')
-            few_shot_num = 4
+                         f'but got {few_shot_num}. Use 4-shot by default.')
+            kwargs['few_shot_num'] = 4
 
-        super().__init__(
-            subset_list=subset_list,
-            metric_list=metric_list,
-            few_shot_num=few_shot_num,
-            train_split=train_split,
-            eval_split=eval_split,
-            **kwargs)
+        super().__init__(**kwargs)
 
     def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
         data_dict: dict = {}
@@ -119,66 +105,6 @@ def match(self, gold: str, pred: str) -> float:
 
         return res
 
-    def compute_metric(self, review_res_list: list) -> float:
-        """
-        Compute evaluation result by specific metric.
-
-        Args:
-            review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
-
-        Returns:
-            The metric score.
-        """
-        items = [(score, 1.0) for score in review_res_list]
-        return weighted_mean(items)
-
-    def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
-        """
-        Generate the report for the model output.
-
-        Args:
-            subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
-            report_name: The user-defined report name.
-
-        Returns: A dict of metric calculation results. The format is like:
-        {
-            "name":"CompetitionMath",
-            "metric":"WeightedAverageAccuracy",
-            "score":0.5632,
-            "category":[
-                {
-                    "name":"DEFAULT",
-                    "score":0.5632,
-                    "subset":[
-                        {
-                            "name":"main",
-                            "score":0.5632
-                        },
-                    ]
-                }
-            ],
-            "total_num":100
-        }
-        """
-        total_num: int = sum([num for _, num in subset_score_map.values()])
-        weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
-        weighted_avg_acc = normalize_score(score=weighted_avg_acc)
-        cate_avg_list = [{
-            'name': subset_name,
-            'score': normalize_score(score=score)
-        } for subset_name, (score, _) in subset_score_map.items()]
-
-        category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
-
-        res_map = dict(
-            name=report_name or 'competition_math',
-            metric=self.metric_list[0]['name'],
-            score=weighted_avg_acc,
-            category=[category_d],
-            total_num=total_num)
-
-        return res_map
-
     @classmethod
     def _generate_prompt(cls, input_d: dict, use_fewshot: bool = True) -> str:
         problem: str = input_d['problem']
diff --git a/evalscope/benchmarks/data_adapter.py b/evalscope/benchmarks/data_adapter.py
index 34e49acc..0fd1e787 100644
--- a/evalscope/benchmarks/data_adapter.py
+++ b/evalscope/benchmarks/data_adapter.py
@@ -63,18 +63,17 @@ def load(self,
         dataset_name_or_path = os.path.expanduser(dataset_name_or_path)
         subset_list = subset_list or self.subset_list
 
-        logger.info(f'Evaluating on subsets for {dataset_name_or_path}: {subset_list}')
-
         # Try to load dataset from local disk
         if os.path.exists(dataset_name_or_path):
-            logger.info(
-                f'Loading dataset from local disk: > dataset_name: {dataset_name_or_path}  > work_dir: {work_dir}')
+            logger.info(f'Loading dataset from work_dir: {work_dir}: > dataset_name: {dataset_name_or_path} > \
+                    subsets: {subset_list}')
             data_dict = self.load_from_disk(dataset_name_or_path, subset_list, work_dir, **kwargs)
             if len(data_dict) == 0 or len(next(iter(data_dict.values()))) == 0:
                 raise ValueError(f'Local dataset is empty: {dataset_name_or_path}')
         else:
             # Load dataset from remote
-            logger.info(f'Loading dataset from {datasets_hub}: > dataset_name: {dataset_name_or_path}')
+            logger.info(
+                f'Loading dataset from {datasets_hub}: > dataset_name: {dataset_name_or_path} > subsets: {subset_list}')
             data_dict = {}
             split_list = [split for split in [self.train_split, self.eval_split] if split is not None]
             if len(split_list) == 0:
diff --git a/evalscope/collections/collection_schema.py b/evalscope/collections/collection_schema.py
index 53089cde..0109f65b 100644
--- a/evalscope/collections/collection_schema.py
+++ b/evalscope/collections/collection_schema.py
@@ -1,17 +1,104 @@
-class CollectionSchema:
+import json
+from dataclasses import asdict, dataclass, field
+from typing import List, Union
+
+from evalscope.benchmarks.benchmark import Benchmark
+
+
+@dataclass
+class DatasetInfo:
+    name: str
+    weight: int = 1  # sample weight in each collection
+    task_type: str = ''
+    tags: List[str] = field(default_factory=list)
+    args: dict = field(default_factory=dict)
+
+    def get_data(self) -> dict:
+        benchmark_meta = Benchmark.get(self.name)
 
-    def __init__(self):
-        self.datasets = []
+        data_adapter = benchmark_meta.get_data_adapter(config=self.args)
+        data_dict = data_adapter.load(
+            dataset_name_or_path=benchmark_meta.dataset_id, subset_list=benchmark_meta.subset_list)
+        prompts = data_adapter.gen_prompts(data_dict)
+        return prompts
 
-    def register_dataset(self, name, evaluator, weight=1, task_type='', tags=''):
-        dataset_info = {'name': name, 'evaluator': evaluator, 'weight': weight, 'task_type': task_type, 'tags': tags}
-        self.datasets.append(dataset_info)
 
-    def get_evaluator(self, name):
+@dataclass
+class CollectionSchema:
+    name: str
+    datasets: List[Union[DatasetInfo, 'CollectionSchema']] = field(default_factory=list)
+
+    def __post_init__(self):
+        # uniform the weight of datasets in each collection
+        total_weight = sum(dataset.weight for dataset in self.datasets if isinstance(dataset, DatasetInfo))
         for dataset in self.datasets:
-            if dataset['name'] == name:
-                return dataset['evaluator']
-        return None
+            if isinstance(dataset, DatasetInfo):
+                dataset.weight = dataset.weight / total_weight
+
+    def add_dataset(self, name, weight=1, task_type='', tags=[]):
+        self.datasets.append(DatasetInfo(name, weight, task_type, tags))
+
+    def add_collection(self, collection: 'CollectionSchema'):
+        self.datasets.append(collection)
 
     def get_datasets(self):
         return self.datasets
+
+    def to_dict(self):
+        return {
+            'name':
+            self.name,
+            'datasets':
+            [asdict(dataset) if isinstance(dataset, DatasetInfo) else dataset.to_dict() for dataset in self.datasets]
+        }
+
+    @classmethod
+    def from_dict(cls, data):
+        instance = cls(name=data.get('name', ''))
+        for dataset in data.get('datasets', []):
+            if 'datasets' in dataset:
+                instance.datasets.append(CollectionSchema.from_dict(dataset))
+            else:
+                instance.datasets.append(DatasetInfo(**dataset))
+        return instance
+
+    def flatten(self) -> List[DatasetInfo]:
+        flat_datasets = []
+
+        for dataset in self.datasets:
+            if isinstance(dataset, CollectionSchema):
+                nested_datasets = dataset.flatten()
+                flat_datasets.extend(nested_datasets)
+            else:
+                flat_datasets.append(dataset)
+        return flat_datasets
+
+    def dump_json(self, file_path):
+        d = self.to_dict()
+        with open(file_path, 'w') as f:
+            json.dump(d, f, ensure_ascii=False, indent=4)
+
+
+if __name__ == '__main__':
+    schema = CollectionSchema(
+        name='math&reasoning',
+        datasets=[
+            CollectionSchema(
+                name='math',
+                datasets=[
+                    DatasetInfo(name='gsm8k', weight=1, task_type='math', tags=['en', 'math']),
+                    DatasetInfo(name='competition_math', weight=2, task_type='math', tags=['en', 'math']),
+                ]),
+            CollectionSchema(
+                name='reasoning',
+                datasets=[
+                    DatasetInfo(name='arc', weight=1, task_type='reasoning', tags=['en', 'reasoning']),
+                ]),
+        ])
+    print(schema.to_dict())
+    print(schema.flatten())
+    schema.dump_json('schema.json')
+
+    schema = CollectionSchema.from_dict(json.load(open('schema.json', 'r')))
+    print(schema.to_dict())
+    print(schema.flatten())
diff --git a/evalscope/collections/data_generator.py b/evalscope/collections/data_generator.py
index be2c5369..2b4a6f8a 100644
--- a/evalscope/collections/data_generator.py
+++ b/evalscope/collections/data_generator.py
@@ -1,26 +1,69 @@
 import json
+import random
+from abc import ABC, abstractmethod
+from collections import defaultdict
+from typing import List, Optional
 
+from evalscope.collections.collection_schema import CollectionSchema, DatasetInfo
 
-def generate_mixed_dataset(evaluator_collection, samples):
-    mixed_data = []
-    for sample in samples:
-        dataset_name = sample['source']
-        evaluation_result = evaluator_collection.evaluate(dataset_name, sample)
-        mixed_data.append({
-            'id': sample['id'],
-            'row_data': {
-                'prompt': sample['row_data']['prompt'],
-                'answer': sample['row_data']['answer']
-            },
-            'tags': sample['tags'],
-            'task': sample['task_type'],
-            'source': dataset_name,
-            'evaluation': evaluation_result
-        })
-    return mixed_data
+
+# Define an abstract base class for Samplers
+class Sampler(ABC):
+
+    def __init__(self, schema: CollectionSchema, count: Optional[int] = None):
+        self.schema = schema
+        self.count = count
+
+    @abstractmethod
+    def sample(self) -> List[dict]:
+        pass
+
+
+class WeightedSampler(Sampler):
+
+    def sample(self) -> List[dict]:
+        all_data = []
+
+        dataset_info_list = self.schema.flatten()
+        total_weight = sum(dataset.weight for dataset in dataset_info_list)
+
+        remaining_count = self.count
+
+        for i, dataset in enumerate(dataset_info_list):
+            data_dict = dataset.get_data()
+
+            dataset_data = []
+            for subset_name, subset_data in data_dict.items():
+                for prompt in subset_data:
+                    dataset_data.append({
+                        'prompt': prompt,
+                        'tags': dataset.tags,
+                        'task': dataset.task_type,
+                        'source': f'{dataset.name}/{subset_name}',
+                    })
+
+            # For the last dataset, use the remaining count
+            if i == len(dataset_info_list) - 1:
+                dataset_sample_count = remaining_count
+            else:
+                dataset_sample_count = int((dataset.weight / total_weight) * self.count)
+                remaining_count -= dataset_sample_count
+
+            sampled_data = random.choices(dataset_data, k=dataset_sample_count)
+            all_data.extend(sampled_data)
+
+        return all_data
 
 
 def save_to_jsonl(data, file_path):
     with open(file_path, 'w') as f:
-        for entry in data:
+        for i, entry in enumerate(data):
+            entry['id'] = i
             f.write(json.dumps(entry, ensure_ascii=False) + '\n')
+
+
+if __name__ == '__main__':
+    schema = CollectionSchema.from_dict(json.load(open('schema.json', 'r')))
+    print(schema.to_dict())
+    mixed_data = WeightedSampler(schema, 10).sample()
+    save_to_jsonl(mixed_data, 'mixed_data.jsonl')
diff --git a/evalscope/collections/data_sampler.py b/evalscope/collections/data_sampler.py
deleted file mode 100644
index aa904ab5..00000000
--- a/evalscope/collections/data_sampler.py
+++ /dev/null
@@ -1,47 +0,0 @@
-import random
-
-
-class DatasetSampler:
-
-    def __init__(self, collection_schema):
-        self.collection_schema = collection_schema
-        self.datasets = collection_schema['datasets']
-        self.total_weight = sum(dataset['weight'] for dataset in self.datasets)
-
-    def sample_dataset(self):
-        rand_value = random.uniform(0, self.total_weight)
-        cumulative_weight = 0
-        for dataset in self.datasets:
-            cumulative_weight += dataset['weight']
-            if rand_value <= cumulative_weight:
-                return dataset['name']
-        return None
-
-
-# 示例使用
-collection_schema = {
-    'collection_name':
-    'math',
-    'datasets': [
-        {
-            'name': 'gsm8k',
-            'weight': 1,
-            'task_type': 'math',
-            'tags': 'en,math'
-        },
-        {
-            'name': 'competition_math',
-            'weight': 2,
-            'task_type': 'math',
-            'tags': 'en,math'
-        },
-        # 可以继续添加其他数据集
-    ]
-}
-
-sampler = DatasetSampler(collection_schema)
-
-# 采样数据集
-for _ in range(10):
-    sampled_dataset = sampler.sample_dataset()
-    print(f"Sampled dataset: {sampled_dataset}")
diff --git a/evalscope/collections/run.py b/evalscope/collections/run.py
index 94b32c89..e69de29b 100644
--- a/evalscope/collections/run.py
+++ b/evalscope/collections/run.py
@@ -1,58 +0,0 @@
-# 导入必要的模块
-from evalscope.collections.collection_schema import CollectionSchema
-from evalscope.collections.data_generator import generate_mixed_dataset, save_to_jsonl
-from evalscope.collections.data_sampler import DatasetSampler
-from evalscope.collections.evaluators import EvaluatorCollection
-
-
-# 假设 Gsm8kEvaluator 和 CompetitionMathEvaluator 已定义
-class Gsm8kEvaluator:
-
-    def evaluate(self, sample):
-        # 实现评估逻辑
-        return {'score': 1.0}
-
-
-class CompetitionMathEvaluator:
-
-    def evaluate(self, sample):
-        # 实现评估逻辑
-        return {'score': 2.0}
-
-
-# 创建集合架构
-schema = CollectionSchema()
-schema.register_dataset('gsm8k', Gsm8kEvaluator(), weight=1, task_type='math', tags='en,math')
-schema.register_dataset('competition_math', CompetitionMathEvaluator(), weight=2, task_type='math', tags='en,math')
-
-# 创建评估器集合
-evaluator_collection = EvaluatorCollection(schema)
-evaluator_collection.add_evaluator('gsm8k')
-evaluator_collection.add_evaluator('competition_math')
-
-# 示例数据
-samples = [{
-    'id': 1,
-    'row_data': {
-        'prompt': '一艘大船运了6次货，一艘小船运了9次货，大船每次运30吨，小船每次运12吨，大船和小船一共运了多少吨货？',
-        'answer': ['288.0']
-    },
-    'tags': 'zh_cn,math',
-    'task_type': 'math23k',
-    'source': 'gsm8k'
-}, {
-    'id': 2,
-    'row_data': {
-        'prompt': '0.054-(-0.045)=',
-        'answer': ['0.0990']
-    },
-    'tags': 'en,math',
-    'task_type': 'math401',
-    'source': 'competition_math'
-}]
-
-# 生成混合数据集
-mixed_data = generate_mixed_dataset(evaluator_collection, samples)
-
-# 保存为JSONL文件
-save_to_jsonl(mixed_data, 'mixed_dataset.jsonl')
diff --git a/evalscope/config.py b/evalscope/config.py
index 0566bc13..52a78a74 100644
--- a/evalscope/config.py
+++ b/evalscope/config.py
@@ -40,8 +40,8 @@ class TaskConfig:
     chat_template: Optional[str] = None
 
     # Dataset-related arguments
-    datasets: Optional[List[str]] = None
-    dataset_args: Optional[Dict] = field(default_factory=dict)
+    datasets: List[str] = field(default_factory=list)
+    dataset_args: Dict = field(default_factory=dict)
     dataset_dir: str = DEFAULT_DATASET_CACHE_DIR
     dataset_hub: str = HubType.MODELSCOPE
 
diff --git a/evalscope/evaluator/evaluator.py b/evalscope/evaluator/evaluator.py
index 2f406a29..3d76b704 100644
--- a/evalscope/evaluator/evaluator.py
+++ b/evalscope/evaluator/evaluator.py
@@ -31,13 +31,8 @@ class Evaluator(object):
         data_adapter: DataAdapter, the data adapter for the dataset.
         subset_list: list, the subset list for the dataset.
         model_adapter: BaseModelAdapter, the model adapter for the model.
-        use_cache: str, path to local cache. Default: None
-        outputs_dir: OutputsStructure, the outputs dir. Default: None
-        datasets_dir: str, the datasets dir. Default: DEFAULT_ROOT_CACHE_DIR
-        datasets_hub: str, the datasets hub. `Local`, `ModelScope` or `HuggingFace`. Default: 'ModelScope'
-        stage: str, the stage of evaluation. `all` or `infer` or `review`. Default: 'all'
-        eval_type: str, the evaluation type. `checkpoint` or `service` or `custom`. Default: 'checkpoint'
-        overall_task_cfg: dict, the overall task config. Default: None
+        outputs: OutputsStructure, the outputs dir. Default: None
+        task_cfg: TaskConfig, the overall task config. Default: None
         **kwargs: kwargs.
     """
 
diff --git a/evalscope/run.py b/evalscope/run.py
index 8fd41575..fae14ff0 100644
--- a/evalscope/run.py
+++ b/evalscope/run.py
@@ -122,7 +122,7 @@ def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsSt
     """Create an evaluator object for the specified dataset."""
     benchmark: BenchmarkMeta = Benchmark.get(dataset_name)
 
-    data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args)
+    data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
     model_adapter = initialize_model_adapter(task_cfg, benchmark.model_adapter, base_model)
 
     # update task_cfg.dataset_args
diff --git a/mixed_data.jsonl b/mixed_data.jsonl
new file mode 100644
index 00000000..d6d054dd
--- /dev/null
+++ b/mixed_data.jsonl
@@ -0,0 +1,10 @@
+{"prompt": {"data": ["Question: Angelo and Melanie want to plan how many hours over the next week they should study together for their test next week. They have 2 chapters of their textbook to study and 4 worksheets to memorize. They figure out that they should dedicate 3 hours to each chapter of their textbook and 1.5 hours for each worksheet. If they plan to study no more than 4 hours each day, how many days should they plan to study total over the next week if they take a 10-minute break every hour, include 3 10-minute snack breaks each day, and 30 minutes for lunch each day?\nLet's think step by step\nAngelo and Melanie think they should dedicate 3 hours to each of the 2 chapters, 3 hours x 2 chapters = 6 hours total.\nFor the worksheets they plan to dedicate 1.5 hours for each worksheet, 1.5 hours x 4 worksheets = 6 hours total.\nAngelo and Melanie need to start with planning 12 hours to study, at 4 hours a day, 12 / 4 = 3 days.\nHowever, they need to include time for breaks and lunch. Every hour they want to include a 10-minute break, so 12 total hours x 10 minutes = 120 extra minutes for breaks.\nThey also want to include 3 10-minute snack breaks, 3 x 10 minutes = 30 minutes.\nAnd they want to include 30 minutes for lunch each day, so 120 minutes for breaks + 30 minutes for snack breaks + 30 minutes for lunch = 180 minutes, or 180 / 60 minutes per hour = 3 extra hours.\nSo Angelo and Melanie want to plan 12 hours to study + 3 hours of breaks = 15 hours total.\nThey want to study no more than 4 hours each day, 15 hours / 4 hours each day = 3.75\nThey will need to plan to study 4 days to allow for all the time they need.\nThe answer is 4\n\nQuestion: Mark's basketball team scores 25 2 pointers, 8 3 pointers and 10 free throws.  Their opponents score double the 2 pointers but half the 3 pointers and free throws.  What's the total number of points scored by both teams added together?\nLet's think step by step\nMark's team scores 25 2 pointers, meaning they scored 25*2= 50 points in 2 pointers.\nHis team also scores 6 3 pointers, meaning they scored 8*3= 24 points in 3 pointers\nThey scored 10 free throws, and free throws count as one point so they scored 10*1=10 points in free throws.\nAll together his team scored 50+24+10= 84 points\nMark's opponents scored double his team's number of 2 pointers, meaning they scored 50*2=100 points in 2 pointers.\nHis opponents scored half his team's number of 3 pointers, meaning they scored 24/2= 12 points in 3 pointers.\nThey also scored half Mark's team's points in free throws, meaning they scored 10/2=5 points in free throws.\nAll together Mark's opponents scored 100+12+5=117 points\nThe total score for the game is both team's scores added together, so it is 84+117=201 points\nThe answer is 201\n\nQuestion: Bella has two times as many marbles as frisbees. She also has 20 more frisbees than deck cards. If she buys 2/5 times more of each item, what would be the total number of the items she will have if she currently has 60 marbles?\nLet's think step by step\nWhen Bella buys 2/5 times more marbles, she'll have increased the number of marbles by 2/5*60 = 24\nThe total number of marbles she'll have is 60+24 = 84\nIf Bella currently has 60 marbles, and she has two times as many marbles as frisbees, she has 60/2 = 30 frisbees.\nIf Bella buys 2/5 times more frisbees, she'll have 2/5*30 = 12 more frisbees.\nThe total number of frisbees she'll have will increase to 30+12 = 42\nBella also has 20 more frisbees than deck cards, meaning she has 30-20 = 10 deck cards\nIf she buys 2/5 times more deck cards, she'll have 2/5*10 = 4 more deck cards.\nThe total number of deck cards she'll have is 10+4 = 14\nTogether, Bella will have a total of 14+42+84 = 140 items\nThe answer is 140\n\nQuestion: A group of 4 fruit baskets contains 9 apples, 15 oranges, and 14 bananas in the first three baskets and 2 less of each fruit in the fourth basket. How many fruits are there?\nLet's think step by step\nFor the first three baskets, the number of apples and oranges in one basket is 9+15=24\nIn total, together with bananas, the number of fruits in one basket is 24+14=38 for the first three baskets.\nSince there are three baskets each having 38 fruits, there are 3*38=114 fruits in the first three baskets.\nThe number of apples in the fourth basket is 9-2=7\nThere are also 15-2=13 oranges in the fourth basket\nThe combined number of oranges and apples in the fourth basket is 13+7=20\nThe fourth basket also contains 14-2=12 bananas.\nIn total, the fourth basket has 20+12=32 fruits.\nThe four baskets together have 32+114=146 fruits.\nThe answer is 146\n\nQuestion: In one hour, Ezra read twice as many books as Ahmed. Ezra has read 300 books this hour and decided to read 150 more. How many books have they read altogether?\nLet's think step by step\nAnswer:"], "raw_input": {"question": "In one hour, Ezra read twice as many books as Ahmed. Ezra has read 300 books this hour and decided to read 150 more. How many books have they read altogether?", "answer": "If Ezra has read 300 books this hour and decided to read 150 more in the next hour, he has read a total of 300+150=<<300+150=450>>450\nSince Ezra reads twice as many books as Ahmed, Ahmed has read 450/2=<<450/2=225>>225 books.\nTogether, Ahmed and Ezra has read 225+450=<<225+450=675>>675 books\n#### 675"}}, "tags": ["en", "math"], "task": "math", "source": "gsm8k/main", "id": 0}
+{"prompt": {"data": ["Problem:\nFind the domain of the expression $\\frac{{\\sqrt{{x-2}}}}{{\\sqrt{{5-x}}}}$.}}\nSolution:\nThe expressions inside each square root must be non-negative. Therefore, $x-2 \\ge 0$, so $x\\ge2$, and $5 - x \\ge 0$, so $x \\le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$.\nFinal Answer: The final answer is $[2,5)$. I hope it is correct.\nProblem:\nIf $\\det \\mathbf{{A}} = 2$ and $\\det \\mathbf{{B}} = 12,$ then find $\\det (\\mathbf{{A}} \\mathbf{{B}}).$\nSolution:\nWe have that $\\det (\\mathbf{{A}} \\mathbf{{B}}) = (\\det \\mathbf{{A}})(\\det \\mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$\nFinal Answer: The final answer is $24$. I hope it is correct.\nProblem:\nTerrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?\nSolution:\nIf Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\\cdot 12\\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\\cdot15\\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \\Rightarrow\\qquad n&=480/30=\\boxed{{16}} \\end{{align*}}\nFinal Answer: The final answer is $16$. I hope it is correct.\nProblem:\nIf the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \\end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero.\nSolution:\nIf we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$\nFinal Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct.\nProblem:\nThe graph of the parabola defined by the equation $y=-(x+1)^2+1$ is shifted 1 unit to the right, then shifted 5 units down, then rotated 180 degrees about its vertex. The resulting parabola has zeros at $x=a$ and $x=b$, where $b\\ge a$. What is $b-a$?\nSolution:\n"], "raw_input": {"problem": "The graph of the parabola defined by the equation $y=-(x+1)^2+1$ is shifted 1 unit to the right, then shifted 5 units down, then rotated 180 degrees about its vertex. The resulting parabola has zeros at $x=a$ and $x=b$, where $b\\ge a$. What is $b-a$?", "level": "Level 5", "type": "Algebra", "solution": "The graph of the original parabola ($A$) and its final image ($A'$) after rotation and translation is shown below:\n\n[asy]\n\nLabel f;\n\nf.p=fontsize(4);\n\nxaxis(-4,4,Ticks(f, 2.0));\n\nyaxis(-6,5,Ticks(f, 2.0));\n\nreal f(real x)\n\n{\n\nreturn x^2-4;\n\n}\n\ndraw(\"$A'$\", graph(f,-3,3), linewidth(1));\n\nreal g(real x)\n\n{\n\nreturn -(x+1)^2+1;\n\n}\n\ndraw(\"$A$\", graph(g,-3.5,1.5), linewidth(1));\n\n[/asy]\n\nShifting the original parabola 1 unit to the right changes its equation to $y=-x^2+1$. Shifting this last parabola 5 units down changes its equation to $y=-x^2-4$. Rotating it by 180 degrees changes its equation to $y=x^2-4$. So the equation of $A'$ is $y=x^2-4$. To find the zeros of this parabola, we set $y=0$ to get $0=x^2-4$. Factoring the right hand side, we get $0=(x-2)(x+2)$, so either $x-2=0\\Rightarrow x=2$ or $x+2=0 \\Rightarrow x=-2$. Thus, $a=-2$ and $b=2$, so $b-a=\\boxed{4}$."}}, "tags": ["en", "math"], "task": "math", "source": "competition_math/default", "id": 1}
+{"prompt": {"data": ["Problem:\nFind the domain of the expression $\\frac{{\\sqrt{{x-2}}}}{{\\sqrt{{5-x}}}}$.}}\nSolution:\nThe expressions inside each square root must be non-negative. Therefore, $x-2 \\ge 0$, so $x\\ge2$, and $5 - x \\ge 0$, so $x \\le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$.\nFinal Answer: The final answer is $[2,5)$. I hope it is correct.\nProblem:\nIf $\\det \\mathbf{{A}} = 2$ and $\\det \\mathbf{{B}} = 12,$ then find $\\det (\\mathbf{{A}} \\mathbf{{B}}).$\nSolution:\nWe have that $\\det (\\mathbf{{A}} \\mathbf{{B}}) = (\\det \\mathbf{{A}})(\\det \\mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$\nFinal Answer: The final answer is $24$. I hope it is correct.\nProblem:\nTerrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?\nSolution:\nIf Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\\cdot 12\\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\\cdot15\\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \\Rightarrow\\qquad n&=480/30=\\boxed{{16}} \\end{{align*}}\nFinal Answer: The final answer is $16$. I hope it is correct.\nProblem:\nIf the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \\end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero.\nSolution:\nIf we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$\nFinal Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct.\nProblem:\nA juice company sells its product in either a 48-ounce size or a 32-ounce size. It charges $\\$3.90$ for the 48-ounce size. How much should it charge for the smaller size if it wants the price per ounce to be $25\\%$ more than the price per ounce of the larger size?\nSolution:\n"], "raw_input": {"problem": "A juice company sells its product in either a 48-ounce size or a 32-ounce size. It charges $\\$3.90$ for the 48-ounce size. How much should it charge for the smaller size if it wants the price per ounce to be $25\\%$ more than the price per ounce of the larger size?", "level": "Level 5", "type": "Prealgebra", "solution": "We could solve this problem by figuring out the per-ounce cost of the 48-ounce package, increasing it by $25\\%$, and then multiplying that by 32 for the smaller package. However, if we simply increase the price by $25\\%$, and then scale the package size down to 32 ounces from 48 ounces, these are the same calculations, but in a different order that makes it easier to calculate. Thus: $3.90 \\times 1.25 \\times \\frac{32}{48} = \\boxed{3.25\\text{ dollars}}$"}}, "tags": ["en", "math"], "task": "math", "source": "competition_math/default", "id": 2}
+{"prompt": {"data": ["Problem:\nFind the domain of the expression $\\frac{{\\sqrt{{x-2}}}}{{\\sqrt{{5-x}}}}$.}}\nSolution:\nThe expressions inside each square root must be non-negative. Therefore, $x-2 \\ge 0$, so $x\\ge2$, and $5 - x \\ge 0$, so $x \\le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$.\nFinal Answer: The final answer is $[2,5)$. I hope it is correct.\nProblem:\nIf $\\det \\mathbf{{A}} = 2$ and $\\det \\mathbf{{B}} = 12,$ then find $\\det (\\mathbf{{A}} \\mathbf{{B}}).$\nSolution:\nWe have that $\\det (\\mathbf{{A}} \\mathbf{{B}}) = (\\det \\mathbf{{A}})(\\det \\mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$\nFinal Answer: The final answer is $24$. I hope it is correct.\nProblem:\nTerrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?\nSolution:\nIf Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\\cdot 12\\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\\cdot15\\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \\Rightarrow\\qquad n&=480/30=\\boxed{{16}} \\end{{align*}}\nFinal Answer: The final answer is $16$. I hope it is correct.\nProblem:\nIf the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \\end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero.\nSolution:\nIf we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$\nFinal Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct.\nProblem:\nSuppose that $x,$ $y,$ and $z$ satisfy the equations\n\\begin{align*}\nxyz &= 4, \\\\\nx^3 + y^3 + z^3 &= 4, \\\\\nxy^2 + x^2 y + xz^2 + x^2 z + yz^2 + y^2 z &= 12.\n\\end{align*}Calculate the value of $xy + yz + zx.$\nSolution:\n"], "raw_input": {"problem": "Suppose that $x,$ $y,$ and $z$ satisfy the equations\n\\begin{align*}\nxyz &= 4, \\\\\nx^3 + y^3 + z^3 &= 4, \\\\\nxy^2 + x^2 y + xz^2 + x^2 z + yz^2 + y^2 z &= 12.\n\\end{align*}Calculate the value of $xy + yz + zx.$", "level": "Level 4", "type": "Intermediate Algebra", "solution": "Let $s_1 = x + y + z$ and $s_2 = xy + xz + yz.$  Then\n\\begin{align*}\ns_1 s_2 &= (x + y + z)(xy + xz + yz) \\\\\n&= x^2 y + xy^2 + x^2 z + xz^2 + y^2 z + yz^2 + 3xyz \\\\\n&= 12 + 3 \\cdot 4 = 24.\n\\end{align*}Also,\n\\begin{align*}\ns_1^3 &= (x + y + z)^3 \\\\\n&= (x^3 + y^3 + z^3) + 3(x^2 y + xy^2 + x^2 z + xz^2 + y^2 z + yz^2) + 6xyz \\\\\n&= 4 + 3 \\cdot 12 + 6 \\cdot 4 = 64,\n\\end{align*}so $s_1 = 4.$  Hence, $s_2 = \\frac{24}{s_1} = \\boxed{6}.$"}}, "tags": ["en", "math"], "task": "math", "source": "competition_math/default", "id": 3}
+{"prompt": {"data": ["The average car uses about 25% of the available energy in each gallon of gasoline. The energy not used to run the car is released as heat and sound. Which statement best describes this occurrence?\nA. Energy conversion reduces efficiency.\nB. Energy is destroyed at high temperatures.\nC. Energy loss increases the order in a system.\nD. Energy levels increase as conversions take place.\nAnswer:"], "multi_choices": ["A", "B", "C", "D"], "raw_input": {"id": "Mercury_7145950", "question": "The average car uses about 25% of the available energy in each gallon of gasoline. The energy not used to run the car is released as heat and sound. Which statement best describes this occurrence?", "choices": {"text": ["Energy conversion reduces efficiency.", "Energy is destroyed at high temperatures.", "Energy loss increases the order in a system.", "Energy levels increase as conversions take place."], "label": ["A", "B", "C", "D"]}, "answerKey": "A"}}, "tags": ["en", "reasoning"], "task": "reasoning", "source": "arc/ARC-Easy", "id": 4}
+{"prompt": {"data": ["Which process is the best example of a sudden change to Earth's surface?\nA. landslides moving loose rocks downhill\nB. sediments depositing on the seafloor\nC. deltas forming at the mouth of rivers\nD. mountains building up\nAnswer:"], "multi_choices": ["A", "B", "C", "D"], "raw_input": {"id": "MEA_2010_8_15", "question": "Which process is the best example of a sudden change to Earth's surface?", "choices": {"text": ["landslides moving loose rocks downhill", "sediments depositing on the seafloor", "deltas forming at the mouth of rivers", "mountains building up"], "label": ["A", "B", "C", "D"]}, "answerKey": "A"}}, "tags": ["en", "reasoning"], "task": "reasoning", "source": "arc/ARC-Challenge", "id": 5}
+{"prompt": {"data": ["On January 15th, there were 10 hours and 24 minutes of daylight in Jacksonville, Florida. On the same day, there were only 9 hours and 37 minutes of daylight in New York City. Why did New York have fewer hours of daylight than Florida?\nA. because Earth rotates\nB. because Earth tilts on its axis\nC. because gravity pulls Earth toward the Sun\nD. because the Moon reflects sunlight onto Earth\nAnswer:"], "multi_choices": ["A", "B", "C", "D"], "raw_input": {"id": "Mercury_SC_408336", "question": "On January 15th, there were 10 hours and 24 minutes of daylight in Jacksonville, Florida. On the same day, there were only 9 hours and 37 minutes of daylight in New York City. Why did New York have fewer hours of daylight than Florida?", "choices": {"text": ["because Earth rotates", "because Earth tilts on its axis", "because gravity pulls Earth toward the Sun", "because the Moon reflects sunlight onto Earth"], "label": ["A", "B", "C", "D"]}, "answerKey": "B"}}, "tags": ["en", "reasoning"], "task": "reasoning", "source": "arc/ARC-Easy", "id": 6}
+{"prompt": {"data": ["Insulin is a chemical that is released by the endocrine system to increase the absorption of glucose by the body's cells. After which activity would insulin levels in a healthy body increase?\nA. drinking water\nB. taking a nap\nC. swimming\nD. eating\nAnswer:"], "multi_choices": ["A", "B", "C", "D"], "raw_input": {"id": "Mercury_7126613", "question": "Insulin is a chemical that is released by the endocrine system to increase the absorption of glucose by the body's cells. After which activity would insulin levels in a healthy body increase?", "choices": {"text": ["drinking water", "taking a nap", "swimming", "eating"], "label": ["A", "B", "C", "D"]}, "answerKey": "D"}}, "tags": ["en", "reasoning"], "task": "reasoning", "source": "arc/ARC-Easy", "id": 7}
+{"prompt": {"data": ["Which statement describes the transfer of energy when an ice cube is placed in a cup of hot tea?\nA. Heat flows from the tea to the ice.\nB. Cold flows from the ice to the tea.\nC. Cold flows from the ice, and heat flows from the tea.\nD. Heat flows simultaneously between the ice and the tea.\nAnswer:"], "multi_choices": ["A", "B", "C", "D"], "raw_input": {"id": "Mercury_7029785", "question": "Which statement describes the transfer of energy when an ice cube is placed in a cup of hot tea?", "choices": {"text": ["Heat flows from the tea to the ice.", "Cold flows from the ice to the tea.", "Cold flows from the ice, and heat flows from the tea.", "Heat flows simultaneously between the ice and the tea."], "label": ["A", "B", "C", "D"]}, "answerKey": "A"}}, "tags": ["en", "reasoning"], "task": "reasoning", "source": "arc/ARC-Easy", "id": 8}
+{"prompt": {"data": ["A student walks to school one morning and notices the grass is wet but the streets are dry. Which of these processes most likely caused the grass to be wet?\nA. condensation\nB. erosion\nC. evaporation\nD. precipitation\nAnswer:"], "multi_choices": ["A", "B", "C", "D"], "raw_input": {"id": "MSA_2012_5_2", "question": "A student walks to school one morning and notices the grass is wet but the streets are dry. Which of these processes most likely caused the grass to be wet?", "choices": {"text": ["condensation", "erosion", "evaporation", "precipitation"], "label": ["A", "B", "C", "D"]}, "answerKey": "A"}}, "tags": ["en", "reasoning"], "task": "reasoning", "source": "arc/ARC-Challenge", "id": 9}
diff --git a/schema.json b/schema.json
new file mode 100644
index 00000000..924f0732
--- /dev/null
+++ b/schema.json
@@ -0,0 +1,45 @@
+{
+    "name": "math&reasoning",
+    "datasets": [
+        {
+            "name": "math",
+            "datasets": [
+                {
+                    "name": "gsm8k",
+                    "weight": 0.3333333333333333,
+                    "task_type": "math",
+                    "tags": [
+                        "en",
+                        "math"
+                    ],
+                    "args": {}
+                },
+                {
+                    "name": "competition_math",
+                    "weight": 0.6666666666666666,
+                    "task_type": "math",
+                    "tags": [
+                        "en",
+                        "math"
+                    ],
+                    "args": {}
+                }
+            ]
+        },
+        {
+            "name": "reasoning",
+            "datasets": [
+                {
+                    "name": "arc",
+                    "weight": 1.0,
+                    "task_type": "reasoning",
+                    "tags": [
+                        "en",
+                        "reasoning"
+                    ],
+                    "args": {}
+                }
+            ]
+        }
+    ]
+}
diff --git a/tests/cli/test_run.py b/tests/cli/test_run.py
index 16109997..05ffa53c 100644
--- a/tests/cli/test_run.py
+++ b/tests/cli/test_run.py
@@ -121,10 +121,11 @@ def test_run_server_model(self):
             api_key='EMPTY',
             eval_type=EvalType.SERVICE,
             datasets=[
-                'gsm8k',
-                'arc',
-                'ceval',
-                'bbh',
+                'competition_math',
+                # 'gsm8k',
+                # 'arc',
+                # 'ceval',
+                # 'bbh',
                 # 'hellaswag',
             ],
             limit=2,

From ebcc800670546a9ea17891ab9a5087f5cba4c158 Mon Sep 17 00:00:00 2001
From: Yunnglin <mao.looper@qq.com>
Date: Mon, 23 Dec 2024 17:27:32 +0800
Subject: [PATCH 11/15] remove output

---
 mixed_data.jsonl | 10 ----------
 schema.json      | 45 ---------------------------------------------
 2 files changed, 55 deletions(-)
 delete mode 100644 mixed_data.jsonl
 delete mode 100644 schema.json

diff --git a/mixed_data.jsonl b/mixed_data.jsonl
deleted file mode 100644
index d6d054dd..00000000
--- a/mixed_data.jsonl
+++ /dev/null
@@ -1,10 +0,0 @@
-{"prompt": {"data": ["Question: Angelo and Melanie want to plan how many hours over the next week they should study together for their test next week. They have 2 chapters of their textbook to study and 4 worksheets to memorize. They figure out that they should dedicate 3 hours to each chapter of their textbook and 1.5 hours for each worksheet. If they plan to study no more than 4 hours each day, how many days should they plan to study total over the next week if they take a 10-minute break every hour, include 3 10-minute snack breaks each day, and 30 minutes for lunch each day?\nLet's think step by step\nAngelo and Melanie think they should dedicate 3 hours to each of the 2 chapters, 3 hours x 2 chapters = 6 hours total.\nFor the worksheets they plan to dedicate 1.5 hours for each worksheet, 1.5 hours x 4 worksheets = 6 hours total.\nAngelo and Melanie need to start with planning 12 hours to study, at 4 hours a day, 12 / 4 = 3 days.\nHowever, they need to include time for breaks and lunch. Every hour they want to include a 10-minute break, so 12 total hours x 10 minutes = 120 extra minutes for breaks.\nThey also want to include 3 10-minute snack breaks, 3 x 10 minutes = 30 minutes.\nAnd they want to include 30 minutes for lunch each day, so 120 minutes for breaks + 30 minutes for snack breaks + 30 minutes for lunch = 180 minutes, or 180 / 60 minutes per hour = 3 extra hours.\nSo Angelo and Melanie want to plan 12 hours to study + 3 hours of breaks = 15 hours total.\nThey want to study no more than 4 hours each day, 15 hours / 4 hours each day = 3.75\nThey will need to plan to study 4 days to allow for all the time they need.\nThe answer is 4\n\nQuestion: Mark's basketball team scores 25 2 pointers, 8 3 pointers and 10 free throws.  Their opponents score double the 2 pointers but half the 3 pointers and free throws.  What's the total number of points scored by both teams added together?\nLet's think step by step\nMark's team scores 25 2 pointers, meaning they scored 25*2= 50 points in 2 pointers.\nHis team also scores 6 3 pointers, meaning they scored 8*3= 24 points in 3 pointers\nThey scored 10 free throws, and free throws count as one point so they scored 10*1=10 points in free throws.\nAll together his team scored 50+24+10= 84 points\nMark's opponents scored double his team's number of 2 pointers, meaning they scored 50*2=100 points in 2 pointers.\nHis opponents scored half his team's number of 3 pointers, meaning they scored 24/2= 12 points in 3 pointers.\nThey also scored half Mark's team's points in free throws, meaning they scored 10/2=5 points in free throws.\nAll together Mark's opponents scored 100+12+5=117 points\nThe total score for the game is both team's scores added together, so it is 84+117=201 points\nThe answer is 201\n\nQuestion: Bella has two times as many marbles as frisbees. She also has 20 more frisbees than deck cards. If she buys 2/5 times more of each item, what would be the total number of the items she will have if she currently has 60 marbles?\nLet's think step by step\nWhen Bella buys 2/5 times more marbles, she'll have increased the number of marbles by 2/5*60 = 24\nThe total number of marbles she'll have is 60+24 = 84\nIf Bella currently has 60 marbles, and she has two times as many marbles as frisbees, she has 60/2 = 30 frisbees.\nIf Bella buys 2/5 times more frisbees, she'll have 2/5*30 = 12 more frisbees.\nThe total number of frisbees she'll have will increase to 30+12 = 42\nBella also has 20 more frisbees than deck cards, meaning she has 30-20 = 10 deck cards\nIf she buys 2/5 times more deck cards, she'll have 2/5*10 = 4 more deck cards.\nThe total number of deck cards she'll have is 10+4 = 14\nTogether, Bella will have a total of 14+42+84 = 140 items\nThe answer is 140\n\nQuestion: A group of 4 fruit baskets contains 9 apples, 15 oranges, and 14 bananas in the first three baskets and 2 less of each fruit in the fourth basket. How many fruits are there?\nLet's think step by step\nFor the first three baskets, the number of apples and oranges in one basket is 9+15=24\nIn total, together with bananas, the number of fruits in one basket is 24+14=38 for the first three baskets.\nSince there are three baskets each having 38 fruits, there are 3*38=114 fruits in the first three baskets.\nThe number of apples in the fourth basket is 9-2=7\nThere are also 15-2=13 oranges in the fourth basket\nThe combined number of oranges and apples in the fourth basket is 13+7=20\nThe fourth basket also contains 14-2=12 bananas.\nIn total, the fourth basket has 20+12=32 fruits.\nThe four baskets together have 32+114=146 fruits.\nThe answer is 146\n\nQuestion: In one hour, Ezra read twice as many books as Ahmed. Ezra has read 300 books this hour and decided to read 150 more. How many books have they read altogether?\nLet's think step by step\nAnswer:"], "raw_input": {"question": "In one hour, Ezra read twice as many books as Ahmed. Ezra has read 300 books this hour and decided to read 150 more. How many books have they read altogether?", "answer": "If Ezra has read 300 books this hour and decided to read 150 more in the next hour, he has read a total of 300+150=<<300+150=450>>450\nSince Ezra reads twice as many books as Ahmed, Ahmed has read 450/2=<<450/2=225>>225 books.\nTogether, Ahmed and Ezra has read 225+450=<<225+450=675>>675 books\n#### 675"}}, "tags": ["en", "math"], "task": "math", "source": "gsm8k/main", "id": 0}
-{"prompt": {"data": ["Problem:\nFind the domain of the expression $\\frac{{\\sqrt{{x-2}}}}{{\\sqrt{{5-x}}}}$.}}\nSolution:\nThe expressions inside each square root must be non-negative. Therefore, $x-2 \\ge 0$, so $x\\ge2$, and $5 - x \\ge 0$, so $x \\le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$.\nFinal Answer: The final answer is $[2,5)$. I hope it is correct.\nProblem:\nIf $\\det \\mathbf{{A}} = 2$ and $\\det \\mathbf{{B}} = 12,$ then find $\\det (\\mathbf{{A}} \\mathbf{{B}}).$\nSolution:\nWe have that $\\det (\\mathbf{{A}} \\mathbf{{B}}) = (\\det \\mathbf{{A}})(\\det \\mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$\nFinal Answer: The final answer is $24$. I hope it is correct.\nProblem:\nTerrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?\nSolution:\nIf Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\\cdot 12\\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\\cdot15\\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \\Rightarrow\\qquad n&=480/30=\\boxed{{16}} \\end{{align*}}\nFinal Answer: The final answer is $16$. I hope it is correct.\nProblem:\nIf the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \\end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero.\nSolution:\nIf we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$\nFinal Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct.\nProblem:\nThe graph of the parabola defined by the equation $y=-(x+1)^2+1$ is shifted 1 unit to the right, then shifted 5 units down, then rotated 180 degrees about its vertex. The resulting parabola has zeros at $x=a$ and $x=b$, where $b\\ge a$. What is $b-a$?\nSolution:\n"], "raw_input": {"problem": "The graph of the parabola defined by the equation $y=-(x+1)^2+1$ is shifted 1 unit to the right, then shifted 5 units down, then rotated 180 degrees about its vertex. The resulting parabola has zeros at $x=a$ and $x=b$, where $b\\ge a$. What is $b-a$?", "level": "Level 5", "type": "Algebra", "solution": "The graph of the original parabola ($A$) and its final image ($A'$) after rotation and translation is shown below:\n\n[asy]\n\nLabel f;\n\nf.p=fontsize(4);\n\nxaxis(-4,4,Ticks(f, 2.0));\n\nyaxis(-6,5,Ticks(f, 2.0));\n\nreal f(real x)\n\n{\n\nreturn x^2-4;\n\n}\n\ndraw(\"$A'$\", graph(f,-3,3), linewidth(1));\n\nreal g(real x)\n\n{\n\nreturn -(x+1)^2+1;\n\n}\n\ndraw(\"$A$\", graph(g,-3.5,1.5), linewidth(1));\n\n[/asy]\n\nShifting the original parabola 1 unit to the right changes its equation to $y=-x^2+1$. Shifting this last parabola 5 units down changes its equation to $y=-x^2-4$. Rotating it by 180 degrees changes its equation to $y=x^2-4$. So the equation of $A'$ is $y=x^2-4$. To find the zeros of this parabola, we set $y=0$ to get $0=x^2-4$. Factoring the right hand side, we get $0=(x-2)(x+2)$, so either $x-2=0\\Rightarrow x=2$ or $x+2=0 \\Rightarrow x=-2$. Thus, $a=-2$ and $b=2$, so $b-a=\\boxed{4}$."}}, "tags": ["en", "math"], "task": "math", "source": "competition_math/default", "id": 1}
-{"prompt": {"data": ["Problem:\nFind the domain of the expression $\\frac{{\\sqrt{{x-2}}}}{{\\sqrt{{5-x}}}}$.}}\nSolution:\nThe expressions inside each square root must be non-negative. Therefore, $x-2 \\ge 0$, so $x\\ge2$, and $5 - x \\ge 0$, so $x \\le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$.\nFinal Answer: The final answer is $[2,5)$. I hope it is correct.\nProblem:\nIf $\\det \\mathbf{{A}} = 2$ and $\\det \\mathbf{{B}} = 12,$ then find $\\det (\\mathbf{{A}} \\mathbf{{B}}).$\nSolution:\nWe have that $\\det (\\mathbf{{A}} \\mathbf{{B}}) = (\\det \\mathbf{{A}})(\\det \\mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$\nFinal Answer: The final answer is $24$. I hope it is correct.\nProblem:\nTerrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?\nSolution:\nIf Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\\cdot 12\\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\\cdot15\\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \\Rightarrow\\qquad n&=480/30=\\boxed{{16}} \\end{{align*}}\nFinal Answer: The final answer is $16$. I hope it is correct.\nProblem:\nIf the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \\end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero.\nSolution:\nIf we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$\nFinal Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct.\nProblem:\nA juice company sells its product in either a 48-ounce size or a 32-ounce size. It charges $\\$3.90$ for the 48-ounce size. How much should it charge for the smaller size if it wants the price per ounce to be $25\\%$ more than the price per ounce of the larger size?\nSolution:\n"], "raw_input": {"problem": "A juice company sells its product in either a 48-ounce size or a 32-ounce size. It charges $\\$3.90$ for the 48-ounce size. How much should it charge for the smaller size if it wants the price per ounce to be $25\\%$ more than the price per ounce of the larger size?", "level": "Level 5", "type": "Prealgebra", "solution": "We could solve this problem by figuring out the per-ounce cost of the 48-ounce package, increasing it by $25\\%$, and then multiplying that by 32 for the smaller package. However, if we simply increase the price by $25\\%$, and then scale the package size down to 32 ounces from 48 ounces, these are the same calculations, but in a different order that makes it easier to calculate. Thus: $3.90 \\times 1.25 \\times \\frac{32}{48} = \\boxed{3.25\\text{ dollars}}$"}}, "tags": ["en", "math"], "task": "math", "source": "competition_math/default", "id": 2}
-{"prompt": {"data": ["Problem:\nFind the domain of the expression $\\frac{{\\sqrt{{x-2}}}}{{\\sqrt{{5-x}}}}$.}}\nSolution:\nThe expressions inside each square root must be non-negative. Therefore, $x-2 \\ge 0$, so $x\\ge2$, and $5 - x \\ge 0$, so $x \\le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$.\nFinal Answer: The final answer is $[2,5)$. I hope it is correct.\nProblem:\nIf $\\det \\mathbf{{A}} = 2$ and $\\det \\mathbf{{B}} = 12,$ then find $\\det (\\mathbf{{A}} \\mathbf{{B}}).$\nSolution:\nWe have that $\\det (\\mathbf{{A}} \\mathbf{{B}}) = (\\det \\mathbf{{A}})(\\det \\mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$\nFinal Answer: The final answer is $24$. I hope it is correct.\nProblem:\nTerrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?\nSolution:\nIf Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\\cdot 12\\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\\cdot15\\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \\Rightarrow\\qquad n&=480/30=\\boxed{{16}} \\end{{align*}}\nFinal Answer: The final answer is $16$. I hope it is correct.\nProblem:\nIf the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \\end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero.\nSolution:\nIf we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$\nFinal Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct.\nProblem:\nSuppose that $x,$ $y,$ and $z$ satisfy the equations\n\\begin{align*}\nxyz &= 4, \\\\\nx^3 + y^3 + z^3 &= 4, \\\\\nxy^2 + x^2 y + xz^2 + x^2 z + yz^2 + y^2 z &= 12.\n\\end{align*}Calculate the value of $xy + yz + zx.$\nSolution:\n"], "raw_input": {"problem": "Suppose that $x,$ $y,$ and $z$ satisfy the equations\n\\begin{align*}\nxyz &= 4, \\\\\nx^3 + y^3 + z^3 &= 4, \\\\\nxy^2 + x^2 y + xz^2 + x^2 z + yz^2 + y^2 z &= 12.\n\\end{align*}Calculate the value of $xy + yz + zx.$", "level": "Level 4", "type": "Intermediate Algebra", "solution": "Let $s_1 = x + y + z$ and $s_2 = xy + xz + yz.$  Then\n\\begin{align*}\ns_1 s_2 &= (x + y + z)(xy + xz + yz) \\\\\n&= x^2 y + xy^2 + x^2 z + xz^2 + y^2 z + yz^2 + 3xyz \\\\\n&= 12 + 3 \\cdot 4 = 24.\n\\end{align*}Also,\n\\begin{align*}\ns_1^3 &= (x + y + z)^3 \\\\\n&= (x^3 + y^3 + z^3) + 3(x^2 y + xy^2 + x^2 z + xz^2 + y^2 z + yz^2) + 6xyz \\\\\n&= 4 + 3 \\cdot 12 + 6 \\cdot 4 = 64,\n\\end{align*}so $s_1 = 4.$  Hence, $s_2 = \\frac{24}{s_1} = \\boxed{6}.$"}}, "tags": ["en", "math"], "task": "math", "source": "competition_math/default", "id": 3}
-{"prompt": {"data": ["The average car uses about 25% of the available energy in each gallon of gasoline. The energy not used to run the car is released as heat and sound. Which statement best describes this occurrence?\nA. Energy conversion reduces efficiency.\nB. Energy is destroyed at high temperatures.\nC. Energy loss increases the order in a system.\nD. Energy levels increase as conversions take place.\nAnswer:"], "multi_choices": ["A", "B", "C", "D"], "raw_input": {"id": "Mercury_7145950", "question": "The average car uses about 25% of the available energy in each gallon of gasoline. The energy not used to run the car is released as heat and sound. Which statement best describes this occurrence?", "choices": {"text": ["Energy conversion reduces efficiency.", "Energy is destroyed at high temperatures.", "Energy loss increases the order in a system.", "Energy levels increase as conversions take place."], "label": ["A", "B", "C", "D"]}, "answerKey": "A"}}, "tags": ["en", "reasoning"], "task": "reasoning", "source": "arc/ARC-Easy", "id": 4}
-{"prompt": {"data": ["Which process is the best example of a sudden change to Earth's surface?\nA. landslides moving loose rocks downhill\nB. sediments depositing on the seafloor\nC. deltas forming at the mouth of rivers\nD. mountains building up\nAnswer:"], "multi_choices": ["A", "B", "C", "D"], "raw_input": {"id": "MEA_2010_8_15", "question": "Which process is the best example of a sudden change to Earth's surface?", "choices": {"text": ["landslides moving loose rocks downhill", "sediments depositing on the seafloor", "deltas forming at the mouth of rivers", "mountains building up"], "label": ["A", "B", "C", "D"]}, "answerKey": "A"}}, "tags": ["en", "reasoning"], "task": "reasoning", "source": "arc/ARC-Challenge", "id": 5}
-{"prompt": {"data": ["On January 15th, there were 10 hours and 24 minutes of daylight in Jacksonville, Florida. On the same day, there were only 9 hours and 37 minutes of daylight in New York City. Why did New York have fewer hours of daylight than Florida?\nA. because Earth rotates\nB. because Earth tilts on its axis\nC. because gravity pulls Earth toward the Sun\nD. because the Moon reflects sunlight onto Earth\nAnswer:"], "multi_choices": ["A", "B", "C", "D"], "raw_input": {"id": "Mercury_SC_408336", "question": "On January 15th, there were 10 hours and 24 minutes of daylight in Jacksonville, Florida. On the same day, there were only 9 hours and 37 minutes of daylight in New York City. Why did New York have fewer hours of daylight than Florida?", "choices": {"text": ["because Earth rotates", "because Earth tilts on its axis", "because gravity pulls Earth toward the Sun", "because the Moon reflects sunlight onto Earth"], "label": ["A", "B", "C", "D"]}, "answerKey": "B"}}, "tags": ["en", "reasoning"], "task": "reasoning", "source": "arc/ARC-Easy", "id": 6}
-{"prompt": {"data": ["Insulin is a chemical that is released by the endocrine system to increase the absorption of glucose by the body's cells. After which activity would insulin levels in a healthy body increase?\nA. drinking water\nB. taking a nap\nC. swimming\nD. eating\nAnswer:"], "multi_choices": ["A", "B", "C", "D"], "raw_input": {"id": "Mercury_7126613", "question": "Insulin is a chemical that is released by the endocrine system to increase the absorption of glucose by the body's cells. After which activity would insulin levels in a healthy body increase?", "choices": {"text": ["drinking water", "taking a nap", "swimming", "eating"], "label": ["A", "B", "C", "D"]}, "answerKey": "D"}}, "tags": ["en", "reasoning"], "task": "reasoning", "source": "arc/ARC-Easy", "id": 7}
-{"prompt": {"data": ["Which statement describes the transfer of energy when an ice cube is placed in a cup of hot tea?\nA. Heat flows from the tea to the ice.\nB. Cold flows from the ice to the tea.\nC. Cold flows from the ice, and heat flows from the tea.\nD. Heat flows simultaneously between the ice and the tea.\nAnswer:"], "multi_choices": ["A", "B", "C", "D"], "raw_input": {"id": "Mercury_7029785", "question": "Which statement describes the transfer of energy when an ice cube is placed in a cup of hot tea?", "choices": {"text": ["Heat flows from the tea to the ice.", "Cold flows from the ice to the tea.", "Cold flows from the ice, and heat flows from the tea.", "Heat flows simultaneously between the ice and the tea."], "label": ["A", "B", "C", "D"]}, "answerKey": "A"}}, "tags": ["en", "reasoning"], "task": "reasoning", "source": "arc/ARC-Easy", "id": 8}
-{"prompt": {"data": ["A student walks to school one morning and notices the grass is wet but the streets are dry. Which of these processes most likely caused the grass to be wet?\nA. condensation\nB. erosion\nC. evaporation\nD. precipitation\nAnswer:"], "multi_choices": ["A", "B", "C", "D"], "raw_input": {"id": "MSA_2012_5_2", "question": "A student walks to school one morning and notices the grass is wet but the streets are dry. Which of these processes most likely caused the grass to be wet?", "choices": {"text": ["condensation", "erosion", "evaporation", "precipitation"], "label": ["A", "B", "C", "D"]}, "answerKey": "A"}}, "tags": ["en", "reasoning"], "task": "reasoning", "source": "arc/ARC-Challenge", "id": 9}
diff --git a/schema.json b/schema.json
deleted file mode 100644
index 924f0732..00000000
--- a/schema.json
+++ /dev/null
@@ -1,45 +0,0 @@
-{
-    "name": "math&reasoning",
-    "datasets": [
-        {
-            "name": "math",
-            "datasets": [
-                {
-                    "name": "gsm8k",
-                    "weight": 0.3333333333333333,
-                    "task_type": "math",
-                    "tags": [
-                        "en",
-                        "math"
-                    ],
-                    "args": {}
-                },
-                {
-                    "name": "competition_math",
-                    "weight": 0.6666666666666666,
-                    "task_type": "math",
-                    "tags": [
-                        "en",
-                        "math"
-                    ],
-                    "args": {}
-                }
-            ]
-        },
-        {
-            "name": "reasoning",
-            "datasets": [
-                {
-                    "name": "arc",
-                    "weight": 1.0,
-                    "task_type": "reasoning",
-                    "tags": [
-                        "en",
-                        "reasoning"
-                    ],
-                    "args": {}
-                }
-            ]
-        }
-    ]
-}

From 0c4e87d59cb48dedcd3ffaad082a3d53c23cf8a5 Mon Sep 17 00:00:00 2001
From: Yunnglin <mao.looper@qq.com>
Date: Mon, 23 Dec 2024 20:52:09 +0800
Subject: [PATCH 12/15] add mix evaluator

---
 evalscope/collections/collection_schema.py |   4 +-
 evalscope/collections/data_generator.py    |   5 +-
 evalscope/collections/evaluators.py        | 106 ++++++++++++++++++---
 evalscope/config.py                        |   4 +-
 evalscope/evaluator/evaluator.py           |  12 +--
 evalscope/models/__init__.py               |   6 +-
 evalscope/models/base_adapter.py           |  25 ++++-
 evalscope/models/local_model.py            |  31 +++++-
 evalscope/run.py                           |  48 +---------
 tests/cli/test_run.py                      |  10 +-
 10 files changed, 170 insertions(+), 81 deletions(-)

diff --git a/evalscope/collections/collection_schema.py b/evalscope/collections/collection_schema.py
index 0109f65b..74662e68 100644
--- a/evalscope/collections/collection_schema.py
+++ b/evalscope/collections/collection_schema.py
@@ -97,8 +97,8 @@ def dump_json(self, file_path):
         ])
     print(schema.to_dict())
     print(schema.flatten())
-    schema.dump_json('schema.json')
+    schema.dump_json('outputs/schema.json')
 
-    schema = CollectionSchema.from_dict(json.load(open('schema.json', 'r')))
+    schema = CollectionSchema.from_dict(json.load(open('outputs/schema.json', 'r')))
     print(schema.to_dict())
     print(schema.flatten())
diff --git a/evalscope/collections/data_generator.py b/evalscope/collections/data_generator.py
index 2b4a6f8a..f3bde5ee 100644
--- a/evalscope/collections/data_generator.py
+++ b/evalscope/collections/data_generator.py
@@ -1,10 +1,9 @@
 import json
 import random
 from abc import ABC, abstractmethod
-from collections import defaultdict
 from typing import List, Optional
 
-from evalscope.collections.collection_schema import CollectionSchema, DatasetInfo
+from evalscope.collections.collection_schema import CollectionSchema
 
 
 # Define an abstract base class for Samplers
@@ -66,4 +65,4 @@ def save_to_jsonl(data, file_path):
     schema = CollectionSchema.from_dict(json.load(open('schema.json', 'r')))
     print(schema.to_dict())
     mixed_data = WeightedSampler(schema, 10).sample()
-    save_to_jsonl(mixed_data, 'mixed_data.jsonl')
+    save_to_jsonl(mixed_data, 'outputs/mixed_data.jsonl')
diff --git a/evalscope/collections/evaluators.py b/evalscope/collections/evaluators.py
index 70c0ed0d..2a978abd 100644
--- a/evalscope/collections/evaluators.py
+++ b/evalscope/collections/evaluators.py
@@ -1,15 +1,99 @@
+import os
+from collections import defaultdict
+from datetime import datetime
+
+from evalscope.benchmarks import Benchmark
+from evalscope.config import TaskConfig
+from evalscope.constants import EvalType
+from evalscope.evaluator import Evaluator
+from evalscope.models import get_local_model, initialize_model_adapter
+from evalscope.utils import logger
+from evalscope.utils.io_utils import OutputsStructure, jsonl_to_list
+
+
+class MixEvaluator(Evaluator):
+
+    def __init__(self, data_adapter, model_adapter, task_cfg, outputs):
+        super().__init__(
+            dataset_name_or_path='mixed_data',
+            data_adapter=data_adapter,
+            model_adapter=model_adapter,
+            task_cfg=task_cfg,
+            outputs=outputs)
+
+    def evaluate(self, samples: dict, infer_cfg: dict, debug: bool):
+        logger.info(f'**** Start evaluating on dataset {self.dataset_name_or_path} ****')
+
+        reviews_score_all = {}  # {subset_name: (score, num)}
+        stage_answers_dict = {}
+        stage_reviews_dict = {}
+
+        for subset_name, prompts_list in samples.items():
+
+            answers_list: list = self.get_answers(
+                subset_name=subset_name, prompts_list=prompts_list, infer_cfg=infer_cfg, debug=debug)
+
+            stage_answers_dict[subset_name] = answers_list
+
+            reviews_list: list = self.get_reviews(subset_name=subset_name, answers_list=answers_list, debug=debug)
+
+            metric_res = self.compute_metrics(reviews_list=reviews_list)
+            reviews_score_all[subset_name] = (metric_res, len(reviews_list))
+            stage_reviews_dict[subset_name] = reviews_list
+
+        # Generate report
+        report_map = self.dump_report(reviews_score_all)
+
+        logger.info(f'**** Evaluation finished on {self.dataset_name_or_path} ****\n')
+
+        return report_map
+
+
 class EvaluatorCollection:
 
-    def __init__(self, schema):
-        self.schema = schema
-        self.evaluators = {}
+    def __init__(self, task_cfg: TaskConfig, dataset):
+        self.task_cfg = task_cfg
+        self.dataset = dataset
+        self.model = get_local_model(task_cfg)
+        self.outputs = OutputsStructure(
+            outputs_dir=os.path.join(self.task_cfg.work_dir,
+                                     datetime.now().strftime('%Y%m%d%H%M%S')))
+        self.dataset_dict = self.parse_dataset()
+        self.evaluators = self.add_evaluator()
+
+    def parse_dataset(self):
+        dataset_dict = defaultdict(lambda: defaultdict(list))
+        for sample in self.dataset:
+            source = sample['source']
+            dataset_name, subset_name = source.split('/')
+            dataset_dict[dataset_name][subset_name].append(sample['prompt'])
+        return dataset_dict
+
+    def add_evaluator(self):
+        evaluators = {}
+        for dataset_name in self.dataset_dict.keys():
+            benchmark = Benchmark.get(dataset_name)
+            data_adapter = benchmark.get_data_adapter()
+            model_adapter = initialize_model_adapter(self.task_cfg, benchmark.model_adapter, self.model)
+            evaluators[dataset_name] = MixEvaluator(data_adapter, model_adapter, self.task_cfg, self.outputs)
+        return evaluators
+
+    def evaluate(self):
+        for dataset_name, evaluator in self.evaluators.items():
+            evaluator.evaluate(
+                samples=self.dataset_dict[dataset_name],
+                infer_cfg=self.task_cfg.generation_config,
+                debug=self.task_cfg.debug)
+
 
-    def add_evaluator(self, dataset_name):
-        evaluator = self.schema.get_evaluator(dataset_name)
-        if evaluator:
-            self.evaluators[dataset_name] = evaluator
+if __name__ == '__main__':
+    dataset = jsonl_to_list('outputs/mixed_data.jsonl')
+    task_cfg = TaskConfig(
+        model='qwen2.5',
+        api_url='http://127.0.0.1:8801/v1/chat/completions',
+        api_key='EMPTY',
+        eval_type=EvalType.SERVICE,
+    )
 
-    def evaluate(self, dataset_name, sample):
-        evaluator = self.evaluators.get(dataset_name)
-        if evaluator:
-            return evaluator.evaluate(sample)
+    evaluator_collection = EvaluatorCollection(task_cfg, dataset)
+    evaluator_collection.evaluate()
diff --git a/evalscope/config.py b/evalscope/config.py
index 52a78a74..81371bc4 100644
--- a/evalscope/config.py
+++ b/evalscope/config.py
@@ -31,7 +31,7 @@
 @dataclass
 class TaskConfig:
     # Model-related arguments
-    model: Union[str, CustomModel, None] = None
+    model: Union[str, 'CustomModel', None] = None
     model_id: Optional[str] = None
     model_args: Optional[Dict] = field(default_factory=lambda: DEFAULT_MODEL_ARGS | {})
 
@@ -74,8 +74,6 @@ def __post_init__(self):
                 self.model_id = type(self.model).__name__
             else:
                 self.model_id = os.path.basename(self.model).rstrip(os.sep)
-        # Convert Enum to string
-        self.eval_backend = str(self.eval_backend)
 
     def to_dict(self):
         return self.__dict__
diff --git a/evalscope/evaluator/evaluator.py b/evalscope/evaluator/evaluator.py
index 3d76b704..e5306bf6 100644
--- a/evalscope/evaluator/evaluator.py
+++ b/evalscope/evaluator/evaluator.py
@@ -29,7 +29,6 @@ class Evaluator(object):
                 if the dataset is a local path, e.g. /path/to/your_dataset_name,
                 then the task name will be the basename of the path, which is `your_dataset_name`.
         data_adapter: DataAdapter, the data adapter for the dataset.
-        subset_list: list, the subset list for the dataset.
         model_adapter: BaseModelAdapter, the model adapter for the model.
         outputs: OutputsStructure, the outputs dir. Default: None
         task_cfg: TaskConfig, the overall task config. Default: None
@@ -40,7 +39,6 @@ def __init__(self,
                  dataset_name_or_path: str,
                  data_adapter: DataAdapter,
                  model_adapter: BaseModelAdapter,
-                 subset_list: list = None,
                  outputs: OutputsStructure = None,
                  task_cfg: TaskConfig = None,
                  **kwargs):
@@ -50,17 +48,14 @@ def __init__(self,
         self.model_name = task_cfg.model_id
         self.custom_task_name = f'{self.model_name}_{self.dataset_name}'
 
-        self.datasets_dir = os.path.expanduser(task_cfg.dataset_dir)
-        self.kwargs = kwargs
         self.data_adapter = data_adapter
         self.model_adapter = model_adapter
+        self.model_cfg = model_adapter.model_cfg
         self.eval_type = task_cfg.eval_type
-        self.subset_list = subset_list
         self.dataset_hub = task_cfg.dataset_hub
         self.stage = task_cfg.stage
         self.use_cache = task_cfg.use_cache
         self.task_cfg = task_cfg
-        self.model_cfg = model_adapter.model_cfg
         # Deal with the output paths
         self.outputs_structure = outputs
 
@@ -69,13 +64,12 @@ def __init__(self,
     def load_dataset(self):
         dataset = self.data_adapter.load(
             dataset_name_or_path=self.dataset_name_or_path,
-            subset_list=self.subset_list,
-            work_dir=self.datasets_dir,
+            subset_list=self.data_adapter.subset_list,
+            work_dir=os.path.expanduser(self.task_cfg.dataset_dir),
             datasets_hub=self.dataset_hub,
             **self.kwargs)
 
         # Get prompts from dataset
-        # TODO: support sampler
         prompts = self.data_adapter.gen_prompts(data_dict=dataset)
         return prompts
 
diff --git a/evalscope/models/__init__.py b/evalscope/models/__init__.py
index 90f126ee..09dee522 100644
--- a/evalscope/models/__init__.py
+++ b/evalscope/models/__init__.py
@@ -1,16 +1,16 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
-from evalscope.models.base_adapter import BaseModelAdapter
+from evalscope.models.base_adapter import BaseModelAdapter, initialize_model_adapter
 from evalscope.models.chat_adapter import ChatGenerationModelAdapter
 from evalscope.models.choice_adapter import ContinuationLogitsModelAdapter, MultiChoiceModelAdapter
 from evalscope.models.custom import CustomModel
 from evalscope.models.custom_adapter import CustomModelAdapter
-from evalscope.models.local_model import LocalModel
+from evalscope.models.local_model import LocalModel, get_local_model
 from evalscope.models.model import BaseModel, ChatBaseModel, OpenAIModel
 from evalscope.models.server_adapter import ServerModelAdapter
 
 __all__ = [
     'CustomModel', 'BaseModel', 'ChatBaseModel', 'OpenAIModel', 'BaseModelAdapter', 'ChatGenerationModelAdapter',
     'MultiChoiceModelAdapter', 'ContinuationLogitsModelAdapter', 'CustomModelAdapter', 'ServerModelAdapter',
-    'LocalModel'
+    'LocalModel', 'get_local_model', 'initialize_model_adapter'
 ]
diff --git a/evalscope/models/base_adapter.py b/evalscope/models/base_adapter.py
index 32ec490c..8eff3a0e 100644
--- a/evalscope/models/base_adapter.py
+++ b/evalscope/models/base_adapter.py
@@ -1,10 +1,14 @@
 import torch
 from abc import ABC, abstractmethod
-from typing import Any, Optional, Union
+from typing import TYPE_CHECKING, Any, Optional, Union
 
+from evalscope.constants import EvalType
 from evalscope.models.custom import CustomModel
 from evalscope.models.local_model import LocalModel
 
+if TYPE_CHECKING:
+    from evalscope.config import TaskConfig
+
 
 class BaseModelAdapter(ABC):
 
@@ -27,3 +31,22 @@ def __init__(self, model: Optional[Union[LocalModel, CustomModel]], **kwargs):
     @torch.no_grad()
     def predict(self, *args, **kwargs) -> Any:
         raise NotImplementedError
+
+
+def initialize_model_adapter(task_cfg: 'TaskConfig', model_adapter_cls: 'BaseModelAdapter', base_model: 'LocalModel'):
+    """Initialize the model adapter based on the task configuration."""
+    if task_cfg.dry_run:
+        from evalscope.models.model import DummyChatModel
+        return DummyChatModel(model_cfg=dict())
+    elif task_cfg.eval_type == EvalType.CUSTOM:
+        if not isinstance(task_cfg.model, CustomModel):
+            raise ValueError(f'Expected evalscope.models.custom.CustomModel, but got {type(task_cfg.model)}.')
+        from evalscope.models import CustomModelAdapter
+        return CustomModelAdapter(custom_model=task_cfg.model)
+    elif task_cfg.eval_type == EvalType.SERVICE:
+        from evalscope.models import ServerModelAdapter
+        return ServerModelAdapter(
+            api_url=task_cfg.api_url, model_id=task_cfg.model, api_key=task_cfg.api_key, seed=task_cfg.seed)
+    else:
+        return model_adapter_cls(
+            model=base_model, generation_config=task_cfg.generation_config, chat_template=task_cfg.chat_template)
diff --git a/evalscope/models/local_model.py b/evalscope/models/local_model.py
index 3702781f..502e0643 100644
--- a/evalscope/models/local_model.py
+++ b/evalscope/models/local_model.py
@@ -1,10 +1,14 @@
 import torch
 from modelscope import AutoModelForCausalLM, AutoTokenizer
 from torch import dtype
+from typing import TYPE_CHECKING, Optional
 
-from evalscope.constants import DEFAULT_MODEL_CACHE_DIR
+from evalscope.constants import DEFAULT_MODEL_CACHE_DIR, DEFAULT_MODEL_REVISION, EvalType
 from evalscope.utils.logger import get_logger
 
+if TYPE_CHECKING:
+    from evalscope.config import TaskConfig
+
 logger = get_logger()
 
 
@@ -12,7 +16,7 @@ class LocalModel:
 
     def __init__(self,
                  model_id: str,
-                 model_revision: str = 'master',
+                 model_revision: str = DEFAULT_MODEL_REVISION,
                  device_map: str = 'auto',
                  torch_dtype: dtype = torch.bfloat16,
                  cache_dir: str = None,
@@ -45,3 +49,26 @@ def __init__(self,
             'device_map': device_map,
             'torch_dtype': str(torch_dtype),
         }
+
+
+def get_local_model(task_cfg: 'TaskConfig') -> Optional[LocalModel]:
+    """Get the base local model for the task. If the task is not checkpoint-based, return None.
+       Avoids loading model multiple times for different datasets.
+    """
+    if task_cfg.eval_type != EvalType.CHECKPOINT:
+        return None
+    else:
+        device_map = task_cfg.model_args.get('device_map', 'auto') if torch.cuda.is_available() else None
+        cache_dir = task_cfg.model_args.get('cache_dir', None)
+        model_precision = task_cfg.model_args.get('precision', torch.float16)
+        model_revision = task_cfg.model_args.get('revision', DEFAULT_MODEL_REVISION)
+        if isinstance(model_precision, str) and model_precision != 'auto':
+            model_precision = eval(model_precision)
+
+        base_model = LocalModel(
+            model_id=task_cfg.model,
+            model_revision=model_revision,
+            device_map=device_map,
+            torch_dtype=model_precision,
+            cache_dir=cache_dir)
+        return base_model
diff --git a/evalscope/run.py b/evalscope/run.py
index fae14ff0..e968b6d2 100644
--- a/evalscope/run.py
+++ b/evalscope/run.py
@@ -3,7 +3,6 @@
 Run evaluation for LLMs.
 """
 import os.path
-import torch
 from argparse import Namespace
 from datetime import datetime
 from typing import List, Optional, Union
@@ -11,9 +10,9 @@
 from evalscope.arguments import parse_args
 from evalscope.benchmarks import Benchmark, BenchmarkMeta
 from evalscope.config import TaskConfig, parse_task_config
-from evalscope.constants import DEFAULT_MODEL_REVISION, DEFAULT_WORK_DIR, EvalBackend, EvalType
+from evalscope.constants import DEFAULT_WORK_DIR, EvalBackend
 from evalscope.evaluator import Evaluator
-from evalscope.models import BaseModelAdapter, CustomModel, LocalModel
+from evalscope.models import LocalModel, get_local_model, initialize_model_adapter
 from evalscope.utils import seed_everything
 from evalscope.utils.io_utils import OutputsStructure, are_paths_same
 from evalscope.utils.logger import configure_logging, get_logger
@@ -131,55 +130,12 @@ def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsSt
     return Evaluator(
         dataset_name_or_path=benchmark.dataset_id,
         data_adapter=data_adapter,
-        subset_list=benchmark.subset_list,
         model_adapter=model_adapter,
         outputs=outputs,
         task_cfg=task_cfg,
     )
 
 
-def get_local_model(task_cfg: TaskConfig) -> Optional[LocalModel]:
-    """Get the base local model for the task. If the task is not checkpoint-based, return None.
-       Avoids loading model multiple times for different datasets.
-    """
-    if task_cfg.eval_type != EvalType.CHECKPOINT:
-        return None
-    else:
-        device_map = task_cfg.model_args.get('device_map', 'auto') if torch.cuda.is_available() else None
-        cache_dir = task_cfg.model_args.get('cache_dir', None)
-        model_precision = task_cfg.model_args.get('precision', torch.float16)
-        model_revision = task_cfg.model_args.get('revision', DEFAULT_MODEL_REVISION)
-        if isinstance(model_precision, str) and model_precision != 'auto':
-            model_precision = eval(model_precision)
-
-        base_model = LocalModel(
-            model_id=task_cfg.model,
-            model_revision=model_revision,
-            device_map=device_map,
-            torch_dtype=model_precision,
-            cache_dir=cache_dir)
-        return base_model
-
-
-def initialize_model_adapter(task_cfg: TaskConfig, model_adapter_cls: BaseModelAdapter, base_model: LocalModel):
-    """Initialize the model adapter based on the task configuration."""
-    if task_cfg.dry_run:
-        from evalscope.models.model import DummyChatModel
-        return DummyChatModel(model_cfg=dict())
-    elif task_cfg.eval_type == EvalType.CUSTOM:
-        if not isinstance(task_cfg.model, CustomModel):
-            raise ValueError(f'Expected evalscope.models.custom.CustomModel, but got {type(task_cfg.model)}.')
-        from evalscope.models import CustomModelAdapter
-        return CustomModelAdapter(custom_model=task_cfg.model)
-    elif task_cfg.eval_type == EvalType.SERVICE:
-        from evalscope.models import ServerModelAdapter
-        return ServerModelAdapter(
-            api_url=task_cfg.api_url, model_id=task_cfg.model, api_key=task_cfg.api_key, seed=task_cfg.seed)
-    else:
-        return model_adapter_cls(
-            model=base_model, generation_config=task_cfg.generation_config, chat_template=task_cfg.chat_template)
-
-
 def main():
     args = parse_args()
     run_task(args)
diff --git a/tests/cli/test_run.py b/tests/cli/test_run.py
index 05ffa53c..5353273b 100644
--- a/tests/cli/test_run.py
+++ b/tests/cli/test_run.py
@@ -71,7 +71,15 @@ def test_run_eval_with_args(self):
 
     @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
     def test_run_task(self):
-        task_cfg = {'model': 'qwen/Qwen2-0.5B-Instruct', 'datasets': ['bbh', 'hellaswag', 'gsm8k', 'arc'], 'limit': 2, 'debug': True}
+        task_cfg = {'model': 'qwen/Qwen2-0.5B-Instruct',
+                    'datasets': [
+                        # 'bbh',
+                        # 'hellaswag',
+                        # 'gsm8k',
+                        'arc'
+                        ],
+                    'limit': 2,
+                    'debug': True}
         run_task(task_cfg=task_cfg)
 
 

From b957f83e8fe7c7982ccda1f14edfc9f212dc04da Mon Sep 17 00:00:00 2001
From: Yunnglin <mao.looper@qq.com>
Date: Tue, 24 Dec 2024 14:50:44 +0800
Subject: [PATCH 13/15] add evaluator

---
 evalscope/collections/data_generator.py       |  11 +-
 evalscope/collections/evaluator.py            | 158 ++++++++++++++++++
 evalscope/collections/evaluators.py           |  99 -----------
 .../{collection_schema.py => schema.py}       |   0
 evalscope/evaluator/evaluator.py              | 103 +++++-------
 5 files changed, 203 insertions(+), 168 deletions(-)
 create mode 100644 evalscope/collections/evaluator.py
 delete mode 100644 evalscope/collections/evaluators.py
 rename evalscope/collections/{collection_schema.py => schema.py} (100%)

diff --git a/evalscope/collections/data_generator.py b/evalscope/collections/data_generator.py
index f3bde5ee..6fc1297a 100644
--- a/evalscope/collections/data_generator.py
+++ b/evalscope/collections/data_generator.py
@@ -1,9 +1,10 @@
 import json
 import random
 from abc import ABC, abstractmethod
+from tqdm import tqdm
 from typing import List, Optional
 
-from evalscope.collections.collection_schema import CollectionSchema
+from evalscope.collections.schema import CollectionSchema
 
 
 # Define an abstract base class for Samplers
@@ -28,7 +29,7 @@ def sample(self) -> List[dict]:
 
         remaining_count = self.count
 
-        for i, dataset in enumerate(dataset_info_list):
+        for i, dataset in enumerate(tqdm(dataset_info_list)):
             data_dict = dataset.get_data()
 
             dataset_data = []
@@ -38,7 +39,9 @@ def sample(self) -> List[dict]:
                         'prompt': prompt,
                         'tags': dataset.tags,
                         'task': dataset.task_type,
-                        'source': f'{dataset.name}/{subset_name}',
+                        'weight': dataset.weight,
+                        'dataset_name': dataset.name,
+                        'subset_name': subset_name,
                     })
 
             # For the last dataset, use the remaining count
@@ -62,7 +65,7 @@ def save_to_jsonl(data, file_path):
 
 
 if __name__ == '__main__':
-    schema = CollectionSchema.from_dict(json.load(open('schema.json', 'r')))
+    schema = CollectionSchema.from_dict(json.load(open('outputs/schema.json', 'r')))
     print(schema.to_dict())
     mixed_data = WeightedSampler(schema, 10).sample()
     save_to_jsonl(mixed_data, 'outputs/mixed_data.jsonl')
diff --git a/evalscope/collections/evaluator.py b/evalscope/collections/evaluator.py
new file mode 100644
index 00000000..fa4e3a26
--- /dev/null
+++ b/evalscope/collections/evaluator.py
@@ -0,0 +1,158 @@
+import json
+import os
+import pandas as pd
+from collections import defaultdict
+from datetime import datetime
+from tqdm import tqdm
+
+from evalscope.benchmarks import Benchmark
+from evalscope.config import TaskConfig
+from evalscope.constants import AnswerKeys, DumpMode, EvalType, ReviewKeys
+from evalscope.evaluator import Evaluator
+from evalscope.models import get_local_model, initialize_model_adapter
+from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
+from evalscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+class SimpleEvaluator(Evaluator):
+
+    def __init__(self, dataset_name, data_adapter, model_adapter, task_cfg, outputs):
+        super().__init__(
+            dataset_name_or_path=dataset_name,
+            data_adapter=data_adapter,
+            model_adapter=model_adapter,
+            task_cfg=task_cfg,
+            outputs=outputs)
+
+    def get_answer(self, input_prompt, subset_name, infer_cfg) -> dict:
+        answer_d: dict = self.model_adapter.predict(inputs=input_prompt, infer_cfg=infer_cfg)
+        answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
+        processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
+        return processed_answer
+
+    def get_review(self, answer_d) -> dict:
+        review_id, reviewer_spec = self._generate_review_id(answer_d)
+        review_d = self._get_review(answer_d=answer_d, review_id=review_id, reviewer_spec=reviewer_spec)
+        return review_d
+
+
+class EvaluatorCollection:
+
+    def __init__(self, task_cfg: TaskConfig):
+        self.task_cfg = task_cfg
+        self.model = get_local_model(task_cfg)
+        self.outputs = OutputsStructure(
+            outputs_dir=os.path.join(self.task_cfg.work_dir,
+                                     datetime.now().strftime('%Y%m%d%H%M%S')))
+        self.raw_dataset = jsonl_to_list(self.task_cfg.dataset_args['data_collection']['local_path'])
+        self.dataset_name_map, self.dataset_id_map = self._parse_dataset()
+        self.evaluators = self._initialize_evaluators()
+
+    def _parse_dataset(self):
+        dataset_name_map = defaultdict(lambda: defaultdict(list))
+        dataset_id_map = {}
+        for sample in self.raw_dataset:
+            dataset_name, subset_name = sample['dataset_name'], sample['subset_name']
+            dataset_name_map[dataset_name][subset_name].append(sample['id'])
+            dataset_id_map[sample['id']] = sample
+        return dataset_name_map, dataset_id_map
+
+    def _initialize_evaluators(self):
+        evaluators = {}
+        for dataset_name in self.dataset_name_map.keys():
+            benchmark = Benchmark.get(dataset_name)
+            data_adapter = benchmark.get_data_adapter()
+            model_adapter = initialize_model_adapter(self.task_cfg, benchmark.model_adapter, self.model)
+            evaluators[dataset_name] = SimpleEvaluator(dataset_name, data_adapter, model_adapter, self.task_cfg,
+                                                       self.outputs)
+        return evaluators
+
+    def get_report(self, reviews):
+        data = []
+        for dataset_name, data_map in self.dataset_name_map.items():
+            for subset_name, ids in data_map.items():
+                for _id in ids:
+                    review_d = reviews[_id]
+                    row_data = self.dataset_id_map[_id]
+                    score = self.get_pred_score(review_d)
+                    data.append({
+                        'task_type': row_data['task'],
+                        'dataset_name': dataset_name,
+                        'subset_name': subset_name,
+                        'tags': row_data['tags'],
+                        'score': score
+                    })
+
+        df = pd.DataFrame(data)
+
+        # Multi-level aggregation
+        subset_report_df = df.groupby(['task_type', 'dataset_name', 'subset_name']).agg(
+            average_score=('score', 'mean'), count=('score', 'size')).reset_index()
+
+        dataset_report_df = df.groupby(['task_type', 'dataset_name']).agg(
+            average_score=('score', 'mean'), count=('score', 'size')).reset_index()
+
+        task_report_df = df.groupby(['task_type']).agg(
+            average_score=('score', 'mean'), count=('score', 'size')).reset_index()
+
+        # Combine all reports into a single dictionary
+        report = {
+            'subset_level': subset_report_df.to_dict(orient='records'),
+            'dataset_level': dataset_report_df.to_dict(orient='records'),
+            'task_level': task_report_df.to_dict(orient='records')
+        }
+
+        # Log the report
+        logger.info(f"Report:\n{pd.DataFrame(report['subset_level']).to_markdown(index=False)}")
+
+        # Save the report to a JSON file
+        report_file_path = os.path.join(self.outputs.reports_dir, 'data_collection.json')
+        with open(report_file_path, 'w', encoding='utf-8') as f:
+            json.dump(report, f, ensure_ascii=False, indent=4)
+
+    def get_answers(self):
+        pred_file_path = os.path.join(self.outputs.predictions_dir, 'data_collection.jsonl')
+        answers = defaultdict(dict)
+        for sample in tqdm(self.raw_dataset, desc='Getting answers'):
+            evaluator = self.evaluators[sample['dataset_name']]
+            answer_d = evaluator.get_answer(sample['prompt'], sample['subset_name'], self.task_cfg.generation_config)
+            answers[sample['id']] = answer_d
+            dump_jsonl_data(answer_d, pred_file_path, dump_mode=DumpMode.APPEND)
+        return answers
+
+    def get_reviews(self, answers):
+        review_file_path = os.path.join(self.outputs.reviews_dir, 'data_collection.jsonl')
+        reviews = defaultdict(dict)
+        for sample in tqdm(self.raw_dataset, desc='Getting reviews'):
+            evaluator = self.evaluators[sample['dataset_name']]
+            review_d = evaluator.get_review(answers[sample['id']])
+            reviews[sample['id']] = review_d
+            dump_jsonl_data(review_d, review_file_path, dump_mode=DumpMode.APPEND)
+        return reviews
+
+    @staticmethod
+    def get_pred_score(review_d) -> float:
+        return review_d[AnswerKeys.CHOICES][0][ReviewKeys.REVIEW][ReviewKeys.RESULT]
+
+    def evaluate(self):
+        answers = self.get_answers()
+        reviews = self.get_reviews(answers)
+        self.get_report(reviews)
+
+
+if __name__ == '__main__':
+    task_cfg = TaskConfig(
+        model='qwen2.5',
+        api_url='http://127.0.0.1:8801/v1/chat/completions',
+        api_key='EMPTY',
+        eval_type=EvalType.SERVICE,
+        datasets=['data_collection'],
+        dataset_args={'data_collection': {
+            'local_path': 'outputs/mixed_data.jsonl'
+        }},
+    )
+
+    evaluator_collection = EvaluatorCollection(task_cfg)
+    evaluator_collection.evaluate()
diff --git a/evalscope/collections/evaluators.py b/evalscope/collections/evaluators.py
deleted file mode 100644
index 2a978abd..00000000
--- a/evalscope/collections/evaluators.py
+++ /dev/null
@@ -1,99 +0,0 @@
-import os
-from collections import defaultdict
-from datetime import datetime
-
-from evalscope.benchmarks import Benchmark
-from evalscope.config import TaskConfig
-from evalscope.constants import EvalType
-from evalscope.evaluator import Evaluator
-from evalscope.models import get_local_model, initialize_model_adapter
-from evalscope.utils import logger
-from evalscope.utils.io_utils import OutputsStructure, jsonl_to_list
-
-
-class MixEvaluator(Evaluator):
-
-    def __init__(self, data_adapter, model_adapter, task_cfg, outputs):
-        super().__init__(
-            dataset_name_or_path='mixed_data',
-            data_adapter=data_adapter,
-            model_adapter=model_adapter,
-            task_cfg=task_cfg,
-            outputs=outputs)
-
-    def evaluate(self, samples: dict, infer_cfg: dict, debug: bool):
-        logger.info(f'**** Start evaluating on dataset {self.dataset_name_or_path} ****')
-
-        reviews_score_all = {}  # {subset_name: (score, num)}
-        stage_answers_dict = {}
-        stage_reviews_dict = {}
-
-        for subset_name, prompts_list in samples.items():
-
-            answers_list: list = self.get_answers(
-                subset_name=subset_name, prompts_list=prompts_list, infer_cfg=infer_cfg, debug=debug)
-
-            stage_answers_dict[subset_name] = answers_list
-
-            reviews_list: list = self.get_reviews(subset_name=subset_name, answers_list=answers_list, debug=debug)
-
-            metric_res = self.compute_metrics(reviews_list=reviews_list)
-            reviews_score_all[subset_name] = (metric_res, len(reviews_list))
-            stage_reviews_dict[subset_name] = reviews_list
-
-        # Generate report
-        report_map = self.dump_report(reviews_score_all)
-
-        logger.info(f'**** Evaluation finished on {self.dataset_name_or_path} ****\n')
-
-        return report_map
-
-
-class EvaluatorCollection:
-
-    def __init__(self, task_cfg: TaskConfig, dataset):
-        self.task_cfg = task_cfg
-        self.dataset = dataset
-        self.model = get_local_model(task_cfg)
-        self.outputs = OutputsStructure(
-            outputs_dir=os.path.join(self.task_cfg.work_dir,
-                                     datetime.now().strftime('%Y%m%d%H%M%S')))
-        self.dataset_dict = self.parse_dataset()
-        self.evaluators = self.add_evaluator()
-
-    def parse_dataset(self):
-        dataset_dict = defaultdict(lambda: defaultdict(list))
-        for sample in self.dataset:
-            source = sample['source']
-            dataset_name, subset_name = source.split('/')
-            dataset_dict[dataset_name][subset_name].append(sample['prompt'])
-        return dataset_dict
-
-    def add_evaluator(self):
-        evaluators = {}
-        for dataset_name in self.dataset_dict.keys():
-            benchmark = Benchmark.get(dataset_name)
-            data_adapter = benchmark.get_data_adapter()
-            model_adapter = initialize_model_adapter(self.task_cfg, benchmark.model_adapter, self.model)
-            evaluators[dataset_name] = MixEvaluator(data_adapter, model_adapter, self.task_cfg, self.outputs)
-        return evaluators
-
-    def evaluate(self):
-        for dataset_name, evaluator in self.evaluators.items():
-            evaluator.evaluate(
-                samples=self.dataset_dict[dataset_name],
-                infer_cfg=self.task_cfg.generation_config,
-                debug=self.task_cfg.debug)
-
-
-if __name__ == '__main__':
-    dataset = jsonl_to_list('outputs/mixed_data.jsonl')
-    task_cfg = TaskConfig(
-        model='qwen2.5',
-        api_url='http://127.0.0.1:8801/v1/chat/completions',
-        api_key='EMPTY',
-        eval_type=EvalType.SERVICE,
-    )
-
-    evaluator_collection = EvaluatorCollection(task_cfg, dataset)
-    evaluator_collection.evaluate()
diff --git a/evalscope/collections/collection_schema.py b/evalscope/collections/schema.py
similarity index 100%
rename from evalscope/collections/collection_schema.py
rename to evalscope/collections/schema.py
diff --git a/evalscope/evaluator/evaluator.py b/evalscope/evaluator/evaluator.py
index e5306bf6..bf65d51e 100644
--- a/evalscope/evaluator/evaluator.py
+++ b/evalscope/evaluator/evaluator.py
@@ -73,13 +73,19 @@ def load_dataset(self):
         prompts = self.data_adapter.gen_prompts(data_dict=dataset)
         return prompts
 
-    def _pred_answer(self, input_d: dict, infer_cfg: dict, subset_name: str, answer_id: str = None) -> dict:
-
-        ans: dict = self.model_adapter.predict(inputs=input_d, infer_cfg=infer_cfg)
-        ans[AnswerKeys.ANSWER_ID] = answer_id
-        ans[AnswerKeys.SUBSET_NAME] = subset_name
-
-        return ans
+    def _generate_answer_id(self, model_cfg, input_d, infer_cfg):
+        model_cfg_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(model_cfg).items())), ensure_ascii=False)
+        input_prompt_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(input_d).items())), ensure_ascii=False)
+        infer_cfg_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())), ensure_ascii=False)
+        return 'answer-' + gen_hash(model_cfg_str + input_prompt_str + infer_cfg_str)
+
+    def _process_answer(self, answer_d, input_d, subset_name, answer_id):
+        answer_d[AnswerKeys.MODEL_SPEC] = self.model_adapter.model_cfg
+        answer_d[AnswerKeys.ANSWER_ID] = answer_id
+        answer_d[AnswerKeys.SUBSET_NAME] = subset_name
+        answer_d[AnswerKeys.RAW_INPUT] = input_d[AnswerKeys.RAW_INPUT]
+        answer_d[AnswerKeys.ORIGIN_PROMPT] = input_d
+        return answer_d
 
     def get_answers(self,
                     subset_name: str,
@@ -130,57 +136,24 @@ def get_answers(self,
             resp_answers_list: List[Dict[str, Any]] = self.model_adapter.predict(
                 inputs=prompts_list, infer_cfg=infer_cfg)
 
-            assert len(prompts_list) == len(resp_answers_list), \
-                f'Length of prompts_list({len(prompts_list)}) != Length of resp_answers_list({len(resp_answers_list)})'
-
-            for in_d, resp_d in zip(prompts_list, resp_answers_list):
-
-                # Gen answer_id (concat: model_cfg + input_prompt + infer_cfg)
-                model_cfg_str = json.dumps(
-                    OrderedDict(sorted(dict_torch_dtype_to_str(self.model_adapter.model_cfg).items())),
-                    ensure_ascii=False)
-                input_prompt_str = json.dumps(
-                    OrderedDict(sorted(dict_torch_dtype_to_str(in_d).items())), ensure_ascii=False)
-                infer_cfg_str = json.dumps(
-                    OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())), ensure_ascii=False)
-                answer_id = 'answer-' + gen_hash(model_cfg_str + input_prompt_str + infer_cfg_str)
-
-                resp_d[AnswerKeys.MODEL_SPEC] = self.model_adapter.model_cfg
-                resp_d[AnswerKeys.ANSWER_ID] = answer_id
-                resp_d[AnswerKeys.SUBSET_NAME] = subset_name
-                resp_d[AnswerKeys.RAW_INPUT] = in_d[AnswerKeys.RAW_INPUT]
-                resp_d[AnswerKeys.ORIGIN_PROMPT] = in_d
-
-                answers_list.append(resp_d)
-                dump_jsonl_data(resp_d, pred_file_path, dump_mode=DumpMode.APPEND)
+            for input_prompt, answer_d in zip(prompts_list, resp_answers_list):
+                answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
+                processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
+                answers_list.append(processed_answer)
+                dump_jsonl_data(processed_answer, pred_file_path, dump_mode=DumpMode.APPEND)
 
         else:
             for input_prompt in tqdm(prompts_list, total=len(prompts_list), desc=f'Predicting({subset_name}): '):
-
-                # Gen answer_id (concat: model_cfg + input_prompt + infer_cfg)
-                model_cfg_str = json.dumps(
-                    OrderedDict(sorted(dict_torch_dtype_to_str(self.model_adapter.model_cfg).items())),
-                    ensure_ascii=False)
-                input_prompt_str = json.dumps(
-                    OrderedDict(sorted(dict_torch_dtype_to_str(input_prompt).items())), ensure_ascii=False)
-                infer_cfg_str = json.dumps(
-                    OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())), ensure_ascii=False)
-                answer_id = 'answer-' + gen_hash(model_cfg_str + input_prompt_str + infer_cfg_str)
-
-                # Get answers
-                answer_d: dict = self._pred_answer(
-                    input_d=input_prompt, infer_cfg=infer_cfg, subset_name=subset_name, answer_id=answer_id)
-
-                answer_d[AnswerKeys.MODEL_SPEC] = self.model_adapter.model_cfg
-                answer_d[AnswerKeys.RAW_INPUT] = input_prompt[AnswerKeys.RAW_INPUT]
-                answer_d[AnswerKeys.ORIGIN_PROMPT] = input_prompt
+                answer_d: dict = self.model_adapter.predict(inputs=input_prompt, infer_cfg=infer_cfg)
+                answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
+                processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
 
                 if debug:
                     logger.info(f'**input_prompt: {json.dumps(input_prompt, ensure_ascii=False)} \n')
-                    logger.info(f'**predicted ans: {json.dumps(answer_d, ensure_ascii=False)} \n')
+                    logger.info(f'**predicted ans: {json.dumps(processed_answer, ensure_ascii=False)} \n')
 
-                answers_list.append(answer_d)
-                dump_jsonl_data(answer_d, pred_file_path, dump_mode=DumpMode.APPEND)
+                answers_list.append(processed_answer)
+                dump_jsonl_data(processed_answer, pred_file_path, dump_mode=DumpMode.APPEND)
 
         logger.info(f'Dump predictions to {pred_file_path}.')
         return answers_list
@@ -224,6 +197,19 @@ def _get_review(self, answer_d: dict, review_id: str = None, reviewer_spec: dict
 
         return review_res
 
+    def _generate_review_id(self, answer_d):
+        # Gen review_id (concat: answer_id + reviewer_spec)
+        answer_id = answer_d[AnswerKeys.ANSWER_ID]
+        reviewer_spec = {
+            'metric': [metric_d['name'] for metric_d in self.data_adapter.metric_list],
+            'reviewer': ['Evaluator'],
+            'revision': ['default']
+        }
+        reviewer_spec_str = json.dumps(
+            OrderedDict(sorted(dict_torch_dtype_to_str(reviewer_spec).items())), ensure_ascii=False)
+        review_id = 'review-' + gen_hash(answer_id + reviewer_spec_str)
+        return review_id, reviewer_spec
+
     def get_reviews(self, subset_name: str, answers_list: List[dict], debug: bool = False, **kwargs) -> list:
         """
         Get reviews from answers.
@@ -247,19 +233,7 @@ def get_reviews(self, subset_name: str, answers_list: List[dict], debug: bool =
             logger.warning(f'Ignore use_cache={self.use_cache}, updating the review file: {review_file_path} ...')
 
         for answer_d in tqdm(answers_list, total=len(answers_list), desc=f'Reviewing({subset_name}): '):
-
-            # Gen review_id (concat: answer_id + reviewer_spec)
-            answer_id = answer_d[AnswerKeys.ANSWER_ID]
-
-            reviewer_spec: dict = {
-                'metric': [metric_d['name'] for metric_d in self.data_adapter.metric_list],
-                'reviewer': ['Evaluator'],
-                'revision': ['default']
-            }
-            reviewer_spec_str = json.dumps(
-                OrderedDict(sorted(dict_torch_dtype_to_str(reviewer_spec).items())), ensure_ascii=False)
-            review_id = 'review-' + gen_hash(answer_id + reviewer_spec_str)
-
+            review_id, reviewer_spec = self._generate_review_id(answer_d)
             # Get review
             review_d = self._get_review(answer_d=answer_d, review_id=review_id, reviewer_spec=reviewer_spec)
 
@@ -267,7 +241,6 @@ def get_reviews(self, subset_name: str, answers_list: List[dict], debug: bool =
                 logger.info(review_d)
 
             reviews_list.append(review_d)
-
             # Dump reviews
             dump_jsonl_data(review_d, review_file_path, dump_mode=DumpMode.APPEND)
 

From 95aa74103f9cf8e9a65443b291af59e1d83b70ff Mon Sep 17 00:00:00 2001
From: Yunnglin <mao.looper@qq.com>
Date: Tue, 24 Dec 2024 16:33:19 +0800
Subject: [PATCH 14/15] register all data

---
 evalscope/benchmarks/__init__.py              |   2 +-
 evalscope/benchmarks/cmmlu/cmmlu_adapter.py   |  61 +++-------
 .../general_qa/general_qa_adapter.py          |  35 +++---
 .../benchmarks/humaneval/humaneval_adapter.py |  80 +++---------
 evalscope/benchmarks/mmlu/mmlu_adapter.py     |  65 ++++------
 evalscope/benchmarks/race/race_adapter.py     |  78 ++++--------
 .../benchmarks/trivia_qa/trivia_qa_adapter.py | 115 ++++--------------
 .../truthful_qa/truthful_qa_adapter.py        |  56 ++++-----
 evalscope/collections/__init__.py             |   3 +
 evalscope/collections/data_generator.py       |  50 +++++---
 evalscope/collections/evaluator.py            |  50 ++++----
 evalscope/collections/run.py                  |   0
 evalscope/metrics/__init__.py                 |   5 +-
 evalscope/run.py                              |   6 +
 tests/cli/test_collection.py                  |  56 +++++++++
 tests/cli/test_run.py                         |  14 ++-
 16 files changed, 284 insertions(+), 392 deletions(-)
 delete mode 100644 evalscope/collections/run.py
 create mode 100644 tests/cli/test_collection.py

diff --git a/evalscope/benchmarks/__init__.py b/evalscope/benchmarks/__init__.py
index 444d5e79..984f4f00 100644
--- a/evalscope/benchmarks/__init__.py
+++ b/evalscope/benchmarks/__init__.py
@@ -20,4 +20,4 @@
         module_path = relative_path[:-3].replace(os.path.sep, '.')  # strip '.py' and convert to module path
         full_path = f'evalscope.benchmarks.{module_path}'
         importlib.import_module(full_path)
-        print(f'Importing {full_path}')
+        # print(f'Importing {full_path}')
diff --git a/evalscope/benchmarks/cmmlu/cmmlu_adapter.py b/evalscope/benchmarks/cmmlu/cmmlu_adapter.py
index 7e358f81..8fc41dd4 100644
--- a/evalscope/benchmarks/cmmlu/cmmlu_adapter.py
+++ b/evalscope/benchmarks/cmmlu/cmmlu_adapter.py
@@ -3,8 +3,10 @@
 import csv
 import os
 
-from evalscope.benchmarks.data_adapter import DataAdapter
-from evalscope.metrics.metrics import exact_match, weighted_mean
+from evalscope.benchmarks import Benchmark, DataAdapter
+from evalscope.constants import EvalType
+from evalscope.metrics import WeightedAverageAccuracy, exact_match
+from evalscope.models import MultiChoiceModelAdapter
 from evalscope.utils import ResponseParser, normalize_score
 from evalscope.utils.logger import get_logger
 
@@ -12,8 +14,6 @@
 
 logger = get_logger()
 
-DATASET_ID = 'modelscope/cmmlu'
-
 SUBSET_LIST = [
     'agronomy', 'anatomy', 'ancient_chinese', 'arts', 'astronomy', 'business_ethics', 'chinese_civil_service_exam',
     'chinese_driving_rule', 'chinese_food_culture', 'chinese_foreign_policy', 'chinese_history', 'chinese_literature',
@@ -101,31 +101,23 @@
 }
 
 
+@Benchmark.register(
+    name='cmmlu',
+    dataset_id='modelscope/cmmlu',
+    model_adapter=MultiChoiceModelAdapter,
+    subset_list=SUBSET_LIST,
+    metric_list=[WeightedAverageAccuracy],
+    few_shot_num=5,
+    train_split='dev',
+    eval_split='test',
+)
 class CMMLUAdapter(DataAdapter):
 
     choices = ['A', 'B', 'C', 'D']
 
-    def __init__(self,
-                 subset_list: list = None,
-                 metric_list: list = None,
-                 few_shot_num: int = 5,
-                 train_split: str = 'dev',
-                 eval_split: str = 'test',
-                 **kwargs):
-
-        if subset_list is None:
-            subset_list = SUBSET_LIST
-
-        if metric_list is None:
-            metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
+    def __init__(self, **kwargs):
 
-        super().__init__(
-            subset_list=subset_list,
-            metric_list=metric_list,
-            few_shot_num=few_shot_num,
-            train_split=train_split,
-            eval_split=eval_split,
-            **kwargs)
+        super().__init__(**kwargs)
 
     def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
         data_dict = {}
@@ -187,7 +179,7 @@ def get_gold_answer(self, input_d: dict) -> str:
         # Get the gold choice
         return input_d.get('Answer', '')
 
-    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
+    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
         """
         Parse the model output to get the answer. Could be the best choice index.
 
@@ -199,11 +191,11 @@ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: st
         Returns:
             The parsed answer. Depending on the dataset. Usually a string for chat.
         """
-        if eval_type == 'checkpoint':
+        if eval_type == EvalType.CHECKPOINT:
             return result
-        elif eval_type == 'service':
+        elif eval_type == EvalType.SERVICE:
             return ResponseParser.parse_first_option_with_choices(result, self.choices)  # TODO: to be checked !
-        elif eval_type == 'custom':
+        elif eval_type == EvalType.CUSTOM:
             return ResponseParser.parse_first_option_with_choices(result, self.choices)  # TODO: to be checked !
         else:
             raise ValueError(f'Invalid eval_type: {eval_type}')
@@ -211,19 +203,6 @@ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: st
     def match(self, gold: str, pred: str) -> float:
         return exact_match(gold=gold, pred=pred)
 
-    def compute_metric(self, review_res_list: list) -> float:
-        """
-        Compute evaluation result by specific metric.
-
-        Args:
-            review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
-
-        Returns:
-            The metric score.
-        """
-        items = [(score, 1.0) for score in review_res_list]
-        return weighted_mean(items)
-
     def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
         """
         Generate report for the evaluation.
diff --git a/evalscope/benchmarks/general_qa/general_qa_adapter.py b/evalscope/benchmarks/general_qa/general_qa_adapter.py
index c0178a96..e2941687 100644
--- a/evalscope/benchmarks/general_qa/general_qa_adapter.py
+++ b/evalscope/benchmarks/general_qa/general_qa_adapter.py
@@ -5,35 +5,32 @@
 from collections import defaultdict
 from typing import Any, Optional
 
-from evalscope.benchmarks.data_adapter import DataAdapter
-from evalscope.metrics.metrics import bleu_ngram_one_sample, weighted_mean
-from evalscope.metrics.rouge_metric import compute_rouge_score_one_sample_zh
+from evalscope.benchmarks import Benchmark, DataAdapter
+from evalscope.metrics import (WeightedAverageBLEU, bleu_ngram_one_sample, compute_rouge_score_one_sample_zh,
+                               weighted_mean)
+from evalscope.models import ChatGenerationModelAdapter
 from evalscope.utils.io_utils import jsonl_to_list
 from evalscope.utils.logger import get_logger
 
 logger = get_logger()
 
-DATASET_ID = 'general_qa'
-SUBSET_LIST = ['default']
-
 
+@Benchmark.register(
+    name='general_qa',
+    dataset_id='general_qa',
+    model_adapter=ChatGenerationModelAdapter,
+    subset_list=['default'],
+    metric_list=[WeightedAverageBLEU],
+    few_shot_num=0,
+    train_split=None,
+    eval_split='test',
+)
 class GeneralQAAdapter(DataAdapter):
     # TODO: set few_shot_num
 
-    def __init__(self,
-                 subset_list: list = None,
-                 metric_list: list = None,
-                 train_split: str = None,
-                 eval_split: str = 'test',
-                 **kwargs):
-        if subset_list is None:
-            subset_list = SUBSET_LIST
-
-        if metric_list is None:
-            metric_list = [{'name': 'WeightedAverageBLEU', 'object': weighted_mean}]
+    def __init__(self, **kwargs):
 
-        super().__init__(
-            subset_list=subset_list, metric_list=metric_list, train_split=train_split, eval_split=eval_split, **kwargs)
+        super().__init__(**kwargs)
 
     def load(self, dataset_name_or_path: str, subset_list: list = None, **kwargs) -> dict:
 
diff --git a/evalscope/benchmarks/humaneval/humaneval_adapter.py b/evalscope/benchmarks/humaneval/humaneval_adapter.py
index 501a0e32..39d80976 100644
--- a/evalscope/benchmarks/humaneval/humaneval_adapter.py
+++ b/evalscope/benchmarks/humaneval/humaneval_adapter.py
@@ -2,33 +2,34 @@
 import re
 from typing import List
 
-from evalscope.benchmarks.data_adapter import DataAdapter
-from evalscope.metrics.metrics import weighted_mean
-from evalscope.utils import normalize_score
+from evalscope.benchmarks import Benchmark, DataAdapter
+from evalscope.metrics import Pass1
+from evalscope.models import ChatGenerationModelAdapter
 from evalscope.utils.logger import get_logger
 
 logger = get_logger()
 
-DATASET_ID = 'modelscope/humaneval'
-SUBSET_LIST = ['openai_humaneval']
-
 # Example:
 # {"task_id": "HumanEval/0", "prompt": "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    \"\"\"\n", "entry_point": "has_close_elements", "canonical_solution": "    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n    assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\n"}  # noqa
 
 
+@Benchmark.register(
+    name='humaneval',
+    dataset_id='modelscope/humaneval',
+    model_adapter=ChatGenerationModelAdapter,
+    subset_list=['openai_humaneval'],
+    metric_list=[Pass1],
+    few_shot_num=0,
+    train_split=None,
+    eval_split='test',
+    prompt_template='Complete the following python code:\n',
+)
 class HumanevalAdapter(DataAdapter):
     """
     A placeholder for humaneval adapter, see HumanevalEvaluator for implementation.
     """
 
-    def __init__(self,
-                 subset_list: list = None,
-                 metric_list: list = None,
-                 few_shot_num: int = None,
-                 train_split: str = None,
-                 eval_split: str = 'test',
-                 prompt_template: str = None,
-                 **kwargs):
+    def __init__(self, **kwargs):
         try:
             from human_eval.data import stream_jsonl, write_jsonl
             from human_eval.evaluation import check_correctness
@@ -37,15 +38,6 @@ def __init__(self,
                               'https://github.com/openai/human-eval/tree/master#installation , '
                               'Note that you need to enable the execution code in the human_eval/execution.py first.')
 
-        if subset_list is None:
-            subset_list = SUBSET_LIST
-
-        if metric_list is None:
-            metric_list = [{'name': 'pass@1', 'object': weighted_mean}]
-
-        if prompt_template is None:
-            prompt_template = 'Complete the following python code:\n'
-
         self.k = [1]
         self.num_workers = 4
         self.timeout = 4.0
@@ -54,14 +46,7 @@ def __init__(self,
         self.write_jsonl_func = write_jsonl
         self.eval_func = check_correctness
 
-        super().__init__(
-            subset_list=subset_list,
-            metric_list=metric_list,
-            few_shot_num=few_shot_num,
-            train_split=train_split,
-            eval_split=eval_split,
-            prompt_template=prompt_template,
-            **kwargs)
+        super().__init__(**kwargs)
 
     def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
         data_dict = {}
@@ -85,26 +70,6 @@ def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
 
         return {'data': [full_prompt]}
 
-    def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
-        total_num: int = sum([num for _, num in subset_score_map.values()])
-        weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
-        weighted_avg_acc = normalize_score(score=weighted_avg_acc)
-        cate_avg_list = [{
-            'name': subset_name,
-            'score': normalize_score(score=score)
-        } for subset_name, (score, _) in subset_score_map.items()]
-
-        category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
-
-        res_map = dict(
-            name=report_name or 'HumanEval',
-            metric=self.metric_list[0]['name'],
-            score=weighted_avg_acc,
-            category=[category_d],
-            total_num=total_num)
-
-        return res_map
-
     @classmethod
     def _postprocess(cls, text: str) -> str:
         if '```' in text:
@@ -129,19 +94,6 @@ def _postprocess(cls, text: str) -> str:
                 text = '\n'.join(['    ' + line for line in text.split('\n')])
         return text
 
-    def compute_metric(self, review_res_list: list) -> float:
-        """
-        Compute evaluation result by specific metric.
-
-        Args:
-            review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
-
-        Returns:
-            The metric score.
-        """
-        items = [(score, 1.0) for score in review_res_list]
-        return weighted_mean(items)
-
     def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
         return self._postprocess(result)
 
diff --git a/evalscope/benchmarks/mmlu/mmlu_adapter.py b/evalscope/benchmarks/mmlu/mmlu_adapter.py
index ed6769c7..d77839c9 100644
--- a/evalscope/benchmarks/mmlu/mmlu_adapter.py
+++ b/evalscope/benchmarks/mmlu/mmlu_adapter.py
@@ -2,8 +2,10 @@
 import csv
 import os
 
-from evalscope.benchmarks.data_adapter import DataAdapter
-from evalscope.metrics.metrics import exact_match, weighted_mean
+from evalscope.benchmarks import Benchmark, DataAdapter
+from evalscope.constants import EvalType
+from evalscope.metrics import WeightedAverageAccuracy, exact_match
+from evalscope.models import MultiChoiceModelAdapter
 from evalscope.utils import ResponseParser, normalize_score
 from evalscope.utils.logger import get_logger
 
@@ -134,37 +136,29 @@
 }
 
 
+@Benchmark.register(
+    name='mmlu',
+    dataset_id='modelscope/mmlu',
+    model_adapter=MultiChoiceModelAdapter,
+    subset_list=SUBSET_LIST,
+    metric_list=[WeightedAverageAccuracy],
+    few_shot_num=5,
+    train_split='train',
+    eval_split='test',
+    prompt_template='',
+)
 class MMLUAdapter(DataAdapter):
 
     choices = ['A', 'B', 'C', 'D']
 
-    def __init__(self,
-                 subset_list: list = SUBSET_LIST,
-                 metric_list: list = None,
-                 few_shot_num: int = None,
-                 train_split: str = 'train',
-                 eval_split: str = 'test',
-                 **kwargs):
-
-        if metric_list is None:
-            metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
-
-        if few_shot_num is None:
-            # Use 5-shot by default
-            logger.info(f'Set 5-shot examples by system for MMLU.')
-            few_shot_num = 5
+    def __init__(self, **kwargs):
 
+        few_shot_num = kwargs.get('few_shot_num', 5)
         if few_shot_num > 5:
             logger.warning(f'few_shot_num <= 5 for MMLU, but got {few_shot_num}. Use 5-shot by default.')
-            few_shot_num = 5
+            kwargs['few_shot_num'] = 5
 
-        super().__init__(
-            subset_list=subset_list,
-            metric_list=metric_list,
-            few_shot_num=few_shot_num,
-            train_split=train_split,
-            eval_split=eval_split,
-            **kwargs)
+        super().__init__(**kwargs)
 
     def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
         data_dict = {}
@@ -241,7 +235,7 @@ def get_gold_answer(self, input_d: dict) -> str:
         # Get the gold choice
         return input_d.get('target', '')
 
-    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
+    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
         """
         Parse the model output to get the answer. Could be the best choice index.
 
@@ -253,11 +247,11 @@ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: st
         Returns:
             The parsed answer. Depending on the dataset. Usually a string for chat.
         """
-        if eval_type == 'checkpoint':
+        if eval_type == EvalType.CHECKPOINT:
             return result
-        elif eval_type == 'service':
+        elif eval_type == EvalType.SERVICE:
             return ResponseParser.parse_first_option_with_choices(result, self.choices)  # TODO: to be checked !
-        elif eval_type == 'custom':
+        elif eval_type == EvalType.CUSTOM:
             return ResponseParser.parse_first_option_with_choices(result, self.choices)  # TODO: to be checked !
         else:
             raise ValueError(f'Invalid eval_type: {eval_type}')
@@ -265,19 +259,6 @@ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: st
     def match(self, gold: str, pred: str) -> float:
         return exact_match(gold=gold, pred=pred)
 
-    def compute_metric(self, review_res_list: list) -> float:
-        """
-        Compute evaluation result by specific metric.
-
-        Args:
-            review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
-
-        Returns:
-            The metric score.
-        """
-        items = [(score, 1.0) for score in review_res_list]
-        return weighted_mean(items)
-
     def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
         """
         Generate report for the evaluation.
diff --git a/evalscope/benchmarks/race/race_adapter.py b/evalscope/benchmarks/race/race_adapter.py
index 3496db9e..bf73882a 100644
--- a/evalscope/benchmarks/race/race_adapter.py
+++ b/evalscope/benchmarks/race/race_adapter.py
@@ -1,11 +1,12 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
-import json
 import os
 
-from evalscope.benchmarks.data_adapter import DataAdapter
-from evalscope.metrics.metrics import exact_match, weighted_mean
-from evalscope.utils import normalize_score
+from evalscope.benchmarks import Benchmark, DataAdapter
+from evalscope.constants import EvalType
+from evalscope.metrics import WeightedAverageAccuracy, exact_match
+from evalscope.models import MultiChoiceModelAdapter
+from evalscope.utils import ResponseParser, normalize_score
 from evalscope.utils.io_utils import jsonl_to_list
 from evalscope.utils.logger import get_logger
 
@@ -13,46 +14,30 @@
 
 logger = get_logger()
 
-DATASET_ID = 'modelscope/race'
-
-SUBSET_LIST = ['high', 'middle']
-
 SUBJECT_MAPPING = {'high': 'High', 'middle': 'Middle'}
 
 
+@Benchmark.register(
+    name='race',
+    dataset_id='modelscope/race',
+    model_adapter=MultiChoiceModelAdapter,
+    subset_list=['high', 'middle'],
+    metric_list=[WeightedAverageAccuracy],
+    few_shot_num=3,
+    train_split='train',
+    eval_split='test',
+)
 class RACEAdapter(DataAdapter):
 
     choices = ['A', 'B', 'C', 'D']
 
-    def __init__(self,
-                 subset_list: list = None,
-                 metric_list: list = None,
-                 few_shot_num: int = None,
-                 train_split: str = 'train',
-                 eval_split: str = 'test',
-                 **kwargs):
-
-        if subset_list is None:
-            subset_list = SUBSET_LIST
-
-        if metric_list is None:
-            metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
-
-        if few_shot_num is None:
-            logger.info(f'Set 3-shot examples by system for RACE.')
-            few_shot_num = 3
-
+    def __init__(self, **kwargs):
+        few_shot_num = kwargs.get('few_shot_num', 3)
         if few_shot_num > 3:
             logger.warning(f'few_shot_num <= 3 for RACE, but got {few_shot_num}. Use 3-shot by default.')
-            few_shot_num = 3
+            kwargs['few_shot_num'] = 3
 
-        super().__init__(
-            subset_list=subset_list,
-            metric_list=metric_list,
-            few_shot_num=few_shot_num,
-            train_split=train_split,
-            eval_split=eval_split,
-            **kwargs)
+        super().__init__(**kwargs)
 
     def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
         data_dict = {}
@@ -105,7 +90,7 @@ def get_gold_answer(self, input_d: dict) -> str:
         # Get the gold choice
         return input_d.get('answer', '')
 
-    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
+    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
         """
         Parse the model output to get the answer. Could be the best choice index.
 
@@ -117,31 +102,18 @@ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: st
         Returns:
             The parsed answer. Depending on the dataset. Usually a string for chat.
         """
-        if eval_type == 'checkpoint':
-            return result
-        elif eval_type == 'service':  # TODO: to be implemented
-            return result
-        elif eval_type == 'custom':  # TODO: to be implemented
+        if eval_type == EvalType.CHECKPOINT:
             return result
+        elif eval_type == EvalType.SERVICE:
+            return ResponseParser.parse_first_option_with_choices(result, self.choices)  # TODO: to be checked !
+        elif eval_type == EvalType.CUSTOM:
+            return ResponseParser.parse_first_option_with_choices(result, self.choices)  # TODO: to be checked !
         else:
             raise ValueError(f'Unknown eval_type: {eval_type}')
 
     def match(self, gold: str, pred: str) -> float:
         return exact_match(gold=gold, pred=pred)
 
-    def compute_metric(self, review_res_list: list) -> float:
-        """
-        Compute evaluation result by specific metric.
-
-        Args:
-            review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
-
-        Returns:
-            The metric score.
-        """
-        items = [(score, 1.0) for score in review_res_list]
-        return weighted_mean(items)
-
     def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
         """
         Generate report for the evaluation.
diff --git a/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py b/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py
index 1923b819..c604128f 100644
--- a/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py
+++ b/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py
@@ -5,45 +5,35 @@
 import os
 from typing import List
 
+from evalscope.benchmarks import Benchmark
 from evalscope.benchmarks.data_adapter import DataAdapter
-from evalscope.metrics.metrics import exact_match, weighted_mean
-from evalscope.utils.logger import get_logger
+from evalscope.constants import EvalType
+from evalscope.metrics import WeightedAverageAccuracy
+from evalscope.metrics.metrics import exact_match
+from evalscope.models import ChatGenerationModelAdapter
+from evalscope.utils import get_logger
+from evalscope.utils.utils import ResponseParser
 
 # flake8: noqa
 
 logger = get_logger()
 
-DATASET_ID = 'modelscope/trivia_qa'
-SUBSET_LIST = ['default']
-
 
+@Benchmark.register(
+    name='trivia_qa',
+    dataset_id='modelscope/trivia_qa',
+    model_adapter=ChatGenerationModelAdapter,
+    subset_list=['default'],
+    metric_list=[WeightedAverageAccuracy],
+    few_shot_num=5,
+    train_split='dev',
+    eval_split='test',
+)
 class TriviaQaAdapter(DataAdapter):
 
-    def __init__(self,
-                 subset_list: list = None,
-                 metric_list: list = None,
-                 few_shot_num: int = None,
-                 train_split: str = 'dev',
-                 eval_split: str = 'test',
-                 **kwargs):
-
-        if subset_list is None:
-            subset_list = SUBSET_LIST
-
-        if metric_list is None:
-            metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
+    def __init__(self, **kwargs):
 
-        if few_shot_num is None:
-            logger.info(f'few_shot_num is not specified for TriviaQA, use default value: 5')
-            few_shot_num = 5
-
-        super().__init__(
-            subset_list=subset_list,
-            metric_list=metric_list,
-            few_shot_num=few_shot_num,
-            train_split=train_split,
-            eval_split=eval_split,
-            **kwargs)
+        super().__init__(**kwargs)
 
     def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
         data_dict = {}
@@ -122,7 +112,7 @@ def get_gold_answer(self, input_d: dict) -> list:
         ans: list = input_d.get('ideal', [])
         return ans
 
-    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
+    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
         """
         Parse the model output to get the answer.
 
@@ -134,74 +124,11 @@ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: st
         Returns:
             The predicted answer.
         """
-        if eval_type == 'checkpoint':
-            return result
-        elif eval_type == 'service':  # TODO: to be implemented
-            return result
-        elif eval_type == 'custom':  # TODO: to be implemented
-            return result
-        else:
-            raise ValueError(f'Unknown eval_type: {eval_type}')
+        return ResponseParser.parse_first_option(result)
 
     def match(self, gold: list, pred: str) -> float:
         return max([exact_match(gold=ref, pred=pred) for ref in gold])
 
-    def compute_metric(self, review_res_list: list) -> float:
-        """
-        Compute evaluation result by specific metric.
-
-        Args:
-            review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
-
-        Returns:
-            The metric score.
-        """
-        items = [(score, 1.0) for score in review_res_list]
-        return weighted_mean(items)
-
-    def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
-        """
-        Generate the report for the model output.
-
-        Args:
-            subset_score_map: {subset_name: (score, num), ...}
-            report_name: The user-defined report name.
-
-        Returns:
-        {
-            "name":"TriviaQA",
-            "metric":"WeightedAverageAccuracy",
-            "score":0.3389,
-            "category":[
-                {
-                    "name":"DEFAULT",
-                    "score":0.3389,
-                    "subset":[
-                        {
-                            "name":"default",
-                            "score":0.3389
-                        }
-                    ]
-                }
-            ],
-            "total_num":100
-        }
-        """
-        total_num: int = sum([num for _, num in subset_score_map.values()])
-        weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
-        cate_avg_list = [{'name': subset_name, 'score': score} for subset_name, (score, _) in subset_score_map.items()]
-
-        category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
-
-        res_map = dict(
-            name=report_name or 'trivia_qa',
-            metric=self.metric_list[0]['name'],
-            score=weighted_avg_acc,
-            category=[category_d],
-            total_num=total_num)
-
-        return res_map
-
     @classmethod
     def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:
 
diff --git a/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py b/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py
index ca6128d2..4a83c5e9 100644
--- a/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py
+++ b/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py
@@ -6,20 +6,31 @@
 import os
 from typing import List
 
+from evalscope.benchmarks import Benchmark
 from evalscope.benchmarks.data_adapter import DataAdapter
+from evalscope.constants import EvalType
+from evalscope.metrics import WeightedAverageAccuracy
 from evalscope.metrics.metrics import weighted_mean
-from evalscope.utils import normalize_score
-from evalscope.utils.logger import get_logger
+from evalscope.models import ContinuationLogitsModelAdapter
+from evalscope.utils import get_logger, normalize_score
 
 # flake8: noqa
 
 logger = get_logger()
 
-DATASET_ID = 'modelscope/truthful_qa'
 # task_list = ['generation', 'multiple_choice']
-SUBSET_LIST = ['multiple_choice']
 
 
+@Benchmark.register(
+    name='truthful_qa',
+    dataset_id='modelscope/truthful_qa',
+    model_adapter=ContinuationLogitsModelAdapter,
+    subset_list=['multiple_choice'],
+    metric_list=[WeightedAverageAccuracy],
+    few_shot_num=0,
+    train_split=None,
+    eval_split='validation',
+)
 class TruthfulQaAdapter(DataAdapter):
     """
     Adapter for TruthfulQA benchmark.
@@ -40,35 +51,14 @@ class TruthfulQaAdapter(DataAdapter):
                  'Q: Where were the 1992 Olympics held?\n'
                  'A: The 1992 Olympics were held in Barcelona, Spain.')
 
-    def __init__(self,
-                 subset_list: list = None,
-                 metric_list: list = None,
-                 few_shot_num: int = None,
-                 train_split: str = None,
-                 eval_split: str = 'validation',
-                 **kwargs):
-
-        if subset_list is None:
-            subset_list = SUBSET_LIST
-
-        if metric_list is None:
-            metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
-
-        if few_shot_num is None:
-            logger.info(f'Set 0-shot examples by system for TruthfulQA.')
-            few_shot_num = 0
+    def __init__(self, **kwargs):
 
+        few_shot_num = kwargs.get('few_shot_num', 0)
         if few_shot_num != 0:
             logger.warning(f'few_shot_num should be 0 for TruthfulQA, but got {few_shot_num}. Use 0-shot by default.')
-            few_shot_num = 0
+            kwargs['few_shot_num'] = 0
 
-        super().__init__(
-            subset_list=subset_list,
-            metric_list=metric_list,
-            few_shot_num=few_shot_num,
-            train_split=train_split,
-            eval_split=eval_split,
-            **kwargs)
+        super().__init__(**kwargs)
 
     def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
         data_dict = {}
@@ -215,7 +205,7 @@ def get_gold_answer(self, input_d: dict) -> dict:
         # TODO: generation sub-task to be added
         return {'mc1_labels': input_d['mc1_targets']['labels'], 'mc2_labels': input_d['mc2_targets']['labels']}
 
-    def parse_pred_result(self, result: list, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> list:
+    def parse_pred_result(self, result: list, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> list:
         """
         Parse the model output to get the answer.
 
@@ -227,11 +217,11 @@ def parse_pred_result(self, result: list, raw_input_d: dict = None, eval_type: s
         Returns:
             The predicted answer.
         """
-        if eval_type == 'checkpoint':
+        if eval_type == EvalType.CHECKPOINT:
             return result
-        elif eval_type == 'service':  # TODO: to be supported !
+        elif eval_type == EvalType.SERVICE:  # TODO: to be supported !
             return result
-        elif eval_type == 'custom':  # TODO: to be supported !
+        elif eval_type == EvalType.CUSTOM:  # TODO: to be supported !
             return result
         else:
             raise ValueError(f'Invalid eval_type: {eval_type}')
diff --git a/evalscope/collections/__init__.py b/evalscope/collections/__init__.py
index e69de29b..c87f613f 100644
--- a/evalscope/collections/__init__.py
+++ b/evalscope/collections/__init__.py
@@ -0,0 +1,3 @@
+from evalscope.collections.data_generator import WeightedSampler
+from evalscope.collections.evaluator import EvaluatorCollection
+from evalscope.collections.schema import CollectionSchema
diff --git a/evalscope/collections/data_generator.py b/evalscope/collections/data_generator.py
index 6fc1297a..499abac6 100644
--- a/evalscope/collections/data_generator.py
+++ b/evalscope/collections/data_generator.py
@@ -1,10 +1,12 @@
 import json
 import random
 from abc import ABC, abstractmethod
+from dataclasses import asdict, dataclass, field
 from tqdm import tqdm
 from typing import List, Optional
 
 from evalscope.collections.schema import CollectionSchema
+from evalscope.utils.io_utils import dump_jsonl_data
 
 
 # Define an abstract base class for Samplers
@@ -19,30 +21,42 @@ def sample(self) -> List[dict]:
         pass
 
 
+@dataclass
+class DatasetEntry:
+    index: int = 0
+    prompt: dict = field(default_factory=dict)
+    tags: List[str] = field(default_factory=list)
+    task: str = ''
+    weight: float = 0.0
+    dataset_name: str = ''
+    subset_name: str = ''
+
+
 class WeightedSampler(Sampler):
 
     def sample(self) -> List[dict]:
-        all_data = []
+        all_data: List[DatasetEntry] = []
 
         dataset_info_list = self.schema.flatten()
         total_weight = sum(dataset.weight for dataset in dataset_info_list)
 
         remaining_count = self.count
 
-        for i, dataset in enumerate(tqdm(dataset_info_list)):
+        for i, dataset in enumerate(tqdm(dataset_info_list, desc='Sampling data')):
             data_dict = dataset.get_data()
 
             dataset_data = []
             for subset_name, subset_data in data_dict.items():
                 for prompt in subset_data:
-                    dataset_data.append({
-                        'prompt': prompt,
-                        'tags': dataset.tags,
-                        'task': dataset.task_type,
-                        'weight': dataset.weight,
-                        'dataset_name': dataset.name,
-                        'subset_name': subset_name,
-                    })
+                    dataset_data.append(
+                        DatasetEntry(
+                            prompt=prompt,
+                            tags=dataset.tags,
+                            task=dataset.task_type,
+                            weight=dataset.weight,
+                            dataset_name=dataset.name,
+                            subset_name=subset_name,
+                        ))
 
             # For the last dataset, use the remaining count
             if i == len(dataset_info_list) - 1:
@@ -54,18 +68,16 @@ def sample(self) -> List[dict]:
             sampled_data = random.choices(dataset_data, k=dataset_sample_count)
             all_data.extend(sampled_data)
 
-        return all_data
-
-
-def save_to_jsonl(data, file_path):
-    with open(file_path, 'w') as f:
-        for i, entry in enumerate(data):
-            entry['id'] = i
-            f.write(json.dumps(entry, ensure_ascii=False) + '\n')
+        # update index
+        result = []
+        for i, entry in enumerate(all_data):
+            entry.index = i
+            result.append(asdict(entry))
+        return result
 
 
 if __name__ == '__main__':
     schema = CollectionSchema.from_dict(json.load(open('outputs/schema.json', 'r')))
     print(schema.to_dict())
     mixed_data = WeightedSampler(schema, 10).sample()
-    save_to_jsonl(mixed_data, 'outputs/mixed_data.jsonl')
+    dump_jsonl_data(mixed_data, 'outputs/mixed_data.jsonl')
diff --git a/evalscope/collections/evaluator.py b/evalscope/collections/evaluator.py
index fa4e3a26..2dfe1c9e 100644
--- a/evalscope/collections/evaluator.py
+++ b/evalscope/collections/evaluator.py
@@ -6,6 +6,7 @@
 from tqdm import tqdm
 
 from evalscope.benchmarks import Benchmark
+from evalscope.collections.data_generator import DatasetEntry
 from evalscope.config import TaskConfig
 from evalscope.constants import AnswerKeys, DumpMode, EvalType, ReviewKeys
 from evalscope.evaluator import Evaluator
@@ -40,23 +41,28 @@ def get_review(self, answer_d) -> dict:
 
 class EvaluatorCollection:
 
-    def __init__(self, task_cfg: TaskConfig):
+    def __init__(self, task_cfg: TaskConfig, outputs: OutputsStructure):
         self.task_cfg = task_cfg
+        self.outputs = outputs
         self.model = get_local_model(task_cfg)
-        self.outputs = OutputsStructure(
-            outputs_dir=os.path.join(self.task_cfg.work_dir,
-                                     datetime.now().strftime('%Y%m%d%H%M%S')))
-        self.raw_dataset = jsonl_to_list(self.task_cfg.dataset_args['data_collection']['local_path'])
+        self.dataset = self.load()
         self.dataset_name_map, self.dataset_id_map = self._parse_dataset()
         self.evaluators = self._initialize_evaluators()
 
+    def load(self) -> list[DatasetEntry]:
+        raw_dataset = jsonl_to_list(self.task_cfg.dataset_args['data_collection']['local_path'])
+        datasets = []
+        for sample in raw_dataset:
+            datasets.append(DatasetEntry(**sample))
+        return datasets
+
     def _parse_dataset(self):
         dataset_name_map = defaultdict(lambda: defaultdict(list))
         dataset_id_map = {}
-        for sample in self.raw_dataset:
-            dataset_name, subset_name = sample['dataset_name'], sample['subset_name']
-            dataset_name_map[dataset_name][subset_name].append(sample['id'])
-            dataset_id_map[sample['id']] = sample
+        for sample in self.dataset:
+            dataset_name, subset_name = sample.dataset_name, sample.subset_name
+            dataset_name_map[dataset_name][subset_name].append(sample.index)
+            dataset_id_map[sample.index] = sample
         return dataset_name_map, dataset_id_map
 
     def _initialize_evaluators(self):
@@ -75,13 +81,13 @@ def get_report(self, reviews):
             for subset_name, ids in data_map.items():
                 for _id in ids:
                     review_d = reviews[_id]
-                    row_data = self.dataset_id_map[_id]
+                    row_data: DatasetEntry = self.dataset_id_map[_id]
                     score = self.get_pred_score(review_d)
                     data.append({
-                        'task_type': row_data['task'],
+                        'task_type': row_data.task,
                         'dataset_name': dataset_name,
                         'subset_name': subset_name,
-                        'tags': row_data['tags'],
+                        'tags': row_data.tags,
                         'score': score
                     })
 
@@ -115,20 +121,20 @@ def get_report(self, reviews):
     def get_answers(self):
         pred_file_path = os.path.join(self.outputs.predictions_dir, 'data_collection.jsonl')
         answers = defaultdict(dict)
-        for sample in tqdm(self.raw_dataset, desc='Getting answers'):
-            evaluator = self.evaluators[sample['dataset_name']]
-            answer_d = evaluator.get_answer(sample['prompt'], sample['subset_name'], self.task_cfg.generation_config)
-            answers[sample['id']] = answer_d
+        for sample in tqdm(self.dataset, desc='Getting answers'):
+            evaluator = self.evaluators[sample.dataset_name]
+            answer_d = evaluator.get_answer(sample.prompt, sample.subset_name, self.task_cfg.generation_config)
+            answers[sample.index] = answer_d
             dump_jsonl_data(answer_d, pred_file_path, dump_mode=DumpMode.APPEND)
         return answers
 
     def get_reviews(self, answers):
         review_file_path = os.path.join(self.outputs.reviews_dir, 'data_collection.jsonl')
         reviews = defaultdict(dict)
-        for sample in tqdm(self.raw_dataset, desc='Getting reviews'):
-            evaluator = self.evaluators[sample['dataset_name']]
-            review_d = evaluator.get_review(answers[sample['id']])
-            reviews[sample['id']] = review_d
+        for sample in tqdm(self.dataset, desc='Getting reviews'):
+            evaluator = self.evaluators[sample.dataset_name]
+            review_d = evaluator.get_review(answers[sample.index])
+            reviews[sample.index] = review_d
             dump_jsonl_data(review_d, review_file_path, dump_mode=DumpMode.APPEND)
         return reviews
 
@@ -136,7 +142,7 @@ def get_reviews(self, answers):
     def get_pred_score(review_d) -> float:
         return review_d[AnswerKeys.CHOICES][0][ReviewKeys.REVIEW][ReviewKeys.RESULT]
 
-    def evaluate(self):
+    def eval(self, **kwargs):
         answers = self.get_answers()
         reviews = self.get_reviews(answers)
         self.get_report(reviews)
@@ -155,4 +161,4 @@ def evaluate(self):
     )
 
     evaluator_collection = EvaluatorCollection(task_cfg)
-    evaluator_collection.evaluate()
+    evaluator_collection.eval()
diff --git a/evalscope/collections/run.py b/evalscope/collections/run.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/evalscope/metrics/__init__.py b/evalscope/metrics/__init__.py
index 7c7ff37a..1714b5e2 100644
--- a/evalscope/metrics/__init__.py
+++ b/evalscope/metrics/__init__.py
@@ -1,4 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from evalscope.metrics.metrics import exact_match, weighted_mean
+from evalscope.metrics.metrics import bleu_ngram_one_sample, exact_match, weighted_mean
+from evalscope.metrics.rouge_metric import compute_rouge_score_one_sample_zh
 
 WeightedAverageAccuracy = {'name': 'WeightedAverageAccuracy', 'object': weighted_mean}
+WeightedAverageBLEU = {'name': 'WeightedAverageBLEU', 'object': weighted_mean}
+Pass1 = {'name': 'Pass@1', 'object': weighted_mean}
diff --git a/evalscope/run.py b/evalscope/run.py
index e968b6d2..998ecc99 100644
--- a/evalscope/run.py
+++ b/evalscope/run.py
@@ -119,6 +119,12 @@ def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
 
 def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsStructure, base_model: LocalModel):
     """Create an evaluator object for the specified dataset."""
+
+    if dataset_name == 'data_collection':
+        # EvaluatorCollection is a collection of evaluators
+        from evalscope.collections import EvaluatorCollection
+        return EvaluatorCollection(task_cfg, outputs)
+
     benchmark: BenchmarkMeta = Benchmark.get(dataset_name)
 
     data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
diff --git a/tests/cli/test_collection.py b/tests/cli/test_collection.py
new file mode 100644
index 00000000..6a290e7d
--- /dev/null
+++ b/tests/cli/test_collection.py
@@ -0,0 +1,56 @@
+import json
+import unittest
+
+from evalscope.collections.data_generator import WeightedSampler
+from evalscope.collections.schema import CollectionSchema, DatasetInfo
+from evalscope.constants import EvalType
+from evalscope.run import run_task
+from evalscope.utils.io_utils import dump_jsonl_data
+from evalscope.utils.utils import test_level_list
+
+
+class TestCollection(unittest.TestCase):
+    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
+    def test_create_collection(self):
+        schema = CollectionSchema(
+            name='math&reasoning',
+            datasets=[
+                CollectionSchema(
+                    name='math',
+                    datasets=[
+                        DatasetInfo(name='gsm8k', weight=1, task_type='math', tags=['en', 'math']),
+                        DatasetInfo(name='competition_math', weight=2, task_type='math', tags=['en', 'math']),
+                    ]),
+                CollectionSchema(
+                    name='reasoning',
+                    datasets=[
+                        DatasetInfo(name='arc', weight=1, task_type='reasoning', tags=['en', 'reasoning']),
+                    ]),
+            ])
+        print(schema.to_dict())
+        print(schema.flatten())
+        schema.dump_json('outputs/schema_test.json')
+
+
+    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
+    def test_generate_data(self):
+        schema = CollectionSchema.from_dict(json.load(open('outputs/schema_test.json', 'r')))
+        print(schema.to_dict())
+        mixed_data = WeightedSampler(schema, 10).sample()
+        dump_jsonl_data(mixed_data, 'outputs/mixed_data_test.jsonl')
+
+    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
+    def test_evaluate_collection(self):
+        from evalscope.config import TaskConfig
+
+        task_cfg = TaskConfig(
+            model='qwen2.5',
+            api_url='http://127.0.0.1:8801/v1/chat/completions',
+            api_key='EMPTY',
+            eval_type=EvalType.SERVICE,
+            datasets=['data_collection'],
+            dataset_args={'data_collection': {
+                'local_path': 'outputs/mixed_data_test.jsonl'
+            }},
+        )
+        run_task(task_cfg=task_cfg)
diff --git a/tests/cli/test_run.py b/tests/cli/test_run.py
index 5353273b..7b8fddec 100644
--- a/tests/cli/test_run.py
+++ b/tests/cli/test_run.py
@@ -76,7 +76,10 @@ def test_run_task(self):
                         # 'bbh',
                         # 'hellaswag',
                         # 'gsm8k',
-                        'arc'
+                        # 'arc'
+                        'race',
+                        'truthful_qa',
+                        'trivia_qa',
                         ],
                     'limit': 2,
                     'debug': True}
@@ -129,14 +132,19 @@ def test_run_server_model(self):
             api_key='EMPTY',
             eval_type=EvalType.SERVICE,
             datasets=[
-                'competition_math',
+                # 'mmlu',
+                # 'race',
+                'trivia_qa',
+                # 'cmmlu',
+                # 'humaneval',
+                # 'competition_math',
                 # 'gsm8k',
                 # 'arc',
                 # 'ceval',
                 # 'bbh',
                 # 'hellaswag',
             ],
-            limit=2,
+            limit=20,
             debug=True
         )
 

From 1ea478c476ebb9c46129baa7813810d4a566bd53 Mon Sep 17 00:00:00 2001
From: Yunnglin <mao.looper@qq.com>
Date: Tue, 24 Dec 2024 18:43:44 +0800
Subject: [PATCH 15/15] update test

---
 tests/cli/test_collection.py | 26 ++++++++++++--------------
 tests/rag/test_mteb.py       |  5 +++--
 2 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/tests/cli/test_collection.py b/tests/cli/test_collection.py
index 6a290e7d..72ca3a55 100644
--- a/tests/cli/test_collection.py
+++ b/tests/cli/test_collection.py
@@ -12,21 +12,19 @@
 class TestCollection(unittest.TestCase):
     @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
     def test_create_collection(self):
-        schema = CollectionSchema(
-            name='math&reasoning',
-            datasets=[
-                CollectionSchema(
-                    name='math',
-                    datasets=[
-                        DatasetInfo(name='gsm8k', weight=1, task_type='math', tags=['en', 'math']),
-                        DatasetInfo(name='competition_math', weight=2, task_type='math', tags=['en', 'math']),
+        schema = CollectionSchema(name='math&reasoning', datasets=[
+                    CollectionSchema(name='math', datasets=[
+                            DatasetInfo(name='gsm8k', weight=1, task_type='math', tags=['en', 'math']),
+                            DatasetInfo(name='competition_math', weight=1, task_type='math', tags=['en', 'math']),
+                            DatasetInfo(name='cmmlu', weight=2, task_type='math', tags=['zh', 'math'], args={'subset_list': ['college_mathematics', 'high_school_mathematics']}),
+                            DatasetInfo(name='ceval', weight=3, task_type='math', tags=['zh', 'math'], args={'subset_list': ['advanced_mathematics', 'high_school_mathematics', 'discrete_mathematics', 'middle_school_mathematics']}),
                     ]),
-                CollectionSchema(
-                    name='reasoning',
-                    datasets=[
-                        DatasetInfo(name='arc', weight=1, task_type='reasoning', tags=['en', 'reasoning']),
+                    CollectionSchema(name='reasoning', datasets=[
+                            DatasetInfo(name='arc', weight=1, task_type='reasoning', tags=['en', 'reasoning']),
+                            DatasetInfo(name='ceval', weight=1, task_type='reasoning', tags=['zh', 'reasoning'], args={'subset_list': ['logic']}),
+                            DatasetInfo(name='race', weight=1, task_type='reasoning', tags=['en', 'reasoning']),
                     ]),
-            ])
+                ])
         print(schema.to_dict())
         print(schema.flatten())
         schema.dump_json('outputs/schema_test.json')
@@ -36,7 +34,7 @@ def test_create_collection(self):
     def test_generate_data(self):
         schema = CollectionSchema.from_dict(json.load(open('outputs/schema_test.json', 'r')))
         print(schema.to_dict())
-        mixed_data = WeightedSampler(schema, 10).sample()
+        mixed_data = WeightedSampler(schema, 100).sample()
         dump_jsonl_data(mixed_data, 'outputs/mixed_data_test.jsonl')
 
     @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
diff --git a/tests/rag/test_mteb.py b/tests/rag/test_mteb.py
index f80e0023..66d494ad 100644
--- a/tests/rag/test_mteb.py
+++ b/tests/rag/test_mteb.py
@@ -79,7 +79,7 @@ def test_run_two_stage_mteb(self):
                         },
                     },
                     {
-                        'model_name_or_path': 'OpenBMB/MiniCPM-Reranker',
+                        'model_name_or_path': 'BAAI/bge-reranker-v2-m3',
                         'is_cross_encoder': True,
                         'max_seq_length': 512,
                         'prompt': '为这个问题生成一个检索用的表示',
@@ -94,7 +94,8 @@ def test_run_two_stage_mteb(self):
                     'verbosity': 2,
                     'output_folder': 'outputs',
                     'overwrite_results': True,
-                    'limits': 10,
+                    # 'limits': 10,
+                    'top_k': 10,
                 },
             },
         }