Skip to content

Commit

Permalink
register all data
Browse files Browse the repository at this point in the history
  • Loading branch information
Yunnglin committed Dec 24, 2024
1 parent b957f83 commit 95aa741
Show file tree
Hide file tree
Showing 16 changed files with 284 additions and 392 deletions.
2 changes: 1 addition & 1 deletion evalscope/benchmarks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,4 @@
module_path = relative_path[:-3].replace(os.path.sep, '.') # strip '.py' and convert to module path
full_path = f'evalscope.benchmarks.{module_path}'
importlib.import_module(full_path)
print(f'Importing {full_path}')
# print(f'Importing {full_path}')
61 changes: 20 additions & 41 deletions evalscope/benchmarks/cmmlu/cmmlu_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,17 @@
import csv
import os

from evalscope.benchmarks.data_adapter import DataAdapter
from evalscope.metrics.metrics import exact_match, weighted_mean
from evalscope.benchmarks import Benchmark, DataAdapter
from evalscope.constants import EvalType
from evalscope.metrics import WeightedAverageAccuracy, exact_match
from evalscope.models import MultiChoiceModelAdapter
from evalscope.utils import ResponseParser, normalize_score
from evalscope.utils.logger import get_logger

# flake8: noqa

logger = get_logger()

DATASET_ID = 'modelscope/cmmlu'

SUBSET_LIST = [
'agronomy', 'anatomy', 'ancient_chinese', 'arts', 'astronomy', 'business_ethics', 'chinese_civil_service_exam',
'chinese_driving_rule', 'chinese_food_culture', 'chinese_foreign_policy', 'chinese_history', 'chinese_literature',
Expand Down Expand Up @@ -101,31 +101,23 @@
}


@Benchmark.register(
name='cmmlu',
dataset_id='modelscope/cmmlu',
model_adapter=MultiChoiceModelAdapter,
subset_list=SUBSET_LIST,
metric_list=[WeightedAverageAccuracy],
few_shot_num=5,
train_split='dev',
eval_split='test',
)
class CMMLUAdapter(DataAdapter):

choices = ['A', 'B', 'C', 'D']

def __init__(self,
subset_list: list = None,
metric_list: list = None,
few_shot_num: int = 5,
train_split: str = 'dev',
eval_split: str = 'test',
**kwargs):

if subset_list is None:
subset_list = SUBSET_LIST

if metric_list is None:
metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
def __init__(self, **kwargs):

super().__init__(
subset_list=subset_list,
metric_list=metric_list,
few_shot_num=few_shot_num,
train_split=train_split,
eval_split=eval_split,
**kwargs)
super().__init__(**kwargs)

def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
data_dict = {}
Expand Down Expand Up @@ -187,7 +179,7 @@ def get_gold_answer(self, input_d: dict) -> str:
# Get the gold choice
return input_d.get('Answer', '')

def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
"""
Parse the model output to get the answer. Could be the best choice index.
Expand All @@ -199,31 +191,18 @@ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: st
Returns:
The parsed answer. Depending on the dataset. Usually a string for chat.
"""
if eval_type == 'checkpoint':
if eval_type == EvalType.CHECKPOINT:
return result
elif eval_type == 'service':
elif eval_type == EvalType.SERVICE:
return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked !
elif eval_type == 'custom':
elif eval_type == EvalType.CUSTOM:
return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked !
else:
raise ValueError(f'Invalid eval_type: {eval_type}')

def match(self, gold: str, pred: str) -> float:
return exact_match(gold=gold, pred=pred)

def compute_metric(self, review_res_list: list) -> float:
"""
Compute evaluation result by specific metric.
Args:
review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
Returns:
The metric score.
"""
items = [(score, 1.0) for score in review_res_list]
return weighted_mean(items)

def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
"""
Generate report for the evaluation.
Expand Down
35 changes: 16 additions & 19 deletions evalscope/benchmarks/general_qa/general_qa_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,35 +5,32 @@
from collections import defaultdict
from typing import Any, Optional

from evalscope.benchmarks.data_adapter import DataAdapter
from evalscope.metrics.metrics import bleu_ngram_one_sample, weighted_mean
from evalscope.metrics.rouge_metric import compute_rouge_score_one_sample_zh
from evalscope.benchmarks import Benchmark, DataAdapter
from evalscope.metrics import (WeightedAverageBLEU, bleu_ngram_one_sample, compute_rouge_score_one_sample_zh,
weighted_mean)
from evalscope.models import ChatGenerationModelAdapter
from evalscope.utils.io_utils import jsonl_to_list
from evalscope.utils.logger import get_logger

logger = get_logger()

DATASET_ID = 'general_qa'
SUBSET_LIST = ['default']


@Benchmark.register(
name='general_qa',
dataset_id='general_qa',
model_adapter=ChatGenerationModelAdapter,
subset_list=['default'],
metric_list=[WeightedAverageBLEU],
few_shot_num=0,
train_split=None,
eval_split='test',
)
class GeneralQAAdapter(DataAdapter):
# TODO: set few_shot_num

def __init__(self,
subset_list: list = None,
metric_list: list = None,
train_split: str = None,
eval_split: str = 'test',
**kwargs):
if subset_list is None:
subset_list = SUBSET_LIST

if metric_list is None:
metric_list = [{'name': 'WeightedAverageBLEU', 'object': weighted_mean}]
def __init__(self, **kwargs):

super().__init__(
subset_list=subset_list, metric_list=metric_list, train_split=train_split, eval_split=eval_split, **kwargs)
super().__init__(**kwargs)

def load(self, dataset_name_or_path: str, subset_list: list = None, **kwargs) -> dict:

Expand Down
80 changes: 16 additions & 64 deletions evalscope/benchmarks/humaneval/humaneval_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,33 +2,34 @@
import re
from typing import List

from evalscope.benchmarks.data_adapter import DataAdapter
from evalscope.metrics.metrics import weighted_mean
from evalscope.utils import normalize_score
from evalscope.benchmarks import Benchmark, DataAdapter
from evalscope.metrics import Pass1
from evalscope.models import ChatGenerationModelAdapter
from evalscope.utils.logger import get_logger

logger = get_logger()

DATASET_ID = 'modelscope/humaneval'
SUBSET_LIST = ['openai_humaneval']

# Example:
# {"task_id": "HumanEval/0", "prompt": "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n given threshold.\n >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n False\n >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n True\n \"\"\"\n", "entry_point": "has_close_elements", "canonical_solution": " for idx, elem in enumerate(numbers):\n for idx2, elem2 in enumerate(numbers):\n if idx != idx2:\n distance = abs(elem - elem2)\n if distance < threshold:\n return True\n\n return False\n", "test": "\n\nMETADATA = {\n 'author': 'jt',\n 'dataset': 'test'\n}\n\n\ndef check(candidate):\n assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\n"} # noqa


@Benchmark.register(
name='humaneval',
dataset_id='modelscope/humaneval',
model_adapter=ChatGenerationModelAdapter,
subset_list=['openai_humaneval'],
metric_list=[Pass1],
few_shot_num=0,
train_split=None,
eval_split='test',
prompt_template='Complete the following python code:\n',
)
class HumanevalAdapter(DataAdapter):
"""
A placeholder for humaneval adapter, see HumanevalEvaluator for implementation.
"""

def __init__(self,
subset_list: list = None,
metric_list: list = None,
few_shot_num: int = None,
train_split: str = None,
eval_split: str = 'test',
prompt_template: str = None,
**kwargs):
def __init__(self, **kwargs):
try:
from human_eval.data import stream_jsonl, write_jsonl
from human_eval.evaluation import check_correctness
Expand All @@ -37,15 +38,6 @@ def __init__(self,
'https://github.com/openai/human-eval/tree/master#installation , '
'Note that you need to enable the execution code in the human_eval/execution.py first.')

if subset_list is None:
subset_list = SUBSET_LIST

if metric_list is None:
metric_list = [{'name': 'pass@1', 'object': weighted_mean}]

if prompt_template is None:
prompt_template = 'Complete the following python code:\n'

self.k = [1]
self.num_workers = 4
self.timeout = 4.0
Expand All @@ -54,14 +46,7 @@ def __init__(self,
self.write_jsonl_func = write_jsonl
self.eval_func = check_correctness

super().__init__(
subset_list=subset_list,
metric_list=metric_list,
few_shot_num=few_shot_num,
train_split=train_split,
eval_split=eval_split,
prompt_template=prompt_template,
**kwargs)
super().__init__(**kwargs)

def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
data_dict = {}
Expand All @@ -85,26 +70,6 @@ def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:

return {'data': [full_prompt]}

def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
total_num: int = sum([num for _, num in subset_score_map.values()])
weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
weighted_avg_acc = normalize_score(score=weighted_avg_acc)
cate_avg_list = [{
'name': subset_name,
'score': normalize_score(score=score)
} for subset_name, (score, _) in subset_score_map.items()]

category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)

res_map = dict(
name=report_name or 'HumanEval',
metric=self.metric_list[0]['name'],
score=weighted_avg_acc,
category=[category_d],
total_num=total_num)

return res_map

@classmethod
def _postprocess(cls, text: str) -> str:
if '```' in text:
Expand All @@ -129,19 +94,6 @@ def _postprocess(cls, text: str) -> str:
text = '\n'.join([' ' + line for line in text.split('\n')])
return text

def compute_metric(self, review_res_list: list) -> float:
"""
Compute evaluation result by specific metric.
Args:
review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
Returns:
The metric score.
"""
items = [(score, 1.0) for score in review_res_list]
return weighted_mean(items)

def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
return self._postprocess(result)

Expand Down
Loading

0 comments on commit 95aa741

Please sign in to comment.