Skip to content

Commit

Permalink
Merge pull request #1 from zyearw1024/main
Browse files Browse the repository at this point in the history
Main
  • Loading branch information
zyearw1024 authored Jul 17, 2024
2 parents b5b2b8b + 7b24674 commit 892bd05
Show file tree
Hide file tree
Showing 495 changed files with 33,426 additions and 19,402 deletions.
147 changes: 95 additions & 52 deletions .github/resources/opencompass-hf-results.json
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
{
"meta-llama/Llama-2-7b-chat": {
"ceval": "28.3",
"mmlu": "35.49",
"ceval": "30.44",
"mmlu": "33.38",
"wic": "0",
"wsc": "0",
"triviaqa": "42.83",
"gsm8k": "26.38",
"race-middle": "41.57",
"race-high": "38.77",
"crows_pairs": "23.21"
"triviaqa": "56.11",
"gsm8k": "28.58",
"race-middle": "58.57",
"race-high": "51.74",
"crows_pairs": "17.97"
},
"Qwen/Qwen-7B-Chat": {
"ceval": "55.41",
Expand All @@ -22,69 +22,112 @@
"crows_pairs": "55.37"
},
"internlm/internlm-chat-7b": {
"ceval": "53.40",
"mmlu": "50.86",
"wic": "57.21",
"wsc": "41.35",
"triviaqa": "28.19",
"gsm8k": "33.43",
"race-middle": "80.99",
"race-high": "77.62",
"crows_pairs": "43.04"
"ceval": "54.10",
"mmlu": "52.84",
"wic": "52.04",
"wsc": "60.58",
"triviaqa": "37.77",
"gsm8k": "35.18",
"race-middle": "83.64",
"race-high": "78.85",
"crows_pairs": "45.49"
},
"baichuan-inc/Baichuan2-7B-Chat": {
"ceval": "53.92",
"mmlu": "50.13",
"ceval": "53.85",
"mmlu": "52.49",
"GaokaoBench": "37.54",
"winogrande": "52.80",
"hellaswag":"51.67",
"math":"2.90",
"wic": "0.16",
"wsc": "2.88",
"triviaqa": "37.66",
"gsm8k": "32.37",
"race-middle": "72.01",
"race-high": "67.44",
"crows_pairs": "8.09"
"triviaqa": "46.80",
"gsm8k": "32.15",
"bbh":"35.92",
"race-middle": "71.94",
"race-high": "67.58",
"crows_pairs": "8.29"
},
"internlm/internlm2-chat-7b": {
"ceval": "61.46",
"mmlu": "63.68",
"wic": "63.01",
"wsc": "41.35",
"triviaqa": "49.41",
"gsm8k": "71.57",
"race-middle": "89.21",
"race-high": "84.82",
"crows_pairs": "13.46"
"ceval": "61.25",
"mmlu": "63.05",
"GaokaoBench": "54.50",
"winogrande": "73.48",
"hellaswag":"84.80",
"math":"28.14",
"wic": "60.34",
"wsc": "65.38",
"gsm8k": "69.90",
"bbh":"57.83",
"race-middle": "88.72",
"race-high": "84.51",
"crows_pairs": "29.64"
},
"internlm/internlm2-chat-20b": {
"ceval": "-",
"ceval": "63.56",
"mmlu": "66.50",
"GaokaoBench": "57.95",
"hellaswag":"88.48",
"math":"34.68",
"wic": "-",
"wsc": "-",
"triviaqa": "-",
"gsm8k": "79.53",
"gsm8k": "75.21",
"bbh":"68.24",
"race-middle": "-",
"race-high": "-",
"crows_pairs": "-"
},
"Qwen/Qwen1.5-7B-Chat": {
"ceval": "-",
"mmlu": "61.44",
"wic": "-",
"wsc": "-",
"triviaqa": "-",
"gsm8k": "55.65",
"race-middle": "-",
"race-high": "-",
"crows_pairs": "-"
"ceval": "71.12",
"mmlu": "61.82",
"GaokaoBench": "71.00",
"winogrande": "65.19",
"hellaswag":"71.61",
"math":"22.64",
"wic": "47.96",
"wsc": "41.35",
"gsm8k": "56.25",
"bbh":"38.56",
"race-middle": "88.16",
"race-high": "83.33",
"crows_pairs": "34.02"
},
"mistralai/Mistral-7B-Instruct-v0.1": {
"ceval": "-",
"mmlu": "52.64",
"wic": "-",
"wsc": "-",
"triviaqa": "-",
"gsm8k": "41.93",
"race-middle": "-",
"race-high": "-",
"crows_pairs": "-"
"ceval": "39.94",
"mmlu": "52.79",
"wic": "50.78",
"wsc": "64.42",
"triviaqa": "49.93",
"gsm8k": "42.23",
"race-middle": "74.72",
"race-high": "69.75",
"crows_pairs": "7.82"
},
"mistralai/Mixtral-8x7B-Instruct-v0.1": {
"ceval": "51.29",
"mmlu": "69.67",
"wic": "55.33",
"wsc": "46.15",
"triviaqa": "71.06",
"gsm8k": "65.66",
"race-middle": "87.26",
"race-high": "81.85",
"crows_pairs": "23.34"
},
"meta-llama/Meta-Llama-3-8B-Instruct": {
"ceval": "52.32",
"mmlu": "68.37",
"GaokaoBench": "45.44",
"winogrande": "66.22",
"hellaswag":"74.39",
"math":"27.52",
"wic": "36.99",
"wsc": "32.69",
"gsm8k": "79.53",
"bbh":"52.83",
"race-middle": "88.63",
"race-high": "81.22",
"crows_pairs": "86.07"
}
}
95 changes: 86 additions & 9 deletions .github/scripts/action_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from typing import List

import fire
import pandas as pd
from mmengine.config import Config


Expand Down Expand Up @@ -74,6 +75,7 @@ def add_summary(csv_path: str):
for line in lines[1:]:
line = '|' + line.strip().replace(',', '|') + '|'
_append_summary(line)
_append_summary('\n')


def _load_hf_results(test_results: dict, model_name: str):
Expand All @@ -98,7 +100,7 @@ def _load_hf_results(test_results: dict, model_name: str):
return out


def evaluate(models: List[str], workspace: str):
def evaluate(models: List[str], datasets: List[str], workspace: str):
"""Evaluate models from lmdeploy using opencompass.
Args:
Expand Down Expand Up @@ -141,18 +143,27 @@ def evaluate(models: List[str], workspace: str):
logging.error(
f'Model {target_model} not found in configuration file')
continue
model_cfg = cfg[target_model]
hf_model_path = model_cfg['path']
if not os.path.exists(hf_model_path):
logging.error(f'Model path not exists: {hf_model_path}')
continue
logging.info(f'Start evaluating {target_model} ...\\nn{model_cfg}\n\n')
if engine_type != 'hf':
model_cfg = cfg[target_model]
hf_model_path = model_cfg['path']
if not os.path.exists(hf_model_path):
logging.error(f'Model path not exists: {hf_model_path}')
continue
logging.info(
f'Start evaluating {target_model} ...\\nn{model_cfg}\n\n')
else:
hf_model_path = target_model

with open(config_path_new, 'a') as f:
f.write(f'\nmodels = [ {target_model} ]\n')
f.write(f'\ndatasets = {datasets}\n')
if engine_type == 'hf':
f.write(f'\nmodels = [ *{target_model} ]\n')
else:
f.write(f'\nmodels = [ {target_model} ]\n')

work_dir = os.path.join(workspace, target_model)
cmd_eval = [
f'python3 {opencompass_dir}/run.py {config_path_new} -w {work_dir}'
f'python3 {opencompass_dir}/run.py {config_path_new} -w {work_dir} --reuse --max-num-workers 8' # noqa: E501
]
eval_log = os.path.join(workspace, f'eval.{ori_model}.txt')
ret = run_cmd(cmd_eval, log_path=eval_log, cwd=lmdeploy_dir)
Expand Down Expand Up @@ -235,5 +246,71 @@ def create_model_links(src_dir: str, dst_dir: str):
logging.warning(f'Model_path exists: {dst}')


def generate_benchmark_report(report_path: str):
# write to github action summary
_append_summary('## Evaluation Results Start')
subfolders = [f.path for f in os.scandir(report_path) if f.is_dir()]
for dir_path in subfolders:
second_subfolders = [
f.path for f in os.scandir(dir_path) if f.is_dir()
]
for sec_dir_path in second_subfolders:
model = sec_dir_path.replace(report_path + '/', '')
print('-' * 25, model, '-' * 25)
_append_summary('-' * 25 + model + '-' * 25 + '\n')

benchmark_subfolders = [
f.path for f in os.scandir(sec_dir_path) if f.is_dir()
]
for benchmark_subfolder in benchmark_subfolders:
backend_subfolders = [
f.path for f in os.scandir(benchmark_subfolder)
if f.is_dir()
]
for backend_subfolder in backend_subfolders:
benchmark_type = backend_subfolder.replace(
sec_dir_path + '/', '')
print('*' * 10, benchmark_type, '*' * 10)
_append_summary('-' * 10 + benchmark_type + '-' * 10 +
'\n')
merged_csv_path = os.path.join(backend_subfolder,
'summary.csv')
csv_files = glob.glob(
os.path.join(backend_subfolder, '*.csv'))
average_csv_path = os.path.join(backend_subfolder,
'average.csv')
if merged_csv_path in csv_files:
csv_files.remove(merged_csv_path)
if average_csv_path in csv_files:
csv_files.remove(average_csv_path)
merged_df = pd.DataFrame()

if len(csv_files) > 0:
for f in csv_files:
df = pd.read_csv(f)
merged_df = pd.concat([merged_df, df],
ignore_index=True)

merged_df = merged_df.sort_values(
by=merged_df.columns[0])

grouped_df = merged_df.groupby(merged_df.columns[0])
if 'generation' not in benchmark_subfolder:
average_values = grouped_df.pipe(
(lambda group: {
'mean': group.mean().round(decimals=3)
}))['mean']
average_values.to_csv(average_csv_path, index=True)
avg_df = pd.read_csv(average_csv_path)
merged_df = pd.concat([merged_df, avg_df],
ignore_index=True)
add_summary(average_csv_path)
merged_df.to_csv(merged_csv_path, index=False)
if 'generation' in benchmark_subfolder:
add_summary(merged_csv_path)
print(merged_df)
_append_summary('## Evaluation Results End')


if __name__ == '__main__':
fire.Fire()
Loading

0 comments on commit 892bd05

Please sign in to comment.