Skip to content

Commit

Permalink
Merge pull request #2 from zyearw1024/main
Browse files Browse the repository at this point in the history
Main
  • Loading branch information
zyearw1024 authored Sep 15, 2024
2 parents 48cc41f + e2aa4bd commit 4063b26
Show file tree
Hide file tree
Showing 506 changed files with 37,472 additions and 23,989 deletions.
4 changes: 4 additions & 0 deletions .github/resources/opencompass-hf-results.json
Original file line number Diff line number Diff line change
Expand Up @@ -129,5 +129,9 @@
"race-middle": "88.63",
"race-high": "81.22",
"crows_pairs": "86.07"
},
"internlm/internlm2_5-7b-chat": {
"mmlu": "72.8",
"gsm8k": "86.0"
}
}
96 changes: 47 additions & 49 deletions .github/scripts/action_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import os
import shutil
import subprocess
import time
from collections import OrderedDict
from typing import List

Expand Down Expand Up @@ -166,7 +167,11 @@ def evaluate(models: List[str], datasets: List[str], workspace: str):
f'python3 {opencompass_dir}/run.py {config_path_new} -w {work_dir} --reuse --max-num-workers 8' # noqa: E501
]
eval_log = os.path.join(workspace, f'eval.{ori_model}.txt')
start_time = time.time()
ret = run_cmd(cmd_eval, log_path=eval_log, cwd=lmdeploy_dir)
end_time = time.time()
task_duration_seconds = round(end_time - start_time, 2)
logging.info(f'task_duration_seconds: {task_duration_seconds}\n')
if ret != 0:
continue
csv_files = glob.glob(f'{work_dir}/*/summary/summary_*.csv')
Expand Down Expand Up @@ -204,6 +209,7 @@ def evaluate(models: List[str], datasets: List[str], workspace: str):
prec = precision if do_lite else '-'

row = ','.join([model, engine_type, prec] +
[str(task_duration_seconds)] +
[model_results[_] for _ in dataset_names])
hf_res_row = None
if hf_model_path not in test_model_names:
Expand All @@ -213,11 +219,11 @@ def evaluate(models: List[str], datasets: List[str], workspace: str):
hf_metrics = [
hf_res[d] if d in hf_res else '-' for d in dataset_names
]
hf_res_row = ','.join([model, 'hf', '-'] + hf_metrics)
hf_res_row = ','.join([model, 'hf', '-', '-'] + hf_metrics)
if not os.path.exists(output_csv):
with open(output_csv, 'w') as f:
header = ','.join(['Model', 'Engine', 'Precision'] +
dataset_names)
['task_duration_secs'] + dataset_names)
f.write(header + '\n')
if hf_res_row:
f.write(hf_res_row + '\n')
Expand Down Expand Up @@ -264,53 +270,45 @@ def generate_benchmark_report(report_path: str):
benchmark_subfolders = [
f.path for f in os.scandir(sec_dir_path) if f.is_dir()
]
for benchmark_subfolder in benchmark_subfolders:
backend_subfolders = [
f.path for f in os.scandir(benchmark_subfolder)
if f.is_dir()
]
for backend_subfolder in backend_subfolders:
benchmark_type = backend_subfolder.replace(
sec_dir_path + '/', '')
print('*' * 10, benchmark_type, '*' * 10)
_append_summary('-' * 10 + benchmark_type + '-' * 10 +
'\n')
merged_csv_path = os.path.join(backend_subfolder,
'summary.csv')
csv_files = glob.glob(
os.path.join(backend_subfolder, '*.csv'))
average_csv_path = os.path.join(backend_subfolder,
'average.csv')
if merged_csv_path in csv_files:
csv_files.remove(merged_csv_path)
if average_csv_path in csv_files:
csv_files.remove(average_csv_path)
merged_df = pd.DataFrame()

if len(csv_files) > 0:
for f in csv_files:
df = pd.read_csv(f)
merged_df = pd.concat([merged_df, df],
ignore_index=True)

merged_df = merged_df.sort_values(
by=merged_df.columns[0])

grouped_df = merged_df.groupby(merged_df.columns[0])
if 'generation' not in benchmark_subfolder:
average_values = grouped_df.pipe(
(lambda group: {
'mean': group.mean().round(decimals=3)
}))['mean']
average_values.to_csv(average_csv_path, index=True)
avg_df = pd.read_csv(average_csv_path)
merged_df = pd.concat([merged_df, avg_df],
ignore_index=True)
add_summary(average_csv_path)
merged_df.to_csv(merged_csv_path, index=False)
if 'generation' in benchmark_subfolder:
add_summary(merged_csv_path)
print(merged_df)
for backend_subfolder in benchmark_subfolders:
benchmark_type = backend_subfolder.replace(
sec_dir_path + '/', '')
print('*' * 10, benchmark_type, '*' * 10)
_append_summary('-' * 10 + benchmark_type + '-' * 10 + '\n')
merged_csv_path = os.path.join(backend_subfolder,
'summary.csv')
csv_files = glob.glob(os.path.join(backend_subfolder, '*.csv'))
average_csv_path = os.path.join(backend_subfolder,
'average.csv')
if merged_csv_path in csv_files:
csv_files.remove(merged_csv_path)
if average_csv_path in csv_files:
csv_files.remove(average_csv_path)
merged_df = pd.DataFrame()

if len(csv_files) > 0:
for f in csv_files:
df = pd.read_csv(f)
merged_df = pd.concat([merged_df, df],
ignore_index=True)

merged_df = merged_df.sort_values(by=merged_df.columns[0])

grouped_df = merged_df.groupby(merged_df.columns[0])
if 'generation' not in backend_subfolder:
average_values = grouped_df.pipe(
(lambda group: {
'mean': group.mean().round(decimals=3)
}))['mean']
average_values.to_csv(average_csv_path, index=True)
avg_df = pd.read_csv(average_csv_path)
merged_df = pd.concat([merged_df, avg_df],
ignore_index=True)
add_summary(average_csv_path)
merged_df.to_csv(merged_csv_path, index=False)
if 'generation' in backend_subfolder:
add_summary(merged_csv_path)

_append_summary('## Benchmark Results End')


Expand Down
76 changes: 68 additions & 8 deletions .github/scripts/eval_opencompass_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,12 +150,10 @@
MAX_NEW_TOKENS = 1024

tb_engine_config_template_max_bs_128 = dict(session_len=MAX_SESSION_LEN,
max_batch_size=128,
rope_scaling_factor=1.0)
max_batch_size=128)
tb_engine_config_template_max_bs_128_tp2 = dict(session_len=MAX_SESSION_LEN,
max_batch_size=128,
tp=2,
rope_scaling_factor=1.0)
tp=2)

pt_engine_config_template_max_bs_16 = dict(session_len=MAX_SESSION_LEN,
max_batch_size=16)
Expand Down Expand Up @@ -192,12 +190,10 @@
tp=2)
tb_awq_engine_config_template_max_bs_8 = dict(session_len=MAX_SESSION_LEN,
max_batch_size=8,
model_format='awq',
rope_scaling_factor=1.0)
model_format='awq')
tb_awq_engine_config_template_max_bs_32 = dict(session_len=MAX_SESSION_LEN,
max_batch_size=32,
model_format='awq',
rope_scaling_factor=1.0)
model_format='awq')

gen_config_template = dict(top_k=1,
top_p=0.8,
Expand Down Expand Up @@ -315,6 +311,38 @@
run_cfg=run_cfg_tp1_template,
end_str='<|im_end|>')

# ===== Configs for internlm/internlm2_5_20b_chat =====
tb_internlm2_5_20b_chat = dict(
type=TurboMindModelwithChatTemplate,
abbr='tb_internlm2_5_20b_chat',
path='internlm/internlm2_5-20b-chat',
engine_config=engine_config_template_max_bs_128_tp2,
gen_config=gen_config_template,
max_seq_len=MAX_SESSION_LEN,
max_out_len=MAX_NEW_TOKENS,
batch_size=128,
run_cfg=dict(num_gpus=2),
stop_words=['</s>', '<|im_end|>'],
)

tb_internlm2_5_20b_chat_4bits = deepcopy(tb_internlm2_5_20b_chat)
tb_internlm2_5_20b_chat_kvint4 = deepcopy(tb_internlm2_5_20b_chat)
tb_internlm2_5_20b_chat_kvint8 = deepcopy(tb_internlm2_5_20b_chat)

pt_internlm2_5_20b_chat = dict(
type=LmdeployPytorchModel,
abbr='pt_internlm2_5_20b_chat',
path='internlm/internlm2_5-20b-chat',
engine_config=pt_engine_config_template_max_bs_64_tp2,
gen_config=gen_config_template,
max_out_len=MAX_NEW_TOKENS,
max_seq_len=MAX_SESSION_LEN,
batch_size=64,
concurrency=64,
meta_template=internlm2_meta_template,
run_cfg=run_cfg_tp2_template,
end_str='<|im_end|>')

# ===== Configs for internlm/internlm2_chat_20b =====
tb_internlm2_chat_20b = dict(
type=TurboMindModelwithChatTemplate,
Expand Down Expand Up @@ -473,6 +501,38 @@
tb_llama_3_8b_instruct_kvint4 = deepcopy(tb_llama_3_8b_instruct)
tb_llama_3_8b_instruct_kvint8 = deepcopy(tb_llama_3_8b_instruct)

# ===== Configs for meta-llama/Meta-Llama-3.1-8B-Instruct =====
tb_llama_3d1_8b_instruct = dict(
type=TurboMindModelwithChatTemplate,
abbr='tb_llama_3d1_8b_instruct',
path='meta-llama/Meta-Llama-3-1-8B-Instruct',
engine_config=engine_config_template_max_bs_128,
gen_config=gen_config_template,
max_seq_len=MAX_SESSION_LEN,
max_out_len=MAX_NEW_TOKENS,
batch_size=128,
run_cfg=dict(num_gpus=1),
stop_words=['<|eot_id|>', '<|end_of_text|>'],
)

pt_llama_3d1_8b_instruct = dict(
type=LmdeployPytorchModel,
abbr='pt_llama_3d1_8b_instruct',
path='meta-llama/Meta-Llama-3-1-8B-Instruct',
engine_config=pt_engine_config_template_max_bs_128,
gen_config=gen_config_template,
max_out_len=MAX_NEW_TOKENS,
max_seq_len=MAX_SESSION_LEN,
batch_size=128,
concurrency=128,
meta_template=llama3_meta_template,
run_cfg=run_cfg_tp1_template,
end_str='[INST]')

tb_llama_3d1_8b_instruct_4bits = deepcopy(tb_llama_3d1_8b_instruct)
tb_llama_3d1_8b_instruct_kvint4 = deepcopy(tb_llama_3d1_8b_instruct)
tb_llama_3d1_8b_instruct_kvint8 = deepcopy(tb_llama_3d1_8b_instruct)

# ===== Configs for Qwen/Qwen2-7B-Instruct =====
tb_qwen2_7b_instruct = dict(
type=TurboMindModelwithChatTemplate,
Expand Down
42 changes: 0 additions & 42 deletions .github/scripts/set_benchmark_param.sh

This file was deleted.

Loading

0 comments on commit 4063b26

Please sign in to comment.