Merge pull request #1 from zyearw1024/main

Main
zyearw1024 · Jul 17, 2024 · 892bd05 · 892bd05
2 parents b5b2b8b + 7b24674
commit 892bd05
Show file tree

Hide file tree

Showing 495 changed files with 33,426 additions and 19,402 deletions.
diff --git a/.github/resources/opencompass-hf-results.json b/.github/resources/opencompass-hf-results.json
@@ -1,14 +1,14 @@
 {
     "meta-llama/Llama-2-7b-chat": {
-        "ceval": "28.3",
-        "mmlu": "35.49",
+        "ceval": "30.44",
+        "mmlu": "33.38",
         "wic": "0",
         "wsc": "0",
-        "triviaqa": "42.83",
-        "gsm8k": "26.38",
-        "race-middle": "41.57",
-        "race-high": "38.77",
-        "crows_pairs": "23.21"
+        "triviaqa": "56.11",
+        "gsm8k": "28.58",
+        "race-middle": "58.57",
+        "race-high": "51.74",
+        "crows_pairs": "17.97"
     },
     "Qwen/Qwen-7B-Chat": {
         "ceval": "55.41",
@@ -22,69 +22,112 @@
         "crows_pairs": "55.37"
     },
     "internlm/internlm-chat-7b": {
-        "ceval": "53.40",
-        "mmlu": "50.86",
-        "wic": "57.21",
-        "wsc": "41.35",
-        "triviaqa": "28.19",
-        "gsm8k": "33.43",
-        "race-middle": "80.99",
-        "race-high": "77.62",
-        "crows_pairs": "43.04"
+        "ceval": "54.10",
+        "mmlu": "52.84",
+        "wic": "52.04",
+        "wsc": "60.58",
+        "triviaqa": "37.77",
+        "gsm8k": "35.18",
+        "race-middle": "83.64",
+        "race-high": "78.85",
+        "crows_pairs": "45.49"
     },
     "baichuan-inc/Baichuan2-7B-Chat": {
-        "ceval": "53.92",
-        "mmlu": "50.13",
+        "ceval": "53.85",
+        "mmlu": "52.49",
+        "GaokaoBench": "37.54",
+        "winogrande": "52.80",
+        "hellaswag":"51.67",
+        "math":"2.90",
         "wic": "0.16",
         "wsc": "2.88",
-        "triviaqa": "37.66",
-        "gsm8k": "32.37",
-        "race-middle": "72.01",
-        "race-high": "67.44",
-        "crows_pairs": "8.09"
+        "triviaqa": "46.80",
+        "gsm8k": "32.15",
+        "bbh":"35.92",
+        "race-middle": "71.94",
+        "race-high": "67.58",
+        "crows_pairs": "8.29"
     },
     "internlm/internlm2-chat-7b": {
-        "ceval": "61.46",
-        "mmlu": "63.68",
-        "wic": "63.01",
-        "wsc": "41.35",
-        "triviaqa": "49.41",
-        "gsm8k": "71.57",
-        "race-middle": "89.21",
-        "race-high": "84.82",
-        "crows_pairs": "13.46"
+        "ceval": "61.25",
+        "mmlu": "63.05",
+        "GaokaoBench": "54.50",
+        "winogrande": "73.48",
+        "hellaswag":"84.80",
+        "math":"28.14",
+        "wic": "60.34",
+        "wsc": "65.38",
+        "gsm8k": "69.90",
+        "bbh":"57.83",
+        "race-middle": "88.72",
+        "race-high": "84.51",
+        "crows_pairs": "29.64"
     },
     "internlm/internlm2-chat-20b": {
-        "ceval": "-",
+        "ceval": "63.56",
         "mmlu": "66.50",
+        "GaokaoBench": "57.95",
+        "hellaswag":"88.48",
+        "math":"34.68",
         "wic": "-",
         "wsc": "-",
         "triviaqa": "-",
-        "gsm8k": "79.53",
+        "gsm8k": "75.21",
+        "bbh":"68.24",
         "race-middle": "-",
         "race-high": "-",
         "crows_pairs": "-"
     },
     "Qwen/Qwen1.5-7B-Chat": {
-        "ceval": "-",
-        "mmlu": "61.44",
-        "wic": "-",
-        "wsc": "-",
-        "triviaqa": "-",
-        "gsm8k": "55.65",
-        "race-middle": "-",
-        "race-high": "-",
-        "crows_pairs": "-"
+        "ceval": "71.12",
+        "mmlu": "61.82",
+        "GaokaoBench": "71.00",
+        "winogrande": "65.19",
+        "hellaswag":"71.61",
+        "math":"22.64",
+        "wic": "47.96",
+        "wsc": "41.35",
+        "gsm8k": "56.25",
+        "bbh":"38.56",
+        "race-middle": "88.16",
+        "race-high": "83.33",
+        "crows_pairs": "34.02"
     },
     "mistralai/Mistral-7B-Instruct-v0.1": {
-        "ceval": "-",
-        "mmlu": "52.64",
-        "wic": "-",
-        "wsc": "-",
-        "triviaqa": "-",
-        "gsm8k": "41.93",
-        "race-middle": "-",
-        "race-high": "-",
-        "crows_pairs": "-"
+        "ceval": "39.94",
+        "mmlu": "52.79",
+        "wic": "50.78",
+        "wsc": "64.42",
+        "triviaqa": "49.93",
+        "gsm8k": "42.23",
+        "race-middle": "74.72",
+        "race-high": "69.75",
+        "crows_pairs": "7.82"
+    },
+    "mistralai/Mixtral-8x7B-Instruct-v0.1": {
+        "ceval": "51.29",
+        "mmlu": "69.67",
+        "wic": "55.33",
+        "wsc": "46.15",
+        "triviaqa": "71.06",
+        "gsm8k": "65.66",
+        "race-middle": "87.26",
+        "race-high": "81.85",
+        "crows_pairs": "23.34"
+    },
+    "meta-llama/Meta-Llama-3-8B-Instruct": {
+        "ceval": "52.32",
+        "mmlu": "68.37",
+        "GaokaoBench": "45.44",
+        "winogrande": "66.22",
+        "hellaswag":"74.39",
+        "math":"27.52",
+        "wic": "36.99",
+        "wsc": "32.69",
+        "gsm8k": "79.53",
+        "bbh":"52.83",
+        "race-middle": "88.63",
+        "race-high": "81.22",
+        "crows_pairs": "86.07"
     }
 }
diff --git a/.github/scripts/action_tools.py b/.github/scripts/action_tools.py
@@ -9,6 +9,7 @@
 from typing import List
 
 import fire
+import pandas as pd
 from mmengine.config import Config
 
 
@@ -74,6 +75,7 @@ def add_summary(csv_path: str):
         for line in lines[1:]:
             line = '|' + line.strip().replace(',', '|') + '|'
             _append_summary(line)
+        _append_summary('\n')
 
 
 def _load_hf_results(test_results: dict, model_name: str):
@@ -98,7 +100,7 @@ def _load_hf_results(test_results: dict, model_name: str):
     return out
 
 
-def evaluate(models: List[str], workspace: str):
+def evaluate(models: List[str], datasets: List[str], workspace: str):
     """Evaluate models from lmdeploy using opencompass.
 
     Args:
@@ -141,18 +143,27 @@ def evaluate(models: List[str], workspace: str):
             logging.error(
                 f'Model {target_model} not found in configuration file')
             continue
-        model_cfg = cfg[target_model]
-        hf_model_path = model_cfg['path']
-        if not os.path.exists(hf_model_path):
-            logging.error(f'Model path not exists: {hf_model_path}')
-            continue
-        logging.info(f'Start evaluating {target_model} ...\\nn{model_cfg}\n\n')
+        if engine_type != 'hf':
+            model_cfg = cfg[target_model]
+            hf_model_path = model_cfg['path']
+            if not os.path.exists(hf_model_path):
+                logging.error(f'Model path not exists: {hf_model_path}')
+                continue
+            logging.info(
+                f'Start evaluating {target_model} ...\\nn{model_cfg}\n\n')
+        else:
+            hf_model_path = target_model
+
         with open(config_path_new, 'a') as f:
-            f.write(f'\nmodels = [ {target_model} ]\n')
+            f.write(f'\ndatasets = {datasets}\n')
+            if engine_type == 'hf':
+                f.write(f'\nmodels = [ *{target_model} ]\n')
+            else:
+                f.write(f'\nmodels = [ {target_model} ]\n')
 
         work_dir = os.path.join(workspace, target_model)
         cmd_eval = [
-            f'python3 {opencompass_dir}/run.py {config_path_new} -w {work_dir}'
+            f'python3 {opencompass_dir}/run.py {config_path_new} -w {work_dir} --reuse --max-num-workers 8'  # noqa: E501
         ]
         eval_log = os.path.join(workspace, f'eval.{ori_model}.txt')
         ret = run_cmd(cmd_eval, log_path=eval_log, cwd=lmdeploy_dir)
@@ -235,5 +246,71 @@ def create_model_links(src_dir: str, dst_dir: str):
             logging.warning(f'Model_path exists: {dst}')
 
 
+def generate_benchmark_report(report_path: str):
+    # write to github action summary
+    _append_summary('## Evaluation Results Start')
+    subfolders = [f.path for f in os.scandir(report_path) if f.is_dir()]
+    for dir_path in subfolders:
+        second_subfolders = [
+            f.path for f in os.scandir(dir_path) if f.is_dir()
+        ]
+        for sec_dir_path in second_subfolders:
+            model = sec_dir_path.replace(report_path + '/', '')
+            print('-' * 25, model, '-' * 25)
+            _append_summary('-' * 25 + model + '-' * 25 + '\n')
+
+            benchmark_subfolders = [
+                f.path for f in os.scandir(sec_dir_path) if f.is_dir()
+            ]
+            for benchmark_subfolder in benchmark_subfolders:
+                backend_subfolders = [
+                    f.path for f in os.scandir(benchmark_subfolder)
+                    if f.is_dir()
+                ]
+                for backend_subfolder in backend_subfolders:
+                    benchmark_type = backend_subfolder.replace(
+                        sec_dir_path + '/', '')
+                    print('*' * 10, benchmark_type, '*' * 10)
+                    _append_summary('-' * 10 + benchmark_type + '-' * 10 +
+                                    '\n')
+                    merged_csv_path = os.path.join(backend_subfolder,
+                                                   'summary.csv')
+                    csv_files = glob.glob(
+                        os.path.join(backend_subfolder, '*.csv'))
+                    average_csv_path = os.path.join(backend_subfolder,
+                                                    'average.csv')
+                    if merged_csv_path in csv_files:
+                        csv_files.remove(merged_csv_path)
+                    if average_csv_path in csv_files:
+                        csv_files.remove(average_csv_path)
+                    merged_df = pd.DataFrame()
+
+                    if len(csv_files) > 0:
+                        for f in csv_files:
+                            df = pd.read_csv(f)
+                            merged_df = pd.concat([merged_df, df],
+                                                  ignore_index=True)
+
+                        merged_df = merged_df.sort_values(
+                            by=merged_df.columns[0])
+
+                        grouped_df = merged_df.groupby(merged_df.columns[0])
+                        if 'generation' not in benchmark_subfolder:
+                            average_values = grouped_df.pipe(
+                                (lambda group: {
+                                    'mean': group.mean().round(decimals=3)
+                                }))['mean']
+                            average_values.to_csv(average_csv_path, index=True)
+                            avg_df = pd.read_csv(average_csv_path)
+                            merged_df = pd.concat([merged_df, avg_df],
+                                                  ignore_index=True)
+                            add_summary(average_csv_path)
+                        merged_df.to_csv(merged_csv_path, index=False)
+                        if 'generation' in benchmark_subfolder:
+                            add_summary(merged_csv_path)
+                        print(merged_df)
+    _append_summary('## Evaluation Results End')
+
+
 if __name__ == '__main__':
     fire.Fire()