tatsu-lab · YannDubs · Sep 30, 2023 · Sep 30, 2023 · Sep 30, 2023 · Sep 30, 2023
diff --git a/docs/alpaca_eval_gpt4_leaderboard.csv b/docs/alpaca_eval_gpt4_leaderboard.csv
@@ -2,7 +2,9 @@ name,win_rate,avg_length,link,samples,filter
 XwinLM 70b V0.1,95.56803995006244,1775.0,https://github.com/Xwin-LM/Xwin-LM,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/xwinlm-70b-v0.1/model_outputs.json,community
 GPT-4,95.27950310559004,1365.0,,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/gpt4/model_outputs.json,minimal
 LLaMA2 Chat 70B,92.66169154228857,1790.0,https://ai.meta.com/llama/,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/llama-2-70b-chat-hf/model_outputs.json,minimal
+UltraLM 13B V2.0 (best-of-16),92.29813664596274,1720.0,https://huggingface.co/openbmb/UltraRM-13b,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/ultralm-13b-v2.0-best-of-16/model_outputs.json,community
 XwinLM 13b V0.1,91.76029962546816,1894.0,https://github.com/Xwin-LM/Xwin-LM,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/xwinlm-13b-v0.1/model_outputs.json,community
+UltraLM 13B (best-of-16),91.54228855721394,1980.0,https://huggingface.co/openbmb/UltraRM-13b,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/ultralm-13b-best-of-16/model_outputs.json,community
 Claude 2,91.35572139303484,1069.0,,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/claude-2/model_outputs.json,minimal
 OpenChat V3.1 13B,89.49004975124379,1484.0,https://github.com/imoneoi/openchat,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/openchat-v3.1-13b/model_outputs.json,community
 ChatGPT,89.36567164179104,827.0,,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/chatgpt/model_outputs.json,minimal
@@ -17,6 +19,7 @@ OpenBuddy-LLaMA-65B-v8,86.53366583541147,1162.0,https://huggingface.co/OpenBuddy
 WizardLM 13B V1.1,86.31840796019901,1525.0,https://huggingface.co/WizardLM/WizardLM-13B-V1.1,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/wizardlm-13b-v1.1/model_outputs.json,community
 OpenChat V2 13B,84.96894409937889,1564.0,https://github.com/imoneoi/openchat,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/openchat-v2-13b/model_outputs.json,community
 Humpback LLaMa 65B,83.70646766169155,1269.0,https://arxiv.org/abs/2308.06259,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/humpback-llama-65b/model_outputs.json,community
+UltraLM 13B V2.0,83.60248447204968,1399.0,https://github.com/thunlp/UltraChat,,community
 Vicuna 13B v1.3,82.11180124223603,1132.0,https://huggingface.co/lmsys/vicuna-13b-v1.3,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/vicuna-13b-v1.3/model_outputs.json,verified
 GPT-3.5,81.7103620474407,1018.0,,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/gpt35_turbo_instruct/model_outputs.json,community
 OpenBuddy-LLaMA-30B-v7.1,81.54613466334165,968.0,https://huggingface.co/OpenBuddy/openbuddy-llama-30b-v7.1-bf16,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/openbuddy-llama-30b-v7.1/model_outputs.json,community

diff --git a/docs/format_export_leaderboards.py b/docs/format_export_leaderboards.py
@@ -1,3 +1,4 @@
+import logging
 from pathlib import Path
 
 from alpaca_eval.constants import MODELS_CONFIG_DIR, PRECOMPUTED_LEADERBOARDS
@@ -15,9 +16,18 @@
     df = df.reset_index(names="name")
     for idx in range(len(df)):
         informal_name = df.loc[idx, "name"]
-        model_config = load_configs(df.loc[idx, "name"], relative_to=MODELS_CONFIG_DIR)[informal_name]
+        try:
+            model_config = load_configs(df.loc[idx, "name"], relative_to=MODELS_CONFIG_DIR)[informal_name]
+        except KeyError as e:
+            logging.exception(
+                f"Could not find model config for {informal_name}. This is likely because the name of "
+                f"the annotator does not match the name of the model's directory."
+            )
+            raise e
+
         if "pretty_name" in model_config:
             df.loc[idx, "name"] = model_config["pretty_name"]
+
         if "link" in model_config:
             df.loc[idx, "link"] = model_config["link"]
 

diff --git a/results/alpaca-7b/annotations.json → ...lpaca-7b/annotation_alpaca_eval_gpt4.json b/results/alpaca-7b/annotations.json → ...lpaca-7b/annotation_alpaca_eval_gpt4.json
diff --git a/...ts/alpaca-farm-ppo-human/annotations.json → ...po-human/annotation_alpaca_eval_gpt4.json b/...ts/alpaca-farm-ppo-human/annotations.json → ...po-human/annotation_alpaca_eval_gpt4.json
diff --git a/results/chatgpt/annotation_alpaca_eval_gpt4.json b/results/chatgpt/annotation_alpaca_eval_gpt4.json
diff --git a/results/claude-2/annotation_alpaca_eval_gpt4.json b/results/claude-2/annotation_alpaca_eval_gpt4.json
diff --git a/results/claude/annotation_alpaca_eval_gpt4.json b/results/claude/annotation_alpaca_eval_gpt4.json
diff --git a/results/claude/annotations.json b/results/claude/annotations.json
diff --git a/results/falcon-40b-instruct/annotations.json → ...instruct/annotation_alpaca_eval_gpt4.json b/results/falcon-40b-instruct/annotations.json → ...instruct/annotation_alpaca_eval_gpt4.json
diff --git a/...ca-farm-ppo-sim-gpt4-20k/annotations.json → ...instruct/annotation_alpaca_eval_gpt4.json b/...ca-farm-ppo-sim-gpt4-20k/annotations.json → ...instruct/annotation_alpaca_eval_gpt4.json
diff --git a/results/falcon-7b-instruct/annotations.json b/results/falcon-7b-instruct/annotations.json
diff --git a/...lts/gpt35_turbo_instruct/annotations.json → ...instruct/annotation_alpaca_eval_gpt4.json b/...lts/gpt35_turbo_instruct/annotations.json → ...instruct/annotation_alpaca_eval_gpt4.json
diff --git a/results/gpt4/annotations.json → ...lts/gpt4/annotation_alpaca_eval_gpt4.json b/results/gpt4/annotations.json → ...lts/gpt4/annotation_alpaca_eval_gpt4.json
diff --git a/results/guanaco-13b/annotations.json b/results/guanaco-13b/annotations.json
diff --git a/results/guanaco-33b/annotations.json b/results/guanaco-33b/annotations.json
diff --git a/results/guanaco-65b/annotations.json → ...naco-65b/annotation_alpaca_eval_gpt4.json b/results/guanaco-65b/annotations.json → ...naco-65b/annotation_alpaca_eval_gpt4.json
diff --git a/results/guanaco-7b/annotations.json b/results/guanaco-7b/annotations.json
diff --git a/results/llama-2-13b-chat-hf/annotation_alpaca_eval_gpt4.json b/results/llama-2-13b-chat-hf/annotation_alpaca_eval_gpt4.json
diff --git a/results/llama-2-70b-chat-hf/annotation_alpaca_eval_gpt4.json b/results/llama-2-70b-chat-hf/annotation_alpaca_eval_gpt4.json
diff --git a/results/llama-2-7b-chat-hf/annotation_alpaca_eval_gpt4.json b/results/llama-2-7b-chat-hf/annotation_alpaca_eval_gpt4.json
diff --git a/results/nous-hermes-13b/annotations.json b/results/nous-hermes-13b/annotations.json
diff --git a/...lts/oasst-rlhf-llama-33b/annotations.json → ...lama-33b/annotation_alpaca_eval_gpt4.json b/...lts/oasst-rlhf-llama-33b/annotations.json → ...lama-33b/annotation_alpaca_eval_gpt4.json
diff --git a/results/oasst-sft-llama-33b/annotations.json b/results/oasst-sft-llama-33b/annotations.json
diff --git a/results/oasst-sft-pythia-12b/annotations.json b/results/oasst-sft-pythia-12b/annotations.json
diff --git a/results/pythia-12b-mix-sft/annotations.json b/results/pythia-12b-mix-sft/annotations.json
diff --git a/results/text_davinci_001/annotations.json → ...inci_001/annotation_alpaca_eval_gpt4.json b/results/text_davinci_001/annotations.json → ...inci_001/annotation_alpaca_eval_gpt4.json
diff --git a/results/text_davinci_003/annotations.json b/results/text_davinci_003/annotations.json
diff --git a/results/vicuna-13b/annotations.json → ...cuna-13b/annotation_alpaca_eval_gpt4.json b/results/vicuna-13b/annotations.json → ...cuna-13b/annotation_alpaca_eval_gpt4.json
diff --git a/results/vicuna-7b/annotations.json b/results/vicuna-7b/annotations.json
diff --git a/results/wizardlm-13b/annotations.json → ...rdlm-13b/annotation_alpaca_eval_gpt4.json b/results/wizardlm-13b/annotations.json → ...rdlm-13b/annotation_alpaca_eval_gpt4.json
diff --git a/src/alpaca_eval/main.py b/src/alpaca_eval/main.py
@@ -116,6 +116,7 @@ def evaluate(
     )
     annotations = None
 
+    arg_model_outputs = model_outputs
     if model_outputs is not None:
         model_outputs = utils.load_or_convert_to_dataframe(model_outputs)
         reference_outputs = utils.load_or_convert_to_dataframe(reference_outputs)
@@ -142,7 +143,7 @@ def evaluate(
         else:
             logging.info(f"Skipping evaluation of {name} as it is already in the precomputed leaderboard.")
 
-    output_path = utils.get_output_path(output_path, model_outputs, name)
+    output_path = utils.get_output_path(output_path, arg_model_outputs, name)
 
     df_leaderboard = pd.DataFrame.from_dict(leaderboard, orient="index").sort_values(by=sort_by, ascending=False)
     df_leaderboard = df_leaderboard[
@@ -153,9 +154,12 @@ def evaluate(
         logging.info(f"Saving all results to {output_path}")
         df_leaderboard.to_csv(output_path / "leaderboard.csv")
         if annotations is not None:
-            utils.convert_to_dataframe(annotations).to_json(
-                output_path / "annotations.json", orient="records", indent=2
-            )
+            if isinstance(annotators_config, str) and "/" not in annotators_config:
+                annotations_name = f"annotation_{annotators_config}.json"
+            else:
+                annotations_name = "annotations.json"
+
+            utils.convert_to_dataframe(annotations).to_json(output_path / annotations_name, orient="records", indent=2)
 
     if is_cache_leaderboard is None:
         is_cache_leaderboard = max_instances is None

diff --git a/src/alpaca_eval/models_configs/ultralm-13b-best-of-16/configs.yaml b/src/alpaca_eval/models_configs/ultralm-13b-best-of-16/configs.yaml
@@ -1,9 +1,8 @@
-ultralm-13b:
+ultralm-13b-best-of-16:
   prompt_template: "ultralm-13b-best-of-16/prompt.txt"
   pretty_name: "UltraLM 13B (best-of-16)"
-  link: 
-    - "https://github.com/thunlp/UltraChat"
-    - "https://github.com/thunlp/UltraFeedback"
-    - "https://huggingface.co/openbmb/UltraRM-13b"
+  link: "https://huggingface.co/openbmb/UltraRM-13b"
+#    - "https://github.com/thunlp/UltraChat"
+#    - "https://github.com/thunlp/UltraFeedback"
   # Results cannot be directly reproduced with alpaca_eval official `fn_completions` because they require best-of-n sampling.
   # The reproduction requires generaing 16 completions using vllm at inference time and then using a reward model, UltraRM, to seelct the one with the highest reward.
diff --git a/src/alpaca_eval/models_configs/ultralm-13b-v2.0-best-of-16/configs.yaml b/src/alpaca_eval/models_configs/ultralm-13b-v2.0-best-of-16/configs.yaml
@@ -1,9 +1,8 @@
-ultralm-13b:
+ultralm-13b-v2.0-best-of-16:
   prompt_template: "ultralm-13b-v2.0-best-of-16/prompt.txt"
   pretty_name: "UltraLM 13B V2.0 (best-of-16)"
-  link: 
-    - "https://github.com/thunlp/UltraChat"
-    - "https://github.com/thunlp/UltraFeedback"
-    - "https://huggingface.co/openbmb/UltraRM-13b"
+  link: "https://huggingface.co/openbmb/UltraRM-13b"
+#    - "https://github.com/thunlp/UltraChat"
+#    - "https://github.com/thunlp/UltraFeedback"
   # Results cannot be directly reproduced with alpaca_eval official `fn_completions` because they require best-of-n sampling.
   # The reproduction requires generaing 16 completions using vllm at inference time and then using a reward model, UltraRM, to seelct the one with the highest reward.
diff --git a/src/alpaca_eval/models_configs/ultralm-13b-v2.0/configs.yaml b/src/alpaca_eval/models_configs/ultralm-13b-v2.0/configs.yaml
@@ -1,4 +1,4 @@
-ultralm-13b:
+ultralm-13b-v2.0:
   prompt_template: "ultralm-13b-v2.0/prompt.txt"
   fn_completions: "huggingface_local_completions"
   completions_kwargs: