tatsu-lab · YannDubs · Feb 12, 2024 · Feb 12, 2024 · Feb 12, 2024
diff --git a/notebooks/length_correction.ipynb b/notebooks/length_correction.ipynb
diff --git a/results/gpt-3.5-turbo-0613/model_outputs.json b/results/gpt-3.5-turbo-0613/model_outputs.json
diff --git a/results/gpt-3.5-turbo-0613/weighted_alpaca_eval_gpt4_turbo/annotations.json b/results/gpt-3.5-turbo-0613/weighted_alpaca_eval_gpt4_turbo/annotations.json
diff --git a/results/gpt-3.5-turbo-1106/alpaca_eval_gpt4/annotations.json b/results/gpt-3.5-turbo-1106/alpaca_eval_gpt4/annotations.json
diff --git a/results/gpt-3.5-turbo-1106/model_outputs.json b/results/gpt-3.5-turbo-1106/model_outputs.json
diff --git a/results/gpt-3.5-turbo-1106/weighted_alpaca_eval_gpt4_turbo/annotations.json b/results/gpt-3.5-turbo-1106/weighted_alpaca_eval_gpt4_turbo/annotations.json
diff --git a/results/gpt4_0314/alpaca_eval_gpt4/annotations.json b/results/gpt4_0314/alpaca_eval_gpt4/annotations.json
diff --git a/results/gpt4_0314/model_outputs.json b/results/gpt4_0314/model_outputs.json
diff --git a/results/gpt4_0314/weighted_alpaca_eval_gpt4_turbo/annotations.json b/results/gpt4_0314/weighted_alpaca_eval_gpt4_turbo/annotations.json
diff --git a/src/alpaca_eval/constants.py b/src/alpaca_eval/constants.py
@@ -188,32 +188,43 @@ def ALPACAFARM_GOLD_ANNOTATIONS():
     "alpaca-7b",
 ]
 
+# Feb 2, 2024
 # maps models to Arena Elo rating
 CHATBOT_ARENA_LEADERBOARD = {
-    "gpt4_1106_preview": 1249,
-    "gpt4_0314": 1189,
-    "gpt4_0613": 1161,
-    "claude": 1150,
-    "claude-2": 1131,
+    "gpt4_1106_preview": 1252,
+    "gpt4_0314": 1190,
+    "gpt4_0613": 1162,
+    "mistral-medium": 1150,
+    "claude": 1149,
+    "claude-2": 1132,
     "claude-2.1": 1119,
-    "tulu-2-dpo-70b": 1104,
+    "Mixtral-8x7B-Instruct-v0.1": 1118,
+    "gpt-3.5-turbo-0613": 1118,
     "Yi-34B-Chat": 1115,
-    "llama-2-70b-chat-hf": 1077,
-    "gpt-3.5-turbo-1106": 1072,
-    "gemini-pro": 1111,
-    "Mixtral-8x7B-Instruct-v0.1": 1121,
-    "Mistral-7B-Instruct-v0.2": 1023,
-    "vicuna-33b-v1.3": 1095,
-    "Starling-LM-7B-alpha": 1089,
-    "dolphin-2.2.1-mistral-7b": 1064,
-    "zephyr-7b-alpha": 1037,
-    "oasst-sft-pythia-12b": 896,
-    "wizardlm-13b-v1.2": 1058,
-    "guanaco-33b": 1031,
-    "deepseek-llm-67b-chat": 1082,
+    "gemini-pro": 1114,
+    "claude-instant-1.2": 1109,
+    "gpt-3.5-turbo-0301": 1105,
     "wizardlm-70b": 1105,
+    "tulu-2-dpo-70b": 1104,
+    "vicuna-33b-v1.3": 1093,
+    "Starling-LM-7B-alpha": 1090,
+    "deepseek-llm-67b-chat": 1082,
+    "llama-2-70b-chat-hf": 1082,
+    "OpenHermes-2.5-Mistral-7B": 1078,
+    "gpt-3.5-turbo-1106": 1071,
+    "dolphin-2.2.1-mistral-7b": 1065,
+    "wizardlm-13b-v1.2": 1058,
+    "zephyr-7b-beta": 1051,
+    "llama-2-13b-chat-hf": 800,
+    "vicuna-13b-v1.5": 1040,
+    "zephyr-7b-alpha": 1037,
     "Qwen-14B-Chat": 1034,
-    "claude-instant-1.2": 1109,
+    "guanaco-33b": 1032,
+    "llama-2-7b-chat-hf": 1024,
+    "vicuna-7b-v1.5": 1005,
+    # older models
+    "chatglm2-6b": 930,
+    "oasst-sft-pythia-12b": 897,
 }
 
 EVALUATORS_LEADERBOARD_COLS_TO_PRINT = EVALUATORS_LEADERBOARD_COLS_TO_PRIORITIZE[:8]

diff --git a/...lpaca_eval/leaderboards/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv b/...lpaca_eval/leaderboards/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv
@@ -31,6 +31,7 @@ Mistral-7B-Instruct-v0.2,14.722772657714286,1.0785266446729775,113,691,1,805,14.
 wizardlm-70b,14.38389608705848,1.0395048912956086,106,697,2,805,13.291925465838508,community,1545
 Starling-LM-7B-alpha,14.245923521762474,1.0685460609391275,102,702,1,805,12.732919254658384,community,1895
 gpt-3.5-turbo-16k-0613,14.132390707727575,1.0275794002684782,96,704,5,805,12.236024844720497,verified,1328
+gpt-3.5-turbo-0613,14.095798573846427,1.0371186214996104,99,700,6,805,12.670807453416149,community,1331
 llama-2-70b-chat-hf,13.871009062248447,1.078780255916568,104,701,0,805,12.919254658385093,verified,1790
 ultralm-13b-v2.0-best-of-16,13.853373471264224,1.0493447060435432,98,705,2,805,12.298136645962732,community,1720
 pairrm-tulu-2-13b,13.831901016808686,1.0835284665179354,110,694,1,805,13.72670807453416,community,1454

diff --git a/src/alpaca_eval/models_configs/gpt-3.5-turbo-0613/configs.yaml b/src/alpaca_eval/models_configs/gpt-3.5-turbo-0613/configs.yaml
@@ -0,0 +1,7 @@
+gpt-3.5-turbo-0613:
+  prompt_template: "gpt4/chatml_prompt.txt"
+  fn_completions: "openai_completions"
+  completions_kwargs:
+    model_name: "gpt-3.5-turbo-0613"
+    max_tokens: 3072
+  pretty_name: "GPT 3.5 Turbo 0613"