forked from tatsu-lab/alpaca_eval
-
Notifications
You must be signed in to change notification settings - Fork 0
/
benchmarks.csv
We can make this file beautiful and searchable if this error is corrected: It looks like row 2 should actually have 24 columns, instead of 4 in line 1.
75 lines (75 loc) · 6.21 KB
/
benchmarks.csv
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
Model,"Arena Elo
[Feb 2, 2024]","Arena Elo
[April 18, 2024]",LC AlpacaEval 2.0,AlpacaEval 2.0,AlpacaEval 1.0,"MT-bench
(multi-turn)",WildBench,"Open LLM
(average)","ARC-C
(25-shot)","HellaSwag
(10-shot)","MMLU
(5-shot)","TruthfulQA
(0-shot)","WinoGrande
(5-shot)","GSM-8K
(5-shot)","GPT4All
(average)","AGI Eval
(en)","HELM
Lite","BBH, cot
(3-shot)","HumanEval
(pass @1)","LLMonitor
(01-10)","OpenComp.
(en, avg)","MBPP
(pass @1)",Output Length
gpt-4-turbo-2024-04-09,,1260,55.0
gpt4_1106_preview,,1254,50.0
gpt4_0125_preview,1249.0,,,,,,940.6,,,,,,,,,,,,,,,,
claude-3-opus-20240229,1247.0,1255,40.5,29.04176413403727,,,852.6,,,,86.8,,,,,,,,,,,,1388.0
claude-3-sonnet-20240229,1190.0,1203,34.9,25.556325292273296,,,835.8,,,,79.0,,,,,,,,,,,,1420.0
gpt4_0314,1185.0,1189,35.3,22.073258928708075,94.78260869565216,8.96,,,96.3,95.3,86.4,59.0,,,,57.0,,86.7,88.4,93.0,73.3,,1371.0
gpt4_0613,1160.0,1164,30.2,15.75503808763975,93.78109452736318,9.18,,,,,,,,,,57.0,0.962,86.7,88.4,89.0,73.3,,1140.0
mistral-large-2402,1155.0,1158,32.7,21.43877598137888,,,824.2,,,,81.2,,,,,,,,,,,,1362.0
Qwen1.5-72B-Chat,1146.0,1154,36.571754111987296,26.49828339562733,,8.61,,,,,77.5,,,,,,,,,,,,1549.0
claude,1145.0,1150,27.289504443727107,16.98534361236025,91.5527950310559,7.9,,,,,77.0,,,,,49.7,0.724,67.3,56.0,66.0,46.3,,1082.0
mistral-medium,1145.0,1148,28.6,21.855772543652176,96.83229813664596,8.61,,,89.9,88.0,75.3,,88.0,66.7,,,,,,,,62.3,1500.0
claude-2,1126.0,1131,28.155196141629148,17.188240356708075,91.35572139303484,8.06,,,,,78.5,,,,,,0.679,,71.2,68.0,,,1069.0
Mistral-Next,1123.0,,,,,,,,,,,,,,,,,,,,,,
Gemini Pro (Dev API),1118.0,,,,,,,,,,71.8,,,,,,,,,,,,
claude-2.1,1115.0,1119,25.251943886133027,15.733506736409938,87.0807453416149,8.18,,,,,,,,,,,0.593,,,,,,1096.0
Mixtral-8x7B-Instruct-v0.1,1114.0,1114,23.68848260134481,18.25531762637268,94.78260869565216,8.3,765.7,72.62,70.22,87.63,70.6,64.58,81.37,60.73,76.41,45.3,0.728,67.0,54.9,,56.8,60.7,1465.0
gpt-3.5-turbo-0613,1113.0,1119,22.35251298054288,14.09579857390062,,8.39,,,,,,,,,,,0.507,71.0,72.6,81.0,,,1331.0
gemini-pro,1110.0,1115,24.38177610802152,18.177644540571432,79.66417910447761,,788.0,,,,71.8,,,,,,,65.6,63.4,,,72.9,1456.0
GPT-3.5-Turbo-0314,1104.0,,,,,7.94,,,85.5,70.6,70.0,,85.2,57.1,,43.2,,,73.2,79.0,63.5,81.6,
claude-instant-1.2,1104.0,1108,25.61225902543337,16.12739962159006,,7.85,,,,,73.4,,,,,,,,52.8,60.0,,,1112.0
wizardlm-70b,1102.0,1108,17.575060737493747,14.383896086782608,,7.71,,61.25,65.44,84.41,63.7,54.81,80.82,17.97,,,,,,,,,1545.0
Yi-34B-Chat,1099.0,1107,27.19054787762733,29.65994671879504,94.08468244084682,,743.9,65.32,65.44,84.16,73.5,55.37,80.11,31.92,72.13,50.8,0.772,71.7,,,63.3,,2123.0
tulu-2-dpo-70b,1097.0,1103,21.238610038371124,15.982854374136648,95.03105590062113,7.89,685.9,73.77,72.1,88.99,69.84,65.78,83.27,62.62,,,,66.0,,,,,1418.0
dbrx-instruct,,1101,25.4
GPT-3.5-Turbo-0125,1096.0,,,,,,736.4,,,,,,,,,,,,,,,,
vicuna-33b-v1.3,1089.0,,17.574575310874923,12.705947921540371,88.99253731,7.12,,58.5,,,59.2,,,,,37.3,,52.0,,,53.0,,1479.0
Starling-LM-7B-alpha,1084.0,,14.690471079424972,14.24592352162733,,8.09,,67.13,63.82,84.9,63.9,46.39,80.58,62.4,72.72,40.1,,,,,,,1895.0
llama-2-70b-chat-hf,1082.0,,14.689648588392544,13.88825834374378,92.66169154228857,6.86,697.4,62.4,64.59,85.88,63.0,52.8,80.51,26.69,,45.0,,60.8,,60.0,58.6,,1790.0
OpenHermes-2.5-Mistral-7B,1079.0,,16.248577696674843,10.340415705751552,,,,61.52,64.93,84.18,63.8,52.24,78.06,26.08,73.12,43.0,,,48.2,,,,1107.0
NV-Llama2-70B-SteerLM-Chat,1076.0,,,,,7.54,,,,,68.5,,,,,,,,,,,,
Mistral-7B-Instruct-v0.2,1073.0,,17.111251846021165,14.722772657714286,92.77708592777088,7.6,,,,,,,,,,,,,,,,,1676.0
deepseek-llm-67b-chat,1073.0,,17.843384089909343,12.093422264919258,,,,71.79,67.75,86.82,72.42,55.85,84.21,63.68,,,,,,,,,1151.0
OpenChat-3.5,1071.0,,,,,7.81,,61.24,63.91,84.79,64.3,46.38,80.58,26.84,72.92,42.7,,,55.5,,,,
pplx-70b-online,1068.0,,,,,,,,,,,,,,,,,,,,,,
SOLAR-10.7B-Instruct-v1.0,1065.0,,,,,7.58,,74.2,71.08,88.16,66.2,71.43,83.58,64.75,75.11,47.6,,,,,,42.9,
dolphin-2.2.1-mistral-7b,1058.0,,13.121477650433736,9.039799728223604,,,,64.93,63.31,83.76,63.2,53.11,78.14,48.07,72.24,39.2,,59.8,,,58.0,,1130.0
wizardlm-13b-v1.2,1054.0,,14.462590694316631,12.027480342770186,89.16562889,7.2,,54.76,59.04,82.21,52.7,47.27,71.9,13.5,,,,,,,,,1635.0
zephyr-7b-beta,1046.0,,13.203198493136666,10.992885755354038,90.5977584059776,7.34,662.3,61.95,62.03,84.36,61.4,57.45,77.74,29.04,71.83,40.6,,,30.0,,,41.1,1444.0
llama-2-13b-chat-hf,1043.0,,8.436014548885215,7.702309957875775,81.09452736318407,6.65,678.2,54.91,59.04,81.94,53.6,44.12,74.51,15.24,,33.6,0.348,58.2,,50.0,50.3,,1513.0
MPT-30B-chat,1042.0,,,,,6.39,,55.38,58.7,82.54,50.4,52.42,75.3,12.13,,,,,,40.0,,,
CodeLlama-34B-instruct,1040.0,,,,,,,57.29,54.27,76.92,53.7,44.44,74.59,37.98,,,,,51.8,34.0,,,
vicuna-13b-v1.5,1037.0,,10.484438298504218,6.722122014857143,,6.57,593.2,55.4,57.08,81.24,55.8,51.51,74.66,11.3,63.1,36.8,,51.5,17.1,50.0,52.1,,1061.0
pplx-7b-online,1035.0,,,,,,,,,,,,,,,,,,,,,,
zephyr-7b-alpha,1033.0,,10.289760888704258,8.352663968198758,85.7587064676617,6.88,,59.5,61.01,84.04,61.4,57.9,78.61,14.03,72.24,38.0,,,,,,,1302.0
Qwen-14B-Chat,1032.0,,12.378741790737235,7.502333484720497,,6.96,,,,,66.5,,,59.7,,39.6,,53.7,43.9,,,,1013.0
guanaco-33b,1031.0,,5.690019090866207,5.002493724956522,65.96273292,6.53,,,,,57.6,,,,,,,,,43.0,,,1311.0
gemma-7b-it,1029.0,,10.425760403690134,6.937294379677018,,,676.5,,,,64.3,,,,,,,,,,,,1115.0
llama-2-7b-chat-hf,1027.0,,5.354821279508294,4.961339547167702,71.36645962732919,6.27,651.9,50.74,52.9,78.55,45.8,45.57,71.74,7.35,,29.6,0.217,35.6,,50.0,,,1479.0
falcon-180b-chat,1026.0,,,,,,,67.85,69.45,88.86,68.0,45.47,86.9,45.94,,,,,,67.0,,,
Mistral-7B-Instruct-v0.1,1002.0,,,,,6.84,545.9,54.96,54.52,75.63,55.4,56.28,73.72,14.25,67.95,33.5,0.438,56.7,28.7,57.0,53.6,,
vicuna-7b-v1.5,1001.0,,7.616892731870527,4.797493939167703,,6.17,,50.1,53.24,77.39,49.8,50.34,72.14,8.19,61.0,31.4,,43.4,11.6,41.0,,,1083.0
gemma-2b-it,985.0,,5.437453620377121,3.4019714381366457,,,,,,,42.3,,,,,,,,,,,,1041.0
chatglm2-6b,925.0,,4.35928292679035,2.7621847964596284,47.12858926,4.96,,,,,45.5,,,,,,,,,,,,1027.0
oasst-sft-pythia-12b,893.0,,3.270102114456748,1.790114083180124,25.96273292,4.32,,40.77,46.42,70.0,26.19,39.19,62.19,0.61,,,,,,,,,726.0
Yi-34Bx2-MoE-60B,,,,,,,,76.72,71.08,85.23,77.5,66.19,84.85,75.51,,,,,,,,,
,