Skip to content

Commit f756e77

Browse files
authored
Update HF space leaderboard (#1832)
1 parent 3cbe2a5 commit f756e77

File tree

4 files changed

+192
-19
lines changed

4 files changed

+192
-19
lines changed

fastchat/llm_judge/qa_browser.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,6 @@ def display_answer(
5858
judgment_dict_turn_2 = resolve_default_judgment_dict(
5959
q, model_judgments_normal, model_judgments_math, multi_turn=True
6060
)
61-
6261
explanation_turn_2 = (
6362
"##### Model Judgment (second turn)\n"
6463
+ get_model_judge_explanation(gamekey, judgment_dict_turn_2)

fastchat/serve/inference.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,7 @@ def chat_loop(
313313
context_len = model.config.max_position_embeddings
314314
else:
315315
context_len = 2048
316+
# TODO: Establish a standard that can be uniformly written in the config.
316317
if is_longchat:
317318
context_len = 16384
318319

fastchat/serve/model_worker.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,8 @@ def __init__(
112112
self.context_len = self.model.config.max_position_embeddings
113113
else:
114114
self.context_len = 2048
115-
# TODO: Can we establish a standard that can be uniformly written in the Config?
115+
116+
# TODO: Establish a standard that can be uniformly written in the config.
116117
if is_longchat:
117118
self.context_len = 16384
118119

Lines changed: 189 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,47 +1,215 @@
11
"""A gradio app that renders a static leaderboard. This is used for Hugging Face Space."""
2+
import ast
23
import argparse
34
import pickle
45

56
import gradio as gr
7+
import numpy as np
68

79

8-
notebook_url = "https://colab.research.google.com/drive/17L9uCiAivzWfzOxo2Tb9RMauT7vS6nVU?usp=sharing"
10+
notebook_url = "https://colab.research.google.com/drive/1RAWb22-PFNI-X1gPVzc927SGUdfr6nsR?usp=sharing"
11+
12+
13+
basic_component_values = [None] * 6
14+
leader_component_values = [None] * 5
915

1016

1117
def make_leaderboard_md(elo_results):
1218
leaderboard_md = f"""
1319
# Leaderboard
14-
| [Vote](https://arena.lmsys.org/) | [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
20+
| [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
21+
22+
🏆 This leaderboard is based on the following three benchmarks.
23+
- [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) - a crowdsourced, randomized battle platform. We use 40K+ user votes to compute Elo ratings.
24+
- [MT-Bench](https://arxiv.org/abs/2306.05685) - a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.
25+
- [MMLU](https://arxiv.org/abs/2009.03300) (5-shot) - a test to measure a model's multitask accuracy on 57 tasks.
26+
27+
💻 We use [fastchat.llm_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge) to compute MT-bench scores (single-answer grading on a scale of 10) and win rates (against gpt-3.5). The Arena Elo ratings are computed by this [notebook]({notebook_url}). The MMLU scores are computed by [InstructEval](https://github.com/declare-lab/instruct-eval) and [Chain-of-Thought Hub](https://github.com/FranxYao/chain-of-thought-hub). Higher values are better for all benchmarks. Empty cells mean not available.
28+
"""
29+
return leaderboard_md
1530

16-
We use the Elo rating system to calculate the relative performance of the models. You can view the voting data, basic analyses, and calculation procedure in this [notebook]({notebook_url}). We will periodically release new leaderboards. If you want to see more models, please help us [add them](https://github.com/lm-sys/FastChat/blob/main/docs/arena.md#how-to-add-a-new-model).
31+
32+
def make_leaderboard_md_live(elo_results):
33+
leaderboard_md = f"""
34+
# Leaderboard
1735
Last updated: {elo_results["last_updated_datetime"]}
1836
{elo_results["leaderboard_table"]}
1937
"""
2038
return leaderboard_md
2139

2240

23-
def build_leaderboard_tab(elo_results_file):
24-
if elo_results_file is not None:
41+
def update_elo_components(max_num_files, elo_results_file):
42+
log_files = get_log_files(max_num_files)
43+
44+
# Leaderboard
45+
if elo_results_file is None: # Do live update
46+
battles = clean_battle_data(log_files)
47+
elo_results = report_elo_analysis_results(battles)
48+
49+
leader_component_values[0] = make_leaderboard_md_live(elo_results)
50+
leader_component_values[1] = elo_results["win_fraction_heatmap"]
51+
leader_component_values[2] = elo_results["battle_count_heatmap"]
52+
leader_component_values[3] = elo_results["bootstrap_elo_rating"]
53+
leader_component_values[4] = elo_results["average_win_rate_bar"]
54+
55+
# Basic stats
56+
basic_stats = report_basic_stats(log_files)
57+
md0 = f"Last updated: {basic_stats['last_updated_datetime']}"
58+
59+
md1 = "### Action Histogram\n"
60+
md1 += basic_stats["action_hist_md"] + "\n"
61+
62+
md2 = "### Anony. Vote Histogram\n"
63+
md2 += basic_stats["anony_vote_hist_md"] + "\n"
64+
65+
md3 = "### Model Call Histogram\n"
66+
md3 += basic_stats["model_hist_md"] + "\n"
67+
68+
md4 = "### Model Call (Last 24 Hours)\n"
69+
md4 += basic_stats["num_chats_last_24_hours"] + "\n"
70+
71+
basic_component_values[0] = md0
72+
basic_component_values[1] = basic_stats["chat_dates_bar"]
73+
basic_component_values[2] = md1
74+
basic_component_values[3] = md2
75+
basic_component_values[4] = md3
76+
basic_component_values[5] = md4
77+
78+
79+
def update_worker(max_num_files, interval, elo_results_file):
80+
while True:
81+
tic = time.time()
82+
update_elo_components(max_num_files, elo_results_file)
83+
durtaion = time.time() - tic
84+
print(f"update duration: {durtaion:.2f} s")
85+
time.sleep(max(interval - durtaion, 0))
86+
87+
88+
def load_demo(url_params, request: gr.Request):
89+
logger.info(f"load_demo. ip: {request.client.host}. params: {url_params}")
90+
return basic_component_values + leader_component_values
91+
92+
93+
def model_hyperlink(model_name, link):
94+
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
95+
96+
97+
def load_leaderboard_table_csv(filename, add_hyperlink=True):
98+
lines = open(filename).readlines()
99+
heads = [v.strip() for v in lines[0].split(",")]
100+
rows = []
101+
for i in range(1, len(lines)):
102+
row = [v.strip() for v in lines[i].split(",")]
103+
for j in range(len(heads)):
104+
item = {}
105+
for h, v in zip(heads, row):
106+
if h == "Arena Elo rating":
107+
if v != "-":
108+
v = int(ast.literal_eval(v))
109+
else:
110+
v = np.nan
111+
elif h == "MMLU":
112+
if v != "-":
113+
v = round(ast.literal_eval(v) * 100, 1)
114+
else:
115+
v = np.nan
116+
elif h == "MT-bench (win rate %)":
117+
if v != "-":
118+
v = round(ast.literal_eval(v[:-1]), 1)
119+
else:
120+
v = np.nan
121+
elif h == "MT-bench (score)":
122+
if v != "-":
123+
v = round(ast.literal_eval(v), 2)
124+
else:
125+
v = np.nan
126+
item[h] = v
127+
if add_hyperlink:
128+
item["Model"] = model_hyperlink(item["Model"], item["Link"])
129+
rows.append(item)
130+
131+
return rows
132+
133+
134+
def build_basic_stats_tab():
135+
empty = "Loading ..."
136+
basic_component_values[:] = [empty, None, empty, empty, empty, empty]
137+
138+
md0 = gr.Markdown(empty)
139+
gr.Markdown("#### Figure 1: Number of model calls and votes")
140+
plot_1 = gr.Plot(show_label=False)
141+
with gr.Row():
142+
with gr.Column():
143+
md1 = gr.Markdown(empty)
144+
with gr.Column():
145+
md2 = gr.Markdown(empty)
146+
with gr.Row():
147+
with gr.Column():
148+
md3 = gr.Markdown(empty)
149+
with gr.Column():
150+
md4 = gr.Markdown(empty)
151+
return [md0, plot_1, md1, md2, md3, md4]
152+
153+
154+
def build_leaderboard_tab(elo_results_file, leaderboard_table_file):
155+
if elo_results_file is None: # Do live update
156+
md = "Loading ..."
157+
p1 = p2 = p3 = p4 = None
158+
else:
25159
with open(elo_results_file, "rb") as fin:
26160
elo_results = pickle.load(fin)
27161

28162
md = make_leaderboard_md(elo_results)
29163
p1 = elo_results["win_fraction_heatmap"]
30164
p2 = elo_results["battle_count_heatmap"]
31-
p3 = elo_results["average_win_rate_bar"]
32-
p4 = elo_results["bootstrap_elo_rating"]
165+
p3 = elo_results["bootstrap_elo_rating"]
166+
p4 = elo_results["average_win_rate_bar"]
167+
168+
md_1 = gr.Markdown(md, elem_id="leaderboard_markdown")
169+
170+
if leaderboard_table_file:
171+
data = load_leaderboard_table_csv(leaderboard_table_file)
172+
headers = [
173+
"Model",
174+
"Arena Elo rating",
175+
"MT-bench (score)",
176+
"MT-bench (win rate %)",
177+
"MMLU",
178+
"License",
179+
]
180+
values = []
181+
for item in data:
182+
row = []
183+
for key in headers:
184+
value = item[key]
185+
row.append(value)
186+
values.append(row)
187+
values.sort(key=lambda x: -x[1] if not np.isnan(x[1]) else 1e9)
188+
189+
headers[1] = "⭐ " + headers[1]
190+
headers[2] = "📈 " + headers[2]
191+
192+
gr.Dataframe(
193+
headers=headers,
194+
datatype=["markdown", "number", "number", "number", "number", "str"],
195+
value=values,
196+
elem_id="leaderboard_dataframe",
197+
)
198+
gr.Markdown(
199+
"If you want to see more models, please help us [add them](https://github.com/lm-sys/FastChat/blob/main/docs/arena.md#how-to-add-a-new-model)."
200+
)
33201
else:
34-
md = "Loading ..."
35-
p1 = p2 = p3 = p4 = None
202+
pass
36203

37-
md_1 = gr.Markdown(md)
38204
gr.Markdown(
39-
f"""## More Statistics\n
205+
f"""## More Statistics for Chatbot Arena\n
40206
We added some additional figures to show more statistics. The code for generating them is also included in this [notebook]({notebook_url}).
41207
Please note that you may see different orders from different ranking methods. This is expected for models that perform similarly, as demonstrated by the confidence interval in the bootstrap figure. Going forward, we prefer the classical Elo calculation because of its scalability and interpretability. You can find more discussions in this blog [post](https://lmsys.org/blog/2023-05-03-arena/).
42208
"""
43209
)
44210

211+
leader_component_values[:] = [md, p1, p2, p3, p4]
212+
45213
with gr.Row():
46214
with gr.Column():
47215
gr.Markdown(
@@ -56,23 +224,27 @@ def build_leaderboard_tab(elo_results_file):
56224
with gr.Row():
57225
with gr.Column():
58226
gr.Markdown(
59-
"#### Figure 3: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)"
227+
"#### Figure 3: Bootstrap of Elo Estimates (1000 Rounds of Random Sampling)"
60228
)
61229
plot_3 = gr.Plot(p3, show_label=False)
62230
with gr.Column():
63231
gr.Markdown(
64-
"#### Figure 4: Bootstrap of Elo Estimates (1000 Rounds of Random Sampling)"
232+
"#### Figure 4: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)"
65233
)
66234
plot_4 = gr.Plot(p4, show_label=False)
67235
return [md_1, plot_1, plot_2, plot_3, plot_4]
68236

69237

70-
def build_demo(elo_results_file):
238+
def build_demo(elo_results_file, leaderboard_table_file):
239+
text_size = gr.themes.sizes.text_lg
240+
71241
with gr.Blocks(
72242
title="Chatbot Arena Leaderboard",
73-
theme=gr.themes.Base(),
243+
theme=gr.themes.Base(text_size=text_size),
74244
) as demo:
75-
leader_components = build_leaderboard_tab(elo_results_file)
245+
leader_components = build_leaderboard_tab(
246+
elo_results_file, leaderboard_table_file
247+
)
76248

77249
return demo
78250

@@ -82,5 +254,5 @@ def build_demo(elo_results_file):
82254
parser.add_argument("--share", action="store_true")
83255
args = parser.parse_args()
84256

85-
demo = build_demo("elo_results_20230619.pkl")
257+
demo = build_demo("elo_results_20230619.pkl", "leaderboard_table_20230619.csv")
86258
demo.launch(share=args.share)

0 commit comments

Comments
 (0)