Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Leaderboard UI improvements #1320

Merged
merged 12 commits into from
Oct 26, 2024
5 changes: 2 additions & 3 deletions mteb/benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,7 @@
from pydantic import AnyUrl, BeforeValidator, TypeAdapter

from mteb.abstasks.AbsTask import AbsTask
from mteb.load_results.benchmark_results import (
BenchmarkResults,
)
from mteb.load_results.benchmark_results import BenchmarkResults
from mteb.load_results.load_results import load_results
from mteb.overview import get_tasks

Expand Down Expand Up @@ -63,6 +61,7 @@ def load_results(
base_results = load_results()
return base_results.select_tasks(self.tasks)


MTEB_MAIN_EN = Benchmark(
name="MTEB(eng)",
tasks=get_tasks(
Expand Down
104 changes: 57 additions & 47 deletions mteb/leaderboard/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from gradio_rangeslider import RangeSlider

import mteb
from mteb.leaderboard.table import scores_to_table
from mteb.leaderboard.table import scores_to_tables


def load_results():
Expand All @@ -27,7 +27,7 @@ def load_results():

benchmarks = mteb.get_benchmarks()

default_benchmark = mteb.get_benchmark("MTEB(multilingual)")
default_benchmark = mteb.get_benchmark("MTEB(Multilingual)")
default_results = default_benchmark.load_results(base_results=all_results)

benchmark_select = gr.Dropdown(
Expand Down Expand Up @@ -60,6 +60,7 @@ def load_results():
task_select = gr.Dropdown(
default_results.task_names,
value=default_results.task_names,
allow_custom_value=True,
multiselect=True,
label="Task",
info="Select specific tasks to include",
Expand All @@ -79,44 +80,46 @@ def load_results():
""",
)
with gr.Group():
availability = gr.Radio(
[
("Only Open", True),
("Only Proprietary", False),
("Both", None),
],
value=None,
label="Availability",
interactive=True,
)
compatibility = gr.CheckboxGroup(
[
(
"Should be sentence-transformers compatible",
"sbert_compatible",
with gr.Row(elem_classes="overflow-y-scroll max-h-80"):
with gr.Column():
availability = gr.Radio(
[
("Only Open", True),
("Only Proprietary", False),
("Both", None),
],
value=None,
label="Availability",
interactive=True,
)
compatibility = gr.CheckboxGroup(
[
(
"Should be sentence-transformers compatible",
"sbert_compatible",
)
],
value=[],
label="Compatibility",
interactive=True,
)
instructions = gr.Radio(
[
("Only Instruction-tuned", True),
("Only non-instruction", False),
("Both", None),
],
value=None,
label="Instructions",
interactive=True,
)
model_size = RangeSlider(
minimum=0,
maximum=8000,
value=(0, 8000),
label="Model Size (#M Parameters)",
interactive=True,
)
],
value=[],
label="Compatibility",
interactive=True,
)
instructions = gr.Radio(
[
("Only Instruction-tuned", True),
("Only non-instruction", False),
("Both", None),
],
value=None,
label="Instructions",
interactive=True,
)
model_size = RangeSlider(
minimum=0,
maximum=8000,
value=(0, 8000),
label="Model Size (#M Parameters)",
interactive=True,
)
with gr.Column(scale=2):
gr.Markdown(
"""
Expand All @@ -126,7 +129,7 @@ def load_results():
"""
)
with gr.Group():
with gr.Row(elem_classes="overflow-y-scroll h-80"):
with gr.Row(elem_classes="overflow-y-scroll max-h-80"):
with gr.Column():
benchmark_select.render()
with gr.Accordion("Select Languages", open=False):
Expand All @@ -135,13 +138,20 @@ def load_results():
type_select.render()
with gr.Accordion("Select Domains", open=False):
domain_select.render()
# with gr.Accordion("Add and remove tasks:", open=False):
task_select.render()
scores = gr.State(default_results.get_scores(format="long"))
dataframe = gr.DataFrame(
scores_to_table,
inputs=[scores],
)
with gr.Accordion("Add and remove tasks:", open=False):
task_select.render()
default_scores = default_results.get_scores(format="long")
scores = gr.State(default_scores)
summary, per_task = scores_to_tables(default_scores)
with gr.Tab("Summary"):
summary_table = gr.DataFrame(summary)
with gr.Tab("Performance per task"):
per_task_table = gr.DataFrame(per_task)

@gr.on(inputs=[scores], outputs=[summary_table, per_task_table])
def update_tables(scores):
summary, per_task = scores_to_tables(scores)
return summary, per_task

@gr.on(
inputs=[benchmark_select],
Expand Down
50 changes: 47 additions & 3 deletions mteb/leaderboard/table.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,19 @@
from __future__ import annotations

import gradio as gr
import numpy as np
import pandas as pd

from mteb.overview import get_task


def scores_to_table(scores_long: list[dict]):
def format_scores(score: float) -> float:
return score * 100


def scores_to_tables(scores_long: list[dict]):
if not scores_long:
return gr.DataFrame(), gr.DataFrame()
data = pd.DataFrame.from_records(scores_long)
data["task_type"] = data["task_name"].map(
lambda task_name: get_task(task_name).metadata.type
Expand All @@ -27,19 +34,56 @@ def scores_to_table(scores_long: list[dict]):
per_task = data.pivot(
index=["model_name", "model_revision"], columns="task_name", values="score"
)
to_remove = per_task.isna().any(axis="columns")
overall_mean = (
data.groupby(["model_name", "model_revision"])[["score"]]
.agg(np.nanmean)
.rename(columns={"score": "mean"})
)
joint_table = overall_mean.join([typed_mean, mean_per_type, per_task]).reset_index()
per_task = per_task[~to_remove]
mean_per_type = mean_per_type[~to_remove]
overall_mean = overall_mean[~to_remove]
mean_rank = per_task.rank(ascending=False, numeric_only=True).mean(
axis=1, skipna=True
)
joint_table = overall_mean.join([typed_mean, mean_per_type])
joint_table.insert(0, "mean_rank", mean_rank)
joint_table = joint_table.reset_index()
joint_table = joint_table.sort_values("mean", ascending=False)
joint_table["model_name"] = joint_table["model_name"].map(
lambda name: name.split("/")[-1]
)
joint_table = joint_table.rename(
columns={
"model_name": "Model",
"mean_by_task_type": "Mean by Task Type",
"mean": "Mean",
"mean_rank": "Mean Rank",
}
)
joint_table = joint_table.drop(columns=["model_revision"])
return joint_table
joint_table.insert(
0, "Rank", joint_table["Mean"].rank(ascending=False).map(int).map(str)
)
per_task = per_task.rename(
columns={
"model_name": "Model",
}
)
per_task = per_task.reset_index().drop(columns=["model_revision"])
numerics = joint_table.select_dtypes("number").columns
to_format = ["Mean", "Mean by Task Type", *mean_per_type.columns]
joint_table[to_format] = joint_table[to_format].map(format_scores)
joint_table = joint_table.style.highlight_max(
subset=to_format,
props="font-weight: bold",
).format("{:.2f}", subset=numerics)
joint_table = joint_table.highlight_min(
subset=["Mean Rank"], props="font-weight: bold"
)
numerics = per_task.select_dtypes("number").columns
per_task[numerics] = per_task[numerics].map(format_scores)
per_task = per_task.style.highlight_max(
subset=numerics, props="font-weight: bold"
).format("{:.2f}", subset=numerics)
return joint_table, per_task
Loading