From 4949acadcece5532ddb32ee0e2dd0e38cd16ecc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Wed, 23 Oct 2024 16:00:44 +0200 Subject: [PATCH 01/11] Fixed typos in task_results --- mteb/load_results/task_results.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/mteb/load_results/task_results.py b/mteb/load_results/task_results.py index aa9bf58359..c6e827d833 100644 --- a/mteb/load_results/task_results.py +++ b/mteb/load_results/task_results.py @@ -291,12 +291,8 @@ def from_disk(cls, path: Path, load_historic_data: bool = True) -> TaskResult: ) pre_1_11_load = ( - ( - "mteb_version" in data - and Version(data["mteb_version"]) < Version("1.11.0") - ) - or "mteb_version" not in data - ) # assume it is before 1.11.0 if the version is not present + "mteb_version" in data and Version(data["mteb_version"]) < Version("1.11.0") + ) or "mteb_version" not in data # assume it is before 1.11.0 if the version is not present try: obj = cls.model_validate(data) except Exception as e: @@ -492,12 +488,12 @@ def validate_and_filter_scores(self, task: AbsTask | None = None) -> AbsTask: hf_subsets = {"default"} new_scores = {} seen_splits = set() - for split in task_result.scores: + for split in self.scores: if split not in splits: continue new_scores[split] = [] seen_subsets = set() - for _scores in task_result.scores[split]: + for _scores in self.scores[split]: if _scores["hf_subset"] not in hf_subsets: continue new_scores[split].append(_scores) From 692cdc999d2e3ccd919e84442b00575393fc3e2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Wed, 23 Oct 2024 16:01:46 +0200 Subject: [PATCH 02/11] Fixed typos in task_results --- mteb/load_results/task_results.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mteb/load_results/task_results.py b/mteb/load_results/task_results.py index c6e827d833..0e8a4b4778 100644 --- a/mteb/load_results/task_results.py +++ b/mteb/load_results/task_results.py @@ -505,6 +505,6 @@ def validate_and_filter_scores(self, task: AbsTask | None = None) -> AbsTask: seen_splits.add(split) if seen_splits != set(splits): raise ValueError(f"Missing splits {set(splits) - seen_splits}") - new_res = {**task_result.to_dict(), "scores": new_scores} + new_res = {**self.to_dict(), "scores": new_scores} new_res = TaskResult.from_dict(new_res) return new_res From fc7c1a0bd14a27ac59bf52e007a2cc447f779ee6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Wed, 23 Oct 2024 18:01:00 +0200 Subject: [PATCH 03/11] Added Tailwind, reorganized layout and fixed scrolling --- mteb/leaderboard/app.py | 74 ++++++++++++++++++++--------------------- 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index 2ee7acbfa0..770445b286 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -65,25 +65,26 @@ def load_results(): info="Select specific tasks to include", ) -css = """ -.scrollable { - overflow-y: scroll; - max-height: 400px -} +head = """ + """ -with gr.Blocks(fill_width=True, theme=gr.themes.Base(), css=css) as demo: - gr.Markdown( - """ - ### Model Selection - Select models to rank based on an assortment of criteria. - """ - ) - with gr.Group(): - with gr.Row(): - with gr.Column(): +with gr.Blocks(fill_width=True, theme=gr.themes.Base(), head=head) as demo: + with gr.Row(): + with gr.Column(scale=1): + gr.Markdown( + """ + ### Model Selection + Select models to rank based on an assortment of criteria. + """, + ) + with gr.Group(): availability = gr.Radio( - [("Only Open", True), ("Only Proprietary", False), ("Both", None)], + [ + ("Only Open", True), + ("Only Proprietary", False), + ("Both", None), + ], value=None, label="Availability", interactive=True, @@ -99,7 +100,6 @@ def load_results(): label="Compatibility", interactive=True, ) - with gr.Column(): instructions = gr.Radio( [ ("Only Instruction-tuned", True), @@ -117,26 +117,26 @@ def load_results(): label="Model Size (#M Parameters)", interactive=True, ) - - gr.Markdown( - """ - ### Benchmarks - Select one of the hand-curated benchmarks from our publication. - Or create one from scratch based on your use case. - """ - ) - with gr.Group(elem_classes="scrollable"): - with gr.Row(): - with gr.Column(): - benchmark_select.render() - with gr.Row(): - lang_select.render() - type_select.render() - with gr.Row(): - domain_select.render() - with gr.Column(): - # with gr.Accordion("Add and remove tasks:", open=False): - task_select.render() + with gr.Column(scale=2): + gr.Markdown( + """ + ### Benchmarks + Select one of the hand-curated benchmarks from our publication. + Or create one from scratch based on your use case. + """ + ) + with gr.Group(): + with gr.Row(elem_classes="overflow-y-scroll h-80"): + with gr.Column(): + benchmark_select.render() + with gr.Accordion("Select Languages", open=False): + lang_select.render() + with gr.Accordion("Select Task Types", open=False): + type_select.render() + with gr.Accordion("Select Domains", open=False): + domain_select.render() + # with gr.Accordion("Add and remove tasks:", open=False): + task_select.render() scores = gr.State(default_results.get_scores(format="long")) dataframe = gr.DataFrame( scores_to_table, From 9b8a58f4627337ddc4a7f4c1db038d5454d75bda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Thu, 24 Oct 2024 13:09:05 +0200 Subject: [PATCH 04/11] Ran linting --- mteb/load_results/task_results.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/mteb/load_results/task_results.py b/mteb/load_results/task_results.py index 0e8a4b4778..b6da0ba304 100644 --- a/mteb/load_results/task_results.py +++ b/mteb/load_results/task_results.py @@ -291,8 +291,12 @@ def from_disk(cls, path: Path, load_historic_data: bool = True) -> TaskResult: ) pre_1_11_load = ( - "mteb_version" in data and Version(data["mteb_version"]) < Version("1.11.0") - ) or "mteb_version" not in data # assume it is before 1.11.0 if the version is not present + ( + "mteb_version" in data + and Version(data["mteb_version"]) < Version("1.11.0") + ) + or "mteb_version" not in data + ) # assume it is before 1.11.0 if the version is not present try: obj = cls.model_validate(data) except Exception as e: From 3339614ad57e8d3fc3eaf3d56303c9913f626f22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Thu, 24 Oct 2024 14:21:24 +0200 Subject: [PATCH 05/11] Removed faux benchmark --- mteb/benchmarks/benchmarks.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index ee8a184449..9c24c525ac 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -7,9 +7,7 @@ from pydantic import AnyUrl, BeforeValidator, TypeAdapter from mteb.abstasks.AbsTask import AbsTask -from mteb.load_results.benchmark_results import ( - BenchmarkResults, -) +from mteb.load_results.benchmark_results import BenchmarkResults from mteb.load_results.load_results import load_results from mteb.overview import get_tasks @@ -64,8 +62,6 @@ def load_results( return base_results.select_tasks(self.tasks) -MTEB_MAIN_MULTILINGUAL = Benchmark(name="MTEB(multilingual)", tasks=get_tasks()) - MTEB_MAIN_EN = Benchmark( name="MTEB(eng)", tasks=get_tasks( From 26f05b9dc192eec4ee17f88120a17f4bad1703ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Thu, 24 Oct 2024 14:22:01 +0200 Subject: [PATCH 06/11] Updated layout --- mteb/leaderboard/app.py | 104 +++++++++++++++++++++----------------- mteb/leaderboard/table.py | 21 ++++++-- 2 files changed, 75 insertions(+), 50 deletions(-) diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index 770445b286..7d49d009d1 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -7,7 +7,7 @@ from gradio_rangeslider import RangeSlider import mteb -from mteb.leaderboard.table import scores_to_table +from mteb.leaderboard.table import scores_to_tables def load_results(): @@ -27,7 +27,7 @@ def load_results(): benchmarks = mteb.get_benchmarks() -default_benchmark = mteb.get_benchmark("MTEB(multilingual)") +default_benchmark = mteb.get_benchmark("MTEB(Multilingual)") default_results = default_benchmark.load_results(base_results=all_results) benchmark_select = gr.Dropdown( @@ -60,6 +60,7 @@ def load_results(): task_select = gr.Dropdown( default_results.task_names, value=default_results.task_names, + allow_custom_value=True, multiselect=True, label="Task", info="Select specific tasks to include", @@ -79,44 +80,46 @@ def load_results(): """, ) with gr.Group(): - availability = gr.Radio( - [ - ("Only Open", True), - ("Only Proprietary", False), - ("Both", None), - ], - value=None, - label="Availability", - interactive=True, - ) - compatibility = gr.CheckboxGroup( - [ - ( - "Should be sentence-transformers compatible", - "sbert_compatible", + with gr.Row(elem_classes="overflow-y-scroll max-h-80"): + with gr.Column(): + availability = gr.Radio( + [ + ("Only Open", True), + ("Only Proprietary", False), + ("Both", None), + ], + value=None, + label="Availability", + interactive=True, + ) + compatibility = gr.CheckboxGroup( + [ + ( + "Should be sentence-transformers compatible", + "sbert_compatible", + ) + ], + value=[], + label="Compatibility", + interactive=True, + ) + instructions = gr.Radio( + [ + ("Only Instruction-tuned", True), + ("Only non-instruction", False), + ("Both", None), + ], + value=None, + label="Instructions", + interactive=True, + ) + model_size = RangeSlider( + minimum=0, + maximum=8000, + value=(0, 8000), + label="Model Size (#M Parameters)", + interactive=True, ) - ], - value=[], - label="Compatibility", - interactive=True, - ) - instructions = gr.Radio( - [ - ("Only Instruction-tuned", True), - ("Only non-instruction", False), - ("Both", None), - ], - value=None, - label="Instructions", - interactive=True, - ) - model_size = RangeSlider( - minimum=0, - maximum=8000, - value=(0, 8000), - label="Model Size (#M Parameters)", - interactive=True, - ) with gr.Column(scale=2): gr.Markdown( """ @@ -126,7 +129,7 @@ def load_results(): """ ) with gr.Group(): - with gr.Row(elem_classes="overflow-y-scroll h-80"): + with gr.Row(elem_classes="overflow-y-scroll max-h-80"): with gr.Column(): benchmark_select.render() with gr.Accordion("Select Languages", open=False): @@ -135,13 +138,20 @@ def load_results(): type_select.render() with gr.Accordion("Select Domains", open=False): domain_select.render() - # with gr.Accordion("Add and remove tasks:", open=False): - task_select.render() - scores = gr.State(default_results.get_scores(format="long")) - dataframe = gr.DataFrame( - scores_to_table, - inputs=[scores], - ) + with gr.Accordion("Add and remove tasks:", open=False): + task_select.render() + default_scores = default_results.get_scores(format="long") + scores = gr.State(default_scores) + summary, per_task = scores_to_tables(default_scores) + with gr.Tab("Summary"): + summary_table = gr.DataFrame(summary) + with gr.Tab("Performance per task"): + per_task_table = gr.DataFrame(per_task) + + @gr.on(inputs=[scores], outputs=[summary_table, per_task_table]) + def update_tables(scores): + summary, per_task = scores_to_tables(scores) + return summary, per_task @gr.on( inputs=[benchmark_select], diff --git a/mteb/leaderboard/table.py b/mteb/leaderboard/table.py index b2ec0384f8..54b1bb1499 100644 --- a/mteb/leaderboard/table.py +++ b/mteb/leaderboard/table.py @@ -1,12 +1,15 @@ from __future__ import annotations +import gradio as gr import numpy as np import pandas as pd from mteb.overview import get_task -def scores_to_table(scores_long: list[dict]): +def scores_to_tables(scores_long: list[dict]): + if not scores_long: + return gr.DataFrame(), gr.DataFrame() data = pd.DataFrame.from_records(scores_long) data["task_type"] = data["task_name"].map( lambda task_name: get_task(task_name).metadata.type @@ -27,12 +30,16 @@ def scores_to_table(scores_long: list[dict]): per_task = data.pivot( index=["model_name", "model_revision"], columns="task_name", values="score" ) + to_remove = per_task.isna().any(axis="columns") overall_mean = ( data.groupby(["model_name", "model_revision"])[["score"]] .agg(np.nanmean) .rename(columns={"score": "mean"}) ) - joint_table = overall_mean.join([typed_mean, mean_per_type, per_task]).reset_index() + per_task = per_task[~to_remove] + mean_per_type = mean_per_type[~to_remove] + overall_mean = overall_mean[~to_remove] + joint_table = overall_mean.join([typed_mean, mean_per_type]).reset_index() joint_table = joint_table.sort_values("mean", ascending=False) joint_table = joint_table.rename( columns={ @@ -42,4 +49,12 @@ def scores_to_table(scores_long: list[dict]): } ) joint_table = joint_table.drop(columns=["model_revision"]) - return joint_table + per_task = per_task.rename( + columns={ + "model_name": "Model", + "mean_by_task_type": "Mean by Task Type", + "mean": "Mean", + } + ) + per_task = per_task.reset_index().drop(columns=["model_revision"]) + return joint_table, per_task From 2417f5195cd0c9769e393c0978397e1f520a9bdc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Thu, 24 Oct 2024 14:37:00 +0200 Subject: [PATCH 07/11] Changed table number format --- mteb/leaderboard/table.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mteb/leaderboard/table.py b/mteb/leaderboard/table.py index 54b1bb1499..772eb10984 100644 --- a/mteb/leaderboard/table.py +++ b/mteb/leaderboard/table.py @@ -7,6 +7,10 @@ from mteb.overview import get_task +def format_scores(score: float) -> str: + return f"{score*100:.2f}" + + def scores_to_tables(scores_long: list[dict]): if not scores_long: return gr.DataFrame(), gr.DataFrame() @@ -57,4 +61,8 @@ def scores_to_tables(scores_long: list[dict]): } ) per_task = per_task.reset_index().drop(columns=["model_revision"]) + numerics = joint_table.select_dtypes("number").columns + joint_table[numerics] = joint_table[numerics].map(format_scores) + numerics = per_task.select_dtypes("number").columns + per_task[numerics] = per_task[numerics].map(format_scores) return joint_table, per_task From 467287e70e3c1c6a3622921e68d85d3b581817e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Thu, 24 Oct 2024 14:48:02 +0200 Subject: [PATCH 08/11] Table highlights highest values by making them bold --- mteb/leaderboard/table.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mteb/leaderboard/table.py b/mteb/leaderboard/table.py index 772eb10984..e4e51a2f1b 100644 --- a/mteb/leaderboard/table.py +++ b/mteb/leaderboard/table.py @@ -63,6 +63,10 @@ def scores_to_tables(scores_long: list[dict]): per_task = per_task.reset_index().drop(columns=["model_revision"]) numerics = joint_table.select_dtypes("number").columns joint_table[numerics] = joint_table[numerics].map(format_scores) + joint_table = joint_table.style.highlight_max( + subset=numerics, props="font-weight: bold" + ) numerics = per_task.select_dtypes("number").columns per_task[numerics] = per_task[numerics].map(format_scores) + per_task = per_task.style.highlight_max(subset=numerics, props="font-weight: bold") return joint_table, per_task From 7c14b2bfdc3cd720762c0aa8ae95876db830721e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Thu, 24 Oct 2024 15:51:24 +0200 Subject: [PATCH 09/11] Added rank to table, removed organization from model_name --- mteb/leaderboard/table.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/mteb/leaderboard/table.py b/mteb/leaderboard/table.py index e4e51a2f1b..3fcaa34721 100644 --- a/mteb/leaderboard/table.py +++ b/mteb/leaderboard/table.py @@ -45,6 +45,9 @@ def scores_to_tables(scores_long: list[dict]): overall_mean = overall_mean[~to_remove] joint_table = overall_mean.join([typed_mean, mean_per_type]).reset_index() joint_table = joint_table.sort_values("mean", ascending=False) + joint_table["model_name"] = joint_table["model_name"].map( + lambda name: name.split("/")[-1] + ) joint_table = joint_table.rename( columns={ "model_name": "Model", @@ -53,11 +56,12 @@ def scores_to_tables(scores_long: list[dict]): } ) joint_table = joint_table.drop(columns=["model_revision"]) + joint_table.insert( + 0, "Rank", joint_table["Mean"].rank(ascending=False).map(int).map(str) + ) per_task = per_task.rename( columns={ "model_name": "Model", - "mean_by_task_type": "Mean by Task Type", - "mean": "Mean", } ) per_task = per_task.reset_index().drop(columns=["model_revision"]) From 6a2b64d990f53ae06ec98e8f5b4e373013ca4434 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Thu, 24 Oct 2024 16:17:20 +0200 Subject: [PATCH 10/11] Added mean rank to table --- mteb/leaderboard/table.py | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/mteb/leaderboard/table.py b/mteb/leaderboard/table.py index 3fcaa34721..570d5bc6dd 100644 --- a/mteb/leaderboard/table.py +++ b/mteb/leaderboard/table.py @@ -7,8 +7,8 @@ from mteb.overview import get_task -def format_scores(score: float) -> str: - return f"{score*100:.2f}" +def format_scores(score: float) -> float: + return score * 100 def scores_to_tables(scores_long: list[dict]): @@ -43,7 +43,12 @@ def scores_to_tables(scores_long: list[dict]): per_task = per_task[~to_remove] mean_per_type = mean_per_type[~to_remove] overall_mean = overall_mean[~to_remove] - joint_table = overall_mean.join([typed_mean, mean_per_type]).reset_index() + mean_rank = per_task.rank(ascending=False, numeric_only=True).mean( + axis=1, skipna=True + ) + joint_table = overall_mean.join([typed_mean, mean_per_type]) + joint_table.insert(0, "mean_rank", mean_rank) + joint_table = joint_table.reset_index() joint_table = joint_table.sort_values("mean", ascending=False) joint_table["model_name"] = joint_table["model_name"].map( lambda name: name.split("/")[-1] @@ -53,6 +58,7 @@ def scores_to_tables(scores_long: list[dict]): "model_name": "Model", "mean_by_task_type": "Mean by Task Type", "mean": "Mean", + "mean_rank": "Mean Rank", } ) joint_table = joint_table.drop(columns=["model_revision"]) @@ -66,11 +72,18 @@ def scores_to_tables(scores_long: list[dict]): ) per_task = per_task.reset_index().drop(columns=["model_revision"]) numerics = joint_table.select_dtypes("number").columns - joint_table[numerics] = joint_table[numerics].map(format_scores) + to_format = ["Mean", "Mean by Task Type", *mean_per_type.columns] + joint_table[to_format] = joint_table[to_format].map(format_scores) joint_table = joint_table.style.highlight_max( - subset=numerics, props="font-weight: bold" + subset=to_format, + props="font-weight: bold", + ).format("{:.2f}", subset=numerics) + joint_table = joint_table.highlight_min( + subset=["Mean Rank"], props="font-weight: bold" ) numerics = per_task.select_dtypes("number").columns per_task[numerics] = per_task[numerics].map(format_scores) - per_task = per_task.style.highlight_max(subset=numerics, props="font-weight: bold") + per_task = per_task.style.highlight_max( + subset=numerics, props="font-weight: bold" + ).format("{:.2f}", subset=numerics) return joint_table, per_task From 544ebb70e4d55bccb0ce9f6ba267ca3d2bcbdb32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Thu, 24 Oct 2024 16:19:40 +0200 Subject: [PATCH 11/11] Ran linting --- mteb/benchmarks/benchmarks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index 7eaa3b2986..9c24c525ac 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -61,6 +61,7 @@ def load_results( base_results = load_results() return base_results.select_tasks(self.tasks) + MTEB_MAIN_EN = Benchmark( name="MTEB(eng)", tasks=get_tasks(