docs: Create benchmarks overview table (#1245)

* fix get_benchmarks method * add create benchmark script * make lint
embeddings-benchmark · Sep 27, 2024 · fda9be1 · fda9be1
1 parent 3c06694
commit fda9be1
Show file tree

Hide file tree

Showing 5 changed files with 83 additions and 2 deletions.
diff --git a/docs/__init__.py b/docs/__init__.py
diff --git a/docs/benchmarks.md b/docs/benchmarks.md
@@ -0,0 +1,22 @@
+## Available benchmarks
+The following tables give you an overview of the benchmarks in MTEB.
+
+<details>
+
+<!-- This allows the table to be autogenerated in the future: -->
+<!-- BENCHMARKS TABLE START -->
+| Name | # Tasks | Task Types | Domains | Languages |
+|------|---------|------------|---------|-----------|
+| [CoIR](https://github.com/CoIR-team/coir) | 10 | {'Retrieval': 10} | [Written, Programming] | python,c++,sql,go,eng,php,javascript,ruby,java |
+| [MINERSBitextMining](https://arxiv.org/pdf/2406.07424) | 7 | {'BitextMining': 7} | [Written, Social, Reviews] | sun,kaz,tzl,ido,abs,arq,yue,tam,nij,glg,slk,hsb,ber,xho,cbk,pol,uzb,ina,kab,swh,amh,fao,kzj,lfn,uig,sqi,deu,ang,ind,bug,pms,ibo,cym,eus,spa,ceb,tgl,ron,isl,ita,csb,cha,fin,est,pes,jpn,tel,tha,oci,cmn,min,fry,bbc,epo,lit,rus,bos,hrv,war,ara,bjn,mkd,srp,ast,nno,urd,pam,aze,eng,ace,bew,kor,dan,awa,mui,hye,ban,cor,ben,gle,swe,mad,bul,lat,cat,nob,fra,pcm,ell,mar,vie,tat,ukr,gsw,kat,arz,dsb,lvs,nld,tur,bel,max,nds,afr,khm,dtp,yor,ces,gla,zsm,mak,ile,nov,orv,bre,swg,rej,mhr,mon,mal,jav,heb,slv,bhp,kur,wuu,tuk,por,hun,hin,hau,yid |
+| [MTEB(Retrieval w/Instructions)](https://arxiv.org/abs/2403.15246) | 3 | {'InstructionRetrieval': 3} | [Written, News] | eng |
+| [MTEB(Scandinavian)](https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/) | 28 | {'BitextMining': 2, 'Classification': 13, 'Retrieval': 7, 'Clustering': 6} | [Encyclopaedic, Spoken, Non-fiction, Government, News, Fiction, Social, Blog, Reviews, Written, Web, Legal] | nob,fao,swe,isl,dan,nno |
+| MTEB(code) | 12 | {'Retrieval': 12} | [Written, Programming] | python,c++,sql,c,go,eng,shell,typescript,php,scala,rust,swift,javascript,ruby,java |
+| [MTEB(deu)](https://arxiv.org/html/2401.02709v1) | 19 | {'Classification': 6, 'Clustering': 4, 'PairClassification': 2, 'Reranking': 1, 'Retrieval': 4, 'STS': 2} | [Encyclopaedic, Spoken, News, Reviews, Written, Web] | eng,deu,pol,fra |
+| MTEB(eng) | 67 | {'Classification': 12, 'Retrieval': 26, 'Clustering': 11, 'Reranking': 4, 'STS': 10, 'PairClassification': 3, 'Summarization': 1} | [Encyclopaedic, Spoken, Non-fiction, Blog, News, Medical, Social, Programming, Written, Reviews, Web, Academic] | tur,fra,eng,cmn,pol,ita,nld,spa,deu,ara |
+| [MTEB(fra)](https://arxiv.org/abs/2405.20468) | 26 | {'Classification': 6, 'Clustering': 7, 'PairClassification': 2, 'Reranking': 2, 'Retrieval': 5, 'STS': 3, 'Summarization': 1} | [Encyclopaedic, Spoken, Non-fiction, News, Social, Reviews, Written, Web, Legal, Academic] | eng,deu,pol,fra |
+| MTEB(kor) | 6 | {'Classification': 1, 'Reranking': 1, 'Retrieval': 2, 'STS': 2} | [Encyclopaedic, Spoken, News, Reviews, Written, Web] | kor |
+| [MTEB(law)](https://aclanthology.org/2023.eacl-main.148/) | 8 | {'Retrieval': 8} | [Written, Legal] | eng,deu,zho |
+| [MTEB(pol)](https://arxiv.org/abs/2405.10138) | 18 | {'Classification': 7, 'Clustering': 3, 'PairClassification': 4, 'STS': 4} | [Spoken, Non-fiction, News, Fiction, Social, Written, Web, Legal, Academic] | pol,deu,eng,fra |
+| [MTEB(rus)](https://aclanthology.org/2023.eacl-main.148/) | 23 | {'Classification': 9, 'Clustering': 3, 'MultilabelClassification': 2, 'PairClassification': 1, 'Reranking': 2, 'Retrieval': 3, 'STS': 3} | [Encyclopaedic, Spoken, Blog, News, Social, Reviews, Written, Web, Academic] | rus |
+<!-- BENCHMARKS TABLE END -->
diff --git a/docs/create_benchmarks_table.py b/docs/create_benchmarks_table.py
@@ -0,0 +1,58 @@
+from __future__ import annotations
+
+from collections import Counter
+from pathlib import Path
+
+import mteb
+from docs.create_tasks_table import insert_tables
+
+
+def benchmark_to_markdown_row(b: mteb.Benchmark) -> str:
+    name = b.name
+    name_w_reference = f"[{name}]({b.reference})" if b.reference else name
+    num_tasks = len(b.tasks)
+    n_tasks = f"{num_tasks}"
+
+    agg_domains = set()
+    agg_langs = set()
+    for t in b.tasks:
+        if t.metadata.domains:
+            agg_domains.update(t.metadata.domains)
+        if t.metadata.languages:
+            agg_langs.update(t.languages)
+
+    langs = ",".join(list(agg_langs))
+    domains = "[" + ", ".join(agg_domains) + "]" if agg_domains else ""
+
+    task_types = dict(Counter([t.metadata.type for t in b.tasks]))
+
+    return f"| {name_w_reference} | {n_tasks} | {task_types} | {domains} | {langs} |"
+
+
+def create_benchmarks_table(benchmarks: list[mteb.Benchmark]) -> str:
+    table = """
+| Name | # Tasks | Task Types | Domains | Languages |
+|------|---------|------------|---------|-----------|
+"""
+    for benchmark in benchmarks:
+        table += benchmark_to_markdown_row(benchmark) + "\n"
+    return table
+
+
+def main():
+    benchmarks = mteb.get_benchmarks()
+    benchmarks = sorted(benchmarks, key=lambda x: x.name)
+
+    benchmarks_table = create_benchmarks_table(benchmarks)
+
+    file_path = Path(__file__).parent / "benchmarks.md"
+
+    insert_tables(
+        file_path,
+        tables=[benchmarks_table],
+        tags=["BENCHMARKS TABLE"],
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/mteb/__init__.py b/mteb/__init__.py
@@ -15,7 +15,7 @@
 from mteb.overview import TASKS_REGISTRY, get_task, get_tasks
 
 from .benchmarks.benchmarks import Benchmark
-from .benchmarks.get_benchmark import get_benchmark
+from .benchmarks.get_benchmark import get_benchmark, get_benchmarks
 
 __version__ = version("mteb")  # fetch version from install metadata
 
@@ -34,4 +34,5 @@
     "load_results",
     "Benchmark",
     "get_benchmark",
+    "get_benchmarks",
 ]
diff --git a/mteb/benchmarks/get_benchmark.py b/mteb/benchmarks/get_benchmark.py
@@ -28,7 +28,7 @@ def get_benchmark(
 
 
 def get_benchmarks(
-    names: list[str] | None,
+    names: list[str] | None = None,
 ) -> list[Benchmark]:
     if names is None:
         names = list(BENCHMARK_REGISTRY.keys())