Skip to content

Commit

Permalink
docs: Create benchmarks overview table (#1245)
Browse files Browse the repository at this point in the history
* fix get_benchmarks method

* add create benchmark script

* make lint
  • Loading branch information
isaac-chung authored Sep 27, 2024
1 parent 3c06694 commit fda9be1
Show file tree
Hide file tree
Showing 5 changed files with 83 additions and 2 deletions.
Empty file added docs/__init__.py
Empty file.
22 changes: 22 additions & 0 deletions docs/benchmarks.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
## Available benchmarks
The following tables give you an overview of the benchmarks in MTEB.

<details>

<!-- This allows the table to be autogenerated in the future: -->
<!-- BENCHMARKS TABLE START -->
| Name | # Tasks | Task Types | Domains | Languages |
|------|---------|------------|---------|-----------|
| [CoIR](https://github.com/CoIR-team/coir) | 10 | {'Retrieval': 10} | [Written, Programming] | python,c++,sql,go,eng,php,javascript,ruby,java |
| [MINERSBitextMining](https://arxiv.org/pdf/2406.07424) | 7 | {'BitextMining': 7} | [Written, Social, Reviews] | sun,kaz,tzl,ido,abs,arq,yue,tam,nij,glg,slk,hsb,ber,xho,cbk,pol,uzb,ina,kab,swh,amh,fao,kzj,lfn,uig,sqi,deu,ang,ind,bug,pms,ibo,cym,eus,spa,ceb,tgl,ron,isl,ita,csb,cha,fin,est,pes,jpn,tel,tha,oci,cmn,min,fry,bbc,epo,lit,rus,bos,hrv,war,ara,bjn,mkd,srp,ast,nno,urd,pam,aze,eng,ace,bew,kor,dan,awa,mui,hye,ban,cor,ben,gle,swe,mad,bul,lat,cat,nob,fra,pcm,ell,mar,vie,tat,ukr,gsw,kat,arz,dsb,lvs,nld,tur,bel,max,nds,afr,khm,dtp,yor,ces,gla,zsm,mak,ile,nov,orv,bre,swg,rej,mhr,mon,mal,jav,heb,slv,bhp,kur,wuu,tuk,por,hun,hin,hau,yid |
| [MTEB(Retrieval w/Instructions)](https://arxiv.org/abs/2403.15246) | 3 | {'InstructionRetrieval': 3} | [Written, News] | eng |
| [MTEB(Scandinavian)](https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/) | 28 | {'BitextMining': 2, 'Classification': 13, 'Retrieval': 7, 'Clustering': 6} | [Encyclopaedic, Spoken, Non-fiction, Government, News, Fiction, Social, Blog, Reviews, Written, Web, Legal] | nob,fao,swe,isl,dan,nno |
| MTEB(code) | 12 | {'Retrieval': 12} | [Written, Programming] | python,c++,sql,c,go,eng,shell,typescript,php,scala,rust,swift,javascript,ruby,java |
| [MTEB(deu)](https://arxiv.org/html/2401.02709v1) | 19 | {'Classification': 6, 'Clustering': 4, 'PairClassification': 2, 'Reranking': 1, 'Retrieval': 4, 'STS': 2} | [Encyclopaedic, Spoken, News, Reviews, Written, Web] | eng,deu,pol,fra |
| MTEB(eng) | 67 | {'Classification': 12, 'Retrieval': 26, 'Clustering': 11, 'Reranking': 4, 'STS': 10, 'PairClassification': 3, 'Summarization': 1} | [Encyclopaedic, Spoken, Non-fiction, Blog, News, Medical, Social, Programming, Written, Reviews, Web, Academic] | tur,fra,eng,cmn,pol,ita,nld,spa,deu,ara |
| [MTEB(fra)](https://arxiv.org/abs/2405.20468) | 26 | {'Classification': 6, 'Clustering': 7, 'PairClassification': 2, 'Reranking': 2, 'Retrieval': 5, 'STS': 3, 'Summarization': 1} | [Encyclopaedic, Spoken, Non-fiction, News, Social, Reviews, Written, Web, Legal, Academic] | eng,deu,pol,fra |
| MTEB(kor) | 6 | {'Classification': 1, 'Reranking': 1, 'Retrieval': 2, 'STS': 2} | [Encyclopaedic, Spoken, News, Reviews, Written, Web] | kor |
| [MTEB(law)](https://aclanthology.org/2023.eacl-main.148/) | 8 | {'Retrieval': 8} | [Written, Legal] | eng,deu,zho |
| [MTEB(pol)](https://arxiv.org/abs/2405.10138) | 18 | {'Classification': 7, 'Clustering': 3, 'PairClassification': 4, 'STS': 4} | [Spoken, Non-fiction, News, Fiction, Social, Written, Web, Legal, Academic] | pol,deu,eng,fra |
| [MTEB(rus)](https://aclanthology.org/2023.eacl-main.148/) | 23 | {'Classification': 9, 'Clustering': 3, 'MultilabelClassification': 2, 'PairClassification': 1, 'Reranking': 2, 'Retrieval': 3, 'STS': 3} | [Encyclopaedic, Spoken, Blog, News, Social, Reviews, Written, Web, Academic] | rus |
<!-- BENCHMARKS TABLE END -->
58 changes: 58 additions & 0 deletions docs/create_benchmarks_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from __future__ import annotations

from collections import Counter
from pathlib import Path

import mteb
from docs.create_tasks_table import insert_tables


def benchmark_to_markdown_row(b: mteb.Benchmark) -> str:
name = b.name
name_w_reference = f"[{name}]({b.reference})" if b.reference else name
num_tasks = len(b.tasks)
n_tasks = f"{num_tasks}"

agg_domains = set()
agg_langs = set()
for t in b.tasks:
if t.metadata.domains:
agg_domains.update(t.metadata.domains)
if t.metadata.languages:
agg_langs.update(t.languages)

langs = ",".join(list(agg_langs))
domains = "[" + ", ".join(agg_domains) + "]" if agg_domains else ""

task_types = dict(Counter([t.metadata.type for t in b.tasks]))

return f"| {name_w_reference} | {n_tasks} | {task_types} | {domains} | {langs} |"


def create_benchmarks_table(benchmarks: list[mteb.Benchmark]) -> str:
table = """
| Name | # Tasks | Task Types | Domains | Languages |
|------|---------|------------|---------|-----------|
"""
for benchmark in benchmarks:
table += benchmark_to_markdown_row(benchmark) + "\n"
return table


def main():
benchmarks = mteb.get_benchmarks()
benchmarks = sorted(benchmarks, key=lambda x: x.name)

benchmarks_table = create_benchmarks_table(benchmarks)

file_path = Path(__file__).parent / "benchmarks.md"

insert_tables(
file_path,
tables=[benchmarks_table],
tags=["BENCHMARKS TABLE"],
)


if __name__ == "__main__":
main()
3 changes: 2 additions & 1 deletion mteb/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from mteb.overview import TASKS_REGISTRY, get_task, get_tasks

from .benchmarks.benchmarks import Benchmark
from .benchmarks.get_benchmark import get_benchmark
from .benchmarks.get_benchmark import get_benchmark, get_benchmarks

__version__ = version("mteb") # fetch version from install metadata

Expand All @@ -34,4 +34,5 @@
"load_results",
"Benchmark",
"get_benchmark",
"get_benchmarks",
]
2 changes: 1 addition & 1 deletion mteb/benchmarks/get_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def get_benchmark(


def get_benchmarks(
names: list[str] | None,
names: list[str] | None = None,
) -> list[Benchmark]:
if names is None:
names = list(BENCHMARK_REGISTRY.keys())
Expand Down

0 comments on commit fda9be1

Please sign in to comment.