diff --git a/docs/__init__.py b/docs/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/benchmarks.md b/docs/benchmarks.md new file mode 100644 index 0000000000..9eb471d187 --- /dev/null +++ b/docs/benchmarks.md @@ -0,0 +1,22 @@ +## Available benchmarks +The following tables give you an overview of the benchmarks in MTEB. + +
+ + + +| Name | # Tasks | Task Types | Domains | Languages | +|------|---------|------------|---------|-----------| +| [CoIR](https://github.com/CoIR-team/coir) | 10 | {'Retrieval': 10} | [Written, Programming] | python,c++,sql,go,eng,php,javascript,ruby,java | +| [MINERSBitextMining](https://arxiv.org/pdf/2406.07424) | 7 | {'BitextMining': 7} | [Written, Social, Reviews] | sun,kaz,tzl,ido,abs,arq,yue,tam,nij,glg,slk,hsb,ber,xho,cbk,pol,uzb,ina,kab,swh,amh,fao,kzj,lfn,uig,sqi,deu,ang,ind,bug,pms,ibo,cym,eus,spa,ceb,tgl,ron,isl,ita,csb,cha,fin,est,pes,jpn,tel,tha,oci,cmn,min,fry,bbc,epo,lit,rus,bos,hrv,war,ara,bjn,mkd,srp,ast,nno,urd,pam,aze,eng,ace,bew,kor,dan,awa,mui,hye,ban,cor,ben,gle,swe,mad,bul,lat,cat,nob,fra,pcm,ell,mar,vie,tat,ukr,gsw,kat,arz,dsb,lvs,nld,tur,bel,max,nds,afr,khm,dtp,yor,ces,gla,zsm,mak,ile,nov,orv,bre,swg,rej,mhr,mon,mal,jav,heb,slv,bhp,kur,wuu,tuk,por,hun,hin,hau,yid | +| [MTEB(Retrieval w/Instructions)](https://arxiv.org/abs/2403.15246) | 3 | {'InstructionRetrieval': 3} | [Written, News] | eng | +| [MTEB(Scandinavian)](https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/) | 28 | {'BitextMining': 2, 'Classification': 13, 'Retrieval': 7, 'Clustering': 6} | [Encyclopaedic, Spoken, Non-fiction, Government, News, Fiction, Social, Blog, Reviews, Written, Web, Legal] | nob,fao,swe,isl,dan,nno | +| MTEB(code) | 12 | {'Retrieval': 12} | [Written, Programming] | python,c++,sql,c,go,eng,shell,typescript,php,scala,rust,swift,javascript,ruby,java | +| [MTEB(deu)](https://arxiv.org/html/2401.02709v1) | 19 | {'Classification': 6, 'Clustering': 4, 'PairClassification': 2, 'Reranking': 1, 'Retrieval': 4, 'STS': 2} | [Encyclopaedic, Spoken, News, Reviews, Written, Web] | eng,deu,pol,fra | +| MTEB(eng) | 67 | {'Classification': 12, 'Retrieval': 26, 'Clustering': 11, 'Reranking': 4, 'STS': 10, 'PairClassification': 3, 'Summarization': 1} | [Encyclopaedic, Spoken, Non-fiction, Blog, News, Medical, Social, Programming, Written, Reviews, Web, Academic] | tur,fra,eng,cmn,pol,ita,nld,spa,deu,ara | +| [MTEB(fra)](https://arxiv.org/abs/2405.20468) | 26 | {'Classification': 6, 'Clustering': 7, 'PairClassification': 2, 'Reranking': 2, 'Retrieval': 5, 'STS': 3, 'Summarization': 1} | [Encyclopaedic, Spoken, Non-fiction, News, Social, Reviews, Written, Web, Legal, Academic] | eng,deu,pol,fra | +| MTEB(kor) | 6 | {'Classification': 1, 'Reranking': 1, 'Retrieval': 2, 'STS': 2} | [Encyclopaedic, Spoken, News, Reviews, Written, Web] | kor | +| [MTEB(law)](https://aclanthology.org/2023.eacl-main.148/) | 8 | {'Retrieval': 8} | [Written, Legal] | eng,deu,zho | +| [MTEB(pol)](https://arxiv.org/abs/2405.10138) | 18 | {'Classification': 7, 'Clustering': 3, 'PairClassification': 4, 'STS': 4} | [Spoken, Non-fiction, News, Fiction, Social, Written, Web, Legal, Academic] | pol,deu,eng,fra | +| [MTEB(rus)](https://aclanthology.org/2023.eacl-main.148/) | 23 | {'Classification': 9, 'Clustering': 3, 'MultilabelClassification': 2, 'PairClassification': 1, 'Reranking': 2, 'Retrieval': 3, 'STS': 3} | [Encyclopaedic, Spoken, Blog, News, Social, Reviews, Written, Web, Academic] | rus | + \ No newline at end of file diff --git a/docs/create_benchmarks_table.py b/docs/create_benchmarks_table.py new file mode 100644 index 0000000000..7fddf07c75 --- /dev/null +++ b/docs/create_benchmarks_table.py @@ -0,0 +1,58 @@ +from __future__ import annotations + +from collections import Counter +from pathlib import Path + +import mteb +from docs.create_tasks_table import insert_tables + + +def benchmark_to_markdown_row(b: mteb.Benchmark) -> str: + name = b.name + name_w_reference = f"[{name}]({b.reference})" if b.reference else name + num_tasks = len(b.tasks) + n_tasks = f"{num_tasks}" + + agg_domains = set() + agg_langs = set() + for t in b.tasks: + if t.metadata.domains: + agg_domains.update(t.metadata.domains) + if t.metadata.languages: + agg_langs.update(t.languages) + + langs = ",".join(list(agg_langs)) + domains = "[" + ", ".join(agg_domains) + "]" if agg_domains else "" + + task_types = dict(Counter([t.metadata.type for t in b.tasks])) + + return f"| {name_w_reference} | {n_tasks} | {task_types} | {domains} | {langs} |" + + +def create_benchmarks_table(benchmarks: list[mteb.Benchmark]) -> str: + table = """ +| Name | # Tasks | Task Types | Domains | Languages | +|------|---------|------------|---------|-----------| +""" + for benchmark in benchmarks: + table += benchmark_to_markdown_row(benchmark) + "\n" + return table + + +def main(): + benchmarks = mteb.get_benchmarks() + benchmarks = sorted(benchmarks, key=lambda x: x.name) + + benchmarks_table = create_benchmarks_table(benchmarks) + + file_path = Path(__file__).parent / "benchmarks.md" + + insert_tables( + file_path, + tables=[benchmarks_table], + tags=["BENCHMARKS TABLE"], + ) + + +if __name__ == "__main__": + main() diff --git a/mteb/__init__.py b/mteb/__init__.py index 2b98827014..281faf7d77 100644 --- a/mteb/__init__.py +++ b/mteb/__init__.py @@ -15,7 +15,7 @@ from mteb.overview import TASKS_REGISTRY, get_task, get_tasks from .benchmarks.benchmarks import Benchmark -from .benchmarks.get_benchmark import get_benchmark +from .benchmarks.get_benchmark import get_benchmark, get_benchmarks __version__ = version("mteb") # fetch version from install metadata @@ -34,4 +34,5 @@ "load_results", "Benchmark", "get_benchmark", + "get_benchmarks", ] diff --git a/mteb/benchmarks/get_benchmark.py b/mteb/benchmarks/get_benchmark.py index 88079ce860..2f7f3aa6d0 100644 --- a/mteb/benchmarks/get_benchmark.py +++ b/mteb/benchmarks/get_benchmark.py @@ -28,7 +28,7 @@ def get_benchmark( def get_benchmarks( - names: list[str] | None, + names: list[str] | None = None, ) -> list[Benchmark]: if names is None: names = list(BENCHMARK_REGISTRY.keys())