From bf086c5c3c918cb59496487568f1c20feebff281 Mon Sep 17 00:00:00 2001
From: Kenneth Enevoldsen <kennethcenevoldsen@gmail.com>
Date: Mon, 9 Sep 2024 14:35:34 +0200
Subject: [PATCH 1/6] Normalize benchmarks to only include tasks

- Force benchmarks to only include tasks. This fixes a few bugs where benchmarks can reference a task which is not implemented
- implements `mteb.get_benchmark`, which makes it easier to fetch benchmarks
- Added tests + updated docs

A few outstanding issues:

I would like `mteb.MTEB(benchmark)` to always reproduce the benchmark. Currently this is not possible as MTEB(eng) required the split to be specified. A solution it to allow "eval_splits) to be specified when initializing a task and then pass it on to the `load_data()`. This way we can write the following:

`mteb.get_tasks(tasks=[...], eval_splits=["test"], ...)`

I would also love the aggregation to be a part of the benchmark (such that it is clear how it should be aggregated). This is especially relevant for MTEB(eng) as it average the CQAD datasets before creating the global average. This way we can also create a result object for the benchmark itself. A complimenting solution for this is to allow nested benchmarks.
---
 README.md                              |  20 +-
 mteb/__init__.py                       |   6 +-
 mteb/benchmarks/__init__.py            |   3 +
 mteb/{ => benchmarks}/benchmarks.py    | 323 ++++++++++++++-----------
 mteb/benchmarks/get_benchmark.py       |  27 +++
 tests/test_benchmark/test_benchmark.py |  21 +-
 6 files changed, 249 insertions(+), 151 deletions(-)
 create mode 100644 mteb/benchmarks/__init__.py
 rename mteb/{ => benchmarks}/benchmarks.py (63%)
 create mode 100644 mteb/benchmarks/get_benchmark.py
diff --git a/README.md b/README.md
index e08545aec6..0d50acbebc 100644
--- a/README.md
+++ b/README.md
@@ -38,7 +38,7 @@ pip install mteb
 
 ## Usage
 
-* Using a python script (see [scripts/run_mteb_english.py](https://github.com/embeddings-benchmark/mteb/blob/main/scripts/run_mteb_english.py) and [mteb/mtebscripts](https://github.com/embeddings-benchmark/mtebscripts) for more):
+* Using a python script:
 
 ```python
 import mteb
@@ -77,11 +77,11 @@ Click on each section below to see the details.
 <br /> 
 
 <details>
-  <summary>  Dataset selection </summary>
+  <summary>  Task selection </summary>
 
-### Dataset selection
+### Task selection
 
-Datasets can be selected by providing the list of datasets, but also
+Tasks can be selected by providing the list of datasets, but also
 
 * by their task (e.g. "Clustering" or "Classification")
 
@@ -121,11 +121,17 @@ evaluation = mteb.MTEB(tasks=[
 # for an example of a HF subset see "Subset" in the dataset viewer at: https://huggingface.co/datasets/mteb/bucc-bitext-mining
 ```
 
-There are also presets available for certain task collections, e.g. to select the 56 English datasets that form the "Overall MTEB English leaderboard":
+</details>
+
+<details>
+  <summary>  Running a benchmark </summary>
+
+`mteb` comes with a set of predefined benchmarks. These can be fetched using `get_benchmark` and run in a similar fashion to other sets of tasks. 
+For instance to select the 56 English datasets that form the "Overall MTEB English leaderboard":
 
 ```python
-from mteb import MTEB_MAIN_EN
-evaluation = mteb.MTEB(tasks=MTEB_MAIN_EN, task_langs=["en"])
+mteb_eng = mteb.get_benchmark("MTEB(eng)")
+evaluation = mteb.MTEB(tasks=mteb_eng, eval_splits=["test"])
 ```
 
 </details>
diff --git a/mteb/__init__.py b/mteb/__init__.py
index be5edd97ed..2b98827014 100644
--- a/mteb/__init__.py
+++ b/mteb/__init__.py
@@ -2,7 +2,7 @@
 
 from importlib.metadata import version
 
-from mteb.benchmarks import (
+from mteb.benchmarks.benchmarks import (
     MTEB_MAIN_EN,
     MTEB_MAIN_RU,
     MTEB_RETRIEVAL_LAW,
@@ -14,7 +14,8 @@
 from mteb.models import get_model, get_model_meta
 from mteb.overview import TASKS_REGISTRY, get_task, get_tasks
 
-from .benchmarks import Benchmark
+from .benchmarks.benchmarks import Benchmark
+from .benchmarks.get_benchmark import get_benchmark
 
 __version__ = version("mteb")  # fetch version from install metadata
 
@@ -32,4 +33,5 @@
     "get_model_meta",
     "load_results",
     "Benchmark",
+    "get_benchmark",
 ]
diff --git a/mteb/benchmarks/__init__.py b/mteb/benchmarks/__init__.py
new file mode 100644
index 0000000000..fb1d12a293
--- /dev/null
+++ b/mteb/benchmarks/__init__.py
@@ -0,0 +1,3 @@
+from __future__ import annotations
+
+from mteb.benchmarks.benchmarks import *
diff --git a/mteb/benchmarks.py b/mteb/benchmarks/benchmarks.py
similarity index 63%
rename from mteb/benchmarks.py
rename to mteb/benchmarks/benchmarks.py
index 9485230a62..e94ba1c8f1 100644
--- a/mteb/benchmarks.py
+++ b/mteb/benchmarks/benchmarks.py
@@ -3,16 +3,34 @@
 from dataclasses import dataclass
 from typing import Sequence
 
+from pydantic import AnyUrl, BeforeValidator, TypeAdapter
+from typing_extensions import Annotated
+
 from mteb.abstasks.AbsTask import AbsTask
 from mteb.overview import get_tasks
 
+http_url_adapter = TypeAdapter(AnyUrl)
+STR_URL = Annotated[
+    str, BeforeValidator(lambda value: str(http_url_adapter.validate_python(value)))
+]  # Allows the type to be a string, but ensures that the string is a URL
+
 
 @dataclass
 class Benchmark:
+    """A benchmark object intended to run certain a full benchmark within MTEB.
+
+    Args:
+        name: The name of the benchmark
+        tasks: The tasks within the benchmark.
+        description: A description of the benchmark, should include its intended goal and potentially a description of its construction
+        reference: A link reference, to a source containing additional information typically to a paper, leaderboard or github.
+        citation: A bibtex citation
+    """
+
     name: str
-    tasks: Sequence[str] | Sequence[AbsTask]
+    tasks: Sequence[AbsTask]
     description: str | None = None
-    reference: str | None = None
+    reference: STR_URL | None = None
     citation: str | None = None
 
     def __iter__(self):
@@ -25,77 +43,91 @@ def __getitem__(self, index):
         return self.tasks[index]
 
 
+def create_benchmark_list() -> list[type[Benchmark]]:
+    benchmark_categories_cls = list(Benchmark.__subclasses__())
+    benchmarks = [
+        cls
+        for cat_cls in benchmark_categories_cls
+        for cls in cat_cls.__subclasses__()
+        if cat_cls.__name__.startswith("Benchmark")
+    ]
+    return benchmarks
+
+
 MTEB_MAIN_EN = Benchmark(
     name="MTEB(eng)",
-    tasks=[
-        "AmazonCounterfactualClassification",
-        "AmazonPolarityClassification",
-        "AmazonReviewsClassification",
-        "ArguAna",
-        "ArxivClusteringP2P",
-        "ArxivClusteringS2S",
-        "AskUbuntuDupQuestions",
-        "BIOSSES",
-        "Banking77Classification",
-        "BiorxivClusteringP2P",
-        "BiorxivClusteringS2S",
-        "CQADupstackAndroidRetrieval",
-        "CQADupstackEnglishRetrieval",
-        "CQADupstackGamingRetrieval",
-        "CQADupstackGisRetrieval",
-        "CQADupstackMathematicaRetrieval",
-        "CQADupstackPhysicsRetrieval",
-        "CQADupstackProgrammersRetrieval",
-        "CQADupstackStatsRetrieval",
-        "CQADupstackTexRetrieval",
-        "CQADupstackUnixRetrieval",
-        "CQADupstackWebmastersRetrieval",
-        "CQADupstackWordpressRetrieval",
-        "ClimateFEVER",
-        "DBPedia",
-        "EmotionClassification",
-        "FEVER",
-        "FiQA2018",
-        "HotpotQA",
-        "ImdbClassification",
-        "MSMARCO",
-        "MTOPDomainClassification",
-        "MTOPIntentClassification",
-        "MassiveIntentClassification",
-        "MassiveScenarioClassification",
-        "MedrxivClusteringP2P",
-        "MedrxivClusteringS2S",
-        "MindSmallReranking",
-        "NFCorpus",
-        "NQ",
-        "QuoraRetrieval",
-        "RedditClustering",
-        "RedditClusteringP2P",
-        "SCIDOCS",
-        "SICK-R",
-        "STS12",
-        "STS13",
-        "STS14",
-        "STS15",
-        "STS16",
-        "STS17",
-        "STS22",
-        "STSBenchmark",
-        "SciDocsRR",
-        "SciFact",
-        "SprintDuplicateQuestions",
-        "StackExchangeClustering",
-        "StackExchangeClusteringP2P",
-        "StackOverflowDupQuestions",
-        "SummEval",
-        "TRECCOVID",
-        "Touche2020",
-        "ToxicConversationsClassification",
-        "TweetSentimentExtractionClassification",
-        "TwentyNewsgroupsClustering",
-        "TwitterSemEval2015",
-        "TwitterURLCorpus",
-    ],
+    tasks=get_tasks(
+        tasks=[
+            "AmazonCounterfactualClassification",
+            "AmazonPolarityClassification",
+            "AmazonReviewsClassification",
+            "ArguAna",
+            "ArxivClusteringP2P",
+            "ArxivClusteringS2S",
+            "AskUbuntuDupQuestions",
+            "BIOSSES",
+            "Banking77Classification",
+            "BiorxivClusteringP2P",
+            "BiorxivClusteringS2S",
+            "CQADupstackAndroidRetrieval",
+            "CQADupstackEnglishRetrieval",
+            "CQADupstackGamingRetrieval",
+            "CQADupstackGisRetrieval",
+            "CQADupstackMathematicaRetrieval",
+            "CQADupstackPhysicsRetrieval",
+            "CQADupstackProgrammersRetrieval",
+            "CQADupstackStatsRetrieval",
+            "CQADupstackTexRetrieval",
+            "CQADupstackUnixRetrieval",
+            "CQADupstackWebmastersRetrieval",
+            "CQADupstackWordpressRetrieval",
+            "ClimateFEVER",
+            "DBPedia",
+            "EmotionClassification",
+            "FEVER",
+            "FiQA2018",
+            "HotpotQA",
+            "ImdbClassification",
+            "MSMARCO",
+            "MTOPDomainClassification",
+            "MTOPIntentClassification",
+            "MassiveIntentClassification",
+            "MassiveScenarioClassification",
+            "MedrxivClusteringP2P",
+            "MedrxivClusteringS2S",
+            "MindSmallReranking",
+            "NFCorpus",
+            "NQ",
+            "QuoraRetrieval",
+            "RedditClustering",
+            "RedditClusteringP2P",
+            "SCIDOCS",
+            "SICK-R",
+            "STS12",
+            "STS13",
+            "STS14",
+            "STS15",
+            "STS16",
+            "STS17",
+            "STS22",
+            "STSBenchmark",
+            "SciDocsRR",
+            "SciFact",
+            "SprintDuplicateQuestions",
+            "StackExchangeClustering",
+            "StackExchangeClusteringP2P",
+            "StackOverflowDupQuestions",
+            "SummEval",
+            "TRECCOVID",
+            "Touche2020",
+            "ToxicConversationsClassification",
+            "TweetSentimentExtractionClassification",
+            "TwentyNewsgroupsClustering",
+            "TwitterSemEval2015",
+            "TwitterURLCorpus",
+        ],
+        languages=["eng"],
+    ),
     description="Main English benchmarks from MTEB",
     citation="""@inproceedings{muennighoff-etal-2023-mteb,
     title = "{MTEB}: Massive Text Embedding Benchmark",
@@ -170,11 +202,13 @@ def __getitem__(self, index):
 
 MTEB_RETRIEVAL_WITH_INSTRUCTIONS = Benchmark(
     name="MTEB(Retrieval w/Instructions)",
-    tasks=[
-        "Robust04InstructionRetrieval",
-        "News21InstructionRetrieval",
-        "Core17InstructionRetrieval",
-    ],
+    tasks=get_tasks(
+        tasks=[
+            "Robust04InstructionRetrieval",
+            "News21InstructionRetrieval",
+            "Core17InstructionRetrieval",
+        ]
+    ),
     description="Retrieval w/Instructions is the task of finding relevant documents for a query that has detailed instructions.",
     reference="https://arxiv.org/abs/2403.15246",
     citation="""@misc{weller2024followir,
@@ -188,33 +222,37 @@ def __getitem__(self, index):
 )
 
 MTEB_RETRIEVAL_LAW = Benchmark(
-    name="MTEB(law)",
-    tasks=[
-        "LegalSummarization",
-        "LegalBenchConsumerContractsQA",
-        "LegalBenchCorporateLobbying",
-        "AILACasedocs",
-        "AILAStatutes",
-        "LeCaRDv2",
-        "LegalQuAD",
-        "GerDaLIRSmall",
-    ],
-    description="Legal benchmarks from MTEB",
+    name="MTEB(law)",  # This benchmark is likely in the need of an update
+    tasks=get_tasks(
+        tasks=[
+            "LegalSummarization",
+            "LegalBenchConsumerContractsQA",
+            "LegalBenchCorporateLobbying",
+            "AILACasedocs",
+            "AILAStatutes",
+            "LeCaRDv2",
+            "LegalQuAD",
+            "GerDaLIRSmall",
+        ]
+    ),
+    description="Legal benchmarks from MTEB.",
     reference="https://aclanthology.org/2023.eacl-main.148/",
     citation=None,
 )
 
 MTEB_MINERS_BITEXT_MINING = Benchmark(
     name="MINERSBitextMining",
-    tasks=[
-        "BUCCBitextMining",
-        "LinceMTBitextMining",
-        "NollySentiBitextMining",
-        "NusaXBitextMining",
-        "NusaTranslationBitextMining",
-        "PhincBitextMining",
-        "TatoebaBitextMining",
-    ],
+    tasks=get_tasks(
+        tasks=[
+            "BUCC",
+            "LinceMTBitextMining",
+            "NollySentiBitextMining",
+            "NusaXBitextMining",
+            "NusaTranslationBitextMining",
+            "PhincBitextMining",
+            "Tatoeba",
+        ]
+    ),
     description="BitextMining benchmark from MINERS",
     reference="https://arxiv.org/pdf/2406.07424",
     citation="""
@@ -228,37 +266,40 @@ def __getitem__(self, index):
 )
 SEB = Benchmark(
     name="MTEB(Scandinavian)",
-    tasks=[
-        "BornholmBitextMining",
-        "NorwegianCourtsBitextMining",
-        "AngryTweetsClassification",
-        "DanishPoliticalCommentsClassification",
-        "DKHateClassification",
-        "LccSentimentClassification",
-        "MassiveIntentClassification",
-        "MassiveScenarioClassification",
-        "NordicLangClassification",
-        "ScalaClassification",
-        "NoRecClassification",
-        "NorwegianParliamentClassification",
-        "DalajClassification",
-        "SwedishSentimentClassification",
-        "SweRecClassification",
-        "DanFEVER",
-        "TV2Nordretrieval",
-        "TwitterHjerneRetrieval",
-        "NorQuadRetrieval",
-        "SNLRetrieval",
-        "SwednRetrieval",
-        "SweFaqRetrieval",
-        "WikiClusteringP2P.v2",
-        "SNLHierarchicalClusteringP2P",
-        "SNLHierarchicalClusteringS2S",
-        "VGHierarchicalClusteringP2P",
-        "VGHierarchicalClusteringS2S",
-        "SwednClusteringP2P",
-        "SwednClusteringS2S",
-    ],
+    tasks=get_tasks(
+        tasks=[
+            "BornholmBitextMining",
+            "NorwegianCourtsBitextMining",
+            "AngryTweetsClassification",
+            "DanishPoliticalCommentsClassification",
+            "DKHateClassification",
+            "LccSentimentClassification",
+            "MassiveIntentClassification",
+            "MassiveScenarioClassification",
+            "NordicLangClassification",
+            "ScalaClassification",
+            "NoRecClassification",
+            "NorwegianParliamentClassification",
+            "DalajClassification",
+            "SwedishSentimentClassification",
+            "SweRecClassification",
+            "DanFEVER",
+            "TV2Nordretrieval",
+            "TwitterHjerneRetrieval",
+            "NorQuadRetrieval",
+            "SNLRetrieval",
+            "SwednRetrieval",
+            "SweFaqRetrieval",
+            "WikiClusteringP2P.v2",
+            "SNLHierarchicalClusteringP2P",
+            "SNLHierarchicalClusteringS2S",
+            "VGHierarchicalClusteringP2P",
+            "VGHierarchicalClusteringS2S",
+            "SwednClusteringP2P",
+            "SwednClusteringS2S",
+        ],
+        languages=["dan", "swe", "nno", "nob"],
+    ),
     description="A curated selection of tasks coverering the Scandinavian languages; Danish, Swedish and Norwegian, including Bokmål and Nynorsk.",
     reference="https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/",
     citation="""@misc{enevoldsen2024scandinavian,
@@ -273,18 +314,20 @@ def __getitem__(self, index):
 
 CoIR = Benchmark(
     name="CoIR",
-    tasks=[
-        "AppsRetrieval",
-        "CosQA",
-        "SyntheticText2SQL",
-        "COIRCodeSearchNetRetrieval",
-        "CodeSearchNetCCRetrieval",
-        "CodeTransOceanDL",
-        "CodeTransOceanContest",
-        "StackOverflowQA",
-        "CodeFeedbackMT",
-        "CodeFeedbackST",
-    ],
+    tasks=get_tasks(
+        tasks=[
+            "AppsRetrieval",
+            "CosQA",
+            "SyntheticText2SQL",
+            "COIRCodeSearchNetRetrieval",
+            "CodeSearchNetCCRetrieval",
+            "CodeTransOceanDL",
+            "CodeTransOceanContest",
+            "StackOverflowQA",
+            "CodeFeedbackMT",
+            "CodeFeedbackST",
+        ]
+    ),
     description="CoIR: A Comprehensive Benchmark for Code Information Retrieval Models",
     reference="https://github.com/CoIR-team/coir",
     citation="""@misc{li2024coircomprehensivebenchmarkcode,
diff --git a/mteb/benchmarks/get_benchmark.py b/mteb/benchmarks/get_benchmark.py
new file mode 100644
index 0000000000..6d52a51aa7
--- /dev/null
+++ b/mteb/benchmarks/get_benchmark.py
@@ -0,0 +1,27 @@
+from __future__ import annotations
+
+import difflib
+
+import mteb.benchmarks.benchmarks as benchmark_module
+from mteb.benchmarks.benchmarks import Benchmark
+
+BENCHMARK_REGISTRY = {
+    inst.name: inst
+    for nam, inst in benchmark_module.__dict__.items()
+    if isinstance(inst, Benchmark)
+}
+
+
+def get_benchmark(
+    benchmark_name: str,
+) -> Benchmark:
+    if benchmark_name not in BENCHMARK_REGISTRY:
+        close_matches = difflib.get_close_matches(
+            benchmark_name, BENCHMARK_REGISTRY.keys()
+        )
+        if close_matches:
+            suggestion = f"KeyError: '{benchmark_name}' not found. Did you mean: {close_matches[0]}?"
+        else:
+            suggestion = f"KeyError: '{benchmark_name}' not found and no similar keys were found."
+        raise KeyError(suggestion)
+    return BENCHMARK_REGISTRY[benchmark_name]
diff --git a/tests/test_benchmark/test_benchmark.py b/tests/test_benchmark/test_benchmark.py
index 3d32d923bc..772dd6754c 100644
--- a/tests/test_benchmark/test_benchmark.py
+++ b/tests/test_benchmark/test_benchmark.py
@@ -10,7 +10,7 @@
 from sentence_transformers import SentenceTransformer
 
 import mteb
-from mteb.benchmarks import Benchmark
+from mteb.benchmarks.benchmarks import Benchmark
 from mteb.create_meta import generate_readme
 
 from .mock_models import (
@@ -127,9 +127,26 @@ def encode(self, sentences, prompt_name: str | None = None, **kwargs):
 @pytest.mark.parametrize("model", [MockNumpyEncoder()])
 def test_run_using_benchmark(model: mteb.Encoder):
     """Test that a benchmark object can be run using the MTEB class."""
-    bench = Benchmark(name="test_bench", tasks=["STS12", "SummEval"])
+    bench = Benchmark(name="test_bench", tasks=mteb.get_tasks(["STS12", "SummEval"]))
 
     eval = mteb.MTEB(tasks=bench)
     eval.run(
         model, output_folder="tests/results", overwrite_results=True
     )  # we just want to test that it runs
+
+
+def test_benchmark_names_must_be_unique():
+    import mteb.benchmarks.benchmarks as benchmark_module
+
+    names = [
+        inst.name
+        for nam, inst in benchmark_module.__dict__.items()
+        if isinstance(inst, Benchmark)
+    ]
+    assert len(names) == len(set(names))
+
+
+@pytest.mark.parametrize("name", ["MTEB(eng)", "MTEB(rus)", "MTEB(Scandinavian)"])
+def test_get_benchmarks(name):
+    benchmark = mteb.get_benchmark(benchmark_name=name)
+    assert isinstance(benchmark, mteb.Benchmark)

From 83dc6626c238bfb76c99b6fef1c27c6ca0b085f7 Mon Sep 17 00:00:00 2001
From: Kenneth Enevoldsen <kennethcenevoldsen@gmail.com>
Date: Mon, 9 Sep 2024 14:45:45 +0200
Subject: [PATCH 2/6] fix error in tests

---
 tests/test_benchmark/test_benchmark.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_benchmark/test_benchmark.py b/tests/test_benchmark/test_benchmark.py
index 772dd6754c..67a9df52ad 100644
--- a/tests/test_benchmark/test_benchmark.py
+++ b/tests/test_benchmark/test_benchmark.py
@@ -127,7 +127,7 @@ def encode(self, sentences, prompt_name: str | None = None, **kwargs):
 @pytest.mark.parametrize("model", [MockNumpyEncoder()])
 def test_run_using_benchmark(model: mteb.Encoder):
     """Test that a benchmark object can be run using the MTEB class."""
-    bench = Benchmark(name="test_bench", tasks=mteb.get_tasks(["STS12", "SummEval"]))
+    bench = Benchmark(name="test_bench", tasks=mteb.get_tasks(tasks=["STS12", "SummEval"]))
 
     eval = mteb.MTEB(tasks=bench)
     eval.run(

From 4b11e8c256322ed87e426740f7c955d0789b843b Mon Sep 17 00:00:00 2001
From: Kenneth Enevoldsen <kennethcenevoldsen@gmail.com>
Date: Mon, 9 Sep 2024 14:47:21 +0200
Subject: [PATCH 3/6] format

---
 tests/test_benchmark/test_benchmark.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/test_benchmark/test_benchmark.py b/tests/test_benchmark/test_benchmark.py
index 67a9df52ad..742c7930e9 100644
--- a/tests/test_benchmark/test_benchmark.py
+++ b/tests/test_benchmark/test_benchmark.py
@@ -127,7 +127,9 @@ def encode(self, sentences, prompt_name: str | None = None, **kwargs):
 @pytest.mark.parametrize("model", [MockNumpyEncoder()])
 def test_run_using_benchmark(model: mteb.Encoder):
     """Test that a benchmark object can be run using the MTEB class."""
-    bench = Benchmark(name="test_bench", tasks=mteb.get_tasks(tasks=["STS12", "SummEval"]))
+    bench = Benchmark(
+        name="test_bench", tasks=mteb.get_tasks(tasks=["STS12", "SummEval"])
+    )
 
     eval = mteb.MTEB(tasks=bench)
     eval.run(

From 7ced8b49ed19746470c49777e2aed6909f1c84ac Mon Sep 17 00:00:00 2001
From: Kenneth Enevoldsen <kennethcenevoldsen@gmail.com>
Date: Mon, 9 Sep 2024 21:02:06 +0200
Subject: [PATCH 4/6] Added corrections based on review

---
 README.md                     |  1 +
 mteb/benchmarks/benchmarks.py | 76 ++++++++++++++++-------------------
 2 files changed, 36 insertions(+), 41 deletions(-)

diff --git a/README.md b/README.md
index 0d50acbebc..e2f7a523a5 100644
--- a/README.md
+++ b/README.md
@@ -130,6 +130,7 @@ evaluation = mteb.MTEB(tasks=[
 For instance to select the 56 English datasets that form the "Overall MTEB English leaderboard":
 
 ```python
+import mteb
 mteb_eng = mteb.get_benchmark("MTEB(eng)")
 evaluation = mteb.MTEB(tasks=mteb_eng, eval_splits=["test"])
 ```
diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py
index e94ba1c8f1..729103f995 100644
--- a/mteb/benchmarks/benchmarks.py
+++ b/mteb/benchmarks/benchmarks.py
@@ -10,14 +10,14 @@
 from mteb.overview import get_tasks
 
 http_url_adapter = TypeAdapter(AnyUrl)
-STR_URL = Annotated[
+UrlString = Annotated[
     str, BeforeValidator(lambda value: str(http_url_adapter.validate_python(value)))
 ]  # Allows the type to be a string, but ensures that the string is a URL
 
 
 @dataclass
 class Benchmark:
-    """A benchmark object intended to run certain a full benchmark within MTEB.
+    """A benchmark object intended to run a certain benchmark within MTEB.
 
     Args:
         name: The name of the benchmark
@@ -30,7 +30,7 @@ class Benchmark:
     name: str
     tasks: Sequence[AbsTask]
     description: str | None = None
-    reference: STR_URL | None = None
+    reference: UrlString | None = None
     citation: str | None = None
 
     def __iter__(self):
@@ -43,15 +43,6 @@ def __getitem__(self, index):
         return self.tasks[index]
 
 
-def create_benchmark_list() -> list[type[Benchmark]]:
-    benchmark_categories_cls = list(Benchmark.__subclasses__())
-    benchmarks = [
-        cls
-        for cat_cls in benchmark_categories_cls
-        for cls in cat_cls.__subclasses__()
-        if cat_cls.__name__.startswith("Benchmark")
-    ]
-    return benchmarks
 
 
 MTEB_MAIN_EN = Benchmark(
@@ -225,14 +216,14 @@ def create_benchmark_list() -> list[type[Benchmark]]:
     name="MTEB(law)",  # This benchmark is likely in the need of an update
     tasks=get_tasks(
         tasks=[
-            "LegalSummarization",
-            "LegalBenchConsumerContractsQA",
-            "LegalBenchCorporateLobbying",
             "AILACasedocs",
             "AILAStatutes",
+            "LegalSummarization",
+            "GerDaLIRSmall",
             "LeCaRDv2",
+            "LegalBenchConsumerContractsQA",
+            "LegalBenchCorporateLobbying",
             "LegalQuAD",
-            "GerDaLIRSmall",
         ]
     ),
     description="Legal benchmarks from MTEB.",
@@ -268,35 +259,38 @@ def create_benchmark_list() -> list[type[Benchmark]]:
     name="MTEB(Scandinavian)",
     tasks=get_tasks(
         tasks=[
+            # Bitext
             "BornholmBitextMining",
             "NorwegianCourtsBitextMining",
+            # Classification
             "AngryTweetsClassification",
             "DanishPoliticalCommentsClassification",
+            "DalajClassification",
             "DKHateClassification",
             "LccSentimentClassification",
             "MassiveIntentClassification",
             "MassiveScenarioClassification",
             "NordicLangClassification",
-            "ScalaClassification",
             "NoRecClassification",
             "NorwegianParliamentClassification",
-            "DalajClassification",
+            "ScalaClassification",
             "SwedishSentimentClassification",
             "SweRecClassification",
+            # Retrieval
             "DanFEVER",
-            "TV2Nordretrieval",
-            "TwitterHjerneRetrieval",
             "NorQuadRetrieval",
             "SNLRetrieval",
             "SwednRetrieval",
             "SweFaqRetrieval",
-            "WikiClusteringP2P.v2",
-            "SNLHierarchicalClusteringP2P",
+            "TV2Nordretrieval",
+            "TwitterHjerneRetrieval",
+            # Clustering
             "SNLHierarchicalClusteringS2S",
-            "VGHierarchicalClusteringP2P",
-            "VGHierarchicalClusteringS2S",
+            "SNLHierarchicalClusteringP2P",
             "SwednClusteringP2P",
             "SwednClusteringS2S",
+            "VGHierarchicalClusteringS2S",
+            "VGHierarchicalClusteringP2P",
         ],
         languages=["dan", "swe", "nno", "nob"],
     ),
@@ -317,15 +311,15 @@ def create_benchmark_list() -> list[type[Benchmark]]:
     tasks=get_tasks(
         tasks=[
             "AppsRetrieval",
-            "CosQA",
-            "SyntheticText2SQL",
-            "COIRCodeSearchNetRetrieval",
+            "CodeFeedbackMT",
+            "CodeFeedbackST",
             "CodeSearchNetCCRetrieval",
-            "CodeTransOceanDL",
             "CodeTransOceanContest",
+            "CodeTransOceanDL",
+            "CosQA",
+            "COIRCodeSearchNetRetrieval",
             "StackOverflowQA",
-            "CodeFeedbackMT",
-            "CodeFeedbackST",
+            "SyntheticText2SQL",
         ]
     ),
     description="CoIR: A Comprehensive Benchmark for Code Information Retrieval Models",
@@ -365,19 +359,19 @@ def create_benchmark_list() -> list[type[Benchmark]]:
             "OpusparcusPC",
             "PawsXPairClassification",
             # Reranking
-            "SyntecReranking",
             "AlloprofReranking",
+            "SyntecReranking",
             # Retrieval
             "AlloprofRetrieval",
             "BSARDRetrieval",
+            "MintakaRetrieval",
             "SyntecRetrieval",
             "XPQARetrieval",
-            "MintakaRetrieval",
             # STS
-            "SummEvalFr",
-            "STSBenchmarkMultilingualSTS",
-            "STS22",
             "SICKFr",
+            "STS22",
+            "STSBenchmarkMultilingualSTS",
+            "SummEvalFr",
         ],
     ),
     description="Main French benchmarks from MTEB",
@@ -469,27 +463,27 @@ def create_benchmark_list() -> list[type[Benchmark]]:
         languages=["pol"],
         tasks=[
             # Classification
+            "AllegroReviews",
             "CBD",
+            "MassiveIntentClassification",
+            "MassiveScenarioClassification",
             "PolEmo2.0-IN",
             "PolEmo2.0-OUT",
-            "AllegroReviews",
             "PAC",
-            "MassiveIntentClassification",
-            "MassiveScenarioClassification",
             # Clustering
             "EightTagsClustering",
             "PlscClusteringS2S",
             "PlscClusteringP2P",
             # Pair Classification
-            "SICK-E-PL",
-            "PpcPC",
             "CDSC-E",
+            "PpcPC",
             "PSC",
+            "SICK-E-PL",
             # STS
-            "SICK-R-PL",
             "CDSC-R",
             "STS22",
             "STSBenchmarkMultilingualSTS",
+            "SICK-R-PL",
         ],
     ),
     description="Main Polish benchmarks from MTEB",

From 25550b89e133a0373c4d59971d409a290946165b Mon Sep 17 00:00:00 2001
From: Kenneth Enevoldsen <kennethcenevoldsen@gmail.com>
Date: Tue, 10 Sep 2024 09:24:51 +0200
Subject: [PATCH 5/6] added example and formatted

---
 mteb/benchmarks/benchmarks.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py
index 729103f995..048c74d75a 100644
--- a/mteb/benchmarks/benchmarks.py
+++ b/mteb/benchmarks/benchmarks.py
@@ -25,6 +25,16 @@ class Benchmark:
         description: A description of the benchmark, should include its intended goal and potentially a description of its construction
         reference: A link reference, to a source containing additional information typically to a paper, leaderboard or github.
         citation: A bibtex citation
+
+    Example:
+        >>> Benchmark(
+        ...     name="MTEB(custom)",
+        ...     tasks=mteb.get_tasks(
+        ...         tasks=["AmazonCounterfactualClassification", "AmazonPolarityClassification"],
+        ...         languages=["eng"],
+        ...     ),
+        ...     description="A custom benchmark"
+        ... )
     """
 
     name: str
@@ -43,8 +53,6 @@ def __getitem__(self, index):
         return self.tasks[index]
 
 
-
-
 MTEB_MAIN_EN = Benchmark(
     name="MTEB(eng)",
     tasks=get_tasks(

From f8ff240f5afdaf6790e47a0012c62e285f6c11f8 Mon Sep 17 00:00:00 2001
From: Kenneth Enevoldsen <kennethcenevoldsen@gmail.com>
Date: Tue, 10 Sep 2024 14:29:01 +0200
Subject: [PATCH 6/6] update benchmark reference

---
 mteb/benchmarks/get_benchmark.py                                | 2 +-
 .../Classification/fil/FilipinoHateSpeechClassification.py      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/mteb/benchmarks/get_benchmark.py b/mteb/benchmarks/get_benchmark.py
index 6d52a51aa7..169e3bcd50 100644
--- a/mteb/benchmarks/get_benchmark.py
+++ b/mteb/benchmarks/get_benchmark.py
@@ -3,7 +3,7 @@
 import difflib
 
 import mteb.benchmarks.benchmarks as benchmark_module
-from mteb.benchmarks.benchmarks import Benchmark
+from mteb.benchmarks import Benchmark
 
 BENCHMARK_REGISTRY = {
     inst.name: inst
diff --git a/mteb/tasks/Classification/fil/FilipinoHateSpeechClassification.py b/mteb/tasks/Classification/fil/FilipinoHateSpeechClassification.py
index a9cf4cea25..a01bda1d80 100644
--- a/mteb/tasks/Classification/fil/FilipinoHateSpeechClassification.py
+++ b/mteb/tasks/Classification/fil/FilipinoHateSpeechClassification.py
@@ -12,7 +12,7 @@ class FilipinoHateSpeechClassification(AbsTaskClassification):
         description="Filipino Twitter dataset for sentiment classification.",
         reference="https://pcj.csp.org.ph/index.php/pcj/issue/download/29/PCJ%20V14%20N1%20pp1-14%202019",
         dataset={
-            "path": "hate-speech-filipino/hate_speech_filipino",
+            "path": "legacy-datasets/hate_speech_filipino",
             "revision": "1994e9bb7f3ec07518e3f0d9e870cb293e234686",
             "trust_remote_code": True,
         },