Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Leaderboard #1235

Merged
merged 40 commits into from
Oct 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
a363594
Add leaderboard dev
KennethEnevoldsen Sep 14, 2024
aa50296
Renamed MTEBResults to TaskResult
x-tabdeveloping Sep 23, 2024
17b17f7
Moved model and model meta loading utilities into overview.py
x-tabdeveloping Sep 23, 2024
2ae9392
Added get_model_metas to retrieve filtered metadata for models
x-tabdeveloping Sep 23, 2024
5eb66f1
Restructured results object and made it into a class instead of a dict
x-tabdeveloping Sep 24, 2024
9f75bf5
Added utilities for filtering models on BenchmarkResults objects
x-tabdeveloping Sep 24, 2024
f3103c1
Added to_table utility function to BenchmarkResults
x-tabdeveloping Sep 24, 2024
942b1a7
Merge branch 'main' into leaderboard_dev
x-tabdeveloping Sep 24, 2024
bb1e364
Added serialization utilities to BenchmarkResults
x-tabdeveloping Sep 24, 2024
24e0e3e
Attempted fixing tests
x-tabdeveloping Sep 24, 2024
bc1941e
Added get_model_metas to __init__
x-tabdeveloping Sep 24, 2024
37e0e25
Added get_benchmarks to __init__ and made it return all benchmarks by…
x-tabdeveloping Sep 24, 2024
f0fb326
Added get_benchmarks to __init__
x-tabdeveloping Sep 24, 2024
691380c
Made tasks hashable
x-tabdeveloping Sep 24, 2024
111cfd5
Added task filtering based on task objects on BenchmarkResults
x-tabdeveloping Sep 24, 2024
a84764c
Added BenchmarkResults to __init__
x-tabdeveloping Sep 24, 2024
50062e3
Added additional arguments to get_scores on two classes
x-tabdeveloping Oct 2, 2024
5a8fa73
Made get_scores smarter on BenchmarkResult
x-tabdeveloping Oct 14, 2024
31ac648
Added basic multilingual benchmark
x-tabdeveloping Oct 14, 2024
4332612
Modified benchmark to be able to easily access results
x-tabdeveloping Oct 14, 2024
3e17e4c
Added useful properties and filtering functions to BenchmarkResults
x-tabdeveloping Oct 14, 2024
e7ca3f8
Added minimal functioning example
x-tabdeveloping Oct 14, 2024
0d1d450
Added smarter table, task-list updating and tried fixing dropdown scr…
x-tabdeveloping Oct 16, 2024
e8cca9d
Merge branch 'main' into leaderboard_dev
x-tabdeveloping Oct 16, 2024
266394c
Made restrict_results into a private function
x-tabdeveloping Oct 17, 2024
327c8d6
Removed old leaderboard scripts
x-tabdeveloping Oct 17, 2024
9ec49fb
Hardcoded max and min model size
x-tabdeveloping Oct 17, 2024
ce2569d
Removed redundant utils file
x-tabdeveloping Oct 17, 2024
228e7d3
Ran linting
x-tabdeveloping Oct 17, 2024
bee9e41
added leaderboard dependencies as optional
x-tabdeveloping Oct 17, 2024
5e6a42e
Fixed union type error on Python 3.9
x-tabdeveloping Oct 17, 2024
781ee95
Removed references to Dict in task aggregation
x-tabdeveloping Oct 17, 2024
ae5afb7
Fixed name errors in _restrict_task_results
x-tabdeveloping Oct 18, 2024
ca5014c
Fixed _restrict_task_results
x-tabdeveloping Oct 18, 2024
cb11921
Made hf_subsets={'default'} when the task is monolingual in _restric_…
x-tabdeveloping Oct 18, 2024
9fac012
Task dropdown now gets filtered based on the other criteria
x-tabdeveloping Oct 18, 2024
006b845
Ran linting again
x-tabdeveloping Oct 21, 2024
dcca04d
Introduced hotfix for reranking test
x-tabdeveloping Oct 21, 2024
0bf3746
Added BenchmarkResults to __all__ in __init__
x-tabdeveloping Oct 21, 2024
607c998
Fixed validate_and_filter_scores method, and replaced _restric_task_r…
x-tabdeveloping Oct 21, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions mteb/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
CoIR,
)
from mteb.evaluation import *
from mteb.load_results import load_results
from mteb.models import get_model, get_model_meta
from mteb.load_results import BenchmarkResults, load_results
from mteb.models import get_model, get_model_meta, get_model_metas
from mteb.overview import TASKS_REGISTRY, get_task, get_tasks

from .benchmarks.benchmarks import Benchmark
Expand All @@ -31,8 +31,10 @@
"get_task",
"get_model",
"get_model_meta",
"get_model_metas",
"load_results",
"Benchmark",
"get_benchmark",
"get_benchmarks",
"BenchmarkResults",
]
3 changes: 3 additions & 0 deletions mteb/abstasks/AbsTask.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,3 +310,6 @@ def __repr__(self) -> str:
return (
f"{self.__class__.__name__}(name='{self.metadata.name}', languages={langs})"
)

def __hash__(self) -> int:
return hash(self.metadata)
2 changes: 1 addition & 1 deletion mteb/abstasks/AbsTaskBitextMining.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from mteb.encoder_interface import Encoder

from ..evaluation.evaluators import BitextMiningEvaluator
from ..load_results.mteb_results import HFSubset, ScoresDict
from ..load_results.task_results import HFSubset, ScoresDict
from .AbsTask import AbsTask, DescriptiveStatistics

logger = logging.getLogger(__name__)
Expand Down
2 changes: 1 addition & 1 deletion mteb/abstasks/AbsTaskClassification.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
kNNClassificationEvaluatorPytorch,
logRegClassificationEvaluator,
)
from ..load_results.mteb_results import HFSubset, ScoresDict
from ..load_results.task_results import HFSubset, ScoresDict
from .AbsTask import AbsTask, DescriptiveStatistics

logger = logging.getLogger(__name__)
Expand Down
2 changes: 1 addition & 1 deletion mteb/abstasks/AbsTaskClustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from datasets import Dataset

from mteb.encoder_interface import Encoder, EncoderWithQueryCorpusEncode
from mteb.load_results.mteb_results import ScoresDict
from mteb.load_results.task_results import ScoresDict

from ..evaluation.evaluators import ClusteringEvaluator
from .AbsTask import AbsTask, DescriptiveStatistics
Expand Down
2 changes: 1 addition & 1 deletion mteb/abstasks/AbsTaskClusteringFast.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from mteb.encoder_interface import Encoder

from ..evaluation.evaluators.model_encode import model_encode
from ..load_results.mteb_results import HFSubset
from ..load_results.task_results import HFSubset
from .AbsTask import AbsTask, DescriptiveStatistics

logger = logging.getLogger(__name__)
Expand Down
2 changes: 1 addition & 1 deletion mteb/abstasks/AbsTaskMultilabelClassification.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from mteb.encoder_interface import Encoder

from ..evaluation.evaluators.model_encode import model_encode
from ..load_results.mteb_results import HFSubset, ScoresDict
from ..load_results.task_results import HFSubset, ScoresDict
from .AbsTask import AbsTask, DescriptiveStatistics

logger = logging.getLogger(__name__)
Expand Down
2 changes: 1 addition & 1 deletion mteb/abstasks/AbsTaskPairClassification.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from ..encoder_interface import Encoder, EncoderWithQueryCorpusEncode
from ..evaluation.evaluators import PairClassificationEvaluator
from ..load_results.mteb_results import ScoresDict
from ..load_results.task_results import ScoresDict
from .AbsTask import AbsTask, DescriptiveStatistics

logger = logging.getLogger(__name__)
Expand Down
2 changes: 1 addition & 1 deletion mteb/abstasks/AbsTaskReranking.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from datasets import Dataset

from mteb.encoder_interface import Encoder, EncoderWithQueryCorpusEncode
from mteb.load_results.mteb_results import ScoresDict
from mteb.load_results.task_results import ScoresDict

from ..evaluation.evaluators import RerankingEvaluator
from .AbsTask import AbsTask, DescriptiveStatistics
Expand Down
2 changes: 1 addition & 1 deletion mteb/abstasks/AbsTaskRetrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from mteb.abstasks.TaskMetadata import HFSubset

from ..evaluation.evaluators import RetrievalEvaluator
from ..load_results.mteb_results import ScoresDict
from ..load_results.task_results import ScoresDict
from .AbsTask import AbsTask, DescriptiveStatistics

logger = logging.getLogger(__name__)
Expand Down
2 changes: 1 addition & 1 deletion mteb/abstasks/AbsTaskSTS.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from typing import Any

from ..evaluation.evaluators import STSEvaluator
from ..load_results.mteb_results import ScoresDict
from ..load_results.task_results import ScoresDict
from .AbsTask import AbsTask, DescriptiveStatistics

logger = logging.getLogger(__name__)
Expand Down
2 changes: 1 addition & 1 deletion mteb/abstasks/AbsTaskSpeedTask.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import numpy as np

from mteb.encoder_interface import Encoder, EncoderWithQueryCorpusEncode
from mteb.load_results.mteb_results import ScoresDict
from mteb.load_results.task_results import ScoresDict

from .AbsTask import AbsTask

Expand Down
2 changes: 1 addition & 1 deletion mteb/abstasks/AbsTaskSummarization.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import numpy as np

from mteb.encoder_interface import Encoder
from mteb.load_results.mteb_results import ScoresDict
from mteb.load_results.task_results import ScoresDict

from ..evaluation.evaluators import SummarizationEvaluator
from .AbsTask import AbsTask, DescriptiveStatistics
Expand Down
5 changes: 4 additions & 1 deletion mteb/abstasks/TaskMetadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from typing import Annotated, Any, Union

from pydantic import AnyUrl, BaseModel, BeforeValidator, TypeAdapter, field_validator
from typing_extensions import Literal
from typing_extensions import Annotated, Literal

from ..languages import (
ISO_LANGUAGE_SCRIPT,
Expand Down Expand Up @@ -352,3 +352,6 @@ def intext_citation(self, include_cite: bool = True) -> str:
)
return f"\\cite{{{cite}}}"
return cite

def __hash__(self) -> int:
return hash(self.model_dump_json())
1 change: 1 addition & 0 deletions mteb/benchmarks/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from __future__ import annotations

from mteb.benchmarks.benchmarks import *
from mteb.benchmarks.get_benchmark import *
15 changes: 15 additions & 0 deletions mteb/benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,12 @@
from pydantic import AnyUrl, BeforeValidator, TypeAdapter

from mteb.abstasks.AbsTask import AbsTask
from mteb.load_results.benchmark_results import (
BenchmarkResults,
ModelResult,
TaskResult,
)
from mteb.load_results.load_results import load_results
from mteb.overview import get_tasks

http_url_adapter = TypeAdapter(AnyUrl)
Expand Down Expand Up @@ -52,6 +58,15 @@ def __len__(self) -> int:
def __getitem__(self, index):
return self.tasks[index]

def load_results(
self, base_results: None | BenchmarkResults = None
) -> BenchmarkResults:
if base_results is None:
base_results = load_results()
return base_results.select_tasks(self.tasks)


MTEB_MAIN_MULTILINGUAL = Benchmark(name="MTEB(multilingual)", tasks=get_tasks())

MTEB_MAIN_EN = Benchmark(
name="MTEB(eng)",
Expand Down
2 changes: 1 addition & 1 deletion mteb/benchmarks/get_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import difflib

import mteb.benchmarks.benchmarks as benchmark_module
from mteb.benchmarks import Benchmark
from mteb.benchmarks.benchmarks import Benchmark

BENCHMARK_REGISTRY = {
inst.name: inst
Expand Down
16 changes: 8 additions & 8 deletions mteb/create_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
import yaml

import mteb
from mteb import MTEBResults
from mteb.load_results.mteb_results import CQADupstackRetrievalDummy
from mteb import TaskResult
from mteb.load_results.task_results import CQADupstackRetrievalDummy


def generate_readme(results_folder: Path, from_existing: Path | None = None) -> str:
Expand Down Expand Up @@ -45,7 +45,7 @@ def load_model_name(results_folder: Path) -> str:
return "PLACEHOLDER"


def process_task_result(task_result: MTEBResults) -> list[dict[str, Any]]:
def process_task_result(task_result: TaskResult) -> list[dict[str, Any]]:
# CQADupstackRetrieval is a combined dataset (special case atm.)
task = (
CQADupstackRetrievalDummy()
Expand Down Expand Up @@ -84,13 +84,13 @@ def process_task_result(task_result: MTEBResults) -> list[dict[str, Any]]:
return yaml_results


def get_task_results(results_folder: Path) -> list[MTEBResults]:
def get_task_results(results_folder: Path) -> list[TaskResult]:
json_files = [
r
for r in results_folder.glob("*.json")
if r.is_file() and r.name != "model_meta.json"
]
task_results = [MTEBResults.from_disk(path) for path in json_files]
task_results = [TaskResult.from_disk(path) for path in json_files]
task_results = [
results
for results in task_results
Expand All @@ -102,8 +102,8 @@ def get_task_results(results_folder: Path) -> list[MTEBResults]:


def potentially_add_cqadupstack_to_results(
results: list[MTEBResults],
) -> list[MTEBResults]:
results: list[TaskResult],
) -> list[TaskResult]:
task_list_cqa = {
"CQADupstackAndroidRetrieval",
"CQADupstackEnglishRetrieval",
Expand All @@ -128,7 +128,7 @@ def potentially_add_cqadupstack_to_results(
main_scores = [r.get_score(splits=["test"]) for r in cqa_results]
main_score = float(sum(main_scores) / len(main_scores))

combined_result = MTEBResults(
combined_result = TaskResult(
task_name="CQADupstackRetrieval",
dataset_revision="CQADupstackRetrieval_is_a_combined_dataset",
mteb_version="NA",
Expand Down
10 changes: 5 additions & 5 deletions mteb/evaluation/MTEB.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

from ..abstasks import *
from ..abstasks import AbsTask
from ..load_results.mteb_results import MTEBResults
from ..load_results.task_results import TaskResult
from ..tasks import *
from . import LangMapping

Expand Down Expand Up @@ -317,7 +317,7 @@ def run(
co2_tracker: bool = False,
encode_kwargs: dict[str, Any] = {},
**kwargs,
) -> list[MTEBResults]:
) -> list[TaskResult]:
"""Run the evaluation pipeline on the selected tasks.

Args:
Expand All @@ -336,7 +336,7 @@ def run(
kwargs: Additional arguments to be passed to `_run_eval` method and task.load_data.

Returns:
A list of MTEBResults objects, one for each task evaluated.
A list of TaskResult objects, one for each task evaluated.
"""
if "batch_size" in kwargs:
logger.warning(
Expand Down Expand Up @@ -376,7 +376,7 @@ def run(
logger.info(
f"{task.metadata.name} results already exists. Loading results from disk. Set overwrite_results=True to overwrite."
)
mteb_results = MTEBResults.from_disk(save_path)
mteb_results = TaskResult.from_disk(save_path)
evaluation_results.append(mteb_results)
del self.tasks[0] # empty memory
continue
Expand Down Expand Up @@ -437,7 +437,7 @@ def run(
if verbosity >= 1:
logger.info(f"Scores: {results}")

mteb_task_result = MTEBResults.from_task_results(
mteb_task_result = TaskResult.from_task_results(
task,
task_results,
evaluation_time=evaluation_time,
Expand Down
1 change: 1 addition & 0 deletions mteb/leaderboard/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from mteb.leaderboard.app import demo
Loading
Loading