diff --git a/mteb/abstasks/TaskMetadata.py b/mteb/abstasks/TaskMetadata.py index 817194f30e..f9b1f94fce 100644 --- a/mteb/abstasks/TaskMetadata.py +++ b/mteb/abstasks/TaskMetadata.py @@ -3,19 +3,16 @@ import json import logging from collections.abc import Mapping -from datetime import date from pathlib import Path -from typing import Annotated, Any, Union +from typing import Any, Union from pydantic import ( - AnyUrl, BaseModel, - BeforeValidator, - TypeAdapter, field_validator, ) from typing_extensions import Literal, TypedDict +from ..custom_validators import LICENSES, MODALITIES, STR_DATE, STR_URL from ..encoder_interface import PromptType from ..languages import ( ISO_LANGUAGE_SCRIPT, @@ -24,7 +21,6 @@ path_to_lang_codes, path_to_lang_scripts, ) -from ..modalities import MODALITIES TASK_SUBTYPE = Literal[ "Article retrieval", @@ -158,16 +154,6 @@ "LM-generated and reviewed", # reviewed by humans ] -http_url_adapter = TypeAdapter(AnyUrl) -STR_URL = Annotated[ - str, BeforeValidator(lambda value: str(http_url_adapter.validate_python(value))) -] # Allows the type to be a string, but ensures that the string is a URL - -pastdate_adapter = TypeAdapter(date) -STR_DATE = Annotated[ - str, BeforeValidator(lambda value: str(pastdate_adapter.validate_python(value))) -] # Allows the type to be a string, but ensures that the string is a valid date - SPLIT_NAME = str HFSubset = str LANGUAGES = Union[ @@ -191,36 +177,6 @@ "sql", ] -LICENSES = ( # this list can be extended as needed - Literal[ # we use lowercase for the licenses similar to the huggingface datasets - "not specified", # or none found - "mit", - "cc-by-2.0", - "cc-by-3.0", - "cc-by-4.0", - "cc-by-sa-3.0", - "cc-by-sa-4.0", - "cc-by-nc-4.0", - "cc-by-nc-sa-3.0", - "cc-by-nc-sa-4.0", - "cc-by-nc-nd-4.0", - "cc-by-nd-4.0", - "openrail", - "openrail++", - "odc-by", - "afl-3.0", - "apache-2.0", - "cc-by-nd-2.1-jp", - "cc0-1.0", - "bsd-3-clause", - "gpl-3.0", - "cdla-sharing-1.0", - "mpl-2.0", - "msr-la-nc", - "multiple", - ] -) - METRIC_NAME = str METRIC_VALUE = Union[int, float, dict[str, Any]] diff --git a/mteb/abstasks/aggregate_task_metadata.py b/mteb/abstasks/aggregate_task_metadata.py index 445f07ff6d..d119ea01f4 100644 --- a/mteb/abstasks/aggregate_task_metadata.py +++ b/mteb/abstasks/aggregate_task_metadata.py @@ -10,16 +10,15 @@ from mteb.abstasks.TaskMetadata import ( ANNOTATOR_TYPE, LANGUAGES, - LICENSES, MODALITIES, SAMPLE_CREATION_METHOD, - STR_DATE, TASK_DOMAIN, TASK_SUBTYPE, TASK_TYPE, HFSubset, TaskMetadata, ) +from mteb.custom_validators import LICENSES, STR_DATE from mteb.languages import ISO_LANGUAGE_SCRIPT logger = logging.getLogger(__name__) diff --git a/mteb/custom_validators.py b/mteb/custom_validators.py new file mode 100644 index 0000000000..1f2f3b4054 --- /dev/null +++ b/mteb/custom_validators.py @@ -0,0 +1,53 @@ +from __future__ import annotations + +from datetime import date +from typing import Annotated + +from pydantic import AnyUrl, BeforeValidator, TypeAdapter +from typing_extensions import Literal + +MODALITIES = Literal[ + "text", + "image", +] + +http_url_adapter = TypeAdapter(AnyUrl) +STR_URL = Annotated[ + str, BeforeValidator(lambda value: str(http_url_adapter.validate_python(value))) +] # Allows the type to be a string, but ensures that the string is a URL + +LICENSES = ( # this list can be extended as needed + Literal[ # we use lowercase for the licenses similar to the huggingface datasets + "not specified", # or none found + "mit", + "cc-by-2.0", + "cc-by-3.0", + "cc-by-4.0", + "cc-by-sa-3.0", + "cc-by-sa-4.0", + "cc-by-nc-4.0", + "cc-by-nc-sa-3.0", + "cc-by-nc-sa-4.0", + "cc-by-nc-nd-4.0", + "cc-by-nd-4.0", + "openrail", + "openrail++", + "odc-by", + "afl-3.0", + "apache-2.0", + "cc-by-nd-2.1-jp", + "cc0-1.0", + "bsd-3-clause", + "gpl-3.0", + "lgpl", + "cdla-sharing-1.0", + "mpl-2.0", + "msr-la-nc", + "multiple", + ] +) + +pastdate_adapter = TypeAdapter(date) +STR_DATE = Annotated[ + str, BeforeValidator(lambda value: str(pastdate_adapter.validate_python(value))) +] # Allows the type to be a string, but ensures that the string is a valid date diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index c3c1c73a16..996f550e27 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -17,12 +17,11 @@ import mteb from mteb.abstasks.TaskMetadata import TASK_DOMAIN, TASK_TYPE from mteb.benchmarks.benchmarks import MTEB_multilingual +from mteb.custom_validators import MODALITIES from mteb.languages import ISO_TO_LANGUAGE from mteb.leaderboard.figures import performance_size_plot, radar_chart from mteb.leaderboard.table import scores_to_tables -from ..modalities import MODALITIES - logger = logging.getLogger(__name__) acknowledgment_md = """ diff --git a/mteb/load_results/benchmark_results.py b/mteb/load_results/benchmark_results.py index d67501f3c7..ba5743b1f8 100644 --- a/mteb/load_results/benchmark_results.py +++ b/mteb/load_results/benchmark_results.py @@ -15,10 +15,10 @@ from mteb.abstasks.AbsTask import AbsTask, ScoresDict from mteb.abstasks.TaskMetadata import ( ISO_LANGUAGE_SCRIPT, - MODALITIES, TASK_DOMAIN, TASK_TYPE, ) +from mteb.custom_validators import MODALITIES from mteb.languages import ISO_LANGUAGE from mteb.load_results.task_results import TaskResult from mteb.models.overview import get_model_metas diff --git a/mteb/modalities.py b/mteb/modalities.py deleted file mode 100644 index ff83f963af..0000000000 --- a/mteb/modalities.py +++ /dev/null @@ -1,8 +0,0 @@ -from __future__ import annotations - -from typing_extensions import Literal - -MODALITIES = Literal[ - "text", - "image", -] diff --git a/mteb/model_meta.py b/mteb/model_meta.py index d71f6b792a..09ed32bb69 100644 --- a/mteb/model_meta.py +++ b/mteb/model_meta.py @@ -14,11 +14,10 @@ from pydantic import BaseModel, ConfigDict from mteb.abstasks.AbsTask import AbsTask -from mteb.abstasks.TaskMetadata import STR_DATE, STR_URL from mteb.encoder_interface import Encoder +from .custom_validators import LICENSES, MODALITIES, STR_DATE, STR_URL from .languages import ISO_LANGUAGE_SCRIPT -from .modalities import MODALITIES if TYPE_CHECKING: from .models.sentence_transformer_wrapper import SentenceTransformerWrapper @@ -104,7 +103,7 @@ class ModelMeta(BaseModel): memory_usage_mb: float | None max_tokens: float | None embed_dim: int | None - license: str | None + license: LICENSES | STR_URL | None open_weights: bool | None public_training_code: str | None public_training_data: str | bool | None diff --git a/mteb/models/bge_models.py b/mteb/models/bge_models.py index 9b900c98e9..d5c42afb25 100644 --- a/mteb/models/bge_models.py +++ b/mteb/models/bge_models.py @@ -736,7 +736,7 @@ n_parameters=9.24 * 1e9, memory_usage_mb=35254, embed_dim=3584, # from old C-MTEB leaderboard - license="gemma", + license="https://ai.google.dev/gemma/terms", max_tokens=8192, # from old C-MTEB leaderboard reference="https://huggingface.co/BAAI/bge-multilingual-gemma2", similarity_fn_name="cosine", @@ -768,7 +768,7 @@ n_parameters=7.11 * 1e9, memory_usage_mb=27125, embed_dim=4096, - license="apache-2", + license="apache-2.0", max_tokens=32768, reference="https://huggingface.co/BAAI/bge-en-icl", similarity_fn_name="cosine", diff --git a/mteb/models/gte_models.py b/mteb/models/gte_models.py index a9d50738b5..95ae0d0587 100644 --- a/mteb/models/gte_models.py +++ b/mteb/models/gte_models.py @@ -304,7 +304,7 @@ def instruction_template( n_parameters=int(305 * 1e6), memory_usage_mb=582, embed_dim=1024, - license="apache-2", + license="apache-2.0", max_tokens=8192, reference="https://huggingface.co/Alibaba-NLP/gte-multilingual-base", similarity_fn_name="cosine", @@ -329,7 +329,7 @@ def instruction_template( n_parameters=int(149 * 1e6), memory_usage_mb=284, embed_dim=768, - license="apache-2", + license="apache-2.0", max_tokens=8192, reference="https://huggingface.co/Alibaba-NLP/gte-modernbert-base", similarity_fn_name="cosine", diff --git a/mteb/models/misc_models.py b/mteb/models/misc_models.py index 7188614e1f..fd24538296 100644 --- a/mteb/models/misc_models.py +++ b/mteb/models/misc_models.py @@ -1220,7 +1220,7 @@ memory_usage_mb=None, max_tokens=None, embed_dim=None, - license="bigscience-bloom-rail-1.0", + license="https://huggingface.co/spaces/bigscience/license", open_weights=True, public_training_code=None, public_training_data=None, @@ -1242,7 +1242,7 @@ memory_usage_mb=None, max_tokens=None, embed_dim=None, - license="bigscience-bloom-rail-1.0", + license="https://huggingface.co/spaces/bigscience/license", open_weights=True, public_training_code=None, public_training_data=None, @@ -1264,7 +1264,7 @@ memory_usage_mb=None, max_tokens=None, embed_dim=None, - license="bigscience-bloom-rail-1.0", + license="https://huggingface.co/spaces/bigscience/license", open_weights=True, public_training_code=None, public_training_data=None, @@ -1286,7 +1286,7 @@ memory_usage_mb=None, max_tokens=None, embed_dim=None, - license="bigscience-bloom-rail-1.0", + license="https://huggingface.co/spaces/bigscience/license", open_weights=True, public_training_code=None, public_training_data=None, @@ -1793,7 +1793,7 @@ memory_usage_mb=None, # Not visible on repo max_tokens=512, embed_dim=128, - license="apache-2", + license="apache-2.0", open_weights=True, public_training_code=None, public_training_data=None, @@ -1819,7 +1819,7 @@ memory_usage_mb=283, max_tokens=1024, embed_dim=768, - license="apache-2", + license="apache-2.0", open_weights=True, public_training_code=None, public_training_data=None, @@ -1947,7 +1947,7 @@ memory_usage_mb=None, max_tokens=None, embed_dim=None, - license="proprietary", + license="https://aws.amazon.com/service-terms/", open_weights=False, public_training_code=None, public_training_data=None, diff --git a/mteb/models/moka_models.py b/mteb/models/moka_models.py index 2c62dcea9b..2958cce4b1 100644 --- a/mteb/models/moka_models.py +++ b/mteb/models/moka_models.py @@ -91,7 +91,7 @@ memory_usage_mb=390, embed_dim=768, # They don't give a specific license but commercial use is not allowed - license="unspecified-noncommercial", + license="https://huggingface.co/moka-ai/m3e-base#%F0%9F%93%9C-license", max_tokens=512, reference="https://huggingface.co/moka-ai/m3e-base", similarity_fn_name="cosine", @@ -114,7 +114,7 @@ memory_usage_mb=None, # Can't be seen on HF page embed_dim=512, # They don't give a specific license but commercial use is not allowed - license="unspecified-noncommercial", + license="https://huggingface.co/moka-ai/m3e-base#%F0%9F%93%9C-license", max_tokens=512, reference="https://huggingface.co/moka-ai/m3e-small", similarity_fn_name="cosine", @@ -137,7 +137,7 @@ memory_usage_mb=None, # Can't be seen on HF page embed_dim=768, # They don't give a specific license but commercial use is not allowed - license="unspecified-noncommercial", + license="https://huggingface.co/moka-ai/m3e-base#%F0%9F%93%9C-license", max_tokens=512, reference="https://huggingface.co/moka-ai/m3e-large", similarity_fn_name="cosine", diff --git a/mteb/models/qodo_models.py b/mteb/models/qodo_models.py index fb87612335..f5125c7638 100644 --- a/mteb/models/qodo_models.py +++ b/mteb/models/qodo_models.py @@ -21,7 +21,7 @@ n_parameters=1_780_000_000, memory_usage_mb=6776, embed_dim=1536, - license="QodoAI-Open-RAIL-M", + license="https://huggingface.co/Qodo/Qodo-Embed-1-1.5B/blob/main/LICENSE", max_tokens=32768, reference="https://huggingface.co/Qodo/Qodo-Embed-1-1.5B", similarity_fn_name="cosine", @@ -52,7 +52,7 @@ n_parameters=7_613_000_000, memory_usage_mb=29040, embed_dim=3584, - license="Qodo-Model", + license="https://huggingface.co/Qodo/Qodo-Embed-1-1.5B/blob/main/LICENSE", max_tokens=32768, reference="https://huggingface.co/Qodo/Qodo-Embed-1-7B", similarity_fn_name="cosine", diff --git a/mteb/models/ru_sentence_models.py b/mteb/models/ru_sentence_models.py index 29df9c477e..905fb6550a 100644 --- a/mteb/models/ru_sentence_models.py +++ b/mteb/models/ru_sentence_models.py @@ -92,7 +92,7 @@ n_parameters=427_000_000, memory_usage_mb=1629, embed_dim=1024, - license="Not specified", + license="not specified", max_tokens=512, # best guess reference="https://huggingface.co/ai-forever/sbert_large_mt_nlu_ru", similarity_fn_name="cosine", @@ -248,7 +248,7 @@ n_parameters=1280_000_000, memory_usage_mb=4883, embed_dim=768, - license="Not specified", + license="not specified", max_tokens=512, reference="https://huggingface.co/DeepPavlov/rubert-base-cased", similarity_fn_name="cosine", @@ -273,7 +273,7 @@ n_parameters=107_000_000, memory_usage_mb=408, embed_dim=768, - license="Not specified", + license="not specified", max_tokens=512, reference="https://huggingface.co/DeepPavlov/distilrubert-small-cased-conversational", similarity_fn_name="cosine", @@ -296,7 +296,7 @@ n_parameters=107_000_000, memory_usage_mb=408, embed_dim=768, - license="Not specified", + license="not specified", max_tokens=512, reference="https://huggingface.co/DeepPavlov/rubert-base-cased-sentence", similarity_fn_name="cosine", @@ -319,7 +319,7 @@ n_parameters=129_000_000, memory_usage_mb=492, embed_dim=768, - license="Not specified", + license="not specified", max_tokens=512, reference="https://huggingface.co/cointegrated/LaBSE-en-ru", similarity_fn_name="cosine", diff --git a/mteb/overview.py b/mteb/overview.py index dcd716897f..03ad0e67ba 100644 --- a/mteb/overview.py +++ b/mteb/overview.py @@ -9,7 +9,8 @@ import pandas as pd from mteb.abstasks.AbsTask import AbsTask -from mteb.abstasks.TaskMetadata import MODALITIES, TASK_CATEGORY, TASK_DOMAIN, TASK_TYPE +from mteb.abstasks.TaskMetadata import TASK_CATEGORY, TASK_DOMAIN, TASK_TYPE +from mteb.custom_validators import MODALITIES from mteb.languages import ( ISO_TO_LANGUAGE, ISO_TO_SCRIPT, diff --git a/tests/test_overview.py b/tests/test_overview.py index 11b3c7378e..4486bc1136 100644 --- a/tests/test_overview.py +++ b/tests/test_overview.py @@ -5,7 +5,8 @@ import mteb from mteb import get_task, get_tasks from mteb.abstasks.AbsTask import AbsTask -from mteb.abstasks.TaskMetadata import MODALITIES, TASK_DOMAIN, TASK_TYPE +from mteb.abstasks.TaskMetadata import TASK_DOMAIN, TASK_TYPE +from mteb.custom_validators import MODALITIES from mteb.overview import MTEBTasks