Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Add ModelMeta license & custom validations #2293

Merged
merged 5 commits into from
Mar 9, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions mteb/abstasks/TaskMetadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,7 @@
"cc0-1.0",
"bsd-3-clause",
"gpl-3.0",
"lgpl",
"cdla-sharing-1.0",
"mpl-2.0",
"msr-la-nc",
Expand Down
4 changes: 2 additions & 2 deletions mteb/model_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from pydantic import BaseModel, ConfigDict

from mteb.abstasks.AbsTask import AbsTask
from mteb.abstasks.TaskMetadata import STR_DATE, STR_URL
from mteb.abstasks.TaskMetadata import LICENSES, STR_DATE, STR_URL
from mteb.encoder_interface import Encoder

from .languages import ISO_LANGUAGE_SCRIPT
Expand Down Expand Up @@ -104,7 +104,7 @@ class ModelMeta(BaseModel):
memory_usage_mb: float | None
max_tokens: float | None
embed_dim: int | None
license: str | None
license: LICENSES | STR_URL | None
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice!

open_weights: bool | None
public_training_code: str | None
public_training_data: str | bool | None
Expand Down
4 changes: 2 additions & 2 deletions mteb/models/bge_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -736,7 +736,7 @@
n_parameters=9.24 * 1e9,
memory_usage_mb=35254,
embed_dim=3584, # from old C-MTEB leaderboard
license="gemma",
license="https://ai.google.dev/gemma/terms",
max_tokens=8192, # from old C-MTEB leaderboard
reference="https://huggingface.co/BAAI/bge-multilingual-gemma2",
similarity_fn_name="cosine",
Expand Down Expand Up @@ -768,7 +768,7 @@
n_parameters=7.11 * 1e9,
memory_usage_mb=27125,
embed_dim=4096,
license="apache-2",
license="apache-2.0",
max_tokens=32768,
reference="https://huggingface.co/BAAI/bge-en-icl",
similarity_fn_name="cosine",
Expand Down
4 changes: 2 additions & 2 deletions mteb/models/gte_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,7 @@ def instruction_template(
n_parameters=int(305 * 1e6),
memory_usage_mb=582,
embed_dim=1024,
license="apache-2",
license="apache-2.0",
max_tokens=8192,
reference="https://huggingface.co/Alibaba-NLP/gte-multilingual-base",
similarity_fn_name="cosine",
Expand All @@ -329,7 +329,7 @@ def instruction_template(
n_parameters=int(149 * 1e6),
memory_usage_mb=284,
embed_dim=768,
license="apache-2",
license="apache-2.0",
max_tokens=8192,
reference="https://huggingface.co/Alibaba-NLP/gte-modernbert-base",
similarity_fn_name="cosine",
Expand Down
14 changes: 7 additions & 7 deletions mteb/models/misc_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1220,7 +1220,7 @@
memory_usage_mb=None,
max_tokens=None,
embed_dim=None,
license="bigscience-bloom-rail-1.0",
license="https://huggingface.co/spaces/bigscience/license",
open_weights=True,
public_training_code=None,
public_training_data=None,
Expand All @@ -1242,7 +1242,7 @@
memory_usage_mb=None,
max_tokens=None,
embed_dim=None,
license="bigscience-bloom-rail-1.0",
license="https://huggingface.co/spaces/bigscience/license",
open_weights=True,
public_training_code=None,
public_training_data=None,
Expand All @@ -1264,7 +1264,7 @@
memory_usage_mb=None,
max_tokens=None,
embed_dim=None,
license="bigscience-bloom-rail-1.0",
license="https://huggingface.co/spaces/bigscience/license",
open_weights=True,
public_training_code=None,
public_training_data=None,
Expand All @@ -1286,7 +1286,7 @@
memory_usage_mb=None,
max_tokens=None,
embed_dim=None,
license="bigscience-bloom-rail-1.0",
license="https://huggingface.co/spaces/bigscience/license",
open_weights=True,
public_training_code=None,
public_training_data=None,
Expand Down Expand Up @@ -1793,7 +1793,7 @@
memory_usage_mb=None, # Not visible on repo
max_tokens=512,
embed_dim=128,
license="apache-2",
license="apache-2.0",
open_weights=True,
public_training_code=None,
public_training_data=None,
Expand All @@ -1819,7 +1819,7 @@
memory_usage_mb=283,
max_tokens=1024,
embed_dim=768,
license="apache-2",
license="apache-2.0",
open_weights=True,
public_training_code=None,
public_training_data=None,
Expand Down Expand Up @@ -1947,7 +1947,7 @@
memory_usage_mb=None,
max_tokens=None,
embed_dim=None,
license="proprietary",
license="https://aws.amazon.com/service-terms/",
open_weights=False,
public_training_code=None,
public_training_data=None,
Expand Down
6 changes: 3 additions & 3 deletions mteb/models/moka_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@
memory_usage_mb=390,
embed_dim=768,
# They don't give a specific license but commercial use is not allowed
license="unspecified-noncommercial",
license="https://huggingface.co/moka-ai/m3e-base#%F0%9F%93%9C-license",
max_tokens=512,
reference="https://huggingface.co/moka-ai/m3e-base",
similarity_fn_name="cosine",
Expand All @@ -114,7 +114,7 @@
memory_usage_mb=None, # Can't be seen on HF page
embed_dim=512,
# They don't give a specific license but commercial use is not allowed
license="unspecified-noncommercial",
license="https://huggingface.co/moka-ai/m3e-base#%F0%9F%93%9C-license",
max_tokens=512,
reference="https://huggingface.co/moka-ai/m3e-small",
similarity_fn_name="cosine",
Expand All @@ -137,7 +137,7 @@
memory_usage_mb=None, # Can't be seen on HF page
embed_dim=768,
# They don't give a specific license but commercial use is not allowed
license="unspecified-noncommercial",
license="https://huggingface.co/moka-ai/m3e-base#%F0%9F%93%9C-license",
max_tokens=512,
reference="https://huggingface.co/moka-ai/m3e-large",
similarity_fn_name="cosine",
Expand Down
4 changes: 2 additions & 2 deletions mteb/models/qodo_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
n_parameters=1_780_000_000,
memory_usage_mb=6776,
embed_dim=1536,
license="QodoAI-Open-RAIL-M",
license="https://huggingface.co/Qodo/Qodo-Embed-1-1.5B/blob/main/LICENSE",
max_tokens=32768,
reference="https://huggingface.co/Qodo/Qodo-Embed-1-1.5B",
similarity_fn_name="cosine",
Expand Down Expand Up @@ -52,7 +52,7 @@
n_parameters=7_613_000_000,
memory_usage_mb=29040,
embed_dim=3584,
license="Qodo-Model",
license="https://huggingface.co/Qodo/Qodo-Embed-1-1.5B/blob/main/LICENSE",
max_tokens=32768,
reference="https://huggingface.co/Qodo/Qodo-Embed-1-7B",
similarity_fn_name="cosine",
Expand Down
10 changes: 5 additions & 5 deletions mteb/models/ru_sentence_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@
n_parameters=427_000_000,
memory_usage_mb=1629,
embed_dim=1024,
license="Not specified",
license="not specified",
max_tokens=512, # best guess
reference="https://huggingface.co/ai-forever/sbert_large_mt_nlu_ru",
similarity_fn_name="cosine",
Expand Down Expand Up @@ -248,7 +248,7 @@
n_parameters=1280_000_000,
memory_usage_mb=4883,
embed_dim=768,
license="Not specified",
license="not specified",
max_tokens=512,
reference="https://huggingface.co/DeepPavlov/rubert-base-cased",
similarity_fn_name="cosine",
Expand All @@ -273,7 +273,7 @@
n_parameters=107_000_000,
memory_usage_mb=408,
embed_dim=768,
license="Not specified",
license="not specified",
max_tokens=512,
reference="https://huggingface.co/DeepPavlov/distilrubert-small-cased-conversational",
similarity_fn_name="cosine",
Expand All @@ -296,7 +296,7 @@
n_parameters=107_000_000,
memory_usage_mb=408,
embed_dim=768,
license="Not specified",
license="not specified",
max_tokens=512,
reference="https://huggingface.co/DeepPavlov/rubert-base-cased-sentence",
similarity_fn_name="cosine",
Expand All @@ -319,7 +319,7 @@
n_parameters=129_000_000,
memory_usage_mb=492,
embed_dim=768,
license="Not specified",
license="not specified",
max_tokens=512,
reference="https://huggingface.co/cointegrated/LaBSE-en-ru",
similarity_fn_name="cosine",
Expand Down