Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

docs: add annotation models for stella zh #2277

Merged
merged 4 commits into from
Mar 11, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions mteb/models/bge_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
"HotpotQA-NL": ["train"], # translation not trained on
"HotpotQAHardNegatives": ["train"],
"T2Retrieval": ["train"],
"DuReader": ["train"],
"DuRetrieval": ["train"],
"MMarcoReranking": ["train"],
"CodeSearchNet": ["train"],
# not in mteb
Expand Down Expand Up @@ -70,11 +70,11 @@
"validation",
"test",
], # assumed from mlqa (question, context)
"DuRetrieval": ["train"],
# not in mteb
# Dataset Pairs
# wudao (title, passage)
# cmrc2018 (query, context)
# dureader (query, context)
# simclue (sentence_a, sentence_b)
# csl (title, abstract)
# amazon_reviews_multi (title, body)
Expand All @@ -91,7 +91,7 @@
bge_chinese_training_data = {
# source: https://arxiv.org/pdf/2309.07597
"T2Retrieval": ["train"],
"DuReader": ["train"],
"DuRetrieval": ["train"],
"MMarcoReranking": ["train"],
"CMedQAv2-reranking": ["train"],
"Cmnli": ["train"],
Expand Down Expand Up @@ -121,7 +121,7 @@
# Dataset Pairs
# wudao (title, passage)
# cmrc2018 (query, context)
# dureader (query, context)
# dureader (query, context) - DuRetrieval
# simclue (sentence_a, sentence_b)
# csl (title, abstract)
# amazon_reviews_multi (title, body)
Expand Down
2 changes: 1 addition & 1 deletion mteb/models/gte_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ def instruction_template(
# Source: https://arxiv.org/pdf/2407.19669
gte_multi_training_data = {
"T2Retrieval": ["train"],
"DuReader": ["train"],
"DuRetrieval": ["train"],
"MMarcoReranking": ["train"],
"CMedQAv2-reranking": ["train"],
"NQ-NL": ["train"], # translation not trained on
Expand Down
3 changes: 2 additions & 1 deletion mteb/models/moka_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
"LCQMC": ["train"],
"MIRACLReranking": ["train"],
"PAWSX": ["train"],
"DuRetrieval": [],
# not in MTEB:
# - cmrc2018
# - belle_2m
Expand All @@ -67,7 +68,7 @@
# - wiki_atomic_edit
# - chatmed_consult
# - webqa
# - dureader_robust
# - dureader_robust - DuRetrieval
# - csl
# - lawzhidao
# - CINLID
Expand Down
2 changes: 1 addition & 1 deletion mteb/models/overview.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from mteb.model_meta import ModelMeta
from mteb.models import (
align_models,
ara_models,
arctic_models,
bedrock_models,
bge_models,
Expand Down Expand Up @@ -74,7 +75,6 @@
vlm2vec_models,
voyage_models,
voyage_v,
ara_models,
)

logger = logging.getLogger(__name__)
Expand Down
39 changes: 38 additions & 1 deletion mteb/models/stella_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,41 @@
from mteb.models.instruct_wrapper import instruct_wrapper
from mteb.models.nvidia_models import nvidia_training_datasets

stella_zh_datasets = {
"BQ": [],
"LCQMC": [],
"PAWSX": [],
"STS-B": [],
"DuRetrieval": [],
"AFQMC": [],
"Cmnli": [],
"Ocnli": [],
}

# Derived from conversation:

# The model information in Chinese is as follows:
# infgrad/stella-base-zh:based on piccolo-base-zh, using supervised data to train, the data is wudao_base_200GB[1]、m3e[2] and simclue[3]
# infgrad/stella-large-zh:based on piccolo-large-zh, using supervised data to train, the data is wudao_base_200GB[1]、m3e[2] and simclue[3]
# infgrad/stella-base-zh-v2:based on infgrad/stella-base-zh, using supervised data to train, the data is wudao_base_200GB[1]、m3e[2] and simclue[3]
# infgrad/stella-large-zh-v2:based on infgrad/stella-large-zh, using supervised data to train, the data is wudao_base_200GB[1]、m3e[2] and simclue[3]
# For infgrad/stella-mrl-large-zh-v3.5-1792d, infgrad/stella-base-zh-v3-1792d, or other models, I forgot their details, what I remember is that they are distilled models, and using skypile[4] and matrix[5].
# Finally, m3e[2] and simclue[3] has a overlap with C-MTEB, specifically:
# BQ
# lcqmc
# paws-x
# dureader_robust
# AFQMC
# STSB
# CMNLI
# OCNLI
# Totally 8 training datasets are also CMTEB testset.
# https://www.scidb.cn/en/detail?dataSetId=c6a3fe684227415a9db8e21bac4a15ab
# https://github.com/wangyuxinwhy/uniem
# https://github.com/CLUEbenchmark/SimCLUE
# https://huggingface.co/datasets/Skywork/SkyPile-150B
# https://huggingface.co/datasets/m-a-p/Matrix

stella_en_400M = ModelMeta(
# https://huggingface.co/dunzhang/stella_en_400M_v5/discussions/21#671a6205ac1e2416090f2bf4
loader=partial( # type: ignore
Expand Down Expand Up @@ -83,6 +118,7 @@
public_training_code=None,
public_training_data=None,
training_datasets={
**stella_zh_datasets
# Not in MTEB:
# - infgrad/dialogue_rewrite_llm
# - infgrad/retrieval_data_llm
Expand All @@ -109,6 +145,7 @@
public_training_code=None,
public_training_data=None,
training_datasets={
**stella_zh_datasets
# Not in MTEB:
# - infgrad/dialogue_rewrite_llm
# - infgrad/retrieval_data_llm
Expand All @@ -135,7 +172,7 @@
adapted_from="dunzhang/stella-large-zh-v3-1792d",
public_training_code=None,
public_training_data=None,
training_datasets=None, # Not specified
training_datasets=stella_zh_datasets,
)

zpoint_large_embedding_zh = ModelMeta(
Expand Down