Skip to content

Commit 034da4d

Browse files
fix: add annotation models for stella zh (#2277)
* fix: add annotation models for stella zh Additionally fixed a few annotation errors * format * Update mteb/models/stella_models.py Co-authored-by: Isaac Chung <chungisaac1217@gmail.com> --------- Co-authored-by: Isaac Chung <chungisaac1217@gmail.com>
1 parent 8b14281 commit 034da4d

File tree

4 files changed

+45
-7
lines changed

4 files changed

+45
-7
lines changed

mteb/models/bge_models.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
"HotpotQA-NL": ["train"], # translation not trained on
3434
"HotpotQAHardNegatives": ["train"],
3535
"T2Retrieval": ["train"],
36-
"DuReader": ["train"],
36+
"DuRetrieval": ["train"],
3737
"MMarcoReranking": ["train"],
3838
"CodeSearchNet": ["train"],
3939
# not in mteb
@@ -70,11 +70,11 @@
7070
"validation",
7171
"test",
7272
], # assumed from mlqa (question, context)
73+
"DuRetrieval": ["train"],
7374
# not in mteb
7475
# Dataset Pairs
7576
# wudao (title, passage)
7677
# cmrc2018 (query, context)
77-
# dureader (query, context)
7878
# simclue (sentence_a, sentence_b)
7979
# csl (title, abstract)
8080
# amazon_reviews_multi (title, body)
@@ -91,7 +91,7 @@
9191
bge_chinese_training_data = {
9292
# source: https://arxiv.org/pdf/2309.07597
9393
"T2Retrieval": ["train"],
94-
"DuReader": ["train"],
94+
"DuRetrieval": ["train"],
9595
"MMarcoReranking": ["train"],
9696
"CMedQAv2-reranking": ["train"],
9797
"Cmnli": ["train"],
@@ -121,7 +121,7 @@
121121
# Dataset Pairs
122122
# wudao (title, passage)
123123
# cmrc2018 (query, context)
124-
# dureader (query, context)
124+
# dureader (query, context) - DuRetrieval
125125
# simclue (sentence_a, sentence_b)
126126
# csl (title, abstract)
127127
# amazon_reviews_multi (title, body)

mteb/models/gte_models.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -265,7 +265,7 @@ def instruction_template(
265265
# Source: https://arxiv.org/pdf/2407.19669
266266
gte_multi_training_data = {
267267
"T2Retrieval": ["train"],
268-
"DuReader": ["train"],
268+
"DuRetrieval": ["train"],
269269
"MMarcoReranking": ["train"],
270270
"CMedQAv2-reranking": ["train"],
271271
"NQ-NL": ["train"], # translation not trained on

mteb/models/moka_models.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@
5454
"LCQMC": ["train"],
5555
"MIRACLReranking": ["train"],
5656
"PAWSX": ["train"],
57+
"DuRetrieval": [],
5758
# not in MTEB:
5859
# - cmrc2018
5960
# - belle_2m
@@ -67,7 +68,7 @@
6768
# - wiki_atomic_edit
6869
# - chatmed_consult
6970
# - webqa
70-
# - dureader_robust
71+
# - dureader_robust - DuRetrieval
7172
# - csl
7273
# - lawzhidao
7374
# - CINLID

mteb/models/stella_models.py

+38-1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,41 @@
66
from mteb.models.instruct_wrapper import instruct_wrapper
77
from mteb.models.nvidia_models import nvidia_training_datasets
88

9+
stella_zh_datasets = {
10+
"BQ": [],
11+
"LCQMC": [],
12+
"PAWSX": [],
13+
"STS-B": [],
14+
"DuRetrieval": [],
15+
"AFQMC": [],
16+
"Cmnli": [],
17+
"Ocnli": [],
18+
}
19+
20+
# Derived from conversation:
21+
22+
# The model information in Chinese is as follows:
23+
# infgrad/stella-base-zh:based on piccolo-base-zh, using supervised data to train, the data is wudao_base_200GB[1]、m3e[2] and simclue[3]
24+
# infgrad/stella-large-zh:based on piccolo-large-zh, using supervised data to train, the data is wudao_base_200GB[1]、m3e[2] and simclue[3]
25+
# infgrad/stella-base-zh-v2:based on infgrad/stella-base-zh, using supervised data to train, the data is wudao_base_200GB[1]、m3e[2] and simclue[3]
26+
# infgrad/stella-large-zh-v2:based on infgrad/stella-large-zh, using supervised data to train, the data is wudao_base_200GB[1]、m3e[2] and simclue[3]
27+
# For infgrad/stella-mrl-large-zh-v3.5-1792d, infgrad/stella-base-zh-v3-1792d, or other models, I forgot their details, what I remember is that they are distilled models, and using skypile[4] and matrix[5].
28+
# Finally, m3e[2] and simclue[3] has a overlap with C-MTEB, specifically:
29+
# BQ
30+
# lcqmc
31+
# paws-x
32+
# dureader_robust
33+
# AFQMC
34+
# STSB
35+
# CMNLI
36+
# OCNLI
37+
# Totally 8 training datasets are also CMTEB testset.
38+
# https://www.scidb.cn/en/detail?dataSetId=c6a3fe684227415a9db8e21bac4a15ab
39+
# https://github.com/wangyuxinwhy/uniem
40+
# https://github.com/CLUEbenchmark/SimCLUE
41+
# https://huggingface.co/datasets/Skywork/SkyPile-150B
42+
# https://huggingface.co/datasets/m-a-p/Matrix
43+
944
stella_en_400M = ModelMeta(
1045
# https://huggingface.co/dunzhang/stella_en_400M_v5/discussions/21#671a6205ac1e2416090f2bf4
1146
loader=partial( # type: ignore
@@ -83,6 +118,7 @@
83118
public_training_code=None,
84119
public_training_data=None,
85120
training_datasets={
121+
**stella_zh_datasets
86122
# Not in MTEB:
87123
# - infgrad/dialogue_rewrite_llm
88124
# - infgrad/retrieval_data_llm
@@ -109,6 +145,7 @@
109145
public_training_code=None,
110146
public_training_data=None,
111147
training_datasets={
148+
**stella_zh_datasets
112149
# Not in MTEB:
113150
# - infgrad/dialogue_rewrite_llm
114151
# - infgrad/retrieval_data_llm
@@ -135,7 +172,7 @@
135172
adapted_from="dunzhang/stella-large-zh-v3-1792d",
136173
public_training_code=None,
137174
public_training_data=None,
138-
training_datasets=None, # Not specified
175+
training_datasets=stella_zh_datasets,
139176
)
140177

141178
zpoint_large_embedding_zh = ModelMeta(

0 commit comments

Comments
 (0)