fix: add annotation models for stella zh (#2277)

KennethEnevoldsen · isaac-chung · web-flow · commit 034da4d7ebc9 · 2025-03-11T15:30:38.000Z
* fix: add annotation models for stella zh

Additionally fixed a few annotation errors

* format

* Update mteb/models/stella_models.py

Co-authored-by: Isaac Chung &lt;chungisaac1217@gmail.com&gt;

---------

Co-authored-by: Isaac Chung &lt;chungisaac1217@gmail.com&gt;
diff --git a/mteb/models/bge_models.py b/mteb/models/bge_models.py
@@ -33,7 +33,7 @@
     "HotpotQA-NL": ["train"],  # translation not trained on
     "HotpotQAHardNegatives": ["train"],
     "T2Retrieval": ["train"],
-    "DuReader": ["train"],
+    "DuRetrieval": ["train"],
     "MMarcoReranking": ["train"],
     "CodeSearchNet": ["train"],
     # not in mteb
@@ -70,11 +70,11 @@
         "validation",
         "test",
     ],  # assumed from mlqa	(question, context)
+    "DuRetrieval": ["train"],
     # not in mteb
     # Dataset	Pairs
     # wudao	(title, passage)
     # cmrc2018	(query, context)
-    # dureader	(query, context)
     # simclue	(sentence_a, sentence_b)
     # csl	(title, abstract)
     # amazon_reviews_multi	(title, body)
@@ -91,7 +91,7 @@
 bge_chinese_training_data = {
     # source: https://arxiv.org/pdf/2309.07597
     "T2Retrieval": ["train"],
-    "DuReader": ["train"],
+    "DuRetrieval": ["train"],
     "MMarcoReranking": ["train"],
     "CMedQAv2-reranking": ["train"],
     "Cmnli": ["train"],
@@ -121,7 +121,7 @@
     # Dataset	Pairs
     # wudao	(title, passage)
     # cmrc2018	(query, context)
-    # dureader	(query, context)
+    # dureader	(query, context) - DuRetrieval
     # simclue	(sentence_a, sentence_b)
     # csl	(title, abstract)
     # amazon_reviews_multi	(title, body)
diff --git a/mteb/models/gte_models.py b/mteb/models/gte_models.py
@@ -265,7 +265,7 @@ def instruction_template(
 # Source: https://arxiv.org/pdf/2407.19669
 gte_multi_training_data = {
     "T2Retrieval": ["train"],
-    "DuReader": ["train"],
+    "DuRetrieval": ["train"],
     "MMarcoReranking": ["train"],
     "CMedQAv2-reranking": ["train"],
     "NQ-NL": ["train"],  # translation not trained on
diff --git a/mteb/models/moka_models.py b/mteb/models/moka_models.py
@@ -54,6 +54,7 @@
     "LCQMC": ["train"],
     "MIRACLReranking": ["train"],
     "PAWSX": ["train"],
+    "DuRetrieval": [],
     # not in MTEB:
     # - cmrc2018
     # - belle_2m
@@ -67,7 +68,7 @@
     # - wiki_atomic_edit
     # - chatmed_consult
     # - webqa
-    # - dureader_robust
+    # - dureader_robust - DuRetrieval
     # - csl
     # - lawzhidao
     # - CINLID
diff --git a/mteb/models/stella_models.py b/mteb/models/stella_models.py
@@ -6,6 +6,41 @@
 from mteb.models.instruct_wrapper import instruct_wrapper
 from mteb.models.nvidia_models import nvidia_training_datasets
 
+stella_zh_datasets = {
+    "BQ": [],
+    "LCQMC": [],
+    "PAWSX": [],
+    "STS-B": [],
+    "DuRetrieval": [],
+    "AFQMC": [],
+    "Cmnli": [],
+    "Ocnli": [],
+}
+
+# Derived from conversation:
+
+# The model information in Chinese is as follows:
+# infgrad/stella-base-zh：based on piccolo-base-zh, using supervised data to train, the data is wudao_base_200GB[1]、m3e[2] and simclue[3]
+# infgrad/stella-large-zh：based on piccolo-large-zh, using supervised data to train, the data is wudao_base_200GB[1]、m3e[2] and simclue[3]
+# infgrad/stella-base-zh-v2：based on infgrad/stella-base-zh, using supervised data to train, the data is wudao_base_200GB[1]、m3e[2] and simclue[3]
+# infgrad/stella-large-zh-v2：based on infgrad/stella-large-zh, using supervised data to train, the data is wudao_base_200GB[1]、m3e[2] and simclue[3]
+# For infgrad/stella-mrl-large-zh-v3.5-1792d， infgrad/stella-base-zh-v3-1792d, or other models, I forgot their details, what I remember is that they are distilled models, and using skypile[4] and matrix[5].
+# Finally, m3e[2] and simclue[3] has a overlap with C-MTEB, specifically：
+# BQ
+# lcqmc
+# paws-x
+# dureader_robust
+# AFQMC
+# STSB
+# CMNLI
+# OCNLI
+# Totally 8 training datasets are also CMTEB testset.
+# https://www.scidb.cn/en/detail?dataSetId=c6a3fe684227415a9db8e21bac4a15ab
+# https://github.com/wangyuxinwhy/uniem
+# https://github.com/CLUEbenchmark/SimCLUE
+# https://huggingface.co/datasets/Skywork/SkyPile-150B
+# https://huggingface.co/datasets/m-a-p/Matrix
+
 stella_en_400M = ModelMeta(
     # https://huggingface.co/dunzhang/stella_en_400M_v5/discussions/21#671a6205ac1e2416090f2bf4
     loader=partial(  # type: ignore
@@ -83,6 +118,7 @@
     public_training_code=None,
     public_training_data=None,
     training_datasets={
+        **stella_zh_datasets
         # Not in MTEB:
         # - infgrad/dialogue_rewrite_llm
         # - infgrad/retrieval_data_llm
@@ -109,6 +145,7 @@
     public_training_code=None,
     public_training_data=None,
     training_datasets={
+        **stella_zh_datasets
         # Not in MTEB:
         # - infgrad/dialogue_rewrite_llm
         # - infgrad/retrieval_data_llm
@@ -135,7 +172,7 @@
     adapted_from="dunzhang/stella-large-zh-v3-1792d",
     public_training_code=None,
     public_training_data=None,
-    training_datasets=None,  # Not specified
+    training_datasets=stella_zh_datasets,
 )
 
 zpoint_large_embedding_zh = ModelMeta(