|
6 | 6 | from mteb.models.instruct_wrapper import instruct_wrapper
|
7 | 7 | from mteb.models.nvidia_models import nvidia_training_datasets
|
8 | 8 |
|
| 9 | +stella_zh_datasets = { |
| 10 | + "BQ": [], |
| 11 | + "LCQMC": [], |
| 12 | + "PAWSX": [], |
| 13 | + "STS-B": [], |
| 14 | + "DuRetrieval": [], |
| 15 | + "AFQMC": [], |
| 16 | + "Cmnli": [], |
| 17 | + "Ocnli": [], |
| 18 | +} |
| 19 | + |
| 20 | +# Derived from conversation: |
| 21 | + |
| 22 | +# The model information in Chinese is as follows: |
| 23 | +# infgrad/stella-base-zh:based on piccolo-base-zh, using supervised data to train, the data is wudao_base_200GB[1]、m3e[2] and simclue[3] |
| 24 | +# infgrad/stella-large-zh:based on piccolo-large-zh, using supervised data to train, the data is wudao_base_200GB[1]、m3e[2] and simclue[3] |
| 25 | +# infgrad/stella-base-zh-v2:based on infgrad/stella-base-zh, using supervised data to train, the data is wudao_base_200GB[1]、m3e[2] and simclue[3] |
| 26 | +# infgrad/stella-large-zh-v2:based on infgrad/stella-large-zh, using supervised data to train, the data is wudao_base_200GB[1]、m3e[2] and simclue[3] |
| 27 | +# For infgrad/stella-mrl-large-zh-v3.5-1792d, infgrad/stella-base-zh-v3-1792d, or other models, I forgot their details, what I remember is that they are distilled models, and using skypile[4] and matrix[5]. |
| 28 | +# Finally, m3e[2] and simclue[3] has a overlap with C-MTEB, specifically: |
| 29 | +# BQ |
| 30 | +# lcqmc |
| 31 | +# paws-x |
| 32 | +# dureader_robust |
| 33 | +# AFQMC |
| 34 | +# STSB |
| 35 | +# CMNLI |
| 36 | +# OCNLI |
| 37 | +# Totally 8 training datasets are also CMTEB testset. |
| 38 | +# https://www.scidb.cn/en/detail?dataSetId=c6a3fe684227415a9db8e21bac4a15ab |
| 39 | +# https://github.com/wangyuxinwhy/uniem |
| 40 | +# https://github.com/CLUEbenchmark/SimCLUE |
| 41 | +# https://huggingface.co/datasets/Skywork/SkyPile-150B |
| 42 | +# https://huggingface.co/datasets/m-a-p/Matrix |
| 43 | + |
9 | 44 | stella_en_400M = ModelMeta(
|
10 | 45 | # https://huggingface.co/dunzhang/stella_en_400M_v5/discussions/21#671a6205ac1e2416090f2bf4
|
11 | 46 | loader=partial( # type: ignore
|
|
83 | 118 | public_training_code=None,
|
84 | 119 | public_training_data=None,
|
85 | 120 | training_datasets={
|
| 121 | + **stella_zh_datasets |
86 | 122 | # Not in MTEB:
|
87 | 123 | # - infgrad/dialogue_rewrite_llm
|
88 | 124 | # - infgrad/retrieval_data_llm
|
|
109 | 145 | public_training_code=None,
|
110 | 146 | public_training_data=None,
|
111 | 147 | training_datasets={
|
| 148 | + **stella_zh_datasets |
112 | 149 | # Not in MTEB:
|
113 | 150 | # - infgrad/dialogue_rewrite_llm
|
114 | 151 | # - infgrad/retrieval_data_llm
|
|
135 | 172 | adapted_from="dunzhang/stella-large-zh-v3-1792d",
|
136 | 173 | public_training_code=None,
|
137 | 174 | public_training_data=None,
|
138 |
| - training_datasets=None, # Not specified |
| 175 | + training_datasets=stella_zh_datasets, |
139 | 176 | )
|
140 | 177 |
|
141 | 178 | zpoint_large_embedding_zh = ModelMeta(
|
|
0 commit comments