From f913ee5a5b92323266cfccf0691292d541bc58ac Mon Sep 17 00:00:00 2001 From: "shengzhe.li" Date: Mon, 9 Sep 2024 23:23:20 +0900 Subject: [PATCH 1/7] Add ruri-large/base/small to leaderboard --- docs/results/cl-nagoya/ruri-base/summary.json | 62 +++++++++++++++++++ .../results/cl-nagoya/ruri-large/summary.json | 62 +++++++++++++++++++ .../results/cl-nagoya/ruri-small/summary.json | 62 +++++++++++++++++++ leaderboard.md | 35 ++++++++--- 4 files changed, 214 insertions(+), 7 deletions(-) create mode 100644 docs/results/cl-nagoya/ruri-base/summary.json create mode 100644 docs/results/cl-nagoya/ruri-large/summary.json create mode 100644 docs/results/cl-nagoya/ruri-small/summary.json diff --git a/docs/results/cl-nagoya/ruri-base/summary.json b/docs/results/cl-nagoya/ruri-base/summary.json new file mode 100644 index 0000000..a7c7b05 --- /dev/null +++ b/docs/results/cl-nagoya/ruri-base/summary.json @@ -0,0 +1,62 @@ +{ + "Classification": { + "amazon_counterfactual_classification": { + "macro_f1": 0.7665550732749669 + }, + "amazon_review_classification": { + "macro_f1": 0.5575876111411316 + }, + "massive_intent_classification": { + "macro_f1": 0.8141210121425055 + }, + "massive_scenario_classification": { + "macro_f1": 0.8848812917656395 + } + }, + "Reranking": { + "esci": { + "ndcg@10": 0.9290942178703699 + } + }, + "Retrieval": { + "jagovfaqs_22k": { + "ndcg@10": 0.7455660589538348 + }, + "jaqket": { + "ndcg@10": 0.5012253145754781 + }, + "mrtydi": { + "ndcg@10": 0.3545113073009125 + }, + "nlp_journal_abs_intro": { + "ndcg@10": 0.8689204088388403 + }, + "nlp_journal_title_abs": { + "ndcg@10": 0.9656989703684407 + }, + "nlp_journal_title_intro": { + "ndcg@10": 0.7531306059721564 + } + }, + "STS": { + "jsick": { + "spearman": 0.8231772134744029 + }, + "jsts": { + "spearman": 0.8342848039994751 + } + }, + "Clustering": { + "livedoor_news": { + "v_measure_score": 0.5427223607801758 + }, + "mewsc16": { + "v_measure_score": 0.5404099864321413 + } + }, + "PairClassification": { + "paws_x_ja": { + "binary_f1": 0.6237623762376238 + } + } +} \ No newline at end of file diff --git a/docs/results/cl-nagoya/ruri-large/summary.json b/docs/results/cl-nagoya/ruri-large/summary.json new file mode 100644 index 0000000..e86c46b --- /dev/null +++ b/docs/results/cl-nagoya/ruri-large/summary.json @@ -0,0 +1,62 @@ +{ + "Classification": { + "amazon_counterfactual_classification": { + "macro_f1": 0.8080806321853091 + }, + "amazon_review_classification": { + "macro_f1": 0.5680171450057119 + }, + "massive_intent_classification": { + "macro_f1": 0.8255898596881264 + }, + "massive_scenario_classification": { + "macro_f1": 0.8956410349938264 + } + }, + "Reranking": { + "esci": { + "ndcg@10": 0.9298524733536755 + } + }, + "Retrieval": { + "jagovfaqs_22k": { + "ndcg@10": 0.7667506664925435 + }, + "jaqket": { + "ndcg@10": 0.6173871224245404 + }, + "mrtydi": { + "ndcg@10": 0.3803302462897418 + }, + "nlp_journal_abs_intro": { + "ndcg@10": 0.8712459719069233 + }, + "nlp_journal_title_abs": { + "ndcg@10": 0.9657898747088243 + }, + "nlp_journal_title_intro": { + "ndcg@10": 0.779665053945222 + } + }, + "STS": { + "jsick": { + "spearman": 0.8199959693684533 + }, + "jsts": { + "spearman": 0.8426164139167538 + } + }, + "Clustering": { + "livedoor_news": { + "v_measure_score": 0.5139491572866559 + }, + "mewsc16": { + "v_measure_score": 0.5225025331595674 + } + }, + "PairClassification": { + "paws_x_ja": { + "binary_f1": 0.6228813559322034 + } + } +} \ No newline at end of file diff --git a/docs/results/cl-nagoya/ruri-small/summary.json b/docs/results/cl-nagoya/ruri-small/summary.json new file mode 100644 index 0000000..cb591ea --- /dev/null +++ b/docs/results/cl-nagoya/ruri-small/summary.json @@ -0,0 +1,62 @@ +{ + "Classification": { + "amazon_counterfactual_classification": { + "macro_f1": 0.7991935990685706 + }, + "amazon_review_classification": { + "macro_f1": 0.556129066893332 + }, + "massive_intent_classification": { + "macro_f1": 0.8148895285345188 + }, + "massive_scenario_classification": { + "macro_f1": 0.8787774569382543 + } + }, + "Reranking": { + "esci": { + "ndcg@10": 0.9300177985352138 + } + }, + "Retrieval": { + "jagovfaqs_22k": { + "ndcg@10": 0.736494039429321 + }, + "jaqket": { + "ndcg@10": 0.484437639428696 + }, + "mrtydi": { + "ndcg@10": 0.3342716158897666 + }, + "nlp_journal_abs_intro": { + "ndcg@10": 0.8768878489670099 + }, + "nlp_journal_title_abs": { + "ndcg@10": 0.9716879343439146 + }, + "nlp_journal_title_intro": { + "ndcg@10": 0.7608660955794895 + } + }, + "STS": { + "jsick": { + "spearman": 0.8343927017558587 + }, + "jsts": { + "spearman": 0.8213297790184827 + } + }, + "Clustering": { + "livedoor_news": { + "v_measure_score": 0.5096442244018489 + }, + "mewsc16": { + "v_measure_score": 0.5141045788711239 + } + }, + "PairClassification": { + "paws_x_ja": { + "binary_f1": 0.6211267605633802 + } + } +} \ No newline at end of file diff --git a/leaderboard.md b/leaderboard.md index b07dbca..93d3988 100644 --- a/leaderboard.md +++ b/leaderboard.md @@ -7,7 +7,10 @@ The summary shows the average scores within each task. | Model | Avg. | Retrieval | STS | Classification | Reranking | Clustering | PairClassification | |:----------------------------------------------|:----------|:------------|:----------|:-----------------|:------------|:-------------|:---------------------| -| OpenAI/text-embedding-3-large | **73.97** | **74.48** | 82.52 | **77.58** | **93.58** | **53.32** | 62.35 | +| OpenAI/text-embedding-3-large | **73.97** | **74.48** | 82.52 | **77.58** | **93.58** | 53.32 | 62.35 | +| cl-nagoya/ruri-large | 73.45 | 73.02 | 83.13 | 77.43 | 92.99 | 51.82 | 62.29 | +| cl-nagoya/ruri-base | 72.95 | 69.82 | 82.87 | 75.58 | 92.91 | **54.16** | 62.38 | +| cl-nagoya/ruri-small | 72.45 | 69.41 | 82.79 | 76.22 | 93.00 | 51.19 | 62.11 | | intfloat/multilingual-e5-large | 71.65 | 70.98 | 79.70 | 72.89 | 92.96 | 51.24 | 62.15 | | OpenAI/text-embedding-3-small | 70.86 | 66.39 | 79.46 | 73.06 | 92.92 | 51.06 | 62.27 | | pkshatech/GLuCoSE-base-ja | 70.44 | 59.02 | 78.71 | 76.82 | 91.90 | 49.78 | **66.39** | @@ -33,8 +36,11 @@ The summary shows the average scores within each task. ## Retrieval | Model | Avg. | jagovfaqs_22k
(ndcg@10) | jaqket
(ndcg@10) | mrtydi
(ndcg@10) | nlp_journal_abs_intro
(ndcg@10) | nlp_journal_title_abs
(ndcg@10) | nlp_journal_title_intro
(ndcg@10) | |:----------------------------------------------|:----------|:-----------------------------|:----------------------|:----------------------|:-------------------------------------|:-------------------------------------|:---------------------------------------| -| OpenAI/text-embedding-3-large | **74.48** | **72.41** | 48.21 | 34.88 | **99.33** | **96.55** | **95.47** | -| intfloat/multilingual-e5-large | 70.98 | 70.30 | **58.78** | **43.63** | 86.00 | 94.70 | 72.48 | +| OpenAI/text-embedding-3-large | **74.48** | 72.41 | 48.21 | 34.88 | **99.33** | 96.55 | **95.47** | +| cl-nagoya/ruri-large | 73.02 | **76.68** | **61.74** | 38.03 | 87.12 | 96.58 | 77.97 | +| intfloat/multilingual-e5-large | 70.98 | 70.30 | 58.78 | **43.63** | 86.00 | 94.70 | 72.48 | +| cl-nagoya/ruri-base | 69.82 | 74.56 | 50.12 | 35.45 | 86.89 | 96.57 | 75.31 | +| cl-nagoya/ruri-small | 69.41 | 73.65 | 48.44 | 33.43 | 87.69 | **97.17** | 76.09 | | intfloat/multilingual-e5-base | 68.21 | 65.34 | 50.67 | 38.38 | 87.10 | 94.73 | 73.05 | | intfloat/multilingual-e5-small | 67.27 | 64.11 | 49.97 | 36.05 | 85.21 | 95.26 | 72.99 | | OpenAI/text-embedding-3-small | 66.39 | 64.02 | 33.94 | 20.03 | 98.47 | 91.70 | 90.17 | @@ -60,7 +66,10 @@ The summary shows the average scores within each task. | Model | Avg. | jsick
(spearman) | jsts
(spearman) | |:----------------------------------------------|:----------|:----------------------|:---------------------| | cl-nagoya/sup-simcse-ja-large | **83.18** | **83.80** | 82.57 | -| OpenAI/text-embedding-3-large | 82.52 | 81.27 | **83.77** | +| cl-nagoya/ruri-large | 83.13 | 82.00 | **84.26** | +| cl-nagoya/ruri-base | 82.87 | 82.32 | 83.43 | +| cl-nagoya/ruri-small | 82.79 | 83.44 | 82.13 | +| OpenAI/text-embedding-3-large | 82.52 | 81.27 | 83.77 | | cl-nagoya/sup-simcse-ja-base | 82.05 | 82.83 | 81.27 | | cl-nagoya/unsup-simcse-ja-large | 80.56 | 80.15 | 80.98 | | intfloat/multilingual-e5-small | 80.07 | 81.50 | 78.65 | @@ -85,9 +94,12 @@ The summary shows the average scores within each task. ## Classification | Model | Avg. | amazon_counterfactual
(macro_f1) | amazon_review
(macro_f1) | massive_intent
(macro_f1) | massive_scenario
(macro_f1) | |:----------------------------------------------|:----------|:--------------------------------------|:------------------------------|:-------------------------------|:---------------------------------| -| OpenAI/text-embedding-3-large | **77.58** | 77.90 | **60.44** | **80.91** | **91.08** | +| OpenAI/text-embedding-3-large | **77.58** | 77.90 | **60.44** | 80.91 | **91.08** | +| cl-nagoya/ruri-large | 77.43 | 80.81 | 56.80 | **82.56** | 89.56 | | pkshatech/GLuCoSE-base-ja | 76.82 | **82.44** | 58.07 | 78.85 | 87.94 | | oshizo/sbert-jsnli-luke-japanese-base-lite | 76.61 | 79.95 | 57.48 | 80.26 | 88.75 | +| cl-nagoya/ruri-small | 76.22 | 79.92 | 55.61 | 81.49 | 87.88 | +| cl-nagoya/ruri-base | 75.58 | 76.66 | 55.76 | 81.41 | 88.49 | | cl-nagoya/unsup-simcse-ja-large | 74.66 | 76.79 | 55.37 | 79.13 | 87.36 | | MU-Kindai/Japanese-DiffCSE-BERT-base | 73.77 | 78.10 | 51.56 | 78.79 | 86.63 | | cl-nagoya/sup-simcse-ja-large | 73.73 | 73.21 | 54.76 | 79.23 | 87.72 | @@ -114,8 +126,11 @@ The summary shows the average scores within each task. | OpenAI/text-embedding-3-large | **93.58** | **93.58** | | OpenAI/text-embedding-ada-002 | 93.04 | 93.04 | | intfloat/multilingual-e5-small | 93.03 | 93.03 | +| cl-nagoya/ruri-small | 93.00 | 93.00 | +| cl-nagoya/ruri-large | 92.99 | 92.99 | | intfloat/multilingual-e5-large | 92.96 | 92.96 | | OpenAI/text-embedding-3-small | 92.92 | 92.92 | +| cl-nagoya/ruri-base | 92.91 | 92.91 | | intfloat/multilingual-e5-base | 92.85 | 92.85 | | pkshatech/GLuCoSE-base-ja | 91.90 | 91.90 | | cl-nagoya/sup-simcse-ja-base | 91.83 | 91.83 | @@ -137,12 +152,15 @@ The summary shows the average scores within each task. ## Clustering | Model | Avg. | livedoor_news
(v_measure_score) | mewsc16
(v_measure_score) | |:----------------------------------------------|:----------|:-------------------------------------|:-------------------------------| -| OpenAI/text-embedding-3-large | **53.32** | 57.09 | 49.55 | +| cl-nagoya/ruri-base | **54.16** | 54.27 | **54.04** | +| OpenAI/text-embedding-3-large | 53.32 | 57.09 | 49.55 | +| cl-nagoya/ruri-large | 51.82 | 51.39 | 52.25 | | cl-nagoya/sup-simcse-ja-base | 51.79 | 52.67 | 50.91 | | intfloat/multilingual-e5-large | 51.24 | **57.13** | 45.34 | +| cl-nagoya/ruri-small | 51.19 | 50.96 | 51.41 | | OpenAI/text-embedding-3-small | 51.06 | 54.57 | 47.55 | | cl-nagoya/sup-simcse-ja-large | 50.56 | 50.75 | 50.38 | -| oshizo/sbert-jsnli-luke-japanese-base-lite | 50.33 | 46.77 | **53.89** | +| oshizo/sbert-jsnli-luke-japanese-base-lite | 50.33 | 46.77 | 53.89 | | pkshatech/GLuCoSE-base-ja | 49.78 | 49.89 | 49.68 | | cl-nagoya/unsup-simcse-ja-large | 48.41 | 50.90 | 45.92 | | OpenAI/text-embedding-ada-002 | 48.30 | 49.67 | 46.92 | @@ -171,6 +189,7 @@ The summary shows the average scores within each task. | pkshatech/simcse-ja-bert-base-clcmlp | 62.40 | 62.40 | | OpenAI/text-embedding-ada-002 | 62.40 | 62.40 | | MU-Kindai/Japanese-SimCSE-BERT-base-unsup | 62.38 | 62.38 | +| cl-nagoya/ruri-base | 62.38 | 62.38 | | oshizo/sbert-jsnli-luke-japanese-base-lite | 62.38 | 62.38 | | MU-Kindai/Japanese-DiffCSE-BERT-base | 62.38 | 62.38 | | MU-Kindai/Japanese-SimCSE-BERT-base-sup | 62.37 | 62.37 | @@ -179,10 +198,12 @@ The summary shows the average scores within each task. | MU-Kindai/Japanese-MixCSE-BERT-base | 62.33 | 62.33 | | sentence-transformers/LaBSE | 62.33 | 62.33 | | colorfulscoop/sbert-base-ja | 62.31 | 62.31 | +| cl-nagoya/ruri-large | 62.29 | 62.29 | | OpenAI/text-embedding-3-small | 62.27 | 62.27 | | MU-Kindai/Japanese-SimCSE-BERT-large-unsup | 62.27 | 62.27 | | intfloat/multilingual-e5-base | 62.26 | 62.26 | | sentence-transformers/stsb-xlm-r-multilingual | 62.20 | 62.20 | | intfloat/multilingual-e5-small | 62.19 | 62.19 | | intfloat/multilingual-e5-large | 62.15 | 62.15 | +| cl-nagoya/ruri-small | 62.11 | 62.11 | From 73a304dcc36b5a1aa13ec9ca7778dc83d6e5acb6 Mon Sep 17 00:00:00 2001 From: "shengzhe.li" Date: Tue, 10 Sep 2024 21:24:22 +0900 Subject: [PATCH 2/7] Add pkshatech/RoSEtta-base-ja and pkshatech/GLuCoSE-base-ja-v2 to leaderboard --- .../pkshatech/GLuCoSE-base-ja-v2/summary.json | 62 +++++++++++++++++++ .../pkshatech/RoSEtta-base-ja/summary.json | 62 +++++++++++++++++++ leaderboard.md | 20 +++++- 3 files changed, 141 insertions(+), 3 deletions(-) create mode 100644 docs/results/pkshatech/GLuCoSE-base-ja-v2/summary.json create mode 100644 docs/results/pkshatech/RoSEtta-base-ja/summary.json diff --git a/docs/results/pkshatech/GLuCoSE-base-ja-v2/summary.json b/docs/results/pkshatech/GLuCoSE-base-ja-v2/summary.json new file mode 100644 index 0000000..60223bc --- /dev/null +++ b/docs/results/pkshatech/GLuCoSE-base-ja-v2/summary.json @@ -0,0 +1,62 @@ +{ + "Classification": { + "amazon_counterfactual_classification": { + "macro_f1": 0.7528271196943096 + }, + "amazon_review_classification": { + "macro_f1": 0.5561679575066396 + }, + "massive_intent_classification": { + "macro_f1": 0.8058990735631814 + }, + "massive_scenario_classification": { + "macro_f1": 0.8729457394926279 + } + }, + "Reranking": { + "esci": { + "ndcg@10": 0.9289703513027785 + } + }, + "Retrieval": { + "jagovfaqs_22k": { + "ndcg@10": 0.6842208748694516 + }, + "jaqket": { + "ndcg@10": 0.666162910609933 + }, + "mrtydi": { + "ndcg@10": 0.3679312414893066 + }, + "nlp_journal_abs_intro": { + "ndcg@10": 0.8961561684616985 + }, + "nlp_journal_title_abs": { + "ndcg@10": 0.9465973412523236 + }, + "nlp_journal_title_intro": { + "ndcg@10": 0.7514787290834406 + } + }, + "STS": { + "jsick": { + "spearman": 0.8499279029619572 + }, + "jsts": { + "spearman": 0.8150603412605322 + } + }, + "Clustering": { + "livedoor_news": { + "v_measure_score": 0.5165568486237136 + }, + "mewsc16": { + "v_measure_score": 0.4970285237567235 + } + }, + "PairClassification": { + "paws_x_ja": { + "binary_f1": 0.6239830208701804 + } + } +} \ No newline at end of file diff --git a/docs/results/pkshatech/RoSEtta-base-ja/summary.json b/docs/results/pkshatech/RoSEtta-base-ja/summary.json new file mode 100644 index 0000000..5025c4d --- /dev/null +++ b/docs/results/pkshatech/RoSEtta-base-ja/summary.json @@ -0,0 +1,62 @@ +{ + "Classification": { + "amazon_counterfactual_classification": { + "macro_f1": 0.7006688790331752 + }, + "amazon_review_classification": { + "macro_f1": 0.5299983831023539 + }, + "massive_intent_classification": { + "macro_f1": 0.7952268533717546 + }, + "massive_scenario_classification": { + "macro_f1": 0.869707847800633 + } + }, + "Reranking": { + "esci": { + "ndcg@10": 0.9267539503767978 + } + }, + "Retrieval": { + "jagovfaqs_22k": { + "ndcg@10": 0.6379929234552755 + }, + "jaqket": { + "ndcg@10": 0.6533570255483011 + }, + "mrtydi": { + "ndcg@10": 0.3407337609040446 + }, + "nlp_journal_abs_intro": { + "ndcg@10": 0.9577227924391506 + }, + "nlp_journal_title_abs": { + "ndcg@10": 0.9282272189004226 + }, + "nlp_journal_title_intro": { + "ndcg@10": 0.7938878816204916 + } + }, + "STS": { + "jsick": { + "spearman": 0.8302539464008364 + }, + "jsts": { + "spearman": 0.7961383132420531 + } + }, + "Clustering": { + "livedoor_news": { + "v_measure_score": 0.5503116157834466 + }, + "mewsc16": { + "v_measure_score": 0.389105324755125 + } + }, + "PairClassification": { + "paws_x_ja": { + "binary_f1": 0.6218727662616155 + } + } +} \ No newline at end of file diff --git a/leaderboard.md b/leaderboard.md index 93d3988..b41c49c 100644 --- a/leaderboard.md +++ b/leaderboard.md @@ -10,8 +10,10 @@ The summary shows the average scores within each task. | OpenAI/text-embedding-3-large | **73.97** | **74.48** | 82.52 | **77.58** | **93.58** | 53.32 | 62.35 | | cl-nagoya/ruri-large | 73.45 | 73.02 | 83.13 | 77.43 | 92.99 | 51.82 | 62.29 | | cl-nagoya/ruri-base | 72.95 | 69.82 | 82.87 | 75.58 | 92.91 | **54.16** | 62.38 | +| pkshatech/GLuCoSE-base-ja-v2 | 72.63 | 71.88 | **83.25** | 74.70 | 92.90 | 50.68 | 62.40 | | cl-nagoya/ruri-small | 72.45 | 69.41 | 82.79 | 76.22 | 93.00 | 51.19 | 62.11 | | intfloat/multilingual-e5-large | 71.65 | 70.98 | 79.70 | 72.89 | 92.96 | 51.24 | 62.15 | +| pkshatech/RoSEtta-base-ja | 71.23 | 71.87 | 81.32 | 72.39 | 92.68 | 46.97 | 62.19 | | OpenAI/text-embedding-3-small | 70.86 | 66.39 | 79.46 | 73.06 | 92.92 | 51.06 | 62.27 | | pkshatech/GLuCoSE-base-ja | 70.44 | 59.02 | 78.71 | 76.82 | 91.90 | 49.78 | **66.39** | | intfloat/multilingual-e5-base | 70.12 | 68.21 | 79.84 | 69.30 | 92.85 | 48.26 | 62.26 | @@ -20,7 +22,7 @@ The summary shows the average scores within each task. | cl-nagoya/sup-simcse-ja-base | 68.56 | 49.64 | 82.05 | 73.47 | 91.83 | 51.79 | 62.57 | | MU-Kindai/Japanese-SimCSE-BERT-large-unsup | 66.89 | 47.38 | 78.99 | 73.13 | 91.30 | 48.25 | 62.27 | | oshizo/sbert-jsnli-luke-japanese-base-lite | 66.75 | 43.00 | 76.60 | 76.61 | 91.56 | 50.33 | 62.38 | -| cl-nagoya/sup-simcse-ja-large | 66.51 | 37.62 | **83.18** | 73.73 | 91.48 | 50.56 | 62.51 | +| cl-nagoya/sup-simcse-ja-large | 66.51 | 37.62 | 83.18 | 73.73 | 91.48 | 50.56 | 62.51 | | cl-nagoya/unsup-simcse-ja-large | 66.27 | 40.53 | 80.56 | 74.66 | 90.95 | 48.41 | 62.49 | | MU-Kindai/Japanese-SimCSE-BERT-base-unsup | 66.23 | 46.36 | 77.49 | 73.30 | 91.16 | 46.68 | 62.38 | | MU-Kindai/Japanese-SimCSE-BERT-large-sup | 65.28 | 40.82 | 78.28 | 73.47 | 90.95 | 45.81 | 62.35 | @@ -37,7 +39,9 @@ The summary shows the average scores within each task. | Model | Avg. | jagovfaqs_22k
(ndcg@10) | jaqket
(ndcg@10) | mrtydi
(ndcg@10) | nlp_journal_abs_intro
(ndcg@10) | nlp_journal_title_abs
(ndcg@10) | nlp_journal_title_intro
(ndcg@10) | |:----------------------------------------------|:----------|:-----------------------------|:----------------------|:----------------------|:-------------------------------------|:-------------------------------------|:---------------------------------------| | OpenAI/text-embedding-3-large | **74.48** | 72.41 | 48.21 | 34.88 | **99.33** | 96.55 | **95.47** | -| cl-nagoya/ruri-large | 73.02 | **76.68** | **61.74** | 38.03 | 87.12 | 96.58 | 77.97 | +| cl-nagoya/ruri-large | 73.02 | **76.68** | 61.74 | 38.03 | 87.12 | 96.58 | 77.97 | +| pkshatech/GLuCoSE-base-ja-v2 | 71.88 | 68.42 | **66.62** | 36.79 | 89.62 | 94.66 | 75.15 | +| pkshatech/RoSEtta-base-ja | 71.87 | 63.80 | 65.34 | 34.07 | 95.77 | 92.82 | 79.39 | | intfloat/multilingual-e5-large | 70.98 | 70.30 | 58.78 | **43.63** | 86.00 | 94.70 | 72.48 | | cl-nagoya/ruri-base | 69.82 | 74.56 | 50.12 | 35.45 | 86.89 | 96.57 | 75.31 | | cl-nagoya/ruri-small | 69.41 | 73.65 | 48.44 | 33.43 | 87.69 | **97.17** | 76.09 | @@ -65,12 +69,14 @@ The summary shows the average scores within each task. ## STS | Model | Avg. | jsick
(spearman) | jsts
(spearman) | |:----------------------------------------------|:----------|:----------------------|:---------------------| -| cl-nagoya/sup-simcse-ja-large | **83.18** | **83.80** | 82.57 | +| pkshatech/GLuCoSE-base-ja-v2 | **83.25** | **84.99** | 81.51 | +| cl-nagoya/sup-simcse-ja-large | 83.18 | 83.80 | 82.57 | | cl-nagoya/ruri-large | 83.13 | 82.00 | **84.26** | | cl-nagoya/ruri-base | 82.87 | 82.32 | 83.43 | | cl-nagoya/ruri-small | 82.79 | 83.44 | 82.13 | | OpenAI/text-embedding-3-large | 82.52 | 81.27 | 83.77 | | cl-nagoya/sup-simcse-ja-base | 82.05 | 82.83 | 81.27 | +| pkshatech/RoSEtta-base-ja | 81.32 | 83.03 | 79.61 | | cl-nagoya/unsup-simcse-ja-large | 80.56 | 80.15 | 80.98 | | intfloat/multilingual-e5-small | 80.07 | 81.50 | 78.65 | | intfloat/multilingual-e5-base | 79.84 | 81.28 | 78.39 | @@ -100,6 +106,7 @@ The summary shows the average scores within each task. | oshizo/sbert-jsnli-luke-japanese-base-lite | 76.61 | 79.95 | 57.48 | 80.26 | 88.75 | | cl-nagoya/ruri-small | 76.22 | 79.92 | 55.61 | 81.49 | 87.88 | | cl-nagoya/ruri-base | 75.58 | 76.66 | 55.76 | 81.41 | 88.49 | +| pkshatech/GLuCoSE-base-ja-v2 | 74.70 | 75.28 | 55.62 | 80.59 | 87.29 | | cl-nagoya/unsup-simcse-ja-large | 74.66 | 76.79 | 55.37 | 79.13 | 87.36 | | MU-Kindai/Japanese-DiffCSE-BERT-base | 73.77 | 78.10 | 51.56 | 78.79 | 86.63 | | cl-nagoya/sup-simcse-ja-large | 73.73 | 73.21 | 54.76 | 79.23 | 87.72 | @@ -113,6 +120,7 @@ The summary shows the average scores within each task. | intfloat/multilingual-e5-large | 72.89 | 70.66 | 56.54 | 75.78 | 88.59 | | MU-Kindai/Japanese-SimCSE-BERT-base-sup | 72.76 | 76.20 | 52.06 | 77.89 | 84.90 | | sentence-transformers/LaBSE | 72.66 | 73.61 | 51.70 | 76.99 | 88.35 | +| pkshatech/RoSEtta-base-ja | 72.39 | 70.07 | 53.00 | 79.52 | 86.97 | | sentence-transformers/stsb-xlm-r-multilingual | 71.84 | 75.65 | 51.32 | 74.28 | 86.10 | | pkshatech/simcse-ja-bert-base-clcmlp | 71.30 | 67.49 | 50.85 | 79.67 | 87.20 | | OpenAI/text-embedding-ada-002 | 69.75 | 64.42 | 53.13 | 74.57 | 86.89 | @@ -131,7 +139,9 @@ The summary shows the average scores within each task. | intfloat/multilingual-e5-large | 92.96 | 92.96 | | OpenAI/text-embedding-3-small | 92.92 | 92.92 | | cl-nagoya/ruri-base | 92.91 | 92.91 | +| pkshatech/GLuCoSE-base-ja-v2 | 92.90 | 92.90 | | intfloat/multilingual-e5-base | 92.85 | 92.85 | +| pkshatech/RoSEtta-base-ja | 92.68 | 92.68 | | pkshatech/GLuCoSE-base-ja | 91.90 | 91.90 | | cl-nagoya/sup-simcse-ja-base | 91.83 | 91.83 | | sentence-transformers/LaBSE | 91.63 | 91.63 | @@ -159,6 +169,7 @@ The summary shows the average scores within each task. | intfloat/multilingual-e5-large | 51.24 | **57.13** | 45.34 | | cl-nagoya/ruri-small | 51.19 | 50.96 | 51.41 | | OpenAI/text-embedding-3-small | 51.06 | 54.57 | 47.55 | +| pkshatech/GLuCoSE-base-ja-v2 | 50.68 | 51.66 | 49.70 | | cl-nagoya/sup-simcse-ja-large | 50.56 | 50.75 | 50.38 | | oshizo/sbert-jsnli-luke-japanese-base-lite | 50.33 | 46.77 | 53.89 | | pkshatech/GLuCoSE-base-ja | 49.78 | 49.89 | 49.68 | @@ -167,6 +178,7 @@ The summary shows the average scores within each task. | intfloat/multilingual-e5-base | 48.26 | 55.03 | 41.49 | | MU-Kindai/Japanese-SimCSE-BERT-large-unsup | 48.25 | 53.20 | 43.31 | | pkshatech/simcse-ja-bert-base-clcmlp | 47.53 | 44.77 | 50.30 | +| pkshatech/RoSEtta-base-ja | 46.97 | 55.03 | 38.91 | | intfloat/multilingual-e5-small | 46.91 | 54.70 | 39.12 | | MU-Kindai/Japanese-SimCSE-BERT-base-unsup | 46.68 | 53.02 | 40.35 | | MU-Kindai/Japanese-SimCSE-BERT-large-sup | 45.81 | 48.45 | 43.17 | @@ -188,6 +200,7 @@ The summary shows the average scores within each task. | cl-nagoya/unsup-simcse-ja-base | 62.44 | 62.44 | | pkshatech/simcse-ja-bert-base-clcmlp | 62.40 | 62.40 | | OpenAI/text-embedding-ada-002 | 62.40 | 62.40 | +| pkshatech/GLuCoSE-base-ja-v2 | 62.40 | 62.40 | | MU-Kindai/Japanese-SimCSE-BERT-base-unsup | 62.38 | 62.38 | | cl-nagoya/ruri-base | 62.38 | 62.38 | | oshizo/sbert-jsnli-luke-japanese-base-lite | 62.38 | 62.38 | @@ -204,6 +217,7 @@ The summary shows the average scores within each task. | intfloat/multilingual-e5-base | 62.26 | 62.26 | | sentence-transformers/stsb-xlm-r-multilingual | 62.20 | 62.20 | | intfloat/multilingual-e5-small | 62.19 | 62.19 | +| pkshatech/RoSEtta-base-ja | 62.19 | 62.19 | | intfloat/multilingual-e5-large | 62.15 | 62.15 | | cl-nagoya/ruri-small | 62.11 | 62.11 | From 9df586a2677472c1286727a9ce15cc5d2c2b4673 Mon Sep 17 00:00:00 2001 From: "shengzhe.li" Date: Wed, 11 Sep 2024 19:55:46 +0900 Subject: [PATCH 3/7] Fix leaderboard summary average score to micro-average --- leaderboard.md | 56 ++++++++++++++++++++++----------------------- make_leaderboard.py | 14 ++++++++++-- 2 files changed, 40 insertions(+), 30 deletions(-) diff --git a/leaderboard.md b/leaderboard.md index b41c49c..45be95b 100644 --- a/leaderboard.md +++ b/leaderboard.md @@ -3,37 +3,37 @@ This leaderboard shows the results stored under `docs/results`. The scores are a ## Summary -The summary shows the average scores within each task. +The summary shows the average scores within each task. The average score is the average of scores by dataset. | Model | Avg. | Retrieval | STS | Classification | Reranking | Clustering | PairClassification | |:----------------------------------------------|:----------|:------------|:----------|:-----------------|:------------|:-------------|:---------------------| -| OpenAI/text-embedding-3-large | **73.97** | **74.48** | 82.52 | **77.58** | **93.58** | 53.32 | 62.35 | -| cl-nagoya/ruri-large | 73.45 | 73.02 | 83.13 | 77.43 | 92.99 | 51.82 | 62.29 | -| cl-nagoya/ruri-base | 72.95 | 69.82 | 82.87 | 75.58 | 92.91 | **54.16** | 62.38 | -| pkshatech/GLuCoSE-base-ja-v2 | 72.63 | 71.88 | **83.25** | 74.70 | 92.90 | 50.68 | 62.40 | -| cl-nagoya/ruri-small | 72.45 | 69.41 | 82.79 | 76.22 | 93.00 | 51.19 | 62.11 | -| intfloat/multilingual-e5-large | 71.65 | 70.98 | 79.70 | 72.89 | 92.96 | 51.24 | 62.15 | -| pkshatech/RoSEtta-base-ja | 71.23 | 71.87 | 81.32 | 72.39 | 92.68 | 46.97 | 62.19 | -| OpenAI/text-embedding-3-small | 70.86 | 66.39 | 79.46 | 73.06 | 92.92 | 51.06 | 62.27 | -| pkshatech/GLuCoSE-base-ja | 70.44 | 59.02 | 78.71 | 76.82 | 91.90 | 49.78 | **66.39** | -| intfloat/multilingual-e5-base | 70.12 | 68.21 | 79.84 | 69.30 | 92.85 | 48.26 | 62.26 | -| intfloat/multilingual-e5-small | 69.52 | 67.27 | 80.07 | 67.62 | 93.03 | 46.91 | 62.19 | -| OpenAI/text-embedding-ada-002 | 69.48 | 64.38 | 79.02 | 69.75 | 93.04 | 48.30 | 62.40 | -| cl-nagoya/sup-simcse-ja-base | 68.56 | 49.64 | 82.05 | 73.47 | 91.83 | 51.79 | 62.57 | -| MU-Kindai/Japanese-SimCSE-BERT-large-unsup | 66.89 | 47.38 | 78.99 | 73.13 | 91.30 | 48.25 | 62.27 | -| oshizo/sbert-jsnli-luke-japanese-base-lite | 66.75 | 43.00 | 76.60 | 76.61 | 91.56 | 50.33 | 62.38 | -| cl-nagoya/sup-simcse-ja-large | 66.51 | 37.62 | 83.18 | 73.73 | 91.48 | 50.56 | 62.51 | -| cl-nagoya/unsup-simcse-ja-large | 66.27 | 40.53 | 80.56 | 74.66 | 90.95 | 48.41 | 62.49 | -| MU-Kindai/Japanese-SimCSE-BERT-base-unsup | 66.23 | 46.36 | 77.49 | 73.30 | 91.16 | 46.68 | 62.38 | -| MU-Kindai/Japanese-SimCSE-BERT-large-sup | 65.28 | 40.82 | 78.28 | 73.47 | 90.95 | 45.81 | 62.35 | -| MU-Kindai/Japanese-MixCSE-BERT-base | 65.14 | 42.59 | 77.05 | 72.90 | 91.01 | 44.95 | 62.33 | -| cl-nagoya/unsup-simcse-ja-base | 65.07 | 40.23 | 78.72 | 73.07 | 91.16 | 44.77 | 62.44 | -| MU-Kindai/Japanese-DiffCSE-BERT-base | 64.77 | 41.79 | 75.50 | 73.77 | 90.95 | 44.22 | 62.38 | -| sentence-transformers/LaBSE | 64.70 | 40.12 | 76.56 | 72.66 | 91.63 | 44.88 | 62.33 | -| pkshatech/simcse-ja-bert-base-clcmlp | 64.42 | 37.00 | 76.80 | 71.30 | 91.49 | 47.53 | 62.40 | -| MU-Kindai/Japanese-SimCSE-BERT-base-sup | 64.15 | 41.32 | 74.66 | 72.76 | 90.66 | 43.11 | 62.37 | -| colorfulscoop/sbert-base-ja | 58.85 | 16.52 | 70.42 | 69.07 | 89.97 | 44.81 | 62.31 | -| sentence-transformers/stsb-xlm-r-multilingual | 58.01 | 21.00 | 75.40 | 71.84 | 90.20 | 27.46 | 62.20 | +| OpenAI/text-embedding-3-large | **74.05** | **74.48** | 82.52 | **77.58** | **93.58** | 53.32 | 62.35 | +| cl-nagoya/ruri-large | 73.31 | 73.02 | 83.13 | 77.43 | 92.99 | 51.82 | 62.29 | +| pkshatech/GLuCoSE-base-ja-v2 | 72.07 | 71.88 | **83.25** | 74.70 | 92.90 | 50.68 | 62.40 | +| cl-nagoya/ruri-base | 71.91 | 69.82 | 82.87 | 75.58 | 92.91 | **54.16** | 62.38 | +| cl-nagoya/ruri-small | 71.53 | 69.41 | 82.79 | 76.22 | 93.00 | 51.19 | 62.11 | +| intfloat/multilingual-e5-large | 70.90 | 70.98 | 79.70 | 72.89 | 92.96 | 51.24 | 62.15 | +| pkshatech/RoSEtta-base-ja | 70.76 | 71.87 | 81.32 | 72.39 | 92.68 | 46.97 | 62.19 | +| OpenAI/text-embedding-3-small | 69.18 | 66.39 | 79.46 | 73.06 | 92.92 | 51.06 | 62.27 | +| intfloat/multilingual-e5-base | 68.61 | 68.21 | 79.84 | 69.30 | 92.85 | 48.26 | 62.26 | +| intfloat/multilingual-e5-small | 67.71 | 67.27 | 80.07 | 67.62 | 93.03 | 46.91 | 62.19 | +| pkshatech/GLuCoSE-base-ja | 67.29 | 59.02 | 78.71 | 76.82 | 91.90 | 49.78 | **66.39** | +| OpenAI/text-embedding-ada-002 | 67.21 | 64.38 | 79.02 | 69.75 | 93.04 | 48.30 | 62.40 | +| cl-nagoya/sup-simcse-ja-base | 63.36 | 49.64 | 82.05 | 73.47 | 91.83 | 51.79 | 62.57 | +| MU-Kindai/Japanese-SimCSE-BERT-large-unsup | 61.55 | 47.38 | 78.99 | 73.13 | 91.30 | 48.25 | 62.27 | +| MU-Kindai/Japanese-SimCSE-BERT-base-unsup | 60.83 | 46.36 | 77.49 | 73.30 | 91.16 | 46.68 | 62.38 | +| oshizo/sbert-jsnli-luke-japanese-base-lite | 60.77 | 43.00 | 76.60 | 76.61 | 91.56 | 50.33 | 62.38 | +| cl-nagoya/unsup-simcse-ja-large | 59.58 | 40.53 | 80.56 | 74.66 | 90.95 | 48.41 | 62.49 | +| MU-Kindai/Japanese-MixCSE-BERT-base | 59.03 | 42.59 | 77.05 | 72.90 | 91.01 | 44.95 | 62.33 | +| cl-nagoya/sup-simcse-ja-large | 58.88 | 37.62 | 83.18 | 73.73 | 91.48 | 50.56 | 62.51 | +| MU-Kindai/Japanese-SimCSE-BERT-large-sup | 58.77 | 40.82 | 78.28 | 73.47 | 90.95 | 45.81 | 62.35 | +| MU-Kindai/Japanese-DiffCSE-BERT-base | 58.66 | 41.79 | 75.50 | 73.77 | 90.95 | 44.22 | 62.38 | +| cl-nagoya/unsup-simcse-ja-base | 58.39 | 40.23 | 78.72 | 73.07 | 91.16 | 44.77 | 62.44 | +| sentence-transformers/LaBSE | 58.01 | 40.12 | 76.56 | 72.66 | 91.63 | 44.88 | 62.33 | +| MU-Kindai/Japanese-SimCSE-BERT-base-sup | 57.97 | 41.32 | 74.66 | 72.76 | 90.66 | 43.11 | 62.37 | +| pkshatech/simcse-ja-bert-base-clcmlp | 56.86 | 37.00 | 76.80 | 71.30 | 91.49 | 47.53 | 62.40 | +| sentence-transformers/stsb-xlm-r-multilingual | 48.21 | 21.00 | 75.40 | 71.84 | 90.20 | 27.46 | 62.20 | +| colorfulscoop/sbert-base-ja | 47.38 | 16.52 | 70.42 | 69.07 | 89.97 | 44.81 | 62.31 | ## Retrieval | Model | Avg. | jagovfaqs_22k
(ndcg@10) | jaqket
(ndcg@10) | mrtydi
(ndcg@10) | nlp_journal_abs_intro
(ndcg@10) | nlp_journal_title_abs
(ndcg@10) | nlp_journal_title_intro
(ndcg@10) | diff --git a/make_leaderboard.py b/make_leaderboard.py index ff3a330..0e43ccf 100644 --- a/make_leaderboard.py +++ b/make_leaderboard.py @@ -62,7 +62,14 @@ def format_score(score: float) -> str: table_list: list[list[str | float]] = [] for model_signature, dataset_scores in task_results.items(): model_scores = [dataset_scores[k] for k in dataset_keys] - average_score = sum(model_scores) / len(model_scores) + if task_name == SUMMARY_KEY: + scores_by_dataset = [] + for _task_name, _task_results in all_results.items(): + if _task_name != SUMMARY_KEY: + scores_by_dataset.extend(list(_task_results[model_signature].values())) + average_score = sum(scores_by_dataset) / len(scores_by_dataset) + else: + average_score = sum(model_scores) / len(model_scores) table_list.append([model_signature, average_score, *model_scores]) # sort by the average score @@ -97,7 +104,10 @@ def format_score(score: float) -> str: f.write(f"## {task_name}\n") if task_name == SUMMARY_KEY: - f.write("\nThe summary shows the average scores within each task.\n\n") + f.write( + "\nThe summary shows the average scores within each task. " + "The average score is the average of scores by dataset.\n\n" + ) f.write(markdown_table) f.write("\n\n") From 5242d370be95bfd753c60a821538e79d90bfc84e Mon Sep 17 00:00:00 2001 From: lsz05 Date: Thu, 12 Sep 2024 11:13:56 +0900 Subject: [PATCH 4/7] Update `pkshatech/RoSEtta-base-ja` scores https://github.com/sbintuitions/JMTEB/issues/71#issuecomment-2343044173 --- .../pkshatech/RoSEtta-base-ja/summary.json | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/docs/results/pkshatech/RoSEtta-base-ja/summary.json b/docs/results/pkshatech/RoSEtta-base-ja/summary.json index 5025c4d..d82af4b 100644 --- a/docs/results/pkshatech/RoSEtta-base-ja/summary.json +++ b/docs/results/pkshatech/RoSEtta-base-ja/summary.json @@ -1,62 +1,62 @@ { "Classification": { "amazon_counterfactual_classification": { - "macro_f1": 0.7006688790331752 + "macro_f1": 0.7005147244958231 }, "amazon_review_classification": { - "macro_f1": 0.5299983831023539 + "macro_f1": 0.5263680453119501 }, "massive_intent_classification": { - "macro_f1": 0.7952268533717546 + "macro_f1": 0.7983787583297884 }, "massive_scenario_classification": { - "macro_f1": 0.869707847800633 + "macro_f1": 0.8709593192703351 } }, "Reranking": { "esci": { - "ndcg@10": 0.9267539503767978 + "ndcg@10": 0.9268625513429571 } }, "Retrieval": { "jagovfaqs_22k": { - "ndcg@10": 0.6379929234552755 + "ndcg@10": 0.6595934642903105 }, "jaqket": { - "ndcg@10": 0.6533570255483011 + "ndcg@10": 0.6533452086105761 }, "mrtydi": { - "ndcg@10": 0.3407337609040446 + "ndcg@10": 0.36731170141136216 }, "nlp_journal_abs_intro": { - "ndcg@10": 0.9577227924391506 + "ndcg@10": 0.9553567926226499 }, "nlp_journal_title_abs": { - "ndcg@10": 0.9282272189004226 + "ndcg@10": 0.940828991756893 }, "nlp_journal_title_intro": { - "ndcg@10": 0.7938878816204916 + "ndcg@10": 0.8163161967769845 } }, "STS": { "jsick": { - "spearman": 0.8302539464008364 + "spearman": 0.8383455453168481 }, "jsts": { - "spearman": 0.7961383132420531 + "spearman": 0.7895388048564987 } }, "Clustering": { "livedoor_news": { - "v_measure_score": 0.5503116157834466 + "v_measure_score": 0.5861760622672214 }, "mewsc16": { - "v_measure_score": 0.389105324755125 + "v_measure_score": 0.4784844036038961 } }, "PairClassification": { "paws_x_ja": { - "binary_f1": 0.6218727662616155 + "binary_f1": 0.6173974540311173 } } -} \ No newline at end of file +} From 76fd77a346ac9af565463464750619961180b901 Mon Sep 17 00:00:00 2001 From: lsz05 Date: Thu, 12 Sep 2024 11:26:33 +0900 Subject: [PATCH 5/7] Update `pkshatech/GLuCoSE-base-ja-v2` scores https://github.com/sbintuitions/JMTEB/issues/72#issuecomment-2343043103 --- .../pkshatech/GLuCoSE-base-ja-v2/summary.json | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/docs/results/pkshatech/GLuCoSE-base-ja-v2/summary.json b/docs/results/pkshatech/GLuCoSE-base-ja-v2/summary.json index 60223bc..7318aab 100644 --- a/docs/results/pkshatech/GLuCoSE-base-ja-v2/summary.json +++ b/docs/results/pkshatech/GLuCoSE-base-ja-v2/summary.json @@ -1,62 +1,62 @@ { "Classification": { "amazon_counterfactual_classification": { - "macro_f1": 0.7528271196943096 + "macro_f1": 0.7492232749031491 }, "amazon_review_classification": { - "macro_f1": 0.5561679575066396 + "macro_f1": 0.5530707609927811 }, "massive_intent_classification": { - "macro_f1": 0.8058990735631814 + "macro_f1": 0.7979144461303402 }, "massive_scenario_classification": { - "macro_f1": 0.8729457394926279 + "macro_f1": 0.8683641924034757 } }, "Reranking": { "esci": { - "ndcg@10": 0.9289703513027785 + "ndcg@10": 0.9301469431250418 } }, "Retrieval": { "jagovfaqs_22k": { - "ndcg@10": 0.6842208748694516 + "ndcg@10": 0.6979374757372254 }, "jaqket": { - "ndcg@10": 0.666162910609933 + "ndcg@10": 0.6729417850207029 }, "mrtydi": { - "ndcg@10": 0.3679312414893066 + "ndcg@10": 0.41858579533990486 }, "nlp_journal_abs_intro": { - "ndcg@10": 0.8961561684616985 + "ndcg@10": 0.9029337913460675 }, "nlp_journal_title_abs": { - "ndcg@10": 0.9465973412523236 + "ndcg@10": 0.9511153967130517 }, "nlp_journal_title_intro": { - "ndcg@10": 0.7514787290834406 + "ndcg@10": 0.7580448576047344 } }, "STS": { "jsick": { - "spearman": 0.8499279029619572 + "spearman": 0.849637366944316 }, "jsts": { - "spearman": 0.8150603412605322 + "spearman": 0.8095684318108997 } }, "Clustering": { "livedoor_news": { - "v_measure_score": 0.5165568486237136 + "v_measure_score": 0.5151536908540161 }, "mewsc16": { - "v_measure_score": 0.4970285237567235 + "v_measure_score": 0.45782610528001805 } }, "PairClassification": { "paws_x_ja": { - "binary_f1": 0.6239830208701804 + "binary_f1": 0.623716814159292 } } -} \ No newline at end of file +} From 4dbe7a3da8ddefcaacdb3fbb4fbbc3e46c4ab5da Mon Sep 17 00:00:00 2001 From: "shengzhe.li" Date: Thu, 12 Sep 2024 11:28:56 +0900 Subject: [PATCH 6/7] Update leaderboard --- leaderboard.md | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/leaderboard.md b/leaderboard.md index 45be95b..4b05e46 100644 --- a/leaderboard.md +++ b/leaderboard.md @@ -9,11 +9,11 @@ The summary shows the average scores within each task. The average score is the |:----------------------------------------------|:----------|:------------|:----------|:-----------------|:------------|:-------------|:---------------------| | OpenAI/text-embedding-3-large | **74.05** | **74.48** | 82.52 | **77.58** | **93.58** | 53.32 | 62.35 | | cl-nagoya/ruri-large | 73.31 | 73.02 | 83.13 | 77.43 | 92.99 | 51.82 | 62.29 | -| pkshatech/GLuCoSE-base-ja-v2 | 72.07 | 71.88 | **83.25** | 74.70 | 92.90 | 50.68 | 62.40 | +| pkshatech/GLuCoSE-base-ja-v2 | 72.23 | 73.36 | 82.96 | 74.21 | 93.01 | 48.65 | 62.37 | +| pkshatech/RoSEtta-base-ja | 72.04 | 73.21 | 81.39 | 72.41 | 92.69 | 53.23 | 61.74 | | cl-nagoya/ruri-base | 71.91 | 69.82 | 82.87 | 75.58 | 92.91 | **54.16** | 62.38 | | cl-nagoya/ruri-small | 71.53 | 69.41 | 82.79 | 76.22 | 93.00 | 51.19 | 62.11 | | intfloat/multilingual-e5-large | 70.90 | 70.98 | 79.70 | 72.89 | 92.96 | 51.24 | 62.15 | -| pkshatech/RoSEtta-base-ja | 70.76 | 71.87 | 81.32 | 72.39 | 92.68 | 46.97 | 62.19 | | OpenAI/text-embedding-3-small | 69.18 | 66.39 | 79.46 | 73.06 | 92.92 | 51.06 | 62.27 | | intfloat/multilingual-e5-base | 68.61 | 68.21 | 79.84 | 69.30 | 92.85 | 48.26 | 62.26 | | intfloat/multilingual-e5-small | 67.71 | 67.27 | 80.07 | 67.62 | 93.03 | 46.91 | 62.19 | @@ -25,7 +25,7 @@ The summary shows the average scores within each task. The average score is the | oshizo/sbert-jsnli-luke-japanese-base-lite | 60.77 | 43.00 | 76.60 | 76.61 | 91.56 | 50.33 | 62.38 | | cl-nagoya/unsup-simcse-ja-large | 59.58 | 40.53 | 80.56 | 74.66 | 90.95 | 48.41 | 62.49 | | MU-Kindai/Japanese-MixCSE-BERT-base | 59.03 | 42.59 | 77.05 | 72.90 | 91.01 | 44.95 | 62.33 | -| cl-nagoya/sup-simcse-ja-large | 58.88 | 37.62 | 83.18 | 73.73 | 91.48 | 50.56 | 62.51 | +| cl-nagoya/sup-simcse-ja-large | 58.88 | 37.62 | **83.18** | 73.73 | 91.48 | 50.56 | 62.51 | | MU-Kindai/Japanese-SimCSE-BERT-large-sup | 58.77 | 40.82 | 78.28 | 73.47 | 90.95 | 45.81 | 62.35 | | MU-Kindai/Japanese-DiffCSE-BERT-base | 58.66 | 41.79 | 75.50 | 73.77 | 90.95 | 44.22 | 62.38 | | cl-nagoya/unsup-simcse-ja-base | 58.39 | 40.23 | 78.72 | 73.07 | 91.16 | 44.77 | 62.44 | @@ -39,9 +39,9 @@ The summary shows the average scores within each task. The average score is the | Model | Avg. | jagovfaqs_22k
(ndcg@10) | jaqket
(ndcg@10) | mrtydi
(ndcg@10) | nlp_journal_abs_intro
(ndcg@10) | nlp_journal_title_abs
(ndcg@10) | nlp_journal_title_intro
(ndcg@10) | |:----------------------------------------------|:----------|:-----------------------------|:----------------------|:----------------------|:-------------------------------------|:-------------------------------------|:---------------------------------------| | OpenAI/text-embedding-3-large | **74.48** | 72.41 | 48.21 | 34.88 | **99.33** | 96.55 | **95.47** | +| pkshatech/GLuCoSE-base-ja-v2 | 73.36 | 69.79 | **67.29** | 41.86 | 90.29 | 95.11 | 75.80 | +| pkshatech/RoSEtta-base-ja | 73.21 | 65.96 | 65.33 | 36.73 | 95.54 | 94.08 | 81.63 | | cl-nagoya/ruri-large | 73.02 | **76.68** | 61.74 | 38.03 | 87.12 | 96.58 | 77.97 | -| pkshatech/GLuCoSE-base-ja-v2 | 71.88 | 68.42 | **66.62** | 36.79 | 89.62 | 94.66 | 75.15 | -| pkshatech/RoSEtta-base-ja | 71.87 | 63.80 | 65.34 | 34.07 | 95.77 | 92.82 | 79.39 | | intfloat/multilingual-e5-large | 70.98 | 70.30 | 58.78 | **43.63** | 86.00 | 94.70 | 72.48 | | cl-nagoya/ruri-base | 69.82 | 74.56 | 50.12 | 35.45 | 86.89 | 96.57 | 75.31 | | cl-nagoya/ruri-small | 69.41 | 73.65 | 48.44 | 33.43 | 87.69 | **97.17** | 76.09 | @@ -69,14 +69,14 @@ The summary shows the average scores within each task. The average score is the ## STS | Model | Avg. | jsick
(spearman) | jsts
(spearman) | |:----------------------------------------------|:----------|:----------------------|:---------------------| -| pkshatech/GLuCoSE-base-ja-v2 | **83.25** | **84.99** | 81.51 | -| cl-nagoya/sup-simcse-ja-large | 83.18 | 83.80 | 82.57 | +| cl-nagoya/sup-simcse-ja-large | **83.18** | 83.80 | 82.57 | | cl-nagoya/ruri-large | 83.13 | 82.00 | **84.26** | +| pkshatech/GLuCoSE-base-ja-v2 | 82.96 | **84.96** | 80.96 | | cl-nagoya/ruri-base | 82.87 | 82.32 | 83.43 | | cl-nagoya/ruri-small | 82.79 | 83.44 | 82.13 | | OpenAI/text-embedding-3-large | 82.52 | 81.27 | 83.77 | | cl-nagoya/sup-simcse-ja-base | 82.05 | 82.83 | 81.27 | -| pkshatech/RoSEtta-base-ja | 81.32 | 83.03 | 79.61 | +| pkshatech/RoSEtta-base-ja | 81.39 | 83.83 | 78.95 | | cl-nagoya/unsup-simcse-ja-large | 80.56 | 80.15 | 80.98 | | intfloat/multilingual-e5-small | 80.07 | 81.50 | 78.65 | | intfloat/multilingual-e5-base | 79.84 | 81.28 | 78.39 | @@ -106,8 +106,8 @@ The summary shows the average scores within each task. The average score is the | oshizo/sbert-jsnli-luke-japanese-base-lite | 76.61 | 79.95 | 57.48 | 80.26 | 88.75 | | cl-nagoya/ruri-small | 76.22 | 79.92 | 55.61 | 81.49 | 87.88 | | cl-nagoya/ruri-base | 75.58 | 76.66 | 55.76 | 81.41 | 88.49 | -| pkshatech/GLuCoSE-base-ja-v2 | 74.70 | 75.28 | 55.62 | 80.59 | 87.29 | | cl-nagoya/unsup-simcse-ja-large | 74.66 | 76.79 | 55.37 | 79.13 | 87.36 | +| pkshatech/GLuCoSE-base-ja-v2 | 74.21 | 74.92 | 55.31 | 79.79 | 86.84 | | MU-Kindai/Japanese-DiffCSE-BERT-base | 73.77 | 78.10 | 51.56 | 78.79 | 86.63 | | cl-nagoya/sup-simcse-ja-large | 73.73 | 73.21 | 54.76 | 79.23 | 87.72 | | MU-Kindai/Japanese-SimCSE-BERT-large-sup | 73.47 | 77.25 | 53.42 | 76.83 | 86.39 | @@ -120,7 +120,7 @@ The summary shows the average scores within each task. The average score is the | intfloat/multilingual-e5-large | 72.89 | 70.66 | 56.54 | 75.78 | 88.59 | | MU-Kindai/Japanese-SimCSE-BERT-base-sup | 72.76 | 76.20 | 52.06 | 77.89 | 84.90 | | sentence-transformers/LaBSE | 72.66 | 73.61 | 51.70 | 76.99 | 88.35 | -| pkshatech/RoSEtta-base-ja | 72.39 | 70.07 | 53.00 | 79.52 | 86.97 | +| pkshatech/RoSEtta-base-ja | 72.41 | 70.05 | 52.64 | 79.84 | 87.10 | | sentence-transformers/stsb-xlm-r-multilingual | 71.84 | 75.65 | 51.32 | 74.28 | 86.10 | | pkshatech/simcse-ja-bert-base-clcmlp | 71.30 | 67.49 | 50.85 | 79.67 | 87.20 | | OpenAI/text-embedding-ada-002 | 69.75 | 64.42 | 53.13 | 74.57 | 86.89 | @@ -134,14 +134,14 @@ The summary shows the average scores within each task. The average score is the | OpenAI/text-embedding-3-large | **93.58** | **93.58** | | OpenAI/text-embedding-ada-002 | 93.04 | 93.04 | | intfloat/multilingual-e5-small | 93.03 | 93.03 | +| pkshatech/GLuCoSE-base-ja-v2 | 93.01 | 93.01 | | cl-nagoya/ruri-small | 93.00 | 93.00 | | cl-nagoya/ruri-large | 92.99 | 92.99 | | intfloat/multilingual-e5-large | 92.96 | 92.96 | | OpenAI/text-embedding-3-small | 92.92 | 92.92 | | cl-nagoya/ruri-base | 92.91 | 92.91 | -| pkshatech/GLuCoSE-base-ja-v2 | 92.90 | 92.90 | | intfloat/multilingual-e5-base | 92.85 | 92.85 | -| pkshatech/RoSEtta-base-ja | 92.68 | 92.68 | +| pkshatech/RoSEtta-base-ja | 92.69 | 92.69 | | pkshatech/GLuCoSE-base-ja | 91.90 | 91.90 | | cl-nagoya/sup-simcse-ja-base | 91.83 | 91.83 | | sentence-transformers/LaBSE | 91.63 | 91.63 | @@ -164,21 +164,21 @@ The summary shows the average scores within each task. The average score is the |:----------------------------------------------|:----------|:-------------------------------------|:-------------------------------| | cl-nagoya/ruri-base | **54.16** | 54.27 | **54.04** | | OpenAI/text-embedding-3-large | 53.32 | 57.09 | 49.55 | +| pkshatech/RoSEtta-base-ja | 53.23 | **58.62** | 47.85 | | cl-nagoya/ruri-large | 51.82 | 51.39 | 52.25 | | cl-nagoya/sup-simcse-ja-base | 51.79 | 52.67 | 50.91 | -| intfloat/multilingual-e5-large | 51.24 | **57.13** | 45.34 | +| intfloat/multilingual-e5-large | 51.24 | 57.13 | 45.34 | | cl-nagoya/ruri-small | 51.19 | 50.96 | 51.41 | | OpenAI/text-embedding-3-small | 51.06 | 54.57 | 47.55 | -| pkshatech/GLuCoSE-base-ja-v2 | 50.68 | 51.66 | 49.70 | | cl-nagoya/sup-simcse-ja-large | 50.56 | 50.75 | 50.38 | | oshizo/sbert-jsnli-luke-japanese-base-lite | 50.33 | 46.77 | 53.89 | | pkshatech/GLuCoSE-base-ja | 49.78 | 49.89 | 49.68 | +| pkshatech/GLuCoSE-base-ja-v2 | 48.65 | 51.52 | 45.78 | | cl-nagoya/unsup-simcse-ja-large | 48.41 | 50.90 | 45.92 | | OpenAI/text-embedding-ada-002 | 48.30 | 49.67 | 46.92 | | intfloat/multilingual-e5-base | 48.26 | 55.03 | 41.49 | | MU-Kindai/Japanese-SimCSE-BERT-large-unsup | 48.25 | 53.20 | 43.31 | | pkshatech/simcse-ja-bert-base-clcmlp | 47.53 | 44.77 | 50.30 | -| pkshatech/RoSEtta-base-ja | 46.97 | 55.03 | 38.91 | | intfloat/multilingual-e5-small | 46.91 | 54.70 | 39.12 | | MU-Kindai/Japanese-SimCSE-BERT-base-unsup | 46.68 | 53.02 | 40.35 | | MU-Kindai/Japanese-SimCSE-BERT-large-sup | 45.81 | 48.45 | 43.17 | @@ -200,11 +200,11 @@ The summary shows the average scores within each task. The average score is the | cl-nagoya/unsup-simcse-ja-base | 62.44 | 62.44 | | pkshatech/simcse-ja-bert-base-clcmlp | 62.40 | 62.40 | | OpenAI/text-embedding-ada-002 | 62.40 | 62.40 | -| pkshatech/GLuCoSE-base-ja-v2 | 62.40 | 62.40 | | MU-Kindai/Japanese-SimCSE-BERT-base-unsup | 62.38 | 62.38 | | cl-nagoya/ruri-base | 62.38 | 62.38 | | oshizo/sbert-jsnli-luke-japanese-base-lite | 62.38 | 62.38 | | MU-Kindai/Japanese-DiffCSE-BERT-base | 62.38 | 62.38 | +| pkshatech/GLuCoSE-base-ja-v2 | 62.37 | 62.37 | | MU-Kindai/Japanese-SimCSE-BERT-base-sup | 62.37 | 62.37 | | MU-Kindai/Japanese-SimCSE-BERT-large-sup | 62.35 | 62.35 | | OpenAI/text-embedding-3-large | 62.35 | 62.35 | @@ -217,7 +217,7 @@ The summary shows the average scores within each task. The average score is the | intfloat/multilingual-e5-base | 62.26 | 62.26 | | sentence-transformers/stsb-xlm-r-multilingual | 62.20 | 62.20 | | intfloat/multilingual-e5-small | 62.19 | 62.19 | -| pkshatech/RoSEtta-base-ja | 62.19 | 62.19 | | intfloat/multilingual-e5-large | 62.15 | 62.15 | | cl-nagoya/ruri-small | 62.11 | 62.11 | +| pkshatech/RoSEtta-base-ja | 61.74 | 61.74 | From d3f81ec96476798545418a0366676d07145b179b Mon Sep 17 00:00:00 2001 From: "shengzhe.li" Date: Tue, 17 Sep 2024 14:48:12 +0900 Subject: [PATCH 7/7] version 1.3.2 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 57e4f74..743b6bd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,7 @@ description = "The evaluation scripts for JMTEB (Japanese Massive Text Embedding name = "JMTEB" packages = [{from = "src", include = "jmteb"}] readme = "README.md" -version = "1.3.1" +version = "1.3.2" [tool.poetry.dependencies] python = ">=3.10,<4.0"