diff --git a/docs/results/MU-Kindai/Japanese-DiffCSE-BERT-base/summary.json b/docs/results/MU-Kindai/Japanese-DiffCSE-BERT-base/summary.json new file mode 100644 index 0000000..1b99a44 --- /dev/null +++ b/docs/results/MU-Kindai/Japanese-DiffCSE-BERT-base/summary.json @@ -0,0 +1,62 @@ +{ + "Classification": { + "amazon_counterfactual_classification": { + "macro_f1": 0.7809527709426081 + }, + "amazon_review_classification": { + "macro_f1": 0.5155899232320224 + }, + "massive_intent_classification": { + "macro_f1": 0.7879373479249787 + }, + "massive_scenario_classification": { + "macro_f1": 0.8662625888023707 + } + }, + "Reranking": { + "esci": { + "ndcg@10": 0.9095168116460639 + } + }, + "Retrieval": { + "jagovfaqs_22k": { + "ndcg@10": 0.42314124780036416 + }, + "jaqket": { + "ndcg@10": 0.36199154051747723 + }, + "mrtydi": { + "ndcg@10": 0.07810683176415421 + }, + "nlp_journal_abs_intro": { + "ndcg@10": 0.6077212544951452 + }, + "nlp_journal_title_abs": { + "ndcg@10": 0.6433890489201118 + }, + "nlp_journal_title_intro": { + "ndcg@10": 0.39317174536190913 + } + }, + "STS": { + "jsick": { + "spearman": 0.754165277432144 + }, + "jsts": { + "spearman": 0.7558202366183716 + } + }, + "Clustering": { + "livedoor_news": { + "v_measure_score": 0.4966545453348478 + }, + "mewsc16": { + "v_measure_score": 0.3877356318022785 + } + }, + "PairClassification": { + "paws_x_ja": { + "binary_f1": 0.6237623762376237 + } + } +} \ No newline at end of file diff --git a/docs/results/MU-Kindai/Japanese-MixCSE-BERT-base/summary.json b/docs/results/MU-Kindai/Japanese-MixCSE-BERT-base/summary.json new file mode 100644 index 0000000..ea227c2 --- /dev/null +++ b/docs/results/MU-Kindai/Japanese-MixCSE-BERT-base/summary.json @@ -0,0 +1,62 @@ +{ + "Classification": { + "amazon_counterfactual_classification": { + "macro_f1": 0.776174162517931 + }, + "amazon_review_classification": { + "macro_f1": 0.5085781180553806 + }, + "massive_intent_classification": { + "macro_f1": 0.7718541530739129 + }, + "massive_scenario_classification": { + "macro_f1": 0.8592571786794985 + } + }, + "Reranking": { + "esci": { + "ndcg@10": 0.9100551950168166 + } + }, + "Retrieval": { + "jagovfaqs_22k": { + "ndcg@10": 0.42368135774043536 + }, + "jaqket": { + "ndcg@10": 0.37721850397542034 + }, + "mrtydi": { + "ndcg@10": 0.07878085186566607 + }, + "nlp_journal_abs_intro": { + "ndcg@10": 0.636999375405723 + }, + "nlp_journal_title_abs": { + "ndcg@10": 0.6413498649875696 + }, + "nlp_journal_title_intro": { + "ndcg@10": 0.397250919496823 + } + }, + "STS": { + "jsick": { + "spearman": 0.7756925231422259 + }, + "jsts": { + "spearman": 0.7652968548841591 + } + }, + "Clustering": { + "livedoor_news": { + "v_measure_score": 0.5262387436934941 + }, + "mewsc16": { + "v_measure_score": 0.37277574537292835 + } + }, + "PairClassification": { + "paws_x_ja": { + "binary_f1": 0.623321554770318 + } + } +} \ No newline at end of file diff --git a/docs/results/MU-Kindai/Japanese-SimCSE-BERT-base-sup/summary.json b/docs/results/MU-Kindai/Japanese-SimCSE-BERT-base-sup/summary.json new file mode 100644 index 0000000..dbed068 --- /dev/null +++ b/docs/results/MU-Kindai/Japanese-SimCSE-BERT-base-sup/summary.json @@ -0,0 +1,62 @@ +{ + "Classification": { + "amazon_counterfactual_classification": { + "macro_f1": 0.7619809437515043 + }, + "amazon_review_classification": { + "macro_f1": 0.5205592432502059 + }, + "massive_intent_classification": { + "macro_f1": 0.7789367871593064 + }, + "massive_scenario_classification": { + "macro_f1": 0.8490320705866646 + } + }, + "Reranking": { + "esci": { + "ndcg@10": 0.9065584234991577 + } + }, + "Retrieval": { + "jagovfaqs_22k": { + "ndcg@10": 0.4411487123884245 + }, + "jaqket": { + "ndcg@10": 0.39613283459361814 + }, + "mrtydi": { + "ndcg@10": 0.08154879873415645 + }, + "nlp_journal_abs_intro": { + "ndcg@10": 0.6276035246534508 + }, + "nlp_journal_title_abs": { + "ndcg@10": 0.5838785018803183 + }, + "nlp_journal_title_intro": { + "ndcg@10": 0.3489329387182086 + } + }, + "STS": { + "jsick": { + "spearman": 0.7463567093877269 + }, + "jsts": { + "spearman": 0.7468283806971927 + } + }, + "Clustering": { + "livedoor_news": { + "v_measure_score": 0.41041888940251137 + }, + "mewsc16": { + "v_measure_score": 0.45175891401665724 + } + }, + "PairClassification": { + "paws_x_ja": { + "binary_f1": 0.6236711552090717 + } + } +} \ No newline at end of file diff --git a/docs/results/MU-Kindai/Japanese-SimCSE-BERT-base-unsup/summary.json b/docs/results/MU-Kindai/Japanese-SimCSE-BERT-base-unsup/summary.json new file mode 100644 index 0000000..9528312 --- /dev/null +++ b/docs/results/MU-Kindai/Japanese-SimCSE-BERT-base-unsup/summary.json @@ -0,0 +1,62 @@ +{ + "Classification": { + "amazon_counterfactual_classification": { + "macro_f1": 0.7619809437515043 + }, + "amazon_review_classification": { + "macro_f1": 0.5152108946679324 + }, + "massive_intent_classification": { + "macro_f1": 0.7895128475562229 + }, + "massive_scenario_classification": { + "macro_f1": 0.865430249169577 + } + }, + "Reranking": { + "esci": { + "ndcg@10": 0.9115815294581953 + } + }, + "Retrieval": { + "jagovfaqs_22k": { + "ndcg@10": 0.47387768939865055 + }, + "jaqket": { + "ndcg@10": 0.3956683977353904 + }, + "mrtydi": { + "ndcg@10": 0.1144234568266308 + }, + "nlp_journal_abs_intro": { + "ndcg@10": 0.6416096544574569 + }, + "nlp_journal_title_abs": { + "ndcg@10": 0.7023477497744102 + }, + "nlp_journal_title_intro": { + "ndcg@10": 0.4536720868647063 + } + }, + "STS": { + "jsick": { + "spearman": 0.781770693640686 + }, + "jsts": { + "spearman": 0.7680617109850311 + } + }, + "Clustering": { + "livedoor_news": { + "v_measure_score": 0.5301620892693397 + }, + "mewsc16": { + "v_measure_score": 0.4034776723308173 + } + }, + "PairClassification": { + "paws_x_ja": { + "binary_f1": 0.6238078417520311 + } + } +} \ No newline at end of file diff --git a/docs/results/MU-Kindai/Japanese-SimCSE-BERT-large-sup/summary.json b/docs/results/MU-Kindai/Japanese-SimCSE-BERT-large-sup/summary.json new file mode 100644 index 0000000..b36686c --- /dev/null +++ b/docs/results/MU-Kindai/Japanese-SimCSE-BERT-large-sup/summary.json @@ -0,0 +1,62 @@ +{ + "Classification": { + "amazon_counterfactual_classification": { + "macro_f1": 0.7725250131648236 + }, + "amazon_review_classification": { + "macro_f1": 0.5341627023771393 + }, + "massive_intent_classification": { + "macro_f1": 0.7682863192709365 + }, + "massive_scenario_classification": { + "macro_f1": 0.8639396658321546 + } + }, + "Reranking": { + "esci": { + "ndcg@10": 0.9094717381883379 + } + }, + "Retrieval": { + "jagovfaqs_22k": { + "ndcg@10": 0.47038430326303626 + }, + "jaqket": { + "ndcg@10": 0.44101304795602897 + }, + "mrtydi": { + "ndcg@10": 0.11429128335865787 + }, + "nlp_journal_abs_intro": { + "ndcg@10": 0.43434267808785576 + }, + "nlp_journal_title_abs": { + "ndcg@10": 0.6240651697600803 + }, + "nlp_journal_title_intro": { + "ndcg@10": 0.3651687833824759 + } + }, + "STS": { + "jsick": { + "spearman": 0.787528927058734 + }, + "jsts": { + "spearman": 0.7781413957931619 + } + }, + "Clustering": { + "livedoor_news": { + "v_measure_score": 0.48448646364489634 + }, + "mewsc16": { + "v_measure_score": 0.43168522818790694 + } + }, + "PairClassification": { + "paws_x_ja": { + "binary_f1": 0.6235418875927891 + } + } +} \ No newline at end of file diff --git a/docs/results/MU-Kindai/Japanese-SimCSE-BERT-large-unsup/summary.json b/docs/results/MU-Kindai/Japanese-SimCSE-BERT-large-unsup/summary.json new file mode 100644 index 0000000..f620d50 --- /dev/null +++ b/docs/results/MU-Kindai/Japanese-SimCSE-BERT-large-unsup/summary.json @@ -0,0 +1,62 @@ +{ + "Classification": { + "amazon_counterfactual_classification": { + "macro_f1": 0.7635642561809131 + }, + "amazon_review_classification": { + "macro_f1": 0.5275222511867922 + }, + "massive_intent_classification": { + "macro_f1": 0.7688060073049678 + }, + "massive_scenario_classification": { + "macro_f1": 0.8651446837233107 + } + }, + "Reranking": { + "esci": { + "ndcg@10": 0.9129851570116734 + } + }, + "Retrieval": { + "jagovfaqs_22k": { + "ndcg@10": 0.5014367709991477 + }, + "jaqket": { + "ndcg@10": 0.4583812630740073 + }, + "mrtydi": { + "ndcg@10": 0.13003320802922363 + }, + "nlp_journal_abs_intro": { + "ndcg@10": 0.5508587506679636 + }, + "nlp_journal_title_abs": { + "ndcg@10": 0.7497069192695408 + }, + "nlp_journal_title_intro": { + "ndcg@10": 0.4524300499843447 + } + }, + "STS": { + "jsick": { + "spearman": 0.7984403024596518 + }, + "jsts": { + "spearman": 0.7813685476201204 + } + }, + "Clustering": { + "livedoor_news": { + "v_measure_score": 0.5319881995988209 + }, + "mewsc16": { + "v_measure_score": 0.4330807170988368 + } + }, + "PairClassification": { + "paws_x_ja": { + "binary_f1": 0.6226614895870103 + } + } +} \ No newline at end of file diff --git a/docs/results/OpenAI/text-embedding-3-large/summary.json b/docs/results/OpenAI/text-embedding-3-large/summary.json new file mode 100644 index 0000000..46af0c5 --- /dev/null +++ b/docs/results/OpenAI/text-embedding-3-large/summary.json @@ -0,0 +1,62 @@ +{ + "Classification": { + "amazon_counterfactual_classification": { + "macro_f1": 0.7789727938896414 + }, + "amazon_review_classification": { + "macro_f1": 0.6043632319384946 + }, + "massive_intent_classification": { + "macro_f1": 0.8090871295952566 + }, + "massive_scenario_classification": { + "macro_f1": 0.9108443051510002 + } + }, + "Reranking": { + "esci": { + "ndcg@10": 0.9358042266852659 + } + }, + "Retrieval": { + "jagovfaqs_22k": { + "ndcg@10": 0.7240937077183436 + }, + "jaqket": { + "ndcg@10": 0.48208863565793814 + }, + "mrtydi": { + "ndcg@10": 0.3488438390945784 + }, + "nlp_journal_abs_intro": { + "ndcg@10": 0.9932811349540317 + }, + "nlp_journal_title_abs": { + "ndcg@10": 0.9655113335080678 + }, + "nlp_journal_title_intro": { + "ndcg@10": 0.9547126796600445 + } + }, + "STS": { + "jsick": { + "spearman": 0.8126909906411093 + }, + "jsts": { + "spearman": 0.8376863979620452 + } + }, + "Clustering": { + "livedoor_news": { + "v_measure_score": 0.05018478985401151 + }, + "mewsc16": { + "v_measure_score": 0.4955424351458981 + } + }, + "PairClassification": { + "paws_x_ja": { + "binary_f1": 0.6234502302515055 + } + } +} \ No newline at end of file diff --git a/docs/results/OpenAI/text-embedding-3-small/summary.json b/docs/results/OpenAI/text-embedding-3-small/summary.json new file mode 100644 index 0000000..74cee2e --- /dev/null +++ b/docs/results/OpenAI/text-embedding-3-small/summary.json @@ -0,0 +1,62 @@ +{ + "Classification": { + "amazon_counterfactual_classification": { + "macro_f1": 0.7000818608185178 + }, + "amazon_review_classification": { + "macro_f1": 0.5592259673654241 + }, + "massive_intent_classification": { + "macro_f1": 0.7766119663088307 + }, + "massive_scenario_classification": { + "macro_f1": 0.8866536867311439 + } + }, + "Reranking": { + "esci": { + "ndcg@10": 0.9291728102678644 + } + }, + "Retrieval": { + "jagovfaqs_22k": { + "ndcg@10": 0.640150048193537 + }, + "jaqket": { + "ndcg@10": 0.3394304922804131 + }, + "mrtydi": { + "ndcg@10": 0.2002984123046011 + }, + "nlp_journal_abs_intro": { + "ndcg@10": 0.9846617848570168 + }, + "nlp_journal_title_abs": { + "ndcg@10": 0.9170440283351765 + }, + "nlp_journal_title_intro": { + "ndcg@10": 0.9017272741306225 + } + }, + "STS": { + "jsick": { + "spearman": 0.8083062989093882 + }, + "jsts": { + "spearman": 0.7808357024283473 + } + }, + "Clustering": { + "livedoor_news": { + "v_measure_score": 0.051323988942160705 + }, + "mewsc16": { + "v_measure_score": 0.4755374215259236 + } + }, + "PairClassification": { + "paws_x_ja": { + "binary_f1": 0.6227417640807651 + } + } +} \ No newline at end of file diff --git a/docs/results/OpenAI/text-embedding-ada-002/summary.json b/docs/results/OpenAI/text-embedding-ada-002/summary.json new file mode 100644 index 0000000..8c7a548 --- /dev/null +++ b/docs/results/OpenAI/text-embedding-ada-002/summary.json @@ -0,0 +1,62 @@ +{ + "Classification": { + "amazon_counterfactual_classification": { + "macro_f1": 0.6441904761904762 + }, + "amazon_review_classification": { + "macro_f1": 0.5312953134953877 + }, + "massive_intent_classification": { + "macro_f1": 0.7457150118928685 + }, + "massive_scenario_classification": { + "macro_f1": 0.8689044829586676 + } + }, + "Reranking": { + "esci": { + "ndcg@10": 0.9303611831749345 + } + }, + "Retrieval": { + "jagovfaqs_22k": { + "ndcg@10": 0.6102270226904314 + }, + "jaqket": { + "ndcg@10": 0.4256467956806472 + }, + "mrtydi": { + "ndcg@10": 0.1450739420851161 + }, + "nlp_journal_abs_intro": { + "ndcg@10": 0.9499224324391132 + }, + "nlp_journal_title_abs": { + "ndcg@10": 0.9123300358752942 + }, + "nlp_journal_title_intro": { + "ndcg@10": 0.8197798210453923 + } + }, + "STS": { + "jsick": { + "spearman": 0.7909435250482901 + }, + "jsts": { + "spearman": 0.7894052744557472 + } + }, + "Clustering": { + "livedoor_news": { + "v_measure_score": 0.060252212362740365 + }, + "mewsc16": { + "v_measure_score": 0.4691938182964486 + } + }, + "PairClassification": { + "paws_x_ja": { + "binary_f1": 0.6239830208701805 + } + } +} \ No newline at end of file diff --git a/docs/results/cl-nagoya/sup-simcse-ja-base/summary.json b/docs/results/cl-nagoya/sup-simcse-ja-base/summary.json new file mode 100644 index 0000000..42cc5ff --- /dev/null +++ b/docs/results/cl-nagoya/sup-simcse-ja-base/summary.json @@ -0,0 +1,62 @@ +{ + "Classification": { + "amazon_counterfactual_classification": { + "macro_f1": 0.7234436301724776 + }, + "amazon_review_classification": { + "macro_f1": 0.5441445333270086 + }, + "massive_intent_classification": { + "macro_f1": 0.7951973953020242 + }, + "massive_scenario_classification": { + "macro_f1": 0.8760200177186923 + } + }, + "Reranking": { + "esci": { + "ndcg@10": 0.9183455876236017 + } + }, + "Retrieval": { + "jagovfaqs_22k": { + "ndcg@10": 0.5161990612242935 + }, + "jaqket": { + "ndcg@10": 0.5024513438428565 + }, + "mrtydi": { + "ndcg@10": 0.13976323269046823 + }, + "nlp_journal_abs_intro": { + "ndcg@10": 0.6807886421530585 + }, + "nlp_journal_title_abs": { + "ndcg@10": 0.6570889175649209 + }, + "nlp_journal_title_intro": { + "ndcg@10": 0.48219159577174137 + } + }, + "STS": { + "jsick": { + "spearman": 0.8282816229512862 + }, + "jsts": { + "spearman": 0.8127259236647225 + } + }, + "Clustering": { + "livedoor_news": { + "v_measure_score": 0.5266774168531417 + }, + "mewsc16": { + "v_measure_score": 0.5091016872016825 + } + }, + "PairClassification": { + "paws_x_ja": { + "binary_f1": 0.6256665481692143 + } + } +} \ No newline at end of file diff --git a/docs/results/cl-nagoya/sup-simcse-ja-large/summary.json b/docs/results/cl-nagoya/sup-simcse-ja-large/summary.json new file mode 100644 index 0000000..a2d8924 --- /dev/null +++ b/docs/results/cl-nagoya/sup-simcse-ja-large/summary.json @@ -0,0 +1,62 @@ +{ + "Classification": { + "amazon_counterfactual_classification": { + "macro_f1": 0.7321444865928852 + }, + "amazon_review_classification": { + "macro_f1": 0.5475800661400465 + }, + "massive_intent_classification": { + "macro_f1": 0.7922802742146243 + }, + "massive_scenario_classification": { + "macro_f1": 0.8772172454209797 + } + }, + "Reranking": { + "esci": { + "ndcg@10": 0.9148471751378899 + } + }, + "Retrieval": { + "jagovfaqs_22k": { + "ndcg@10": 0.4683673504170269 + }, + "jaqket": { + "ndcg@10": 0.39878189118804513 + }, + "mrtydi": { + "ndcg@10": 0.11834919561027905 + }, + "nlp_journal_abs_intro": { + "ndcg@10": 0.634254459552888 + }, + "nlp_journal_title_abs": { + "ndcg@10": 0.37927566884615427 + }, + "nlp_journal_title_intro": { + "ndcg@10": 0.25787534957423713 + } + }, + "STS": { + "jsick": { + "spearman": 0.837959537101532 + }, + "jsts": { + "spearman": 0.825691902117111 + } + }, + "Clustering": { + "livedoor_news": { + "v_measure_score": 0.5074967876488787 + }, + "mewsc16": { + "v_measure_score": 0.503782014677764 + } + }, + "PairClassification": { + "paws_x_ja": { + "binary_f1": 0.6250885896527285 + } + } +} \ No newline at end of file diff --git a/docs/results/cl-nagoya/unsup-simcse-ja-base/summary.json b/docs/results/cl-nagoya/unsup-simcse-ja-base/summary.json new file mode 100644 index 0000000..3863c9e --- /dev/null +++ b/docs/results/cl-nagoya/unsup-simcse-ja-base/summary.json @@ -0,0 +1,62 @@ +{ + "Classification": { + "amazon_counterfactual_classification": { + "macro_f1": 0.7330185800774036 + }, + "amazon_review_classification": { + "macro_f1": 0.5392887528271114 + }, + "massive_intent_classification": { + "macro_f1": 0.7907120296283751 + }, + "massive_scenario_classification": { + "macro_f1": 0.8597097942715117 + } + }, + "Reranking": { + "esci": { + "ndcg@10": 0.9115668272308735 + } + }, + "Retrieval": { + "jagovfaqs_22k": { + "ndcg@10": 0.46003459081522513 + }, + "jaqket": { + "ndcg@10": 0.3945725593125862 + }, + "mrtydi": { + "ndcg@10": 0.055507775092798486 + }, + "nlp_journal_abs_intro": { + "ndcg@10": 0.6025847751308843 + }, + "nlp_journal_title_abs": { + "ndcg@10": 0.5562839869857912 + }, + "nlp_journal_title_intro": { + "ndcg@10": 0.3449181162324482 + } + }, + "STS": { + "jsick": { + "spearman": 0.7849379492955117 + }, + "jsts": { + "spearman": 0.7894946592483818 + } + }, + "Clustering": { + "livedoor_news": { + "v_measure_score": 0.5223347838445698 + }, + "mewsc16": { + "v_measure_score": 0.37310458219601117 + } + }, + "PairClassification": { + "paws_x_ja": { + "binary_f1": 0.624424778761062 + } + } +} \ No newline at end of file diff --git a/docs/results/cl-nagoya/unsup-simcse-ja-large/summary.json b/docs/results/cl-nagoya/unsup-simcse-ja-large/summary.json new file mode 100644 index 0000000..d37618a --- /dev/null +++ b/docs/results/cl-nagoya/unsup-simcse-ja-large/summary.json @@ -0,0 +1,62 @@ +{ + "Classification": { + "amazon_counterfactual_classification": { + "macro_f1": 0.767905114979583 + }, + "amazon_review_classification": { + "macro_f1": 0.5537089641846143 + }, + "massive_intent_classification": { + "macro_f1": 0.7912698845073401 + }, + "massive_scenario_classification": { + "macro_f1": 0.8736185210672394 + } + }, + "Reranking": { + "esci": { + "ndcg@10": 0.9095494729022622 + } + }, + "Retrieval": { + "jagovfaqs_22k": { + "ndcg@10": 0.4509073581555124 + }, + "jaqket": { + "ndcg@10": 0.34595043675331943 + }, + "mrtydi": { + "ndcg@10": 0.05750859876901772 + }, + "nlp_journal_abs_intro": { + "ndcg@10": 0.550742021417855 + }, + "nlp_journal_title_abs": { + "ndcg@10": 0.6307172007359215 + }, + "nlp_journal_title_intro": { + "ndcg@10": 0.39612451822677164 + } + }, + "STS": { + "jsick": { + "spearman": 0.8014979086154339 + }, + "jsts": { + "spearman": 0.8097685749017456 + } + }, + "Clustering": { + "livedoor_news": { + "v_measure_score": 0.5090447587797094 + }, + "mewsc16": { + "v_measure_score": 0.4591920015613856 + } + }, + "PairClassification": { + "paws_x_ja": { + "binary_f1": 0.6248671625929861 + } + } +} \ No newline at end of file diff --git a/docs/results/colorfulscoop/sbert-base-ja/summary.json b/docs/results/colorfulscoop/sbert-base-ja/summary.json new file mode 100644 index 0000000..2a08044 --- /dev/null +++ b/docs/results/colorfulscoop/sbert-base-ja/summary.json @@ -0,0 +1,62 @@ +{ + "Classification": { + "amazon_counterfactual_classification": { + "macro_f1": 0.7221023294352484 + }, + "amazon_review_classification": { + "macro_f1": 0.47952384496155054 + }, + "massive_intent_classification": { + "macro_f1": 0.725195343788811 + }, + "massive_scenario_classification": { + "macro_f1": 0.836177960542408 + } + }, + "Reranking": { + "esci": { + "ndcg@10": 0.8997301146575819 + } + }, + "Retrieval": { + "jagovfaqs_22k": { + "ndcg@10": 0.21501915127957166 + }, + "jaqket": { + "ndcg@10": 0.13161989528541293 + }, + "mrtydi": { + "ndcg@10": 0.00436010196904899 + }, + "nlp_journal_abs_intro": { + "ndcg@10": 0.2878020264605714 + }, + "nlp_journal_title_abs": { + "ndcg@10": 0.22397059858982324 + }, + "nlp_journal_title_intro": { + "ndcg@10": 0.12815871897103842 + } + }, + "STS": { + "jsick": { + "spearman": 0.6659298300713198 + }, + "jsts": { + "spearman": 0.7423952309826243 + } + }, + "Clustering": { + "livedoor_news": { + "v_measure_score": 0.4298579019834722 + }, + "mewsc16": { + "v_measure_score": 0.46641671645082333 + } + }, + "PairClassification": { + "paws_x_ja": { + "binary_f1": 0.6231013776050865 + } + } +} \ No newline at end of file diff --git a/docs/results/intfloat/multilingual-e5-base/summary.json b/docs/results/intfloat/multilingual-e5-base/summary.json new file mode 100644 index 0000000..96f9640 --- /dev/null +++ b/docs/results/intfloat/multilingual-e5-base/summary.json @@ -0,0 +1,62 @@ +{ + "Classification": { + "amazon_counterfactual_classification": { + "macro_f1": 0.6367079139150691 + }, + "amazon_review_classification": { + "macro_f1": 0.5424265794470897 + }, + "massive_intent_classification": { + "macro_f1": 0.7277503514873049 + }, + "massive_scenario_classification": { + "macro_f1": 0.8652828949015864 + } + }, + "Reranking": { + "esci": { + "ndcg@10": 0.9285060467194839 + } + }, + "Retrieval": { + "jagovfaqs_22k": { + "ndcg@10": 0.6534478396845428 + }, + "jaqket": { + "ndcg@10": 0.5067444792013236 + }, + "mrtydi": { + "ndcg@10": 0.3837652120001251 + }, + "nlp_journal_abs_intro": { + "ndcg@10": 0.8709767034225332 + }, + "nlp_journal_title_abs": { + "ndcg@10": 0.9473129303429082 + }, + "nlp_journal_title_intro": { + "ndcg@10": 0.7304538728893641 + } + }, + "STS": { + "jsick": { + "spearman": 0.8128058660848744 + }, + "jsts": { + "spearman": 0.7839196475937381 + } + }, + "Clustering": { + "livedoor_news": { + "v_measure_score": 0.5502694126615243 + }, + "mewsc16": { + "v_measure_score": 0.41494514000218946 + } + }, + "PairClassification": { + "paws_x_ja": { + "binary_f1": 0.6226482073127441 + } + } +} \ No newline at end of file diff --git a/docs/results/intfloat/multilingual-e5-large/summary.json b/docs/results/intfloat/multilingual-e5-large/summary.json new file mode 100644 index 0000000..a28c470 --- /dev/null +++ b/docs/results/intfloat/multilingual-e5-large/summary.json @@ -0,0 +1,62 @@ +{ + "Classification": { + "amazon_counterfactual_classification": { + "macro_f1": 0.706580687830688 + }, + "amazon_review_classification": { + "macro_f1": 0.5653992303516462 + }, + "massive_intent_classification": { + "macro_f1": 0.7577710251429624 + }, + "massive_scenario_classification": { + "macro_f1": 0.8859090262583831 + } + }, + "Reranking": { + "esci": { + "ndcg@10": 0.9296254722183955 + } + }, + "Retrieval": { + "jagovfaqs_22k": { + "ndcg@10": 0.7030214336558751 + }, + "jaqket": { + "ndcg@10": 0.5878065301444064 + }, + "mrtydi": { + "ndcg@10": 0.4363167873386172 + }, + "nlp_journal_abs_intro": { + "ndcg@10": 0.8600225120389309 + }, + "nlp_journal_title_abs": { + "ndcg@10": 0.9469712765040588 + }, + "nlp_journal_title_intro": { + "ndcg@10": 0.7248023877969718 + } + }, + "STS": { + "jsick": { + "spearman": 0.7840335060728089 + }, + "jsts": { + "spearman": 0.8098724997856234 + } + }, + "Clustering": { + "livedoor_news": { + "v_measure_score": 0.5713023706914878 + }, + "mewsc16": { + "v_measure_score": 0.4534484706354193 + } + }, + "PairClassification": { + "paws_x_ja": { + "binary_f1": 0.621496984746364 + } + } +} \ No newline at end of file diff --git a/docs/results/intfloat/multilingual-e5-small/summary.json b/docs/results/intfloat/multilingual-e5-small/summary.json new file mode 100644 index 0000000..99a4423 --- /dev/null +++ b/docs/results/intfloat/multilingual-e5-small/summary.json @@ -0,0 +1,62 @@ +{ + "Classification": { + "amazon_counterfactual_classification": { + "macro_f1": 0.6214130966524566 + }, + "amazon_review_classification": { + "macro_f1": 0.5127428912860463 + }, + "massive_intent_classification": { + "macro_f1": 0.7085230519111091 + }, + "massive_scenario_classification": { + "macro_f1": 0.8622036829599259 + } + }, + "Reranking": { + "esci": { + "ndcg@10": 0.9303349187158247 + } + }, + "Retrieval": { + "jagovfaqs_22k": { + "ndcg@10": 0.6411252958220891 + }, + "jaqket": { + "ndcg@10": 0.49966509556428645 + }, + "mrtydi": { + "ndcg@10": 0.36054822913647616 + }, + "nlp_journal_abs_intro": { + "ndcg@10": 0.8520749151982298 + }, + "nlp_journal_title_abs": { + "ndcg@10": 0.9526123412781002 + }, + "nlp_journal_title_intro": { + "ndcg@10": 0.729906931983999 + } + }, + "STS": { + "jsick": { + "spearman": 0.8150271836013705 + }, + "jsts": { + "spearman": 0.786450077409501 + } + }, + "Clustering": { + "livedoor_news": { + "v_measure_score": 0.5470075389200084 + }, + "mewsc16": { + "v_measure_score": 0.391226933590049 + } + }, + "PairClassification": { + "paws_x_ja": { + "binary_f1": 0.6219382321618744 + } + } +} \ No newline at end of file diff --git a/docs/results/oshizo/sbert-jsnli-luke-japanese-base-lite/summary.json b/docs/results/oshizo/sbert-jsnli-luke-japanese-base-lite/summary.json new file mode 100644 index 0000000..6b7309a --- /dev/null +++ b/docs/results/oshizo/sbert-jsnli-luke-japanese-base-lite/summary.json @@ -0,0 +1,62 @@ +{ + "Classification": { + "amazon_counterfactual_classification": { + "macro_f1": 0.7994675369288904 + }, + "amazon_review_classification": { + "macro_f1": 0.5748206591211895 + }, + "massive_intent_classification": { + "macro_f1": 0.8025949222725076 + }, + "massive_scenario_classification": { + "macro_f1": 0.8875250742566655 + } + }, + "Reranking": { + "esci": { + "ndcg@10": 0.9156331205981866 + } + }, + "Retrieval": { + "jagovfaqs_22k": { + "ndcg@10": 0.519938655947725 + }, + "jaqket": { + "ndcg@10": 0.4206746951743811 + }, + "mrtydi": { + "ndcg@10": 0.10116108109776817 + }, + "nlp_journal_abs_intro": { + "ndcg@10": 0.4930421996747514 + }, + "nlp_journal_title_abs": { + "ndcg@10": 0.719369187830078 + }, + "nlp_journal_title_intro": { + "ndcg@10": 0.3258568875005778 + } + }, + "STS": { + "jsick": { + "spearman": 0.7211422898060521 + }, + "jsts": { + "spearman": 0.8109305772255819 + } + }, + "Clustering": { + "livedoor_news": { + "v_measure_score": 0.4677177349822789 + }, + "mewsc16": { + "v_measure_score": 0.5389209739242912 + } + }, + "PairClassification": { + "paws_x_ja": { + "binary_f1": 0.6237623762376237 + } + } +} \ No newline at end of file diff --git a/docs/results/pkshatech/GLuCoSE-base-ja/summary.json b/docs/results/pkshatech/GLuCoSE-base-ja/summary.json new file mode 100644 index 0000000..9048691 --- /dev/null +++ b/docs/results/pkshatech/GLuCoSE-base-ja/summary.json @@ -0,0 +1,62 @@ +{ + "Classification": { + "amazon_counterfactual_classification": { + "macro_f1": 0.8243606275521169 + }, + "amazon_review_classification": { + "macro_f1": 0.580654308041878 + }, + "massive_intent_classification": { + "macro_f1": 0.7885427536904928 + }, + "massive_scenario_classification": { + "macro_f1": 0.8794225134482166 + } + }, + "Reranking": { + "esci": { + "ndcg@10": 0.9190289767663239 + } + }, + "Retrieval": { + "jagovfaqs_22k": { + "ndcg@10": 0.6387979415478197 + }, + "jaqket": { + "ndcg@10": 0.3981609655991592 + }, + "mrtydi": { + "ndcg@10": 0.30281316435910444 + }, + "nlp_journal_abs_intro": { + "ndcg@10": 0.7825765249971093 + }, + "nlp_journal_title_abs": { + "ndcg@10": 0.8206371528870603 + }, + "nlp_journal_title_intro": { + "ndcg@10": 0.5982476164344701 + } + }, + "STS": { + "jsick": { + "spearman": 0.7496711324072552 + }, + "jsts": { + "spearman": 0.824592262812859 + } + }, + "Clustering": { + "livedoor_news": { + "v_measure_score": 0.49890886040948096 + }, + "mewsc16": { + "v_measure_score": 0.49676862904881375 + } + }, + "PairClassification": { + "paws_x_ja": { + "binary_f1": 0.663883089770355 + } + } +} \ No newline at end of file diff --git a/docs/results/pkshatech/simcse-ja-bert-base-clcmlp/summary.json b/docs/results/pkshatech/simcse-ja-bert-base-clcmlp/summary.json new file mode 100644 index 0000000..cc9f179 --- /dev/null +++ b/docs/results/pkshatech/simcse-ja-bert-base-clcmlp/summary.json @@ -0,0 +1,62 @@ +{ + "Classification": { + "amazon_counterfactual_classification": { + "macro_f1": 0.6748573563374541 + }, + "amazon_review_classification": { + "macro_f1": 0.5084883283463678 + }, + "massive_intent_classification": { + "macro_f1": 0.7967050091211104 + }, + "massive_scenario_classification": { + "macro_f1": 0.871999260591497 + } + }, + "Reranking": { + "esci": { + "ndcg@10": 0.914930352019688 + } + }, + "Retrieval": { + "jagovfaqs_22k": { + "ndcg@10": 0.41496851385134836 + }, + "jaqket": { + "ndcg@10": 0.46003031782136106 + }, + "mrtydi": { + "ndcg@10": 0.1019130492122431 + }, + "nlp_journal_abs_intro": { + "ndcg@10": 0.4014036990267884 + }, + "nlp_journal_title_abs": { + "ndcg@10": 0.5962532652358485 + }, + "nlp_journal_title_intro": { + "ndcg@10": 0.2452584471710635 + } + }, + "STS": { + "jsick": { + "spearman": 0.7307715649457595 + }, + "jsts": { + "spearman": 0.8052279921326252 + } + }, + "Clustering": { + "livedoor_news": { + "v_measure_score": 0.4476707933600858 + }, + "mewsc16": { + "v_measure_score": 0.5029508725037098 + } + }, + "PairClassification": { + "paws_x_ja": { + "binary_f1": 0.6239830208701805 + } + } +} \ No newline at end of file diff --git a/docs/results/sentence-transformers/LaBSE/summary.json b/docs/results/sentence-transformers/LaBSE/summary.json new file mode 100644 index 0000000..de8fd21 --- /dev/null +++ b/docs/results/sentence-transformers/LaBSE/summary.json @@ -0,0 +1,62 @@ +{ + "Classification": { + "amazon_counterfactual_classification": { + "macro_f1": 0.7361214773958769 + }, + "amazon_review_classification": { + "macro_f1": 0.516957890685124 + }, + "massive_intent_classification": { + "macro_f1": 0.7698802987251081 + }, + "massive_scenario_classification": { + "macro_f1": 0.8835366493433755 + } + }, + "Reranking": { + "esci": { + "ndcg@10": 0.9162507647227857 + } + }, + "Retrieval": { + "jagovfaqs_22k": { + "ndcg@10": 0.4310160105414995 + }, + "jaqket": { + "ndcg@10": 0.34245849139132745 + }, + "mrtydi": { + "ndcg@10": 0.04238747941951049 + }, + "nlp_journal_abs_intro": { + "ndcg@10": 0.48918127058907085 + }, + "nlp_journal_title_abs": { + "ndcg@10": 0.7513086500303519 + }, + "nlp_journal_title_intro": { + "ndcg@10": 0.35089108319096984 + } + }, + "STS": { + "jsick": { + "spearman": 0.7698905918950973 + }, + "jsts": { + "spearman": 0.7612337568248777 + } + }, + "Clustering": { + "livedoor_news": { + "v_measure_score": 0.4829337123233023 + }, + "mewsc16": { + "v_measure_score": 0.41471299546625956 + } + }, + "PairClassification": { + "paws_x_ja": { + "binary_f1": 0.623321554770318 + } + } +} \ No newline at end of file diff --git a/docs/results/sentence-transformers/stsb-xlm-r-multilingual/summary.json b/docs/results/sentence-transformers/stsb-xlm-r-multilingual/summary.json new file mode 100644 index 0000000..12f71a2 --- /dev/null +++ b/docs/results/sentence-transformers/stsb-xlm-r-multilingual/summary.json @@ -0,0 +1,62 @@ +{ + "Classification": { + "amazon_counterfactual_classification": { + "macro_f1": 0.7565022696601644 + }, + "amazon_review_classification": { + "macro_f1": 0.5131771609073525 + }, + "massive_intent_classification": { + "macro_f1": 0.7427818411370812 + }, + "massive_scenario_classification": { + "macro_f1": 0.8609512679368835 + } + }, + "Reranking": { + "esci": { + "ndcg@10": 0.901984958764163 + } + }, + "Retrieval": { + "jagovfaqs_22k": { + "ndcg@10": 0.2511106863952595 + }, + "jaqket": { + "ndcg@10": 0.21606007987072834 + }, + "mrtydi": { + "ndcg@10": 0.027590779174942116 + }, + "nlp_journal_abs_intro": { + "ndcg@10": 0.2848558252647936 + }, + "nlp_journal_title_abs": { + "ndcg@10": 0.3646520309406354 + }, + "nlp_journal_title_intro": { + "ndcg@10": 0.11545016260271045 + } + }, + "STS": { + "jsick": { + "spearman": 0.7236409557069434 + }, + "jsts": { + "spearman": 0.7843597058304203 + } + }, + "Clustering": { + "livedoor_news": { + "v_measure_score": 0.24487129939212224 + }, + "mewsc16": { + "v_measure_score": 0.304278393205056 + } + }, + "PairClassification": { + "paws_x_ja": { + "binary_f1": 0.6219686162624821 + } + } +} \ No newline at end of file diff --git a/leaderboard.md b/leaderboard.md new file mode 100644 index 0000000..107f2e9 --- /dev/null +++ b/leaderboard.md @@ -0,0 +1,188 @@ +# Leaderboard +This leaderboard shows the results stored under `docs/results`. The scores are all multiplied by 100. + +## Summary + +The summary shows the average scores within each task. + +| Model | Avg. | Retrieval | STS | Classification | Reranking | Clustering | PairClassification | +|:----------------------------------------------|:----------|:------------|:----------|:-----------------|:------------|:-------------|:---------------------| +| intfloat/multilingual-e5-large | **71.65** | 70.98 | 79.70 | 72.89 | 92.96 | 51.24 | 62.15 | +| pkshatech/GLuCoSE-base-ja | 70.44 | 59.02 | 78.71 | 76.82 | 91.90 | 49.78 | **66.39** | +| intfloat/multilingual-e5-base | 70.12 | 68.21 | 79.84 | 69.30 | 92.85 | 48.26 | 62.26 | +| OpenAI/text-embedding-3-large | 69.63 | **74.48** | 82.52 | **77.58** | **93.58** | 27.29 | 62.35 | +| intfloat/multilingual-e5-small | 69.52 | 67.27 | 80.07 | 67.62 | 93.03 | 46.91 | 62.19 | +| cl-nagoya/sup-simcse-ja-base | 68.56 | 49.64 | 82.05 | 73.47 | 91.83 | **51.79** | 62.57 | +| MU-Kindai/Japanese-SimCSE-BERT-large-unsup | 66.89 | 47.38 | 78.99 | 73.13 | 91.30 | 48.25 | 62.27 | +| oshizo/sbert-jsnli-luke-japanese-base-lite | 66.75 | 43.00 | 76.60 | 76.61 | 91.56 | 50.33 | 62.38 | +| OpenAI/text-embedding-3-small | 66.74 | 66.39 | 79.46 | 73.06 | 92.92 | 26.34 | 62.27 | +| cl-nagoya/sup-simcse-ja-large | 66.51 | 37.62 | **83.18** | 73.73 | 91.48 | 50.56 | 62.51 | +| cl-nagoya/unsup-simcse-ja-large | 66.27 | 40.53 | 80.56 | 74.66 | 90.95 | 48.41 | 62.49 | +| MU-Kindai/Japanese-SimCSE-BERT-base-unsup | 66.23 | 46.36 | 77.49 | 73.30 | 91.16 | 46.68 | 62.38 | +| OpenAI/text-embedding-ada-002 | 65.84 | 64.38 | 79.02 | 69.75 | 93.04 | 26.47 | 62.40 | +| MU-Kindai/Japanese-SimCSE-BERT-large-sup | 65.28 | 40.82 | 78.28 | 73.47 | 90.95 | 45.81 | 62.35 | +| MU-Kindai/Japanese-MixCSE-BERT-base | 65.14 | 42.59 | 77.05 | 72.90 | 91.01 | 44.95 | 62.33 | +| cl-nagoya/unsup-simcse-ja-base | 65.07 | 40.23 | 78.72 | 73.07 | 91.16 | 44.77 | 62.44 | +| MU-Kindai/Japanese-DiffCSE-BERT-base | 64.77 | 41.79 | 75.50 | 73.77 | 90.95 | 44.22 | 62.38 | +| sentence-transformers/LaBSE | 64.70 | 40.12 | 76.56 | 72.66 | 91.63 | 44.88 | 62.33 | +| pkshatech/simcse-ja-bert-base-clcmlp | 64.42 | 37.00 | 76.80 | 71.30 | 91.49 | 47.53 | 62.40 | +| MU-Kindai/Japanese-SimCSE-BERT-base-sup | 64.15 | 41.32 | 74.66 | 72.76 | 90.66 | 43.11 | 62.37 | +| colorfulscoop/sbert-base-ja | 58.85 | 16.52 | 70.42 | 69.07 | 89.97 | 44.81 | 62.31 | +| sentence-transformers/stsb-xlm-r-multilingual | 58.01 | 21.00 | 75.40 | 71.84 | 90.20 | 27.46 | 62.20 | + +## Retrieval +| Model | Avg. | jagovfaqs_22k
(ndcg@10) | jaqket
(ndcg@10) | mrtydi
(ndcg@10) | nlp_journal_abs_intro
(ndcg@10) | nlp_journal_title_abs
(ndcg@10) | nlp_journal_title_intro
(ndcg@10) | +|:----------------------------------------------|:----------|:-----------------------------|:----------------------|:----------------------|:-------------------------------------|:-------------------------------------|:---------------------------------------| +| OpenAI/text-embedding-3-large | **74.48** | **72.41** | 48.21 | 34.88 | **99.33** | **96.55** | **95.47** | +| intfloat/multilingual-e5-large | 70.98 | 70.30 | **58.78** | **43.63** | 86.00 | 94.70 | 72.48 | +| intfloat/multilingual-e5-base | 68.21 | 65.34 | 50.67 | 38.38 | 87.10 | 94.73 | 73.05 | +| intfloat/multilingual-e5-small | 67.27 | 64.11 | 49.97 | 36.05 | 85.21 | 95.26 | 72.99 | +| OpenAI/text-embedding-3-small | 66.39 | 64.02 | 33.94 | 20.03 | 98.47 | 91.70 | 90.17 | +| OpenAI/text-embedding-ada-002 | 64.38 | 61.02 | 42.56 | 14.51 | 94.99 | 91.23 | 81.98 | +| pkshatech/GLuCoSE-base-ja | 59.02 | 63.88 | 39.82 | 30.28 | 78.26 | 82.06 | 59.82 | +| cl-nagoya/sup-simcse-ja-base | 49.64 | 51.62 | 50.25 | 13.98 | 68.08 | 65.71 | 48.22 | +| MU-Kindai/Japanese-SimCSE-BERT-large-unsup | 47.38 | 50.14 | 45.84 | 13.00 | 55.09 | 74.97 | 45.24 | +| MU-Kindai/Japanese-SimCSE-BERT-base-unsup | 46.36 | 47.39 | 39.57 | 11.44 | 64.16 | 70.23 | 45.37 | +| oshizo/sbert-jsnli-luke-japanese-base-lite | 43.00 | 51.99 | 42.07 | 10.12 | 49.30 | 71.94 | 32.59 | +| MU-Kindai/Japanese-MixCSE-BERT-base | 42.59 | 42.37 | 37.72 | 7.88 | 63.70 | 64.13 | 39.73 | +| MU-Kindai/Japanese-DiffCSE-BERT-base | 41.79 | 42.31 | 36.20 | 7.81 | 60.77 | 64.34 | 39.32 | +| MU-Kindai/Japanese-SimCSE-BERT-base-sup | 41.32 | 44.11 | 39.61 | 8.15 | 62.76 | 58.39 | 34.89 | +| MU-Kindai/Japanese-SimCSE-BERT-large-sup | 40.82 | 47.04 | 44.10 | 11.43 | 43.43 | 62.41 | 36.52 | +| cl-nagoya/unsup-simcse-ja-large | 40.53 | 45.09 | 34.60 | 5.75 | 55.07 | 63.07 | 39.61 | +| cl-nagoya/unsup-simcse-ja-base | 40.23 | 46.00 | 39.46 | 5.55 | 60.26 | 55.63 | 34.49 | +| sentence-transformers/LaBSE | 40.12 | 43.10 | 34.25 | 4.24 | 48.92 | 75.13 | 35.09 | +| cl-nagoya/sup-simcse-ja-large | 37.62 | 46.84 | 39.88 | 11.83 | 63.43 | 37.93 | 25.79 | +| pkshatech/simcse-ja-bert-base-clcmlp | 37.00 | 41.50 | 46.00 | 10.19 | 40.14 | 59.63 | 24.53 | +| sentence-transformers/stsb-xlm-r-multilingual | 21.00 | 25.11 | 21.61 | 2.76 | 28.49 | 36.47 | 11.55 | +| colorfulscoop/sbert-base-ja | 16.52 | 21.50 | 13.16 | 0.44 | 28.78 | 22.40 | 12.82 | + +## STS +| Model | Avg. | jsick
(spearman) | jsts
(spearman) | +|:----------------------------------------------|:----------|:----------------------|:---------------------| +| cl-nagoya/sup-simcse-ja-large | **83.18** | **83.80** | 82.57 | +| OpenAI/text-embedding-3-large | 82.52 | 81.27 | **83.77** | +| cl-nagoya/sup-simcse-ja-base | 82.05 | 82.83 | 81.27 | +| cl-nagoya/unsup-simcse-ja-large | 80.56 | 80.15 | 80.98 | +| intfloat/multilingual-e5-small | 80.07 | 81.50 | 78.65 | +| intfloat/multilingual-e5-base | 79.84 | 81.28 | 78.39 | +| intfloat/multilingual-e5-large | 79.70 | 78.40 | 80.99 | +| OpenAI/text-embedding-3-small | 79.46 | 80.83 | 78.08 | +| OpenAI/text-embedding-ada-002 | 79.02 | 79.09 | 78.94 | +| MU-Kindai/Japanese-SimCSE-BERT-large-unsup | 78.99 | 79.84 | 78.14 | +| cl-nagoya/unsup-simcse-ja-base | 78.72 | 78.49 | 78.95 | +| pkshatech/GLuCoSE-base-ja | 78.71 | 74.97 | 82.46 | +| MU-Kindai/Japanese-SimCSE-BERT-large-sup | 78.28 | 78.75 | 77.81 | +| MU-Kindai/Japanese-SimCSE-BERT-base-unsup | 77.49 | 78.18 | 76.81 | +| MU-Kindai/Japanese-MixCSE-BERT-base | 77.05 | 77.57 | 76.53 | +| pkshatech/simcse-ja-bert-base-clcmlp | 76.80 | 73.08 | 80.52 | +| oshizo/sbert-jsnli-luke-japanese-base-lite | 76.60 | 72.11 | 81.09 | +| sentence-transformers/LaBSE | 76.56 | 76.99 | 76.12 | +| MU-Kindai/Japanese-DiffCSE-BERT-base | 75.50 | 75.42 | 75.58 | +| sentence-transformers/stsb-xlm-r-multilingual | 75.40 | 72.36 | 78.44 | +| MU-Kindai/Japanese-SimCSE-BERT-base-sup | 74.66 | 74.64 | 74.68 | +| colorfulscoop/sbert-base-ja | 70.42 | 66.59 | 74.24 | + +## Classification +| Model | Avg. | amazon_counterfactual
(macro_f1) | amazon_review
(macro_f1) | massive_intent
(macro_f1) | massive_scenario
(macro_f1) | +|:----------------------------------------------|:----------|:--------------------------------------|:------------------------------|:-------------------------------|:---------------------------------| +| OpenAI/text-embedding-3-large | **77.58** | 77.90 | **60.44** | **80.91** | **91.08** | +| pkshatech/GLuCoSE-base-ja | 76.82 | **82.44** | 58.07 | 78.85 | 87.94 | +| oshizo/sbert-jsnli-luke-japanese-base-lite | 76.61 | 79.95 | 57.48 | 80.26 | 88.75 | +| cl-nagoya/unsup-simcse-ja-large | 74.66 | 76.79 | 55.37 | 79.13 | 87.36 | +| MU-Kindai/Japanese-DiffCSE-BERT-base | 73.77 | 78.10 | 51.56 | 78.79 | 86.63 | +| cl-nagoya/sup-simcse-ja-large | 73.73 | 73.21 | 54.76 | 79.23 | 87.72 | +| MU-Kindai/Japanese-SimCSE-BERT-large-sup | 73.47 | 77.25 | 53.42 | 76.83 | 86.39 | +| cl-nagoya/sup-simcse-ja-base | 73.47 | 72.34 | 54.41 | 79.52 | 87.60 | +| MU-Kindai/Japanese-SimCSE-BERT-base-unsup | 73.30 | 76.20 | 51.52 | 78.95 | 86.54 | +| MU-Kindai/Japanese-SimCSE-BERT-large-unsup | 73.13 | 76.36 | 52.75 | 76.88 | 86.51 | +| cl-nagoya/unsup-simcse-ja-base | 73.07 | 73.30 | 53.93 | 79.07 | 85.97 | +| OpenAI/text-embedding-3-small | 73.06 | 70.01 | 55.92 | 77.66 | 88.67 | +| MU-Kindai/Japanese-MixCSE-BERT-base | 72.90 | 77.62 | 50.86 | 77.19 | 85.93 | +| intfloat/multilingual-e5-large | 72.89 | 70.66 | 56.54 | 75.78 | 88.59 | +| MU-Kindai/Japanese-SimCSE-BERT-base-sup | 72.76 | 76.20 | 52.06 | 77.89 | 84.90 | +| sentence-transformers/LaBSE | 72.66 | 73.61 | 51.70 | 76.99 | 88.35 | +| sentence-transformers/stsb-xlm-r-multilingual | 71.84 | 75.65 | 51.32 | 74.28 | 86.10 | +| pkshatech/simcse-ja-bert-base-clcmlp | 71.30 | 67.49 | 50.85 | 79.67 | 87.20 | +| OpenAI/text-embedding-ada-002 | 69.75 | 64.42 | 53.13 | 74.57 | 86.89 | +| intfloat/multilingual-e5-base | 69.30 | 63.67 | 54.24 | 72.78 | 86.53 | +| colorfulscoop/sbert-base-ja | 69.07 | 72.21 | 47.95 | 72.52 | 83.62 | +| intfloat/multilingual-e5-small | 67.62 | 62.14 | 51.27 | 70.85 | 86.22 | + +## Reranking +| Model | Avg. | esci
(ndcg@10) | +|:----------------------------------------------|:----------|:--------------------| +| OpenAI/text-embedding-3-large | **93.58** | **93.58** | +| OpenAI/text-embedding-ada-002 | 93.04 | 93.04 | +| intfloat/multilingual-e5-small | 93.03 | 93.03 | +| intfloat/multilingual-e5-large | 92.96 | 92.96 | +| OpenAI/text-embedding-3-small | 92.92 | 92.92 | +| intfloat/multilingual-e5-base | 92.85 | 92.85 | +| pkshatech/GLuCoSE-base-ja | 91.90 | 91.90 | +| cl-nagoya/sup-simcse-ja-base | 91.83 | 91.83 | +| sentence-transformers/LaBSE | 91.63 | 91.63 | +| oshizo/sbert-jsnli-luke-japanese-base-lite | 91.56 | 91.56 | +| pkshatech/simcse-ja-bert-base-clcmlp | 91.49 | 91.49 | +| cl-nagoya/sup-simcse-ja-large | 91.48 | 91.48 | +| MU-Kindai/Japanese-SimCSE-BERT-large-unsup | 91.30 | 91.30 | +| MU-Kindai/Japanese-SimCSE-BERT-base-unsup | 91.16 | 91.16 | +| cl-nagoya/unsup-simcse-ja-base | 91.16 | 91.16 | +| MU-Kindai/Japanese-MixCSE-BERT-base | 91.01 | 91.01 | +| cl-nagoya/unsup-simcse-ja-large | 90.95 | 90.95 | +| MU-Kindai/Japanese-DiffCSE-BERT-base | 90.95 | 90.95 | +| MU-Kindai/Japanese-SimCSE-BERT-large-sup | 90.95 | 90.95 | +| MU-Kindai/Japanese-SimCSE-BERT-base-sup | 90.66 | 90.66 | +| sentence-transformers/stsb-xlm-r-multilingual | 90.20 | 90.20 | +| colorfulscoop/sbert-base-ja | 89.97 | 89.97 | + +## Clustering +| Model | Avg. | livedoor_news
(v_measure_score) | mewsc16
(v_measure_score) | +|:----------------------------------------------|:----------|:-------------------------------------|:-------------------------------| +| cl-nagoya/sup-simcse-ja-base | **51.79** | 52.67 | 50.91 | +| intfloat/multilingual-e5-large | 51.24 | **57.13** | 45.34 | +| cl-nagoya/sup-simcse-ja-large | 50.56 | 50.75 | 50.38 | +| oshizo/sbert-jsnli-luke-japanese-base-lite | 50.33 | 46.77 | **53.89** | +| pkshatech/GLuCoSE-base-ja | 49.78 | 49.89 | 49.68 | +| cl-nagoya/unsup-simcse-ja-large | 48.41 | 50.90 | 45.92 | +| intfloat/multilingual-e5-base | 48.26 | 55.03 | 41.49 | +| MU-Kindai/Japanese-SimCSE-BERT-large-unsup | 48.25 | 53.20 | 43.31 | +| pkshatech/simcse-ja-bert-base-clcmlp | 47.53 | 44.77 | 50.30 | +| intfloat/multilingual-e5-small | 46.91 | 54.70 | 39.12 | +| MU-Kindai/Japanese-SimCSE-BERT-base-unsup | 46.68 | 53.02 | 40.35 | +| MU-Kindai/Japanese-SimCSE-BERT-large-sup | 45.81 | 48.45 | 43.17 | +| MU-Kindai/Japanese-MixCSE-BERT-base | 44.95 | 52.62 | 37.28 | +| sentence-transformers/LaBSE | 44.88 | 48.29 | 41.47 | +| colorfulscoop/sbert-base-ja | 44.81 | 42.99 | 46.64 | +| cl-nagoya/unsup-simcse-ja-base | 44.77 | 52.23 | 37.31 | +| MU-Kindai/Japanese-DiffCSE-BERT-base | 44.22 | 49.67 | 38.77 | +| MU-Kindai/Japanese-SimCSE-BERT-base-sup | 43.11 | 41.04 | 45.18 | +| sentence-transformers/stsb-xlm-r-multilingual | 27.46 | 24.49 | 30.43 | +| OpenAI/text-embedding-3-large | 27.29 | 5.02 | 49.55 | +| OpenAI/text-embedding-ada-002 | 26.47 | 6.03 | 46.92 | +| OpenAI/text-embedding-3-small | 26.34 | 5.13 | 47.55 | + +## PairClassification +| Model | Avg. | paws_x_ja
(binary_f1) | +|:----------------------------------------------|:----------|:---------------------------| +| pkshatech/GLuCoSE-base-ja | **66.39** | **66.39** | +| cl-nagoya/sup-simcse-ja-base | 62.57 | 62.57 | +| cl-nagoya/sup-simcse-ja-large | 62.51 | 62.51 | +| cl-nagoya/unsup-simcse-ja-large | 62.49 | 62.49 | +| cl-nagoya/unsup-simcse-ja-base | 62.44 | 62.44 | +| pkshatech/simcse-ja-bert-base-clcmlp | 62.40 | 62.40 | +| OpenAI/text-embedding-ada-002 | 62.40 | 62.40 | +| MU-Kindai/Japanese-SimCSE-BERT-base-unsup | 62.38 | 62.38 | +| oshizo/sbert-jsnli-luke-japanese-base-lite | 62.38 | 62.38 | +| MU-Kindai/Japanese-DiffCSE-BERT-base | 62.38 | 62.38 | +| MU-Kindai/Japanese-SimCSE-BERT-base-sup | 62.37 | 62.37 | +| MU-Kindai/Japanese-SimCSE-BERT-large-sup | 62.35 | 62.35 | +| OpenAI/text-embedding-3-large | 62.35 | 62.35 | +| MU-Kindai/Japanese-MixCSE-BERT-base | 62.33 | 62.33 | +| sentence-transformers/LaBSE | 62.33 | 62.33 | +| colorfulscoop/sbert-base-ja | 62.31 | 62.31 | +| OpenAI/text-embedding-3-small | 62.27 | 62.27 | +| MU-Kindai/Japanese-SimCSE-BERT-large-unsup | 62.27 | 62.27 | +| intfloat/multilingual-e5-base | 62.26 | 62.26 | +| sentence-transformers/stsb-xlm-r-multilingual | 62.20 | 62.20 | +| intfloat/multilingual-e5-small | 62.19 | 62.19 | +| intfloat/multilingual-e5-large | 62.15 | 62.15 | + diff --git a/make_leaderboard.py b/make_leaderboard.py new file mode 100644 index 0000000..ff3a330 --- /dev/null +++ b/make_leaderboard.py @@ -0,0 +1,103 @@ +import json +from collections import defaultdict +from pathlib import Path + +from tabulate import tabulate + +dataset_name_aliases = { + "amazon_counterfactual_classification": "amazon_counterfactual", + "amazon_review_classification": "amazon_review", + "massive_intent_classification": "massive_intent", + "massive_scenario_classification": "massive_scenario", +} + +TASK_ORDER = ["Retrieval", "STS", "Classification", "Reranking", "Clustering", "PairClassification"] +SUMMARY_KEY = "Summary" + +""" +Collects the results from the results folder. +""" +# {task_name: {model_signature: {(dataset_name, metric_name): score}}} +all_results: dict[str, dict[str, dict[str, float]]] = defaultdict(lambda: defaultdict(dict)) +for summary_file in Path("docs/results").rglob("summary.json"): + if not summary_file.exists(): + continue + + with open(summary_file) as f: + summary = json.load(f) + + org_name = summary_file.parent.parent.name + model_name = summary_file.parent.name + model_signature = f"{org_name}/{model_name}" + + for task_name, task_results in summary.items(): + task_results_formatted: dict[str, float] = {} + task_scores: list[float] = [] + for dataset_name, metric_dict in task_results.items(): + metric_name, score = next(iter(metric_dict.items())) + dataset_name = dataset_name_aliases.get(dataset_name, dataset_name) + task_results_formatted[f"{dataset_name}
({metric_name})"] = score + task_scores.append(score) + all_results[task_name][model_signature] = task_results_formatted + all_results[SUMMARY_KEY][model_signature][task_name] = sum(task_scores) / len(task_scores) + +""" +Creates markdown tables for each task. +""" + + +def format_score(score: float) -> str: + return f"{score * 100:.2f}" + + +AVG_COLUMN_NAME = "Avg." +markdown_tables: dict[str, str] = {} +for task_name, task_results in all_results.items(): + # format to markdown table + dataset_keys = list(task_results[next(iter(task_results))].keys()) + if task_name == SUMMARY_KEY: + dataset_keys = TASK_ORDER + + header = ["Model", AVG_COLUMN_NAME, *dataset_keys] + table_list: list[list[str | float]] = [] + for model_signature, dataset_scores in task_results.items(): + model_scores = [dataset_scores[k] for k in dataset_keys] + average_score = sum(model_scores) / len(model_scores) + table_list.append([model_signature, average_score, *model_scores]) + + # sort by the average score + avg_idx = header.index(AVG_COLUMN_NAME) + table_list.sort(key=lambda x: x[avg_idx], reverse=True) + + # make the highest score in each dataset bold + for dataset_name in [AVG_COLUMN_NAME, *dataset_keys]: + task_idx = header.index(dataset_name) + max_score = max(row[task_idx] for row in table_list) + for row in table_list: + if row[task_idx] == max_score: + row[task_idx] = f"**{format_score(row[task_idx])}**" + else: + row[task_idx] = format_score(row[task_idx]) + + # add header + table_list.insert(0, ["Model", AVG_COLUMN_NAME, *dataset_keys]) + markdown_table = tabulate(table_list, headers="firstrow", tablefmt="pipe") + markdown_tables[task_name] = markdown_table + +""" +Dump the markdown tables to a file. +""" +with open("leaderboard.md", "w") as f: + f.write("# Leaderboard\n") + f.write( + "This leaderboard shows the results stored under `docs/results`. The scores are all multiplied by 100.\n\n" + ) + for task_name in [SUMMARY_KEY, *TASK_ORDER]: + markdown_table = markdown_tables[task_name] + f.write(f"## {task_name}\n") + + if task_name == SUMMARY_KEY: + f.write("\nThe summary shows the average scores within each task.\n\n") + + f.write(markdown_table) + f.write("\n\n") diff --git a/poetry.lock b/poetry.lock index 71b1f79..40fbe9f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1454,14 +1454,13 @@ files = [ [[package]] name = "nvidia-nvjitlink-cu12" -version = "12.5.40" +version = "12.5.82" description = "Nvidia JIT LTO Library" optional = false python-versions = ">=3" files = [ - {file = "nvidia_nvjitlink_cu12-12.5.40-py3-none-manylinux2014_aarch64.whl", hash = "sha256:004186d5ea6a57758fd6d57052a123c73a4815adf365eb8dd6a85c9eaa7535ff"}, - {file = "nvidia_nvjitlink_cu12-12.5.40-py3-none-manylinux2014_x86_64.whl", hash = "sha256:d9714f27c1d0f0895cd8915c07a87a1d0029a0aa36acaf9156952ec2a8a12189"}, - {file = "nvidia_nvjitlink_cu12-12.5.40-py3-none-win_amd64.whl", hash = "sha256:c3401dc8543b52d3a8158007a0c1ab4e9c768fcbd24153a48c86972102197ddd"}, + {file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f9b37bc5c8cf7509665cb6ada5aaa0ce65618f2332b7d3e78e9790511f111212"}, + {file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-win_amd64.whl", hash = "sha256:e782564d705ff0bf61ac3e1bf730166da66dd2fe9012f111ede5fc49b64ae697"}, ] [[package]] @@ -2621,6 +2620,20 @@ files = [ [package.dependencies] mpmath = ">=1.1.0,<1.4.0" +[[package]] +name = "tabulate" +version = "0.9.0" +description = "Pretty-print tabular data" +optional = false +python-versions = ">=3.7" +files = [ + {file = "tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f"}, + {file = "tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c"}, +] + +[package.extras] +widechars = ["wcwidth"] + [[package]] name = "tbb" version = "2021.13.0" @@ -3389,4 +3402,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = ">=3.10,<4.0" -content-hash = "bfff8c9db1f28df560b71b1c09802902ee89f5c39baa84ffda906a9238d38df8" +content-hash = "a2c9ed2cef63429fda1482752acb674fe3b39b94498bbe2c177d0b8ac9558c44" diff --git a/pyproject.toml b/pyproject.toml index c4775df..8c27ffb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,15 +32,17 @@ pytest-mock = "^3.14.0" tiktoken = "^0.6.0" numpy = "^1.26" accelerate = "^0.31.0" +tabulate = "^0.9.0" [tool.poetry.group.dev.dependencies] black = "^23.11.0" isort = "^5.12.0" mypy = "^1.7.1" flake8 = "^7.0.0" +tabulate = "^0.9.0" [tool.black] line-length = 119 [tool.isort] -profile = "black" +profile = "black" \ No newline at end of file