Skip to content

Commit

Permalink
docs: Update mteb(eng) calculation (#1258)
Browse files Browse the repository at this point in the history
* Update mteb(eng) calculation

* Fixed citations

* Update MTEB(eng) + MTEB(multilingual)
  • Loading branch information
KennethEnevoldsen authored Oct 3, 2024
1 parent 647c295 commit 11518ed
Show file tree
Hide file tree
Showing 7 changed files with 1,692 additions and 674 deletions.
4 changes: 2 additions & 2 deletions mteb/tasks/Clustering/eng/ArXivHierarchicalClustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ class ArXivHierarchicalClusteringP2P(AbsTaskClusteringFast):
annotations_creators="derived",
dialect=["Thematic clustering"],
sample_creation="found",
bibtex_citation="@misc{arXiv.org e-Print archive, url={https://arxiv.org/} }",
bibtex_citation="",
descriptive_stats={
"n_samples": {"test": N_SAMPLES},
"test": {
Expand Down Expand Up @@ -217,7 +217,7 @@ class ArXivHierarchicalClusteringS2S(AbsTaskClusteringFast):
annotations_creators="derived",
dialect=[],
sample_creation="found",
bibtex_citation="@misc{arXiv.org e-Print archive, url={https://arxiv.org/} }",
bibtex_citation="",
descriptive_stats={
"n_samples": {"test": N_SAMPLES},
"avg_character_length": {"test": 1009.98},
Expand Down
26 changes: 13 additions & 13 deletions scripts/task_selection/mteb_lite_results.csv
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
,model,revision,mean,mean (STS),mean (Classification),mean (Reranking),mean (Retrieval),mean (Clustering),mean (PairClassification),mean (weighted by task type),borda_count,Total Evaluation time (hours)
11,intfloat/e5-mistral-7b-instruct,07163b72af1488142a360786df853f237b1a3ca1,0.691,0.84,0.674,0.498,0.573,0.518,0.884,0.665,275.0,1.453
2,GritLM/GritLM-7B,13f00a0e36500c80ce12870ea513846a066004af,0.686,0.828,0.695,0.496,0.572,0.5,0.873,0.661,265.0,1.707
7,intfloat/multilingual-e5-large-instruct,baa7be480a7de1539afce709c8f13f833a510e0a,0.678,0.846,0.642,0.487,0.547,0.499,0.862,0.647,256.0,1.127
3,intfloat/multilingual-e5-large,4dc6d853a804b9c8886ede6dda8a073b7dc08a81,0.643,0.815,0.658,0.447,0.493,0.427,0.847,0.615,190.0,1.211
9,intfloat/multilingual-e5-base,d13f1b27baf31030b7fd040960d60d909913633f,0.627,0.799,0.639,0.443,0.459,0.427,0.836,0.6,147.0,0.72
6,sentence-transformers/all-mpnet-base-v2,84f2bcc00d77236f9e89c8a360a00fb1139bf47d,0.595,0.724,0.513,0.484,0.469,0.458,0.83,0.58,143.0,0.741
4,sentence-transformers/paraphrase-multilingual-mpnet-base-v2,79f2382ceacceacdf38563d7c5d16b9ff8d725d6,0.605,0.801,0.637,0.452,0.342,0.423,0.817,0.579,130.0,0.694
8,sentence-transformers/all-MiniLM-L12-v2,a05860a77cef7b37e0048a7864658139bc18a854,0.581,0.711,0.523,0.475,0.433,0.438,0.825,0.568,122.0,0.571
10,sentence-transformers/all-MiniLM-L6-v2,8b3219a92973c328a8e22fadcfa821b5dc75636a,0.579,0.708,0.515,0.471,0.43,0.446,0.824,0.566,106.0,0.521
0,intfloat/multilingual-e5-small,e4ce9877abf3edfe10b0d82785e83bdcb973e22e,0.611,0.785,0.62,0.432,0.43,0.413,0.827,0.584,102.0,0.565
5,sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2,bf3bf13ab40c3157080a7ab344c831b9ad18b5eb,0.584,0.778,0.59,0.454,0.328,0.411,0.8,0.56,76.0,0.618
1,sentence-transformers/LaBSE,e34fab64a3011d2176c99545a93d5cbddc9a91b7,0.533,0.712,0.637,0.413,0.184,0.37,0.789,0.518,36.0,0.675
,model,revision,mean,mean (Clustering),mean (STS),mean (Classification),mean (Reranking),mean (Retrieval),mean (PairClassification),mean (weighted by task type),borda_count,Total Evaluation time (hours),Total CO2-eq emissions (kg)
11,intfloat/e5-mistral-7b-instruct,07163b72af1488142a360786df853f237b1a3ca1,0.67,0.514,0.836,0.752,0.498,0.548,0.884,0.672,393.0,2.502,2.971
2,GritLM/GritLM-7B,13f00a0e36500c80ce12870ea513846a066004af,0.664,0.508,0.825,0.77,0.496,0.532,0.873,0.667,384.0,3.111,3.409
7,intfloat/multilingual-e5-large-instruct,baa7be480a7de1539afce709c8f13f833a510e0a,0.652,0.499,0.843,0.732,0.487,0.51,0.862,0.656,357.0,2.033,1.418
3,intfloat/multilingual-e5-large,4dc6d853a804b9c8886ede6dda8a073b7dc08a81,0.621,0.428,0.806,0.728,0.447,0.49,0.847,0.624,270.0,2.549,1.563
6,sentence-transformers/all-mpnet-base-v2,84f2bcc00d77236f9e89c8a360a00fb1139bf47d,0.56,0.466,0.722,0.566,0.484,0.419,0.83,0.581,211.0,1.19,0.688
9,intfloat/multilingual-e5-base,d13f1b27baf31030b7fd040960d60d909913633f,0.602,0.422,0.791,0.7,0.443,0.461,0.836,0.609,211.0,1.17,0.691
4,sentence-transformers/paraphrase-multilingual-mpnet-base-v2,79f2382ceacceacdf38563d7c5d16b9ff8d725d6,0.573,0.435,0.798,0.686,0.452,0.341,0.817,0.588,188.0,1.017,0.563
8,sentence-transformers/all-MiniLM-L12-v2,a05860a77cef7b37e0048a7864658139bc18a854,0.547,0.446,0.707,0.558,0.475,0.407,0.825,0.57,172.0,0.814,0.442
10,sentence-transformers/all-MiniLM-L6-v2,8b3219a92973c328a8e22fadcfa821b5dc75636a,0.544,0.449,0.704,0.554,0.471,0.398,0.824,0.567,149.0,0.733,0.391
0,intfloat/multilingual-e5-small,e4ce9877abf3edfe10b0d82785e83bdcb973e22e,0.584,0.408,0.776,0.677,0.432,0.437,0.827,0.593,147.0,0.833,0.459
5,sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2,bf3bf13ab40c3157080a7ab344c831b9ad18b5eb,0.551,0.417,0.775,0.644,0.454,0.328,0.8,0.57,109.0,0.879,0.469
1,sentence-transformers/LaBSE,e34fab64a3011d2176c99545a93d5cbddc9a91b7,0.486,0.361,0.702,0.668,0.413,0.168,0.789,0.517,49.0,1.02,0.582
66 changes: 39 additions & 27 deletions scripts/task_selection/mteb_lite_tasks.csv
Original file line number Diff line number Diff line change
@@ -1,29 +1,41 @@
,name,type,languages,domains,license
0,AmazonCounterfactualClassification,Classification,"['deu', 'eng', 'jpn']","['Reviews', 'Written']",CC BY 4.0
0,AmazonCounterfactualClassification,Classification,"['deu', 'eng', 'jpn']","['Reviews', 'Written']",cc-by-4.0
1,ArguAna,Retrieval,['eng'],"['Medical', 'Written']",cc-by-sa-4.0
2,ArXivHierarchicalClusteringP2P,Clustering,['eng'],"['Academic', 'Written']",CC0
3,AskUbuntuDupQuestions,Reranking,['eng'],,
4,BIOSSES,STS,['eng'],,
5,BiorxivClusteringP2P.v2,Clustering,['eng'],"['Academic', 'Written']",https://www.biorxiv.org/content/about-biorxiv
6,CQADupstackGamingRetrieval,Retrieval,['eng'],,
7,FiQA2018,Retrieval,['eng'],,
8,MassiveIntentClassification,Classification,"['afr', 'amh', 'ara', 'aze', 'ben', 'cmo', 'cym', 'dan', 'deu', 'ell', 'eng', 'fas', 'fin', 'fra', 'heb', 'hin', 'hun', 'hye', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kan', 'kat', 'khm', 'kor', 'lav', 'mal', 'mon', 'msa', 'mya', 'nld', 'nob', 'pol', 'por', 'ron', 'rus', 'slv', 'spa', 'sqi', 'swa', 'swe', 'tam', 'tel', 'tgl', 'tha', 'tur', 'urd', 'vie']",['Spoken'],Apache 2.0
9,MedrxivClusteringP2P.v2,Clustering,['eng'],"['Academic', 'Medical', 'Written']",https://www.medrxiv.org/content/about-medrxiv
10,MindSmallReranking,Reranking,['eng'],"['News', 'Written']",https://github.com/msnews/MIND/blob/master/MSR%20License_Data.pdf
11,SCIDOCS,Retrieval,['eng'],"['Academic', 'Written', 'Non-fiction']",cc-by-sa-4.0
12,SICK-R,STS,['eng'],,
13,STS12,STS,['eng'],"['Encyclopaedic', 'News', 'Written']",Not specified
14,STS13,STS,['eng'],"['Web', 'News', 'Non-fiction', 'Written']",Not specified
15,STS15,STS,['eng'],"['Blog', 'News', 'Web', 'Written', 'Spoken']",Not specified
16,STS16,STS,['eng'],"['Blog', 'Web', 'Spoken']",Not specified
17,STS17,STS,"['ara', 'deu', 'eng', 'fra', 'ita', 'kor', 'nld', 'spa', 'tur']","['News', 'Web', 'Written']",Not specified
18,STS22.v2,STS,"['ara', 'cmn', 'deu', 'eng', 'fra', 'ita', 'pol', 'rus', 'spa', 'tur']","['News', 'Written']",Not specified
19,STSBenchmark,STS,['eng'],,
20,SprintDuplicateQuestions,PairClassification,['eng'],"['Programming', 'Written']",Not specified
21,StackExchangeClustering.v2,Clustering,['eng'],"['Web', 'Written']",Not specified
22,StackExchangeClusteringP2P.v2,Clustering,['eng'],"['Web', 'Written']",Not specified
23,TRECCOVID,Retrieval,['eng'],,
24,ToxicConversationsClassification,Classification,['eng'],"['Social', 'Written']",CC BY 4.0
25,TweetSentimentExtractionClassification,Classification,['eng'],"['Social', 'Written']",Not specified
26,TwitterSemEval2015,PairClassification,['eng'],,
27,TwitterURLCorpus,PairClassification,['eng'],,
2,ArXivHierarchicalClusteringP2P,Clustering,['eng'],"['Academic', 'Written']",cc0-1.0
3,ArXivHierarchicalClusteringS2S,Clustering,['eng'],"['Academic', 'Written']",cc0-1.0
4,AskUbuntuDupQuestions,Reranking,['eng'],,
5,BIOSSES,STS,['eng'],,
6,Banking77Classification,Classification,['eng'],['Written'],mit
7,BiorxivClusteringP2P.v2,Clustering,['eng'],"['Academic', 'Written']",https://www.biorxiv.org/content/about-biorxiv
8,CQADupstackGamingRetrieval,Retrieval,['eng'],,
9,CQADupstackUnixRetrieval,Retrieval,['eng'],,
10,ClimateFEVERHardNegatives,Retrieval,['eng'],,
11,FEVERHardNegatives,Retrieval,['eng'],,
12,FiQA2018,Retrieval,['eng'],,
13,HotpotQAHardNegatives,Retrieval,['eng'],"['Web', 'Written']",cc-by-sa-4.0
14,ImdbClassification,Classification,['eng'],"['Reviews', 'Written']",not specified
15,MTOPDomainClassification,Classification,"['deu', 'eng', 'fra', 'hin', 'spa', 'tha']","['Spoken', 'Spoken']",not specified
16,MassiveIntentClassification,Classification,"['afr', 'amh', 'ara', 'aze', 'ben', 'cmo', 'cym', 'dan', 'deu', 'ell', 'eng', 'fas', 'fin', 'fra', 'heb', 'hin', 'hun', 'hye', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kan', 'kat', 'khm', 'kor', 'lav', 'mal', 'mon', 'msa', 'mya', 'nld', 'nob', 'pol', 'por', 'ron', 'rus', 'slv', 'spa', 'sqi', 'swa', 'swe', 'tam', 'tel', 'tgl', 'tha', 'tur', 'urd', 'vie']",['Spoken'],apache-2.0
17,MassiveScenarioClassification,Classification,"['afr', 'amh', 'ara', 'aze', 'ben', 'cmo', 'cym', 'dan', 'deu', 'ell', 'eng', 'fas', 'fin', 'fra', 'heb', 'hin', 'hun', 'hye', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kan', 'kat', 'khm', 'kor', 'lav', 'mal', 'mon', 'msa', 'mya', 'nld', 'nob', 'pol', 'por', 'ron', 'rus', 'slv', 'spa', 'sqi', 'swa', 'swe', 'tam', 'tel', 'tgl', 'tha', 'tur', 'urd', 'vie']",['Spoken'],apache-2.0
18,MedrxivClusteringP2P.v2,Clustering,['eng'],"['Academic', 'Medical', 'Written']",https://www.medrxiv.org/content/about-medrxiv
19,MedrxivClusteringS2S.v2,Clustering,['eng'],"['Academic', 'Medical', 'Written']",https://www.medrxiv.org/content/about-medrxiv
20,MindSmallReranking,Reranking,['eng'],"['News', 'Written']",https://github.com/msnews/MIND/blob/master/MSR%20License_Data.pdf
21,SCIDOCS,Retrieval,['eng'],"['Academic', 'Written', 'Non-fiction']",cc-by-sa-4.0
22,SICK-R,STS,['eng'],,
23,STS12,STS,['eng'],"['Encyclopaedic', 'News', 'Written']",not specified
24,STS13,STS,['eng'],"['Web', 'News', 'Non-fiction', 'Written']",not specified
25,STS14,STS,['eng'],"['Blog', 'Web', 'Spoken']",not specified
26,STS15,STS,['eng'],"['Blog', 'News', 'Web', 'Written', 'Spoken']",not specified
27,STS17,STS,"['ara', 'deu', 'eng', 'fra', 'ita', 'kor', 'nld', 'spa', 'tur']","['News', 'Web', 'Written']",not specified
28,STS22.v2,STS,"['ara', 'cmn', 'deu', 'eng', 'fra', 'ita', 'pol', 'rus', 'spa', 'tur']","['News', 'Written']",not specified
29,STSBenchmark,STS,['eng'],,
30,SprintDuplicateQuestions,PairClassification,['eng'],"['Programming', 'Written']",not specified
31,StackExchangeClustering.v2,Clustering,['eng'],"['Web', 'Written']",not specified
32,StackExchangeClusteringP2P.v2,Clustering,['eng'],"['Web', 'Written']",not specified
33,TRECCOVID,Retrieval,['eng'],,
34,Touche2020,Retrieval,['eng'],,
35,ToxicConversationsClassification,Classification,['eng'],"['Social', 'Written']",cc-by-4.0
36,TweetSentimentExtractionClassification,Classification,['eng'],"['Social', 'Written']",not specified
37,TwentyNewsgroupsClustering.v2,Clustering,['eng'],"['News', 'Written']",not specified
38,TwitterSemEval2015,PairClassification,['eng'],,
39,TwitterURLCorpus,PairClassification,['eng'],,
26 changes: 13 additions & 13 deletions scripts/task_selection/mult_results.csv
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
,model,revision,mean,mean (BitextMining),mean (PairClassification),mean (Classification),mean (STS),mean (Retrieval),mean (MultilabelClassification),mean (Clustering),mean (Reranking),mean (InstructionRetrieval),mean (wieghted by task type),borda_count,Total Evaluation time (hours)
7,intfloat/multilingual-e5-large-instruct,baa7be480a7de1539afce709c8f13f833a510e0a,0.634,0.801,0.811,0.65,0.767,0.58,0.222,0.515,0.625,-0.004,0.552,1237.0,6.396
2,GritLM/GritLM-7B,13f00a0e36500c80ce12870ea513846a066004af,0.61,0.705,0.802,0.619,0.732,0.595,0.212,0.504,0.628,0.035,0.537,1114.0,8.63
11,intfloat/e5-mistral-7b-instruct,07163b72af1488142a360786df853f237b1a3ca1,0.601,0.706,0.813,0.603,0.739,0.553,0.2,0.514,0.631,-0.006,0.528,1087.0,7.516
3,intfloat/multilingual-e5-large,4dc6d853a804b9c8886ede6dda8a073b7dc08a81,0.587,0.717,0.793,0.599,0.734,0.543,0.213,0.431,0.626,-0.031,0.514,972.0,7.339
9,intfloat/multilingual-e5-base,d13f1b27baf31030b7fd040960d60d909913633f,0.571,0.694,0.776,0.582,0.712,0.53,0.202,0.428,0.599,-0.027,0.5,802.0,3.738
4,sentence-transformers/paraphrase-multilingual-mpnet-base-v2,79f2382ceacceacdf38563d7c5d16b9ff8d725d6,0.522,0.521,0.816,0.551,0.695,0.4,0.164,0.412,0.532,-0.011,0.453,693.0,15.838
0,intfloat/multilingual-e5-small,e4ce9877abf3edfe10b0d82785e83bdcb973e22e,0.555,0.675,0.768,0.565,0.699,0.496,0.191,0.418,0.602,-0.024,0.488,645.0,2.686
1,sentence-transformers/LaBSE,e34fab64a3011d2176c99545a93d5cbddc9a91b7,0.524,0.763,0.761,0.546,0.652,0.338,0.201,0.394,0.504,-0.03,0.459,586.0,3.382
5,sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2,bf3bf13ab40c3157080a7ab344c831b9ad18b5eb,0.49,0.445,0.794,0.517,0.664,0.37,0.149,0.396,0.51,-0.013,0.426,471.0,2.626
6,sentence-transformers/all-mpnet-base-v2,84f2bcc00d77236f9e89c8a360a00fb1139bf47d,0.427,0.212,0.71,0.47,0.571,0.342,0.163,0.411,0.421,-0.031,0.363,397.5,3.967
8,sentence-transformers/all-MiniLM-L12-v2,a05860a77cef7b37e0048a7864658139bc18a854,0.423,0.229,0.719,0.468,0.566,0.336,0.146,0.368,0.443,-0.008,0.363,353.0,2.56
10,sentence-transformers/all-MiniLM-L6-v2,8b3219a92973c328a8e22fadcfa821b5dc75636a,0.417,0.201,0.713,0.463,0.556,0.345,0.151,0.383,0.4,-0.028,0.354,288.5,2.316
,model,revision,mean,mean (BitextMining),mean (PairClassification),mean (Classification),mean (STS),mean (Retrieval),mean (MultilabelClassification),mean (Clustering),mean (Reranking),mean (InstructionRetrieval),mean (weighted by task type),borda_count,Total Evaluation time (hours)
7,intfloat/multilingual-e5-large-instruct,baa7be480a7de1539afce709c8f13f833a510e0a,0.634,0.801,0.812,0.65,0.767,0.58,0.229,0.515,0.63,-0.004,0.553,1244.0,6.884
2,GritLM/GritLM-7B,13f00a0e36500c80ce12870ea513846a066004af,0.609,0.705,0.802,0.619,0.732,0.591,0.212,0.504,0.628,0.035,0.536,1119.0,10.675
11,intfloat/e5-mistral-7b-instruct,07163b72af1488142a360786df853f237b1a3ca1,0.602,0.706,0.814,0.603,0.739,0.554,0.222,0.514,0.634,-0.006,0.531,1100.0,9.969
3,intfloat/multilingual-e5-large,4dc6d853a804b9c8886ede6dda8a073b7dc08a81,0.587,0.717,0.793,0.599,0.734,0.55,0.213,0.431,0.626,-0.031,0.515,980.0,9.206
9,intfloat/multilingual-e5-base,d13f1b27baf31030b7fd040960d60d909913633f,0.571,0.694,0.776,0.582,0.712,0.536,0.202,0.428,0.599,-0.027,0.5,811.0,4.261
4,sentence-transformers/paraphrase-multilingual-mpnet-base-v2,79f2382ceacceacdf38563d7c5d16b9ff8d725d6,0.52,0.521,0.816,0.551,0.695,0.393,0.164,0.412,0.532,-0.011,0.452,698.0,16.15
0,intfloat/multilingual-e5-small,e4ce9877abf3edfe10b0d82785e83bdcb973e22e,0.556,0.675,0.768,0.565,0.699,0.502,0.191,0.418,0.602,-0.024,0.488,654.0,2.893
1,sentence-transformers/LaBSE,e34fab64a3011d2176c99545a93d5cbddc9a91b7,0.521,0.763,0.761,0.546,0.652,0.329,0.201,0.394,0.504,-0.03,0.458,589.0,3.818
5,sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2,bf3bf13ab40c3157080a7ab344c831b9ad18b5eb,0.488,0.445,0.794,0.517,0.664,0.362,0.149,0.396,0.51,-0.013,0.425,475.0,2.759
6,sentence-transformers/all-mpnet-base-v2,84f2bcc00d77236f9e89c8a360a00fb1139bf47d,0.424,0.212,0.71,0.47,0.571,0.328,0.163,0.411,0.421,-0.031,0.362,397.5,4.772
8,sentence-transformers/all-MiniLM-L12-v2,a05860a77cef7b37e0048a7864658139bc18a854,0.421,0.229,0.719,0.468,0.566,0.324,0.146,0.368,0.443,-0.008,0.362,355.0,2.691
10,sentence-transformers/all-MiniLM-L6-v2,8b3219a92973c328a8e22fadcfa821b5dc75636a,0.415,0.201,0.713,0.463,0.556,0.331,0.151,0.383,0.4,-0.028,0.352,289.5,2.43
Loading

0 comments on commit 11518ed

Please sign in to comment.