embeddings-benchmark · KennethEnevoldsen · Sep 28, 2024 · Sep 23, 2024 · Sep 24, 2024 · Sep 24, 2024
diff --git a/docs/mmteb/points/1236.jsonl b/docs/mmteb/points/1236.jsonl
@@ -0,0 +1,3 @@
+{"GitHub": "orionw", "Coordination": 25}
+{"GitHub": "KennethEnevoldsen", "Review PR": 2, "Bug fixes": 2}
+{"GitHub": "vaibhavad", "Coordination": 25}
diff --git a/mteb/tasks/Retrieval/eng/ClimateFEVERRetrieval.py b/mteb/tasks/Retrieval/eng/ClimateFEVERRetrieval.py
@@ -48,3 +48,48 @@ class ClimateFEVER(AbsTaskRetrieval):
             },
         },
     )
+
+
+class ClimateFEVERHardNegatives(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="ClimateFEVERHardNegatives",
+        description="CLIMATE-FEVER is a dataset adopting the FEVER methodology that consists of 1,535 real-world claims regarding climate-change. The hard negative version has been created by pooling the 250 top documents per query from BM25, e5-multilingual-large and e5-mistral-instruct.",
+        reference="https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html",
+        dataset={
+            "path": "mteb/ClimateFEVER_test_top_250_only_w_correct-v2",
+            "revision": "3a309e201f3c2c4b13bd4a367a8f37eee2ec1d21",
+        },
+        type="Retrieval",
+        category="s2p",
+        modalities=["text"],
+        eval_splits=["test"],
+        eval_langs=["eng-Latn"],
+        main_score="ndcg_at_10",
+        date=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        annotations_creators=None,
+        dialect=None,
+        sample_creation=None,
+        bibtex_citation="""@misc{diggelmann2021climatefever,
+      title={CLIMATE-FEVER: A Dataset for Verification of Real-World Climate Claims}, 
+      author={Thomas Diggelmann and Jordan Boyd-Graber and Jannis Bulian and Massimiliano Ciaramita and Markus Leippold},
+      year={2021},
+      eprint={2012.00614},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}""",
+        descriptive_stats={
+            "n_samples": {"test": 1000},
+            "avg_character_length": {
+                "test": {
+                    "average_document_length": 1245.4236333727013,
+                    "average_query_length": 121.879,
+                    "num_documents": 47416,
+                    "num_queries": 1000,
+                    "average_relevant_docs_per_query": 3.048,
+                }
+            },
+        },
+    )
diff --git a/mteb/tasks/Retrieval/eng/DBPediaRetrieval.py b/mteb/tasks/Retrieval/eng/DBPediaRetrieval.py
@@ -50,3 +50,50 @@ class DBPedia(AbsTaskRetrieval):
             },
         },
     )
+
+
+class DBPediaHardNegatives(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="DBPediaHardNegatives",
+        description="DBpedia-Entity is a standard test collection for entity search over the DBpedia knowledge base. The hard negative version has been created by pooling the 250 top documents per query from BM25, e5-multilingual-large and e5-mistral-instruct.",
+        reference="https://github.com/iai-group/DBpedia-Entity/",
+        dataset={
+            "path": "mteb/DBPedia_test_top_250_only_w_correct-v2",
+            "revision": "943ec7fdfef3728b2ad1966c5b6479ff9ffd26c9",
+        },
+        type="Retrieval",
+        category="s2p",
+        modalities=["text"],
+        eval_splits=["test"],
+        eval_langs=["eng-Latn"],
+        main_score="ndcg_at_10",
+        date=("2017-01-01", "2017-01-01"),  # best guess: based on publication date
+        domains=["Written", "Encyclopaedic"],
+        task_subtypes=[],
+        license="mit",
+        annotations_creators="derived",
+        dialect=[],
+        sample_creation="found",
+        bibtex_citation="""@inproceedings{Hasibi:2017:DVT,
+ author =    {Hasibi, Faegheh and Nikolaev, Fedor and Xiong, Chenyan and Balog, Krisztian and Bratsberg, Svein Erik and Kotov, Alexander and Callan, Jamie},
+ title =     {DBpedia-Entity V2: A Test Collection for Entity Search},
+ booktitle = {Proceedings of the 40th International ACM SIGIR Conference on Research and Development in Information Retrieval},
+ series =    {SIGIR '17},
+ year =      {2017},
+ pages =     {1265--1268},
+ doi =       {10.1145/3077136.3080751},
+ publisher = {ACM}
+}""",
+        descriptive_stats={
+            "n_samples": {"test": 400},
+            "avg_character_length": {
+                "test": {
+                    "average_document_length": 338.58561119129564,
+                    "average_query_length": 34.085,
+                    "num_documents": 90070,
+                    "num_queries": 400,
+                    "average_relevant_docs_per_query": 38.215,
+                }
+            },
+        },
+    )
diff --git a/mteb/tasks/Retrieval/eng/FEVERRetrieval.py b/mteb/tasks/Retrieval/eng/FEVERRetrieval.py
@@ -79,3 +79,65 @@ class FEVER(AbsTaskRetrieval):
             },
         },
     )
+
+
+class FEVERHardNegatives(AbsTaskRetrieval):
+    ignore_identical_ids = True
+
+    metadata = TaskMetadata(
+        name="FEVERHardNegatives",
+        dataset={
+            "path": "mteb/FEVER_test_top_250_only_w_correct-v2",
+            "revision": "080c9ed6267b65029207906e815d44a9240bafca",
+        },
+        description=(
+            "FEVER (Fact Extraction and VERification) consists of 185,445 claims generated by altering sentences"
+            + " extracted from Wikipedia and subsequently verified without knowledge of the sentence they were"
+            + " derived from. The hard negative version has been created by pooling the 250 top documents per query from BM25, e5-multilingual-large and e5-mistral-instruct."
+        ),
+        reference="https://fever.ai/",
+        type="Retrieval",
+        category="s2p",
+        modalities=["text"],
+        eval_splits=["test"],
+        eval_langs=["eng-Latn"],
+        main_score="ndcg_at_10",
+        date=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        annotations_creators=None,
+        dialect=None,
+        sample_creation=None,
+        bibtex_citation="""@inproceedings{thorne-etal-2018-fever,
+    title = "{FEVER}: a Large-scale Dataset for Fact Extraction and {VER}ification",
+    author = "Thorne, James  and
+      Vlachos, Andreas  and
+      Christodoulopoulos, Christos  and
+      Mittal, Arpit",
+    editor = "Walker, Marilyn  and
+      Ji, Heng  and
+      Stent, Amanda",
+    booktitle = "Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)",
+    month = jun,
+    year = "2018",
+    address = "New Orleans, Louisiana",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/N18-1074",
+    doi = "10.18653/v1/N18-1074",
+    pages = "809--819",
+    abstract = "In this paper we introduce a new publicly available dataset for verification against textual sources, FEVER: Fact Extraction and VERification. It consists of 185,445 claims generated by altering sentences extracted from Wikipedia and subsequently verified without knowledge of the sentence they were derived from. The claims are classified as Supported, Refuted or NotEnoughInfo by annotators achieving 0.6841 in Fleiss kappa. For the first two classes, the annotators also recorded the sentence(s) forming the necessary evidence for their judgment. To characterize the challenge of the dataset presented, we develop a pipeline approach and compare it to suitably designed oracles. The best accuracy we achieve on labeling a claim accompanied by the correct evidence is 31.87{\%}, while if we ignore the evidence we achieve 50.91{\%}. Thus we believe that FEVER is a challenging testbed that will help stimulate progress on claim verification against textual sources.",
+}""",
+        descriptive_stats={
+            "n_samples": {"test": 1000},
+            "avg_character_length": {
+                "test": {
+                    "average_document_length": 695.4370242764114,
+                    "average_query_length": 49.62,
+                    "num_documents": 163698,
+                    "num_queries": 1000,
+                    "average_relevant_docs_per_query": 1.171,
+                }
+            },
+        },
+    )
diff --git a/mteb/tasks/Retrieval/eng/HotpotQARetrieval.py b/mteb/tasks/Retrieval/eng/HotpotQARetrieval.py
@@ -80,3 +80,66 @@ class HotpotQA(AbsTaskRetrieval):
             },
         },
     )
+
+
+class HotpotQAHardNegatives(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="HotpotQAHardNegatives",
+        dataset={
+            "path": "mteb/HotpotQA_test_top_250_only_w_correct-v2",
+            "revision": "617612fa63afcb60e3b134bed8b7216a99707c37",
+        },
+        description=(
+            "HotpotQA is a question answering dataset featuring natural, multi-hop questions, with strong"
+            + " supervision for supporting facts to enable more explainable question answering systems.  The hard negative version has been created by pooling the 250 top documents per query from BM25, e5-multilingual-large and e5-mistral-instruct."
+        ),
+        reference="https://hotpotqa.github.io/",
+        type="Retrieval",
+        category="s2p",
+        modalities=["text"],
+        eval_splits=["test"],
+        eval_langs=["eng-Latn"],
+        main_score="ndcg_at_10",
+        date=("2018-01-01", "2018-12-31"),  # best guess: based on publication date
+        domains=["Web", "Written"],
+        task_subtypes=["Question answering"],
+        license="cc-by-sa-4.0",
+        annotations_creators="human-annotated",
+        dialect=[],
+        sample_creation="found",
+        bibtex_citation="""@inproceedings{yang-etal-2018-hotpotqa,
+    title = "{H}otpot{QA}: A Dataset for Diverse, Explainable Multi-hop Question Answering",
+    author = "Yang, Zhilin  and
+      Qi, Peng  and
+      Zhang, Saizheng  and
+      Bengio, Yoshua  and
+      Cohen, William  and
+      Salakhutdinov, Ruslan  and
+      Manning, Christopher D.",
+    editor = "Riloff, Ellen  and
+      Chiang, David  and
+      Hockenmaier, Julia  and
+      Tsujii, Jun{'}ichi",
+    booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing",
+    month = oct # "-" # nov,
+    year = "2018",
+    address = "Brussels, Belgium",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/D18-1259",
+    doi = "10.18653/v1/D18-1259",
+    pages = "2369--2380",
+    abstract = "Existing question answering (QA) datasets fail to train QA systems to perform complex reasoning and provide explanations for answers. We introduce HotpotQA, a new dataset with 113k Wikipedia-based question-answer pairs with four key features: (1) the questions require finding and reasoning over multiple supporting documents to answer; (2) the questions are diverse and not constrained to any pre-existing knowledge bases or knowledge schemas; (3) we provide sentence-level supporting facts required for reasoning, allowing QA systems to reason with strong supervision and explain the predictions; (4) we offer a new type of factoid comparison questions to test QA systems{'} ability to extract relevant facts and perform necessary comparison. We show that HotpotQA is challenging for the latest QA systems, and the supporting facts enable models to improve performance and make explainable predictions.",
+}""",
+        descriptive_stats={
+            "n_samples": {"test": 1000},
+            "avg_character_length": {
+                "test": {
+                    "average_document_length": 373.558822095461,
+                    "average_query_length": 92.584,
+                    "num_documents": 225621,
+                    "num_queries": 1000,
+                    "average_relevant_docs_per_query": 2.0,
+                }
+            },
+        },
+    )
diff --git a/mteb/tasks/Retrieval/eng/MSMARCORetrieval.py b/mteb/tasks/Retrieval/eng/MSMARCORetrieval.py
@@ -76,3 +76,62 @@ class MSMARCO(AbsTaskRetrieval):
             },
         },
     )
+
+
+class MSMARCOHardNegatives(AbsTaskRetrieval):
+    ignore_identical_ids = True
+
+    metadata = TaskMetadata(
+        name="MSMARCOHardNegatives",
+        dataset={
+            "path": "mteb/MSMARCO_test_top_250_only_w_correct-v2",
+            "revision": "67c0b4f7f15946e0b15cf6cf3b8993d04cb3efc6",
+        },
+        description="MS MARCO is a collection of datasets focused on deep learning in search. The hard negative version has been created by pooling the 250 top documents per query from BM25, e5-multilingual-large and e5-mistral-instruct.",
+        reference="https://microsoft.github.io/msmarco/",
+        type="Retrieval",
+        category="s2p",
+        modalities=["text"],
+        eval_splits=["test"],
+        eval_langs=["eng-Latn"],
+        main_score="ndcg_at_10",
+        date=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        annotations_creators=None,
+        dialect=None,
+        sample_creation=None,
+        bibtex_citation="""@article{DBLP:journals/corr/NguyenRSGTMD16,
+  author    = {Tri Nguyen and
+               Mir Rosenberg and
+               Xia Song and
+               Jianfeng Gao and
+               Saurabh Tiwary and
+               Rangan Majumder and
+               Li Deng},
+  title     = {{MS} {MARCO:} {A} Human Generated MAchine Reading COmprehension Dataset},
+  journal   = {CoRR},
+  volume    = {abs/1611.09268},
+  year      = {2016},
+  url       = {http://arxiv.org/abs/1611.09268},
+  archivePrefix = {arXiv},
+  eprint    = {1611.09268},
+  timestamp = {Mon, 13 Aug 2018 16:49:03 +0200},
+  biburl    = {https://dblp.org/rec/journals/corr/NguyenRSGTMD16.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+}""",
+        descriptive_stats={
+            "n_samples": {"test": 43},
+            "avg_character_length": {
+                "test": {
+                    "average_document_length": 355.2909668633681,
+                    "average_query_length": 32.74418604651163,
+                    "num_documents": 8812,
+                    "num_queries": 43,
+                    "average_relevant_docs_per_query": 95.3953488372093,
+                }
+            },
+        },
+    )
diff --git a/mteb/tasks/Retrieval/eng/NQRetrieval.py b/mteb/tasks/Retrieval/eng/NQRetrieval.py
@@ -46,3 +46,46 @@ class NQ(AbsTaskRetrieval):
             },
         },
     )
+
+
+class NQHardNegatives(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="NQHardNegatives",
+        dataset={
+            "path": "mteb/NQ_test_top_250_only_w_correct-v2",
+            "revision": "d700fe4f167a5db8e6c9b03e8c26e7eaf66faf97",
+        },
+        description="NFCorpus: A Full-Text Learning to Rank Dataset for Medical Information Retrieval. The hard negative version has been created by pooling the 250 top documents per query from BM25, e5-multilingual-large and e5-mistral-instruct.",
+        reference="https://ai.google.com/research/NaturalQuestions/",
+        type="Retrieval",
+        category="s2p",
+        modalities=["text"],
+        eval_splits=["test"],
+        eval_langs=["eng-Latn"],
+        main_score="ndcg_at_10",
+        date=None,
+        domains=None,
+        task_subtypes=None,
+        license=None,
+        annotations_creators=None,
+        dialect=None,
+        sample_creation=None,
+        bibtex_citation="""@article{47761,title	= {Natural Questions: a Benchmark for Question Answering Research},
+        author	= {Tom Kwiatkowski and Jennimaria Palomaki and Olivia Redfield and Michael Collins and Ankur Parikh 
+        and Chris Alberti and Danielle Epstein and Illia Polosukhin and Matthew Kelcey and Jacob Devlin and Kenton Lee 
+        and Kristina N. Toutanova and Llion Jones and Ming-Wei Chang and Andrew Dai and Jakob Uszkoreit and Quoc Le 
+        and Slav Petrov},year	= {2019},journal	= {Transactions of the Association of Computational 
+        Linguistics}}""",
+        descriptive_stats={
+            "n_samples": {"test": 1000},
+            "avg_character_length": {
+                "test": {
+                    "average_document_length": 602.7903551179953,
+                    "average_query_length": 47.878,
+                    "num_documents": 198779,
+                    "num_queries": 1000,
+                    "average_relevant_docs_per_query": 1.213,
+                }
+            },
+        },
+    )