Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Downsample large retrieval datasets #1236

Merged
merged 12 commits into from
Sep 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions docs/mmteb/points/1236.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{"GitHub": "orionw", "Coordination": 25}
{"GitHub": "KennethEnevoldsen", "Review PR": 2, "Bug fixes": 2}
{"GitHub": "vaibhavad", "Coordination": 25}
45 changes: 45 additions & 0 deletions mteb/tasks/Retrieval/eng/ClimateFEVERRetrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,48 @@ class ClimateFEVER(AbsTaskRetrieval):
},
},
)


class ClimateFEVERHardNegatives(AbsTaskRetrieval):
metadata = TaskMetadata(
name="ClimateFEVERHardNegatives",
description="CLIMATE-FEVER is a dataset adopting the FEVER methodology that consists of 1,535 real-world claims regarding climate-change. The hard negative version has been created by pooling the 250 top documents per query from BM25, e5-multilingual-large and e5-mistral-instruct.",
reference="https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html",
dataset={
"path": "mteb/ClimateFEVER_test_top_250_only_w_correct-v2",
"revision": "3a309e201f3c2c4b13bd4a367a8f37eee2ec1d21",
},
type="Retrieval",
category="s2p",
modalities=["text"],
eval_splits=["test"],
eval_langs=["eng-Latn"],
main_score="ndcg_at_10",
date=None,
domains=None,
task_subtypes=None,
license=None,
annotations_creators=None,
dialect=None,
sample_creation=None,
bibtex_citation="""@misc{diggelmann2021climatefever,
title={CLIMATE-FEVER: A Dataset for Verification of Real-World Climate Claims},
author={Thomas Diggelmann and Jordan Boyd-Graber and Jannis Bulian and Massimiliano Ciaramita and Markus Leippold},
year={2021},
eprint={2012.00614},
archivePrefix={arXiv},
primaryClass={cs.CL}
}""",
descriptive_stats={
"n_samples": {"test": 1000},
"avg_character_length": {
"test": {
"average_document_length": 1245.4236333727013,
"average_query_length": 121.879,
"num_documents": 47416,
"num_queries": 1000,
"average_relevant_docs_per_query": 3.048,
}
},
},
)
47 changes: 47 additions & 0 deletions mteb/tasks/Retrieval/eng/DBPediaRetrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,3 +50,50 @@ class DBPedia(AbsTaskRetrieval):
},
},
)


class DBPediaHardNegatives(AbsTaskRetrieval):
metadata = TaskMetadata(
name="DBPediaHardNegatives",
description="DBpedia-Entity is a standard test collection for entity search over the DBpedia knowledge base. The hard negative version has been created by pooling the 250 top documents per query from BM25, e5-multilingual-large and e5-mistral-instruct.",
reference="https://github.com/iai-group/DBpedia-Entity/",
dataset={
"path": "mteb/DBPedia_test_top_250_only_w_correct-v2",
"revision": "943ec7fdfef3728b2ad1966c5b6479ff9ffd26c9",
},
type="Retrieval",
category="s2p",
modalities=["text"],
eval_splits=["test"],
eval_langs=["eng-Latn"],
main_score="ndcg_at_10",
date=("2017-01-01", "2017-01-01"), # best guess: based on publication date
domains=["Written", "Encyclopaedic"],
task_subtypes=[],
license="mit",
annotations_creators="derived",
dialect=[],
sample_creation="found",
bibtex_citation="""@inproceedings{Hasibi:2017:DVT,
author = {Hasibi, Faegheh and Nikolaev, Fedor and Xiong, Chenyan and Balog, Krisztian and Bratsberg, Svein Erik and Kotov, Alexander and Callan, Jamie},
title = {DBpedia-Entity V2: A Test Collection for Entity Search},
booktitle = {Proceedings of the 40th International ACM SIGIR Conference on Research and Development in Information Retrieval},
series = {SIGIR '17},
year = {2017},
pages = {1265--1268},
doi = {10.1145/3077136.3080751},
publisher = {ACM}
}""",
descriptive_stats={
"n_samples": {"test": 400},
"avg_character_length": {
"test": {
"average_document_length": 338.58561119129564,
"average_query_length": 34.085,
"num_documents": 90070,
"num_queries": 400,
"average_relevant_docs_per_query": 38.215,
}
},
},
)
62 changes: 62 additions & 0 deletions mteb/tasks/Retrieval/eng/FEVERRetrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,3 +79,65 @@ class FEVER(AbsTaskRetrieval):
},
},
)


class FEVERHardNegatives(AbsTaskRetrieval):
ignore_identical_ids = True

metadata = TaskMetadata(
name="FEVERHardNegatives",
dataset={
"path": "mteb/FEVER_test_top_250_only_w_correct-v2",
"revision": "080c9ed6267b65029207906e815d44a9240bafca",
},
description=(
"FEVER (Fact Extraction and VERification) consists of 185,445 claims generated by altering sentences"
+ " extracted from Wikipedia and subsequently verified without knowledge of the sentence they were"
+ " derived from. The hard negative version has been created by pooling the 250 top documents per query from BM25, e5-multilingual-large and e5-mistral-instruct."
),
reference="https://fever.ai/",
type="Retrieval",
category="s2p",
modalities=["text"],
eval_splits=["test"],
eval_langs=["eng-Latn"],
main_score="ndcg_at_10",
date=None,
domains=None,
task_subtypes=None,
license=None,
annotations_creators=None,
dialect=None,
sample_creation=None,
bibtex_citation="""@inproceedings{thorne-etal-2018-fever,
title = "{FEVER}: a Large-scale Dataset for Fact Extraction and {VER}ification",
author = "Thorne, James and
Vlachos, Andreas and
Christodoulopoulos, Christos and
Mittal, Arpit",
editor = "Walker, Marilyn and
Ji, Heng and
Stent, Amanda",
booktitle = "Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)",
month = jun,
year = "2018",
address = "New Orleans, Louisiana",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/N18-1074",
doi = "10.18653/v1/N18-1074",
pages = "809--819",
abstract = "In this paper we introduce a new publicly available dataset for verification against textual sources, FEVER: Fact Extraction and VERification. It consists of 185,445 claims generated by altering sentences extracted from Wikipedia and subsequently verified without knowledge of the sentence they were derived from. The claims are classified as Supported, Refuted or NotEnoughInfo by annotators achieving 0.6841 in Fleiss kappa. For the first two classes, the annotators also recorded the sentence(s) forming the necessary evidence for their judgment. To characterize the challenge of the dataset presented, we develop a pipeline approach and compare it to suitably designed oracles. The best accuracy we achieve on labeling a claim accompanied by the correct evidence is 31.87{\%}, while if we ignore the evidence we achieve 50.91{\%}. Thus we believe that FEVER is a challenging testbed that will help stimulate progress on claim verification against textual sources.",
}""",
descriptive_stats={
"n_samples": {"test": 1000},
"avg_character_length": {
"test": {
"average_document_length": 695.4370242764114,
"average_query_length": 49.62,
"num_documents": 163698,
"num_queries": 1000,
"average_relevant_docs_per_query": 1.171,
}
},
},
)
63 changes: 63 additions & 0 deletions mteb/tasks/Retrieval/eng/HotpotQARetrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,3 +80,66 @@ class HotpotQA(AbsTaskRetrieval):
},
},
)


class HotpotQAHardNegatives(AbsTaskRetrieval):
metadata = TaskMetadata(
name="HotpotQAHardNegatives",
dataset={
"path": "mteb/HotpotQA_test_top_250_only_w_correct-v2",
"revision": "617612fa63afcb60e3b134bed8b7216a99707c37",
},
description=(
"HotpotQA is a question answering dataset featuring natural, multi-hop questions, with strong"
+ " supervision for supporting facts to enable more explainable question answering systems. The hard negative version has been created by pooling the 250 top documents per query from BM25, e5-multilingual-large and e5-mistral-instruct."
),
reference="https://hotpotqa.github.io/",
type="Retrieval",
category="s2p",
modalities=["text"],
eval_splits=["test"],
eval_langs=["eng-Latn"],
main_score="ndcg_at_10",
date=("2018-01-01", "2018-12-31"), # best guess: based on publication date
domains=["Web", "Written"],
task_subtypes=["Question answering"],
license="cc-by-sa-4.0",
annotations_creators="human-annotated",
dialect=[],
sample_creation="found",
bibtex_citation="""@inproceedings{yang-etal-2018-hotpotqa,
title = "{H}otpot{QA}: A Dataset for Diverse, Explainable Multi-hop Question Answering",
author = "Yang, Zhilin and
Qi, Peng and
Zhang, Saizheng and
Bengio, Yoshua and
Cohen, William and
Salakhutdinov, Ruslan and
Manning, Christopher D.",
editor = "Riloff, Ellen and
Chiang, David and
Hockenmaier, Julia and
Tsujii, Jun{'}ichi",
booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing",
month = oct # "-" # nov,
year = "2018",
address = "Brussels, Belgium",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/D18-1259",
doi = "10.18653/v1/D18-1259",
pages = "2369--2380",
abstract = "Existing question answering (QA) datasets fail to train QA systems to perform complex reasoning and provide explanations for answers. We introduce HotpotQA, a new dataset with 113k Wikipedia-based question-answer pairs with four key features: (1) the questions require finding and reasoning over multiple supporting documents to answer; (2) the questions are diverse and not constrained to any pre-existing knowledge bases or knowledge schemas; (3) we provide sentence-level supporting facts required for reasoning, allowing QA systems to reason with strong supervision and explain the predictions; (4) we offer a new type of factoid comparison questions to test QA systems{'} ability to extract relevant facts and perform necessary comparison. We show that HotpotQA is challenging for the latest QA systems, and the supporting facts enable models to improve performance and make explainable predictions.",
}""",
descriptive_stats={
"n_samples": {"test": 1000},
"avg_character_length": {
"test": {
"average_document_length": 373.558822095461,
"average_query_length": 92.584,
"num_documents": 225621,
"num_queries": 1000,
"average_relevant_docs_per_query": 2.0,
}
},
},
)
59 changes: 59 additions & 0 deletions mteb/tasks/Retrieval/eng/MSMARCORetrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,3 +76,62 @@ class MSMARCO(AbsTaskRetrieval):
},
},
)


class MSMARCOHardNegatives(AbsTaskRetrieval):
ignore_identical_ids = True

metadata = TaskMetadata(
name="MSMARCOHardNegatives",
dataset={
"path": "mteb/MSMARCO_test_top_250_only_w_correct-v2",
"revision": "67c0b4f7f15946e0b15cf6cf3b8993d04cb3efc6",
},
description="MS MARCO is a collection of datasets focused on deep learning in search. The hard negative version has been created by pooling the 250 top documents per query from BM25, e5-multilingual-large and e5-mistral-instruct.",
reference="https://microsoft.github.io/msmarco/",
type="Retrieval",
category="s2p",
modalities=["text"],
eval_splits=["test"],
eval_langs=["eng-Latn"],
main_score="ndcg_at_10",
date=None,
domains=None,
task_subtypes=None,
license=None,
annotations_creators=None,
dialect=None,
sample_creation=None,
bibtex_citation="""@article{DBLP:journals/corr/NguyenRSGTMD16,
author = {Tri Nguyen and
Mir Rosenberg and
Xia Song and
Jianfeng Gao and
Saurabh Tiwary and
Rangan Majumder and
Li Deng},
title = {{MS} {MARCO:} {A} Human Generated MAchine Reading COmprehension Dataset},
journal = {CoRR},
volume = {abs/1611.09268},
year = {2016},
url = {http://arxiv.org/abs/1611.09268},
archivePrefix = {arXiv},
eprint = {1611.09268},
timestamp = {Mon, 13 Aug 2018 16:49:03 +0200},
biburl = {https://dblp.org/rec/journals/corr/NguyenRSGTMD16.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
}""",
descriptive_stats={
"n_samples": {"test": 43},
"avg_character_length": {
"test": {
"average_document_length": 355.2909668633681,
"average_query_length": 32.74418604651163,
"num_documents": 8812,
"num_queries": 43,
"average_relevant_docs_per_query": 95.3953488372093,
}
},
},
)
43 changes: 43 additions & 0 deletions mteb/tasks/Retrieval/eng/NQRetrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,46 @@ class NQ(AbsTaskRetrieval):
},
},
)


class NQHardNegatives(AbsTaskRetrieval):
metadata = TaskMetadata(
name="NQHardNegatives",
dataset={
"path": "mteb/NQ_test_top_250_only_w_correct-v2",
"revision": "d700fe4f167a5db8e6c9b03e8c26e7eaf66faf97",
},
description="NFCorpus: A Full-Text Learning to Rank Dataset for Medical Information Retrieval. The hard negative version has been created by pooling the 250 top documents per query from BM25, e5-multilingual-large and e5-mistral-instruct.",
reference="https://ai.google.com/research/NaturalQuestions/",
type="Retrieval",
category="s2p",
modalities=["text"],
eval_splits=["test"],
eval_langs=["eng-Latn"],
main_score="ndcg_at_10",
date=None,
domains=None,
task_subtypes=None,
license=None,
annotations_creators=None,
dialect=None,
sample_creation=None,
bibtex_citation="""@article{47761,title = {Natural Questions: a Benchmark for Question Answering Research},
author = {Tom Kwiatkowski and Jennimaria Palomaki and Olivia Redfield and Michael Collins and Ankur Parikh
and Chris Alberti and Danielle Epstein and Illia Polosukhin and Matthew Kelcey and Jacob Devlin and Kenton Lee
and Kristina N. Toutanova and Llion Jones and Ming-Wei Chang and Andrew Dai and Jakob Uszkoreit and Quoc Le
and Slav Petrov},year = {2019},journal = {Transactions of the Association of Computational
Linguistics}}""",
descriptive_stats={
"n_samples": {"test": 1000},
"avg_character_length": {
"test": {
"average_document_length": 602.7903551179953,
"average_query_length": 47.878,
"num_documents": 198779,
"num_queries": 1000,
"average_relevant_docs_per_query": 1.213,
}
},
},
)
Loading
Loading