diff --git a/src/main/java/io/anserini/search/topicreader/Topics.java b/src/main/java/io/anserini/search/topicreader/Topics.java index 3d5131cef4..d2a4f09db2 100755 --- a/src/main/java/io/anserini/search/topicreader/Topics.java +++ b/src/main/java/io/anserini/search/topicreader/Topics.java @@ -50,17 +50,26 @@ public enum Topics { TREC2020_DL(TsvIntTopicReader.class,"topics-and-qrels/topics.dl20.txt"), TREC2021_DL(TsvIntTopicReader.class,"topics-and-qrels/topics.dl21.txt"), MSMARCO_DOC_DEV(TsvIntTopicReader.class,"topics-and-qrels/topics.msmarco-doc.dev.txt"), + MSMARCO_DOC_DEV_UNICOIL(TsvIntTopicReader.class,"topics-and-qrels/topics.msmarco-doc.dev.unicoil.tsv.gz"), MSMARCO_DOC_TEST(TsvIntTopicReader.class,"topics-and-qrels/topics.msmarco-doc.test.txt"), MSMARCO_PASSAGE_DEV_SUBSET(TsvIntTopicReader.class, "topics-and-qrels/topics.msmarco-passage.dev-subset.txt"), - MSMARCO_PASSAGE_TEST_SUBSET(TsvIntTopicReader.class, "topics-and-qrels/topics.msmarco-passage.test-subset.txt"), MSMARCO_PASSAGE_DEV_SUBSET_DEEPIMPACT(TsvIntTopicReader.class, "topics-and-qrels/topics.msmarco-passage.dev-subset.deepimpact.tsv.gz"), MSMARCO_PASSAGE_DEV_SUBSET_UNICOIL_D2Q(TsvIntTopicReader.class, "topics-and-qrels/topics.msmarco-passage.dev-subset.unicoil.tsv.gz"), MSMARCO_PASSAGE_DEV_SUBSET_UNICOIL_TILDE(TsvIntTopicReader.class, "topics-and-qrels/topics.msmarco-passage.dev-subset.unicoil-tilde-expansion.tsv.gz"), MSMARCO_PASSAGE_DEV_SUBSET_DISTILL_SPLADE_MAX(TsvIntTopicReader.class, "topics-and-qrels/topics.msmarco-passage.dev-subset.distill-splade-max.tsv.gz"), + MSMARCO_PASSAGE_TEST_SUBSET(TsvIntTopicReader.class, "topics-and-qrels/topics.msmarco-passage.test-subset.txt"), MSMARCO_V2_DOC_DEV(TsvIntTopicReader.class,"topics-and-qrels/topics.msmarco-v2-doc.dev.txt"), + MSMARCO_V2_DOC_DEV_UNICOIL(TsvIntTopicReader.class,"topics-and-qrels/topics.msmarco-v2-doc.dev.unicoil.0shot.tsv.gz"), + MSMARCO_V2_DOC_DEV_UNICOIL_NOEXP(TsvIntTopicReader.class,"topics-and-qrels/topics.msmarco-v2-doc.dev.unicoil-noexp.0shot.tsv.gz"), MSMARCO_V2_DOC_DEV2(TsvIntTopicReader.class,"topics-and-qrels/topics.msmarco-v2-doc.dev2.txt"), + MSMARCO_V2_DOC_DEV2_UNICOIL(TsvIntTopicReader.class,"topics-and-qrels/topics.msmarco-v2-doc.dev2.unicoil.0shot.tsv.gz"), + MSMARCO_V2_DOC_DEV2_UNICOIL_NOEXP(TsvIntTopicReader.class,"topics-and-qrels/topics.msmarco-v2-doc.dev2.unicoil-noexp.0shot.tsv.gz"), MSMARCO_V2_PASSAGE_DEV(TsvIntTopicReader.class, "topics-and-qrels/topics.msmarco-v2-passage.dev.txt"), + MSMARCO_V2_PASSAGE_DEV_UNICOIL(TsvIntTopicReader.class, "topics-and-qrels/topics.msmarco-v2-passage.dev.unicoil.0shot.tsv.gz"), + MSMARCO_V2_PASSAGE_DEV_UNICOIL_NOEXP(TsvIntTopicReader.class, "topics-and-qrels/topics.msmarco-v2-passage.dev.unicoil-noexp.0shot.tsv.gz"), MSMARCO_V2_PASSAGE_DEV2(TsvIntTopicReader.class, "topics-and-qrels/topics.msmarco-v2-passage.dev2.txt"), + MSMARCO_V2_PASSAGE_DEV2_UNICOIL(TsvIntTopicReader.class, "topics-and-qrels/topics.msmarco-v2-passage.dev2.unicoil.0shot.tsv.gz"), + MSMARCO_V2_PASSAGE_DEV2_UNICOIL_NOEXP(TsvIntTopicReader.class, "topics-and-qrels/topics.msmarco-v2-passage.dev2.unicoil-noexp.0shot.tsv.gz"), NTCIR8_ZH(TsvStringTopicReader.class, "topics-and-qrels/topics.ntcir8zh.eval.txt"), CLEF2006_FR(TsvStringTopicReader.class, "topics-and-qrels/topics.clef06fr.mono.fr.txt"), TREC2002_AR(TrecTopicReader.class, "topics-and-qrels/topics.trec02ar-ar.txt"), diff --git a/src/test/java/io/anserini/search/topicreader/TopicReaderTest.java b/src/test/java/io/anserini/search/topicreader/TopicReaderTest.java index 72e1636284..880d376d86 100755 --- a/src/test/java/io/anserini/search/topicreader/TopicReaderTest.java +++ b/src/test/java/io/anserini/search/topicreader/TopicReaderTest.java @@ -37,7 +37,7 @@ public void testIterateThroughAllEnums() { String[] pathParts = topic.path.split("/"); assertEquals(topic.readerClass, TopicReader.getTopicReaderClassByFile(pathParts[1])); } - assertEquals(104, cnt); + assertEquals(113, cnt); } @Test @@ -493,104 +493,6 @@ public void testCAR_TopicIdsAsStrings() { assertEquals("Yellowstone National Park/Recreation", topics.get("enwiki:Yellowstone%20National%20Park/Recreation").get("title")); } - @Test - public void testMSMARCO() { - SortedMap> topics; - - topics = TopicReader.getTopics(Topics.MSMARCO_DOC_DEV); - assertNotNull(topics); - assertEquals(5193, topics.size()); - assertEquals(2, (int) topics.firstKey()); - assertEquals("androgen receptor define", topics.get(topics.firstKey()).get("title")); - assertEquals(1102400, (int) topics.lastKey()); - assertEquals("why do bears hibernate", topics.get(topics.lastKey()).get("title")); - - topics = TopicReader.getTopics(Topics.MSMARCO_DOC_TEST); - assertNotNull(topics); - assertEquals(5793, topics.size()); - assertEquals(57, (int) topics.firstKey()); - assertEquals("term service agreement definition", topics.get(topics.firstKey()).get("title")); - assertEquals(1136966, (int) topics.lastKey()); - assertEquals("#ffffff color code", topics.get(topics.lastKey()).get("title")); - - topics = TopicReader.getTopics(Topics.MSMARCO_PASSAGE_DEV_SUBSET); - assertNotNull(topics); - assertEquals(6980, topics.size()); - assertEquals(2, (int) topics.firstKey()); - assertEquals("Androgen receptor define", topics.get(topics.firstKey()).get("title")); - assertEquals(1102400, (int) topics.lastKey()); - assertEquals("why do bears hibernate", topics.get(topics.lastKey()).get("title")); - - topics = TopicReader.getTopics(Topics.MSMARCO_PASSAGE_DEV_SUBSET_DEEPIMPACT); - assertNotNull(topics); - assertEquals(6980, topics.size()); - assertEquals(2, (int) topics.firstKey()); - assertEquals("receptor androgen define", topics.get(topics.firstKey()).get("title")); - assertEquals(1102400, (int) topics.lastKey()); - assertEquals("why hibernate bears", topics.get(topics.lastKey()).get("title")); - - topics = TopicReader.getTopics(Topics.MSMARCO_PASSAGE_DEV_SUBSET_UNICOIL_D2Q); - assertNotNull(topics); - assertEquals(6980, topics.size()); - assertEquals(619, topics.get(topics.firstKey()).get("title").split(" ").length); - assertEquals(1102400, (int) topics.lastKey()); - assertEquals(686, topics.get(topics.lastKey()).get("title").split(" ").length); - - topics = TopicReader.getTopics(Topics.MSMARCO_PASSAGE_DEV_SUBSET_UNICOIL_TILDE); - assertNotNull(topics); - assertEquals(6980, topics.size()); - assertEquals(584, topics.get(topics.firstKey()).get("title").split(" ").length); - assertEquals(1102400, (int) topics.lastKey()); - assertEquals(610, topics.get(topics.lastKey()).get("title").split(" ").length); - - topics = TopicReader.getTopics(Topics.MSMARCO_PASSAGE_DEV_SUBSET_DISTILL_SPLADE_MAX); - assertNotNull(topics); - assertEquals(6980, topics.size()); - assertEquals(1991, topics.get(topics.firstKey()).get("title").split(" ").length); - assertEquals(1102400, (int) topics.lastKey()); - assertEquals(2409, topics.get(topics.lastKey()).get("title").split(" ").length); - - topics = TopicReader.getTopics(Topics.MSMARCO_PASSAGE_TEST_SUBSET); - assertNotNull(topics); - assertEquals(6837, topics.size()); - assertEquals(57, (int) topics.firstKey()); - assertEquals("term service agreement definition", topics.get(topics.firstKey()).get("title")); - assertEquals(1136966, (int) topics.lastKey()); - assertEquals("#ffffff color code", topics.get(topics.lastKey()).get("title")); - - topics = TopicReader.getTopics(Topics.MSMARCO_V2_DOC_DEV); - assertNotNull(topics); - assertEquals(4552, topics.size()); - assertEquals(2, (int) topics.firstKey()); - assertEquals("Androgen receptor define", topics.get(topics.firstKey()).get("title")); - assertEquals(1102390, (int) topics.lastKey()); - assertEquals("why do children get aggressive", topics.get(topics.lastKey()).get("title")); - - topics = TopicReader.getTopics(Topics.MSMARCO_V2_DOC_DEV2); - assertNotNull(topics); - assertEquals(5000, topics.size()); - assertEquals(361, (int) topics.firstKey()); - assertEquals(". irritability medical definition", topics.get(topics.firstKey()).get("title")); - assertEquals(1102413, (int) topics.lastKey()); - assertEquals("why do a ferritin level", topics.get(topics.lastKey()).get("title")); - - topics = TopicReader.getTopics(Topics.MSMARCO_V2_PASSAGE_DEV); - assertNotNull(topics); - assertEquals(3903, topics.size()); - assertEquals(2, (int) topics.firstKey()); - assertEquals("Androgen receptor define", topics.get(topics.firstKey()).get("title")); - assertEquals(1102390, (int) topics.lastKey()); - assertEquals("why do children get aggressive", topics.get(topics.lastKey()).get("title")); - - topics = TopicReader.getTopics(Topics.MSMARCO_V2_PASSAGE_DEV2); - assertNotNull(topics); - assertEquals(4281, topics.size()); - assertEquals(1325, (int) topics.firstKey()); - assertEquals("323 area code zip code", topics.get(topics.firstKey()).get("title")); - assertEquals(1102413, (int) topics.lastKey()); - assertEquals("why do a ferritin level", topics.get(topics.lastKey()).get("title")); - } - @Test public void testDprNq() { SortedMap> topics; @@ -766,6 +668,176 @@ public void testTREC21DL() { assertEquals("who killed nicholas ii of russia", topics.get(1043135).get("title")); } + @Test + public void testMSMARCO() { + SortedMap> topics; + + topics = TopicReader.getTopics(Topics.MSMARCO_DOC_DEV); + assertNotNull(topics); + assertEquals(5193, topics.size()); + assertEquals(2, (int) topics.firstKey()); + assertEquals("androgen receptor define", topics.get(topics.firstKey()).get("title")); + assertEquals(1102400, (int) topics.lastKey()); + assertEquals("why do bears hibernate", topics.get(topics.lastKey()).get("title")); + + topics = TopicReader.getTopics(Topics.MSMARCO_DOC_DEV_UNICOIL); + assertNotNull(topics); + assertEquals(5193, topics.size()); + assertEquals(2, (int) topics.firstKey()); + assertEquals(617, topics.get(topics.firstKey()).get("title").split(" ").length); + assertEquals(1102400, (int) topics.lastKey()); + assertEquals(682, topics.get(topics.lastKey()).get("title").split(" ").length); + + topics = TopicReader.getTopics(Topics.MSMARCO_DOC_TEST); + assertNotNull(topics); + assertEquals(5793, topics.size()); + assertEquals(57, (int) topics.firstKey()); + assertEquals("term service agreement definition", topics.get(topics.firstKey()).get("title")); + assertEquals(1136966, (int) topics.lastKey()); + assertEquals("#ffffff color code", topics.get(topics.lastKey()).get("title")); + + topics = TopicReader.getTopics(Topics.MSMARCO_PASSAGE_DEV_SUBSET); + assertNotNull(topics); + assertEquals(6980, topics.size()); + assertEquals(2, (int) topics.firstKey()); + assertEquals("Androgen receptor define", topics.get(topics.firstKey()).get("title")); + assertEquals(1102400, (int) topics.lastKey()); + assertEquals("why do bears hibernate", topics.get(topics.lastKey()).get("title")); + + topics = TopicReader.getTopics(Topics.MSMARCO_PASSAGE_DEV_SUBSET_DEEPIMPACT); + assertNotNull(topics); + assertEquals(6980, topics.size()); + assertEquals(2, (int) topics.firstKey()); + assertEquals("receptor androgen define", topics.get(topics.firstKey()).get("title")); + assertEquals(1102400, (int) topics.lastKey()); + assertEquals("why hibernate bears", topics.get(topics.lastKey()).get("title")); + + topics = TopicReader.getTopics(Topics.MSMARCO_PASSAGE_DEV_SUBSET_UNICOIL_D2Q); + assertNotNull(topics); + assertEquals(6980, topics.size()); + assertEquals(619, topics.get(topics.firstKey()).get("title").split(" ").length); + assertEquals(1102400, (int) topics.lastKey()); + assertEquals(686, topics.get(topics.lastKey()).get("title").split(" ").length); + + topics = TopicReader.getTopics(Topics.MSMARCO_PASSAGE_DEV_SUBSET_UNICOIL_TILDE); + assertNotNull(topics); + assertEquals(6980, topics.size()); + assertEquals(584, topics.get(topics.firstKey()).get("title").split(" ").length); + assertEquals(1102400, (int) topics.lastKey()); + assertEquals(610, topics.get(topics.lastKey()).get("title").split(" ").length); + + topics = TopicReader.getTopics(Topics.MSMARCO_PASSAGE_DEV_SUBSET_DISTILL_SPLADE_MAX); + assertNotNull(topics); + assertEquals(6980, topics.size()); + assertEquals(1991, topics.get(topics.firstKey()).get("title").split(" ").length); + assertEquals(1102400, (int) topics.lastKey()); + assertEquals(2409, topics.get(topics.lastKey()).get("title").split(" ").length); + + topics = TopicReader.getTopics(Topics.MSMARCO_PASSAGE_TEST_SUBSET); + assertNotNull(topics); + assertEquals(6837, topics.size()); + assertEquals(57, (int) topics.firstKey()); + assertEquals("term service agreement definition", topics.get(topics.firstKey()).get("title")); + assertEquals(1136966, (int) topics.lastKey()); + assertEquals("#ffffff color code", topics.get(topics.lastKey()).get("title")); + + topics = TopicReader.getTopics(Topics.MSMARCO_V2_DOC_DEV); + assertNotNull(topics); + assertEquals(4552, topics.size()); + assertEquals(2, (int) topics.firstKey()); + assertEquals("Androgen receptor define", topics.get(topics.firstKey()).get("title")); + assertEquals(1102390, (int) topics.lastKey()); + assertEquals("why do children get aggressive", topics.get(topics.lastKey()).get("title")); + + topics = TopicReader.getTopics(Topics.MSMARCO_V2_DOC_DEV_UNICOIL); + assertNotNull(topics); + assertEquals(4552, topics.size()); + assertEquals(2, (int) topics.firstKey()); + assertEquals(617, topics.get(topics.firstKey()).get("title").split(" ").length); + assertEquals(1102390, (int) topics.lastKey()); + assertEquals(608, topics.get(topics.lastKey()).get("title").split(" ").length); + + topics = TopicReader.getTopics(Topics.MSMARCO_V2_DOC_DEV_UNICOIL_NOEXP); + assertNotNull(topics); + assertEquals(4552, topics.size()); + assertEquals(2, (int) topics.firstKey()); + assertEquals(609, topics.get(topics.firstKey()).get("title").split(" ").length); + assertEquals(1102390, (int) topics.lastKey()); + assertEquals(533, topics.get(topics.lastKey()).get("title").split(" ").length); + + topics = TopicReader.getTopics(Topics.MSMARCO_V2_DOC_DEV2); + assertNotNull(topics); + assertEquals(5000, topics.size()); + assertEquals(361, (int) topics.firstKey()); + assertEquals(". irritability medical definition", topics.get(topics.firstKey()).get("title")); + assertEquals(1102413, (int) topics.lastKey()); + assertEquals("why do a ferritin level", topics.get(topics.lastKey()).get("title")); + + topics = TopicReader.getTopics(Topics.MSMARCO_V2_DOC_DEV2_UNICOIL); + assertNotNull(topics); + assertEquals(5000, topics.size()); + assertEquals(361, (int) topics.firstKey()); + assertEquals(714, topics.get(topics.firstKey()).get("title").split(" ").length); + assertEquals(1102413, (int) topics.lastKey()); + assertEquals(664, topics.get(topics.lastKey()).get("title").split(" ").length); + + topics = TopicReader.getTopics(Topics.MSMARCO_V2_DOC_DEV2_UNICOIL_NOEXP); + assertNotNull(topics); + assertEquals(5000, topics.size()); + assertEquals(361, (int) topics.firstKey()); + assertEquals(690, topics.get(topics.firstKey()).get("title").split(" ").length); + assertEquals(1102413, (int) topics.lastKey()); + assertEquals(537, topics.get(topics.lastKey()).get("title").split(" ").length); + + topics = TopicReader.getTopics(Topics.MSMARCO_V2_PASSAGE_DEV); + assertNotNull(topics); + assertEquals(3903, topics.size()); + assertEquals(2, (int) topics.firstKey()); + assertEquals("Androgen receptor define", topics.get(topics.firstKey()).get("title")); + assertEquals(1102390, (int) topics.lastKey()); + assertEquals("why do children get aggressive", topics.get(topics.lastKey()).get("title")); + + topics = TopicReader.getTopics(Topics.MSMARCO_V2_PASSAGE_DEV_UNICOIL); + assertNotNull(topics); + assertEquals(3903, topics.size()); + assertEquals(2, (int) topics.firstKey()); + assertEquals(617, topics.get(topics.firstKey()).get("title").split(" ").length); + assertEquals(1102390, (int) topics.lastKey()); + assertEquals(608, topics.get(topics.lastKey()).get("title").split(" ").length); + + topics = TopicReader.getTopics(Topics.MSMARCO_V2_PASSAGE_DEV_UNICOIL_NOEXP); + assertNotNull(topics); + assertEquals(3903, topics.size()); + assertEquals(2, (int) topics.firstKey()); + assertEquals(609, topics.get(topics.firstKey()).get("title").split(" ").length); + assertEquals(1102390, (int) topics.lastKey()); + assertEquals(533, topics.get(topics.lastKey()).get("title").split(" ").length); + + topics = TopicReader.getTopics(Topics.MSMARCO_V2_PASSAGE_DEV2); + assertNotNull(topics); + assertEquals(4281, topics.size()); + assertEquals(1325, (int) topics.firstKey()); + assertEquals("323 area code zip code", topics.get(topics.firstKey()).get("title")); + assertEquals(1102413, (int) topics.lastKey()); + assertEquals("why do a ferritin level", topics.get(topics.lastKey()).get("title")); + + topics = TopicReader.getTopics(Topics.MSMARCO_V2_PASSAGE_DEV2_UNICOIL); + assertNotNull(topics); + assertEquals(4281, topics.size()); + assertEquals(1325, (int) topics.firstKey()); + assertEquals(671, topics.get(topics.firstKey()).get("title").split(" ").length); + assertEquals(1102413, (int) topics.lastKey()); + assertEquals(664, topics.get(topics.lastKey()).get("title").split(" ").length); + + topics = TopicReader.getTopics(Topics.MSMARCO_V2_PASSAGE_DEV2_UNICOIL_NOEXP); + assertNotNull(topics); + assertEquals(4281, topics.size()); + assertEquals(1325, (int) topics.firstKey()); + assertEquals(649, topics.get(topics.firstKey()).get("title").split(" ").length); + assertEquals(1102413, (int) topics.lastKey()); + assertEquals(537, topics.get(topics.lastKey()).get("title").split(" ").length); + } + @Test public void testMSMARO_TopicIdsAsStrings() { Map> topics;