Skip to content

Commit

Permalink
Fix batch_search for RM3, make BM25prf thread-safe (#1992)
Browse files Browse the repository at this point in the history
Also fixes a bug in bglinking regressions.
  • Loading branch information
lintool authored Oct 17, 2022
1 parent fa4af9f commit 7b244ee
Show file tree
Hide file tree
Showing 11 changed files with 55 additions and 32 deletions.
4 changes: 2 additions & 2 deletions docs/regressions-backgroundlinking18.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ With the above commands, you should be able to reproduce the following results:

| **MAP** | **BM25** | **+RM3** | **+RM3+DF**|
|:-------------------------------------------------------------------------------------------------------------|-----------|-----------|-----------|
| [TREC 2018 Topics](../src/main/resources/topics-and-qrels/topics.backgroundlinking18.txt) | 0.2490 | 0.2661 | 0.2685 |
| [TREC 2018 Topics](../src/main/resources/topics-and-qrels/topics.backgroundlinking18.txt) | 0.2490 | 0.2661 | 0.2679 |
| **nDCG@5** | **BM25** | **+RM3** | **+RM3+DF**|
| [TREC 2018 Topics](../src/main/resources/topics-and-qrels/topics.backgroundlinking18.txt) | 0.3293 | 0.3447 | 0.4060 |
| [TREC 2018 Topics](../src/main/resources/topics-and-qrels/topics.backgroundlinking18.txt) | 0.3293 | 0.3436 | 0.4027 |

4 changes: 2 additions & 2 deletions docs/regressions-backgroundlinking19.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ With the above commands, you should be able to reproduce the following results:

| **MAP** | **BM25** | **+RM3** | **+RM3+DF**|
|:-------------------------------------------------------------------------------------------------------------|-----------|-----------|-----------|
| [TREC 2019 Topics](../src/main/resources/topics-and-qrels/topics.backgroundlinking19.txt) | 0.3029 | 0.3756 | 0.3133 |
| [TREC 2019 Topics](../src/main/resources/topics-and-qrels/topics.backgroundlinking19.txt) | 0.3029 | 0.3787 | 0.3160 |
| **nDCG@5** | **BM25** | **+RM3** | **+RM3+DF**|
| [TREC 2019 Topics](../src/main/resources/topics-and-qrels/topics.backgroundlinking19.txt) | 0.4785 | 0.5124 | 0.4967 |
| [TREC 2019 Topics](../src/main/resources/topics-and-qrels/topics.backgroundlinking19.txt) | 0.4785 | 0.5200 | 0.5018 |

4 changes: 2 additions & 2 deletions docs/regressions-backgroundlinking20.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ With the above commands, you should be able to reproduce the following results:

| **MAP** | **BM25** | **+RM3** | **+RM3+DF**|
|:-------------------------------------------------------------------------------------------------------------|-----------|-----------|-----------|
| [TREC 2020 Topics](../src/main/resources/topics-and-qrels/topics.backgroundlinking20.txt) | 0.3286 | 0.4529 | 0.3443 |
| [TREC 2020 Topics](../src/main/resources/topics-and-qrels/topics.backgroundlinking20.txt) | 0.3286 | 0.4528 | 0.3438 |
| **nDCG@5** | **BM25** | **+RM3** | **+RM3+DF**|
| [TREC 2020 Topics](../src/main/resources/topics-and-qrels/topics.backgroundlinking20.txt) | 0.5231 | 0.5684 | 0.5316 |
| [TREC 2020 Topics](../src/main/resources/topics-and-qrels/topics.backgroundlinking20.txt) | 0.5231 | 0.5696 | 0.5304 |

16 changes: 8 additions & 8 deletions src/main/java/io/anserini/rerank/lib/BM25PrfReranker.java
Original file line number Diff line number Diff line change
Expand Up @@ -89,16 +89,17 @@ public BM25PrfReranker(Analyzer analyzer, String field, int fbTerms, int fbDocs,

@Override
public ScoredDocuments rerank(ScoredDocuments docs, RerankerContext context) {
IndexSearcher existingSearcher = context.getIndexSearcher();
IndexReader reader = existingSearcher.getIndexReader();

// set similarity to BM25PRF
IndexSearcher searcher = context.getIndexSearcher();
BM25Similarity originalSimilarity = (BM25Similarity) searcher.getSimilarity();
// Set similarity to BM25Prf. We want to get a new searcher for a different similarity, as opposed to using the
// existing searcher. Naively using the existing searcher makes the reranker not thread-safe, since interleaved
// execution would leave the searcher in some weird state wrt what similarity it's using.
IndexSearcher searcher = new IndexSearcher(reader);
searcher.setSimilarity(new BM25PrfSimilarity(k1, b));
IndexReader reader = searcher.getIndexReader();
List<String> originalQueryTerms = AnalyzerUtils.analyze(analyzer, context.getQueryText());

boolean useRf = (context.getSearchArgs().rf_qrels != null);
PrfFeatures fv = expandQuery(originalQueryTerms, docs, reader, useRf);
PrfFeatures fv = expandQuery(context.getQueryTokens(), docs, reader, useRf);
Query newQuery = fv.toQuery();

if (this.outputQuery) {
Expand All @@ -123,8 +124,7 @@ public ScoredDocuments rerank(ScoredDocuments docs, RerankerContext context) {
e.printStackTrace();
return docs;
}
// set similarity back
searcher.setSimilarity(originalSimilarity);

return ScoredDocuments.fromTopDocs(rs, searcher);
}

Expand Down
2 changes: 1 addition & 1 deletion src/main/java/io/anserini/rerank/lib/Rm3Reranker.java
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ public ScoredDocuments rerank(ScoredDocuments docs, RerankerContext context) {
IndexSearcher searcher = context.getIndexSearcher();
IndexReader reader = searcher.getIndexReader();

FeatureVector qfv = FeatureVector.fromTerms(AnalyzerUtils.analyze(analyzer, context.getQueryText())).scaleToUnitL1Norm();
FeatureVector qfv = FeatureVector.fromTerms(context.getQueryTokens()).scaleToUnitL1Norm();

boolean useRf = (context.getSearchArgs().rf_qrels != null);
FeatureVector rm = estimateRelevanceModel(docs, reader, context.getSearchArgs().searchtweets, useRf);
Expand Down
4 changes: 0 additions & 4 deletions src/main/java/io/anserini/search/SearchCollection.java
Original file line number Diff line number Diff line change
Expand Up @@ -191,10 +191,6 @@ public void run() {

// This is the number of threads that we're going to devote to running the queries in parallel.
int parallelism = args.parallelism;
// BM25 PRF is not thread safe, so we can't run in parallel.
if (args.bm25prf) {
parallelism = 1;
}

// ThreadPool for parallelizing the execution of individual queries:
ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(parallelism);
Expand Down
3 changes: 2 additions & 1 deletion src/main/java/io/anserini/search/SimpleSearcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -582,8 +582,9 @@ public Result[] search(Query query, int k) throws IOException {
*/
public Result[] search(QueryGenerator generator, String q, int k) throws IOException {
Query query = generator.buildQuery(IndexArgs.CONTENTS, analyzer, q);
List<String> queryTokens = AnalyzerUtils.analyze(analyzer, q);

return _search(query, null, null, k);
return _search(query, queryTokens, q, k);
}

// internal implementation
Expand Down
6 changes: 3 additions & 3 deletions src/main/resources/regression/backgroundlinking18.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,12 @@ models:
MAP:
- 0.2661
nDCG@5:
- 0.3447
- 0.3436
- name: bm25+rm3+df
display: +RM3+DF
params: -backgroundlinking -backgroundlinking.datefilter -backgroundlinking.k 100 -bm25 -rm3 -hits 100
results:
MAP:
- 0.2685
- 0.2679
nDCG@5:
- 0.4060
- 0.4027
8 changes: 4 additions & 4 deletions src/main/resources/regression/backgroundlinking19.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,14 +51,14 @@ models:
params: -backgroundlinking -backgroundlinking.k 100 -bm25 -rm3 -hits 100
results:
MAP:
- 0.3756
- 0.3787
nDCG@5:
- 0.5124
- 0.5200
- name: bm25+rm3+df
display: +RM3+DF
params: -backgroundlinking -backgroundlinking.datefilter -backgroundlinking.k 100 -bm25 -rm3 -hits 100
results:
MAP:
- 0.3133
- 0.3160
nDCG@5:
- 0.4967
- 0.5018
8 changes: 4 additions & 4 deletions src/main/resources/regression/backgroundlinking20.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,14 +51,14 @@ models:
params: -backgroundlinking -backgroundlinking.k 100 -bm25 -rm3 -hits 100
results:
MAP:
- 0.4529
- 0.4528
nDCG@5:
- 0.5684
- 0.5696
- name: bm25+rm3+df
display: +RM3+DF
params: -backgroundlinking -backgroundlinking.datefilter -backgroundlinking.k 100 -bm25 -rm3 -hits 100
results:
MAP:
- 0.3443
- 0.3438
nDCG@5:
- 0.5316
- 0.5304
28 changes: 27 additions & 1 deletion src/test/java/io/anserini/search/SimpleSearcherTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,7 @@ public void testSearchCustomQuery() throws Exception {
}

@Test
public void testBatchSearch() throws Exception {
public void testBatchSearch1() throws Exception {
SimpleSearcher searcher = new SimpleSearcher(super.tempDir1.toString());

List<String> queries = new ArrayList<>();
Expand All @@ -358,6 +358,32 @@ public void testBatchSearch() throws Exception {
searcher.close();
}

@Test
public void testBatchSearch2() throws Exception {
SimpleSearcher searcher = new SimpleSearcher(super.tempDir1.toString());
searcher.set_rm3();

List<String> queries = new ArrayList<>();
queries.add("test");
queries.add("more");

List<String> qids = new ArrayList<>();
qids.add("query_test");
qids.add("query_more");

Map<String, SimpleSearcher.Result[]> hits = searcher.batch_search(queries, qids, 10, 2);
assertEquals(2, hits.size());

assertEquals(1, hits.get("query_test").length);
assertEquals("doc3", hits.get("query_test")[0].docid);

assertEquals(2, hits.get("query_more").length);
assertEquals("doc2", hits.get("query_more")[0].docid);
assertEquals("doc1", hits.get("query_more")[1].docid);

searcher.close();
}

@Test
public void testFieldedSearch() throws Exception {
SimpleSearcher searcher = new SimpleSearcher(super.tempDir1.toString());
Expand Down

0 comments on commit 7b244ee

Please sign in to comment.