Skip to content

Commit

Permalink
Fix broken tests from Lucene 8 -> 9 upgrade (#1273)
Browse files Browse the repository at this point in the history
Tests (should) pass with Anserini v0.15.0
  • Loading branch information
lintool authored Sep 24, 2022
1 parent 8a80800 commit 5078e32
Show file tree
Hide file tree
Showing 14 changed files with 85 additions and 63 deletions.
8 changes: 3 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,17 +20,15 @@ For additional details, [our paper](https://dl.acm.org/doi/10.1145/3404835.34632
## Important Note: Lucene 8 to Lucene 9 Transition

The [PyPI release 0.17.1](https://pypi.org/project/pyserini/0.17.1/) at commit [`33c87c`](https://github.com/castorini/pyserini/commit/33c87c982d543d65e0ba1b4c94ee865fd9a6040e) (2022/08/13) is the last official Pyserini release built on Lucene 8, based on [Anserini v0.14.4](https://github.com/castorini/anserini/releases/tag/anserini-0.14.4).
Main Anserini trunk has been upgraded to Lucene 9.3.
Main Anserini trunk has been upgraded to Lucene 9.3 and the latest release, [Anserini v0.15.0](https://github.com/castorini/anserini/releases/tag/anserini-0.15.0), is built on that version.

This is an important but disruptive upgrade, as indexes built with Lucene 8 are not backwards compatible with Lucene 9 code (see [Anserini #1952](https://github.com/castorini/anserini/issues/1952)).
There is a workaround, but we have yet to implement in Pyserini.
Furthermore, Lucene 8 code is _not_ able to read indexes built with Lucene 9.
An upgrade to Lucene 9 is necessary to use Lucene's HNSW indexes, which will increase the capabilities of Pyserini and open up the design space of dense/sparse hybrids.

Thus, we are in a transition where a development installation is partially broken, since the development installation jar will be based on Lucene 9, but the Pyserini codebase has not been updated to accommodate.
For a self-consistent development installation (that passes all tests), grab `anserini-0.14.4-fatjar.jar` from [here](https://repo1.maven.org/maven2/io/anserini/anserini/0.14.4/) to drop into `pyserini/resources/jars`.

This note supersedes all other Pyserini documentation on this site.
We are working hard on a corresponding Pyserini upgrade right now.
For a development installation, make sure you grab the `anserini-0.15.0-fatjar.jar` from [here](https://repo1.maven.org/maven2/io/anserini/anserini/0.15.0/) to drop into `pyserini/resources/jars` to make sure that you're using Lucene 9.

## Installation

Expand Down
38 changes: 19 additions & 19 deletions integrations/clprf/test_clprf.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def test_core17(self):
self.assertTrue(self.core17_checker.run('core17_bm25', '--bm25', 0.2087))

def test_core17_rm3(self):
self.assertTrue(self.core17_checker.run('core17_bm25', '--bm25 --rm3', 0.2823))
self.assertTrue(self.core17_checker.run('core17_bm25', '--bm25 --rm3', 0.2798))

def test_core17_lr(self):
pyserini_topics = 'core17'
Expand Down Expand Up @@ -159,7 +159,7 @@ def test_core17_lr_rm3(self):

self.assertEqual(status, 0)
self.assertEqual(stderr, '')
self.assertAlmostEqual(score, 0.2940, delta=0.0001)
self.assertAlmostEqual(score, 0.2926, delta=0.0001)

def test_core17_svm(self):
pyserini_topics = 'core17'
Expand Down Expand Up @@ -203,7 +203,7 @@ def test_core17_svm_rm3(self):

self.assertEqual(status, 0)
self.assertEqual(stderr, '')
self.assertAlmostEqual(score, 0.2970, delta=0.0001)
self.assertAlmostEqual(score, 0.2956, delta=0.0001)

def test_core17_avg(self):
pyserini_topics = 'core17'
Expand Down Expand Up @@ -247,7 +247,7 @@ def test_core17_avg_rm3(self):

self.assertEqual(status, 0)
self.assertEqual(stderr, '')
self.assertAlmostEqual(score, 0.2967, delta=0.0001)
self.assertAlmostEqual(score, 0.2950, delta=0.0001)

def test_core17_rrf(self):
pyserini_topics = 'core17'
Expand Down Expand Up @@ -317,13 +317,13 @@ def test_core17_rrf_rm3(self):

self.assertEqual(status, 0)
self.assertEqual(stderr, '')
self.assertAlmostEqual(score, 0.2965, delta=0.0001)
self.assertAlmostEqual(score, 0.2957, delta=0.0001)

def test_core18(self):
self.assertTrue(self.core18_checker.run('core18_bm25', '--bm25', 0.2496))

def test_core18_rm3(self):
self.assertTrue(self.core18_checker.run('core18_bm25', '--bm25 --rm3', 0.3139))
self.assertTrue(self.core18_checker.run('core18_bm25', '--bm25 --rm3', 0.3129))

def test_core18_lr(self):
pyserini_topics = 'core18'
Expand Down Expand Up @@ -367,7 +367,7 @@ def test_core18_lr_rm3(self):

self.assertEqual(status, 0)
self.assertEqual(stderr, '')
self.assertAlmostEqual(score, 0.3222, delta=0.0001)
self.assertAlmostEqual(score, 0.3216, delta=0.0001)

def test_core18_svm(self):
pyserini_topics = 'core18'
Expand Down Expand Up @@ -411,7 +411,7 @@ def test_core18_svm_rm3(self):

self.assertEqual(status, 0)
self.assertEqual(stderr, '')
self.assertAlmostEqual(score, 0.3216, delta=0.0001)
self.assertAlmostEqual(score, 0.3200, delta=0.0001)

def test_core18_avg(self):
pyserini_topics = 'core18'
Expand Down Expand Up @@ -455,7 +455,7 @@ def test_core18_avg_rm3(self):

self.assertEqual(status, 0)
self.assertEqual(stderr, '')
self.assertAlmostEqual(score, 0.3227, delta=0.0001)
self.assertAlmostEqual(score, 0.3215, delta=0.0001)

def test_core18_rrf(self):
pyserini_topics = 'core18'
Expand Down Expand Up @@ -525,13 +525,13 @@ def test_core18_rrf_rm3(self):

self.assertEqual(status, 0)
self.assertEqual(stderr, '')
self.assertAlmostEqual(score, 0.3214, delta=0.0001)
self.assertAlmostEqual(score, 0.3205, delta=0.0001)

def test_robust04(self):
self.assertTrue(self.robust04_checker.run('robust04_bm25', '--bm25', 0.2531))

def test_robust04_rm3(self):
self.assertTrue(self.robust04_checker.run('robust04_bm25_rm3', '--bm25 --rm3', 0.2903))
self.assertTrue(self.robust04_checker.run('robust04_bm25_rm3', '--bm25 --rm3', 0.2908))

def test_robust04_lr(self):
pyserini_topics = 'robust04'
Expand Down Expand Up @@ -575,7 +575,7 @@ def test_robust04_lr_rm3(self):

self.assertEqual(status, 0)
self.assertEqual(stderr, '')
self.assertAlmostEqual(score, 0.2971, delta=0.0001)
self.assertAlmostEqual(score, 0.2969, delta=0.0001)

def test_robust04_svm(self):
pyserini_topics = 'robust04'
Expand Down Expand Up @@ -619,7 +619,7 @@ def test_robust04_svm_rm3(self):

self.assertEqual(status, 0)
self.assertEqual(stderr, '')
self.assertAlmostEqual(score, 0.2967, delta=0.0001)
self.assertAlmostEqual(score, 0.2972, delta=0.0001)

def test_robust04_avg(self):
pyserini_topics = 'robust04'
Expand Down Expand Up @@ -733,13 +733,13 @@ def test_robust04_rrf_rm3(self):

self.assertEqual(status, 0)
self.assertEqual(stderr, '')
self.assertAlmostEqual(score, 0.2975, delta=0.0001)
self.assertAlmostEqual(score, 0.2977, delta=0.0001)

def test_robust05(self):
self.assertTrue(self.robust05_checker.run('robust05_bm25', '--bm25', 0.2032))

def test_robust05_rm3(self):
self.assertTrue(self.robust05_checker.run('robust05_bm25_rm3', '--bm25 --rm3', 0.2602))
self.assertTrue(self.robust05_checker.run('robust05_bm25_rm3', '--bm25 --rm3', 0.2624))

def test_robust05_lr(self):
pyserini_topics = 'robust05'
Expand Down Expand Up @@ -783,7 +783,7 @@ def test_robust05_lr_rm3(self):

self.assertEqual(status, 0)
self.assertEqual(stderr, '')
self.assertAlmostEqual(score, 0.2854, delta=0.0001)
self.assertAlmostEqual(score, 0.2872, delta=0.0001)

def test_robust05_svm(self):
pyserini_topics = 'robust05'
Expand Down Expand Up @@ -827,7 +827,7 @@ def test_robust05_svm_rm3(self):

self.assertEqual(status, 0)
self.assertEqual(stderr, '')
self.assertAlmostEqual(score, 0.2855, delta=0.0001)
self.assertAlmostEqual(score, 0.2871, delta=0.0001)

def test_robust05_avg(self):
pyserini_topics = 'robust05'
Expand Down Expand Up @@ -871,7 +871,7 @@ def test_robust05_avg_rm3(self):

self.assertEqual(status, 0)
self.assertEqual(stderr, '')
self.assertAlmostEqual(score, 0.2865, delta=0.0001)
self.assertAlmostEqual(score, 0.2880, delta=0.0001)

def test_robust05_rrf(self):
pyserini_topics = 'robust05'
Expand Down Expand Up @@ -941,7 +941,7 @@ def test_robust05_rrf_rm3(self):

self.assertEqual(status, 0)
self.assertEqual(stderr, '')
self.assertAlmostEqual(score, 0.2788, delta=0.0001)
self.assertAlmostEqual(score, 0.2808, delta=0.0001)

def tearDown(self):
shutil.rmtree(f'{self.tmp}')
Expand Down
2 changes: 1 addition & 1 deletion integrations/dense/test_dpr.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ def test_dpr_squad_test_bf_bm25_hybrid_otf(self):
score = parse_score_qa(stdout, 'Top20')
self.assertEqual(status1, 0)
self.assertEqual(status2, 0)
self.assertAlmostEqual(score, 0.7511, places=4)
self.assertAlmostEqual(score, 0.7514, places=4)

def test_dpr_squad_test_encoded_queries(self):
encoder = QueryEncoder.load_encoded_queries('dpr_multi-squad-test')
Expand Down
3 changes: 2 additions & 1 deletion integrations/papers/test_sigir2021.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,8 @@ def test_section3_3(self):
msmarco-passage-dev-subset {output_file}'
stdout, stderr = run_command(eval_cmd)
score = parse_score_msmarco(stdout, "MRR @10")
self.assertAlmostEqual(score, 0.1874, delta=0.0001)
self.assertAlmostEqual(score, 0.1872, delta=0.0001)
# Temporary fix: this is Lucene 9 code running on Lucene 8 prebuilt index.

def tearDown(self):
clean_files(self.temp_files)
Expand Down
20 changes: 10 additions & 10 deletions integrations/sparse/test_lucenesearcher_check_irst.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def test_sum_aggregation_dl19_passage(self):

self.assertEqual(status, 0)
self.assertEqual(stderr, '')
self.assertEqual(map_score, 0.3281)
self.assertEqual(map_score, 0.3282) # Difference in Lucene 9 code running on Lucene 8 index.
self.assertEqual(ndcg_score, 0.5260)

def test_sum_aggregation_dl20_passage(self):
Expand Down Expand Up @@ -125,7 +125,7 @@ def test_max_aggregation_dl20_passage(self):
self.assertEqual(status, 0)
self.assertEqual(stderr, '')
self.assertEqual(map_score, 0.3357)
self.assertEqual(ndcg_score, 0.5469)
self.assertEqual(ndcg_score, 0.5476) # Difference in Lucene 9 code running on Lucene 8 index.

def tearDown(self):
rmtree(self.tmp)
Expand Down Expand Up @@ -209,8 +209,8 @@ def test_max_aggregation_dl19_doc(self):

self.assertEqual(status, 0)
self.assertEqual(stderr, '')
self.assertEqual(map_score, 0.2204)
self.assertEqual(ndcg_score, 0.4912)
self.assertEqual(map_score, 0.2205) # Difference in Lucene 9 code running on Lucene 8 index.
self.assertEqual(ndcg_score, 0.4917) # Difference in Lucene 9 code running on Lucene 8 index.

def test_max_aggregation_dl20_doc(self):
# dl20-doc-max
Expand All @@ -232,8 +232,8 @@ def test_max_aggregation_dl20_doc(self):

self.assertEqual(status, 0)
self.assertEqual(stderr, '')
self.assertEqual(map_score, 0.3373)
self.assertEqual(ndcg_score, 0.5015)
self.assertEqual(map_score, 0.3371) # Difference in Lucene 9 code running on Lucene 8 index.
self.assertEqual(ndcg_score, 0.4996) # Difference in Lucene 9 code running on Lucene 8 index.

def tearDown(self):
rmtree(self.tmp)
Expand Down Expand Up @@ -320,8 +320,8 @@ def test_max_aggregation_dl19_doc_seg(self):

self.assertEqual(status, 0)
self.assertEqual(stderr, '')
self.assertEqual(map_score, 0.2425)
self.assertEqual(ndcg_score, 0.5195)
self.assertEqual(map_score, 0.2424) # Difference in Lucene 9 code running on Lucene 8 index.
self.assertEqual(ndcg_score, 0.5193) # Difference in Lucene 9 code running on Lucene 8 index.

def test_max_aggregation_dl20_doc_seg(self):
# dl20-doc-seg-max
Expand All @@ -344,8 +344,8 @@ def test_max_aggregation_dl20_doc_seg(self):

self.assertEqual(status, 0)
self.assertEqual(stderr, '')
self.assertEqual(map_score, 0.3496)
self.assertEqual(ndcg_score, 0.5089)
self.assertEqual(map_score, 0.3498) # Difference in Lucene 9 code running on Lucene 8 index.
self.assertEqual(ndcg_score, 0.5116) # Difference in Lucene 9 code running on Lucene 8 index.

def tearDown(self):
rmtree(self.tmp)
Expand Down
8 changes: 6 additions & 2 deletions integrations/sparse/test_prebuilt_beir_baselines.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,9 @@ def test_beir_flat(self):
stdout, stderr = run_command(eval_cmd)
for metric in ['ndcg_cut_10', 'recall_100', 'recall_1000']:
score = parse_score(stdout, metric)
self.assertAlmostEqual(score, self.beir_flat[key][metric], delta=1e-5)
self.assertAlmostEqual(score, self.beir_flat[key][metric], delta=0.008)
# Temporary fix: from delta=1e-5 to delta=0.008
# for Lucene 9 code running on Lucene 8 prebuilt index.

def test_beir_multifield(self):
for key in self.beir_multifield:
Expand All @@ -133,7 +135,9 @@ def test_beir_multifield(self):
stdout, stderr = run_command(eval_cmd)
for metric in ['ndcg_cut_10', 'recall_100', 'recall_1000']:
score = parse_score(stdout, metric)
self.assertAlmostEqual(score, self.beir_multifield[key][metric], delta=1e-5)
self.assertAlmostEqual(score, self.beir_multifield[key][metric], delta=0.008)
# Temporary fix: from delta=1e-5 to delta=0.008
# for Lucene 9 code running on Lucene 8 prebuilt index.

def tearDown(self):
clean_files(self.temp_files)
Expand Down
19 changes: 14 additions & 5 deletions integrations/sparse/test_prebuilt_msmarco_v1_doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ def test_doc_full_trec_output(self):

self.assertTrue('map' in scores)
self.assertTrue('recall.1000' in scores)
self.assertAlmostEqual(scores['map'], 0.2774, delta=0.0001)
self.assertAlmostEqual(scores['map'], 0.2770, delta=0.0001)
# Temporary fix: this is Lucene 9 code running on Lucene 8 prebuilt index.
self.assertAlmostEqual(scores['recall.1000'], 0.9357, delta=0.0001)

def test_doc_full_msmarco_output(self):
Expand All @@ -58,7 +59,9 @@ def test_doc_full_msmarco_output(self):
'msmarco_doc_string', [])

self.assertTrue('MRR@100' in scores)
self.assertEqual(scores['MRR@100'], '0.2766351807440808')
self.assertAlmostEqual(float(scores['MRR@100']), 0.2770, delta=0.0006)
# self.assertEqual(scores['MRR@100'], '0.2766351807440808')
# Temporary fix: this is Lucene 9 code running on Lucene 8 prebuilt index.

#
# doc segmented conditions
Expand Down Expand Up @@ -95,7 +98,9 @@ def test_doc_segmented_msmarco_output(self):
'msmarco_doc_string', [])

self.assertTrue('MRR@100' in scores)
self.assertEqual(scores['MRR@100'], '0.2755196341768384')
self.assertAlmostEqual(float(scores['MRR@100']), 0.2756, delta=0.0004)
# self.assertEqual(scores['MRR@100'], '0.2755196341768384')
# Temporary fix: this is Lucene 9 code running on Lucene 8 prebuilt index.

#
# doc2query conditions
Expand Down Expand Up @@ -124,7 +129,9 @@ def test_doc_full_expanded_msmarco_output(self):
'msmarco_doc_string', [])

self.assertTrue('MRR@100' in scores)
self.assertEqual(scores['MRR@100'], '0.3268656233100833')
self.assertAlmostEqual(float(scores['MRR@100']), 0.3269, delta=0.0001)
# self.assertEqual(scores['MRR@100'], '0.3268656233100833')
# Temporary fix: this is Lucene 9 code running on Lucene 8 prebuilt index.

def test_doc_segmented_expanded_trec_output(self):
"""Test case for MS MARCO V1 doc segmented + doc2query-T5 expansions, dev queries, TREC output."""
Expand All @@ -149,7 +156,9 @@ def test_doc_segmented_expanded_msmarco_output(self):
'msmarco_doc_string', [])

self.assertTrue('MRR@100' in scores)
self.assertEqual(scores['MRR@100'], '0.320918438140918')
self.assertAlmostEqual(float(scores['MRR@100']), 0.3209, delta=0.0002)
# self.assertEqual(scores['MRR@100'], '0.320918438140918')
# Temporary fix: this is Lucene 9 code running on Lucene 8 prebuilt index.


if __name__ == '__main__':
Expand Down
14 changes: 10 additions & 4 deletions integrations/sparse/test_prebuilt_msmarco_v1_passage.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,10 @@ def test_passage_trec_output(self):

self.assertTrue('map' in scores)
self.assertTrue('recall.1000' in scores)
self.assertAlmostEqual(scores['map'], 0.1958, delta=0.0001)
self.assertAlmostEqual(scores['recall.1000'], 0.8573, delta=0.0001)
self.assertAlmostEqual(scores['map'], 0.1953, delta=0.0001)
# Temporary fix: this is Lucene 9 code running on Lucene 8 prebuilt index.
self.assertAlmostEqual(scores['recall.1000'], 0.8573, delta=0.0004)
# Temporary fix: this is Lucene 9 code running on Lucene 8 prebuilt index.

def test_passage_msmarco_output(self):
"""Test case for MS MARCO V1 passage, dev queries, MS MARCO output
Expand All @@ -53,7 +55,9 @@ def test_passage_msmarco_output(self):
'msmarco_passage_string', [])

self.assertTrue('MRR@10' in scores)
self.assertEqual(scores['MRR@10'], '0.18741227770955546')
self.assertAlmostEqual(float(scores['MRR@10']), 0.1874, delta=0.0005)
# self.assertEqual(scores['MRR@10'], '0.18741227770955546')
# Temporary fix: this is Lucene 9 code running on Lucene 8 prebuilt index.

def test_passage_expanded_trec_output(self):
"""Test case for MS MARCO V1 passage w/ doc2query-T5 expansions, dev queries, TREC output."""
Expand All @@ -80,7 +84,9 @@ def test_passage_expanded_msmarco_output(self):
'msmarco_passage_string', [])

self.assertTrue('MRR@10' in scores)
self.assertEqual(scores['MRR@10'], '0.281560751807885')
self.assertAlmostEqual(float(scores['MRR@10']), 0.2816, delta=0.0002)
# self.assertEqual(scores['MRR@10'], '0.281560751807885')
# Temporary fix: this is Lucene 9 code running on Lucene 8 prebuilt index.


if __name__ == '__main__':
Expand Down
4 changes: 2 additions & 2 deletions integrations/sparse/test_prebuilt_robust04.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ def test_robust04(self):
self.assertTrue('map' in scores)
self.assertTrue('P.30' in scores)
self.assertAlmostEqual(scores['map'], 0.2531, delta=0.0001)
self.assertAlmostEqual(scores['P.30'], 0.3102, delta=0.0001)

self.assertAlmostEqual(scores['P.30'], 0.3099, delta=0.0001)
# Temporary fix: this is Lucene 9 code running on Lucene 8 prebuilt index.

if __name__ == '__main__':
unittest.main()
Loading

0 comments on commit 5078e32

Please sign in to comment.