From 883539b119e3c4f236f4fda3dd748a3b2d4ff1a1 Mon Sep 17 00:00:00 2001 From: Jimmy Lin Date: Tue, 19 Dec 2023 13:56:11 -0500 Subject: [PATCH] Upgrade to Lucene 9.9.1 (#2302) Also added HNSW int8 regressions: works for cosDPR-distill, issues remain with OpenAI Ada2 --- README.md | 9 +- docs/regressions.md | 24 +- ...9-passage-cos-dpr-distil-hnsw-int8-onnx.md | 123 ++++++++++ ...s-dl19-passage-cos-dpr-distil-hnsw-int8.md | 121 ++++++++++ ...s-dl19-passage-cos-dpr-distil-hnsw-onnx.md | 10 +- ...ssions-dl19-passage-cos-dpr-distil-hnsw.md | 6 +- ...gressions-dl19-passage-openai-ada2-int8.md | 123 ++++++++++ .../regressions-dl19-passage-openai-ada2.md | 6 +- ...0-passage-cos-dpr-distil-hnsw-int8-onnx.md | 123 ++++++++++ ...s-dl20-passage-cos-dpr-distil-hnsw-int8.md | 121 ++++++++++ ...s-dl20-passage-cos-dpr-distil-hnsw-onnx.md | 10 +- ...ssions-dl20-passage-cos-dpr-distil-hnsw.md | 6 +- ...gressions-dl20-passage-openai-ada2-int8.md | 123 ++++++++++ .../regressions-dl20-passage-openai-ada2.md | 6 +- ...o-passage-cos-dpr-distil-hnsw-int8-onnx.md | 115 ++++++++++ ...smarco-passage-cos-dpr-distil-hnsw-int8.md | 115 ++++++++++ ...smarco-passage-cos-dpr-distil-hnsw-onnx.md | 10 +- ...ons-msmarco-passage-cos-dpr-distil-hnsw.md | 6 +- ...ssions-msmarco-passage-openai-ada2-int8.md | 116 ++++++++++ ...regressions-msmarco-passage-openai-ada2.md | 6 +- pom.xml | 2 +- .../io/anserini/index/IndexCollection.java | 30 +-- .../anserini/index/IndexHnswDenseVectors.java | 99 ++++++-- .../index/IndexInvertedDenseVectors.java | 13 +- src/main/python/regressions-batch03.txt | 213 +++++++++--------- ...age-cos-dpr-distil-hnsw-int8-onnx.template | 101 +++++++++ ...-passage-cos-dpr-distil-hnsw-int8.template | 99 ++++++++ ...-passage-cos-dpr-distil-hnsw-onnx.template | 8 +- .../dl19-passage-cos-dpr-distil-hnsw.template | 4 +- .../dl19-passage-openai-ada2-int8.template | 101 +++++++++ .../dl19-passage-openai-ada2.template | 4 +- ...age-cos-dpr-distil-hnsw-int8-onnx.template | 101 +++++++++ ...-passage-cos-dpr-distil-hnsw-int8.template | 99 ++++++++ ...-passage-cos-dpr-distil-hnsw-onnx.template | 8 +- .../dl20-passage-cos-dpr-distil-hnsw.template | 4 +- .../dl20-passage-openai-ada2-int8.template | 101 +++++++++ .../dl20-passage-openai-ada2.template | 4 +- ...age-cos-dpr-distil-hnsw-int8-onnx.template | 93 ++++++++ ...-passage-cos-dpr-distil-hnsw-int8.template | 93 ++++++++ ...-passage-cos-dpr-distil-hnsw-onnx.template | 8 +- ...marco-passage-cos-dpr-distil-hnsw.template | 4 +- .../msmarco-passage-openai-ada2-int8.template | 94 ++++++++ .../msmarco-passage-openai-ada2.template | 4 +- ...passage-cos-dpr-distil-hnsw-int8-onnx.yaml | 65 ++++++ ...dl19-passage-cos-dpr-distil-hnsw-int8.yaml | 65 ++++++ ...dl19-passage-cos-dpr-distil-hnsw-onnx.yaml | 2 +- .../dl19-passage-cos-dpr-distil-hnsw.yaml | 2 +- .../dl19-passage-openai-ada2-int8.yaml | 65 ++++++ .../regression/dl19-passage-openai-ada2.yaml | 2 +- ...passage-cos-dpr-distil-hnsw-int8-onnx.yaml | 65 ++++++ ...dl20-passage-cos-dpr-distil-hnsw-int8.yaml | 65 ++++++ ...dl20-passage-cos-dpr-distil-hnsw-onnx.yaml | 2 +- .../dl20-passage-cos-dpr-distil-hnsw.yaml | 2 +- .../dl20-passage-openai-ada2-int8.yaml | 65 ++++++ .../regression/dl20-passage-openai-ada2.yaml | 2 +- ...passage-cos-dpr-distil-hnsw-int8-onnx.yaml | 65 ++++++ ...arco-passage-cos-dpr-distil-hnsw-int8.yaml | 65 ++++++ ...arco-passage-cos-dpr-distil-hnsw-onnx.yaml | 2 +- .../msmarco-passage-cos-dpr-distil-hnsw.yaml | 2 +- .../msmarco-passage-openai-ada2-int8.yaml | 65 ++++++ .../msmarco-passage-openai-ada2.yaml | 2 +- .../index/IndexHnswDenseVectorsTest.java | 22 ++ .../search/SearchHnswDenseVectorsTest.java | 12 +- 63 files changed, 2897 insertions(+), 206 deletions(-) create mode 100644 docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-int8-onnx.md create mode 100644 docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-int8.md create mode 100644 docs/regressions/regressions-dl19-passage-openai-ada2-int8.md create mode 100644 docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-int8-onnx.md create mode 100644 docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-int8.md create mode 100644 docs/regressions/regressions-dl20-passage-openai-ada2-int8.md create mode 100644 docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-int8-onnx.md create mode 100644 docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-int8.md create mode 100644 docs/regressions/regressions-msmarco-passage-openai-ada2-int8.md create mode 100644 src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-int8-onnx.template create mode 100644 src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-int8.template create mode 100644 src/main/resources/docgen/templates/dl19-passage-openai-ada2-int8.template create mode 100644 src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-int8-onnx.template create mode 100644 src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-int8.template create mode 100644 src/main/resources/docgen/templates/dl20-passage-openai-ada2-int8.template create mode 100644 src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-int8-onnx.template create mode 100644 src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-int8.template create mode 100644 src/main/resources/docgen/templates/msmarco-passage-openai-ada2-int8.template create mode 100644 src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-int8-onnx.yaml create mode 100644 src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-int8.yaml create mode 100644 src/main/resources/regression/dl19-passage-openai-ada2-int8.yaml create mode 100644 src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-int8-onnx.yaml create mode 100644 src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-int8.yaml create mode 100644 src/main/resources/regression/dl20-passage-openai-ada2-int8.yaml create mode 100644 src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-int8-onnx.yaml create mode 100644 src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-int8.yaml create mode 100644 src/main/resources/regression/msmarco-passage-openai-ada2-int8.yaml diff --git a/README.md b/README.md index 93c1b1a925..ca9361b895 100644 --- a/README.md +++ b/README.md @@ -89,9 +89,12 @@ See individual pages for details! | SPLADE++ CoCondenser-SelfDistil | [✓](docs/regressions/regressions-msmarco-passage-splade-pp-sd.md) | [✓](docs/regressions/regressions-dl19-passage-splade-pp-sd.md) | [✓](docs/regressions/regressions-dl20-passage-splade-pp-sd.md) | | SPLADE++ CoCondenser-SelfDistil (ONNX) | [✓](docs/regressions/regressions-msmarco-passage-splade-pp-sd-onnx.md) | [✓](docs/regressions/regressions-dl19-passage-splade-pp-sd-onnx.md) | [✓](docs/regressions/regressions-dl20-passage-splade-pp-sd-onnx.md) | | **Learned Dense** (HNSW) | | | | -| cosDPR-distil w/ HNSW | [✓](docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw.md) | [✓](docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw.md) | [✓](docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw.md) | -| cosDPR-distil w/ HSNW (ONNX) | [✓](docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-onnx.md) | [✓](docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-onnx.md) | [✓](docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-onnx.md) | -| OpenAI-ada2 w/ HNSW | [✓](docs/regressions/regressions-msmarco-passage-openai-ada2.md) | [✓](docs/regressions/regressions-dl19-passage-openai-ada2.md) | [✓](docs/regressions/regressions-dl20-passage-openai-ada2.md) | +| cosDPR-distil w/ HNSW fp32 | [✓](docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw.md) | [✓](docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw.md) | [✓](docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw.md) | +| cosDPR-distil w/ HNSW int8 | [✓](docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-int8.md) | [✓](docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-int8.md) | [✓](docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-int8.md) | +| cosDPR-distil w/ HSNW fp32 (ONNX) | [✓](docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-onnx.md) | [✓](docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-onnx.md) | [✓](docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-onnx.md) | +| cosDPR-distil w/ HSNW int8 (ONNX) | [✓](docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-int8-onnx.md) | [✓](docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-int8-onnx.md) | [✓](docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-int8-onnx.md) | +| OpenAI Ada2 w/ HNSW fp32 | [✓](docs/regressions/regressions-msmarco-passage-openai-ada2.md) | [✓](docs/regressions/regressions-dl19-passage-openai-ada2.md) | [✓](docs/regressions/regressions-dl20-passage-openai-ada2.md) | +| OpenAI Ada2 w/ HNSW int8 | [✓](docs/regressions/regressions-msmarco-passage-openai-ada2-int8.md) | [✓](docs/regressions/regressions-dl19-passage-openai-ada2-int8.md) | [✓](docs/regressions/regressions-dl20-passage-openai-ada2-int8.md) | | **Learned Dense** (Inverted; experimental) | | | | | cosDPR-distil w/ "fake words" | [✓](docs/regressions/regressions-msmarco-passage-cos-dpr-distil-fw.md) | [✓](docs/regressions/regressions-dl19-passage-cos-dpr-distil-fw.md) | [✓](docs/regressions/regressions-dl20-passage-cos-dpr-distil-fw.md) | | cosDPR-distil w/ "LexLSH" | [✓](docs/regressions/regressions-msmarco-passage-cos-dpr-distil-lexlsh.md) | [✓](docs/regressions/regressions-dl19-passage-cos-dpr-distil-lexlsh.md) | [✓](docs/regressions/regressions-dl20-passage-cos-dpr-distil-lexlsh.md) | diff --git a/docs/regressions.md b/docs/regressions.md index dec59541f6..36f7eb59f6 100644 --- a/docs/regressions.md +++ b/docs/regressions.md @@ -51,13 +51,15 @@ nohup python src/main/python/run_regression.py --index --verify --search --regre nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-splade-pp-ed >& logs/log.msmarco-passage-splade-pp-ed & nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-splade-pp-sd >& logs/log.msmarco-passage-splade-pp-sd & nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw >& logs/log.msmarco-passage-cos-dpr-distil-hnsw & +nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw-int8 >& logs/log.msmarco-passage-cos-dpr-distil-hnsw-int8 & nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-fw >& logs/log.msmarco-passage-cos-dpr-distil-fw & nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-lexlsh >& logs/log.msmarco-passage-cos-dpr-distil-lexlsh & nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-openai-ada2 >& logs/log.msmarco-passage-openai-ada2 & -nohup python src/main/python/run_regression.py --search-pool 1 --index --verify --search --regression msmarco-passage-splade-pp-ed-onnx >& logs/log.msmarco-passage-splade-pp-ed-onnx & -nohup python src/main/python/run_regression.py --search-pool 1 --index --verify --search --regression msmarco-passage-splade-pp-sd-onnx >& logs/log.msmarco-passage-splade-pp-sd-onnx & -nohup python src/main/python/run_regression.py --search-pool 1 --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw-onnx >& logs/log.msmarco-passage-cos-dpr-distil-hnsw-onnx & +nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-splade-pp-ed-onnx >& logs/log.msmarco-passage-splade-pp-ed-onnx & +nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-splade-pp-sd-onnx >& logs/log.msmarco-passage-splade-pp-sd-onnx & +nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw-onnx >& logs/log.msmarco-passage-cos-dpr-distil-hnsw-onnx & +nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw-int8-onnx >& logs/log.msmarco-passage-cos-dpr-distil-hnsw-int8-onnx & nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-doc >& logs/log.msmarco-doc & nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-doc-wp >& logs/log.msmarco-doc-wp & @@ -83,13 +85,15 @@ nohup python src/main/python/run_regression.py --index --verify --search --regre nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-splade-pp-ed >& logs/log.dl19-passage-splade-pp-ed & nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-splade-pp-sd >& logs/log.dl19-passage-splade-pp-sd & nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-cos-dpr-distil-hnsw >& logs/log.dl19-passage-cos-dpr-distil-hnsw & +nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-cos-dpr-distil-hnsw-int8 >& logs/log.dl19-passage-cos-dpr-distil-hnsw-int8 & nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-cos-dpr-distil-fw >& logs/log.dl19-passage-cos-dpr-distil-fw & nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-cos-dpr-distil-lexlsh >& logs/log.dl19-passage-cos-dpr-distil-lexlsh & nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-openai-ada2 >& logs/log.dl19-passage-openai-ada2 & -nohup python src/main/python/run_regression.py --search-pool 1 --index --verify --search --regression dl19-passage-splade-pp-ed-onnx >& logs/log.dl19-passage-splade-pp-ed-onnx & -nohup python src/main/python/run_regression.py --search-pool 1 --index --verify --search --regression dl19-passage-splade-pp-sd-onnx >& logs/log.dl19-passage-splade-pp-sd-onnx & -nohup python src/main/python/run_regression.py --search-pool 1 --index --verify --search --regression dl19-passage-cos-dpr-distil-hnsw-onnx >& logs/log.dl19-passage-cos-dpr-distil-hnsw-onnx & +nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-splade-pp-ed-onnx >& logs/log.dl19-passage-splade-pp-ed-onnx & +nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-splade-pp-sd-onnx >& logs/log.dl19-passage-splade-pp-sd-onnx & +nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-cos-dpr-distil-hnsw-onnx >& logs/log.dl19-passage-cos-dpr-distil-hnsw-onnx & +nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-cos-dpr-distil-hnsw-int8-onnx >& logs/log.dl19-passage-cos-dpr-distil-hnsw-int8-onnx & nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-doc >& logs/log.dl19-doc & nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-doc-wp >& logs/log.dl19-doc-wp & @@ -115,13 +119,15 @@ nohup python src/main/python/run_regression.py --index --verify --search --regre nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-splade-pp-ed >& logs/log.dl20-passage-splade-pp-ed & nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-splade-pp-sd >& logs/log.dl20-passage-splade-pp-sd & nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-cos-dpr-distil-hnsw >& logs/log.dl20-passage-cos-dpr-distil-hnsw & +nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-cos-dpr-distil-hnsw-int8 >& logs/log.dl20-passage-cos-dpr-distil-hnsw-int8 & nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-cos-dpr-distil-fw >& logs/log.dl20-passage-cos-dpr-distil-fw & nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-cos-dpr-distil-lexlsh >& logs/log.dl20-passage-cos-dpr-distil-lexlsh & nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-openai-ada2 >& logs/log.dl20-passage-openai-ada2 & -nohup python src/main/python/run_regression.py --search-pool 1 --index --verify --search --regression dl20-passage-splade-pp-ed-onnx >& logs/log.dl20-passage-splade-pp-ed-onnx & -nohup python src/main/python/run_regression.py --search-pool 1 --index --verify --search --regression dl20-passage-splade-pp-sd-onnx >& logs/log.dl20-passage-splade-pp-sd-onnx & -nohup python src/main/python/run_regression.py --search-pool 1 --index --verify --search --regression dl20-passage-cos-dpr-distil-hnsw-onnx >& logs/log.dl20-passage-cos-dpr-distil-hnsw-onnx & +nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-splade-pp-ed-onnx >& logs/log.dl20-passage-splade-pp-ed-onnx & +nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-splade-pp-sd-onnx >& logs/log.dl20-passage-splade-pp-sd-onnx & +nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-cos-dpr-distil-hnsw-onnx >& logs/log.dl20-passage-cos-dpr-distil-hnsw-onnx & +nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-cos-dpr-distil-hnsw-int8-onnx >& logs/log.dl20-passage-cos-dpr-distil-hnsw-int8-onnx & nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-doc >& logs/log.dl20-doc & nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-doc-wp >& logs/log.dl20-doc-wp & diff --git a/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-int8-onnx.md b/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-int8-onnx.md new file mode 100644 index 0000000000..24f529ef99 --- /dev/null +++ b/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-int8-onnx.md @@ -0,0 +1,123 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with HNSW indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-int8-onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-int8-onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-cos-dpr-distil-hnsw-int8-onnx +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage-cos-dpr-distil-hnsw-int8-onnx +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar -P collections/ +tar xvf collections/msmarco-passage-cos-dpr-distil.tar -C collections/ +``` + +To confirm, `msmarco-passage-cos-dpr-distil.tar` is 57 GB and has MD5 checksum `e20ffbc8b5e7f760af31298aefeaebbd`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-cos-dpr-distil-hnsw-int8-onnx \ + --corpus-path collections/msmarco-passage-cos-dpr-distil +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +target/appassembler/bin/IndexHnswDenseVectors \ + -collection JsonDenseVectorCollection \ + -input /path/to/msmarco-passage-cos-dpr-distil \ + -generator HnswDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/ \ + -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8 \ + >& logs/log.msmarco-passage-cos-dpr-distil & +``` + +The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. +Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. +See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +target/appassembler/bin/SearchHnswDenseVectors \ + -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/ \ + -topics tools/topics-and-qrels/topics.dl19-passage.txt \ + -topicReader TsvInt \ + -output runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl19-passage.txt \ + -generator VectorQueryGenerator -topicField title -threads 16 -hits 1000 -efSearch 1000 -encoder CosDprDistil & +``` + +Note that we are performing query inference "on-the-fly" with ONNX in these experiments. + +Evaluation can be performed using `trec_eval`: + +```bash +tools/eval/trec_eval.9.0.4/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl19-passage.txt +tools/eval/trec_eval.9.0.4/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl19-passage.txt +tools/eval/trec_eval.9.0.4/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl19-passage.txt +tools/eval/trec_eval.9.0.4/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl19-passage.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cosDPR-distil**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.458 | +| **nDCG@10** | **cosDPR-distil**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.717 | +| **R@100** | **cosDPR-distil**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.605 | +| **R@1000** | **cosDPR-distil**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.805 | + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-int8-onnx.yaml). + +Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-int8-onnx.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-int8.md b/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-int8.md new file mode 100644 index 0000000000..bbad60de68 --- /dev/null +++ b/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-int8.md @@ -0,0 +1,121 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with HNSW indexes (using pre-encoded queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-int8.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-int8.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-cos-dpr-distil-hnsw-int8 +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage-cos-dpr-distil-hnsw-int8 +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar -P collections/ +tar xvf collections/msmarco-passage-cos-dpr-distil.tar -C collections/ +``` + +To confirm, `msmarco-passage-cos-dpr-distil.tar` is 57 GB and has MD5 checksum `e20ffbc8b5e7f760af31298aefeaebbd`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-cos-dpr-distil-hnsw-int8 \ + --corpus-path collections/msmarco-passage-cos-dpr-distil +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +target/appassembler/bin/IndexHnswDenseVectors \ + -collection JsonDenseVectorCollection \ + -input /path/to/msmarco-passage-cos-dpr-distil \ + -generator HnswDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/ \ + -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8 \ + >& logs/log.msmarco-passage-cos-dpr-distil & +``` + +The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. +Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. +See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +target/appassembler/bin/SearchHnswDenseVectors \ + -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/ \ + -topics tools/topics-and-qrels/topics.dl19-passage.cos-dpr-distil.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl19-passage.cos-dpr-distil.jsonl.txt \ + -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +tools/eval/trec_eval.9.0.4/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl19-passage.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl19-passage.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl19-passage.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl19-passage.cos-dpr-distil.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cosDPR-distil**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.458 | +| **nDCG@10** | **cosDPR-distil**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.717 | +| **R@100** | **cosDPR-distil**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.605 | +| **R@1000** | **cosDPR-distil**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.805 | + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-int8.yaml). + +Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-int8.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-onnx.md b/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-onnx.md index b08012b307..5335899358 100644 --- a/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-onnx.md +++ b/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-onnx.md @@ -6,7 +6,7 @@ This page describes regression experiments, integrated into Anserini's regressio > Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. -In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). +In these experiments, we are performing query inference "on-the-fly" with ONNX. Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). @@ -57,14 +57,16 @@ target/appassembler/bin/IndexHnswDenseVectors \ -input /path/to/msmarco-passage-cos-dpr-distil \ -generator HnswDenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil/ \ - -threads 16 -M 16 -efC 100 -memoryBuffer 65536 \ + -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge \ >& logs/log.msmarco-passage-cos-dpr-distil & ``` The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above. - Upon completion, we should have an index with 8,841,823 documents. +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. + ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. @@ -82,6 +84,8 @@ target/appassembler/bin/SearchHnswDenseVectors \ -generator VectorQueryGenerator -topicField title -threads 16 -hits 1000 -efSearch 1000 -encoder CosDprDistil & ``` +Note that we are performing query inference "on-the-fly" with ONNX in these experiments. + Evaluation can be performed using `trec_eval`: ```bash diff --git a/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw.md b/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw.md index dc625e14d0..9b533ff518 100644 --- a/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw.md +++ b/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw.md @@ -57,14 +57,16 @@ target/appassembler/bin/IndexHnswDenseVectors \ -input /path/to/msmarco-passage-cos-dpr-distil \ -generator HnswDenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil/ \ - -threads 16 -M 16 -efC 100 -memoryBuffer 65536 \ + -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge \ >& logs/log.msmarco-passage-cos-dpr-distil & ``` The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above. - Upon completion, we should have an index with 8,841,823 documents. +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. + ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/docs/regressions/regressions-dl19-passage-openai-ada2-int8.md b/docs/regressions/regressions-dl19-passage-openai-ada2-int8.md new file mode 100644 index 0000000000..e71b7c03b8 --- /dev/null +++ b/docs/regressions/regressions-dl19-passage-openai-ada2-int8.md @@ -0,0 +1,123 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**NOTE:** We're currently having issues with this regression, which throws "Retried waiting for GCLocker too often" errors. + +**Model**: OpenAI-ada2 embeddings (using pre-encoded queries) with HNSW indexes + +This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023. + +In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage-openai-ada2-int8.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage-openai-ada2-int8.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-openai-ada2-int8 +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage-openai-ada2-int8 +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.tar -P collections/ +tar xvf collections/msmarco-passage-openai-ada2.tar -C collections/ +``` + +To confirm, `msmarco-passage-openai-ada2.tar` is 109 GB and has MD5 checksum `a4d843d522ff3a3af7edbee789a63402`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-openai-ada2-int8 \ + --corpus-path collections/msmarco-passage-openai-ada2 +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +target/appassembler/bin/IndexHnswDenseVectors \ + -collection JsonDenseVectorCollection \ + -input /path/to/msmarco-passage-openai-ada2 \ + -generator HnswDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw.msmarco-passage-openai-ada2-int8/ \ + -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8 \ + >& logs/log.msmarco-passage-openai-ada2 & +``` + +The path `/path/to/msmarco-passage-openai-ada2/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. +Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. +See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +target/appassembler/bin/SearchHnswDenseVectors \ + -index indexes/lucene-hnsw.msmarco-passage-openai-ada2-int8/ \ + -topics tools/topics-and-qrels/topics.dl19-passage.openai-ada2.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.dl19-passage.openai-ada2.jsonl.txt \ + -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +tools/eval/trec_eval.9.0.4/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.dl19-passage.openai-ada2.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.dl19-passage.openai-ada2.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.dl19-passage.openai-ada2.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.dl19-passage.openai-ada2.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **OpenAI-ada2**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.479 | +| **nDCG@10** | **OpenAI-ada2**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.704 | +| **R@100** | **OpenAI-ada2**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.624 | +| **R@1000** | **OpenAI-ada2**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.857 | + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl19-passage-openai-ada2-int8.yaml). + +Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage-openai-ada2-int8.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl19-passage-openai-ada2.md b/docs/regressions/regressions-dl19-passage-openai-ada2.md index c5382dc3c5..57f5ab8932 100644 --- a/docs/regressions/regressions-dl19-passage-openai-ada2.md +++ b/docs/regressions/regressions-dl19-passage-openai-ada2.md @@ -57,14 +57,16 @@ target/appassembler/bin/IndexHnswDenseVectors \ -input /path/to/msmarco-passage-openai-ada2 \ -generator HnswDenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.msmarco-passage-openai-ada2/ \ - -threads 16 -M 16 -efC 100 -memoryBuffer 65536 \ + -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge \ >& logs/log.msmarco-passage-openai-ada2 & ``` The path `/path/to/msmarco-passage-openai-ada2/` should point to the corpus downloaded above. - Upon completion, we should have an index with 8,841,823 documents. +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. + ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-int8-onnx.md b/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-int8-onnx.md new file mode 100644 index 0000000000..21c2f8cd12 --- /dev/null +++ b/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-int8-onnx.md @@ -0,0 +1,123 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with HNSW indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-int8-onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-int8-onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-cos-dpr-distil-hnsw-int8-onnx +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage-cos-dpr-distil-hnsw-int8-onnx +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar -P collections/ +tar xvf collections/msmarco-passage-cos-dpr-distil.tar -C collections/ +``` + +To confirm, `msmarco-passage-cos-dpr-distil.tar` is 57 GB and has MD5 checksum `e20ffbc8b5e7f760af31298aefeaebbd`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-cos-dpr-distil-hnsw-int8-onnx \ + --corpus-path collections/msmarco-passage-cos-dpr-distil +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +target/appassembler/bin/IndexHnswDenseVectors \ + -collection JsonDenseVectorCollection \ + -input /path/to/msmarco-passage-cos-dpr-distil \ + -generator HnswDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/ \ + -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8 \ + >& logs/log.msmarco-passage-cos-dpr-distil & +``` + +The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. +Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. +See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +target/appassembler/bin/SearchHnswDenseVectors \ + -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/ \ + -topics tools/topics-and-qrels/topics.dl20.txt \ + -topicReader TsvInt \ + -output runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl20.txt \ + -generator VectorQueryGenerator -topicField title -threads 16 -hits 1000 -efSearch 1000 -encoder CosDprDistil & +``` + +Note that we are performing query inference "on-the-fly" with ONNX in these experiments. + +Evaluation can be performed using `trec_eval`: + +```bash +tools/eval/trec_eval.9.0.4/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl20.txt +tools/eval/trec_eval.9.0.4/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl20.txt +tools/eval/trec_eval.9.0.4/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl20.txt +tools/eval/trec_eval.9.0.4/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl20.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cosDPR-distil**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.482 | +| **nDCG@10** | **cosDPR-distil**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.701 | +| **R@100** | **cosDPR-distil**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.712 | +| **R@1000** | **cosDPR-distil**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.843 | + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-int8-onnx.yaml). + +Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-int8-onnx.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-int8.md b/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-int8.md new file mode 100644 index 0000000000..cc9c2f14b6 --- /dev/null +++ b/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-int8.md @@ -0,0 +1,121 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with HNSW indexes (using pre-encoded queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-int8.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-int8.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-cos-dpr-distil-hnsw-int8 +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage-cos-dpr-distil-hnsw-int8 +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar -P collections/ +tar xvf collections/msmarco-passage-cos-dpr-distil.tar -C collections/ +``` + +To confirm, `msmarco-passage-cos-dpr-distil.tar` is 57 GB and has MD5 checksum `e20ffbc8b5e7f760af31298aefeaebbd`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-cos-dpr-distil-hnsw-int8 \ + --corpus-path collections/msmarco-passage-cos-dpr-distil +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +target/appassembler/bin/IndexHnswDenseVectors \ + -collection JsonDenseVectorCollection \ + -input /path/to/msmarco-passage-cos-dpr-distil \ + -generator HnswDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/ \ + -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8 \ + >& logs/log.msmarco-passage-cos-dpr-distil & +``` + +The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. +Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. +See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +target/appassembler/bin/SearchHnswDenseVectors \ + -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/ \ + -topics tools/topics-and-qrels/topics.dl20.cos-dpr-distil.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl20.cos-dpr-distil.jsonl.txt \ + -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +tools/eval/trec_eval.9.0.4/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl20.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl20.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl20.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl20.cos-dpr-distil.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cosDPR-distil**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.482 | +| **nDCG@10** | **cosDPR-distil**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.701 | +| **R@100** | **cosDPR-distil**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.712 | +| **R@1000** | **cosDPR-distil**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.843 | + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-int8.yaml). + +Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-int8.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-onnx.md b/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-onnx.md index f040a9ce41..a802d3370e 100644 --- a/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-onnx.md +++ b/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-onnx.md @@ -6,7 +6,7 @@ This page describes regression experiments, integrated into Anserini's regressio > Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. -In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). +In these experiments, we are performing query inference "on-the-fly" with ONNX. Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). @@ -57,14 +57,16 @@ target/appassembler/bin/IndexHnswDenseVectors \ -input /path/to/msmarco-passage-cos-dpr-distil \ -generator HnswDenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil/ \ - -threads 16 -M 16 -efC 100 -memoryBuffer 65536 \ + -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge \ >& logs/log.msmarco-passage-cos-dpr-distil & ``` The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above. - Upon completion, we should have an index with 8,841,823 documents. +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. + ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. @@ -82,6 +84,8 @@ target/appassembler/bin/SearchHnswDenseVectors \ -generator VectorQueryGenerator -topicField title -threads 16 -hits 1000 -efSearch 1000 -encoder CosDprDistil & ``` +Note that we are performing query inference "on-the-fly" with ONNX in these experiments. + Evaluation can be performed using `trec_eval`: ```bash diff --git a/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw.md b/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw.md index c2f46b422c..d67487d1e4 100644 --- a/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw.md +++ b/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw.md @@ -57,14 +57,16 @@ target/appassembler/bin/IndexHnswDenseVectors \ -input /path/to/msmarco-passage-cos-dpr-distil \ -generator HnswDenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil/ \ - -threads 16 -M 16 -efC 100 -memoryBuffer 65536 \ + -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge \ >& logs/log.msmarco-passage-cos-dpr-distil & ``` The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above. - Upon completion, we should have an index with 8,841,823 documents. +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. + ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/docs/regressions/regressions-dl20-passage-openai-ada2-int8.md b/docs/regressions/regressions-dl20-passage-openai-ada2-int8.md new file mode 100644 index 0000000000..beb86feb50 --- /dev/null +++ b/docs/regressions/regressions-dl20-passage-openai-ada2-int8.md @@ -0,0 +1,123 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**NOTE:** We're currently having issues with this regression, which throws "Retried waiting for GCLocker too often" errors. + +**Model**: OpenAI-ada2 embeddings (using pre-encoded queries) with HNSW indexes + +This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023. + +In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage-openai-ada2-int8.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage-openai-ada2-int8.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-openai-ada2-int8 +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage-openai-ada2-int8 +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.tar -P collections/ +tar xvf collections/msmarco-passage-openai-ada2.tar -C collections/ +``` + +To confirm, `msmarco-passage-openai-ada2.tar` is 109 GB and has MD5 checksum `a4d843d522ff3a3af7edbee789a63402`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-openai-ada2-int8 \ + --corpus-path collections/msmarco-passage-openai-ada2 +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +target/appassembler/bin/IndexHnswDenseVectors \ + -collection JsonDenseVectorCollection \ + -input /path/to/msmarco-passage-openai-ada2 \ + -generator HnswDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw.msmarco-passage-openai-ada2-int8/ \ + -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8 \ + >& logs/log.msmarco-passage-openai-ada2 & +``` + +The path `/path/to/msmarco-passage-openai-ada2/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. +Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. +See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +target/appassembler/bin/SearchHnswDenseVectors \ + -index indexes/lucene-hnsw.msmarco-passage-openai-ada2-int8/ \ + -topics tools/topics-and-qrels/topics.dl20-passage.openai-ada2.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.dl20-passage.openai-ada2.jsonl.txt \ + -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +tools/eval/trec_eval.9.0.4/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.dl20-passage.openai-ada2.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.dl20-passage.openai-ada2.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.dl20-passage.openai-ada2.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.dl20-passage.openai-ada2.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **OpenAI-ada2**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.477 | +| **nDCG@10** | **OpenAI-ada2**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.676 | +| **R@100** | **OpenAI-ada2**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.723 | +| **R@1000** | **OpenAI-ada2**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.867 | + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl20-passage-openai-ada2-int8.yaml). + +Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage-openai-ada2-int8.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl20-passage-openai-ada2.md b/docs/regressions/regressions-dl20-passage-openai-ada2.md index f3e93c63ef..ed3f4be6d7 100644 --- a/docs/regressions/regressions-dl20-passage-openai-ada2.md +++ b/docs/regressions/regressions-dl20-passage-openai-ada2.md @@ -57,14 +57,16 @@ target/appassembler/bin/IndexHnswDenseVectors \ -input /path/to/msmarco-passage-openai-ada2 \ -generator HnswDenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.msmarco-passage-openai-ada2/ \ - -threads 16 -M 16 -efC 100 -memoryBuffer 65536 \ + -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge \ >& logs/log.msmarco-passage-openai-ada2 & ``` The path `/path/to/msmarco-passage-openai-ada2/` should point to the corpus downloaded above. - Upon completion, we should have an index with 8,841,823 documents. +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. + ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-int8-onnx.md b/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-int8-onnx.md new file mode 100644 index 0000000000..f130af6512 --- /dev/null +++ b/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-int8-onnx.md @@ -0,0 +1,115 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: cosDPR-distil with HNSW indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-int8-onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-int8-onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw-int8-onnx +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw-int8-onnx +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar -P collections/ +tar xvf collections/msmarco-passage-cos-dpr-distil.tar -C collections/ +``` + +To confirm, `msmarco-passage-cos-dpr-distil.tar` is 57 GB and has MD5 checksum `e20ffbc8b5e7f760af31298aefeaebbd`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw-int8-onnx \ + --corpus-path collections/msmarco-passage-cos-dpr-distil +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +target/appassembler/bin/IndexHnswDenseVectors \ + -collection JsonDenseVectorCollection \ + -input /path/to/msmarco-passage-cos-dpr-distil \ + -generator HnswDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/ \ + -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8 \ + >& logs/log.msmarco-passage-cos-dpr-distil & +``` + +The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. +Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. +See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](../../docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +target/appassembler/bin/SearchHnswDenseVectors \ + -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/ \ + -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.txt \ + -topicReader TsvInt \ + -output runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.msmarco-passage.dev-subset.txt \ + -generator VectorQueryGenerator -topicField title -threads 16 -hits 1000 -efSearch 1000 -encoder CosDprDistil & +``` + +Note that we are performing query inference "on-the-fly" with ONNX in these experiments. + +Evaluation can be performed using `trec_eval`: + +```bash +tools/eval/trec_eval.9.0.4/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.msmarco-passage.dev-subset.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.msmarco-passage.dev-subset.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.msmarco-passage.dev-subset.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.msmarco-passage.dev-subset.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cosDPR-distil**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.393 | +| **RR@10** | **cosDPR-distil**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.388 | +| **R@100** | **cosDPR-distil**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.903 | +| **R@1000** | **cosDPR-distil**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.974 | + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-int8-onnx.yaml). + +## Reproduction Log[*](../../docs/reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-int8-onnx.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-int8.md b/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-int8.md new file mode 100644 index 0000000000..e97fb49c17 --- /dev/null +++ b/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-int8.md @@ -0,0 +1,115 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: cosDPR-distil with HNSW indexes (using pre-encoded queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-int8.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-int8.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw-int8 +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw-int8 +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar -P collections/ +tar xvf collections/msmarco-passage-cos-dpr-distil.tar -C collections/ +``` + +To confirm, `msmarco-passage-cos-dpr-distil.tar` is 57 GB and has MD5 checksum `e20ffbc8b5e7f760af31298aefeaebbd`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw-int8 \ + --corpus-path collections/msmarco-passage-cos-dpr-distil +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +target/appassembler/bin/IndexHnswDenseVectors \ + -collection JsonDenseVectorCollection \ + -input /path/to/msmarco-passage-cos-dpr-distil \ + -generator HnswDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/ \ + -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8 \ + >& logs/log.msmarco-passage-cos-dpr-distil & +``` + +The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. +Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. +See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](../../docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +target/appassembler/bin/SearchHnswDenseVectors \ + -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/ \ + -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt \ + -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +tools/eval/trec_eval.9.0.4/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cosDPR-distil**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.393 | +| **RR@10** | **cosDPR-distil**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.388 | +| **R@100** | **cosDPR-distil**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.903 | +| **R@1000** | **cosDPR-distil**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.974 | + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-int8.yaml). + +## Reproduction Log[*](../../docs/reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-int8.template) and run `bin/build.sh` to rebuild the documentation. + ++ Results reproduced by [@yilinjz](https://github.com/yilinjz) on 2023-09-01 (commit [`4ae518b`](https://github.com/castorini/anserini/commit/4ae518bb284ebcba0b273a473bc8774735cb7d19)) \ No newline at end of file diff --git a/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-onnx.md b/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-onnx.md index 6df98970f1..a8f41de6ee 100644 --- a/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-onnx.md +++ b/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-onnx.md @@ -6,7 +6,7 @@ This page describes regression experiments, integrated into Anserini's regressio > Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. -In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). +In these experiments, we are performing query inference "on-the-fly" with ONNX. The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-onnx.yaml). Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. @@ -54,14 +54,16 @@ target/appassembler/bin/IndexHnswDenseVectors \ -input /path/to/msmarco-passage-cos-dpr-distil \ -generator HnswDenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil/ \ - -threads 16 -M 16 -efC 100 -memoryBuffer 65536 \ + -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge \ >& logs/log.msmarco-passage-cos-dpr-distil & ``` The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above. - Upon completion, we should have an index with 8,841,823 documents. +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. + ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. @@ -78,6 +80,8 @@ target/appassembler/bin/SearchHnswDenseVectors \ -generator VectorQueryGenerator -topicField title -threads 16 -hits 1000 -efSearch 1000 -encoder CosDprDistil & ``` +Note that we are performing query inference "on-the-fly" with ONNX in these experiments. + Evaluation can be performed using `trec_eval`: ```bash diff --git a/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw.md b/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw.md index cf41e9645a..0d98114e80 100644 --- a/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw.md +++ b/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw.md @@ -54,14 +54,16 @@ target/appassembler/bin/IndexHnswDenseVectors \ -input /path/to/msmarco-passage-cos-dpr-distil \ -generator HnswDenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil/ \ - -threads 16 -M 16 -efC 100 -memoryBuffer 65536 \ + -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge \ >& logs/log.msmarco-passage-cos-dpr-distil & ``` The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above. - Upon completion, we should have an index with 8,841,823 documents. +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. + ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/docs/regressions/regressions-msmarco-passage-openai-ada2-int8.md b/docs/regressions/regressions-msmarco-passage-openai-ada2-int8.md new file mode 100644 index 0000000000..7b2053dec9 --- /dev/null +++ b/docs/regressions/regressions-msmarco-passage-openai-ada2-int8.md @@ -0,0 +1,116 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**NOTE:** We're currently having issues with this regression, which throws "Retried waiting for GCLocker too often" errors. + +**Model**: OpenAI-ada2 embeddings (using pre-encoded queries) with HNSW indexes + +This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023. + +In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-passage-openai-ada2-int8.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-passage-openai-ada2-int8.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-openai-ada2-int8 +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-passage-openai-ada2-int8 +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.tar -P collections/ +tar xvf collections/msmarco-passage-openai-ada2.tar -C collections/ +``` + +To confirm, `msmarco-passage-openai-ada2.tar` is 109 GB and has MD5 checksum `a4d843d522ff3a3af7edbee789a63402`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-openai-ada2-int8 \ + --corpus-path collections/msmarco-passage-openai-ada2 +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +target/appassembler/bin/IndexHnswDenseVectors \ + -collection JsonDenseVectorCollection \ + -input /path/to/msmarco-passage-openai-ada2 \ + -generator HnswDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw.msmarco-passage-openai-ada2-int8/ \ + -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8 \ + >& logs/log.msmarco-passage-openai-ada2 & +``` + +The path `/path/to/msmarco-passage-openai-ada2/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. +Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. +See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](../../docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +target/appassembler/bin/SearchHnswDenseVectors \ + -index indexes/lucene-hnsw.msmarco-passage-openai-ada2-int8/ \ + -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.openai-ada2.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.msmarco-passage.dev-subset.openai-ada2.jsonl.txt \ + -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +tools/eval/trec_eval.9.0.4/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.msmarco-passage.dev-subset.openai-ada2.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.msmarco-passage.dev-subset.openai-ada2.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.msmarco-passage.dev-subset.openai-ada2.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.msmarco-passage.dev-subset.openai-ada2.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **OpenAI-ada2**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.350 | +| **RR@10** | **OpenAI-ada2**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.343 | +| **R@100** | **OpenAI-ada2**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.898 | +| **R@1000** | **OpenAI-ada2**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.985 | + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/msmarco-passage-openai-ada2-int8.yaml). + +## Reproduction Log[*](../../docs/reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/msmarco-passage-openai-ada2-int8.template) and run `bin/build.sh` to rebuild the documentation. + diff --git a/docs/regressions/regressions-msmarco-passage-openai-ada2.md b/docs/regressions/regressions-msmarco-passage-openai-ada2.md index c6ca60e2bc..8f58da3521 100644 --- a/docs/regressions/regressions-msmarco-passage-openai-ada2.md +++ b/docs/regressions/regressions-msmarco-passage-openai-ada2.md @@ -54,14 +54,16 @@ target/appassembler/bin/IndexHnswDenseVectors \ -input /path/to/msmarco-passage-openai-ada2 \ -generator HnswDenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.msmarco-passage-openai-ada2/ \ - -threads 16 -M 16 -efC 100 -memoryBuffer 65536 \ + -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge \ >& logs/log.msmarco-passage-openai-ada2 & ``` The path `/path/to/msmarco-passage-openai-ada2/` should point to the corpus downloaded above. - Upon completion, we should have an index with 8,841,823 documents. +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. + ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/pom.xml b/pom.xml index bec797baa4..9b43d57aa3 100644 --- a/pom.xml +++ b/pom.xml @@ -26,7 +26,7 @@ - 9.8.0 + 9.9.1 UTF-8 diff --git a/src/main/java/io/anserini/index/IndexCollection.java b/src/main/java/io/anserini/index/IndexCollection.java index f8775d92f0..2242837027 100644 --- a/src/main/java/io/anserini/index/IndexCollection.java +++ b/src/main/java/io/anserini/index/IndexCollection.java @@ -31,6 +31,7 @@ import org.apache.logging.log4j.Logger; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.WhitespaceAnalyzer; +import org.apache.lucene.codecs.lucene99.Lucene99Codec; import org.apache.lucene.index.ConcurrentMergeScheduler; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; @@ -161,20 +162,6 @@ public static class Args extends AbstractIndexer.Args { public IndexCollection(Args args) throws Exception { super(args); - LOG.info("IndexCollection settings:"); - LOG.info(" + Generator: " + args.generatorClass); - LOG.info(" + Language: " + args.language); - LOG.info(" + Stemmer: " + args.stemmer); - LOG.info(" + Keep stopwords? " + args.keepStopwords); - LOG.info(" + Stopwords: " + args.stopwords); - LOG.info(" + Store positions? " + args.storePositions); - LOG.info(" + Store docvectors? " + args.storeDocvectors); - LOG.info(" + Store document \"contents\" field? " + args.storeContents); - LOG.info(" + Store document \"raw\" field? " + args.storeRaw); - LOG.info(" + Additional fields to index: " + Arrays.toString(args.fields)); - LOG.info(" + Whitelist: " + args.whitelist); - LOG.info(" + Pretokenized?: " + args.pretokenized); - try { super.generatorClass = (Class>) Class.forName("io.anserini.index.generator." + args.generatorClass); @@ -206,6 +193,21 @@ public IndexCollection(Args args) throws Exception { config.setMergeScheduler(new ConcurrentMergeScheduler()); super.writer = new IndexWriter(dir, config); + + LOG.info("IndexCollection settings:"); + LOG.info(" + Generator: " + args.generatorClass); + LOG.info(" + Language: " + args.language); + LOG.info(" + Stemmer: " + args.stemmer); + LOG.info(" + Keep stopwords? " + args.keepStopwords); + LOG.info(" + Stopwords: " + args.stopwords); + LOG.info(" + Store positions? " + args.storePositions); + LOG.info(" + Store docvectors? " + args.storeDocvectors); + LOG.info(" + Store document \"contents\" field? " + args.storeContents); + LOG.info(" + Store document \"raw\" field? " + args.storeRaw); + LOG.info(" + Additional fields to index: " + Arrays.toString(args.fields)); + LOG.info(" + Whitelist: " + args.whitelist); + LOG.info(" + Pretokenized?: " + args.pretokenized); + LOG.info(" + Codec: " + this.writer.getConfig().getCodec()); } private Analyzer getAnalyzer() { diff --git a/src/main/java/io/anserini/index/IndexHnswDenseVectors.java b/src/main/java/io/anserini/index/IndexHnswDenseVectors.java index 10e7c15640..161fbc4fac 100644 --- a/src/main/java/io/anserini/index/IndexHnswDenseVectors.java +++ b/src/main/java/io/anserini/index/IndexHnswDenseVectors.java @@ -23,11 +23,13 @@ import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.codecs.KnnVectorsWriter; -import org.apache.lucene.codecs.lucene95.Lucene95Codec; -import org.apache.lucene.codecs.lucene95.Lucene95HnswVectorsFormat; +import org.apache.lucene.codecs.lucene99.Lucene99Codec; +import org.apache.lucene.codecs.lucene99.Lucene99HnswScalarQuantizedVectorsFormat; +import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; import org.apache.lucene.index.ConcurrentMergeScheduler; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.NoMergePolicy; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.TieredMergePolicy; @@ -56,20 +58,34 @@ public static final class Args extends AbstractIndexer.Args { @Option(name = "-efC", metaVar = "[num]", usage = "HNSW parameters ef Construction") public int efC = 100; + @Option(name = "-quantize.int8", usage = "Quantize vectors into int8.") + public boolean quantizeInt8 = false; + @Option(name = "-storeVectors", usage = "Boolean switch to store raw raw vectors.") public boolean storeVectors = false; + + @Option(name = "-noMerge", usage = "Do not merge segments (fast indexing, slow retrieval).") + public boolean noMerge = false; + + @Option(name = "-maxThreadMemoryBeforeFlush", metaVar = "[num]", usage = "Maximum memory consumption per thread before triggering a forced flush (in MB); must be smaller than 2048.") + public int maxThreadMemoryBeforeFlush = 2047; + // This is the most aggressive possible setting; default is 1945. + // If the setting is too aggressive, may result in GCLocker issues. + + @Option(name = "-maxMergedSegmentSize", metaVar = "[num]", usage = "Maximum sized segment to produce during normal merging (in MB).") + public int maxMergedSegmentSize = 1024 * 16; + + @Option(name = "-segmentsPerTier", metaVar = "[num]", usage = "Allowed number of segments per tier.") + public int segmentsPerTier = 10; + + @Option(name = "-maxMergeAtOnce", metaVar = "[num]", usage = "Maximum number of segments to be merged at a time during \"normal\" merging.") + public int maxMergeAtOnce = 10; } @SuppressWarnings("unchecked") public IndexHnswDenseVectors(Args args) throws Exception { super(args); - LOG.info("HnswIndexer settings:"); - LOG.info(" + Generator: " + args.generatorClass); - LOG.info(" + M: " + args.M); - LOG.info(" + efC: " + args.efC); - LOG.info(" + Store document vectors? " + args.storeVectors); - try { super.generatorClass = (Class>) Class.forName("io.anserini.index.generator." + args.generatorClass); @@ -79,26 +95,49 @@ public IndexHnswDenseVectors(Args args) throws Exception { try { final Directory dir = FSDirectory.open(Paths.get(args.index)); - final IndexWriterConfig config = new IndexWriterConfig().setCodec( - new Lucene95Codec() { - @Override - public KnnVectorsFormat getKnnVectorsFormatForField(String field) { - return new DelegatingKnnVectorsFormat( - new Lucene95HnswVectorsFormat(args.M, args.efC), 4096); - } - }); + final IndexWriterConfig config; + + if (args.quantizeInt8) { + config = new IndexWriterConfig().setCodec( + new Lucene99Codec() { + @Override + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + return new DelegatingKnnVectorsFormat( + new Lucene99HnswScalarQuantizedVectorsFormat(args.M, args.efC), 4096); + } + }); + } else { + config = new IndexWriterConfig().setCodec( + new Lucene99Codec() { + @Override + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + return new DelegatingKnnVectorsFormat( + new Lucene99HnswVectorsFormat(args.M, args.efC), 4096); + } + }); + } config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); config.setRAMBufferSizeMB(args.memoryBuffer); + config.setRAMPerThreadHardLimitMB(args.maxThreadMemoryBeforeFlush); config.setUseCompoundFile(false); config.setMergeScheduler(new ConcurrentMergeScheduler()); - if (args.optimize) { - // If we're going to merge down into a single segment at the end, skip intermediate merges, - // since they are a waste of time. + if (args.noMerge) { + config.setMergePolicy(NoMergePolicy.INSTANCE); + } else { TieredMergePolicy mergePolicy = new TieredMergePolicy(); - mergePolicy.setMaxMergeAtOnce(256); - mergePolicy.setSegmentsPerTier(256); + if (args.optimize) { + // If we're going to merge down into a single segment at the end, skip intermediate merges, + // since they are a waste of time. + mergePolicy.setMaxMergeAtOnce(256); + mergePolicy.setSegmentsPerTier(256); + } else { + mergePolicy.setFloorSegmentMB(1024); + mergePolicy.setMaxMergedSegmentMB(args.maxMergedSegmentSize); + mergePolicy.setSegmentsPerTier(args.segmentsPerTier); + mergePolicy.setMaxMergeAtOnce(args.maxMergeAtOnce); + } config.setMergePolicy(mergePolicy); } @@ -106,6 +145,24 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) { } catch (Exception e) { throw new IllegalArgumentException(String.format("Unable to create IndexWriter: %s.", e.getMessage())); } + + LOG.info("HnswIndexer settings:"); + LOG.info(" + Generator: " + args.generatorClass); + LOG.info(" + M: " + args.M); + LOG.info(" + efC: " + args.efC); + LOG.info(" + Store document vectors? " + args.storeVectors); + LOG.info(" + Codec: " + this.writer.getConfig().getCodec()); + LOG.info(" + MemoryBuffer: " + args.memoryBuffer); + LOG.info(" + MaxThreadMemoryBeforeFlush: " + args.maxThreadMemoryBeforeFlush); + + if (args.noMerge) { + LOG.info(" + MergePolicy: NoMerge"); + } else { + LOG.info(" + MergePolicy: TieredMergePolicy"); + LOG.info(" + MaxMergedSegmentSize: " + args.maxMergedSegmentSize); + LOG.info(" + SegmentsPerTier: " + args.segmentsPerTier); + LOG.info(" + MaxMergeAtOnce: " + args.maxMergeAtOnce); + } } // Solution provided by Solr, see https://www.mail-archive.com/java-user@lucene.apache.org/msg52149.html diff --git a/src/main/java/io/anserini/index/IndexInvertedDenseVectors.java b/src/main/java/io/anserini/index/IndexInvertedDenseVectors.java index e3a0c8e810..249c626a97 100644 --- a/src/main/java/io/anserini/index/IndexInvertedDenseVectors.java +++ b/src/main/java/io/anserini/index/IndexInvertedDenseVectors.java @@ -25,7 +25,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper; import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.codecs.lucene95.Lucene95Codec; +import org.apache.lucene.codecs.lucene99.Lucene99Codec; import org.apache.lucene.index.ConcurrentMergeScheduler; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; @@ -78,10 +78,6 @@ public static final class Args extends AbstractIndexer.Args { public IndexInvertedDenseVectors(Args args) { super(args); - LOG.info("InvertedDenseIndexer settings:"); - LOG.info(" + Generator: " + args.generatorClass); - LOG.info(" + Encoding: " + args.encoding); - try { super.generatorClass = (Class>) Class.forName("io.anserini.index.generator." + args.generatorClass); @@ -104,7 +100,7 @@ public IndexInvertedDenseVectors(Args args) { try { final Directory dir = FSDirectory.open(Paths.get(args.index)); - final IndexWriterConfig config = new IndexWriterConfig(analyzer).setCodec(new Lucene95Codec()); + final IndexWriterConfig config = new IndexWriterConfig(analyzer); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); config.setRAMBufferSizeMB(args.memoryBuffer); config.setUseCompoundFile(false); @@ -113,6 +109,11 @@ public IndexInvertedDenseVectors(Args args) { } catch (Exception e) { throw new IllegalArgumentException(String.format("Unable to create IndexWriter: %s.", e.getMessage())); } + + LOG.info("InvertedDenseIndexer settings:"); + LOG.info(" + Generator: " + args.generatorClass); + LOG.info(" + Encoding: " + args.encoding); + LOG.info(" + Codec: " + this.writer.getConfig().getCodec()); } public static void main(String[] args) throws Exception { diff --git a/src/main/python/regressions-batch03.txt b/src/main/python/regressions-batch03.txt index baab7fc298..4a38ff5dfc 100644 --- a/src/main/python/regressions-batch03.txt +++ b/src/main/python/regressions-batch03.txt @@ -1,8 +1,10 @@ +python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw > logs/log.msmarco-passage-cos-dpr-distil-hnsw 2>&1 +python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw-int8 > logs/log.msmarco-passage-cos-dpr-distil-hnsw-int8 2>&1 +python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-openai-ada2 > logs/log.msmarco-passage-openai-ada2 2>&1 + # MS MARCO V1 passage python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-splade-pp-ed > logs/log.msmarco-passage-splade-pp-ed 2>&1 python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-splade-pp-sd > logs/log.msmarco-passage-splade-pp-sd 2>&1 -python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-openai-ada2 > logs/log.msmarco-passage-openai-ada2 2>&1 -python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw > logs/log.msmarco-passage-cos-dpr-distil-hnsw 2>&1 python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-fw > logs/log.msmarco-passage-cos-dpr-distil-fw 2>&1 python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-lexlsh > logs/log.msmarco-passage-cos-dpr-distil-lexlsh 2>&1 @@ -20,6 +22,10 @@ python src/main/python/run_regression.py --index --verify --search --regression python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-distill-splade-max > logs/log.msmarco-passage-distill-splade-max 2>&1 python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-splade-distil-cocodenser-medium > logs/log.msmarco-passage-splade-distil-cocodenser-medium 2>&1 +# HNSW search-only +python src/main/python/run_regression.py --search --regression msmarco-passage-cos-dpr-distil-hnsw-onnx > logs/log.msmarco-passage-cos-dpr-distil-hnsw-onnx 2>&1 +python src/main/python/run_regression.py --search --regression msmarco-passage-cos-dpr-distil-hnsw-int8-onnx > logs/log.msmarco-passage-cos-dpr-distil-hnsw-int8-onnx 2>&1 + # MS MARCO V1 doc python src/main/python/run_regression.py --index --verify --search --regression msmarco-doc > logs/log.msmarco-doc 2>&1 python src/main/python/run_regression.py --index --verify --search --regression msmarco-doc-wp > logs/log.msmarco-doc-wp 2>&1 @@ -34,9 +40,8 @@ python src/main/python/run_regression.py --index --verify --search --regression python src/main/python/run_regression.py --index --verify --search --regression msmarco-doc-segmented-unicoil-noexp > logs/log.msmarco-doc-segmented-unicoil-noexp 2>&1 # MS MARCO V1 passage ONNX runs - uses same index, so need to make sure previous runs finish -python src/main/python/run_regression.py --search-pool 1 --verify --search --regression msmarco-passage-splade-pp-ed-onnx > logs/log.msmarco-passage-splade-pp-ed-onnx 2>&1 -python src/main/python/run_regression.py --search-pool 1 --verify --search --regression msmarco-passage-splade-pp-sd-onnx > logs/log.msmarco-passage-splade-pp-sd-onnx 2>&1 -python src/main/python/run_regression.py --search-pool 1 --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw-onnx > logs/log.msmarco-passage-cos-dpr-distil-hnsw-onnx 2>&1 +python src/main/python/run_regression.py --search --regression msmarco-passage-splade-pp-ed-onnx > logs/log.msmarco-passage-splade-pp-ed-onnx 2>&1 +python src/main/python/run_regression.py --search --regression msmarco-passage-splade-pp-sd-onnx > logs/log.msmarco-passage-splade-pp-sd-onnx 2>&1 # MIRACL python src/main/python/run_regression.py --index --verify --search --regression miracl-v1.0-ar > logs/log.miracl-v1.0-ar 2>&1 @@ -121,107 +126,107 @@ python src/main/python/run_regression.py --index --verify --search --regression python src/main/python/run_regression.py --index --verify --search --regression mrtydi-v1.1-te-aca > logs/log.mrtydi-v1.1-te-aca 2>&1 python src/main/python/run_regression.py --index --verify --search --regression mrtydi-v1.1-th-aca > logs/log.mrtydi-v1.1-th-aca 2>&1 -# DL19 - ONNX -python src/main/python/run_regression.py --search-pool 1 --verify --search --regression dl19-passage-splade-pp-ed-onnx > logs/log.dl19-passage-splade-pp-ed-onnx 2>&1 -python src/main/python/run_regression.py --search-pool 1 --verify --search --regression dl19-passage-splade-pp-sd-onnx > logs/log.dl19-passage-splade-pp-sd-onnx 2>&1 -python src/main/python/run_regression.py --search-pool 1 --verify --search --regression dl19-passage-cos-dpr-distil-hnsw-onnx > logs/log.dl19-passage-cos-dpr-distil-hnsw-onnx 2>&1 - -# Other DL19 -python src/main/python/run_regression.py --verify --search --regression dl19-passage > logs/log.dl19-passage 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl19-passage-bm25-b8 > logs/log.dl19-passage-bm25-b8 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl19-passage-ca > logs/log.dl19-passage-ca 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl19-passage-wp > logs/log.dl19-passage-wp 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl19-passage-hgf-wp > logs/log.dl19-passage-hgf-wp 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl19-passage-docTTTTTquery > logs/log.dl19-passage-docTTTTTquery 2>&1 - -python src/main/python/run_regression.py --verify --search --regression dl19-passage-unicoil > logs/log.dl19-passage-unicoil 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl19-passage-unicoil-noexp > logs/log.dl19-passage-unicoil-noexp 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl19-passage-splade-distil-cocodenser-medium > logs/log.dl19-passage-splade-distil-cocodenser-medium 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl19-passage-splade-pp-ed > logs/log.dl19-passage-splade-pp-ed 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl19-passage-splade-pp-sd > logs/log.dl19-passage-splade-pp-sd 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl19-passage-cos-dpr-distil-hnsw > logs/log.dl19-passage-cos-dpr-distil-hnsw 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl19-passage-cos-dpr-distil-fw > logs/log.dl19-passage-cos-dpr-distil-fw 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl19-passage-cos-dpr-distil-lexlsh > logs/log.dl19-passage-cos-dpr-distil-lexlsh 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl19-passage-openai-ada2 > logs/log.dl19-passage-openai-ada2 2>&1 - -python src/main/python/run_regression.py --verify --search --regression dl19-doc > logs/log.dl19-doc 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl19-doc-ca > logs/log.dl19-doc-ca 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl19-doc-wp > logs/log.dl19-doc-wp 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl19-doc-hgf-wp > logs/log.dl19-doc-hgf-wp 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl19-doc-segmented-wp > logs/log.dl19-doc-segmented-wp 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl19-doc-docTTTTTquery > logs/log.dl19-doc-docTTTTTquery 2>&1 - -python src/main/python/run_regression.py --verify --search --regression dl19-doc-segmented > logs/log.dl19-doc-segmented 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl19-doc-segmented-ca > logs/log.dl19-doc-segmented-ca 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl19-doc-segmented-docTTTTTquery > logs/log.dl19-doc-segmented-docTTTTTquery 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl19-doc-segmented-unicoil > logs/log.dl19-doc-segmented-unicoil 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl19-doc-segmented-unicoil-noexp > logs/log.dl19-doc-segmented-unicoil-noexp 2>&1 - -# DL20 - ONNX -python src/main/python/run_regression.py --search-pool 1 --verify --search --regression dl20-passage-splade-pp-ed-onnx > logs/log.dl20-passage-splade-pp-ed-onnx 2>&1 -python src/main/python/run_regression.py --search-pool 1 --verify --search --regression dl20-passage-splade-pp-sd-onnx > logs/log.dl20-passage-splade-pp-sd-onnx 2>&1 -python src/main/python/run_regression.py --search-pool 1 --verify --search --regression dl20-passage-cos-dpr-distil-hnsw-onnx > logs/log.dl20-passage-cos-dpr-distil-hnsw-onnx 2>&1 - -# Other DL20 -python src/main/python/run_regression.py --verify --search --regression dl20-passage > logs/log.dl20-passage 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl20-passage-bm25-b8 > logs/log.dl20-passage-bm25-b8 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl20-passage-ca > logs/log.dl20-passage-ca 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl20-passage-wp > logs/log.dl20-passage-wp 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl20-passage-hgf-wp > logs/log.dl20-passage-hgf-wp 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl20-passage-docTTTTTquery > logs/log.dl20-passage-docTTTTTquery 2>&1 - -python src/main/python/run_regression.py --verify --search --regression dl20-passage-unicoil > logs/log.dl20-passage-unicoil 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl20-passage-unicoil-noexp > logs/log.dl20-passage-unicoil-noexp 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl20-passage-splade-distil-cocodenser-medium > logs/log.dl20-passage-splade-distil-cocodenser-medium 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl20-passage-splade-pp-ed > logs/log.dl20-passage-splade-pp-ed 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl20-passage-splade-pp-sd > logs/log.dl20-passage-splade-pp-sd 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl20-passage-cos-dpr-distil-hnsw > logs/log.dl20-passage-cos-dpr-distil-hnsw 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl20-passage-cos-dpr-distil-fw > logs/log.dl20-passage-cos-dpr-distil-fw 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl20-passage-cos-dpr-distil-lexlsh > logs/log.dl20-passage-cos-dpr-distil-lexlsh 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl20-passage-openai-ada2 > logs/log.dl20-passage-openai-ada2 2>&1 - -python src/main/python/run_regression.py --verify --search --regression dl20-doc > logs/log.dl20-doc 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl20-doc-ca > logs/log.dl20-doc-ca 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl20-doc-wp > logs/log.dl20-doc-wp 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl20-doc-hgf-wp > logs/log.dl20-doc-hgf-wp 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl20-doc-docTTTTTquery > logs/log.dl20-doc-docTTTTTquery 2>&1 - -python src/main/python/run_regression.py --verify --search --regression dl20-doc-segmented > logs/log.dl20-doc-segmented 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl20-doc-segmented-ca > logs/log.dl20-doc-segmented-ca 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl20-doc-segmented-wp > logs/log.dl20-doc-segmented-wp 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl20-doc-segmented-docTTTTTquery > logs/log.dl20-doc-segmented-docTTTTTquery 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl20-doc-segmented-unicoil > logs/log.dl20-doc-segmented-unicoil 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl20-doc-segmented-unicoil-noexp > logs/log.dl20-doc-segmented-unicoil-noexp 2>&1 +# DL19 +python src/main/python/run_regression.py --search --regression dl19-passage > logs/log.dl19-passage 2>&1 +python src/main/python/run_regression.py --search --regression dl19-passage-bm25-b8 > logs/log.dl19-passage-bm25-b8 2>&1 +python src/main/python/run_regression.py --search --regression dl19-passage-ca > logs/log.dl19-passage-ca 2>&1 +python src/main/python/run_regression.py --search --regression dl19-passage-wp > logs/log.dl19-passage-wp 2>&1 +python src/main/python/run_regression.py --search --regression dl19-passage-hgf-wp > logs/log.dl19-passage-hgf-wp 2>&1 +python src/main/python/run_regression.py --search --regression dl19-passage-docTTTTTquery > logs/log.dl19-passage-docTTTTTquery 2>&1 +python src/main/python/run_regression.py --search --regression dl19-passage-splade-pp-ed-onnx > logs/log.dl19-passage-splade-pp-ed-onnx 2>&1 +python src/main/python/run_regression.py --search --regression dl19-passage-splade-pp-sd-onnx > logs/log.dl19-passage-splade-pp-sd-onnx 2>&1 + +python src/main/python/run_regression.py --search --regression dl19-passage-unicoil > logs/log.dl19-passage-unicoil 2>&1 +python src/main/python/run_regression.py --search --regression dl19-passage-unicoil-noexp > logs/log.dl19-passage-unicoil-noexp 2>&1 +python src/main/python/run_regression.py --search --regression dl19-passage-splade-distil-cocodenser-medium > logs/log.dl19-passage-splade-distil-cocodenser-medium 2>&1 +python src/main/python/run_regression.py --search --regression dl19-passage-splade-pp-ed > logs/log.dl19-passage-splade-pp-ed 2>&1 +python src/main/python/run_regression.py --search --regression dl19-passage-splade-pp-sd > logs/log.dl19-passage-splade-pp-sd 2>&1 +python src/main/python/run_regression.py --search --regression dl19-passage-cos-dpr-distil-hnsw > logs/log.dl19-passage-cos-dpr-distil-hnsw 2>&1 +python src/main/python/run_regression.py --search --regression dl19-passage-cos-dpr-distil-hnsw-int8 > logs/log.dl19-passage-cos-dpr-distil-hnsw-int8 2>&1 +python src/main/python/run_regression.py --search --regression dl19-passage-cos-dpr-distil-hnsw-onnx > logs/log.dl19-passage-cos-dpr-distil-hnsw-onnx 2>&1 +python src/main/python/run_regression.py --search --regression dl19-passage-cos-dpr-distil-hnsw-int8-onnx > logs/log.dl19-passage-cos-dpr-distil-hnsw-int8-onnx 2>&1 +python src/main/python/run_regression.py --search --regression dl19-passage-cos-dpr-distil-fw > logs/log.dl19-passage-cos-dpr-distil-fw 2>&1 +python src/main/python/run_regression.py --search --regression dl19-passage-cos-dpr-distil-lexlsh > logs/log.dl19-passage-cos-dpr-distil-lexlsh 2>&1 +python src/main/python/run_regression.py --search --regression dl19-passage-openai-ada2 > logs/log.dl19-passage-openai-ada2 2>&1 + +python src/main/python/run_regression.py --search --regression dl19-doc > logs/log.dl19-doc 2>&1 +python src/main/python/run_regression.py --search --regression dl19-doc-ca > logs/log.dl19-doc-ca 2>&1 +python src/main/python/run_regression.py --search --regression dl19-doc-wp > logs/log.dl19-doc-wp 2>&1 +python src/main/python/run_regression.py --search --regression dl19-doc-hgf-wp > logs/log.dl19-doc-hgf-wp 2>&1 +python src/main/python/run_regression.py --search --regression dl19-doc-segmented-wp > logs/log.dl19-doc-segmented-wp 2>&1 +python src/main/python/run_regression.py --search --regression dl19-doc-docTTTTTquery > logs/log.dl19-doc-docTTTTTquery 2>&1 + +python src/main/python/run_regression.py --search --regression dl19-doc-segmented > logs/log.dl19-doc-segmented 2>&1 +python src/main/python/run_regression.py --search --regression dl19-doc-segmented-ca > logs/log.dl19-doc-segmented-ca 2>&1 +python src/main/python/run_regression.py --search --regression dl19-doc-segmented-docTTTTTquery > logs/log.dl19-doc-segmented-docTTTTTquery 2>&1 +python src/main/python/run_regression.py --search --regression dl19-doc-segmented-unicoil > logs/log.dl19-doc-segmented-unicoil 2>&1 +python src/main/python/run_regression.py --search --regression dl19-doc-segmented-unicoil-noexp > logs/log.dl19-doc-segmented-unicoil-noexp 2>&1 + +# DL20 +python src/main/python/run_regression.py --search --regression dl20-passage > logs/log.dl20-passage 2>&1 +python src/main/python/run_regression.py --search --regression dl20-passage-bm25-b8 > logs/log.dl20-passage-bm25-b8 2>&1 +python src/main/python/run_regression.py --search --regression dl20-passage-ca > logs/log.dl20-passage-ca 2>&1 +python src/main/python/run_regression.py --search --regression dl20-passage-wp > logs/log.dl20-passage-wp 2>&1 +python src/main/python/run_regression.py --search --regression dl20-passage-hgf-wp > logs/log.dl20-passage-hgf-wp 2>&1 +python src/main/python/run_regression.py --search --regression dl20-passage-docTTTTTquery > logs/log.dl20-passage-docTTTTTquery 2>&1 +python src/main/python/run_regression.py --search --regression dl20-passage-splade-pp-ed-onnx > logs/log.dl20-passage-splade-pp-ed-onnx 2>&1 +python src/main/python/run_regression.py --search --regression dl20-passage-splade-pp-sd-onnx > logs/log.dl20-passage-splade-pp-sd-onnx 2>&1 + +python src/main/python/run_regression.py --search --regression dl20-passage-unicoil > logs/log.dl20-passage-unicoil 2>&1 +python src/main/python/run_regression.py --search --regression dl20-passage-unicoil-noexp > logs/log.dl20-passage-unicoil-noexp 2>&1 +python src/main/python/run_regression.py --search --regression dl20-passage-splade-distil-cocodenser-medium > logs/log.dl20-passage-splade-distil-cocodenser-medium 2>&1 +python src/main/python/run_regression.py --search --regression dl20-passage-splade-pp-ed > logs/log.dl20-passage-splade-pp-ed 2>&1 +python src/main/python/run_regression.py --search --regression dl20-passage-splade-pp-sd > logs/log.dl20-passage-splade-pp-sd 2>&1 +python src/main/python/run_regression.py --search --regression dl20-passage-cos-dpr-distil-hnsw > logs/log.dl20-passage-cos-dpr-distil-hnsw 2>&1 +python src/main/python/run_regression.py --search --regression dl20-passage-cos-dpr-distil-hnsw-int8 > logs/log.dl20-passage-cos-dpr-distil-hnsw-int8 2>&1 +python src/main/python/run_regression.py --search --regression dl20-passage-cos-dpr-distil-hnsw-onnx > logs/log.dl20-passage-cos-dpr-distil-hnsw-onnx 2>&1 +python src/main/python/run_regression.py --search --regression dl20-passage-cos-dpr-distil-hnsw-int8-onnx > logs/log.dl20-passage-cos-dpr-distil-hnsw-int8-onnx 2>&1 +python src/main/python/run_regression.py --search --regression dl20-passage-cos-dpr-distil-fw > logs/log.dl20-passage-cos-dpr-distil-fw 2>&1 +python src/main/python/run_regression.py --search --regression dl20-passage-cos-dpr-distil-lexlsh > logs/log.dl20-passage-cos-dpr-distil-lexlsh 2>&1 +python src/main/python/run_regression.py --search --regression dl20-passage-openai-ada2 > logs/log.dl20-passage-openai-ada2 2>&1 + +python src/main/python/run_regression.py --search --regression dl20-doc > logs/log.dl20-doc 2>&1 +python src/main/python/run_regression.py --search --regression dl20-doc-ca > logs/log.dl20-doc-ca 2>&1 +python src/main/python/run_regression.py --search --regression dl20-doc-wp > logs/log.dl20-doc-wp 2>&1 +python src/main/python/run_regression.py --search --regression dl20-doc-hgf-wp > logs/log.dl20-doc-hgf-wp 2>&1 +python src/main/python/run_regression.py --search --regression dl20-doc-docTTTTTquery > logs/log.dl20-doc-docTTTTTquery 2>&1 + +python src/main/python/run_regression.py --search --regression dl20-doc-segmented > logs/log.dl20-doc-segmented 2>&1 +python src/main/python/run_regression.py --search --regression dl20-doc-segmented-ca > logs/log.dl20-doc-segmented-ca 2>&1 +python src/main/python/run_regression.py --search --regression dl20-doc-segmented-wp > logs/log.dl20-doc-segmented-wp 2>&1 +python src/main/python/run_regression.py --search --regression dl20-doc-segmented-docTTTTTquery > logs/log.dl20-doc-segmented-docTTTTTquery 2>&1 +python src/main/python/run_regression.py --search --regression dl20-doc-segmented-unicoil > logs/log.dl20-doc-segmented-unicoil 2>&1 +python src/main/python/run_regression.py --search --regression dl20-doc-segmented-unicoil-noexp > logs/log.dl20-doc-segmented-unicoil-noexp 2>&1 # DL21/22 -python src/main/python/run_regression.py --verify --search --regression dl21-passage > logs/log.dl21-passage 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl21-passage-d2q-t5 > logs/log.dl21-passage-d2q-t5 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl21-passage-augmented > logs/log.dl21-passage-augmented 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl21-passage-augmented-d2q-t5 > logs/log.dl21-passage-augmented-d2q-t5 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl21-passage-unicoil-noexp-0shot > logs/log.dl21-passage-unicoil-noexp-0shot 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl21-passage-unicoil-0shot > logs/log.dl21-passage-unicoil-0shot 2>&1 - -python src/main/python/run_regression.py --verify --search --regression dl21-passage-splade-pp-ed > logs/log.dl21-passage-splade-pp-ed 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl21-passage-splade-pp-sd > logs/log.dl21-passage-splade-pp-sd 2>&1 - -python src/main/python/run_regression.py --verify --search --regression dl21-doc > logs/log.dl21-doc 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl21-doc-d2q-t5 > logs/log.dl21-doc-d2q-t5 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl21-doc-segmented > logs/log.dl21-doc-segmented 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl21-doc-segmented-d2q-t5 > logs/log.dl21-doc-segmented-d2q-t5 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl21-doc-segmented-unicoil-noexp-0shot > logs/log.dl21-doc-segmented-unicoil-noexp-0shot 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl21-doc-segmented-unicoil-noexp-0shot-v2 > logs/log.dl21-doc-segmented-unicoil-noexp-0shot-v2 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl21-doc-segmented-unicoil-0shot > logs/log.dl21-doc-segmented-unicoil-0shot 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl21-doc-segmented-unicoil-0shot-v2 > logs/log.dl21-doc-segmented-unicoil-0shot-v2 2>&1 - -python src/main/python/run_regression.py --verify --search --regression dl22-passage > logs/log.dl22-passage 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl22-passage-d2q-t5 > logs/log.dl22-passage-d2q-t5 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl22-passage-augmented > logs/log.dl22-passage-augmented 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl22-passage-augmented-d2q-t5 > logs/log.dl22-passage-augmented-d2q-t5 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl22-passage-unicoil-noexp-0shot > logs/log.dl22-passage-unicoil-noexp-0shot 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl22-passage-unicoil-0shot > logs/log.dl22-passage-unicoil-0shot 2>&1 - -python src/main/python/run_regression.py --verify --search --regression dl22-passage-splade-pp-ed > logs/log.dl22-passage-splade-pp-ed 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl22-passage-splade-pp-sd > logs/log.dl22-passage-splade-pp-sd 2>&1 +python src/main/python/run_regression.py --search --regression dl21-passage > logs/log.dl21-passage 2>&1 +python src/main/python/run_regression.py --search --regression dl21-passage-d2q-t5 > logs/log.dl21-passage-d2q-t5 2>&1 +python src/main/python/run_regression.py --search --regression dl21-passage-augmented > logs/log.dl21-passage-augmented 2>&1 +python src/main/python/run_regression.py --search --regression dl21-passage-augmented-d2q-t5 > logs/log.dl21-passage-augmented-d2q-t5 2>&1 +python src/main/python/run_regression.py --search --regression dl21-passage-unicoil-noexp-0shot > logs/log.dl21-passage-unicoil-noexp-0shot 2>&1 +python src/main/python/run_regression.py --search --regression dl21-passage-unicoil-0shot > logs/log.dl21-passage-unicoil-0shot 2>&1 + +python src/main/python/run_regression.py --search --regression dl21-passage-splade-pp-ed > logs/log.dl21-passage-splade-pp-ed 2>&1 +python src/main/python/run_regression.py --search --regression dl21-passage-splade-pp-sd > logs/log.dl21-passage-splade-pp-sd 2>&1 + +python src/main/python/run_regression.py --search --regression dl21-doc > logs/log.dl21-doc 2>&1 +python src/main/python/run_regression.py --search --regression dl21-doc-d2q-t5 > logs/log.dl21-doc-d2q-t5 2>&1 +python src/main/python/run_regression.py --search --regression dl21-doc-segmented > logs/log.dl21-doc-segmented 2>&1 +python src/main/python/run_regression.py --search --regression dl21-doc-segmented-d2q-t5 > logs/log.dl21-doc-segmented-d2q-t5 2>&1 +python src/main/python/run_regression.py --search --regression dl21-doc-segmented-unicoil-noexp-0shot > logs/log.dl21-doc-segmented-unicoil-noexp-0shot 2>&1 +python src/main/python/run_regression.py --search --regression dl21-doc-segmented-unicoil-noexp-0shot-v2 > logs/log.dl21-doc-segmented-unicoil-noexp-0shot-v2 2>&1 +python src/main/python/run_regression.py --search --regression dl21-doc-segmented-unicoil-0shot > logs/log.dl21-doc-segmented-unicoil-0shot 2>&1 +python src/main/python/run_regression.py --search --regression dl21-doc-segmented-unicoil-0shot-v2 > logs/log.dl21-doc-segmented-unicoil-0shot-v2 2>&1 + +python src/main/python/run_regression.py --search --regression dl22-passage > logs/log.dl22-passage 2>&1 +python src/main/python/run_regression.py --search --regression dl22-passage-d2q-t5 > logs/log.dl22-passage-d2q-t5 2>&1 +python src/main/python/run_regression.py --search --regression dl22-passage-augmented > logs/log.dl22-passage-augmented 2>&1 +python src/main/python/run_regression.py --search --regression dl22-passage-augmented-d2q-t5 > logs/log.dl22-passage-augmented-d2q-t5 2>&1 +python src/main/python/run_regression.py --search --regression dl22-passage-unicoil-noexp-0shot > logs/log.dl22-passage-unicoil-noexp-0shot 2>&1 +python src/main/python/run_regression.py --search --regression dl22-passage-unicoil-0shot > logs/log.dl22-passage-unicoil-0shot 2>&1 + +python src/main/python/run_regression.py --search --regression dl22-passage-splade-pp-ed > logs/log.dl22-passage-splade-pp-ed 2>&1 +python src/main/python/run_regression.py --search --regression dl22-passage-splade-pp-sd > logs/log.dl22-passage-splade-pp-sd 2>&1 # CIRAL python src/main/python/run_regression.py --index --verify --search --regression ciral-v1.0-ha > logs/log.ciral-v1.0-ha 2>&1 diff --git a/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-int8-onnx.template b/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-int8-onnx.template new file mode 100644 index 0000000000..fd1fa8fa91 --- /dev/null +++ b/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-int8-onnx.template @@ -0,0 +1,101 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with HNSW indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 57 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. +Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. +See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Note that we are performing query inference "on-the-fly" with ONNX in these experiments. + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). + +Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-int8.template b/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-int8.template new file mode 100644 index 0000000000..0900647c40 --- /dev/null +++ b/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-int8.template @@ -0,0 +1,99 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with HNSW indexes (using pre-encoded queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 57 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. +Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. +See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). + +Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-onnx.template b/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-onnx.template index 355d54348f..07322676fc 100644 --- a/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-onnx.template +++ b/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-onnx.template @@ -6,7 +6,7 @@ This page describes regression experiments, integrated into Anserini's regressio > Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. -In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). +In these experiments, we are performing query inference "on-the-fly" with ONNX. Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). @@ -56,9 +56,11 @@ ${index_cmds} ``` The path `/path/to/${corpus}/` should point to the corpus downloaded above. - Upon completion, we should have an index with 8,841,823 documents. +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. + ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. @@ -71,6 +73,8 @@ After indexing has completed, you should be able to perform retrieval as follows ${ranking_cmds} ``` +Note that we are performing query inference "on-the-fly" with ONNX in these experiments. + Evaluation can be performed using `trec_eval`: ```bash diff --git a/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw.template b/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw.template index 81c68f7ec2..a1839cf6df 100644 --- a/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw.template +++ b/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw.template @@ -56,9 +56,11 @@ ${index_cmds} ``` The path `/path/to/${corpus}/` should point to the corpus downloaded above. - Upon completion, we should have an index with 8,841,823 documents. +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. + ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/src/main/resources/docgen/templates/dl19-passage-openai-ada2-int8.template b/src/main/resources/docgen/templates/dl19-passage-openai-ada2-int8.template new file mode 100644 index 0000000000..99454b9d31 --- /dev/null +++ b/src/main/resources/docgen/templates/dl19-passage-openai-ada2-int8.template @@ -0,0 +1,101 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**NOTE:** We're currently having issues with this regression, which throws "Retried waiting for GCLocker too often" errors. + +**Model**: OpenAI-ada2 embeddings (using pre-encoded queries) with HNSW indexes + +This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023. + +In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 109 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. +Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. +See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). + +Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl19-passage-openai-ada2.template b/src/main/resources/docgen/templates/dl19-passage-openai-ada2.template index 15fe5a605a..f84f7d2d1d 100644 --- a/src/main/resources/docgen/templates/dl19-passage-openai-ada2.template +++ b/src/main/resources/docgen/templates/dl19-passage-openai-ada2.template @@ -56,9 +56,11 @@ ${index_cmds} ``` The path `/path/to/${corpus}/` should point to the corpus downloaded above. - Upon completion, we should have an index with 8,841,823 documents. +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. + ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-int8-onnx.template b/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-int8-onnx.template new file mode 100644 index 0000000000..9179919a51 --- /dev/null +++ b/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-int8-onnx.template @@ -0,0 +1,101 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with HNSW indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 57 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. +Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. +See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Note that we are performing query inference "on-the-fly" with ONNX in these experiments. + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). + +Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-int8.template b/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-int8.template new file mode 100644 index 0000000000..e9e46d5fd7 --- /dev/null +++ b/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-int8.template @@ -0,0 +1,99 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with HNSW indexes (using pre-encoded queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 57 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. +Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. +See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). + +Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-onnx.template b/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-onnx.template index 36fee4a8b0..e5b80bf511 100644 --- a/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-onnx.template +++ b/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-onnx.template @@ -6,7 +6,7 @@ This page describes regression experiments, integrated into Anserini's regressio > Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. -In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). +In these experiments, we are performing query inference "on-the-fly" with ONNX. Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). @@ -56,9 +56,11 @@ ${index_cmds} ``` The path `/path/to/${corpus}/` should point to the corpus downloaded above. - Upon completion, we should have an index with 8,841,823 documents. +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. + ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. @@ -71,6 +73,8 @@ After indexing has completed, you should be able to perform retrieval as follows ${ranking_cmds} ``` +Note that we are performing query inference "on-the-fly" with ONNX in these experiments. + Evaluation can be performed using `trec_eval`: ```bash diff --git a/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw.template b/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw.template index e3f6969a9f..2ec64f9a41 100644 --- a/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw.template +++ b/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw.template @@ -56,9 +56,11 @@ ${index_cmds} ``` The path `/path/to/${corpus}/` should point to the corpus downloaded above. - Upon completion, we should have an index with 8,841,823 documents. +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. + ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/src/main/resources/docgen/templates/dl20-passage-openai-ada2-int8.template b/src/main/resources/docgen/templates/dl20-passage-openai-ada2-int8.template new file mode 100644 index 0000000000..eea224f03e --- /dev/null +++ b/src/main/resources/docgen/templates/dl20-passage-openai-ada2-int8.template @@ -0,0 +1,101 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**NOTE:** We're currently having issues with this regression, which throws "Retried waiting for GCLocker too often" errors. + +**Model**: OpenAI-ada2 embeddings (using pre-encoded queries) with HNSW indexes + +This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023. + +In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 109 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. +Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. +See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). + +Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl20-passage-openai-ada2.template b/src/main/resources/docgen/templates/dl20-passage-openai-ada2.template index 4cd598d31b..069c3d41fe 100644 --- a/src/main/resources/docgen/templates/dl20-passage-openai-ada2.template +++ b/src/main/resources/docgen/templates/dl20-passage-openai-ada2.template @@ -56,9 +56,11 @@ ${index_cmds} ``` The path `/path/to/${corpus}/` should point to the corpus downloaded above. - Upon completion, we should have an index with 8,841,823 documents. +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. + ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-int8-onnx.template b/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-int8-onnx.template new file mode 100644 index 0000000000..a7e21103f1 --- /dev/null +++ b/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-int8-onnx.template @@ -0,0 +1,93 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: cosDPR-distil with HNSW indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 57 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. +Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. +See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](${root_path}/docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +${ranking_cmds} +``` + +Note that we are performing query inference "on-the-fly" with ONNX in these experiments. + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). + +## Reproduction Log[*](${root_path}/docs/reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-int8.template b/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-int8.template new file mode 100644 index 0000000000..ae04c97479 --- /dev/null +++ b/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-int8.template @@ -0,0 +1,93 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: cosDPR-distil with HNSW indexes (using pre-encoded queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 57 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. +Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. +See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](${root_path}/docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). + +## Reproduction Log[*](${root_path}/docs/reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. + ++ Results reproduced by [@yilinjz](https://github.com/yilinjz) on 2023-09-01 (commit [`4ae518b`](https://github.com/castorini/anserini/commit/4ae518bb284ebcba0b273a473bc8774735cb7d19)) \ No newline at end of file diff --git a/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-onnx.template b/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-onnx.template index bbc6f5a298..cad852f311 100644 --- a/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-onnx.template +++ b/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-onnx.template @@ -6,7 +6,7 @@ This page describes regression experiments, integrated into Anserini's regressio > Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. -In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). +In these experiments, we are performing query inference "on-the-fly" with ONNX. The exact configurations for these regressions are stored in [this YAML file](${yaml}). Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. @@ -53,9 +53,11 @@ ${index_cmds} ``` The path `/path/to/${corpus}/` should point to the corpus downloaded above. - Upon completion, we should have an index with 8,841,823 documents. +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. + ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. @@ -67,6 +69,8 @@ After indexing has completed, you should be able to perform retrieval as follows ${ranking_cmds} ``` +Note that we are performing query inference "on-the-fly" with ONNX in these experiments. + Evaluation can be performed using `trec_eval`: ```bash diff --git a/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw.template b/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw.template index 98b1ed5b42..f5d7400267 100644 --- a/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw.template +++ b/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw.template @@ -53,9 +53,11 @@ ${index_cmds} ``` The path `/path/to/${corpus}/` should point to the corpus downloaded above. - Upon completion, we should have an index with 8,841,823 documents. +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. + ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/src/main/resources/docgen/templates/msmarco-passage-openai-ada2-int8.template b/src/main/resources/docgen/templates/msmarco-passage-openai-ada2-int8.template new file mode 100644 index 0000000000..b9e3a3c5e5 --- /dev/null +++ b/src/main/resources/docgen/templates/msmarco-passage-openai-ada2-int8.template @@ -0,0 +1,94 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**NOTE:** We're currently having issues with this regression, which throws "Retried waiting for GCLocker too often" errors. + +**Model**: OpenAI-ada2 embeddings (using pre-encoded queries) with HNSW indexes + +This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023. + +In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 109 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. +Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. +See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](${root_path}/docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). + +## Reproduction Log[*](${root_path}/docs/reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. + diff --git a/src/main/resources/docgen/templates/msmarco-passage-openai-ada2.template b/src/main/resources/docgen/templates/msmarco-passage-openai-ada2.template index c1b658c309..7bf567dda7 100644 --- a/src/main/resources/docgen/templates/msmarco-passage-openai-ada2.template +++ b/src/main/resources/docgen/templates/msmarco-passage-openai-ada2.template @@ -53,9 +53,11 @@ ${index_cmds} ``` The path `/path/to/${corpus}/` should point to the corpus downloaded above. - Upon completion, we should have an index with 8,841,823 documents. +Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments. +This is because merging index segments is a costly operation and not worthwhile given our query set. + ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-int8-onnx.yaml b/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-int8-onnx.yaml new file mode 100644 index 0000000000..cae9fc745e --- /dev/null +++ b/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-int8-onnx.yaml @@ -0,0 +1,65 @@ +--- +corpus: msmarco-passage-cos-dpr-distil +corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar +download_checksum: e20ffbc8b5e7f760af31298aefeaebbd + +index_path: indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/ +index_type: hnsw +collection_class: JsonDenseVectorCollection +generator_class: HnswDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8 + +metrics: + - metric: AP@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: TsvInt +topics: + - name: "[DL19 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl19 + path: topics.dl19-passage.txt + qrel: qrels.dl19-passage.txt + +models: + - name: cos-dpr-distil-hnsw + display: cosDPR-distil + type: hnsw + params: -generator VectorQueryGenerator -topicField title -threads 16 -hits 1000 -efSearch 1000 -encoder CosDprDistil + results: + AP@1000: + - 0.458 + nDCG@10: + - 0.717 + R@100: + - 0.605 + R@1000: + - 0.805 diff --git a/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-int8.yaml b/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-int8.yaml new file mode 100644 index 0000000000..596ff276ed --- /dev/null +++ b/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-int8.yaml @@ -0,0 +1,65 @@ +--- +corpus: msmarco-passage-cos-dpr-distil +corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar +download_checksum: e20ffbc8b5e7f760af31298aefeaebbd + +index_path: indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/ +index_type: hnsw +collection_class: JsonDenseVectorCollection +generator_class: HnswDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8 + +metrics: + - metric: AP@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[DL19 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl19 + path: topics.dl19-passage.cos-dpr-distil.jsonl.gz + qrel: qrels.dl19-passage.txt + +models: + - name: cos-dpr-distil-hnsw + display: cosDPR-distil + type: hnsw + params: -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 + results: + AP@1000: + - 0.458 + nDCG@10: + - 0.717 + R@100: + - 0.605 + R@1000: + - 0.805 diff --git a/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-onnx.yaml b/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-onnx.yaml index 31966a5457..8422854271 100644 --- a/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-onnx.yaml +++ b/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-onnx.yaml @@ -10,7 +10,7 @@ index_type: hnsw collection_class: JsonDenseVectorCollection generator_class: HnswDenseVectorDocumentGenerator index_threads: 16 -index_options: -M 16 -efC 100 -memoryBuffer 65536 +index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge metrics: - metric: AP@1000 diff --git a/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw.yaml b/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw.yaml index 8b38c5c1b2..fa55081708 100644 --- a/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw.yaml +++ b/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw.yaml @@ -10,7 +10,7 @@ index_type: hnsw collection_class: JsonDenseVectorCollection generator_class: HnswDenseVectorDocumentGenerator index_threads: 16 -index_options: -M 16 -efC 100 -memoryBuffer 65536 +index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge metrics: - metric: AP@1000 diff --git a/src/main/resources/regression/dl19-passage-openai-ada2-int8.yaml b/src/main/resources/regression/dl19-passage-openai-ada2-int8.yaml new file mode 100644 index 0000000000..f05d1f2290 --- /dev/null +++ b/src/main/resources/regression/dl19-passage-openai-ada2-int8.yaml @@ -0,0 +1,65 @@ +--- +corpus: msmarco-passage-openai-ada2 +corpus_path: collections/msmarco/msmarco-passage-openai-ada2/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.tar +download_checksum: a4d843d522ff3a3af7edbee789a63402 + +index_path: indexes/lucene-hnsw.msmarco-passage-openai-ada2-int8/ +index_type: hnsw +collection_class: JsonDenseVectorCollection +generator_class: HnswDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8 + +metrics: + - metric: AP@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[DL19 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl19 + path: topics.dl19-passage.openai-ada2.jsonl.gz + qrel: qrels.dl19-passage.txt + +models: + - name: openai-ada2 + display: OpenAI-ada2 + type: hnsw + params: -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 + results: + AP@1000: + - 0.479 + nDCG@10: + - 0.704 + R@100: + - 0.624 + R@1000: + - 0.857 diff --git a/src/main/resources/regression/dl19-passage-openai-ada2.yaml b/src/main/resources/regression/dl19-passage-openai-ada2.yaml index 9667d0907c..2c4b0796f7 100644 --- a/src/main/resources/regression/dl19-passage-openai-ada2.yaml +++ b/src/main/resources/regression/dl19-passage-openai-ada2.yaml @@ -10,7 +10,7 @@ index_type: hnsw collection_class: JsonDenseVectorCollection generator_class: HnswDenseVectorDocumentGenerator index_threads: 16 -index_options: -M 16 -efC 100 -memoryBuffer 65536 +index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge metrics: - metric: AP@1000 diff --git a/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-int8-onnx.yaml b/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-int8-onnx.yaml new file mode 100644 index 0000000000..4e64494b59 --- /dev/null +++ b/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-int8-onnx.yaml @@ -0,0 +1,65 @@ +--- +corpus: msmarco-passage-cos-dpr-distil +corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar +download_checksum: e20ffbc8b5e7f760af31298aefeaebbd + +index_path: indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/ +index_type: hnsw +collection_class: JsonDenseVectorCollection +generator_class: HnswDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8 + +metrics: + - metric: AP@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: TsvInt +topics: + - name: "[DL20 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl20 + path: topics.dl20.txt + qrel: qrels.dl20-passage.txt + +models: + - name: cos-dpr-distil-hnsw + display: cosDPR-distil + type: hnsw + params: -generator VectorQueryGenerator -topicField title -threads 16 -hits 1000 -efSearch 1000 -encoder CosDprDistil + results: + AP@1000: + - 0.482 + nDCG@10: + - 0.701 + R@100: + - 0.712 + R@1000: + - 0.843 diff --git a/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-int8.yaml b/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-int8.yaml new file mode 100644 index 0000000000..98968a983f --- /dev/null +++ b/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-int8.yaml @@ -0,0 +1,65 @@ +--- +corpus: msmarco-passage-cos-dpr-distil +corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar +download_checksum: e20ffbc8b5e7f760af31298aefeaebbd + +index_path: indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/ +index_type: hnsw +collection_class: JsonDenseVectorCollection +generator_class: HnswDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8 + +metrics: + - metric: AP@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[DL20 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl20 + path: topics.dl20.cos-dpr-distil.jsonl.gz + qrel: qrels.dl20-passage.txt + +models: + - name: cos-dpr-distil-hnsw + display: cosDPR-distil + type: hnsw + params: -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 + results: + AP@1000: + - 0.482 + nDCG@10: + - 0.701 + R@100: + - 0.712 + R@1000: + - 0.843 diff --git a/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-onnx.yaml b/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-onnx.yaml index d2e0f89991..055abefa4b 100644 --- a/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-onnx.yaml +++ b/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-onnx.yaml @@ -10,7 +10,7 @@ index_type: hnsw collection_class: JsonDenseVectorCollection generator_class: HnswDenseVectorDocumentGenerator index_threads: 16 -index_options: -M 16 -efC 100 -memoryBuffer 65536 +index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge metrics: - metric: AP@1000 diff --git a/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw.yaml b/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw.yaml index f120eadd61..f149679351 100644 --- a/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw.yaml +++ b/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw.yaml @@ -10,7 +10,7 @@ index_type: hnsw collection_class: JsonDenseVectorCollection generator_class: HnswDenseVectorDocumentGenerator index_threads: 16 -index_options: -M 16 -efC 100 -memoryBuffer 65536 +index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge metrics: - metric: AP@1000 diff --git a/src/main/resources/regression/dl20-passage-openai-ada2-int8.yaml b/src/main/resources/regression/dl20-passage-openai-ada2-int8.yaml new file mode 100644 index 0000000000..6f26a14fe7 --- /dev/null +++ b/src/main/resources/regression/dl20-passage-openai-ada2-int8.yaml @@ -0,0 +1,65 @@ +--- +corpus: msmarco-passage-openai-ada2 +corpus_path: collections/msmarco/msmarco-passage-openai-ada2/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.tar +download_checksum: a4d843d522ff3a3af7edbee789a63402 + +index_path: indexes/lucene-hnsw.msmarco-passage-openai-ada2-int8/ +index_type: hnsw +collection_class: JsonDenseVectorCollection +generator_class: HnswDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8 + +metrics: + - metric: AP@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[DL20 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl20 + path: topics.dl20-passage.openai-ada2.jsonl.gz + qrel: qrels.dl20-passage.txt + +models: + - name: openai-ada2 + display: OpenAI-ada2 + type: hnsw + params: -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 + results: + AP@1000: + - 0.477 + nDCG@10: + - 0.676 + R@100: + - 0.723 + R@1000: + - 0.867 diff --git a/src/main/resources/regression/dl20-passage-openai-ada2.yaml b/src/main/resources/regression/dl20-passage-openai-ada2.yaml index 152d18765c..ff7d16aa64 100644 --- a/src/main/resources/regression/dl20-passage-openai-ada2.yaml +++ b/src/main/resources/regression/dl20-passage-openai-ada2.yaml @@ -10,7 +10,7 @@ index_type: hnsw collection_class: JsonDenseVectorCollection generator_class: HnswDenseVectorDocumentGenerator index_threads: 16 -index_options: -M 16 -efC 100 -memoryBuffer 65536 +index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge metrics: - metric: AP@1000 diff --git a/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-int8-onnx.yaml b/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-int8-onnx.yaml new file mode 100644 index 0000000000..1691c197c8 --- /dev/null +++ b/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-int8-onnx.yaml @@ -0,0 +1,65 @@ +--- +corpus: msmarco-passage-cos-dpr-distil +corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar +download_checksum: e20ffbc8b5e7f760af31298aefeaebbd + +index_path: indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/ +index_type: hnsw +collection_class: JsonDenseVectorCollection +generator_class: HnswDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8 + +metrics: + - metric: AP@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -m map + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: RR@10 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -M 10 -m recip_rank + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -m recall.100 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -m recall.1000 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: TsvInt +topics: + - name: "[MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking)" + id: dev + path: topics.msmarco-passage.dev-subset.txt + qrel: qrels.msmarco-passage.dev-subset.txt + +models: + - name: cos-dpr-distil-hnsw + display: cosDPR-distil + type: hnsw + params: -generator VectorQueryGenerator -topicField title -threads 16 -hits 1000 -efSearch 1000 -encoder CosDprDistil + results: + AP@1000: + - 0.393 + RR@10: + - 0.388 + R@100: + - 0.903 + R@1000: + - 0.974 diff --git a/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-int8.yaml b/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-int8.yaml new file mode 100644 index 0000000000..6f93e3b017 --- /dev/null +++ b/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-int8.yaml @@ -0,0 +1,65 @@ +--- +corpus: msmarco-passage-cos-dpr-distil +corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar +download_checksum: e20ffbc8b5e7f760af31298aefeaebbd + +index_path: indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/ +index_type: hnsw +collection_class: JsonDenseVectorCollection +generator_class: HnswDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8 + +metrics: + - metric: AP@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -m map + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: RR@10 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -M 10 -m recip_rank + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -m recall.100 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -m recall.1000 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking)" + id: dev + path: topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.gz + qrel: qrels.msmarco-passage.dev-subset.txt + +models: + - name: cos-dpr-distil-hnsw + display: cosDPR-distil + type: hnsw + params: -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 + results: + AP@1000: + - 0.393 + RR@10: + - 0.388 + R@100: + - 0.903 + R@1000: + - 0.974 diff --git a/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-onnx.yaml b/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-onnx.yaml index 4746f389f3..372d40a67b 100644 --- a/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-onnx.yaml +++ b/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-onnx.yaml @@ -10,7 +10,7 @@ index_type: hnsw collection_class: JsonDenseVectorCollection generator_class: HnswDenseVectorDocumentGenerator index_threads: 16 -index_options: -M 16 -efC 100 -memoryBuffer 65536 +index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge metrics: - metric: AP@1000 diff --git a/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw.yaml b/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw.yaml index c63c3b7972..a235e7823e 100644 --- a/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw.yaml +++ b/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw.yaml @@ -10,7 +10,7 @@ index_type: hnsw collection_class: JsonDenseVectorCollection generator_class: HnswDenseVectorDocumentGenerator index_threads: 16 -index_options: -M 16 -efC 100 -memoryBuffer 65536 +index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge metrics: - metric: AP@1000 diff --git a/src/main/resources/regression/msmarco-passage-openai-ada2-int8.yaml b/src/main/resources/regression/msmarco-passage-openai-ada2-int8.yaml new file mode 100644 index 0000000000..9332504916 --- /dev/null +++ b/src/main/resources/regression/msmarco-passage-openai-ada2-int8.yaml @@ -0,0 +1,65 @@ +--- +corpus: msmarco-passage-openai-ada2 +corpus_path: collections/msmarco/msmarco-passage-openai-ada2/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.tar +download_checksum: a4d843d522ff3a3af7edbee789a63402 + +index_path: indexes/lucene-hnsw.msmarco-passage-openai-ada2-int8/ +index_type: hnsw +collection_class: JsonDenseVectorCollection +generator_class: HnswDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8 + +metrics: + - metric: AP@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -m map + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: RR@10 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -M 10 -m recip_rank + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -m recall.100 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -m recall.1000 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking)" + id: dev + path: topics.msmarco-passage.dev-subset.openai-ada2.jsonl.gz + qrel: qrels.msmarco-passage.dev-subset.txt + +models: + - name: openai-ada2 + display: OpenAI-ada2 + type: hnsw + params: -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 + results: + AP@1000: + - 0.350 + RR@10: + - 0.343 + R@100: + - 0.898 + R@1000: + - 0.985 diff --git a/src/main/resources/regression/msmarco-passage-openai-ada2.yaml b/src/main/resources/regression/msmarco-passage-openai-ada2.yaml index 08289ce2c0..5bd13b6d28 100644 --- a/src/main/resources/regression/msmarco-passage-openai-ada2.yaml +++ b/src/main/resources/regression/msmarco-passage-openai-ada2.yaml @@ -10,7 +10,7 @@ index_type: hnsw collection_class: JsonDenseVectorCollection generator_class: HnswDenseVectorDocumentGenerator index_threads: 16 -index_options: -M 16 -efC 100 -memoryBuffer 65536 +index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge metrics: - metric: AP@1000 diff --git a/src/test/java/io/anserini/index/IndexHnswDenseVectorsTest.java b/src/test/java/io/anserini/index/IndexHnswDenseVectorsTest.java index bade9e0366..535475aee5 100644 --- a/src/test/java/io/anserini/index/IndexHnswDenseVectorsTest.java +++ b/src/test/java/io/anserini/index/IndexHnswDenseVectorsTest.java @@ -152,4 +152,26 @@ public void test1() throws Exception { assertNotNull(results); assertEquals(100, results.get("documents")); } + + @Test + public void testQuantizedInt8() throws Exception { + String indexPath = "target/idx-sample-hnsw" + System.currentTimeMillis(); + String[] indexArgs = new String[] { + "-collection", "JsonDenseVectorCollection", + "-input", "src/test/resources/sample_docs/openai_ada2/json_vector", + "-index", indexPath, + "-generator", "HnswDenseVectorDocumentGenerator", + "-threads", "1", + "-M", "16", "-efC", "100", "-quantize.int8" + }; + + IndexHnswDenseVectors.main(indexArgs); + + IndexReader reader = IndexReaderUtils.getReader(indexPath); + assertNotNull(reader); + + Map results = IndexReaderUtils.getIndexStats(reader, Constants.VECTOR); + assertNotNull(results); + assertEquals(100, results.get("documents")); + } } \ No newline at end of file diff --git a/src/test/java/io/anserini/search/SearchHnswDenseVectorsTest.java b/src/test/java/io/anserini/search/SearchHnswDenseVectorsTest.java index 683af4c779..2a9e2a9fab 100644 --- a/src/test/java/io/anserini/search/SearchHnswDenseVectorsTest.java +++ b/src/test/java/io/anserini/search/SearchHnswDenseVectorsTest.java @@ -305,13 +305,13 @@ public void testBasicAda2() throws Exception { "-hits", "5"}; SearchHnswDenseVectors.main(searchArgs); - TestUtils.checkFile(runfile, new String[] { + TestUtils.checkRunFileApproximate(runfile, new String[] { "160885 Q0 45 1 0.863064 Anserini", "160885 Q0 44 2 0.861596 Anserini", "160885 Q0 40 3 0.858651 Anserini", "160885 Q0 48 4 0.858514 Anserini", - "160885 Q0 41 5 0.856264 Anserini", - "867490 Q0 10 1 0.850332 Anserini", + "160885 Q0 41 5 0.856265 Anserini", + "867490 Q0 10 1 0.850331 Anserini", "867490 Q0 45 2 0.846281 Anserini", "867490 Q0 44 3 0.845236 Anserini", "867490 Q0 95 4 0.845013 Anserini", @@ -393,7 +393,7 @@ public void testBasicWithOnnx() throws Exception { SearchHnswDenseVectors.main(searchArgs); // Note output is slightly different from pre-encoded query vectors. - TestUtils.checkFile(runfile, new String[] { + TestUtils.checkRunFileApproximate(runfile, new String[] { "2 Q0 208 1 0.578723 Anserini", "2 Q0 224 2 0.578716 Anserini", "2 Q0 384 3 0.573913 Anserini", @@ -437,7 +437,7 @@ public void testRemoveQuery() throws Exception { "-removeQuery"}; SearchHnswDenseVectors.main(searchArgs); - TestUtils.checkFile(runfile, new String[] { + TestUtils.checkRunFileApproximate(runfile, new String[] { "10 Q0 45 1 0.846281 Anserini", "10 Q0 44 2 0.845236 Anserini", "10 Q0 95 3 0.845013 Anserini", @@ -480,7 +480,7 @@ public void testPassage() throws Exception { "-hits", "10"}; SearchHnswDenseVectors.main(searchArgs); - TestUtils.checkFile(runfile, new String[] { + TestUtils.checkRunFileApproximate(runfile, new String[] { "160885 Q0 44 1 0.863064 Anserini", "160885 Q0 40 2 0.858651 Anserini", "160885 Q0 48 3 0.858514 Anserini",