castorini · lintool · Feb 12, 2024 · Jan 18, 2024 · Jan 19, 2024 · Jan 19, 2024
diff --git a/src/main/python/run_regression.py b/src/main/python/run_regression.py
@@ -204,8 +204,10 @@ def evaluate_and_verify(yaml_data, dry_run):
                 expected = round(model['results'][metric['metric']][i], metric['metric_precision'])
                 actual = round(float(eval_out), metric['metric_precision'])
 
+                using_hnsw = True if 'VectorQueryGenerator' in model['params'] or '-encoder' in model['params'] else False
+
                 # For HNSW, we only print to third digit
-                if 'VectorQueryGenerator' in model['params']:
+                if using_hnsw:
                     result_str = 'expected: {0:.3f} actual: {1:.3f} - metric: {2:<8} model: {3} topics: {4}'.format(
                         expected, actual, metric['metric'], model['name'], topic_set['id'])
                 else:
@@ -216,8 +218,8 @@ def evaluate_and_verify(yaml_data, dry_run):
                 # For HNSW, be more tolerant, but as long as the actual score is higher than the expected score,
                 # let the test pass.
                 if is_close(expected, actual) or \
-                        ('VectorQueryGenerator' in model['params'] and is_close(expected, actual, abs_tol=0.007)) or \
-                        ('VectorQueryGenerator' in model['params'] and actual > expected):
+                        (using_hnsw and is_close(expected, actual, abs_tol=0.007)) or \
+                        (using_hnsw and actual > expected):
                     logger.info(ok_str + result_str)
                 # For ONNX runs, increase tolerance a bit because we observe some minor differences across OSes.
                 elif '-encoder' in model['params'] and is_close(expected, actual, abs_tol=0.001):

diff --git a/src/main/resources/regression/beir-v1.0.0-arguana-splade-pp-ed-onnx.yaml b/src/main/resources/regression/beir-v1.0.0-arguana-splade-pp-ed-onnx.yaml
@@ -0,0 +1,55 @@
+---
+corpus: beir-v1.0.0-arguana-splade-pp-ed
+corpus_path: collections/beir-v1.0.0/splade-pp-ed/arguana
+
+index_path: indexes/lucene-index.beir-v1.0.0-arguana-splade-pp-ed/
+collection_class: JsonVectorCollection
+generator_class: DefaultLuceneDocumentGenerator
+index_threads: 16
+index_options: -impact -pretokenized
+index_stats:
+  documents: 8674
+  documents (non-empty): 8674
+  total terms: 71992355
+
+metrics:
+  - metric: nDCG@10
+    command: target/appassembler/bin/trec_eval
+    params: -c -m ndcg_cut.10
+    separator: "\t"
+    parse_index: 2
+    metric_precision: 4
+    can_combine: false
+  - metric: R@100
+    command: target/appassembler/bin/trec_eval
+    params: -c -m recall.100
+    separator: "\t"
+    parse_index: 2
+    metric_precision: 4
+    can_combine: false
+  - metric: R@1000
+    command: target/appassembler/bin/trec_eval
+    params: -c -m recall.1000
+    separator: "\t"
+    parse_index: 2
+    metric_precision: 4
+    can_combine: false
+
+topic_reader: TsvString
+topics:
+  - name: "BEIR (v1.0.0): ArguAna"
+    id: test
+    path: topics.beir-v1.0.0-arguana.test.tsv.gz
+    qrel: qrels.beir-v1.0.0-arguana.test.txt
+
+models:
+  - name: splade-pp-ed
+    display: SPLADE++ (CoCondenser-EnsembleDistil)
+    params: -impact -pretokenized -removeQuery -hits 1000 -encoder SpladePlusPlusEnsembleDistil
+    results:
+      nDCG@10:
+        - 0.5203
+      R@100:
+        - 0.9744
+      R@1000:
+        - 0.9950
diff --git a/src/main/resources/regression/beir-v1.0.0-bioasq-splade-pp-ed-onnx.yaml b/src/main/resources/regression/beir-v1.0.0-bioasq-splade-pp-ed-onnx.yaml
@@ -0,0 +1,55 @@
+---
+corpus: beir-v1.0.0-bioasq-splade-pp-ed
+corpus_path: collections/beir-v1.0.0/splade-pp-ed/bioasq
+
+index_path: indexes/lucene-index.beir-v1.0.0-bioasq-splade-pp-ed/
+collection_class: JsonVectorCollection
+generator_class: DefaultLuceneDocumentGenerator
+index_threads: 16
+index_options: -impact -pretokenized
+index_stats:
+  documents: 14914603
+  documents (non-empty): 14914603
+  total terms: 127381306317
+
+metrics:
+  - metric: nDCG@10
+    command: target/appassembler/bin/trec_eval
+    params: -c -m ndcg_cut.10
+    separator: "\t"
+    parse_index: 2
+    metric_precision: 4
+    can_combine: false
+  - metric: R@100
+    command: target/appassembler/bin/trec_eval
+    params: -c -m recall.100
+    separator: "\t"
+    parse_index: 2
+    metric_precision: 4
+    can_combine: false
+  - metric: R@1000
+    command: target/appassembler/bin/trec_eval
+    params: -c -m recall.1000
+    separator: "\t"
+    parse_index: 2
+    metric_precision: 4
+    can_combine: false
+
+topic_reader: TsvString
+topics:
+  - name: "BEIR (v1.0.0): BioASQ"
+    id: test
+    path: topics.beir-v1.0.0-bioasq.test.tsv.gz
+    qrel: qrels.beir-v1.0.0-bioasq.test.txt
+
+models:
+  - name: splade-pp-ed
+    display: SPLADE++ (CoCondenser-EnsembleDistil)
+    params: -impact -pretokenized -removeQuery -hits 1000 -encoder SpladePlusPlusEnsembleDistil
+    results:
+      nDCG@10:
+        - 0.4980
+      R@100:
+        - 0.7385
+      R@1000:
+        - 0.8757
diff --git a/src/main/resources/regression/beir-v1.0.0-climate-fever-splade-pp-ed-onnx.yaml b/src/main/resources/regression/beir-v1.0.0-climate-fever-splade-pp-ed-onnx.yaml
@@ -0,0 +1,55 @@
+---
+corpus: beir-v1.0.0-climate-fever-splade-pp-ed
+corpus_path: collections/beir-v1.0.0/splade-pp-ed/climate-fever
+
+index_path: indexes/lucene-index.beir-v1.0.0-climate-fever-splade-pp-ed/
+collection_class: JsonVectorCollection
+generator_class: DefaultLuceneDocumentGenerator
+index_threads: 16
+index_options: -impact -pretokenized
+index_stats:
+  documents: 5416593
+  documents (non-empty): 5416593
+  total terms: 28498465299
+
+metrics:
+  - metric: nDCG@10
+    command: target/appassembler/bin/trec_eval
+    params: -c -m ndcg_cut.10
+    separator: "\t"
+    parse_index: 2
+    metric_precision: 4
+    can_combine: false
+  - metric: R@100
+    command: target/appassembler/bin/trec_eval
+    params: -c -m recall.100
+    separator: "\t"
+    parse_index: 2
+    metric_precision: 4
+    can_combine: false
+  - metric: R@1000
+    command: target/appassembler/bin/trec_eval
+    params: -c -m recall.1000
+    separator: "\t"
+    parse_index: 2
+    metric_precision: 4
+    can_combine: false
+
+topic_reader: TsvString
+topics:
+  - name: "BEIR (v1.0.0): Climate-FEVER"
+    id: test
+    path: topics.beir-v1.0.0-climate-fever.test.tsv.gz
+    qrel: qrels.beir-v1.0.0-climate-fever.test.txt
+
+models:
+  - name: splade-pp-ed
+    display: SPLADE++ (CoCondenser-EnsembleDistil)
+    params: -impact -pretokenized -removeQuery -hits 1000 -encoder SpladePlusPlusEnsembleDistil
+    results:
+      nDCG@10:
+        - 0.2297
+      R@100:
+        - 0.5211
+      R@1000:
+        - 0.7183
diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-android-splade-pp-ed-onnx.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-android-splade-pp-ed-onnx.yaml
@@ -0,0 +1,55 @@
+---
+corpus: beir-v1.0.0-cqadupstack-android-splade-pp-ed
+corpus_path: collections/beir-v1.0.0/splade-pp-ed/cqadupstack-android
+
+index_path: indexes/lucene-index.beir-v1.0.0-cqadupstack-android-splade-pp-ed/
+collection_class: JsonVectorCollection
+generator_class: DefaultLuceneDocumentGenerator
+index_threads: 16
+index_options: -impact -pretokenized
+index_stats:
+  documents: 22998
+  documents (non-empty): 22998
+  total terms: 108476959
+
+metrics:
+  - metric: nDCG@10
+    command: target/appassembler/bin/trec_eval
+    params: -c -m ndcg_cut.10
+    separator: "\t"
+    parse_index: 2
+    metric_precision: 4
+    can_combine: false
+  - metric: R@100
+    command: target/appassembler/bin/trec_eval
+    params: -c -m recall.100
+    separator: "\t"
+    parse_index: 2
+    metric_precision: 4
+    can_combine: false
+  - metric: R@1000
+    command: target/appassembler/bin/trec_eval
+    params: -c -m recall.1000
+    separator: "\t"
+    parse_index: 2
+    metric_precision: 4
+    can_combine: false
+
+topic_reader: TsvString
+topics:
+  - name: "BEIR (v1.0.0): CQADupStack-android"
+    id: test
+    path: topics.beir-v1.0.0-cqadupstack-android.test.tsv.gz
+    qrel: qrels.beir-v1.0.0-cqadupstack-android.test.txt
+
+models:
+  - name: splade-pp-ed
+    display: SPLADE++ (CoCondenser-EnsembleDistil)
+    params: -impact -pretokenized -removeQuery -hits 1000 -encoder SpladePlusPlusEnsembleDistil
+    results:
+      nDCG@10:
+        - 0.3904
+      R@100:
+        - 0.7404
+      R@1000:
+        - 0.9064
diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-english-splade-pp-ed-onnx.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-english-splade-pp-ed-onnx.yaml
@@ -0,0 +1,55 @@
+---
+corpus: beir-v1.0.0-cqadupstack-english-splade-pp-ed
+corpus_path: collections/beir-v1.0.0/splade-pp-ed/cqadupstack-english
+
+index_path: indexes/lucene-index.beir-v1.0.0-cqadupstack-english-splade-pp-ed/
+collection_class: JsonVectorCollection
+generator_class: DefaultLuceneDocumentGenerator
+index_threads: 16
+index_options: -impact -pretokenized
+index_stats:
+  documents: 40221
+  documents (non-empty): 40221
+  total terms: 158861979
+
+metrics:
+  - metric: nDCG@10
+    command: target/appassembler/bin/trec_eval
+    params: -c -m ndcg_cut.10
+    separator: "\t"
+    parse_index: 2
+    metric_precision: 4
+    can_combine: false
+  - metric: R@100
+    command: target/appassembler/bin/trec_eval
+    params: -c -m recall.100
+    separator: "\t"
+    parse_index: 2
+    metric_precision: 4
+    can_combine: false
+  - metric: R@1000
+    command: target/appassembler/bin/trec_eval
+    params: -c -m recall.1000
+    separator: "\t"
+    parse_index: 2
+    metric_precision: 4
+    can_combine: false
+
+topic_reader: TsvString
+topics:
+  - name: "BEIR (v1.0.0): CQADupStack-english"
+    id: test
+    path: topics.beir-v1.0.0-cqadupstack-english.test.tsv.gz
+    qrel: qrels.beir-v1.0.0-cqadupstack-english.test.txt
+
+models:
+  - name: splade-pp-ed
+    display: SPLADE++ (CoCondenser-EnsembleDistil)
+    params: -impact -pretokenized -removeQuery -hits 1000 -encoder SpladePlusPlusEnsembleDistil
+    results:
+      nDCG@10:
+        - 0.4079
+      R@100:
+        - 0.6946
+      R@1000:
+        - 0.8454
diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-gaming-splade-pp-ed-onnx.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-gaming-splade-pp-ed-onnx.yaml
@@ -0,0 +1,55 @@
+---
+corpus: beir-v1.0.0-cqadupstack-gaming-splade-pp-ed
+corpus_path: collections/beir-v1.0.0/splade-pp-ed/cqadupstack-gaming
+
+index_path: indexes/lucene-index.beir-v1.0.0-cqadupstack-gaming-splade-pp-ed/
+collection_class: JsonVectorCollection
+generator_class: DefaultLuceneDocumentGenerator
+index_threads: 16
+index_options: -impact -pretokenized
+index_stats:
+  documents: 45301
+  documents (non-empty): 45301
+  total terms: 197713644
+
+metrics:
+  - metric: nDCG@10
+    command: target/appassembler/bin/trec_eval
+    params: -c -m ndcg_cut.10
+    separator: "\t"
+    parse_index: 2
+    metric_precision: 4
+    can_combine: false
+  - metric: R@100
+    command: target/appassembler/bin/trec_eval
+    params: -c -m recall.100
+    separator: "\t"
+    parse_index: 2
+    metric_precision: 4
+    can_combine: false
+  - metric: R@1000
+    command: target/appassembler/bin/trec_eval
+    params: -c -m recall.1000
+    separator: "\t"
+    parse_index: 2
+    metric_precision: 4
+    can_combine: false
+
+topic_reader: TsvString
+topics:
+  - name: "BEIR (v1.0.0): CQADupStack-gaming"
+    id: test
+    path: topics.beir-v1.0.0-cqadupstack-gaming.test.tsv.gz
+    qrel: qrels.beir-v1.0.0-cqadupstack-gaming.test.txt
+
+models:
+  - name: splade-pp-ed
+    display: SPLADE++ (CoCondenser-EnsembleDistil)
+    params: -impact -pretokenized -removeQuery -hits 1000 -encoder SpladePlusPlusEnsembleDistil
+    results:
+      nDCG@10:
+        - 0.4957
+      R@100:
+        - 0.8131
+      R@1000:
+        - 0.9221