Skip to content

Commit

Permalink
Add initial ONNX regressions for MS MARCO (#2094)
Browse files Browse the repository at this point in the history
There appears to be a concurrency bug associated with regressions, which we address with
python src/main/python/run_regression.py --search-pool 1 ... but we should circle back and
look into later.
  • Loading branch information
lintool authored Apr 5, 2023
1 parent cef91f3 commit a7df7fc
Show file tree
Hide file tree
Showing 6 changed files with 510 additions and 0 deletions.
93 changes: 93 additions & 0 deletions src/main/resources/regression/dl19-passage-splade-pp-ed-onnx.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
---
corpus: msmarco-passage-splade-pp-ed
corpus_path: collections/msmarco/msmarco-passage-splade-pp-ed

download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-splade-pp-ed.tar
download_checksum: e489133bdc54ee1e7c62a32aa582bc77

index_path: indexes/lucene-index.msmarco-passage-splade-pp-ed/
collection_class: JsonVectorCollection
generator_class: DefaultLuceneDocumentGenerator
index_threads: 16
index_options: -impact -pretokenized -storeDocvectors
index_stats:
documents: 8841823
documents (non-empty): 8841823
total terms: 52376261130

metrics:
- metric: AP@1000
command: tools/eval/trec_eval.9.0.4/trec_eval
params: -m map -c -l 2
separator: "\t"
parse_index: 2
metric_precision: 4
can_combine: false
- metric: nDCG@10
command: tools/eval/trec_eval.9.0.4/trec_eval
params: -m ndcg_cut.10 -c
separator: "\t"
parse_index: 2
metric_precision: 4
can_combine: false
- metric: R@100
command: tools/eval/trec_eval.9.0.4/trec_eval
params: -m recall.100 -c -l 2
separator: "\t"
parse_index: 2
metric_precision: 4
can_combine: false
- metric: R@1000
command: tools/eval/trec_eval.9.0.4/trec_eval
params: -m recall.1000 -c -l 2
separator: "\t"
parse_index: 2
metric_precision: 4
can_combine: false

topic_reader: TsvInt
topic_root: src/main/resources/topics-and-qrels/
qrels_root: src/main/resources/topics-and-qrels/
topics:
- name: "[DL19 (Passage)](https://trec.nist.gov/data/deep2020.html)"
id: dl19
path: topics.dl19-passage.txt
qrel: qrels.dl19-passage.txt

models:
- name: splade-pp-ed
display: SPLADE++ CoCondenser-EnsembleDistil
params: -impact -pretokenized -encoder SpladePlusPlusEnsembleDistil
results:
AP@1000:
- 0.5050
nDCG@10:
- 0.7308
R@100:
- 0.6390
R@1000:
- 0.8728
- name: rm3
display: +RM3
params: -impact -pretokenized -encoder SpladePlusPlusEnsembleDistil -rm3
results:
AP@1000:
- 0.4995
nDCG@10:
- 0.6849
R@100:
- 0.6427
R@1000:
- 0.8684
- name: rocchio
display: +Rocchio
params: -impact -pretokenized -encoder SpladePlusPlusEnsembleDistil -rocchio
results:
AP@1000:
- 0.5140
nDCG@10:
- 0.7119
R@100:
- 0.6394
R@1000:
- 0.8799
93 changes: 93 additions & 0 deletions src/main/resources/regression/dl19-passage-splade-pp-sd-onnx.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
---
corpus: msmarco-passage-splade-pp-sd
corpus_path: collections/msmarco/msmarco-passage-splade-pp-sd

download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-splade-pp-sd.tar
download_checksum: cb7e264222f2bf2221dd2c9d28190be1

index_path: indexes/lucene-index.msmarco-passage-splade-pp-sd/
collection_class: JsonVectorCollection
generator_class: DefaultLuceneDocumentGenerator
index_threads: 16
index_options: -impact -pretokenized -storeDocvectors
index_stats:
documents: 8841823
documents (non-empty): 8841823
total terms: 55456660129

metrics:
- metric: AP@1000
command: tools/eval/trec_eval.9.0.4/trec_eval
params: -m map -c -l 2
separator: "\t"
parse_index: 2
metric_precision: 4
can_combine: false
- metric: nDCG@10
command: tools/eval/trec_eval.9.0.4/trec_eval
params: -m ndcg_cut.10 -c
separator: "\t"
parse_index: 2
metric_precision: 4
can_combine: false
- metric: R@100
command: tools/eval/trec_eval.9.0.4/trec_eval
params: -m recall.100 -c -l 2
separator: "\t"
parse_index: 2
metric_precision: 4
can_combine: false
- metric: R@1000
command: tools/eval/trec_eval.9.0.4/trec_eval
params: -m recall.1000 -c -l 2
separator: "\t"
parse_index: 2
metric_precision: 4
can_combine: false

topic_reader: TsvInt
topic_root: src/main/resources/topics-and-qrels/
qrels_root: src/main/resources/topics-and-qrels/
topics:
- name: "[DL19 (Passage)](https://trec.nist.gov/data/deep2020.html)"
id: dl19
path: topics.dl19-passage.txt
qrel: qrels.dl19-passage.txt

models:
- name: splade-pp-sd
display: SPLADE++ CoCondenser-SelfDistil
params: -impact -pretokenized -encoder SpladePlusPlusSelfDistil
results:
AP@1000:
- 0.4998
nDCG@10:
- 0.7358
R@100:
- 0.6370
R@1000:
- 0.8761
- name: rm3
display: +RM3
params: -impact -pretokenized -rm3 -encoder SpladePlusPlusSelfDistil
results:
AP@1000:
- 0.4914
nDCG@10:
- 0.6989
R@100:
- 0.6456
R@1000:
- 0.8793
- name: rocchio
display: +Rocchio
params: -impact -pretokenized -rocchio -encoder SpladePlusPlusSelfDistil
results:
AP@1000:
- 0.5072
nDCG@10:
- 0.7156
R@100:
- 0.6570
R@1000:
- 0.8918
93 changes: 93 additions & 0 deletions src/main/resources/regression/dl20-passage-splade-pp-ed-onnx.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
---
corpus: msmarco-passage-splade-pp-ed
corpus_path: collections/msmarco/msmarco-passage-splade-pp-ed

download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-splade-pp-ed.tar
download_checksum: e489133bdc54ee1e7c62a32aa582bc77

index_path: indexes/lucene-index.msmarco-passage-splade-pp-ed/
collection_class: JsonVectorCollection
generator_class: DefaultLuceneDocumentGenerator
index_threads: 16
index_options: -impact -pretokenized -storeDocvectors
index_stats:
documents: 8841823
documents (non-empty): 8841823
total terms: 52376261130

metrics:
- metric: AP@1000
command: tools/eval/trec_eval.9.0.4/trec_eval
params: -m map -c -l 2
separator: "\t"
parse_index: 2
metric_precision: 4
can_combine: false
- metric: nDCG@10
command: tools/eval/trec_eval.9.0.4/trec_eval
params: -m ndcg_cut.10 -c
separator: "\t"
parse_index: 2
metric_precision: 4
can_combine: false
- metric: R@100
command: tools/eval/trec_eval.9.0.4/trec_eval
params: -m recall.100 -c -l 2
separator: "\t"
parse_index: 2
metric_precision: 4
can_combine: false
- metric: R@1000
command: tools/eval/trec_eval.9.0.4/trec_eval
params: -m recall.1000 -c -l 2
separator: "\t"
parse_index: 2
metric_precision: 4
can_combine: false

topic_reader: TsvInt
topic_root: src/main/resources/topics-and-qrels/
qrels_root: src/main/resources/topics-and-qrels/
topics:
- name: "[DL20 (Passage)](https://trec.nist.gov/data/deep2020.html)"
id: dl20
path: topics.dl20.txt
qrel: qrels.dl20-passage.txt

models:
- name: splade-pp-ed
display: SPLADE++ CoCondenser-EnsembleDistil
params: -impact -pretokenized -encoder SpladePlusPlusEnsembleDistil
results:
AP@1000:
- 0.4999
nDCG@10:
- 0.7197
R@100:
- 0.7653
R@1000:
- 0.8998
- name: rm3
display: +RM3
params: -impact -pretokenized -rm3 -encoder SpladePlusPlusEnsembleDistil
results:
AP@1000:
- 0.5098
nDCG@10:
- 0.7145
R@100:
- 0.7555
R@1000:
- 0.9046
- name: rocchio
display: +Rocchio
params: -impact -pretokenized -rocchio -encoder SpladePlusPlusEnsembleDistil
results:
AP@1000:
- 0.5084
nDCG@10:
- 0.7280
R@100:
- 0.7704
R@1000:
- 0.9069
93 changes: 93 additions & 0 deletions src/main/resources/regression/dl20-passage-splade-pp-sd-onnx.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
---
corpus: msmarco-passage-splade-pp-sd
corpus_path: collections/msmarco/msmarco-passage-splade-pp-sd

download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-splade-pp-sd.tar
download_checksum: cb7e264222f2bf2221dd2c9d28190be1

index_path: indexes/lucene-index.msmarco-passage-splade-pp-sd/
collection_class: JsonVectorCollection
generator_class: DefaultLuceneDocumentGenerator
index_threads: 16
index_options: -impact -pretokenized -storeDocvectors
index_stats:
documents: 8841823
documents (non-empty): 8841823
total terms: 55456660129

metrics:
- metric: AP@1000
command: tools/eval/trec_eval.9.0.4/trec_eval
params: -m map -c -l 2
separator: "\t"
parse_index: 2
metric_precision: 4
can_combine: false
- metric: nDCG@10
command: tools/eval/trec_eval.9.0.4/trec_eval
params: -m ndcg_cut.10 -c
separator: "\t"
parse_index: 2
metric_precision: 4
can_combine: false
- metric: R@100
command: tools/eval/trec_eval.9.0.4/trec_eval
params: -m recall.100 -c -l 2
separator: "\t"
parse_index: 2
metric_precision: 4
can_combine: false
- metric: R@1000
command: tools/eval/trec_eval.9.0.4/trec_eval
params: -m recall.1000 -c -l 2
separator: "\t"
parse_index: 2
metric_precision: 4
can_combine: false

topic_reader: TsvInt
topic_root: src/main/resources/topics-and-qrels/
qrels_root: src/main/resources/topics-and-qrels/
topics:
- name: "[DL20 (Passage)](https://trec.nist.gov/data/deep2020.html)"
id: dl20
path: topics.dl20.txt
qrel: qrels.dl20-passage.txt

models:
- name: splade-pp-sd
display: SPLADE++ CoCondenser-SelfDistil
params: -impact -pretokenized -encoder SpladePlusPlusSelfDistil
results:
AP@1000:
- 0.5139
nDCG@10:
- 0.7282
R@100:
- 0.7512
R@1000:
- 0.9024
- name: rm3
display: +RM3
params: -impact -pretokenized -rm3 -encoder SpladePlusPlusSelfDistil
results:
AP@1000:
- 0.5266
nDCG@10:
- 0.7227
R@100:
- 0.7648
R@1000:
- 0.9174
- name: rocchio
display: +Rocchio
params: -impact -pretokenized -rocchio -encoder SpladePlusPlusSelfDistil
results:
AP@1000:
- 0.5335
nDCG@10:
- 0.7388
R@100:
- 0.7656
R@1000:
- 0.9120
Loading

0 comments on commit a7df7fc

Please sign in to comment.