Support KILT for Pyserini's h/d/search (#405)

castorini · Apr 29, 2021 · ecfed61 · ecfed61
1 parent 737fc8b
commit ecfed61
Show file tree

Hide file tree

Showing 33 changed files with 1,092 additions and 203 deletions.
diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,7 @@ pyserini/resources/jars/*.jar
 collections/*
 indexes/*
 .vscode/
+venv/
 # build directories from `python3 setup.py sdist bdist_wheel`
 build/
 dist/

diff --git a/docs/experiments-ance.md b/docs/experiments-ance.md
@@ -19,7 +19,7 @@ $ python -m pyserini.dsearch --topics msmarco-passage-dev-subset \
                              --batch-size 36 \
                              --threads 12 \
                              --output runs/run.msmarco-passage.ance.bf.tsv \
-                             --msmarco
+                             --output-format msmarco
 ```
 
 The option `--encoded-queries` specifies the use of encoded queries (i.e., queries that have already been converted into dense vectors and cached).
@@ -56,7 +56,7 @@ $ python -m pyserini.dsearch --topics msmarco-doc-dev \
                              --hits 1000 \
                              --max-passage \
                              --max-passage-hits 100 \
-                             --msmarco \
+                             --output-format msmarco \
                              --batch-size 36 \
                              --threads 12
 ```

diff --git a/docs/experiments-distilbert_kd.md b/docs/experiments-distilbert_kd.md
@@ -19,7 +19,7 @@ $ python -m pyserini.dsearch --topics msmarco-passage-dev-subset \
                              --batch-size 36 \
                              --threads 12 \
                              --output runs/run.msmarco-passage.distilbert-dot-margin_mse-T2.bf.tsv \
-                             --msmarco
+                             --output-format msmarco
 ```
 
 Replace `--encoded-queries` with `--encoder sebastian-hofstaetter/distilbert-dot-margin_mse-T2-msmarco` for on-the-fly query encoding.

diff --git a/docs/experiments-elastic.md b/docs/experiments-elastic.md
@@ -57,7 +57,7 @@ attention to: the official metric is MRR@100, so we want to only return the top
 format.
 
 ```bash
-python -m pyserini.search --msmarco --hits 100 \
+python -m pyserini.search --output-format msmarco --hits 100 \
   --topics msmarco-doc-dev \
   --index indexes/msmarco-doc/lucene-index-msmarco/ \
   --output runs/run.msmarco-doc.leaderboard-dev.elastic.txt \

diff --git a/docs/experiments-msmarco-doc.md b/docs/experiments-msmarco-doc.md
@@ -64,11 +64,11 @@ We can now perform retrieval using these queries:
 python -m pyserini.search --topics msmarco-doc-dev \
  --index indexes/lucene-index-msmarco-doc \
  --output runs/run.msmarco-doc.bm25tuned.txt \
- --bm25 --msmarco --hits 100 --k1 4.46 --b 0.82
+ --bm25 --output-format msmarco --hits 100 --k1 4.46 --b 0.82
 ```
 
 Here, we set the BM25 parameters to `k1=4.46`, `b=0.82` (tuned by grid search).
-The option `--msmarco` says to generate output in the MS MARCO output format.
+The option `--output-format msmarco` says to generate output in the MS MARCO output format.
 The option `--hits` specifies the number of documents to return per query.
 Note that for the [MS MARCO Document Ranking Leaderboard](https://microsoft.github.io/MSMARCO-Document-Ranking-Submissions/leaderboard/), the official metric is MRR@100, so submissions should only return 100 hits per query. 
 

diff --git a/docs/experiments-msmarco-passage.md b/docs/experiments-msmarco-passage.md
@@ -75,11 +75,11 @@ We can now perform retrieval using these queries:
 python -m pyserini.search --topics msmarco-passage-dev-subset \
  --index indexes/lucene-index-msmarco-passage \
  --output runs/run.msmarco-passage.bm25tuned.txt \
- --bm25 --msmarco --hits 1000 --k1 0.82 --b 0.68
+ --bm25 --output-format msmarco --hits 1000 --k1 0.82 --b 0.68
 ```
 
 Here, we set the BM25 parameters to `k1=0.82`, `b=0.68` (tuned by grid search).
-The option `--msmarco` says to generate output in the MS MARCO output format.
+The option `--output-format msmarco` says to generate output in the MS MARCO output format.
 The option `--hits` specifies the number of documents to return per query.
 Thus, the output file should have approximately 6980 × 1000 = 6.9M lines.
 

diff --git a/docs/experiments-sbert.md b/docs/experiments-sbert.md
@@ -11,7 +11,7 @@ $ python -m pyserini.dsearch --topics msmarco-passage-dev-subset \
                              --batch-size 36 \
                              --threads 12 \
                              --output runs/run.msmarco-passage.sbert.bf.tsv \
-                             --msmarco
+                             --output-format msmarco
 ```
 
 Replace `--encoded-queries` by `--encoder sentence-transformers/msmarco-distilbert-base-v3` for on-the-fly query encoding.
@@ -48,7 +48,7 @@ $ python -m pyserini.hsearch dense  --index msmarco-passage-sbert-bf \
                              run    --topics msmarco-passage-dev-subset \
                                     --output runs/run.msmarco-passage.sbert.bf.bm25.tsv \
                                     --batch-size 36 --threads 12 \
-                                    --msmarco
+                                    --output-format msmarco
 ```
 
 Replace `--encoded-queries` by `--encoder sentence-transformers/msmarco-distilbert-base-v3` for on-the-fly query encoding.

diff --git a/docs/experiments-tct_colbert.md b/docs/experiments-tct_colbert.md
@@ -31,7 +31,7 @@ $ python -m pyserini.dsearch --topics msmarco-passage-dev-subset \
                              --batch-size 36 \
                              --threads 12 \
                              --output runs/run.msmarco-passage.tct_colbert.bf.tsv \
-                             --msmarco
+                             --output-format msmarco
 ```
 
 Note that to ensure maximum reproducibility, by default Pyserini uses pre-computed query representations that are automatically downloaded.
@@ -67,7 +67,7 @@ Dense retrieval with TCT-ColBERT, HNSW index:
 $ python -m pyserini.dsearch --topics msmarco-passage-dev-subset \
                              --index msmarco-passage-tct_colbert-hnsw \
                              --output runs/run.msmarco-passage.tct_colbert.hnsw.tsv \
-                             --msmarco 
+                             --output-format msmarco 
 ```
 
 To evaluate:
@@ -102,7 +102,7 @@ $ python -m pyserini.hsearch dense  --index msmarco-passage-tct_colbert-bf \
                              run    --topics msmarco-passage-dev-subset \
                                     --output runs/run.msmarco-passage.tct_colbert.bf.bm25.tsv \
                                     --batch-size 36 --threads 12 \
-                                    --msmarco
+                                    --output-format msmarco
 ```
 
 To evaluate:
@@ -135,7 +135,7 @@ $ python -m pyserini.hsearch dense  --index msmarco-passage-tct_colbert-bf \
                              run    --topics msmarco-passage-dev-subset \
                                     --output runs/run.msmarco-passage.tct_colbert.bf.doc2queryT5.tsv \
                                     --batch-size 36 --threads 12 \
-                                    --msmarco
+                                    --output-format msmarco
 ```
 
 To evaluate:
@@ -180,7 +180,7 @@ $ python -m pyserini.dsearch --topics msmarco-doc-dev \
                              --hits 1000 \
                              --max-passage \
                              --max-passage-hits 100 \
-                             --msmarco \
+                             --output-format msmarco \
                              --batch-size 36 \
                              --threads 12
 ```
@@ -218,7 +218,7 @@ $ python -m pyserini.hsearch dense  --index msmarco-doc-tct_colbert-bf \
                                     --output runs/run.msmarco-doc.tct_colbert.bf.bm25.tsv \
                                     --hits 1000 --max-passage --max-passage-hits 100 \
                                     --batch-size 36 --threads 12 \
-                                    --msmarco
+                                    --output-format msmarco
 ```
 
 Replace `--encoded-queries` by `--encoder castorini/tct_colbert-msmarco` for on-the-fly query encoding.
@@ -251,7 +251,7 @@ $ python -m pyserini.hsearch dense  --index msmarco-doc-tct_colbert-bf \
                                     --output runs/run.msmarco-doc.tct_colbert.bf.doc2queryT5.tsv \
                                     --hits 1000 --max-passage --max-passage-hits 100 \
                                     --batch-size 36 --threads 12 \
-                                    --msmarco
+                                    --output-format msmarco
 ```
 
 Replace `--encoded-queries` by `--encoder castorini/tct_colbert-msmarco` for on-the-fly query encoding.

diff --git a/docs/pypi-reproduction.md b/docs/pypi-reproduction.md
@@ -24,7 +24,7 @@ P_30                  	all	0.3102
 MS MARCO passage ranking task, BM25 baseline:
 
 ```bash
-$ python -m pyserini.search --topics msmarco-passage-dev-subset --index msmarco-passage --output run.msmarco-passage.txt --bm25 --msmarco
+$ python -m pyserini.search --topics msmarco-passage-dev-subset --index msmarco-passage --output run.msmarco-passage.txt --bm25 --output-format msmarco
 ```
 
 Evaluation command:
@@ -40,7 +40,7 @@ QueriesRanked: 6980
 MS MARCO passage ranking task, BM25 baseline with [docTTTTTquery expansions](http://doc2query.ai/):
 
 ```bash
-$ python -m pyserini.search --topics msmarco-passage-dev-subset --index msmarco-passage-expanded --output run.msmarco-passage.expanded.txt --bm25 --msmarco
+$ python -m pyserini.search --topics msmarco-passage-dev-subset --index msmarco-passage-expanded --output run.msmarco-passage.expanded.txt --bm25 --output-format msmarco
 ```
 
 Evaluation command:
@@ -58,7 +58,7 @@ QueriesRanked: 6980
 MS MARCO document ranking task, BM25 baseline:
 
 ```bash
-$ python -m pyserini.search --topics msmarco-doc-dev --index msmarco-doc --output run.msmarco-doc.doc.txt --bm25 --hits 100 --msmarco
+$ python -m pyserini.search --topics msmarco-doc-dev --index msmarco-doc --output run.msmarco-doc.doc.txt --bm25 --hits 100 --output-format msmarco
 ```
 
 Evaluation command:
@@ -74,7 +74,7 @@ QueriesRanked: 5193
 MS MARCO document ranking task, BM25 baseline with [docTTTTTquery expansions](http://doc2query.ai/) (per-document):
 
 ```bash
-$ python -m pyserini.search --topics msmarco-doc-dev --index msmarco-doc-expanded-per-doc --output run.msmarco-doc.doc-expanded.txt --bm25 --hits 100 --msmarco
+$ python -m pyserini.search --topics msmarco-doc-dev --index msmarco-doc-expanded-per-doc --output run.msmarco-doc.doc-expanded.txt --bm25 --hits 100 --output-format msmarco
 ```
 
 Evaluation command:
@@ -90,7 +90,7 @@ QueriesRanked: 5193
 MS MARCO document ranking task, BM25 baseline, but with documents segmented into passages and selecting the best-scoring passage per document:
 
 ```bash
-$ python -m pyserini.search --topics msmarco-doc-dev --index msmarco-doc-per-passage --output run.msmarco-doc.passage.txt --bm25 --hits 1000 --max-passage --max-passage-hits 100 --msmarco
+$ python -m pyserini.search --topics msmarco-doc-dev --index msmarco-doc-per-passage --output run.msmarco-doc.passage.txt --bm25 --hits 1000 --max-passage --max-passage-hits 100 --output-format msmarco
 ```
 
 Evaluation command:
@@ -106,7 +106,7 @@ QueriesRanked: 5193
 MS MARCO document ranking task, BM25 baseline with [docTTTTTquery expansions](http://doc2query.ai/) (per-passage):
 
 ```bash
-$ python -m pyserini.search --topics msmarco-doc-dev --index msmarco-doc-expanded-per-passage --output run.msmarco-doc.passage-expanded.txt --bm25 --hits 1000 --max-passage --max-passage-hits 100 --msmarco
+$ python -m pyserini.search --topics msmarco-doc-dev --index msmarco-doc-expanded-per-passage --output run.msmarco-doc.passage-expanded.txt --bm25 --hits 1000 --max-passage --max-passage-hits 100 --output-format msmarco
 ```
 
 Evaluation command:

diff --git a/integrations/test_ance.py b/integrations/test_ance.py
@@ -43,7 +43,7 @@ def test_msmarco_passage_ance_bf_otf(self):
                              --batch-size {self.batch_size} \
                              --threads {self.threads} \
                              --output {output_file} \
-                             --msmarco'
+                             --output-format msmarco'
         cmd2 = f'python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset {output_file}'
         status = os.system(cmd1)
         stdout, stderr = run_command(cmd2)
@@ -67,7 +67,7 @@ def test_msmarco_doc_ance_bf_otf(self):
                              --hits 1000 \
                              --max-passage \
                              --max-passage-hits 100 \
-                             --msmarco \
+                             --output-format msmarco \
                              --batch-size {self.batch_size} \
                              --threads {self.threads}'
         cmd2 = f'python -m pyserini.eval.msmarco_doc_eval --judgments msmarco-doc-dev --run {output_file}'

diff --git a/integrations/test_distilbert_kd.py b/integrations/test_distilbert_kd.py
@@ -44,7 +44,7 @@ def test_msmarco_passage_distilbert_kd_bf_otf(self):
                              --batch-size {self.batch_size} \
                              --threads {self.threads} \
                              --output {output_file} \
-                             --msmarco'
+                             --output-format msmarco'
         cmd2 = f'python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset {output_file}'
         status = os.system(cmd1)
         stdout, stderr = run_command(cmd2)

diff --git a/integrations/test_kilt.py b/integrations/test_kilt.py
@@ -0,0 +1,70 @@
+#
+# Pyserini: Python interface to the Anserini IR toolkit built on Lucene
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""Integration tests for KILT integration."""
+
+import os
+import socket
+import unittest
+import re
+from integrations.utils import clean_files, run_command
+from pyserini.search import get_topics
+from pyserini.dsearch import QueryEncoder
+
+
+def parse_kilt_score(output, metric, digits=4):
+    pattern = re.compile(r"[0-1]\.[0-9]*")
+    for line in output.split('\n')[::-1]:
+        if metric in line:
+            score = float(pattern.search(line).group(0))
+            return round(score, digits)
+    return None
+
+
+class TestSearchIntegration(unittest.TestCase):
+    def setUp(self):
+        self.temp_files = []
+        self.threads = 12
+        self.batch_size = 36
+
+        # Hard-code larger values for internal servers
+        if socket.gethostname().startswith('damiano') or socket.gethostname().startswith('orca'):
+            self.threads = 36
+            self.batch_size = 144
+
+    def test_kilt_search(self):
+        run_file = 'test_run.fever-dev-kilt.jsonl'
+        self.temp_files.append(run_file)
+        cmd1 = f'python -m pyserini.search --topics fever-dev-kilt \
+                             --topics-format kilt \
+                             --index wikipedia-kilt-doc \
+                             --output {run_file} \
+                             --output-format kilt \
+                             --threads {self.threads} \
+                             --batch-size {self.batch_size}'
+        status = os.system(cmd1)
+        self.assertEqual(status, 0)
+        cmd2 = f'python -m pyserini.eval.evaluate_kilt_retrieval {run_file} fever-dev-kilt --ks 1,100'
+        stdout, stderr = run_command(cmd2)
+        score = parse_kilt_score(stdout, "Rprec")
+        self.assertAlmostEqual(score, 0.3821, delta=0.0001)
+
+    def tearDown(self):
+        clean_files(self.temp_files)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/integrations/test_sbert.py b/integrations/test_sbert.py
@@ -44,7 +44,7 @@ def test_msmarco_passage_sbert_bf_otf(self):
                              --batch-size {self.batch_size} \
                              --threads {self.threads} \
                              --output {output_file} \
-                             --msmarco'
+                             --output-format msmarco'
         cmd2 = f'python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset {output_file}'
         status = os.system(cmd1)
         stdout, stderr = run_command(cmd2)

diff --git a/integrations/test_simplesearcher_multithread.py b/integrations/test_simplesearcher_multithread.py
@@ -52,42 +52,42 @@ def test_msmarco_passage(self):
             index='msmarco-passage',
             topics='msmarco-passage-dev-subset')
         self.assertTrue(self.check_equal(checker,
-                                         'msmarco_passage', extras='--msmarco'))
+                                         'msmarco_passage', extras='--output-format msmarco'))
 
     def test_msmarco_passage_docTTTTTquery(self):
         checker = RunSimpleSearcher(
             index='msmarco-passage-expanded',
             topics='msmarco-passage-dev-subset')
         self.assertTrue(self.check_equal(checker,
-                                         'msmarco_passage_docTTTTTquery', extras='--msmarco'))
+                                         'msmarco_passage_docTTTTTquery', extras='--output-format msmarco'))
 
     def test_msmarco_doc(self):
         checker = RunSimpleSearcher(
             index='msmarco-doc',
             topics='msmarco-doc-dev')
         self.assertTrue(self.check_equal(checker, 'msmarco_doc',
-                                         extras='--hits 100 --msmarco'))
+                                         extras='--hits 100 --output-format msmarco'))
 
     def test_msmarco_doc_docTTTTTquery(self):
         checker = RunSimpleSearcher(
             index='msmarco-doc-expanded-per-doc',
             topics='msmarco-doc-dev')
         self.assertTrue(self.check_equal(checker, 'msmarco_doc_docTTTTTquery',
-                                         extras='--hits 100 --msmarco'))
+                                         extras='--hits 100 --output-format msmarco'))
 
     def test_msmarco_doc_per_passage(self):
         checker = RunSimpleSearcher(
             index='msmarco-doc-per-passage',
             topics='msmarco-doc-dev')
         self.assertTrue(self.check_equal(checker, 'msmarco_doc_per_passage',
-                                         extras='--hits 1000 --max-passage --max-passage-hits 100 --msmarco'))
+                                         extras='--hits 1000 --max-passage --max-passage-hits 100 --output-format msmarco'))
 
     def test_msmarco_doc_docTTTTTquery_passage(self):
         checker = RunSimpleSearcher(
             index='msmarco-doc-expanded-per-passage',
             topics='msmarco-doc-dev')
         self.assertTrue(self.check_equal(checker, 'msmarco_doc_docTTTTTquery_passage',
-                                         extras='--hits 1000 --max-passage --max-passage-hits 100 --msmarco'))
+                                         extras='--hits 1000 --max-passage --max-passage-hits 100 --output-format msmarco'))
 
     def tearDown(self):
         pass