From 104e70e7c61b38d3d5a3d9d6c82f81f0c8aa193c Mon Sep 17 00:00:00 2001 From: Jimmy Lin Date: Sun, 24 Oct 2021 10:28:42 -0400 Subject: [PATCH] Add script to verify (download) prebuilt indexes (#832) + Add a few mirror locations of indexes in CS Vault + Refactoring: BM25_INDEX_INFO -> TF_INDEX_INFO --- pyserini/index/_base.py | 8 +++--- pyserini/prebuilt_index_info.py | 7 ++++- pyserini/util.py | 14 +++++----- scripts/verify_prebuilt_indexes.py | 41 ++++++++++++++++++++++++++++++ 4 files changed, 58 insertions(+), 12 deletions(-) create mode 100644 scripts/verify_prebuilt_indexes.py diff --git a/pyserini/index/_base.py b/pyserini/index/_base.py index 5c95756193..49614c91f2 100644 --- a/pyserini/index/_base.py +++ b/pyserini/index/_base.py @@ -28,7 +28,7 @@ from ..pyclass import autoclass, JString from ..search import Document from pyserini.util import download_prebuilt_index, get_sparse_indexes_info -from pyserini.prebuilt_index_info import BM25_INDEX_INFO +from pyserini.prebuilt_index_info import TF_INDEX_INFO logger = logging.getLogger(__name__) @@ -182,13 +182,13 @@ def validate_prebuilt_index(cls, prebuilt_index_name: str): reader = cls.from_prebuilt_index(prebuilt_index_name) stats = reader.stats() - if stats['documents'] != BM25_INDEX_INFO[prebuilt_index_name]['documents']: + if stats['documents'] != TF_INDEX_INFO[prebuilt_index_name]['documents']: raise ValueError('"documents" does not match!') - if stats['unique_terms'] != BM25_INDEX_INFO[prebuilt_index_name]['unique_terms']: + if stats['unique_terms'] != TF_INDEX_INFO[prebuilt_index_name]['unique_terms']: raise ValueError('"unique_terms" does not match!') - if stats['total_terms'] != BM25_INDEX_INFO[prebuilt_index_name]['total_terms']: + if stats['total_terms'] != TF_INDEX_INFO[prebuilt_index_name]['total_terms']: raise ValueError('"total_terms" does not match!') print(reader.stats()) diff --git a/pyserini/prebuilt_index_info.py b/pyserini/prebuilt_index_info.py index 1de2717a33..8305051095 100644 --- a/pyserini/prebuilt_index_info.py +++ b/pyserini/prebuilt_index_info.py @@ -14,7 +14,7 @@ # limitations under the License. # -BM25_INDEX_INFO = { +TF_INDEX_INFO = { "cacm": { "description": "CACM corpus", "filename": "lucene-index.cacm.tar.gz", @@ -496,6 +496,7 @@ "readme": "https://github.com/castorini/pyserini/blob/master/pyserini/resources/index-metadata/lucene-index.msmarco-passage.deepimpact.20211012.58d286.readme.txt", "urls": [ "https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/pyserini-indexes/lucene-index.msmarco-passage.deepimpact.20211012.58d286.tar.gz", + "https://vault.cs.uwaterloo.ca/s/FfwF6nB9M5sjTYk/download", ], "md5": "9938f5529fee5cdb405b8587746c9e93", "size compressed (bytes)": 1295216704, @@ -510,6 +511,7 @@ "readme": "https://github.com/castorini/pyserini/blob/master/pyserini/resources/index-metadata/lucene-index.msmarco-passage.unicoil-d2q.20211012.58d286.readme.txt", "urls": [ "https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/pyserini-indexes/lucene-index.msmarco-passage.unicoil-d2q.20211012.58d286.tar.gz", + "https://vault.cs.uwaterloo.ca/s/LGoAAXM7ZEbyQ7y/download" ], "md5": "4a8cb3b86a0d9085a0860c7f7bb7fe99", "size compressed (bytes)": 1205104390, @@ -524,6 +526,7 @@ "readme": "https://github.com/castorini/pyserini/blob/master/pyserini/resources/index-metadata/lucene-index.msmarco-passage.unicoil-tilde.20211012.58d286.readme.txt", "urls": [ "https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/pyserini-indexes/lucene-index.msmarco-passage.unicoil-tilde.20211012.58d286.tar.gz", + "https://vault.cs.uwaterloo.ca/s/KdoNXqwAtTmTeNo/download" ], "md5": "cc19cfe241053f5a303f7f05a7ac40a5", "size compressed (bytes)": 1935108302, @@ -538,6 +541,7 @@ "readme": "https://github.com/castorini/pyserini/blob/master/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage.unicoil-noexp-0shot.20211012.58d286.readme.txt", "urls": [ "https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/pyserini-indexes/lucene-index.msmarco-v2-passage.unicoil-noexp-0shot.20211012.58d286.tar.gz", + "https://vault.cs.uwaterloo.ca/s/eXA2BHF8WQjdY8R/download" ], "md5": "8886a8d9599838bc6d8d61464da61086", "size compressed (bytes)": 14801476783, @@ -552,6 +556,7 @@ "readme": "https://github.com/castorini/pyserini/blob/master/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage.unicoil-noexp-0shot.20211012.58d286.readme.txt", "urls": [ "https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/pyserini-indexes/lucene-index.msmarco-v2-passage.unicoil-tilde.20211012.58d286.tar.gz", + "https://vault.cs.uwaterloo.ca/s/oGQ8tWifozPaHLK/download" ], "md5": "562f9534eefe04ab8c07beb304074d41", "size compressed (bytes)": 31168302160, diff --git a/pyserini/util.py b/pyserini/util.py index a7fce67e93..c3494241d1 100644 --- a/pyserini/util.py +++ b/pyserini/util.py @@ -27,7 +27,7 @@ from pyserini.encoded_query_info import QUERY_INFO from pyserini.evaluate_script_info import EVALUATION_INFO -from pyserini.prebuilt_index_info import BM25_INDEX_INFO, FAISS_INDEX_INFO, IMPACT_INDEX_INFO +from pyserini.prebuilt_index_info import TF_INDEX_INFO, FAISS_INDEX_INFO, IMPACT_INDEX_INFO # https://gist.github.com/leimao/37ff6e990b3226c2c9670a2cd1e4a6f5 @@ -158,8 +158,8 @@ def download_and_unpack_index(url, index_directory='indexes', local_filename=Fal def check_downloaded(index_name): - if index_name in BM25_INDEX_INFO: - target_index = BM25_INDEX_INFO[index_name] + if index_name in TF_INDEX_INFO: + target_index = TF_INDEX_INFO[index_name] elif index_name in IMPACT_INDEX_INFO: target_index = IMPACT_INDEX_INFO[index_name] else: @@ -175,7 +175,7 @@ def check_downloaded(index_name): def get_sparse_indexes_info(): - df = pd.DataFrame.from_dict(BM25_INDEX_INFO) + df = pd.DataFrame.from_dict(TF_INDEX_INFO) for index in df.keys(): df[index]['downloaded'] = check_downloaded(index) @@ -205,10 +205,10 @@ def get_dense_indexes_info(): def download_prebuilt_index(index_name, force=False, verbose=True, mirror=None): - if index_name not in BM25_INDEX_INFO and index_name not in FAISS_INDEX_INFO and index_name not in IMPACT_INDEX_INFO: + if index_name not in TF_INDEX_INFO and index_name not in FAISS_INDEX_INFO and index_name not in IMPACT_INDEX_INFO: raise ValueError(f'Unrecognized index name {index_name}') - if index_name in BM25_INDEX_INFO: - target_index = BM25_INDEX_INFO[index_name] + if index_name in TF_INDEX_INFO: + target_index = TF_INDEX_INFO[index_name] elif index_name in IMPACT_INDEX_INFO: target_index = IMPACT_INDEX_INFO[index_name] else: diff --git a/scripts/verify_prebuilt_indexes.py b/scripts/verify_prebuilt_indexes.py new file mode 100644 index 0000000000..541302d58d --- /dev/null +++ b/scripts/verify_prebuilt_indexes.py @@ -0,0 +1,41 @@ +# +# Pyserini: Reproducible IR research with sparse and dense representations +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import sys + +# Use Pyserini in this repo (as opposed to pip install) +sys.path.insert(0, './') + +from pyserini.util import download_url +from pyserini.prebuilt_index_info import TF_INDEX_INFO, IMPACT_INDEX_INFO, FAISS_INDEX_INFO + + +def check(index): + for entry in index: + print(f'# Checking "{entry}"...') + md5sum = index[entry]['md5'] + for url in index[entry]['urls']: + destination = download_url(url, '.', md5=md5sum) + print(f'Finished downloading to {destination}, cleaning up.') + os.remove(destination) + print('\n') + + +if __name__ == '__main__': + check(TF_INDEX_INFO) + check(IMPACT_INDEX_INFO) + check(FAISS_INDEX_INFO)