Skip to content

Commit

Permalink
Add script to verify (download) prebuilt indexes (castorini#832)
Browse files Browse the repository at this point in the history
+ Add a few mirror locations of indexes in CS Vault
+ Refactoring: BM25_INDEX_INFO -> TF_INDEX_INFO
  • Loading branch information
lintool authored Oct 24, 2021
1 parent 5652718 commit 104e70e
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 12 deletions.
8 changes: 4 additions & 4 deletions pyserini/index/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
from ..pyclass import autoclass, JString
from ..search import Document
from pyserini.util import download_prebuilt_index, get_sparse_indexes_info
from pyserini.prebuilt_index_info import BM25_INDEX_INFO
from pyserini.prebuilt_index_info import TF_INDEX_INFO

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -182,13 +182,13 @@ def validate_prebuilt_index(cls, prebuilt_index_name: str):
reader = cls.from_prebuilt_index(prebuilt_index_name)
stats = reader.stats()

if stats['documents'] != BM25_INDEX_INFO[prebuilt_index_name]['documents']:
if stats['documents'] != TF_INDEX_INFO[prebuilt_index_name]['documents']:
raise ValueError('"documents" does not match!')

if stats['unique_terms'] != BM25_INDEX_INFO[prebuilt_index_name]['unique_terms']:
if stats['unique_terms'] != TF_INDEX_INFO[prebuilt_index_name]['unique_terms']:
raise ValueError('"unique_terms" does not match!')

if stats['total_terms'] != BM25_INDEX_INFO[prebuilt_index_name]['total_terms']:
if stats['total_terms'] != TF_INDEX_INFO[prebuilt_index_name]['total_terms']:
raise ValueError('"total_terms" does not match!')

print(reader.stats())
Expand Down
7 changes: 6 additions & 1 deletion pyserini/prebuilt_index_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
# limitations under the License.
#

BM25_INDEX_INFO = {
TF_INDEX_INFO = {
"cacm": {
"description": "CACM corpus",
"filename": "lucene-index.cacm.tar.gz",
Expand Down Expand Up @@ -496,6 +496,7 @@
"readme": "https://github.com/castorini/pyserini/blob/master/pyserini/resources/index-metadata/lucene-index.msmarco-passage.deepimpact.20211012.58d286.readme.txt",
"urls": [
"https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/pyserini-indexes/lucene-index.msmarco-passage.deepimpact.20211012.58d286.tar.gz",
"https://vault.cs.uwaterloo.ca/s/FfwF6nB9M5sjTYk/download",
],
"md5": "9938f5529fee5cdb405b8587746c9e93",
"size compressed (bytes)": 1295216704,
Expand All @@ -510,6 +511,7 @@
"readme": "https://github.com/castorini/pyserini/blob/master/pyserini/resources/index-metadata/lucene-index.msmarco-passage.unicoil-d2q.20211012.58d286.readme.txt",
"urls": [
"https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/pyserini-indexes/lucene-index.msmarco-passage.unicoil-d2q.20211012.58d286.tar.gz",
"https://vault.cs.uwaterloo.ca/s/LGoAAXM7ZEbyQ7y/download"
],
"md5": "4a8cb3b86a0d9085a0860c7f7bb7fe99",
"size compressed (bytes)": 1205104390,
Expand All @@ -524,6 +526,7 @@
"readme": "https://github.com/castorini/pyserini/blob/master/pyserini/resources/index-metadata/lucene-index.msmarco-passage.unicoil-tilde.20211012.58d286.readme.txt",
"urls": [
"https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/pyserini-indexes/lucene-index.msmarco-passage.unicoil-tilde.20211012.58d286.tar.gz",
"https://vault.cs.uwaterloo.ca/s/KdoNXqwAtTmTeNo/download"
],
"md5": "cc19cfe241053f5a303f7f05a7ac40a5",
"size compressed (bytes)": 1935108302,
Expand All @@ -538,6 +541,7 @@
"readme": "https://github.com/castorini/pyserini/blob/master/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage.unicoil-noexp-0shot.20211012.58d286.readme.txt",
"urls": [
"https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/pyserini-indexes/lucene-index.msmarco-v2-passage.unicoil-noexp-0shot.20211012.58d286.tar.gz",
"https://vault.cs.uwaterloo.ca/s/eXA2BHF8WQjdY8R/download"
],
"md5": "8886a8d9599838bc6d8d61464da61086",
"size compressed (bytes)": 14801476783,
Expand All @@ -552,6 +556,7 @@
"readme": "https://github.com/castorini/pyserini/blob/master/pyserini/resources/index-metadata/lucene-index.msmarco-v2-passage.unicoil-noexp-0shot.20211012.58d286.readme.txt",
"urls": [
"https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/pyserini-indexes/lucene-index.msmarco-v2-passage.unicoil-tilde.20211012.58d286.tar.gz",
"https://vault.cs.uwaterloo.ca/s/oGQ8tWifozPaHLK/download"
],
"md5": "562f9534eefe04ab8c07beb304074d41",
"size compressed (bytes)": 31168302160,
Expand Down
14 changes: 7 additions & 7 deletions pyserini/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@

from pyserini.encoded_query_info import QUERY_INFO
from pyserini.evaluate_script_info import EVALUATION_INFO
from pyserini.prebuilt_index_info import BM25_INDEX_INFO, FAISS_INDEX_INFO, IMPACT_INDEX_INFO
from pyserini.prebuilt_index_info import TF_INDEX_INFO, FAISS_INDEX_INFO, IMPACT_INDEX_INFO


# https://gist.github.com/leimao/37ff6e990b3226c2c9670a2cd1e4a6f5
Expand Down Expand Up @@ -158,8 +158,8 @@ def download_and_unpack_index(url, index_directory='indexes', local_filename=Fal


def check_downloaded(index_name):
if index_name in BM25_INDEX_INFO:
target_index = BM25_INDEX_INFO[index_name]
if index_name in TF_INDEX_INFO:
target_index = TF_INDEX_INFO[index_name]
elif index_name in IMPACT_INDEX_INFO:
target_index = IMPACT_INDEX_INFO[index_name]
else:
Expand All @@ -175,7 +175,7 @@ def check_downloaded(index_name):


def get_sparse_indexes_info():
df = pd.DataFrame.from_dict(BM25_INDEX_INFO)
df = pd.DataFrame.from_dict(TF_INDEX_INFO)
for index in df.keys():
df[index]['downloaded'] = check_downloaded(index)

Expand Down Expand Up @@ -205,10 +205,10 @@ def get_dense_indexes_info():


def download_prebuilt_index(index_name, force=False, verbose=True, mirror=None):
if index_name not in BM25_INDEX_INFO and index_name not in FAISS_INDEX_INFO and index_name not in IMPACT_INDEX_INFO:
if index_name not in TF_INDEX_INFO and index_name not in FAISS_INDEX_INFO and index_name not in IMPACT_INDEX_INFO:
raise ValueError(f'Unrecognized index name {index_name}')
if index_name in BM25_INDEX_INFO:
target_index = BM25_INDEX_INFO[index_name]
if index_name in TF_INDEX_INFO:
target_index = TF_INDEX_INFO[index_name]
elif index_name in IMPACT_INDEX_INFO:
target_index = IMPACT_INDEX_INFO[index_name]
else:
Expand Down
41 changes: 41 additions & 0 deletions scripts/verify_prebuilt_indexes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#
# Pyserini: Reproducible IR research with sparse and dense representations
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import os
import sys

# Use Pyserini in this repo (as opposed to pip install)
sys.path.insert(0, './')

from pyserini.util import download_url
from pyserini.prebuilt_index_info import TF_INDEX_INFO, IMPACT_INDEX_INFO, FAISS_INDEX_INFO


def check(index):
for entry in index:
print(f'# Checking "{entry}"...')
md5sum = index[entry]['md5']
for url in index[entry]['urls']:
destination = download_url(url, '.', md5=md5sum)
print(f'Finished downloading to {destination}, cleaning up.')
os.remove(destination)
print('\n')


if __name__ == '__main__':
check(TF_INDEX_INFO)
check(IMPACT_INDEX_INFO)
check(FAISS_INDEX_INFO)

0 comments on commit 104e70e

Please sign in to comment.