Skip to content

Commit

Permalink
Convert kilt dpr corpus
Browse files Browse the repository at this point in the history
  • Loading branch information
yuxuan-ji committed Mar 21, 2021
1 parent 5414bca commit 135e95d
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 3 deletions.
2 changes: 1 addition & 1 deletion scripts/kilt/anserini_retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def _get_predictions_thread(arguments):
doc_scores = []

if use_bigrams:
tokens = filter(lambda word: word not in STOPWORDS, word_tokenize(query))
tokens = filter(lambda word: word.lower() not in STOPWORDS, word_tokenize(query))
if stem_bigrams:
tokens = map(stemmer.stem, tokens)
bigram_query = bigrams(tokens)
Expand Down
30 changes: 30 additions & 0 deletions scripts/kilt/convert_kilt_dpr_to_pyserini_format.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import argparse
import pickle
import csv
from tqdm import tqdm


if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Convert KILT-dpr corpus into the docids file read by pyserini-dpr')
parser.add_argument('--input', required=True, help='Path to the kilt_w100_title.tsv file')
parser.add_argument('--mapping', required=True, help='Path to the mapping_KILT_title.p file')
parser.add_argument('--output', required=True, help='Name and path of the output file')

args = parser.parse_args()

KILT_mapping = pickle.load(open(args.mapping, "rb"))

not_found = set()
with open(args.input, 'r') as f, open(args.output, 'w') as outp:
tsv = csv.reader(f, delimiter='\t')
next(tsv) # skip headers
for row in tqdm(tsv):
title = row[2]
if title not in KILT_mapping:
not_found.add(title)
_ = outp.write('N/A\n')
else:
_ = outp.write(f'{KILT_mapping[title]}')

print('Done!')
print(f'Not found: {not_found}')
2 changes: 1 addition & 1 deletion scripts/kilt/convert_kilt_to_document_jsonl.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
doc["id"] = raw["_id"]
doc["contents"] = "".join(raw["text"])
if args.bigrams:
tokens = filter(lambda word: word not in STOPWORDS, word_tokenize(doc["contents"]))
tokens = filter(lambda word: word.lower() not in STOPWORDS, word_tokenize(doc["contents"]))
if args.stem:
tokens = map(stemmer.stem, tokens)
bigram_doc = bigrams(tokens)
Expand Down
2 changes: 1 addition & 1 deletion scripts/kilt/convert_kilt_to_passage_jsonl.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
doc["id"] = f"{raw['_id']}-{i}"
p = texts[i]
if args.bigrams:
tokens = filter(lambda word: word not in STOPWORDS, word_tokenize(p))
tokens = filter(lambda word: word.lower() not in STOPWORDS, word_tokenize(p))
if args.stem:
tokens = map(stemmer.stem, tokens)
bigram_doc = bigrams(tokens)
Expand Down

0 comments on commit 135e95d

Please sign in to comment.