-
Notifications
You must be signed in to change notification settings - Fork 395
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Pyserini repro doc for MS MARO v2 (fix) (#716)
* fix MAP and MRR score for ms-passage-v2 reranking top1k * add script to convert passage run file to document run file
- Loading branch information
1 parent
c75006e
commit 2b96b99
Showing
3 changed files
with
151 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
# | ||
# Pyserini: Reproducible IR research with sparse and dense representations | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
# This file generates the .tsv files which map each passage to its original docid | ||
# from the passage collection | ||
# | ||
# Usage: | ||
# python scripts/msmarco_v2/build_passage_to_doc_id_map.py \ | ||
# --input collections/msmarco_v2_passage \ | ||
# --output /path/to/idmap_dir | ||
import os | ||
import math | ||
import json | ||
import gzip | ||
import argparse | ||
from multiprocessing import Pool | ||
|
||
|
||
def write_mapping(psg_fn, outp_fn): | ||
open_handle = gzip.open if psg_fn.endswith(".gz") else open | ||
with open_handle(psg_fn) as fin, open(outp_fn, "w") as fout: | ||
for line in fin: | ||
line = json.loads(line) | ||
pid, docid = line["pid"], line["docid"] | ||
fout.write(f"{pid}\t{docid}\n") | ||
|
||
|
||
def main(args): | ||
input_dir, output_dir = args.input, args.output | ||
threads = args.threads | ||
if not os.path.exists(output_dir): | ||
os.makedirs(output_dir, exist_ok=True) | ||
|
||
inp_outp_fn_pairs = [( | ||
os.path.join(input_dir, psg_fn), | ||
os.path.join(output_dir, f"{psg_fn.rstrip('.gz')}.idmap.tsv") | ||
) for psg_fn in os.listdir(input_dir)] | ||
|
||
with Pool(threads) as p: | ||
p.starmap(write_mapping, inp_outp_fn_pairs) | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser(description="Build mapping from passage id to document id (MS MARCO v2)") | ||
parser.add_argument("--input", type=str, required=True, help="path to msmarco passage.") | ||
parser.add_argument("--output", type=str, required=True, help="output directory to store the mapping tsv files.") | ||
parser.add_argument("--threads", type=int, default=5, help="Number of threads to use.") | ||
|
||
args = parser.parse_args() | ||
|
||
main(args) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
# | ||
# Pyserini: Reproducible IR research with sparse and dense representations | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
# This file converts the runfile containing the passage id from MS MARCO Passage v2 to the | ||
# runfile containing the docid from MS MARCO Doc v2. The passage with the max score are | ||
# selected as the document score | ||
# | ||
# Note that this file can only be used after running `build_passage_to_doc_id_map.py` under the | ||
# same folder, to prepare for the idmap.tsv files. | ||
# | ||
# Usage: | ||
# python scripts/msmarco_v2/convert_passage_run_to_doc_run.py \ | ||
# --input runs/run.mspsg.dev.txt \ | ||
# --id-map /path/to/id_map_dir # generated by build_psg_doc_idmap.py \ | ||
# --output runs/run.msdoc-converted-from-psg.dev.txt | ||
import os | ||
import argparse | ||
from collections import defaultdict | ||
|
||
|
||
def load_id_map_from_file(id_map_fn): | ||
psgid2docid = {} | ||
with open(id_map_fn) as f: | ||
for line in f: | ||
psgid, docid = line.strip().split("\t") | ||
psgid2docid[psgid] = docid | ||
return psgid2docid | ||
|
||
|
||
def load_id_map_from_dir(id_map_dir): | ||
return { | ||
fn.replace(".idmap.tsv", ""): load_id_map_from_file(os.path.join(id_map_dir, fn)) | ||
for fn in os.listdir(id_map_dir) | ||
} | ||
|
||
|
||
def main(args): | ||
input_runfile, output_runfile = args.input, args.output | ||
id_map_dir = args.id_map | ||
id_map = load_id_map_from_dir(id_map_dir) | ||
|
||
docid_runs = defaultdict(dict) | ||
with open(input_runfile) as f: | ||
for line in f: | ||
qid, _, psgid, rank, score, tag = line.strip().split() | ||
score = float(score) | ||
|
||
psg_fn = "_".join(psgid.split("_")[:-1]) | ||
docid = id_map[psg_fn][psgid] | ||
|
||
if docid not in docid_runs[qid]: | ||
docid_runs[qid][docid] = score | ||
else: | ||
docid_runs[qid][docid] = max(score, docid_runs[qid][docid]) | ||
|
||
with open(output_runfile, "w") as f: | ||
for qid in sorted(docid_runs, key=lambda q: int(q)): | ||
docid2score = docid_runs[qid] | ||
for rank, (docid, score) in enumerate(sorted(docid2score.items(), key=lambda kv: kv[1], reverse=True)): | ||
f.write(f"{qid} Q0 {docid} {rank} {score} convert-from-passage-v2\n") | ||
|
||
print("finished") | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser(description="Conver runfile that contain psg id into runfile that contain doc id.") | ||
parser.add_argument("--input", type=str, required=True, help="path to msmarco passage runfile.") | ||
parser.add_argument("--id-map", type=str, required=True, help="directory that contains msmarco passage-doc id mapping .tsv files. Generated by `build_passage_to_doc_id_map.py` under the same directory.") | ||
parser.add_argument("--output", type=str, required=True, help="output path to store document id runfile.") | ||
|
||
args = parser.parse_args() | ||
|
||
main(args) |