From 2383fb737db918f4cca2da745e6a626c055a7c4a Mon Sep 17 00:00:00 2001 From: ronakice Date: Fri, 22 May 2020 01:05:13 +0530 Subject: [PATCH 01/11] create docs, indexes, logs, models, runs and add MS Marco official eval --- docs/experiments-msmarco-passage.md | 45 +++++++ evaluate/msmarco/msmarco_eval.py | 183 ++++++++++++++++++++++++++++ indexes/.gitkeep | 1 + logs/.gitkeep | 1 + models/.gitkeep | 1 + runs/.gitkeep | 1 + 6 files changed, 232 insertions(+) create mode 100644 docs/experiments-msmarco-passage.md create mode 100644 evaluate/msmarco/msmarco_eval.py create mode 100644 indexes/.gitkeep create mode 100644 logs/.gitkeep create mode 100644 models/.gitkeep create mode 100644 runs/.gitkeep diff --git a/docs/experiments-msmarco-passage.md b/docs/experiments-msmarco-passage.md new file mode 100644 index 00000000..8c35c913 --- /dev/null +++ b/docs/experiments-msmarco-passage.md @@ -0,0 +1,45 @@ +# PyGaggle: Neural Baselines on [MS MARCO Passage Retrieval](https://github.com/microsoft/MSMARCO-Passage-Ranking) + +This page contains instructions for running various neural reranking baselines on the MS MARCO *passage* ranking task. +Note that there is also a separate [MS MARCO *document* ranking task](experiments-msmarco-doc.md). + +Prior to running this, we suggest looking at our first-stage [BM25 ranking instructions](https://github.com/castorini/anserini/blob/master/docs/experiments-msmarco-passage.md). +We rerank the BM25 run files that contain ~1000 passages per query. + +Keeping computational resources in mind, our instructions primarily focus on a 1005 query subset of MS MARCO dev set. +Running the instructions with the entire MS MARCO dev set should give about the same results as that in the corresponding paper. + +*Note: Run the following instructions at root of this repo.* + +## Data Prep + +We're first going to download the files corresponding to the 1005 query subset. The run file is generated by following the BM25 ranking instructions. We'll store the files related to this run in the `runs` directory. + +``` +wget https://www.dropbox.com/s/wz89rag7brcgt8v/msmarco_ans_medium.zip -P data +``` + +To confirm, `msmarco_ans_medium.zip` should have MD5 checksum of `1119afbdee29eb0a9a56bc6701127a84`. + +Next, we extract the contents of the zip file into runs. + +``` +unzip msmarco_ans_medium.zip -d data +``` + +We can evaluate the first-stage retrieved documents using the official MS MARCO evaluation script. + +``` +python3 evaluate/msmarco/msmarco_eval.py data/msmarco_ans_medium/qrels.dev.small.tsv data/msmarco_ans_medium/run.dev.small.tsv +``` + +And the output should be: + +``` +##################### +MRR @10: 0.1905808260285872 +QueriesRanked: 1005 +##################### +``` + + diff --git a/evaluate/msmarco/msmarco_eval.py b/evaluate/msmarco/msmarco_eval.py new file mode 100644 index 00000000..28d8ccbb --- /dev/null +++ b/evaluate/msmarco/msmarco_eval.py @@ -0,0 +1,183 @@ +""" +This module computes evaluation metrics for MSMARCO dataset on the ranking task. +Command line: +python msmarco_eval_ranking.py + +Creation Date : 06/12/2018 +Last Modified : 1/21/2019 +Authors : Daniel Campos , Rutger van Haasteren +""" +import sys +import statistics + +from collections import Counter + +MaxMRRRank = 10 + +def load_reference_from_stream(f): + """Load Reference reference relevant passages + Args:f (stream): stream to load. + Returns:qids_to_relevant_passageids (dict): dictionary mapping from query_id (int) to relevant passages (list of ints). + """ + qids_to_relevant_passageids = {} + for l in f: + try: + l = l.strip().split('\t') + qid = int(l[0]) + if qid in qids_to_relevant_passageids: + pass + else: + qids_to_relevant_passageids[qid] = [] + qids_to_relevant_passageids[qid].append(int(l[2])) + except: + raise IOError('\"%s\" is not valid format' % l) + return qids_to_relevant_passageids + +def load_reference(path_to_reference): + """Load Reference reference relevant passages + Args:path_to_reference (str): path to a file to load. + Returns:qids_to_relevant_passageids (dict): dictionary mapping from query_id (int) to relevant passages (list of ints). + """ + with open(path_to_reference,'r') as f: + qids_to_relevant_passageids = load_reference_from_stream(f) + return qids_to_relevant_passageids + +def load_candidate_from_stream(f): + """Load candidate data from a stream. + Args:f (stream): stream to load. + Returns:qid_to_ranked_candidate_passages (dict): dictionary mapping from query_id (int) to a list of 1000 passage ids(int) ranked by relevance and importance + """ + qid_to_ranked_candidate_passages = {} + for l in f: + try: + l = l.strip().split('\t') + qid = int(l[0]) + pid = int(l[1]) + rank = int(l[2]) + if qid in qid_to_ranked_candidate_passages: + pass + else: + # By default, all PIDs in the list of 1000 are 0. Only override those that are given + tmp = [0] * 1000 + qid_to_ranked_candidate_passages[qid] = tmp + qid_to_ranked_candidate_passages[qid][rank-1]=pid + except: + raise IOError('\"%s\" is not valid format' % l) + return qid_to_ranked_candidate_passages + +def load_candidate(path_to_candidate): + """Load candidate data from a file. + Args:path_to_candidate (str): path to file to load. + Returns:qid_to_ranked_candidate_passages (dict): dictionary mapping from query_id (int) to a list of 1000 passage ids(int) ranked by relevance and importance + """ + + with open(path_to_candidate,'r') as f: + qid_to_ranked_candidate_passages = load_candidate_from_stream(f) + return qid_to_ranked_candidate_passages + +def quality_checks_qids(qids_to_relevant_passageids, qids_to_ranked_candidate_passages): + """Perform quality checks on the dictionaries + + Args: + p_qids_to_relevant_passageids (dict): dictionary of query-passage mapping + Dict as read in with load_reference or load_reference_from_stream + p_qids_to_ranked_candidate_passages (dict): dictionary of query-passage candidates + Returns: + bool,str: Boolean whether allowed, message to be shown in case of a problem + """ + message = '' + allowed = True + + # Create sets of the QIDs for the submitted and reference queries + candidate_set = set(qids_to_ranked_candidate_passages.keys()) + ref_set = set(qids_to_relevant_passageids.keys()) + + # Check that we do not have multiple passages per query + for qid in qids_to_ranked_candidate_passages: + # Remove all zeros from the candidates + duplicate_pids = set([item for item, count in Counter(qids_to_ranked_candidate_passages[qid]).items() if count > 1]) + + if len(duplicate_pids-set([0])) > 0: + message = "Cannot rank a passage multiple times for a single query. QID={qid}, PID={pid}".format( + qid=qid, pid=list(duplicate_pids)[0]) + allowed = False + + return allowed, message + +def compute_metrics(qids_to_relevant_passageids, qids_to_ranked_candidate_passages): + """Compute MRR metric + Args: + p_qids_to_relevant_passageids (dict): dictionary of query-passage mapping + Dict as read in with load_reference or load_reference_from_stream + p_qids_to_ranked_candidate_passages (dict): dictionary of query-passage candidates + Returns: + dict: dictionary of metrics {'MRR': } + """ + all_scores = {} + MRR = 0 + qids_with_relevant_passages = 0 + ranking = [] + for qid in qids_to_ranked_candidate_passages: + if qid in qids_to_relevant_passageids: + ranking.append(0) + target_pid = qids_to_relevant_passageids[qid] + candidate_pid = qids_to_ranked_candidate_passages[qid] + for i in range(0,MaxMRRRank): + if candidate_pid[i] in target_pid: + MRR += 1/(i + 1) + ranking.pop() + ranking.append(i+1) + break + if len(ranking) == 0: + raise IOError("No matching QIDs found. Are you sure you are scoring the evaluation set?") + + MRR = MRR/len(qids_to_relevant_passageids) + all_scores['MRR @10'] = MRR + all_scores['QueriesRanked'] = len(qids_to_ranked_candidate_passages) + return all_scores + +def compute_metrics_from_files(path_to_reference, path_to_candidate, perform_checks=True): + """Compute MRR metric + Args: + p_path_to_reference_file (str): path to reference file. + Reference file should contain lines in the following format: + QUERYID\tPASSAGEID + Where PASSAGEID is a relevant passage for a query. Note QUERYID can repeat on different lines with different PASSAGEIDs + p_path_to_candidate_file (str): path to candidate file. + Candidate file sould contain lines in the following format: + QUERYID\tPASSAGEID1\tRank + If a user wishes to use the TREC format please run the script with a -t flag at the end. If this flag is used the expected format is + QUERYID\tITER\tDOCNO\tRANK\tSIM\tRUNID + Where the values are separated by tabs and ranked in order of relevance + Returns: + dict: dictionary of metrics {'MRR': } + """ + + qids_to_relevant_passageids = load_reference(path_to_reference) + qids_to_ranked_candidate_passages = load_candidate(path_to_candidate) + if perform_checks: + allowed, message = quality_checks_qids(qids_to_relevant_passageids, qids_to_ranked_candidate_passages) + if message != '': print(message) + + return compute_metrics(qids_to_relevant_passageids, qids_to_ranked_candidate_passages) + +def main(): + """Command line: + python msmarco_eval_ranking.py + """ + + if len(sys.argv) == 3: + path_to_reference = sys.argv[1] + path_to_candidate = sys.argv[2] + metrics = compute_metrics_from_files(path_to_reference, path_to_candidate) + print('#####################') + for metric in sorted(metrics): + print('{}: {}'.format(metric, metrics[metric])) + print('#####################') + + else: + print('Usage: msmarco_eval_ranking.py ') + exit() + +if __name__ == '__main__': + main() diff --git a/indexes/.gitkeep b/indexes/.gitkeep new file mode 100644 index 00000000..6a43dd68 --- /dev/null +++ b/indexes/.gitkeep @@ -0,0 +1 @@ +# This is the default directory for indexes. Placeholder so that directory is kept in git. \ No newline at end of file diff --git a/logs/.gitkeep b/logs/.gitkeep new file mode 100644 index 00000000..bb0a0cfc --- /dev/null +++ b/logs/.gitkeep @@ -0,0 +1 @@ +# This is the default directory for logs. Placeholder so that directory is kept in git. \ No newline at end of file diff --git a/models/.gitkeep b/models/.gitkeep new file mode 100644 index 00000000..b4509c2f --- /dev/null +++ b/models/.gitkeep @@ -0,0 +1 @@ +# This is the default directory for models. Placeholder so that directory is kept in git. \ No newline at end of file diff --git a/runs/.gitkeep b/runs/.gitkeep new file mode 100644 index 00000000..1284ed79 --- /dev/null +++ b/runs/.gitkeep @@ -0,0 +1 @@ +# This is the default directory for runs. Placeholder so that directory is kept in git. \ No newline at end of file From 1f37ef0e81fb4524ba637dc6e292b054a14bca8a Mon Sep 17 00:00:00 2001 From: ronakice Date: Fri, 22 May 2020 01:21:20 +0530 Subject: [PATCH 02/11] remove index-dir from settings for added clarity --- pygaggle/run/evaluate_passage_ranker.py | 8 ++++---- pygaggle/settings.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pygaggle/run/evaluate_passage_ranker.py b/pygaggle/run/evaluate_passage_ranker.py index 34537756..e3062e9d 100644 --- a/pygaggle/run/evaluate_passage_ranker.py +++ b/pygaggle/run/evaluate_passage_ranker.py @@ -46,7 +46,7 @@ class PassageRankingEvaluationOptions(BaseModel): metrics: List[str] model_type: Optional[str] tokenizer_name: Optional[str] - index_dir: Optional[Path] + index_dir: Path @validator('dataset') def dataset_exists(cls, v: str): @@ -59,8 +59,7 @@ def datadir_exists(cls, v: str): @validator('index_dir') def index_dir_exists(cls, v: str): - if v is None: - return SETTINGS.msmarco_index_path + assert v.exists(), 'index directory must exist' return v @validator('model_name_or_path') @@ -140,7 +139,8 @@ def main(): apb.add_opts(opt('--dataset', type=str, default='msmarco'), - opt('--data-dir', type=Path, default='/content/data/msmarco'), + opt('--data-dir', type=Path, required=True), + opt('--index-dir', type=Path, required=True), opt('--method', required=True, type=str, diff --git a/pygaggle/settings.py b/pygaggle/settings.py index 780980fb..65aeb3da 100644 --- a/pygaggle/settings.py +++ b/pygaggle/settings.py @@ -12,7 +12,7 @@ class Settings(BaseSettings): class MsMarcoSettings(Settings): - msmarco_index_path: str = 'data/index-msmarco-passage-20191117-0ed488' + pass class Cord19Settings(Settings): From 492cae6109da54ddcd9882f505b9a1dee635771b Mon Sep 17 00:00:00 2001 From: ronakice Date: Fri, 22 May 2020 01:24:03 +0530 Subject: [PATCH 03/11] index-dir bf2 --- pygaggle/run/evaluate_passage_ranker.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pygaggle/run/evaluate_passage_ranker.py b/pygaggle/run/evaluate_passage_ranker.py index e3062e9d..2b84a592 100644 --- a/pygaggle/run/evaluate_passage_ranker.py +++ b/pygaggle/run/evaluate_passage_ranker.py @@ -140,7 +140,6 @@ def main(): type=str, default='msmarco'), opt('--data-dir', type=Path, required=True), - opt('--index-dir', type=Path, required=True), opt('--method', required=True, type=str, From 7fad43a414bbe042bc5494d13f263a54b2260b24 Mon Sep 17 00:00:00 2001 From: ronakice Date: Fri, 22 May 2020 01:26:51 +0530 Subject: [PATCH 04/11] index-dir bf3 --- pygaggle/run/evaluate_passage_ranker.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pygaggle/run/evaluate_passage_ranker.py b/pygaggle/run/evaluate_passage_ranker.py index 2b84a592..21dd1068 100644 --- a/pygaggle/run/evaluate_passage_ranker.py +++ b/pygaggle/run/evaluate_passage_ranker.py @@ -37,6 +37,7 @@ class PassageRankingEvaluationOptions(BaseModel): dataset: str data_dir: Path + index_dir: Path method: str model_name_or_path: str split: str @@ -46,7 +47,6 @@ class PassageRankingEvaluationOptions(BaseModel): metrics: List[str] model_type: Optional[str] tokenizer_name: Optional[str] - index_dir: Path @validator('dataset') def dataset_exists(cls, v: str): @@ -140,6 +140,7 @@ def main(): type=str, default='msmarco'), opt('--data-dir', type=Path, required=True), + opt('--index-dir', type=Path, required=True), opt('--method', required=True, type=str, @@ -160,13 +161,12 @@ def main(): default=metric_names(), choices=metric_names()), opt('--model-type', type=str, default='bert-base'), - opt('--tokenizer-name', type=str), - opt('--index-dir', type=Path)) + opt('--tokenizer-name', type=str)) args = apb.parser.parse_args() options = PassageRankingEvaluationOptions(**vars(args)) ds = MsMarcoDataset.from_folder(str(options.data_dir), split=options.split, is_duo=options.is_duo) - examples = ds.to_relevance_examples(SETTINGS.msmarco_index_path, + examples = ds.to_relevance_examples(options.index_dir, is_duo=options.is_duo) construct_map = dict(transformer=construct_transformer, bm25=construct_bm25, From a06e70ccd9294054318055047e7fab84982d11e6 Mon Sep 17 00:00:00 2001 From: ronakice Date: Fri, 22 May 2020 01:33:38 +0530 Subject: [PATCH 05/11] index-dir bf4 --- pygaggle/run/evaluate_passage_ranker.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pygaggle/run/evaluate_passage_ranker.py b/pygaggle/run/evaluate_passage_ranker.py index 21dd1068..454dbf1d 100644 --- a/pygaggle/run/evaluate_passage_ranker.py +++ b/pygaggle/run/evaluate_passage_ranker.py @@ -131,7 +131,7 @@ def construct_seq_class_transformer(options: PassageRankingEvaluationOptions def construct_bm25(options: PassageRankingEvaluationOptions) -> Reranker: - return Bm25Reranker(index_path=options.msmarco_index_path) + return Bm25Reranker(index_path=options.index_dir) def main(): @@ -166,7 +166,7 @@ def main(): options = PassageRankingEvaluationOptions(**vars(args)) ds = MsMarcoDataset.from_folder(str(options.data_dir), split=options.split, is_duo=options.is_duo) - examples = ds.to_relevance_examples(options.index_dir, + examples = ds.to_relevance_examples(str(options.index_dir), is_duo=options.is_duo) construct_map = dict(transformer=construct_transformer, bm25=construct_bm25, From e37bce5aaf75222924ac7ad33721580bf0dc9c1e Mon Sep 17 00:00:00 2001 From: ronakice Date: Fri, 22 May 2020 03:39:13 +0530 Subject: [PATCH 06/11] done doc --- docs/experiments-msmarco-passage.md | 132 +++++++++++++++++++++++++--- 1 file changed, 122 insertions(+), 10 deletions(-) diff --git a/docs/experiments-msmarco-passage.md b/docs/experiments-msmarco-passage.md index 8c35c913..c8367f88 100644 --- a/docs/experiments-msmarco-passage.md +++ b/docs/experiments-msmarco-passage.md @@ -2,44 +2,156 @@ This page contains instructions for running various neural reranking baselines on the MS MARCO *passage* ranking task. Note that there is also a separate [MS MARCO *document* ranking task](experiments-msmarco-doc.md). +Make sure that you have access to a GPU before running this. Prior to running this, we suggest looking at our first-stage [BM25 ranking instructions](https://github.com/castorini/anserini/blob/master/docs/experiments-msmarco-passage.md). -We rerank the BM25 run files that contain ~1000 passages per query. +We rerank the BM25 run files that contain ~ 1000 passages per query using both monoBERT and monoT5. -Keeping computational resources in mind, our instructions primarily focus on a 1005 query subset of MS MARCO dev set. +Keeping computational resources in mind, our instructions primarily focus on a 105 query subset of MS MARCO dev set. Running the instructions with the entire MS MARCO dev set should give about the same results as that in the corresponding paper. -*Note: Run the following instructions at root of this repo.* +*Note: Run the following instructions at root of this repo. Installation must have be done from source.* + +## Models + ++ monoBERT-Large: Passage Re-ranking with BERT [(Rodrigo et al., 2019)](https://arxiv.org/pdf/1901.04085.pdf) ++ monoT5-base: Document Ranking with a Pretrained Sequence-to-Sequence Model [(Nogueira et al., 2020)](https://arxiv.org/pdf/2003.06713.pdf) ## Data Prep -We're first going to download the files corresponding to the 1005 query subset. The run file is generated by following the BM25 ranking instructions. We'll store the files related to this run in the `runs` directory. +We're first going to download the queries, qrels and run files corresponding to the MS MARCO subset considered. The run file is generated by following the BM25 ranking instructions. We'll store the files in the `data` directory. ``` -wget https://www.dropbox.com/s/wz89rag7brcgt8v/msmarco_ans_medium.zip -P data +wget https://www.dropbox.com/s/5xa5vjbjle0c8jv/msmarco_ans_small.zip -P data ``` -To confirm, `msmarco_ans_medium.zip` should have MD5 checksum of `1119afbdee29eb0a9a56bc6701127a84`. +To confirm, `msmarco_ans_small.zip` should have MD5 checksum of `65d8007bfb2c72b5fc384738e5572f74`. Next, we extract the contents of the zip file into runs. ``` -unzip msmarco_ans_medium.zip -d data +unzip msmarco_ans_small.zip -d data ``` We can evaluate the first-stage retrieved documents using the official MS MARCO evaluation script. ``` -python3 evaluate/msmarco/msmarco_eval.py data/msmarco_ans_medium/qrels.dev.small.tsv data/msmarco_ans_medium/run.dev.small.tsv +python evaluate/msmarco/msmarco_eval.py data/msmarco_ans_small/qrels.dev.small.tsv data/msmarco_ans_small/run.dev.small.tsv ``` And the output should be: ``` ##################### -MRR @10: 0.1905808260285872 -QueriesRanked: 1005 +MRR @10: 0.15906651549508694 +QueriesRanked: 105 ##################### ``` +Let's download and extract the pre-built MS MARCO index: + +``` +wget https://git.uwaterloo.ca/jimmylin/anserini-indexes/raw/master/index-msmarco-passage-20191117-0ed488.tar.gz -P indexes +tar xvfz indexes/index-msmarco-passage-20191117-0ed488.tar.gz -C indexes +``` + +## Model Prep + +Let's download and extract monoBERT into the `models` directory + +``` +wget https://www.dropbox.com/s/jr0hpksboh7pa48/monobert_msmarco_large.zip -P models +unzip models/monobert_msmarco_large.zip -d models +``` + +While running the re-ranking script with the monoT5 model, it is automatically downloaded from Google Cloud Storage. + +Now, we can begin with re-ranking the set! + +## Re-Ranking with monoBERT + +First, lets evaluate using monoBERT! + +``` +python -um pygaggle.run.evaluate_passage_ranker --split dev \ + --method seq_class_transformer \ + --model-name-or-path models/monobert_msmarco_large \ + --data-dir data/msmarco_ans_small/ \ + --index-dir indexes/index-msmarco-passage-20191117-0ed488 \ + --dataset msmarco \ + --output-file runs/run.monobert.ans_small.dev.tsv +``` + +It takes about ~ 52 minutes to re-rank this subset on MS MARCO using a P100. +The type of GPU will directly influence your inference time. +It is possible that the default batch results in a GPU OOM error. +In this case, perhaps manually assigning a batch size (using option `--batch-size`) which is smaller than the default of 96, should help! + +Upon completion, the re-ranked run file `run.monobert.ans_small.dev.tsv` will be available in the `runs` directory. +The following output will also be visible: + +``` +precision@1 0.2761904761904762 +recall@3 0.42698412698412697 +recall@50 0.8174603174603176 +recall@1000 0.8476190476190476 +mrr 0.41089693612003686 +mrr@10 0.4026795162509449 +``` + +Great, we can also verify that the MRR@10 is indeed right using the official MS MARCO evaluation script: + +``` +python evaluate/msmarco/msmarco_eval.py data/msmarco_ans_small/qrels.dev.small.tsv runs/run.monobert.ans_small.dev.tsv +``` + +You should see the same result. Great, let's move on to monoT5! + +## Re-Ranking with monoT5 + +We use the monoT5-base variant as it is the easiest to run without access to larger GPUs/TPUs. Let us now re-rank the set! + +``` +python -um pygaggle.run.evaluate_passage_ranker --split dev \ + --method t5 \ + --model-name-or-path gs://neuralresearcher_data/doc2query/experiments/367 \ + --data-dir data/msmarco_ans_small \ + --model-type t5-base \ + --dataset msmarco \ + --index-dir indexes/index-msmarco-passage-20191117-0ed488 \ + --batch-size 32 \ + --output-file runs/run.monot5.ans_small.dev.tsv +``` + +It takes about ~ 13 minutes to re-rank this subset on MS MARCO using a P100. +It is worth noting again that you might need to modify the batch size to best fit the GPU at hand. + +Upon completion, the re-ranked run file `run.monot5.ans_small.dev.tsv` will be available in the `runs` directory. +The following output will also be visible: + +``` +precision@1 0.26666666666666666 +recall@3 0.4603174603174603 +recall@50 0.8063492063492063 +recall@1000 0.8476190476190476 +mrr 0.3973368360121561 +mrr@10 0.39044217687074834 +``` + +Awesome, we can verify that this MRR@10 is indeed right using the official MS MARCO evaluation script: + +``` +python evaluate/msmarco/msmarco_eval.py data/msmarco_ans_small/qrels.dev.small.tsv runs/run.monot5.ans_small.dev.tsv +``` + +You should see the same result. + +If you were able to replicate any of these results, please submit a PR adding to the replication log! + + +## Replication Log + +### monoBERT + +### monoT5 From 1761a79bfdbfca4f0c8b5c365b82f652f2f5a156 Mon Sep 17 00:00:00 2001 From: ronakice Date: Fri, 22 May 2020 03:48:12 +0530 Subject: [PATCH 07/11] fix typos --- docs/experiments-msmarco-passage.md | 46 +++++++++++++++-------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/docs/experiments-msmarco-passage.md b/docs/experiments-msmarco-passage.md index c8367f88..8c957aab 100644 --- a/docs/experiments-msmarco-passage.md +++ b/docs/experiments-msmarco-passage.md @@ -7,19 +7,19 @@ Make sure that you have access to a GPU before running this. Prior to running this, we suggest looking at our first-stage [BM25 ranking instructions](https://github.com/castorini/anserini/blob/master/docs/experiments-msmarco-passage.md). We rerank the BM25 run files that contain ~ 1000 passages per query using both monoBERT and monoT5. -Keeping computational resources in mind, our instructions primarily focus on a 105 query subset of MS MARCO dev set. -Running the instructions with the entire MS MARCO dev set should give about the same results as that in the corresponding paper. +Keeping computational resources in mind, our instructions primarily focus on a 105 query subset of the MS MARCO dev set. +Running these instructions with the entire MS MARCO dev set should give about the same results as that in the corresponding paper. -*Note: Run the following instructions at root of this repo. Installation must have be done from source.* +*Note: Run the following instructions at root of this repo. Installation must have been done from source.* ## Models -+ monoBERT-Large: Passage Re-ranking with BERT [(Rodrigo et al., 2019)](https://arxiv.org/pdf/1901.04085.pdf) ++ monoBERT-Large: Passage Re-ranking with BERT [(Nogueira et al., 2019)](https://arxiv.org/pdf/1901.04085.pdf) + monoT5-base: Document Ranking with a Pretrained Sequence-to-Sequence Model [(Nogueira et al., 2020)](https://arxiv.org/pdf/2003.06713.pdf) ## Data Prep -We're first going to download the queries, qrels and run files corresponding to the MS MARCO subset considered. The run file is generated by following the BM25 ranking instructions. We'll store the files in the `data` directory. +We're first going to download the queries, qrels and run files corresponding to the MS MARCO set considered. The run file is generated by following the BM25 ranking instructions. We'll store all these files in the `data` directory. ``` wget https://www.dropbox.com/s/5xa5vjbjle0c8jv/msmarco_ans_small.zip -P data @@ -48,7 +48,7 @@ QueriesRanked: 105 ##################### ``` -Let's download and extract the pre-built MS MARCO index: +Let's download and extract the pre-built MS MARCO index into the `indexes` directory: ``` wget https://git.uwaterloo.ca/jimmylin/anserini-indexes/raw/master/index-msmarco-passage-20191117-0ed488.tar.gz -P indexes @@ -66,7 +66,7 @@ unzip models/monobert_msmarco_large.zip -d models While running the re-ranking script with the monoT5 model, it is automatically downloaded from Google Cloud Storage. -Now, we can begin with re-ranking the set! +Now, we can begin with re-ranking the set. ## Re-Ranking with monoBERT @@ -82,13 +82,7 @@ python -um pygaggle.run.evaluate_passage_ranker --split dev \ --output-file runs/run.monobert.ans_small.dev.tsv ``` -It takes about ~ 52 minutes to re-rank this subset on MS MARCO using a P100. -The type of GPU will directly influence your inference time. -It is possible that the default batch results in a GPU OOM error. -In this case, perhaps manually assigning a batch size (using option `--batch-size`) which is smaller than the default of 96, should help! - -Upon completion, the re-ranked run file `run.monobert.ans_small.dev.tsv` will be available in the `runs` directory. -The following output will also be visible: +Upon completion, the following output will be visible: ``` precision@1 0.2761904761904762 @@ -99,7 +93,13 @@ mrr 0.41089693612003686 mrr@10 0.4026795162509449 ``` -Great, we can also verify that the MRR@10 is indeed right using the official MS MARCO evaluation script: +It takes about ~ 52 minutes to re-rank this subset on MS MARCO using a P100. +The type of GPU will directly influence your inference time. +It is possible that the default batch results in a GPU OOM error. +In this case, assigning a batch size (using option `--batch-size`) which is smaller than the default (96) should help! + +The re-ranked run file `run.monobert.ans_small.dev.tsv` will also be available in the `runs` directory upon completion. +We can use this to verify that the MRR@10 is indeed right using the official MS MARCO evaluation script: ``` python evaluate/msmarco/msmarco_eval.py data/msmarco_ans_small/qrels.dev.small.tsv runs/run.monobert.ans_small.dev.tsv @@ -109,7 +109,7 @@ You should see the same result. Great, let's move on to monoT5! ## Re-Ranking with monoT5 -We use the monoT5-base variant as it is the easiest to run without access to larger GPUs/TPUs. Let us now re-rank the set! +We use the monoT5-base variant as it is the easiest to run without access to larger GPUs/TPUs. Let us now re-rank the set: ``` python -um pygaggle.run.evaluate_passage_ranker --split dev \ @@ -123,11 +123,7 @@ python -um pygaggle.run.evaluate_passage_ranker --split dev \ --output-file runs/run.monot5.ans_small.dev.tsv ``` -It takes about ~ 13 minutes to re-rank this subset on MS MARCO using a P100. -It is worth noting again that you might need to modify the batch size to best fit the GPU at hand. - -Upon completion, the re-ranked run file `run.monot5.ans_small.dev.tsv` will be available in the `runs` directory. -The following output will also be visible: +The following output will be visible after it has finished: ``` precision@1 0.26666666666666666 @@ -138,13 +134,19 @@ mrr 0.3973368360121561 mrr@10 0.39044217687074834 ``` +It takes about ~ 13 minutes to re-rank this subset on MS MARCO using a P100. +It is worth noting again that you might need to modify the batch size to best fit the GPU at hand. + +Upon completion, the re-ranked run file `run.monot5.ans_small.dev.tsv` will be available in the `runs` directory. + + Awesome, we can verify that this MRR@10 is indeed right using the official MS MARCO evaluation script: ``` python evaluate/msmarco/msmarco_eval.py data/msmarco_ans_small/qrels.dev.small.tsv runs/run.monot5.ans_small.dev.tsv ``` -You should see the same result. +You should see the same result. If you were able to replicate any of these results, please submit a PR adding to the replication log! From 74cee53fae91339ab247379b9eba19c4492b861a Mon Sep 17 00:00:00 2001 From: ronakice Date: Fri, 22 May 2020 03:54:01 +0530 Subject: [PATCH 08/11] fix typos 2 --- docs/experiments-msmarco-passage.md | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/docs/experiments-msmarco-passage.md b/docs/experiments-msmarco-passage.md index 8c957aab..7373e174 100644 --- a/docs/experiments-msmarco-passage.md +++ b/docs/experiments-msmarco-passage.md @@ -2,7 +2,6 @@ This page contains instructions for running various neural reranking baselines on the MS MARCO *passage* ranking task. Note that there is also a separate [MS MARCO *document* ranking task](experiments-msmarco-doc.md). -Make sure that you have access to a GPU before running this. Prior to running this, we suggest looking at our first-stage [BM25 ranking instructions](https://github.com/castorini/anserini/blob/master/docs/experiments-msmarco-passage.md). We rerank the BM25 run files that contain ~ 1000 passages per query using both monoBERT and monoT5. @@ -10,7 +9,7 @@ We rerank the BM25 run files that contain ~ 1000 passages per query using both m Keeping computational resources in mind, our instructions primarily focus on a 105 query subset of the MS MARCO dev set. Running these instructions with the entire MS MARCO dev set should give about the same results as that in the corresponding paper. -*Note: Run the following instructions at root of this repo. Installation must have been done from source.* +*Note: Run the following instructions at root of this repo. Installation must have been done from source. Make sure that you have access to a GPU* ## Models @@ -27,19 +26,19 @@ wget https://www.dropbox.com/s/5xa5vjbjle0c8jv/msmarco_ans_small.zip -P data To confirm, `msmarco_ans_small.zip` should have MD5 checksum of `65d8007bfb2c72b5fc384738e5572f74`. -Next, we extract the contents of the zip file into runs. +Next, we extract the contents into `data`. ``` unzip msmarco_ans_small.zip -d data ``` -We can evaluate the first-stage retrieved documents using the official MS MARCO evaluation script. +As a sanity check, we can evaluate the first-stage retrieved documents using the official MS MARCO evaluation script. ``` python evaluate/msmarco/msmarco_eval.py data/msmarco_ans_small/qrels.dev.small.tsv data/msmarco_ans_small/run.dev.small.tsv ``` -And the output should be: +The output should be: ``` ##################### @@ -48,7 +47,7 @@ QueriesRanked: 105 ##################### ``` -Let's download and extract the pre-built MS MARCO index into the `indexes` directory: +Let's download and extract the pre-built MS MARCO index into `indexes`: ``` wget https://git.uwaterloo.ca/jimmylin/anserini-indexes/raw/master/index-msmarco-passage-20191117-0ed488.tar.gz -P indexes @@ -57,7 +56,7 @@ tar xvfz indexes/index-msmarco-passage-20191117-0ed488.tar.gz -C indexes ## Model Prep -Let's download and extract monoBERT into the `models` directory +Let's download and extract monoBERT into `models`: ``` wget https://www.dropbox.com/s/jr0hpksboh7pa48/monobert_msmarco_large.zip -P models @@ -140,7 +139,7 @@ It is worth noting again that you might need to modify the batch size to best fi Upon completion, the re-ranked run file `run.monot5.ans_small.dev.tsv` will be available in the `runs` directory. -Awesome, we can verify that this MRR@10 is indeed right using the official MS MARCO evaluation script: +We can verify that the MRR@10 is indeed right using the official MS MARCO evaluation script: ``` python evaluate/msmarco/msmarco_eval.py data/msmarco_ans_small/qrels.dev.small.tsv runs/run.monot5.ans_small.dev.tsv @@ -154,6 +153,8 @@ If you were able to replicate any of these results, please submit a PR adding to ## Replication Log ### monoBERT ++ ### monoT5 ++ From 500145dd9d50b958ea6b0b3ba996466f656b9519 Mon Sep 17 00:00:00 2001 From: ronakice Date: Fri, 22 May 2020 03:55:18 +0530 Subject: [PATCH 09/11] fix heading --- docs/experiments-msmarco-passage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/experiments-msmarco-passage.md b/docs/experiments-msmarco-passage.md index 7373e174..84c41c63 100644 --- a/docs/experiments-msmarco-passage.md +++ b/docs/experiments-msmarco-passage.md @@ -1,4 +1,4 @@ -# PyGaggle: Neural Baselines on [MS MARCO Passage Retrieval](https://github.com/microsoft/MSMARCO-Passage-Ranking) +# PyGaggle: Neural Ranking Baselines on [MS MARCO Passage Retrieval](https://github.com/microsoft/MSMARCO-Passage-Ranking) This page contains instructions for running various neural reranking baselines on the MS MARCO *passage* ranking task. Note that there is also a separate [MS MARCO *document* ranking task](experiments-msmarco-doc.md). From d35cd0fe9455fdc71f82b1b4b2d41b5399087615 Mon Sep 17 00:00:00 2001 From: Ronak Date: Fri, 22 May 2020 17:07:45 +0530 Subject: [PATCH 10/11] Update docs/experiments-msmarco-passage.md Co-authored-by: Rodrigo Frassetto Nogueira --- docs/experiments-msmarco-passage.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/experiments-msmarco-passage.md b/docs/experiments-msmarco-passage.md index 84c41c63..27ddf22e 100644 --- a/docs/experiments-msmarco-passage.md +++ b/docs/experiments-msmarco-passage.md @@ -9,7 +9,9 @@ We rerank the BM25 run files that contain ~ 1000 passages per query using both m Keeping computational resources in mind, our instructions primarily focus on a 105 query subset of the MS MARCO dev set. Running these instructions with the entire MS MARCO dev set should give about the same results as that in the corresponding paper. -*Note: Run the following instructions at root of this repo. Installation must have been done from source. Make sure that you have access to a GPU* +Note 1: Run the following instructions at root of this repo. +Note 2: Installation must have been done from source. +Note 3: Make sure that you have access to a GPU ## Models @@ -157,4 +159,3 @@ If you were able to replicate any of these results, please submit a PR adding to ### monoT5 + - From 4dde3383ab3c6dc17cd78c7bc0f868adeeeed5e4 Mon Sep 17 00:00:00 2001 From: Ronak Date: Fri, 22 May 2020 17:23:26 +0530 Subject: [PATCH 11/11] Update experiments-msmarco-passage.md resolve comments --- docs/experiments-msmarco-passage.md | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/docs/experiments-msmarco-passage.md b/docs/experiments-msmarco-passage.md index 27ddf22e..72e7f330 100644 --- a/docs/experiments-msmarco-passage.md +++ b/docs/experiments-msmarco-passage.md @@ -1,12 +1,13 @@ # PyGaggle: Neural Ranking Baselines on [MS MARCO Passage Retrieval](https://github.com/microsoft/MSMARCO-Passage-Ranking) This page contains instructions for running various neural reranking baselines on the MS MARCO *passage* ranking task. -Note that there is also a separate [MS MARCO *document* ranking task](experiments-msmarco-doc.md). +Note that there is also a separate [MS MARCO *document* ranking task](https://github.com/castorini/anserini/blob/master/docs/experiments-msmarco-doc.md). Prior to running this, we suggest looking at our first-stage [BM25 ranking instructions](https://github.com/castorini/anserini/blob/master/docs/experiments-msmarco-passage.md). -We rerank the BM25 run files that contain ~ 1000 passages per query using both monoBERT and monoT5. +We rerank the BM25 run files that contain ~1000 passages per query using both monoBERT and monoT5. +monoBERT and monoT5 are pointwise rerankers. This means that each document is scored independently using either BERT or T5 respectively. -Keeping computational resources in mind, our instructions primarily focus on a 105 query subset of the MS MARCO dev set. +Since it can take many hours to run these models on all of the 6980 queries from the MS MARCO dev set, we will instead use a subset of 105 queries randomly sampled from the dev set. Running these instructions with the entire MS MARCO dev set should give about the same results as that in the corresponding paper. Note 1: Run the following instructions at root of this repo. @@ -94,13 +95,14 @@ mrr 0.41089693612003686 mrr@10 0.4026795162509449 ``` -It takes about ~ 52 minutes to re-rank this subset on MS MARCO using a P100. +It takes about ~52 minutes to re-rank this subset on MS MARCO using a P100. The type of GPU will directly influence your inference time. It is possible that the default batch results in a GPU OOM error. In this case, assigning a batch size (using option `--batch-size`) which is smaller than the default (96) should help! -The re-ranked run file `run.monobert.ans_small.dev.tsv` will also be available in the `runs` directory upon completion. -We can use this to verify that the MRR@10 is indeed right using the official MS MARCO evaluation script: +The re-ranked run file `run.monobert.ans_small.dev.tsv` will also be available in the `runs` directory upon completion. + +We can use the official MS MARCO evaluation script to verify the MRR@10: ``` python evaluate/msmarco/msmarco_eval.py data/msmarco_ans_small/qrels.dev.small.tsv runs/run.monobert.ans_small.dev.tsv @@ -135,13 +137,12 @@ mrr 0.3973368360121561 mrr@10 0.39044217687074834 ``` -It takes about ~ 13 minutes to re-rank this subset on MS MARCO using a P100. +It takes about ~13 minutes to re-rank this subset on MS MARCO using a P100. It is worth noting again that you might need to modify the batch size to best fit the GPU at hand. Upon completion, the re-ranked run file `run.monot5.ans_small.dev.tsv` will be available in the `runs` directory. - -We can verify that the MRR@10 is indeed right using the official MS MARCO evaluation script: +We can use the official MS MARCO evaluation script to verify the MRR@10: ``` python evaluate/msmarco/msmarco_eval.py data/msmarco_ans_small/qrels.dev.small.tsv runs/run.monot5.ans_small.dev.tsv